PyPI - dv-pipecat-ai - Versions diffs - 0.0.85.dev824__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.85.dev824py3-none-any.whl → 0.0.85.dev858py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (31) hide show

{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/METADATA +2 -1
{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/RECORD +31 -29
pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +5 -1
pipecat/frames/frames.py +22 -0
pipecat/metrics/connection_metrics.py +45 -0
pipecat/processors/aggregators/llm_response.py +15 -9
pipecat/processors/dtmf_aggregator.py +17 -21
pipecat/processors/frame_processor.py +44 -1
pipecat/processors/metrics/frame_processor_metrics.py +108 -0
pipecat/processors/transcript_processor.py +2 -1
pipecat/serializers/__init__.py +2 -0
pipecat/serializers/asterisk.py +16 -2
pipecat/serializers/convox.py +2 -2
pipecat/serializers/custom.py +2 -2
pipecat/serializers/vi.py +326 -0
pipecat/services/cartesia/tts.py +75 -10
pipecat/services/deepgram/stt.py +317 -17
pipecat/services/elevenlabs/stt.py +487 -19
pipecat/services/elevenlabs/tts.py +28 -4
pipecat/services/google/llm.py +26 -11
pipecat/services/openai/base_llm.py +79 -14
pipecat/services/salesforce/llm.py +64 -59
pipecat/services/sarvam/tts.py +0 -1
pipecat/services/soniox/stt.py +45 -10
pipecat/services/vistaar/llm.py +97 -6
pipecat/transcriptions/language.py +50 -0
pipecat/transports/base_input.py +15 -11
pipecat/transports/base_output.py +26 -3
{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt +0 -0

pipecat/services/vistaar/llm.py CHANGED Viewed

@@ -10,19 +10,27 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
 from urllib.parse import urlencode
 import httpx
+import jwt
 from loguru import logger
 from pydantic import BaseModel, Field
+try:
+    import redis.asyncio as redis
+    REDIS_AVAILABLE = True
+except ImportError:
+    REDIS_AVAILABLE = False
+    redis = None
 from pipecat.frames.frames import (
-    EndFrame,
     CancelFrame,
+    EndFrame,
     Frame,
+    InterruptionFrame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     LLMMessagesFrame,
     LLMTextFrame,
     LLMUpdateSettingsFrame,
-    StartInterruptionFrame,
 )
 from pipecat.processors.aggregators.llm_response import (
     LLMAssistantAggregatorParams,
@@ -55,7 +63,9 @@ class VistaarLLMService(LLMService):
         Parameters:
             source_lang: Source language code (e.g., 'mr' for Marathi, 'hi' for Hindi).
             target_lang: Target language code for responses.
-            session_id: Session ID for maintaining conversation context.
+            session_id: Session ID for maintaining conversation context (also used for JWT caching).
+            pre_query_response_phrases: List of phrases to say while waiting for response.
+            phone_number: Phone number for JWT subject claim.
             extra: Additional model-specific parameters
         """
@@ -63,6 +73,7 @@ class VistaarLLMService(LLMService):
         target_lang: Optional[str] = Field(default="mr")
         session_id: Optional[str] = Field(default=None)
         pre_query_response_phrases: Optional[List[str]] = Field(default_factory=list)
+        phone_number: Optional[str] = Field(default="UNKNOWN")
         extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
     def __init__(
@@ -72,6 +83,9 @@ class VistaarLLMService(LLMService):
         params: Optional[InputParams] = None,
         timeout: float = 30.0,
         interim_timeout: float = 5.0,
+        redis_client: Optional[Any] = None,  # redis.Redis type
+        jwt_private_key: Optional[str] = None,
+        jwt_token_expiry: int = 3600,
         **kwargs,
     ):
         """Initialize Vistaar LLM service.
@@ -81,6 +95,9 @@ class VistaarLLMService(LLMService):
             params: Input parameters for model configuration and behavior.
             timeout: Request timeout in seconds. Defaults to 30.0 seconds.
             interim_timeout: Time in seconds before sending interim message. Defaults to 5.0 seconds.
+            redis_client: Optional Redis client for JWT token caching.
+            jwt_private_key: Optional RSA private key in PEM format for JWT signing.
+            jwt_token_expiry: JWT token expiry time in seconds. Defaults to 3600 (1 hour).
             **kwargs: Additional arguments passed to the parent LLMService.
         """
         super().__init__(**kwargs)
@@ -95,6 +112,16 @@ class VistaarLLMService(LLMService):
         self._extra = params.extra if isinstance(params.extra, dict) else {}
         self._timeout = timeout
         self._interim_timeout = interim_timeout
+        self._phone_number = params.phone_number
+        # JWT authentication setup
+        self._redis_client = redis_client
+        self._jwt_private_key = jwt_private_key
+        self._jwt_token_expiry = jwt_token_expiry
+        self._jwt_issuer = "voice-provider"
+        if self._jwt_private_key and not self._redis_client:
+            logger.warning("JWT private key provided but no Redis client for caching. JWT auth will regenerate tokens on each request.")
         # Create an async HTTP client
         self._client = httpx.AsyncClient(timeout=httpx.Timeout(self._timeout), verify=False)
@@ -112,6 +139,53 @@ class VistaarLLMService(LLMService):
             f"Vistaar LLM initialized - Base URL: {self._base_url}, Session ID: {self._session_id}, Source Lang: {self._source_lang}, Target Lang: {self._target_lang}, Timeout: {self._timeout}s"
         )
+    async def _get_jwt_token(self) -> Optional[str]:
+        """Generate or retrieve a cached JWT token.
+        Returns:
+            JWT token string or None if JWT auth is not configured.
+        """
+        if not self._jwt_private_key:
+            return None
+        # Try to get from Redis cache if available
+        if self._redis_client and self._session_id:
+            redis_key = f"vistaar_jwt:{self._session_id}"
+            try:
+                cached_token = await self._redis_client.get(redis_key)
+                if cached_token:
+                    logger.debug(f"Retrieved JWT token from Redis cache for session_id: {self._session_id}")
+                    return cached_token.decode('utf-8') if isinstance(cached_token, bytes) else cached_token
+            except Exception as e:
+                logger.warning(f"Redis cache retrieval failed: {e}. Generating new token.")
+        # Generate new token
+        current_time = int(time.time())
+        payload = {
+            "sub": self._phone_number,  # Subject identifier (phone number)
+            "iss": self._jwt_issuer,    # Issuer
+            "iat": current_time,         # Issued at timestamp
+            "exp": current_time + self._jwt_token_expiry  # Expiration timestamp
+        }
+        token = jwt.encode(payload, self._jwt_private_key, algorithm="RS256")
+        logger.info(f"Generated new JWT token for {self._phone_number}, expires in {self._jwt_token_expiry}s")
+        # Cache in Redis if available
+        if self._redis_client and self._session_id:
+            redis_key = f"vistaar_jwt:{self._session_id}"
+            try:
+                await self._redis_client.setex(
+                    redis_key,
+                    self._jwt_token_expiry,
+                    token
+                )
+                logger.debug(f"Cached JWT token in Redis for session_id: {self._session_id} with {self._jwt_token_expiry}s TTL")
+            except Exception as e:
+                logger.warning(f"Redis cache storage failed: {e}. Continuing without cache.")
+        return token
     async def _extract_messages_to_query(self, context: OpenAILLMContext) -> str:
         """Extract only the last user message from context.
@@ -259,9 +333,23 @@ class VistaarLLMService(LLMService):
         self._interim_in_progress = False
         self._interim_completion_event.clear()  # Reset the event for new request
+        # Prepare headers with JWT authentication if configured
+        headers = {}
+        try:
+            jwt_token = await self._get_jwt_token()
+            if jwt_token:
+                headers["Authorization"] = f"Bearer {jwt_token}"
+                logger.debug(f"Added JWT authentication header for session_id: {self._session_id}")
+        except Exception as e:
+            logger.error(f"Failed to generate JWT token: {e}")
+            raise
+        await self.start_connection_metrics()
         try:
             # Use httpx to handle SSE streaming
-            async with self._client.stream("GET", url) as response:
+            async with self._client.stream("GET", url, headers=headers) as response:
+                await self.stop_connection_metrics(success=True, connection_type="http")
                 self._current_response = response  # Store for potential cancellation
                 response.raise_for_status()
@@ -279,14 +367,17 @@ class VistaarLLMService(LLMService):
                     yield line
         except httpx.HTTPStatusError as e:
+            await self.stop_connection_metrics(success=False, error=f"HTTP {e.response.status_code}", connection_type="http")
             logger.error(
                 f"Vistaar HTTP error - Status: {e.response.status_code}, URL: {url}, Response: {e.response.text if hasattr(e.response, 'text') else 'N/A'}"
             )
             raise
         except httpx.TimeoutException as e:
+            await self.stop_connection_metrics(success=False, error="Timeout", connection_type="http")
             logger.error(f"Vistaar timeout error - URL: {url}, Timeout: {self._timeout}s")
             raise
         except Exception as e:
+            await self.stop_connection_metrics(success=False, error=str(e), connection_type="http")
             logger.error(
                 f"Vistaar unexpected error - Type: {type(e).__name__}, Message: {str(e)}, URL: {url}"
             )
@@ -391,7 +482,7 @@ class VistaarLLMService(LLMService):
             )
             await self.push_frame(frame, direction)
             return
-        elif isinstance(frame, StartInterruptionFrame):
+        elif isinstance(frame, InterruptionFrame):
             await self._handle_interruption()
             await self.push_frame(frame, direction)
             return
@@ -467,4 +558,4 @@ class VistaarLLMService(LLMService):
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics."""
-        return True
+        return True

pipecat/transcriptions/language.py CHANGED Viewed

@@ -569,3 +569,53 @@ class Language(StrEnum):
     # Zulu
     ZU = "zu"
     ZU_ZA = "zu-ZA"
+def resolve_language(
+    language: Language, language_map: dict[Language, str], use_base_code: bool = True
+) -> str:
+    """Resolve a Language enum to a service-specific language code.
+    Checks the language map first, then falls back to extracting the appropriate
+    code format with a warning if not found in the verified list.
+    Args:
+        language: The Language enum value to convert.
+        language_map: Dictionary mapping Language enums to service language codes.
+        use_base_code: If True, extracts base code (e.g., 'en' from 'en-US').
+                      If False, uses full language code as-is.
+    Returns:
+        The resolved language code for the service.
+    Examples::
+        # Service expecting base codes (e.g., Cartesia)
+        >>> LANGUAGE_MAP = {Language.EN: "en", Language.ES: "es"}
+        >>> resolve_language(Language.EN_US, LANGUAGE_MAP, use_base_code=True)
+        # Logs: "Language en-US not verified. Using base code 'en'."
+        "en"
+        # Service expecting full codes (e.g., AWS)
+        >>> LANGUAGE_MAP = {Language.EN_US: "en-US", Language.ES_ES: "es-ES"}
+        >>> resolve_language(Language.EN_GB, LANGUAGE_MAP, use_base_code=False)
+        # Logs: "Language en-GB not verified. Using 'en-GB'."
+        "en-GB"
+    """
+    # Check if language is in the verified map
+    result = language_map.get(language)
+    if result is not None:
+        return result
+    # Not in map - fall back with warning
+    lang_str = str(language.value)
+    if use_base_code:
+        # Extract base code (e.g., "en" from "en-US")
+        base_code = lang_str.split("-")[0].lower()
+        # logger.warning(f"Language {language.value} not verified. Using base code '{base_code}'.")
+        return base_code
+    else:
+        # logger.warning(f"Language {language.value} not verified. Using '{lang_str}'.")
+        return lang_str

pipecat/transports/base_input.py CHANGED Viewed

@@ -297,6 +297,17 @@ class BaseInputTransport(FrameProcessor):
         elif isinstance(frame, EmulateUserStoppedSpeakingFrame):
             self.logger.debug("Emulating user stopped speaking")
             await self._handle_user_interruption(VADState.QUIET, emulated=True)
+        elif isinstance(frame, VADParamsUpdateFrame):
+            if self.vad_analyzer:
+                self.vad_analyzer.set_params(frame.params, self.logger)
+                speech_frame = SpeechControlParamsFrame(
+                    vad_params=frame.params,
+                    turn_params=self._params.turn_analyzer.params
+                    if self._params.turn_analyzer
+                    else None,
+                )
+                await self.push_frame(speech_frame)
+            await self.push_frame(frame, direction)
         # All other system frames
         elif isinstance(frame, SystemFrame):
             await self.push_frame(frame, direction)
@@ -309,16 +320,6 @@ class BaseInputTransport(FrameProcessor):
         elif isinstance(frame, StopFrame):
             await self.push_frame(frame, direction)
             await self.pause(frame)
-        elif isinstance(frame, VADParamsUpdateFrame):
-            if self.vad_analyzer:
-                self.vad_analyzer.set_params(frame.params)
-                speech_frame = SpeechControlParamsFrame(
-                    vad_params=frame.params,
-                    turn_params=self._params.turn_analyzer.params
-                    if self._params.turn_analyzer
-                    else None,
-                )
-                await self.push_frame(speech_frame)
         elif isinstance(frame, FilterUpdateSettingsFrame) and self._params.audio_in_filter:
             await self._params.audio_in_filter.process_frame(frame)
         # Other frames
@@ -444,7 +445,10 @@ class BaseInputTransport(FrameProcessor):
             await self._handle_user_interruption(VADState.QUIET)
     async def _run_turn_analyzer(
-        self, frame: InputAudioRawFrame, vad_state: VADState, previous_vad_state: VADState
+        self,
+        frame: InputAudioRawFrame,
+        vad_state: VADState,
+        previous_vad_state: VADState,
     ):
         """Run turn analysis on audio frame and handle results."""
         is_speech = vad_state == VADState.SPEAKING or vad_state == VADState.STARTING

pipecat/transports/base_output.py CHANGED Viewed

@@ -50,6 +50,11 @@ from pipecat.utils.time import nanoseconds_to_seconds
 # TODO: When we use GeminiMultimodalLiveLLMService, we need to change this to 0.35 but that creates issue for faster TTS.
 BOT_VAD_STOP_SECS = 0.30
+# For the very first bot utterance (e.g., intro), we can safely
+# detect end-of-speech sooner to improve responsiveness for the
+# user’s first short reply. Keep conservative to avoid mid-utterance
+# false stops when TTS streams quickly.
+FIRST_BOT_VAD_STOP_SECS = 0.12
 class BaseOutputTransport(FrameProcessor):
@@ -406,6 +411,9 @@ class BaseOutputTransport(FrameProcessor):
             self._bot_speaking_frame_period = 0.2
             # Last time the bot actually spoke.
             self._bot_speech_last_time = 0
+            # Before the first stop event, we use a shorter silence
+            # threshold to make the first turn more responsive.
+            self._first_stop_pending = True
             self._audio_task: Optional[asyncio.Task] = None
             self._video_task: Optional[asyncio.Task] = None
@@ -631,6 +639,10 @@ class BaseOutputTransport(FrameProcessor):
             self._bot_speaking = False
+            # Mark that the first stop has been completed so subsequent
+            # stops use the regular (longer) VAD stop threshold.
+            self._first_stop_pending = False
             # Clean audio buffer (there could be tiny left overs if not multiple
             # to our output chunk size).
             self._audio_buffer = bytearray()
@@ -690,9 +702,14 @@ class BaseOutputTransport(FrameProcessor):
             async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
                 while True:
                     try:
-                        frame = await asyncio.wait_for(
-                            self._audio_queue.get(), timeout=vad_stop_secs
+                        # Use a shorter timeout only for the first bot stop to
+                        # accelerate the initial turn handoff right after the intro.
+                        timeout = (
+                            FIRST_BOT_VAD_STOP_SECS
+                            if getattr(self, "_first_stop_pending", True)
+                            else BOT_VAD_STOP_SECS
                         )
+                        frame = await asyncio.wait_for(self._audio_queue.get(), timeout=timeout)
                         yield frame
                         self._audio_queue.task_done()
                     except asyncio.TimeoutError:
@@ -713,7 +730,13 @@ class BaseOutputTransport(FrameProcessor):
                     except asyncio.QueueEmpty:
                         # Notify the bot stopped speaking upstream if necessary.
                         diff_time = time.time() - last_frame_time
-                        if diff_time > vad_stop_secs:
+                        # Use a shorter threshold for the first stop only.
+                        current_stop_secs = (
+                            FIRST_BOT_VAD_STOP_SECS
+                            if getattr(self, "_first_stop_pending", True)
+                            else BOT_VAD_STOP_SECS
+                        )
+                        if diff_time > current_stop_secs:
                             await self._bot_stopped_speaking()
                         # Generate an audio frame with only the mixer's part.
                         frame = OutputAudioRawFrame(

{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt RENAMED Viewed

File without changes

dv-pipecat-ai 0.0.85.dev824__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.85.dev824py3-none-any.whl → 0.0.85.dev858py3-none-any.whl