dv-pipecat-ai 0.0.85.dev824__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (31) hide show
  1. {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/METADATA +2 -1
  2. {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/RECORD +31 -29
  3. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +5 -1
  4. pipecat/frames/frames.py +22 -0
  5. pipecat/metrics/connection_metrics.py +45 -0
  6. pipecat/processors/aggregators/llm_response.py +15 -9
  7. pipecat/processors/dtmf_aggregator.py +17 -21
  8. pipecat/processors/frame_processor.py +44 -1
  9. pipecat/processors/metrics/frame_processor_metrics.py +108 -0
  10. pipecat/processors/transcript_processor.py +2 -1
  11. pipecat/serializers/__init__.py +2 -0
  12. pipecat/serializers/asterisk.py +16 -2
  13. pipecat/serializers/convox.py +2 -2
  14. pipecat/serializers/custom.py +2 -2
  15. pipecat/serializers/vi.py +326 -0
  16. pipecat/services/cartesia/tts.py +75 -10
  17. pipecat/services/deepgram/stt.py +317 -17
  18. pipecat/services/elevenlabs/stt.py +487 -19
  19. pipecat/services/elevenlabs/tts.py +28 -4
  20. pipecat/services/google/llm.py +26 -11
  21. pipecat/services/openai/base_llm.py +79 -14
  22. pipecat/services/salesforce/llm.py +64 -59
  23. pipecat/services/sarvam/tts.py +0 -1
  24. pipecat/services/soniox/stt.py +45 -10
  25. pipecat/services/vistaar/llm.py +97 -6
  26. pipecat/transcriptions/language.py +50 -0
  27. pipecat/transports/base_input.py +15 -11
  28. pipecat/transports/base_output.py +26 -3
  29. {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL +0 -0
  30. {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE +0 -0
  31. {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt +0 -0
@@ -10,19 +10,27 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
10
10
  from urllib.parse import urlencode
11
11
 
12
12
  import httpx
13
+ import jwt
13
14
  from loguru import logger
14
15
  from pydantic import BaseModel, Field
15
16
 
17
+ try:
18
+ import redis.asyncio as redis
19
+ REDIS_AVAILABLE = True
20
+ except ImportError:
21
+ REDIS_AVAILABLE = False
22
+ redis = None
23
+
16
24
  from pipecat.frames.frames import (
17
- EndFrame,
18
25
  CancelFrame,
26
+ EndFrame,
19
27
  Frame,
28
+ InterruptionFrame,
20
29
  LLMFullResponseEndFrame,
21
30
  LLMFullResponseStartFrame,
22
31
  LLMMessagesFrame,
23
32
  LLMTextFrame,
24
33
  LLMUpdateSettingsFrame,
25
- StartInterruptionFrame,
26
34
  )
27
35
  from pipecat.processors.aggregators.llm_response import (
28
36
  LLMAssistantAggregatorParams,
@@ -55,7 +63,9 @@ class VistaarLLMService(LLMService):
55
63
  Parameters:
56
64
  source_lang: Source language code (e.g., 'mr' for Marathi, 'hi' for Hindi).
57
65
  target_lang: Target language code for responses.
58
- session_id: Session ID for maintaining conversation context.
66
+ session_id: Session ID for maintaining conversation context (also used for JWT caching).
67
+ pre_query_response_phrases: List of phrases to say while waiting for response.
68
+ phone_number: Phone number for JWT subject claim.
59
69
  extra: Additional model-specific parameters
60
70
  """
61
71
 
@@ -63,6 +73,7 @@ class VistaarLLMService(LLMService):
63
73
  target_lang: Optional[str] = Field(default="mr")
64
74
  session_id: Optional[str] = Field(default=None)
65
75
  pre_query_response_phrases: Optional[List[str]] = Field(default_factory=list)
76
+ phone_number: Optional[str] = Field(default="UNKNOWN")
66
77
  extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
67
78
 
68
79
  def __init__(
@@ -72,6 +83,9 @@ class VistaarLLMService(LLMService):
72
83
  params: Optional[InputParams] = None,
73
84
  timeout: float = 30.0,
74
85
  interim_timeout: float = 5.0,
86
+ redis_client: Optional[Any] = None, # redis.Redis type
87
+ jwt_private_key: Optional[str] = None,
88
+ jwt_token_expiry: int = 3600,
75
89
  **kwargs,
76
90
  ):
77
91
  """Initialize Vistaar LLM service.
@@ -81,6 +95,9 @@ class VistaarLLMService(LLMService):
81
95
  params: Input parameters for model configuration and behavior.
82
96
  timeout: Request timeout in seconds. Defaults to 30.0 seconds.
83
97
  interim_timeout: Time in seconds before sending interim message. Defaults to 5.0 seconds.
98
+ redis_client: Optional Redis client for JWT token caching.
99
+ jwt_private_key: Optional RSA private key in PEM format for JWT signing.
100
+ jwt_token_expiry: JWT token expiry time in seconds. Defaults to 3600 (1 hour).
84
101
  **kwargs: Additional arguments passed to the parent LLMService.
85
102
  """
86
103
  super().__init__(**kwargs)
@@ -95,6 +112,16 @@ class VistaarLLMService(LLMService):
95
112
  self._extra = params.extra if isinstance(params.extra, dict) else {}
96
113
  self._timeout = timeout
97
114
  self._interim_timeout = interim_timeout
115
+ self._phone_number = params.phone_number
116
+
117
+ # JWT authentication setup
118
+ self._redis_client = redis_client
119
+ self._jwt_private_key = jwt_private_key
120
+ self._jwt_token_expiry = jwt_token_expiry
121
+ self._jwt_issuer = "voice-provider"
122
+
123
+ if self._jwt_private_key and not self._redis_client:
124
+ logger.warning("JWT private key provided but no Redis client for caching. JWT auth will regenerate tokens on each request.")
98
125
 
99
126
  # Create an async HTTP client
100
127
  self._client = httpx.AsyncClient(timeout=httpx.Timeout(self._timeout), verify=False)
@@ -112,6 +139,53 @@ class VistaarLLMService(LLMService):
112
139
  f"Vistaar LLM initialized - Base URL: {self._base_url}, Session ID: {self._session_id}, Source Lang: {self._source_lang}, Target Lang: {self._target_lang}, Timeout: {self._timeout}s"
113
140
  )
114
141
 
142
+ async def _get_jwt_token(self) -> Optional[str]:
143
+ """Generate or retrieve a cached JWT token.
144
+
145
+ Returns:
146
+ JWT token string or None if JWT auth is not configured.
147
+ """
148
+ if not self._jwt_private_key:
149
+ return None
150
+
151
+ # Try to get from Redis cache if available
152
+ if self._redis_client and self._session_id:
153
+ redis_key = f"vistaar_jwt:{self._session_id}"
154
+ try:
155
+ cached_token = await self._redis_client.get(redis_key)
156
+ if cached_token:
157
+ logger.debug(f"Retrieved JWT token from Redis cache for session_id: {self._session_id}")
158
+ return cached_token.decode('utf-8') if isinstance(cached_token, bytes) else cached_token
159
+ except Exception as e:
160
+ logger.warning(f"Redis cache retrieval failed: {e}. Generating new token.")
161
+
162
+ # Generate new token
163
+ current_time = int(time.time())
164
+ payload = {
165
+ "sub": self._phone_number, # Subject identifier (phone number)
166
+ "iss": self._jwt_issuer, # Issuer
167
+ "iat": current_time, # Issued at timestamp
168
+ "exp": current_time + self._jwt_token_expiry # Expiration timestamp
169
+ }
170
+
171
+ token = jwt.encode(payload, self._jwt_private_key, algorithm="RS256")
172
+ logger.info(f"Generated new JWT token for {self._phone_number}, expires in {self._jwt_token_expiry}s")
173
+
174
+ # Cache in Redis if available
175
+ if self._redis_client and self._session_id:
176
+ redis_key = f"vistaar_jwt:{self._session_id}"
177
+ try:
178
+ await self._redis_client.setex(
179
+ redis_key,
180
+ self._jwt_token_expiry,
181
+ token
182
+ )
183
+ logger.debug(f"Cached JWT token in Redis for session_id: {self._session_id} with {self._jwt_token_expiry}s TTL")
184
+ except Exception as e:
185
+ logger.warning(f"Redis cache storage failed: {e}. Continuing without cache.")
186
+
187
+ return token
188
+
115
189
  async def _extract_messages_to_query(self, context: OpenAILLMContext) -> str:
116
190
  """Extract only the last user message from context.
117
191
 
@@ -259,9 +333,23 @@ class VistaarLLMService(LLMService):
259
333
  self._interim_in_progress = False
260
334
  self._interim_completion_event.clear() # Reset the event for new request
261
335
 
336
+ # Prepare headers with JWT authentication if configured
337
+ headers = {}
338
+ try:
339
+ jwt_token = await self._get_jwt_token()
340
+ if jwt_token:
341
+ headers["Authorization"] = f"Bearer {jwt_token}"
342
+ logger.debug(f"Added JWT authentication header for session_id: {self._session_id}")
343
+ except Exception as e:
344
+ logger.error(f"Failed to generate JWT token: {e}")
345
+ raise
346
+
347
+ await self.start_connection_metrics()
348
+
262
349
  try:
263
350
  # Use httpx to handle SSE streaming
264
- async with self._client.stream("GET", url) as response:
351
+ async with self._client.stream("GET", url, headers=headers) as response:
352
+ await self.stop_connection_metrics(success=True, connection_type="http")
265
353
  self._current_response = response # Store for potential cancellation
266
354
  response.raise_for_status()
267
355
 
@@ -279,14 +367,17 @@ class VistaarLLMService(LLMService):
279
367
  yield line
280
368
 
281
369
  except httpx.HTTPStatusError as e:
370
+ await self.stop_connection_metrics(success=False, error=f"HTTP {e.response.status_code}", connection_type="http")
282
371
  logger.error(
283
372
  f"Vistaar HTTP error - Status: {e.response.status_code}, URL: {url}, Response: {e.response.text if hasattr(e.response, 'text') else 'N/A'}"
284
373
  )
285
374
  raise
286
375
  except httpx.TimeoutException as e:
376
+ await self.stop_connection_metrics(success=False, error="Timeout", connection_type="http")
287
377
  logger.error(f"Vistaar timeout error - URL: {url}, Timeout: {self._timeout}s")
288
378
  raise
289
379
  except Exception as e:
380
+ await self.stop_connection_metrics(success=False, error=str(e), connection_type="http")
290
381
  logger.error(
291
382
  f"Vistaar unexpected error - Type: {type(e).__name__}, Message: {str(e)}, URL: {url}"
292
383
  )
@@ -391,7 +482,7 @@ class VistaarLLMService(LLMService):
391
482
  )
392
483
  await self.push_frame(frame, direction)
393
484
  return
394
- elif isinstance(frame, StartInterruptionFrame):
485
+ elif isinstance(frame, InterruptionFrame):
395
486
  await self._handle_interruption()
396
487
  await self.push_frame(frame, direction)
397
488
  return
@@ -467,4 +558,4 @@ class VistaarLLMService(LLMService):
467
558
 
468
559
  def can_generate_metrics(self) -> bool:
469
560
  """Check if this service can generate processing metrics."""
470
- return True
561
+ return True
@@ -569,3 +569,53 @@ class Language(StrEnum):
569
569
  # Zulu
570
570
  ZU = "zu"
571
571
  ZU_ZA = "zu-ZA"
572
+
573
+
574
+ def resolve_language(
575
+ language: Language, language_map: dict[Language, str], use_base_code: bool = True
576
+ ) -> str:
577
+ """Resolve a Language enum to a service-specific language code.
578
+
579
+ Checks the language map first, then falls back to extracting the appropriate
580
+ code format with a warning if not found in the verified list.
581
+
582
+ Args:
583
+ language: The Language enum value to convert.
584
+ language_map: Dictionary mapping Language enums to service language codes.
585
+ use_base_code: If True, extracts base code (e.g., 'en' from 'en-US').
586
+ If False, uses full language code as-is.
587
+
588
+ Returns:
589
+ The resolved language code for the service.
590
+
591
+ Examples::
592
+
593
+ # Service expecting base codes (e.g., Cartesia)
594
+ >>> LANGUAGE_MAP = {Language.EN: "en", Language.ES: "es"}
595
+ >>> resolve_language(Language.EN_US, LANGUAGE_MAP, use_base_code=True)
596
+ # Logs: "Language en-US not verified. Using base code 'en'."
597
+ "en"
598
+
599
+ # Service expecting full codes (e.g., AWS)
600
+ >>> LANGUAGE_MAP = {Language.EN_US: "en-US", Language.ES_ES: "es-ES"}
601
+ >>> resolve_language(Language.EN_GB, LANGUAGE_MAP, use_base_code=False)
602
+ # Logs: "Language en-GB not verified. Using 'en-GB'."
603
+ "en-GB"
604
+ """
605
+ # Check if language is in the verified map
606
+ result = language_map.get(language)
607
+
608
+ if result is not None:
609
+ return result
610
+
611
+ # Not in map - fall back with warning
612
+ lang_str = str(language.value)
613
+
614
+ if use_base_code:
615
+ # Extract base code (e.g., "en" from "en-US")
616
+ base_code = lang_str.split("-")[0].lower()
617
+ # logger.warning(f"Language {language.value} not verified. Using base code '{base_code}'.")
618
+ return base_code
619
+ else:
620
+ # logger.warning(f"Language {language.value} not verified. Using '{lang_str}'.")
621
+ return lang_str
@@ -297,6 +297,17 @@ class BaseInputTransport(FrameProcessor):
297
297
  elif isinstance(frame, EmulateUserStoppedSpeakingFrame):
298
298
  self.logger.debug("Emulating user stopped speaking")
299
299
  await self._handle_user_interruption(VADState.QUIET, emulated=True)
300
+ elif isinstance(frame, VADParamsUpdateFrame):
301
+ if self.vad_analyzer:
302
+ self.vad_analyzer.set_params(frame.params, self.logger)
303
+ speech_frame = SpeechControlParamsFrame(
304
+ vad_params=frame.params,
305
+ turn_params=self._params.turn_analyzer.params
306
+ if self._params.turn_analyzer
307
+ else None,
308
+ )
309
+ await self.push_frame(speech_frame)
310
+ await self.push_frame(frame, direction)
300
311
  # All other system frames
301
312
  elif isinstance(frame, SystemFrame):
302
313
  await self.push_frame(frame, direction)
@@ -309,16 +320,6 @@ class BaseInputTransport(FrameProcessor):
309
320
  elif isinstance(frame, StopFrame):
310
321
  await self.push_frame(frame, direction)
311
322
  await self.pause(frame)
312
- elif isinstance(frame, VADParamsUpdateFrame):
313
- if self.vad_analyzer:
314
- self.vad_analyzer.set_params(frame.params)
315
- speech_frame = SpeechControlParamsFrame(
316
- vad_params=frame.params,
317
- turn_params=self._params.turn_analyzer.params
318
- if self._params.turn_analyzer
319
- else None,
320
- )
321
- await self.push_frame(speech_frame)
322
323
  elif isinstance(frame, FilterUpdateSettingsFrame) and self._params.audio_in_filter:
323
324
  await self._params.audio_in_filter.process_frame(frame)
324
325
  # Other frames
@@ -444,7 +445,10 @@ class BaseInputTransport(FrameProcessor):
444
445
  await self._handle_user_interruption(VADState.QUIET)
445
446
 
446
447
  async def _run_turn_analyzer(
447
- self, frame: InputAudioRawFrame, vad_state: VADState, previous_vad_state: VADState
448
+ self,
449
+ frame: InputAudioRawFrame,
450
+ vad_state: VADState,
451
+ previous_vad_state: VADState,
448
452
  ):
449
453
  """Run turn analysis on audio frame and handle results."""
450
454
  is_speech = vad_state == VADState.SPEAKING or vad_state == VADState.STARTING
@@ -50,6 +50,11 @@ from pipecat.utils.time import nanoseconds_to_seconds
50
50
 
51
51
  # TODO: When we use GeminiMultimodalLiveLLMService, we need to change this to 0.35 but that creates issue for faster TTS.
52
52
  BOT_VAD_STOP_SECS = 0.30
53
+ # For the very first bot utterance (e.g., intro), we can safely
54
+ # detect end-of-speech sooner to improve responsiveness for the
55
+ # user’s first short reply. Keep conservative to avoid mid-utterance
56
+ # false stops when TTS streams quickly.
57
+ FIRST_BOT_VAD_STOP_SECS = 0.12
53
58
 
54
59
 
55
60
  class BaseOutputTransport(FrameProcessor):
@@ -406,6 +411,9 @@ class BaseOutputTransport(FrameProcessor):
406
411
  self._bot_speaking_frame_period = 0.2
407
412
  # Last time the bot actually spoke.
408
413
  self._bot_speech_last_time = 0
414
+ # Before the first stop event, we use a shorter silence
415
+ # threshold to make the first turn more responsive.
416
+ self._first_stop_pending = True
409
417
 
410
418
  self._audio_task: Optional[asyncio.Task] = None
411
419
  self._video_task: Optional[asyncio.Task] = None
@@ -631,6 +639,10 @@ class BaseOutputTransport(FrameProcessor):
631
639
 
632
640
  self._bot_speaking = False
633
641
 
642
+ # Mark that the first stop has been completed so subsequent
643
+ # stops use the regular (longer) VAD stop threshold.
644
+ self._first_stop_pending = False
645
+
634
646
  # Clean audio buffer (there could be tiny left overs if not multiple
635
647
  # to our output chunk size).
636
648
  self._audio_buffer = bytearray()
@@ -690,9 +702,14 @@ class BaseOutputTransport(FrameProcessor):
690
702
  async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
691
703
  while True:
692
704
  try:
693
- frame = await asyncio.wait_for(
694
- self._audio_queue.get(), timeout=vad_stop_secs
705
+ # Use a shorter timeout only for the first bot stop to
706
+ # accelerate the initial turn handoff right after the intro.
707
+ timeout = (
708
+ FIRST_BOT_VAD_STOP_SECS
709
+ if getattr(self, "_first_stop_pending", True)
710
+ else BOT_VAD_STOP_SECS
695
711
  )
712
+ frame = await asyncio.wait_for(self._audio_queue.get(), timeout=timeout)
696
713
  yield frame
697
714
  self._audio_queue.task_done()
698
715
  except asyncio.TimeoutError:
@@ -713,7 +730,13 @@ class BaseOutputTransport(FrameProcessor):
713
730
  except asyncio.QueueEmpty:
714
731
  # Notify the bot stopped speaking upstream if necessary.
715
732
  diff_time = time.time() - last_frame_time
716
- if diff_time > vad_stop_secs:
733
+ # Use a shorter threshold for the first stop only.
734
+ current_stop_secs = (
735
+ FIRST_BOT_VAD_STOP_SECS
736
+ if getattr(self, "_first_stop_pending", True)
737
+ else BOT_VAD_STOP_SECS
738
+ )
739
+ if diff_time > current_stop_secs:
717
740
  await self._bot_stopped_speaking()
718
741
  # Generate an audio frame with only the mixer's part.
719
742
  frame = OutputAudioRawFrame(