dv-pipecat-ai 0.0.85.dev818__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/METADATA +2 -1
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/RECORD +32 -29
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +5 -1
- pipecat/frames/frames.py +34 -0
- pipecat/metrics/connection_metrics.py +45 -0
- pipecat/processors/aggregators/llm_response.py +25 -4
- pipecat/processors/dtmf_aggregator.py +17 -21
- pipecat/processors/frame_processor.py +51 -8
- pipecat/processors/metrics/frame_processor_metrics.py +108 -0
- pipecat/processors/transcript_processor.py +22 -1
- pipecat/serializers/__init__.py +2 -0
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +2 -2
- pipecat/serializers/custom.py +2 -2
- pipecat/serializers/vi.py +326 -0
- pipecat/services/cartesia/tts.py +75 -10
- pipecat/services/deepgram/stt.py +317 -17
- pipecat/services/elevenlabs/stt.py +487 -19
- pipecat/services/elevenlabs/tts.py +28 -4
- pipecat/services/google/llm.py +26 -11
- pipecat/services/openai/base_llm.py +79 -14
- pipecat/services/salesforce/llm.py +321 -86
- pipecat/services/sarvam/tts.py +0 -1
- pipecat/services/soniox/stt.py +45 -10
- pipecat/services/vistaar/llm.py +97 -6
- pipecat/transcriptions/language.py +50 -0
- pipecat/transports/base_input.py +15 -11
- pipecat/transports/base_output.py +29 -3
- pipecat/utils/redis.py +58 -0
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt +0 -0
pipecat/services/soniox/stt.py
CHANGED
|
@@ -49,6 +49,33 @@ END_TOKEN = "<end>"
|
|
|
49
49
|
FINALIZED_TOKEN = "<fin>"
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
class SonioxContextGeneralItem(BaseModel):
|
|
53
|
+
"""Represents a key-value pair for structured general context information."""
|
|
54
|
+
|
|
55
|
+
key: str
|
|
56
|
+
value: str
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class SonioxContextTranslationTerm(BaseModel):
|
|
60
|
+
"""Represents a custom translation mapping for ambiguous or domain-specific terms."""
|
|
61
|
+
|
|
62
|
+
source: str
|
|
63
|
+
target: str
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class SonioxContextObject(BaseModel):
|
|
67
|
+
"""Context object for models with context_version 2, for Soniox stt-rt-v3-preview and higher.
|
|
68
|
+
|
|
69
|
+
Learn more about context in the documentation:
|
|
70
|
+
https://soniox.com/docs/stt/concepts/context
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
general: Optional[List[SonioxContextGeneralItem]] = None
|
|
74
|
+
text: Optional[str] = None
|
|
75
|
+
terms: Optional[List[str]] = None
|
|
76
|
+
translation_terms: Optional[List[SonioxContextTranslationTerm]] = None
|
|
77
|
+
|
|
78
|
+
|
|
52
79
|
class SonioxInputParams(BaseModel):
|
|
53
80
|
"""Real-time transcription settings.
|
|
54
81
|
|
|
@@ -60,9 +87,9 @@ class SonioxInputParams(BaseModel):
|
|
|
60
87
|
audio_format: Audio format to use for transcription.
|
|
61
88
|
num_channels: Number of channels to use for transcription.
|
|
62
89
|
language_hints: List of language hints to use for transcription.
|
|
63
|
-
context: Customization for transcription.
|
|
64
|
-
|
|
65
|
-
|
|
90
|
+
context: Customization for transcription. String for models with context_version 1 and ContextObject for models with context_version 2.
|
|
91
|
+
enable_speaker_diarization: Whether to enable speaker diarization. Tokens are annotated with speaker IDs.
|
|
92
|
+
enable_language_identification: Whether to enable language identification. Tokens are annotated with language IDs.
|
|
66
93
|
client_reference_id: Client reference ID to use for transcription.
|
|
67
94
|
"""
|
|
68
95
|
|
|
@@ -72,10 +99,10 @@ class SonioxInputParams(BaseModel):
|
|
|
72
99
|
num_channels: Optional[int] = 1
|
|
73
100
|
|
|
74
101
|
language_hints: Optional[List[Language]] = None
|
|
75
|
-
context: Optional[str] = None
|
|
102
|
+
context: Optional[SonioxContextObject | str] = None
|
|
76
103
|
|
|
77
|
-
|
|
78
|
-
|
|
104
|
+
enable_speaker_diarization: Optional[bool] = False
|
|
105
|
+
enable_language_identification: Optional[bool] = False
|
|
79
106
|
|
|
80
107
|
client_reference_id: Optional[str] = None
|
|
81
108
|
|
|
@@ -173,6 +200,10 @@ class SonioxSTTService(STTService):
|
|
|
173
200
|
# Either one or the other is required.
|
|
174
201
|
enable_endpoint_detection = not self._vad_force_turn_endpoint
|
|
175
202
|
|
|
203
|
+
context = self._params.context
|
|
204
|
+
if isinstance(context, SonioxContextObject):
|
|
205
|
+
context = context.model_dump()
|
|
206
|
+
|
|
176
207
|
# Send the initial configuration message.
|
|
177
208
|
config = {
|
|
178
209
|
"api_key": self._api_key,
|
|
@@ -182,9 +213,9 @@ class SonioxSTTService(STTService):
|
|
|
182
213
|
"enable_endpoint_detection": enable_endpoint_detection,
|
|
183
214
|
"sample_rate": self.sample_rate,
|
|
184
215
|
"language_hints": _prepare_language_hints(self._params.language_hints),
|
|
185
|
-
"context":
|
|
186
|
-
"
|
|
187
|
-
"
|
|
216
|
+
"context": context,
|
|
217
|
+
"enable_speaker_diarization": self._params.enable_speaker_diarization,
|
|
218
|
+
"enable_language_identification": self._params.enable_language_identification,
|
|
188
219
|
"client_reference_id": self._params.client_reference_id,
|
|
189
220
|
}
|
|
190
221
|
|
|
@@ -210,6 +241,7 @@ class SonioxSTTService(STTService):
|
|
|
210
241
|
if self._receive_task != asyncio.current_task():
|
|
211
242
|
await self._receive_task
|
|
212
243
|
self._receive_task = None
|
|
244
|
+
self.logger.debug("Disconnected from Soniox STT")
|
|
213
245
|
|
|
214
246
|
async def stop(self, frame: EndFrame):
|
|
215
247
|
"""Stop the Soniox STT websocket connection.
|
|
@@ -351,7 +383,10 @@ class SonioxSTTService(STTService):
|
|
|
351
383
|
|
|
352
384
|
if self._final_transcription_buffer or non_final_transcription:
|
|
353
385
|
final_text = "".join(
|
|
354
|
-
map(
|
|
386
|
+
map(
|
|
387
|
+
lambda token: token["text"],
|
|
388
|
+
self._final_transcription_buffer,
|
|
389
|
+
)
|
|
355
390
|
)
|
|
356
391
|
non_final_text = "".join(
|
|
357
392
|
map(lambda token: token["text"], non_final_transcription)
|
pipecat/services/vistaar/llm.py
CHANGED
|
@@ -10,19 +10,27 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
|
|
|
10
10
|
from urllib.parse import urlencode
|
|
11
11
|
|
|
12
12
|
import httpx
|
|
13
|
+
import jwt
|
|
13
14
|
from loguru import logger
|
|
14
15
|
from pydantic import BaseModel, Field
|
|
15
16
|
|
|
17
|
+
try:
|
|
18
|
+
import redis.asyncio as redis
|
|
19
|
+
REDIS_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
REDIS_AVAILABLE = False
|
|
22
|
+
redis = None
|
|
23
|
+
|
|
16
24
|
from pipecat.frames.frames import (
|
|
17
|
-
EndFrame,
|
|
18
25
|
CancelFrame,
|
|
26
|
+
EndFrame,
|
|
19
27
|
Frame,
|
|
28
|
+
InterruptionFrame,
|
|
20
29
|
LLMFullResponseEndFrame,
|
|
21
30
|
LLMFullResponseStartFrame,
|
|
22
31
|
LLMMessagesFrame,
|
|
23
32
|
LLMTextFrame,
|
|
24
33
|
LLMUpdateSettingsFrame,
|
|
25
|
-
StartInterruptionFrame,
|
|
26
34
|
)
|
|
27
35
|
from pipecat.processors.aggregators.llm_response import (
|
|
28
36
|
LLMAssistantAggregatorParams,
|
|
@@ -55,7 +63,9 @@ class VistaarLLMService(LLMService):
|
|
|
55
63
|
Parameters:
|
|
56
64
|
source_lang: Source language code (e.g., 'mr' for Marathi, 'hi' for Hindi).
|
|
57
65
|
target_lang: Target language code for responses.
|
|
58
|
-
session_id: Session ID for maintaining conversation context.
|
|
66
|
+
session_id: Session ID for maintaining conversation context (also used for JWT caching).
|
|
67
|
+
pre_query_response_phrases: List of phrases to say while waiting for response.
|
|
68
|
+
phone_number: Phone number for JWT subject claim.
|
|
59
69
|
extra: Additional model-specific parameters
|
|
60
70
|
"""
|
|
61
71
|
|
|
@@ -63,6 +73,7 @@ class VistaarLLMService(LLMService):
|
|
|
63
73
|
target_lang: Optional[str] = Field(default="mr")
|
|
64
74
|
session_id: Optional[str] = Field(default=None)
|
|
65
75
|
pre_query_response_phrases: Optional[List[str]] = Field(default_factory=list)
|
|
76
|
+
phone_number: Optional[str] = Field(default="UNKNOWN")
|
|
66
77
|
extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
|
67
78
|
|
|
68
79
|
def __init__(
|
|
@@ -72,6 +83,9 @@ class VistaarLLMService(LLMService):
|
|
|
72
83
|
params: Optional[InputParams] = None,
|
|
73
84
|
timeout: float = 30.0,
|
|
74
85
|
interim_timeout: float = 5.0,
|
|
86
|
+
redis_client: Optional[Any] = None, # redis.Redis type
|
|
87
|
+
jwt_private_key: Optional[str] = None,
|
|
88
|
+
jwt_token_expiry: int = 3600,
|
|
75
89
|
**kwargs,
|
|
76
90
|
):
|
|
77
91
|
"""Initialize Vistaar LLM service.
|
|
@@ -81,6 +95,9 @@ class VistaarLLMService(LLMService):
|
|
|
81
95
|
params: Input parameters for model configuration and behavior.
|
|
82
96
|
timeout: Request timeout in seconds. Defaults to 30.0 seconds.
|
|
83
97
|
interim_timeout: Time in seconds before sending interim message. Defaults to 5.0 seconds.
|
|
98
|
+
redis_client: Optional Redis client for JWT token caching.
|
|
99
|
+
jwt_private_key: Optional RSA private key in PEM format for JWT signing.
|
|
100
|
+
jwt_token_expiry: JWT token expiry time in seconds. Defaults to 3600 (1 hour).
|
|
84
101
|
**kwargs: Additional arguments passed to the parent LLMService.
|
|
85
102
|
"""
|
|
86
103
|
super().__init__(**kwargs)
|
|
@@ -95,6 +112,16 @@ class VistaarLLMService(LLMService):
|
|
|
95
112
|
self._extra = params.extra if isinstance(params.extra, dict) else {}
|
|
96
113
|
self._timeout = timeout
|
|
97
114
|
self._interim_timeout = interim_timeout
|
|
115
|
+
self._phone_number = params.phone_number
|
|
116
|
+
|
|
117
|
+
# JWT authentication setup
|
|
118
|
+
self._redis_client = redis_client
|
|
119
|
+
self._jwt_private_key = jwt_private_key
|
|
120
|
+
self._jwt_token_expiry = jwt_token_expiry
|
|
121
|
+
self._jwt_issuer = "voice-provider"
|
|
122
|
+
|
|
123
|
+
if self._jwt_private_key and not self._redis_client:
|
|
124
|
+
logger.warning("JWT private key provided but no Redis client for caching. JWT auth will regenerate tokens on each request.")
|
|
98
125
|
|
|
99
126
|
# Create an async HTTP client
|
|
100
127
|
self._client = httpx.AsyncClient(timeout=httpx.Timeout(self._timeout), verify=False)
|
|
@@ -112,6 +139,53 @@ class VistaarLLMService(LLMService):
|
|
|
112
139
|
f"Vistaar LLM initialized - Base URL: {self._base_url}, Session ID: {self._session_id}, Source Lang: {self._source_lang}, Target Lang: {self._target_lang}, Timeout: {self._timeout}s"
|
|
113
140
|
)
|
|
114
141
|
|
|
142
|
+
async def _get_jwt_token(self) -> Optional[str]:
|
|
143
|
+
"""Generate or retrieve a cached JWT token.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
JWT token string or None if JWT auth is not configured.
|
|
147
|
+
"""
|
|
148
|
+
if not self._jwt_private_key:
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
# Try to get from Redis cache if available
|
|
152
|
+
if self._redis_client and self._session_id:
|
|
153
|
+
redis_key = f"vistaar_jwt:{self._session_id}"
|
|
154
|
+
try:
|
|
155
|
+
cached_token = await self._redis_client.get(redis_key)
|
|
156
|
+
if cached_token:
|
|
157
|
+
logger.debug(f"Retrieved JWT token from Redis cache for session_id: {self._session_id}")
|
|
158
|
+
return cached_token.decode('utf-8') if isinstance(cached_token, bytes) else cached_token
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.warning(f"Redis cache retrieval failed: {e}. Generating new token.")
|
|
161
|
+
|
|
162
|
+
# Generate new token
|
|
163
|
+
current_time = int(time.time())
|
|
164
|
+
payload = {
|
|
165
|
+
"sub": self._phone_number, # Subject identifier (phone number)
|
|
166
|
+
"iss": self._jwt_issuer, # Issuer
|
|
167
|
+
"iat": current_time, # Issued at timestamp
|
|
168
|
+
"exp": current_time + self._jwt_token_expiry # Expiration timestamp
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
token = jwt.encode(payload, self._jwt_private_key, algorithm="RS256")
|
|
172
|
+
logger.info(f"Generated new JWT token for {self._phone_number}, expires in {self._jwt_token_expiry}s")
|
|
173
|
+
|
|
174
|
+
# Cache in Redis if available
|
|
175
|
+
if self._redis_client and self._session_id:
|
|
176
|
+
redis_key = f"vistaar_jwt:{self._session_id}"
|
|
177
|
+
try:
|
|
178
|
+
await self._redis_client.setex(
|
|
179
|
+
redis_key,
|
|
180
|
+
self._jwt_token_expiry,
|
|
181
|
+
token
|
|
182
|
+
)
|
|
183
|
+
logger.debug(f"Cached JWT token in Redis for session_id: {self._session_id} with {self._jwt_token_expiry}s TTL")
|
|
184
|
+
except Exception as e:
|
|
185
|
+
logger.warning(f"Redis cache storage failed: {e}. Continuing without cache.")
|
|
186
|
+
|
|
187
|
+
return token
|
|
188
|
+
|
|
115
189
|
async def _extract_messages_to_query(self, context: OpenAILLMContext) -> str:
|
|
116
190
|
"""Extract only the last user message from context.
|
|
117
191
|
|
|
@@ -259,9 +333,23 @@ class VistaarLLMService(LLMService):
|
|
|
259
333
|
self._interim_in_progress = False
|
|
260
334
|
self._interim_completion_event.clear() # Reset the event for new request
|
|
261
335
|
|
|
336
|
+
# Prepare headers with JWT authentication if configured
|
|
337
|
+
headers = {}
|
|
338
|
+
try:
|
|
339
|
+
jwt_token = await self._get_jwt_token()
|
|
340
|
+
if jwt_token:
|
|
341
|
+
headers["Authorization"] = f"Bearer {jwt_token}"
|
|
342
|
+
logger.debug(f"Added JWT authentication header for session_id: {self._session_id}")
|
|
343
|
+
except Exception as e:
|
|
344
|
+
logger.error(f"Failed to generate JWT token: {e}")
|
|
345
|
+
raise
|
|
346
|
+
|
|
347
|
+
await self.start_connection_metrics()
|
|
348
|
+
|
|
262
349
|
try:
|
|
263
350
|
# Use httpx to handle SSE streaming
|
|
264
|
-
async with self._client.stream("GET", url) as response:
|
|
351
|
+
async with self._client.stream("GET", url, headers=headers) as response:
|
|
352
|
+
await self.stop_connection_metrics(success=True, connection_type="http")
|
|
265
353
|
self._current_response = response # Store for potential cancellation
|
|
266
354
|
response.raise_for_status()
|
|
267
355
|
|
|
@@ -279,14 +367,17 @@ class VistaarLLMService(LLMService):
|
|
|
279
367
|
yield line
|
|
280
368
|
|
|
281
369
|
except httpx.HTTPStatusError as e:
|
|
370
|
+
await self.stop_connection_metrics(success=False, error=f"HTTP {e.response.status_code}", connection_type="http")
|
|
282
371
|
logger.error(
|
|
283
372
|
f"Vistaar HTTP error - Status: {e.response.status_code}, URL: {url}, Response: {e.response.text if hasattr(e.response, 'text') else 'N/A'}"
|
|
284
373
|
)
|
|
285
374
|
raise
|
|
286
375
|
except httpx.TimeoutException as e:
|
|
376
|
+
await self.stop_connection_metrics(success=False, error="Timeout", connection_type="http")
|
|
287
377
|
logger.error(f"Vistaar timeout error - URL: {url}, Timeout: {self._timeout}s")
|
|
288
378
|
raise
|
|
289
379
|
except Exception as e:
|
|
380
|
+
await self.stop_connection_metrics(success=False, error=str(e), connection_type="http")
|
|
290
381
|
logger.error(
|
|
291
382
|
f"Vistaar unexpected error - Type: {type(e).__name__}, Message: {str(e)}, URL: {url}"
|
|
292
383
|
)
|
|
@@ -391,7 +482,7 @@ class VistaarLLMService(LLMService):
|
|
|
391
482
|
)
|
|
392
483
|
await self.push_frame(frame, direction)
|
|
393
484
|
return
|
|
394
|
-
elif isinstance(frame,
|
|
485
|
+
elif isinstance(frame, InterruptionFrame):
|
|
395
486
|
await self._handle_interruption()
|
|
396
487
|
await self.push_frame(frame, direction)
|
|
397
488
|
return
|
|
@@ -467,4 +558,4 @@ class VistaarLLMService(LLMService):
|
|
|
467
558
|
|
|
468
559
|
def can_generate_metrics(self) -> bool:
|
|
469
560
|
"""Check if this service can generate processing metrics."""
|
|
470
|
-
return True
|
|
561
|
+
return True
|
|
@@ -569,3 +569,53 @@ class Language(StrEnum):
|
|
|
569
569
|
# Zulu
|
|
570
570
|
ZU = "zu"
|
|
571
571
|
ZU_ZA = "zu-ZA"
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def resolve_language(
|
|
575
|
+
language: Language, language_map: dict[Language, str], use_base_code: bool = True
|
|
576
|
+
) -> str:
|
|
577
|
+
"""Resolve a Language enum to a service-specific language code.
|
|
578
|
+
|
|
579
|
+
Checks the language map first, then falls back to extracting the appropriate
|
|
580
|
+
code format with a warning if not found in the verified list.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
language: The Language enum value to convert.
|
|
584
|
+
language_map: Dictionary mapping Language enums to service language codes.
|
|
585
|
+
use_base_code: If True, extracts base code (e.g., 'en' from 'en-US').
|
|
586
|
+
If False, uses full language code as-is.
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
The resolved language code for the service.
|
|
590
|
+
|
|
591
|
+
Examples::
|
|
592
|
+
|
|
593
|
+
# Service expecting base codes (e.g., Cartesia)
|
|
594
|
+
>>> LANGUAGE_MAP = {Language.EN: "en", Language.ES: "es"}
|
|
595
|
+
>>> resolve_language(Language.EN_US, LANGUAGE_MAP, use_base_code=True)
|
|
596
|
+
# Logs: "Language en-US not verified. Using base code 'en'."
|
|
597
|
+
"en"
|
|
598
|
+
|
|
599
|
+
# Service expecting full codes (e.g., AWS)
|
|
600
|
+
>>> LANGUAGE_MAP = {Language.EN_US: "en-US", Language.ES_ES: "es-ES"}
|
|
601
|
+
>>> resolve_language(Language.EN_GB, LANGUAGE_MAP, use_base_code=False)
|
|
602
|
+
# Logs: "Language en-GB not verified. Using 'en-GB'."
|
|
603
|
+
"en-GB"
|
|
604
|
+
"""
|
|
605
|
+
# Check if language is in the verified map
|
|
606
|
+
result = language_map.get(language)
|
|
607
|
+
|
|
608
|
+
if result is not None:
|
|
609
|
+
return result
|
|
610
|
+
|
|
611
|
+
# Not in map - fall back with warning
|
|
612
|
+
lang_str = str(language.value)
|
|
613
|
+
|
|
614
|
+
if use_base_code:
|
|
615
|
+
# Extract base code (e.g., "en" from "en-US")
|
|
616
|
+
base_code = lang_str.split("-")[0].lower()
|
|
617
|
+
# logger.warning(f"Language {language.value} not verified. Using base code '{base_code}'.")
|
|
618
|
+
return base_code
|
|
619
|
+
else:
|
|
620
|
+
# logger.warning(f"Language {language.value} not verified. Using '{lang_str}'.")
|
|
621
|
+
return lang_str
|
pipecat/transports/base_input.py
CHANGED
|
@@ -297,6 +297,17 @@ class BaseInputTransport(FrameProcessor):
|
|
|
297
297
|
elif isinstance(frame, EmulateUserStoppedSpeakingFrame):
|
|
298
298
|
self.logger.debug("Emulating user stopped speaking")
|
|
299
299
|
await self._handle_user_interruption(VADState.QUIET, emulated=True)
|
|
300
|
+
elif isinstance(frame, VADParamsUpdateFrame):
|
|
301
|
+
if self.vad_analyzer:
|
|
302
|
+
self.vad_analyzer.set_params(frame.params, self.logger)
|
|
303
|
+
speech_frame = SpeechControlParamsFrame(
|
|
304
|
+
vad_params=frame.params,
|
|
305
|
+
turn_params=self._params.turn_analyzer.params
|
|
306
|
+
if self._params.turn_analyzer
|
|
307
|
+
else None,
|
|
308
|
+
)
|
|
309
|
+
await self.push_frame(speech_frame)
|
|
310
|
+
await self.push_frame(frame, direction)
|
|
300
311
|
# All other system frames
|
|
301
312
|
elif isinstance(frame, SystemFrame):
|
|
302
313
|
await self.push_frame(frame, direction)
|
|
@@ -309,16 +320,6 @@ class BaseInputTransport(FrameProcessor):
|
|
|
309
320
|
elif isinstance(frame, StopFrame):
|
|
310
321
|
await self.push_frame(frame, direction)
|
|
311
322
|
await self.pause(frame)
|
|
312
|
-
elif isinstance(frame, VADParamsUpdateFrame):
|
|
313
|
-
if self.vad_analyzer:
|
|
314
|
-
self.vad_analyzer.set_params(frame.params)
|
|
315
|
-
speech_frame = SpeechControlParamsFrame(
|
|
316
|
-
vad_params=frame.params,
|
|
317
|
-
turn_params=self._params.turn_analyzer.params
|
|
318
|
-
if self._params.turn_analyzer
|
|
319
|
-
else None,
|
|
320
|
-
)
|
|
321
|
-
await self.push_frame(speech_frame)
|
|
322
323
|
elif isinstance(frame, FilterUpdateSettingsFrame) and self._params.audio_in_filter:
|
|
323
324
|
await self._params.audio_in_filter.process_frame(frame)
|
|
324
325
|
# Other frames
|
|
@@ -444,7 +445,10 @@ class BaseInputTransport(FrameProcessor):
|
|
|
444
445
|
await self._handle_user_interruption(VADState.QUIET)
|
|
445
446
|
|
|
446
447
|
async def _run_turn_analyzer(
|
|
447
|
-
self,
|
|
448
|
+
self,
|
|
449
|
+
frame: InputAudioRawFrame,
|
|
450
|
+
vad_state: VADState,
|
|
451
|
+
previous_vad_state: VADState,
|
|
448
452
|
):
|
|
449
453
|
"""Run turn analysis on audio frame and handle results."""
|
|
450
454
|
is_speech = vad_state == VADState.SPEAKING or vad_state == VADState.STARTING
|
|
@@ -50,6 +50,11 @@ from pipecat.utils.time import nanoseconds_to_seconds
|
|
|
50
50
|
|
|
51
51
|
# TODO: When we use GeminiMultimodalLiveLLMService, we need to change this to 0.35 but that creates issue for faster TTS.
|
|
52
52
|
BOT_VAD_STOP_SECS = 0.30
|
|
53
|
+
# For the very first bot utterance (e.g., intro), we can safely
|
|
54
|
+
# detect end-of-speech sooner to improve responsiveness for the
|
|
55
|
+
# user’s first short reply. Keep conservative to avoid mid-utterance
|
|
56
|
+
# false stops when TTS streams quickly.
|
|
57
|
+
FIRST_BOT_VAD_STOP_SECS = 0.12
|
|
53
58
|
|
|
54
59
|
|
|
55
60
|
class BaseOutputTransport(FrameProcessor):
|
|
@@ -84,6 +89,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
84
89
|
# us to send multiple streams at the same time if the transport allows
|
|
85
90
|
# it.
|
|
86
91
|
self._media_senders: Dict[Any, "BaseOutputTransport.MediaSender"] = {}
|
|
92
|
+
self._register_event_handler("on_output_terminated")
|
|
87
93
|
|
|
88
94
|
@property
|
|
89
95
|
def sample_rate(self) -> int:
|
|
@@ -301,10 +307,12 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
301
307
|
await self.start(frame)
|
|
302
308
|
elif isinstance(frame, EndFrame):
|
|
303
309
|
await self.stop(frame)
|
|
310
|
+
await self._call_event_handler("on_output_terminated", frame)
|
|
304
311
|
# Keep pushing EndFrame down so all the pipeline stops nicely.
|
|
305
312
|
await self.push_frame(frame, direction)
|
|
306
313
|
elif isinstance(frame, CancelFrame):
|
|
307
314
|
await self.cancel(frame)
|
|
315
|
+
await self._call_event_handler("on_output_terminated", frame)
|
|
308
316
|
await self.push_frame(frame, direction)
|
|
309
317
|
elif isinstance(frame, InterruptionFrame):
|
|
310
318
|
await self.push_frame(frame, direction)
|
|
@@ -403,6 +411,9 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
403
411
|
self._bot_speaking_frame_period = 0.2
|
|
404
412
|
# Last time the bot actually spoke.
|
|
405
413
|
self._bot_speech_last_time = 0
|
|
414
|
+
# Before the first stop event, we use a shorter silence
|
|
415
|
+
# threshold to make the first turn more responsive.
|
|
416
|
+
self._first_stop_pending = True
|
|
406
417
|
|
|
407
418
|
self._audio_task: Optional[asyncio.Task] = None
|
|
408
419
|
self._video_task: Optional[asyncio.Task] = None
|
|
@@ -628,6 +639,10 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
628
639
|
|
|
629
640
|
self._bot_speaking = False
|
|
630
641
|
|
|
642
|
+
# Mark that the first stop has been completed so subsequent
|
|
643
|
+
# stops use the regular (longer) VAD stop threshold.
|
|
644
|
+
self._first_stop_pending = False
|
|
645
|
+
|
|
631
646
|
# Clean audio buffer (there could be tiny left overs if not multiple
|
|
632
647
|
# to our output chunk size).
|
|
633
648
|
self._audio_buffer = bytearray()
|
|
@@ -687,9 +702,14 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
687
702
|
async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
|
|
688
703
|
while True:
|
|
689
704
|
try:
|
|
690
|
-
|
|
691
|
-
|
|
705
|
+
# Use a shorter timeout only for the first bot stop to
|
|
706
|
+
# accelerate the initial turn handoff right after the intro.
|
|
707
|
+
timeout = (
|
|
708
|
+
FIRST_BOT_VAD_STOP_SECS
|
|
709
|
+
if getattr(self, "_first_stop_pending", True)
|
|
710
|
+
else BOT_VAD_STOP_SECS
|
|
692
711
|
)
|
|
712
|
+
frame = await asyncio.wait_for(self._audio_queue.get(), timeout=timeout)
|
|
693
713
|
yield frame
|
|
694
714
|
self._audio_queue.task_done()
|
|
695
715
|
except asyncio.TimeoutError:
|
|
@@ -710,7 +730,13 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
710
730
|
except asyncio.QueueEmpty:
|
|
711
731
|
# Notify the bot stopped speaking upstream if necessary.
|
|
712
732
|
diff_time = time.time() - last_frame_time
|
|
713
|
-
|
|
733
|
+
# Use a shorter threshold for the first stop only.
|
|
734
|
+
current_stop_secs = (
|
|
735
|
+
FIRST_BOT_VAD_STOP_SECS
|
|
736
|
+
if getattr(self, "_first_stop_pending", True)
|
|
737
|
+
else BOT_VAD_STOP_SECS
|
|
738
|
+
)
|
|
739
|
+
if diff_time > current_stop_secs:
|
|
714
740
|
await self._bot_stopped_speaking()
|
|
715
741
|
# Generate an audio frame with only the mixer's part.
|
|
716
742
|
frame = OutputAudioRawFrame(
|
pipecat/utils/redis.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Async Redis helper utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional, TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import redis.asyncio as redis
|
|
11
|
+
except ImportError: # pragma: no cover - Redis is optional
|
|
12
|
+
redis = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING: # pragma: no cover - typing aid
|
|
16
|
+
from redis.asyncio import Redis
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_async_redis_client(
|
|
20
|
+
url: Optional[str],
|
|
21
|
+
*,
|
|
22
|
+
decode_responses: bool = True,
|
|
23
|
+
encoding: str = "utf-8",
|
|
24
|
+
logger: Optional[Any] = None,
|
|
25
|
+
**kwargs,
|
|
26
|
+
) -> Optional["Redis"]:
|
|
27
|
+
"""Return a configured async Redis client or None if unavailable.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
url: Redis connection URL.
|
|
31
|
+
decode_responses: Whether to decode responses to str.
|
|
32
|
+
encoding: Character encoding to use with decoded responses.
|
|
33
|
+
logger: Optional logger supporting .warning() for diagnostics.
|
|
34
|
+
**kwargs: Additional keyword arguments forwarded to Redis.from_url.
|
|
35
|
+
"""
|
|
36
|
+
if redis is None:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
if not url or url in {"redis_url", "REDIS_URL"}:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
parsed = urlparse(url)
|
|
43
|
+
connection_kwargs = {
|
|
44
|
+
"decode_responses": decode_responses,
|
|
45
|
+
"encoding": encoding,
|
|
46
|
+
}
|
|
47
|
+
connection_kwargs.update(kwargs)
|
|
48
|
+
|
|
49
|
+
if parsed.scheme == "rediss":
|
|
50
|
+
connection_kwargs.setdefault("ssl_cert_reqs", "none")
|
|
51
|
+
connection_kwargs.setdefault("ssl_check_hostname", False)
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
return redis.Redis.from_url(url, **connection_kwargs)
|
|
55
|
+
except Exception as exc: # pragma: no cover - best effort logging
|
|
56
|
+
if logger is not None:
|
|
57
|
+
logger.warning(f"Failed to create Redis client: {exc}")
|
|
58
|
+
return None
|
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt
RENAMED
|
File without changes
|