dv-pipecat-ai 0.0.85.dev818__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (32) hide show
  1. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/METADATA +2 -1
  2. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/RECORD +32 -29
  3. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +5 -1
  4. pipecat/frames/frames.py +34 -0
  5. pipecat/metrics/connection_metrics.py +45 -0
  6. pipecat/processors/aggregators/llm_response.py +25 -4
  7. pipecat/processors/dtmf_aggregator.py +17 -21
  8. pipecat/processors/frame_processor.py +51 -8
  9. pipecat/processors/metrics/frame_processor_metrics.py +108 -0
  10. pipecat/processors/transcript_processor.py +22 -1
  11. pipecat/serializers/__init__.py +2 -0
  12. pipecat/serializers/asterisk.py +16 -2
  13. pipecat/serializers/convox.py +2 -2
  14. pipecat/serializers/custom.py +2 -2
  15. pipecat/serializers/vi.py +326 -0
  16. pipecat/services/cartesia/tts.py +75 -10
  17. pipecat/services/deepgram/stt.py +317 -17
  18. pipecat/services/elevenlabs/stt.py +487 -19
  19. pipecat/services/elevenlabs/tts.py +28 -4
  20. pipecat/services/google/llm.py +26 -11
  21. pipecat/services/openai/base_llm.py +79 -14
  22. pipecat/services/salesforce/llm.py +321 -86
  23. pipecat/services/sarvam/tts.py +0 -1
  24. pipecat/services/soniox/stt.py +45 -10
  25. pipecat/services/vistaar/llm.py +97 -6
  26. pipecat/transcriptions/language.py +50 -0
  27. pipecat/transports/base_input.py +15 -11
  28. pipecat/transports/base_output.py +29 -3
  29. pipecat/utils/redis.py +58 -0
  30. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL +0 -0
  31. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE +0 -0
  32. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,326 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Vodafone Idea (VI) WebSocket frame serializer for audio streaming and call management."""
8
+
9
+ import base64
10
+ import json
11
+ from datetime import datetime, timezone
12
+ from typing import Optional
13
+
14
+ from loguru import logger
15
+ from pydantic import BaseModel
16
+
17
+ from pipecat.audio.utils import create_default_resampler
18
+ from pipecat.frames.frames import (
19
+ AudioRawFrame,
20
+ CancelFrame,
21
+ EndFrame,
22
+ Frame,
23
+ InputAudioRawFrame,
24
+ InputDTMFFrame,
25
+ KeypadEntry,
26
+ StartFrame,
27
+ StartInterruptionFrame,
28
+ TransportMessageFrame,
29
+ TransportMessageUrgentFrame,
30
+ )
31
+ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
32
+
33
+
34
+ class VIFrameSerializer(FrameSerializer):
35
+ """Serializer for Vodafone Idea (VI) WebSocket protocol.
36
+
37
+ This serializer handles converting between Pipecat frames and VI's WebSocket
38
+ protocol for bidirectional audio streaming. It supports audio conversion, DTMF events,
39
+ and real-time communication with VI telephony systems.
40
+
41
+ VI WebSocket protocol requirements:
42
+ - PCM audio format at 8kHz sample rate
43
+ - 16-bit Linear PCM encoding
44
+ - Base64 encoded audio payloads
45
+ - JSON message format for control and media events
46
+ - Bitrate: 128 Kbps
47
+
48
+ Events (VI → Endpoint):
49
+ - connected: WebSocket connection established
50
+ - start: Stream session started with call/stream IDs
51
+ - media: Audio data in Base64-encoded PCM
52
+ - dtmf: Keypad digit pressed
53
+ - stop: Stream ended
54
+ - mark: Audio playback checkpoint confirmation
55
+
56
+ Events (Endpoint → VI):
57
+ - media: Send audio back to VI
58
+ - mark: Request acknowledgment for audio playback
59
+ - clear: Clear queued audio (interruption)
60
+ - exit: Terminate session gracefully
61
+ """
62
+
63
+ class InputParams(BaseModel):
64
+ """Configuration parameters for VIFrameSerializer.
65
+
66
+ Attributes:
67
+ vi_sample_rate: Sample rate used by VI, defaults to 8000 Hz (telephony standard).
68
+ sample_rate: Optional override for pipeline input sample rate.
69
+ auto_hang_up: Whether to automatically terminate call on EndFrame.
70
+ """
71
+
72
+ vi_sample_rate: int = 8000
73
+ sample_rate: Optional[int] = None
74
+ auto_hang_up: bool = False
75
+
76
+ def __init__(
77
+ self,
78
+ stream_id: str,
79
+ call_id: Optional[str] = None,
80
+ params: Optional[InputParams] = None,
81
+ ):
82
+ """Initialize the VIFrameSerializer.
83
+
84
+ Args:
85
+ stream_id: The VI stream identifier.
86
+ call_id: The associated VI call identifier.
87
+ params: Configuration parameters.
88
+ """
89
+ self._stream_id = stream_id
90
+ self._call_id = call_id
91
+ self._params = params or VIFrameSerializer.InputParams()
92
+
93
+ self._vi_sample_rate = self._params.vi_sample_rate
94
+ self._sample_rate = 0 # Pipeline input rate
95
+ self._call_ended = False
96
+
97
+ self._resampler = create_default_resampler()
98
+
99
+ @property
100
+ def type(self) -> FrameSerializerType:
101
+ """Gets the serializer type.
102
+
103
+ Returns:
104
+ The serializer type as TEXT for JSON WebSocket messages.
105
+ """
106
+ return FrameSerializerType.TEXT
107
+
108
+ async def setup(self, frame: StartFrame):
109
+ """Sets up the serializer with pipeline configuration.
110
+
111
+ Args:
112
+ frame: The StartFrame containing pipeline configuration.
113
+ """
114
+ self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
115
+
116
+ async def serialize(self, frame: Frame) -> str | bytes | None:
117
+ """Serializes a Pipecat frame to VI WebSocket format.
118
+
119
+ Handles conversion of various frame types to VI WebSocket messages.
120
+ For EndFrames, initiates call termination if auto_hang_up is enabled.
121
+
122
+ Args:
123
+ frame: The Pipecat frame to serialize.
124
+
125
+ Returns:
126
+ Serialized data as JSON string, or None if the frame isn't handled.
127
+ """
128
+ if (
129
+ self._params.auto_hang_up
130
+ and not self._call_ended
131
+ and isinstance(frame, (EndFrame, CancelFrame))
132
+ ):
133
+ self._call_ended = True
134
+ # Return the exit event to terminate the VI session
135
+ return await self._send_exit_event()
136
+
137
+ elif isinstance(frame, StartInterruptionFrame):
138
+ # Clear/interrupt command for VI - clears queued audio
139
+ message = {
140
+ "event": "clear",
141
+ "stream_id": self._stream_id,
142
+ "call_id": self._call_id,
143
+ }
144
+ logger.debug(f"VI: Sending clear event for stream_id: {self._stream_id}")
145
+ return json.dumps(message)
146
+
147
+ elif isinstance(frame, AudioRawFrame):
148
+ if self._call_ended:
149
+ logger.debug("VI SERIALIZE: Skipping audio - call has ended")
150
+ return None
151
+
152
+ # Convert PCM audio to VI format
153
+ data = frame.audio
154
+
155
+ # Resample to VI sample rate (8kHz)
156
+ serialized_data = await self._resampler.resample(
157
+ data, frame.sample_rate, self._vi_sample_rate
158
+ )
159
+
160
+ # Encode as base64 for transmission
161
+ payload = base64.b64encode(serialized_data).decode("ascii")
162
+
163
+ # VI expects media event format with Base64-encoded PCM audio
164
+ timestamp = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
165
+
166
+ message = {
167
+ "event": "media",
168
+ "stream_id": self._stream_id,
169
+ "media": {
170
+ "timestamp": timestamp,
171
+ "chunk": len(serialized_data), # Chunk size in bytes
172
+ "payload": payload,
173
+ },
174
+ }
175
+
176
+ logger.debug(f"VI: Sending media event {message} for stream_id: {self._stream_id}")
177
+
178
+ return json.dumps(message)
179
+
180
+ elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
181
+ # Pass through transport messages (for mark events, etc.)
182
+ return json.dumps(frame.message)
183
+
184
+ return None
185
+
186
+ async def _send_exit_event(self):
187
+ """Send an exit event to VI to terminate the session gracefully.
188
+
189
+ This method is called when auto_hang_up is enabled and an EndFrame or
190
+ CancelFrame is received. The exit event allows IVR logic to continue
191
+ after the WebSocket session ends.
192
+ """
193
+ try:
194
+ exit_event = {
195
+ "event": "exit",
196
+ "stream_id": self._stream_id,
197
+ "call_id": self._call_id,
198
+ "timestamp": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
199
+ }
200
+
201
+ logger.info(
202
+ f"VI auto_hang_up: Sending exit event for stream_id: {self._stream_id}, call_id: {self._call_id}"
203
+ )
204
+ return json.dumps(exit_event)
205
+ except Exception as e:
206
+ logger.error(f"VI auto_hang_up: Failed to create exit event: {e}")
207
+ return None
208
+
209
+ async def deserialize(self, data: str | bytes) -> Frame | None:
210
+ """Deserializes VI WebSocket data to Pipecat frames.
211
+
212
+ Handles conversion of VI media events to appropriate Pipecat frames.
213
+
214
+ Args:
215
+ data: The raw WebSocket data from VI.
216
+
217
+ Returns:
218
+ A Pipecat frame corresponding to the VI event, or None if unhandled.
219
+ """
220
+ try:
221
+ message = json.loads(data)
222
+ except json.JSONDecodeError:
223
+ logger.error(f"Invalid JSON received from VI: {data}")
224
+ return None
225
+
226
+ # Log all incoming events for debugging and monitoring
227
+ event = message.get("event")
228
+ logger.debug(
229
+ f"VI INCOMING EVENT: {event} - stream_id: {self._stream_id}, call_id: {self._call_id}"
230
+ )
231
+
232
+ if event == "media":
233
+ # Handle incoming audio data from VI
234
+ media = message.get("media", {})
235
+ payload_base64 = media.get("payload")
236
+
237
+ if not payload_base64:
238
+ logger.warning("VI DESERIALIZE: No payload in VI media message")
239
+ return None
240
+
241
+ try:
242
+ payload = base64.b64decode(payload_base64)
243
+ chunk_size = len(payload)
244
+
245
+ # Log chunk info (optional)
246
+ logger.debug(
247
+ f"VI DESERIALIZE: Received audio from VI - {chunk_size} bytes at {self._vi_sample_rate}Hz"
248
+ )
249
+
250
+ except Exception as e:
251
+ logger.error(f"VI DESERIALIZE: Error decoding VI audio payload: {e}")
252
+ return None
253
+
254
+ # Convert from VI sample rate (8kHz) to pipeline sample rate
255
+ deserialized_data = await self._resampler.resample(
256
+ payload,
257
+ self._vi_sample_rate,
258
+ self._sample_rate,
259
+ )
260
+
261
+ audio_frame = InputAudioRawFrame(
262
+ audio=deserialized_data,
263
+ num_channels=1, # VI uses mono audio
264
+ sample_rate=self._sample_rate,
265
+ )
266
+ return audio_frame
267
+
268
+ elif event == "dtmf":
269
+ # Handle DTMF events
270
+ dtmf_data = message.get("dtmf", {})
271
+ digit = dtmf_data.get("digit")
272
+
273
+ if digit:
274
+ try:
275
+ logger.info(f"VI: Received DTMF digit: {digit}")
276
+ return InputDTMFFrame(KeypadEntry(digit))
277
+ except ValueError:
278
+ logger.warning(f"Invalid DTMF digit from VI: {digit}")
279
+ return None
280
+
281
+ elif event == "connected":
282
+ # Handle connection event
283
+ logger.info(f"VI connection established: {message}")
284
+ return None
285
+
286
+ elif event == "start":
287
+ # Handle stream start event
288
+ logger.info(f"VI stream started: {message}")
289
+ return None
290
+
291
+ elif event == "stop":
292
+ # Handle stream stop event
293
+ logger.info(f"VI stream stopped: {message}")
294
+ # Don't end the call here, wait for explicit exit or call end
295
+ return None
296
+
297
+ elif event == "mark":
298
+ # Handle mark event - checkpoint confirming audio playback completion
299
+ mark_data = message.get("mark", {})
300
+ mark_name = mark_data.get("name", "unknown")
301
+ logger.info(f"VI mark event received: {mark_name}")
302
+ # Mark events are informational, no frame to return
303
+ return None
304
+
305
+ elif event == "error":
306
+ # Handle error events
307
+ error_msg = message.get("error", "Unknown error")
308
+ logger.error(f"VI error: {error_msg}")
309
+ return None
310
+
311
+ elif event == "exit":
312
+ # Handle exit event from VI
313
+ logger.info("VI exit event received - terminating session")
314
+ self._call_ended = True
315
+ return CancelFrame()
316
+
317
+ elif event == "call_end" or event == "callEnd":
318
+ # Handle call end event (if VI sends this)
319
+ logger.info("VI call end event received")
320
+ self._call_ended = True
321
+ return CancelFrame()
322
+
323
+ else:
324
+ logger.debug(f"VI UNHANDLED EVENT: {event}")
325
+
326
+ return None
@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
15
15
  from loguru import logger
16
16
  from pydantic import BaseModel, Field
17
17
 
18
-
19
18
  from pipecat.frames.frames import (
20
19
  CancelFrame,
21
20
  EndFrame,
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
49
48
  raise Exception(f"Missing module: {e}")
50
49
 
51
50
 
51
+ class GenerationConfig(BaseModel):
52
+ """Configuration for Cartesia Sonic-3 generation parameters.
53
+
54
+ Sonic-3 interprets these parameters as guidance to ensure natural speech.
55
+ Test against your content for best results.
56
+
57
+ Parameters:
58
+ volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
59
+ speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
60
+ emotion: Single emotion string to guide the emotional tone. Examples include neutral,
61
+ angry, excited, content, sad, scared. Over 60 emotions are supported. For best
62
+ results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
63
+ and Marian.
64
+ """
65
+
66
+ volume: Optional[float] = None
67
+ speed: Optional[float] = None
68
+ emotion: Optional[str] = None
69
+
70
+
52
71
  def language_to_cartesia_language(language: Language) -> Optional[str]:
53
72
  """Convert a Language enum to Cartesia language code.
54
73
 
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
74
93
  Language.SV: "sv",
75
94
  Language.TR: "tr",
76
95
  Language.ZH: "zh",
96
+ Language.TL: "tl",
97
+ Language.BG: "bg",
98
+ Language.RO: "ro",
99
+ Language.AR: "ar",
100
+ Language.CS: "cs",
101
+ Language.EL: "el",
102
+ Language.FI: "fi",
103
+ Language.HR: "hr",
104
+ Language.MS: "ms",
105
+ Language.SK: "sk",
106
+ Language.DA: "da",
107
+ Language.TA: "ta",
108
+ Language.UK: "uk",
109
+ Language.HU: "hu",
110
+ Language.NO: "no",
111
+ Language.VI: "vi",
112
+ Language.BN: "bn",
113
+ Language.TH: "th",
114
+ Language.HE: "he",
115
+ Language.KA: "ka",
116
+ Language.ID: "id",
117
+ Language.TE: "te",
118
+ Language.GU: "gu",
119
+ Language.KN: "kn",
120
+ Language.ML: "ml",
121
+ Language.MR: "mr",
122
+ Language.PA: "pa",
77
123
  }
78
124
 
79
125
  result = BASE_LANGUAGES.get(language)
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
102
148
 
103
149
  Parameters:
104
150
  language: Language to use for synthesis.
105
- speed: Voice speed control.
106
- emotion: List of emotion controls.
151
+ speed: Voice speed control for non-Sonic-3 models (literal values).
152
+ emotion: List of emotion controls for non-Sonic-3 models.
107
153
 
108
154
  .. deprecated:: 0.0.68
109
155
  The `emotion` parameter is deprecated and will be removed in a future version.
156
+
157
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
158
+ speed (numeric), and emotion (string) parameters.
110
159
  """
111
160
 
112
161
  language: Optional[Language] = Language.EN
113
162
  speed: Optional[Literal["slow", "normal", "fast"]] = None
114
163
  emotion: Optional[List[str]] = []
164
+ generation_config: Optional[GenerationConfig] = None
115
165
 
116
166
  def __init__(
117
167
  self,
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
120
170
  voice_id: str,
121
171
  cartesia_version: str = "2025-04-16",
122
172
  url: str = "wss://api.cartesia.ai/tts/websocket",
123
- model: str = "sonic-2",
173
+ model: str = "sonic-3",
124
174
  sample_rate: Optional[int] = None,
125
175
  encoding: str = "pcm_s16le",
126
176
  container: str = "raw",
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
136
186
  voice_id: ID of the voice to use for synthesis.
137
187
  cartesia_version: API version string for Cartesia service.
138
188
  url: WebSocket URL for Cartesia TTS API.
139
- model: TTS model to use (e.g., "sonic-2").
189
+ model: TTS model to use (e.g., "sonic-3").
140
190
  sample_rate: Audio sample rate. If None, uses default.
141
191
  encoding: Audio encoding format.
142
192
  container: Audio container format.
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
180
230
  else "en",
181
231
  "speed": params.speed,
182
232
  "emotion": params.emotion,
233
+ "generation_config": params.generation_config,
183
234
  }
184
235
  self.set_model_name(model)
185
236
  self.set_voice(voice_id)
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
298
349
  if self._settings["speed"]:
299
350
  msg["speed"] = self._settings["speed"]
300
351
 
352
+ if self._settings["generation_config"]:
353
+ msg["generation_config"] = self._settings["generation_config"].model_dump(
354
+ exclude_none=True
355
+ )
356
+
301
357
  return json.dumps(msg)
302
358
 
303
359
  async def start(self, frame: StartFrame):
@@ -419,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
419
475
  logger.error(f"{self} error: {msg}")
420
476
  await self.push_frame(TTSStoppedFrame())
421
477
  await self.stop_all_metrics()
422
-
423
478
  await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
424
479
  self._context_id = None
425
480
  else:
@@ -484,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
484
539
 
485
540
  Parameters:
486
541
  language: Language to use for synthesis.
487
- speed: Voice speed control.
488
- emotion: List of emotion controls.
542
+ speed: Voice speed control for non-Sonic-3 models (literal values).
543
+ emotion: List of emotion controls for non-Sonic-3 models.
489
544
 
490
545
  .. deprecated:: 0.0.68
491
546
  The `emotion` parameter is deprecated and will be removed in a future version.
547
+
548
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
549
+ speed (numeric), and emotion (string) parameters.
492
550
  """
493
551
 
494
552
  language: Optional[Language] = Language.EN
495
553
  speed: Optional[Literal["slow", "normal", "fast"]] = None
496
554
  emotion: Optional[List[str]] = Field(default_factory=list)
555
+ generation_config: Optional[GenerationConfig] = None
497
556
 
498
557
  def __init__(
499
558
  self,
500
559
  *,
501
560
  api_key: str,
502
561
  voice_id: str,
503
- model: str = "sonic-2",
562
+ model: str = "sonic-3",
504
563
  base_url: str = "https://api.cartesia.ai",
505
564
  cartesia_version: str = "2024-11-13",
506
565
  sample_rate: Optional[int] = None,
@@ -514,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
514
573
  Args:
515
574
  api_key: Cartesia API key for authentication.
516
575
  voice_id: ID of the voice to use for synthesis.
517
- model: TTS model to use (e.g., "sonic-2").
576
+ model: TTS model to use (e.g., "sonic-3").
518
577
  base_url: Base URL for Cartesia HTTP API.
519
578
  cartesia_version: API version string for Cartesia service.
520
579
  sample_rate: Audio sample rate. If None, uses default.
@@ -541,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
541
600
  else "en",
542
601
  "speed": params.speed,
543
602
  "emotion": params.emotion,
603
+ "generation_config": params.generation_config,
544
604
  }
545
605
  self.set_voice(voice_id)
546
606
  self.set_model_name(model)
@@ -634,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
634
694
  if self._settings["speed"]:
635
695
  payload["speed"] = self._settings["speed"]
636
696
 
697
+ if self._settings["generation_config"]:
698
+ payload["generation_config"] = self._settings["generation_config"].model_dump(
699
+ exclude_none=True
700
+ )
701
+
637
702
  yield TTSStartedFrame()
638
703
 
639
704
  session = await self._client._get_session()