dv-pipecat-ai 0.0.85.dev830__py3-none-any.whl → 0.0.85.dev832__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev830
3
+ Version: 0.0.85.dev832
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev830.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev832.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -205,7 +205,7 @@ pipecat/services/azure/realtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
205
205
  pipecat/services/azure/realtime/llm.py,sha256=MnDiw-YJP3kll1gbkta4z4vsWfWZ5oBprZCinMP9O0M,2385
206
206
  pipecat/services/cartesia/__init__.py,sha256=vzh0jBnfPwWdxFfV-tu0x1HFoOTgr9s91GYmD-CJUtY,284
207
207
  pipecat/services/cartesia/stt.py,sha256=00k9gQYo_xPKb-RRJ-RNV4LPFw-7xXiFU7ACFLYttWY,12388
208
- pipecat/services/cartesia/tts.py,sha256=EdpVJoDhZn7N5hj-VDsCaO-W2MsA78UzOdrHR4G7w08,24355
208
+ pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_obtc,27008
209
209
  pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
210
210
  pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
211
211
  pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
@@ -415,7 +415,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
415
415
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
416
416
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
417
417
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
418
- dv_pipecat_ai-0.0.85.dev830.dist-info/METADATA,sha256=wPJAPffJo_L5wKNWKbIxlaBG09JAGKUTFl_qkLwmoPw,32924
419
- dv_pipecat_ai-0.0.85.dev830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
420
- dv_pipecat_ai-0.0.85.dev830.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
421
- dv_pipecat_ai-0.0.85.dev830.dist-info/RECORD,,
418
+ dv_pipecat_ai-0.0.85.dev832.dist-info/METADATA,sha256=LjkA2HTlz8IiiToSkqkqztGsCqkbhgEdL6B0BXdOOLA,32924
419
+ dv_pipecat_ai-0.0.85.dev832.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
420
+ dv_pipecat_ai-0.0.85.dev832.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
421
+ dv_pipecat_ai-0.0.85.dev832.dist-info/RECORD,,
@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
15
15
  from loguru import logger
16
16
  from pydantic import BaseModel, Field
17
17
 
18
-
19
18
  from pipecat.frames.frames import (
20
19
  CancelFrame,
21
20
  EndFrame,
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
49
48
  raise Exception(f"Missing module: {e}")
50
49
 
51
50
 
51
+ class GenerationConfig(BaseModel):
52
+ """Configuration for Cartesia Sonic-3 generation parameters.
53
+
54
+ Sonic-3 interprets these parameters as guidance to ensure natural speech.
55
+ Test against your content for best results.
56
+
57
+ Parameters:
58
+ volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
59
+ speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
60
+ emotion: Single emotion string to guide the emotional tone. Examples include neutral,
61
+ angry, excited, content, sad, scared. Over 60 emotions are supported. For best
62
+ results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
63
+ and Marian.
64
+ """
65
+
66
+ volume: Optional[float] = None
67
+ speed: Optional[float] = None
68
+ emotion: Optional[str] = None
69
+
70
+
52
71
  def language_to_cartesia_language(language: Language) -> Optional[str]:
53
72
  """Convert a Language enum to Cartesia language code.
54
73
 
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
74
93
  Language.SV: "sv",
75
94
  Language.TR: "tr",
76
95
  Language.ZH: "zh",
96
+ Language.TL: "tl",
97
+ Language.BG: "bg",
98
+ Language.RO: "ro",
99
+ Language.AR: "ar",
100
+ Language.CS: "cs",
101
+ Language.EL: "el",
102
+ Language.FI: "fi",
103
+ Language.HR: "hr",
104
+ Language.MS: "ms",
105
+ Language.SK: "sk",
106
+ Language.DA: "da",
107
+ Language.TA: "ta",
108
+ Language.UK: "uk",
109
+ Language.HU: "hu",
110
+ Language.NO: "no",
111
+ Language.VI: "vi",
112
+ Language.BN: "bn",
113
+ Language.TH: "th",
114
+ Language.HE: "he",
115
+ Language.KA: "ka",
116
+ Language.ID: "id",
117
+ Language.TE: "te",
118
+ Language.GU: "gu",
119
+ Language.KN: "kn",
120
+ Language.ML: "ml",
121
+ Language.MR: "mr",
122
+ Language.PA: "pa",
77
123
  }
78
124
 
79
125
  result = BASE_LANGUAGES.get(language)
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
102
148
 
103
149
  Parameters:
104
150
  language: Language to use for synthesis.
105
- speed: Voice speed control.
106
- emotion: List of emotion controls.
151
+ speed: Voice speed control for non-Sonic-3 models (literal values).
152
+ emotion: List of emotion controls for non-Sonic-3 models.
107
153
 
108
154
  .. deprecated:: 0.0.68
109
155
  The `emotion` parameter is deprecated and will be removed in a future version.
156
+
157
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
158
+ speed (numeric), and emotion (string) parameters.
110
159
  """
111
160
 
112
161
  language: Optional[Language] = Language.EN
113
162
  speed: Optional[Literal["slow", "normal", "fast"]] = None
114
163
  emotion: Optional[List[str]] = []
164
+ generation_config: Optional[GenerationConfig] = None
115
165
 
116
166
  def __init__(
117
167
  self,
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
120
170
  voice_id: str,
121
171
  cartesia_version: str = "2025-04-16",
122
172
  url: str = "wss://api.cartesia.ai/tts/websocket",
123
- model: str = "sonic-2",
173
+ model: str = "sonic-3",
124
174
  sample_rate: Optional[int] = None,
125
175
  encoding: str = "pcm_s16le",
126
176
  container: str = "raw",
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
136
186
  voice_id: ID of the voice to use for synthesis.
137
187
  cartesia_version: API version string for Cartesia service.
138
188
  url: WebSocket URL for Cartesia TTS API.
139
- model: TTS model to use (e.g., "sonic-2").
189
+ model: TTS model to use (e.g., "sonic-3").
140
190
  sample_rate: Audio sample rate. If None, uses default.
141
191
  encoding: Audio encoding format.
142
192
  container: Audio container format.
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
180
230
  else "en",
181
231
  "speed": params.speed,
182
232
  "emotion": params.emotion,
233
+ "generation_config": params.generation_config,
183
234
  }
184
235
  self.set_model_name(model)
185
236
  self.set_voice(voice_id)
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
298
349
  if self._settings["speed"]:
299
350
  msg["speed"] = self._settings["speed"]
300
351
 
352
+ if self._settings["generation_config"]:
353
+ msg["generation_config"] = self._settings["generation_config"].model_dump(
354
+ exclude_none=True
355
+ )
356
+
301
357
  return json.dumps(msg)
302
358
 
303
359
  async def start(self, frame: StartFrame):
@@ -419,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
419
475
  logger.error(f"{self} error: {msg}")
420
476
  await self.push_frame(TTSStoppedFrame())
421
477
  await self.stop_all_metrics()
422
-
423
478
  await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
424
479
  self._context_id = None
425
480
  else:
@@ -484,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
484
539
 
485
540
  Parameters:
486
541
  language: Language to use for synthesis.
487
- speed: Voice speed control.
488
- emotion: List of emotion controls.
542
+ speed: Voice speed control for non-Sonic-3 models (literal values).
543
+ emotion: List of emotion controls for non-Sonic-3 models.
489
544
 
490
545
  .. deprecated:: 0.0.68
491
546
  The `emotion` parameter is deprecated and will be removed in a future version.
547
+
548
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
549
+ speed (numeric), and emotion (string) parameters.
492
550
  """
493
551
 
494
552
  language: Optional[Language] = Language.EN
495
553
  speed: Optional[Literal["slow", "normal", "fast"]] = None
496
554
  emotion: Optional[List[str]] = Field(default_factory=list)
555
+ generation_config: Optional[GenerationConfig] = None
497
556
 
498
557
  def __init__(
499
558
  self,
500
559
  *,
501
560
  api_key: str,
502
561
  voice_id: str,
503
- model: str = "sonic-2",
562
+ model: str = "sonic-3",
504
563
  base_url: str = "https://api.cartesia.ai",
505
564
  cartesia_version: str = "2024-11-13",
506
565
  sample_rate: Optional[int] = None,
@@ -514,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
514
573
  Args:
515
574
  api_key: Cartesia API key for authentication.
516
575
  voice_id: ID of the voice to use for synthesis.
517
- model: TTS model to use (e.g., "sonic-2").
576
+ model: TTS model to use (e.g., "sonic-3").
518
577
  base_url: Base URL for Cartesia HTTP API.
519
578
  cartesia_version: API version string for Cartesia service.
520
579
  sample_rate: Audio sample rate. If None, uses default.
@@ -541,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
541
600
  else "en",
542
601
  "speed": params.speed,
543
602
  "emotion": params.emotion,
603
+ "generation_config": params.generation_config,
544
604
  }
545
605
  self.set_voice(voice_id)
546
606
  self.set_model_name(model)
@@ -634,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
634
694
  if self._settings["speed"]:
635
695
  payload["speed"] = self._settings["speed"]
636
696
 
697
+ if self._settings["generation_config"]:
698
+ payload["generation_config"] = self._settings["generation_config"].model_dump(
699
+ exclude_none=True
700
+ )
701
+
637
702
  yield TTSStartedFrame()
638
703
 
639
704
  session = await self._client._get_session()