dv-pipecat-ai 0.0.85.dev830__py3-none-any.whl → 0.0.85.dev831__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev830
3
+ Version: 0.0.85.dev831
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev830.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev831.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -205,7 +205,7 @@ pipecat/services/azure/realtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
205
205
  pipecat/services/azure/realtime/llm.py,sha256=MnDiw-YJP3kll1gbkta4z4vsWfWZ5oBprZCinMP9O0M,2385
206
206
  pipecat/services/cartesia/__init__.py,sha256=vzh0jBnfPwWdxFfV-tu0x1HFoOTgr9s91GYmD-CJUtY,284
207
207
  pipecat/services/cartesia/stt.py,sha256=00k9gQYo_xPKb-RRJ-RNV4LPFw-7xXiFU7ACFLYttWY,12388
208
- pipecat/services/cartesia/tts.py,sha256=EdpVJoDhZn7N5hj-VDsCaO-W2MsA78UzOdrHR4G7w08,24355
208
+ pipecat/services/cartesia/tts.py,sha256=Fh6hm5AUj2rNX8J4UOjHA7uAPIGcie1Dyxv5WBvV1OY,26279
209
209
  pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
210
210
  pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
211
211
  pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
@@ -415,7 +415,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
415
415
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
416
416
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
417
417
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
418
- dv_pipecat_ai-0.0.85.dev830.dist-info/METADATA,sha256=wPJAPffJo_L5wKNWKbIxlaBG09JAGKUTFl_qkLwmoPw,32924
419
- dv_pipecat_ai-0.0.85.dev830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
420
- dv_pipecat_ai-0.0.85.dev830.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
421
- dv_pipecat_ai-0.0.85.dev830.dist-info/RECORD,,
418
+ dv_pipecat_ai-0.0.85.dev831.dist-info/METADATA,sha256=5ahEs864DAPIEEiiv7-7Oa-vRhRN1Ede341NuqED3Sw,32924
419
+ dv_pipecat_ai-0.0.85.dev831.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
420
+ dv_pipecat_ai-0.0.85.dev831.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
421
+ dv_pipecat_ai-0.0.85.dev831.dist-info/RECORD,,
@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
15
15
  from loguru import logger
16
16
  from pydantic import BaseModel, Field
17
17
 
18
-
19
18
  from pipecat.frames.frames import (
20
19
  CancelFrame,
21
20
  EndFrame,
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
49
48
  raise Exception(f"Missing module: {e}")
50
49
 
51
50
 
51
+ class GenerationConfig(BaseModel):
52
+ """Configuration for Cartesia Sonic-3 generation parameters.
53
+
54
+ Sonic-3 interprets these parameters as guidance to ensure natural speech.
55
+ Test against your content for best results.
56
+
57
+ Parameters:
58
+ volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
59
+ speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
60
+ emotion: Single emotion string to guide the emotional tone. Examples include neutral,
61
+ angry, excited, content, sad, scared. Over 60 emotions are supported. For best
62
+ results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
63
+ and Marian.
64
+ """
65
+
66
+ volume: Optional[float] = None
67
+ speed: Optional[float] = None
68
+ emotion: Optional[str] = None
69
+
70
+
52
71
  def language_to_cartesia_language(language: Language) -> Optional[str]:
53
72
  """Convert a Language enum to Cartesia language code.
54
73
 
@@ -102,16 +121,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
102
121
 
103
122
  Parameters:
104
123
  language: Language to use for synthesis.
105
- speed: Voice speed control.
106
- emotion: List of emotion controls.
124
+ speed: Voice speed control for non-Sonic-3 models (literal values).
125
+ emotion: List of emotion controls for non-Sonic-3 models.
107
126
 
108
127
  .. deprecated:: 0.0.68
109
128
  The `emotion` parameter is deprecated and will be removed in a future version.
129
+
130
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
131
+ speed (numeric), and emotion (string) parameters.
110
132
  """
111
133
 
112
134
  language: Optional[Language] = Language.EN
113
135
  speed: Optional[Literal["slow", "normal", "fast"]] = None
114
136
  emotion: Optional[List[str]] = []
137
+ generation_config: Optional[GenerationConfig] = None
115
138
 
116
139
  def __init__(
117
140
  self,
@@ -120,7 +143,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
120
143
  voice_id: str,
121
144
  cartesia_version: str = "2025-04-16",
122
145
  url: str = "wss://api.cartesia.ai/tts/websocket",
123
- model: str = "sonic-2",
146
+ model: str = "sonic-3",
124
147
  sample_rate: Optional[int] = None,
125
148
  encoding: str = "pcm_s16le",
126
149
  container: str = "raw",
@@ -136,7 +159,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
136
159
  voice_id: ID of the voice to use for synthesis.
137
160
  cartesia_version: API version string for Cartesia service.
138
161
  url: WebSocket URL for Cartesia TTS API.
139
- model: TTS model to use (e.g., "sonic-2").
162
+ model: TTS model to use (e.g., "sonic-3").
140
163
  sample_rate: Audio sample rate. If None, uses default.
141
164
  encoding: Audio encoding format.
142
165
  container: Audio container format.
@@ -180,6 +203,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
180
203
  else "en",
181
204
  "speed": params.speed,
182
205
  "emotion": params.emotion,
206
+ "generation_config": params.generation_config,
183
207
  }
184
208
  self.set_model_name(model)
185
209
  self.set_voice(voice_id)
@@ -298,6 +322,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
298
322
  if self._settings["speed"]:
299
323
  msg["speed"] = self._settings["speed"]
300
324
 
325
+ if self._settings["generation_config"]:
326
+ msg["generation_config"] = self._settings["generation_config"].model_dump(
327
+ exclude_none=True
328
+ )
329
+
301
330
  return json.dumps(msg)
302
331
 
303
332
  async def start(self, frame: StartFrame):
@@ -419,7 +448,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
419
448
  logger.error(f"{self} error: {msg}")
420
449
  await self.push_frame(TTSStoppedFrame())
421
450
  await self.stop_all_metrics()
422
-
423
451
  await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
424
452
  self._context_id = None
425
453
  else:
@@ -484,23 +512,27 @@ class CartesiaHttpTTSService(TTSService):
484
512
 
485
513
  Parameters:
486
514
  language: Language to use for synthesis.
487
- speed: Voice speed control.
488
- emotion: List of emotion controls.
515
+ speed: Voice speed control for non-Sonic-3 models (literal values).
516
+ emotion: List of emotion controls for non-Sonic-3 models.
489
517
 
490
518
  .. deprecated:: 0.0.68
491
519
  The `emotion` parameter is deprecated and will be removed in a future version.
520
+
521
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
522
+ speed (numeric), and emotion (string) parameters.
492
523
  """
493
524
 
494
525
  language: Optional[Language] = Language.EN
495
526
  speed: Optional[Literal["slow", "normal", "fast"]] = None
496
527
  emotion: Optional[List[str]] = Field(default_factory=list)
528
+ generation_config: Optional[GenerationConfig] = None
497
529
 
498
530
  def __init__(
499
531
  self,
500
532
  *,
501
533
  api_key: str,
502
534
  voice_id: str,
503
- model: str = "sonic-2",
535
+ model: str = "sonic-3",
504
536
  base_url: str = "https://api.cartesia.ai",
505
537
  cartesia_version: str = "2024-11-13",
506
538
  sample_rate: Optional[int] = None,
@@ -514,7 +546,7 @@ class CartesiaHttpTTSService(TTSService):
514
546
  Args:
515
547
  api_key: Cartesia API key for authentication.
516
548
  voice_id: ID of the voice to use for synthesis.
517
- model: TTS model to use (e.g., "sonic-2").
549
+ model: TTS model to use (e.g., "sonic-3").
518
550
  base_url: Base URL for Cartesia HTTP API.
519
551
  cartesia_version: API version string for Cartesia service.
520
552
  sample_rate: Audio sample rate. If None, uses default.
@@ -541,6 +573,7 @@ class CartesiaHttpTTSService(TTSService):
541
573
  else "en",
542
574
  "speed": params.speed,
543
575
  "emotion": params.emotion,
576
+ "generation_config": params.generation_config,
544
577
  }
545
578
  self.set_voice(voice_id)
546
579
  self.set_model_name(model)
@@ -634,6 +667,11 @@ class CartesiaHttpTTSService(TTSService):
634
667
  if self._settings["speed"]:
635
668
  payload["speed"] = self._settings["speed"]
636
669
 
670
+ if self._settings["generation_config"]:
671
+ payload["generation_config"] = self._settings["generation_config"].model_dump(
672
+ exclude_none=True
673
+ )
674
+
637
675
  yield TTSStartedFrame()
638
676
 
639
677
  session = await self._client._get_session()