dv-pipecat-ai 0.0.85.dev830__py3-none-any.whl → 0.0.85.dev832__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/METADATA +1 -1
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/RECORD +6 -6
- pipecat/services/cartesia/tts.py +75 -10
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
dv_pipecat_ai-0.0.85.
|
|
1
|
+
dv_pipecat_ai-0.0.85.dev832.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
|
|
2
2
|
pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
|
|
3
3
|
pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -205,7 +205,7 @@ pipecat/services/azure/realtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
205
205
|
pipecat/services/azure/realtime/llm.py,sha256=MnDiw-YJP3kll1gbkta4z4vsWfWZ5oBprZCinMP9O0M,2385
|
|
206
206
|
pipecat/services/cartesia/__init__.py,sha256=vzh0jBnfPwWdxFfV-tu0x1HFoOTgr9s91GYmD-CJUtY,284
|
|
207
207
|
pipecat/services/cartesia/stt.py,sha256=00k9gQYo_xPKb-RRJ-RNV4LPFw-7xXiFU7ACFLYttWY,12388
|
|
208
|
-
pipecat/services/cartesia/tts.py,sha256=
|
|
208
|
+
pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_obtc,27008
|
|
209
209
|
pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
|
|
210
210
|
pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
|
|
211
211
|
pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
|
|
@@ -415,7 +415,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
|
|
|
415
415
|
pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
|
|
416
416
|
pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
|
|
417
417
|
pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
|
|
418
|
-
dv_pipecat_ai-0.0.85.
|
|
419
|
-
dv_pipecat_ai-0.0.85.
|
|
420
|
-
dv_pipecat_ai-0.0.85.
|
|
421
|
-
dv_pipecat_ai-0.0.85.
|
|
418
|
+
dv_pipecat_ai-0.0.85.dev832.dist-info/METADATA,sha256=LjkA2HTlz8IiiToSkqkqztGsCqkbhgEdL6B0BXdOOLA,32924
|
|
419
|
+
dv_pipecat_ai-0.0.85.dev832.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
420
|
+
dv_pipecat_ai-0.0.85.dev832.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
|
|
421
|
+
dv_pipecat_ai-0.0.85.dev832.dist-info/RECORD,,
|
pipecat/services/cartesia/tts.py
CHANGED
|
@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
|
|
|
15
15
|
from loguru import logger
|
|
16
16
|
from pydantic import BaseModel, Field
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
from pipecat.frames.frames import (
|
|
20
19
|
CancelFrame,
|
|
21
20
|
EndFrame,
|
|
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
|
|
|
49
48
|
raise Exception(f"Missing module: {e}")
|
|
50
49
|
|
|
51
50
|
|
|
51
|
+
class GenerationConfig(BaseModel):
|
|
52
|
+
"""Configuration for Cartesia Sonic-3 generation parameters.
|
|
53
|
+
|
|
54
|
+
Sonic-3 interprets these parameters as guidance to ensure natural speech.
|
|
55
|
+
Test against your content for best results.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
|
|
59
|
+
speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
|
|
60
|
+
emotion: Single emotion string to guide the emotional tone. Examples include neutral,
|
|
61
|
+
angry, excited, content, sad, scared. Over 60 emotions are supported. For best
|
|
62
|
+
results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
|
|
63
|
+
and Marian.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
volume: Optional[float] = None
|
|
67
|
+
speed: Optional[float] = None
|
|
68
|
+
emotion: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
|
|
52
71
|
def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
53
72
|
"""Convert a Language enum to Cartesia language code.
|
|
54
73
|
|
|
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
|
74
93
|
Language.SV: "sv",
|
|
75
94
|
Language.TR: "tr",
|
|
76
95
|
Language.ZH: "zh",
|
|
96
|
+
Language.TL: "tl",
|
|
97
|
+
Language.BG: "bg",
|
|
98
|
+
Language.RO: "ro",
|
|
99
|
+
Language.AR: "ar",
|
|
100
|
+
Language.CS: "cs",
|
|
101
|
+
Language.EL: "el",
|
|
102
|
+
Language.FI: "fi",
|
|
103
|
+
Language.HR: "hr",
|
|
104
|
+
Language.MS: "ms",
|
|
105
|
+
Language.SK: "sk",
|
|
106
|
+
Language.DA: "da",
|
|
107
|
+
Language.TA: "ta",
|
|
108
|
+
Language.UK: "uk",
|
|
109
|
+
Language.HU: "hu",
|
|
110
|
+
Language.NO: "no",
|
|
111
|
+
Language.VI: "vi",
|
|
112
|
+
Language.BN: "bn",
|
|
113
|
+
Language.TH: "th",
|
|
114
|
+
Language.HE: "he",
|
|
115
|
+
Language.KA: "ka",
|
|
116
|
+
Language.ID: "id",
|
|
117
|
+
Language.TE: "te",
|
|
118
|
+
Language.GU: "gu",
|
|
119
|
+
Language.KN: "kn",
|
|
120
|
+
Language.ML: "ml",
|
|
121
|
+
Language.MR: "mr",
|
|
122
|
+
Language.PA: "pa",
|
|
77
123
|
}
|
|
78
124
|
|
|
79
125
|
result = BASE_LANGUAGES.get(language)
|
|
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
102
148
|
|
|
103
149
|
Parameters:
|
|
104
150
|
language: Language to use for synthesis.
|
|
105
|
-
speed: Voice speed control.
|
|
106
|
-
emotion: List of emotion controls.
|
|
151
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
152
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
107
153
|
|
|
108
154
|
.. deprecated:: 0.0.68
|
|
109
155
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
156
|
+
|
|
157
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
158
|
+
speed (numeric), and emotion (string) parameters.
|
|
110
159
|
"""
|
|
111
160
|
|
|
112
161
|
language: Optional[Language] = Language.EN
|
|
113
162
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
114
163
|
emotion: Optional[List[str]] = []
|
|
164
|
+
generation_config: Optional[GenerationConfig] = None
|
|
115
165
|
|
|
116
166
|
def __init__(
|
|
117
167
|
self,
|
|
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
120
170
|
voice_id: str,
|
|
121
171
|
cartesia_version: str = "2025-04-16",
|
|
122
172
|
url: str = "wss://api.cartesia.ai/tts/websocket",
|
|
123
|
-
model: str = "sonic-
|
|
173
|
+
model: str = "sonic-3",
|
|
124
174
|
sample_rate: Optional[int] = None,
|
|
125
175
|
encoding: str = "pcm_s16le",
|
|
126
176
|
container: str = "raw",
|
|
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
136
186
|
voice_id: ID of the voice to use for synthesis.
|
|
137
187
|
cartesia_version: API version string for Cartesia service.
|
|
138
188
|
url: WebSocket URL for Cartesia TTS API.
|
|
139
|
-
model: TTS model to use (e.g., "sonic-
|
|
189
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
140
190
|
sample_rate: Audio sample rate. If None, uses default.
|
|
141
191
|
encoding: Audio encoding format.
|
|
142
192
|
container: Audio container format.
|
|
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
180
230
|
else "en",
|
|
181
231
|
"speed": params.speed,
|
|
182
232
|
"emotion": params.emotion,
|
|
233
|
+
"generation_config": params.generation_config,
|
|
183
234
|
}
|
|
184
235
|
self.set_model_name(model)
|
|
185
236
|
self.set_voice(voice_id)
|
|
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
298
349
|
if self._settings["speed"]:
|
|
299
350
|
msg["speed"] = self._settings["speed"]
|
|
300
351
|
|
|
352
|
+
if self._settings["generation_config"]:
|
|
353
|
+
msg["generation_config"] = self._settings["generation_config"].model_dump(
|
|
354
|
+
exclude_none=True
|
|
355
|
+
)
|
|
356
|
+
|
|
301
357
|
return json.dumps(msg)
|
|
302
358
|
|
|
303
359
|
async def start(self, frame: StartFrame):
|
|
@@ -419,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
419
475
|
logger.error(f"{self} error: {msg}")
|
|
420
476
|
await self.push_frame(TTSStoppedFrame())
|
|
421
477
|
await self.stop_all_metrics()
|
|
422
|
-
|
|
423
478
|
await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
|
|
424
479
|
self._context_id = None
|
|
425
480
|
else:
|
|
@@ -484,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
484
539
|
|
|
485
540
|
Parameters:
|
|
486
541
|
language: Language to use for synthesis.
|
|
487
|
-
speed: Voice speed control.
|
|
488
|
-
emotion: List of emotion controls.
|
|
542
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
543
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
489
544
|
|
|
490
545
|
.. deprecated:: 0.0.68
|
|
491
546
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
547
|
+
|
|
548
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
549
|
+
speed (numeric), and emotion (string) parameters.
|
|
492
550
|
"""
|
|
493
551
|
|
|
494
552
|
language: Optional[Language] = Language.EN
|
|
495
553
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
496
554
|
emotion: Optional[List[str]] = Field(default_factory=list)
|
|
555
|
+
generation_config: Optional[GenerationConfig] = None
|
|
497
556
|
|
|
498
557
|
def __init__(
|
|
499
558
|
self,
|
|
500
559
|
*,
|
|
501
560
|
api_key: str,
|
|
502
561
|
voice_id: str,
|
|
503
|
-
model: str = "sonic-
|
|
562
|
+
model: str = "sonic-3",
|
|
504
563
|
base_url: str = "https://api.cartesia.ai",
|
|
505
564
|
cartesia_version: str = "2024-11-13",
|
|
506
565
|
sample_rate: Optional[int] = None,
|
|
@@ -514,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
514
573
|
Args:
|
|
515
574
|
api_key: Cartesia API key for authentication.
|
|
516
575
|
voice_id: ID of the voice to use for synthesis.
|
|
517
|
-
model: TTS model to use (e.g., "sonic-
|
|
576
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
518
577
|
base_url: Base URL for Cartesia HTTP API.
|
|
519
578
|
cartesia_version: API version string for Cartesia service.
|
|
520
579
|
sample_rate: Audio sample rate. If None, uses default.
|
|
@@ -541,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
541
600
|
else "en",
|
|
542
601
|
"speed": params.speed,
|
|
543
602
|
"emotion": params.emotion,
|
|
603
|
+
"generation_config": params.generation_config,
|
|
544
604
|
}
|
|
545
605
|
self.set_voice(voice_id)
|
|
546
606
|
self.set_model_name(model)
|
|
@@ -634,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
634
694
|
if self._settings["speed"]:
|
|
635
695
|
payload["speed"] = self._settings["speed"]
|
|
636
696
|
|
|
697
|
+
if self._settings["generation_config"]:
|
|
698
|
+
payload["generation_config"] = self._settings["generation_config"].model_dump(
|
|
699
|
+
exclude_none=True
|
|
700
|
+
)
|
|
701
|
+
|
|
637
702
|
yield TTSStartedFrame()
|
|
638
703
|
|
|
639
704
|
session = await self._client._get_session()
|
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/top_level.txt
RENAMED
|
File without changes
|