dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
pipecat/services/cartesia/stt.py
CHANGED
|
@@ -28,13 +28,12 @@ from pipecat.frames.frames import (
|
|
|
28
28
|
UserStoppedSpeakingFrame,
|
|
29
29
|
)
|
|
30
30
|
from pipecat.processors.frame_processor import FrameDirection
|
|
31
|
-
from pipecat.services.stt_service import
|
|
31
|
+
from pipecat.services.stt_service import WebsocketSTTService
|
|
32
32
|
from pipecat.transcriptions.language import Language
|
|
33
33
|
from pipecat.utils.time import time_now_iso8601
|
|
34
34
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
35
35
|
|
|
36
36
|
try:
|
|
37
|
-
import websockets
|
|
38
37
|
from websockets.asyncio.client import connect as websocket_connect
|
|
39
38
|
from websockets.protocol import State
|
|
40
39
|
except ModuleNotFoundError as e:
|
|
@@ -124,7 +123,7 @@ class CartesiaLiveOptions:
|
|
|
124
123
|
return cls(**json.loads(json_str))
|
|
125
124
|
|
|
126
125
|
|
|
127
|
-
class CartesiaSTTService(
|
|
126
|
+
class CartesiaSTTService(WebsocketSTTService):
|
|
128
127
|
"""Speech-to-text service using Cartesia Live API.
|
|
129
128
|
|
|
130
129
|
Provides real-time speech transcription through WebSocket connection
|
|
@@ -176,8 +175,7 @@ class CartesiaSTTService(STTService):
|
|
|
176
175
|
self.set_model_name(merged_options.model)
|
|
177
176
|
self._api_key = api_key
|
|
178
177
|
self._base_url = base_url or "api.cartesia.ai"
|
|
179
|
-
self.
|
|
180
|
-
self._receiver_task = None
|
|
178
|
+
self._receive_task = None
|
|
181
179
|
|
|
182
180
|
def can_generate_metrics(self) -> bool:
|
|
183
181
|
"""Check if the service can generate processing metrics.
|
|
@@ -214,6 +212,27 @@ class CartesiaSTTService(STTService):
|
|
|
214
212
|
await super().cancel(frame)
|
|
215
213
|
await self._disconnect()
|
|
216
214
|
|
|
215
|
+
async def start_metrics(self):
|
|
216
|
+
"""Start performance metrics collection for transcription processing."""
|
|
217
|
+
await self.start_ttfb_metrics()
|
|
218
|
+
await self.start_processing_metrics()
|
|
219
|
+
|
|
220
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
221
|
+
"""Process incoming frames and handle speech events.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
frame: The frame to process.
|
|
225
|
+
direction: Direction of frame flow in the pipeline.
|
|
226
|
+
"""
|
|
227
|
+
await super().process_frame(frame, direction)
|
|
228
|
+
|
|
229
|
+
if isinstance(frame, UserStartedSpeakingFrame):
|
|
230
|
+
await self.start_metrics()
|
|
231
|
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
232
|
+
# Send finalize command to flush the transcription session
|
|
233
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
234
|
+
await self._websocket.send("finalize")
|
|
235
|
+
|
|
217
236
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
218
237
|
"""Process audio data for speech-to-text transcription.
|
|
219
238
|
|
|
@@ -224,45 +243,71 @@ class CartesiaSTTService(STTService):
|
|
|
224
243
|
None - transcription results are handled via WebSocket responses.
|
|
225
244
|
"""
|
|
226
245
|
# If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
|
|
227
|
-
if not self.
|
|
246
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
228
247
|
await self._connect()
|
|
229
248
|
|
|
230
|
-
await self.
|
|
249
|
+
await self._websocket.send(audio)
|
|
231
250
|
yield None
|
|
232
251
|
|
|
233
252
|
async def _connect(self):
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
253
|
+
await self._connect_websocket()
|
|
254
|
+
|
|
255
|
+
if self._websocket and not self._receive_task:
|
|
256
|
+
self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
|
|
238
257
|
|
|
258
|
+
async def _disconnect(self):
|
|
259
|
+
if self._receive_task:
|
|
260
|
+
await self.cancel_task(self._receive_task)
|
|
261
|
+
self._receive_task = None
|
|
262
|
+
|
|
263
|
+
await self._disconnect_websocket()
|
|
264
|
+
|
|
265
|
+
async def _connect_websocket(self):
|
|
239
266
|
try:
|
|
240
|
-
self.
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
267
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
268
|
+
return
|
|
269
|
+
logger.debug("Connecting to Cartesia STT")
|
|
270
|
+
|
|
271
|
+
params = self._settings.to_dict()
|
|
272
|
+
ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
|
|
273
|
+
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
|
|
274
|
+
|
|
275
|
+
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
|
276
|
+
await self._call_event_handler("on_connected")
|
|
245
277
|
except Exception as e:
|
|
246
278
|
logger.error(f"{self}: unable to connect to Cartesia: {e}")
|
|
247
279
|
|
|
248
|
-
async def
|
|
280
|
+
async def _disconnect_websocket(self):
|
|
249
281
|
try:
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
message = await self._connection.recv()
|
|
255
|
-
try:
|
|
256
|
-
data = json.loads(message)
|
|
257
|
-
await self._process_response(data)
|
|
258
|
-
except json.JSONDecodeError:
|
|
259
|
-
logger.warning(f"Received non-JSON message: {message}")
|
|
260
|
-
except asyncio.CancelledError:
|
|
261
|
-
pass
|
|
262
|
-
except websockets.exceptions.ConnectionClosed as e:
|
|
263
|
-
logger.debug(f"WebSocket connection closed: {e}")
|
|
282
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
283
|
+
logger.debug("Disconnecting from Cartesia STT")
|
|
284
|
+
await self._websocket.close()
|
|
264
285
|
except Exception as e:
|
|
265
|
-
logger.error(f"
|
|
286
|
+
logger.error(f"{self} error closing websocket: {e}")
|
|
287
|
+
finally:
|
|
288
|
+
self._websocket = None
|
|
289
|
+
await self._call_event_handler("on_disconnected")
|
|
290
|
+
|
|
291
|
+
def _get_websocket(self):
|
|
292
|
+
if self._websocket:
|
|
293
|
+
return self._websocket
|
|
294
|
+
raise Exception("Websocket not connected")
|
|
295
|
+
|
|
296
|
+
async def _process_messages(self):
|
|
297
|
+
async for message in self._get_websocket():
|
|
298
|
+
try:
|
|
299
|
+
data = json.loads(message)
|
|
300
|
+
await self._process_response(data)
|
|
301
|
+
except json.JSONDecodeError:
|
|
302
|
+
logger.warning(f"Received non-JSON message: {message}")
|
|
303
|
+
|
|
304
|
+
async def _receive_messages(self):
|
|
305
|
+
while True:
|
|
306
|
+
await self._process_messages()
|
|
307
|
+
# Cartesia times out after 5 minutes of innactivity (no keepalive
|
|
308
|
+
# mechanism is available). So, we try to reconnect.
|
|
309
|
+
logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
|
|
310
|
+
await self._connect_websocket()
|
|
266
311
|
|
|
267
312
|
async def _process_response(self, data):
|
|
268
313
|
if "type" in data:
|
|
@@ -316,41 +361,3 @@ class CartesiaSTTService(STTService):
|
|
|
316
361
|
language,
|
|
317
362
|
)
|
|
318
363
|
)
|
|
319
|
-
|
|
320
|
-
async def _disconnect(self):
|
|
321
|
-
if self._receiver_task:
|
|
322
|
-
self._receiver_task.cancel()
|
|
323
|
-
try:
|
|
324
|
-
await self._receiver_task
|
|
325
|
-
except asyncio.CancelledError:
|
|
326
|
-
pass
|
|
327
|
-
except Exception as e:
|
|
328
|
-
logger.exception(f"Unexpected exception while cancelling task: {e}")
|
|
329
|
-
self._receiver_task = None
|
|
330
|
-
|
|
331
|
-
if self._connection and self._connection.state is State.OPEN:
|
|
332
|
-
logger.debug("Disconnecting from Cartesia")
|
|
333
|
-
|
|
334
|
-
await self._connection.close()
|
|
335
|
-
self._connection = None
|
|
336
|
-
|
|
337
|
-
async def start_metrics(self):
|
|
338
|
-
"""Start performance metrics collection for transcription processing."""
|
|
339
|
-
await self.start_ttfb_metrics()
|
|
340
|
-
await self.start_processing_metrics()
|
|
341
|
-
|
|
342
|
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
343
|
-
"""Process incoming frames and handle speech events.
|
|
344
|
-
|
|
345
|
-
Args:
|
|
346
|
-
frame: The frame to process.
|
|
347
|
-
direction: Direction of frame flow in the pipeline.
|
|
348
|
-
"""
|
|
349
|
-
await super().process_frame(frame, direction)
|
|
350
|
-
|
|
351
|
-
if isinstance(frame, UserStartedSpeakingFrame):
|
|
352
|
-
await self.start_metrics()
|
|
353
|
-
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
354
|
-
# Send finalize command to flush the transcription session
|
|
355
|
-
if self._connection and self._connection.state is State.OPEN:
|
|
356
|
-
await self._connection.send("finalize")
|
pipecat/services/cartesia/tts.py
CHANGED
|
@@ -15,14 +15,13 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
|
|
|
15
15
|
from loguru import logger
|
|
16
16
|
from pydantic import BaseModel, Field
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
from pipecat.frames.frames import (
|
|
20
19
|
CancelFrame,
|
|
21
20
|
EndFrame,
|
|
22
21
|
ErrorFrame,
|
|
23
22
|
Frame,
|
|
23
|
+
InterruptionFrame,
|
|
24
24
|
StartFrame,
|
|
25
|
-
StartInterruptionFrame,
|
|
26
25
|
TTSAudioRawFrame,
|
|
27
26
|
TTSStartedFrame,
|
|
28
27
|
TTSStoppedFrame,
|
|
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
|
|
|
49
48
|
raise Exception(f"Missing module: {e}")
|
|
50
49
|
|
|
51
50
|
|
|
51
|
+
class GenerationConfig(BaseModel):
|
|
52
|
+
"""Configuration for Cartesia Sonic-3 generation parameters.
|
|
53
|
+
|
|
54
|
+
Sonic-3 interprets these parameters as guidance to ensure natural speech.
|
|
55
|
+
Test against your content for best results.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
|
|
59
|
+
speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
|
|
60
|
+
emotion: Single emotion string to guide the emotional tone. Examples include neutral,
|
|
61
|
+
angry, excited, content, sad, scared. Over 60 emotions are supported. For best
|
|
62
|
+
results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
|
|
63
|
+
and Marian.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
volume: Optional[float] = None
|
|
67
|
+
speed: Optional[float] = None
|
|
68
|
+
emotion: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
|
|
52
71
|
def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
53
72
|
"""Convert a Language enum to Cartesia language code.
|
|
54
73
|
|
|
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
|
74
93
|
Language.SV: "sv",
|
|
75
94
|
Language.TR: "tr",
|
|
76
95
|
Language.ZH: "zh",
|
|
96
|
+
Language.TL: "tl",
|
|
97
|
+
Language.BG: "bg",
|
|
98
|
+
Language.RO: "ro",
|
|
99
|
+
Language.AR: "ar",
|
|
100
|
+
Language.CS: "cs",
|
|
101
|
+
Language.EL: "el",
|
|
102
|
+
Language.FI: "fi",
|
|
103
|
+
Language.HR: "hr",
|
|
104
|
+
Language.MS: "ms",
|
|
105
|
+
Language.SK: "sk",
|
|
106
|
+
Language.DA: "da",
|
|
107
|
+
Language.TA: "ta",
|
|
108
|
+
Language.UK: "uk",
|
|
109
|
+
Language.HU: "hu",
|
|
110
|
+
Language.NO: "no",
|
|
111
|
+
Language.VI: "vi",
|
|
112
|
+
Language.BN: "bn",
|
|
113
|
+
Language.TH: "th",
|
|
114
|
+
Language.HE: "he",
|
|
115
|
+
Language.KA: "ka",
|
|
116
|
+
Language.ID: "id",
|
|
117
|
+
Language.TE: "te",
|
|
118
|
+
Language.GU: "gu",
|
|
119
|
+
Language.KN: "kn",
|
|
120
|
+
Language.ML: "ml",
|
|
121
|
+
Language.MR: "mr",
|
|
122
|
+
Language.PA: "pa",
|
|
77
123
|
}
|
|
78
124
|
|
|
79
125
|
result = BASE_LANGUAGES.get(language)
|
|
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
102
148
|
|
|
103
149
|
Parameters:
|
|
104
150
|
language: Language to use for synthesis.
|
|
105
|
-
speed: Voice speed control.
|
|
106
|
-
emotion: List of emotion controls.
|
|
151
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
152
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
107
153
|
|
|
108
154
|
.. deprecated:: 0.0.68
|
|
109
155
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
156
|
+
|
|
157
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
158
|
+
speed (numeric), and emotion (string) parameters.
|
|
110
159
|
"""
|
|
111
160
|
|
|
112
161
|
language: Optional[Language] = Language.EN
|
|
113
162
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
114
163
|
emotion: Optional[List[str]] = []
|
|
164
|
+
generation_config: Optional[GenerationConfig] = None
|
|
115
165
|
|
|
116
166
|
def __init__(
|
|
117
167
|
self,
|
|
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
120
170
|
voice_id: str,
|
|
121
171
|
cartesia_version: str = "2025-04-16",
|
|
122
172
|
url: str = "wss://api.cartesia.ai/tts/websocket",
|
|
123
|
-
model: str = "sonic-
|
|
173
|
+
model: str = "sonic-3",
|
|
124
174
|
sample_rate: Optional[int] = None,
|
|
125
175
|
encoding: str = "pcm_s16le",
|
|
126
176
|
container: str = "raw",
|
|
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
136
186
|
voice_id: ID of the voice to use for synthesis.
|
|
137
187
|
cartesia_version: API version string for Cartesia service.
|
|
138
188
|
url: WebSocket URL for Cartesia TTS API.
|
|
139
|
-
model: TTS model to use (e.g., "sonic-
|
|
189
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
140
190
|
sample_rate: Audio sample rate. If None, uses default.
|
|
141
191
|
encoding: Audio encoding format.
|
|
142
192
|
container: Audio container format.
|
|
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
180
230
|
else "en",
|
|
181
231
|
"speed": params.speed,
|
|
182
232
|
"emotion": params.emotion,
|
|
233
|
+
"generation_config": params.generation_config,
|
|
183
234
|
}
|
|
184
235
|
self.set_model_name(model)
|
|
185
236
|
self.set_voice(voice_id)
|
|
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
298
349
|
if self._settings["speed"]:
|
|
299
350
|
msg["speed"] = self._settings["speed"]
|
|
300
351
|
|
|
352
|
+
if self._settings["generation_config"]:
|
|
353
|
+
msg["generation_config"] = self._settings["generation_config"].model_dump(
|
|
354
|
+
exclude_none=True
|
|
355
|
+
)
|
|
356
|
+
|
|
301
357
|
return json.dumps(msg)
|
|
302
358
|
|
|
303
359
|
async def start(self, frame: StartFrame):
|
|
@@ -345,10 +401,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
345
401
|
try:
|
|
346
402
|
if self._websocket and self._websocket.state is State.OPEN:
|
|
347
403
|
return
|
|
348
|
-
logger.debug("Connecting to Cartesia")
|
|
404
|
+
logger.debug("Connecting to Cartesia TTS")
|
|
349
405
|
self._websocket = await websocket_connect(
|
|
350
406
|
f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
|
|
351
407
|
)
|
|
408
|
+
await self._call_event_handler("on_connected")
|
|
352
409
|
except Exception as e:
|
|
353
410
|
logger.error(f"{self} initialization error: {e}")
|
|
354
411
|
self._websocket = None
|
|
@@ -366,13 +423,14 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
366
423
|
finally:
|
|
367
424
|
self._context_id = None
|
|
368
425
|
self._websocket = None
|
|
426
|
+
await self._call_event_handler("on_disconnected")
|
|
369
427
|
|
|
370
428
|
def _get_websocket(self):
|
|
371
429
|
if self._websocket:
|
|
372
430
|
return self._websocket
|
|
373
431
|
raise Exception("Websocket not connected")
|
|
374
432
|
|
|
375
|
-
async def _handle_interruption(self, frame:
|
|
433
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
376
434
|
await super()._handle_interruption(frame, direction)
|
|
377
435
|
await self.stop_all_metrics()
|
|
378
436
|
if self._context_id:
|
|
@@ -417,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
417
475
|
logger.error(f"{self} error: {msg}")
|
|
418
476
|
await self.push_frame(TTSStoppedFrame())
|
|
419
477
|
await self.stop_all_metrics()
|
|
420
|
-
|
|
421
478
|
await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
|
|
422
479
|
self._context_id = None
|
|
423
480
|
else:
|
|
@@ -482,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
482
539
|
|
|
483
540
|
Parameters:
|
|
484
541
|
language: Language to use for synthesis.
|
|
485
|
-
speed: Voice speed control.
|
|
486
|
-
emotion: List of emotion controls.
|
|
542
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
543
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
487
544
|
|
|
488
545
|
.. deprecated:: 0.0.68
|
|
489
546
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
547
|
+
|
|
548
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
549
|
+
speed (numeric), and emotion (string) parameters.
|
|
490
550
|
"""
|
|
491
551
|
|
|
492
552
|
language: Optional[Language] = Language.EN
|
|
493
553
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
494
554
|
emotion: Optional[List[str]] = Field(default_factory=list)
|
|
555
|
+
generation_config: Optional[GenerationConfig] = None
|
|
495
556
|
|
|
496
557
|
def __init__(
|
|
497
558
|
self,
|
|
498
559
|
*,
|
|
499
560
|
api_key: str,
|
|
500
561
|
voice_id: str,
|
|
501
|
-
model: str = "sonic-
|
|
562
|
+
model: str = "sonic-3",
|
|
502
563
|
base_url: str = "https://api.cartesia.ai",
|
|
503
564
|
cartesia_version: str = "2024-11-13",
|
|
504
565
|
sample_rate: Optional[int] = None,
|
|
@@ -512,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
512
573
|
Args:
|
|
513
574
|
api_key: Cartesia API key for authentication.
|
|
514
575
|
voice_id: ID of the voice to use for synthesis.
|
|
515
|
-
model: TTS model to use (e.g., "sonic-
|
|
576
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
516
577
|
base_url: Base URL for Cartesia HTTP API.
|
|
517
578
|
cartesia_version: API version string for Cartesia service.
|
|
518
579
|
sample_rate: Audio sample rate. If None, uses default.
|
|
@@ -539,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
539
600
|
else "en",
|
|
540
601
|
"speed": params.speed,
|
|
541
602
|
"emotion": params.emotion,
|
|
603
|
+
"generation_config": params.generation_config,
|
|
542
604
|
}
|
|
543
605
|
self.set_voice(voice_id)
|
|
544
606
|
self.set_model_name(model)
|
|
@@ -632,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
632
694
|
if self._settings["speed"]:
|
|
633
695
|
payload["speed"] = self._settings["speed"]
|
|
634
696
|
|
|
697
|
+
if self._settings["generation_config"]:
|
|
698
|
+
payload["generation_config"] = self._settings["generation_config"].model_dump(
|
|
699
|
+
exclude_none=True
|
|
700
|
+
)
|
|
701
|
+
|
|
635
702
|
yield TTSStartedFrame()
|
|
636
703
|
|
|
637
704
|
session = await self._client._get_session()
|
|
File without changes
|