dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -28,13 +28,12 @@ from pipecat.frames.frames import (
28
28
  UserStoppedSpeakingFrame,
29
29
  )
30
30
  from pipecat.processors.frame_processor import FrameDirection
31
- from pipecat.services.stt_service import STTService
31
+ from pipecat.services.stt_service import WebsocketSTTService
32
32
  from pipecat.transcriptions.language import Language
33
33
  from pipecat.utils.time import time_now_iso8601
34
34
  from pipecat.utils.tracing.service_decorators import traced_stt
35
35
 
36
36
  try:
37
- import websockets
38
37
  from websockets.asyncio.client import connect as websocket_connect
39
38
  from websockets.protocol import State
40
39
  except ModuleNotFoundError as e:
@@ -124,7 +123,7 @@ class CartesiaLiveOptions:
124
123
  return cls(**json.loads(json_str))
125
124
 
126
125
 
127
- class CartesiaSTTService(STTService):
126
+ class CartesiaSTTService(WebsocketSTTService):
128
127
  """Speech-to-text service using Cartesia Live API.
129
128
 
130
129
  Provides real-time speech transcription through WebSocket connection
@@ -176,8 +175,7 @@ class CartesiaSTTService(STTService):
176
175
  self.set_model_name(merged_options.model)
177
176
  self._api_key = api_key
178
177
  self._base_url = base_url or "api.cartesia.ai"
179
- self._connection = None
180
- self._receiver_task = None
178
+ self._receive_task = None
181
179
 
182
180
  def can_generate_metrics(self) -> bool:
183
181
  """Check if the service can generate processing metrics.
@@ -214,6 +212,27 @@ class CartesiaSTTService(STTService):
214
212
  await super().cancel(frame)
215
213
  await self._disconnect()
216
214
 
215
+ async def start_metrics(self):
216
+ """Start performance metrics collection for transcription processing."""
217
+ await self.start_ttfb_metrics()
218
+ await self.start_processing_metrics()
219
+
220
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
221
+ """Process incoming frames and handle speech events.
222
+
223
+ Args:
224
+ frame: The frame to process.
225
+ direction: Direction of frame flow in the pipeline.
226
+ """
227
+ await super().process_frame(frame, direction)
228
+
229
+ if isinstance(frame, UserStartedSpeakingFrame):
230
+ await self.start_metrics()
231
+ elif isinstance(frame, UserStoppedSpeakingFrame):
232
+ # Send finalize command to flush the transcription session
233
+ if self._websocket and self._websocket.state is State.OPEN:
234
+ await self._websocket.send("finalize")
235
+
217
236
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
218
237
  """Process audio data for speech-to-text transcription.
219
238
 
@@ -224,45 +243,71 @@ class CartesiaSTTService(STTService):
224
243
  None - transcription results are handled via WebSocket responses.
225
244
  """
226
245
  # If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
227
- if not self._connection or self._connection.state is State.CLOSED:
246
+ if not self._websocket or self._websocket.state is State.CLOSED:
228
247
  await self._connect()
229
248
 
230
- await self._connection.send(audio)
249
+ await self._websocket.send(audio)
231
250
  yield None
232
251
 
233
252
  async def _connect(self):
234
- params = self._settings.to_dict()
235
- ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
236
- logger.debug(f"Connecting to Cartesia: {ws_url}")
237
- headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
253
+ await self._connect_websocket()
254
+
255
+ if self._websocket and not self._receive_task:
256
+ self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
238
257
 
258
+ async def _disconnect(self):
259
+ if self._receive_task:
260
+ await self.cancel_task(self._receive_task)
261
+ self._receive_task = None
262
+
263
+ await self._disconnect_websocket()
264
+
265
+ async def _connect_websocket(self):
239
266
  try:
240
- self._connection = await websocket_connect(ws_url, additional_headers=headers)
241
- # Setup the receiver task to handle the incoming messages from the Cartesia server
242
- if self._receiver_task is None or self._receiver_task.done():
243
- self._receiver_task = asyncio.create_task(self._receive_messages())
244
- logger.debug(f"Connected to Cartesia")
267
+ if self._websocket and self._websocket.state is State.OPEN:
268
+ return
269
+ logger.debug("Connecting to Cartesia STT")
270
+
271
+ params = self._settings.to_dict()
272
+ ws_url = f"wss://{self._base_url}/stt/websocket?{urllib.parse.urlencode(params)}"
273
+ headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
274
+
275
+ self._websocket = await websocket_connect(ws_url, additional_headers=headers)
276
+ await self._call_event_handler("on_connected")
245
277
  except Exception as e:
246
278
  logger.error(f"{self}: unable to connect to Cartesia: {e}")
247
279
 
248
- async def _receive_messages(self):
280
+ async def _disconnect_websocket(self):
249
281
  try:
250
- while True:
251
- if not self._connection or self._connection.state is State.CLOSED:
252
- break
253
-
254
- message = await self._connection.recv()
255
- try:
256
- data = json.loads(message)
257
- await self._process_response(data)
258
- except json.JSONDecodeError:
259
- logger.warning(f"Received non-JSON message: {message}")
260
- except asyncio.CancelledError:
261
- pass
262
- except websockets.exceptions.ConnectionClosed as e:
263
- logger.debug(f"WebSocket connection closed: {e}")
282
+ if self._websocket and self._websocket.state is State.OPEN:
283
+ logger.debug("Disconnecting from Cartesia STT")
284
+ await self._websocket.close()
264
285
  except Exception as e:
265
- logger.error(f"Error in message receiver: {e}")
286
+ logger.error(f"{self} error closing websocket: {e}")
287
+ finally:
288
+ self._websocket = None
289
+ await self._call_event_handler("on_disconnected")
290
+
291
+ def _get_websocket(self):
292
+ if self._websocket:
293
+ return self._websocket
294
+ raise Exception("Websocket not connected")
295
+
296
+ async def _process_messages(self):
297
+ async for message in self._get_websocket():
298
+ try:
299
+ data = json.loads(message)
300
+ await self._process_response(data)
301
+ except json.JSONDecodeError:
302
+ logger.warning(f"Received non-JSON message: {message}")
303
+
304
+ async def _receive_messages(self):
305
+ while True:
306
+ await self._process_messages()
307
+ # Cartesia times out after 5 minutes of innactivity (no keepalive
308
+ # mechanism is available). So, we try to reconnect.
309
+ logger.debug(f"{self} Cartesia connection was disconnected (timeout?), reconnecting")
310
+ await self._connect_websocket()
266
311
 
267
312
  async def _process_response(self, data):
268
313
  if "type" in data:
@@ -316,41 +361,3 @@ class CartesiaSTTService(STTService):
316
361
  language,
317
362
  )
318
363
  )
319
-
320
- async def _disconnect(self):
321
- if self._receiver_task:
322
- self._receiver_task.cancel()
323
- try:
324
- await self._receiver_task
325
- except asyncio.CancelledError:
326
- pass
327
- except Exception as e:
328
- logger.exception(f"Unexpected exception while cancelling task: {e}")
329
- self._receiver_task = None
330
-
331
- if self._connection and self._connection.state is State.OPEN:
332
- logger.debug("Disconnecting from Cartesia")
333
-
334
- await self._connection.close()
335
- self._connection = None
336
-
337
- async def start_metrics(self):
338
- """Start performance metrics collection for transcription processing."""
339
- await self.start_ttfb_metrics()
340
- await self.start_processing_metrics()
341
-
342
- async def process_frame(self, frame: Frame, direction: FrameDirection):
343
- """Process incoming frames and handle speech events.
344
-
345
- Args:
346
- frame: The frame to process.
347
- direction: Direction of frame flow in the pipeline.
348
- """
349
- await super().process_frame(frame, direction)
350
-
351
- if isinstance(frame, UserStartedSpeakingFrame):
352
- await self.start_metrics()
353
- elif isinstance(frame, UserStoppedSpeakingFrame):
354
- # Send finalize command to flush the transcription session
355
- if self._connection and self._connection.state is State.OPEN:
356
- await self._connection.send("finalize")
@@ -15,14 +15,13 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
15
15
  from loguru import logger
16
16
  from pydantic import BaseModel, Field
17
17
 
18
-
19
18
  from pipecat.frames.frames import (
20
19
  CancelFrame,
21
20
  EndFrame,
22
21
  ErrorFrame,
23
22
  Frame,
23
+ InterruptionFrame,
24
24
  StartFrame,
25
- StartInterruptionFrame,
26
25
  TTSAudioRawFrame,
27
26
  TTSStartedFrame,
28
27
  TTSStoppedFrame,
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
49
48
  raise Exception(f"Missing module: {e}")
50
49
 
51
50
 
51
+ class GenerationConfig(BaseModel):
52
+ """Configuration for Cartesia Sonic-3 generation parameters.
53
+
54
+ Sonic-3 interprets these parameters as guidance to ensure natural speech.
55
+ Test against your content for best results.
56
+
57
+ Parameters:
58
+ volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
59
+ speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
60
+ emotion: Single emotion string to guide the emotional tone. Examples include neutral,
61
+ angry, excited, content, sad, scared. Over 60 emotions are supported. For best
62
+ results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
63
+ and Marian.
64
+ """
65
+
66
+ volume: Optional[float] = None
67
+ speed: Optional[float] = None
68
+ emotion: Optional[str] = None
69
+
70
+
52
71
  def language_to_cartesia_language(language: Language) -> Optional[str]:
53
72
  """Convert a Language enum to Cartesia language code.
54
73
 
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
74
93
  Language.SV: "sv",
75
94
  Language.TR: "tr",
76
95
  Language.ZH: "zh",
96
+ Language.TL: "tl",
97
+ Language.BG: "bg",
98
+ Language.RO: "ro",
99
+ Language.AR: "ar",
100
+ Language.CS: "cs",
101
+ Language.EL: "el",
102
+ Language.FI: "fi",
103
+ Language.HR: "hr",
104
+ Language.MS: "ms",
105
+ Language.SK: "sk",
106
+ Language.DA: "da",
107
+ Language.TA: "ta",
108
+ Language.UK: "uk",
109
+ Language.HU: "hu",
110
+ Language.NO: "no",
111
+ Language.VI: "vi",
112
+ Language.BN: "bn",
113
+ Language.TH: "th",
114
+ Language.HE: "he",
115
+ Language.KA: "ka",
116
+ Language.ID: "id",
117
+ Language.TE: "te",
118
+ Language.GU: "gu",
119
+ Language.KN: "kn",
120
+ Language.ML: "ml",
121
+ Language.MR: "mr",
122
+ Language.PA: "pa",
77
123
  }
78
124
 
79
125
  result = BASE_LANGUAGES.get(language)
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
102
148
 
103
149
  Parameters:
104
150
  language: Language to use for synthesis.
105
- speed: Voice speed control.
106
- emotion: List of emotion controls.
151
+ speed: Voice speed control for non-Sonic-3 models (literal values).
152
+ emotion: List of emotion controls for non-Sonic-3 models.
107
153
 
108
154
  .. deprecated:: 0.0.68
109
155
  The `emotion` parameter is deprecated and will be removed in a future version.
156
+
157
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
158
+ speed (numeric), and emotion (string) parameters.
110
159
  """
111
160
 
112
161
  language: Optional[Language] = Language.EN
113
162
  speed: Optional[Literal["slow", "normal", "fast"]] = None
114
163
  emotion: Optional[List[str]] = []
164
+ generation_config: Optional[GenerationConfig] = None
115
165
 
116
166
  def __init__(
117
167
  self,
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
120
170
  voice_id: str,
121
171
  cartesia_version: str = "2025-04-16",
122
172
  url: str = "wss://api.cartesia.ai/tts/websocket",
123
- model: str = "sonic-2",
173
+ model: str = "sonic-3",
124
174
  sample_rate: Optional[int] = None,
125
175
  encoding: str = "pcm_s16le",
126
176
  container: str = "raw",
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
136
186
  voice_id: ID of the voice to use for synthesis.
137
187
  cartesia_version: API version string for Cartesia service.
138
188
  url: WebSocket URL for Cartesia TTS API.
139
- model: TTS model to use (e.g., "sonic-2").
189
+ model: TTS model to use (e.g., "sonic-3").
140
190
  sample_rate: Audio sample rate. If None, uses default.
141
191
  encoding: Audio encoding format.
142
192
  container: Audio container format.
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
180
230
  else "en",
181
231
  "speed": params.speed,
182
232
  "emotion": params.emotion,
233
+ "generation_config": params.generation_config,
183
234
  }
184
235
  self.set_model_name(model)
185
236
  self.set_voice(voice_id)
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
298
349
  if self._settings["speed"]:
299
350
  msg["speed"] = self._settings["speed"]
300
351
 
352
+ if self._settings["generation_config"]:
353
+ msg["generation_config"] = self._settings["generation_config"].model_dump(
354
+ exclude_none=True
355
+ )
356
+
301
357
  return json.dumps(msg)
302
358
 
303
359
  async def start(self, frame: StartFrame):
@@ -345,10 +401,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
345
401
  try:
346
402
  if self._websocket and self._websocket.state is State.OPEN:
347
403
  return
348
- logger.debug("Connecting to Cartesia")
404
+ logger.debug("Connecting to Cartesia TTS")
349
405
  self._websocket = await websocket_connect(
350
406
  f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
351
407
  )
408
+ await self._call_event_handler("on_connected")
352
409
  except Exception as e:
353
410
  logger.error(f"{self} initialization error: {e}")
354
411
  self._websocket = None
@@ -366,13 +423,14 @@ class CartesiaTTSService(AudioContextWordTTSService):
366
423
  finally:
367
424
  self._context_id = None
368
425
  self._websocket = None
426
+ await self._call_event_handler("on_disconnected")
369
427
 
370
428
  def _get_websocket(self):
371
429
  if self._websocket:
372
430
  return self._websocket
373
431
  raise Exception("Websocket not connected")
374
432
 
375
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
433
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
376
434
  await super()._handle_interruption(frame, direction)
377
435
  await self.stop_all_metrics()
378
436
  if self._context_id:
@@ -417,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
417
475
  logger.error(f"{self} error: {msg}")
418
476
  await self.push_frame(TTSStoppedFrame())
419
477
  await self.stop_all_metrics()
420
-
421
478
  await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
422
479
  self._context_id = None
423
480
  else:
@@ -482,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
482
539
 
483
540
  Parameters:
484
541
  language: Language to use for synthesis.
485
- speed: Voice speed control.
486
- emotion: List of emotion controls.
542
+ speed: Voice speed control for non-Sonic-3 models (literal values).
543
+ emotion: List of emotion controls for non-Sonic-3 models.
487
544
 
488
545
  .. deprecated:: 0.0.68
489
546
  The `emotion` parameter is deprecated and will be removed in a future version.
547
+
548
+ generation_config: Generation configuration for Sonic-3 models. Includes volume,
549
+ speed (numeric), and emotion (string) parameters.
490
550
  """
491
551
 
492
552
  language: Optional[Language] = Language.EN
493
553
  speed: Optional[Literal["slow", "normal", "fast"]] = None
494
554
  emotion: Optional[List[str]] = Field(default_factory=list)
555
+ generation_config: Optional[GenerationConfig] = None
495
556
 
496
557
  def __init__(
497
558
  self,
498
559
  *,
499
560
  api_key: str,
500
561
  voice_id: str,
501
- model: str = "sonic-2",
562
+ model: str = "sonic-3",
502
563
  base_url: str = "https://api.cartesia.ai",
503
564
  cartesia_version: str = "2024-11-13",
504
565
  sample_rate: Optional[int] = None,
@@ -512,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
512
573
  Args:
513
574
  api_key: Cartesia API key for authentication.
514
575
  voice_id: ID of the voice to use for synthesis.
515
- model: TTS model to use (e.g., "sonic-2").
576
+ model: TTS model to use (e.g., "sonic-3").
516
577
  base_url: Base URL for Cartesia HTTP API.
517
578
  cartesia_version: API version string for Cartesia service.
518
579
  sample_rate: Audio sample rate. If None, uses default.
@@ -539,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
539
600
  else "en",
540
601
  "speed": params.speed,
541
602
  "emotion": params.emotion,
603
+ "generation_config": params.generation_config,
542
604
  }
543
605
  self.set_voice(voice_id)
544
606
  self.set_model_name(model)
@@ -632,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
632
694
  if self._settings["speed"]:
633
695
  payload["speed"] = self._settings["speed"]
634
696
 
697
+ if self._settings["generation_config"]:
698
+ payload["generation_config"] = self._settings["generation_config"].model_dump(
699
+ exclude_none=True
700
+ )
701
+
635
702
  yield TTSStartedFrame()
636
703
 
637
704
  session = await self._client._get_session()
@@ -8,6 +8,7 @@ import sys
8
8
 
9
9
  from pipecat.services import DeprecatedModuleProxy
10
10
 
11
+ from .flux import *
11
12
  from .stt import *
12
13
  from .tts import *
13
14
 
File without changes