dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,6 +4,8 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Azure Cognitive Services Text-to-Speech service implementations."""
8
+
7
9
  import asyncio
8
10
  from typing import AsyncGenerator, Optional
9
11
 
@@ -21,8 +23,8 @@ from pipecat.frames.frames import (
21
23
  from pipecat.services.azure.common import language_to_azure_language
22
24
  from pipecat.services.tts_service import TTSService
23
25
  from pipecat.transcriptions.language import Language
24
- from pipecat.utils.utils import detect_language_from_script
25
26
  from pipecat.utils.tracing.service_decorators import traced_tts
27
+ from pipecat.utils.utils import detect_language_from_script
26
28
 
27
29
  try:
28
30
  from azure.cognitiveservices.speech import (
@@ -40,6 +42,15 @@ except ModuleNotFoundError as e:
40
42
 
41
43
 
42
44
  def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputFormat:
45
+ """Convert sample rate to Azure speech synthesis output format.
46
+
47
+ Args:
48
+ sample_rate: Sample rate in Hz.
49
+
50
+ Returns:
51
+ Corresponding Azure SpeechSynthesisOutputFormat enum value.
52
+ Defaults to Raw24Khz16BitMonoPcm if sample rate not found.
53
+ """
43
54
  sample_rate_map = {
44
55
  8000: SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm,
45
56
  16000: SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm,
@@ -52,7 +63,36 @@ def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputForma
52
63
 
53
64
 
54
65
  class AzureBaseTTSService(TTSService):
66
+ """Base class for Azure Cognitive Services text-to-speech implementations.
67
+
68
+ Provides common functionality for Azure TTS services including SSML
69
+ construction, voice configuration, and parameter management.
70
+ """
71
+
72
+ # Define SSML escape mappings based on SSML reserved characters
73
+ # See - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure
74
+ SSML_ESCAPE_CHARS = {
75
+ "&": "&",
76
+ "<": "&lt;",
77
+ ">": "&gt;",
78
+ '"': "&quot;",
79
+ "'": "&apos;",
80
+ }
81
+
55
82
  class InputParams(BaseModel):
83
+ """Input parameters for Azure TTS voice configuration.
84
+
85
+ Parameters:
86
+ emphasis: Emphasis level for speech ("strong", "moderate", "reduced").
87
+ language: Language for synthesis. Defaults to English (US).
88
+ pitch: Voice pitch adjustment (e.g., "+10%", "-5Hz", "high").
89
+ rate: Speech rate multiplier. Defaults to "1.05".
90
+ role: Voice role for expression (e.g., "YoungAdultFemale").
91
+ style: Speaking style (e.g., "cheerful", "sad", "excited").
92
+ style_degree: Intensity of the speaking style (0.01 to 2.0).
93
+ volume: Volume level (e.g., "+20%", "loud", "x-soft").
94
+ """
95
+
56
96
  emphasis: Optional[str] = None
57
97
  language: Optional[Language] = Language.EN_US
58
98
  pitch: Optional[str] = None
@@ -75,6 +115,16 @@ class AzureBaseTTSService(TTSService):
75
115
  params: Optional[InputParams] = None,
76
116
  **kwargs,
77
117
  ):
118
+ """Initialize the Azure TTS service with configuration parameters.
119
+
120
+ Args:
121
+ api_key: Azure Cognitive Services subscription key.
122
+ region: Azure region identifier (e.g., "eastus", "westus2").
123
+ voice: Voice name to use for synthesis. Defaults to "en-US-SaraNeural".
124
+ sample_rate: Audio sample rate in Hz. If None, uses service default.
125
+ params: Voice and synthesis parameters configuration.
126
+ **kwargs: Additional arguments passed to parent TTSService.
127
+ """
78
128
  super().__init__(sample_rate=sample_rate, **kwargs)
79
129
 
80
130
  params = params or AzureBaseTTSService.InputParams()
@@ -138,9 +188,22 @@ class AzureBaseTTSService(TTSService):
138
188
  logger.debug(f"Final additional language map: {self._additional_lang_map}")
139
189
 
140
190
  def can_generate_metrics(self) -> bool:
191
+ """Check if this service can generate processing metrics.
192
+
193
+ Returns:
194
+ True, as Azure TTS service supports metrics generation.
195
+ """
141
196
  return True
142
197
 
143
198
  def language_to_service_language(self, language: Language) -> Optional[str]:
199
+ """Convert a Language enum to Azure language format.
200
+
201
+ Args:
202
+ language: The language to convert.
203
+
204
+ Returns:
205
+ The Azure-specific language code, or None if not supported.
206
+ """
144
207
  return language_to_azure_language(language)
145
208
 
146
209
  def _construct_ssml(self, text: str) -> str:
@@ -162,6 +225,10 @@ class AzureBaseTTSService(TTSService):
162
225
  )
163
226
 
164
227
  # 3. Construct SSML with the selected language and voice
228
+
229
+ # Escape special characters
230
+ escaped_text = self._escape_text(text)
231
+
165
232
  ssml = (
166
233
  f"<speak version='1.0' xml:lang='{target_language}' "
167
234
  "xmlns='http://www.w3.org/2001/10/synthesis' "
@@ -193,10 +260,10 @@ class AzureBaseTTSService(TTSService):
193
260
 
194
261
  if "Multilingual" in target_voice:
195
262
  ssml += f"<lang xml:lang='{target_language}'>"
196
- ssml += text
263
+ ssml += escaped_text
197
264
  ssml += "</lang>"
198
265
  else:
199
- ssml += text
266
+ ssml += escaped_text
200
267
 
201
268
  if self._settings["emphasis"]:
202
269
  ssml += "</emphasis>"
@@ -210,9 +277,42 @@ class AzureBaseTTSService(TTSService):
210
277
 
211
278
  return ssml
212
279
 
280
+ def _escape_text(self, text: str) -> str:
281
+ """Escapes XML/SSML reserved characters according to Microsoft documentation.
282
+
283
+ This method escapes the following characters:
284
+ - & becomes &amp;
285
+ - < becomes &lt;
286
+ - > becomes &gt;
287
+ - " becomes &quot;
288
+ - ' becomes &apos;
289
+
290
+ Args:
291
+ text: The text to escape.
292
+
293
+ Returns:
294
+ The escaped text.
295
+ """
296
+ escaped_text = text
297
+ for char, escape_code in AzureBaseTTSService.SSML_ESCAPE_CHARS.items():
298
+ escaped_text = escaped_text.replace(char, escape_code)
299
+ return escaped_text
300
+
213
301
 
214
302
  class AzureTTSService(AzureBaseTTSService):
303
+ """Azure Cognitive Services streaming TTS service.
304
+
305
+ Provides real-time text-to-speech synthesis using Azure's WebSocket-based
306
+ streaming API. Audio chunks are streamed as they become available for
307
+ lower latency playback.
308
+ """
309
+
215
310
  def __init__(self, **kwargs):
311
+ """Initialize the Azure streaming TTS service.
312
+
313
+ Args:
314
+ **kwargs: All arguments passed to AzureBaseTTSService parent class.
315
+ """
216
316
  super().__init__(**kwargs)
217
317
  self._speech_config = None
218
318
  self._speech_synthesizer = None
@@ -220,6 +320,11 @@ class AzureTTSService(AzureBaseTTSService):
220
320
  self._clear_audio = False
221
321
 
222
322
  async def start(self, frame: StartFrame):
323
+ """Start the Azure TTS service and initialize speech synthesizer.
324
+
325
+ Args:
326
+ frame: Start frame containing initialization parameters.
327
+ """
223
328
  await super().start(frame)
224
329
 
225
330
  if self._speech_config:
@@ -250,12 +355,12 @@ class AzureTTSService(AzureBaseTTSService):
250
355
  self._speech_synthesizer.synthesis_canceled.connect(self._handle_canceled)
251
356
 
252
357
  def _handle_synthesizing(self, evt):
253
- """Handle audio chunks as they arrive"""
358
+ """Handle audio chunks as they arriv."""
254
359
  if evt.result and evt.result.audio_data:
255
360
  self._audio_queue.put_nowait(evt.result.audio_data)
256
361
 
257
362
  def _handle_completed(self, evt):
258
- """Handle synthesis completion"""
363
+ """Handle synthesis completion."""
259
364
  self._audio_queue.put_nowait(None) # Signal completion
260
365
 
261
366
  def _handle_canceled(self, evt):
@@ -263,29 +368,30 @@ class AzureTTSService(AzureBaseTTSService):
263
368
  self.logger.error(f"Speech synthesis canceled: {evt.result.cancellation_details.reason}")
264
369
  self._audio_queue.put_nowait(None)
265
370
 
266
- async def clear_azure_audio(self):
267
- self.logger.debug("Flushing audio")
268
- self._clear_audio = True
269
- if self._speech_synthesizer is not None:
270
- future = self._speech_synthesizer.stop_speaking_async()
271
-
272
- async def wait_for_future_completion():
273
- loop = self.get_event_loop()
274
- await loop.run_in_executor(None, future.get)
275
-
276
- task = self.create_task(wait_for_future_completion())
277
- await self.wait_for_task(task)
278
- while not self._audio_queue.empty():
279
- try:
280
- self._audio_queue.get_nowait()
281
- except asyncio.QueueEmpty:
282
- break
283
- self._clear_audio = False
371
+ async def flush_audio(self):
372
+ """Flush any pending audio data."""
373
+ logger.trace(f"{self}: flushing audio")
284
374
 
285
375
  @traced_tts
286
376
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
377
+ """Generate speech from text using Azure's streaming synthesis.
378
+
379
+ Args:
380
+ text: The text to synthesize into speech.
381
+
382
+ Yields:
383
+ Frame: Audio frames containing synthesized speech data.
384
+ """
287
385
  text = text.lstrip()
288
386
  self.logger.debug(f"{self}: Generating TTS [{text}]")
387
+
388
+ # Clear the audio queue in case there's still audio in it, causing the next audio response
389
+ # to be cut off by the 'None' element returned at the end of the previous audio synthesis.
390
+ # Empty the audio queue before processing the new text
391
+ while not self._audio_queue.empty():
392
+ self._audio_queue.get_nowait()
393
+ self._audio_queue.task_done()
394
+
289
395
  try:
290
396
  if self._speech_synthesizer is None:
291
397
  error_msg = "Speech synthesizer not initialized."
@@ -324,12 +430,29 @@ class AzureTTSService(AzureBaseTTSService):
324
430
 
325
431
 
326
432
  class AzureHttpTTSService(AzureBaseTTSService):
433
+ """Azure Cognitive Services HTTP-based TTS service.
434
+
435
+ Provides text-to-speech synthesis using Azure's HTTP API for simpler,
436
+ non-streaming synthesis. Suitable for use cases where streaming is not
437
+ required and simpler integration is preferred.
438
+ """
439
+
327
440
  def __init__(self, **kwargs):
441
+ """Initialize the Azure HTTP TTS service.
442
+
443
+ Args:
444
+ **kwargs: All arguments passed to AzureBaseTTSService parent class.
445
+ """
328
446
  super().__init__(**kwargs)
329
447
  self._speech_config = None
330
448
  self._speech_synthesizer = None
331
449
 
332
450
  async def start(self, frame: StartFrame):
451
+ """Start the Azure HTTP TTS service and initialize speech synthesizer.
452
+
453
+ Args:
454
+ frame: Start frame containing initialization parameters.
455
+ """
333
456
  await super().start(frame)
334
457
 
335
458
  if self._speech_config:
@@ -349,6 +472,14 @@ class AzureHttpTTSService(AzureBaseTTSService):
349
472
 
350
473
  @traced_tts
351
474
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
475
+ """Generate speech from text using Azure's HTTP synthesis API.
476
+
477
+ Args:
478
+ text: The text to synthesize into speech.
479
+
480
+ Yields:
481
+ Frame: Audio frames containing the complete synthesized speech.
482
+ """
352
483
  logger.debug(f"{self}: Generating TTS [{text}]")
353
484
 
354
485
  await self.start_ttfb_metrics()
@@ -4,12 +4,17 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Cartesia Speech-to-Text service implementation.
8
+
9
+ This module provides a WebSocket-based STT service that integrates with
10
+ the Cartesia Live transcription API for real-time speech recognition.
11
+ """
12
+
7
13
  import asyncio
8
14
  import json
9
15
  import urllib.parse
10
16
  from typing import AsyncGenerator, Optional
11
17
 
12
- import websockets
13
18
  from loguru import logger
14
19
 
15
20
  from pipecat.frames.frames import (
@@ -28,8 +33,23 @@ from pipecat.transcriptions.language import Language
28
33
  from pipecat.utils.time import time_now_iso8601
29
34
  from pipecat.utils.tracing.service_decorators import traced_stt
30
35
 
36
+ try:
37
+ import websockets
38
+ from websockets.asyncio.client import connect as websocket_connect
39
+ from websockets.protocol import State
40
+ except ModuleNotFoundError as e:
41
+ logger.error(f"Exception: {e}")
42
+ logger.error("In order to use Cartesia, you need to `pip install pipecat-ai[cartesia]`.")
43
+ raise Exception(f"Missing module: {e}")
44
+
31
45
 
32
46
  class CartesiaLiveOptions:
47
+ """Configuration options for Cartesia Live STT service.
48
+
49
+ Manages transcription parameters including model selection, language,
50
+ audio encoding format, and sample rate settings.
51
+ """
52
+
33
53
  def __init__(
34
54
  self,
35
55
  *,
@@ -39,6 +59,15 @@ class CartesiaLiveOptions:
39
59
  sample_rate: int = 16000,
40
60
  **kwargs,
41
61
  ):
62
+ """Initialize CartesiaLiveOptions with default or provided parameters.
63
+
64
+ Args:
65
+ model: The transcription model to use. Defaults to "ink-whisper".
66
+ language: Target language for transcription. Defaults to English.
67
+ encoding: Audio encoding format. Defaults to "pcm_s16le".
68
+ sample_rate: Audio sample rate in Hz. Defaults to 16000.
69
+ **kwargs: Additional parameters for the transcription service.
70
+ """
42
71
  self.model = model
43
72
  self.language = language
44
73
  self.encoding = encoding
@@ -46,6 +75,11 @@ class CartesiaLiveOptions:
46
75
  self.additional_params = kwargs
47
76
 
48
77
  def to_dict(self):
78
+ """Convert options to dictionary format.
79
+
80
+ Returns:
81
+ Dictionary containing all configuration parameters.
82
+ """
49
83
  params = {
50
84
  "model": self.model,
51
85
  "language": self.language if isinstance(self.language, str) else self.language.value,
@@ -56,19 +90,48 @@ class CartesiaLiveOptions:
56
90
  return params
57
91
 
58
92
  def items(self):
93
+ """Get configuration items as key-value pairs.
94
+
95
+ Returns:
96
+ Iterator of (key, value) tuples for all configuration parameters.
97
+ """
59
98
  return self.to_dict().items()
60
99
 
61
100
  def get(self, key, default=None):
101
+ """Get a configuration value by key.
102
+
103
+ Args:
104
+ key: The configuration parameter name to retrieve.
105
+ default: Default value if key is not found.
106
+
107
+ Returns:
108
+ The configuration value or default if not found.
109
+ """
62
110
  if hasattr(self, key):
63
111
  return getattr(self, key)
64
112
  return self.additional_params.get(key, default)
65
113
 
66
114
  @classmethod
67
115
  def from_json(cls, json_str: str) -> "CartesiaLiveOptions":
116
+ """Create options from JSON string.
117
+
118
+ Args:
119
+ json_str: JSON string containing configuration parameters.
120
+
121
+ Returns:
122
+ New CartesiaLiveOptions instance with parsed parameters.
123
+ """
68
124
  return cls(**json.loads(json_str))
69
125
 
70
126
 
71
127
  class CartesiaSTTService(STTService):
128
+ """Speech-to-text service using Cartesia Live API.
129
+
130
+ Provides real-time speech transcription through WebSocket connection
131
+ to Cartesia's Live transcription service. Supports both interim and
132
+ final transcriptions with configurable models and languages.
133
+ """
134
+
72
135
  def __init__(
73
136
  self,
74
137
  *,
@@ -78,6 +141,15 @@ class CartesiaSTTService(STTService):
78
141
  live_options: Optional[CartesiaLiveOptions] = None,
79
142
  **kwargs,
80
143
  ):
144
+ """Initialize CartesiaSTTService with API key and options.
145
+
146
+ Args:
147
+ api_key: Authentication key for Cartesia API.
148
+ base_url: Custom API endpoint URL. If empty, uses default.
149
+ sample_rate: Audio sample rate in Hz. Defaults to 16000.
150
+ live_options: Configuration options for transcription service.
151
+ **kwargs: Additional arguments passed to parent STTService.
152
+ """
81
153
  sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
82
154
  super().__init__(sample_rate=sample_rate, **kwargs)
83
155
 
@@ -108,23 +180,51 @@ class CartesiaSTTService(STTService):
108
180
  self._receiver_task = None
109
181
 
110
182
  def can_generate_metrics(self) -> bool:
183
+ """Check if the service can generate processing metrics.
184
+
185
+ Returns:
186
+ True, indicating metrics are supported.
187
+ """
111
188
  return True
112
189
 
113
190
  async def start(self, frame: StartFrame):
191
+ """Start the STT service and establish connection.
192
+
193
+ Args:
194
+ frame: Frame indicating service should start.
195
+ """
114
196
  await super().start(frame)
115
197
  await self._connect()
116
198
 
117
199
  async def stop(self, frame: EndFrame):
200
+ """Stop the STT service and close connection.
201
+
202
+ Args:
203
+ frame: Frame indicating service should stop.
204
+ """
118
205
  await super().stop(frame)
119
206
  await self._disconnect()
120
207
 
121
208
  async def cancel(self, frame: CancelFrame):
209
+ """Cancel the STT service and close connection.
210
+
211
+ Args:
212
+ frame: Frame indicating service should be cancelled.
213
+ """
122
214
  await super().cancel(frame)
123
215
  await self._disconnect()
124
216
 
125
217
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
218
+ """Process audio data for speech-to-text transcription.
219
+
220
+ Args:
221
+ audio: Raw audio bytes to transcribe.
222
+
223
+ Yields:
224
+ None - transcription results are handled via WebSocket responses.
225
+ """
126
226
  # If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
127
- if not self._connection or self._connection.closed:
227
+ if not self._connection or self._connection.state is State.CLOSED:
128
228
  await self._connect()
129
229
 
130
230
  await self._connection.send(audio)
@@ -137,7 +237,7 @@ class CartesiaSTTService(STTService):
137
237
  headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
138
238
 
139
239
  try:
140
- self._connection = await websockets.connect(ws_url, extra_headers=headers)
240
+ self._connection = await websocket_connect(ws_url, additional_headers=headers)
141
241
  # Setup the receiver task to handle the incoming messages from the Cartesia server
142
242
  if self._receiver_task is None or self._receiver_task.done():
143
243
  self._receiver_task = asyncio.create_task(self._receive_messages())
@@ -148,7 +248,7 @@ class CartesiaSTTService(STTService):
148
248
  async def _receive_messages(self):
149
249
  try:
150
250
  while True:
151
- if not self._connection or self._connection.closed:
251
+ if not self._connection or self._connection.state is State.CLOSED:
152
252
  break
153
253
 
154
254
  message = await self._connection.recv()
@@ -197,14 +297,24 @@ class CartesiaSTTService(STTService):
197
297
  await self.stop_ttfb_metrics()
198
298
  if is_final:
199
299
  await self.push_frame(
200
- TranscriptionFrame(transcript, "", time_now_iso8601(), language)
300
+ TranscriptionFrame(
301
+ transcript,
302
+ self._user_id,
303
+ time_now_iso8601(),
304
+ language,
305
+ )
201
306
  )
202
307
  await self._handle_transcription(transcript, is_final, language)
203
308
  await self.stop_processing_metrics()
204
309
  else:
205
310
  # For interim transcriptions, just push the frame without tracing
206
311
  await self.push_frame(
207
- InterimTranscriptionFrame(transcript, "", time_now_iso8601(), language)
312
+ InterimTranscriptionFrame(
313
+ transcript,
314
+ self._user_id,
315
+ time_now_iso8601(),
316
+ language,
317
+ )
208
318
  )
209
319
 
210
320
  async def _disconnect(self):
@@ -218,22 +328,29 @@ class CartesiaSTTService(STTService):
218
328
  logger.exception(f"Unexpected exception while cancelling task: {e}")
219
329
  self._receiver_task = None
220
330
 
221
- if self._connection and self._connection.open:
331
+ if self._connection and self._connection.state is State.OPEN:
222
332
  logger.debug("Disconnecting from Cartesia")
223
333
 
224
334
  await self._connection.close()
225
335
  self._connection = None
226
336
 
227
337
  async def start_metrics(self):
338
+ """Start performance metrics collection for transcription processing."""
228
339
  await self.start_ttfb_metrics()
229
340
  await self.start_processing_metrics()
230
341
 
231
342
  async def process_frame(self, frame: Frame, direction: FrameDirection):
343
+ """Process incoming frames and handle speech events.
344
+
345
+ Args:
346
+ frame: The frame to process.
347
+ direction: Direction of frame flow in the pipeline.
348
+ """
232
349
  await super().process_frame(frame, direction)
233
350
 
234
351
  if isinstance(frame, UserStartedSpeakingFrame):
235
352
  await self.start_metrics()
236
353
  elif isinstance(frame, UserStoppedSpeakingFrame):
237
354
  # Send finalize command to flush the transcription session
238
- if self._connection and self._connection.open:
355
+ if self._connection and self._connection.state is State.OPEN:
239
356
  await self._connection.send("finalize")