dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """AWS Transcribe Speech-to-Text service implementation.
8
+
9
+ This module provides a WebSocket-based connection to AWS Transcribe for real-time
10
+ speech-to-text transcription with support for multiple languages and audio formats.
11
+ """
12
+
7
13
  import asyncio
8
14
  import json
9
15
  import os
@@ -30,6 +36,8 @@ from pipecat.utils.tracing.service_decorators import traced_stt
30
36
 
31
37
  try:
32
38
  import websockets
39
+ from websockets.asyncio.client import connect as websocket_connect
40
+ from websockets.protocol import State
33
41
  except ModuleNotFoundError as e:
34
42
  logger.error(f"Exception: {e}")
35
43
  logger.error("In order to use AWS services, you need to `pip install pipecat-ai[aws]`.")
@@ -37,6 +45,13 @@ except ModuleNotFoundError as e:
37
45
 
38
46
 
39
47
  class AWSTranscribeSTTService(STTService):
48
+ """AWS Transcribe Speech-to-Text service using WebSocket streaming.
49
+
50
+ Provides real-time speech transcription using AWS Transcribe's streaming API.
51
+ Supports multiple languages, configurable sample rates, and both interim and
52
+ final transcription results.
53
+ """
54
+
40
55
  def __init__(
41
56
  self,
42
57
  *,
@@ -48,6 +63,17 @@ class AWSTranscribeSTTService(STTService):
48
63
  language: Language = Language.EN,
49
64
  **kwargs,
50
65
  ):
66
+ """Initialize the AWS Transcribe STT service.
67
+
68
+ Args:
69
+ api_key: AWS secret access key. If None, uses AWS_SECRET_ACCESS_KEY environment variable.
70
+ aws_access_key_id: AWS access key ID. If None, uses AWS_ACCESS_KEY_ID environment variable.
71
+ aws_session_token: AWS session token for temporary credentials. If None, uses AWS_SESSION_TOKEN environment variable.
72
+ region: AWS region for the service. Defaults to "us-east-1".
73
+ sample_rate: Audio sample rate in Hz. Must be 8000 or 16000. Defaults to 16000.
74
+ language: Language for transcription. Defaults to English.
75
+ **kwargs: Additional arguments passed to parent STTService class.
76
+ """
51
77
  super().__init__(**kwargs)
52
78
 
53
79
  self._settings = {
@@ -79,14 +105,28 @@ class AWSTranscribeSTTService(STTService):
79
105
  self._receive_task = None
80
106
 
81
107
  def get_service_encoding(self, encoding: str) -> str:
82
- """Convert internal encoding format to AWS Transcribe format."""
108
+ """Convert internal encoding format to AWS Transcribe format.
109
+
110
+ Args:
111
+ encoding: Internal encoding format string.
112
+
113
+ Returns:
114
+ AWS Transcribe compatible encoding format.
115
+ """
83
116
  encoding_map = {
84
117
  "linear16": "pcm", # AWS expects "pcm" for 16-bit linear PCM
85
118
  }
86
119
  return encoding_map.get(encoding, encoding)
87
120
 
88
121
  async def start(self, frame: StartFrame):
89
- """Initialize the connection when the service starts."""
122
+ """Initialize the connection when the service starts.
123
+
124
+ Args:
125
+ frame: Start frame signaling service initialization.
126
+
127
+ Raises:
128
+ RuntimeError: If WebSocket connection cannot be established after retries.
129
+ """
90
130
  await super().start(frame)
91
131
  logger.info("Starting AWS Transcribe service...")
92
132
  retry_count = 0
@@ -95,7 +135,7 @@ class AWSTranscribeSTTService(STTService):
95
135
  while retry_count < max_retries:
96
136
  try:
97
137
  await self._connect()
98
- if self._ws_client and self._ws_client.open:
138
+ if self._ws_client and self._ws_client.state is State.OPEN:
99
139
  logger.info("Successfully established WebSocket connection")
100
140
  return
101
141
  logger.warning("WebSocket connection not established after connect")
@@ -108,18 +148,35 @@ class AWSTranscribeSTTService(STTService):
108
148
  raise RuntimeError("Failed to establish WebSocket connection after multiple attempts")
109
149
 
110
150
  async def stop(self, frame: EndFrame):
151
+ """Stop the service and disconnect from AWS Transcribe.
152
+
153
+ Args:
154
+ frame: End frame signaling service shutdown.
155
+ """
111
156
  await super().stop(frame)
112
157
  await self._disconnect()
113
158
 
114
159
  async def cancel(self, frame: CancelFrame):
160
+ """Cancel the service and disconnect from AWS Transcribe.
161
+
162
+ Args:
163
+ frame: Cancel frame signaling service cancellation.
164
+ """
115
165
  await super().cancel(frame)
116
166
  await self._disconnect()
117
167
 
118
168
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
119
- """Process audio data and send to AWS Transcribe"""
169
+ """Process audio data and send to AWS Transcribe.
170
+
171
+ Args:
172
+ audio: Raw audio bytes to transcribe.
173
+
174
+ Yields:
175
+ ErrorFrame: If processing fails or connection issues occur.
176
+ """
120
177
  try:
121
178
  # Ensure WebSocket is connected
122
- if not self._ws_client or not self._ws_client.open:
179
+ if not self._ws_client or self._ws_client.state is State.CLOSED:
123
180
  logger.debug("WebSocket not connected, attempting to reconnect...")
124
181
  try:
125
182
  await self._connect()
@@ -153,7 +210,7 @@ class AWSTranscribeSTTService(STTService):
153
210
 
154
211
  async def _connect(self):
155
212
  """Connect to AWS Transcribe with connection state management."""
156
- if self._ws_client and self._ws_client.open and self._receive_task:
213
+ if self._ws_client and self._ws_client.state is State.OPEN and self._receive_task:
157
214
  logger.debug(f"{self} Already connected")
158
215
  return
159
216
 
@@ -183,7 +240,7 @@ class AWSTranscribeSTTService(STTService):
183
240
  )
184
241
 
185
242
  # Add required headers
186
- extra_headers = {
243
+ additional_headers = {
187
244
  "Origin": "https://localhost",
188
245
  "Sec-WebSocket-Key": websocket_key,
189
246
  "Sec-WebSocket-Version": "13",
@@ -213,9 +270,9 @@ class AWSTranscribeSTTService(STTService):
213
270
  logger.debug(f"{self} Connecting to WebSocket with URL: {presigned_url[:100]}...")
214
271
 
215
272
  # Connect with the required headers and settings
216
- self._ws_client = await websockets.connect(
273
+ self._ws_client = await websocket_connect(
217
274
  presigned_url,
218
- extra_headers=extra_headers,
275
+ additional_headers=additional_headers,
219
276
  subprotocols=["mqtt"],
220
277
  ping_interval=None,
221
278
  ping_timeout=None,
@@ -244,7 +301,7 @@ class AWSTranscribeSTTService(STTService):
244
301
  self._receive_task = None
245
302
 
246
303
  try:
247
- if self._ws_client and self._ws_client.open:
304
+ if self._ws_client and self._ws_client.state is State.OPEN:
248
305
  # Send end-stream message
249
306
  end_stream = {"message-type": "event", "event": "end"}
250
307
  await self._ws_client.send(json.dumps(end_stream))
@@ -255,19 +312,158 @@ class AWSTranscribeSTTService(STTService):
255
312
  self._ws_client = None
256
313
 
257
314
  def language_to_service_language(self, language: Language) -> str | None:
258
- """Convert internal language enum to AWS Transcribe language code."""
315
+ """Convert internal language enum to AWS Transcribe language code.
316
+
317
+ Source:
318
+ https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html
319
+ All language codes that support streaming are included.
320
+
321
+ Args:
322
+ language: Internal language enumeration value.
323
+
324
+ Returns:
325
+ AWS Transcribe compatible language code, or None if unsupported.
326
+ """
259
327
  language_map = {
260
- Language.EN: "en-US",
261
- Language.ES: "es-US",
262
- Language.FR: "fr-FR",
263
- Language.DE: "de-DE",
328
+ # Afrikaans
329
+ Language.AF: "af-ZA",
330
+ Language.AF_ZA: "af-ZA",
331
+ # Arabic
332
+ Language.AR: "ar-SA", # Default to Modern Standard Arabic
333
+ Language.AR_AE: "ar-AE", # Gulf Arabic
334
+ Language.AR_SA: "ar-SA", # Modern Standard Arabic
335
+ # Basque
336
+ Language.EU: "eu-ES",
337
+ Language.EU_ES: "eu-ES",
338
+ # Catalan
339
+ Language.CA: "ca-ES",
340
+ Language.CA_ES: "ca-ES",
341
+ # Chinese
342
+ Language.ZH: "zh-CN", # Default to Simplified
343
+ Language.ZH_CN: "zh-CN", # Simplified
344
+ Language.ZH_TW: "zh-TW", # Traditional
345
+ Language.ZH_HK: "zh-HK", # Cantonese (also yue-HK)
346
+ Language.YUE: "zh-HK", # Cantonese fallback
347
+ # Croatian
348
+ Language.HR: "hr-HR",
349
+ Language.HR_HR: "hr-HR",
350
+ # Czech
351
+ Language.CS: "cs-CZ",
352
+ Language.CS_CZ: "cs-CZ",
353
+ # Danish
354
+ Language.DA: "da-DK",
355
+ Language.DA_DK: "da-DK",
356
+ # Dutch
357
+ Language.NL: "nl-NL",
358
+ Language.NL_NL: "nl-NL",
359
+ # English
360
+ Language.EN: "en-US", # Default to US
361
+ Language.EN_AU: "en-AU", # Australian
362
+ Language.EN_GB: "en-GB", # British
363
+ Language.EN_IN: "en-IN", # Indian
364
+ Language.EN_IE: "en-IE", # Irish
365
+ Language.EN_NZ: "en-NZ", # New Zealand
366
+ # Note: Scottish (en-AB) and Welsh (en-WL) don't have direct Language enum matches
367
+ Language.EN_ZA: "en-ZA", # South African
368
+ Language.EN_US: "en-US", # US
369
+ # Persian/Farsi
370
+ Language.FA: "fa-IR",
371
+ Language.FA_IR: "fa-IR",
372
+ # Finnish
373
+ Language.FI: "fi-FI",
374
+ Language.FI_FI: "fi-FI",
375
+ # French
376
+ Language.FR: "fr-FR", # Default to France
377
+ Language.FR_FR: "fr-FR",
378
+ Language.FR_CA: "fr-CA", # Canadian
379
+ # Galician
380
+ Language.GL: "gl-ES",
381
+ Language.GL_ES: "gl-ES",
382
+ # Georgian
383
+ Language.KA: "ka-GE",
384
+ Language.KA_GE: "ka-GE",
385
+ # German
386
+ Language.DE: "de-DE", # Default to Germany
387
+ Language.DE_DE: "de-DE",
388
+ Language.DE_CH: "de-CH", # Swiss
389
+ # Greek
390
+ Language.EL: "el-GR",
391
+ Language.EL_GR: "el-GR",
392
+ # Hebrew
393
+ Language.HE: "he-IL",
394
+ Language.HE_IL: "he-IL",
395
+ # Hindi
396
+ Language.HI: "hi-IN",
397
+ Language.HI_IN: "hi-IN",
398
+ # Indonesian
399
+ Language.ID: "id-ID",
400
+ Language.ID_ID: "id-ID",
401
+ # Italian
264
402
  Language.IT: "it-IT",
265
- Language.PT: "pt-BR",
403
+ Language.IT_IT: "it-IT",
404
+ # Japanese
266
405
  Language.JA: "ja-JP",
406
+ Language.JA_JP: "ja-JP",
407
+ # Korean
267
408
  Language.KO: "ko-KR",
268
- Language.ZH: "zh-CN",
409
+ Language.KO_KR: "ko-KR",
410
+ # Latvian
411
+ Language.LV: "lv-LV",
412
+ Language.LV_LV: "lv-LV",
413
+ # Malay
414
+ Language.MS: "ms-MY",
415
+ Language.MS_MY: "ms-MY",
416
+ # Norwegian
417
+ Language.NB: "no-NO", # Norwegian Bokmål
418
+ Language.NB_NO: "no-NO",
419
+ Language.NO: "no-NO",
420
+ # Polish
269
421
  Language.PL: "pl-PL",
422
+ Language.PL_PL: "pl-PL",
423
+ # Portuguese
424
+ Language.PT: "pt-PT", # Default to Portugal
425
+ Language.PT_PT: "pt-PT",
426
+ Language.PT_BR: "pt-BR", # Brazilian
427
+ # Romanian
428
+ Language.RO: "ro-RO",
429
+ Language.RO_RO: "ro-RO",
430
+ # Russian
431
+ Language.RU: "ru-RU",
432
+ Language.RU_RU: "ru-RU",
433
+ # Serbian
434
+ Language.SR: "sr-RS",
435
+ Language.SR_RS: "sr-RS",
436
+ # Slovak
437
+ Language.SK: "sk-SK",
438
+ Language.SK_SK: "sk-SK",
439
+ # Somali
440
+ Language.SO: "so-SO",
441
+ Language.SO_SO: "so-SO",
442
+ # Spanish
443
+ Language.ES: "es-ES", # Default to Spain
444
+ Language.ES_ES: "es-ES",
445
+ Language.ES_US: "es-US", # US Spanish
446
+ # Swedish
447
+ Language.SV: "sv-SE",
448
+ Language.SV_SE: "sv-SE",
449
+ # Tagalog/Filipino
450
+ Language.TL: "tl-PH",
451
+ Language.FIL: "tl-PH", # Filipino maps to Tagalog
452
+ Language.FIL_PH: "tl-PH",
453
+ # Thai
454
+ Language.TH: "th-TH",
455
+ Language.TH_TH: "th-TH",
456
+ # Ukrainian
457
+ Language.UK: "uk-UA",
458
+ Language.UK_UA: "uk-UA",
459
+ # Vietnamese
460
+ Language.VI: "vi-VN",
461
+ Language.VI_VN: "vi-VN",
462
+ # Zulu
463
+ Language.ZU: "zu-ZA",
464
+ Language.ZU_ZA: "zu-ZA",
270
465
  }
466
+
271
467
  return language_map.get(language)
272
468
 
273
469
  @traced_stt
@@ -279,12 +475,12 @@ class AWSTranscribeSTTService(STTService):
279
475
  async def _receive_loop(self):
280
476
  """Background task to receive and process messages from AWS Transcribe."""
281
477
  while True:
282
- if not self._ws_client or not self._ws_client.open:
478
+ if not self._ws_client or self._ws_client.state is State.CLOSED:
283
479
  logger.warning(f"{self} WebSocket closed in receive loop")
284
480
  break
285
481
 
286
482
  try:
287
- response = await asyncio.wait_for(self._ws_client.recv(), timeout=1.0)
483
+ response = await self._ws_client.recv()
288
484
 
289
485
  headers, payload = decode_event(response)
290
486
 
@@ -304,7 +500,7 @@ class AWSTranscribeSTTService(STTService):
304
500
  await self.push_frame(
305
501
  TranscriptionFrame(
306
502
  transcript,
307
- "",
503
+ self._user_id,
308
504
  time_now_iso8601(),
309
505
  self._settings["language"],
310
506
  result=result,
@@ -320,7 +516,7 @@ class AWSTranscribeSTTService(STTService):
320
516
  await self.push_frame(
321
517
  InterimTranscriptionFrame(
322
518
  transcript,
323
- "",
519
+ self._user_id,
324
520
  time_now_iso8601(),
325
521
  self._settings["language"],
326
522
  result=result,
@@ -335,8 +531,6 @@ class AWSTranscribeSTTService(STTService):
335
531
  else:
336
532
  logger.debug(f"{self} Other message type received: {headers}")
337
533
  logger.debug(f"{self} Payload: {payload}")
338
- except asyncio.TimeoutError:
339
- self.reset_watchdog()
340
534
  except websockets.exceptions.ConnectionClosed as e:
341
535
  logger.error(
342
536
  f"{self} WebSocket connection closed in receive loop with code {e.code}: {e.reason}"
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """AWS Polly text-to-speech service implementation.
8
+
9
+ This module provides integration with Amazon Polly for text-to-speech synthesis,
10
+ supporting multiple languages, voices, and SSML features.
11
+ """
12
+
7
13
  import asyncio
8
14
  import os
9
15
  from typing import AsyncGenerator, List, Optional
@@ -11,7 +17,7 @@ from typing import AsyncGenerator, List, Optional
11
17
  from loguru import logger
12
18
  from pydantic import BaseModel
13
19
 
14
- from pipecat.audio.utils import create_default_resampler
20
+ from pipecat.audio.utils import create_stream_resampler
15
21
  from pipecat.frames.frames import (
16
22
  ErrorFrame,
17
23
  Frame,
@@ -24,7 +30,7 @@ from pipecat.transcriptions.language import Language
24
30
  from pipecat.utils.tracing.service_decorators import traced_tts
25
31
 
26
32
  try:
27
- import boto3
33
+ import aioboto3
28
34
  from botocore.exceptions import BotoCoreError, ClientError
29
35
  except ModuleNotFoundError as e:
30
36
  logger.error(f"Exception: {e}")
@@ -33,6 +39,14 @@ except ModuleNotFoundError as e:
33
39
 
34
40
 
35
41
  def language_to_aws_language(language: Language) -> Optional[str]:
42
+ """Convert a Language enum to AWS Polly language code.
43
+
44
+ Args:
45
+ language: The Language enum value to convert.
46
+
47
+ Returns:
48
+ The corresponding AWS Polly language code, or None if not supported.
49
+ """
36
50
  language_map = {
37
51
  # Arabic
38
52
  Language.AR: "arb",
@@ -109,7 +123,25 @@ def language_to_aws_language(language: Language) -> Optional[str]:
109
123
 
110
124
 
111
125
  class AWSPollyTTSService(TTSService):
126
+ """AWS Polly text-to-speech service.
127
+
128
+ Provides text-to-speech synthesis using Amazon Polly with support for
129
+ multiple languages, voices, SSML features, and voice customization
130
+ options including prosody controls.
131
+ """
132
+
112
133
  class InputParams(BaseModel):
134
+ """Input parameters for AWS Polly TTS configuration.
135
+
136
+ Parameters:
137
+ engine: TTS engine to use ('standard', 'neural', etc.).
138
+ language: Language for synthesis. Defaults to English.
139
+ pitch: Voice pitch adjustment (for standard engine only).
140
+ rate: Speech rate adjustment.
141
+ volume: Voice volume adjustment.
142
+ lexicon_names: List of pronunciation lexicons to apply.
143
+ """
144
+
113
145
  engine: Optional[str] = None
114
146
  language: Optional[Language] = Language.EN
115
147
  pitch: Optional[str] = None
@@ -129,54 +161,73 @@ class AWSPollyTTSService(TTSService):
129
161
  params: Optional[InputParams] = None,
130
162
  **kwargs,
131
163
  ):
164
+ """Initializes the AWS Polly TTS service.
165
+
166
+ Args:
167
+ api_key: AWS secret access key. If None, uses AWS_SECRET_ACCESS_KEY environment variable.
168
+ aws_access_key_id: AWS access key ID. If None, uses AWS_ACCESS_KEY_ID environment variable.
169
+ aws_session_token: AWS session token for temporary credentials.
170
+ region: AWS region for Polly service. Defaults to 'us-east-1'.
171
+ voice_id: Voice ID to use for synthesis. Defaults to 'Joanna'.
172
+ sample_rate: Audio sample rate. If None, uses service default.
173
+ params: Additional input parameters for voice customization.
174
+ **kwargs: Additional arguments passed to parent TTSService class.
175
+ """
132
176
  super().__init__(sample_rate=sample_rate, **kwargs)
133
177
 
134
178
  params = params or AWSPollyTTSService.InputParams()
135
179
 
136
- self._polly_client = boto3.client(
137
- "polly",
138
- aws_access_key_id=aws_access_key_id,
139
- aws_secret_access_key=api_key,
140
- aws_session_token=aws_session_token,
141
- region_name=region,
142
- )
143
- self._settings = {
144
- "engine": params.engine,
145
- "language": self.language_to_service_language(params.language)
146
- if params.language
147
- else "en-US",
148
- "pitch": params.pitch,
149
- "rate": params.rate,
150
- "volume": params.volume,
151
- "lexicon_names": params.lexicon_names,
152
- }
153
-
154
- self._resampler = create_default_resampler()
155
-
156
- self.set_voice(voice_id)
157
-
158
180
  # Get credentials from environment variables if not provided
159
- self._credentials = {
181
+ self._aws_params = {
160
182
  "aws_access_key_id": aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
161
183
  "aws_secret_access_key": api_key or os.getenv("AWS_SECRET_ACCESS_KEY"),
162
184
  "aws_session_token": aws_session_token or os.getenv("AWS_SESSION_TOKEN"),
163
- "region": region or os.getenv("AWS_REGION", "us-east-1"),
185
+ "region_name": region or os.getenv("AWS_REGION", "us-east-1"),
164
186
  }
165
187
 
166
188
  # Validate that we have the required credentials
167
189
  if (
168
- not self._credentials["aws_access_key_id"]
169
- or not self._credentials["aws_secret_access_key"]
190
+ not self._aws_params["aws_access_key_id"]
191
+ or not self._aws_params["aws_secret_access_key"]
170
192
  ):
171
193
  raise ValueError(
172
194
  "AWS credentials not found. Please provide them either through constructor parameters "
173
195
  "or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables."
174
196
  )
175
197
 
198
+ self._aws_session = aioboto3.Session()
199
+ self._settings = {
200
+ "engine": params.engine,
201
+ "language": self.language_to_service_language(params.language)
202
+ if params.language
203
+ else "en-US",
204
+ "pitch": params.pitch,
205
+ "rate": params.rate,
206
+ "volume": params.volume,
207
+ "lexicon_names": params.lexicon_names,
208
+ }
209
+
210
+ self._resampler = create_stream_resampler()
211
+
212
+ self.set_voice(voice_id)
213
+
176
214
  def can_generate_metrics(self) -> bool:
215
+ """Check if this service can generate processing metrics.
216
+
217
+ Returns:
218
+ True, as AWS Polly service supports metrics generation.
219
+ """
177
220
  return True
178
221
 
179
222
  def language_to_service_language(self, language: Language) -> Optional[str]:
223
+ """Convert a Language enum to AWS Polly language format.
224
+
225
+ Args:
226
+ language: The language to convert.
227
+
228
+ Returns:
229
+ The AWS Polly-specific language code, or None if not supported.
230
+ """
180
231
  return language_to_aws_language(language)
181
232
 
182
233
  def _construct_ssml(self, text: str) -> str:
@@ -214,13 +265,14 @@ class AWSPollyTTSService(TTSService):
214
265
 
215
266
  @traced_tts
216
267
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
217
- def read_audio_data(**args):
218
- response = self._polly_client.synthesize_speech(**args)
219
- if "AudioStream" in response:
220
- audio_data = response["AudioStream"].read()
221
- return audio_data
222
- return None
268
+ """Generate speech from text using AWS Polly.
223
269
 
270
+ Args:
271
+ text: The text to synthesize into speech.
272
+
273
+ Yields:
274
+ Frame: Audio frames containing the synthesized speech.
275
+ """
224
276
  logger.debug(f"{self}: Generating TTS [{text}]")
225
277
 
226
278
  try:
@@ -243,30 +295,32 @@ class AWSPollyTTSService(TTSService):
243
295
  # Filter out None values
244
296
  filtered_params = {k: v for k, v in params.items() if v is not None}
245
297
 
246
- audio_data = await asyncio.to_thread(read_audio_data, **filtered_params)
247
-
248
- if not audio_data:
249
- logger.error(f"{self} No audio data returned")
250
- yield None
251
- return
298
+ async with self._aws_session.client("polly", **self._aws_params) as polly:
299
+ response = await polly.synthesize_speech(**filtered_params)
300
+ if "AudioStream" in response:
301
+ # Get the streaming body and read it
302
+ stream = response["AudioStream"]
303
+ audio_data = await stream.read()
304
+ else:
305
+ logger.error(f"{self} No audio stream in response")
306
+ audio_data = None
252
307
 
253
- audio_data = await self._resampler.resample(audio_data, 16000, self.sample_rate)
308
+ audio_data = await self._resampler.resample(audio_data, 16000, self.sample_rate)
254
309
 
255
- await self.start_tts_usage_metrics(text)
310
+ await self.start_tts_usage_metrics(text)
256
311
 
257
- yield TTSStartedFrame()
312
+ yield TTSStartedFrame()
258
313
 
259
- CHUNK_SIZE = self.chunk_size
314
+ CHUNK_SIZE = self.chunk_size
260
315
 
261
- for i in range(0, len(audio_data), CHUNK_SIZE):
262
- chunk = audio_data[i : i + CHUNK_SIZE]
263
- if len(chunk) > 0:
264
- await self.stop_ttfb_metrics()
265
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
266
- yield frame
267
-
268
- yield TTSStoppedFrame()
316
+ for i in range(0, len(audio_data), CHUNK_SIZE):
317
+ chunk = audio_data[i : i + CHUNK_SIZE]
318
+ if len(chunk) > 0:
319
+ await self.stop_ttfb_metrics()
320
+ frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
321
+ yield frame
269
322
 
323
+ yield TTSStoppedFrame()
270
324
  except (BotoCoreError, ClientError) as error:
271
325
  logger.exception(f"{self} error generating TTS: {error}")
272
326
  error_message = f"AWS Polly TTS error: {str(error)}"
@@ -277,7 +331,19 @@ class AWSPollyTTSService(TTSService):
277
331
 
278
332
 
279
333
  class PollyTTSService(AWSPollyTTSService):
334
+ """Deprecated alias for AWSPollyTTSService.
335
+
336
+ .. deprecated:: 0.0.67
337
+ `PollyTTSService` is deprecated, use `AWSPollyTTSService` instead.
338
+
339
+ """
340
+
280
341
  def __init__(self, **kwargs):
342
+ """Initialize the deprecated PollyTTSService.
343
+
344
+ Args:
345
+ **kwargs: All arguments passed to AWSPollyTTSService.
346
+ """
281
347
  super().__init__(**kwargs)
282
348
 
283
349
  import warnings