dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """ElevenLabs text-to-speech service implementations.
8
+
9
+ This module provides WebSocket and HTTP-based TTS services using ElevenLabs API
10
+ with support for streaming audio, word timestamps, and voice customization.
11
+ """
12
+
7
13
  import asyncio
8
14
  import base64
9
15
  import json
@@ -32,12 +38,13 @@ from pipecat.services.tts_service import (
32
38
  WordTTSService,
33
39
  )
34
40
  from pipecat.transcriptions.language import Language
35
- from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
36
41
  from pipecat.utils.tracing.service_decorators import traced_tts
37
42
 
38
43
  # See .env.example for ElevenLabs configuration needed
39
44
  try:
40
45
  import websockets
46
+ from websockets.asyncio.client import connect as websocket_connect
47
+ from websockets.protocol import State
41
48
  except ModuleNotFoundError as e:
42
49
  logger.error(f"Exception: {e}")
43
50
  logger.error("In order to use ElevenLabs, you need to `pip install pipecat-ai[elevenlabs]`.")
@@ -57,6 +64,14 @@ ELEVENLABS_MULTILINGUAL_MODELS = {
57
64
 
58
65
 
59
66
  def language_to_elevenlabs_language(language: Language) -> Optional[str]:
67
+ """Convert a Language enum to ElevenLabs language code.
68
+
69
+ Args:
70
+ language: The Language enum value to convert.
71
+
72
+ Returns:
73
+ The corresponding ElevenLabs language code, or None if not supported.
74
+ """
60
75
  BASE_LANGUAGES = {
61
76
  Language.AR: "ar",
62
77
  Language.BG: "bg",
@@ -106,6 +121,14 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
106
121
 
107
122
 
108
123
  def output_format_from_sample_rate(sample_rate: int) -> str:
124
+ """Get the appropriate output format string for a given sample rate.
125
+
126
+ Args:
127
+ sample_rate: The audio sample rate in Hz.
128
+
129
+ Returns:
130
+ The ElevenLabs output format string.
131
+ """
109
132
  match sample_rate:
110
133
  case 8000:
111
134
  return "pcm_8000"
@@ -129,10 +152,10 @@ def build_elevenlabs_voice_settings(
129
152
  """Build voice settings dictionary for ElevenLabs based on provided settings.
130
153
 
131
154
  Args:
132
- settings: Dictionary containing voice settings parameters
155
+ settings: Dictionary containing voice settings parameters.
133
156
 
134
157
  Returns:
135
- Dictionary of voice settings or None if no valid settings are provided
158
+ Dictionary of voice settings or None if no valid settings are provided.
136
159
  """
137
160
  voice_setting_keys = ["stability", "similarity_boost", "style", "use_speaker_boost", "speed"]
138
161
 
@@ -151,26 +174,83 @@ def build_elevenlabs_voice_settings(
151
174
  def calculate_word_times(
152
175
  alignment_info: Mapping[str, Any], cumulative_time: float
153
176
  ) -> List[Tuple[str, float]]:
154
- zipped_times = list(zip(alignment_info["chars"], alignment_info["charStartTimesMs"]))
177
+ """Calculate word timestamps from character alignment information.
155
178
 
156
- words = "".join(alignment_info["chars"]).split(" ")
179
+ Args:
180
+ alignment_info: Character alignment data from ElevenLabs API.
181
+ cumulative_time: Base time offset for this chunk.
157
182
 
158
- # Calculate start time for each word. We do this by finding a space character
159
- # and using the previous word time, also taking into account there might not
160
- # be a space at the end.
161
- times = []
162
- for i, (a, b) in enumerate(zipped_times):
163
- if a == " " or i == len(zipped_times) - 1:
164
- t = cumulative_time + (zipped_times[i - 1][1] / 1000.0)
165
- times.append(t)
183
+ Returns:
184
+ List of (word, timestamp) tuples.
185
+ """
186
+ chars = alignment_info["chars"]
187
+ char_start_times_ms = alignment_info["charStartTimesMs"]
166
188
 
167
- word_times = list(zip(words, times))
189
+ if len(chars) != len(char_start_times_ms):
190
+ logger.error(
191
+ f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
192
+ )
193
+ return []
194
+
195
+ # Build words and track their start positions
196
+ words = []
197
+ word_start_indices = []
198
+ current_word = ""
199
+ word_start_index = None
200
+
201
+ for i, char in enumerate(chars):
202
+ if char == " ":
203
+ # End of current word
204
+ if current_word: # Only add non-empty words
205
+ words.append(current_word)
206
+ word_start_indices.append(word_start_index)
207
+ current_word = ""
208
+ word_start_index = None
209
+ else:
210
+ # Building a word
211
+ if word_start_index is None: # First character of new word
212
+ word_start_index = i
213
+ current_word += char
214
+
215
+ # Handle the last word if there's no trailing space
216
+ if current_word and word_start_index is not None:
217
+ words.append(current_word)
218
+ word_start_indices.append(word_start_index)
219
+
220
+ # Calculate timestamps for each word
221
+ word_times = []
222
+ for word, start_idx in zip(words, word_start_indices):
223
+ # Convert from milliseconds to seconds and add cumulative offset
224
+ start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
225
+ word_times.append((word, start_time_seconds))
168
226
 
169
227
  return word_times
170
228
 
171
229
 
172
230
  class ElevenLabsTTSService(AudioContextWordTTSService):
231
+ """ElevenLabs WebSocket-based TTS service with word timestamps.
232
+
233
+ Provides real-time text-to-speech using ElevenLabs' WebSocket streaming API.
234
+ Supports word-level timestamps, audio context management, and various voice
235
+ customization options including stability, similarity boost, and speed controls.
236
+ """
237
+
173
238
  class InputParams(BaseModel):
239
+ """Input parameters for ElevenLabs TTS configuration.
240
+
241
+ Parameters:
242
+ language: Language to use for synthesis.
243
+ stability: Voice stability control (0.0 to 1.0).
244
+ similarity_boost: Similarity boost control (0.0 to 1.0).
245
+ style: Style control for voice expression (0.0 to 1.0).
246
+ use_speaker_boost: Whether to use speaker boost enhancement.
247
+ speed: Voice speed control (0.7 to 1.2).
248
+ auto_mode: Whether to enable automatic mode optimization.
249
+ enable_ssml_parsing: Whether to parse SSML tags in text.
250
+ enable_logging: Whether to enable ElevenLabs logging.
251
+ apply_text_normalization: Text normalization mode ("auto", "on", "off").
252
+ """
253
+
174
254
  language: Optional[Language] = None
175
255
  stability: Optional[float] = None
176
256
  similarity_boost: Optional[float] = None
@@ -180,18 +260,32 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
180
260
  auto_mode: Optional[bool] = True
181
261
  enable_ssml_parsing: Optional[bool] = None
182
262
  enable_logging: Optional[bool] = None
263
+ apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
183
264
 
184
265
  def __init__(
185
266
  self,
186
267
  *,
187
268
  api_key: str,
188
269
  voice_id: str,
189
- model: str = "eleven_flash_v2_5",
270
+ model: str = "eleven_turbo_v2_5",
190
271
  url: str = "wss://api.elevenlabs.io",
191
272
  sample_rate: Optional[int] = None,
192
273
  params: Optional[InputParams] = None,
274
+ aggregate_sentences: Optional[bool] = True,
193
275
  **kwargs,
194
276
  ):
277
+ """Initialize the ElevenLabs TTS service.
278
+
279
+ Args:
280
+ api_key: ElevenLabs API key for authentication.
281
+ voice_id: ID of the voice to use for synthesis.
282
+ model: TTS model to use (e.g., "eleven_turbo_v2_5").
283
+ url: WebSocket URL for ElevenLabs TTS API.
284
+ sample_rate: Audio sample rate. If None, uses default.
285
+ params: Additional input parameters for voice customization.
286
+ aggregate_sentences: Whether to aggregate sentences within the TTSService.
287
+ **kwargs: Additional arguments passed to the parent service.
288
+ """
195
289
  # Aggregating sentences still gives cleaner-sounding results and fewer
196
290
  # artifacts than streaming one word at a time. On average, waiting for a
197
291
  # full sentence should only "cost" us 15ms or so with GPT-4o or a Llama
@@ -207,7 +301,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
207
301
  # speaking for a while, so we want the parent class to send TTSStopFrame
208
302
  # after a short period not receiving any audio.
209
303
  super().__init__(
210
- aggregate_sentences=True,
304
+ aggregate_sentences=aggregate_sentences,
211
305
  push_text_frames=False,
212
306
  push_stop_frames=True,
213
307
  pause_frame_processing=True,
@@ -231,6 +325,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
231
325
  "auto_mode": str(params.auto_mode).lower(),
232
326
  "enable_ssml_parsing": params.enable_ssml_parsing,
233
327
  "enable_logging": params.enable_logging,
328
+ "apply_text_normalization": params.apply_text_normalization,
234
329
  }
235
330
  self.set_model_name(model)
236
331
  self.set_voice(voice_id)
@@ -248,43 +343,114 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
248
343
  self._keepalive_task = None
249
344
 
250
345
  def can_generate_metrics(self) -> bool:
346
+ """Check if this service can generate processing metrics.
347
+
348
+ Returns:
349
+ True, as ElevenLabs service supports metrics generation.
350
+ """
251
351
  return True
252
352
 
253
353
  def language_to_service_language(self, language: Language) -> Optional[str]:
354
+ """Convert a Language enum to ElevenLabs language format.
355
+
356
+ Args:
357
+ language: The language to convert.
358
+
359
+ Returns:
360
+ The ElevenLabs-specific language code, or None if not supported.
361
+ """
254
362
  return language_to_elevenlabs_language(language)
255
363
 
256
364
  def _set_voice_settings(self):
257
365
  return build_elevenlabs_voice_settings(self._settings)
258
366
 
259
367
  async def set_model(self, model: str):
368
+ """Set the TTS model and reconnect.
369
+
370
+ Args:
371
+ model: The model name to use for synthesis.
372
+ """
260
373
  await super().set_model(model)
261
374
  self.logger.info(f"Switching TTS model to: [{model}]")
262
375
  await self._disconnect()
263
376
  await self._connect()
264
377
 
265
378
  async def _update_settings(self, settings: Mapping[str, Any]):
379
+ """Update service settings and reconnect if voice, model, or language changed."""
380
+ # Track previous values for settings that require reconnection
266
381
  prev_voice = self._voice_id
382
+ prev_model = self.model_name
383
+ prev_language = self._settings.get("language")
384
+ # Create snapshot of current voice settings to detect changes after update
385
+ prev_voice_settings = self._voice_settings.copy() if self._voice_settings else None
386
+
267
387
  await super()._update_settings(settings)
268
- if not prev_voice == self._voice_id:
269
- logger.info(f"Switching TTS voice to: [{self._voice_id}]")
388
+
389
+ # Update voice settings for the next context creation
390
+ self._voice_settings = self._set_voice_settings()
391
+
392
+ # Check if URL-level settings changed (these require reconnection)
393
+ url_changed = (
394
+ prev_voice != self._voice_id
395
+ or prev_model != self.model_name
396
+ or prev_language != self._settings.get("language")
397
+ )
398
+
399
+ # Check if only voice settings changed (speed, stability, etc.)
400
+ voice_settings_changed = prev_voice_settings != self._voice_settings
401
+
402
+ if url_changed:
403
+ # These settings are in the WebSocket URL, so we need to reconnect
404
+ logger.debug(
405
+ f"URL-level setting changed (voice/model/language), reconnecting WebSocket"
406
+ )
270
407
  await self._disconnect()
271
408
  await self._connect()
272
409
  self.logger.info(f"Switching TTS voice to: [{self._voice_id}]")
410
+ elif voice_settings_changed and self._context_id:
411
+ # Voice settings can be updated by closing current context
412
+ # so new one gets created with updated voice settings
413
+ logger.debug(f"Voice settings changed, closing current context to apply changes")
414
+ try:
415
+ if self._websocket:
416
+ await self._websocket.send(
417
+ json.dumps({"context_id": self._context_id, "close_context": True})
418
+ )
419
+ except Exception as e:
420
+ logger.warning(f"Error closing context for voice settings update: {e}")
421
+ self._context_id = None
422
+ self._started = False
273
423
 
274
424
  async def start(self, frame: StartFrame):
425
+ """Start the ElevenLabs TTS service.
426
+
427
+ Args:
428
+ frame: The start frame containing initialization parameters.
429
+ """
275
430
  await super().start(frame)
276
431
  self._output_format = output_format_from_sample_rate(self.sample_rate)
277
432
  await self._connect()
278
433
 
279
434
  async def stop(self, frame: EndFrame):
435
+ """Stop the ElevenLabs TTS service.
436
+
437
+ Args:
438
+ frame: The end frame.
439
+ """
280
440
  await super().stop(frame)
281
441
  await self._disconnect()
282
442
 
283
443
  async def cancel(self, frame: CancelFrame):
444
+ """Cancel the ElevenLabs TTS service.
445
+
446
+ Args:
447
+ frame: The cancel frame.
448
+ """
284
449
  await super().cancel(frame)
285
450
  await self._disconnect()
286
451
 
287
452
  async def flush_audio(self):
453
+ """Flush any pending audio and finalize the current context."""
288
454
  if not self._context_id or not self._websocket:
289
455
  return
290
456
  self.logger.trace(f"{self}: flushing audio")
@@ -292,6 +458,12 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
292
458
  await self._websocket.send(json.dumps(msg))
293
459
 
294
460
  async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
461
+ """Push a frame and handle state changes.
462
+
463
+ Args:
464
+ frame: The frame to push.
465
+ direction: The direction to push the frame.
466
+ """
295
467
  await super().push_frame(frame, direction)
296
468
  if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
297
469
  self._started = False
@@ -320,7 +492,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
320
492
 
321
493
  async def _connect_websocket(self):
322
494
  try:
323
- if self._websocket and self._websocket.open:
495
+ if self._websocket and self._websocket.state is State.OPEN:
324
496
  return
325
497
 
326
498
  self.logger.debug("Connecting to ElevenLabs")
@@ -336,6 +508,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
336
508
  if self._settings["enable_logging"]:
337
509
  url += f"&enable_logging={self._settings['enable_logging']}"
338
510
 
511
+ if self._settings["apply_text_normalization"] is not None:
512
+ url += f"&apply_text_normalization={self._settings['apply_text_normalization']}"
513
+
339
514
  # Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS
340
515
  language = self._settings["language"]
341
516
  if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None:
@@ -347,8 +522,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
347
522
  )
348
523
 
349
524
  # Set max websocket message size to 16MB for large audio responses
350
- self._websocket = await websockets.connect(
351
- url, max_size=16 * 1024 * 1024, extra_headers={"xi-api-key": self._api_key}
525
+ self._websocket = await websocket_connect(
526
+ url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
352
527
  )
353
528
 
354
529
  except Exception as e:
@@ -366,6 +541,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
366
541
  if self._context_id:
367
542
  await self._websocket.send(json.dumps({"close_socket": True}))
368
543
  await self._websocket.close()
544
+ logger.debug("Disconnected from ElevenLabs")
369
545
  except Exception as e:
370
546
  self.logger.error(f"{self} error closing websocket: {e}")
371
547
 
@@ -375,6 +551,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
375
551
  raise Exception("Websocket not connected")
376
552
 
377
553
  async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
554
+ """Handle interruption by closing the current context."""
378
555
  await super()._handle_interruption(frame, direction)
379
556
 
380
557
  # Close the current context when interrupted without closing the websocket
@@ -396,9 +573,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
396
573
  self._started = False
397
574
 
398
575
  async def _receive_messages(self):
399
- async for message in WatchdogAsyncIterator(
400
- self._get_websocket(), manager=self.task_manager
401
- ):
576
+ """Handle incoming WebSocket messages from ElevenLabs."""
577
+ async for message in self._get_websocket():
402
578
  msg = json.loads(message)
403
579
 
404
580
  received_ctx_id = msg.get("contextId")
@@ -411,10 +587,18 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
411
587
  continue
412
588
 
413
589
  # Check if this message belongs to the current context.
414
- # This should never happen, so warn about it.
415
590
  if not self.audio_context_available(received_ctx_id):
416
- logger.warning(f"Ignoring message from unavailable context: {received_ctx_id}")
417
- continue
591
+ if self._context_id == received_ctx_id:
592
+ logger.debug(
593
+ f"Received a delayed message, recreating the context: {self._context_id}"
594
+ )
595
+ await self.create_audio_context(self._context_id)
596
+ else:
597
+ # This can happen if a message is received _after_ we have closed a context
598
+ # due to user interruption but _before_ the `isFinal` message for the context
599
+ # is received.
600
+ logger.debug(f"Ignoring message from unavailable context: {received_ctx_id}")
601
+ continue
418
602
 
419
603
  if msg.get("audio"):
420
604
  await self.stop_ttfb_metrics()
@@ -423,18 +607,37 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
423
607
  audio = base64.b64decode(msg["audio"])
424
608
  frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
425
609
  await self.append_to_audio_context(received_ctx_id, frame)
610
+
426
611
  if msg.get("alignment"):
427
- word_times = calculate_word_times(msg["alignment"], self._cumulative_time)
428
- await self.add_word_timestamps(word_times)
429
- self._cumulative_time = word_times[-1][1]
612
+ alignment = msg["alignment"]
613
+ word_times = calculate_word_times(alignment, self._cumulative_time)
614
+
615
+ if word_times:
616
+ await self.add_word_timestamps(word_times)
617
+
618
+ # Calculate the actual end time of this audio chunk
619
+ char_start_times_ms = alignment.get("charStartTimesMs", [])
620
+ char_durations_ms = alignment.get("charDurationsMs", [])
621
+
622
+ if char_start_times_ms and char_durations_ms:
623
+ # End time = start time of last character + duration of last character
624
+ chunk_end_time_ms = char_start_times_ms[-1] + char_durations_ms[-1]
625
+ chunk_end_time_seconds = chunk_end_time_ms / 1000.0
626
+ self._cumulative_time += chunk_end_time_seconds
627
+ else:
628
+ # Fallback: use the last word's start time (current behavior)
629
+ self._cumulative_time = word_times[-1][1]
630
+ logger.warning(
631
+ "_receive_messages: using fallback timing method - consider investigating alignment data structure"
632
+ )
430
633
 
431
634
  async def _keepalive_task_handler(self):
432
- KEEPALIVE_SLEEP = 10 if self.task_manager.task_watchdog_enabled else 3
635
+ """Send periodic keepalive messages to maintain WebSocket connection."""
636
+ KEEPALIVE_SLEEP = 10
433
637
  while True:
434
- self.reset_watchdog()
435
638
  await asyncio.sleep(KEEPALIVE_SLEEP)
436
639
  try:
437
- if self._websocket and self._websocket.open:
640
+ if self._websocket and self._websocket.state is State.OPEN:
438
641
  if self._context_id:
439
642
  # Send keepalive with context ID to keep the connection alive
440
643
  keepalive_message = {
@@ -454,16 +657,25 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
454
657
  break
455
658
 
456
659
  async def _send_text(self, text: str):
660
+ """Send text to the WebSocket for synthesis."""
457
661
  if self._websocket and self._context_id:
458
662
  msg = {"text": text, "context_id": self._context_id}
459
663
  await self._websocket.send(json.dumps(msg))
460
664
 
461
665
  @traced_tts
462
666
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
667
+ """Generate speech from text using ElevenLabs' streaming WebSocket API.
668
+
669
+ Args:
670
+ text: The text to synthesize into speech.
671
+
672
+ Yields:
673
+ Frame: Audio frames containing the synthesized speech.
674
+ """
463
675
  self.logger.debug(f"{self}: Generating TTS [{text}]")
464
676
 
465
677
  try:
466
- if not self._websocket or self._websocket.closed:
678
+ if not self._websocket or self._websocket.state is State.CLOSED:
467
679
  await self._connect()
468
680
  self.logger.debug("Connected to ElevenLabs")
469
681
 
@@ -473,9 +685,16 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
473
685
  yield TTSStartedFrame()
474
686
  self._started = True
475
687
  self._cumulative_time = 0
476
- # Create new context ID and register it
477
- self._context_id = str(uuid.uuid4())
478
- await self.create_audio_context(self._context_id)
688
+ # If a context ID does not exist, create a new one and
689
+ # register it. If an ID exists, that means the Pipeline is
690
+ # configured for allow_interruptions=False, so continue
691
+ # using the current ID. When interruptions are enabled
692
+ # (e.g. allow_interruptions=True), user speech results in
693
+ # an interruption, which resets the context ID.
694
+ if not self._context_id:
695
+ self._context_id = str(uuid.uuid4())
696
+ if not self.audio_context_available(self._context_id):
697
+ await self.create_audio_context(self._context_id)
479
698
 
480
699
  # Initialize context with voice settings
481
700
  msg = {"text": " ", "context_id": self._context_id}
@@ -499,19 +718,27 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
499
718
 
500
719
 
501
720
  class ElevenLabsHttpTTSService(WordTTSService):
502
- """ElevenLabs Text-to-Speech service using HTTP streaming with word timestamps.
721
+ """ElevenLabs HTTP-based TTS service with word timestamps.
503
722
 
504
- Args:
505
- api_key: ElevenLabs API key
506
- voice_id: ID of the voice to use
507
- aiohttp_session: aiohttp ClientSession
508
- model: Model ID (default: "eleven_flash_v2_5" for low latency)
509
- base_url: API base URL
510
- sample_rate: Output sample rate
511
- params: Additional parameters for voice configuration
723
+ Provides text-to-speech using ElevenLabs' HTTP streaming API for simpler,
724
+ non-WebSocket integration. Suitable for use cases where streaming WebSocket
725
+ connection is not required or desired.
512
726
  """
513
727
 
514
728
  class InputParams(BaseModel):
729
+ """Input parameters for ElevenLabs HTTP TTS configuration.
730
+
731
+ Parameters:
732
+ language: Language to use for synthesis.
733
+ optimize_streaming_latency: Latency optimization level (0-4).
734
+ stability: Voice stability control (0.0 to 1.0).
735
+ similarity_boost: Similarity boost control (0.0 to 1.0).
736
+ style: Style control for voice expression (0.0 to 1.0).
737
+ use_speaker_boost: Whether to use speaker boost enhancement.
738
+ speed: Voice speed control (0.25 to 4.0).
739
+ apply_text_normalization: Text normalization mode ("auto", "on", "off").
740
+ """
741
+
515
742
  language: Optional[Language] = None
516
743
  optimize_streaming_latency: Optional[int] = None
517
744
  stability: Optional[float] = None
@@ -519,6 +746,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
519
746
  style: Optional[float] = None
520
747
  use_speaker_boost: Optional[bool] = None
521
748
  speed: Optional[float] = None
749
+ apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
522
750
 
523
751
  def __init__(
524
752
  self,
@@ -526,12 +754,24 @@ class ElevenLabsHttpTTSService(WordTTSService):
526
754
  api_key: str,
527
755
  voice_id: str,
528
756
  aiohttp_session: aiohttp.ClientSession,
529
- model: str = "eleven_flash_v2_5",
757
+ model: str = "eleven_turbo_v2_5",
530
758
  base_url: str = "https://api.elevenlabs.io",
531
759
  sample_rate: Optional[int] = None,
532
760
  params: Optional[InputParams] = None,
533
761
  **kwargs,
534
762
  ):
763
+ """Initialize the ElevenLabs HTTP TTS service.
764
+
765
+ Args:
766
+ api_key: ElevenLabs API key for authentication.
767
+ voice_id: ID of the voice to use for synthesis.
768
+ aiohttp_session: aiohttp ClientSession for HTTP requests.
769
+ model: TTS model to use (e.g., "eleven_turbo_v2_5").
770
+ base_url: Base URL for ElevenLabs HTTP API.
771
+ sample_rate: Audio sample rate. If None, uses default.
772
+ params: Additional input parameters for voice customization.
773
+ **kwargs: Additional arguments passed to the parent service.
774
+ """
535
775
  super().__init__(
536
776
  aggregate_sentences=True,
537
777
  push_text_frames=False,
@@ -557,6 +797,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
557
797
  "style": params.style,
558
798
  "use_speaker_boost": params.use_speaker_boost,
559
799
  "speed": params.speed,
800
+ "apply_text_normalization": params.apply_text_normalization,
560
801
  }
561
802
  self.set_model_name(model)
562
803
  self.set_voice(voice_id)
@@ -571,11 +812,22 @@ class ElevenLabsHttpTTSService(WordTTSService):
571
812
  self._previous_text = ""
572
813
 
573
814
  def language_to_service_language(self, language: Language) -> Optional[str]:
574
- """Convert pipecat Language to ElevenLabs language code."""
815
+ """Convert pipecat Language to ElevenLabs language code.
816
+
817
+ Args:
818
+ language: The language to convert.
819
+
820
+ Returns:
821
+ The ElevenLabs-specific language code, or None if not supported.
822
+ """
575
823
  return language_to_elevenlabs_language(language)
576
824
 
577
825
  def can_generate_metrics(self) -> bool:
578
- """Indicate that this service can generate usage metrics."""
826
+ """Check if this service can generate processing metrics.
827
+
828
+ Returns:
829
+ True, as ElevenLabs HTTP service supports metrics generation.
830
+ """
579
831
  return True
580
832
 
581
833
  def _set_voice_settings(self):
@@ -589,12 +841,22 @@ class ElevenLabsHttpTTSService(WordTTSService):
589
841
  logger.debug(f"{self}: Reset internal state")
590
842
 
591
843
  async def start(self, frame: StartFrame):
592
- """Initialize the service upon receiving a StartFrame."""
844
+ """Start the ElevenLabs HTTP TTS service.
845
+
846
+ Args:
847
+ frame: The start frame containing initialization parameters.
848
+ """
593
849
  await super().start(frame)
594
850
  self._output_format = output_format_from_sample_rate(self.sample_rate)
595
851
  self._reset_state()
596
852
 
597
853
  async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
854
+ """Push a frame and handle state changes.
855
+
856
+ Args:
857
+ frame: The frame to push.
858
+ direction: The direction to push the frame.
859
+ """
598
860
  await super().push_frame(frame, direction)
599
861
  if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
600
862
  # Reset timing on interruption or stop
@@ -610,21 +872,23 @@ class ElevenLabsHttpTTSService(WordTTSService):
610
872
  def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
611
873
  """Calculate word timing from character alignment data.
612
874
 
613
- Example input data:
614
- {
615
- "characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
616
- "character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
617
- "character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
618
- }
619
-
620
- Would produce word times (with cumulative_time=0):
621
- [("Hello", 0.1), ("world", 0.5)]
622
-
623
875
  Args:
624
- alignment_info: Character timing data from ElevenLabs
876
+ alignment_info: Character timing data from ElevenLabs.
625
877
 
626
878
  Returns:
627
- List of (word, timestamp) pairs
879
+ List of (word, timestamp) pairs.
880
+
881
+ Example input data::
882
+
883
+ {
884
+ "characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
885
+ "character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
886
+ "character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
887
+ }
888
+
889
+ Would produce word times (with cumulative_time=0)::
890
+
891
+ [("Hello", 0.1), ("world", 0.5)]
628
892
  """
629
893
  chars = alignment_info.get("characters", [])
630
894
  char_start_times = alignment_info.get("character_start_times_seconds", [])
@@ -675,10 +939,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
675
939
  Includes previous text as context for better prosody continuity.
676
940
 
677
941
  Args:
678
- text: Text to convert to speech
942
+ text: Text to convert to speech.
679
943
 
680
944
  Yields:
681
- Audio and control frames
945
+ Frame: Audio and control frames containing the synthesized speech.
682
946
  """
683
947
  self.logger.debug(f"{self}: Generating TTS [{text}]")
684
948
 
@@ -717,6 +981,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
717
981
  }
718
982
  if self._settings["optimize_streaming_latency"] is not None:
719
983
  params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
984
+ if self._settings["apply_text_normalization"] is not None:
985
+ params["apply_text_normalization"] = self._settings["apply_text_normalization"]
720
986
 
721
987
  self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")
722
988