dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -0,0 +1,339 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """ElevenLabs speech-to-text service implementation.
8
+
9
+ This module provides integration with ElevenLabs' Speech-to-Text API for transcription
10
+ using segmented audio processing. The service uploads audio files and receives
11
+ transcription results directly.
12
+ """
13
+
14
+ import io
15
+ from typing import AsyncGenerator, Optional
16
+
17
+ import aiohttp
18
+ from loguru import logger
19
+ from pydantic import BaseModel
20
+
21
+ from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
22
+ from pipecat.services.stt_service import SegmentedSTTService
23
+ from pipecat.transcriptions.language import Language
24
+ from pipecat.utils.time import time_now_iso8601
25
+ from pipecat.utils.tracing.service_decorators import traced_stt
26
+
27
+
28
+ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
29
+ """Convert a Language enum to ElevenLabs language code.
30
+
31
+ Source:
32
+ https://elevenlabs.io/docs/capabilities/speech-to-text
33
+
34
+ Args:
35
+ language: The Language enum value to convert.
36
+
37
+ Returns:
38
+ The corresponding ElevenLabs language code, or None if not supported.
39
+ """
40
+ BASE_LANGUAGES = {
41
+ Language.AF: "afr", # Afrikaans
42
+ Language.AM: "amh", # Amharic
43
+ Language.AR: "ara", # Arabic
44
+ Language.HY: "hye", # Armenian
45
+ Language.AS: "asm", # Assamese
46
+ Language.AST: "ast", # Asturian
47
+ Language.AZ: "aze", # Azerbaijani
48
+ Language.BE: "bel", # Belarusian
49
+ Language.BN: "ben", # Bengali
50
+ Language.BS: "bos", # Bosnian
51
+ Language.BG: "bul", # Bulgarian
52
+ Language.MY: "mya", # Burmese
53
+ Language.YUE: "yue", # Cantonese
54
+ Language.CA: "cat", # Catalan
55
+ Language.CEB: "ceb", # Cebuano
56
+ Language.NY: "nya", # Chichewa
57
+ Language.HR: "hrv", # Croatian
58
+ Language.CS: "ces", # Czech
59
+ Language.DA: "dan", # Danish
60
+ Language.NL: "nld", # Dutch
61
+ Language.EN: "eng", # English
62
+ Language.ET: "est", # Estonian
63
+ Language.FIL: "fil", # Filipino
64
+ Language.FI: "fin", # Finnish
65
+ Language.FR: "fra", # French
66
+ Language.FF: "ful", # Fulah
67
+ Language.GL: "glg", # Galician
68
+ Language.LG: "lug", # Ganda
69
+ Language.KA: "kat", # Georgian
70
+ Language.DE: "deu", # German
71
+ Language.EL: "ell", # Greek
72
+ Language.GU: "guj", # Gujarati
73
+ Language.HA: "hau", # Hausa
74
+ Language.HE: "heb", # Hebrew
75
+ Language.HI: "hin", # Hindi
76
+ Language.HU: "hun", # Hungarian
77
+ Language.IS: "isl", # Icelandic
78
+ Language.IG: "ibo", # Igbo
79
+ Language.ID: "ind", # Indonesian
80
+ Language.GA: "gle", # Irish
81
+ Language.IT: "ita", # Italian
82
+ Language.JA: "jpn", # Japanese
83
+ Language.JV: "jav", # Javanese
84
+ Language.KEA: "kea", # Kabuverdianu
85
+ Language.KN: "kan", # Kannada
86
+ Language.KK: "kaz", # Kazakh
87
+ Language.KM: "khm", # Khmer
88
+ Language.KO: "kor", # Korean
89
+ Language.KU: "kur", # Kurdish
90
+ Language.KY: "kir", # Kyrgyz
91
+ Language.LO: "lao", # Lao
92
+ Language.LV: "lav", # Latvian
93
+ Language.LN: "lin", # Lingala
94
+ Language.LT: "lit", # Lithuanian
95
+ Language.LUO: "luo", # Luo
96
+ Language.LB: "ltz", # Luxembourgish
97
+ Language.MK: "mkd", # Macedonian
98
+ Language.MS: "msa", # Malay
99
+ Language.ML: "mal", # Malayalam
100
+ Language.MT: "mlt", # Maltese
101
+ Language.ZH: "zho", # Mandarin Chinese
102
+ Language.MI: "mri", # Māori
103
+ Language.MR: "mar", # Marathi
104
+ Language.MN: "mon", # Mongolian
105
+ Language.NE: "nep", # Nepali
106
+ Language.NSO: "nso", # Northern Sotho
107
+ Language.NO: "nor", # Norwegian
108
+ Language.OC: "oci", # Occitan
109
+ Language.OR: "ori", # Odia
110
+ Language.PS: "pus", # Pashto
111
+ Language.FA: "fas", # Persian
112
+ Language.PL: "pol", # Polish
113
+ Language.PT: "por", # Portuguese
114
+ Language.PA: "pan", # Punjabi
115
+ Language.RO: "ron", # Romanian
116
+ Language.RU: "rus", # Russian
117
+ Language.SR: "srp", # Serbian
118
+ Language.SN: "sna", # Shona
119
+ Language.SD: "snd", # Sindhi
120
+ Language.SK: "slk", # Slovak
121
+ Language.SL: "slv", # Slovenian
122
+ Language.SO: "som", # Somali
123
+ Language.ES: "spa", # Spanish
124
+ Language.SW: "swa", # Swahili
125
+ Language.SV: "swe", # Swedish
126
+ Language.TA: "tam", # Tamil
127
+ Language.TG: "tgk", # Tajik
128
+ Language.TE: "tel", # Telugu
129
+ Language.TH: "tha", # Thai
130
+ Language.TR: "tur", # Turkish
131
+ Language.UK: "ukr", # Ukrainian
132
+ Language.UMB: "umb", # Umbundu
133
+ Language.UR: "urd", # Urdu
134
+ Language.UZ: "uzb", # Uzbek
135
+ Language.VI: "vie", # Vietnamese
136
+ Language.CY: "cym", # Welsh
137
+ Language.WO: "wol", # Wolof
138
+ Language.XH: "xho", # Xhosa
139
+ Language.ZU: "zul", # Zulu
140
+ }
141
+
142
+ result = BASE_LANGUAGES.get(language)
143
+
144
+ # If not found in base languages, try to find the base language from a variant
145
+ if not result:
146
+ lang_str = str(language.value)
147
+ base_code = lang_str.split("-")[0].lower()
148
+ result = base_code if base_code in BASE_LANGUAGES.values() else None
149
+
150
+ return result
151
+
152
+
153
+ class ElevenLabsSTTService(SegmentedSTTService):
154
+ """Speech-to-text service using ElevenLabs' file-based API.
155
+
156
+ This service uses ElevenLabs' Speech-to-Text API to perform transcription on audio
157
+ segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
158
+ The service uploads audio files to ElevenLabs and receives transcription results directly.
159
+ """
160
+
161
+ class InputParams(BaseModel):
162
+ """Configuration parameters for ElevenLabs STT API.
163
+
164
+ Parameters:
165
+ language: Target language for transcription.
166
+ tag_audio_events: Whether to include audio events like (laughter), (coughing), in the transcription.
167
+ """
168
+
169
+ language: Optional[Language] = None
170
+ tag_audio_events: bool = True
171
+
172
+ def __init__(
173
+ self,
174
+ *,
175
+ api_key: str,
176
+ aiohttp_session: aiohttp.ClientSession,
177
+ base_url: str = "https://api.elevenlabs.io",
178
+ model: str = "scribe_v1",
179
+ sample_rate: Optional[int] = None,
180
+ params: Optional[InputParams] = None,
181
+ **kwargs,
182
+ ):
183
+ """Initialize the ElevenLabs STT service.
184
+
185
+ Args:
186
+ api_key: ElevenLabs API key for authentication.
187
+ aiohttp_session: aiohttp ClientSession for HTTP requests.
188
+ base_url: Base URL for ElevenLabs API.
189
+ model: Model ID for transcription. Defaults to "scribe_v1".
190
+ sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
191
+ params: Configuration parameters for the STT service.
192
+ **kwargs: Additional arguments passed to SegmentedSTTService.
193
+ """
194
+ super().__init__(
195
+ sample_rate=sample_rate,
196
+ **kwargs,
197
+ )
198
+
199
+ params = params or ElevenLabsSTTService.InputParams()
200
+
201
+ self._api_key = api_key
202
+ self._base_url = base_url
203
+ self._session = aiohttp_session
204
+ self._model_id = model
205
+ self._tag_audio_events = params.tag_audio_events
206
+
207
+ self._settings = {
208
+ "language": self.language_to_service_language(params.language)
209
+ if params.language
210
+ else "eng",
211
+ }
212
+
213
+ def can_generate_metrics(self) -> bool:
214
+ """Check if the service can generate processing metrics.
215
+
216
+ Returns:
217
+ True, as ElevenLabs STT service supports metrics generation.
218
+ """
219
+ return True
220
+
221
+ def language_to_service_language(self, language: Language) -> Optional[str]:
222
+ """Convert a Language enum to ElevenLabs service-specific language code.
223
+
224
+ Args:
225
+ language: The language to convert.
226
+
227
+ Returns:
228
+ The ElevenLabs-specific language code, or None if not supported.
229
+ """
230
+ return language_to_elevenlabs_language(language)
231
+
232
+ async def set_language(self, language: Language):
233
+ """Set the transcription language.
234
+
235
+ Args:
236
+ language: The language to use for speech-to-text transcription.
237
+ """
238
+ self.logger.info(f"Switching STT language to: [{language}]")
239
+ self._settings["language"] = self.language_to_service_language(language)
240
+
241
+ async def set_model(self, model: str):
242
+ """Set the STT model.
243
+
244
+ Args:
245
+ model: The model name to use for transcription.
246
+
247
+ Note:
248
+ ElevenLabs STT API does not currently support model selection.
249
+ This method is provided for interface compatibility.
250
+ """
251
+ await super().set_model(model)
252
+ self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
253
+
254
+ async def _transcribe_audio(self, audio_data: bytes) -> dict:
255
+ """Upload audio data to ElevenLabs and get transcription result.
256
+
257
+ Args:
258
+ audio_data: Raw audio bytes in WAV format.
259
+
260
+ Returns:
261
+ The transcription result data.
262
+
263
+ Raises:
264
+ Exception: If transcription fails or returns an error.
265
+ """
266
+ url = f"{self._base_url}/v1/speech-to-text"
267
+ headers = {"xi-api-key": self._api_key}
268
+
269
+ # Create form data with the audio file
270
+ data = aiohttp.FormData()
271
+ data.add_field(
272
+ "file",
273
+ io.BytesIO(audio_data),
274
+ filename="audio.wav",
275
+ content_type="audio/x-wav",
276
+ )
277
+
278
+ # Add required model_id, language_code, and tag_audio_events
279
+ data.add_field("model_id", self._model_id)
280
+ data.add_field("language_code", self._settings["language"])
281
+ data.add_field("tag_audio_events", str(self._tag_audio_events).lower())
282
+
283
+ async with self._session.post(url, data=data, headers=headers) as response:
284
+ if response.status != 200:
285
+ error_text = await response.text()
286
+ self.logger.error(f"ElevenLabs transcription error: {error_text}")
287
+ raise Exception(f"Transcription failed with status {response.status}: {error_text}")
288
+
289
+ result = await response.json()
290
+ return result
291
+
292
+ @traced_stt
293
+ async def _handle_transcription(
294
+ self, transcript: str, is_final: bool, language: Optional[str] = None
295
+ ):
296
+ """Handle a transcription result with tracing."""
297
+ await self.stop_ttfb_metrics()
298
+ await self.stop_processing_metrics()
299
+
300
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
301
+ """Transcribe an audio segment using ElevenLabs' STT API.
302
+
303
+ Args:
304
+ audio: Raw audio bytes in WAV format (already converted by base class).
305
+
306
+ Yields:
307
+ Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
308
+
309
+ Note:
310
+ The audio is already in WAV format from the SegmentedSTTService.
311
+ Only non-empty transcriptions are yielded.
312
+ """
313
+ try:
314
+ await self.start_processing_metrics()
315
+ await self.start_ttfb_metrics()
316
+
317
+ # Upload audio and get transcription result directly
318
+ result = await self._transcribe_audio(audio)
319
+
320
+ # Extract transcription text
321
+ text = result.get("text", "").strip()
322
+ if text:
323
+ # Use the language_code returned by the API
324
+ detected_language = result.get("language_code", "eng")
325
+
326
+ await self._handle_transcription(text, True, detected_language)
327
+ self.logger.debug(f"Transcription: [{text}]")
328
+
329
+ yield TranscriptionFrame(
330
+ text,
331
+ self._user_id,
332
+ time_now_iso8601(),
333
+ detected_language,
334
+ result=result,
335
+ )
336
+
337
+ except Exception as e:
338
+ self.logger.error(f"ElevenLabs STT error: {e}")
339
+ yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
@@ -25,9 +25,9 @@ from pipecat.frames.frames import (
25
25
  EndFrame,
26
26
  ErrorFrame,
27
27
  Frame,
28
+ InterruptionFrame,
28
29
  LLMFullResponseEndFrame,
29
30
  StartFrame,
30
- StartInterruptionFrame,
31
31
  TTSAudioRawFrame,
32
32
  TTSStartedFrame,
33
33
  TTSStoppedFrame,
@@ -172,16 +172,24 @@ def build_elevenlabs_voice_settings(
172
172
 
173
173
 
174
174
  def calculate_word_times(
175
- alignment_info: Mapping[str, Any], cumulative_time: float
176
- ) -> List[Tuple[str, float]]:
175
+ alignment_info: Mapping[str, Any],
176
+ cumulative_time: float,
177
+ partial_word: str = "",
178
+ partial_word_start_time: float = 0.0,
179
+ ) -> tuple[List[Tuple[str, float]], str, float]:
177
180
  """Calculate word timestamps from character alignment information.
178
181
 
179
182
  Args:
180
183
  alignment_info: Character alignment data from ElevenLabs API.
181
184
  cumulative_time: Base time offset for this chunk.
185
+ partial_word: Partial word carried over from previous chunk.
186
+ partial_word_start_time: Start time of the partial word.
182
187
 
183
188
  Returns:
184
- List of (word, timestamp) tuples.
189
+ Tuple of (word_times, new_partial_word, new_partial_word_start_time):
190
+ - word_times: List of (word, timestamp) tuples for complete words
191
+ - new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space)
192
+ - new_partial_word_start_time: Start time of the incomplete word
185
193
  """
186
194
  chars = alignment_info["chars"]
187
195
  char_start_times_ms = alignment_info["charStartTimesMs"]
@@ -190,41 +198,37 @@ def calculate_word_times(
190
198
  logger.error(
191
199
  f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
192
200
  )
193
- return []
201
+ return ([], partial_word, partial_word_start_time)
194
202
 
195
203
  # Build words and track their start positions
196
204
  words = []
197
- word_start_indices = []
198
- current_word = ""
199
- word_start_index = None
205
+ word_start_times = []
206
+ current_word = partial_word # Start with any partial word from previous chunk
207
+ word_start_time = partial_word_start_time if partial_word else None
200
208
 
201
209
  for i, char in enumerate(chars):
202
210
  if char == " ":
203
211
  # End of current word
204
212
  if current_word: # Only add non-empty words
205
213
  words.append(current_word)
206
- word_start_indices.append(word_start_index)
214
+ word_start_times.append(word_start_time)
207
215
  current_word = ""
208
- word_start_index = None
216
+ word_start_time = None
209
217
  else:
210
218
  # Building a word
211
- if word_start_index is None: # First character of new word
212
- word_start_index = i
219
+ if word_start_time is None: # First character of new word
220
+ # Convert from milliseconds to seconds and add cumulative offset
221
+ word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0)
213
222
  current_word += char
214
223
 
215
- # Handle the last word if there's no trailing space
216
- if current_word and word_start_index is not None:
217
- words.append(current_word)
218
- word_start_indices.append(word_start_index)
224
+ # Build result for complete words
225
+ word_times = list(zip(words, word_start_times))
219
226
 
220
- # Calculate timestamps for each word
221
- word_times = []
222
- for word, start_idx in zip(words, word_start_indices):
223
- # Convert from milliseconds to seconds and add cumulative offset
224
- start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
225
- word_times.append((word, start_time_seconds))
227
+ # Return any incomplete word at the end of this chunk
228
+ new_partial_word = current_word if current_word else ""
229
+ new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0
226
230
 
227
- return word_times
231
+ return (word_times, new_partial_word, new_partial_word_start_time)
228
232
 
229
233
 
230
234
  class ElevenLabsTTSService(AudioContextWordTTSService):
@@ -336,6 +340,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
336
340
  # there's an interruption or TTSStoppedFrame.
337
341
  self._started = False
338
342
  self._cumulative_time = 0
343
+ # Track partial words that span across alignment chunks
344
+ self._partial_word = ""
345
+ self._partial_word_start_time = 0.0
339
346
 
340
347
  # Context management for v1 multi API
341
348
  self._context_id = None
@@ -465,7 +472,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
465
472
  direction: The direction to push the frame.
466
473
  """
467
474
  await super().push_frame(frame, direction)
468
- if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
475
+ if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
469
476
  self._started = False
470
477
  if isinstance(frame, TTSStoppedFrame):
471
478
  await self.add_word_timestamps([("Reset", 0)])
@@ -526,6 +533,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
526
533
  url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
527
534
  )
528
535
 
536
+ await self._call_event_handler("on_connected")
529
537
  except Exception as e:
530
538
  self.logger.error(f"{self} initialization error: {e}")
531
539
  self._websocket = None
@@ -544,13 +552,18 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
544
552
  logger.debug("Disconnected from ElevenLabs")
545
553
  except Exception as e:
546
554
  self.logger.error(f"{self} error closing websocket: {e}")
555
+ finally:
556
+ self._started = False
557
+ self._context_id = None
558
+ self._websocket = None
559
+ await self._call_event_handler("on_disconnected")
547
560
 
548
561
  def _get_websocket(self):
549
562
  if self._websocket:
550
563
  return self._websocket
551
564
  raise Exception("Websocket not connected")
552
565
 
553
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
566
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
554
567
  """Handle interruption by closing the current context."""
555
568
  await super()._handle_interruption(frame, direction)
556
569
 
@@ -559,7 +572,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
559
572
  logger.trace(f"Closing context {self._context_id} due to interruption")
560
573
  try:
561
574
  # ElevenLabs requires that Pipecat manages the contexts and closes them
562
- # when they're not longer in use. Since a StartInterruptionFrame is pushed
575
+ # when they're not longer in use. Since an InterruptionFrame is pushed
563
576
  # every time the user speaks, we'll use this as a trigger to close the context
564
577
  # and reset the state.
565
578
  # Note: We do not need to call remove_audio_context here, as the context is
@@ -571,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
571
584
  logger.error(f"Error closing context on interruption: {e}")
572
585
  self._context_id = None
573
586
  self._started = False
587
+ self._partial_word = ""
588
+ self._partial_word_start_time = 0.0
574
589
 
575
590
  async def _receive_messages(self):
576
591
  """Handle incoming WebSocket messages from ElevenLabs."""
@@ -610,7 +625,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
610
625
 
611
626
  if msg.get("alignment"):
612
627
  alignment = msg["alignment"]
613
- word_times = calculate_word_times(alignment, self._cumulative_time)
628
+ word_times, self._partial_word, self._partial_word_start_time = (
629
+ calculate_word_times(
630
+ alignment,
631
+ self._cumulative_time,
632
+ self._partial_word,
633
+ self._partial_word_start_time,
634
+ )
635
+ )
614
636
 
615
637
  if word_times:
616
638
  await self.add_word_timestamps(word_times)
@@ -685,6 +707,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
685
707
  yield TTSStartedFrame()
686
708
  self._started = True
687
709
  self._cumulative_time = 0
710
+ self._partial_word = ""
711
+ self._partial_word_start_time = 0.0
688
712
  # If a context ID does not exist, create a new one and
689
713
  # register it. If an ID exists, that means the Pipeline is
690
714
  # configured for allow_interruptions=False, so continue
@@ -758,6 +782,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
758
782
  base_url: str = "https://api.elevenlabs.io",
759
783
  sample_rate: Optional[int] = None,
760
784
  params: Optional[InputParams] = None,
785
+ aggregate_sentences: Optional[bool] = True,
761
786
  **kwargs,
762
787
  ):
763
788
  """Initialize the ElevenLabs HTTP TTS service.
@@ -770,10 +795,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
770
795
  base_url: Base URL for ElevenLabs HTTP API.
771
796
  sample_rate: Audio sample rate. If None, uses default.
772
797
  params: Additional input parameters for voice customization.
798
+ aggregate_sentences: Whether to aggregate sentences within the TTSService.
773
799
  **kwargs: Additional arguments passed to the parent service.
774
800
  """
775
801
  super().__init__(
776
- aggregate_sentences=True,
802
+ aggregate_sentences=aggregate_sentences,
777
803
  push_text_frames=False,
778
804
  push_stop_frames=True,
779
805
  sample_rate=sample_rate,
@@ -811,6 +837,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
811
837
  # Store previous text for context within a turn
812
838
  self._previous_text = ""
813
839
 
840
+ # Track partial words that span across alignment chunks
841
+ self._partial_word = ""
842
+ self._partial_word_start_time = 0.0
843
+
814
844
  def language_to_service_language(self, language: Language) -> Optional[str]:
815
845
  """Convert pipecat Language to ElevenLabs language code.
816
846
 
@@ -838,6 +868,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
838
868
  self._cumulative_time = 0
839
869
  self._started = False
840
870
  self._previous_text = ""
871
+ self._partial_word = ""
872
+ self._partial_word_start_time = 0.0
841
873
  logger.debug(f"{self}: Reset internal state")
842
874
 
843
875
  async def start(self, frame: StartFrame):
@@ -858,7 +890,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
858
890
  direction: The direction to push the frame.
859
891
  """
860
892
  await super().push_frame(frame, direction)
861
- if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
893
+ if isinstance(frame, (InterruptionFrame, TTSStoppedFrame)):
862
894
  # Reset timing on interruption or stop
863
895
  self._reset_state()
864
896
 
@@ -872,11 +904,13 @@ class ElevenLabsHttpTTSService(WordTTSService):
872
904
  def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
873
905
  """Calculate word timing from character alignment data.
874
906
 
907
+ This method handles partial words that may span across multiple alignment chunks.
908
+
875
909
  Args:
876
910
  alignment_info: Character timing data from ElevenLabs.
877
911
 
878
912
  Returns:
879
- List of (word, timestamp) pairs.
913
+ List of (word, timestamp) pairs for complete words in this chunk.
880
914
 
881
915
  Example input data::
882
916
 
@@ -902,30 +936,28 @@ class ElevenLabsHttpTTSService(WordTTSService):
902
936
  # Build the words and find their start times
903
937
  words = []
904
938
  word_start_times = []
905
- current_word = ""
906
- first_char_idx = -1
939
+ # Start with any partial word from previous chunk
940
+ current_word = self._partial_word
941
+ word_start_time = self._partial_word_start_time if self._partial_word else None
907
942
 
908
943
  for i, char in enumerate(chars):
909
944
  if char == " ":
910
945
  if current_word: # Only add non-empty words
911
946
  words.append(current_word)
912
- # Use time of the first character of the word, offset by cumulative time
913
- word_start_times.append(
914
- self._cumulative_time + char_start_times[first_char_idx]
915
- )
947
+ word_start_times.append(word_start_time)
916
948
  current_word = ""
917
- first_char_idx = -1
949
+ word_start_time = None
918
950
  else:
919
- if not current_word: # This is the first character of a new word
920
- first_char_idx = i
951
+ if word_start_time is None: # First character of a new word
952
+ # Use time of the first character of the word, offset by cumulative time
953
+ word_start_time = self._cumulative_time + char_start_times[i]
921
954
  current_word += char
922
955
 
923
- # Don't forget the last word if there's no trailing space
924
- if current_word and first_char_idx >= 0:
925
- words.append(current_word)
926
- word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
956
+ # Store any incomplete word at the end of this chunk
957
+ self._partial_word = current_word if current_word else ""
958
+ self._partial_word_start_time = word_start_time if word_start_time is not None else 0.0
927
959
 
928
- # Create word-time pairs
960
+ # Create word-time pairs for complete words only
929
961
  word_times = list(zip(words, word_start_times))
930
962
 
931
963
  return word_times
@@ -961,6 +993,9 @@ class ElevenLabsHttpTTSService(WordTTSService):
961
993
  if self._voice_settings:
962
994
  payload["voice_settings"] = self._voice_settings
963
995
 
996
+ if self._settings["apply_text_normalization"] is not None:
997
+ payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
998
+
964
999
  language = self._settings["language"]
965
1000
  if self._model_name in ELEVENLABS_MULTILINGUAL_MODELS and language:
966
1001
  payload["language_code"] = language
@@ -981,8 +1016,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
981
1016
  }
982
1017
  if self._settings["optimize_streaming_latency"] is not None:
983
1018
  params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
984
- if self._settings["apply_text_normalization"] is not None:
985
- params["apply_text_normalization"] = self._settings["apply_text_normalization"]
986
1019
 
987
1020
  self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")
988
1021
 
@@ -1045,6 +1078,14 @@ class ElevenLabsHttpTTSService(WordTTSService):
1045
1078
  logger.error(f"Error processing response: {e}", exc_info=True)
1046
1079
  continue
1047
1080
 
1081
+ # After processing all chunks, emit any remaining partial word
1082
+ # since this is the end of the utterance
1083
+ if self._partial_word:
1084
+ final_word_time = [(self._partial_word, self._partial_word_start_time)]
1085
+ await self.add_word_timestamps(final_word_time)
1086
+ self._partial_word = ""
1087
+ self._partial_word_start_time = 0.0
1088
+
1048
1089
  # After processing all chunks, add the total utterance duration
1049
1090
  # to the cumulative time to ensure next utterance starts after this one
1050
1091
  if utterance_duration > 0: