dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""ElevenLabs text-to-speech service implementations.
|
|
8
|
+
|
|
9
|
+
This module provides WebSocket and HTTP-based TTS services using ElevenLabs API
|
|
10
|
+
with support for streaming audio, word timestamps, and voice customization.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import asyncio
|
|
8
14
|
import base64
|
|
9
15
|
import json
|
|
@@ -32,12 +38,13 @@ from pipecat.services.tts_service import (
|
|
|
32
38
|
WordTTSService,
|
|
33
39
|
)
|
|
34
40
|
from pipecat.transcriptions.language import Language
|
|
35
|
-
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
|
36
41
|
from pipecat.utils.tracing.service_decorators import traced_tts
|
|
37
42
|
|
|
38
43
|
# See .env.example for ElevenLabs configuration needed
|
|
39
44
|
try:
|
|
40
45
|
import websockets
|
|
46
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
47
|
+
from websockets.protocol import State
|
|
41
48
|
except ModuleNotFoundError as e:
|
|
42
49
|
logger.error(f"Exception: {e}")
|
|
43
50
|
logger.error("In order to use ElevenLabs, you need to `pip install pipecat-ai[elevenlabs]`.")
|
|
@@ -57,6 +64,14 @@ ELEVENLABS_MULTILINGUAL_MODELS = {
|
|
|
57
64
|
|
|
58
65
|
|
|
59
66
|
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
67
|
+
"""Convert a Language enum to ElevenLabs language code.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
language: The Language enum value to convert.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The corresponding ElevenLabs language code, or None if not supported.
|
|
74
|
+
"""
|
|
60
75
|
BASE_LANGUAGES = {
|
|
61
76
|
Language.AR: "ar",
|
|
62
77
|
Language.BG: "bg",
|
|
@@ -106,6 +121,14 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
106
121
|
|
|
107
122
|
|
|
108
123
|
def output_format_from_sample_rate(sample_rate: int) -> str:
|
|
124
|
+
"""Get the appropriate output format string for a given sample rate.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
sample_rate: The audio sample rate in Hz.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
The ElevenLabs output format string.
|
|
131
|
+
"""
|
|
109
132
|
match sample_rate:
|
|
110
133
|
case 8000:
|
|
111
134
|
return "pcm_8000"
|
|
@@ -129,10 +152,10 @@ def build_elevenlabs_voice_settings(
|
|
|
129
152
|
"""Build voice settings dictionary for ElevenLabs based on provided settings.
|
|
130
153
|
|
|
131
154
|
Args:
|
|
132
|
-
settings: Dictionary containing voice settings parameters
|
|
155
|
+
settings: Dictionary containing voice settings parameters.
|
|
133
156
|
|
|
134
157
|
Returns:
|
|
135
|
-
Dictionary of voice settings or None if no valid settings are provided
|
|
158
|
+
Dictionary of voice settings or None if no valid settings are provided.
|
|
136
159
|
"""
|
|
137
160
|
voice_setting_keys = ["stability", "similarity_boost", "style", "use_speaker_boost", "speed"]
|
|
138
161
|
|
|
@@ -151,26 +174,83 @@ def build_elevenlabs_voice_settings(
|
|
|
151
174
|
def calculate_word_times(
|
|
152
175
|
alignment_info: Mapping[str, Any], cumulative_time: float
|
|
153
176
|
) -> List[Tuple[str, float]]:
|
|
154
|
-
|
|
177
|
+
"""Calculate word timestamps from character alignment information.
|
|
155
178
|
|
|
156
|
-
|
|
179
|
+
Args:
|
|
180
|
+
alignment_info: Character alignment data from ElevenLabs API.
|
|
181
|
+
cumulative_time: Base time offset for this chunk.
|
|
157
182
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
if a == " " or i == len(zipped_times) - 1:
|
|
164
|
-
t = cumulative_time + (zipped_times[i - 1][1] / 1000.0)
|
|
165
|
-
times.append(t)
|
|
183
|
+
Returns:
|
|
184
|
+
List of (word, timestamp) tuples.
|
|
185
|
+
"""
|
|
186
|
+
chars = alignment_info["chars"]
|
|
187
|
+
char_start_times_ms = alignment_info["charStartTimesMs"]
|
|
166
188
|
|
|
167
|
-
|
|
189
|
+
if len(chars) != len(char_start_times_ms):
|
|
190
|
+
logger.error(
|
|
191
|
+
f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
|
|
192
|
+
)
|
|
193
|
+
return []
|
|
194
|
+
|
|
195
|
+
# Build words and track their start positions
|
|
196
|
+
words = []
|
|
197
|
+
word_start_indices = []
|
|
198
|
+
current_word = ""
|
|
199
|
+
word_start_index = None
|
|
200
|
+
|
|
201
|
+
for i, char in enumerate(chars):
|
|
202
|
+
if char == " ":
|
|
203
|
+
# End of current word
|
|
204
|
+
if current_word: # Only add non-empty words
|
|
205
|
+
words.append(current_word)
|
|
206
|
+
word_start_indices.append(word_start_index)
|
|
207
|
+
current_word = ""
|
|
208
|
+
word_start_index = None
|
|
209
|
+
else:
|
|
210
|
+
# Building a word
|
|
211
|
+
if word_start_index is None: # First character of new word
|
|
212
|
+
word_start_index = i
|
|
213
|
+
current_word += char
|
|
214
|
+
|
|
215
|
+
# Handle the last word if there's no trailing space
|
|
216
|
+
if current_word and word_start_index is not None:
|
|
217
|
+
words.append(current_word)
|
|
218
|
+
word_start_indices.append(word_start_index)
|
|
219
|
+
|
|
220
|
+
# Calculate timestamps for each word
|
|
221
|
+
word_times = []
|
|
222
|
+
for word, start_idx in zip(words, word_start_indices):
|
|
223
|
+
# Convert from milliseconds to seconds and add cumulative offset
|
|
224
|
+
start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
|
|
225
|
+
word_times.append((word, start_time_seconds))
|
|
168
226
|
|
|
169
227
|
return word_times
|
|
170
228
|
|
|
171
229
|
|
|
172
230
|
class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
231
|
+
"""ElevenLabs WebSocket-based TTS service with word timestamps.
|
|
232
|
+
|
|
233
|
+
Provides real-time text-to-speech using ElevenLabs' WebSocket streaming API.
|
|
234
|
+
Supports word-level timestamps, audio context management, and various voice
|
|
235
|
+
customization options including stability, similarity boost, and speed controls.
|
|
236
|
+
"""
|
|
237
|
+
|
|
173
238
|
class InputParams(BaseModel):
|
|
239
|
+
"""Input parameters for ElevenLabs TTS configuration.
|
|
240
|
+
|
|
241
|
+
Parameters:
|
|
242
|
+
language: Language to use for synthesis.
|
|
243
|
+
stability: Voice stability control (0.0 to 1.0).
|
|
244
|
+
similarity_boost: Similarity boost control (0.0 to 1.0).
|
|
245
|
+
style: Style control for voice expression (0.0 to 1.0).
|
|
246
|
+
use_speaker_boost: Whether to use speaker boost enhancement.
|
|
247
|
+
speed: Voice speed control (0.7 to 1.2).
|
|
248
|
+
auto_mode: Whether to enable automatic mode optimization.
|
|
249
|
+
enable_ssml_parsing: Whether to parse SSML tags in text.
|
|
250
|
+
enable_logging: Whether to enable ElevenLabs logging.
|
|
251
|
+
apply_text_normalization: Text normalization mode ("auto", "on", "off").
|
|
252
|
+
"""
|
|
253
|
+
|
|
174
254
|
language: Optional[Language] = None
|
|
175
255
|
stability: Optional[float] = None
|
|
176
256
|
similarity_boost: Optional[float] = None
|
|
@@ -180,18 +260,32 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
180
260
|
auto_mode: Optional[bool] = True
|
|
181
261
|
enable_ssml_parsing: Optional[bool] = None
|
|
182
262
|
enable_logging: Optional[bool] = None
|
|
263
|
+
apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
|
|
183
264
|
|
|
184
265
|
def __init__(
|
|
185
266
|
self,
|
|
186
267
|
*,
|
|
187
268
|
api_key: str,
|
|
188
269
|
voice_id: str,
|
|
189
|
-
model: str = "
|
|
270
|
+
model: str = "eleven_turbo_v2_5",
|
|
190
271
|
url: str = "wss://api.elevenlabs.io",
|
|
191
272
|
sample_rate: Optional[int] = None,
|
|
192
273
|
params: Optional[InputParams] = None,
|
|
274
|
+
aggregate_sentences: Optional[bool] = True,
|
|
193
275
|
**kwargs,
|
|
194
276
|
):
|
|
277
|
+
"""Initialize the ElevenLabs TTS service.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
api_key: ElevenLabs API key for authentication.
|
|
281
|
+
voice_id: ID of the voice to use for synthesis.
|
|
282
|
+
model: TTS model to use (e.g., "eleven_turbo_v2_5").
|
|
283
|
+
url: WebSocket URL for ElevenLabs TTS API.
|
|
284
|
+
sample_rate: Audio sample rate. If None, uses default.
|
|
285
|
+
params: Additional input parameters for voice customization.
|
|
286
|
+
aggregate_sentences: Whether to aggregate sentences within the TTSService.
|
|
287
|
+
**kwargs: Additional arguments passed to the parent service.
|
|
288
|
+
"""
|
|
195
289
|
# Aggregating sentences still gives cleaner-sounding results and fewer
|
|
196
290
|
# artifacts than streaming one word at a time. On average, waiting for a
|
|
197
291
|
# full sentence should only "cost" us 15ms or so with GPT-4o or a Llama
|
|
@@ -207,7 +301,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
207
301
|
# speaking for a while, so we want the parent class to send TTSStopFrame
|
|
208
302
|
# after a short period not receiving any audio.
|
|
209
303
|
super().__init__(
|
|
210
|
-
aggregate_sentences=
|
|
304
|
+
aggregate_sentences=aggregate_sentences,
|
|
211
305
|
push_text_frames=False,
|
|
212
306
|
push_stop_frames=True,
|
|
213
307
|
pause_frame_processing=True,
|
|
@@ -231,6 +325,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
231
325
|
"auto_mode": str(params.auto_mode).lower(),
|
|
232
326
|
"enable_ssml_parsing": params.enable_ssml_parsing,
|
|
233
327
|
"enable_logging": params.enable_logging,
|
|
328
|
+
"apply_text_normalization": params.apply_text_normalization,
|
|
234
329
|
}
|
|
235
330
|
self.set_model_name(model)
|
|
236
331
|
self.set_voice(voice_id)
|
|
@@ -248,43 +343,114 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
248
343
|
self._keepalive_task = None
|
|
249
344
|
|
|
250
345
|
def can_generate_metrics(self) -> bool:
|
|
346
|
+
"""Check if this service can generate processing metrics.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
True, as ElevenLabs service supports metrics generation.
|
|
350
|
+
"""
|
|
251
351
|
return True
|
|
252
352
|
|
|
253
353
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
354
|
+
"""Convert a Language enum to ElevenLabs language format.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
language: The language to convert.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
The ElevenLabs-specific language code, or None if not supported.
|
|
361
|
+
"""
|
|
254
362
|
return language_to_elevenlabs_language(language)
|
|
255
363
|
|
|
256
364
|
def _set_voice_settings(self):
|
|
257
365
|
return build_elevenlabs_voice_settings(self._settings)
|
|
258
366
|
|
|
259
367
|
async def set_model(self, model: str):
|
|
368
|
+
"""Set the TTS model and reconnect.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
model: The model name to use for synthesis.
|
|
372
|
+
"""
|
|
260
373
|
await super().set_model(model)
|
|
261
374
|
self.logger.info(f"Switching TTS model to: [{model}]")
|
|
262
375
|
await self._disconnect()
|
|
263
376
|
await self._connect()
|
|
264
377
|
|
|
265
378
|
async def _update_settings(self, settings: Mapping[str, Any]):
|
|
379
|
+
"""Update service settings and reconnect if voice, model, or language changed."""
|
|
380
|
+
# Track previous values for settings that require reconnection
|
|
266
381
|
prev_voice = self._voice_id
|
|
382
|
+
prev_model = self.model_name
|
|
383
|
+
prev_language = self._settings.get("language")
|
|
384
|
+
# Create snapshot of current voice settings to detect changes after update
|
|
385
|
+
prev_voice_settings = self._voice_settings.copy() if self._voice_settings else None
|
|
386
|
+
|
|
267
387
|
await super()._update_settings(settings)
|
|
268
|
-
|
|
269
|
-
|
|
388
|
+
|
|
389
|
+
# Update voice settings for the next context creation
|
|
390
|
+
self._voice_settings = self._set_voice_settings()
|
|
391
|
+
|
|
392
|
+
# Check if URL-level settings changed (these require reconnection)
|
|
393
|
+
url_changed = (
|
|
394
|
+
prev_voice != self._voice_id
|
|
395
|
+
or prev_model != self.model_name
|
|
396
|
+
or prev_language != self._settings.get("language")
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Check if only voice settings changed (speed, stability, etc.)
|
|
400
|
+
voice_settings_changed = prev_voice_settings != self._voice_settings
|
|
401
|
+
|
|
402
|
+
if url_changed:
|
|
403
|
+
# These settings are in the WebSocket URL, so we need to reconnect
|
|
404
|
+
logger.debug(
|
|
405
|
+
f"URL-level setting changed (voice/model/language), reconnecting WebSocket"
|
|
406
|
+
)
|
|
270
407
|
await self._disconnect()
|
|
271
408
|
await self._connect()
|
|
272
409
|
self.logger.info(f"Switching TTS voice to: [{self._voice_id}]")
|
|
410
|
+
elif voice_settings_changed and self._context_id:
|
|
411
|
+
# Voice settings can be updated by closing current context
|
|
412
|
+
# so new one gets created with updated voice settings
|
|
413
|
+
logger.debug(f"Voice settings changed, closing current context to apply changes")
|
|
414
|
+
try:
|
|
415
|
+
if self._websocket:
|
|
416
|
+
await self._websocket.send(
|
|
417
|
+
json.dumps({"context_id": self._context_id, "close_context": True})
|
|
418
|
+
)
|
|
419
|
+
except Exception as e:
|
|
420
|
+
logger.warning(f"Error closing context for voice settings update: {e}")
|
|
421
|
+
self._context_id = None
|
|
422
|
+
self._started = False
|
|
273
423
|
|
|
274
424
|
async def start(self, frame: StartFrame):
|
|
425
|
+
"""Start the ElevenLabs TTS service.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
frame: The start frame containing initialization parameters.
|
|
429
|
+
"""
|
|
275
430
|
await super().start(frame)
|
|
276
431
|
self._output_format = output_format_from_sample_rate(self.sample_rate)
|
|
277
432
|
await self._connect()
|
|
278
433
|
|
|
279
434
|
async def stop(self, frame: EndFrame):
|
|
435
|
+
"""Stop the ElevenLabs TTS service.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
frame: The end frame.
|
|
439
|
+
"""
|
|
280
440
|
await super().stop(frame)
|
|
281
441
|
await self._disconnect()
|
|
282
442
|
|
|
283
443
|
async def cancel(self, frame: CancelFrame):
|
|
444
|
+
"""Cancel the ElevenLabs TTS service.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
frame: The cancel frame.
|
|
448
|
+
"""
|
|
284
449
|
await super().cancel(frame)
|
|
285
450
|
await self._disconnect()
|
|
286
451
|
|
|
287
452
|
async def flush_audio(self):
|
|
453
|
+
"""Flush any pending audio and finalize the current context."""
|
|
288
454
|
if not self._context_id or not self._websocket:
|
|
289
455
|
return
|
|
290
456
|
self.logger.trace(f"{self}: flushing audio")
|
|
@@ -292,6 +458,12 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
292
458
|
await self._websocket.send(json.dumps(msg))
|
|
293
459
|
|
|
294
460
|
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
|
461
|
+
"""Push a frame and handle state changes.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
frame: The frame to push.
|
|
465
|
+
direction: The direction to push the frame.
|
|
466
|
+
"""
|
|
295
467
|
await super().push_frame(frame, direction)
|
|
296
468
|
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
|
|
297
469
|
self._started = False
|
|
@@ -320,7 +492,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
320
492
|
|
|
321
493
|
async def _connect_websocket(self):
|
|
322
494
|
try:
|
|
323
|
-
if self._websocket and self._websocket.
|
|
495
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
324
496
|
return
|
|
325
497
|
|
|
326
498
|
self.logger.debug("Connecting to ElevenLabs")
|
|
@@ -336,6 +508,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
336
508
|
if self._settings["enable_logging"]:
|
|
337
509
|
url += f"&enable_logging={self._settings['enable_logging']}"
|
|
338
510
|
|
|
511
|
+
if self._settings["apply_text_normalization"] is not None:
|
|
512
|
+
url += f"&apply_text_normalization={self._settings['apply_text_normalization']}"
|
|
513
|
+
|
|
339
514
|
# Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS
|
|
340
515
|
language = self._settings["language"]
|
|
341
516
|
if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None:
|
|
@@ -347,8 +522,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
347
522
|
)
|
|
348
523
|
|
|
349
524
|
# Set max websocket message size to 16MB for large audio responses
|
|
350
|
-
self._websocket = await
|
|
351
|
-
url, max_size=16 * 1024 * 1024,
|
|
525
|
+
self._websocket = await websocket_connect(
|
|
526
|
+
url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
|
|
352
527
|
)
|
|
353
528
|
|
|
354
529
|
except Exception as e:
|
|
@@ -366,6 +541,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
366
541
|
if self._context_id:
|
|
367
542
|
await self._websocket.send(json.dumps({"close_socket": True}))
|
|
368
543
|
await self._websocket.close()
|
|
544
|
+
logger.debug("Disconnected from ElevenLabs")
|
|
369
545
|
except Exception as e:
|
|
370
546
|
self.logger.error(f"{self} error closing websocket: {e}")
|
|
371
547
|
|
|
@@ -375,6 +551,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
375
551
|
raise Exception("Websocket not connected")
|
|
376
552
|
|
|
377
553
|
async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
|
|
554
|
+
"""Handle interruption by closing the current context."""
|
|
378
555
|
await super()._handle_interruption(frame, direction)
|
|
379
556
|
|
|
380
557
|
# Close the current context when interrupted without closing the websocket
|
|
@@ -396,9 +573,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
396
573
|
self._started = False
|
|
397
574
|
|
|
398
575
|
async def _receive_messages(self):
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
):
|
|
576
|
+
"""Handle incoming WebSocket messages from ElevenLabs."""
|
|
577
|
+
async for message in self._get_websocket():
|
|
402
578
|
msg = json.loads(message)
|
|
403
579
|
|
|
404
580
|
received_ctx_id = msg.get("contextId")
|
|
@@ -411,10 +587,18 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
411
587
|
continue
|
|
412
588
|
|
|
413
589
|
# Check if this message belongs to the current context.
|
|
414
|
-
# This should never happen, so warn about it.
|
|
415
590
|
if not self.audio_context_available(received_ctx_id):
|
|
416
|
-
|
|
417
|
-
|
|
591
|
+
if self._context_id == received_ctx_id:
|
|
592
|
+
logger.debug(
|
|
593
|
+
f"Received a delayed message, recreating the context: {self._context_id}"
|
|
594
|
+
)
|
|
595
|
+
await self.create_audio_context(self._context_id)
|
|
596
|
+
else:
|
|
597
|
+
# This can happen if a message is received _after_ we have closed a context
|
|
598
|
+
# due to user interruption but _before_ the `isFinal` message for the context
|
|
599
|
+
# is received.
|
|
600
|
+
logger.debug(f"Ignoring message from unavailable context: {received_ctx_id}")
|
|
601
|
+
continue
|
|
418
602
|
|
|
419
603
|
if msg.get("audio"):
|
|
420
604
|
await self.stop_ttfb_metrics()
|
|
@@ -423,18 +607,37 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
423
607
|
audio = base64.b64decode(msg["audio"])
|
|
424
608
|
frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
|
|
425
609
|
await self.append_to_audio_context(received_ctx_id, frame)
|
|
610
|
+
|
|
426
611
|
if msg.get("alignment"):
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
612
|
+
alignment = msg["alignment"]
|
|
613
|
+
word_times = calculate_word_times(alignment, self._cumulative_time)
|
|
614
|
+
|
|
615
|
+
if word_times:
|
|
616
|
+
await self.add_word_timestamps(word_times)
|
|
617
|
+
|
|
618
|
+
# Calculate the actual end time of this audio chunk
|
|
619
|
+
char_start_times_ms = alignment.get("charStartTimesMs", [])
|
|
620
|
+
char_durations_ms = alignment.get("charDurationsMs", [])
|
|
621
|
+
|
|
622
|
+
if char_start_times_ms and char_durations_ms:
|
|
623
|
+
# End time = start time of last character + duration of last character
|
|
624
|
+
chunk_end_time_ms = char_start_times_ms[-1] + char_durations_ms[-1]
|
|
625
|
+
chunk_end_time_seconds = chunk_end_time_ms / 1000.0
|
|
626
|
+
self._cumulative_time += chunk_end_time_seconds
|
|
627
|
+
else:
|
|
628
|
+
# Fallback: use the last word's start time (current behavior)
|
|
629
|
+
self._cumulative_time = word_times[-1][1]
|
|
630
|
+
logger.warning(
|
|
631
|
+
"_receive_messages: using fallback timing method - consider investigating alignment data structure"
|
|
632
|
+
)
|
|
430
633
|
|
|
431
634
|
async def _keepalive_task_handler(self):
|
|
432
|
-
|
|
635
|
+
"""Send periodic keepalive messages to maintain WebSocket connection."""
|
|
636
|
+
KEEPALIVE_SLEEP = 10
|
|
433
637
|
while True:
|
|
434
|
-
self.reset_watchdog()
|
|
435
638
|
await asyncio.sleep(KEEPALIVE_SLEEP)
|
|
436
639
|
try:
|
|
437
|
-
if self._websocket and self._websocket.
|
|
640
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
438
641
|
if self._context_id:
|
|
439
642
|
# Send keepalive with context ID to keep the connection alive
|
|
440
643
|
keepalive_message = {
|
|
@@ -454,16 +657,25 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
454
657
|
break
|
|
455
658
|
|
|
456
659
|
async def _send_text(self, text: str):
|
|
660
|
+
"""Send text to the WebSocket for synthesis."""
|
|
457
661
|
if self._websocket and self._context_id:
|
|
458
662
|
msg = {"text": text, "context_id": self._context_id}
|
|
459
663
|
await self._websocket.send(json.dumps(msg))
|
|
460
664
|
|
|
461
665
|
@traced_tts
|
|
462
666
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
667
|
+
"""Generate speech from text using ElevenLabs' streaming WebSocket API.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
text: The text to synthesize into speech.
|
|
671
|
+
|
|
672
|
+
Yields:
|
|
673
|
+
Frame: Audio frames containing the synthesized speech.
|
|
674
|
+
"""
|
|
463
675
|
self.logger.debug(f"{self}: Generating TTS [{text}]")
|
|
464
676
|
|
|
465
677
|
try:
|
|
466
|
-
if not self._websocket or self._websocket.
|
|
678
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
467
679
|
await self._connect()
|
|
468
680
|
self.logger.debug("Connected to ElevenLabs")
|
|
469
681
|
|
|
@@ -473,9 +685,16 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
473
685
|
yield TTSStartedFrame()
|
|
474
686
|
self._started = True
|
|
475
687
|
self._cumulative_time = 0
|
|
476
|
-
#
|
|
477
|
-
|
|
478
|
-
|
|
688
|
+
# If a context ID does not exist, create a new one and
|
|
689
|
+
# register it. If an ID exists, that means the Pipeline is
|
|
690
|
+
# configured for allow_interruptions=False, so continue
|
|
691
|
+
# using the current ID. When interruptions are enabled
|
|
692
|
+
# (e.g. allow_interruptions=True), user speech results in
|
|
693
|
+
# an interruption, which resets the context ID.
|
|
694
|
+
if not self._context_id:
|
|
695
|
+
self._context_id = str(uuid.uuid4())
|
|
696
|
+
if not self.audio_context_available(self._context_id):
|
|
697
|
+
await self.create_audio_context(self._context_id)
|
|
479
698
|
|
|
480
699
|
# Initialize context with voice settings
|
|
481
700
|
msg = {"text": " ", "context_id": self._context_id}
|
|
@@ -499,19 +718,27 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
499
718
|
|
|
500
719
|
|
|
501
720
|
class ElevenLabsHttpTTSService(WordTTSService):
|
|
502
|
-
"""ElevenLabs
|
|
721
|
+
"""ElevenLabs HTTP-based TTS service with word timestamps.
|
|
503
722
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
aiohttp_session: aiohttp ClientSession
|
|
508
|
-
model: Model ID (default: "eleven_flash_v2_5" for low latency)
|
|
509
|
-
base_url: API base URL
|
|
510
|
-
sample_rate: Output sample rate
|
|
511
|
-
params: Additional parameters for voice configuration
|
|
723
|
+
Provides text-to-speech using ElevenLabs' HTTP streaming API for simpler,
|
|
724
|
+
non-WebSocket integration. Suitable for use cases where streaming WebSocket
|
|
725
|
+
connection is not required or desired.
|
|
512
726
|
"""
|
|
513
727
|
|
|
514
728
|
class InputParams(BaseModel):
|
|
729
|
+
"""Input parameters for ElevenLabs HTTP TTS configuration.
|
|
730
|
+
|
|
731
|
+
Parameters:
|
|
732
|
+
language: Language to use for synthesis.
|
|
733
|
+
optimize_streaming_latency: Latency optimization level (0-4).
|
|
734
|
+
stability: Voice stability control (0.0 to 1.0).
|
|
735
|
+
similarity_boost: Similarity boost control (0.0 to 1.0).
|
|
736
|
+
style: Style control for voice expression (0.0 to 1.0).
|
|
737
|
+
use_speaker_boost: Whether to use speaker boost enhancement.
|
|
738
|
+
speed: Voice speed control (0.25 to 4.0).
|
|
739
|
+
apply_text_normalization: Text normalization mode ("auto", "on", "off").
|
|
740
|
+
"""
|
|
741
|
+
|
|
515
742
|
language: Optional[Language] = None
|
|
516
743
|
optimize_streaming_latency: Optional[int] = None
|
|
517
744
|
stability: Optional[float] = None
|
|
@@ -519,6 +746,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
519
746
|
style: Optional[float] = None
|
|
520
747
|
use_speaker_boost: Optional[bool] = None
|
|
521
748
|
speed: Optional[float] = None
|
|
749
|
+
apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
|
|
522
750
|
|
|
523
751
|
def __init__(
|
|
524
752
|
self,
|
|
@@ -526,12 +754,24 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
526
754
|
api_key: str,
|
|
527
755
|
voice_id: str,
|
|
528
756
|
aiohttp_session: aiohttp.ClientSession,
|
|
529
|
-
model: str = "
|
|
757
|
+
model: str = "eleven_turbo_v2_5",
|
|
530
758
|
base_url: str = "https://api.elevenlabs.io",
|
|
531
759
|
sample_rate: Optional[int] = None,
|
|
532
760
|
params: Optional[InputParams] = None,
|
|
533
761
|
**kwargs,
|
|
534
762
|
):
|
|
763
|
+
"""Initialize the ElevenLabs HTTP TTS service.
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
api_key: ElevenLabs API key for authentication.
|
|
767
|
+
voice_id: ID of the voice to use for synthesis.
|
|
768
|
+
aiohttp_session: aiohttp ClientSession for HTTP requests.
|
|
769
|
+
model: TTS model to use (e.g., "eleven_turbo_v2_5").
|
|
770
|
+
base_url: Base URL for ElevenLabs HTTP API.
|
|
771
|
+
sample_rate: Audio sample rate. If None, uses default.
|
|
772
|
+
params: Additional input parameters for voice customization.
|
|
773
|
+
**kwargs: Additional arguments passed to the parent service.
|
|
774
|
+
"""
|
|
535
775
|
super().__init__(
|
|
536
776
|
aggregate_sentences=True,
|
|
537
777
|
push_text_frames=False,
|
|
@@ -557,6 +797,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
557
797
|
"style": params.style,
|
|
558
798
|
"use_speaker_boost": params.use_speaker_boost,
|
|
559
799
|
"speed": params.speed,
|
|
800
|
+
"apply_text_normalization": params.apply_text_normalization,
|
|
560
801
|
}
|
|
561
802
|
self.set_model_name(model)
|
|
562
803
|
self.set_voice(voice_id)
|
|
@@ -571,11 +812,22 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
571
812
|
self._previous_text = ""
|
|
572
813
|
|
|
573
814
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
574
|
-
"""Convert pipecat Language to ElevenLabs language code.
|
|
815
|
+
"""Convert pipecat Language to ElevenLabs language code.
|
|
816
|
+
|
|
817
|
+
Args:
|
|
818
|
+
language: The language to convert.
|
|
819
|
+
|
|
820
|
+
Returns:
|
|
821
|
+
The ElevenLabs-specific language code, or None if not supported.
|
|
822
|
+
"""
|
|
575
823
|
return language_to_elevenlabs_language(language)
|
|
576
824
|
|
|
577
825
|
def can_generate_metrics(self) -> bool:
|
|
578
|
-
"""
|
|
826
|
+
"""Check if this service can generate processing metrics.
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
True, as ElevenLabs HTTP service supports metrics generation.
|
|
830
|
+
"""
|
|
579
831
|
return True
|
|
580
832
|
|
|
581
833
|
def _set_voice_settings(self):
|
|
@@ -589,12 +841,22 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
589
841
|
logger.debug(f"{self}: Reset internal state")
|
|
590
842
|
|
|
591
843
|
async def start(self, frame: StartFrame):
|
|
592
|
-
"""
|
|
844
|
+
"""Start the ElevenLabs HTTP TTS service.
|
|
845
|
+
|
|
846
|
+
Args:
|
|
847
|
+
frame: The start frame containing initialization parameters.
|
|
848
|
+
"""
|
|
593
849
|
await super().start(frame)
|
|
594
850
|
self._output_format = output_format_from_sample_rate(self.sample_rate)
|
|
595
851
|
self._reset_state()
|
|
596
852
|
|
|
597
853
|
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
|
854
|
+
"""Push a frame and handle state changes.
|
|
855
|
+
|
|
856
|
+
Args:
|
|
857
|
+
frame: The frame to push.
|
|
858
|
+
direction: The direction to push the frame.
|
|
859
|
+
"""
|
|
598
860
|
await super().push_frame(frame, direction)
|
|
599
861
|
if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
|
|
600
862
|
# Reset timing on interruption or stop
|
|
@@ -610,21 +872,23 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
610
872
|
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
|
|
611
873
|
"""Calculate word timing from character alignment data.
|
|
612
874
|
|
|
613
|
-
Example input data:
|
|
614
|
-
{
|
|
615
|
-
"characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
|
|
616
|
-
"character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
|
|
617
|
-
"character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
|
|
618
|
-
}
|
|
619
|
-
|
|
620
|
-
Would produce word times (with cumulative_time=0):
|
|
621
|
-
[("Hello", 0.1), ("world", 0.5)]
|
|
622
|
-
|
|
623
875
|
Args:
|
|
624
|
-
alignment_info: Character timing data from ElevenLabs
|
|
876
|
+
alignment_info: Character timing data from ElevenLabs.
|
|
625
877
|
|
|
626
878
|
Returns:
|
|
627
|
-
List of (word, timestamp) pairs
|
|
879
|
+
List of (word, timestamp) pairs.
|
|
880
|
+
|
|
881
|
+
Example input data::
|
|
882
|
+
|
|
883
|
+
{
|
|
884
|
+
"characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
|
|
885
|
+
"character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
|
|
886
|
+
"character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
Would produce word times (with cumulative_time=0)::
|
|
890
|
+
|
|
891
|
+
[("Hello", 0.1), ("world", 0.5)]
|
|
628
892
|
"""
|
|
629
893
|
chars = alignment_info.get("characters", [])
|
|
630
894
|
char_start_times = alignment_info.get("character_start_times_seconds", [])
|
|
@@ -675,10 +939,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
675
939
|
Includes previous text as context for better prosody continuity.
|
|
676
940
|
|
|
677
941
|
Args:
|
|
678
|
-
text: Text to convert to speech
|
|
942
|
+
text: Text to convert to speech.
|
|
679
943
|
|
|
680
944
|
Yields:
|
|
681
|
-
Audio and control frames
|
|
945
|
+
Frame: Audio and control frames containing the synthesized speech.
|
|
682
946
|
"""
|
|
683
947
|
self.logger.debug(f"{self}: Generating TTS [{text}]")
|
|
684
948
|
|
|
@@ -717,6 +981,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
717
981
|
}
|
|
718
982
|
if self._settings["optimize_streaming_latency"] is not None:
|
|
719
983
|
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
|
|
984
|
+
if self._settings["apply_text_normalization"] is not None:
|
|
985
|
+
params["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
|
720
986
|
|
|
721
987
|
self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")
|
|
722
988
|
|