dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
pipecat/services/azure/tts.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Azure Cognitive Services Text-to-Speech service implementations."""
|
|
8
|
+
|
|
7
9
|
import asyncio
|
|
8
10
|
from typing import AsyncGenerator, Optional
|
|
9
11
|
|
|
@@ -21,8 +23,8 @@ from pipecat.frames.frames import (
|
|
|
21
23
|
from pipecat.services.azure.common import language_to_azure_language
|
|
22
24
|
from pipecat.services.tts_service import TTSService
|
|
23
25
|
from pipecat.transcriptions.language import Language
|
|
24
|
-
from pipecat.utils.utils import detect_language_from_script
|
|
25
26
|
from pipecat.utils.tracing.service_decorators import traced_tts
|
|
27
|
+
from pipecat.utils.utils import detect_language_from_script
|
|
26
28
|
|
|
27
29
|
try:
|
|
28
30
|
from azure.cognitiveservices.speech import (
|
|
@@ -40,6 +42,15 @@ except ModuleNotFoundError as e:
|
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputFormat:
|
|
45
|
+
"""Convert sample rate to Azure speech synthesis output format.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
sample_rate: Sample rate in Hz.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Corresponding Azure SpeechSynthesisOutputFormat enum value.
|
|
52
|
+
Defaults to Raw24Khz16BitMonoPcm if sample rate not found.
|
|
53
|
+
"""
|
|
43
54
|
sample_rate_map = {
|
|
44
55
|
8000: SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm,
|
|
45
56
|
16000: SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm,
|
|
@@ -52,7 +63,36 @@ def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputForma
|
|
|
52
63
|
|
|
53
64
|
|
|
54
65
|
class AzureBaseTTSService(TTSService):
|
|
66
|
+
"""Base class for Azure Cognitive Services text-to-speech implementations.
|
|
67
|
+
|
|
68
|
+
Provides common functionality for Azure TTS services including SSML
|
|
69
|
+
construction, voice configuration, and parameter management.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# Define SSML escape mappings based on SSML reserved characters
|
|
73
|
+
# See - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure
|
|
74
|
+
SSML_ESCAPE_CHARS = {
|
|
75
|
+
"&": "&",
|
|
76
|
+
"<": "<",
|
|
77
|
+
">": ">",
|
|
78
|
+
'"': """,
|
|
79
|
+
"'": "'",
|
|
80
|
+
}
|
|
81
|
+
|
|
55
82
|
class InputParams(BaseModel):
|
|
83
|
+
"""Input parameters for Azure TTS voice configuration.
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
emphasis: Emphasis level for speech ("strong", "moderate", "reduced").
|
|
87
|
+
language: Language for synthesis. Defaults to English (US).
|
|
88
|
+
pitch: Voice pitch adjustment (e.g., "+10%", "-5Hz", "high").
|
|
89
|
+
rate: Speech rate multiplier. Defaults to "1.05".
|
|
90
|
+
role: Voice role for expression (e.g., "YoungAdultFemale").
|
|
91
|
+
style: Speaking style (e.g., "cheerful", "sad", "excited").
|
|
92
|
+
style_degree: Intensity of the speaking style (0.01 to 2.0).
|
|
93
|
+
volume: Volume level (e.g., "+20%", "loud", "x-soft").
|
|
94
|
+
"""
|
|
95
|
+
|
|
56
96
|
emphasis: Optional[str] = None
|
|
57
97
|
language: Optional[Language] = Language.EN_US
|
|
58
98
|
pitch: Optional[str] = None
|
|
@@ -75,6 +115,16 @@ class AzureBaseTTSService(TTSService):
|
|
|
75
115
|
params: Optional[InputParams] = None,
|
|
76
116
|
**kwargs,
|
|
77
117
|
):
|
|
118
|
+
"""Initialize the Azure TTS service with configuration parameters.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
api_key: Azure Cognitive Services subscription key.
|
|
122
|
+
region: Azure region identifier (e.g., "eastus", "westus2").
|
|
123
|
+
voice: Voice name to use for synthesis. Defaults to "en-US-SaraNeural".
|
|
124
|
+
sample_rate: Audio sample rate in Hz. If None, uses service default.
|
|
125
|
+
params: Voice and synthesis parameters configuration.
|
|
126
|
+
**kwargs: Additional arguments passed to parent TTSService.
|
|
127
|
+
"""
|
|
78
128
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
79
129
|
|
|
80
130
|
params = params or AzureBaseTTSService.InputParams()
|
|
@@ -138,9 +188,22 @@ class AzureBaseTTSService(TTSService):
|
|
|
138
188
|
logger.debug(f"Final additional language map: {self._additional_lang_map}")
|
|
139
189
|
|
|
140
190
|
def can_generate_metrics(self) -> bool:
|
|
191
|
+
"""Check if this service can generate processing metrics.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
True, as Azure TTS service supports metrics generation.
|
|
195
|
+
"""
|
|
141
196
|
return True
|
|
142
197
|
|
|
143
198
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
199
|
+
"""Convert a Language enum to Azure language format.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
language: The language to convert.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
The Azure-specific language code, or None if not supported.
|
|
206
|
+
"""
|
|
144
207
|
return language_to_azure_language(language)
|
|
145
208
|
|
|
146
209
|
def _construct_ssml(self, text: str) -> str:
|
|
@@ -162,6 +225,10 @@ class AzureBaseTTSService(TTSService):
|
|
|
162
225
|
)
|
|
163
226
|
|
|
164
227
|
# 3. Construct SSML with the selected language and voice
|
|
228
|
+
|
|
229
|
+
# Escape special characters
|
|
230
|
+
escaped_text = self._escape_text(text)
|
|
231
|
+
|
|
165
232
|
ssml = (
|
|
166
233
|
f"<speak version='1.0' xml:lang='{target_language}' "
|
|
167
234
|
"xmlns='http://www.w3.org/2001/10/synthesis' "
|
|
@@ -193,10 +260,10 @@ class AzureBaseTTSService(TTSService):
|
|
|
193
260
|
|
|
194
261
|
if "Multilingual" in target_voice:
|
|
195
262
|
ssml += f"<lang xml:lang='{target_language}'>"
|
|
196
|
-
ssml +=
|
|
263
|
+
ssml += escaped_text
|
|
197
264
|
ssml += "</lang>"
|
|
198
265
|
else:
|
|
199
|
-
ssml +=
|
|
266
|
+
ssml += escaped_text
|
|
200
267
|
|
|
201
268
|
if self._settings["emphasis"]:
|
|
202
269
|
ssml += "</emphasis>"
|
|
@@ -210,9 +277,42 @@ class AzureBaseTTSService(TTSService):
|
|
|
210
277
|
|
|
211
278
|
return ssml
|
|
212
279
|
|
|
280
|
+
def _escape_text(self, text: str) -> str:
|
|
281
|
+
"""Escapes XML/SSML reserved characters according to Microsoft documentation.
|
|
282
|
+
|
|
283
|
+
This method escapes the following characters:
|
|
284
|
+
- & becomes &
|
|
285
|
+
- < becomes <
|
|
286
|
+
- > becomes >
|
|
287
|
+
- " becomes "
|
|
288
|
+
- ' becomes '
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
text: The text to escape.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
The escaped text.
|
|
295
|
+
"""
|
|
296
|
+
escaped_text = text
|
|
297
|
+
for char, escape_code in AzureBaseTTSService.SSML_ESCAPE_CHARS.items():
|
|
298
|
+
escaped_text = escaped_text.replace(char, escape_code)
|
|
299
|
+
return escaped_text
|
|
300
|
+
|
|
213
301
|
|
|
214
302
|
class AzureTTSService(AzureBaseTTSService):
|
|
303
|
+
"""Azure Cognitive Services streaming TTS service.
|
|
304
|
+
|
|
305
|
+
Provides real-time text-to-speech synthesis using Azure's WebSocket-based
|
|
306
|
+
streaming API. Audio chunks are streamed as they become available for
|
|
307
|
+
lower latency playback.
|
|
308
|
+
"""
|
|
309
|
+
|
|
215
310
|
def __init__(self, **kwargs):
|
|
311
|
+
"""Initialize the Azure streaming TTS service.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
**kwargs: All arguments passed to AzureBaseTTSService parent class.
|
|
315
|
+
"""
|
|
216
316
|
super().__init__(**kwargs)
|
|
217
317
|
self._speech_config = None
|
|
218
318
|
self._speech_synthesizer = None
|
|
@@ -220,6 +320,11 @@ class AzureTTSService(AzureBaseTTSService):
|
|
|
220
320
|
self._clear_audio = False
|
|
221
321
|
|
|
222
322
|
async def start(self, frame: StartFrame):
|
|
323
|
+
"""Start the Azure TTS service and initialize speech synthesizer.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
frame: Start frame containing initialization parameters.
|
|
327
|
+
"""
|
|
223
328
|
await super().start(frame)
|
|
224
329
|
|
|
225
330
|
if self._speech_config:
|
|
@@ -250,12 +355,12 @@ class AzureTTSService(AzureBaseTTSService):
|
|
|
250
355
|
self._speech_synthesizer.synthesis_canceled.connect(self._handle_canceled)
|
|
251
356
|
|
|
252
357
|
def _handle_synthesizing(self, evt):
|
|
253
|
-
"""Handle audio chunks as they
|
|
358
|
+
"""Handle audio chunks as they arriv."""
|
|
254
359
|
if evt.result and evt.result.audio_data:
|
|
255
360
|
self._audio_queue.put_nowait(evt.result.audio_data)
|
|
256
361
|
|
|
257
362
|
def _handle_completed(self, evt):
|
|
258
|
-
"""Handle synthesis completion"""
|
|
363
|
+
"""Handle synthesis completion."""
|
|
259
364
|
self._audio_queue.put_nowait(None) # Signal completion
|
|
260
365
|
|
|
261
366
|
def _handle_canceled(self, evt):
|
|
@@ -263,29 +368,30 @@ class AzureTTSService(AzureBaseTTSService):
|
|
|
263
368
|
self.logger.error(f"Speech synthesis canceled: {evt.result.cancellation_details.reason}")
|
|
264
369
|
self._audio_queue.put_nowait(None)
|
|
265
370
|
|
|
266
|
-
async def
|
|
267
|
-
|
|
268
|
-
self
|
|
269
|
-
if self._speech_synthesizer is not None:
|
|
270
|
-
future = self._speech_synthesizer.stop_speaking_async()
|
|
271
|
-
|
|
272
|
-
async def wait_for_future_completion():
|
|
273
|
-
loop = self.get_event_loop()
|
|
274
|
-
await loop.run_in_executor(None, future.get)
|
|
275
|
-
|
|
276
|
-
task = self.create_task(wait_for_future_completion())
|
|
277
|
-
await self.wait_for_task(task)
|
|
278
|
-
while not self._audio_queue.empty():
|
|
279
|
-
try:
|
|
280
|
-
self._audio_queue.get_nowait()
|
|
281
|
-
except asyncio.QueueEmpty:
|
|
282
|
-
break
|
|
283
|
-
self._clear_audio = False
|
|
371
|
+
async def flush_audio(self):
|
|
372
|
+
"""Flush any pending audio data."""
|
|
373
|
+
logger.trace(f"{self}: flushing audio")
|
|
284
374
|
|
|
285
375
|
@traced_tts
|
|
286
376
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
377
|
+
"""Generate speech from text using Azure's streaming synthesis.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
text: The text to synthesize into speech.
|
|
381
|
+
|
|
382
|
+
Yields:
|
|
383
|
+
Frame: Audio frames containing synthesized speech data.
|
|
384
|
+
"""
|
|
287
385
|
text = text.lstrip()
|
|
288
386
|
self.logger.debug(f"{self}: Generating TTS [{text}]")
|
|
387
|
+
|
|
388
|
+
# Clear the audio queue in case there's still audio in it, causing the next audio response
|
|
389
|
+
# to be cut off by the 'None' element returned at the end of the previous audio synthesis.
|
|
390
|
+
# Empty the audio queue before processing the new text
|
|
391
|
+
while not self._audio_queue.empty():
|
|
392
|
+
self._audio_queue.get_nowait()
|
|
393
|
+
self._audio_queue.task_done()
|
|
394
|
+
|
|
289
395
|
try:
|
|
290
396
|
if self._speech_synthesizer is None:
|
|
291
397
|
error_msg = "Speech synthesizer not initialized."
|
|
@@ -324,12 +430,29 @@ class AzureTTSService(AzureBaseTTSService):
|
|
|
324
430
|
|
|
325
431
|
|
|
326
432
|
class AzureHttpTTSService(AzureBaseTTSService):
|
|
433
|
+
"""Azure Cognitive Services HTTP-based TTS service.
|
|
434
|
+
|
|
435
|
+
Provides text-to-speech synthesis using Azure's HTTP API for simpler,
|
|
436
|
+
non-streaming synthesis. Suitable for use cases where streaming is not
|
|
437
|
+
required and simpler integration is preferred.
|
|
438
|
+
"""
|
|
439
|
+
|
|
327
440
|
def __init__(self, **kwargs):
|
|
441
|
+
"""Initialize the Azure HTTP TTS service.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
**kwargs: All arguments passed to AzureBaseTTSService parent class.
|
|
445
|
+
"""
|
|
328
446
|
super().__init__(**kwargs)
|
|
329
447
|
self._speech_config = None
|
|
330
448
|
self._speech_synthesizer = None
|
|
331
449
|
|
|
332
450
|
async def start(self, frame: StartFrame):
|
|
451
|
+
"""Start the Azure HTTP TTS service and initialize speech synthesizer.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
frame: Start frame containing initialization parameters.
|
|
455
|
+
"""
|
|
333
456
|
await super().start(frame)
|
|
334
457
|
|
|
335
458
|
if self._speech_config:
|
|
@@ -349,6 +472,14 @@ class AzureHttpTTSService(AzureBaseTTSService):
|
|
|
349
472
|
|
|
350
473
|
@traced_tts
|
|
351
474
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
475
|
+
"""Generate speech from text using Azure's HTTP synthesis API.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
text: The text to synthesize into speech.
|
|
479
|
+
|
|
480
|
+
Yields:
|
|
481
|
+
Frame: Audio frames containing the complete synthesized speech.
|
|
482
|
+
"""
|
|
352
483
|
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
353
484
|
|
|
354
485
|
await self.start_ttfb_metrics()
|
pipecat/services/cartesia/stt.py
CHANGED
|
@@ -4,12 +4,17 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Cartesia Speech-to-Text service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides a WebSocket-based STT service that integrates with
|
|
10
|
+
the Cartesia Live transcription API for real-time speech recognition.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import asyncio
|
|
8
14
|
import json
|
|
9
15
|
import urllib.parse
|
|
10
16
|
from typing import AsyncGenerator, Optional
|
|
11
17
|
|
|
12
|
-
import websockets
|
|
13
18
|
from loguru import logger
|
|
14
19
|
|
|
15
20
|
from pipecat.frames.frames import (
|
|
@@ -28,8 +33,23 @@ from pipecat.transcriptions.language import Language
|
|
|
28
33
|
from pipecat.utils.time import time_now_iso8601
|
|
29
34
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
30
35
|
|
|
36
|
+
try:
|
|
37
|
+
import websockets
|
|
38
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
39
|
+
from websockets.protocol import State
|
|
40
|
+
except ModuleNotFoundError as e:
|
|
41
|
+
logger.error(f"Exception: {e}")
|
|
42
|
+
logger.error("In order to use Cartesia, you need to `pip install pipecat-ai[cartesia]`.")
|
|
43
|
+
raise Exception(f"Missing module: {e}")
|
|
44
|
+
|
|
31
45
|
|
|
32
46
|
class CartesiaLiveOptions:
|
|
47
|
+
"""Configuration options for Cartesia Live STT service.
|
|
48
|
+
|
|
49
|
+
Manages transcription parameters including model selection, language,
|
|
50
|
+
audio encoding format, and sample rate settings.
|
|
51
|
+
"""
|
|
52
|
+
|
|
33
53
|
def __init__(
|
|
34
54
|
self,
|
|
35
55
|
*,
|
|
@@ -39,6 +59,15 @@ class CartesiaLiveOptions:
|
|
|
39
59
|
sample_rate: int = 16000,
|
|
40
60
|
**kwargs,
|
|
41
61
|
):
|
|
62
|
+
"""Initialize CartesiaLiveOptions with default or provided parameters.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
model: The transcription model to use. Defaults to "ink-whisper".
|
|
66
|
+
language: Target language for transcription. Defaults to English.
|
|
67
|
+
encoding: Audio encoding format. Defaults to "pcm_s16le".
|
|
68
|
+
sample_rate: Audio sample rate in Hz. Defaults to 16000.
|
|
69
|
+
**kwargs: Additional parameters for the transcription service.
|
|
70
|
+
"""
|
|
42
71
|
self.model = model
|
|
43
72
|
self.language = language
|
|
44
73
|
self.encoding = encoding
|
|
@@ -46,6 +75,11 @@ class CartesiaLiveOptions:
|
|
|
46
75
|
self.additional_params = kwargs
|
|
47
76
|
|
|
48
77
|
def to_dict(self):
|
|
78
|
+
"""Convert options to dictionary format.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Dictionary containing all configuration parameters.
|
|
82
|
+
"""
|
|
49
83
|
params = {
|
|
50
84
|
"model": self.model,
|
|
51
85
|
"language": self.language if isinstance(self.language, str) else self.language.value,
|
|
@@ -56,19 +90,48 @@ class CartesiaLiveOptions:
|
|
|
56
90
|
return params
|
|
57
91
|
|
|
58
92
|
def items(self):
|
|
93
|
+
"""Get configuration items as key-value pairs.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Iterator of (key, value) tuples for all configuration parameters.
|
|
97
|
+
"""
|
|
59
98
|
return self.to_dict().items()
|
|
60
99
|
|
|
61
100
|
def get(self, key, default=None):
|
|
101
|
+
"""Get a configuration value by key.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
key: The configuration parameter name to retrieve.
|
|
105
|
+
default: Default value if key is not found.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The configuration value or default if not found.
|
|
109
|
+
"""
|
|
62
110
|
if hasattr(self, key):
|
|
63
111
|
return getattr(self, key)
|
|
64
112
|
return self.additional_params.get(key, default)
|
|
65
113
|
|
|
66
114
|
@classmethod
|
|
67
115
|
def from_json(cls, json_str: str) -> "CartesiaLiveOptions":
|
|
116
|
+
"""Create options from JSON string.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
json_str: JSON string containing configuration parameters.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
New CartesiaLiveOptions instance with parsed parameters.
|
|
123
|
+
"""
|
|
68
124
|
return cls(**json.loads(json_str))
|
|
69
125
|
|
|
70
126
|
|
|
71
127
|
class CartesiaSTTService(STTService):
|
|
128
|
+
"""Speech-to-text service using Cartesia Live API.
|
|
129
|
+
|
|
130
|
+
Provides real-time speech transcription through WebSocket connection
|
|
131
|
+
to Cartesia's Live transcription service. Supports both interim and
|
|
132
|
+
final transcriptions with configurable models and languages.
|
|
133
|
+
"""
|
|
134
|
+
|
|
72
135
|
def __init__(
|
|
73
136
|
self,
|
|
74
137
|
*,
|
|
@@ -78,6 +141,15 @@ class CartesiaSTTService(STTService):
|
|
|
78
141
|
live_options: Optional[CartesiaLiveOptions] = None,
|
|
79
142
|
**kwargs,
|
|
80
143
|
):
|
|
144
|
+
"""Initialize CartesiaSTTService with API key and options.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
api_key: Authentication key for Cartesia API.
|
|
148
|
+
base_url: Custom API endpoint URL. If empty, uses default.
|
|
149
|
+
sample_rate: Audio sample rate in Hz. Defaults to 16000.
|
|
150
|
+
live_options: Configuration options for transcription service.
|
|
151
|
+
**kwargs: Additional arguments passed to parent STTService.
|
|
152
|
+
"""
|
|
81
153
|
sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
|
|
82
154
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
83
155
|
|
|
@@ -108,23 +180,51 @@ class CartesiaSTTService(STTService):
|
|
|
108
180
|
self._receiver_task = None
|
|
109
181
|
|
|
110
182
|
def can_generate_metrics(self) -> bool:
|
|
183
|
+
"""Check if the service can generate processing metrics.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
True, indicating metrics are supported.
|
|
187
|
+
"""
|
|
111
188
|
return True
|
|
112
189
|
|
|
113
190
|
async def start(self, frame: StartFrame):
|
|
191
|
+
"""Start the STT service and establish connection.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
frame: Frame indicating service should start.
|
|
195
|
+
"""
|
|
114
196
|
await super().start(frame)
|
|
115
197
|
await self._connect()
|
|
116
198
|
|
|
117
199
|
async def stop(self, frame: EndFrame):
|
|
200
|
+
"""Stop the STT service and close connection.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
frame: Frame indicating service should stop.
|
|
204
|
+
"""
|
|
118
205
|
await super().stop(frame)
|
|
119
206
|
await self._disconnect()
|
|
120
207
|
|
|
121
208
|
async def cancel(self, frame: CancelFrame):
|
|
209
|
+
"""Cancel the STT service and close connection.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
frame: Frame indicating service should be cancelled.
|
|
213
|
+
"""
|
|
122
214
|
await super().cancel(frame)
|
|
123
215
|
await self._disconnect()
|
|
124
216
|
|
|
125
217
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
218
|
+
"""Process audio data for speech-to-text transcription.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
audio: Raw audio bytes to transcribe.
|
|
222
|
+
|
|
223
|
+
Yields:
|
|
224
|
+
None - transcription results are handled via WebSocket responses.
|
|
225
|
+
"""
|
|
126
226
|
# If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
|
|
127
|
-
if not self._connection or self._connection.
|
|
227
|
+
if not self._connection or self._connection.state is State.CLOSED:
|
|
128
228
|
await self._connect()
|
|
129
229
|
|
|
130
230
|
await self._connection.send(audio)
|
|
@@ -137,7 +237,7 @@ class CartesiaSTTService(STTService):
|
|
|
137
237
|
headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
|
|
138
238
|
|
|
139
239
|
try:
|
|
140
|
-
self._connection = await
|
|
240
|
+
self._connection = await websocket_connect(ws_url, additional_headers=headers)
|
|
141
241
|
# Setup the receiver task to handle the incoming messages from the Cartesia server
|
|
142
242
|
if self._receiver_task is None or self._receiver_task.done():
|
|
143
243
|
self._receiver_task = asyncio.create_task(self._receive_messages())
|
|
@@ -148,7 +248,7 @@ class CartesiaSTTService(STTService):
|
|
|
148
248
|
async def _receive_messages(self):
|
|
149
249
|
try:
|
|
150
250
|
while True:
|
|
151
|
-
if not self._connection or self._connection.
|
|
251
|
+
if not self._connection or self._connection.state is State.CLOSED:
|
|
152
252
|
break
|
|
153
253
|
|
|
154
254
|
message = await self._connection.recv()
|
|
@@ -197,14 +297,24 @@ class CartesiaSTTService(STTService):
|
|
|
197
297
|
await self.stop_ttfb_metrics()
|
|
198
298
|
if is_final:
|
|
199
299
|
await self.push_frame(
|
|
200
|
-
TranscriptionFrame(
|
|
300
|
+
TranscriptionFrame(
|
|
301
|
+
transcript,
|
|
302
|
+
self._user_id,
|
|
303
|
+
time_now_iso8601(),
|
|
304
|
+
language,
|
|
305
|
+
)
|
|
201
306
|
)
|
|
202
307
|
await self._handle_transcription(transcript, is_final, language)
|
|
203
308
|
await self.stop_processing_metrics()
|
|
204
309
|
else:
|
|
205
310
|
# For interim transcriptions, just push the frame without tracing
|
|
206
311
|
await self.push_frame(
|
|
207
|
-
InterimTranscriptionFrame(
|
|
312
|
+
InterimTranscriptionFrame(
|
|
313
|
+
transcript,
|
|
314
|
+
self._user_id,
|
|
315
|
+
time_now_iso8601(),
|
|
316
|
+
language,
|
|
317
|
+
)
|
|
208
318
|
)
|
|
209
319
|
|
|
210
320
|
async def _disconnect(self):
|
|
@@ -218,22 +328,29 @@ class CartesiaSTTService(STTService):
|
|
|
218
328
|
logger.exception(f"Unexpected exception while cancelling task: {e}")
|
|
219
329
|
self._receiver_task = None
|
|
220
330
|
|
|
221
|
-
if self._connection and self._connection.
|
|
331
|
+
if self._connection and self._connection.state is State.OPEN:
|
|
222
332
|
logger.debug("Disconnecting from Cartesia")
|
|
223
333
|
|
|
224
334
|
await self._connection.close()
|
|
225
335
|
self._connection = None
|
|
226
336
|
|
|
227
337
|
async def start_metrics(self):
|
|
338
|
+
"""Start performance metrics collection for transcription processing."""
|
|
228
339
|
await self.start_ttfb_metrics()
|
|
229
340
|
await self.start_processing_metrics()
|
|
230
341
|
|
|
231
342
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
343
|
+
"""Process incoming frames and handle speech events.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
frame: The frame to process.
|
|
347
|
+
direction: Direction of frame flow in the pipeline.
|
|
348
|
+
"""
|
|
232
349
|
await super().process_frame(frame, direction)
|
|
233
350
|
|
|
234
351
|
if isinstance(frame, UserStartedSpeakingFrame):
|
|
235
352
|
await self.start_metrics()
|
|
236
353
|
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
237
354
|
# Send finalize command to flush the transcription session
|
|
238
|
-
if self._connection and self._connection.
|
|
355
|
+
if self._connection and self._connection.state is State.OPEN:
|
|
239
356
|
await self._connection.send("finalize")
|