dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
pipecat/services/cartesia/tts.py
CHANGED
|
@@ -30,15 +30,19 @@ from pipecat.frames.frames import (
|
|
|
30
30
|
from pipecat.processors.frame_processor import FrameDirection
|
|
31
31
|
from pipecat.services.tts_service import AudioContextWordTTSService, TTSService
|
|
32
32
|
from pipecat.transcriptions.language import Language
|
|
33
|
-
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
|
34
33
|
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
|
|
35
34
|
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
|
|
36
35
|
from pipecat.utils.tracing.service_decorators import traced_tts
|
|
37
36
|
|
|
37
|
+
# Suppress regex warnings from pydub (used by cartesia)
|
|
38
|
+
warnings.filterwarnings("ignore", message="invalid escape sequence", category=SyntaxWarning)
|
|
39
|
+
|
|
40
|
+
|
|
38
41
|
# See .env.example for Cartesia configuration needed
|
|
39
42
|
try:
|
|
40
|
-
import websockets
|
|
41
43
|
from cartesia import AsyncCartesia
|
|
44
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
45
|
+
from websockets.protocol import State
|
|
42
46
|
except ModuleNotFoundError as e:
|
|
43
47
|
logger.error(f"Exception: {e}")
|
|
44
48
|
logger.error("In order to use Cartesia, you need to `pip install pipecat-ai[cartesia]`.")
|
|
@@ -91,19 +95,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
91
95
|
Provides text-to-speech using Cartesia's streaming WebSocket API.
|
|
92
96
|
Supports word-level timestamps, audio context management, and various voice
|
|
93
97
|
customization options including speed and emotion controls.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
api_key: Cartesia API key for authentication.
|
|
97
|
-
voice_id: ID of the voice to use for synthesis.
|
|
98
|
-
cartesia_version: API version string for Cartesia service.
|
|
99
|
-
url: WebSocket URL for Cartesia TTS API.
|
|
100
|
-
model: TTS model to use (e.g., "sonic-2").
|
|
101
|
-
sample_rate: Audio sample rate. If None, uses default.
|
|
102
|
-
encoding: Audio encoding format.
|
|
103
|
-
container: Audio container format.
|
|
104
|
-
params: Additional input parameters for voice customization.
|
|
105
|
-
text_aggregator: Custom text aggregator for processing input text.
|
|
106
|
-
**kwargs: Additional arguments passed to the parent service.
|
|
107
98
|
"""
|
|
108
99
|
|
|
109
100
|
class InputParams(BaseModel):
|
|
@@ -112,7 +103,10 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
112
103
|
Parameters:
|
|
113
104
|
language: Language to use for synthesis.
|
|
114
105
|
speed: Voice speed control (string or float).
|
|
115
|
-
emotion: List of emotion controls
|
|
106
|
+
emotion: List of emotion controls.
|
|
107
|
+
|
|
108
|
+
.. deprecated:: 0.0.68
|
|
109
|
+
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
116
110
|
"""
|
|
117
111
|
|
|
118
112
|
language: Optional[Language] = Language.EN
|
|
@@ -132,8 +126,25 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
132
126
|
container: str = "raw",
|
|
133
127
|
params: Optional[InputParams] = None,
|
|
134
128
|
text_aggregator: Optional[BaseTextAggregator] = None,
|
|
129
|
+
aggregate_sentences: Optional[bool] = True,
|
|
135
130
|
**kwargs,
|
|
136
131
|
):
|
|
132
|
+
"""Initialize the Cartesia TTS service.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
api_key: Cartesia API key for authentication.
|
|
136
|
+
voice_id: ID of the voice to use for synthesis.
|
|
137
|
+
cartesia_version: API version string for Cartesia service.
|
|
138
|
+
url: WebSocket URL for Cartesia TTS API.
|
|
139
|
+
model: TTS model to use (e.g., "sonic-2").
|
|
140
|
+
sample_rate: Audio sample rate. If None, uses default.
|
|
141
|
+
encoding: Audio encoding format.
|
|
142
|
+
container: Audio container format.
|
|
143
|
+
params: Additional input parameters for voice customization.
|
|
144
|
+
text_aggregator: Custom text aggregator for processing input text.
|
|
145
|
+
aggregate_sentences: Whether to aggregate sentences within the TTSService.
|
|
146
|
+
**kwargs: Additional arguments passed to the parent service.
|
|
147
|
+
"""
|
|
137
148
|
# Aggregating sentences still gives cleaner-sounding results and fewer
|
|
138
149
|
# artifacts than streaming one word at a time. On average, waiting for a
|
|
139
150
|
# full sentence should only "cost" us 15ms or so with GPT-4o or a Llama
|
|
@@ -145,7 +156,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
145
156
|
# can use those to generate text frames ourselves aligned with the
|
|
146
157
|
# playout timing of the audio!
|
|
147
158
|
super().__init__(
|
|
148
|
-
aggregate_sentences=
|
|
159
|
+
aggregate_sentences=aggregate_sentences,
|
|
149
160
|
push_text_frames=False,
|
|
150
161
|
pause_frame_processing=True,
|
|
151
162
|
sample_rate=sample_rate,
|
|
@@ -205,6 +216,54 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
205
216
|
"""
|
|
206
217
|
return language_to_cartesia_language(language)
|
|
207
218
|
|
|
219
|
+
def _is_cjk_language(self, language: str) -> bool:
|
|
220
|
+
"""Check if the given language is CJK (Chinese, Japanese, Korean).
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
language: The language code to check.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
True if the language is Chinese, Japanese, or Korean.
|
|
227
|
+
"""
|
|
228
|
+
cjk_languages = {"zh", "ja", "ko"}
|
|
229
|
+
base_lang = language.split("-")[0].lower()
|
|
230
|
+
return base_lang in cjk_languages
|
|
231
|
+
|
|
232
|
+
def _process_word_timestamps_for_language(
|
|
233
|
+
self, words: List[str], starts: List[float]
|
|
234
|
+
) -> List[tuple[str, float]]:
|
|
235
|
+
"""Process word timestamps based on the current language.
|
|
236
|
+
|
|
237
|
+
For CJK languages, Cartesia groups related characters in the same timestamp message.
|
|
238
|
+
For example, in Japanese a single message might be `['こ', 'ん', 'に', 'ち', 'は', '。']`.
|
|
239
|
+
We combine these into single words so the downstream aggregator can add natural
|
|
240
|
+
spacing between meaningful units rather than individual characters.
|
|
241
|
+
|
|
242
|
+
For non-CJK languages, words are already properly separated and are used as-is.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
words: List of words/characters from Cartesia.
|
|
246
|
+
starts: List of start timestamps for each word/character.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List of (word, start_time) tuples processed for the language.
|
|
250
|
+
"""
|
|
251
|
+
current_language = self._settings.get("language", "en")
|
|
252
|
+
|
|
253
|
+
# Check if this is a CJK language
|
|
254
|
+
if self._is_cjk_language(current_language):
|
|
255
|
+
# For CJK languages, combine all characters in this message into one word
|
|
256
|
+
# using the first character's start time
|
|
257
|
+
if words and starts:
|
|
258
|
+
combined_word = "".join(words)
|
|
259
|
+
first_start = starts[0]
|
|
260
|
+
return [(combined_word, first_start)]
|
|
261
|
+
else:
|
|
262
|
+
return []
|
|
263
|
+
else:
|
|
264
|
+
# For non-CJK languages, use as-is
|
|
265
|
+
return list(zip(words, starts))
|
|
266
|
+
|
|
208
267
|
def _build_msg(
|
|
209
268
|
self, text: str = "", continue_transcript: bool = True, add_timestamps: bool = True
|
|
210
269
|
):
|
|
@@ -282,10 +341,10 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
282
341
|
|
|
283
342
|
async def _connect_websocket(self):
|
|
284
343
|
try:
|
|
285
|
-
if self._websocket and self._websocket.
|
|
344
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
286
345
|
return
|
|
287
346
|
logger.debug("Connecting to Cartesia")
|
|
288
|
-
self._websocket = await
|
|
347
|
+
self._websocket = await websocket_connect(
|
|
289
348
|
f"{self._url}?api_key={self._api_key}&cartesia_version={self._cartesia_version}"
|
|
290
349
|
)
|
|
291
350
|
except Exception as e:
|
|
@@ -329,9 +388,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
329
388
|
self._context_id = None
|
|
330
389
|
|
|
331
390
|
async def _receive_messages(self):
|
|
332
|
-
async for message in
|
|
333
|
-
self._get_websocket(), manager=self.task_manager
|
|
334
|
-
):
|
|
391
|
+
async for message in self._get_websocket():
|
|
335
392
|
msg = json.loads(message)
|
|
336
393
|
if not msg or not self.audio_context_available(msg["context_id"]):
|
|
337
394
|
continue
|
|
@@ -340,9 +397,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
340
397
|
await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
|
|
341
398
|
await self.remove_audio_context(msg["context_id"])
|
|
342
399
|
elif msg["type"] == "timestamps":
|
|
343
|
-
|
|
344
|
-
|
|
400
|
+
# Process the timestamps based on language before adding them
|
|
401
|
+
processed_timestamps = self._process_word_timestamps_for_language(
|
|
402
|
+
msg["word_timestamps"]["words"], msg["word_timestamps"]["start"]
|
|
345
403
|
)
|
|
404
|
+
await self.add_word_timestamps(processed_timestamps)
|
|
346
405
|
elif msg["type"] == "chunk":
|
|
347
406
|
await self.stop_ttfb_metrics()
|
|
348
407
|
self.start_word_timestamps()
|
|
@@ -375,7 +434,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
375
434
|
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
376
435
|
|
|
377
436
|
try:
|
|
378
|
-
if not self._websocket or self._websocket.
|
|
437
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
379
438
|
await self._connect()
|
|
380
439
|
|
|
381
440
|
if not self._context_id:
|
|
@@ -406,18 +465,6 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
406
465
|
Provides text-to-speech using Cartesia's HTTP API for simpler, non-streaming
|
|
407
466
|
synthesis. Suitable for use cases where streaming is not required and simpler
|
|
408
467
|
integration is preferred.
|
|
409
|
-
|
|
410
|
-
Args:
|
|
411
|
-
api_key: Cartesia API key for authentication.
|
|
412
|
-
voice_id: ID of the voice to use for synthesis.
|
|
413
|
-
model: TTS model to use (e.g., "sonic-2").
|
|
414
|
-
base_url: Base URL for Cartesia HTTP API.
|
|
415
|
-
cartesia_version: API version string for Cartesia service.
|
|
416
|
-
sample_rate: Audio sample rate. If None, uses default.
|
|
417
|
-
encoding: Audio encoding format.
|
|
418
|
-
container: Audio container format.
|
|
419
|
-
params: Additional input parameters for voice customization.
|
|
420
|
-
**kwargs: Additional arguments passed to the parent TTSService.
|
|
421
468
|
"""
|
|
422
469
|
|
|
423
470
|
class InputParams(BaseModel):
|
|
@@ -426,7 +473,10 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
426
473
|
Parameters:
|
|
427
474
|
language: Language to use for synthesis.
|
|
428
475
|
speed: Voice speed control (string or float).
|
|
429
|
-
emotion: List of emotion controls
|
|
476
|
+
emotion: List of emotion controls.
|
|
477
|
+
|
|
478
|
+
.. deprecated:: 0.0.68
|
|
479
|
+
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
430
480
|
"""
|
|
431
481
|
|
|
432
482
|
language: Optional[Language] = Language.EN
|
|
@@ -447,6 +497,20 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
447
497
|
params: Optional[InputParams] = None,
|
|
448
498
|
**kwargs,
|
|
449
499
|
):
|
|
500
|
+
"""Initialize the Cartesia HTTP TTS service.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
api_key: Cartesia API key for authentication.
|
|
504
|
+
voice_id: ID of the voice to use for synthesis.
|
|
505
|
+
model: TTS model to use (e.g., "sonic-2").
|
|
506
|
+
base_url: Base URL for Cartesia HTTP API.
|
|
507
|
+
cartesia_version: API version string for Cartesia service.
|
|
508
|
+
sample_rate: Audio sample rate. If None, uses default.
|
|
509
|
+
encoding: Audio encoding format.
|
|
510
|
+
container: Audio container format.
|
|
511
|
+
params: Additional input parameters for voice customization.
|
|
512
|
+
**kwargs: Additional arguments passed to the parent TTSService.
|
|
513
|
+
"""
|
|
450
514
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
451
515
|
|
|
452
516
|
params = params or CartesiaHttpTTSService.InputParams()
|
pipecat/services/cerebras/llm.py
CHANGED
|
@@ -9,8 +9,7 @@
|
|
|
9
9
|
from typing import List
|
|
10
10
|
|
|
11
11
|
from loguru import logger
|
|
12
|
-
from openai import
|
|
13
|
-
from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
|
|
12
|
+
from openai.types.chat import ChatCompletionMessageParam
|
|
14
13
|
|
|
15
14
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|
16
15
|
from pipecat.services.openai.llm import OpenAILLMService
|
|
@@ -21,12 +20,6 @@ class CerebrasLLMService(OpenAILLMService):
|
|
|
21
20
|
|
|
22
21
|
This service extends OpenAILLMService to connect to Cerebras's API endpoint while
|
|
23
22
|
maintaining full compatibility with OpenAI's interface and functionality.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
api_key: The API key for accessing Cerebras's API.
|
|
27
|
-
base_url: The base URL for Cerebras API. Defaults to "https://api.cerebras.ai/v1".
|
|
28
|
-
model: The model identifier to use. Defaults to "llama-3.3-70b".
|
|
29
|
-
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
30
23
|
"""
|
|
31
24
|
|
|
32
25
|
def __init__(
|
|
@@ -37,6 +30,14 @@ class CerebrasLLMService(OpenAILLMService):
|
|
|
37
30
|
model: str = "llama-3.3-70b",
|
|
38
31
|
**kwargs,
|
|
39
32
|
):
|
|
33
|
+
"""Initialize the Cerebras LLM service.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
api_key: The API key for accessing Cerebras's API.
|
|
37
|
+
base_url: The base URL for Cerebras API. Defaults to "https://api.cerebras.ai/v1".
|
|
38
|
+
model: The model identifier to use. Defaults to "llama-3.3-70b".
|
|
39
|
+
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
40
|
+
"""
|
|
40
41
|
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
|
|
41
42
|
|
|
42
43
|
def create_client(self, api_key=None, base_url=None, **kwargs):
|
|
@@ -53,20 +54,13 @@ class CerebrasLLMService(OpenAILLMService):
|
|
|
53
54
|
logger.debug(f"Creating Cerebras client with api {base_url}")
|
|
54
55
|
return super().create_client(api_key, base_url, **kwargs)
|
|
55
56
|
|
|
56
|
-
|
|
57
|
+
def build_chat_completion_params(
|
|
57
58
|
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
|
|
58
|
-
) ->
|
|
59
|
-
"""
|
|
59
|
+
) -> dict:
|
|
60
|
+
"""Build parameters for Cerebras chat completion request.
|
|
60
61
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
and other settings for the chat completion.
|
|
64
|
-
messages: The list of messages comprising
|
|
65
|
-
the conversation history and current request.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
A streaming response of chat completion
|
|
69
|
-
chunks that can be processed asynchronously.
|
|
62
|
+
Cerebras supports a subset of OpenAI parameters, focusing on core
|
|
63
|
+
completion settings without advanced features like frequency/presence penalties.
|
|
70
64
|
"""
|
|
71
65
|
params = {
|
|
72
66
|
"model": self.model_name,
|
|
@@ -81,6 +75,4 @@ class CerebrasLLMService(OpenAILLMService):
|
|
|
81
75
|
}
|
|
82
76
|
|
|
83
77
|
params.update(self._settings["extra"])
|
|
84
|
-
|
|
85
|
-
chunks = await self._client.chat.completions.create(**params)
|
|
86
|
-
return chunks
|
|
78
|
+
return params
|
pipecat/services/deepgram/stt.py
CHANGED
|
@@ -15,6 +15,7 @@ from loguru import logger
|
|
|
15
15
|
from pipecat.frames.frames import (
|
|
16
16
|
CancelFrame,
|
|
17
17
|
EndFrame,
|
|
18
|
+
ErrorFrame,
|
|
18
19
|
Frame,
|
|
19
20
|
InterimTranscriptionFrame,
|
|
20
21
|
StartFrame,
|
|
@@ -50,15 +51,6 @@ class DeepgramSTTService(STTService):
|
|
|
50
51
|
Provides real-time speech recognition using Deepgram's WebSocket API.
|
|
51
52
|
Supports configurable models, languages, VAD events, and various audio
|
|
52
53
|
processing options.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
api_key: Deepgram API key for authentication.
|
|
56
|
-
url: Deprecated. Use base_url instead.
|
|
57
|
-
base_url: Custom Deepgram API base URL.
|
|
58
|
-
sample_rate: Audio sample rate. If None, uses default or live_options value.
|
|
59
|
-
live_options: Deepgram LiveOptions for detailed configuration.
|
|
60
|
-
addons: Additional Deepgram features to enable.
|
|
61
|
-
**kwargs: Additional arguments passed to the parent STTService.
|
|
62
54
|
"""
|
|
63
55
|
|
|
64
56
|
def __init__(
|
|
@@ -72,6 +64,21 @@ class DeepgramSTTService(STTService):
|
|
|
72
64
|
addons: Optional[Dict] = None,
|
|
73
65
|
**kwargs,
|
|
74
66
|
):
|
|
67
|
+
"""Initialize the Deepgram STT service.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
api_key: Deepgram API key for authentication.
|
|
71
|
+
url: Custom Deepgram API base URL.
|
|
72
|
+
|
|
73
|
+
.. deprecated:: 0.0.64
|
|
74
|
+
Parameter `url` is deprecated, use `base_url` instead.
|
|
75
|
+
|
|
76
|
+
base_url: Custom Deepgram API base URL.
|
|
77
|
+
sample_rate: Audio sample rate. If None, uses default or live_options value.
|
|
78
|
+
live_options: Deepgram LiveOptions for detailed configuration.
|
|
79
|
+
addons: Additional Deepgram features to enable.
|
|
80
|
+
**kwargs: Additional arguments passed to the parent STTService.
|
|
81
|
+
"""
|
|
75
82
|
sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
|
|
76
83
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
77
84
|
|
|
@@ -279,6 +286,7 @@ class DeepgramSTTService(STTService):
|
|
|
279
286
|
async def _on_error(self, *args, **kwargs):
|
|
280
287
|
error: ErrorResponse = kwargs["error"]
|
|
281
288
|
self.logger.warning(f"{self} connection error, will retry: {error}")
|
|
289
|
+
await self.push_error(ErrorFrame(f"{error}"))
|
|
282
290
|
await self.stop_all_metrics()
|
|
283
291
|
# NOTE(aleix): we don't disconnect (i.e. call finish on the connection)
|
|
284
292
|
# because this triggers more errors internally in the Deepgram SDK. So,
|
|
@@ -316,7 +324,7 @@ class DeepgramSTTService(STTService):
|
|
|
316
324
|
await self.push_frame(
|
|
317
325
|
TranscriptionFrame(
|
|
318
326
|
transcript,
|
|
319
|
-
|
|
327
|
+
self._user_id,
|
|
320
328
|
time_now_iso8601(),
|
|
321
329
|
language,
|
|
322
330
|
result=result,
|
|
@@ -330,7 +338,7 @@ class DeepgramSTTService(STTService):
|
|
|
330
338
|
await self.push_frame(
|
|
331
339
|
InterimTranscriptionFrame(
|
|
332
340
|
transcript,
|
|
333
|
-
|
|
341
|
+
self._user_id,
|
|
334
342
|
time_now_iso8601(),
|
|
335
343
|
language,
|
|
336
344
|
result=result,
|
pipecat/services/deepgram/tts.py
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Deepgram text-to-speech service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Deepgram's text-to-speech API
|
|
10
|
+
for generating speech from text using various voice models.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
from typing import AsyncGenerator, Optional
|
|
8
14
|
|
|
9
15
|
from loguru import logger
|
|
@@ -27,6 +33,13 @@ except ModuleNotFoundError as e:
|
|
|
27
33
|
|
|
28
34
|
|
|
29
35
|
class DeepgramTTSService(TTSService):
|
|
36
|
+
"""Deepgram text-to-speech service.
|
|
37
|
+
|
|
38
|
+
Provides text-to-speech synthesis using Deepgram's streaming API.
|
|
39
|
+
Supports various voice models and audio encoding formats with
|
|
40
|
+
configurable sample rates and quality settings.
|
|
41
|
+
"""
|
|
42
|
+
|
|
30
43
|
def __init__(
|
|
31
44
|
self,
|
|
32
45
|
*,
|
|
@@ -37,6 +50,16 @@ class DeepgramTTSService(TTSService):
|
|
|
37
50
|
encoding: str = "linear16",
|
|
38
51
|
**kwargs,
|
|
39
52
|
):
|
|
53
|
+
"""Initialize the Deepgram TTS service.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
api_key: Deepgram API key for authentication.
|
|
57
|
+
voice: Voice model to use for synthesis. Defaults to "aura-2-helena-en".
|
|
58
|
+
base_url: Custom base URL for Deepgram API. Uses default if empty.
|
|
59
|
+
sample_rate: Audio sample rate in Hz. If None, uses service default.
|
|
60
|
+
encoding: Audio encoding format. Defaults to "linear16".
|
|
61
|
+
**kwargs: Additional arguments passed to parent TTSService class.
|
|
62
|
+
"""
|
|
40
63
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
41
64
|
|
|
42
65
|
self._settings = {
|
|
@@ -48,10 +71,23 @@ class DeepgramTTSService(TTSService):
|
|
|
48
71
|
self._deepgram_client = DeepgramClient(api_key, config=client_options)
|
|
49
72
|
|
|
50
73
|
def can_generate_metrics(self) -> bool:
|
|
74
|
+
"""Check if the service can generate metrics.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
True, as Deepgram TTS service supports metrics generation.
|
|
78
|
+
"""
|
|
51
79
|
return True
|
|
52
80
|
|
|
53
81
|
@traced_tts
|
|
54
82
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
83
|
+
"""Generate speech from text using Deepgram's TTS API.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
text: The text to synthesize into speech.
|
|
87
|
+
|
|
88
|
+
Yields:
|
|
89
|
+
Frame: Audio frames containing the synthesized speech, plus start/stop frames.
|
|
90
|
+
"""
|
|
55
91
|
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
56
92
|
|
|
57
93
|
options = SpeakOptions(
|
pipecat/services/deepseek/llm.py
CHANGED
|
@@ -9,8 +9,7 @@
|
|
|
9
9
|
from typing import List
|
|
10
10
|
|
|
11
11
|
from loguru import logger
|
|
12
|
-
from openai import
|
|
13
|
-
from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
|
|
12
|
+
from openai.types.chat import ChatCompletionMessageParam
|
|
14
13
|
|
|
15
14
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|
16
15
|
from pipecat.services.openai.llm import OpenAILLMService
|
|
@@ -21,12 +20,6 @@ class DeepSeekLLMService(OpenAILLMService):
|
|
|
21
20
|
|
|
22
21
|
This service extends OpenAILLMService to connect to DeepSeek's API endpoint while
|
|
23
22
|
maintaining full compatibility with OpenAI's interface and functionality.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
api_key: The API key for accessing DeepSeek's API.
|
|
27
|
-
base_url: The base URL for DeepSeek API. Defaults to "https://api.deepseek.com/v1".
|
|
28
|
-
model: The model identifier to use. Defaults to "deepseek-chat".
|
|
29
|
-
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
30
23
|
"""
|
|
31
24
|
|
|
32
25
|
def __init__(
|
|
@@ -37,6 +30,14 @@ class DeepSeekLLMService(OpenAILLMService):
|
|
|
37
30
|
model: str = "deepseek-chat",
|
|
38
31
|
**kwargs,
|
|
39
32
|
):
|
|
33
|
+
"""Initialize the DeepSeek LLM service.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
api_key: The API key for accessing DeepSeek's API.
|
|
37
|
+
base_url: The base URL for DeepSeek API. Defaults to "https://api.deepseek.com/v1".
|
|
38
|
+
model: The model identifier to use. Defaults to "deepseek-chat".
|
|
39
|
+
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
40
|
+
"""
|
|
40
41
|
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
|
|
41
42
|
|
|
42
43
|
def create_client(self, api_key=None, base_url=None, **kwargs):
|
|
@@ -53,20 +54,12 @@ class DeepSeekLLMService(OpenAILLMService):
|
|
|
53
54
|
logger.debug(f"Creating DeepSeek client with api {base_url}")
|
|
54
55
|
return super().create_client(api_key, base_url, **kwargs)
|
|
55
56
|
|
|
56
|
-
|
|
57
|
+
def _build_chat_completion_params(
|
|
57
58
|
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
|
|
58
|
-
) ->
|
|
59
|
-
"""
|
|
59
|
+
) -> dict:
|
|
60
|
+
"""Build parameters for DeepSeek chat completion request.
|
|
60
61
|
|
|
61
|
-
|
|
62
|
-
context: The context object containing tools configuration
|
|
63
|
-
and other settings for the chat completion.
|
|
64
|
-
messages: The list of messages comprising the conversation
|
|
65
|
-
history and current request.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
A streaming response of chat completion chunks that can be
|
|
69
|
-
processed asynchronously.
|
|
62
|
+
DeepSeek doesn't support some OpenAI parameters like seed and max_completion_tokens.
|
|
70
63
|
"""
|
|
71
64
|
params = {
|
|
72
65
|
"model": self.model_name,
|
|
@@ -83,6 +76,4 @@ class DeepSeekLLMService(OpenAILLMService):
|
|
|
83
76
|
}
|
|
84
77
|
|
|
85
78
|
params.update(self._settings["extra"])
|
|
86
|
-
|
|
87
|
-
chunks = await self._client.chat.completions.create(**params)
|
|
88
|
-
return chunks
|
|
79
|
+
return params
|