dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
pipecat/services/riva/tts.py
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""NVIDIA Riva text-to-speech service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with NVIDIA Riva's TTS services through
|
|
10
|
+
gRPC API for high-quality speech synthesis.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import asyncio
|
|
8
14
|
import os
|
|
9
15
|
from typing import AsyncGenerator, Mapping, Optional
|
|
@@ -37,7 +43,21 @@ RIVA_TTS_TIMEOUT_SECS = 5
|
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
class RivaTTSService(TTSService):
|
|
46
|
+
"""NVIDIA Riva text-to-speech service.
|
|
47
|
+
|
|
48
|
+
Provides high-quality text-to-speech synthesis using NVIDIA Riva's
|
|
49
|
+
cloud-based TTS models. Supports multiple voices, languages, and
|
|
50
|
+
configurable quality settings.
|
|
51
|
+
"""
|
|
52
|
+
|
|
40
53
|
class InputParams(BaseModel):
|
|
54
|
+
"""Input parameters for Riva TTS configuration.
|
|
55
|
+
|
|
56
|
+
Parameters:
|
|
57
|
+
language: Language code for synthesis. Defaults to US English.
|
|
58
|
+
quality: Audio quality setting (0-100). Defaults to 20.
|
|
59
|
+
"""
|
|
60
|
+
|
|
41
61
|
language: Optional[Language] = Language.EN_US
|
|
42
62
|
quality: Optional[int] = 20
|
|
43
63
|
|
|
@@ -55,6 +75,17 @@ class RivaTTSService(TTSService):
|
|
|
55
75
|
params: Optional[InputParams] = None,
|
|
56
76
|
**kwargs,
|
|
57
77
|
):
|
|
78
|
+
"""Initialize the NVIDIA Riva TTS service.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
api_key: NVIDIA API key for authentication.
|
|
82
|
+
server: gRPC server endpoint. Defaults to NVIDIA's cloud endpoint.
|
|
83
|
+
voice_id: Voice model identifier. Defaults to multilingual Ray voice.
|
|
84
|
+
sample_rate: Audio sample rate. If None, uses service default.
|
|
85
|
+
model_function_map: Dictionary containing function_id and model_name for the TTS model.
|
|
86
|
+
params: Additional configuration parameters for TTS synthesis.
|
|
87
|
+
**kwargs: Additional arguments passed to parent TTSService.
|
|
88
|
+
"""
|
|
58
89
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
59
90
|
|
|
60
91
|
params = params or RivaTTSService.InputParams()
|
|
@@ -82,6 +113,13 @@ class RivaTTSService(TTSService):
|
|
|
82
113
|
)
|
|
83
114
|
|
|
84
115
|
async def set_model(self, model: str):
|
|
116
|
+
"""Attempt to set the TTS model.
|
|
117
|
+
|
|
118
|
+
Note: Model cannot be changed after initialization for Riva service.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
model: The model name to set (operation not supported).
|
|
122
|
+
"""
|
|
85
123
|
logger.warning(f"Cannot set model after initialization. Set model and function id like so:")
|
|
86
124
|
example = {"function_id": "<UUID>", "model_name": "<model_name>"}
|
|
87
125
|
logger.warning(
|
|
@@ -90,6 +128,15 @@ class RivaTTSService(TTSService):
|
|
|
90
128
|
|
|
91
129
|
@traced_tts
|
|
92
130
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
131
|
+
"""Generate speech from text using NVIDIA Riva TTS.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
text: The text to synthesize into speech.
|
|
135
|
+
|
|
136
|
+
Yields:
|
|
137
|
+
Frame: Audio frames containing the synthesized speech data.
|
|
138
|
+
"""
|
|
139
|
+
|
|
93
140
|
def read_audio_responses(queue: asyncio.Queue):
|
|
94
141
|
def add_response(r):
|
|
95
142
|
asyncio.run_coroutine_threadsafe(queue.put(r), self.get_event_loop())
|
|
@@ -121,7 +168,7 @@ class RivaTTSService(TTSService):
|
|
|
121
168
|
await asyncio.to_thread(read_audio_responses, queue)
|
|
122
169
|
|
|
123
170
|
# Wait for the thread to start.
|
|
124
|
-
resp = await asyncio.wait_for(queue.get(), RIVA_TTS_TIMEOUT_SECS)
|
|
171
|
+
resp = await asyncio.wait_for(queue.get(), timeout=RIVA_TTS_TIMEOUT_SECS)
|
|
125
172
|
while resp:
|
|
126
173
|
await self.stop_ttfb_metrics()
|
|
127
174
|
frame = TTSAudioRawFrame(
|
|
@@ -130,7 +177,7 @@ class RivaTTSService(TTSService):
|
|
|
130
177
|
num_channels=1,
|
|
131
178
|
)
|
|
132
179
|
yield frame
|
|
133
|
-
resp = await asyncio.wait_for(queue.get(), RIVA_TTS_TIMEOUT_SECS)
|
|
180
|
+
resp = await asyncio.wait_for(queue.get(), timeout=RIVA_TTS_TIMEOUT_SECS)
|
|
134
181
|
except asyncio.TimeoutError:
|
|
135
182
|
logger.error(f"{self} timeout waiting for audio response")
|
|
136
183
|
|
|
@@ -139,6 +186,13 @@ class RivaTTSService(TTSService):
|
|
|
139
186
|
|
|
140
187
|
|
|
141
188
|
class FastPitchTTSService(RivaTTSService):
|
|
189
|
+
"""Deprecated FastPitch TTS service.
|
|
190
|
+
|
|
191
|
+
.. deprecated:: 0.0.66
|
|
192
|
+
This class is deprecated. Use RivaTTSService instead for new implementations.
|
|
193
|
+
Provides backward compatibility for existing FastPitch TTS integrations.
|
|
194
|
+
"""
|
|
195
|
+
|
|
142
196
|
def __init__(
|
|
143
197
|
self,
|
|
144
198
|
*,
|
|
@@ -153,6 +207,17 @@ class FastPitchTTSService(RivaTTSService):
|
|
|
153
207
|
params: Optional[RivaTTSService.InputParams] = None,
|
|
154
208
|
**kwargs,
|
|
155
209
|
):
|
|
210
|
+
"""Initialize the deprecated FastPitch TTS service.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
api_key: NVIDIA API key for authentication.
|
|
214
|
+
server: gRPC server endpoint. Defaults to NVIDIA's cloud endpoint.
|
|
215
|
+
voice_id: Voice model identifier. Defaults to Female-1 voice.
|
|
216
|
+
sample_rate: Audio sample rate. If None, uses service default.
|
|
217
|
+
model_function_map: Dictionary containing function_id and model_name for FastPitch model.
|
|
218
|
+
params: Additional configuration parameters for TTS synthesis.
|
|
219
|
+
**kwargs: Additional arguments passed to parent RivaTTSService.
|
|
220
|
+
"""
|
|
156
221
|
super().__init__(
|
|
157
222
|
api_key=api_key,
|
|
158
223
|
server=server,
|
|
@@ -20,7 +20,6 @@ from pipecat.metrics.metrics import LLMTokenUsage
|
|
|
20
20
|
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|
21
21
|
from pipecat.services.llm_service import FunctionCallFromLLM
|
|
22
22
|
from pipecat.services.openai.llm import OpenAILLMService
|
|
23
|
-
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
|
24
23
|
from pipecat.utils.tracing.service_decorators import traced_llm
|
|
25
24
|
|
|
26
25
|
|
|
@@ -29,12 +28,6 @@ class SambaNovaLLMService(OpenAILLMService): # type: ignore
|
|
|
29
28
|
|
|
30
29
|
This service extends OpenAILLMService to connect to SambaNova's API endpoint while
|
|
31
30
|
maintaining full compatibility with OpenAI's interface and functionality.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
api_key: The API key for accessing SambaNova API.
|
|
35
|
-
model: The model identifier to use. Defaults to "Llama-4-Maverick-17B-128E-Instruct".
|
|
36
|
-
base_url: The base URL for SambaNova API. Defaults to "https://api.sambanova.ai/v1".
|
|
37
|
-
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
38
31
|
"""
|
|
39
32
|
|
|
40
33
|
def __init__(
|
|
@@ -45,6 +38,14 @@ class SambaNovaLLMService(OpenAILLMService): # type: ignore
|
|
|
45
38
|
base_url: str = "https://api.sambanova.ai/v1",
|
|
46
39
|
**kwargs: Dict[Any, Any],
|
|
47
40
|
) -> None:
|
|
41
|
+
"""Initialize SambaNova LLM service.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
api_key: The API key for accessing SambaNova API.
|
|
45
|
+
model: The model identifier to use. Defaults to "Llama-4-Maverick-17B-128E-Instruct".
|
|
46
|
+
base_url: The base URL for SambaNova API. Defaults to "https://api.sambanova.ai/v1".
|
|
47
|
+
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
48
|
+
"""
|
|
48
49
|
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
|
|
49
50
|
|
|
50
51
|
def create_client(
|
|
@@ -66,17 +67,20 @@ class SambaNovaLLMService(OpenAILLMService): # type: ignore
|
|
|
66
67
|
logger.debug(f"Creating SambaNova client with API {base_url}")
|
|
67
68
|
return super().create_client(api_key, base_url, **kwargs)
|
|
68
69
|
|
|
69
|
-
|
|
70
|
+
def build_chat_completion_params(
|
|
70
71
|
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
|
|
71
|
-
) ->
|
|
72
|
-
"""
|
|
72
|
+
) -> dict:
|
|
73
|
+
"""Build parameters for SambaNova chat completion request.
|
|
74
|
+
|
|
75
|
+
SambaNova doesn't support some OpenAI parameters like frequency_penalty,
|
|
76
|
+
presence_penalty, and seed.
|
|
73
77
|
|
|
74
78
|
Args:
|
|
75
|
-
context:
|
|
76
|
-
messages: List of chat completion
|
|
79
|
+
context: The LLM context containing tools and configuration.
|
|
80
|
+
messages: List of chat completion messages to send.
|
|
77
81
|
|
|
78
82
|
Returns:
|
|
79
|
-
|
|
83
|
+
Dictionary of parameters for the chat completion request.
|
|
80
84
|
"""
|
|
81
85
|
params = {
|
|
82
86
|
"model": self.model_name,
|
|
@@ -92,9 +96,7 @@ class SambaNovaLLMService(OpenAILLMService): # type: ignore
|
|
|
92
96
|
}
|
|
93
97
|
|
|
94
98
|
params.update(self._settings["extra"])
|
|
95
|
-
|
|
96
|
-
chunks = await self._client.chat.completions.create(**params)
|
|
97
|
-
return chunks
|
|
99
|
+
return params
|
|
98
100
|
|
|
99
101
|
@traced_llm # type: ignore
|
|
100
102
|
async def _process_context(self, context: OpenAILLMContext) -> AsyncStream[ChatCompletionChunk]:
|
|
@@ -124,7 +126,7 @@ class SambaNovaLLMService(OpenAILLMService): # type: ignore
|
|
|
124
126
|
context
|
|
125
127
|
)
|
|
126
128
|
|
|
127
|
-
async for chunk in
|
|
129
|
+
async for chunk in chunk_stream:
|
|
128
130
|
if chunk.usage:
|
|
129
131
|
tokens = LLMTokenUsage(
|
|
130
132
|
prompt_tokens=chunk.usage.prompt_tokens,
|
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""SambaNova's Speech-to-Text service implementation for real-time transcription."""
|
|
8
|
+
|
|
7
9
|
from typing import Any, Optional
|
|
8
10
|
|
|
9
11
|
from pipecat.services.whisper.base_stt import BaseWhisperSTTService, Transcription
|
|
@@ -12,16 +14,9 @@ from pipecat.transcriptions.language import Language
|
|
|
12
14
|
|
|
13
15
|
class SambaNovaSTTService(BaseWhisperSTTService): # type: ignore
|
|
14
16
|
"""SambaNova Whisper speech-to-text service.
|
|
17
|
+
|
|
15
18
|
Uses SambaNova's Whisper API to convert audio to text.
|
|
16
19
|
Requires a SambaNova API key set via the api_key parameter or SAMBANOVA_API_KEY environment variable.
|
|
17
|
-
Args:
|
|
18
|
-
model: Whisper model to use. Defaults to "Whisper-Large-v3".
|
|
19
|
-
api_key: SambaNova API key. Defaults to None.
|
|
20
|
-
base_url: API base URL. Defaults to "https://api.sambanova.ai/v1".
|
|
21
|
-
language: Language of the audio input. Defaults to English.
|
|
22
|
-
prompt: Optional text to guide the model's style or continue a previous segment.
|
|
23
|
-
temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
|
|
24
|
-
**kwargs: Additional arguments passed to `pipecat.services.whisper.base_stt.BaseWhisperSTTService`.
|
|
25
20
|
"""
|
|
26
21
|
|
|
27
22
|
def __init__(
|
|
@@ -35,6 +30,17 @@ class SambaNovaSTTService(BaseWhisperSTTService): # type: ignore
|
|
|
35
30
|
temperature: Optional[float] = None,
|
|
36
31
|
**kwargs: Any,
|
|
37
32
|
) -> None:
|
|
33
|
+
"""Initialize SambaNova STT service.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
model: Whisper model to use. Defaults to "Whisper-Large-v3".
|
|
37
|
+
api_key: SambaNova API key. Defaults to None.
|
|
38
|
+
base_url: API base URL. Defaults to "https://api.sambanova.ai/v1".
|
|
39
|
+
language: Language of the audio input. Defaults to English.
|
|
40
|
+
prompt: Optional text to guide the model's style or continue a previous segment.
|
|
41
|
+
temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
|
|
42
|
+
**kwargs: Additional arguments passed to `pipecat.services.whisper.base_stt.BaseWhisperSTTService`.
|
|
43
|
+
"""
|
|
38
44
|
super().__init__(
|
|
39
45
|
model=model,
|
|
40
46
|
api_key=api_key,
|
pipecat/services/sarvam/tts.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Sarvam AI text-to-speech service implementation."""
|
|
8
|
+
|
|
7
9
|
import base64
|
|
8
10
|
from typing import AsyncGenerator, Optional
|
|
9
11
|
|
|
@@ -25,7 +27,14 @@ from pipecat.utils.tracing.service_decorators import traced_tts
|
|
|
25
27
|
|
|
26
28
|
|
|
27
29
|
def language_to_sarvam_language(language: Language) -> Optional[str]:
|
|
28
|
-
"""Convert Pipecat Language enum to Sarvam AI language codes.
|
|
30
|
+
"""Convert Pipecat Language enum to Sarvam AI language codes.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
language: The Language enum value to convert.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
The corresponding Sarvam AI language code, or None if not supported.
|
|
37
|
+
"""
|
|
29
38
|
LANGUAGE_MAP = {
|
|
30
39
|
Language.BN: "bn-IN", # Bengali
|
|
31
40
|
Language.EN: "en-IN", # English (India)
|
|
@@ -50,17 +59,8 @@ class SarvamTTSService(TTSService):
|
|
|
50
59
|
Indian languages. Provides control over voice characteristics like pitch, pace,
|
|
51
60
|
and loudness.
|
|
52
61
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
voice_id: Speaker voice ID (e.g., "anushka", "meera").
|
|
56
|
-
model: TTS model to use ("bulbul:v1" or "bulbul:v2").
|
|
57
|
-
aiohttp_session: Shared aiohttp session for making requests.
|
|
58
|
-
base_url: Sarvam AI API base URL.
|
|
59
|
-
sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000).
|
|
60
|
-
params: Additional voice and preprocessing parameters.
|
|
61
|
-
|
|
62
|
-
Example:
|
|
63
|
-
```python
|
|
62
|
+
Example::
|
|
63
|
+
|
|
64
64
|
tts = SarvamTTSService(
|
|
65
65
|
api_key="your-api-key",
|
|
66
66
|
voice_id="anushka",
|
|
@@ -72,10 +72,19 @@ class SarvamTTSService(TTSService):
|
|
|
72
72
|
pace=1.2
|
|
73
73
|
)
|
|
74
74
|
)
|
|
75
|
-
```
|
|
76
75
|
"""
|
|
77
76
|
|
|
78
77
|
class InputParams(BaseModel):
|
|
78
|
+
"""Input parameters for Sarvam TTS configuration.
|
|
79
|
+
|
|
80
|
+
Parameters:
|
|
81
|
+
language: Language for synthesis. Defaults to English (India).
|
|
82
|
+
pitch: Voice pitch adjustment (-0.75 to 0.75). Defaults to 0.0.
|
|
83
|
+
pace: Speech pace multiplier (0.3 to 3.0). Defaults to 1.0.
|
|
84
|
+
loudness: Volume multiplier (0.1 to 3.0). Defaults to 1.0.
|
|
85
|
+
enable_preprocessing: Whether to enable text preprocessing. Defaults to False.
|
|
86
|
+
"""
|
|
87
|
+
|
|
79
88
|
language: Optional[Language] = Language.EN
|
|
80
89
|
pitch: Optional[float] = Field(default=0.0, ge=-0.75, le=0.75)
|
|
81
90
|
pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
|
|
@@ -94,6 +103,18 @@ class SarvamTTSService(TTSService):
|
|
|
94
103
|
params: Optional[InputParams] = None,
|
|
95
104
|
**kwargs,
|
|
96
105
|
):
|
|
106
|
+
"""Initialize the Sarvam TTS service.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
api_key: Sarvam AI API subscription key.
|
|
110
|
+
voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
|
|
111
|
+
model: TTS model to use ("bulbul:v1" or "bulbul:v2"). Defaults to "bulbul:v2".
|
|
112
|
+
aiohttp_session: Shared aiohttp session for making requests.
|
|
113
|
+
base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
|
|
114
|
+
sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
|
|
115
|
+
params: Additional voice and preprocessing parameters. If None, uses defaults.
|
|
116
|
+
**kwargs: Additional arguments passed to parent TTSService.
|
|
117
|
+
"""
|
|
97
118
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
98
119
|
|
|
99
120
|
params = params or SarvamTTSService.InputParams()
|
|
@@ -116,17 +137,43 @@ class SarvamTTSService(TTSService):
|
|
|
116
137
|
self.set_voice(voice_id)
|
|
117
138
|
|
|
118
139
|
def can_generate_metrics(self) -> bool:
|
|
140
|
+
"""Check if this service can generate processing metrics.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
True, as Sarvam service supports metrics generation.
|
|
144
|
+
"""
|
|
119
145
|
return True
|
|
120
146
|
|
|
121
147
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
148
|
+
"""Convert a Language enum to Sarvam AI language format.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
language: The language to convert.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
The Sarvam AI-specific language code, or None if not supported.
|
|
155
|
+
"""
|
|
122
156
|
return language_to_sarvam_language(language)
|
|
123
157
|
|
|
124
158
|
async def start(self, frame: StartFrame):
|
|
159
|
+
"""Start the Sarvam TTS service.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
frame: The start frame containing initialization parameters.
|
|
163
|
+
"""
|
|
125
164
|
await super().start(frame)
|
|
126
165
|
self._settings["sample_rate"] = self.sample_rate
|
|
127
166
|
|
|
128
167
|
@traced_tts
|
|
129
168
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
169
|
+
"""Generate speech from text using Sarvam AI's API.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
text: The text to synthesize into speech.
|
|
173
|
+
|
|
174
|
+
Yields:
|
|
175
|
+
Frame: Audio frames containing the synthesized speech.
|
|
176
|
+
"""
|
|
130
177
|
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
131
178
|
|
|
132
179
|
try:
|
pipecat/services/simli/video.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Simli video service for real-time avatar generation."""
|
|
8
|
+
|
|
7
9
|
import asyncio
|
|
8
10
|
|
|
9
11
|
import numpy as np
|
|
@@ -16,9 +18,10 @@ from pipecat.frames.frames import (
|
|
|
16
18
|
OutputImageRawFrame,
|
|
17
19
|
StartInterruptionFrame,
|
|
18
20
|
TTSAudioRawFrame,
|
|
21
|
+
TTSStoppedFrame,
|
|
22
|
+
UserStartedSpeakingFrame,
|
|
19
23
|
)
|
|
20
24
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor, StartFrame
|
|
21
|
-
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
|
22
25
|
|
|
23
26
|
try:
|
|
24
27
|
from av.audio.frame import AudioFrame
|
|
@@ -31,39 +34,68 @@ except ModuleNotFoundError as e:
|
|
|
31
34
|
|
|
32
35
|
|
|
33
36
|
class SimliVideoService(FrameProcessor):
|
|
37
|
+
"""Simli video service for real-time avatar generation.
|
|
38
|
+
|
|
39
|
+
Provides real-time avatar video generation by processing audio frames
|
|
40
|
+
and producing synchronized video output using the Simli API. Handles
|
|
41
|
+
audio resampling, video frame processing, and connection management.
|
|
42
|
+
"""
|
|
43
|
+
|
|
34
44
|
def __init__(
|
|
35
45
|
self,
|
|
36
46
|
simli_config: SimliConfig,
|
|
37
47
|
use_turn_server: bool = False,
|
|
38
48
|
latency_interval: int = 0,
|
|
49
|
+
simli_url: str = "https://api.simli.ai",
|
|
50
|
+
is_trinity_avatar: bool = False,
|
|
39
51
|
):
|
|
52
|
+
"""Initialize the Simli video service.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
simli_config: Configuration object for Simli client settings.
|
|
56
|
+
use_turn_server: Whether to use TURN server for connection. Defaults to False.
|
|
57
|
+
latency_interval: Latency interval setting for sending health checks to check the latency to Simli Servers. Defaults to 0.
|
|
58
|
+
simli_url: URL of the simli servers. Can be changed for custom deployments of enterprise users.
|
|
59
|
+
is_trinity_avatar: boolean to tell simli client that this is a Trinity avatar which reduces latency when using Trinity.
|
|
60
|
+
|
|
61
|
+
"""
|
|
40
62
|
super().__init__()
|
|
41
|
-
self.
|
|
63
|
+
self._initialized = False
|
|
64
|
+
simli_config.maxIdleTime += 5
|
|
65
|
+
simli_config.maxSessionLength += 5
|
|
66
|
+
self._simli_client = SimliClient(
|
|
67
|
+
simli_config,
|
|
68
|
+
use_turn_server,
|
|
69
|
+
latency_interval,
|
|
70
|
+
simliURL=simli_url,
|
|
71
|
+
)
|
|
42
72
|
|
|
43
|
-
self._pipecat_resampler_event = asyncio.Event()
|
|
44
73
|
self._pipecat_resampler: AudioResampler = None
|
|
74
|
+
self._pipecat_resampler_event = asyncio.Event()
|
|
45
75
|
self._simli_resampler = AudioResampler("s16", "mono", 16000)
|
|
46
76
|
|
|
47
|
-
self._initialized = False
|
|
48
77
|
self._audio_task: asyncio.Task = None
|
|
49
78
|
self._video_task: asyncio.Task = None
|
|
79
|
+
self._is_trinity_avatar = is_trinity_avatar
|
|
80
|
+
self._previously_interrupted = is_trinity_avatar
|
|
81
|
+
self._audio_buffer = bytearray()
|
|
50
82
|
|
|
51
83
|
async def _start_connection(self):
|
|
84
|
+
"""Start the connection to Simli service and begin processing tasks."""
|
|
52
85
|
if not self._initialized:
|
|
53
86
|
await self._simli_client.Initialize()
|
|
54
87
|
self._initialized = True
|
|
55
88
|
|
|
56
89
|
# Create task to consume and process audio and video
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
if not self._video_task:
|
|
61
|
-
self._video_task = self.create_task(self._consume_and_process_video())
|
|
90
|
+
await self._simli_client.sendSilence()
|
|
91
|
+
self._audio_task = self.create_task(self._consume_and_process_audio())
|
|
92
|
+
self._video_task = self.create_task(self._consume_and_process_video())
|
|
62
93
|
|
|
63
94
|
async def _consume_and_process_audio(self):
|
|
95
|
+
"""Consume audio frames from Simli and push them downstream."""
|
|
64
96
|
await self._pipecat_resampler_event.wait()
|
|
65
97
|
audio_iterator = self._simli_client.getAudioStreamIterator()
|
|
66
|
-
async for audio_frame in
|
|
98
|
+
async for audio_frame in audio_iterator:
|
|
67
99
|
resampled_frames = self._pipecat_resampler.resample(audio_frame)
|
|
68
100
|
for resampled_frame in resampled_frames:
|
|
69
101
|
audio_array = resampled_frame.to_ndarray()
|
|
@@ -78,9 +110,10 @@ class SimliVideoService(FrameProcessor):
|
|
|
78
110
|
)
|
|
79
111
|
|
|
80
112
|
async def _consume_and_process_video(self):
|
|
113
|
+
"""Consume video frames from Simli and convert them to output frames."""
|
|
81
114
|
await self._pipecat_resampler_event.wait()
|
|
82
115
|
video_iterator = self._simli_client.getVideoStreamIterator(targetFormat="rgb24")
|
|
83
|
-
async for video_frame in
|
|
116
|
+
async for video_frame in video_iterator:
|
|
84
117
|
# Process the video frame
|
|
85
118
|
convertedFrame: OutputImageRawFrame = OutputImageRawFrame(
|
|
86
119
|
image=video_frame.to_rgb().to_image().tobytes(),
|
|
@@ -91,9 +124,14 @@ class SimliVideoService(FrameProcessor):
|
|
|
91
124
|
await self.push_frame(convertedFrame)
|
|
92
125
|
|
|
93
126
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
127
|
+
"""Process incoming frames and handle Simli video generation.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
frame: The frame to process.
|
|
131
|
+
direction: The direction of frame processing.
|
|
132
|
+
"""
|
|
94
133
|
await super().process_frame(frame, direction)
|
|
95
134
|
if isinstance(frame, StartFrame):
|
|
96
|
-
await self.push_frame(frame, direction)
|
|
97
135
|
await self._start_connection()
|
|
98
136
|
elif isinstance(frame, TTSAudioRawFrame):
|
|
99
137
|
# Send audio frame to Simli
|
|
@@ -112,21 +150,44 @@ class SimliVideoService(FrameProcessor):
|
|
|
112
150
|
|
|
113
151
|
resampled_frames = self._simli_resampler.resample(old_frame)
|
|
114
152
|
for resampled_frame in resampled_frames:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
153
|
+
audioBytes = resampled_frame.to_ndarray().astype(np.int16).tobytes()
|
|
154
|
+
if self._previously_interrupted:
|
|
155
|
+
self._audio_buffer.extend(audioBytes)
|
|
156
|
+
if len(self._audio_buffer) >= 128000:
|
|
157
|
+
try:
|
|
158
|
+
for flushFrame in self._simli_resampler.resample(None):
|
|
159
|
+
self._audio_buffer.extend(
|
|
160
|
+
flushFrame.to_ndarray().astype(np.int16).tobytes()
|
|
161
|
+
)
|
|
162
|
+
finally:
|
|
163
|
+
await self._simli_client.playImmediate(self._audio_buffer)
|
|
164
|
+
self._previously_interrupted = False
|
|
165
|
+
self._audio_buffer = bytearray()
|
|
166
|
+
else:
|
|
167
|
+
await self._simli_client.send(audioBytes)
|
|
168
|
+
return
|
|
118
169
|
except Exception as e:
|
|
119
170
|
logger.exception(f"{self} exception: {e}")
|
|
171
|
+
elif isinstance(frame, TTSStoppedFrame):
|
|
172
|
+
try:
|
|
173
|
+
if self._previously_interrupted and len(self._audio_buffer) > 0:
|
|
174
|
+
await self._simli_client.playImmediate(self._audio_buffer)
|
|
175
|
+
self._previously_interrupted = False
|
|
176
|
+
self._audio_buffer = bytearray()
|
|
177
|
+
except Exception as e:
|
|
178
|
+
logger.exception(f"{self} exception: {e}")
|
|
179
|
+
return
|
|
120
180
|
elif isinstance(frame, (EndFrame, CancelFrame)):
|
|
121
181
|
await self._stop()
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
182
|
+
elif isinstance(frame, (StartInterruptionFrame, UserStartedSpeakingFrame)):
|
|
183
|
+
if not self._previously_interrupted:
|
|
184
|
+
await self._simli_client.clearBuffer()
|
|
185
|
+
self._previously_interrupted = self._is_trinity_avatar
|
|
186
|
+
|
|
187
|
+
await self.push_frame(frame, direction)
|
|
128
188
|
|
|
129
189
|
async def _stop(self):
|
|
190
|
+
"""Stop the Simli client and cancel processing tasks."""
|
|
130
191
|
await self._simli_client.stop()
|
|
131
192
|
if self._audio_task:
|
|
132
193
|
await self.cancel_task(self._audio_task)
|
|
File without changes
|