PyPI - dv-pipecat-ai - Versions diffs - 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show

{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
pipecat/__init__.py +17 -0
pipecat/adapters/base_llm_adapter.py +36 -1
pipecat/adapters/schemas/direct_function.py +296 -0
pipecat/adapters/schemas/function_schema.py +15 -6
pipecat/adapters/schemas/tools_schema.py +55 -7
pipecat/adapters/services/anthropic_adapter.py +22 -3
pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
pipecat/adapters/services/bedrock_adapter.py +22 -3
pipecat/adapters/services/gemini_adapter.py +16 -3
pipecat/adapters/services/open_ai_adapter.py +17 -2
pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
pipecat/audio/filters/base_audio_filter.py +30 -6
pipecat/audio/filters/koala_filter.py +37 -2
pipecat/audio/filters/krisp_filter.py +59 -6
pipecat/audio/filters/noisereduce_filter.py +37 -0
pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
pipecat/audio/mixers/base_audio_mixer.py +30 -7
pipecat/audio/mixers/soundfile_mixer.py +53 -6
pipecat/audio/resamplers/base_audio_resampler.py +17 -9
pipecat/audio/resamplers/resampy_resampler.py +26 -1
pipecat/audio/resamplers/soxr_resampler.py +32 -1
pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
pipecat/audio/utils.py +194 -1
pipecat/audio/vad/silero.py +60 -3
pipecat/audio/vad/vad_analyzer.py +114 -30
pipecat/clocks/base_clock.py +19 -0
pipecat/clocks/system_clock.py +25 -0
pipecat/extensions/voicemail/__init__.py +0 -0
pipecat/extensions/voicemail/voicemail_detector.py +707 -0
pipecat/frames/frames.py +590 -156
pipecat/metrics/metrics.py +64 -1
pipecat/observers/base_observer.py +58 -19
pipecat/observers/loggers/debug_log_observer.py +56 -64
pipecat/observers/loggers/llm_log_observer.py +8 -1
pipecat/observers/loggers/transcription_log_observer.py +19 -7
pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
pipecat/observers/turn_tracking_observer.py +26 -1
pipecat/pipeline/base_pipeline.py +5 -7
pipecat/pipeline/base_task.py +52 -9
pipecat/pipeline/parallel_pipeline.py +121 -177
pipecat/pipeline/pipeline.py +129 -20
pipecat/pipeline/runner.py +50 -1
pipecat/pipeline/sync_parallel_pipeline.py +132 -32
pipecat/pipeline/task.py +263 -280
pipecat/pipeline/task_observer.py +85 -34
pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
pipecat/processors/aggregators/gated.py +25 -24
pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
pipecat/processors/aggregators/llm_response.py +398 -89
pipecat/processors/aggregators/openai_llm_context.py +161 -13
pipecat/processors/aggregators/sentence.py +25 -14
pipecat/processors/aggregators/user_response.py +28 -3
pipecat/processors/aggregators/vision_image_frame.py +24 -14
pipecat/processors/async_generator.py +28 -0
pipecat/processors/audio/audio_buffer_processor.py +78 -37
pipecat/processors/consumer_processor.py +25 -6
pipecat/processors/filters/frame_filter.py +23 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/identity_filter.py +17 -2
pipecat/processors/filters/null_filter.py +24 -1
pipecat/processors/filters/stt_mute_filter.py +56 -21
pipecat/processors/filters/wake_check_filter.py +46 -3
pipecat/processors/filters/wake_notifier_filter.py +21 -3
pipecat/processors/frame_processor.py +488 -131
pipecat/processors/frameworks/langchain.py +38 -3
pipecat/processors/frameworks/rtvi.py +719 -34
pipecat/processors/gstreamer/pipeline_source.py +41 -0
pipecat/processors/idle_frame_processor.py +26 -3
pipecat/processors/logger.py +23 -0
pipecat/processors/metrics/frame_processor_metrics.py +77 -4
pipecat/processors/metrics/sentry.py +42 -4
pipecat/processors/producer_processor.py +34 -14
pipecat/processors/text_transformer.py +22 -10
pipecat/processors/transcript_processor.py +48 -29
pipecat/processors/user_idle_processor.py +31 -21
pipecat/runner/__init__.py +1 -0
pipecat/runner/daily.py +132 -0
pipecat/runner/livekit.py +148 -0
pipecat/runner/run.py +543 -0
pipecat/runner/types.py +67 -0
pipecat/runner/utils.py +515 -0
pipecat/serializers/base_serializer.py +42 -0
pipecat/serializers/exotel.py +17 -6
pipecat/serializers/genesys.py +95 -0
pipecat/serializers/livekit.py +33 -0
pipecat/serializers/plivo.py +16 -15
pipecat/serializers/protobuf.py +37 -1
pipecat/serializers/telnyx.py +18 -17
pipecat/serializers/twilio.py +32 -16
pipecat/services/ai_service.py +5 -3
pipecat/services/anthropic/llm.py +113 -43
pipecat/services/assemblyai/models.py +63 -5
pipecat/services/assemblyai/stt.py +64 -11
pipecat/services/asyncai/__init__.py +0 -0
pipecat/services/asyncai/tts.py +501 -0
pipecat/services/aws/llm.py +185 -111
pipecat/services/aws/stt.py +217 -23
pipecat/services/aws/tts.py +118 -52
pipecat/services/aws/utils.py +101 -5
pipecat/services/aws_nova_sonic/aws.py +82 -64
pipecat/services/aws_nova_sonic/context.py +15 -6
pipecat/services/azure/common.py +10 -2
pipecat/services/azure/image.py +32 -0
pipecat/services/azure/llm.py +9 -7
pipecat/services/azure/stt.py +65 -2
pipecat/services/azure/tts.py +154 -23
pipecat/services/cartesia/stt.py +125 -8
pipecat/services/cartesia/tts.py +102 -38
pipecat/services/cerebras/llm.py +15 -23
pipecat/services/deepgram/stt.py +19 -11
pipecat/services/deepgram/tts.py +36 -0
pipecat/services/deepseek/llm.py +14 -23
pipecat/services/elevenlabs/tts.py +330 -64
pipecat/services/fal/image.py +43 -0
pipecat/services/fal/stt.py +48 -10
pipecat/services/fireworks/llm.py +14 -21
pipecat/services/fish/tts.py +109 -9
pipecat/services/gemini_multimodal_live/__init__.py +1 -0
pipecat/services/gemini_multimodal_live/events.py +83 -2
pipecat/services/gemini_multimodal_live/file_api.py +189 -0
pipecat/services/gemini_multimodal_live/gemini.py +218 -21
pipecat/services/gladia/config.py +17 -10
pipecat/services/gladia/stt.py +82 -36
pipecat/services/google/frames.py +40 -0
pipecat/services/google/google.py +2 -0
pipecat/services/google/image.py +39 -2
pipecat/services/google/llm.py +176 -58
pipecat/services/google/llm_openai.py +26 -4
pipecat/services/google/llm_vertex.py +37 -15
pipecat/services/google/rtvi.py +41 -0
pipecat/services/google/stt.py +65 -17
pipecat/services/google/test-google-chirp.py +45 -0
pipecat/services/google/tts.py +390 -19
pipecat/services/grok/llm.py +8 -6
pipecat/services/groq/llm.py +8 -6
pipecat/services/groq/stt.py +13 -9
pipecat/services/groq/tts.py +40 -0
pipecat/services/hamsa/__init__.py +9 -0
pipecat/services/hamsa/stt.py +241 -0
pipecat/services/heygen/__init__.py +5 -0
pipecat/services/heygen/api.py +281 -0
pipecat/services/heygen/client.py +620 -0
pipecat/services/heygen/video.py +338 -0
pipecat/services/image_service.py +5 -3
pipecat/services/inworld/__init__.py +1 -0
pipecat/services/inworld/tts.py +592 -0
pipecat/services/llm_service.py +127 -45
pipecat/services/lmnt/tts.py +80 -7
pipecat/services/mcp_service.py +85 -44
pipecat/services/mem0/memory.py +42 -13
pipecat/services/minimax/tts.py +74 -15
pipecat/services/mistral/__init__.py +0 -0
pipecat/services/mistral/llm.py +185 -0
pipecat/services/moondream/vision.py +55 -10
pipecat/services/neuphonic/tts.py +275 -48
pipecat/services/nim/llm.py +8 -6
pipecat/services/ollama/llm.py +27 -7
pipecat/services/openai/base_llm.py +54 -16
pipecat/services/openai/image.py +30 -0
pipecat/services/openai/llm.py +7 -5
pipecat/services/openai/stt.py +13 -9
pipecat/services/openai/tts.py +42 -10
pipecat/services/openai_realtime_beta/azure.py +11 -9
pipecat/services/openai_realtime_beta/context.py +7 -5
pipecat/services/openai_realtime_beta/events.py +10 -7
pipecat/services/openai_realtime_beta/openai.py +37 -18
pipecat/services/openpipe/llm.py +30 -24
pipecat/services/openrouter/llm.py +9 -7
pipecat/services/perplexity/llm.py +15 -19
pipecat/services/piper/tts.py +26 -12
pipecat/services/playht/tts.py +227 -65
pipecat/services/qwen/llm.py +8 -6
pipecat/services/rime/tts.py +128 -17
pipecat/services/riva/stt.py +160 -22
pipecat/services/riva/tts.py +67 -2
pipecat/services/sambanova/llm.py +19 -17
pipecat/services/sambanova/stt.py +14 -8
pipecat/services/sarvam/tts.py +60 -13
pipecat/services/simli/video.py +82 -21
pipecat/services/soniox/__init__.py +0 -0
pipecat/services/soniox/stt.py +398 -0
pipecat/services/speechmatics/stt.py +29 -17
pipecat/services/stt_service.py +47 -11
pipecat/services/tavus/video.py +94 -25
pipecat/services/together/llm.py +8 -6
pipecat/services/tts_service.py +77 -53
pipecat/services/ultravox/stt.py +46 -43
pipecat/services/vision_service.py +5 -3
pipecat/services/websocket_service.py +12 -11
pipecat/services/whisper/base_stt.py +58 -12
pipecat/services/whisper/stt.py +69 -58
pipecat/services/xtts/tts.py +59 -2
pipecat/sync/base_notifier.py +19 -0
pipecat/sync/event_notifier.py +24 -0
pipecat/tests/utils.py +73 -5
pipecat/transcriptions/language.py +24 -0
pipecat/transports/base_input.py +112 -8
pipecat/transports/base_output.py +235 -13
pipecat/transports/base_transport.py +119 -0
pipecat/transports/local/audio.py +76 -0
pipecat/transports/local/tk.py +84 -0
pipecat/transports/network/fastapi_websocket.py +174 -15
pipecat/transports/network/small_webrtc.py +383 -39
pipecat/transports/network/webrtc_connection.py +214 -8
pipecat/transports/network/websocket_client.py +171 -1
pipecat/transports/network/websocket_server.py +147 -9
pipecat/transports/services/daily.py +792 -70
pipecat/transports/services/helpers/daily_rest.py +122 -129
pipecat/transports/services/livekit.py +339 -4
pipecat/transports/services/tavus.py +273 -38
pipecat/utils/asyncio/task_manager.py +92 -186
pipecat/utils/base_object.py +83 -1
pipecat/utils/network.py +2 -0
pipecat/utils/string.py +114 -58
pipecat/utils/text/base_text_aggregator.py +44 -13
pipecat/utils/text/base_text_filter.py +46 -0
pipecat/utils/text/markdown_text_filter.py +70 -14
pipecat/utils/text/pattern_pair_aggregator.py +18 -14
pipecat/utils/text/simple_text_aggregator.py +43 -2
pipecat/utils/text/skip_tags_aggregator.py +21 -13
pipecat/utils/time.py +36 -0
pipecat/utils/tracing/class_decorators.py +32 -7
pipecat/utils/tracing/conversation_context_provider.py +12 -2
pipecat/utils/tracing/service_attributes.py +80 -64
pipecat/utils/tracing/service_decorators.py +48 -21
pipecat/utils/tracing/setup.py +13 -7
pipecat/utils/tracing/turn_context_provider.py +12 -2
pipecat/utils/tracing/turn_trace_observer.py +27 -0
pipecat/utils/utils.py +14 -14
dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
pipecat/examples/daily_runner.py +0 -64
pipecat/examples/run.py +0 -265
pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
pipecat/utils/asyncio/watchdog_event.py +0 -42
pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
pipecat/utils/asyncio/watchdog_queue.py +0 -48
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
/pipecat/{examples → extensions}/__init__.py +0 -0

pipecat/services/google/tts.py CHANGED Viewed

@@ -4,7 +4,16 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
-import asyncio
+"""Google Cloud Text-to-Speech service implementations.
+This module provides integration with Google Cloud Text-to-Speech API,
+offering both HTTP-based synthesis with SSML support and streaming synthesis
+for real-time applications.
+It also includes GeminiTTSService which uses Gemini's TTS-specific models
+for natural voice control and multi-speaker conversations.
+"""
 import json
 import os
@@ -13,7 +22,7 @@ from pipecat.utils.tracing.service_decorators import traced_tts
 # Suppress gRPC fork warnings
 os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
-from typing import AsyncGenerator, Literal, Optional
+from typing import AsyncGenerator, List, Literal, Optional
 from loguru import logger
 from pydantic import BaseModel
@@ -21,6 +30,7 @@ from pydantic import BaseModel
 from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
+    StartFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
@@ -41,8 +51,25 @@ except ModuleNotFoundError as e:
     )
     raise Exception(f"Missing module: {e}")
+try:
+    from google import genai
+    from google.genai import types
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.")
+    raise Exception(f"Missing module: {e}")
 def language_to_google_tts_language(language: Language) -> Optional[str]:
+    """Convert a Language enum to Google TTS language code.
+    Args:
+        language: The Language enum value to convert.
+    Returns:
+        The corresponding Google TTS language code, or None if not supported.
+    """
     language_map = {
         # Afrikaans
         Language.AF: "af-ZA",
@@ -203,7 +230,32 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
 class GoogleHttpTTSService(TTSService):
+    """Google Cloud Text-to-Speech HTTP service with SSML support.
+    Provides text-to-speech synthesis using Google Cloud's HTTP API with
+    comprehensive SSML support for voice customization, prosody control,
+    and styling options. Ideal for applications requiring fine-grained
+    control over speech output.
+    Note:
+        Requires Google Cloud credentials via service account JSON, credentials file,
+        or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
+        Chirp and Journey voices don't support SSML and will use plain text input.
+    """
     class InputParams(BaseModel):
+        """Input parameters for Google HTTP TTS voice customization.
+        Parameters:
+            pitch: Voice pitch adjustment (e.g., "+2st", "-50%").
+            rate: Speaking rate adjustment (e.g., "slow", "fast", "125%").
+            volume: Volume adjustment (e.g., "loud", "soft", "+6dB").
+            emphasis: Emphasis level for the text.
+            language: Language for synthesis. Defaults to English.
+            gender: Voice gender preference.
+            google_style: Google-specific voice style.
+        """
         pitch: Optional[str] = None
         rate: Optional[str] = None
         volume: Optional[str] = None
@@ -222,6 +274,16 @@ class GoogleHttpTTSService(TTSService):
         params: Optional[InputParams] = None,
         **kwargs,
     ):
+        """Initializes the Google HTTP TTS service.
+        Args:
+            credentials: JSON string containing Google Cloud service account credentials.
+            credentials_path: Path to Google Cloud service account JSON file.
+            voice_id: Google TTS voice identifier (e.g., "en-US-Standard-A").
+            sample_rate: Audio sample rate in Hz. If None, uses default.
+            params: Voice customization parameters including pitch, rate, volume, etc.
+            **kwargs: Additional arguments passed to parent TTSService.
+        """
         super().__init__(sample_rate=sample_rate, **kwargs)
         params = params or GoogleHttpTTSService.InputParams()
@@ -245,11 +307,20 @@ class GoogleHttpTTSService(TTSService):
     def _create_client(
         self, credentials: Optional[str], credentials_path: Optional[str]
     ) -> texttospeech_v1.TextToSpeechAsyncClient:
+        """Create authenticated Google Text-to-Speech client.
+        Args:
+            credentials: JSON string with service account credentials.
+            credentials_path: Path to service account JSON file.
+        Returns:
+            Authenticated TextToSpeechAsyncClient instance.
+        Raises:
+            ValueError: If no valid credentials are provided.
+        """
         creds: Optional[service_account.Credentials] = None
-        # Create a Google Cloud service account for the Cloud Text-to-Speech API
-        # Using either the provided credentials JSON string or the path to a service account JSON
-        # file, create a Google Cloud service account and use it to authenticate with the API.
         if credentials:
             # Use provided credentials JSON string
             json_account_info = json.loads(credentials)
@@ -271,9 +342,22 @@ class GoogleHttpTTSService(TTSService):
         return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
     def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+        Returns:
+            True, as Google HTTP TTS service supports metrics generation.
+        """
         return True
     def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to Google TTS language format.
+        Args:
+            language: The language to convert.
+        Returns:
+            The Google TTS-specific language code, or None if not supported.
+        """
         return language_to_google_tts_language(language)
     def _construct_ssml(self, text: str) -> str:
@@ -324,6 +408,14 @@ class GoogleHttpTTSService(TTSService):
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Google's HTTP TTS API.
+        Args:
+            text: The text to synthesize into speech.
+        Yields:
+            Frame: Audio frames containing the synthesized speech.
+        """
         logger.debug(f"{self}: Generating TTS [{text}]")
         try:
@@ -381,25 +473,19 @@ class GoogleHttpTTSService(TTSService):
 class GoogleTTSService(TTSService):
-    """Text-to-Speech service using Google Cloud Text-to-Speech API.
+    """Google Cloud Text-to-Speech streaming service.
-    Converts text to speech using Google's TTS models with streaming synthesis
-    for low latency. Supports multiple languages and voices.
+    Provides real-time text-to-speech synthesis using Google Cloud's streaming API
+    for low-latency applications. Optimized for Chirp 3 HD and Journey voices
+    with continuous audio streaming capabilities.
-    Args:
-        credentials: JSON string containing Google Cloud service account credentials.
-        credentials_path: Path to Google Cloud service account JSON file.
-        voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
-        sample_rate: Audio sample rate in Hz.
-        params: Language only.
-    Notes:
+    Note:
         Requires Google Cloud credentials via service account JSON, file path, or
         default application credentials (GOOGLE_APPLICATION_CREDENTIALS env var).
         Only Chirp 3 HD and Journey voices are supported. Use GoogleHttpTTSService for other voices.
-    Example:
-        ```python
+    Example::
         tts = GoogleTTSService(
             credentials_path="/path/to/service-account.json",
             voice_id="en-US-Chirp3-HD-Charon",
@@ -407,10 +493,15 @@ class GoogleTTSService(TTSService):
                 language=Language.EN_US,
             )
         )
-        ```
     """
     class InputParams(BaseModel):
+        """Input parameters for Google streaming TTS configuration.
+        Parameters:
+            language: Language for synthesis. Defaults to English.
+        """
         language: Optional[Language] = Language.EN
         rate: Optional[float] = 1.0
@@ -424,6 +515,16 @@ class GoogleTTSService(TTSService):
         params: InputParams = InputParams(),
         **kwargs,
     ):
+        """Initializes the Google streaming TTS service.
+        Args:
+            credentials: JSON string containing Google Cloud service account credentials.
+            credentials_path: Path to Google Cloud service account JSON file.
+            voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
+            sample_rate: Audio sample rate in Hz. If None, uses default.
+            params: Language configuration parameters.
+            **kwargs: Additional arguments passed to parent TTSService.
+        """
         super().__init__(sample_rate=sample_rate, **kwargs)
         params = params or GoogleTTSService.InputParams()
@@ -482,13 +583,34 @@ class GoogleTTSService(TTSService):
         return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
     def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+        Returns:
+            True, as Google streaming TTS service supports metrics generation.
+        """
         return True
     def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to Google TTS language format.
+        Args:
+            language: The language to convert.
+        Returns:
+            The Google TTS-specific language code, or None if not supported.
+        """
         return language_to_google_tts_language(language)
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate streaming speech from text using Google's streaming API.
+        Args:
+            text: The text to synthesize into speech.
+        Yields:
+            Frame: Audio frames containing the synthesized speech as it's generated.
+        """
         logger.debug(f"{self}: Generating TTS [{text}]")
         try:
@@ -553,3 +675,252 @@ class GoogleTTSService(TTSService):
             logger.exception(f"{self} error generating TTS: {e}")
             error_message = f"TTS generation error: {str(e)}"
             yield ErrorFrame(error=error_message)
+class GeminiTTSService(TTSService):
+    """Gemini Text-to-Speech service using Gemini TTS models.
+    Provides text-to-speech synthesis using Gemini's TTS-specific models
+    (gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with
+    support for natural voice control, multiple speakers, and voice styles.
+    Note:
+        Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS.
+        Audio-out is currently a preview feature.
+    Example::
+        tts = GeminiTTSService(
+            api_key="your-google-ai-api-key",
+            model="gemini-2.5-flash-preview-tts",
+            voice_id="Kore",
+            params=GeminiTTSService.InputParams(
+                language=Language.EN_US,
+            )
+        )
+    """
+    GOOGLE_SAMPLE_RATE = 24000  # Google TTS always outputs at 24kHz
+    # List of available Gemini TTS voices
+    AVAILABLE_VOICES = [
+        "Zephyr",
+        "Puck",
+        "Charon",
+        "Kore",
+        "Fenrir",
+        "Leda",
+        "Orus",
+        "Aoede",
+        "Callirhoe",
+        "Autonoe",
+        "Enceladus",
+        "Iapetus",
+        "Umbriel",
+        "Algieba",
+        "Despina",
+        "Erinome",
+        "Algenib",
+        "Rasalgethi",
+        "Laomedeia",
+        "Achernar",
+        "Alnilam",
+        "Schedar",
+        "Gacrux",
+        "Pulcherrima",
+        "Achird",
+        "Zubenelgenubi",
+        "Vindemiatrix",
+        "Sadachbia",
+        "Sadaltager",
+        "Sulafar",
+    ]
+    class InputParams(BaseModel):
+        """Input parameters for Gemini TTS configuration.
+        Parameters:
+            language: Language for synthesis. Defaults to English.
+            multi_speaker: Whether to enable multi-speaker support.
+            speaker_configs: List of speaker configurations for multi-speaker mode.
+        """
+        language: Optional[Language] = Language.EN
+        multi_speaker: bool = False
+        speaker_configs: Optional[List[dict]] = None
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        model: str = "gemini-2.5-flash-preview-tts",
+        voice_id: str = "Kore",
+        sample_rate: Optional[int] = None,
+        params: Optional[InputParams] = None,
+        **kwargs,
+    ):
+        """Initializes the Gemini TTS service.
+        Args:
+            api_key: Google AI API key for authentication.
+            model: Gemini TTS model to use. Must be a TTS model like
+                   "gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts".
+            voice_id: Voice name from the available Gemini voices.
+            sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
+            params: TTS configuration parameters.
+            **kwargs: Additional arguments passed to parent TTSService.
+        """
+        if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
+            logger.warning(
+                f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
+                f"Current rate of {sample_rate}Hz may cause issues."
+            )
+        super().__init__(sample_rate=sample_rate, **kwargs)
+        params = params or GeminiTTSService.InputParams()
+        if voice_id not in self.AVAILABLE_VOICES:
+            logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
+        self._api_key = api_key
+        self._model = model
+        self._voice_id = voice_id
+        self._settings = {
+            "language": self.language_to_service_language(params.language)
+            if params.language
+            else "en-US",
+            "multi_speaker": params.multi_speaker,
+            "speaker_configs": params.speaker_configs,
+        }
+        self._client = genai.Client(api_key=api_key)
+    def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+        Returns:
+            True, as Gemini TTS service supports metrics generation.
+        """
+        return True
+    def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to Gemini TTS language format.
+        Args:
+            language: The language to convert.
+        Returns:
+            The Gemini TTS-specific language code, or None if not supported.
+        """
+        return language_to_google_tts_language(language)
+    def set_voice(self, voice_id: str):
+        """Set the voice for TTS generation.
+        Args:
+            voice_id: Name of the voice to use from AVAILABLE_VOICES.
+        """
+        if voice_id not in self.AVAILABLE_VOICES:
+            logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
+        self._voice_id = voice_id
+    async def start(self, frame: StartFrame):
+        """Start the Gemini TTS service.
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
+        await super().start(frame)
+        if self.sample_rate != self.GOOGLE_SAMPLE_RATE:
+            logger.warning(
+                f"Google TTS requires {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
+                f"Current rate of {self.sample_rate}Hz may cause issues."
+            )
+    @traced_tts
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Gemini TTS models.
+        Args:
+            text: The text to synthesize into speech. Can include natural language
+                  instructions for style, tone, etc.
+        Yields:
+            Frame: Audio frames containing the synthesized speech.
+        """
+        logger.debug(f"{self}: Generating TTS [{text}]")
+        try:
+            await self.start_ttfb_metrics()
+            # Build the speech config
+            if self._settings["multi_speaker"] and self._settings["speaker_configs"]:
+                # Multi-speaker mode
+                speaker_voice_configs = []
+                for speaker_config in self._settings["speaker_configs"]:
+                    speaker_voice_configs.append(
+                        types.SpeakerVoiceConfig(
+                            speaker=speaker_config["speaker"],
+                            voice_config=types.VoiceConfig(
+                                prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                                    voice_name=speaker_config.get("voice_id", self._voice_id)
+                                )
+                            ),
+                        )
+                    )
+                speech_config = types.SpeechConfig(
+                    multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
+                        speaker_voice_configs=speaker_voice_configs
+                    )
+                )
+            else:
+                # Single speaker mode
+                speech_config = types.SpeechConfig(
+                    voice_config=types.VoiceConfig(
+                        prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id)
+                    )
+                )
+            # Create the generation config
+            generation_config = types.GenerateContentConfig(
+                response_modalities=["AUDIO"],
+                speech_config=speech_config,
+            )
+            # Generate the content
+            response = await self._client.aio.models.generate_content(
+                model=self._model,
+                contents=text,
+                config=generation_config,
+            )
+            await self.start_tts_usage_metrics(text)
+            yield TTSStartedFrame()
+            # Extract audio data from response
+            if response.candidates and len(response.candidates) > 0:
+                candidate = response.candidates[0]
+                if candidate.content and candidate.content.parts:
+                    for part in candidate.content.parts:
+                        if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
+                            audio_data = part.inline_data.data
+                            await self.stop_ttfb_metrics()
+                            # Gemini TTS returns PCM audio data, chunk it appropriately
+                            CHUNK_SIZE = self.chunk_size
+                            for i in range(0, len(audio_data), CHUNK_SIZE):
+                                chunk = audio_data[i : i + CHUNK_SIZE]
+                                if not chunk:
+                                    break
+                                frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
+                                yield frame
+            yield TTSStoppedFrame()
+        except Exception as e:
+            logger.exception(f"{self} error generating TTS: {e}")
+            error_message = f"Gemini TTS generation error: {str(e)}"
+            yield ErrorFrame(error=error_message)

pipecat/services/grok/llm.py CHANGED Viewed

@@ -67,12 +67,6 @@ class GrokLLMService(OpenAILLMService):
     maintaining full compatibility with OpenAI's interface and functionality.
     Includes specialized token usage tracking that accumulates metrics during
     processing and reports final totals.
-    Args:
-        api_key: The API key for accessing Grok's API.
-        base_url: The base URL for Grok API. Defaults to "https://api.x.ai/v1".
-        model: The model identifier to use. Defaults to "grok-3-beta".
-        **kwargs: Additional keyword arguments passed to OpenAILLMService.
     """
     def __init__(
@@ -83,6 +77,14 @@ class GrokLLMService(OpenAILLMService):
         model: str = "grok-3-beta",
         **kwargs,
     ):
+        """Initialize the GrokLLMService with API key and model.
+        Args:
+            api_key: The API key for accessing Grok's API.
+            base_url: The base URL for Grok API. Defaults to "https://api.x.ai/v1".
+            model: The model identifier to use. Defaults to "grok-3-beta".
+            **kwargs: Additional keyword arguments passed to OpenAILLMService.
+        """
         super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
         # Initialize counters for token usage metrics
         self._prompt_tokens = 0

pipecat/services/groq/llm.py CHANGED Viewed

@@ -16,12 +16,6 @@ class GroqLLMService(OpenAILLMService):
     This service extends OpenAILLMService to connect to Groq's API endpoint while
     maintaining full compatibility with OpenAI's interface and functionality.
-    Args:
-        api_key: The API key for accessing Groq's API.
-        base_url: The base URL for Groq API. Defaults to "https://api.groq.com/openai/v1".
-        model: The model identifier to use. Defaults to "llama-3.3-70b-versatile".
-        **kwargs: Additional keyword arguments passed to OpenAILLMService.
     """
     def __init__(
@@ -32,6 +26,14 @@ class GroqLLMService(OpenAILLMService):
         model: str = "llama-3.3-70b-versatile",
         **kwargs,
     ):
+        """Initialize Groq LLM service.
+        Args:
+            api_key: The API key for accessing Groq's API.
+            base_url: The base URL for Groq API. Defaults to "https://api.groq.com/openai/v1".
+            model: The model identifier to use. Defaults to "llama-3.3-70b-versatile".
+            **kwargs: Additional keyword arguments passed to OpenAILLMService.
+        """
         super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
     def create_client(self, api_key=None, base_url=None, **kwargs):

pipecat/services/groq/stt.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""Groq speech-to-text service implementation using Whisper models."""
 from typing import Optional
 from pipecat.services.whisper.base_stt import BaseWhisperSTTService, Transcription
@@ -15,15 +17,6 @@ class GroqSTTService(BaseWhisperSTTService):
     Uses Groq's Whisper API to convert audio to text. Requires a Groq API key
     set via the api_key parameter or GROQ_API_KEY environment variable.
-    Args:
-        model: Whisper model to use. Defaults to "whisper-large-v3-turbo".
-        api_key: Groq API key. Defaults to None.
-        base_url: API base URL. Defaults to "https://api.groq.com/openai/v1".
-        language: Language of the audio input. Defaults to English.
-        prompt: Optional text to guide the model's style or continue a previous segment.
-        temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
-        **kwargs: Additional arguments passed to BaseWhisperSTTService.
     """
     def __init__(
@@ -37,6 +30,17 @@ class GroqSTTService(BaseWhisperSTTService):
         temperature: Optional[float] = None,
         **kwargs,
     ):
+        """Initialize Groq STT service.
+        Args:
+            model: Whisper model to use. Defaults to "whisper-large-v3-turbo".
+            api_key: Groq API key. Defaults to None.
+            base_url: API base URL. Defaults to "https://api.groq.com/openai/v1".
+            language: Language of the audio input. Defaults to English.
+            prompt: Optional text to guide the model's style or continue a previous segment.
+            temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
+            **kwargs: Additional arguments passed to BaseWhisperSTTService.
+        """
         super().__init__(
             model=model,
             api_key=api_key,

pipecat/services/groq/tts.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""Groq text-to-speech service implementation."""
 import io
 import wave
 from typing import AsyncGenerator, Optional
@@ -25,7 +27,21 @@ except ModuleNotFoundError as e:
 class GroqTTSService(TTSService):
+    """Groq text-to-speech service implementation.
+    Provides text-to-speech synthesis using Groq's TTS API. The service
+    operates at a fixed 48kHz sample rate and supports various voices
+    and output formats.
+    """
     class InputParams(BaseModel):
+        """Input parameters for Groq TTS configuration.
+        Parameters:
+            language: Language for speech synthesis. Defaults to English.
+            speed: Speech speed multiplier. Defaults to 1.0.
+        """
         language: Optional[Language] = Language.EN
         speed: Optional[float] = 1.0
@@ -42,6 +58,17 @@ class GroqTTSService(TTSService):
         sample_rate: Optional[int] = GROQ_SAMPLE_RATE,
         **kwargs,
     ):
+        """Initialize Groq TTS service.
+        Args:
+            api_key: Groq API key for authentication.
+            output_format: Audio output format. Defaults to "wav".
+            params: Additional input parameters for voice customization.
+            model_name: TTS model to use. Defaults to "playai-tts".
+            voice_id: Voice identifier to use. Defaults to "Celeste-PlayAI".
+            sample_rate: Audio sample rate. Must be 48000 Hz for Groq TTS.
+            **kwargs: Additional arguments passed to parent TTSService class.
+        """
         if sample_rate != self.GROQ_SAMPLE_RATE:
             logger.warning(f"Groq TTS only supports {self.GROQ_SAMPLE_RATE}Hz sample rate. ")
@@ -71,10 +98,23 @@ class GroqTTSService(TTSService):
         self._client = AsyncGroq(api_key=self._api_key)
     def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+        Returns:
+            True, as Groq TTS service supports metrics generation.
+        """
         return True
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Groq's TTS API.
+        Args:
+            text: The text to synthesize into speech.
+        Yields:
+            Frame: Audio frames containing the synthesized speech data.
+        """
         logger.debug(f"{self}: Generating TTS [{text}]")
         measuring_ttfb = True
         await self.start_ttfb_metrics()

dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl