dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
pipecat/services/google/tts.py
CHANGED
|
@@ -4,7 +4,16 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
"""Google Cloud Text-to-Speech service implementations.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Google Cloud Text-to-Speech API,
|
|
10
|
+
offering both HTTP-based synthesis with SSML support and streaming synthesis
|
|
11
|
+
for real-time applications.
|
|
12
|
+
|
|
13
|
+
It also includes GeminiTTSService which uses Gemini's TTS-specific models
|
|
14
|
+
for natural voice control and multi-speaker conversations.
|
|
15
|
+
"""
|
|
16
|
+
|
|
8
17
|
import json
|
|
9
18
|
import os
|
|
10
19
|
|
|
@@ -13,7 +22,7 @@ from pipecat.utils.tracing.service_decorators import traced_tts
|
|
|
13
22
|
# Suppress gRPC fork warnings
|
|
14
23
|
os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
|
|
15
24
|
|
|
16
|
-
from typing import AsyncGenerator, Literal, Optional
|
|
25
|
+
from typing import AsyncGenerator, List, Literal, Optional
|
|
17
26
|
|
|
18
27
|
from loguru import logger
|
|
19
28
|
from pydantic import BaseModel
|
|
@@ -21,6 +30,7 @@ from pydantic import BaseModel
|
|
|
21
30
|
from pipecat.frames.frames import (
|
|
22
31
|
ErrorFrame,
|
|
23
32
|
Frame,
|
|
33
|
+
StartFrame,
|
|
24
34
|
TTSAudioRawFrame,
|
|
25
35
|
TTSStartedFrame,
|
|
26
36
|
TTSStoppedFrame,
|
|
@@ -41,8 +51,25 @@ except ModuleNotFoundError as e:
|
|
|
41
51
|
)
|
|
42
52
|
raise Exception(f"Missing module: {e}")
|
|
43
53
|
|
|
54
|
+
try:
|
|
55
|
+
from google import genai
|
|
56
|
+
from google.genai import types
|
|
57
|
+
|
|
58
|
+
except ModuleNotFoundError as e:
|
|
59
|
+
logger.error(f"Exception: {e}")
|
|
60
|
+
logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.")
|
|
61
|
+
raise Exception(f"Missing module: {e}")
|
|
62
|
+
|
|
44
63
|
|
|
45
64
|
def language_to_google_tts_language(language: Language) -> Optional[str]:
|
|
65
|
+
"""Convert a Language enum to Google TTS language code.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
language: The Language enum value to convert.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
The corresponding Google TTS language code, or None if not supported.
|
|
72
|
+
"""
|
|
46
73
|
language_map = {
|
|
47
74
|
# Afrikaans
|
|
48
75
|
Language.AF: "af-ZA",
|
|
@@ -203,7 +230,32 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
|
|
|
203
230
|
|
|
204
231
|
|
|
205
232
|
class GoogleHttpTTSService(TTSService):
|
|
233
|
+
"""Google Cloud Text-to-Speech HTTP service with SSML support.
|
|
234
|
+
|
|
235
|
+
Provides text-to-speech synthesis using Google Cloud's HTTP API with
|
|
236
|
+
comprehensive SSML support for voice customization, prosody control,
|
|
237
|
+
and styling options. Ideal for applications requiring fine-grained
|
|
238
|
+
control over speech output.
|
|
239
|
+
|
|
240
|
+
Note:
|
|
241
|
+
Requires Google Cloud credentials via service account JSON, credentials file,
|
|
242
|
+
or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
|
|
243
|
+
Chirp and Journey voices don't support SSML and will use plain text input.
|
|
244
|
+
"""
|
|
245
|
+
|
|
206
246
|
class InputParams(BaseModel):
|
|
247
|
+
"""Input parameters for Google HTTP TTS voice customization.
|
|
248
|
+
|
|
249
|
+
Parameters:
|
|
250
|
+
pitch: Voice pitch adjustment (e.g., "+2st", "-50%").
|
|
251
|
+
rate: Speaking rate adjustment (e.g., "slow", "fast", "125%").
|
|
252
|
+
volume: Volume adjustment (e.g., "loud", "soft", "+6dB").
|
|
253
|
+
emphasis: Emphasis level for the text.
|
|
254
|
+
language: Language for synthesis. Defaults to English.
|
|
255
|
+
gender: Voice gender preference.
|
|
256
|
+
google_style: Google-specific voice style.
|
|
257
|
+
"""
|
|
258
|
+
|
|
207
259
|
pitch: Optional[str] = None
|
|
208
260
|
rate: Optional[str] = None
|
|
209
261
|
volume: Optional[str] = None
|
|
@@ -222,6 +274,16 @@ class GoogleHttpTTSService(TTSService):
|
|
|
222
274
|
params: Optional[InputParams] = None,
|
|
223
275
|
**kwargs,
|
|
224
276
|
):
|
|
277
|
+
"""Initializes the Google HTTP TTS service.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
credentials: JSON string containing Google Cloud service account credentials.
|
|
281
|
+
credentials_path: Path to Google Cloud service account JSON file.
|
|
282
|
+
voice_id: Google TTS voice identifier (e.g., "en-US-Standard-A").
|
|
283
|
+
sample_rate: Audio sample rate in Hz. If None, uses default.
|
|
284
|
+
params: Voice customization parameters including pitch, rate, volume, etc.
|
|
285
|
+
**kwargs: Additional arguments passed to parent TTSService.
|
|
286
|
+
"""
|
|
225
287
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
226
288
|
|
|
227
289
|
params = params or GoogleHttpTTSService.InputParams()
|
|
@@ -245,11 +307,20 @@ class GoogleHttpTTSService(TTSService):
|
|
|
245
307
|
def _create_client(
|
|
246
308
|
self, credentials: Optional[str], credentials_path: Optional[str]
|
|
247
309
|
) -> texttospeech_v1.TextToSpeechAsyncClient:
|
|
310
|
+
"""Create authenticated Google Text-to-Speech client.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
credentials: JSON string with service account credentials.
|
|
314
|
+
credentials_path: Path to service account JSON file.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Authenticated TextToSpeechAsyncClient instance.
|
|
318
|
+
|
|
319
|
+
Raises:
|
|
320
|
+
ValueError: If no valid credentials are provided.
|
|
321
|
+
"""
|
|
248
322
|
creds: Optional[service_account.Credentials] = None
|
|
249
323
|
|
|
250
|
-
# Create a Google Cloud service account for the Cloud Text-to-Speech API
|
|
251
|
-
# Using either the provided credentials JSON string or the path to a service account JSON
|
|
252
|
-
# file, create a Google Cloud service account and use it to authenticate with the API.
|
|
253
324
|
if credentials:
|
|
254
325
|
# Use provided credentials JSON string
|
|
255
326
|
json_account_info = json.loads(credentials)
|
|
@@ -271,9 +342,22 @@ class GoogleHttpTTSService(TTSService):
|
|
|
271
342
|
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
|
272
343
|
|
|
273
344
|
def can_generate_metrics(self) -> bool:
|
|
345
|
+
"""Check if this service can generate processing metrics.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
True, as Google HTTP TTS service supports metrics generation.
|
|
349
|
+
"""
|
|
274
350
|
return True
|
|
275
351
|
|
|
276
352
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
353
|
+
"""Convert a Language enum to Google TTS language format.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
language: The language to convert.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
The Google TTS-specific language code, or None if not supported.
|
|
360
|
+
"""
|
|
277
361
|
return language_to_google_tts_language(language)
|
|
278
362
|
|
|
279
363
|
def _construct_ssml(self, text: str) -> str:
|
|
@@ -324,6 +408,14 @@ class GoogleHttpTTSService(TTSService):
|
|
|
324
408
|
|
|
325
409
|
@traced_tts
|
|
326
410
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
411
|
+
"""Generate speech from text using Google's HTTP TTS API.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
text: The text to synthesize into speech.
|
|
415
|
+
|
|
416
|
+
Yields:
|
|
417
|
+
Frame: Audio frames containing the synthesized speech.
|
|
418
|
+
"""
|
|
327
419
|
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
328
420
|
|
|
329
421
|
try:
|
|
@@ -381,25 +473,19 @@ class GoogleHttpTTSService(TTSService):
|
|
|
381
473
|
|
|
382
474
|
|
|
383
475
|
class GoogleTTSService(TTSService):
|
|
384
|
-
"""
|
|
476
|
+
"""Google Cloud Text-to-Speech streaming service.
|
|
385
477
|
|
|
386
|
-
|
|
387
|
-
for low
|
|
478
|
+
Provides real-time text-to-speech synthesis using Google Cloud's streaming API
|
|
479
|
+
for low-latency applications. Optimized for Chirp 3 HD and Journey voices
|
|
480
|
+
with continuous audio streaming capabilities.
|
|
388
481
|
|
|
389
|
-
|
|
390
|
-
credentials: JSON string containing Google Cloud service account credentials.
|
|
391
|
-
credentials_path: Path to Google Cloud service account JSON file.
|
|
392
|
-
voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
|
|
393
|
-
sample_rate: Audio sample rate in Hz.
|
|
394
|
-
params: Language only.
|
|
395
|
-
|
|
396
|
-
Notes:
|
|
482
|
+
Note:
|
|
397
483
|
Requires Google Cloud credentials via service account JSON, file path, or
|
|
398
484
|
default application credentials (GOOGLE_APPLICATION_CREDENTIALS env var).
|
|
399
485
|
Only Chirp 3 HD and Journey voices are supported. Use GoogleHttpTTSService for other voices.
|
|
400
486
|
|
|
401
|
-
Example
|
|
402
|
-
|
|
487
|
+
Example::
|
|
488
|
+
|
|
403
489
|
tts = GoogleTTSService(
|
|
404
490
|
credentials_path="/path/to/service-account.json",
|
|
405
491
|
voice_id="en-US-Chirp3-HD-Charon",
|
|
@@ -407,10 +493,15 @@ class GoogleTTSService(TTSService):
|
|
|
407
493
|
language=Language.EN_US,
|
|
408
494
|
)
|
|
409
495
|
)
|
|
410
|
-
```
|
|
411
496
|
"""
|
|
412
497
|
|
|
413
498
|
class InputParams(BaseModel):
|
|
499
|
+
"""Input parameters for Google streaming TTS configuration.
|
|
500
|
+
|
|
501
|
+
Parameters:
|
|
502
|
+
language: Language for synthesis. Defaults to English.
|
|
503
|
+
"""
|
|
504
|
+
|
|
414
505
|
language: Optional[Language] = Language.EN
|
|
415
506
|
rate: Optional[float] = 1.0
|
|
416
507
|
|
|
@@ -424,6 +515,16 @@ class GoogleTTSService(TTSService):
|
|
|
424
515
|
params: InputParams = InputParams(),
|
|
425
516
|
**kwargs,
|
|
426
517
|
):
|
|
518
|
+
"""Initializes the Google streaming TTS service.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
credentials: JSON string containing Google Cloud service account credentials.
|
|
522
|
+
credentials_path: Path to Google Cloud service account JSON file.
|
|
523
|
+
voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
|
|
524
|
+
sample_rate: Audio sample rate in Hz. If None, uses default.
|
|
525
|
+
params: Language configuration parameters.
|
|
526
|
+
**kwargs: Additional arguments passed to parent TTSService.
|
|
527
|
+
"""
|
|
427
528
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
428
529
|
|
|
429
530
|
params = params or GoogleTTSService.InputParams()
|
|
@@ -482,13 +583,34 @@ class GoogleTTSService(TTSService):
|
|
|
482
583
|
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
|
483
584
|
|
|
484
585
|
def can_generate_metrics(self) -> bool:
|
|
586
|
+
"""Check if this service can generate processing metrics.
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
True, as Google streaming TTS service supports metrics generation.
|
|
590
|
+
"""
|
|
485
591
|
return True
|
|
486
592
|
|
|
487
593
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
594
|
+
"""Convert a Language enum to Google TTS language format.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
language: The language to convert.
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
The Google TTS-specific language code, or None if not supported.
|
|
601
|
+
"""
|
|
488
602
|
return language_to_google_tts_language(language)
|
|
489
603
|
|
|
490
604
|
@traced_tts
|
|
491
605
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
606
|
+
"""Generate streaming speech from text using Google's streaming API.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
text: The text to synthesize into speech.
|
|
610
|
+
|
|
611
|
+
Yields:
|
|
612
|
+
Frame: Audio frames containing the synthesized speech as it's generated.
|
|
613
|
+
"""
|
|
492
614
|
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
493
615
|
|
|
494
616
|
try:
|
|
@@ -553,3 +675,252 @@ class GoogleTTSService(TTSService):
|
|
|
553
675
|
logger.exception(f"{self} error generating TTS: {e}")
|
|
554
676
|
error_message = f"TTS generation error: {str(e)}"
|
|
555
677
|
yield ErrorFrame(error=error_message)
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
class GeminiTTSService(TTSService):
|
|
681
|
+
"""Gemini Text-to-Speech service using Gemini TTS models.
|
|
682
|
+
|
|
683
|
+
Provides text-to-speech synthesis using Gemini's TTS-specific models
|
|
684
|
+
(gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with
|
|
685
|
+
support for natural voice control, multiple speakers, and voice styles.
|
|
686
|
+
|
|
687
|
+
Note:
|
|
688
|
+
Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS.
|
|
689
|
+
Audio-out is currently a preview feature.
|
|
690
|
+
|
|
691
|
+
Example::
|
|
692
|
+
|
|
693
|
+
tts = GeminiTTSService(
|
|
694
|
+
api_key="your-google-ai-api-key",
|
|
695
|
+
model="gemini-2.5-flash-preview-tts",
|
|
696
|
+
voice_id="Kore",
|
|
697
|
+
params=GeminiTTSService.InputParams(
|
|
698
|
+
language=Language.EN_US,
|
|
699
|
+
)
|
|
700
|
+
)
|
|
701
|
+
"""
|
|
702
|
+
|
|
703
|
+
GOOGLE_SAMPLE_RATE = 24000 # Google TTS always outputs at 24kHz
|
|
704
|
+
|
|
705
|
+
# List of available Gemini TTS voices
|
|
706
|
+
AVAILABLE_VOICES = [
|
|
707
|
+
"Zephyr",
|
|
708
|
+
"Puck",
|
|
709
|
+
"Charon",
|
|
710
|
+
"Kore",
|
|
711
|
+
"Fenrir",
|
|
712
|
+
"Leda",
|
|
713
|
+
"Orus",
|
|
714
|
+
"Aoede",
|
|
715
|
+
"Callirhoe",
|
|
716
|
+
"Autonoe",
|
|
717
|
+
"Enceladus",
|
|
718
|
+
"Iapetus",
|
|
719
|
+
"Umbriel",
|
|
720
|
+
"Algieba",
|
|
721
|
+
"Despina",
|
|
722
|
+
"Erinome",
|
|
723
|
+
"Algenib",
|
|
724
|
+
"Rasalgethi",
|
|
725
|
+
"Laomedeia",
|
|
726
|
+
"Achernar",
|
|
727
|
+
"Alnilam",
|
|
728
|
+
"Schedar",
|
|
729
|
+
"Gacrux",
|
|
730
|
+
"Pulcherrima",
|
|
731
|
+
"Achird",
|
|
732
|
+
"Zubenelgenubi",
|
|
733
|
+
"Vindemiatrix",
|
|
734
|
+
"Sadachbia",
|
|
735
|
+
"Sadaltager",
|
|
736
|
+
"Sulafar",
|
|
737
|
+
]
|
|
738
|
+
|
|
739
|
+
class InputParams(BaseModel):
|
|
740
|
+
"""Input parameters for Gemini TTS configuration.
|
|
741
|
+
|
|
742
|
+
Parameters:
|
|
743
|
+
language: Language for synthesis. Defaults to English.
|
|
744
|
+
multi_speaker: Whether to enable multi-speaker support.
|
|
745
|
+
speaker_configs: List of speaker configurations for multi-speaker mode.
|
|
746
|
+
"""
|
|
747
|
+
|
|
748
|
+
language: Optional[Language] = Language.EN
|
|
749
|
+
multi_speaker: bool = False
|
|
750
|
+
speaker_configs: Optional[List[dict]] = None
|
|
751
|
+
|
|
752
|
+
def __init__(
|
|
753
|
+
self,
|
|
754
|
+
*,
|
|
755
|
+
api_key: str,
|
|
756
|
+
model: str = "gemini-2.5-flash-preview-tts",
|
|
757
|
+
voice_id: str = "Kore",
|
|
758
|
+
sample_rate: Optional[int] = None,
|
|
759
|
+
params: Optional[InputParams] = None,
|
|
760
|
+
**kwargs,
|
|
761
|
+
):
|
|
762
|
+
"""Initializes the Gemini TTS service.
|
|
763
|
+
|
|
764
|
+
Args:
|
|
765
|
+
api_key: Google AI API key for authentication.
|
|
766
|
+
model: Gemini TTS model to use. Must be a TTS model like
|
|
767
|
+
"gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts".
|
|
768
|
+
voice_id: Voice name from the available Gemini voices.
|
|
769
|
+
sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
|
|
770
|
+
params: TTS configuration parameters.
|
|
771
|
+
**kwargs: Additional arguments passed to parent TTSService.
|
|
772
|
+
"""
|
|
773
|
+
if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
|
|
774
|
+
logger.warning(
|
|
775
|
+
f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
|
|
776
|
+
f"Current rate of {sample_rate}Hz may cause issues."
|
|
777
|
+
)
|
|
778
|
+
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
779
|
+
|
|
780
|
+
params = params or GeminiTTSService.InputParams()
|
|
781
|
+
|
|
782
|
+
if voice_id not in self.AVAILABLE_VOICES:
|
|
783
|
+
logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
|
|
784
|
+
|
|
785
|
+
self._api_key = api_key
|
|
786
|
+
self._model = model
|
|
787
|
+
self._voice_id = voice_id
|
|
788
|
+
self._settings = {
|
|
789
|
+
"language": self.language_to_service_language(params.language)
|
|
790
|
+
if params.language
|
|
791
|
+
else "en-US",
|
|
792
|
+
"multi_speaker": params.multi_speaker,
|
|
793
|
+
"speaker_configs": params.speaker_configs,
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
self._client = genai.Client(api_key=api_key)
|
|
797
|
+
|
|
798
|
+
def can_generate_metrics(self) -> bool:
|
|
799
|
+
"""Check if this service can generate processing metrics.
|
|
800
|
+
|
|
801
|
+
Returns:
|
|
802
|
+
True, as Gemini TTS service supports metrics generation.
|
|
803
|
+
"""
|
|
804
|
+
return True
|
|
805
|
+
|
|
806
|
+
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
807
|
+
"""Convert a Language enum to Gemini TTS language format.
|
|
808
|
+
|
|
809
|
+
Args:
|
|
810
|
+
language: The language to convert.
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
The Gemini TTS-specific language code, or None if not supported.
|
|
814
|
+
"""
|
|
815
|
+
return language_to_google_tts_language(language)
|
|
816
|
+
|
|
817
|
+
def set_voice(self, voice_id: str):
|
|
818
|
+
"""Set the voice for TTS generation.
|
|
819
|
+
|
|
820
|
+
Args:
|
|
821
|
+
voice_id: Name of the voice to use from AVAILABLE_VOICES.
|
|
822
|
+
"""
|
|
823
|
+
if voice_id not in self.AVAILABLE_VOICES:
|
|
824
|
+
logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
|
|
825
|
+
self._voice_id = voice_id
|
|
826
|
+
|
|
827
|
+
async def start(self, frame: StartFrame):
|
|
828
|
+
"""Start the Gemini TTS service.
|
|
829
|
+
|
|
830
|
+
Args:
|
|
831
|
+
frame: The start frame containing initialization parameters.
|
|
832
|
+
"""
|
|
833
|
+
await super().start(frame)
|
|
834
|
+
if self.sample_rate != self.GOOGLE_SAMPLE_RATE:
|
|
835
|
+
logger.warning(
|
|
836
|
+
f"Google TTS requires {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
|
|
837
|
+
f"Current rate of {self.sample_rate}Hz may cause issues."
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
@traced_tts
|
|
841
|
+
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
842
|
+
"""Generate speech from text using Gemini TTS models.
|
|
843
|
+
|
|
844
|
+
Args:
|
|
845
|
+
text: The text to synthesize into speech. Can include natural language
|
|
846
|
+
instructions for style, tone, etc.
|
|
847
|
+
|
|
848
|
+
Yields:
|
|
849
|
+
Frame: Audio frames containing the synthesized speech.
|
|
850
|
+
"""
|
|
851
|
+
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
852
|
+
|
|
853
|
+
try:
|
|
854
|
+
await self.start_ttfb_metrics()
|
|
855
|
+
|
|
856
|
+
# Build the speech config
|
|
857
|
+
if self._settings["multi_speaker"] and self._settings["speaker_configs"]:
|
|
858
|
+
# Multi-speaker mode
|
|
859
|
+
speaker_voice_configs = []
|
|
860
|
+
for speaker_config in self._settings["speaker_configs"]:
|
|
861
|
+
speaker_voice_configs.append(
|
|
862
|
+
types.SpeakerVoiceConfig(
|
|
863
|
+
speaker=speaker_config["speaker"],
|
|
864
|
+
voice_config=types.VoiceConfig(
|
|
865
|
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
866
|
+
voice_name=speaker_config.get("voice_id", self._voice_id)
|
|
867
|
+
)
|
|
868
|
+
),
|
|
869
|
+
)
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
speech_config = types.SpeechConfig(
|
|
873
|
+
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
|
|
874
|
+
speaker_voice_configs=speaker_voice_configs
|
|
875
|
+
)
|
|
876
|
+
)
|
|
877
|
+
else:
|
|
878
|
+
# Single speaker mode
|
|
879
|
+
speech_config = types.SpeechConfig(
|
|
880
|
+
voice_config=types.VoiceConfig(
|
|
881
|
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id)
|
|
882
|
+
)
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
# Create the generation config
|
|
886
|
+
generation_config = types.GenerateContentConfig(
|
|
887
|
+
response_modalities=["AUDIO"],
|
|
888
|
+
speech_config=speech_config,
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
# Generate the content
|
|
892
|
+
response = await self._client.aio.models.generate_content(
|
|
893
|
+
model=self._model,
|
|
894
|
+
contents=text,
|
|
895
|
+
config=generation_config,
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
await self.start_tts_usage_metrics(text)
|
|
899
|
+
|
|
900
|
+
yield TTSStartedFrame()
|
|
901
|
+
|
|
902
|
+
# Extract audio data from response
|
|
903
|
+
if response.candidates and len(response.candidates) > 0:
|
|
904
|
+
candidate = response.candidates[0]
|
|
905
|
+
if candidate.content and candidate.content.parts:
|
|
906
|
+
for part in candidate.content.parts:
|
|
907
|
+
if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
|
|
908
|
+
audio_data = part.inline_data.data
|
|
909
|
+
await self.stop_ttfb_metrics()
|
|
910
|
+
|
|
911
|
+
# Gemini TTS returns PCM audio data, chunk it appropriately
|
|
912
|
+
CHUNK_SIZE = self.chunk_size
|
|
913
|
+
|
|
914
|
+
for i in range(0, len(audio_data), CHUNK_SIZE):
|
|
915
|
+
chunk = audio_data[i : i + CHUNK_SIZE]
|
|
916
|
+
if not chunk:
|
|
917
|
+
break
|
|
918
|
+
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
|
919
|
+
yield frame
|
|
920
|
+
|
|
921
|
+
yield TTSStoppedFrame()
|
|
922
|
+
|
|
923
|
+
except Exception as e:
|
|
924
|
+
logger.exception(f"{self} error generating TTS: {e}")
|
|
925
|
+
error_message = f"Gemini TTS generation error: {str(e)}"
|
|
926
|
+
yield ErrorFrame(error=error_message)
|
pipecat/services/grok/llm.py
CHANGED
|
@@ -67,12 +67,6 @@ class GrokLLMService(OpenAILLMService):
|
|
|
67
67
|
maintaining full compatibility with OpenAI's interface and functionality.
|
|
68
68
|
Includes specialized token usage tracking that accumulates metrics during
|
|
69
69
|
processing and reports final totals.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
api_key: The API key for accessing Grok's API.
|
|
73
|
-
base_url: The base URL for Grok API. Defaults to "https://api.x.ai/v1".
|
|
74
|
-
model: The model identifier to use. Defaults to "grok-3-beta".
|
|
75
|
-
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
76
70
|
"""
|
|
77
71
|
|
|
78
72
|
def __init__(
|
|
@@ -83,6 +77,14 @@ class GrokLLMService(OpenAILLMService):
|
|
|
83
77
|
model: str = "grok-3-beta",
|
|
84
78
|
**kwargs,
|
|
85
79
|
):
|
|
80
|
+
"""Initialize the GrokLLMService with API key and model.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
api_key: The API key for accessing Grok's API.
|
|
84
|
+
base_url: The base URL for Grok API. Defaults to "https://api.x.ai/v1".
|
|
85
|
+
model: The model identifier to use. Defaults to "grok-3-beta".
|
|
86
|
+
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
87
|
+
"""
|
|
86
88
|
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
|
|
87
89
|
# Initialize counters for token usage metrics
|
|
88
90
|
self._prompt_tokens = 0
|
pipecat/services/groq/llm.py
CHANGED
|
@@ -16,12 +16,6 @@ class GroqLLMService(OpenAILLMService):
|
|
|
16
16
|
|
|
17
17
|
This service extends OpenAILLMService to connect to Groq's API endpoint while
|
|
18
18
|
maintaining full compatibility with OpenAI's interface and functionality.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
api_key: The API key for accessing Groq's API.
|
|
22
|
-
base_url: The base URL for Groq API. Defaults to "https://api.groq.com/openai/v1".
|
|
23
|
-
model: The model identifier to use. Defaults to "llama-3.3-70b-versatile".
|
|
24
|
-
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
25
19
|
"""
|
|
26
20
|
|
|
27
21
|
def __init__(
|
|
@@ -32,6 +26,14 @@ class GroqLLMService(OpenAILLMService):
|
|
|
32
26
|
model: str = "llama-3.3-70b-versatile",
|
|
33
27
|
**kwargs,
|
|
34
28
|
):
|
|
29
|
+
"""Initialize Groq LLM service.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
api_key: The API key for accessing Groq's API.
|
|
33
|
+
base_url: The base URL for Groq API. Defaults to "https://api.groq.com/openai/v1".
|
|
34
|
+
model: The model identifier to use. Defaults to "llama-3.3-70b-versatile".
|
|
35
|
+
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
36
|
+
"""
|
|
35
37
|
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
|
|
36
38
|
|
|
37
39
|
def create_client(self, api_key=None, base_url=None, **kwargs):
|
pipecat/services/groq/stt.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Groq speech-to-text service implementation using Whisper models."""
|
|
8
|
+
|
|
7
9
|
from typing import Optional
|
|
8
10
|
|
|
9
11
|
from pipecat.services.whisper.base_stt import BaseWhisperSTTService, Transcription
|
|
@@ -15,15 +17,6 @@ class GroqSTTService(BaseWhisperSTTService):
|
|
|
15
17
|
|
|
16
18
|
Uses Groq's Whisper API to convert audio to text. Requires a Groq API key
|
|
17
19
|
set via the api_key parameter or GROQ_API_KEY environment variable.
|
|
18
|
-
|
|
19
|
-
Args:
|
|
20
|
-
model: Whisper model to use. Defaults to "whisper-large-v3-turbo".
|
|
21
|
-
api_key: Groq API key. Defaults to None.
|
|
22
|
-
base_url: API base URL. Defaults to "https://api.groq.com/openai/v1".
|
|
23
|
-
language: Language of the audio input. Defaults to English.
|
|
24
|
-
prompt: Optional text to guide the model's style or continue a previous segment.
|
|
25
|
-
temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
|
|
26
|
-
**kwargs: Additional arguments passed to BaseWhisperSTTService.
|
|
27
20
|
"""
|
|
28
21
|
|
|
29
22
|
def __init__(
|
|
@@ -37,6 +30,17 @@ class GroqSTTService(BaseWhisperSTTService):
|
|
|
37
30
|
temperature: Optional[float] = None,
|
|
38
31
|
**kwargs,
|
|
39
32
|
):
|
|
33
|
+
"""Initialize Groq STT service.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
model: Whisper model to use. Defaults to "whisper-large-v3-turbo".
|
|
37
|
+
api_key: Groq API key. Defaults to None.
|
|
38
|
+
base_url: API base URL. Defaults to "https://api.groq.com/openai/v1".
|
|
39
|
+
language: Language of the audio input. Defaults to English.
|
|
40
|
+
prompt: Optional text to guide the model's style or continue a previous segment.
|
|
41
|
+
temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
|
|
42
|
+
**kwargs: Additional arguments passed to BaseWhisperSTTService.
|
|
43
|
+
"""
|
|
40
44
|
super().__init__(
|
|
41
45
|
model=model,
|
|
42
46
|
api_key=api_key,
|
pipecat/services/groq/tts.py
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Groq text-to-speech service implementation."""
|
|
8
|
+
|
|
7
9
|
import io
|
|
8
10
|
import wave
|
|
9
11
|
from typing import AsyncGenerator, Optional
|
|
@@ -25,7 +27,21 @@ except ModuleNotFoundError as e:
|
|
|
25
27
|
|
|
26
28
|
|
|
27
29
|
class GroqTTSService(TTSService):
|
|
30
|
+
"""Groq text-to-speech service implementation.
|
|
31
|
+
|
|
32
|
+
Provides text-to-speech synthesis using Groq's TTS API. The service
|
|
33
|
+
operates at a fixed 48kHz sample rate and supports various voices
|
|
34
|
+
and output formats.
|
|
35
|
+
"""
|
|
36
|
+
|
|
28
37
|
class InputParams(BaseModel):
|
|
38
|
+
"""Input parameters for Groq TTS configuration.
|
|
39
|
+
|
|
40
|
+
Parameters:
|
|
41
|
+
language: Language for speech synthesis. Defaults to English.
|
|
42
|
+
speed: Speech speed multiplier. Defaults to 1.0.
|
|
43
|
+
"""
|
|
44
|
+
|
|
29
45
|
language: Optional[Language] = Language.EN
|
|
30
46
|
speed: Optional[float] = 1.0
|
|
31
47
|
|
|
@@ -42,6 +58,17 @@ class GroqTTSService(TTSService):
|
|
|
42
58
|
sample_rate: Optional[int] = GROQ_SAMPLE_RATE,
|
|
43
59
|
**kwargs,
|
|
44
60
|
):
|
|
61
|
+
"""Initialize Groq TTS service.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
api_key: Groq API key for authentication.
|
|
65
|
+
output_format: Audio output format. Defaults to "wav".
|
|
66
|
+
params: Additional input parameters for voice customization.
|
|
67
|
+
model_name: TTS model to use. Defaults to "playai-tts".
|
|
68
|
+
voice_id: Voice identifier to use. Defaults to "Celeste-PlayAI".
|
|
69
|
+
sample_rate: Audio sample rate. Must be 48000 Hz for Groq TTS.
|
|
70
|
+
**kwargs: Additional arguments passed to parent TTSService class.
|
|
71
|
+
"""
|
|
45
72
|
if sample_rate != self.GROQ_SAMPLE_RATE:
|
|
46
73
|
logger.warning(f"Groq TTS only supports {self.GROQ_SAMPLE_RATE}Hz sample rate. ")
|
|
47
74
|
|
|
@@ -71,10 +98,23 @@ class GroqTTSService(TTSService):
|
|
|
71
98
|
self._client = AsyncGroq(api_key=self._api_key)
|
|
72
99
|
|
|
73
100
|
def can_generate_metrics(self) -> bool:
|
|
101
|
+
"""Check if this service can generate processing metrics.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
True, as Groq TTS service supports metrics generation.
|
|
105
|
+
"""
|
|
74
106
|
return True
|
|
75
107
|
|
|
76
108
|
@traced_tts
|
|
77
109
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
110
|
+
"""Generate speech from text using Groq's TTS API.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
text: The text to synthesize into speech.
|
|
114
|
+
|
|
115
|
+
Yields:
|
|
116
|
+
Frame: Audio frames containing the synthesized speech data.
|
|
117
|
+
"""
|
|
78
118
|
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
79
119
|
measuring_ttfb = True
|
|
80
120
|
await self.start_ttfb_metrics()
|