dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,7 +4,16 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- import asyncio
7
+ """Google Cloud Text-to-Speech service implementations.
8
+
9
+ This module provides integration with Google Cloud Text-to-Speech API,
10
+ offering both HTTP-based synthesis with SSML support and streaming synthesis
11
+ for real-time applications.
12
+
13
+ It also includes GeminiTTSService which uses Gemini's TTS-specific models
14
+ for natural voice control and multi-speaker conversations.
15
+ """
16
+
8
17
  import json
9
18
  import os
10
19
 
@@ -13,7 +22,7 @@ from pipecat.utils.tracing.service_decorators import traced_tts
13
22
  # Suppress gRPC fork warnings
14
23
  os.environ["GRPC_ENABLE_FORK_SUPPORT"] = "false"
15
24
 
16
- from typing import AsyncGenerator, Literal, Optional
25
+ from typing import AsyncGenerator, List, Literal, Optional
17
26
 
18
27
  from loguru import logger
19
28
  from pydantic import BaseModel
@@ -21,6 +30,7 @@ from pydantic import BaseModel
21
30
  from pipecat.frames.frames import (
22
31
  ErrorFrame,
23
32
  Frame,
33
+ StartFrame,
24
34
  TTSAudioRawFrame,
25
35
  TTSStartedFrame,
26
36
  TTSStoppedFrame,
@@ -41,8 +51,25 @@ except ModuleNotFoundError as e:
41
51
  )
42
52
  raise Exception(f"Missing module: {e}")
43
53
 
54
+ try:
55
+ from google import genai
56
+ from google.genai import types
57
+
58
+ except ModuleNotFoundError as e:
59
+ logger.error(f"Exception: {e}")
60
+ logger.error("In order to use Gemini TTS, you need to `pip install pipecat-ai[google]`.")
61
+ raise Exception(f"Missing module: {e}")
62
+
44
63
 
45
64
  def language_to_google_tts_language(language: Language) -> Optional[str]:
65
+ """Convert a Language enum to Google TTS language code.
66
+
67
+ Args:
68
+ language: The Language enum value to convert.
69
+
70
+ Returns:
71
+ The corresponding Google TTS language code, or None if not supported.
72
+ """
46
73
  language_map = {
47
74
  # Afrikaans
48
75
  Language.AF: "af-ZA",
@@ -203,7 +230,32 @@ def language_to_google_tts_language(language: Language) -> Optional[str]:
203
230
 
204
231
 
205
232
  class GoogleHttpTTSService(TTSService):
233
+ """Google Cloud Text-to-Speech HTTP service with SSML support.
234
+
235
+ Provides text-to-speech synthesis using Google Cloud's HTTP API with
236
+ comprehensive SSML support for voice customization, prosody control,
237
+ and styling options. Ideal for applications requiring fine-grained
238
+ control over speech output.
239
+
240
+ Note:
241
+ Requires Google Cloud credentials via service account JSON, credentials file,
242
+ or default application credentials (GOOGLE_APPLICATION_CREDENTIALS).
243
+ Chirp and Journey voices don't support SSML and will use plain text input.
244
+ """
245
+
206
246
  class InputParams(BaseModel):
247
+ """Input parameters for Google HTTP TTS voice customization.
248
+
249
+ Parameters:
250
+ pitch: Voice pitch adjustment (e.g., "+2st", "-50%").
251
+ rate: Speaking rate adjustment (e.g., "slow", "fast", "125%").
252
+ volume: Volume adjustment (e.g., "loud", "soft", "+6dB").
253
+ emphasis: Emphasis level for the text.
254
+ language: Language for synthesis. Defaults to English.
255
+ gender: Voice gender preference.
256
+ google_style: Google-specific voice style.
257
+ """
258
+
207
259
  pitch: Optional[str] = None
208
260
  rate: Optional[str] = None
209
261
  volume: Optional[str] = None
@@ -222,6 +274,16 @@ class GoogleHttpTTSService(TTSService):
222
274
  params: Optional[InputParams] = None,
223
275
  **kwargs,
224
276
  ):
277
+ """Initializes the Google HTTP TTS service.
278
+
279
+ Args:
280
+ credentials: JSON string containing Google Cloud service account credentials.
281
+ credentials_path: Path to Google Cloud service account JSON file.
282
+ voice_id: Google TTS voice identifier (e.g., "en-US-Standard-A").
283
+ sample_rate: Audio sample rate in Hz. If None, uses default.
284
+ params: Voice customization parameters including pitch, rate, volume, etc.
285
+ **kwargs: Additional arguments passed to parent TTSService.
286
+ """
225
287
  super().__init__(sample_rate=sample_rate, **kwargs)
226
288
 
227
289
  params = params or GoogleHttpTTSService.InputParams()
@@ -245,11 +307,20 @@ class GoogleHttpTTSService(TTSService):
245
307
  def _create_client(
246
308
  self, credentials: Optional[str], credentials_path: Optional[str]
247
309
  ) -> texttospeech_v1.TextToSpeechAsyncClient:
310
+ """Create authenticated Google Text-to-Speech client.
311
+
312
+ Args:
313
+ credentials: JSON string with service account credentials.
314
+ credentials_path: Path to service account JSON file.
315
+
316
+ Returns:
317
+ Authenticated TextToSpeechAsyncClient instance.
318
+
319
+ Raises:
320
+ ValueError: If no valid credentials are provided.
321
+ """
248
322
  creds: Optional[service_account.Credentials] = None
249
323
 
250
- # Create a Google Cloud service account for the Cloud Text-to-Speech API
251
- # Using either the provided credentials JSON string or the path to a service account JSON
252
- # file, create a Google Cloud service account and use it to authenticate with the API.
253
324
  if credentials:
254
325
  # Use provided credentials JSON string
255
326
  json_account_info = json.loads(credentials)
@@ -271,9 +342,22 @@ class GoogleHttpTTSService(TTSService):
271
342
  return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
272
343
 
273
344
  def can_generate_metrics(self) -> bool:
345
+ """Check if this service can generate processing metrics.
346
+
347
+ Returns:
348
+ True, as Google HTTP TTS service supports metrics generation.
349
+ """
274
350
  return True
275
351
 
276
352
  def language_to_service_language(self, language: Language) -> Optional[str]:
353
+ """Convert a Language enum to Google TTS language format.
354
+
355
+ Args:
356
+ language: The language to convert.
357
+
358
+ Returns:
359
+ The Google TTS-specific language code, or None if not supported.
360
+ """
277
361
  return language_to_google_tts_language(language)
278
362
 
279
363
  def _construct_ssml(self, text: str) -> str:
@@ -324,6 +408,14 @@ class GoogleHttpTTSService(TTSService):
324
408
 
325
409
  @traced_tts
326
410
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
411
+ """Generate speech from text using Google's HTTP TTS API.
412
+
413
+ Args:
414
+ text: The text to synthesize into speech.
415
+
416
+ Yields:
417
+ Frame: Audio frames containing the synthesized speech.
418
+ """
327
419
  logger.debug(f"{self}: Generating TTS [{text}]")
328
420
 
329
421
  try:
@@ -381,25 +473,19 @@ class GoogleHttpTTSService(TTSService):
381
473
 
382
474
 
383
475
  class GoogleTTSService(TTSService):
384
- """Text-to-Speech service using Google Cloud Text-to-Speech API.
476
+ """Google Cloud Text-to-Speech streaming service.
385
477
 
386
- Converts text to speech using Google's TTS models with streaming synthesis
387
- for low latency. Supports multiple languages and voices.
478
+ Provides real-time text-to-speech synthesis using Google Cloud's streaming API
479
+ for low-latency applications. Optimized for Chirp 3 HD and Journey voices
480
+ with continuous audio streaming capabilities.
388
481
 
389
- Args:
390
- credentials: JSON string containing Google Cloud service account credentials.
391
- credentials_path: Path to Google Cloud service account JSON file.
392
- voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
393
- sample_rate: Audio sample rate in Hz.
394
- params: Language only.
395
-
396
- Notes:
482
+ Note:
397
483
  Requires Google Cloud credentials via service account JSON, file path, or
398
484
  default application credentials (GOOGLE_APPLICATION_CREDENTIALS env var).
399
485
  Only Chirp 3 HD and Journey voices are supported. Use GoogleHttpTTSService for other voices.
400
486
 
401
- Example:
402
- ```python
487
+ Example::
488
+
403
489
  tts = GoogleTTSService(
404
490
  credentials_path="/path/to/service-account.json",
405
491
  voice_id="en-US-Chirp3-HD-Charon",
@@ -407,10 +493,15 @@ class GoogleTTSService(TTSService):
407
493
  language=Language.EN_US,
408
494
  )
409
495
  )
410
- ```
411
496
  """
412
497
 
413
498
  class InputParams(BaseModel):
499
+ """Input parameters for Google streaming TTS configuration.
500
+
501
+ Parameters:
502
+ language: Language for synthesis. Defaults to English.
503
+ """
504
+
414
505
  language: Optional[Language] = Language.EN
415
506
  rate: Optional[float] = 1.0
416
507
 
@@ -424,6 +515,16 @@ class GoogleTTSService(TTSService):
424
515
  params: InputParams = InputParams(),
425
516
  **kwargs,
426
517
  ):
518
+ """Initializes the Google streaming TTS service.
519
+
520
+ Args:
521
+ credentials: JSON string containing Google Cloud service account credentials.
522
+ credentials_path: Path to Google Cloud service account JSON file.
523
+ voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
524
+ sample_rate: Audio sample rate in Hz. If None, uses default.
525
+ params: Language configuration parameters.
526
+ **kwargs: Additional arguments passed to parent TTSService.
527
+ """
427
528
  super().__init__(sample_rate=sample_rate, **kwargs)
428
529
 
429
530
  params = params or GoogleTTSService.InputParams()
@@ -482,13 +583,34 @@ class GoogleTTSService(TTSService):
482
583
  return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
483
584
 
484
585
  def can_generate_metrics(self) -> bool:
586
+ """Check if this service can generate processing metrics.
587
+
588
+ Returns:
589
+ True, as Google streaming TTS service supports metrics generation.
590
+ """
485
591
  return True
486
592
 
487
593
  def language_to_service_language(self, language: Language) -> Optional[str]:
594
+ """Convert a Language enum to Google TTS language format.
595
+
596
+ Args:
597
+ language: The language to convert.
598
+
599
+ Returns:
600
+ The Google TTS-specific language code, or None if not supported.
601
+ """
488
602
  return language_to_google_tts_language(language)
489
603
 
490
604
  @traced_tts
491
605
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
606
+ """Generate streaming speech from text using Google's streaming API.
607
+
608
+ Args:
609
+ text: The text to synthesize into speech.
610
+
611
+ Yields:
612
+ Frame: Audio frames containing the synthesized speech as it's generated.
613
+ """
492
614
  logger.debug(f"{self}: Generating TTS [{text}]")
493
615
 
494
616
  try:
@@ -553,3 +675,252 @@ class GoogleTTSService(TTSService):
553
675
  logger.exception(f"{self} error generating TTS: {e}")
554
676
  error_message = f"TTS generation error: {str(e)}"
555
677
  yield ErrorFrame(error=error_message)
678
+
679
+
680
+ class GeminiTTSService(TTSService):
681
+ """Gemini Text-to-Speech service using Gemini TTS models.
682
+
683
+ Provides text-to-speech synthesis using Gemini's TTS-specific models
684
+ (gemini-2.5-flash-preview-tts and gemini-2.5-pro-preview-tts) with
685
+ support for natural voice control, multiple speakers, and voice styles.
686
+
687
+ Note:
688
+ Requires Google AI API key. This uses the Gemini API, not Google Cloud TTS.
689
+ Audio-out is currently a preview feature.
690
+
691
+ Example::
692
+
693
+ tts = GeminiTTSService(
694
+ api_key="your-google-ai-api-key",
695
+ model="gemini-2.5-flash-preview-tts",
696
+ voice_id="Kore",
697
+ params=GeminiTTSService.InputParams(
698
+ language=Language.EN_US,
699
+ )
700
+ )
701
+ """
702
+
703
+ GOOGLE_SAMPLE_RATE = 24000 # Google TTS always outputs at 24kHz
704
+
705
+ # List of available Gemini TTS voices
706
+ AVAILABLE_VOICES = [
707
+ "Zephyr",
708
+ "Puck",
709
+ "Charon",
710
+ "Kore",
711
+ "Fenrir",
712
+ "Leda",
713
+ "Orus",
714
+ "Aoede",
715
+ "Callirhoe",
716
+ "Autonoe",
717
+ "Enceladus",
718
+ "Iapetus",
719
+ "Umbriel",
720
+ "Algieba",
721
+ "Despina",
722
+ "Erinome",
723
+ "Algenib",
724
+ "Rasalgethi",
725
+ "Laomedeia",
726
+ "Achernar",
727
+ "Alnilam",
728
+ "Schedar",
729
+ "Gacrux",
730
+ "Pulcherrima",
731
+ "Achird",
732
+ "Zubenelgenubi",
733
+ "Vindemiatrix",
734
+ "Sadachbia",
735
+ "Sadaltager",
736
+ "Sulafar",
737
+ ]
738
+
739
+ class InputParams(BaseModel):
740
+ """Input parameters for Gemini TTS configuration.
741
+
742
+ Parameters:
743
+ language: Language for synthesis. Defaults to English.
744
+ multi_speaker: Whether to enable multi-speaker support.
745
+ speaker_configs: List of speaker configurations for multi-speaker mode.
746
+ """
747
+
748
+ language: Optional[Language] = Language.EN
749
+ multi_speaker: bool = False
750
+ speaker_configs: Optional[List[dict]] = None
751
+
752
+ def __init__(
753
+ self,
754
+ *,
755
+ api_key: str,
756
+ model: str = "gemini-2.5-flash-preview-tts",
757
+ voice_id: str = "Kore",
758
+ sample_rate: Optional[int] = None,
759
+ params: Optional[InputParams] = None,
760
+ **kwargs,
761
+ ):
762
+ """Initializes the Gemini TTS service.
763
+
764
+ Args:
765
+ api_key: Google AI API key for authentication.
766
+ model: Gemini TTS model to use. Must be a TTS model like
767
+ "gemini-2.5-flash-preview-tts" or "gemini-2.5-pro-preview-tts".
768
+ voice_id: Voice name from the available Gemini voices.
769
+ sample_rate: Audio sample rate in Hz. If None, uses Google's default 24kHz.
770
+ params: TTS configuration parameters.
771
+ **kwargs: Additional arguments passed to parent TTSService.
772
+ """
773
+ if sample_rate and sample_rate != self.GOOGLE_SAMPLE_RATE:
774
+ logger.warning(
775
+ f"Google TTS only supports {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
776
+ f"Current rate of {sample_rate}Hz may cause issues."
777
+ )
778
+ super().__init__(sample_rate=sample_rate, **kwargs)
779
+
780
+ params = params or GeminiTTSService.InputParams()
781
+
782
+ if voice_id not in self.AVAILABLE_VOICES:
783
+ logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
784
+
785
+ self._api_key = api_key
786
+ self._model = model
787
+ self._voice_id = voice_id
788
+ self._settings = {
789
+ "language": self.language_to_service_language(params.language)
790
+ if params.language
791
+ else "en-US",
792
+ "multi_speaker": params.multi_speaker,
793
+ "speaker_configs": params.speaker_configs,
794
+ }
795
+
796
+ self._client = genai.Client(api_key=api_key)
797
+
798
+ def can_generate_metrics(self) -> bool:
799
+ """Check if this service can generate processing metrics.
800
+
801
+ Returns:
802
+ True, as Gemini TTS service supports metrics generation.
803
+ """
804
+ return True
805
+
806
+ def language_to_service_language(self, language: Language) -> Optional[str]:
807
+ """Convert a Language enum to Gemini TTS language format.
808
+
809
+ Args:
810
+ language: The language to convert.
811
+
812
+ Returns:
813
+ The Gemini TTS-specific language code, or None if not supported.
814
+ """
815
+ return language_to_google_tts_language(language)
816
+
817
+ def set_voice(self, voice_id: str):
818
+ """Set the voice for TTS generation.
819
+
820
+ Args:
821
+ voice_id: Name of the voice to use from AVAILABLE_VOICES.
822
+ """
823
+ if voice_id not in self.AVAILABLE_VOICES:
824
+ logger.warning(f"Voice '{voice_id}' not in known voices list. Using anyway.")
825
+ self._voice_id = voice_id
826
+
827
+ async def start(self, frame: StartFrame):
828
+ """Start the Gemini TTS service.
829
+
830
+ Args:
831
+ frame: The start frame containing initialization parameters.
832
+ """
833
+ await super().start(frame)
834
+ if self.sample_rate != self.GOOGLE_SAMPLE_RATE:
835
+ logger.warning(
836
+ f"Google TTS requires {self.GOOGLE_SAMPLE_RATE}Hz sample rate. "
837
+ f"Current rate of {self.sample_rate}Hz may cause issues."
838
+ )
839
+
840
+ @traced_tts
841
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
842
+ """Generate speech from text using Gemini TTS models.
843
+
844
+ Args:
845
+ text: The text to synthesize into speech. Can include natural language
846
+ instructions for style, tone, etc.
847
+
848
+ Yields:
849
+ Frame: Audio frames containing the synthesized speech.
850
+ """
851
+ logger.debug(f"{self}: Generating TTS [{text}]")
852
+
853
+ try:
854
+ await self.start_ttfb_metrics()
855
+
856
+ # Build the speech config
857
+ if self._settings["multi_speaker"] and self._settings["speaker_configs"]:
858
+ # Multi-speaker mode
859
+ speaker_voice_configs = []
860
+ for speaker_config in self._settings["speaker_configs"]:
861
+ speaker_voice_configs.append(
862
+ types.SpeakerVoiceConfig(
863
+ speaker=speaker_config["speaker"],
864
+ voice_config=types.VoiceConfig(
865
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
866
+ voice_name=speaker_config.get("voice_id", self._voice_id)
867
+ )
868
+ ),
869
+ )
870
+ )
871
+
872
+ speech_config = types.SpeechConfig(
873
+ multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
874
+ speaker_voice_configs=speaker_voice_configs
875
+ )
876
+ )
877
+ else:
878
+ # Single speaker mode
879
+ speech_config = types.SpeechConfig(
880
+ voice_config=types.VoiceConfig(
881
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._voice_id)
882
+ )
883
+ )
884
+
885
+ # Create the generation config
886
+ generation_config = types.GenerateContentConfig(
887
+ response_modalities=["AUDIO"],
888
+ speech_config=speech_config,
889
+ )
890
+
891
+ # Generate the content
892
+ response = await self._client.aio.models.generate_content(
893
+ model=self._model,
894
+ contents=text,
895
+ config=generation_config,
896
+ )
897
+
898
+ await self.start_tts_usage_metrics(text)
899
+
900
+ yield TTSStartedFrame()
901
+
902
+ # Extract audio data from response
903
+ if response.candidates and len(response.candidates) > 0:
904
+ candidate = response.candidates[0]
905
+ if candidate.content and candidate.content.parts:
906
+ for part in candidate.content.parts:
907
+ if part.inline_data and part.inline_data.mime_type.startswith("audio/"):
908
+ audio_data = part.inline_data.data
909
+ await self.stop_ttfb_metrics()
910
+
911
+ # Gemini TTS returns PCM audio data, chunk it appropriately
912
+ CHUNK_SIZE = self.chunk_size
913
+
914
+ for i in range(0, len(audio_data), CHUNK_SIZE):
915
+ chunk = audio_data[i : i + CHUNK_SIZE]
916
+ if not chunk:
917
+ break
918
+ frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
919
+ yield frame
920
+
921
+ yield TTSStoppedFrame()
922
+
923
+ except Exception as e:
924
+ logger.exception(f"{self} error generating TTS: {e}")
925
+ error_message = f"Gemini TTS generation error: {str(e)}"
926
+ yield ErrorFrame(error=error_message)
@@ -67,12 +67,6 @@ class GrokLLMService(OpenAILLMService):
67
67
  maintaining full compatibility with OpenAI's interface and functionality.
68
68
  Includes specialized token usage tracking that accumulates metrics during
69
69
  processing and reports final totals.
70
-
71
- Args:
72
- api_key: The API key for accessing Grok's API.
73
- base_url: The base URL for Grok API. Defaults to "https://api.x.ai/v1".
74
- model: The model identifier to use. Defaults to "grok-3-beta".
75
- **kwargs: Additional keyword arguments passed to OpenAILLMService.
76
70
  """
77
71
 
78
72
  def __init__(
@@ -83,6 +77,14 @@ class GrokLLMService(OpenAILLMService):
83
77
  model: str = "grok-3-beta",
84
78
  **kwargs,
85
79
  ):
80
+ """Initialize the GrokLLMService with API key and model.
81
+
82
+ Args:
83
+ api_key: The API key for accessing Grok's API.
84
+ base_url: The base URL for Grok API. Defaults to "https://api.x.ai/v1".
85
+ model: The model identifier to use. Defaults to "grok-3-beta".
86
+ **kwargs: Additional keyword arguments passed to OpenAILLMService.
87
+ """
86
88
  super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
87
89
  # Initialize counters for token usage metrics
88
90
  self._prompt_tokens = 0
@@ -16,12 +16,6 @@ class GroqLLMService(OpenAILLMService):
16
16
 
17
17
  This service extends OpenAILLMService to connect to Groq's API endpoint while
18
18
  maintaining full compatibility with OpenAI's interface and functionality.
19
-
20
- Args:
21
- api_key: The API key for accessing Groq's API.
22
- base_url: The base URL for Groq API. Defaults to "https://api.groq.com/openai/v1".
23
- model: The model identifier to use. Defaults to "llama-3.3-70b-versatile".
24
- **kwargs: Additional keyword arguments passed to OpenAILLMService.
25
19
  """
26
20
 
27
21
  def __init__(
@@ -32,6 +26,14 @@ class GroqLLMService(OpenAILLMService):
32
26
  model: str = "llama-3.3-70b-versatile",
33
27
  **kwargs,
34
28
  ):
29
+ """Initialize Groq LLM service.
30
+
31
+ Args:
32
+ api_key: The API key for accessing Groq's API.
33
+ base_url: The base URL for Groq API. Defaults to "https://api.groq.com/openai/v1".
34
+ model: The model identifier to use. Defaults to "llama-3.3-70b-versatile".
35
+ **kwargs: Additional keyword arguments passed to OpenAILLMService.
36
+ """
35
37
  super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
36
38
 
37
39
  def create_client(self, api_key=None, base_url=None, **kwargs):
@@ -4,6 +4,8 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Groq speech-to-text service implementation using Whisper models."""
8
+
7
9
  from typing import Optional
8
10
 
9
11
  from pipecat.services.whisper.base_stt import BaseWhisperSTTService, Transcription
@@ -15,15 +17,6 @@ class GroqSTTService(BaseWhisperSTTService):
15
17
 
16
18
  Uses Groq's Whisper API to convert audio to text. Requires a Groq API key
17
19
  set via the api_key parameter or GROQ_API_KEY environment variable.
18
-
19
- Args:
20
- model: Whisper model to use. Defaults to "whisper-large-v3-turbo".
21
- api_key: Groq API key. Defaults to None.
22
- base_url: API base URL. Defaults to "https://api.groq.com/openai/v1".
23
- language: Language of the audio input. Defaults to English.
24
- prompt: Optional text to guide the model's style or continue a previous segment.
25
- temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
26
- **kwargs: Additional arguments passed to BaseWhisperSTTService.
27
20
  """
28
21
 
29
22
  def __init__(
@@ -37,6 +30,17 @@ class GroqSTTService(BaseWhisperSTTService):
37
30
  temperature: Optional[float] = None,
38
31
  **kwargs,
39
32
  ):
33
+ """Initialize Groq STT service.
34
+
35
+ Args:
36
+ model: Whisper model to use. Defaults to "whisper-large-v3-turbo".
37
+ api_key: Groq API key. Defaults to None.
38
+ base_url: API base URL. Defaults to "https://api.groq.com/openai/v1".
39
+ language: Language of the audio input. Defaults to English.
40
+ prompt: Optional text to guide the model's style or continue a previous segment.
41
+ temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
42
+ **kwargs: Additional arguments passed to BaseWhisperSTTService.
43
+ """
40
44
  super().__init__(
41
45
  model=model,
42
46
  api_key=api_key,
@@ -4,6 +4,8 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Groq text-to-speech service implementation."""
8
+
7
9
  import io
8
10
  import wave
9
11
  from typing import AsyncGenerator, Optional
@@ -25,7 +27,21 @@ except ModuleNotFoundError as e:
25
27
 
26
28
 
27
29
  class GroqTTSService(TTSService):
30
+ """Groq text-to-speech service implementation.
31
+
32
+ Provides text-to-speech synthesis using Groq's TTS API. The service
33
+ operates at a fixed 48kHz sample rate and supports various voices
34
+ and output formats.
35
+ """
36
+
28
37
  class InputParams(BaseModel):
38
+ """Input parameters for Groq TTS configuration.
39
+
40
+ Parameters:
41
+ language: Language for speech synthesis. Defaults to English.
42
+ speed: Speech speed multiplier. Defaults to 1.0.
43
+ """
44
+
29
45
  language: Optional[Language] = Language.EN
30
46
  speed: Optional[float] = 1.0
31
47
 
@@ -42,6 +58,17 @@ class GroqTTSService(TTSService):
42
58
  sample_rate: Optional[int] = GROQ_SAMPLE_RATE,
43
59
  **kwargs,
44
60
  ):
61
+ """Initialize Groq TTS service.
62
+
63
+ Args:
64
+ api_key: Groq API key for authentication.
65
+ output_format: Audio output format. Defaults to "wav".
66
+ params: Additional input parameters for voice customization.
67
+ model_name: TTS model to use. Defaults to "playai-tts".
68
+ voice_id: Voice identifier to use. Defaults to "Celeste-PlayAI".
69
+ sample_rate: Audio sample rate. Must be 48000 Hz for Groq TTS.
70
+ **kwargs: Additional arguments passed to parent TTSService class.
71
+ """
45
72
  if sample_rate != self.GROQ_SAMPLE_RATE:
46
73
  logger.warning(f"Groq TTS only supports {self.GROQ_SAMPLE_RATE}Hz sample rate. ")
47
74
 
@@ -71,10 +98,23 @@ class GroqTTSService(TTSService):
71
98
  self._client = AsyncGroq(api_key=self._api_key)
72
99
 
73
100
  def can_generate_metrics(self) -> bool:
101
+ """Check if this service can generate processing metrics.
102
+
103
+ Returns:
104
+ True, as Groq TTS service supports metrics generation.
105
+ """
74
106
  return True
75
107
 
76
108
  @traced_tts
77
109
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
110
+ """Generate speech from text using Groq's TTS API.
111
+
112
+ Args:
113
+ text: The text to synthesize into speech.
114
+
115
+ Yields:
116
+ Frame: Audio frames containing the synthesized speech data.
117
+ """
78
118
  logger.debug(f"{self}: Generating TTS [{text}]")
79
119
  measuring_ttfb = True
80
120
  await self.start_ttfb_metrics()