dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Fal's image generation service implementation.
8
+
9
+ This module provides integration with Fal's image generation API
10
+ for creating images from text prompts using various AI models.
11
+ """
12
+
7
13
  import asyncio
8
14
  import io
9
15
  import os
@@ -26,7 +32,25 @@ except ModuleNotFoundError as e:
26
32
 
27
33
 
28
34
  class FalImageGenService(ImageGenService):
35
+ """Fal's image generation service.
36
+
37
+ Provides text-to-image generation using Fal.ai's API with configurable
38
+ parameters for image quality, safety, and format options.
39
+ """
40
+
29
41
  class InputParams(BaseModel):
42
+ """Input parameters for Fal.ai image generation.
43
+
44
+ Parameters:
45
+ seed: Random seed for reproducible generation. If None, uses random seed.
46
+ num_inference_steps: Number of inference steps for generation. Defaults to 8.
47
+ num_images: Number of images to generate. Defaults to 1.
48
+ image_size: Image dimensions as string preset or dict with width/height. Defaults to "square_hd".
49
+ expand_prompt: Whether to automatically expand/enhance the prompt. Defaults to False.
50
+ enable_safety_checker: Whether to enable content safety filtering. Defaults to True.
51
+ format: Output image format. Defaults to "png".
52
+ """
53
+
30
54
  seed: Optional[int] = None
31
55
  num_inference_steps: int = 8
32
56
  num_images: int = 1
@@ -44,6 +68,15 @@ class FalImageGenService(ImageGenService):
44
68
  key: Optional[str] = None,
45
69
  **kwargs,
46
70
  ):
71
+ """Initialize the FalImageGenService.
72
+
73
+ Args:
74
+ params: Input parameters for image generation configuration.
75
+ aiohttp_session: HTTP client session for downloading generated images.
76
+ model: The Fal.ai model to use for generation. Defaults to "fal-ai/fast-sdxl".
77
+ key: Optional API key for Fal.ai. If provided, sets FAL_KEY environment variable.
78
+ **kwargs: Additional arguments passed to parent ImageGenService.
79
+ """
47
80
  super().__init__(**kwargs)
48
81
  self.set_model_name(model)
49
82
  self._params = params
@@ -52,6 +85,16 @@ class FalImageGenService(ImageGenService):
52
85
  os.environ["FAL_KEY"] = key
53
86
 
54
87
  async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
88
+ """Generate an image from a text prompt.
89
+
90
+ Args:
91
+ prompt: The text prompt to generate an image from.
92
+
93
+ Yields:
94
+ URLImageRawFrame: Frame containing the generated image data and metadata.
95
+ ErrorFrame: If image generation fails.
96
+ """
97
+
55
98
  def load_image_bytes(encoded_image: bytes):
56
99
  buffer = io.BytesIO(encoded_image)
57
100
  image = Image.open(buffer)
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Fal speech-to-text service implementation.
8
+
9
+ This module provides integration with Fal's Wizper API for speech-to-text
10
+ transcription using segmented audio processing.
11
+ """
12
+
7
13
  import os
8
14
  from typing import AsyncGenerator, Optional
9
15
 
@@ -27,7 +33,14 @@ except ModuleNotFoundError as e:
27
33
 
28
34
 
29
35
  def language_to_fal_language(language: Language) -> Optional[str]:
30
- """Language support for Fal's Wizper API."""
36
+ """Convert a Language enum to Fal's Wizper language code.
37
+
38
+ Args:
39
+ language: The Language enum value to convert.
40
+
41
+ Returns:
42
+ The corresponding Fal Wizper language code, or None if not supported.
43
+ """
31
44
  BASE_LANGUAGES = {
32
45
  Language.AF: "af",
33
46
  Language.AM: "am",
@@ -145,18 +158,12 @@ class FalSTTService(SegmentedSTTService):
145
158
 
146
159
  This service uses Fal's Wizper API to perform speech-to-text transcription on audio
147
160
  segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
148
-
149
- Args:
150
- api_key: Fal API key. If not provided, will check FAL_KEY environment variable.
151
- sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
152
- params: Configuration parameters for the Wizper API.
153
- **kwargs: Additional arguments passed to SegmentedSTTService.
154
161
  """
155
162
 
156
163
  class InputParams(BaseModel):
157
164
  """Configuration parameters for Fal's Wizper API.
158
165
 
159
- Attributes:
166
+ Parameters:
160
167
  language: Language of the audio input. Defaults to English.
161
168
  task: Task to perform ('transcribe' or 'translate'). Defaults to 'transcribe'.
162
169
  chunk_level: Level of chunking ('segment'). Defaults to 'segment'.
@@ -176,6 +183,14 @@ class FalSTTService(SegmentedSTTService):
176
183
  params: Optional[InputParams] = None,
177
184
  **kwargs,
178
185
  ):
186
+ """Initialize the FalSTTService with API key and parameters.
187
+
188
+ Args:
189
+ api_key: Fal API key. If not provided, will check FAL_KEY environment variable.
190
+ sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
191
+ params: Configuration parameters for the Wizper API.
192
+ **kwargs: Additional arguments passed to SegmentedSTTService.
193
+ """
179
194
  super().__init__(
180
195
  sample_rate=sample_rate,
181
196
  **kwargs,
@@ -201,16 +216,39 @@ class FalSTTService(SegmentedSTTService):
201
216
  }
202
217
 
203
218
  def can_generate_metrics(self) -> bool:
219
+ """Check if the service can generate processing metrics.
220
+
221
+ Returns:
222
+ True, as Fal STT service supports metrics generation.
223
+ """
204
224
  return True
205
225
 
206
226
  def language_to_service_language(self, language: Language) -> Optional[str]:
227
+ """Convert a Language enum to Fal's service-specific language code.
228
+
229
+ Args:
230
+ language: The language to convert.
231
+
232
+ Returns:
233
+ The Fal-specific language code, or None if not supported.
234
+ """
207
235
  return language_to_fal_language(language)
208
236
 
209
237
  async def set_language(self, language: Language):
238
+ """Set the transcription language.
239
+
240
+ Args:
241
+ language: The language to use for speech-to-text transcription.
242
+ """
210
243
  logger.info(f"Switching STT language to: [{language}]")
211
244
  self._settings["language"] = self.language_to_service_language(language)
212
245
 
213
246
  async def set_model(self, model: str):
247
+ """Set the STT model.
248
+
249
+ Args:
250
+ model: The model name to use for transcription.
251
+ """
214
252
  await super().set_model(model)
215
253
  logger.info(f"Switching STT model to: [{model}]")
216
254
 
@@ -229,7 +267,7 @@ class FalSTTService(SegmentedSTTService):
229
267
  audio: Raw audio bytes in WAV format (already converted by base class).
230
268
 
231
269
  Yields:
232
- Frame: TranscriptionFrame containing the transcribed text.
270
+ Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
233
271
 
234
272
  Note:
235
273
  The audio is already in WAV format from the SegmentedSTTService.
@@ -253,7 +291,7 @@ class FalSTTService(SegmentedSTTService):
253
291
  logger.debug(f"Transcription: [{text}]")
254
292
  yield TranscriptionFrame(
255
293
  text,
256
- "",
294
+ self._user_id,
257
295
  time_now_iso8601(),
258
296
  Language(self._settings["language"]),
259
297
  result=response,
@@ -20,12 +20,6 @@ class FireworksLLMService(OpenAILLMService):
20
20
 
21
21
  This service extends OpenAILLMService to connect to Fireworks' API endpoint while
22
22
  maintaining full compatibility with OpenAI's interface and functionality.
23
-
24
- Args:
25
- api_key: The API key for accessing Fireworks AI.
26
- model: The model identifier to use. Defaults to "accounts/fireworks/models/firefunction-v2".
27
- base_url: The base URL for Fireworks API. Defaults to "https://api.fireworks.ai/inference/v1".
28
- **kwargs: Additional keyword arguments passed to OpenAILLMService.
29
23
  """
30
24
 
31
25
  def __init__(
@@ -36,6 +30,14 @@ class FireworksLLMService(OpenAILLMService):
36
30
  base_url: str = "https://api.fireworks.ai/inference/v1",
37
31
  **kwargs,
38
32
  ):
33
+ """Initialize the Fireworks LLM service.
34
+
35
+ Args:
36
+ api_key: The API key for accessing Fireworks AI.
37
+ model: The model identifier to use. Defaults to "accounts/fireworks/models/firefunction-v2".
38
+ base_url: The base URL for Fireworks API. Defaults to "https://api.fireworks.ai/inference/v1".
39
+ **kwargs: Additional keyword arguments passed to OpenAILLMService.
40
+ """
39
41
  super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
40
42
 
41
43
  def create_client(self, api_key=None, base_url=None, **kwargs):
@@ -52,20 +54,13 @@ class FireworksLLMService(OpenAILLMService):
52
54
  logger.debug(f"Creating Fireworks client with api {base_url}")
53
55
  return super().create_client(api_key, base_url, **kwargs)
54
56
 
55
- async def get_chat_completions(
57
+ def build_chat_completion_params(
56
58
  self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
57
- ):
58
- """Get chat completions from Fireworks API.
59
+ ) -> dict:
60
+ """Build parameters for Fireworks chat completion request.
59
61
 
60
- Removes OpenAI-specific parameters not supported by Fireworks and
61
- configures the request with Fireworks-compatible settings.
62
-
63
- Args:
64
- context: The OpenAI LLM context containing tools and settings.
65
- messages: List of chat completion message parameters.
66
-
67
- Returns:
68
- Async generator yielding chat completion chunks from Fireworks API.
62
+ Fireworks doesn't support some OpenAI parameters like seed, max_completion_tokens,
63
+ and stream_options.
69
64
  """
70
65
  params = {
71
66
  "model": self.model_name,
@@ -81,6 +76,4 @@ class FireworksLLMService(OpenAILLMService):
81
76
  }
82
77
 
83
78
  params.update(self._settings["extra"])
84
-
85
- chunks = await self._client.chat.completions.create(**params)
86
- return chunks
79
+ return params
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Fish Audio text-to-speech service implementation.
8
+
9
+ This module provides integration with Fish Audio's real-time TTS WebSocket API
10
+ for streaming text-to-speech synthesis with customizable voice parameters.
11
+ """
12
+
7
13
  import uuid
8
14
  from typing import AsyncGenerator, Literal, Optional
9
15
 
@@ -28,7 +34,8 @@ from pipecat.utils.tracing.service_decorators import traced_tts
28
34
 
29
35
  try:
30
36
  import ormsgpack
31
- import websockets
37
+ from websockets.asyncio.client import connect as websocket_connect
38
+ from websockets.protocol import State
32
39
  except ModuleNotFoundError as e:
33
40
  logger.error(f"Exception: {e}")
34
41
  logger.error("In order to use Fish Audio, you need to `pip install pipecat-ai[fish]`.")
@@ -39,9 +46,27 @@ FishAudioOutputFormat = Literal["opus", "mp3", "pcm", "wav"]
39
46
 
40
47
 
41
48
  class FishAudioTTSService(InterruptibleTTSService):
49
+ """Fish Audio text-to-speech service with WebSocket streaming.
50
+
51
+ Provides real-time text-to-speech synthesis using Fish Audio's WebSocket API.
52
+ Supports various audio formats, customizable prosody controls, and streaming
53
+ audio generation with interruption handling.
54
+ """
55
+
42
56
  class InputParams(BaseModel):
57
+ """Input parameters for Fish Audio TTS configuration.
58
+
59
+ Parameters:
60
+ language: Language for synthesis. Defaults to English.
61
+ latency: Latency mode ("normal" or "balanced"). Defaults to "normal".
62
+ normalize: Whether to normalize audio output. Defaults to True.
63
+ prosody_speed: Speech speed multiplier (0.5-2.0). Defaults to 1.0.
64
+ prosody_volume: Volume adjustment in dB. Defaults to 0.
65
+ """
66
+
43
67
  language: Optional[Language] = Language.EN
44
68
  latency: Optional[str] = "normal" # "normal" or "balanced"
69
+ normalize: Optional[bool] = True
45
70
  prosody_speed: Optional[float] = 1.0 # Speech speed (0.5-2.0)
46
71
  prosody_volume: Optional[int] = 0 # Volume adjustment in dB
47
72
 
@@ -49,12 +74,31 @@ class FishAudioTTSService(InterruptibleTTSService):
49
74
  self,
50
75
  *,
51
76
  api_key: str,
52
- model: str, # This is the reference_id
77
+ reference_id: Optional[str] = None, # This is the voice ID
78
+ model: Optional[str] = None, # Deprecated
79
+ model_id: str = "speech-1.5",
53
80
  output_format: FishAudioOutputFormat = "pcm",
54
81
  sample_rate: Optional[int] = None,
55
82
  params: Optional[InputParams] = None,
56
83
  **kwargs,
57
84
  ):
85
+ """Initialize the Fish Audio TTS service.
86
+
87
+ Args:
88
+ api_key: Fish Audio API key for authentication.
89
+ reference_id: Reference ID of the voice model to use for synthesis.
90
+ model: Deprecated. Reference ID of the voice model to use for synthesis.
91
+
92
+ .. deprecated:: 0.0.74
93
+ The `model` parameter is deprecated and will be removed in version 0.1.0.
94
+ Use `reference_id` instead to specify the voice model.
95
+
96
+ model_id: Specify which Fish Audio TTS model to use (e.g. "speech-1.5")
97
+ output_format: Audio output format. Defaults to "pcm".
98
+ sample_rate: Audio sample rate. If None, uses default.
99
+ params: Additional input parameters for voice customization.
100
+ **kwargs: Additional arguments passed to the parent service.
101
+ """
58
102
  super().__init__(
59
103
  push_stop_frames=True,
60
104
  pause_frame_processing=True,
@@ -64,6 +108,26 @@ class FishAudioTTSService(InterruptibleTTSService):
64
108
 
65
109
  params = params or FishAudioTTSService.InputParams()
66
110
 
111
+ # Validation for model and reference_id parameters
112
+ if model and reference_id:
113
+ raise ValueError(
114
+ "Cannot specify both 'model' and 'reference_id'. Use 'reference_id' only."
115
+ )
116
+
117
+ if model is None and reference_id is None:
118
+ raise ValueError("Must specify 'reference_id' (or deprecated 'model') parameter.")
119
+
120
+ if model:
121
+ import warnings
122
+
123
+ warnings.warn(
124
+ "Parameter 'model' is deprecated and will be removed in a future version. "
125
+ "Use 'reference_id' instead.",
126
+ DeprecationWarning,
127
+ stacklevel=2,
128
+ )
129
+ reference_id = model
130
+
67
131
  self._api_key = api_key
68
132
  self._base_url = "wss://api.fish.audio/v1/tts/live"
69
133
  self._websocket = None
@@ -75,33 +139,60 @@ class FishAudioTTSService(InterruptibleTTSService):
75
139
  "sample_rate": 0,
76
140
  "latency": params.latency,
77
141
  "format": output_format,
142
+ "normalize": params.normalize,
78
143
  "prosody": {
79
144
  "speed": params.prosody_speed,
80
145
  "volume": params.prosody_volume,
81
146
  },
82
- "reference_id": model,
147
+ "reference_id": reference_id,
83
148
  }
84
149
 
85
- self.set_model_name(model)
150
+ self.set_model_name(model_id)
86
151
 
87
152
  def can_generate_metrics(self) -> bool:
153
+ """Check if this service can generate processing metrics.
154
+
155
+ Returns:
156
+ True, as Fish Audio service supports metrics generation.
157
+ """
88
158
  return True
89
159
 
90
160
  async def set_model(self, model: str):
91
- self._settings["reference_id"] = model
161
+ """Set the TTS model and reconnect.
162
+
163
+ Args:
164
+ model: The model name to use for synthesis.
165
+ """
92
166
  await super().set_model(model)
93
167
  logger.info(f"Switching TTS model to: [{model}]")
168
+ await self._disconnect()
169
+ await self._connect()
94
170
 
95
171
  async def start(self, frame: StartFrame):
172
+ """Start the Fish Audio TTS service.
173
+
174
+ Args:
175
+ frame: The start frame containing initialization parameters.
176
+ """
96
177
  await super().start(frame)
97
178
  self._settings["sample_rate"] = self.sample_rate
98
179
  await self._connect()
99
180
 
100
181
  async def stop(self, frame: EndFrame):
182
+ """Stop the Fish Audio TTS service.
183
+
184
+ Args:
185
+ frame: The end frame.
186
+ """
101
187
  await super().stop(frame)
102
188
  await self._disconnect()
103
189
 
104
190
  async def cancel(self, frame: CancelFrame):
191
+ """Cancel the Fish Audio TTS service.
192
+
193
+ Args:
194
+ frame: The cancel frame.
195
+ """
105
196
  await super().cancel(frame)
106
197
  await self._disconnect()
107
198
 
@@ -120,12 +211,13 @@ class FishAudioTTSService(InterruptibleTTSService):
120
211
 
121
212
  async def _connect_websocket(self):
122
213
  try:
123
- if self._websocket and self._websocket.open:
214
+ if self._websocket and self._websocket.state is State.OPEN:
124
215
  return
125
216
 
126
217
  logger.debug("Connecting to Fish Audio")
127
218
  headers = {"Authorization": f"Bearer {self._api_key}"}
128
- self._websocket = await websockets.connect(self._base_url, extra_headers=headers)
219
+ headers["model"] = self.model_name
220
+ self._websocket = await websocket_connect(self._base_url, additional_headers=headers)
129
221
 
130
222
  # Send initial start message with ormsgpack
131
223
  start_message = {"event": "start", "request": {"text": "", **self._settings}}
@@ -155,7 +247,7 @@ class FishAudioTTSService(InterruptibleTTSService):
155
247
  async def flush_audio(self):
156
248
  """Flush any buffered audio by sending a flush event to Fish Audio."""
157
249
  logger.trace(f"{self}: Flushing audio buffers")
158
- if not self._websocket or self._websocket.closed:
250
+ if not self._websocket or self._websocket.state is State.CLOSED:
159
251
  return
160
252
  flush_message = {"event": "flush"}
161
253
  await self._get_websocket().send(ormsgpack.packb(flush_message))
@@ -191,9 +283,17 @@ class FishAudioTTSService(InterruptibleTTSService):
191
283
 
192
284
  @traced_tts
193
285
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
286
+ """Generate speech from text using Fish Audio's streaming API.
287
+
288
+ Args:
289
+ text: The text to synthesize into speech.
290
+
291
+ Yields:
292
+ Frame: Audio frames and control frames for the synthesized speech.
293
+ """
194
294
  logger.debug(f"{self}: Generating Fish TTS: [{text}]")
195
295
  try:
196
- if not self._websocket or self._websocket.closed:
296
+ if not self._websocket or self._websocket.state is State.CLOSED:
197
297
  await self._connect()
198
298
 
199
299
  if not self._request_id:
@@ -1 +1,2 @@
1
+ from .file_api import GeminiFileAPI
1
2
  from .gemini import GeminiMultimodalLiveLLMService
@@ -44,6 +44,17 @@ class ContentPart(BaseModel):
44
44
 
45
45
  text: Optional[str] = Field(default=None, validate_default=False)
46
46
  inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
47
+ fileData: Optional["FileData"] = Field(default=None, validate_default=False)
48
+
49
+
50
+ class FileData(BaseModel):
51
+ """Represents a file reference in the Gemini File API."""
52
+
53
+ mimeType: str
54
+ fileUri: str
55
+
56
+
57
+ ContentPart.model_rebuild() # Rebuild model to resolve forward reference
47
58
 
48
59
 
49
60
  class Turn(BaseModel):
@@ -103,13 +114,15 @@ class RealtimeInputConfig(BaseModel):
103
114
 
104
115
 
105
116
  class RealtimeInput(BaseModel):
106
- """Contains realtime input media chunks.
117
+ """Contains realtime input media chunks and text.
107
118
 
108
119
  Parameters:
109
120
  mediaChunks: List of media chunks for realtime processing.
121
+ text: Text for realtime processing.
110
122
  """
111
123
 
112
- mediaChunks: List[MediaChunk]
124
+ mediaChunks: Optional[List[MediaChunk]] = None
125
+ text: Optional[str] = None
113
126
 
114
127
 
115
128
  class ClientContent(BaseModel):
@@ -179,6 +192,24 @@ class VideoInputMessage(BaseModel):
179
192
  )
180
193
 
181
194
 
195
+ class TextInputMessage(BaseModel):
196
+ """Message containing text input data."""
197
+
198
+ realtimeInput: RealtimeInput
199
+
200
+ @classmethod
201
+ def from_text(cls, text: str) -> "TextInputMessage":
202
+ """Create a text input message from a string.
203
+
204
+ Args:
205
+ text: The text to send.
206
+
207
+ Returns:
208
+ A TextInputMessage instance.
209
+ """
210
+ return cls(realtimeInput=RealtimeInput(text=text))
211
+
212
+
182
213
  class ClientContentMessage(BaseModel):
183
214
  """Message containing client content for the API.
184
215
 
@@ -237,6 +268,55 @@ class Config(BaseModel):
237
268
  setup: Setup
238
269
 
239
270
 
271
+ #
272
+ # Grounding metadata models
273
+ #
274
+
275
+
276
+ class SearchEntryPoint(BaseModel):
277
+ """Represents the search entry point with rendered content for search suggestions."""
278
+
279
+ renderedContent: Optional[str] = None
280
+
281
+
282
+ class WebSource(BaseModel):
283
+ """Represents a web source from grounding chunks."""
284
+
285
+ uri: Optional[str] = None
286
+ title: Optional[str] = None
287
+
288
+
289
+ class GroundingChunk(BaseModel):
290
+ """Represents a grounding chunk containing web source information."""
291
+
292
+ web: Optional[WebSource] = None
293
+
294
+
295
+ class GroundingSegment(BaseModel):
296
+ """Represents a segment of text that is grounded."""
297
+
298
+ startIndex: Optional[int] = None
299
+ endIndex: Optional[int] = None
300
+ text: Optional[str] = None
301
+
302
+
303
+ class GroundingSupport(BaseModel):
304
+ """Represents support information for grounded text segments."""
305
+
306
+ segment: Optional[GroundingSegment] = None
307
+ groundingChunkIndices: Optional[List[int]] = None
308
+ confidenceScores: Optional[List[float]] = None
309
+
310
+
311
+ class GroundingMetadata(BaseModel):
312
+ """Represents grounding metadata from Google Search."""
313
+
314
+ searchEntryPoint: Optional[SearchEntryPoint] = None
315
+ groundingChunks: Optional[List[GroundingChunk]] = None
316
+ groundingSupports: Optional[List[GroundingSupport]] = None
317
+ webSearchQueries: Optional[List[str]] = None
318
+
319
+
240
320
  #
241
321
  # Server events
242
322
  #
@@ -328,6 +408,7 @@ class ServerContent(BaseModel):
328
408
  turnComplete: Optional[bool] = None
329
409
  inputTranscription: Optional[BidiGenerateContentTranscription] = None
330
410
  outputTranscription: Optional[BidiGenerateContentTranscription] = None
411
+ groundingMetadata: Optional[GroundingMetadata] = None
331
412
 
332
413
 
333
414
  class FunctionCall(BaseModel):