dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
pipecat/services/fal/image.py
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Fal's image generation service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Fal's image generation API
|
|
10
|
+
for creating images from text prompts using various AI models.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import asyncio
|
|
8
14
|
import io
|
|
9
15
|
import os
|
|
@@ -26,7 +32,25 @@ except ModuleNotFoundError as e:
|
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
class FalImageGenService(ImageGenService):
|
|
35
|
+
"""Fal's image generation service.
|
|
36
|
+
|
|
37
|
+
Provides text-to-image generation using Fal.ai's API with configurable
|
|
38
|
+
parameters for image quality, safety, and format options.
|
|
39
|
+
"""
|
|
40
|
+
|
|
29
41
|
class InputParams(BaseModel):
|
|
42
|
+
"""Input parameters for Fal.ai image generation.
|
|
43
|
+
|
|
44
|
+
Parameters:
|
|
45
|
+
seed: Random seed for reproducible generation. If None, uses random seed.
|
|
46
|
+
num_inference_steps: Number of inference steps for generation. Defaults to 8.
|
|
47
|
+
num_images: Number of images to generate. Defaults to 1.
|
|
48
|
+
image_size: Image dimensions as string preset or dict with width/height. Defaults to "square_hd".
|
|
49
|
+
expand_prompt: Whether to automatically expand/enhance the prompt. Defaults to False.
|
|
50
|
+
enable_safety_checker: Whether to enable content safety filtering. Defaults to True.
|
|
51
|
+
format: Output image format. Defaults to "png".
|
|
52
|
+
"""
|
|
53
|
+
|
|
30
54
|
seed: Optional[int] = None
|
|
31
55
|
num_inference_steps: int = 8
|
|
32
56
|
num_images: int = 1
|
|
@@ -44,6 +68,15 @@ class FalImageGenService(ImageGenService):
|
|
|
44
68
|
key: Optional[str] = None,
|
|
45
69
|
**kwargs,
|
|
46
70
|
):
|
|
71
|
+
"""Initialize the FalImageGenService.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
params: Input parameters for image generation configuration.
|
|
75
|
+
aiohttp_session: HTTP client session for downloading generated images.
|
|
76
|
+
model: The Fal.ai model to use for generation. Defaults to "fal-ai/fast-sdxl".
|
|
77
|
+
key: Optional API key for Fal.ai. If provided, sets FAL_KEY environment variable.
|
|
78
|
+
**kwargs: Additional arguments passed to parent ImageGenService.
|
|
79
|
+
"""
|
|
47
80
|
super().__init__(**kwargs)
|
|
48
81
|
self.set_model_name(model)
|
|
49
82
|
self._params = params
|
|
@@ -52,6 +85,16 @@ class FalImageGenService(ImageGenService):
|
|
|
52
85
|
os.environ["FAL_KEY"] = key
|
|
53
86
|
|
|
54
87
|
async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
|
|
88
|
+
"""Generate an image from a text prompt.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
prompt: The text prompt to generate an image from.
|
|
92
|
+
|
|
93
|
+
Yields:
|
|
94
|
+
URLImageRawFrame: Frame containing the generated image data and metadata.
|
|
95
|
+
ErrorFrame: If image generation fails.
|
|
96
|
+
"""
|
|
97
|
+
|
|
55
98
|
def load_image_bytes(encoded_image: bytes):
|
|
56
99
|
buffer = io.BytesIO(encoded_image)
|
|
57
100
|
image = Image.open(buffer)
|
pipecat/services/fal/stt.py
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Fal speech-to-text service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Fal's Wizper API for speech-to-text
|
|
10
|
+
transcription using segmented audio processing.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import os
|
|
8
14
|
from typing import AsyncGenerator, Optional
|
|
9
15
|
|
|
@@ -27,7 +33,14 @@ except ModuleNotFoundError as e:
|
|
|
27
33
|
|
|
28
34
|
|
|
29
35
|
def language_to_fal_language(language: Language) -> Optional[str]:
|
|
30
|
-
"""Language
|
|
36
|
+
"""Convert a Language enum to Fal's Wizper language code.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
language: The Language enum value to convert.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The corresponding Fal Wizper language code, or None if not supported.
|
|
43
|
+
"""
|
|
31
44
|
BASE_LANGUAGES = {
|
|
32
45
|
Language.AF: "af",
|
|
33
46
|
Language.AM: "am",
|
|
@@ -145,18 +158,12 @@ class FalSTTService(SegmentedSTTService):
|
|
|
145
158
|
|
|
146
159
|
This service uses Fal's Wizper API to perform speech-to-text transcription on audio
|
|
147
160
|
segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
api_key: Fal API key. If not provided, will check FAL_KEY environment variable.
|
|
151
|
-
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
|
152
|
-
params: Configuration parameters for the Wizper API.
|
|
153
|
-
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
154
161
|
"""
|
|
155
162
|
|
|
156
163
|
class InputParams(BaseModel):
|
|
157
164
|
"""Configuration parameters for Fal's Wizper API.
|
|
158
165
|
|
|
159
|
-
|
|
166
|
+
Parameters:
|
|
160
167
|
language: Language of the audio input. Defaults to English.
|
|
161
168
|
task: Task to perform ('transcribe' or 'translate'). Defaults to 'transcribe'.
|
|
162
169
|
chunk_level: Level of chunking ('segment'). Defaults to 'segment'.
|
|
@@ -176,6 +183,14 @@ class FalSTTService(SegmentedSTTService):
|
|
|
176
183
|
params: Optional[InputParams] = None,
|
|
177
184
|
**kwargs,
|
|
178
185
|
):
|
|
186
|
+
"""Initialize the FalSTTService with API key and parameters.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
api_key: Fal API key. If not provided, will check FAL_KEY environment variable.
|
|
190
|
+
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
|
191
|
+
params: Configuration parameters for the Wizper API.
|
|
192
|
+
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
193
|
+
"""
|
|
179
194
|
super().__init__(
|
|
180
195
|
sample_rate=sample_rate,
|
|
181
196
|
**kwargs,
|
|
@@ -201,16 +216,39 @@ class FalSTTService(SegmentedSTTService):
|
|
|
201
216
|
}
|
|
202
217
|
|
|
203
218
|
def can_generate_metrics(self) -> bool:
|
|
219
|
+
"""Check if the service can generate processing metrics.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
True, as Fal STT service supports metrics generation.
|
|
223
|
+
"""
|
|
204
224
|
return True
|
|
205
225
|
|
|
206
226
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
227
|
+
"""Convert a Language enum to Fal's service-specific language code.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
language: The language to convert.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
The Fal-specific language code, or None if not supported.
|
|
234
|
+
"""
|
|
207
235
|
return language_to_fal_language(language)
|
|
208
236
|
|
|
209
237
|
async def set_language(self, language: Language):
|
|
238
|
+
"""Set the transcription language.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
language: The language to use for speech-to-text transcription.
|
|
242
|
+
"""
|
|
210
243
|
logger.info(f"Switching STT language to: [{language}]")
|
|
211
244
|
self._settings["language"] = self.language_to_service_language(language)
|
|
212
245
|
|
|
213
246
|
async def set_model(self, model: str):
|
|
247
|
+
"""Set the STT model.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
model: The model name to use for transcription.
|
|
251
|
+
"""
|
|
214
252
|
await super().set_model(model)
|
|
215
253
|
logger.info(f"Switching STT model to: [{model}]")
|
|
216
254
|
|
|
@@ -229,7 +267,7 @@ class FalSTTService(SegmentedSTTService):
|
|
|
229
267
|
audio: Raw audio bytes in WAV format (already converted by base class).
|
|
230
268
|
|
|
231
269
|
Yields:
|
|
232
|
-
Frame: TranscriptionFrame containing the transcribed text.
|
|
270
|
+
Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
|
|
233
271
|
|
|
234
272
|
Note:
|
|
235
273
|
The audio is already in WAV format from the SegmentedSTTService.
|
|
@@ -253,7 +291,7 @@ class FalSTTService(SegmentedSTTService):
|
|
|
253
291
|
logger.debug(f"Transcription: [{text}]")
|
|
254
292
|
yield TranscriptionFrame(
|
|
255
293
|
text,
|
|
256
|
-
|
|
294
|
+
self._user_id,
|
|
257
295
|
time_now_iso8601(),
|
|
258
296
|
Language(self._settings["language"]),
|
|
259
297
|
result=response,
|
|
@@ -20,12 +20,6 @@ class FireworksLLMService(OpenAILLMService):
|
|
|
20
20
|
|
|
21
21
|
This service extends OpenAILLMService to connect to Fireworks' API endpoint while
|
|
22
22
|
maintaining full compatibility with OpenAI's interface and functionality.
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
api_key: The API key for accessing Fireworks AI.
|
|
26
|
-
model: The model identifier to use. Defaults to "accounts/fireworks/models/firefunction-v2".
|
|
27
|
-
base_url: The base URL for Fireworks API. Defaults to "https://api.fireworks.ai/inference/v1".
|
|
28
|
-
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
29
23
|
"""
|
|
30
24
|
|
|
31
25
|
def __init__(
|
|
@@ -36,6 +30,14 @@ class FireworksLLMService(OpenAILLMService):
|
|
|
36
30
|
base_url: str = "https://api.fireworks.ai/inference/v1",
|
|
37
31
|
**kwargs,
|
|
38
32
|
):
|
|
33
|
+
"""Initialize the Fireworks LLM service.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
api_key: The API key for accessing Fireworks AI.
|
|
37
|
+
model: The model identifier to use. Defaults to "accounts/fireworks/models/firefunction-v2".
|
|
38
|
+
base_url: The base URL for Fireworks API. Defaults to "https://api.fireworks.ai/inference/v1".
|
|
39
|
+
**kwargs: Additional keyword arguments passed to OpenAILLMService.
|
|
40
|
+
"""
|
|
39
41
|
super().__init__(api_key=api_key, base_url=base_url, model=model, **kwargs)
|
|
40
42
|
|
|
41
43
|
def create_client(self, api_key=None, base_url=None, **kwargs):
|
|
@@ -52,20 +54,13 @@ class FireworksLLMService(OpenAILLMService):
|
|
|
52
54
|
logger.debug(f"Creating Fireworks client with api {base_url}")
|
|
53
55
|
return super().create_client(api_key, base_url, **kwargs)
|
|
54
56
|
|
|
55
|
-
|
|
57
|
+
def build_chat_completion_params(
|
|
56
58
|
self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
|
|
57
|
-
):
|
|
58
|
-
"""
|
|
59
|
+
) -> dict:
|
|
60
|
+
"""Build parameters for Fireworks chat completion request.
|
|
59
61
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
context: The OpenAI LLM context containing tools and settings.
|
|
65
|
-
messages: List of chat completion message parameters.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
Async generator yielding chat completion chunks from Fireworks API.
|
|
62
|
+
Fireworks doesn't support some OpenAI parameters like seed, max_completion_tokens,
|
|
63
|
+
and stream_options.
|
|
69
64
|
"""
|
|
70
65
|
params = {
|
|
71
66
|
"model": self.model_name,
|
|
@@ -81,6 +76,4 @@ class FireworksLLMService(OpenAILLMService):
|
|
|
81
76
|
}
|
|
82
77
|
|
|
83
78
|
params.update(self._settings["extra"])
|
|
84
|
-
|
|
85
|
-
chunks = await self._client.chat.completions.create(**params)
|
|
86
|
-
return chunks
|
|
79
|
+
return params
|
pipecat/services/fish/tts.py
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Fish Audio text-to-speech service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Fish Audio's real-time TTS WebSocket API
|
|
10
|
+
for streaming text-to-speech synthesis with customizable voice parameters.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import uuid
|
|
8
14
|
from typing import AsyncGenerator, Literal, Optional
|
|
9
15
|
|
|
@@ -28,7 +34,8 @@ from pipecat.utils.tracing.service_decorators import traced_tts
|
|
|
28
34
|
|
|
29
35
|
try:
|
|
30
36
|
import ormsgpack
|
|
31
|
-
import
|
|
37
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
38
|
+
from websockets.protocol import State
|
|
32
39
|
except ModuleNotFoundError as e:
|
|
33
40
|
logger.error(f"Exception: {e}")
|
|
34
41
|
logger.error("In order to use Fish Audio, you need to `pip install pipecat-ai[fish]`.")
|
|
@@ -39,9 +46,27 @@ FishAudioOutputFormat = Literal["opus", "mp3", "pcm", "wav"]
|
|
|
39
46
|
|
|
40
47
|
|
|
41
48
|
class FishAudioTTSService(InterruptibleTTSService):
|
|
49
|
+
"""Fish Audio text-to-speech service with WebSocket streaming.
|
|
50
|
+
|
|
51
|
+
Provides real-time text-to-speech synthesis using Fish Audio's WebSocket API.
|
|
52
|
+
Supports various audio formats, customizable prosody controls, and streaming
|
|
53
|
+
audio generation with interruption handling.
|
|
54
|
+
"""
|
|
55
|
+
|
|
42
56
|
class InputParams(BaseModel):
|
|
57
|
+
"""Input parameters for Fish Audio TTS configuration.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
language: Language for synthesis. Defaults to English.
|
|
61
|
+
latency: Latency mode ("normal" or "balanced"). Defaults to "normal".
|
|
62
|
+
normalize: Whether to normalize audio output. Defaults to True.
|
|
63
|
+
prosody_speed: Speech speed multiplier (0.5-2.0). Defaults to 1.0.
|
|
64
|
+
prosody_volume: Volume adjustment in dB. Defaults to 0.
|
|
65
|
+
"""
|
|
66
|
+
|
|
43
67
|
language: Optional[Language] = Language.EN
|
|
44
68
|
latency: Optional[str] = "normal" # "normal" or "balanced"
|
|
69
|
+
normalize: Optional[bool] = True
|
|
45
70
|
prosody_speed: Optional[float] = 1.0 # Speech speed (0.5-2.0)
|
|
46
71
|
prosody_volume: Optional[int] = 0 # Volume adjustment in dB
|
|
47
72
|
|
|
@@ -49,12 +74,31 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
49
74
|
self,
|
|
50
75
|
*,
|
|
51
76
|
api_key: str,
|
|
52
|
-
|
|
77
|
+
reference_id: Optional[str] = None, # This is the voice ID
|
|
78
|
+
model: Optional[str] = None, # Deprecated
|
|
79
|
+
model_id: str = "speech-1.5",
|
|
53
80
|
output_format: FishAudioOutputFormat = "pcm",
|
|
54
81
|
sample_rate: Optional[int] = None,
|
|
55
82
|
params: Optional[InputParams] = None,
|
|
56
83
|
**kwargs,
|
|
57
84
|
):
|
|
85
|
+
"""Initialize the Fish Audio TTS service.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
api_key: Fish Audio API key for authentication.
|
|
89
|
+
reference_id: Reference ID of the voice model to use for synthesis.
|
|
90
|
+
model: Deprecated. Reference ID of the voice model to use for synthesis.
|
|
91
|
+
|
|
92
|
+
.. deprecated:: 0.0.74
|
|
93
|
+
The `model` parameter is deprecated and will be removed in version 0.1.0.
|
|
94
|
+
Use `reference_id` instead to specify the voice model.
|
|
95
|
+
|
|
96
|
+
model_id: Specify which Fish Audio TTS model to use (e.g. "speech-1.5")
|
|
97
|
+
output_format: Audio output format. Defaults to "pcm".
|
|
98
|
+
sample_rate: Audio sample rate. If None, uses default.
|
|
99
|
+
params: Additional input parameters for voice customization.
|
|
100
|
+
**kwargs: Additional arguments passed to the parent service.
|
|
101
|
+
"""
|
|
58
102
|
super().__init__(
|
|
59
103
|
push_stop_frames=True,
|
|
60
104
|
pause_frame_processing=True,
|
|
@@ -64,6 +108,26 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
64
108
|
|
|
65
109
|
params = params or FishAudioTTSService.InputParams()
|
|
66
110
|
|
|
111
|
+
# Validation for model and reference_id parameters
|
|
112
|
+
if model and reference_id:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
"Cannot specify both 'model' and 'reference_id'. Use 'reference_id' only."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if model is None and reference_id is None:
|
|
118
|
+
raise ValueError("Must specify 'reference_id' (or deprecated 'model') parameter.")
|
|
119
|
+
|
|
120
|
+
if model:
|
|
121
|
+
import warnings
|
|
122
|
+
|
|
123
|
+
warnings.warn(
|
|
124
|
+
"Parameter 'model' is deprecated and will be removed in a future version. "
|
|
125
|
+
"Use 'reference_id' instead.",
|
|
126
|
+
DeprecationWarning,
|
|
127
|
+
stacklevel=2,
|
|
128
|
+
)
|
|
129
|
+
reference_id = model
|
|
130
|
+
|
|
67
131
|
self._api_key = api_key
|
|
68
132
|
self._base_url = "wss://api.fish.audio/v1/tts/live"
|
|
69
133
|
self._websocket = None
|
|
@@ -75,33 +139,60 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
75
139
|
"sample_rate": 0,
|
|
76
140
|
"latency": params.latency,
|
|
77
141
|
"format": output_format,
|
|
142
|
+
"normalize": params.normalize,
|
|
78
143
|
"prosody": {
|
|
79
144
|
"speed": params.prosody_speed,
|
|
80
145
|
"volume": params.prosody_volume,
|
|
81
146
|
},
|
|
82
|
-
"reference_id":
|
|
147
|
+
"reference_id": reference_id,
|
|
83
148
|
}
|
|
84
149
|
|
|
85
|
-
self.set_model_name(
|
|
150
|
+
self.set_model_name(model_id)
|
|
86
151
|
|
|
87
152
|
def can_generate_metrics(self) -> bool:
|
|
153
|
+
"""Check if this service can generate processing metrics.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
True, as Fish Audio service supports metrics generation.
|
|
157
|
+
"""
|
|
88
158
|
return True
|
|
89
159
|
|
|
90
160
|
async def set_model(self, model: str):
|
|
91
|
-
|
|
161
|
+
"""Set the TTS model and reconnect.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
model: The model name to use for synthesis.
|
|
165
|
+
"""
|
|
92
166
|
await super().set_model(model)
|
|
93
167
|
logger.info(f"Switching TTS model to: [{model}]")
|
|
168
|
+
await self._disconnect()
|
|
169
|
+
await self._connect()
|
|
94
170
|
|
|
95
171
|
async def start(self, frame: StartFrame):
|
|
172
|
+
"""Start the Fish Audio TTS service.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
frame: The start frame containing initialization parameters.
|
|
176
|
+
"""
|
|
96
177
|
await super().start(frame)
|
|
97
178
|
self._settings["sample_rate"] = self.sample_rate
|
|
98
179
|
await self._connect()
|
|
99
180
|
|
|
100
181
|
async def stop(self, frame: EndFrame):
|
|
182
|
+
"""Stop the Fish Audio TTS service.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
frame: The end frame.
|
|
186
|
+
"""
|
|
101
187
|
await super().stop(frame)
|
|
102
188
|
await self._disconnect()
|
|
103
189
|
|
|
104
190
|
async def cancel(self, frame: CancelFrame):
|
|
191
|
+
"""Cancel the Fish Audio TTS service.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
frame: The cancel frame.
|
|
195
|
+
"""
|
|
105
196
|
await super().cancel(frame)
|
|
106
197
|
await self._disconnect()
|
|
107
198
|
|
|
@@ -120,12 +211,13 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
120
211
|
|
|
121
212
|
async def _connect_websocket(self):
|
|
122
213
|
try:
|
|
123
|
-
if self._websocket and self._websocket.
|
|
214
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
124
215
|
return
|
|
125
216
|
|
|
126
217
|
logger.debug("Connecting to Fish Audio")
|
|
127
218
|
headers = {"Authorization": f"Bearer {self._api_key}"}
|
|
128
|
-
|
|
219
|
+
headers["model"] = self.model_name
|
|
220
|
+
self._websocket = await websocket_connect(self._base_url, additional_headers=headers)
|
|
129
221
|
|
|
130
222
|
# Send initial start message with ormsgpack
|
|
131
223
|
start_message = {"event": "start", "request": {"text": "", **self._settings}}
|
|
@@ -155,7 +247,7 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
155
247
|
async def flush_audio(self):
|
|
156
248
|
"""Flush any buffered audio by sending a flush event to Fish Audio."""
|
|
157
249
|
logger.trace(f"{self}: Flushing audio buffers")
|
|
158
|
-
if not self._websocket or self._websocket.
|
|
250
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
159
251
|
return
|
|
160
252
|
flush_message = {"event": "flush"}
|
|
161
253
|
await self._get_websocket().send(ormsgpack.packb(flush_message))
|
|
@@ -191,9 +283,17 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
191
283
|
|
|
192
284
|
@traced_tts
|
|
193
285
|
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
286
|
+
"""Generate speech from text using Fish Audio's streaming API.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
text: The text to synthesize into speech.
|
|
290
|
+
|
|
291
|
+
Yields:
|
|
292
|
+
Frame: Audio frames and control frames for the synthesized speech.
|
|
293
|
+
"""
|
|
194
294
|
logger.debug(f"{self}: Generating Fish TTS: [{text}]")
|
|
195
295
|
try:
|
|
196
|
-
if not self._websocket or self._websocket.
|
|
296
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
197
297
|
await self._connect()
|
|
198
298
|
|
|
199
299
|
if not self._request_id:
|
|
@@ -44,6 +44,17 @@ class ContentPart(BaseModel):
|
|
|
44
44
|
|
|
45
45
|
text: Optional[str] = Field(default=None, validate_default=False)
|
|
46
46
|
inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
|
|
47
|
+
fileData: Optional["FileData"] = Field(default=None, validate_default=False)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FileData(BaseModel):
|
|
51
|
+
"""Represents a file reference in the Gemini File API."""
|
|
52
|
+
|
|
53
|
+
mimeType: str
|
|
54
|
+
fileUri: str
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
ContentPart.model_rebuild() # Rebuild model to resolve forward reference
|
|
47
58
|
|
|
48
59
|
|
|
49
60
|
class Turn(BaseModel):
|
|
@@ -103,13 +114,15 @@ class RealtimeInputConfig(BaseModel):
|
|
|
103
114
|
|
|
104
115
|
|
|
105
116
|
class RealtimeInput(BaseModel):
|
|
106
|
-
"""Contains realtime input media chunks.
|
|
117
|
+
"""Contains realtime input media chunks and text.
|
|
107
118
|
|
|
108
119
|
Parameters:
|
|
109
120
|
mediaChunks: List of media chunks for realtime processing.
|
|
121
|
+
text: Text for realtime processing.
|
|
110
122
|
"""
|
|
111
123
|
|
|
112
|
-
mediaChunks: List[MediaChunk]
|
|
124
|
+
mediaChunks: Optional[List[MediaChunk]] = None
|
|
125
|
+
text: Optional[str] = None
|
|
113
126
|
|
|
114
127
|
|
|
115
128
|
class ClientContent(BaseModel):
|
|
@@ -179,6 +192,24 @@ class VideoInputMessage(BaseModel):
|
|
|
179
192
|
)
|
|
180
193
|
|
|
181
194
|
|
|
195
|
+
class TextInputMessage(BaseModel):
|
|
196
|
+
"""Message containing text input data."""
|
|
197
|
+
|
|
198
|
+
realtimeInput: RealtimeInput
|
|
199
|
+
|
|
200
|
+
@classmethod
|
|
201
|
+
def from_text(cls, text: str) -> "TextInputMessage":
|
|
202
|
+
"""Create a text input message from a string.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
text: The text to send.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
A TextInputMessage instance.
|
|
209
|
+
"""
|
|
210
|
+
return cls(realtimeInput=RealtimeInput(text=text))
|
|
211
|
+
|
|
212
|
+
|
|
182
213
|
class ClientContentMessage(BaseModel):
|
|
183
214
|
"""Message containing client content for the API.
|
|
184
215
|
|
|
@@ -237,6 +268,55 @@ class Config(BaseModel):
|
|
|
237
268
|
setup: Setup
|
|
238
269
|
|
|
239
270
|
|
|
271
|
+
#
|
|
272
|
+
# Grounding metadata models
|
|
273
|
+
#
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class SearchEntryPoint(BaseModel):
|
|
277
|
+
"""Represents the search entry point with rendered content for search suggestions."""
|
|
278
|
+
|
|
279
|
+
renderedContent: Optional[str] = None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class WebSource(BaseModel):
|
|
283
|
+
"""Represents a web source from grounding chunks."""
|
|
284
|
+
|
|
285
|
+
uri: Optional[str] = None
|
|
286
|
+
title: Optional[str] = None
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class GroundingChunk(BaseModel):
|
|
290
|
+
"""Represents a grounding chunk containing web source information."""
|
|
291
|
+
|
|
292
|
+
web: Optional[WebSource] = None
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class GroundingSegment(BaseModel):
|
|
296
|
+
"""Represents a segment of text that is grounded."""
|
|
297
|
+
|
|
298
|
+
startIndex: Optional[int] = None
|
|
299
|
+
endIndex: Optional[int] = None
|
|
300
|
+
text: Optional[str] = None
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class GroundingSupport(BaseModel):
|
|
304
|
+
"""Represents support information for grounded text segments."""
|
|
305
|
+
|
|
306
|
+
segment: Optional[GroundingSegment] = None
|
|
307
|
+
groundingChunkIndices: Optional[List[int]] = None
|
|
308
|
+
confidenceScores: Optional[List[float]] = None
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
class GroundingMetadata(BaseModel):
|
|
312
|
+
"""Represents grounding metadata from Google Search."""
|
|
313
|
+
|
|
314
|
+
searchEntryPoint: Optional[SearchEntryPoint] = None
|
|
315
|
+
groundingChunks: Optional[List[GroundingChunk]] = None
|
|
316
|
+
groundingSupports: Optional[List[GroundingSupport]] = None
|
|
317
|
+
webSearchQueries: Optional[List[str]] = None
|
|
318
|
+
|
|
319
|
+
|
|
240
320
|
#
|
|
241
321
|
# Server events
|
|
242
322
|
#
|
|
@@ -328,6 +408,7 @@ class ServerContent(BaseModel):
|
|
|
328
408
|
turnComplete: Optional[bool] = None
|
|
329
409
|
inputTranscription: Optional[BidiGenerateContentTranscription] = None
|
|
330
410
|
outputTranscription: Optional[BidiGenerateContentTranscription] = None
|
|
411
|
+
groundingMetadata: Optional[GroundingMetadata] = None
|
|
331
412
|
|
|
332
413
|
|
|
333
414
|
class FunctionCall(BaseModel):
|