dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Configuration for the Gladia STT service."""
|
|
8
|
+
|
|
7
9
|
from typing import Any, Dict, List, Optional, Union
|
|
8
10
|
|
|
9
11
|
from pydantic import BaseModel
|
|
@@ -14,7 +16,7 @@ from pipecat.transcriptions.language import Language
|
|
|
14
16
|
class LanguageConfig(BaseModel):
|
|
15
17
|
"""Configuration for language detection and handling.
|
|
16
18
|
|
|
17
|
-
|
|
19
|
+
Parameters:
|
|
18
20
|
languages: List of language codes to use for transcription
|
|
19
21
|
code_switching: Whether to auto-detect language changes during transcription
|
|
20
22
|
"""
|
|
@@ -26,7 +28,7 @@ class LanguageConfig(BaseModel):
|
|
|
26
28
|
class PreProcessingConfig(BaseModel):
|
|
27
29
|
"""Configuration for audio pre-processing options.
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
Parameters:
|
|
30
32
|
speech_threshold: Sensitivity for speech detection (0-1)
|
|
31
33
|
"""
|
|
32
34
|
|
|
@@ -36,7 +38,7 @@ class PreProcessingConfig(BaseModel):
|
|
|
36
38
|
class CustomVocabularyItem(BaseModel):
|
|
37
39
|
"""Represents a custom vocabulary item with an intensity value.
|
|
38
40
|
|
|
39
|
-
|
|
41
|
+
Parameters:
|
|
40
42
|
value: The vocabulary word or phrase
|
|
41
43
|
intensity: The bias intensity for this vocabulary item (0-1)
|
|
42
44
|
"""
|
|
@@ -48,7 +50,7 @@ class CustomVocabularyItem(BaseModel):
|
|
|
48
50
|
class CustomVocabularyConfig(BaseModel):
|
|
49
51
|
"""Configuration for custom vocabulary.
|
|
50
52
|
|
|
51
|
-
|
|
53
|
+
Parameters:
|
|
52
54
|
vocabulary: List of words/phrases or CustomVocabularyItem objects
|
|
53
55
|
default_intensity: Default intensity for simple string vocabulary items
|
|
54
56
|
"""
|
|
@@ -60,7 +62,7 @@ class CustomVocabularyConfig(BaseModel):
|
|
|
60
62
|
class CustomSpellingConfig(BaseModel):
|
|
61
63
|
"""Configuration for custom spelling rules.
|
|
62
64
|
|
|
63
|
-
|
|
65
|
+
Parameters:
|
|
64
66
|
spelling_dictionary: Mapping of correct spellings to phonetic variations
|
|
65
67
|
"""
|
|
66
68
|
|
|
@@ -70,7 +72,7 @@ class CustomSpellingConfig(BaseModel):
|
|
|
70
72
|
class TranslationConfig(BaseModel):
|
|
71
73
|
"""Configuration for real-time translation.
|
|
72
74
|
|
|
73
|
-
|
|
75
|
+
Parameters:
|
|
74
76
|
target_languages: List of target language codes for translation
|
|
75
77
|
model: Translation model to use ("base" or "enhanced")
|
|
76
78
|
match_original_utterances: Whether to align translations with original utterances
|
|
@@ -92,7 +94,7 @@ class TranslationConfig(BaseModel):
|
|
|
92
94
|
class RealtimeProcessingConfig(BaseModel):
|
|
93
95
|
"""Configuration for real-time processing features.
|
|
94
96
|
|
|
95
|
-
|
|
97
|
+
Parameters:
|
|
96
98
|
words_accurate_timestamps: Whether to provide per-word timestamps
|
|
97
99
|
custom_vocabulary: Whether to enable custom vocabulary
|
|
98
100
|
custom_vocabulary_config: Custom vocabulary configuration
|
|
@@ -118,7 +120,7 @@ class RealtimeProcessingConfig(BaseModel):
|
|
|
118
120
|
class MessagesConfig(BaseModel):
|
|
119
121
|
"""Configuration for controlling which message types are sent via WebSocket.
|
|
120
122
|
|
|
121
|
-
|
|
123
|
+
Parameters:
|
|
122
124
|
receive_partial_transcripts: Whether to receive intermediate transcription results
|
|
123
125
|
receive_final_transcripts: Whether to receive final transcription results
|
|
124
126
|
receive_speech_events: Whether to receive speech begin/end events
|
|
@@ -144,14 +146,19 @@ class MessagesConfig(BaseModel):
|
|
|
144
146
|
class GladiaInputParams(BaseModel):
|
|
145
147
|
"""Configuration parameters for the Gladia STT service.
|
|
146
148
|
|
|
147
|
-
|
|
149
|
+
Parameters:
|
|
148
150
|
encoding: Audio encoding format
|
|
149
151
|
bit_depth: Audio bit depth
|
|
150
152
|
channels: Number of audio channels
|
|
151
153
|
custom_metadata: Additional metadata to include with requests
|
|
152
154
|
endpointing: Silence duration in seconds to mark end of speech
|
|
153
155
|
maximum_duration_without_endpointing: Maximum utterance duration without silence
|
|
154
|
-
language:
|
|
156
|
+
language: Language code for transcription
|
|
157
|
+
|
|
158
|
+
.. deprecated:: 0.0.62
|
|
159
|
+
The 'language' parameter is deprecated and will be removed in a future version.
|
|
160
|
+
Use 'language_config' instead.
|
|
161
|
+
|
|
155
162
|
language_config: Detailed language configuration
|
|
156
163
|
pre_processing: Audio pre-processing options
|
|
157
164
|
realtime_processing: Real-time processing features
|
pipecat/services/gladia/stt.py
CHANGED
|
@@ -4,11 +4,17 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Gladia Speech-to-Text (STT) service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides a Speech-to-Text service using Gladia's real-time WebSocket API,
|
|
10
|
+
supporting multiple languages, custom vocabulary, and various audio processing options.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import asyncio
|
|
8
14
|
import base64
|
|
9
15
|
import json
|
|
10
16
|
import warnings
|
|
11
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional
|
|
17
|
+
from typing import Any, AsyncGenerator, Dict, Literal, List, Optional
|
|
12
18
|
|
|
13
19
|
import aiohttp
|
|
14
20
|
from loguru import logger
|
|
@@ -31,12 +37,13 @@ from pipecat.services.gladia.config import (
|
|
|
31
37
|
)
|
|
32
38
|
from pipecat.services.stt_service import STTService
|
|
33
39
|
from pipecat.transcriptions.language import Language
|
|
34
|
-
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
|
35
40
|
from pipecat.utils.time import time_now_iso8601
|
|
36
41
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
37
42
|
|
|
38
43
|
try:
|
|
39
44
|
import websockets
|
|
45
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
46
|
+
from websockets.protocol import State
|
|
40
47
|
except ModuleNotFoundError as e:
|
|
41
48
|
logger.error(f"Exception: {e}")
|
|
42
49
|
logger.error("In order to use Gladia, you need to `pip install pipecat-ai[gladia]`.")
|
|
@@ -47,10 +54,10 @@ def language_to_gladia_language(language: Language) -> Optional[str]:
|
|
|
47
54
|
"""Convert a Language enum to Gladia's language code format.
|
|
48
55
|
|
|
49
56
|
Args:
|
|
50
|
-
language: The Language enum value to convert
|
|
57
|
+
language: The Language enum value to convert.
|
|
51
58
|
|
|
52
59
|
Returns:
|
|
53
|
-
The Gladia language code string or None if not supported
|
|
60
|
+
The Gladia language code string or None if not supported.
|
|
54
61
|
"""
|
|
55
62
|
BASE_LANGUAGES = {
|
|
56
63
|
Language.AF: "af",
|
|
@@ -186,8 +193,12 @@ class GladiaSTTService(STTService):
|
|
|
186
193
|
|
|
187
194
|
This service connects to Gladia's WebSocket API for real-time transcription
|
|
188
195
|
with support for multiple languages, custom vocabulary, and various processing options.
|
|
196
|
+
Provides automatic reconnection, audio buffering, and comprehensive error handling.
|
|
189
197
|
|
|
190
198
|
For complete API documentation, see: https://docs.gladia.io/api-reference/v2/live/init
|
|
199
|
+
|
|
200
|
+
.. deprecated:: 0.0.62
|
|
201
|
+
Use :class:`~pipecat.services.gladia.config.GladiaInputParams` directly instead.
|
|
191
202
|
"""
|
|
192
203
|
|
|
193
204
|
# Maintain backward compatibility
|
|
@@ -197,6 +208,7 @@ class GladiaSTTService(STTService):
|
|
|
197
208
|
self,
|
|
198
209
|
*,
|
|
199
210
|
api_key: str,
|
|
211
|
+
region: Literal["us-west", "eu-west"] | None = None,
|
|
200
212
|
url: str = "https://api.gladia.io/v2/live",
|
|
201
213
|
confidence: float = 0.5,
|
|
202
214
|
sample_rate: Optional[int] = None,
|
|
@@ -210,16 +222,17 @@ class GladiaSTTService(STTService):
|
|
|
210
222
|
"""Initialize the Gladia STT service.
|
|
211
223
|
|
|
212
224
|
Args:
|
|
213
|
-
api_key: Gladia API key
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
225
|
+
api_key: Gladia API key for authentication.
|
|
226
|
+
region: Region used to process audio. eu-west or us-west. Defaults to eu-west.
|
|
227
|
+
url: Gladia API URL. Defaults to "https://api.gladia.io/v2/live".
|
|
228
|
+
confidence: Minimum confidence threshold for transcriptions (0.0-1.0).
|
|
229
|
+
sample_rate: Audio sample rate in Hz. If None, uses service default.
|
|
230
|
+
model: Model to use for transcription. Defaults to "solaria-1".
|
|
231
|
+
params: Additional configuration parameters for Gladia service.
|
|
232
|
+
max_reconnection_attempts: Maximum number of reconnection attempts. Defaults to 5.
|
|
233
|
+
reconnection_delay: Initial delay between reconnection attempts in seconds.
|
|
234
|
+
max_buffer_size: Maximum size of audio buffer in bytes. Defaults to 20MB.
|
|
235
|
+
**kwargs: Additional arguments passed to the STTService parent class.
|
|
223
236
|
"""
|
|
224
237
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
225
238
|
vocab: Optional[List[str]] = kwargs.pop("vocab", None) # Get vocab from kwargs
|
|
@@ -236,6 +249,7 @@ class GladiaSTTService(STTService):
|
|
|
236
249
|
)
|
|
237
250
|
|
|
238
251
|
self._api_key = api_key
|
|
252
|
+
self._region = region
|
|
239
253
|
self._url = url
|
|
240
254
|
self.set_model_name(model)
|
|
241
255
|
self._confidence = confidence
|
|
@@ -280,10 +294,22 @@ class GladiaSTTService(STTService):
|
|
|
280
294
|
self._should_reconnect = True
|
|
281
295
|
|
|
282
296
|
def can_generate_metrics(self) -> bool:
|
|
297
|
+
"""Check if the service can generate performance metrics.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
True, indicating this service supports metrics generation.
|
|
301
|
+
"""
|
|
283
302
|
return True
|
|
284
303
|
|
|
285
304
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
286
|
-
"""Convert pipecat Language enum to Gladia's language code.
|
|
305
|
+
"""Convert pipecat Language enum to Gladia's language code.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
language: The Language enum value to convert.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
The Gladia language code string or None if not supported.
|
|
312
|
+
"""
|
|
287
313
|
return language_to_gladia_language(language)
|
|
288
314
|
|
|
289
315
|
def _prepare_settings(self) -> Dict[str, Any]:
|
|
@@ -338,7 +364,11 @@ class GladiaSTTService(STTService):
|
|
|
338
364
|
return settings
|
|
339
365
|
|
|
340
366
|
async def start(self, frame: StartFrame):
|
|
341
|
-
"""Start the Gladia STT websocket connection.
|
|
367
|
+
"""Start the Gladia STT websocket connection.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
frame: The start frame triggering service startup.
|
|
371
|
+
"""
|
|
342
372
|
await super().start(frame)
|
|
343
373
|
if self._connection_task:
|
|
344
374
|
return
|
|
@@ -347,7 +377,11 @@ class GladiaSTTService(STTService):
|
|
|
347
377
|
self._connection_task = self.create_task(self._connection_handler())
|
|
348
378
|
|
|
349
379
|
async def stop(self, frame: EndFrame):
|
|
350
|
-
"""Stop the Gladia STT websocket connection.
|
|
380
|
+
"""Stop the Gladia STT websocket connection.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
frame: The end frame triggering service shutdown.
|
|
384
|
+
"""
|
|
351
385
|
await super().stop(frame)
|
|
352
386
|
self._should_reconnect = False
|
|
353
387
|
await self._send_stop_recording()
|
|
@@ -359,7 +393,11 @@ class GladiaSTTService(STTService):
|
|
|
359
393
|
await self._cleanup_connection()
|
|
360
394
|
|
|
361
395
|
async def cancel(self, frame: CancelFrame):
|
|
362
|
-
"""Cancel the Gladia STT websocket connection.
|
|
396
|
+
"""Cancel the Gladia STT websocket connection.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
frame: The cancel frame triggering service cancellation.
|
|
400
|
+
"""
|
|
363
401
|
await super().cancel(frame)
|
|
364
402
|
self._should_reconnect = False
|
|
365
403
|
|
|
@@ -370,7 +408,14 @@ class GladiaSTTService(STTService):
|
|
|
370
408
|
await self._cleanup_connection()
|
|
371
409
|
|
|
372
410
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
373
|
-
"""Run speech-to-text on audio data.
|
|
411
|
+
"""Run speech-to-text on audio data.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
audio: Raw audio bytes to transcribe.
|
|
415
|
+
|
|
416
|
+
Yields:
|
|
417
|
+
None (processing is handled asynchronously via WebSocket).
|
|
418
|
+
"""
|
|
374
419
|
await self.start_ttfb_metrics()
|
|
375
420
|
await self.start_processing_metrics()
|
|
376
421
|
|
|
@@ -385,7 +430,7 @@ class GladiaSTTService(STTService):
|
|
|
385
430
|
logger.warning(f"Audio buffer exceeded max size, trimmed {trim_size} bytes")
|
|
386
431
|
|
|
387
432
|
# Send audio if connected
|
|
388
|
-
if self._connection_active and self._websocket and
|
|
433
|
+
if self._connection_active and self._websocket and self._websocket.state is State.OPEN:
|
|
389
434
|
try:
|
|
390
435
|
await self._send_audio(audio)
|
|
391
436
|
except websockets.exceptions.ConnectionClosed as e:
|
|
@@ -406,11 +451,11 @@ class GladiaSTTService(STTService):
|
|
|
406
451
|
self._reconnection_attempts = 0
|
|
407
452
|
|
|
408
453
|
# Connect with automatic reconnection
|
|
409
|
-
async with
|
|
454
|
+
async with websocket_connect(self._session_url) as websocket:
|
|
410
455
|
try:
|
|
411
456
|
self._websocket = websocket
|
|
412
457
|
self._connection_active = True
|
|
413
|
-
logger.
|
|
458
|
+
logger.debug(f"{self} Connected to Gladia WebSocket")
|
|
414
459
|
|
|
415
460
|
# Send buffered audio if any
|
|
416
461
|
await self._send_buffered_audio()
|
|
@@ -465,10 +510,14 @@ class GladiaSTTService(STTService):
|
|
|
465
510
|
|
|
466
511
|
async def _setup_gladia(self, settings: Dict[str, Any]):
|
|
467
512
|
async with aiohttp.ClientSession() as session:
|
|
513
|
+
params = {}
|
|
514
|
+
if self._region:
|
|
515
|
+
params["region"] = self._region
|
|
468
516
|
async with session.post(
|
|
469
517
|
self._url,
|
|
470
|
-
headers={"X-Gladia-Key": self._api_key
|
|
518
|
+
headers={"X-Gladia-Key": self._api_key},
|
|
471
519
|
json=settings,
|
|
520
|
+
params=params,
|
|
472
521
|
) as response:
|
|
473
522
|
if response.ok:
|
|
474
523
|
return await response.json()
|
|
@@ -490,7 +539,7 @@ class GladiaSTTService(STTService):
|
|
|
490
539
|
|
|
491
540
|
async def _send_audio(self, audio: bytes):
|
|
492
541
|
"""Send audio chunk with proper message format."""
|
|
493
|
-
if self._websocket and
|
|
542
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
494
543
|
data = base64.b64encode(audio).decode("utf-8")
|
|
495
544
|
message = {"type": "audio_chunk", "data": {"chunk": data}}
|
|
496
545
|
await self._websocket.send(json.dumps(message))
|
|
@@ -499,22 +548,21 @@ class GladiaSTTService(STTService):
|
|
|
499
548
|
"""Send any buffered audio after reconnection."""
|
|
500
549
|
async with self._buffer_lock:
|
|
501
550
|
if self._audio_buffer:
|
|
502
|
-
logger.
|
|
551
|
+
logger.debug(f"{self} Sending {len(self._audio_buffer)} bytes of buffered audio")
|
|
503
552
|
await self._send_audio(bytes(self._audio_buffer))
|
|
504
553
|
|
|
505
554
|
async def _send_stop_recording(self):
|
|
506
|
-
if self._websocket and
|
|
555
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
507
556
|
await self._websocket.send(json.dumps({"type": "stop_recording"}))
|
|
508
557
|
|
|
509
558
|
async def _keepalive_task_handler(self):
|
|
510
559
|
"""Send periodic empty audio chunks to keep the connection alive."""
|
|
511
560
|
try:
|
|
512
|
-
KEEPALIVE_SLEEP = 20
|
|
561
|
+
KEEPALIVE_SLEEP = 20
|
|
513
562
|
while self._connection_active:
|
|
514
|
-
self.reset_watchdog()
|
|
515
563
|
# Send keepalive (Gladia times out after 30 seconds)
|
|
516
564
|
await asyncio.sleep(KEEPALIVE_SLEEP)
|
|
517
|
-
if self._websocket and
|
|
565
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
518
566
|
# Send an empty audio chunk as keepalive
|
|
519
567
|
empty_audio = b""
|
|
520
568
|
await self._send_audio(empty_audio)
|
|
@@ -528,7 +576,7 @@ class GladiaSTTService(STTService):
|
|
|
528
576
|
|
|
529
577
|
async def _receive_task_handler(self):
|
|
530
578
|
try:
|
|
531
|
-
async for message in
|
|
579
|
+
async for message in self._websocket:
|
|
532
580
|
content = json.loads(message)
|
|
533
581
|
|
|
534
582
|
# Handle audio chunk acknowledgments
|
|
@@ -553,7 +601,7 @@ class GladiaSTTService(STTService):
|
|
|
553
601
|
await self.push_frame(
|
|
554
602
|
TranscriptionFrame(
|
|
555
603
|
transcript,
|
|
556
|
-
|
|
604
|
+
self._user_id,
|
|
557
605
|
time_now_iso8601(),
|
|
558
606
|
language,
|
|
559
607
|
result=content,
|
|
@@ -568,7 +616,7 @@ class GladiaSTTService(STTService):
|
|
|
568
616
|
await self.push_frame(
|
|
569
617
|
InterimTranscriptionFrame(
|
|
570
618
|
transcript,
|
|
571
|
-
|
|
619
|
+
self._user_id,
|
|
572
620
|
time_now_iso8601(),
|
|
573
621
|
language,
|
|
574
622
|
result=content,
|
|
@@ -586,8 +634,6 @@ class GladiaSTTService(STTService):
|
|
|
586
634
|
translation, "", time_now_iso8601(), translated_language
|
|
587
635
|
)
|
|
588
636
|
)
|
|
589
|
-
|
|
590
|
-
self.reset_watchdog()
|
|
591
637
|
except websockets.exceptions.ConnectionClosed:
|
|
592
638
|
# Expected when closing the connection
|
|
593
639
|
pass
|
|
@@ -604,8 +650,8 @@ class GladiaSTTService(STTService):
|
|
|
604
650
|
self._should_reconnect = False
|
|
605
651
|
return False
|
|
606
652
|
delay = self._reconnection_delay * (2 ** (self._reconnection_attempts - 1))
|
|
607
|
-
logger.
|
|
608
|
-
f"Reconnecting in {delay} seconds (attempt {self._reconnection_attempts}/{self._max_reconnection_attempts})"
|
|
653
|
+
logger.debug(
|
|
654
|
+
f"{self} Reconnecting in {delay} seconds (attempt {self._reconnection_attempts}/{self._max_reconnection_attempts})"
|
|
609
655
|
)
|
|
610
656
|
await asyncio.sleep(delay)
|
|
611
657
|
return True
|
|
@@ -4,6 +4,13 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Google AI service frames for search and grounding functionality.
|
|
8
|
+
|
|
9
|
+
This module defines specialized frame types for handling search results
|
|
10
|
+
and grounding metadata from Google AI models, particularly for Gemini
|
|
11
|
+
models that support web search and fact grounding capabilities.
|
|
12
|
+
"""
|
|
13
|
+
|
|
7
14
|
from dataclasses import dataclass, field
|
|
8
15
|
from typing import List, Optional
|
|
9
16
|
|
|
@@ -12,12 +19,27 @@ from pipecat.frames.frames import DataFrame
|
|
|
12
19
|
|
|
13
20
|
@dataclass
|
|
14
21
|
class LLMSearchResult:
|
|
22
|
+
"""Represents a single search result with confidence scores.
|
|
23
|
+
|
|
24
|
+
Parameters:
|
|
25
|
+
text: The search result text content.
|
|
26
|
+
confidence: List of confidence scores associated with the result.
|
|
27
|
+
"""
|
|
28
|
+
|
|
15
29
|
text: str
|
|
16
30
|
confidence: List[float] = field(default_factory=list)
|
|
17
31
|
|
|
18
32
|
|
|
19
33
|
@dataclass
|
|
20
34
|
class LLMSearchOrigin:
|
|
35
|
+
"""Represents the origin source of search results.
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
site_uri: URI of the source website.
|
|
39
|
+
site_title: Title of the source website.
|
|
40
|
+
results: List of search results from this origin.
|
|
41
|
+
"""
|
|
42
|
+
|
|
21
43
|
site_uri: Optional[str] = None
|
|
22
44
|
site_title: Optional[str] = None
|
|
23
45
|
results: List[LLMSearchResult] = field(default_factory=list)
|
|
@@ -25,9 +47,27 @@ class LLMSearchOrigin:
|
|
|
25
47
|
|
|
26
48
|
@dataclass
|
|
27
49
|
class LLMSearchResponseFrame(DataFrame):
|
|
50
|
+
"""Frame containing search results and grounding information from Google AI models.
|
|
51
|
+
|
|
52
|
+
This frame is used to convey search results and grounding metadata
|
|
53
|
+
from Google AI models that support web search capabilities. It includes
|
|
54
|
+
the search result text, rendered content, and detailed origin information
|
|
55
|
+
with confidence scores.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
search_result: The main search result text.
|
|
59
|
+
rendered_content: Rendered content from the search entry point.
|
|
60
|
+
origins: List of search result origins with detailed information.
|
|
61
|
+
"""
|
|
62
|
+
|
|
28
63
|
search_result: Optional[str] = None
|
|
29
64
|
rendered_content: Optional[str] = None
|
|
30
65
|
origins: List[LLMSearchOrigin] = field(default_factory=list)
|
|
31
66
|
|
|
32
67
|
def __str__(self):
|
|
68
|
+
"""Return string representation of the search response frame.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
String representation showing search result and origins.
|
|
72
|
+
"""
|
|
33
73
|
return f"LLMSearchResponseFrame(search_result={self.search_result}, origins={self.origins})"
|
pipecat/services/google/image.py
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Google AI image generation service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Google's Imagen model for generating
|
|
10
|
+
images from text prompts using the Google AI API.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import io
|
|
8
14
|
import os
|
|
9
15
|
|
|
@@ -29,7 +35,22 @@ except ModuleNotFoundError as e:
|
|
|
29
35
|
|
|
30
36
|
|
|
31
37
|
class GoogleImageGenService(ImageGenService):
|
|
38
|
+
"""Google AI image generation service using Imagen models.
|
|
39
|
+
|
|
40
|
+
Provides text-to-image generation capabilities using Google's Imagen models
|
|
41
|
+
through the Google AI API. Supports multiple image generation and negative
|
|
42
|
+
prompting for enhanced control over generated content.
|
|
43
|
+
"""
|
|
44
|
+
|
|
32
45
|
class InputParams(BaseModel):
|
|
46
|
+
"""Configuration parameters for Google image generation.
|
|
47
|
+
|
|
48
|
+
Parameters:
|
|
49
|
+
number_of_images: Number of images to generate (1-8). Defaults to 1.
|
|
50
|
+
model: Google Imagen model to use. Defaults to "imagen-3.0-generate-002".
|
|
51
|
+
negative_prompt: Optional negative prompt to guide what not to include.
|
|
52
|
+
"""
|
|
53
|
+
|
|
33
54
|
number_of_images: int = Field(default=1, ge=1, le=8)
|
|
34
55
|
model: str = Field(default="imagen-3.0-generate-002")
|
|
35
56
|
negative_prompt: Optional[str] = Field(default=None)
|
|
@@ -41,22 +62,38 @@ class GoogleImageGenService(ImageGenService):
|
|
|
41
62
|
params: Optional[InputParams] = None,
|
|
42
63
|
**kwargs,
|
|
43
64
|
):
|
|
65
|
+
"""Initialize the GoogleImageGenService with API key and parameters.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
api_key: Google AI API key for authentication.
|
|
69
|
+
params: Configuration parameters for image generation. Defaults to InputParams().
|
|
70
|
+
**kwargs: Additional arguments passed to the parent ImageGenService.
|
|
71
|
+
"""
|
|
44
72
|
super().__init__(**kwargs)
|
|
45
73
|
self._params = params or GoogleImageGenService.InputParams()
|
|
46
74
|
self._client = genai.Client(api_key=api_key)
|
|
47
75
|
self.set_model_name(self._params.model)
|
|
48
76
|
|
|
49
77
|
def can_generate_metrics(self) -> bool:
|
|
78
|
+
"""Check if this service can generate processing metrics.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
True, as Google image generation service supports metrics.
|
|
82
|
+
"""
|
|
50
83
|
return True
|
|
51
84
|
|
|
52
85
|
async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
|
|
53
86
|
"""Generate images from a text prompt using Google's Imagen model.
|
|
54
87
|
|
|
55
88
|
Args:
|
|
56
|
-
prompt
|
|
89
|
+
prompt: The text description to generate images from.
|
|
57
90
|
|
|
58
91
|
Yields:
|
|
59
|
-
Frame: Generated
|
|
92
|
+
Frame: Generated URLImageRawFrame objects containing the generated
|
|
93
|
+
images, or ErrorFrame objects if generation fails.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
Exception: If there are issues with the Google AI API or image processing.
|
|
60
97
|
"""
|
|
61
98
|
logger.debug(f"Generating image from prompt: {prompt}")
|
|
62
99
|
await self.start_ttfb_metrics()
|