dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
pipecat/services/fish/tts.py
CHANGED
|
@@ -21,8 +21,8 @@ from pipecat.frames.frames import (
|
|
|
21
21
|
EndFrame,
|
|
22
22
|
ErrorFrame,
|
|
23
23
|
Frame,
|
|
24
|
+
InterruptionFrame,
|
|
24
25
|
StartFrame,
|
|
25
|
-
StartInterruptionFrame,
|
|
26
26
|
TTSAudioRawFrame,
|
|
27
27
|
TTSStartedFrame,
|
|
28
28
|
TTSStoppedFrame,
|
|
@@ -225,6 +225,8 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
225
225
|
start_message = {"event": "start", "request": {"text": "", **self._settings}}
|
|
226
226
|
await self._websocket.send(ormsgpack.packb(start_message))
|
|
227
227
|
logger.debug("Sent start message to Fish Audio")
|
|
228
|
+
|
|
229
|
+
await self._call_event_handler("on_connected")
|
|
228
230
|
except Exception as e:
|
|
229
231
|
logger.error(f"Fish Audio initialization error: {e}")
|
|
230
232
|
self._websocket = None
|
|
@@ -245,6 +247,7 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
245
247
|
self._request_id = None
|
|
246
248
|
self._started = False
|
|
247
249
|
self._websocket = None
|
|
250
|
+
await self._call_event_handler("on_disconnected")
|
|
248
251
|
|
|
249
252
|
async def flush_audio(self):
|
|
250
253
|
"""Flush any buffered audio by sending a flush event to Fish Audio."""
|
|
@@ -259,7 +262,7 @@ class FishAudioTTSService(InterruptibleTTSService):
|
|
|
259
262
|
return self._websocket
|
|
260
263
|
raise Exception("Websocket not connected")
|
|
261
264
|
|
|
262
|
-
async def _handle_interruption(self, frame:
|
|
265
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
263
266
|
await super()._handle_interruption(frame, direction)
|
|
264
267
|
await self.stop_all_metrics()
|
|
265
268
|
self._request_id = None
|
|
@@ -4,527 +4,41 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""Event models and utilities for Google Gemini Multimodal Live API.
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
text: Optional[str] = Field(default=None, validate_default=False)
|
|
46
|
-
inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
|
|
47
|
-
fileData: Optional["FileData"] = Field(default=None, validate_default=False)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class FileData(BaseModel):
|
|
51
|
-
"""Represents a file reference in the Gemini File API."""
|
|
52
|
-
|
|
53
|
-
mimeType: str
|
|
54
|
-
fileUri: str
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
ContentPart.model_rebuild() # Rebuild model to resolve forward reference
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class Turn(BaseModel):
|
|
61
|
-
"""Represents a conversational turn in the dialogue.
|
|
62
|
-
|
|
63
|
-
Parameters:
|
|
64
|
-
role: The role of the speaker, either "user" or "model". Defaults to "user".
|
|
65
|
-
parts: List of content parts that make up the turn.
|
|
66
|
-
"""
|
|
67
|
-
|
|
68
|
-
role: Literal["user", "model"] = "user"
|
|
69
|
-
parts: List[ContentPart]
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class StartSensitivity(str, Enum):
|
|
73
|
-
"""Determines how start of speech is detected."""
|
|
74
|
-
|
|
75
|
-
UNSPECIFIED = "START_SENSITIVITY_UNSPECIFIED" # Default is HIGH
|
|
76
|
-
HIGH = "START_SENSITIVITY_HIGH" # Detect start of speech more often
|
|
77
|
-
LOW = "START_SENSITIVITY_LOW" # Detect start of speech less often
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class EndSensitivity(str, Enum):
|
|
81
|
-
"""Determines how end of speech is detected."""
|
|
82
|
-
|
|
83
|
-
UNSPECIFIED = "END_SENSITIVITY_UNSPECIFIED" # Default is HIGH
|
|
84
|
-
HIGH = "END_SENSITIVITY_HIGH" # End speech more often
|
|
85
|
-
LOW = "END_SENSITIVITY_LOW" # End speech less often
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class AutomaticActivityDetection(BaseModel):
|
|
89
|
-
"""Configures automatic detection of voice activity.
|
|
90
|
-
|
|
91
|
-
Parameters:
|
|
92
|
-
disabled: Whether automatic activity detection is disabled. Defaults to None.
|
|
93
|
-
start_of_speech_sensitivity: Sensitivity for detecting speech start. Defaults to None.
|
|
94
|
-
prefix_padding_ms: Padding before speech start in milliseconds. Defaults to None.
|
|
95
|
-
end_of_speech_sensitivity: Sensitivity for detecting speech end. Defaults to None.
|
|
96
|
-
silence_duration_ms: Duration of silence to detect speech end. Defaults to None.
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
disabled: Optional[bool] = None
|
|
100
|
-
start_of_speech_sensitivity: Optional[StartSensitivity] = None
|
|
101
|
-
prefix_padding_ms: Optional[int] = None
|
|
102
|
-
end_of_speech_sensitivity: Optional[EndSensitivity] = None
|
|
103
|
-
silence_duration_ms: Optional[int] = None
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
class RealtimeInputConfig(BaseModel):
|
|
107
|
-
"""Configures the realtime input behavior.
|
|
108
|
-
|
|
109
|
-
Parameters:
|
|
110
|
-
automatic_activity_detection: Voice activity detection configuration. Defaults to None.
|
|
111
|
-
"""
|
|
112
|
-
|
|
113
|
-
automatic_activity_detection: Optional[AutomaticActivityDetection] = None
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
class RealtimeInput(BaseModel):
|
|
117
|
-
"""Contains realtime input media chunks and text.
|
|
118
|
-
|
|
119
|
-
Parameters:
|
|
120
|
-
mediaChunks: List of media chunks for realtime processing.
|
|
121
|
-
text: Text for realtime processing.
|
|
122
|
-
"""
|
|
123
|
-
|
|
124
|
-
mediaChunks: Optional[List[MediaChunk]] = None
|
|
125
|
-
text: Optional[str] = None
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class ClientContent(BaseModel):
|
|
129
|
-
"""Content sent from client to the Gemini Live API.
|
|
130
|
-
|
|
131
|
-
Parameters:
|
|
132
|
-
turns: List of conversation turns. Defaults to None.
|
|
133
|
-
turnComplete: Whether the client's turn is complete. Defaults to False.
|
|
134
|
-
"""
|
|
135
|
-
|
|
136
|
-
turns: Optional[List[Turn]] = None
|
|
137
|
-
turnComplete: bool = False
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
class AudioInputMessage(BaseModel):
|
|
141
|
-
"""Message containing audio input data.
|
|
142
|
-
|
|
143
|
-
Parameters:
|
|
144
|
-
realtimeInput: Realtime input containing audio chunks.
|
|
145
|
-
"""
|
|
146
|
-
|
|
147
|
-
realtimeInput: RealtimeInput
|
|
148
|
-
|
|
149
|
-
@classmethod
|
|
150
|
-
def from_raw_audio(cls, raw_audio: bytes, sample_rate: int) -> "AudioInputMessage":
|
|
151
|
-
"""Create an audio input message from raw audio data.
|
|
152
|
-
|
|
153
|
-
Args:
|
|
154
|
-
raw_audio: Raw audio bytes.
|
|
155
|
-
sample_rate: Audio sample rate in Hz.
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
AudioInputMessage instance with encoded audio data.
|
|
159
|
-
"""
|
|
160
|
-
data = base64.b64encode(raw_audio).decode("utf-8")
|
|
161
|
-
return cls(
|
|
162
|
-
realtimeInput=RealtimeInput(
|
|
163
|
-
mediaChunks=[MediaChunk(mimeType=f"audio/pcm;rate={sample_rate}", data=data)]
|
|
164
|
-
)
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class VideoInputMessage(BaseModel):
|
|
169
|
-
"""Message containing video/image input data.
|
|
170
|
-
|
|
171
|
-
Parameters:
|
|
172
|
-
realtimeInput: Realtime input containing video/image chunks.
|
|
173
|
-
"""
|
|
174
|
-
|
|
175
|
-
realtimeInput: RealtimeInput
|
|
176
|
-
|
|
177
|
-
@classmethod
|
|
178
|
-
def from_image_frame(cls, frame: ImageRawFrame) -> "VideoInputMessage":
|
|
179
|
-
"""Create a video input message from an image frame.
|
|
180
|
-
|
|
181
|
-
Args:
|
|
182
|
-
frame: Image frame to encode.
|
|
183
|
-
|
|
184
|
-
Returns:
|
|
185
|
-
VideoInputMessage instance with encoded image data.
|
|
186
|
-
"""
|
|
187
|
-
buffer = io.BytesIO()
|
|
188
|
-
Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
|
|
189
|
-
data = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
190
|
-
return cls(
|
|
191
|
-
realtimeInput=RealtimeInput(mediaChunks=[MediaChunk(mimeType=f"image/jpeg", data=data)])
|
|
192
|
-
)
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
class TextInputMessage(BaseModel):
|
|
196
|
-
"""Message containing text input data."""
|
|
197
|
-
|
|
198
|
-
realtimeInput: RealtimeInput
|
|
199
|
-
|
|
200
|
-
@classmethod
|
|
201
|
-
def from_text(cls, text: str) -> "TextInputMessage":
|
|
202
|
-
"""Create a text input message from a string.
|
|
203
|
-
|
|
204
|
-
Args:
|
|
205
|
-
text: The text to send.
|
|
206
|
-
|
|
207
|
-
Returns:
|
|
208
|
-
A TextInputMessage instance.
|
|
209
|
-
"""
|
|
210
|
-
return cls(realtimeInput=RealtimeInput(text=text))
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
class ClientContentMessage(BaseModel):
|
|
214
|
-
"""Message containing client content for the API.
|
|
215
|
-
|
|
216
|
-
Parameters:
|
|
217
|
-
clientContent: The client content to send.
|
|
218
|
-
"""
|
|
219
|
-
|
|
220
|
-
clientContent: ClientContent
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
class SystemInstruction(BaseModel):
|
|
224
|
-
"""System instruction for the model.
|
|
225
|
-
|
|
226
|
-
Parameters:
|
|
227
|
-
parts: List of content parts that make up the system instruction.
|
|
228
|
-
"""
|
|
229
|
-
|
|
230
|
-
parts: List[ContentPart]
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
class AudioTranscriptionConfig(BaseModel):
|
|
234
|
-
"""Configuration for audio transcription."""
|
|
235
|
-
|
|
236
|
-
pass
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
class Setup(BaseModel):
|
|
240
|
-
"""Setup configuration for the Gemini Live session.
|
|
241
|
-
|
|
242
|
-
Parameters:
|
|
243
|
-
model: Model identifier to use.
|
|
244
|
-
system_instruction: System instruction for the model. Defaults to None.
|
|
245
|
-
tools: List of available tools/functions. Defaults to None.
|
|
246
|
-
generation_config: Generation configuration parameters. Defaults to None.
|
|
247
|
-
input_audio_transcription: Input audio transcription config. Defaults to None.
|
|
248
|
-
output_audio_transcription: Output audio transcription config. Defaults to None.
|
|
249
|
-
realtime_input_config: Realtime input configuration. Defaults to None.
|
|
250
|
-
"""
|
|
251
|
-
|
|
252
|
-
model: str
|
|
253
|
-
system_instruction: Optional[SystemInstruction] = None
|
|
254
|
-
tools: Optional[List[dict]] = None
|
|
255
|
-
generation_config: Optional[dict] = None
|
|
256
|
-
input_audio_transcription: Optional[AudioTranscriptionConfig] = None
|
|
257
|
-
output_audio_transcription: Optional[AudioTranscriptionConfig] = None
|
|
258
|
-
realtime_input_config: Optional[RealtimeInputConfig] = None
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
class Config(BaseModel):
|
|
262
|
-
"""Configuration message for session setup.
|
|
263
|
-
|
|
264
|
-
Parameters:
|
|
265
|
-
setup: Setup configuration for the session.
|
|
266
|
-
"""
|
|
267
|
-
|
|
268
|
-
setup: Setup
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
#
|
|
272
|
-
# Grounding metadata models
|
|
273
|
-
#
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
class SearchEntryPoint(BaseModel):
|
|
277
|
-
"""Represents the search entry point with rendered content for search suggestions."""
|
|
278
|
-
|
|
279
|
-
renderedContent: Optional[str] = None
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
class WebSource(BaseModel):
|
|
283
|
-
"""Represents a web source from grounding chunks."""
|
|
284
|
-
|
|
285
|
-
uri: Optional[str] = None
|
|
286
|
-
title: Optional[str] = None
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
class GroundingChunk(BaseModel):
|
|
290
|
-
"""Represents a grounding chunk containing web source information."""
|
|
291
|
-
|
|
292
|
-
web: Optional[WebSource] = None
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
class GroundingSegment(BaseModel):
|
|
296
|
-
"""Represents a segment of text that is grounded."""
|
|
297
|
-
|
|
298
|
-
startIndex: Optional[int] = None
|
|
299
|
-
endIndex: Optional[int] = None
|
|
300
|
-
text: Optional[str] = None
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
class GroundingSupport(BaseModel):
|
|
304
|
-
"""Represents support information for grounded text segments."""
|
|
305
|
-
|
|
306
|
-
segment: Optional[GroundingSegment] = None
|
|
307
|
-
groundingChunkIndices: Optional[List[int]] = None
|
|
308
|
-
confidenceScores: Optional[List[float]] = None
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
class GroundingMetadata(BaseModel):
|
|
312
|
-
"""Represents grounding metadata from Google Search."""
|
|
313
|
-
|
|
314
|
-
searchEntryPoint: Optional[SearchEntryPoint] = None
|
|
315
|
-
groundingChunks: Optional[List[GroundingChunk]] = None
|
|
316
|
-
groundingSupports: Optional[List[GroundingSupport]] = None
|
|
317
|
-
webSearchQueries: Optional[List[str]] = None
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
#
|
|
321
|
-
# Server events
|
|
322
|
-
#
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
class SetupComplete(BaseModel):
|
|
326
|
-
"""Indicates that session setup is complete."""
|
|
327
|
-
|
|
328
|
-
pass
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
class InlineData(BaseModel):
|
|
332
|
-
"""Inline data embedded in server responses.
|
|
333
|
-
|
|
334
|
-
Parameters:
|
|
335
|
-
mimeType: MIME type of the data.
|
|
336
|
-
data: Base64-encoded data content.
|
|
337
|
-
"""
|
|
338
|
-
|
|
339
|
-
mimeType: str
|
|
340
|
-
data: str
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
class Part(BaseModel):
|
|
344
|
-
"""Part of a server response containing data or text.
|
|
345
|
-
|
|
346
|
-
Parameters:
|
|
347
|
-
inlineData: Inline binary data. Defaults to None.
|
|
348
|
-
text: Text content. Defaults to None.
|
|
349
|
-
"""
|
|
350
|
-
|
|
351
|
-
inlineData: Optional[InlineData] = None
|
|
352
|
-
text: Optional[str] = None
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
class ModelTurn(BaseModel):
|
|
356
|
-
"""Represents a turn from the model in the conversation.
|
|
357
|
-
|
|
358
|
-
Parameters:
|
|
359
|
-
parts: List of content parts in the model's response.
|
|
360
|
-
"""
|
|
361
|
-
|
|
362
|
-
parts: List[Part]
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
class ServerContentInterrupted(BaseModel):
|
|
366
|
-
"""Indicates server content was interrupted.
|
|
367
|
-
|
|
368
|
-
Parameters:
|
|
369
|
-
interrupted: Whether the content was interrupted.
|
|
370
|
-
"""
|
|
371
|
-
|
|
372
|
-
interrupted: bool
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
class ServerContentTurnComplete(BaseModel):
|
|
376
|
-
"""Indicates the server's turn is complete.
|
|
377
|
-
|
|
378
|
-
Parameters:
|
|
379
|
-
turnComplete: Whether the turn is complete.
|
|
380
|
-
"""
|
|
381
|
-
|
|
382
|
-
turnComplete: bool
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
class BidiGenerateContentTranscription(BaseModel):
|
|
386
|
-
"""Transcription data from bidirectional content generation.
|
|
387
|
-
|
|
388
|
-
Parameters:
|
|
389
|
-
text: The transcribed text content.
|
|
390
|
-
"""
|
|
391
|
-
|
|
392
|
-
text: str
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
class ServerContent(BaseModel):
|
|
396
|
-
"""Content sent from server to client.
|
|
397
|
-
|
|
398
|
-
Parameters:
|
|
399
|
-
modelTurn: Model's conversational turn. Defaults to None.
|
|
400
|
-
interrupted: Whether content was interrupted. Defaults to None.
|
|
401
|
-
turnComplete: Whether the turn is complete. Defaults to None.
|
|
402
|
-
inputTranscription: Transcription of input audio. Defaults to None.
|
|
403
|
-
outputTranscription: Transcription of output audio. Defaults to None.
|
|
404
|
-
"""
|
|
405
|
-
|
|
406
|
-
modelTurn: Optional[ModelTurn] = None
|
|
407
|
-
interrupted: Optional[bool] = None
|
|
408
|
-
turnComplete: Optional[bool] = None
|
|
409
|
-
inputTranscription: Optional[BidiGenerateContentTranscription] = None
|
|
410
|
-
outputTranscription: Optional[BidiGenerateContentTranscription] = None
|
|
411
|
-
groundingMetadata: Optional[GroundingMetadata] = None
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
class FunctionCall(BaseModel):
|
|
415
|
-
"""Represents a function call from the model.
|
|
416
|
-
|
|
417
|
-
Parameters:
|
|
418
|
-
id: Unique identifier for the function call.
|
|
419
|
-
name: Name of the function to call.
|
|
420
|
-
args: Arguments to pass to the function.
|
|
421
|
-
"""
|
|
422
|
-
|
|
423
|
-
id: str
|
|
424
|
-
name: str
|
|
425
|
-
args: dict
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
class ToolCall(BaseModel):
|
|
429
|
-
"""Contains one or more function calls.
|
|
430
|
-
|
|
431
|
-
Parameters:
|
|
432
|
-
functionCalls: List of function calls to execute.
|
|
433
|
-
"""
|
|
434
|
-
|
|
435
|
-
functionCalls: List[FunctionCall]
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
class Modality(str, Enum):
|
|
439
|
-
"""Modality types in token counts."""
|
|
440
|
-
|
|
441
|
-
UNSPECIFIED = "MODALITY_UNSPECIFIED"
|
|
442
|
-
TEXT = "TEXT"
|
|
443
|
-
IMAGE = "IMAGE"
|
|
444
|
-
AUDIO = "AUDIO"
|
|
445
|
-
VIDEO = "VIDEO"
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
class ModalityTokenCount(BaseModel):
|
|
449
|
-
"""Token count for a specific modality.
|
|
450
|
-
|
|
451
|
-
Parameters:
|
|
452
|
-
modality: The modality type.
|
|
453
|
-
tokenCount: Number of tokens for this modality.
|
|
454
|
-
"""
|
|
455
|
-
|
|
456
|
-
modality: Modality
|
|
457
|
-
tokenCount: int
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
class UsageMetadata(BaseModel):
|
|
461
|
-
"""Usage metadata about the API response.
|
|
462
|
-
|
|
463
|
-
Parameters:
|
|
464
|
-
promptTokenCount: Number of tokens in the prompt. Defaults to None.
|
|
465
|
-
cachedContentTokenCount: Number of cached content tokens. Defaults to None.
|
|
466
|
-
responseTokenCount: Number of tokens in the response. Defaults to None.
|
|
467
|
-
toolUsePromptTokenCount: Number of tokens for tool use prompts. Defaults to None.
|
|
468
|
-
thoughtsTokenCount: Number of tokens for model thoughts. Defaults to None.
|
|
469
|
-
totalTokenCount: Total number of tokens used. Defaults to None.
|
|
470
|
-
promptTokensDetails: Detailed breakdown of prompt tokens by modality. Defaults to None.
|
|
471
|
-
cacheTokensDetails: Detailed breakdown of cache tokens by modality. Defaults to None.
|
|
472
|
-
responseTokensDetails: Detailed breakdown of response tokens by modality. Defaults to None.
|
|
473
|
-
toolUsePromptTokensDetails: Detailed breakdown of tool use tokens by modality. Defaults to None.
|
|
474
|
-
"""
|
|
475
|
-
|
|
476
|
-
promptTokenCount: Optional[int] = None
|
|
477
|
-
cachedContentTokenCount: Optional[int] = None
|
|
478
|
-
responseTokenCount: Optional[int] = None
|
|
479
|
-
toolUsePromptTokenCount: Optional[int] = None
|
|
480
|
-
thoughtsTokenCount: Optional[int] = None
|
|
481
|
-
totalTokenCount: Optional[int] = None
|
|
482
|
-
promptTokensDetails: Optional[List[ModalityTokenCount]] = None
|
|
483
|
-
cacheTokensDetails: Optional[List[ModalityTokenCount]] = None
|
|
484
|
-
responseTokensDetails: Optional[List[ModalityTokenCount]] = None
|
|
485
|
-
toolUsePromptTokensDetails: Optional[List[ModalityTokenCount]] = None
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
class ServerEvent(BaseModel):
|
|
489
|
-
"""Server event received from the Gemini Live API.
|
|
490
|
-
|
|
491
|
-
Parameters:
|
|
492
|
-
setupComplete: Setup completion notification. Defaults to None.
|
|
493
|
-
serverContent: Content from the server. Defaults to None.
|
|
494
|
-
toolCall: Tool/function call request. Defaults to None.
|
|
495
|
-
usageMetadata: Token usage metadata. Defaults to None.
|
|
496
|
-
"""
|
|
497
|
-
|
|
498
|
-
setupComplete: Optional[SetupComplete] = None
|
|
499
|
-
serverContent: Optional[ServerContent] = None
|
|
500
|
-
toolCall: Optional[ToolCall] = None
|
|
501
|
-
usageMetadata: Optional[UsageMetadata] = None
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
def parse_server_event(str):
|
|
505
|
-
"""Parse a server event from JSON string.
|
|
506
|
-
|
|
507
|
-
Args:
|
|
508
|
-
str: JSON string containing the server event.
|
|
509
|
-
|
|
510
|
-
Returns:
|
|
511
|
-
ServerEvent instance if parsing succeeds, None otherwise.
|
|
512
|
-
"""
|
|
513
|
-
try:
|
|
514
|
-
evt = json.loads(str)
|
|
515
|
-
return ServerEvent.model_validate(evt)
|
|
516
|
-
except Exception as e:
|
|
517
|
-
print(f"Error parsing server event: {e}")
|
|
518
|
-
return None
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
class ContextWindowCompressionConfig(BaseModel):
|
|
522
|
-
"""Configuration for context window compression.
|
|
523
|
-
|
|
524
|
-
Parameters:
|
|
525
|
-
sliding_window: Whether to use sliding window compression. Defaults to True.
|
|
526
|
-
trigger_tokens: Token count threshold to trigger compression. Defaults to None.
|
|
527
|
-
"""
|
|
528
|
-
|
|
529
|
-
sliding_window: Optional[bool] = Field(default=True)
|
|
530
|
-
trigger_tokens: Optional[int] = Field(default=None)
|
|
7
|
+
"""Event models and utilities for Google Gemini Multimodal Live API.
|
|
8
|
+
|
|
9
|
+
.. deprecated:: 0.0.90
|
|
10
|
+
Importing StartSensitivity and EndSensitivity from this module is deprecated.
|
|
11
|
+
Import them directly from google.genai.types instead.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import warnings
|
|
15
|
+
|
|
16
|
+
from loguru import logger
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from google.genai.types import (
|
|
20
|
+
EndSensitivity as _EndSensitivity,
|
|
21
|
+
)
|
|
22
|
+
from google.genai.types import (
|
|
23
|
+
StartSensitivity as _StartSensitivity,
|
|
24
|
+
)
|
|
25
|
+
except ModuleNotFoundError as e:
|
|
26
|
+
logger.error(f"Exception: {e}")
|
|
27
|
+
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
|
28
|
+
raise Exception(f"Missing module: {e}")
|
|
29
|
+
|
|
30
|
+
# These aliases are just here for backward compatibility, since we used to
|
|
31
|
+
# define public-facing StartSensitivity and EndSensitivity enums in this
|
|
32
|
+
# module.
|
|
33
|
+
with warnings.catch_warnings():
|
|
34
|
+
warnings.simplefilter("always")
|
|
35
|
+
warnings.warn(
|
|
36
|
+
"Importing StartSensitivity and EndSensitivity from "
|
|
37
|
+
"pipecat.services.gemini_multimodal_live.events is deprecated. "
|
|
38
|
+
"Please import them directly from google.genai.types instead.",
|
|
39
|
+
DeprecationWarning,
|
|
40
|
+
stacklevel=2,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
StartSensitivity = _StartSensitivity
|
|
44
|
+
EndSensitivity = _EndSensitivity
|