dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -21,8 +21,8 @@ from pipecat.frames.frames import (
21
21
  EndFrame,
22
22
  ErrorFrame,
23
23
  Frame,
24
+ InterruptionFrame,
24
25
  StartFrame,
25
- StartInterruptionFrame,
26
26
  TTSAudioRawFrame,
27
27
  TTSStartedFrame,
28
28
  TTSStoppedFrame,
@@ -225,6 +225,8 @@ class FishAudioTTSService(InterruptibleTTSService):
225
225
  start_message = {"event": "start", "request": {"text": "", **self._settings}}
226
226
  await self._websocket.send(ormsgpack.packb(start_message))
227
227
  logger.debug("Sent start message to Fish Audio")
228
+
229
+ await self._call_event_handler("on_connected")
228
230
  except Exception as e:
229
231
  logger.error(f"Fish Audio initialization error: {e}")
230
232
  self._websocket = None
@@ -245,6 +247,7 @@ class FishAudioTTSService(InterruptibleTTSService):
245
247
  self._request_id = None
246
248
  self._started = False
247
249
  self._websocket = None
250
+ await self._call_event_handler("on_disconnected")
248
251
 
249
252
  async def flush_audio(self):
250
253
  """Flush any buffered audio by sending a flush event to Fish Audio."""
@@ -259,7 +262,7 @@ class FishAudioTTSService(InterruptibleTTSService):
259
262
  return self._websocket
260
263
  raise Exception("Websocket not connected")
261
264
 
262
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
265
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
263
266
  await super()._handle_interruption(frame, direction)
264
267
  await self.stop_all_metrics()
265
268
  self._request_id = None
@@ -4,527 +4,41 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """Event models and utilities for Google Gemini Multimodal Live API."""
8
-
9
- import base64
10
- import io
11
- import json
12
- from enum import Enum
13
- from typing import List, Literal, Optional
14
-
15
- from PIL import Image
16
- from pydantic import BaseModel, Field
17
-
18
- from pipecat.frames.frames import ImageRawFrame
19
-
20
- #
21
- # Client events
22
- #
23
-
24
-
25
- class MediaChunk(BaseModel):
26
- """Represents a chunk of media data for transmission.
27
-
28
- Parameters:
29
- mimeType: MIME type of the media content.
30
- data: Base64-encoded media data.
31
- """
32
-
33
- mimeType: str
34
- data: str
35
-
36
-
37
- class ContentPart(BaseModel):
38
- """Represents a part of content that can contain text or media.
39
-
40
- Parameters:
41
- text: Text content. Defaults to None.
42
- inlineData: Inline media data. Defaults to None.
43
- """
44
-
45
- text: Optional[str] = Field(default=None, validate_default=False)
46
- inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
47
- fileData: Optional["FileData"] = Field(default=None, validate_default=False)
48
-
49
-
50
- class FileData(BaseModel):
51
- """Represents a file reference in the Gemini File API."""
52
-
53
- mimeType: str
54
- fileUri: str
55
-
56
-
57
- ContentPart.model_rebuild() # Rebuild model to resolve forward reference
58
-
59
-
60
- class Turn(BaseModel):
61
- """Represents a conversational turn in the dialogue.
62
-
63
- Parameters:
64
- role: The role of the speaker, either "user" or "model". Defaults to "user".
65
- parts: List of content parts that make up the turn.
66
- """
67
-
68
- role: Literal["user", "model"] = "user"
69
- parts: List[ContentPart]
70
-
71
-
72
- class StartSensitivity(str, Enum):
73
- """Determines how start of speech is detected."""
74
-
75
- UNSPECIFIED = "START_SENSITIVITY_UNSPECIFIED" # Default is HIGH
76
- HIGH = "START_SENSITIVITY_HIGH" # Detect start of speech more often
77
- LOW = "START_SENSITIVITY_LOW" # Detect start of speech less often
78
-
79
-
80
- class EndSensitivity(str, Enum):
81
- """Determines how end of speech is detected."""
82
-
83
- UNSPECIFIED = "END_SENSITIVITY_UNSPECIFIED" # Default is HIGH
84
- HIGH = "END_SENSITIVITY_HIGH" # End speech more often
85
- LOW = "END_SENSITIVITY_LOW" # End speech less often
86
-
87
-
88
- class AutomaticActivityDetection(BaseModel):
89
- """Configures automatic detection of voice activity.
90
-
91
- Parameters:
92
- disabled: Whether automatic activity detection is disabled. Defaults to None.
93
- start_of_speech_sensitivity: Sensitivity for detecting speech start. Defaults to None.
94
- prefix_padding_ms: Padding before speech start in milliseconds. Defaults to None.
95
- end_of_speech_sensitivity: Sensitivity for detecting speech end. Defaults to None.
96
- silence_duration_ms: Duration of silence to detect speech end. Defaults to None.
97
- """
98
-
99
- disabled: Optional[bool] = None
100
- start_of_speech_sensitivity: Optional[StartSensitivity] = None
101
- prefix_padding_ms: Optional[int] = None
102
- end_of_speech_sensitivity: Optional[EndSensitivity] = None
103
- silence_duration_ms: Optional[int] = None
104
-
105
-
106
- class RealtimeInputConfig(BaseModel):
107
- """Configures the realtime input behavior.
108
-
109
- Parameters:
110
- automatic_activity_detection: Voice activity detection configuration. Defaults to None.
111
- """
112
-
113
- automatic_activity_detection: Optional[AutomaticActivityDetection] = None
114
-
115
-
116
- class RealtimeInput(BaseModel):
117
- """Contains realtime input media chunks and text.
118
-
119
- Parameters:
120
- mediaChunks: List of media chunks for realtime processing.
121
- text: Text for realtime processing.
122
- """
123
-
124
- mediaChunks: Optional[List[MediaChunk]] = None
125
- text: Optional[str] = None
126
-
127
-
128
- class ClientContent(BaseModel):
129
- """Content sent from client to the Gemini Live API.
130
-
131
- Parameters:
132
- turns: List of conversation turns. Defaults to None.
133
- turnComplete: Whether the client's turn is complete. Defaults to False.
134
- """
135
-
136
- turns: Optional[List[Turn]] = None
137
- turnComplete: bool = False
138
-
139
-
140
- class AudioInputMessage(BaseModel):
141
- """Message containing audio input data.
142
-
143
- Parameters:
144
- realtimeInput: Realtime input containing audio chunks.
145
- """
146
-
147
- realtimeInput: RealtimeInput
148
-
149
- @classmethod
150
- def from_raw_audio(cls, raw_audio: bytes, sample_rate: int) -> "AudioInputMessage":
151
- """Create an audio input message from raw audio data.
152
-
153
- Args:
154
- raw_audio: Raw audio bytes.
155
- sample_rate: Audio sample rate in Hz.
156
-
157
- Returns:
158
- AudioInputMessage instance with encoded audio data.
159
- """
160
- data = base64.b64encode(raw_audio).decode("utf-8")
161
- return cls(
162
- realtimeInput=RealtimeInput(
163
- mediaChunks=[MediaChunk(mimeType=f"audio/pcm;rate={sample_rate}", data=data)]
164
- )
165
- )
166
-
167
-
168
- class VideoInputMessage(BaseModel):
169
- """Message containing video/image input data.
170
-
171
- Parameters:
172
- realtimeInput: Realtime input containing video/image chunks.
173
- """
174
-
175
- realtimeInput: RealtimeInput
176
-
177
- @classmethod
178
- def from_image_frame(cls, frame: ImageRawFrame) -> "VideoInputMessage":
179
- """Create a video input message from an image frame.
180
-
181
- Args:
182
- frame: Image frame to encode.
183
-
184
- Returns:
185
- VideoInputMessage instance with encoded image data.
186
- """
187
- buffer = io.BytesIO()
188
- Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
189
- data = base64.b64encode(buffer.getvalue()).decode("utf-8")
190
- return cls(
191
- realtimeInput=RealtimeInput(mediaChunks=[MediaChunk(mimeType=f"image/jpeg", data=data)])
192
- )
193
-
194
-
195
- class TextInputMessage(BaseModel):
196
- """Message containing text input data."""
197
-
198
- realtimeInput: RealtimeInput
199
-
200
- @classmethod
201
- def from_text(cls, text: str) -> "TextInputMessage":
202
- """Create a text input message from a string.
203
-
204
- Args:
205
- text: The text to send.
206
-
207
- Returns:
208
- A TextInputMessage instance.
209
- """
210
- return cls(realtimeInput=RealtimeInput(text=text))
211
-
212
-
213
- class ClientContentMessage(BaseModel):
214
- """Message containing client content for the API.
215
-
216
- Parameters:
217
- clientContent: The client content to send.
218
- """
219
-
220
- clientContent: ClientContent
221
-
222
-
223
- class SystemInstruction(BaseModel):
224
- """System instruction for the model.
225
-
226
- Parameters:
227
- parts: List of content parts that make up the system instruction.
228
- """
229
-
230
- parts: List[ContentPart]
231
-
232
-
233
- class AudioTranscriptionConfig(BaseModel):
234
- """Configuration for audio transcription."""
235
-
236
- pass
237
-
238
-
239
- class Setup(BaseModel):
240
- """Setup configuration for the Gemini Live session.
241
-
242
- Parameters:
243
- model: Model identifier to use.
244
- system_instruction: System instruction for the model. Defaults to None.
245
- tools: List of available tools/functions. Defaults to None.
246
- generation_config: Generation configuration parameters. Defaults to None.
247
- input_audio_transcription: Input audio transcription config. Defaults to None.
248
- output_audio_transcription: Output audio transcription config. Defaults to None.
249
- realtime_input_config: Realtime input configuration. Defaults to None.
250
- """
251
-
252
- model: str
253
- system_instruction: Optional[SystemInstruction] = None
254
- tools: Optional[List[dict]] = None
255
- generation_config: Optional[dict] = None
256
- input_audio_transcription: Optional[AudioTranscriptionConfig] = None
257
- output_audio_transcription: Optional[AudioTranscriptionConfig] = None
258
- realtime_input_config: Optional[RealtimeInputConfig] = None
259
-
260
-
261
- class Config(BaseModel):
262
- """Configuration message for session setup.
263
-
264
- Parameters:
265
- setup: Setup configuration for the session.
266
- """
267
-
268
- setup: Setup
269
-
270
-
271
- #
272
- # Grounding metadata models
273
- #
274
-
275
-
276
- class SearchEntryPoint(BaseModel):
277
- """Represents the search entry point with rendered content for search suggestions."""
278
-
279
- renderedContent: Optional[str] = None
280
-
281
-
282
- class WebSource(BaseModel):
283
- """Represents a web source from grounding chunks."""
284
-
285
- uri: Optional[str] = None
286
- title: Optional[str] = None
287
-
288
-
289
- class GroundingChunk(BaseModel):
290
- """Represents a grounding chunk containing web source information."""
291
-
292
- web: Optional[WebSource] = None
293
-
294
-
295
- class GroundingSegment(BaseModel):
296
- """Represents a segment of text that is grounded."""
297
-
298
- startIndex: Optional[int] = None
299
- endIndex: Optional[int] = None
300
- text: Optional[str] = None
301
-
302
-
303
- class GroundingSupport(BaseModel):
304
- """Represents support information for grounded text segments."""
305
-
306
- segment: Optional[GroundingSegment] = None
307
- groundingChunkIndices: Optional[List[int]] = None
308
- confidenceScores: Optional[List[float]] = None
309
-
310
-
311
- class GroundingMetadata(BaseModel):
312
- """Represents grounding metadata from Google Search."""
313
-
314
- searchEntryPoint: Optional[SearchEntryPoint] = None
315
- groundingChunks: Optional[List[GroundingChunk]] = None
316
- groundingSupports: Optional[List[GroundingSupport]] = None
317
- webSearchQueries: Optional[List[str]] = None
318
-
319
-
320
- #
321
- # Server events
322
- #
323
-
324
-
325
- class SetupComplete(BaseModel):
326
- """Indicates that session setup is complete."""
327
-
328
- pass
329
-
330
-
331
- class InlineData(BaseModel):
332
- """Inline data embedded in server responses.
333
-
334
- Parameters:
335
- mimeType: MIME type of the data.
336
- data: Base64-encoded data content.
337
- """
338
-
339
- mimeType: str
340
- data: str
341
-
342
-
343
- class Part(BaseModel):
344
- """Part of a server response containing data or text.
345
-
346
- Parameters:
347
- inlineData: Inline binary data. Defaults to None.
348
- text: Text content. Defaults to None.
349
- """
350
-
351
- inlineData: Optional[InlineData] = None
352
- text: Optional[str] = None
353
-
354
-
355
- class ModelTurn(BaseModel):
356
- """Represents a turn from the model in the conversation.
357
-
358
- Parameters:
359
- parts: List of content parts in the model's response.
360
- """
361
-
362
- parts: List[Part]
363
-
364
-
365
- class ServerContentInterrupted(BaseModel):
366
- """Indicates server content was interrupted.
367
-
368
- Parameters:
369
- interrupted: Whether the content was interrupted.
370
- """
371
-
372
- interrupted: bool
373
-
374
-
375
- class ServerContentTurnComplete(BaseModel):
376
- """Indicates the server's turn is complete.
377
-
378
- Parameters:
379
- turnComplete: Whether the turn is complete.
380
- """
381
-
382
- turnComplete: bool
383
-
384
-
385
- class BidiGenerateContentTranscription(BaseModel):
386
- """Transcription data from bidirectional content generation.
387
-
388
- Parameters:
389
- text: The transcribed text content.
390
- """
391
-
392
- text: str
393
-
394
-
395
- class ServerContent(BaseModel):
396
- """Content sent from server to client.
397
-
398
- Parameters:
399
- modelTurn: Model's conversational turn. Defaults to None.
400
- interrupted: Whether content was interrupted. Defaults to None.
401
- turnComplete: Whether the turn is complete. Defaults to None.
402
- inputTranscription: Transcription of input audio. Defaults to None.
403
- outputTranscription: Transcription of output audio. Defaults to None.
404
- """
405
-
406
- modelTurn: Optional[ModelTurn] = None
407
- interrupted: Optional[bool] = None
408
- turnComplete: Optional[bool] = None
409
- inputTranscription: Optional[BidiGenerateContentTranscription] = None
410
- outputTranscription: Optional[BidiGenerateContentTranscription] = None
411
- groundingMetadata: Optional[GroundingMetadata] = None
412
-
413
-
414
- class FunctionCall(BaseModel):
415
- """Represents a function call from the model.
416
-
417
- Parameters:
418
- id: Unique identifier for the function call.
419
- name: Name of the function to call.
420
- args: Arguments to pass to the function.
421
- """
422
-
423
- id: str
424
- name: str
425
- args: dict
426
-
427
-
428
- class ToolCall(BaseModel):
429
- """Contains one or more function calls.
430
-
431
- Parameters:
432
- functionCalls: List of function calls to execute.
433
- """
434
-
435
- functionCalls: List[FunctionCall]
436
-
437
-
438
- class Modality(str, Enum):
439
- """Modality types in token counts."""
440
-
441
- UNSPECIFIED = "MODALITY_UNSPECIFIED"
442
- TEXT = "TEXT"
443
- IMAGE = "IMAGE"
444
- AUDIO = "AUDIO"
445
- VIDEO = "VIDEO"
446
-
447
-
448
- class ModalityTokenCount(BaseModel):
449
- """Token count for a specific modality.
450
-
451
- Parameters:
452
- modality: The modality type.
453
- tokenCount: Number of tokens for this modality.
454
- """
455
-
456
- modality: Modality
457
- tokenCount: int
458
-
459
-
460
- class UsageMetadata(BaseModel):
461
- """Usage metadata about the API response.
462
-
463
- Parameters:
464
- promptTokenCount: Number of tokens in the prompt. Defaults to None.
465
- cachedContentTokenCount: Number of cached content tokens. Defaults to None.
466
- responseTokenCount: Number of tokens in the response. Defaults to None.
467
- toolUsePromptTokenCount: Number of tokens for tool use prompts. Defaults to None.
468
- thoughtsTokenCount: Number of tokens for model thoughts. Defaults to None.
469
- totalTokenCount: Total number of tokens used. Defaults to None.
470
- promptTokensDetails: Detailed breakdown of prompt tokens by modality. Defaults to None.
471
- cacheTokensDetails: Detailed breakdown of cache tokens by modality. Defaults to None.
472
- responseTokensDetails: Detailed breakdown of response tokens by modality. Defaults to None.
473
- toolUsePromptTokensDetails: Detailed breakdown of tool use tokens by modality. Defaults to None.
474
- """
475
-
476
- promptTokenCount: Optional[int] = None
477
- cachedContentTokenCount: Optional[int] = None
478
- responseTokenCount: Optional[int] = None
479
- toolUsePromptTokenCount: Optional[int] = None
480
- thoughtsTokenCount: Optional[int] = None
481
- totalTokenCount: Optional[int] = None
482
- promptTokensDetails: Optional[List[ModalityTokenCount]] = None
483
- cacheTokensDetails: Optional[List[ModalityTokenCount]] = None
484
- responseTokensDetails: Optional[List[ModalityTokenCount]] = None
485
- toolUsePromptTokensDetails: Optional[List[ModalityTokenCount]] = None
486
-
487
-
488
- class ServerEvent(BaseModel):
489
- """Server event received from the Gemini Live API.
490
-
491
- Parameters:
492
- setupComplete: Setup completion notification. Defaults to None.
493
- serverContent: Content from the server. Defaults to None.
494
- toolCall: Tool/function call request. Defaults to None.
495
- usageMetadata: Token usage metadata. Defaults to None.
496
- """
497
-
498
- setupComplete: Optional[SetupComplete] = None
499
- serverContent: Optional[ServerContent] = None
500
- toolCall: Optional[ToolCall] = None
501
- usageMetadata: Optional[UsageMetadata] = None
502
-
503
-
504
- def parse_server_event(str):
505
- """Parse a server event from JSON string.
506
-
507
- Args:
508
- str: JSON string containing the server event.
509
-
510
- Returns:
511
- ServerEvent instance if parsing succeeds, None otherwise.
512
- """
513
- try:
514
- evt = json.loads(str)
515
- return ServerEvent.model_validate(evt)
516
- except Exception as e:
517
- print(f"Error parsing server event: {e}")
518
- return None
519
-
520
-
521
- class ContextWindowCompressionConfig(BaseModel):
522
- """Configuration for context window compression.
523
-
524
- Parameters:
525
- sliding_window: Whether to use sliding window compression. Defaults to True.
526
- trigger_tokens: Token count threshold to trigger compression. Defaults to None.
527
- """
528
-
529
- sliding_window: Optional[bool] = Field(default=True)
530
- trigger_tokens: Optional[int] = Field(default=None)
7
+ """Event models and utilities for Google Gemini Multimodal Live API.
8
+
9
+ .. deprecated:: 0.0.90
10
+ Importing StartSensitivity and EndSensitivity from this module is deprecated.
11
+ Import them directly from google.genai.types instead.
12
+ """
13
+
14
+ import warnings
15
+
16
+ from loguru import logger
17
+
18
+ try:
19
+ from google.genai.types import (
20
+ EndSensitivity as _EndSensitivity,
21
+ )
22
+ from google.genai.types import (
23
+ StartSensitivity as _StartSensitivity,
24
+ )
25
+ except ModuleNotFoundError as e:
26
+ logger.error(f"Exception: {e}")
27
+ logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
28
+ raise Exception(f"Missing module: {e}")
29
+
30
+ # These aliases are just here for backward compatibility, since we used to
31
+ # define public-facing StartSensitivity and EndSensitivity enums in this
32
+ # module.
33
+ with warnings.catch_warnings():
34
+ warnings.simplefilter("always")
35
+ warnings.warn(
36
+ "Importing StartSensitivity and EndSensitivity from "
37
+ "pipecat.services.gemini_multimodal_live.events is deprecated. "
38
+ "Please import them directly from google.genai.types instead.",
39
+ DeprecationWarning,
40
+ stacklevel=2,
41
+ )
42
+
43
+ StartSensitivity = _StartSensitivity
44
+ EndSensitivity = _EndSensitivity