dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -0,0 +1,189 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Gemini File API client for uploading and managing files.
8
+
9
+ This module provides a client for Google's Gemini File API, enabling file
10
+ uploads, metadata retrieval, listing, and deletion. Files uploaded through
11
+ this API can be referenced in Gemini generative model calls.
12
+ """
13
+
14
+ import mimetypes
15
+ from typing import Any, Dict, Optional
16
+
17
+ import aiohttp
18
+ from loguru import logger
19
+
20
+
21
+ class GeminiFileAPI:
22
+ """Client for the Gemini File API.
23
+
24
+ This class provides methods for uploading, fetching, listing, and deleting files
25
+ through Google's Gemini File API.
26
+
27
+ Files uploaded through this API remain available for 48 hours and can be referenced
28
+ in calls to the Gemini generative models. Maximum file size is 2GB, with total
29
+ project storage limited to 20GB.
30
+ """
31
+
32
+ def __init__(
33
+ self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
34
+ ):
35
+ """Initialize the Gemini File API client.
36
+
37
+ Args:
38
+ api_key: Google AI API key
39
+ base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
40
+ """
41
+ self._api_key = api_key
42
+ self._base_url = base_url
43
+ # Upload URL uses the /upload/ path
44
+ self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
45
+
46
+ async def upload_file(
47
+ self, file_path: str, display_name: Optional[str] = None
48
+ ) -> Dict[str, Any]:
49
+ """Upload a file to the Gemini File API using the correct resumable upload protocol.
50
+
51
+ Args:
52
+ file_path: Path to the file to upload
53
+ display_name: Optional display name for the file
54
+
55
+ Returns:
56
+ File metadata including uri, name, and display_name
57
+ """
58
+ logger.info(f"Uploading file: {file_path}")
59
+
60
+ async with aiohttp.ClientSession() as session:
61
+ # Determine the file's MIME type
62
+ mime_type, _ = mimetypes.guess_type(file_path)
63
+ if not mime_type:
64
+ mime_type = "application/octet-stream"
65
+
66
+ # Read the file
67
+ with open(file_path, "rb") as f:
68
+ file_data = f.read()
69
+
70
+ # Create the metadata payload
71
+ metadata = {}
72
+ if display_name:
73
+ metadata = {"file": {"display_name": display_name}}
74
+
75
+ # Step 1: Initial resumable request to get upload URL
76
+ headers = {
77
+ "X-Goog-Upload-Protocol": "resumable",
78
+ "X-Goog-Upload-Command": "start",
79
+ "X-Goog-Upload-Header-Content-Length": str(len(file_data)),
80
+ "X-Goog-Upload-Header-Content-Type": mime_type,
81
+ "Content-Type": "application/json",
82
+ }
83
+
84
+ logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
85
+ async with session.post(
86
+ f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
87
+ ) as response:
88
+ if response.status != 200:
89
+ error_text = await response.text()
90
+ logger.error(f"Error initiating file upload: {error_text}")
91
+ raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
92
+
93
+ # Get the upload URL from the response header
94
+ upload_url = response.headers.get("X-Goog-Upload-URL")
95
+ if not upload_url:
96
+ logger.error(f"Response headers: {dict(response.headers)}")
97
+ raise Exception("No upload URL in response headers")
98
+
99
+ logger.debug(f"Got upload URL: {upload_url}")
100
+
101
+ # Step 2: Upload the actual file data
102
+ upload_headers = {
103
+ "Content-Length": str(len(file_data)),
104
+ "X-Goog-Upload-Offset": "0",
105
+ "X-Goog-Upload-Command": "upload, finalize",
106
+ }
107
+
108
+ logger.debug(f"Step 2: Uploading file data to {upload_url}")
109
+ async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
110
+ if response.status != 200:
111
+ error_text = await response.text()
112
+ logger.error(f"Error uploading file data: {error_text}")
113
+ raise Exception(f"Failed to upload file: {response.status} - {error_text}")
114
+
115
+ file_info = await response.json()
116
+ logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
117
+ return file_info
118
+
119
+ async def get_file(self, name: str) -> Dict[str, Any]:
120
+ """Get metadata for a file.
121
+
122
+ Args:
123
+ name: File name (or full path)
124
+
125
+ Returns:
126
+ File metadata
127
+ """
128
+ # Extract just the name part if a full path is provided
129
+ if "/" in name:
130
+ name = name.split("/")[-1]
131
+
132
+ async with aiohttp.ClientSession() as session:
133
+ async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
134
+ if response.status != 200:
135
+ error_text = await response.text()
136
+ logger.error(f"Error getting file metadata: {error_text}")
137
+ raise Exception(f"Failed to get file metadata: {response.status}")
138
+
139
+ file_info = await response.json()
140
+ return file_info
141
+
142
+ async def list_files(
143
+ self, page_size: int = 10, page_token: Optional[str] = None
144
+ ) -> Dict[str, Any]:
145
+ """List uploaded files.
146
+
147
+ Args:
148
+ page_size: Number of files to return per page
149
+ page_token: Token for pagination
150
+
151
+ Returns:
152
+ List of files and next page token if available
153
+ """
154
+ params = {"key": self._api_key, "pageSize": page_size}
155
+
156
+ if page_token:
157
+ params["pageToken"] = page_token
158
+
159
+ async with aiohttp.ClientSession() as session:
160
+ async with session.get(self._base_url, params=params) as response:
161
+ if response.status != 200:
162
+ error_text = await response.text()
163
+ logger.error(f"Error listing files: {error_text}")
164
+ raise Exception(f"Failed to list files: {response.status}")
165
+
166
+ result = await response.json()
167
+ return result
168
+
169
+ async def delete_file(self, name: str) -> bool:
170
+ """Delete a file.
171
+
172
+ Args:
173
+ name: File name (or full path)
174
+
175
+ Returns:
176
+ True if deleted successfully
177
+ """
178
+ # Extract just the name part if a full path is provided
179
+ if "/" in name:
180
+ name = name.split("/")[-1]
181
+
182
+ async with aiohttp.ClientSession() as session:
183
+ async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
184
+ if response.status != 200:
185
+ error_text = await response.text()
186
+ logger.error(f"Error deleting file: {error_text}")
187
+ raise Exception(f"Failed to delete file: {response.status}")
188
+
189
+ return True
@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
32
32
  Frame,
33
33
  InputAudioRawFrame,
34
34
  InputImageRawFrame,
35
+ InputTextRawFrame,
35
36
  LLMFullResponseEndFrame,
36
37
  LLMFullResponseStartFrame,
37
38
  LLMMessagesAppendFrame,
@@ -59,21 +60,22 @@ from pipecat.processors.aggregators.openai_llm_context import (
59
60
  OpenAILLMContextFrame,
60
61
  )
61
62
  from pipecat.processors.frame_processor import FrameDirection
63
+ from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame, LLMSearchResult
62
64
  from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
63
65
  from pipecat.services.openai.llm import (
64
66
  OpenAIAssistantContextAggregator,
65
67
  OpenAIUserContextAggregator,
66
68
  )
67
69
  from pipecat.transcriptions.language import Language
68
- from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
69
70
  from pipecat.utils.string import match_endofsentence
70
71
  from pipecat.utils.time import time_now_iso8601
71
72
  from pipecat.utils.tracing.service_decorators import traced_gemini_live, traced_stt
72
73
 
73
74
  from . import events
75
+ from .file_api import GeminiFileAPI
74
76
 
75
77
  try:
76
- import websockets
78
+ from websockets.asyncio.client import connect as websocket_connect
77
79
  except ModuleNotFoundError as e:
78
80
  logger.error(f"Exception: {e}")
79
81
  logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
@@ -218,6 +220,31 @@ class GeminiMultimodalLiveContext(OpenAILLMContext):
218
220
  system_instruction += str(content)
219
221
  return system_instruction
220
222
 
223
+ def add_file_reference(self, file_uri: str, mime_type: str, text: Optional[str] = None):
224
+ """Add a file reference to the context.
225
+
226
+ This adds a user message with a file reference that will be sent during context initialization.
227
+
228
+ Args:
229
+ file_uri: URI of the uploaded file
230
+ mime_type: MIME type of the file
231
+ text: Optional text prompt to accompany the file
232
+ """
233
+ # Create parts list with file reference
234
+ parts = []
235
+ if text:
236
+ parts.append({"type": "text", "text": text})
237
+
238
+ # Add file reference part
239
+ parts.append(
240
+ {"type": "file_data", "file_data": {"mime_type": mime_type, "file_uri": file_uri}}
241
+ )
242
+
243
+ # Add to messages
244
+ message = {"role": "user", "content": parts}
245
+ self.messages.append(message)
246
+ logger.info(f"Added file reference to context: {file_uri}")
247
+
221
248
  def get_messages_for_initializing_history(self):
222
249
  """Get messages formatted for Gemini history initialization.
223
250
 
@@ -242,6 +269,17 @@ class GeminiMultimodalLiveContext(OpenAILLMContext):
242
269
  for part in content:
243
270
  if part.get("type") == "text":
244
271
  parts.append({"text": part.get("text")})
272
+ elif part.get("type") == "file_data":
273
+ file_data = part.get("file_data", {})
274
+
275
+ parts.append(
276
+ {
277
+ "fileData": {
278
+ "mimeType": file_data.get("mime_type"),
279
+ "fileUri": file_data.get("file_uri"),
280
+ }
281
+ }
282
+ )
245
283
  else:
246
284
  logger.warning(f"Unsupported content type: {str(part)[:80]}")
247
285
  else:
@@ -333,14 +371,26 @@ class GeminiMultimodalLiveContextAggregatorPair:
333
371
 
334
372
 
335
373
  class GeminiMultimodalModalities(Enum):
336
- """Supported modalities for Gemini Multimodal Live."""
374
+ """Supported modalities for Gemini Multimodal Live.
375
+
376
+ Parameters:
377
+ TEXT: Text responses.
378
+ AUDIO: Audio responses.
379
+ """
337
380
 
338
381
  TEXT = "TEXT"
339
382
  AUDIO = "AUDIO"
340
383
 
341
384
 
342
385
  class GeminiMediaResolution(str, Enum):
343
- """Media resolution options for Gemini Multimodal Live."""
386
+ """Media resolution options for Gemini Multimodal Live.
387
+
388
+ Parameters:
389
+ UNSPECIFIED: Use default resolution setting.
390
+ LOW: Low resolution with 64 tokens.
391
+ MEDIUM: Medium resolution with 256 tokens.
392
+ HIGH: High resolution with zoomed reframing and 256 tokens.
393
+ """
344
394
 
345
395
  UNSPECIFIED = "MEDIA_RESOLUTION_UNSPECIFIED" # Use default
346
396
  LOW = "MEDIA_RESOLUTION_LOW" # 64 tokens
@@ -422,20 +472,6 @@ class GeminiMultimodalLiveLLMService(LLMService):
422
472
  This service enables real-time conversations with Gemini, supporting both
423
473
  text and audio modalities. It handles voice transcription, streaming audio
424
474
  responses, and tool usage.
425
-
426
- Args:
427
- api_key: Google AI API key for authentication.
428
- base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
429
- model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
430
- voice_id: TTS voice identifier. Defaults to "Charon".
431
- start_audio_paused: Whether to start with audio input paused. Defaults to False.
432
- start_video_paused: Whether to start with video input paused. Defaults to False.
433
- system_instruction: System prompt for the model. Defaults to None.
434
- tools: Tools/functions available to the model. Defaults to None.
435
- params: Configuration parameters for the model. Defaults to InputParams().
436
- inference_on_context_initialization: Whether to generate a response when context
437
- is first set. Defaults to True.
438
- **kwargs: Additional arguments passed to parent LLMService.
439
475
  """
440
476
 
441
477
  # Overriding the default adapter to use the Gemini one.
@@ -454,8 +490,26 @@ class GeminiMultimodalLiveLLMService(LLMService):
454
490
  tools: Optional[Union[List[dict], ToolsSchema]] = None,
455
491
  params: Optional[InputParams] = None,
456
492
  inference_on_context_initialization: bool = True,
493
+ file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
457
494
  **kwargs,
458
495
  ):
496
+ """Initialize the Gemini Multimodal Live LLM service.
497
+
498
+ Args:
499
+ api_key: Google AI API key for authentication.
500
+ base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
501
+ model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
502
+ voice_id: TTS voice identifier. Defaults to "Charon".
503
+ start_audio_paused: Whether to start with audio input paused. Defaults to False.
504
+ start_video_paused: Whether to start with video input paused. Defaults to False.
505
+ system_instruction: System prompt for the model. Defaults to None.
506
+ tools: Tools/functions available to the model. Defaults to None.
507
+ params: Configuration parameters for the model. Defaults to InputParams().
508
+ inference_on_context_initialization: Whether to generate a response when context
509
+ is first set. Defaults to True.
510
+ file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
511
+ **kwargs: Additional arguments passed to parent LLMService.
512
+ """
459
513
  super().__init__(base_url=base_url, **kwargs)
460
514
 
461
515
  params = params or InputParams()
@@ -516,6 +570,13 @@ class GeminiMultimodalLiveLLMService(LLMService):
516
570
  "extra": params.extra if isinstance(params.extra, dict) else {},
517
571
  }
518
572
 
573
+ # Initialize the File API client
574
+ self.file_api = GeminiFileAPI(api_key=api_key, base_url=file_api_base_url)
575
+
576
+ # Grounding metadata tracking
577
+ self._search_result_buffer = ""
578
+ self._accumulated_grounding_metadata = None
579
+
519
580
  def can_generate_metrics(self) -> bool:
520
581
  """Check if the service can generate usage metrics.
521
582
 
@@ -524,6 +585,17 @@ class GeminiMultimodalLiveLLMService(LLMService):
524
585
  """
525
586
  return True
526
587
 
588
+ def needs_mcp_alternate_schema(self) -> bool:
589
+ """Check if this LLM service requires alternate MCP schema.
590
+
591
+ Google/Gemini has stricter JSON schema validation and requires
592
+ certain properties to be removed or modified for compatibility.
593
+
594
+ Returns:
595
+ True for Google/Gemini services.
596
+ """
597
+ return True
598
+
527
599
  def set_audio_input_paused(self, paused: bool):
528
600
  """Set the audio input pause state.
529
601
 
@@ -666,6 +738,9 @@ class GeminiMultimodalLiveLLMService(LLMService):
666
738
  # Support just one tool call per context frame for now
667
739
  tool_result_message = context.messages[-1]
668
740
  await self._tool_result(tool_result_message)
741
+ elif isinstance(frame, InputTextRawFrame):
742
+ await self._send_user_text(frame.text)
743
+ await self.push_frame(frame, direction)
669
744
  elif isinstance(frame, InputAudioRawFrame):
670
745
  await self._send_user_audio(frame)
671
746
  await self.push_frame(frame, direction)
@@ -709,6 +784,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
709
784
  await self._ws_send(event.model_dump(exclude_none=True))
710
785
 
711
786
  async def _connect(self):
787
+ """Establish WebSocket connection to Gemini Live API."""
712
788
  if self._websocket:
713
789
  # Here we assume that if we have a websocket, we are connected. We
714
790
  # handle disconnections in the send/recv code paths.
@@ -718,7 +794,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
718
794
  try:
719
795
  logger.info(f"Connecting to wss://{self._base_url}")
720
796
  uri = f"wss://{self._base_url}?key={self._api_key}"
721
- self._websocket = await websockets.connect(uri=uri)
797
+ self._websocket = await websocket_connect(uri=uri)
722
798
  self._receive_task = self.create_task(self._receive_task_handler())
723
799
 
724
800
  # Create the basic configuration
@@ -813,6 +889,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
813
889
  self._websocket = None
814
890
 
815
891
  async def _disconnect(self):
892
+ """Disconnect from Gemini Live API and clean up resources."""
816
893
  logger.info("Disconnecting from Gemini service")
817
894
  try:
818
895
  self._disconnecting = True
@@ -829,6 +906,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
829
906
  logger.error(f"{self} error disconnecting: {e}")
830
907
 
831
908
  async def _ws_send(self, message):
909
+ """Send a message to the WebSocket connection."""
832
910
  # logger.debug(f"Sending message to websocket: {message}")
833
911
  try:
834
912
  if self._websocket:
@@ -849,7 +927,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
849
927
  #
850
928
 
851
929
  async def _receive_task_handler(self):
852
- async for message in WatchdogAsyncIterator(self._websocket, manager=self.task_manager):
930
+ """Handle incoming messages from the WebSocket connection."""
931
+ async for message in self._websocket:
853
932
  evt = events.parse_server_event(message)
854
933
  # logger.debug(f"Received event: {message[:500]}")
855
934
  # logger.debug(f"Received event: {evt}")
@@ -865,6 +944,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
865
944
  await self._handle_evt_input_transcription(evt)
866
945
  elif evt.serverContent and evt.serverContent.outputTranscription:
867
946
  await self._handle_evt_output_transcription(evt)
947
+ elif evt.serverContent and evt.serverContent.groundingMetadata:
948
+ await self._handle_evt_grounding_metadata(evt)
868
949
  elif evt.toolCall:
869
950
  await self._handle_evt_tool_call(evt)
870
951
  elif False: # !!! todo: error events?
@@ -877,6 +958,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
877
958
  #
878
959
 
879
960
  async def _send_user_audio(self, frame):
961
+ """Send user audio frame to Gemini Live API."""
880
962
  if self._audio_input_paused:
881
963
  return
882
964
  # Send all audio to Gemini
@@ -892,7 +974,25 @@ class GeminiMultimodalLiveLLMService(LLMService):
892
974
  length = int((frame.sample_rate * frame.num_channels * 2) * 0.5)
893
975
  self._user_audio_buffer = self._user_audio_buffer[-length:]
894
976
 
977
+ async def _send_user_text(self, text: str):
978
+ """Send user text via Gemini Live API's realtime input stream.
979
+
980
+ This method sends text through the realtimeInput stream (via TextInputMessage)
981
+ rather than the clientContent stream. This ensures text input is synchronized
982
+ with audio and video inputs, preventing temporal misalignment that can occur
983
+ when different modalities are processed through separate API pathways.
984
+
985
+ For realtimeInput, turn completion is automatically inferred by the API based
986
+ on user activity, so no explicit turnComplete signal is needed.
987
+
988
+ Args:
989
+ text: The text to send as user input.
990
+ """
991
+ evt = events.TextInputMessage.from_text(text)
992
+ await self.send_client_event(evt)
993
+
895
994
  async def _send_user_video(self, frame):
995
+ """Send user video frame to Gemini Live API."""
896
996
  if self._video_input_paused:
897
997
  return
898
998
 
@@ -906,6 +1006,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
906
1006
  await self.send_client_event(evt)
907
1007
 
908
1008
  async def _create_initial_response(self):
1009
+ """Create initial response based on context history."""
909
1010
  if not self._api_session_ready:
910
1011
  self._run_llm_when_api_session_ready = True
911
1012
  return
@@ -931,7 +1032,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
931
1032
  self._needs_turn_complete_message = True
932
1033
 
933
1034
  async def _create_single_response(self, messages_list):
934
- # refactor to combine this logic with same logic in GeminiMultimodalLiveContext
1035
+ """Create a single response from a list of messages."""
1036
+ # Refactor to combine this logic with same logic in GeminiMultimodalLiveContext
935
1037
  messages = []
936
1038
  for item in messages_list:
937
1039
  role = item.get("role")
@@ -950,6 +1052,17 @@ class GeminiMultimodalLiveLLMService(LLMService):
950
1052
  for part in content:
951
1053
  if part.get("type") == "text":
952
1054
  parts.append({"text": part.get("text")})
1055
+ elif part.get("type") == "file_data":
1056
+ file_data = part.get("file_data", {})
1057
+
1058
+ parts.append(
1059
+ {
1060
+ "fileData": {
1061
+ "mimeType": file_data.get("mime_type"),
1062
+ "fileUri": file_data.get("file_uri"),
1063
+ }
1064
+ }
1065
+ )
953
1066
  else:
954
1067
  logger.warning(f"Unsupported content type: {str(part)[:80]}")
955
1068
  else:
@@ -973,6 +1086,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
973
1086
 
974
1087
  @traced_gemini_live(operation="llm_tool_result")
975
1088
  async def _tool_result(self, tool_result_message):
1089
+ """Send tool result back to the API."""
976
1090
  # For now we're shoving the name into the tool_call_id field, so this
977
1091
  # will work until we revisit that.
978
1092
  id = tool_result_message.get("tool_call_id")
@@ -998,6 +1112,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
998
1112
 
999
1113
  @traced_gemini_live(operation="llm_setup")
1000
1114
  async def _handle_evt_setup_complete(self, evt):
1115
+ """Handle the setup complete event."""
1001
1116
  # If this is our first context frame, run the LLM
1002
1117
  self._api_session_ready = True
1003
1118
  # Now that we've configured the session, we can run the LLM if we need to.
@@ -1006,6 +1121,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
1006
1121
  await self._create_initial_response()
1007
1122
 
1008
1123
  async def _handle_evt_model_turn(self, evt):
1124
+ """Handle the model turn event."""
1009
1125
  part = evt.serverContent.modelTurn.parts[0]
1010
1126
  if not part:
1011
1127
  return
@@ -1019,8 +1135,13 @@ class GeminiMultimodalLiveLLMService(LLMService):
1019
1135
  await self.push_frame(LLMFullResponseStartFrame())
1020
1136
 
1021
1137
  self._bot_text_buffer += text
1138
+ self._search_result_buffer += text # Also accumulate for grounding
1022
1139
  await self.push_frame(LLMTextFrame(text=text))
1023
1140
 
1141
+ # Check for grounding metadata in server content
1142
+ if evt.serverContent and evt.serverContent.groundingMetadata:
1143
+ self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
1144
+
1024
1145
  inline_data = part.inlineData
1025
1146
  if not inline_data:
1026
1147
  return
@@ -1047,6 +1168,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
1047
1168
 
1048
1169
  @traced_gemini_live(operation="llm_tool_call")
1049
1170
  async def _handle_evt_tool_call(self, evt):
1171
+ """Handle tool call events."""
1050
1172
  function_calls = evt.toolCall.functionCalls
1051
1173
  if not function_calls:
1052
1174
  return
@@ -1067,6 +1189,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
1067
1189
 
1068
1190
  @traced_gemini_live(operation="llm_response")
1069
1191
  async def _handle_evt_turn_complete(self, evt):
1192
+ """Handle the turn complete event."""
1070
1193
  self._bot_is_speaking = False
1071
1194
  text = self._bot_text_buffer
1072
1195
 
@@ -1086,6 +1209,16 @@ class GeminiMultimodalLiveLLMService(LLMService):
1086
1209
  self._bot_text_buffer = ""
1087
1210
  self._llm_output_buffer = ""
1088
1211
 
1212
+ # Process grounding metadata if we have accumulated any
1213
+ if self._accumulated_grounding_metadata:
1214
+ await self._process_grounding_metadata(
1215
+ self._accumulated_grounding_metadata, self._search_result_buffer
1216
+ )
1217
+
1218
+ # Reset grounding tracking for next response
1219
+ self._search_result_buffer = ""
1220
+ self._accumulated_grounding_metadata = None
1221
+
1089
1222
  # Only push the TTSStoppedFrame if the bot is outputting audio
1090
1223
  # when text is found, modalities is set to TEXT and no audio
1091
1224
  # is produced.
@@ -1150,6 +1283,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
1150
1283
  )
1151
1284
 
1152
1285
  async def _handle_evt_output_transcription(self, evt):
1286
+ """Handle the output transcription event."""
1153
1287
  if not evt.serverContent.outputTranscription:
1154
1288
  return
1155
1289
 
@@ -1161,13 +1295,76 @@ class GeminiMultimodalLiveLLMService(LLMService):
1161
1295
  if not text:
1162
1296
  return
1163
1297
 
1298
+ # Accumulate text for grounding as well
1299
+ self._search_result_buffer += text
1300
+
1301
+ # Check for grounding metadata in server content
1302
+ if evt.serverContent and evt.serverContent.groundingMetadata:
1303
+ self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
1164
1304
  # Collect text for tracing
1165
1305
  self._llm_output_buffer += text
1166
1306
 
1167
1307
  await self.push_frame(LLMTextFrame(text=text))
1168
1308
  await self.push_frame(TTSTextFrame(text=text))
1169
1309
 
1310
+ async def _handle_evt_grounding_metadata(self, evt):
1311
+ """Handle dedicated grounding metadata events."""
1312
+ if evt.serverContent and evt.serverContent.groundingMetadata:
1313
+ grounding_metadata = evt.serverContent.groundingMetadata
1314
+ # Process the grounding metadata immediately
1315
+ await self._process_grounding_metadata(grounding_metadata, self._search_result_buffer)
1316
+
1317
+ async def _process_grounding_metadata(
1318
+ self, grounding_metadata: events.GroundingMetadata, search_result: str = ""
1319
+ ):
1320
+ """Process grounding metadata and emit LLMSearchResponseFrame."""
1321
+ if not grounding_metadata:
1322
+ return
1323
+
1324
+ # Extract rendered content for search suggestions
1325
+ rendered_content = None
1326
+ if (
1327
+ grounding_metadata.searchEntryPoint
1328
+ and grounding_metadata.searchEntryPoint.renderedContent
1329
+ ):
1330
+ rendered_content = grounding_metadata.searchEntryPoint.renderedContent
1331
+
1332
+ # Convert grounding chunks and supports to LLMSearchOrigin format
1333
+ origins = []
1334
+
1335
+ if grounding_metadata.groundingChunks and grounding_metadata.groundingSupports:
1336
+ # Create a mapping of chunk indices to origins
1337
+ chunk_to_origin = {}
1338
+
1339
+ for index, chunk in enumerate(grounding_metadata.groundingChunks):
1340
+ if chunk.web:
1341
+ origin = LLMSearchOrigin(
1342
+ site_uri=chunk.web.uri, site_title=chunk.web.title, results=[]
1343
+ )
1344
+ chunk_to_origin[index] = origin
1345
+ origins.append(origin)
1346
+
1347
+ # Add grounding support results to the appropriate origins
1348
+ for support in grounding_metadata.groundingSupports:
1349
+ if support.segment and support.groundingChunkIndices:
1350
+ text = support.segment.text or ""
1351
+ confidence_scores = support.confidenceScores or []
1352
+
1353
+ # Add this result to all origins referenced by this support
1354
+ for chunk_index in support.groundingChunkIndices:
1355
+ if chunk_index in chunk_to_origin:
1356
+ result = LLMSearchResult(text=text, confidence=confidence_scores)
1357
+ chunk_to_origin[chunk_index].results.append(result)
1358
+
1359
+ # Create and push the search response frame
1360
+ search_frame = LLMSearchResponseFrame(
1361
+ search_result=search_result, origins=origins, rendered_content=rendered_content
1362
+ )
1363
+
1364
+ await self.push_frame(search_frame)
1365
+
1170
1366
  async def _handle_evt_usage_metadata(self, evt):
1367
+ """Handle the usage metadata event."""
1171
1368
  if not evt.usageMetadata:
1172
1369
  return
1173
1370