dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -0,0 +1,184 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Service for accessing Gemini Live via Google Vertex AI.
8
+
9
+ This module provides integration with Google's Gemini Live model via
10
+ Vertex AI, supporting both text and audio modalities with voice transcription,
11
+ streaming responses, and tool usage.
12
+ """
13
+
14
+ import json
15
+ from typing import List, Optional, Union
16
+
17
+ from loguru import logger
18
+
19
+ from pipecat.adapters.schemas.tools_schema import ToolsSchema
20
+ from pipecat.services.google.gemini_live.llm import (
21
+ GeminiLiveLLMService,
22
+ HttpOptions,
23
+ InputParams,
24
+ )
25
+
26
+ try:
27
+ from google.auth import default
28
+ from google.auth.exceptions import GoogleAuthError
29
+ from google.auth.transport.requests import Request
30
+ from google.genai import Client
31
+ from google.oauth2 import service_account
32
+
33
+ except ModuleNotFoundError as e:
34
+ logger.error(f"Exception: {e}")
35
+ logger.error("In order to use Google Vertex AI, you need to `pip install pipecat-ai[google]`.")
36
+ raise Exception(f"Missing module: {e}")
37
+
38
+
39
+ class GeminiLiveVertexLLMService(GeminiLiveLLMService):
40
+ """Provides access to Google's Gemini Live model via Vertex AI.
41
+
42
+ This service enables real-time conversations with Gemini, supporting both
43
+ text and audio modalities. It handles voice transcription, streaming audio
44
+ responses, and tool usage.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ *,
50
+ credentials: Optional[str] = None,
51
+ credentials_path: Optional[str] = None,
52
+ location: str,
53
+ project_id: str,
54
+ model="google/gemini-2.0-flash-live-preview-04-09",
55
+ voice_id: str = "Charon",
56
+ start_audio_paused: bool = False,
57
+ start_video_paused: bool = False,
58
+ system_instruction: Optional[str] = None,
59
+ tools: Optional[Union[List[dict], ToolsSchema]] = None,
60
+ params: Optional[InputParams] = None,
61
+ inference_on_context_initialization: bool = True,
62
+ file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
63
+ http_options: Optional[HttpOptions] = None,
64
+ **kwargs,
65
+ ):
66
+ """Initialize the service for accessing Gemini Live via Google Vertex AI.
67
+
68
+ Args:
69
+ credentials: JSON string of service account credentials.
70
+ credentials_path: Path to the service account JSON file.
71
+ location: GCP region for Vertex AI endpoint (e.g., "us-east4").
72
+ project_id: Google Cloud project ID.
73
+ model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-preview-04-09".
74
+ voice_id: TTS voice identifier. Defaults to "Charon".
75
+ start_audio_paused: Whether to start with audio input paused. Defaults to False.
76
+ start_video_paused: Whether to start with video input paused. Defaults to False.
77
+ system_instruction: System prompt for the model. Defaults to None.
78
+ tools: Tools/functions available to the model. Defaults to None.
79
+ params: Configuration parameters for the model along with Vertex AI
80
+ location and project ID.
81
+ inference_on_context_initialization: Whether to generate a response when context
82
+ is first set. Defaults to True.
83
+ file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
84
+ http_options: HTTP options for the client.
85
+ **kwargs: Additional arguments passed to parent GeminiLiveLLMService.
86
+ """
87
+ # Check if user incorrectly passed api_key, which is used by parent
88
+ # class but not here.
89
+ if "api_key" in kwargs:
90
+ logger.error(
91
+ "GeminiLiveVertexLLMService does not accept 'api_key' parameter. "
92
+ "Use 'credentials' or 'credentials_path' instead for Vertex AI authentication."
93
+ )
94
+ raise ValueError(
95
+ "Invalid parameter 'api_key'. Use 'credentials' or 'credentials_path' for Vertex AI authentication."
96
+ )
97
+
98
+ # These need to be set before calling super().__init__() because
99
+ # super().__init__() invokes create_client(), which needs these.
100
+ self._credentials = self._get_credentials(credentials, credentials_path)
101
+ self._project_id = project_id
102
+ self._location = location
103
+
104
+ # Call parent constructor with the obtained API key
105
+ super().__init__(
106
+ # api_key is required by parent class, but actually not used with
107
+ # Vertex
108
+ api_key="dummy",
109
+ model=model,
110
+ voice_id=voice_id,
111
+ start_audio_paused=start_audio_paused,
112
+ start_video_paused=start_video_paused,
113
+ system_instruction=system_instruction,
114
+ tools=tools,
115
+ params=params,
116
+ inference_on_context_initialization=inference_on_context_initialization,
117
+ file_api_base_url=file_api_base_url,
118
+ http_options=http_options,
119
+ **kwargs,
120
+ )
121
+
122
+ def create_client(self):
123
+ """Create the Gemini client instance."""
124
+ self._client = Client(
125
+ vertexai=True,
126
+ credentials=self._credentials,
127
+ project=self._project_id,
128
+ location=self._location,
129
+ )
130
+
131
+ @property
132
+ def file_api(self):
133
+ """Gemini File API is not supported with Vertex AI."""
134
+ raise NotImplementedError(
135
+ "When using Vertex AI, the recommended approach is to use Google Cloud Storage for file handling. The Gemini File API is not directly supported in this context."
136
+ )
137
+
138
+ @staticmethod
139
+ def _get_credentials(credentials: Optional[str], credentials_path: Optional[str]) -> str:
140
+ """Retrieve Credentials using Google service account credentials JSON.
141
+
142
+ Supports multiple authentication methods:
143
+ 1. Direct JSON credentials string
144
+ 2. Path to service account JSON file
145
+ 3. Default application credentials (ADC)
146
+
147
+ Args:
148
+ credentials: JSON string of service account credentials.
149
+ credentials_path: Path to the service account JSON file.
150
+
151
+ Returns:
152
+ OAuth token for API authentication.
153
+
154
+ Raises:
155
+ ValueError: If no valid credentials are provided or found.
156
+ """
157
+ creds: Optional[service_account.Credentials] = None
158
+
159
+ if credentials:
160
+ # Parse and load credentials from JSON string
161
+ creds = service_account.Credentials.from_service_account_info(
162
+ json.loads(credentials),
163
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
164
+ )
165
+ elif credentials_path:
166
+ # Load credentials from JSON file
167
+ creds = service_account.Credentials.from_service_account_file(
168
+ credentials_path,
169
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
170
+ )
171
+ else:
172
+ try:
173
+ creds, project_id = default(
174
+ scopes=["https://www.googleapis.com/auth/cloud-platform"]
175
+ )
176
+ except GoogleAuthError:
177
+ pass
178
+
179
+ if not creds:
180
+ raise ValueError("No valid credentials provided.")
181
+
182
+ creds.refresh(Request()) # Ensure token is up-to-date, lifetime is 1 hour.
183
+
184
+ return creds
@@ -35,8 +35,8 @@ from pipecat.frames.frames import (
35
35
  LLMMessagesFrame,
36
36
  LLMTextFrame,
37
37
  LLMUpdateSettingsFrame,
38
+ OutputImageRawFrame,
38
39
  UserImageRawFrame,
39
- VisionImageRawFrame,
40
40
  )
41
41
  from pipecat.metrics.metrics import LLMTokenUsage
42
42
  from pipecat.processors.aggregators.llm_context import LLMContext
@@ -73,6 +73,9 @@ try:
73
73
  HttpOptions,
74
74
  Part,
75
75
  )
76
+
77
+ # Temporary hack to be able to process Nano Banana returned images.
78
+ genai._api_client.READ_BUFFER_SIZE = 5 * 1024 * 1024
76
79
  except ModuleNotFoundError as e:
77
80
  logger.error(f"Exception: {e}")
78
81
  logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
@@ -683,7 +686,7 @@ class GoogleLLMService(LLMService):
683
686
  self,
684
687
  *,
685
688
  api_key: str,
686
- model: str = "gemini-2.0-flash",
689
+ model: str = "gemini-2.5-flash",
687
690
  params: Optional[InputParams] = None,
688
691
  system_instruction: Optional[str] = None,
689
692
  tools: Optional[List[Dict[str, Any]]] = None,
@@ -711,6 +714,7 @@ class GoogleLLMService(LLMService):
711
714
  self._api_key = api_key
712
715
  self._system_instruction = system_instruction
713
716
  self._http_options = http_options
717
+
714
718
  self._create_client(api_key, http_options)
715
719
  self._settings = {
716
720
  "max_tokens": params.max_tokens,
@@ -789,6 +793,9 @@ class GoogleLLMService(LLMService):
789
793
  # and can be configured to turn it off.
790
794
  if not self._model_name.startswith("gemini-2.5-flash"):
791
795
  return
796
+ # If we have an image model, we don't use a budget either.
797
+ if "image" in self._model_name:
798
+ return
792
799
  # If thinking_config is already set, don't override it.
793
800
  if "thinking_config" in generation_params:
794
801
  return
@@ -928,6 +935,12 @@ class GoogleLLMService(LLMService):
928
935
  arguments=function_call.args or {},
929
936
  )
930
937
  )
938
+ elif part.inline_data and part.inline_data.data:
939
+ image = Image.open(io.BytesIO(part.inline_data.data))
940
+ frame = OutputImageRawFrame(
941
+ image=image.tobytes(), size=image.size, format="RGB"
942
+ )
943
+ await self.push_frame(frame)
931
944
 
932
945
  if (
933
946
  candidate.grounding_metadata
@@ -1013,15 +1026,6 @@ class GoogleLLMService(LLMService):
1013
1026
  # NOTE: LLMMessagesFrame is deprecated, so we don't support the newer universal
1014
1027
  # LLMContext with it
1015
1028
  context = GoogleLLMContext(frame.messages)
1016
- elif isinstance(frame, VisionImageRawFrame):
1017
- # This is only useful in very simple pipelines because it creates
1018
- # a new context. Generally we want a context manager to catch
1019
- # UserImageRawFrames coming through the pipeline and add them
1020
- # to the context.
1021
- context = GoogleLLMContext()
1022
- context.add_image_frame_message(
1023
- format=frame.format, size=frame.size, image=frame.image, text=frame.text
1024
- )
1025
1029
  elif isinstance(frame, LLMUpdateSettingsFrame):
1026
1030
  await self._update_settings(frame.settings)
1027
1031
  else:
@@ -96,9 +96,9 @@ class GoogleLLMOpenAIBetaService(OpenAILLMService):
96
96
  async for chunk in chunk_stream:
97
97
  if chunk.usage:
98
98
  tokens = LLMTokenUsage(
99
- prompt_tokens=chunk.usage.prompt_tokens,
100
- completion_tokens=chunk.usage.completion_tokens,
101
- total_tokens=chunk.usage.total_tokens,
99
+ prompt_tokens=chunk.usage.prompt_tokens or 0,
100
+ completion_tokens=chunk.usage.completion_tokens or 0,
101
+ total_tokens=chunk.usage.total_tokens or 0,
102
102
  )
103
103
  await self.start_llm_usage_metrics(tokens)
104
104
 
@@ -53,12 +53,44 @@ class GoogleVertexLLMService(OpenAILLMService):
53
53
 
54
54
  Parameters:
55
55
  location: GCP region for Vertex AI endpoint (e.g., "us-east4").
56
+
57
+ .. deprecated:: 0.0.90
58
+ Use `location` as a direct argument to
59
+ `GoogleVertexLLMService.__init__()` instead.
60
+
56
61
  project_id: Google Cloud project ID.
62
+
63
+ .. deprecated:: 0.0.90
64
+ Use `project_id` as a direct argument to
65
+ `GoogleVertexLLMService.__init__()` instead.
57
66
  """
58
67
 
59
68
  # https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations
60
- location: str = "us-east4"
61
- project_id: str
69
+ location: Optional[str] = None
70
+ project_id: Optional[str] = None
71
+
72
+ def __init__(self, **kwargs):
73
+ """Initializes the InputParams."""
74
+ import warnings
75
+
76
+ with warnings.catch_warnings():
77
+ warnings.simplefilter("always")
78
+ if "location" in kwargs and kwargs["location"] is not None:
79
+ warnings.warn(
80
+ "GoogleVertexLLMService.InputParams.location is deprecated. "
81
+ "Please provide 'location' as a direct argument to GoogleVertexLLMService.__init__() instead.",
82
+ DeprecationWarning,
83
+ stacklevel=2,
84
+ )
85
+
86
+ if "project_id" in kwargs and kwargs["project_id"] is not None:
87
+ warnings.warn(
88
+ "GoogleVertexLLMService.InputParams.project_id is deprecated. "
89
+ "Please provide 'project_id' as a direct argument to GoogleVertexLLMService.__init__() instead.",
90
+ DeprecationWarning,
91
+ stacklevel=2,
92
+ )
93
+ super().__init__(**kwargs)
62
94
 
63
95
  def __init__(
64
96
  self,
@@ -66,7 +98,8 @@ class GoogleVertexLLMService(OpenAILLMService):
66
98
  credentials: Optional[str] = None,
67
99
  credentials_path: Optional[str] = None,
68
100
  model: str = "google/gemini-2.0-flash-001",
69
- params: Optional[InputParams] = None,
101
+ location: Optional[str] = None,
102
+ project_id: Optional[str] = None,
70
103
  **kwargs,
71
104
  ):
72
105
  """Initializes the VertexLLMService.
@@ -75,25 +108,60 @@ class GoogleVertexLLMService(OpenAILLMService):
75
108
  credentials: JSON string of service account credentials.
76
109
  credentials_path: Path to the service account JSON file.
77
110
  model: Model identifier (e.g., "google/gemini-2.0-flash-001").
78
- params: Vertex AI input parameters including location and project.
111
+ location: GCP region for Vertex AI endpoint (e.g., "us-east4").
112
+ project_id: Google Cloud project ID.
79
113
  **kwargs: Additional arguments passed to OpenAILLMService.
80
114
  """
81
- params = params or OpenAILLMService.InputParams()
82
- base_url = self._get_base_url(params)
115
+ # Handle deprecated InputParams fields
116
+ if "params" in kwargs and isinstance(kwargs["params"], GoogleVertexLLMService.InputParams):
117
+ params = kwargs["params"]
118
+ # Extract location and project_id from params if not provided
119
+ # directly, for backward compatibility
120
+ if project_id is None:
121
+ project_id = params.project_id
122
+ if location is None:
123
+ location = params.location
124
+ # Convert to base InputParams
125
+ params = OpenAILLMService.InputParams(
126
+ **params.model_dump(exclude={"location", "project_id"}, exclude_unset=True)
127
+ )
128
+ kwargs["params"] = params
129
+
130
+ # Validate project_id and location parameters
131
+ # NOTE: once we remove Vertex-spcific InputParams class, we can update
132
+ # __init__() signature as follows:
133
+ # - location: str = "us-east4",
134
+ # - project_id: str,
135
+ # But for now, we need them as-is to maintain proper backward
136
+ # compatibility.
137
+ if project_id is None:
138
+ raise ValueError("project_id is required")
139
+ if location is None:
140
+ # If location is not provided, default to "us-east4".
141
+ # Note: this is legacy behavior; ideally location would be
142
+ # required.
143
+ logger.warning("location is not provided. Defaulting to 'us-east4'.")
144
+ location = "us-east4" # Default location if not provided
145
+
146
+ base_url = self._get_base_url(location, project_id)
83
147
  self._api_key = self._get_api_token(credentials, credentials_path)
84
148
 
85
149
  super().__init__(
86
- api_key=self._api_key, base_url=base_url, model=model, params=params, **kwargs
150
+ api_key=self._api_key,
151
+ base_url=base_url,
152
+ model=model,
153
+ **kwargs,
87
154
  )
88
155
 
89
156
  @staticmethod
90
- def _get_base_url(params: InputParams) -> str:
91
- """Constructs the base URL for Vertex AI API."""
92
- hostname_prefix = "" if params.location == "global" else f"{params.location}-"
93
- return (
94
- f"https://{hostname_prefix}aiplatform.googleapis.com/v1/"
95
- f"projects/{params.project_id}/locations/{params.location}/endpoints/openapi"
96
- )
157
+ def _get_base_url(location: str, project_id: str) -> str:
158
+ """Construct the base URL for Vertex AI API."""
159
+ # Determine the correct API host based on location
160
+ if location == "global":
161
+ api_host = "aiplatform.googleapis.com"
162
+ else:
163
+ api_host = f"{location}-aiplatform.googleapis.com"
164
+ return f"https://{api_host}/v1/projects/{project_id}/locations/{location}/endpoints/openapi"
97
165
 
98
166
  @staticmethod
99
167
  def _get_api_token(credentials: Optional[str], credentials_path: Optional[str]) -> str:
@@ -119,12 +187,14 @@ class GoogleVertexLLMService(OpenAILLMService):
119
187
  if credentials:
120
188
  # Parse and load credentials from JSON string
121
189
  creds = service_account.Credentials.from_service_account_info(
122
- json.loads(credentials), scopes=["https://www.googleapis.com/auth/cloud-platform"]
190
+ json.loads(credentials),
191
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
123
192
  )
124
193
  elif credentials_path:
125
194
  # Load credentials from JSON file
126
195
  creds = service_account.Credentials.from_service_account_file(
127
- credentials_path, scopes=["https://www.googleapis.com/auth/cloud-platform"]
196
+ credentials_path,
197
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
128
198
  )
129
199
  else:
130
200
  try:
@@ -730,6 +730,8 @@ class GoogleSTTService(STTService):
730
730
  self._request_queue = asyncio.Queue()
731
731
  self._streaming_task = self.create_task(self._stream_audio())
732
732
 
733
+ await self._call_event_handler("on_connected")
734
+
733
735
  async def _disconnect(self):
734
736
  """Clean up streaming recognition resources."""
735
737
  if self._streaming_task:
@@ -737,6 +739,8 @@ class GoogleSTTService(STTService):
737
739
  await self.cancel_task(self._streaming_task)
738
740
  self._streaming_task = None
739
741
 
742
+ await self._call_event_handler("on_disconnected")
743
+
740
744
  async def _request_generator(self):
741
745
  """Generates requests for the streaming recognize method."""
742
746
  recognizer_path = f"projects/{self._project_id}/locations/{self._location}/recognizers/_"
@@ -500,10 +500,11 @@ class GoogleTTSService(TTSService):
500
500
 
501
501
  Parameters:
502
502
  language: Language for synthesis. Defaults to English.
503
+ speaking_rate: The speaking rate, in the range [0.25, 4.0].
503
504
  """
504
505
 
505
506
  language: Optional[Language] = Language.EN
506
- rate: Optional[float] = 1.0
507
+ speaking_rate: Optional[float] = None
507
508
 
508
509
  def __init__(
509
510
  self,
@@ -511,6 +512,7 @@ class GoogleTTSService(TTSService):
511
512
  credentials: Optional[str] = None,
512
513
  credentials_path: Optional[str] = None,
513
514
  voice_id: str = "en-US-Chirp3-HD-Charon",
515
+ voice_cloning_key: Optional[str] = None,
514
516
  sample_rate: Optional[int] = None,
515
517
  params: InputParams = InputParams(),
516
518
  **kwargs,
@@ -521,6 +523,7 @@ class GoogleTTSService(TTSService):
521
523
  credentials: JSON string containing Google Cloud service account credentials.
522
524
  credentials_path: Path to Google Cloud service account JSON file.
523
525
  voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
526
+ voice_cloning_key: The voice cloning key for Chirp 3 custom voices.
524
527
  sample_rate: Audio sample rate in Hz. If None, uses default.
525
528
  params: Language configuration parameters.
526
529
  **kwargs: Additional arguments passed to parent TTSService.
@@ -536,7 +539,7 @@ class GoogleTTSService(TTSService):
536
539
  "language": self.language_to_service_language(params.language)
537
540
  if params.language
538
541
  else "en-US",
539
- "rate": params.rate,
542
+ "speaking_rate": params.speaking_rate,
540
543
  }
541
544
  self._voice_clone_params = None
542
545
  if self._voice_config.get("is_clone", False):
@@ -550,6 +553,7 @@ class GoogleTTSService(TTSService):
550
553
  language_code=self._settings["language"], voice_clone=self._voice_clone_params
551
554
  )
552
555
  self.set_voice(voice_id)
556
+ self._voice_cloning_key = voice_cloning_key
553
557
  self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
554
558
  credentials, credentials_path
555
559
  )
@@ -628,7 +632,7 @@ class GoogleTTSService(TTSService):
628
632
  streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
629
633
  audio_encoding=texttospeech_v1.AudioEncoding.PCM,
630
634
  sample_rate_hertz=self.sample_rate,
631
- speaking_rate=self._settings["rate"],
635
+ speaking_rate=self._settings["speaking_rate"],
632
636
  ),
633
637
  )
634
638
  config_request = texttospeech_v1.StreamingSynthesizeRequest(
@@ -108,12 +108,14 @@ class HeyGenSession(BaseModel):
108
108
  Parameters:
109
109
  session_id (str): Unique identifier for the streaming session.
110
110
  access_token (str): Token for accessing the session securely.
111
+ livekit_agent_token (str): Token for HeyGen’s audio agents(Pipecat).
111
112
  realtime_endpoint (str): Real-time communication endpoint URL.
112
113
  url (str): Direct URL for the session.
113
114
  """
114
115
 
115
116
  session_id: str
116
117
  access_token: str
118
+ livekit_agent_token: str
117
119
  realtime_endpoint: str
118
120
  url: str
119
121
 
@@ -393,7 +393,9 @@ class HeyGenClient:
393
393
  participant_id: Identifier of the participant to capture audio from
394
394
  callback: Async function to handle received audio frames
395
395
  """
396
- logger.debug(f"capture_participant_audio: {participant_id}")
396
+ logger.debug(
397
+ f"capture_participant_audio: {participant_id}, sample_rate: {self._in_sample_rate}"
398
+ )
397
399
  self._audio_frame_callback = callback
398
400
  if self._audio_task is not None:
399
401
  logger.warning(
@@ -407,7 +409,9 @@ class HeyGenClient:
407
409
  for track_pub in participant.track_publications.values():
408
410
  if track_pub.kind == rtc.TrackKind.KIND_AUDIO and track_pub.track is not None:
409
411
  logger.debug(f"Starting audio capture for existing track: {track_pub.sid}")
410
- audio_stream = rtc.AudioStream(track_pub.track)
412
+ audio_stream = rtc.AudioStream(
413
+ track=track_pub.track, sample_rate=self._in_sample_rate
414
+ )
411
415
  self._audio_task = self._task_manager.create_task(
412
416
  self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
413
417
  )
@@ -536,7 +540,7 @@ class HeyGenClient:
536
540
  and self._audio_task is None
537
541
  ):
538
542
  logger.debug(f"Creating audio stream processor for track: {publication.sid}")
539
- audio_stream = rtc.AudioStream(track)
543
+ audio_stream = rtc.AudioStream(track=track, sample_rate=self._in_sample_rate)
540
544
  self._audio_task = self._task_manager.create_task(
541
545
  self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
542
546
  )
@@ -559,7 +563,7 @@ class HeyGenClient:
559
563
  )
560
564
 
561
565
  await self._livekit_room.connect(
562
- self._heyGen_session.url, self._heyGen_session.access_token
566
+ self._heyGen_session.url, self._heyGen_session.livekit_agent_token
563
567
  )
564
568
  logger.debug(f"Successfully connected to LiveKit room: {self._livekit_room.name}")
565
569
  logger.debug(f"Local participant SID: {self._livekit_room.local_participant.sid}")
@@ -110,6 +110,7 @@ class HeyGenVideoService(AIService):
110
110
  api_key=self._api_key,
111
111
  session=self._session,
112
112
  params=TransportParams(
113
+ audio_in_sample_rate=48000,
113
114
  audio_in_enabled=True,
114
115
  video_in_enabled=True,
115
116
  audio_out_enabled=True,
@@ -240,6 +241,7 @@ class HeyGenVideoService(AIService):
240
241
  # As soon as we receive actual audio, the base output transport will create a
241
242
  # BotStartedSpeakingFrame, which we can use as a signal for the TTFB metrics.
242
243
  await self.stop_ttfb_metrics()
244
+ await self.push_frame(frame, direction)
243
245
  else:
244
246
  await self.push_frame(frame, direction)
245
247
 
@@ -0,0 +1,5 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #