dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Service for accessing Gemini Live via Google Vertex AI.
|
|
8
|
+
|
|
9
|
+
This module provides integration with Google's Gemini Live model via
|
|
10
|
+
Vertex AI, supporting both text and audio modalities with voice transcription,
|
|
11
|
+
streaming responses, and tool usage.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from typing import List, Optional, Union
|
|
16
|
+
|
|
17
|
+
from loguru import logger
|
|
18
|
+
|
|
19
|
+
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
|
20
|
+
from pipecat.services.google.gemini_live.llm import (
|
|
21
|
+
GeminiLiveLLMService,
|
|
22
|
+
HttpOptions,
|
|
23
|
+
InputParams,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
from google.auth import default
|
|
28
|
+
from google.auth.exceptions import GoogleAuthError
|
|
29
|
+
from google.auth.transport.requests import Request
|
|
30
|
+
from google.genai import Client
|
|
31
|
+
from google.oauth2 import service_account
|
|
32
|
+
|
|
33
|
+
except ModuleNotFoundError as e:
|
|
34
|
+
logger.error(f"Exception: {e}")
|
|
35
|
+
logger.error("In order to use Google Vertex AI, you need to `pip install pipecat-ai[google]`.")
|
|
36
|
+
raise Exception(f"Missing module: {e}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class GeminiLiveVertexLLMService(GeminiLiveLLMService):
|
|
40
|
+
"""Provides access to Google's Gemini Live model via Vertex AI.
|
|
41
|
+
|
|
42
|
+
This service enables real-time conversations with Gemini, supporting both
|
|
43
|
+
text and audio modalities. It handles voice transcription, streaming audio
|
|
44
|
+
responses, and tool usage.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
*,
|
|
50
|
+
credentials: Optional[str] = None,
|
|
51
|
+
credentials_path: Optional[str] = None,
|
|
52
|
+
location: str,
|
|
53
|
+
project_id: str,
|
|
54
|
+
model="google/gemini-2.0-flash-live-preview-04-09",
|
|
55
|
+
voice_id: str = "Charon",
|
|
56
|
+
start_audio_paused: bool = False,
|
|
57
|
+
start_video_paused: bool = False,
|
|
58
|
+
system_instruction: Optional[str] = None,
|
|
59
|
+
tools: Optional[Union[List[dict], ToolsSchema]] = None,
|
|
60
|
+
params: Optional[InputParams] = None,
|
|
61
|
+
inference_on_context_initialization: bool = True,
|
|
62
|
+
file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
|
|
63
|
+
http_options: Optional[HttpOptions] = None,
|
|
64
|
+
**kwargs,
|
|
65
|
+
):
|
|
66
|
+
"""Initialize the service for accessing Gemini Live via Google Vertex AI.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
credentials: JSON string of service account credentials.
|
|
70
|
+
credentials_path: Path to the service account JSON file.
|
|
71
|
+
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
|
72
|
+
project_id: Google Cloud project ID.
|
|
73
|
+
model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-preview-04-09".
|
|
74
|
+
voice_id: TTS voice identifier. Defaults to "Charon".
|
|
75
|
+
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
|
76
|
+
start_video_paused: Whether to start with video input paused. Defaults to False.
|
|
77
|
+
system_instruction: System prompt for the model. Defaults to None.
|
|
78
|
+
tools: Tools/functions available to the model. Defaults to None.
|
|
79
|
+
params: Configuration parameters for the model along with Vertex AI
|
|
80
|
+
location and project ID.
|
|
81
|
+
inference_on_context_initialization: Whether to generate a response when context
|
|
82
|
+
is first set. Defaults to True.
|
|
83
|
+
file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
|
|
84
|
+
http_options: HTTP options for the client.
|
|
85
|
+
**kwargs: Additional arguments passed to parent GeminiLiveLLMService.
|
|
86
|
+
"""
|
|
87
|
+
# Check if user incorrectly passed api_key, which is used by parent
|
|
88
|
+
# class but not here.
|
|
89
|
+
if "api_key" in kwargs:
|
|
90
|
+
logger.error(
|
|
91
|
+
"GeminiLiveVertexLLMService does not accept 'api_key' parameter. "
|
|
92
|
+
"Use 'credentials' or 'credentials_path' instead for Vertex AI authentication."
|
|
93
|
+
)
|
|
94
|
+
raise ValueError(
|
|
95
|
+
"Invalid parameter 'api_key'. Use 'credentials' or 'credentials_path' for Vertex AI authentication."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# These need to be set before calling super().__init__() because
|
|
99
|
+
# super().__init__() invokes create_client(), which needs these.
|
|
100
|
+
self._credentials = self._get_credentials(credentials, credentials_path)
|
|
101
|
+
self._project_id = project_id
|
|
102
|
+
self._location = location
|
|
103
|
+
|
|
104
|
+
# Call parent constructor with the obtained API key
|
|
105
|
+
super().__init__(
|
|
106
|
+
# api_key is required by parent class, but actually not used with
|
|
107
|
+
# Vertex
|
|
108
|
+
api_key="dummy",
|
|
109
|
+
model=model,
|
|
110
|
+
voice_id=voice_id,
|
|
111
|
+
start_audio_paused=start_audio_paused,
|
|
112
|
+
start_video_paused=start_video_paused,
|
|
113
|
+
system_instruction=system_instruction,
|
|
114
|
+
tools=tools,
|
|
115
|
+
params=params,
|
|
116
|
+
inference_on_context_initialization=inference_on_context_initialization,
|
|
117
|
+
file_api_base_url=file_api_base_url,
|
|
118
|
+
http_options=http_options,
|
|
119
|
+
**kwargs,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def create_client(self):
|
|
123
|
+
"""Create the Gemini client instance."""
|
|
124
|
+
self._client = Client(
|
|
125
|
+
vertexai=True,
|
|
126
|
+
credentials=self._credentials,
|
|
127
|
+
project=self._project_id,
|
|
128
|
+
location=self._location,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def file_api(self):
|
|
133
|
+
"""Gemini File API is not supported with Vertex AI."""
|
|
134
|
+
raise NotImplementedError(
|
|
135
|
+
"When using Vertex AI, the recommended approach is to use Google Cloud Storage for file handling. The Gemini File API is not directly supported in this context."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def _get_credentials(credentials: Optional[str], credentials_path: Optional[str]) -> str:
|
|
140
|
+
"""Retrieve Credentials using Google service account credentials JSON.
|
|
141
|
+
|
|
142
|
+
Supports multiple authentication methods:
|
|
143
|
+
1. Direct JSON credentials string
|
|
144
|
+
2. Path to service account JSON file
|
|
145
|
+
3. Default application credentials (ADC)
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
credentials: JSON string of service account credentials.
|
|
149
|
+
credentials_path: Path to the service account JSON file.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
OAuth token for API authentication.
|
|
153
|
+
|
|
154
|
+
Raises:
|
|
155
|
+
ValueError: If no valid credentials are provided or found.
|
|
156
|
+
"""
|
|
157
|
+
creds: Optional[service_account.Credentials] = None
|
|
158
|
+
|
|
159
|
+
if credentials:
|
|
160
|
+
# Parse and load credentials from JSON string
|
|
161
|
+
creds = service_account.Credentials.from_service_account_info(
|
|
162
|
+
json.loads(credentials),
|
|
163
|
+
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
|
164
|
+
)
|
|
165
|
+
elif credentials_path:
|
|
166
|
+
# Load credentials from JSON file
|
|
167
|
+
creds = service_account.Credentials.from_service_account_file(
|
|
168
|
+
credentials_path,
|
|
169
|
+
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
|
170
|
+
)
|
|
171
|
+
else:
|
|
172
|
+
try:
|
|
173
|
+
creds, project_id = default(
|
|
174
|
+
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
|
175
|
+
)
|
|
176
|
+
except GoogleAuthError:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
if not creds:
|
|
180
|
+
raise ValueError("No valid credentials provided.")
|
|
181
|
+
|
|
182
|
+
creds.refresh(Request()) # Ensure token is up-to-date, lifetime is 1 hour.
|
|
183
|
+
|
|
184
|
+
return creds
|
pipecat/services/google/llm.py
CHANGED
|
@@ -35,8 +35,8 @@ from pipecat.frames.frames import (
|
|
|
35
35
|
LLMMessagesFrame,
|
|
36
36
|
LLMTextFrame,
|
|
37
37
|
LLMUpdateSettingsFrame,
|
|
38
|
+
OutputImageRawFrame,
|
|
38
39
|
UserImageRawFrame,
|
|
39
|
-
VisionImageRawFrame,
|
|
40
40
|
)
|
|
41
41
|
from pipecat.metrics.metrics import LLMTokenUsage
|
|
42
42
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
@@ -73,6 +73,9 @@ try:
|
|
|
73
73
|
HttpOptions,
|
|
74
74
|
Part,
|
|
75
75
|
)
|
|
76
|
+
|
|
77
|
+
# Temporary hack to be able to process Nano Banana returned images.
|
|
78
|
+
genai._api_client.READ_BUFFER_SIZE = 5 * 1024 * 1024
|
|
76
79
|
except ModuleNotFoundError as e:
|
|
77
80
|
logger.error(f"Exception: {e}")
|
|
78
81
|
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
|
@@ -683,7 +686,7 @@ class GoogleLLMService(LLMService):
|
|
|
683
686
|
self,
|
|
684
687
|
*,
|
|
685
688
|
api_key: str,
|
|
686
|
-
model: str = "gemini-2.
|
|
689
|
+
model: str = "gemini-2.5-flash",
|
|
687
690
|
params: Optional[InputParams] = None,
|
|
688
691
|
system_instruction: Optional[str] = None,
|
|
689
692
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
@@ -711,6 +714,7 @@ class GoogleLLMService(LLMService):
|
|
|
711
714
|
self._api_key = api_key
|
|
712
715
|
self._system_instruction = system_instruction
|
|
713
716
|
self._http_options = http_options
|
|
717
|
+
|
|
714
718
|
self._create_client(api_key, http_options)
|
|
715
719
|
self._settings = {
|
|
716
720
|
"max_tokens": params.max_tokens,
|
|
@@ -789,6 +793,9 @@ class GoogleLLMService(LLMService):
|
|
|
789
793
|
# and can be configured to turn it off.
|
|
790
794
|
if not self._model_name.startswith("gemini-2.5-flash"):
|
|
791
795
|
return
|
|
796
|
+
# If we have an image model, we don't use a budget either.
|
|
797
|
+
if "image" in self._model_name:
|
|
798
|
+
return
|
|
792
799
|
# If thinking_config is already set, don't override it.
|
|
793
800
|
if "thinking_config" in generation_params:
|
|
794
801
|
return
|
|
@@ -928,6 +935,12 @@ class GoogleLLMService(LLMService):
|
|
|
928
935
|
arguments=function_call.args or {},
|
|
929
936
|
)
|
|
930
937
|
)
|
|
938
|
+
elif part.inline_data and part.inline_data.data:
|
|
939
|
+
image = Image.open(io.BytesIO(part.inline_data.data))
|
|
940
|
+
frame = OutputImageRawFrame(
|
|
941
|
+
image=image.tobytes(), size=image.size, format="RGB"
|
|
942
|
+
)
|
|
943
|
+
await self.push_frame(frame)
|
|
931
944
|
|
|
932
945
|
if (
|
|
933
946
|
candidate.grounding_metadata
|
|
@@ -1013,15 +1026,6 @@ class GoogleLLMService(LLMService):
|
|
|
1013
1026
|
# NOTE: LLMMessagesFrame is deprecated, so we don't support the newer universal
|
|
1014
1027
|
# LLMContext with it
|
|
1015
1028
|
context = GoogleLLMContext(frame.messages)
|
|
1016
|
-
elif isinstance(frame, VisionImageRawFrame):
|
|
1017
|
-
# This is only useful in very simple pipelines because it creates
|
|
1018
|
-
# a new context. Generally we want a context manager to catch
|
|
1019
|
-
# UserImageRawFrames coming through the pipeline and add them
|
|
1020
|
-
# to the context.
|
|
1021
|
-
context = GoogleLLMContext()
|
|
1022
|
-
context.add_image_frame_message(
|
|
1023
|
-
format=frame.format, size=frame.size, image=frame.image, text=frame.text
|
|
1024
|
-
)
|
|
1025
1029
|
elif isinstance(frame, LLMUpdateSettingsFrame):
|
|
1026
1030
|
await self._update_settings(frame.settings)
|
|
1027
1031
|
else:
|
|
@@ -96,9 +96,9 @@ class GoogleLLMOpenAIBetaService(OpenAILLMService):
|
|
|
96
96
|
async for chunk in chunk_stream:
|
|
97
97
|
if chunk.usage:
|
|
98
98
|
tokens = LLMTokenUsage(
|
|
99
|
-
prompt_tokens=chunk.usage.prompt_tokens,
|
|
100
|
-
completion_tokens=chunk.usage.completion_tokens,
|
|
101
|
-
total_tokens=chunk.usage.total_tokens,
|
|
99
|
+
prompt_tokens=chunk.usage.prompt_tokens or 0,
|
|
100
|
+
completion_tokens=chunk.usage.completion_tokens or 0,
|
|
101
|
+
total_tokens=chunk.usage.total_tokens or 0,
|
|
102
102
|
)
|
|
103
103
|
await self.start_llm_usage_metrics(tokens)
|
|
104
104
|
|
|
@@ -53,12 +53,44 @@ class GoogleVertexLLMService(OpenAILLMService):
|
|
|
53
53
|
|
|
54
54
|
Parameters:
|
|
55
55
|
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
|
56
|
+
|
|
57
|
+
.. deprecated:: 0.0.90
|
|
58
|
+
Use `location` as a direct argument to
|
|
59
|
+
`GoogleVertexLLMService.__init__()` instead.
|
|
60
|
+
|
|
56
61
|
project_id: Google Cloud project ID.
|
|
62
|
+
|
|
63
|
+
.. deprecated:: 0.0.90
|
|
64
|
+
Use `project_id` as a direct argument to
|
|
65
|
+
`GoogleVertexLLMService.__init__()` instead.
|
|
57
66
|
"""
|
|
58
67
|
|
|
59
68
|
# https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations
|
|
60
|
-
location: str =
|
|
61
|
-
project_id: str
|
|
69
|
+
location: Optional[str] = None
|
|
70
|
+
project_id: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
def __init__(self, **kwargs):
|
|
73
|
+
"""Initializes the InputParams."""
|
|
74
|
+
import warnings
|
|
75
|
+
|
|
76
|
+
with warnings.catch_warnings():
|
|
77
|
+
warnings.simplefilter("always")
|
|
78
|
+
if "location" in kwargs and kwargs["location"] is not None:
|
|
79
|
+
warnings.warn(
|
|
80
|
+
"GoogleVertexLLMService.InputParams.location is deprecated. "
|
|
81
|
+
"Please provide 'location' as a direct argument to GoogleVertexLLMService.__init__() instead.",
|
|
82
|
+
DeprecationWarning,
|
|
83
|
+
stacklevel=2,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if "project_id" in kwargs and kwargs["project_id"] is not None:
|
|
87
|
+
warnings.warn(
|
|
88
|
+
"GoogleVertexLLMService.InputParams.project_id is deprecated. "
|
|
89
|
+
"Please provide 'project_id' as a direct argument to GoogleVertexLLMService.__init__() instead.",
|
|
90
|
+
DeprecationWarning,
|
|
91
|
+
stacklevel=2,
|
|
92
|
+
)
|
|
93
|
+
super().__init__(**kwargs)
|
|
62
94
|
|
|
63
95
|
def __init__(
|
|
64
96
|
self,
|
|
@@ -66,7 +98,8 @@ class GoogleVertexLLMService(OpenAILLMService):
|
|
|
66
98
|
credentials: Optional[str] = None,
|
|
67
99
|
credentials_path: Optional[str] = None,
|
|
68
100
|
model: str = "google/gemini-2.0-flash-001",
|
|
69
|
-
|
|
101
|
+
location: Optional[str] = None,
|
|
102
|
+
project_id: Optional[str] = None,
|
|
70
103
|
**kwargs,
|
|
71
104
|
):
|
|
72
105
|
"""Initializes the VertexLLMService.
|
|
@@ -75,25 +108,60 @@ class GoogleVertexLLMService(OpenAILLMService):
|
|
|
75
108
|
credentials: JSON string of service account credentials.
|
|
76
109
|
credentials_path: Path to the service account JSON file.
|
|
77
110
|
model: Model identifier (e.g., "google/gemini-2.0-flash-001").
|
|
78
|
-
|
|
111
|
+
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
|
112
|
+
project_id: Google Cloud project ID.
|
|
79
113
|
**kwargs: Additional arguments passed to OpenAILLMService.
|
|
80
114
|
"""
|
|
81
|
-
|
|
82
|
-
|
|
115
|
+
# Handle deprecated InputParams fields
|
|
116
|
+
if "params" in kwargs and isinstance(kwargs["params"], GoogleVertexLLMService.InputParams):
|
|
117
|
+
params = kwargs["params"]
|
|
118
|
+
# Extract location and project_id from params if not provided
|
|
119
|
+
# directly, for backward compatibility
|
|
120
|
+
if project_id is None:
|
|
121
|
+
project_id = params.project_id
|
|
122
|
+
if location is None:
|
|
123
|
+
location = params.location
|
|
124
|
+
# Convert to base InputParams
|
|
125
|
+
params = OpenAILLMService.InputParams(
|
|
126
|
+
**params.model_dump(exclude={"location", "project_id"}, exclude_unset=True)
|
|
127
|
+
)
|
|
128
|
+
kwargs["params"] = params
|
|
129
|
+
|
|
130
|
+
# Validate project_id and location parameters
|
|
131
|
+
# NOTE: once we remove Vertex-spcific InputParams class, we can update
|
|
132
|
+
# __init__() signature as follows:
|
|
133
|
+
# - location: str = "us-east4",
|
|
134
|
+
# - project_id: str,
|
|
135
|
+
# But for now, we need them as-is to maintain proper backward
|
|
136
|
+
# compatibility.
|
|
137
|
+
if project_id is None:
|
|
138
|
+
raise ValueError("project_id is required")
|
|
139
|
+
if location is None:
|
|
140
|
+
# If location is not provided, default to "us-east4".
|
|
141
|
+
# Note: this is legacy behavior; ideally location would be
|
|
142
|
+
# required.
|
|
143
|
+
logger.warning("location is not provided. Defaulting to 'us-east4'.")
|
|
144
|
+
location = "us-east4" # Default location if not provided
|
|
145
|
+
|
|
146
|
+
base_url = self._get_base_url(location, project_id)
|
|
83
147
|
self._api_key = self._get_api_token(credentials, credentials_path)
|
|
84
148
|
|
|
85
149
|
super().__init__(
|
|
86
|
-
api_key=self._api_key,
|
|
150
|
+
api_key=self._api_key,
|
|
151
|
+
base_url=base_url,
|
|
152
|
+
model=model,
|
|
153
|
+
**kwargs,
|
|
87
154
|
)
|
|
88
155
|
|
|
89
156
|
@staticmethod
|
|
90
|
-
def _get_base_url(
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
157
|
+
def _get_base_url(location: str, project_id: str) -> str:
|
|
158
|
+
"""Construct the base URL for Vertex AI API."""
|
|
159
|
+
# Determine the correct API host based on location
|
|
160
|
+
if location == "global":
|
|
161
|
+
api_host = "aiplatform.googleapis.com"
|
|
162
|
+
else:
|
|
163
|
+
api_host = f"{location}-aiplatform.googleapis.com"
|
|
164
|
+
return f"https://{api_host}/v1/projects/{project_id}/locations/{location}/endpoints/openapi"
|
|
97
165
|
|
|
98
166
|
@staticmethod
|
|
99
167
|
def _get_api_token(credentials: Optional[str], credentials_path: Optional[str]) -> str:
|
|
@@ -119,12 +187,14 @@ class GoogleVertexLLMService(OpenAILLMService):
|
|
|
119
187
|
if credentials:
|
|
120
188
|
# Parse and load credentials from JSON string
|
|
121
189
|
creds = service_account.Credentials.from_service_account_info(
|
|
122
|
-
json.loads(credentials),
|
|
190
|
+
json.loads(credentials),
|
|
191
|
+
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
|
123
192
|
)
|
|
124
193
|
elif credentials_path:
|
|
125
194
|
# Load credentials from JSON file
|
|
126
195
|
creds = service_account.Credentials.from_service_account_file(
|
|
127
|
-
credentials_path,
|
|
196
|
+
credentials_path,
|
|
197
|
+
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
|
128
198
|
)
|
|
129
199
|
else:
|
|
130
200
|
try:
|
pipecat/services/google/stt.py
CHANGED
|
@@ -730,6 +730,8 @@ class GoogleSTTService(STTService):
|
|
|
730
730
|
self._request_queue = asyncio.Queue()
|
|
731
731
|
self._streaming_task = self.create_task(self._stream_audio())
|
|
732
732
|
|
|
733
|
+
await self._call_event_handler("on_connected")
|
|
734
|
+
|
|
733
735
|
async def _disconnect(self):
|
|
734
736
|
"""Clean up streaming recognition resources."""
|
|
735
737
|
if self._streaming_task:
|
|
@@ -737,6 +739,8 @@ class GoogleSTTService(STTService):
|
|
|
737
739
|
await self.cancel_task(self._streaming_task)
|
|
738
740
|
self._streaming_task = None
|
|
739
741
|
|
|
742
|
+
await self._call_event_handler("on_disconnected")
|
|
743
|
+
|
|
740
744
|
async def _request_generator(self):
|
|
741
745
|
"""Generates requests for the streaming recognize method."""
|
|
742
746
|
recognizer_path = f"projects/{self._project_id}/locations/{self._location}/recognizers/_"
|
pipecat/services/google/tts.py
CHANGED
|
@@ -500,10 +500,11 @@ class GoogleTTSService(TTSService):
|
|
|
500
500
|
|
|
501
501
|
Parameters:
|
|
502
502
|
language: Language for synthesis. Defaults to English.
|
|
503
|
+
speaking_rate: The speaking rate, in the range [0.25, 4.0].
|
|
503
504
|
"""
|
|
504
505
|
|
|
505
506
|
language: Optional[Language] = Language.EN
|
|
506
|
-
|
|
507
|
+
speaking_rate: Optional[float] = None
|
|
507
508
|
|
|
508
509
|
def __init__(
|
|
509
510
|
self,
|
|
@@ -511,6 +512,7 @@ class GoogleTTSService(TTSService):
|
|
|
511
512
|
credentials: Optional[str] = None,
|
|
512
513
|
credentials_path: Optional[str] = None,
|
|
513
514
|
voice_id: str = "en-US-Chirp3-HD-Charon",
|
|
515
|
+
voice_cloning_key: Optional[str] = None,
|
|
514
516
|
sample_rate: Optional[int] = None,
|
|
515
517
|
params: InputParams = InputParams(),
|
|
516
518
|
**kwargs,
|
|
@@ -521,6 +523,7 @@ class GoogleTTSService(TTSService):
|
|
|
521
523
|
credentials: JSON string containing Google Cloud service account credentials.
|
|
522
524
|
credentials_path: Path to Google Cloud service account JSON file.
|
|
523
525
|
voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
|
|
526
|
+
voice_cloning_key: The voice cloning key for Chirp 3 custom voices.
|
|
524
527
|
sample_rate: Audio sample rate in Hz. If None, uses default.
|
|
525
528
|
params: Language configuration parameters.
|
|
526
529
|
**kwargs: Additional arguments passed to parent TTSService.
|
|
@@ -536,7 +539,7 @@ class GoogleTTSService(TTSService):
|
|
|
536
539
|
"language": self.language_to_service_language(params.language)
|
|
537
540
|
if params.language
|
|
538
541
|
else "en-US",
|
|
539
|
-
"
|
|
542
|
+
"speaking_rate": params.speaking_rate,
|
|
540
543
|
}
|
|
541
544
|
self._voice_clone_params = None
|
|
542
545
|
if self._voice_config.get("is_clone", False):
|
|
@@ -550,6 +553,7 @@ class GoogleTTSService(TTSService):
|
|
|
550
553
|
language_code=self._settings["language"], voice_clone=self._voice_clone_params
|
|
551
554
|
)
|
|
552
555
|
self.set_voice(voice_id)
|
|
556
|
+
self._voice_cloning_key = voice_cloning_key
|
|
553
557
|
self._client: texttospeech_v1.TextToSpeechAsyncClient = self._create_client(
|
|
554
558
|
credentials, credentials_path
|
|
555
559
|
)
|
|
@@ -628,7 +632,7 @@ class GoogleTTSService(TTSService):
|
|
|
628
632
|
streaming_audio_config=texttospeech_v1.StreamingAudioConfig(
|
|
629
633
|
audio_encoding=texttospeech_v1.AudioEncoding.PCM,
|
|
630
634
|
sample_rate_hertz=self.sample_rate,
|
|
631
|
-
speaking_rate=self._settings["
|
|
635
|
+
speaking_rate=self._settings["speaking_rate"],
|
|
632
636
|
),
|
|
633
637
|
)
|
|
634
638
|
config_request = texttospeech_v1.StreamingSynthesizeRequest(
|
pipecat/services/heygen/api.py
CHANGED
|
@@ -108,12 +108,14 @@ class HeyGenSession(BaseModel):
|
|
|
108
108
|
Parameters:
|
|
109
109
|
session_id (str): Unique identifier for the streaming session.
|
|
110
110
|
access_token (str): Token for accessing the session securely.
|
|
111
|
+
livekit_agent_token (str): Token for HeyGen’s audio agents(Pipecat).
|
|
111
112
|
realtime_endpoint (str): Real-time communication endpoint URL.
|
|
112
113
|
url (str): Direct URL for the session.
|
|
113
114
|
"""
|
|
114
115
|
|
|
115
116
|
session_id: str
|
|
116
117
|
access_token: str
|
|
118
|
+
livekit_agent_token: str
|
|
117
119
|
realtime_endpoint: str
|
|
118
120
|
url: str
|
|
119
121
|
|
|
@@ -393,7 +393,9 @@ class HeyGenClient:
|
|
|
393
393
|
participant_id: Identifier of the participant to capture audio from
|
|
394
394
|
callback: Async function to handle received audio frames
|
|
395
395
|
"""
|
|
396
|
-
logger.debug(
|
|
396
|
+
logger.debug(
|
|
397
|
+
f"capture_participant_audio: {participant_id}, sample_rate: {self._in_sample_rate}"
|
|
398
|
+
)
|
|
397
399
|
self._audio_frame_callback = callback
|
|
398
400
|
if self._audio_task is not None:
|
|
399
401
|
logger.warning(
|
|
@@ -407,7 +409,9 @@ class HeyGenClient:
|
|
|
407
409
|
for track_pub in participant.track_publications.values():
|
|
408
410
|
if track_pub.kind == rtc.TrackKind.KIND_AUDIO and track_pub.track is not None:
|
|
409
411
|
logger.debug(f"Starting audio capture for existing track: {track_pub.sid}")
|
|
410
|
-
audio_stream = rtc.AudioStream(
|
|
412
|
+
audio_stream = rtc.AudioStream(
|
|
413
|
+
track=track_pub.track, sample_rate=self._in_sample_rate
|
|
414
|
+
)
|
|
411
415
|
self._audio_task = self._task_manager.create_task(
|
|
412
416
|
self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
|
|
413
417
|
)
|
|
@@ -536,7 +540,7 @@ class HeyGenClient:
|
|
|
536
540
|
and self._audio_task is None
|
|
537
541
|
):
|
|
538
542
|
logger.debug(f"Creating audio stream processor for track: {publication.sid}")
|
|
539
|
-
audio_stream = rtc.AudioStream(track)
|
|
543
|
+
audio_stream = rtc.AudioStream(track=track, sample_rate=self._in_sample_rate)
|
|
540
544
|
self._audio_task = self._task_manager.create_task(
|
|
541
545
|
self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
|
|
542
546
|
)
|
|
@@ -559,7 +563,7 @@ class HeyGenClient:
|
|
|
559
563
|
)
|
|
560
564
|
|
|
561
565
|
await self._livekit_room.connect(
|
|
562
|
-
self._heyGen_session.url, self._heyGen_session.
|
|
566
|
+
self._heyGen_session.url, self._heyGen_session.livekit_agent_token
|
|
563
567
|
)
|
|
564
568
|
logger.debug(f"Successfully connected to LiveKit room: {self._livekit_room.name}")
|
|
565
569
|
logger.debug(f"Local participant SID: {self._livekit_room.local_participant.sid}")
|
pipecat/services/heygen/video.py
CHANGED
|
@@ -110,6 +110,7 @@ class HeyGenVideoService(AIService):
|
|
|
110
110
|
api_key=self._api_key,
|
|
111
111
|
session=self._session,
|
|
112
112
|
params=TransportParams(
|
|
113
|
+
audio_in_sample_rate=48000,
|
|
113
114
|
audio_in_enabled=True,
|
|
114
115
|
video_in_enabled=True,
|
|
115
116
|
audio_out_enabled=True,
|
|
@@ -240,6 +241,7 @@ class HeyGenVideoService(AIService):
|
|
|
240
241
|
# As soon as we receive actual audio, the base output transport will create a
|
|
241
242
|
# BotStartedSpeakingFrame, which we can use as a signal for the TTFB metrics.
|
|
242
243
|
await self.stop_ttfb_metrics()
|
|
244
|
+
await self.push_frame(frame, direction)
|
|
243
245
|
else:
|
|
244
246
|
await self.push_frame(frame, direction)
|
|
245
247
|
|