livekit-plugins-google 0.3.0__py3-none-any.whl → 1.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,20 +12,38 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ """Google AI plugin for LiveKit Agents
16
+
17
+ Supports Gemini, Cloud Speech-to-Text, and Cloud Text-to-Speech.
18
+
19
+ See https://docs.livekit.io/agents/integrations/stt/google/ for more information.
20
+ """
21
+
22
+ from . import beta, realtime
23
+ from .llm import LLM
15
24
  from .stt import STT, SpeechStream
25
+ from .tools import _LLMTool
26
+ from .tts import TTS
16
27
  from .version import __version__
17
28
 
18
- __all__ = ["STT", "SpeechStream", "__version__"]
19
-
29
+ __all__ = ["STT", "TTS", "realtime", "SpeechStream", "__version__", "beta", "LLM", "_LLMTool"]
20
30
  from livekit.agents import Plugin
21
31
 
32
+ from .log import logger
22
33
 
23
- class GooglePlugin(Plugin):
24
- def __init__(self):
25
- super().__init__(__name__, __version__, __package__)
26
34
 
27
- def download_files(self):
28
- pass
35
+ class GooglePlugin(Plugin):
36
+ def __init__(self) -> None:
37
+ super().__init__(__name__, __version__, __package__, logger)
29
38
 
30
39
 
31
40
  Plugin.register_plugin(GooglePlugin())
41
+
42
+ # Cleanup docs of unexported modules
43
+ _module = dir()
44
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
45
+
46
+ __pdoc__ = {}
47
+
48
+ for n in NOT_IN_ALL:
49
+ __pdoc__[n] = False
@@ -0,0 +1,13 @@
1
+ from .. import realtime
2
+ from .gemini_tts import TTS as GeminiTTS
3
+
4
+ __all__ = ["GeminiTTS", "realtime"]
5
+
6
+ # Cleanup docs of unexported modules
7
+ _module = dir()
8
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
9
+
10
+ __pdoc__ = {}
11
+
12
+ for n in NOT_IN_ALL:
13
+ __pdoc__[n] = False
@@ -0,0 +1,258 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import Literal
6
+
7
+ from google.genai import Client, types
8
+ from google.genai.errors import APIError, ClientError, ServerError
9
+ from livekit.agents import APIConnectionError, APIStatusError, tts, utils
10
+ from livekit.agents.types import (
11
+ DEFAULT_API_CONNECT_OPTIONS,
12
+ NOT_GIVEN,
13
+ APIConnectOptions,
14
+ NotGivenOr,
15
+ )
16
+ from livekit.agents.utils import is_given
17
+
18
+ GEMINI_TTS_MODELS = Literal["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
19
+ GEMINI_VOICES = Literal[
20
+ "Zephyr",
21
+ "Puck",
22
+ "Charon",
23
+ "Kore",
24
+ "Fenrir",
25
+ "Leda",
26
+ "Orus",
27
+ "Aoede",
28
+ "Callirrhoe",
29
+ "Autonoe",
30
+ "Enceladus",
31
+ "Iapetus",
32
+ "Umbriel",
33
+ "Algieba",
34
+ "Despina",
35
+ "Erinome",
36
+ "Algenib",
37
+ "Rasalgethi",
38
+ "Laomedeia",
39
+ "Achernar",
40
+ "Alnilam",
41
+ "Schedar",
42
+ "Gacrux",
43
+ "Pulcherrima",
44
+ "Achird",
45
+ "Zubenelgenubi",
46
+ "Vindemiatrix",
47
+ "Sadachbia",
48
+ "Sadaltager",
49
+ "Sulafat",
50
+ ]
51
+
52
+ DEFAULT_MODEL = "gemini-2.5-flash-preview-tts"
53
+ DEFAULT_VOICE = "Kore"
54
+ DEFAULT_SAMPLE_RATE = 24000 # not configurable
55
+ NUM_CHANNELS = 1
56
+
57
+ DEFAULT_INSTRUCTIONS = "Say the text with a proper tone, don't omit or add any words"
58
+
59
+
60
+ @dataclass
61
+ class _TTSOptions:
62
+ model: GEMINI_TTS_MODELS | str
63
+ voice_name: GEMINI_VOICES | str
64
+ vertexai: bool
65
+ project: str | None
66
+ location: str | None
67
+ instructions: str | None
68
+
69
+
70
+ class TTS(tts.TTS):
71
+ def __init__(
72
+ self,
73
+ *,
74
+ model: GEMINI_TTS_MODELS | str = DEFAULT_MODEL,
75
+ voice_name: GEMINI_VOICES | str = DEFAULT_VOICE,
76
+ api_key: NotGivenOr[str] = NOT_GIVEN,
77
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
78
+ project: NotGivenOr[str] = NOT_GIVEN,
79
+ location: NotGivenOr[str] = NOT_GIVEN,
80
+ instructions: NotGivenOr[str | None] = NOT_GIVEN,
81
+ ) -> None:
82
+ """
83
+ Create a new instance of Gemini TTS.
84
+
85
+ Environment Requirements:
86
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
87
+ - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
88
+
89
+ Args:
90
+ model (str, optional): The Gemini TTS model to use. Defaults to "gemini-2.5-flash-preview-tts".
91
+ voice_name (str, optional): The voice to use for synthesis. Defaults to "Kore".
92
+ api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
93
+ vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
94
+ project (str, optional): The Google Cloud project to use (only for VertexAI).
95
+ location (str, optional): The location to use for VertexAI API requests. Defaults to "us-central1".
96
+ instructions (str, optional): Control the style, tone, accent, and pace using prompts. See https://ai.google.dev/gemini-api/docs/speech-generation#controllable
97
+ """ # noqa: E501
98
+ super().__init__(
99
+ capabilities=tts.TTSCapabilities(streaming=False),
100
+ sample_rate=DEFAULT_SAMPLE_RATE,
101
+ num_channels=NUM_CHANNELS,
102
+ )
103
+
104
+ gcp_project: str | None = (
105
+ project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
106
+ )
107
+ gcp_location: str | None = (
108
+ location
109
+ if is_given(location)
110
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
111
+ )
112
+ use_vertexai = (
113
+ vertexai
114
+ if is_given(vertexai)
115
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
116
+ )
117
+ gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
118
+
119
+ if use_vertexai:
120
+ if not gcp_project:
121
+ from google.auth._default_async import default_async
122
+
123
+ _, gcp_project = default_async( # type: ignore
124
+ scopes=["https://www.googleapis.com/auth/cloud-platform"]
125
+ )
126
+ gemini_api_key = None # VertexAI does not require an API key
127
+ else:
128
+ gcp_project = None
129
+ gcp_location = None
130
+ if not gemini_api_key:
131
+ raise ValueError(
132
+ "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
133
+ )
134
+
135
+ self._opts = _TTSOptions(
136
+ model=model,
137
+ voice_name=voice_name,
138
+ vertexai=use_vertexai,
139
+ project=gcp_project,
140
+ location=gcp_location,
141
+ instructions=instructions if is_given(instructions) else DEFAULT_INSTRUCTIONS,
142
+ )
143
+
144
+ self._client = Client(
145
+ api_key=gemini_api_key,
146
+ vertexai=use_vertexai,
147
+ project=gcp_project,
148
+ location=gcp_location,
149
+ )
150
+
151
+ @property
152
+ def model(self) -> str:
153
+ return self._opts.model
154
+
155
+ @property
156
+ def provider(self) -> str:
157
+ if self._client.vertexai:
158
+ return "Vertex AI"
159
+ else:
160
+ return "Gemini"
161
+
162
+ def synthesize(
163
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
164
+ ) -> ChunkedStream:
165
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
166
+
167
+ def update_options(
168
+ self,
169
+ *,
170
+ voice_name: NotGivenOr[str] = NOT_GIVEN,
171
+ ) -> None:
172
+ """
173
+ Update the TTS options.
174
+
175
+ Args:
176
+ voice_name (str, optional): The voice to use for synthesis.
177
+ """
178
+ if is_given(voice_name):
179
+ self._opts.voice_name = voice_name
180
+
181
+
182
+ class ChunkedStream(tts.ChunkedStream):
183
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
184
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
185
+ self._tts: TTS = tts
186
+
187
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
188
+ try:
189
+ config = types.GenerateContentConfig(
190
+ response_modalities=["AUDIO"],
191
+ speech_config=types.SpeechConfig(
192
+ voice_config=types.VoiceConfig(
193
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
194
+ voice_name=self._tts._opts.voice_name,
195
+ )
196
+ )
197
+ ),
198
+ )
199
+ input_text = self._input_text
200
+ if self._tts._opts.instructions is not None:
201
+ input_text = f'{self._tts._opts.instructions}:\n"{input_text}"'
202
+
203
+ response = await self._tts._client.aio.models.generate_content(
204
+ model=self._tts._opts.model,
205
+ contents=input_text,
206
+ config=config,
207
+ )
208
+
209
+ output_emitter.initialize(
210
+ request_id=utils.shortuuid(),
211
+ sample_rate=self._tts.sample_rate,
212
+ num_channels=self._tts.num_channels,
213
+ mime_type="audio/pcm",
214
+ )
215
+
216
+ if (
217
+ not response.candidates
218
+ or not (content := response.candidates[0].content)
219
+ or not content.parts
220
+ ):
221
+ raise APIStatusError("No audio content generated")
222
+
223
+ for part in content.parts:
224
+ if (
225
+ (inline_data := part.inline_data)
226
+ and inline_data.data
227
+ and inline_data.mime_type
228
+ and inline_data.mime_type.startswith("audio/")
229
+ ):
230
+ # mime_type: audio/L16;codec=pcm;rate=24000
231
+ output_emitter.push(inline_data.data)
232
+
233
+ except ClientError as e:
234
+ raise APIStatusError(
235
+ "gemini tts: client error",
236
+ status_code=e.code,
237
+ body=f"{e.message} {e.status}",
238
+ retryable=False if e.code != 429 else True,
239
+ ) from e
240
+ except ServerError as e:
241
+ raise APIStatusError(
242
+ "gemini tts: server error",
243
+ status_code=e.code,
244
+ body=f"{e.message} {e.status}",
245
+ retryable=True,
246
+ ) from e
247
+ except APIError as e:
248
+ raise APIStatusError(
249
+ "gemini tts: api error",
250
+ status_code=e.code,
251
+ body=f"{e.message} {e.status}",
252
+ retryable=True,
253
+ ) from e
254
+ except Exception as e:
255
+ raise APIConnectionError(
256
+ f"gemini tts: error generating speech {str(e)}",
257
+ retryable=True,
258
+ ) from e