livekit-plugins-google 0.3.0__py3-none-any.whl → 1.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ from typing import Union
2
+
3
+ from google.genai.types import (
4
+ GoogleMaps,
5
+ GoogleSearch,
6
+ GoogleSearchRetrieval,
7
+ ToolCodeExecution,
8
+ UrlContext,
9
+ )
10
+
11
+ _LLMTool = Union[GoogleSearchRetrieval, ToolCodeExecution, GoogleSearch, UrlContext, GoogleMaps]
@@ -0,0 +1,447 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import weakref
19
+ from collections.abc import AsyncGenerator
20
+ from dataclasses import dataclass, replace
21
+
22
+ from google.api_core.client_options import ClientOptions
23
+ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
24
+ from google.cloud import texttospeech
25
+ from google.cloud.texttospeech_v1.types import (
26
+ CustomPronunciations,
27
+ SsmlVoiceGender,
28
+ SynthesizeSpeechResponse,
29
+ )
30
+ from livekit.agents import APIConnectOptions, APIStatusError, APITimeoutError, tokenize, tts, utils
31
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
32
+ from livekit.agents.utils import is_given
33
+
34
+ from .log import logger
35
+ from .models import Gender, SpeechLanguages
36
+
37
+ NUM_CHANNELS = 1
38
+ DEFAULT_VOICE_NAME = "en-US-Chirp3-HD-Charon"
39
+ DEFAULT_LANGUAGE = "en-US"
40
+ DEFAULT_GENDER = "neutral"
41
+
42
+
43
+ @dataclass
44
+ class _TTSOptions:
45
+ voice: texttospeech.VoiceSelectionParams
46
+ encoding: texttospeech.AudioEncoding
47
+ sample_rate: int
48
+ pitch: float
49
+ effects_profile_id: str
50
+ speaking_rate: float
51
+ tokenizer: tokenize.SentenceTokenizer
52
+ volume_gain_db: float
53
+ custom_pronunciations: CustomPronunciations | None
54
+ enable_ssml: bool
55
+ use_markup: bool
56
+ model_name: str | None
57
+ prompt: str | None
58
+
59
+
60
+ class TTS(tts.TTS):
61
+ def __init__(
62
+ self,
63
+ *,
64
+ language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
65
+ gender: NotGivenOr[Gender | str] = NOT_GIVEN,
66
+ voice_name: NotGivenOr[str] = NOT_GIVEN,
67
+ voice_cloning_key: NotGivenOr[str] = NOT_GIVEN,
68
+ model_name: NotGivenOr[str] = NOT_GIVEN,
69
+ prompt: NotGivenOr[str] = NOT_GIVEN,
70
+ sample_rate: int = 24000,
71
+ pitch: int = 0,
72
+ effects_profile_id: str = "",
73
+ speaking_rate: float = 1.0,
74
+ volume_gain_db: float = 0.0,
75
+ location: str = "global",
76
+ audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.OGG_OPUS, # type: ignore
77
+ credentials_info: NotGivenOr[dict] = NOT_GIVEN,
78
+ credentials_file: NotGivenOr[str] = NOT_GIVEN,
79
+ tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
80
+ custom_pronunciations: NotGivenOr[CustomPronunciations] = NOT_GIVEN,
81
+ use_streaming: bool = True,
82
+ enable_ssml: bool = False,
83
+ use_markup: bool = False,
84
+ ) -> None:
85
+ """
86
+ Create a new instance of Google TTS.
87
+
88
+ Credentials must be provided, either by using the ``credentials_info`` dict, or reading
89
+ from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
90
+ environmental variable.
91
+
92
+ Args:
93
+ language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
94
+ gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
95
+ voice_name (str, optional): Specific voice name. Default is an empty string.
96
+ voice_cloning_key (str, optional): Voice clone key. Created via https://cloud.google.com/text-to-speech/docs/chirp3-instant-custom-voice
97
+ model_name (str, optional): Model name for TTS (e.g., "gemini-2.5-flash-tts"). Enables Gemini TTS models with streaming support.
98
+ prompt (str, optional): Style prompt for Gemini TTS models. Controls tone, style, and speaking characteristics. Only applied to first input chunk in streaming mode.
99
+ sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
100
+ location (str, optional): Location for the TTS client. Default is "global".
101
+ pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
102
+ effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
103
+ speaking_rate (float, optional): Speed of speech. Default is 1.0.
104
+ volume_gain_db (float, optional): Volume gain in decibels. Default is 0.0. In the range [-96.0, 16.0]. Strongly recommended not to exceed +10 (dB).
105
+ credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
106
+ credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
107
+ tokenizer (tokenize.SentenceTokenizer, optional): Tokenizer for the TTS. Defaults to `livekit.agents.tokenize.blingfire.SentenceTokenizer`.
108
+ custom_pronunciations (CustomPronunciations, optional): Custom pronunciations for the TTS. Default is None.
109
+ use_streaming (bool, optional): Whether to use streaming synthesis. Default is True.
110
+ enable_ssml (bool, optional): Whether to enable SSML support. Default is False.
111
+ use_markup (bool, optional): Whether to enable markup input for HD voices. Default is False.
112
+ """ # noqa: E501
113
+ super().__init__(
114
+ capabilities=tts.TTSCapabilities(streaming=use_streaming),
115
+ sample_rate=sample_rate,
116
+ num_channels=1,
117
+ )
118
+
119
+ if enable_ssml:
120
+ if use_streaming:
121
+ raise ValueError("SSML support is not available for streaming synthesis")
122
+ if use_markup:
123
+ raise ValueError("SSML support is not available for markup input")
124
+
125
+ self._client: texttospeech.TextToSpeechAsyncClient | None = None
126
+ self._credentials_info = credentials_info
127
+ self._credentials_file = credentials_file
128
+ self._location = location
129
+
130
+ lang = language if is_given(language) else DEFAULT_LANGUAGE
131
+ ssml_gender = _gender_from_str(DEFAULT_GENDER if not is_given(gender) else gender)
132
+
133
+ voice_params = texttospeech.VoiceSelectionParams(
134
+ language_code=lang,
135
+ ssml_gender=ssml_gender,
136
+ )
137
+ if is_given(model_name):
138
+ voice_params.model_name = model_name
139
+ if is_given(voice_cloning_key):
140
+ voice_params.voice_clone = texttospeech.VoiceCloneParams(
141
+ voice_cloning_key=voice_cloning_key,
142
+ )
143
+ else:
144
+ voice_params.name = voice_name if is_given(voice_name) else DEFAULT_VOICE_NAME
145
+
146
+ if not is_given(tokenizer):
147
+ tokenizer = tokenize.blingfire.SentenceTokenizer()
148
+
149
+ pronunciations = None if not is_given(custom_pronunciations) else custom_pronunciations
150
+
151
+ self._opts = _TTSOptions(
152
+ voice=voice_params,
153
+ encoding=audio_encoding,
154
+ sample_rate=sample_rate,
155
+ pitch=pitch,
156
+ effects_profile_id=effects_profile_id,
157
+ speaking_rate=speaking_rate,
158
+ tokenizer=tokenizer,
159
+ volume_gain_db=volume_gain_db,
160
+ custom_pronunciations=pronunciations,
161
+ enable_ssml=enable_ssml,
162
+ use_markup=use_markup,
163
+ model_name=model_name if is_given(model_name) else None,
164
+ prompt=prompt if is_given(prompt) else None,
165
+ )
166
+ self._streams = weakref.WeakSet[SynthesizeStream]()
167
+
168
+ @property
169
+ def model(self) -> str:
170
+ return self._opts.model_name or "Chirp3"
171
+
172
+ @property
173
+ def provider(self) -> str:
174
+ return "Google Cloud Platform"
175
+
176
+ def update_options(
177
+ self,
178
+ *,
179
+ language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
180
+ gender: NotGivenOr[Gender | str] = NOT_GIVEN,
181
+ voice_name: NotGivenOr[str] = NOT_GIVEN,
182
+ model_name: NotGivenOr[str] = NOT_GIVEN,
183
+ prompt: NotGivenOr[str] = NOT_GIVEN,
184
+ speaking_rate: NotGivenOr[float] = NOT_GIVEN,
185
+ volume_gain_db: NotGivenOr[float] = NOT_GIVEN,
186
+ ) -> None:
187
+ """
188
+ Update the TTS options.
189
+
190
+ Args:
191
+ language (SpeechLanguages | str, optional): Language code (e.g., "en-US").
192
+ gender (Gender | str, optional): Voice gender ("male", "female", "neutral").
193
+ voice_name (str, optional): Specific voice name.
194
+ model_name (str, optional): Model name for TTS (e.g., "gemini-2.5-flash-tts").
195
+ prompt (str, optional): Style prompt for Gemini TTS models.
196
+ speaking_rate (float, optional): Speed of speech.
197
+ volume_gain_db (float, optional): Volume gain in decibels.
198
+ """
199
+ params = {}
200
+ if is_given(language):
201
+ params["language_code"] = str(language)
202
+ if is_given(gender):
203
+ params["ssml_gender"] = _gender_from_str(str(gender))
204
+ if is_given(voice_name):
205
+ params["name"] = voice_name
206
+ if is_given(model_name):
207
+ params["model_name"] = model_name
208
+ self._opts.model_name = model_name
209
+
210
+ if params:
211
+ self._opts.voice = texttospeech.VoiceSelectionParams(**params)
212
+
213
+ if is_given(speaking_rate):
214
+ self._opts.speaking_rate = speaking_rate
215
+ if is_given(volume_gain_db):
216
+ self._opts.volume_gain_db = volume_gain_db
217
+ if is_given(prompt):
218
+ self._opts.prompt = prompt
219
+
220
+ def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
221
+ api_endpoint = "texttospeech.googleapis.com"
222
+ if self._location != "global":
223
+ api_endpoint = f"{self._location}-texttospeech.googleapis.com"
224
+
225
+ if self._client is None:
226
+ if self._credentials_info:
227
+ self._client = texttospeech.TextToSpeechAsyncClient.from_service_account_info(
228
+ self._credentials_info, client_options=ClientOptions(api_endpoint=api_endpoint)
229
+ )
230
+
231
+ elif self._credentials_file:
232
+ self._client = texttospeech.TextToSpeechAsyncClient.from_service_account_file(
233
+ self._credentials_file, client_options=ClientOptions(api_endpoint=api_endpoint)
234
+ )
235
+ else:
236
+ self._client = texttospeech.TextToSpeechAsyncClient(
237
+ client_options=ClientOptions(api_endpoint=api_endpoint)
238
+ )
239
+
240
+ assert self._client is not None
241
+ return self._client
242
+
243
+ def stream(
244
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
245
+ ) -> SynthesizeStream:
246
+ stream = SynthesizeStream(tts=self, conn_options=conn_options)
247
+ self._streams.add(stream)
248
+ return stream
249
+
250
+ def synthesize(
251
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
252
+ ) -> ChunkedStream:
253
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
254
+
255
+ async def aclose(self) -> None:
256
+ for stream in list(self._streams):
257
+ await stream.aclose()
258
+ self._streams.clear()
259
+
260
+
261
+ class ChunkedStream(tts.ChunkedStream):
262
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
263
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
264
+ self._tts: TTS = tts
265
+ self._opts = replace(tts._opts)
266
+
267
+ def _build_ssml(self) -> str:
268
+ ssml = "<speak>"
269
+ ssml += self._input_text
270
+ ssml += "</speak>"
271
+ return ssml
272
+
273
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
274
+ try:
275
+ if self._opts.use_markup:
276
+ tts_input = texttospeech.SynthesisInput(
277
+ markup=self._input_text, custom_pronunciations=self._opts.custom_pronunciations
278
+ )
279
+ elif self._opts.enable_ssml:
280
+ tts_input = texttospeech.SynthesisInput(
281
+ ssml=self._build_ssml(), custom_pronunciations=self._opts.custom_pronunciations
282
+ )
283
+ else:
284
+ tts_input = texttospeech.SynthesisInput(
285
+ text=self._input_text, custom_pronunciations=self._opts.custom_pronunciations
286
+ )
287
+
288
+ response: SynthesizeSpeechResponse = await self._tts._ensure_client().synthesize_speech(
289
+ input=tts_input,
290
+ voice=self._opts.voice,
291
+ audio_config=texttospeech.AudioConfig(
292
+ audio_encoding=self._opts.encoding,
293
+ sample_rate_hertz=self._opts.sample_rate,
294
+ pitch=self._opts.pitch,
295
+ effects_profile_id=self._opts.effects_profile_id,
296
+ speaking_rate=self._opts.speaking_rate,
297
+ volume_gain_db=self._opts.volume_gain_db,
298
+ ),
299
+ timeout=self._conn_options.timeout,
300
+ )
301
+
302
+ output_emitter.initialize(
303
+ request_id=utils.shortuuid(),
304
+ sample_rate=self._opts.sample_rate,
305
+ num_channels=1,
306
+ mime_type=_encoding_to_mimetype(self._opts.encoding),
307
+ )
308
+
309
+ output_emitter.push(response.audio_content)
310
+ except DeadlineExceeded:
311
+ raise APITimeoutError() from None
312
+ except GoogleAPICallError as e:
313
+ raise APIStatusError(e.message, status_code=e.code or -1) from e
314
+
315
+
316
+ class SynthesizeStream(tts.SynthesizeStream):
317
+ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
318
+ super().__init__(tts=tts, conn_options=conn_options)
319
+ self._tts: TTS = tts
320
+ self._opts = replace(tts._opts)
321
+ self._segments_ch = utils.aio.Chan[tokenize.SentenceStream]()
322
+
323
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
324
+ encoding = self._opts.encoding
325
+ if encoding not in (texttospeech.AudioEncoding.OGG_OPUS, texttospeech.AudioEncoding.PCM):
326
+ enc_name = texttospeech.AudioEncoding._member_names_[encoding]
327
+ logger.warning(
328
+ f"encoding {enc_name} isn't supported by the streaming_synthesize, "
329
+ "fallbacking to PCM"
330
+ )
331
+ encoding = texttospeech.AudioEncoding.PCM # type: ignore
332
+
333
+ output_emitter.initialize(
334
+ request_id=utils.shortuuid(),
335
+ sample_rate=self._opts.sample_rate,
336
+ num_channels=1,
337
+ mime_type=_encoding_to_mimetype(encoding),
338
+ stream=True,
339
+ )
340
+
341
+ streaming_config = texttospeech.StreamingSynthesizeConfig(
342
+ voice=self._opts.voice,
343
+ streaming_audio_config=texttospeech.StreamingAudioConfig(
344
+ audio_encoding=encoding,
345
+ sample_rate_hertz=self._opts.sample_rate,
346
+ speaking_rate=self._opts.speaking_rate,
347
+ ),
348
+ custom_pronunciations=self._opts.custom_pronunciations,
349
+ )
350
+
351
+ async def _tokenize_input() -> None:
352
+ input_stream = None
353
+ async for input in self._input_ch:
354
+ if isinstance(input, str):
355
+ if input_stream is None:
356
+ input_stream = self._opts.tokenizer.stream()
357
+ self._segments_ch.send_nowait(input_stream)
358
+ input_stream.push_text(input)
359
+ elif isinstance(input, self._FlushSentinel):
360
+ if input_stream:
361
+ input_stream.end_input()
362
+ input_stream = None
363
+
364
+ self._segments_ch.close()
365
+
366
+ async def _run_segments() -> None:
367
+ async for input_stream in self._segments_ch:
368
+ await self._run_stream(input_stream, output_emitter, streaming_config)
369
+
370
+ tasks = [
371
+ asyncio.create_task(_tokenize_input()),
372
+ asyncio.create_task(_run_segments()),
373
+ ]
374
+ try:
375
+ await asyncio.gather(*tasks)
376
+ finally:
377
+ await utils.aio.cancel_and_wait(*tasks)
378
+
379
+ async def _run_stream(
380
+ self,
381
+ input_stream: tokenize.SentenceStream,
382
+ output_emitter: tts.AudioEmitter,
383
+ streaming_config: texttospeech.StreamingSynthesizeConfig,
384
+ ) -> None:
385
+ @utils.log_exceptions(logger=logger)
386
+ async def input_generator() -> AsyncGenerator[
387
+ texttospeech.StreamingSynthesizeRequest, None
388
+ ]:
389
+ try:
390
+ yield texttospeech.StreamingSynthesizeRequest(streaming_config=streaming_config)
391
+
392
+ is_first_input = True
393
+ async for input in input_stream:
394
+ self._mark_started()
395
+ # prompt is only supported in the first input chunk (for Gemini TTS)
396
+ synthesis_input = texttospeech.StreamingSynthesisInput(
397
+ markup=input.token if self._opts.use_markup else None,
398
+ text=None if self._opts.use_markup else input.token,
399
+ prompt=self._opts.prompt if is_first_input else None,
400
+ )
401
+ is_first_input = False
402
+ yield texttospeech.StreamingSynthesizeRequest(input=synthesis_input)
403
+
404
+ except Exception:
405
+ logger.exception("an error occurred while streaming input to google TTS")
406
+
407
+ input_gen = input_generator()
408
+ try:
409
+ stream = await self._tts._ensure_client().streaming_synthesize(
410
+ input_gen, timeout=self._conn_options.timeout
411
+ )
412
+ output_emitter.start_segment(segment_id=utils.shortuuid())
413
+
414
+ async for resp in stream:
415
+ output_emitter.push(resp.audio_content)
416
+
417
+ output_emitter.end_segment()
418
+
419
+ except DeadlineExceeded:
420
+ raise APITimeoutError() from None
421
+ except GoogleAPICallError as e:
422
+ raise APIStatusError(e.message, status_code=e.code or -1) from e
423
+ finally:
424
+ await input_gen.aclose()
425
+
426
+
427
+ def _gender_from_str(gender: str) -> SsmlVoiceGender:
428
+ ssml_gender = SsmlVoiceGender.NEUTRAL
429
+ if gender == "male":
430
+ ssml_gender = SsmlVoiceGender.MALE
431
+ elif gender == "female":
432
+ ssml_gender = SsmlVoiceGender.FEMALE
433
+
434
+ return ssml_gender # type: ignore
435
+
436
+
437
+ def _encoding_to_mimetype(encoding: texttospeech.AudioEncoding) -> str:
438
+ if encoding == texttospeech.AudioEncoding.PCM:
439
+ return "audio/pcm"
440
+ elif encoding == texttospeech.AudioEncoding.LINEAR16:
441
+ return "audio/wav"
442
+ elif encoding == texttospeech.AudioEncoding.MP3:
443
+ return "audio/mp3"
444
+ elif encoding == texttospeech.AudioEncoding.OGG_OPUS:
445
+ return "audio/opus"
446
+ else:
447
+ raise RuntimeError(f"encoding {encoding} isn't supported")