livekit-plugins-google 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-google might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  from . import realtime
2
+ from .gemini_tts import TTS as GeminiTTS
2
3
 
3
- __all__ = ["realtime"]
4
+ __all__ = ["realtime", "GeminiTTS"]
4
5
 
5
6
  # Cleanup docs of unexported modules
6
7
  _module = dir()
@@ -0,0 +1,247 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import Literal
6
+
7
+ from google.genai import Client, types
8
+ from google.genai.errors import APIError, ClientError, ServerError
9
+ from livekit.agents import APIConnectionError, APIStatusError, tts, utils
10
+ from livekit.agents.types import (
11
+ DEFAULT_API_CONNECT_OPTIONS,
12
+ NOT_GIVEN,
13
+ APIConnectOptions,
14
+ NotGivenOr,
15
+ )
16
+ from livekit.agents.utils import is_given
17
+
18
+ GEMINI_TTS_MODELS = Literal["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
19
+ GEMINI_VOICES = Literal[
20
+ "Zephyr",
21
+ "Puck",
22
+ "Charon",
23
+ "Kore",
24
+ "Fenrir",
25
+ "Leda",
26
+ "Orus",
27
+ "Aoede",
28
+ "Callirrhoe",
29
+ "Autonoe",
30
+ "Enceladus",
31
+ "Iapetus",
32
+ "Umbriel",
33
+ "Algieba",
34
+ "Despina",
35
+ "Erinome",
36
+ "Algenib",
37
+ "Rasalgethi",
38
+ "Laomedeia",
39
+ "Achernar",
40
+ "Alnilam",
41
+ "Schedar",
42
+ "Gacrux",
43
+ "Pulcherrima",
44
+ "Achird",
45
+ "Zubenelgenubi",
46
+ "Vindemiatrix",
47
+ "Sadachbia",
48
+ "Sadaltager",
49
+ "Sulafat",
50
+ ]
51
+
52
+ DEFAULT_MODEL = "gemini-2.5-flash-preview-tts"
53
+ DEFAULT_VOICE = "Kore"
54
+ DEFAULT_SAMPLE_RATE = 24000 # not configurable
55
+ NUM_CHANNELS = 1
56
+
57
+ DEFAULT_INSTRUCTIONS = "Say the text with a proper tone, don't omit or add any words"
58
+
59
+
60
+ @dataclass
61
+ class _TTSOptions:
62
+ model: GEMINI_TTS_MODELS | str
63
+ voice_name: GEMINI_VOICES | str
64
+ vertexai: bool
65
+ project: str | None
66
+ location: str | None
67
+ instructions: str | None
68
+
69
+
70
+ class TTS(tts.TTS):
71
+ def __init__(
72
+ self,
73
+ *,
74
+ model: GEMINI_TTS_MODELS | str = DEFAULT_MODEL,
75
+ voice_name: GEMINI_VOICES | str = DEFAULT_VOICE,
76
+ api_key: NotGivenOr[str] = NOT_GIVEN,
77
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
78
+ project: NotGivenOr[str] = NOT_GIVEN,
79
+ location: NotGivenOr[str] = NOT_GIVEN,
80
+ instructions: NotGivenOr[str | None] = NOT_GIVEN,
81
+ ) -> None:
82
+ """
83
+ Create a new instance of Gemini TTS.
84
+
85
+ Environment Requirements:
86
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
87
+ - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
88
+
89
+ Args:
90
+ model (str, optional): The Gemini TTS model to use. Defaults to "gemini-2.5-flash-preview-tts".
91
+ voice_name (str, optional): The voice to use for synthesis. Defaults to "Kore".
92
+ api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
93
+ vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
94
+ project (str, optional): The Google Cloud project to use (only for VertexAI).
95
+ location (str, optional): The location to use for VertexAI API requests. Defaults to "us-central1".
96
+ instructions (str, optional): Control the style, tone, accent, and pace using prompts. See https://ai.google.dev/gemini-api/docs/speech-generation#controllable
97
+ """ # noqa: E501
98
+ super().__init__(
99
+ capabilities=tts.TTSCapabilities(streaming=False),
100
+ sample_rate=DEFAULT_SAMPLE_RATE,
101
+ num_channels=NUM_CHANNELS,
102
+ )
103
+
104
+ gcp_project: str | None = (
105
+ project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
106
+ )
107
+ gcp_location: str | None = (
108
+ location
109
+ if is_given(location)
110
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
111
+ )
112
+ use_vertexai = (
113
+ vertexai
114
+ if is_given(vertexai)
115
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
116
+ )
117
+ gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
118
+
119
+ if use_vertexai:
120
+ if not gcp_project:
121
+ from google.auth._default_async import default_async
122
+
123
+ _, gcp_project = default_async( # type: ignore
124
+ scopes=["https://www.googleapis.com/auth/cloud-platform"]
125
+ )
126
+ gemini_api_key = None # VertexAI does not require an API key
127
+ else:
128
+ gcp_project = None
129
+ gcp_location = None
130
+ if not gemini_api_key:
131
+ raise ValueError(
132
+ "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
133
+ )
134
+
135
+ self._opts = _TTSOptions(
136
+ model=model,
137
+ voice_name=voice_name,
138
+ vertexai=use_vertexai,
139
+ project=gcp_project,
140
+ location=gcp_location,
141
+ instructions=instructions if is_given(instructions) else DEFAULT_INSTRUCTIONS,
142
+ )
143
+
144
+ self._client = Client(
145
+ api_key=gemini_api_key,
146
+ vertexai=use_vertexai,
147
+ project=gcp_project,
148
+ location=gcp_location,
149
+ )
150
+
151
+ def synthesize(
152
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
153
+ ) -> ChunkedStream:
154
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
155
+
156
+ def update_options(
157
+ self,
158
+ *,
159
+ voice_name: NotGivenOr[str] = NOT_GIVEN,
160
+ ) -> None:
161
+ """
162
+ Update the TTS options.
163
+
164
+ Args:
165
+ voice_name (str, optional): The voice to use for synthesis.
166
+ """
167
+ if is_given(voice_name):
168
+ self._opts.voice_name = voice_name
169
+
170
+
171
+ class ChunkedStream(tts.ChunkedStream):
172
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
173
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
174
+ self._tts: TTS = tts
175
+
176
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
177
+ try:
178
+ config = types.GenerateContentConfig(
179
+ response_modalities=["AUDIO"],
180
+ speech_config=types.SpeechConfig(
181
+ voice_config=types.VoiceConfig(
182
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
183
+ voice_name=self._tts._opts.voice_name,
184
+ )
185
+ )
186
+ ),
187
+ )
188
+ input_text = self._input_text
189
+ if self._tts._opts.instructions is not None:
190
+ input_text = f'{self._tts._opts.instructions}:\n"{input_text}"'
191
+
192
+ response = await self._tts._client.aio.models.generate_content(
193
+ model=self._tts._opts.model,
194
+ contents=input_text,
195
+ config=config,
196
+ )
197
+
198
+ output_emitter.initialize(
199
+ request_id=utils.shortuuid(),
200
+ sample_rate=self._tts.sample_rate,
201
+ num_channels=self._tts.num_channels,
202
+ mime_type="audio/pcm",
203
+ )
204
+
205
+ if (
206
+ not response.candidates
207
+ or not (content := response.candidates[0].content)
208
+ or not content.parts
209
+ ):
210
+ raise APIStatusError("No audio content generated")
211
+
212
+ for part in content.parts:
213
+ if (
214
+ (inline_data := part.inline_data)
215
+ and inline_data.data
216
+ and inline_data.mime_type
217
+ and inline_data.mime_type.startswith("audio/")
218
+ ):
219
+ # mime_type: audio/L16;codec=pcm;rate=24000
220
+ output_emitter.push(inline_data.data)
221
+
222
+ except ClientError as e:
223
+ raise APIStatusError(
224
+ "gemini tts: client error",
225
+ status_code=e.code,
226
+ body=f"{e.message} {e.status}",
227
+ retryable=False if e.code != 429 else True,
228
+ ) from e
229
+ except ServerError as e:
230
+ raise APIStatusError(
231
+ "gemini tts: server error",
232
+ status_code=e.code,
233
+ body=f"{e.message} {e.status}",
234
+ retryable=True,
235
+ ) from e
236
+ except APIError as e:
237
+ raise APIStatusError(
238
+ "gemini tts: api error",
239
+ status_code=e.code,
240
+ body=f"{e.message} {e.status}",
241
+ retryable=True,
242
+ ) from e
243
+ except Exception as e:
244
+ raise APIConnectionError(
245
+ f"gemini tts: error generating speech {str(e)}",
246
+ retryable=True,
247
+ ) from e
@@ -13,7 +13,7 @@ LiveAPIModels = Literal[
13
13
  "gemini-2.5-flash-exp-native-audio-thinking-dialog",
14
14
  ]
15
15
 
16
- Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede", "Leda", "Oru", "Zephyr"]
16
+ Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede", "Leda", "Orus", "Zephyr"]
17
17
 
18
18
 
19
19
  ClientEvents = Union[
@@ -937,7 +937,6 @@ class RealtimeSession(llm.RealtimeSession):
937
937
  arguments=arguments,
938
938
  )
939
939
  )
940
- self._on_final_input_audio_transcription()
941
940
  self._mark_current_generation_done()
942
941
 
943
942
  def _handle_tool_call_cancellation(
@@ -1018,15 +1017,6 @@ class RealtimeSession(llm.RealtimeSession):
1018
1017
  # TODO(dz): this isn't a seamless reconnection just yet
1019
1018
  self._session_should_close.set()
1020
1019
 
1021
- def _on_final_input_audio_transcription(self) -> None:
1022
- if (gen := self._current_generation) and gen.input_transcription:
1023
- self.emit(
1024
- "input_audio_transcription_completed",
1025
- llm.InputTranscriptionCompleted(
1026
- item_id=gen.response_id, transcript=gen.input_transcription, is_final=True
1027
- ),
1028
- )
1029
-
1030
1020
  def commit_audio(self) -> None:
1031
1021
  pass
1032
1022
 
@@ -22,7 +22,11 @@ from dataclasses import dataclass, replace
22
22
  from google.api_core.client_options import ClientOptions
23
23
  from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
24
24
  from google.cloud import texttospeech
25
- from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
25
+ from google.cloud.texttospeech_v1.types import (
26
+ CustomPronunciations,
27
+ SsmlVoiceGender,
28
+ SynthesizeSpeechResponse,
29
+ )
26
30
  from livekit.agents import APIConnectOptions, APIStatusError, APITimeoutError, tokenize, tts, utils
27
31
  from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
28
32
  from livekit.agents.utils import is_given
@@ -47,6 +51,7 @@ class _TTSOptions:
47
51
  speaking_rate: float
48
52
  tokenizer: tokenize.SentenceTokenizer
49
53
  volume_gain_db: float
54
+ custom_pronunciations: CustomPronunciations | None
50
55
  enable_ssml: bool
51
56
 
52
57
 
@@ -67,6 +72,7 @@ class TTS(tts.TTS):
67
72
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
68
73
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
69
74
  tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
75
+ custom_pronunciations: NotGivenOr[CustomPronunciations] = NOT_GIVEN,
70
76
  use_streaming: bool = True,
71
77
  enable_ssml: bool = False,
72
78
  ) -> None:
@@ -90,6 +96,7 @@ class TTS(tts.TTS):
90
96
  credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
91
97
  credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
92
98
  tokenizer (tokenize.SentenceTokenizer, optional): Tokenizer for the TTS. Default is a basic sentence tokenizer.
99
+ custom_pronunciations (CustomPronunciations, optional): Custom pronunciations for the TTS. Default is None.
93
100
  use_streaming (bool, optional): Whether to use streaming synthesis. Default is True.
94
101
  enable_ssml (bool, optional): Whether to enable SSML support. Default is False.
95
102
  """ # noqa: E501
@@ -119,6 +126,8 @@ class TTS(tts.TTS):
119
126
  if not is_given(tokenizer):
120
127
  tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
121
128
 
129
+ pronunciations = None if not is_given(custom_pronunciations) else custom_pronunciations
130
+
122
131
  self._opts = _TTSOptions(
123
132
  voice=voice_params,
124
133
  encoding=audio_encoding,
@@ -128,6 +137,7 @@ class TTS(tts.TTS):
128
137
  speaking_rate=speaking_rate,
129
138
  tokenizer=tokenizer,
130
139
  volume_gain_db=volume_gain_db,
140
+ custom_pronunciations=pronunciations,
131
141
  enable_ssml=enable_ssml,
132
142
  )
133
143
  self._streams = weakref.WeakSet[SynthesizeStream]()
@@ -223,9 +233,15 @@ class ChunkedStream(tts.ChunkedStream):
223
233
  async def _run(self, output_emitter: tts.AudioEmitter) -> None:
224
234
  try:
225
235
  input = (
226
- texttospeech.SynthesisInput(ssml=self._build_ssml())
236
+ texttospeech.SynthesisInput(
237
+ ssml=self._build_ssml(),
238
+ custom_pronunciations=self._opts.custom_pronunciations,
239
+ )
227
240
  if self._opts.enable_ssml
228
- else texttospeech.SynthesisInput(text=self._input_text)
241
+ else texttospeech.SynthesisInput(
242
+ text=self._input_text,
243
+ custom_pronunciations=self._opts.custom_pronunciations,
244
+ )
229
245
  )
230
246
  response: SynthesizeSpeechResponse = await self._tts._ensure_client().synthesize_speech(
231
247
  input=input,
@@ -287,6 +303,7 @@ class SynthesizeStream(tts.SynthesizeStream):
287
303
  sample_rate_hertz=self._opts.sample_rate,
288
304
  speaking_rate=self._opts.speaking_rate,
289
305
  ),
306
+ custom_pronunciations=self._opts.custom_pronunciations,
290
307
  )
291
308
 
292
309
  async def _tokenize_input() -> None:
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.1.5"
15
+ __version__ = "1.1.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.1.5
3
+ Version: 1.1.7
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -22,7 +22,7 @@ Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
23
  Requires-Dist: google-cloud-texttospeech<3,>=2.27
24
24
  Requires-Dist: google-genai>=v1.23.0
25
- Requires-Dist: livekit-agents>=1.1.5
25
+ Requires-Dist: livekit-agents>=1.1.7
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # Google AI plugin for LiveKit Agents
@@ -5,13 +5,14 @@ livekit/plugins/google/models.py,sha256=hOpfbN_qdQ1ZTpCN9m9dvG2eb6WgQ3KE3WRpIeeM
5
5
  livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  livekit/plugins/google/stt.py,sha256=ssDMH5U1vQOLA44XMlovYWIR4UqVtZSge3YFN-zZ7Iw,24696
7
7
  livekit/plugins/google/tools.py,sha256=tD5HVDHO5JfUF029Cx3axHMJec0Gxalkl7s1FDgxLzI,259
8
- livekit/plugins/google/tts.py,sha256=YTfce55MWNJyDH4k8U1O2giOcrtccTs8vrkiW9GuBR0,15541
8
+ livekit/plugins/google/tts.py,sha256=QVM4xcF7WHpbQOZDAhRJrz481iMhO9ACjjqPEdTT4Lw,16277
9
9
  livekit/plugins/google/utils.py,sha256=6iihkKx76DDtLiHOoTU2ZXqzupBRY_gN3njpnwdmeqY,8829
10
- livekit/plugins/google/version.py,sha256=OKtayGMVDYKyoKBO2yNM4kfRbH-PODJqECIiYhUzNWg,600
11
- livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
10
+ livekit/plugins/google/version.py,sha256=EcBB23XE8aEiF7xHMivcb9wptFeYkGB1WNGSn1bIV3A,600
11
+ livekit/plugins/google/beta/__init__.py,sha256=RvAUdvEiRN-fe4JrgPcN0Jkw1kZR9wPerGMFVjS1Cc0,270
12
+ livekit/plugins/google/beta/gemini_tts.py,sha256=esWjr0Xf95tl0_AB7MXiFZ_VCORWgcWjzvLvRa3t0FQ,8515
12
13
  livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
13
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=NfE7xr2N3JOu7gVfWbAmDcEhs8vuZgMRu5vpScPJzsg,776
14
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=tlAsTFsumqOavC9JT2SuQi_3eGYygZ3bbS-nEM7ea8Q,46293
15
- livekit_plugins_google-1.1.5.dist-info/METADATA,sha256=g6aRR1VIspmPtZ2C6VQ-cqZWx1gIpLtg4OFV1pbD01E,1907
16
- livekit_plugins_google-1.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
- livekit_plugins_google-1.1.5.dist-info/RECORD,,
14
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=cbKmpX32G4gPjF6cxFNzGEDfYX19SK-vWi4Myxb8Yks,777
15
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=nqiDiAtyHYFRd_Or1Y_95syjHyAVFjaEYTka0qPfXdE,45853
16
+ livekit_plugins_google-1.1.7.dist-info/METADATA,sha256=yG5QbYo-vfSQQ4oyHXrima24mYz9K1sFfT8Bkx6Yh2A,1907
17
+ livekit_plugins_google-1.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
18
+ livekit_plugins_google-1.1.7.dist-info/RECORD,,