livekit-plugins-google 1.1.6__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-google might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  from . import realtime
2
+ from .gemini_tts import TTS as GeminiTTS
2
3
 
3
- __all__ = ["realtime"]
4
+ __all__ = ["realtime", "GeminiTTS"]
4
5
 
5
6
  # Cleanup docs of unexported modules
6
7
  _module = dir()
@@ -0,0 +1,247 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import Literal
6
+
7
+ from google.genai import Client, types
8
+ from google.genai.errors import APIError, ClientError, ServerError
9
+ from livekit.agents import APIConnectionError, APIStatusError, tts, utils
10
+ from livekit.agents.types import (
11
+ DEFAULT_API_CONNECT_OPTIONS,
12
+ NOT_GIVEN,
13
+ APIConnectOptions,
14
+ NotGivenOr,
15
+ )
16
+ from livekit.agents.utils import is_given
17
+
18
+ GEMINI_TTS_MODELS = Literal["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
19
+ GEMINI_VOICES = Literal[
20
+ "Zephyr",
21
+ "Puck",
22
+ "Charon",
23
+ "Kore",
24
+ "Fenrir",
25
+ "Leda",
26
+ "Orus",
27
+ "Aoede",
28
+ "Callirrhoe",
29
+ "Autonoe",
30
+ "Enceladus",
31
+ "Iapetus",
32
+ "Umbriel",
33
+ "Algieba",
34
+ "Despina",
35
+ "Erinome",
36
+ "Algenib",
37
+ "Rasalgethi",
38
+ "Laomedeia",
39
+ "Achernar",
40
+ "Alnilam",
41
+ "Schedar",
42
+ "Gacrux",
43
+ "Pulcherrima",
44
+ "Achird",
45
+ "Zubenelgenubi",
46
+ "Vindemiatrix",
47
+ "Sadachbia",
48
+ "Sadaltager",
49
+ "Sulafat",
50
+ ]
51
+
52
+ DEFAULT_MODEL = "gemini-2.5-flash-preview-tts"
53
+ DEFAULT_VOICE = "Kore"
54
+ DEFAULT_SAMPLE_RATE = 24000 # not configurable
55
+ NUM_CHANNELS = 1
56
+
57
+ DEFAULT_INSTRUCTIONS = "Say the text with a proper tone, don't omit or add any words"
58
+
59
+
60
+ @dataclass
61
+ class _TTSOptions:
62
+ model: GEMINI_TTS_MODELS | str
63
+ voice_name: GEMINI_VOICES | str
64
+ vertexai: bool
65
+ project: str | None
66
+ location: str | None
67
+ instructions: str | None
68
+
69
+
70
+ class TTS(tts.TTS):
71
+ def __init__(
72
+ self,
73
+ *,
74
+ model: GEMINI_TTS_MODELS | str = DEFAULT_MODEL,
75
+ voice_name: GEMINI_VOICES | str = DEFAULT_VOICE,
76
+ api_key: NotGivenOr[str] = NOT_GIVEN,
77
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
78
+ project: NotGivenOr[str] = NOT_GIVEN,
79
+ location: NotGivenOr[str] = NOT_GIVEN,
80
+ instructions: NotGivenOr[str | None] = NOT_GIVEN,
81
+ ) -> None:
82
+ """
83
+ Create a new instance of Gemini TTS.
84
+
85
+ Environment Requirements:
86
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
87
+ - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
88
+
89
+ Args:
90
+ model (str, optional): The Gemini TTS model to use. Defaults to "gemini-2.5-flash-preview-tts".
91
+ voice_name (str, optional): The voice to use for synthesis. Defaults to "Kore".
92
+ api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
93
+ vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
94
+ project (str, optional): The Google Cloud project to use (only for VertexAI).
95
+ location (str, optional): The location to use for VertexAI API requests. Defaults to "us-central1".
96
+ instructions (str, optional): Control the style, tone, accent, and pace using prompts. See https://ai.google.dev/gemini-api/docs/speech-generation#controllable
97
+ """ # noqa: E501
98
+ super().__init__(
99
+ capabilities=tts.TTSCapabilities(streaming=False),
100
+ sample_rate=DEFAULT_SAMPLE_RATE,
101
+ num_channels=NUM_CHANNELS,
102
+ )
103
+
104
+ gcp_project: str | None = (
105
+ project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
106
+ )
107
+ gcp_location: str | None = (
108
+ location
109
+ if is_given(location)
110
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
111
+ )
112
+ use_vertexai = (
113
+ vertexai
114
+ if is_given(vertexai)
115
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
116
+ )
117
+ gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
118
+
119
+ if use_vertexai:
120
+ if not gcp_project:
121
+ from google.auth._default_async import default_async
122
+
123
+ _, gcp_project = default_async( # type: ignore
124
+ scopes=["https://www.googleapis.com/auth/cloud-platform"]
125
+ )
126
+ gemini_api_key = None # VertexAI does not require an API key
127
+ else:
128
+ gcp_project = None
129
+ gcp_location = None
130
+ if not gemini_api_key:
131
+ raise ValueError(
132
+ "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
133
+ )
134
+
135
+ self._opts = _TTSOptions(
136
+ model=model,
137
+ voice_name=voice_name,
138
+ vertexai=use_vertexai,
139
+ project=gcp_project,
140
+ location=gcp_location,
141
+ instructions=instructions if is_given(instructions) else DEFAULT_INSTRUCTIONS,
142
+ )
143
+
144
+ self._client = Client(
145
+ api_key=gemini_api_key,
146
+ vertexai=use_vertexai,
147
+ project=gcp_project,
148
+ location=gcp_location,
149
+ )
150
+
151
+ def synthesize(
152
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
153
+ ) -> ChunkedStream:
154
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
155
+
156
+ def update_options(
157
+ self,
158
+ *,
159
+ voice_name: NotGivenOr[str] = NOT_GIVEN,
160
+ ) -> None:
161
+ """
162
+ Update the TTS options.
163
+
164
+ Args:
165
+ voice_name (str, optional): The voice to use for synthesis.
166
+ """
167
+ if is_given(voice_name):
168
+ self._opts.voice_name = voice_name
169
+
170
+
171
+ class ChunkedStream(tts.ChunkedStream):
172
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
173
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
174
+ self._tts: TTS = tts
175
+
176
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
177
+ try:
178
+ config = types.GenerateContentConfig(
179
+ response_modalities=["AUDIO"],
180
+ speech_config=types.SpeechConfig(
181
+ voice_config=types.VoiceConfig(
182
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
183
+ voice_name=self._tts._opts.voice_name,
184
+ )
185
+ )
186
+ ),
187
+ )
188
+ input_text = self._input_text
189
+ if self._tts._opts.instructions is not None:
190
+ input_text = f'{self._tts._opts.instructions}:\n"{input_text}"'
191
+
192
+ response = await self._tts._client.aio.models.generate_content(
193
+ model=self._tts._opts.model,
194
+ contents=input_text,
195
+ config=config,
196
+ )
197
+
198
+ output_emitter.initialize(
199
+ request_id=utils.shortuuid(),
200
+ sample_rate=self._tts.sample_rate,
201
+ num_channels=self._tts.num_channels,
202
+ mime_type="audio/pcm",
203
+ )
204
+
205
+ if (
206
+ not response.candidates
207
+ or not (content := response.candidates[0].content)
208
+ or not content.parts
209
+ ):
210
+ raise APIStatusError("No audio content generated")
211
+
212
+ for part in content.parts:
213
+ if (
214
+ (inline_data := part.inline_data)
215
+ and inline_data.data
216
+ and inline_data.mime_type
217
+ and inline_data.mime_type.startswith("audio/")
218
+ ):
219
+ # mime_type: audio/L16;codec=pcm;rate=24000
220
+ output_emitter.push(inline_data.data)
221
+
222
+ except ClientError as e:
223
+ raise APIStatusError(
224
+ "gemini tts: client error",
225
+ status_code=e.code,
226
+ body=f"{e.message} {e.status}",
227
+ retryable=False if e.code != 429 else True,
228
+ ) from e
229
+ except ServerError as e:
230
+ raise APIStatusError(
231
+ "gemini tts: server error",
232
+ status_code=e.code,
233
+ body=f"{e.message} {e.status}",
234
+ retryable=True,
235
+ ) from e
236
+ except APIError as e:
237
+ raise APIStatusError(
238
+ "gemini tts: api error",
239
+ status_code=e.code,
240
+ body=f"{e.message} {e.status}",
241
+ retryable=True,
242
+ ) from e
243
+ except Exception as e:
244
+ raise APIConnectionError(
245
+ f"gemini tts: error generating speech {str(e)}",
246
+ retryable=True,
247
+ ) from e
@@ -52,7 +52,7 @@ class _RealtimeOptions:
52
52
  api_key: str | None
53
53
  voice: Voice | str
54
54
  language: NotGivenOr[str]
55
- response_modalities: NotGivenOr[list[types.Modality]]
55
+ response_modalities: list[types.Modality]
56
56
  vertexai: bool
57
57
  project: str | None
58
58
  location: str | None
@@ -192,12 +192,15 @@ class RealtimeModel(llm.RealtimeModel):
192
192
  ):
193
193
  server_turn_detection = False
194
194
 
195
+ modalities = modalities if is_given(modalities) else [types.Modality.AUDIO]
196
+
195
197
  super().__init__(
196
198
  capabilities=llm.RealtimeCapabilities(
197
199
  message_truncation=False,
198
200
  turn_detection=server_turn_detection,
199
201
  user_transcription=input_audio_transcription is not None,
200
202
  auto_tool_reply_generation=True,
203
+ audio_output=types.Modality.AUDIO in modalities,
201
204
  )
202
205
  )
203
206
 
@@ -538,7 +541,9 @@ class RealtimeSession(llm.RealtimeSession):
538
541
  return
539
542
  self.start_user_activity()
540
543
 
541
- def truncate(self, *, message_id: str, audio_end_ms: int) -> None:
544
+ def truncate(
545
+ self, *, message_id: str, audio_end_ms: int, audio_transcript: NotGivenOr[str] = NOT_GIVEN
546
+ ) -> None:
542
547
  logger.warning("truncate is not supported by the Google Realtime API.")
543
548
  pass
544
549
 
@@ -732,9 +737,7 @@ class RealtimeSession(llm.RealtimeSession):
732
737
  gemini_tools=self._opts.gemini_tools if is_given(self._opts.gemini_tools) else None,
733
738
  )
734
739
  conf = types.LiveConnectConfig(
735
- response_modalities=self._opts.response_modalities
736
- if is_given(self._opts.response_modalities)
737
- else [types.Modality.AUDIO],
740
+ response_modalities=self._opts.response_modalities,
738
741
  generation_config=types.GenerationConfig(
739
742
  candidate_count=self._opts.candidate_count,
740
743
  temperature=temp,
@@ -793,6 +796,8 @@ class RealtimeSession(llm.RealtimeSession):
793
796
  audio_ch=utils.aio.Chan[rtc.AudioFrame](),
794
797
  _created_timestamp=time.time(),
795
798
  )
799
+ if not self._realtime_model.capabilities.audio_output:
800
+ self._current_generation.audio_ch.close()
796
801
 
797
802
  self._current_generation.message_ch.send_nowait(
798
803
  llm.MessageGeneration(
@@ -937,7 +942,6 @@ class RealtimeSession(llm.RealtimeSession):
937
942
  arguments=arguments,
938
943
  )
939
944
  )
940
- self._on_final_input_audio_transcription()
941
945
  self._mark_current_generation_done()
942
946
 
943
947
  def _handle_tool_call_cancellation(
@@ -1018,15 +1022,6 @@ class RealtimeSession(llm.RealtimeSession):
1018
1022
  # TODO(dz): this isn't a seamless reconnection just yet
1019
1023
  self._session_should_close.set()
1020
1024
 
1021
- def _on_final_input_audio_transcription(self) -> None:
1022
- if (gen := self._current_generation) and gen.input_transcription:
1023
- self.emit(
1024
- "input_audio_transcription_completed",
1025
- llm.InputTranscriptionCompleted(
1026
- item_id=gen.response_id, transcript=gen.input_transcription, is_final=True
1027
- ),
1028
- )
1029
-
1030
1025
  def commit_audio(self) -> None:
1031
1026
  pass
1032
1027
 
@@ -187,6 +187,10 @@ class LLM(llm.LLM):
187
187
  location=gcp_location,
188
188
  )
189
189
 
190
+ @property
191
+ def model(self) -> str:
192
+ return self._opts.model
193
+
190
194
  def chat(
191
195
  self,
192
196
  *,
@@ -34,7 +34,6 @@ from livekit.agents.utils import is_given
34
34
  from .log import logger
35
35
  from .models import Gender, SpeechLanguages
36
36
 
37
- BUFFERED_WORDS_COUNT = 8
38
37
  NUM_CHANNELS = 1
39
38
  DEFAULT_VOICE_NAME = "en-US-Chirp3-HD-Charon"
40
39
  DEFAULT_LANGUAGE = "en-US"
@@ -124,7 +123,7 @@ class TTS(tts.TTS):
124
123
  ssml_gender=ssml_gender,
125
124
  )
126
125
  if not is_given(tokenizer):
127
- tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
126
+ tokenizer = tokenize.blingfire.SentenceTokenizer()
128
127
 
129
128
  pronunciations = None if not is_given(custom_pronunciations) else custom_pronunciations
130
129
 
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.1.6"
15
+ __version__ = "1.2.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.1.6
3
+ Version: 1.2.0
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -22,7 +22,7 @@ Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
23
  Requires-Dist: google-cloud-texttospeech<3,>=2.27
24
24
  Requires-Dist: google-genai>=v1.23.0
25
- Requires-Dist: livekit-agents>=1.1.6
25
+ Requires-Dist: livekit-agents>=1.2.0
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # Google AI plugin for LiveKit Agents
@@ -1,17 +1,18 @@
1
1
  livekit/plugins/google/__init__.py,sha256=XIyZ-iFnRBpaLtOJgVwojlB-a8GjdDugVFcjBpMEww8,1412
2
- livekit/plugins/google/llm.py,sha256=PqRQk6E2XfWkTdDrtEtcHjknGZMGbkQgVLr8uTg7F-s,18960
2
+ livekit/plugins/google/llm.py,sha256=VmM-OEDRplHEYEVHh9rq9rI180yP7xvu_JTI2zFolbY,19035
3
3
  livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
4
  livekit/plugins/google/models.py,sha256=hOpfbN_qdQ1ZTpCN9m9dvG2eb6WgQ3KE3WRpIeeM_T0,1569
5
5
  livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  livekit/plugins/google/stt.py,sha256=ssDMH5U1vQOLA44XMlovYWIR4UqVtZSge3YFN-zZ7Iw,24696
7
7
  livekit/plugins/google/tools.py,sha256=tD5HVDHO5JfUF029Cx3axHMJec0Gxalkl7s1FDgxLzI,259
8
- livekit/plugins/google/tts.py,sha256=QVM4xcF7WHpbQOZDAhRJrz481iMhO9ACjjqPEdTT4Lw,16277
8
+ livekit/plugins/google/tts.py,sha256=SODcGwteJIpGmuFArVRwuwy49k8-uQXJAIK5wBNiMC8,16219
9
9
  livekit/plugins/google/utils.py,sha256=6iihkKx76DDtLiHOoTU2ZXqzupBRY_gN3njpnwdmeqY,8829
10
- livekit/plugins/google/version.py,sha256=-bNd31cMcYCdhZCIKJ1-jtY4NgZvppVgKyzXAIzQtqM,600
11
- livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
10
+ livekit/plugins/google/version.py,sha256=K1_77sNd00TXwPoLo9GplK9QYgMVj_EQWf1hyvhbWYI,600
11
+ livekit/plugins/google/beta/__init__.py,sha256=RvAUdvEiRN-fe4JrgPcN0Jkw1kZR9wPerGMFVjS1Cc0,270
12
+ livekit/plugins/google/beta/gemini_tts.py,sha256=esWjr0Xf95tl0_AB7MXiFZ_VCORWgcWjzvLvRa3t0FQ,8515
12
13
  livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
13
14
  livekit/plugins/google/beta/realtime/api_proto.py,sha256=cbKmpX32G4gPjF6cxFNzGEDfYX19SK-vWi4Myxb8Yks,777
14
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=tlAsTFsumqOavC9JT2SuQi_3eGYygZ3bbS-nEM7ea8Q,46293
15
- livekit_plugins_google-1.1.6.dist-info/METADATA,sha256=mxBww_GRBSqdB7Djd5iZYZmik-gDD2wtTRS3i0qRbDs,1907
16
- livekit_plugins_google-1.1.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
- livekit_plugins_google-1.1.6.dist-info/RECORD,,
15
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=LMUOhmv4KWTI6mrLAKnnRdZ72ouug69a_oULGKvoMvw,46073
16
+ livekit_plugins_google-1.2.0.dist-info/METADATA,sha256=Vgf6Wp4oeos1kobtO3Juzs2WTPQBgeXBHKcaF8GRR78,1907
17
+ livekit_plugins_google-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
18
+ livekit_plugins_google-1.2.0.dist-info/RECORD,,