livekit-plugins-google 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,6 +68,7 @@ class _RealtimeOptions:
68
68
  output_audio_transcription: types.AudioTranscriptionConfig | None
69
69
  image_encode_options: NotGivenOr[images.EncodeOptions]
70
70
  conn_options: APIConnectOptions
71
+ http_options: NotGivenOr[types.HttpOptions]
71
72
  enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN
72
73
  proactivity: NotGivenOr[bool] = NOT_GIVEN
73
74
  realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN
@@ -136,6 +137,7 @@ class RealtimeModel(llm.RealtimeModel):
136
137
  context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN,
137
138
  api_version: NotGivenOr[str] = NOT_GIVEN,
138
139
  conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
140
+ http_options: NotGivenOr[types.HttpOptions] = NOT_GIVEN,
139
141
  _gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
140
142
  ) -> None:
141
143
  """
@@ -259,6 +261,7 @@ class RealtimeModel(llm.RealtimeModel):
259
261
  api_version=api_version,
260
262
  gemini_tools=_gemini_tools,
261
263
  conn_options=conn_options,
264
+ http_options=http_options,
262
265
  )
263
266
 
264
267
  self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -319,7 +322,9 @@ class RealtimeSession(llm.RealtimeSession):
319
322
  if not api_version and (self._opts.enable_affective_dialog or self._opts.proactivity):
320
323
  api_version = "v1alpha"
321
324
 
322
- http_options = types.HttpOptions(timeout=int(self._opts.conn_options.timeout * 1000))
325
+ http_options = self._opts.http_options or types.HttpOptions(
326
+ timeout=int(self._opts.conn_options.timeout * 1000)
327
+ )
323
328
  if api_version:
324
329
  http_options.api_version = api_version
325
330
 
@@ -902,6 +907,9 @@ class RealtimeSession(llm.RealtimeSession):
902
907
  )
903
908
 
904
909
  if not gen.text_ch.closed:
910
+ if self._opts.output_audio_transcription is None:
911
+ # close the text data of transcription synchronizer
912
+ gen.text_ch.send_nowait("")
905
913
  gen.text_ch.close()
906
914
  if not gen.audio_ch.closed:
907
915
  gen.audio_ch.close()
@@ -60,6 +60,7 @@ class _LLMOptions:
60
60
  frequency_penalty: NotGivenOr[float]
61
61
  thinking_config: NotGivenOr[types.ThinkingConfigOrDict]
62
62
  gemini_tools: NotGivenOr[list[_LLMTool]]
63
+ http_options: NotGivenOr[types.HttpOptions]
63
64
 
64
65
 
65
66
  class LLM(llm.LLM):
@@ -80,6 +81,7 @@ class LLM(llm.LLM):
80
81
  tool_choice: NotGivenOr[ToolChoice] = NOT_GIVEN,
81
82
  thinking_config: NotGivenOr[types.ThinkingConfigOrDict] = NOT_GIVEN,
82
83
  gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
84
+ http_options: NotGivenOr[types.HttpOptions] = NOT_GIVEN,
83
85
  ) -> None:
84
86
  """
85
87
  Create a new instance of Google GenAI LLM.
@@ -106,6 +108,7 @@ class LLM(llm.LLM):
106
108
  tool_choice (ToolChoice, optional): Specifies whether to use tools during response generation. Defaults to "auto".
107
109
  thinking_config (ThinkingConfigOrDict, optional): The thinking configuration for response generation. Defaults to None.
108
110
  gemini_tools (list[LLMTool], optional): The Gemini-specific tools to use for the session.
111
+ http_options (HttpOptions, optional): The HTTP options to use for the session.
109
112
  """ # noqa: E501
110
113
  super().__init__()
111
114
  gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
@@ -166,6 +169,7 @@ class LLM(llm.LLM):
166
169
  frequency_penalty=frequency_penalty,
167
170
  thinking_config=thinking_config,
168
171
  gemini_tools=gemini_tools,
172
+ http_options=http_options,
169
173
  )
170
174
  self._client = Client(
171
175
  api_key=gemini_api_key,
@@ -312,8 +316,9 @@ class LLMStream(llm.LLMStream):
312
316
  if extra_data.system_messages
313
317
  else None
314
318
  ),
315
- http_options=types.HttpOptions(
316
- timeout=int(self._conn_options.timeout * 1000),
319
+ http_options=(
320
+ self._llm._opts.http_options
321
+ or types.HttpOptions(timeout=int(self._conn_options.timeout * 1000))
317
322
  ),
318
323
  **self._extra_kwargs,
319
324
  )
@@ -381,11 +381,14 @@ class SpeechStream(stt.SpeechStream):
381
381
  self._reconnect_event.set()
382
382
 
383
383
  async def _run(self) -> None:
384
+ audio_pushed = False
385
+
384
386
  # google requires a async generator when calling streaming_recognize
385
387
  # this function basically convert the queue into a async generator
386
388
  async def input_generator(
387
389
  client: SpeechAsyncClient, should_stop: asyncio.Event
388
390
  ) -> AsyncGenerator[cloud_speech.StreamingRecognizeRequest, None]:
391
+ nonlocal audio_pushed
389
392
  try:
390
393
  # first request should contain the config
391
394
  yield cloud_speech.StreamingRecognizeRequest(
@@ -402,6 +405,8 @@ class SpeechStream(stt.SpeechStream):
402
405
 
403
406
  if isinstance(frame, rtc.AudioFrame):
404
407
  yield cloud_speech.StreamingRecognizeRequest(audio=frame.data.tobytes())
408
+ if not audio_pushed:
409
+ audio_pushed = True
405
410
 
406
411
  except Exception:
407
412
  logger.exception("an error occurred while streaming input to google STT")
@@ -470,6 +475,7 @@ class SpeechStream(stt.SpeechStream):
470
475
  has_started = False
471
476
 
472
477
  while True:
478
+ audio_pushed = False
473
479
  try:
474
480
  async with self._pool.connection(timeout=self._conn_options.timeout) as client:
475
481
  self._streaming_config = cloud_speech.StreamingRecognitionConfig(
@@ -514,13 +520,21 @@ class SpeechStream(stt.SpeechStream):
514
520
  break
515
521
  self._reconnect_event.clear()
516
522
  finally:
517
- await utils.aio.gracefully_cancel(process_stream_task, wait_reconnect_task)
518
523
  should_stop.set()
524
+ if not process_stream_task.done() and not wait_reconnect_task.done():
525
+ # try to gracefully stop the process_stream_task
526
+ try:
527
+ await asyncio.wait_for(process_stream_task, timeout=1.0)
528
+ except asyncio.TimeoutError:
529
+ pass
530
+
531
+ await utils.aio.gracefully_cancel(process_stream_task, wait_reconnect_task)
519
532
  except DeadlineExceeded:
520
533
  raise APITimeoutError() from None
521
534
  except GoogleAPICallError as e:
522
535
  if e.code == 409:
523
- logger.debug("stream timed out, restarting.")
536
+ if audio_pushed:
537
+ logger.debug("stream timed out, restarting.")
524
538
  else:
525
539
  raise APIStatusError(
526
540
  f"{e.message} {e.details}", status_code=e.code or -1
@@ -46,6 +46,8 @@ class _TTSOptions:
46
46
  effects_profile_id: str
47
47
  speaking_rate: float
48
48
  tokenizer: tokenize.SentenceTokenizer
49
+ volume_gain_db: float
50
+ enable_ssml: bool
49
51
 
50
52
 
51
53
  class TTS(tts.TTS):
@@ -59,12 +61,14 @@ class TTS(tts.TTS):
59
61
  pitch: int = 0,
60
62
  effects_profile_id: str = "",
61
63
  speaking_rate: float = 1.0,
64
+ volume_gain_db: float = 0.0,
62
65
  location: str = "global",
63
66
  audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.OGG_OPUS, # type: ignore
64
67
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
65
68
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
66
69
  tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
67
70
  use_streaming: bool = True,
71
+ enable_ssml: bool = False,
68
72
  ) -> None:
69
73
  """
70
74
  Create a new instance of Google TTS.
@@ -82,10 +86,12 @@ class TTS(tts.TTS):
82
86
  pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
83
87
  effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
84
88
  speaking_rate (float, optional): Speed of speech. Default is 1.0.
89
+ volume_gain_db (float, optional): Volume gain in decibels. Default is 0.0. In the range [-96.0, 16.0]. Strongly recommended not to exceed +10 (dB).
85
90
  credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
86
91
  credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
87
92
  tokenizer (tokenize.SentenceTokenizer, optional): Tokenizer for the TTS. Default is a basic sentence tokenizer.
88
93
  use_streaming (bool, optional): Whether to use streaming synthesis. Default is True.
94
+ enable_ssml (bool, optional): Whether to enable SSML support. Default is False.
89
95
  """ # noqa: E501
90
96
  super().__init__(
91
97
  capabilities=tts.TTSCapabilities(streaming=use_streaming),
@@ -93,6 +99,9 @@ class TTS(tts.TTS):
93
99
  num_channels=1,
94
100
  )
95
101
 
102
+ if enable_ssml and use_streaming:
103
+ raise ValueError("SSML support is not available for streaming synthesis")
104
+
96
105
  self._client: texttospeech.TextToSpeechAsyncClient | None = None
97
106
  self._credentials_info = credentials_info
98
107
  self._credentials_file = credentials_file
@@ -118,6 +127,8 @@ class TTS(tts.TTS):
118
127
  effects_profile_id=effects_profile_id,
119
128
  speaking_rate=speaking_rate,
120
129
  tokenizer=tokenizer,
130
+ volume_gain_db=volume_gain_db,
131
+ enable_ssml=enable_ssml,
121
132
  )
122
133
  self._streams = weakref.WeakSet[SynthesizeStream]()
123
134
 
@@ -128,6 +139,7 @@ class TTS(tts.TTS):
128
139
  gender: NotGivenOr[Gender | str] = NOT_GIVEN,
129
140
  voice_name: NotGivenOr[str] = NOT_GIVEN,
130
141
  speaking_rate: NotGivenOr[float] = NOT_GIVEN,
142
+ volume_gain_db: NotGivenOr[float] = NOT_GIVEN,
131
143
  ) -> None:
132
144
  """
133
145
  Update the TTS options.
@@ -137,6 +149,7 @@ class TTS(tts.TTS):
137
149
  gender (Gender | str, optional): Voice gender ("male", "female", "neutral").
138
150
  voice_name (str, optional): Specific voice name.
139
151
  speaking_rate (float, optional): Speed of speech.
152
+ volume_gain_db (float, optional): Volume gain in decibels.
140
153
  """
141
154
  params = {}
142
155
  if is_given(language):
@@ -151,6 +164,8 @@ class TTS(tts.TTS):
151
164
 
152
165
  if is_given(speaking_rate):
153
166
  self._opts.speaking_rate = speaking_rate
167
+ if is_given(volume_gain_db):
168
+ self._opts.volume_gain_db = volume_gain_db
154
169
 
155
170
  def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
156
171
  api_endpoint = "texttospeech.googleapis.com"
@@ -199,10 +214,21 @@ class ChunkedStream(tts.ChunkedStream):
199
214
  self._tts: TTS = tts
200
215
  self._opts = replace(tts._opts)
201
216
 
217
+ def _build_ssml(self) -> str:
218
+ ssml = "<speak>"
219
+ ssml += self._input_text
220
+ ssml += "</speak>"
221
+ return ssml
222
+
202
223
  async def _run(self, output_emitter: tts.AudioEmitter) -> None:
203
224
  try:
225
+ input = (
226
+ texttospeech.SynthesisInput(ssml=self._build_ssml())
227
+ if self._opts.enable_ssml
228
+ else texttospeech.SynthesisInput(text=self._input_text)
229
+ )
204
230
  response: SynthesizeSpeechResponse = await self._tts._ensure_client().synthesize_speech(
205
- input=texttospeech.SynthesisInput(text=self._input_text),
231
+ input=input,
206
232
  voice=self._opts.voice,
207
233
  audio_config=texttospeech.AudioConfig(
208
234
  audio_encoding=self._opts.encoding,
@@ -210,6 +236,7 @@ class ChunkedStream(tts.ChunkedStream):
210
236
  pitch=self._opts.pitch,
211
237
  effects_profile_id=self._opts.effects_profile_id,
212
238
  speaking_rate=self._opts.speaking_rate,
239
+ volume_gain_db=self._opts.volume_gain_db,
213
240
  ),
214
241
  timeout=self._conn_options.timeout,
215
242
  )
@@ -256,7 +283,9 @@ class SynthesizeStream(tts.SynthesizeStream):
256
283
  streaming_config = texttospeech.StreamingSynthesizeConfig(
257
284
  voice=self._opts.voice,
258
285
  streaming_audio_config=texttospeech.StreamingAudioConfig(
259
- audio_encoding=encoding, sample_rate_hertz=self._opts.sample_rate
286
+ audio_encoding=encoding,
287
+ sample_rate_hertz=self._opts.sample_rate,
288
+ speaking_rate=self._opts.speaking_rate,
260
289
  ),
261
290
  )
262
291
 
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.1.0"
15
+ __version__ = "1.1.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.1.0
3
+ Version: 1.1.2
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -20,9 +20,9 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: >=3.9.0
21
21
  Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
- Requires-Dist: google-cloud-texttospeech<3,>=2.24
24
- Requires-Dist: google-genai>=v1.16.1
25
- Requires-Dist: livekit-agents>=1.1.0
23
+ Requires-Dist: google-cloud-texttospeech<3,>=2.27
24
+ Requires-Dist: google-genai>=v1.21.1
25
+ Requires-Dist: livekit-agents>=1.1.2
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # Google AI plugin for LiveKit Agents
@@ -1,17 +1,17 @@
1
1
  livekit/plugins/google/__init__.py,sha256=XIyZ-iFnRBpaLtOJgVwojlB-a8GjdDugVFcjBpMEww8,1412
2
- livekit/plugins/google/llm.py,sha256=MIi-6kk8AZQxcf5y4zB3HwwEQHAJSCIdX79yf9QMAvI,17835
2
+ livekit/plugins/google/llm.py,sha256=Feb2ixNN9YoDt3aPXkQNeVx2c-wkmrf-mv4r3vggY1s,18131
3
3
  livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
4
  livekit/plugins/google/models.py,sha256=hOpfbN_qdQ1ZTpCN9m9dvG2eb6WgQ3KE3WRpIeeM_T0,1569
5
5
  livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/google/stt.py,sha256=SddM50w6g2rNkjaF5OtrPwEH-qqq36sa-v_6ogKoBYg,24077
6
+ livekit/plugins/google/stt.py,sha256=ssDMH5U1vQOLA44XMlovYWIR4UqVtZSge3YFN-zZ7Iw,24696
7
7
  livekit/plugins/google/tools.py,sha256=tD5HVDHO5JfUF029Cx3axHMJec0Gxalkl7s1FDgxLzI,259
8
- livekit/plugins/google/tts.py,sha256=PzDfEfvQfj-uSHYOUelFnwYK0Wu2-5Mp8PID0b4I5kc,14293
8
+ livekit/plugins/google/tts.py,sha256=YTfce55MWNJyDH4k8U1O2giOcrtccTs8vrkiW9GuBR0,15541
9
9
  livekit/plugins/google/utils.py,sha256=-4z6wrjVaZPtFRowkpwaA2acBRfqtzTk4r2xrPDUdCk,8609
10
- livekit/plugins/google/version.py,sha256=7SjyflIFTjH0djSotKGIRoRykPCqMpVYetIlvHMFuh0,600
10
+ livekit/plugins/google/version.py,sha256=gqaIRup9hxsq6YNsBlKPmS5PL-B8yqSRTd8wRfj8zoQ,600
11
11
  livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
12
12
  livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
13
13
  livekit/plugins/google/beta/realtime/api_proto.py,sha256=NfE7xr2N3JOu7gVfWbAmDcEhs8vuZgMRu5vpScPJzsg,776
14
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=Mt-f7mkwVd7Aq84HPh_AdIOaB4ye8d6TTllcEjKO5TY,45918
15
- livekit_plugins_google-1.1.0.dist-info/METADATA,sha256=HeQoxgYu0-hOIOawXsvtwHeESXj1U2Oo5GpwEUEx-W8,1907
16
- livekit_plugins_google-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
- livekit_plugins_google-1.1.0.dist-info/RECORD,,
14
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=tlAsTFsumqOavC9JT2SuQi_3eGYygZ3bbS-nEM7ea8Q,46293
15
+ livekit_plugins_google-1.1.2.dist-info/METADATA,sha256=cAk_E0o73mOJ1wFsuUFzmzW4vZ2B_lbM2O3aZeHoHq4,1907
16
+ livekit_plugins_google-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
+ livekit_plugins_google-1.1.2.dist-info/RECORD,,