livekit-plugins-google 1.0.18__py3-none-any.whl → 1.0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,13 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ """Google AI plugin for LiveKit Agents
16
+
17
+ Supports Gemini, Cloud Speech-to-Text, and Cloud Text-to-Speech.
18
+
19
+ See https://docs.livekit.io/agents/integrations/stt/google/ for more information.
20
+ """
21
+
15
22
  from . import beta
16
23
  from .llm import LLM
17
24
  from .stt import STT, SpeechStream
@@ -1,3 +1,12 @@
1
1
  from . import realtime
2
2
 
3
3
  __all__ = ["realtime"]
4
+
5
+ # Cleanup docs of unexported modules
6
+ _module = dir()
7
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
8
+
9
+ __pdoc__ = {}
10
+
11
+ for n in NOT_IN_ALL:
12
+ __pdoc__[n] = False
@@ -18,6 +18,7 @@ from google.genai.types import (
18
18
  GenerationConfig,
19
19
  LiveClientContent,
20
20
  LiveClientRealtimeInput,
21
+ LiveClientToolResponse,
21
22
  LiveConnectConfig,
22
23
  LiveServerContent,
23
24
  LiveServerGoAway,
@@ -101,7 +102,7 @@ class RealtimeModel(llm.RealtimeModel):
101
102
  self,
102
103
  *,
103
104
  instructions: NotGivenOr[str] = NOT_GIVEN,
104
- model: LiveAPIModels | str = "gemini-2.0-flash-live-001",
105
+ model: NotGivenOr[LiveAPIModels | str] = NOT_GIVEN,
105
106
  api_key: NotGivenOr[str] = NOT_GIVEN,
106
107
  voice: Voice | str = "Puck",
107
108
  language: NotGivenOr[str] = NOT_GIVEN,
@@ -133,7 +134,7 @@ class RealtimeModel(llm.RealtimeModel):
133
134
  instructions (str, optional): Initial system instructions for the model. Defaults to "".
134
135
  api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
135
136
  modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
136
- model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
137
+ model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001" or "gemini-2.0-flash-exp" (vertexai).
137
138
  voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
138
139
  language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
139
140
  temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
@@ -159,14 +160,24 @@ class RealtimeModel(llm.RealtimeModel):
159
160
  )
160
161
  )
161
162
 
163
+ if not is_given(model):
164
+ if vertexai:
165
+ model = "gemini-2.0-flash-exp"
166
+ else:
167
+ model = "gemini-2.0-flash-live-001"
168
+
162
169
  gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
163
170
  gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
164
- gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
171
+ gcp_location = (
172
+ location
173
+ if is_given(location)
174
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
175
+ )
165
176
 
166
177
  if vertexai:
167
178
  if not gcp_project or not gcp_location:
168
179
  raise ValueError(
169
- "Project and location are required for VertexAI either via project and location or GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables" # noqa: E501
180
+ "Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
170
181
  )
171
182
  gemini_api_key = None # VertexAI does not require an API key
172
183
  else:
@@ -310,7 +321,9 @@ class RealtimeSession(llm.RealtimeSession):
310
321
  async with self._update_lock:
311
322
  self._chat_ctx = chat_ctx.copy()
312
323
  turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
313
- tool_results = get_tool_results_for_realtime(self._chat_ctx)
324
+ tool_results = get_tool_results_for_realtime(
325
+ self._chat_ctx, vertexai=self._opts.vertexai
326
+ )
314
327
  # TODO(dz): need to compute delta and then either append or recreate session
315
328
  if turns:
316
329
  self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
@@ -481,11 +494,18 @@ class RealtimeSession(llm.RealtimeSession):
481
494
  not self._active_session or self._active_session != session
482
495
  ):
483
496
  break
484
-
485
497
  if isinstance(msg, LiveClientContent):
486
- await session.send(input=msg)
498
+ await session.send_client_content(
499
+ turns=msg.turns, turn_complete=msg.turn_complete
500
+ )
501
+ elif isinstance(msg, LiveClientToolResponse):
502
+ await session.send_tool_response(function_responses=msg.function_responses)
503
+ elif isinstance(msg, LiveClientRealtimeInput):
504
+ for media_chunk in msg.media_chunks:
505
+ await session.send_realtime_input(media=media_chunk)
487
506
  else:
488
- await session.send(input=msg)
507
+ logger.warning(f"Warning: Received unhandled message type: {type(msg)}")
508
+
489
509
  except Exception as e:
490
510
  if not self._session_should_close.is_set():
491
511
  logger.error(f"error in send task: {e}", exc_info=e)
@@ -241,7 +241,7 @@ class LLM(llm.LLM):
241
241
  client=self._client,
242
242
  model=self._opts.model,
243
243
  chat_ctx=chat_ctx,
244
- tools=tools,
244
+ tools=tools or [],
245
245
  conn_options=conn_options,
246
246
  extra_kwargs=extra,
247
247
  )
@@ -256,7 +256,7 @@ class LLMStream(llm.LLMStream):
256
256
  model: str | ChatModels,
257
257
  chat_ctx: llm.ChatContext,
258
258
  conn_options: APIConnectOptions,
259
- tools: list[FunctionTool] | None,
259
+ tools: list[FunctionTool],
260
260
  extra_kwargs: dict[str, Any],
261
261
  ) -> None:
262
262
  super().__init__(llm, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
@@ -270,7 +270,7 @@ class LLMStream(llm.LLMStream):
270
270
  request_id = utils.shortuuid()
271
271
 
272
272
  try:
273
- turns, system_instruction = to_chat_ctx(self._chat_ctx, id(self._llm))
273
+ turns, system_instruction = to_chat_ctx(self._chat_ctx, id(self._llm), generate=True)
274
274
  function_declarations = to_fnc_ctx(self._tools)
275
275
  if function_declarations:
276
276
  self._extra_kwargs["tools"] = [
@@ -54,7 +54,7 @@ LanguageCode = Union[LgType, list[LgType]]
54
54
  _max_session_duration = 240
55
55
 
56
56
  # Google is very sensitive to background noise, so we'll ignore results with low confidence
57
- _min_confidence = 0.65
57
+ _default_min_confidence = 0.65
58
58
 
59
59
 
60
60
  # This class is only be used internally to encapsulate the options
@@ -67,6 +67,7 @@ class STTOptions:
67
67
  spoken_punctuation: bool
68
68
  model: SpeechModels | str
69
69
  sample_rate: int
70
+ min_confidence_threshold: float
70
71
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
71
72
 
72
73
  def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
@@ -98,9 +99,11 @@ class STT(stt.STT):
98
99
  model: SpeechModels | str = "latest_long",
99
100
  location: str = "global",
100
101
  sample_rate: int = 16000,
102
+ min_confidence_threshold: float = _default_min_confidence,
101
103
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
102
104
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
103
105
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
106
+ use_streaming: NotGivenOr[bool] = NOT_GIVEN,
104
107
  ):
105
108
  """
106
109
  Create a new instance of Google STT.
@@ -118,11 +121,18 @@ class STT(stt.STT):
118
121
  model(SpeechModels): the model to use for recognition default: "latest_long"
119
122
  location(str): the location to use for recognition default: "global"
120
123
  sample_rate(int): the sample rate of the audio default: 16000
124
+ min_confidence_threshold(float): minimum confidence threshold for recognition
125
+ (default: 0.65)
121
126
  credentials_info(dict): the credentials info to use for recognition (default: None)
122
127
  credentials_file(str): the credentials file to use for recognition (default: None)
123
128
  keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
129
+ use_streaming(bool): whether to use streaming for recognition (default: True)
124
130
  """
125
- super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=True))
131
+ if not is_given(use_streaming):
132
+ use_streaming = True
133
+ super().__init__(
134
+ capabilities=stt.STTCapabilities(streaming=use_streaming, interim_results=True)
135
+ )
126
136
 
127
137
  self._location = location
128
138
  self._credentials_info = credentials_info
@@ -149,6 +159,7 @@ class STT(stt.STT):
149
159
  spoken_punctuation=spoken_punctuation,
150
160
  model=model,
151
161
  sample_rate=sample_rate,
162
+ min_confidence_threshold=min_confidence_threshold,
152
163
  keywords=keywords,
153
164
  )
154
165
  self._streams = weakref.WeakSet[SpeechStream]()
@@ -246,7 +257,7 @@ class STT(stt.STT):
246
257
  except DeadlineExceeded:
247
258
  raise APITimeoutError() from None
248
259
  except GoogleAPICallError as e:
249
- raise APIStatusError(e.message, status_code=e.code or -1) from None
260
+ raise APIStatusError(f"{e.message} {e.details}", status_code=e.code or -1) from e
250
261
  except Exception as e:
251
262
  raise APIConnectionError() from e
252
263
 
@@ -343,6 +354,7 @@ class SpeechStream(stt.SpeechStream):
343
354
  punctuate: NotGivenOr[bool] = NOT_GIVEN,
344
355
  spoken_punctuation: NotGivenOr[bool] = NOT_GIVEN,
345
356
  model: NotGivenOr[SpeechModels] = NOT_GIVEN,
357
+ min_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
346
358
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
347
359
  ):
348
360
  if is_given(languages):
@@ -359,6 +371,8 @@ class SpeechStream(stt.SpeechStream):
359
371
  self._config.spoken_punctuation = spoken_punctuation
360
372
  if is_given(model):
361
373
  self._config.model = model
374
+ if is_given(min_confidence_threshold):
375
+ self._config.min_confidence_threshold = min_confidence_threshold
362
376
  if is_given(keywords):
363
377
  self._config.keywords = keywords
364
378
 
@@ -405,7 +419,10 @@ class SpeechStream(stt.SpeechStream):
405
419
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED # noqa: E501
406
420
  ):
407
421
  result = resp.results[0]
408
- speech_data = _streaming_recognize_response_to_speech_data(resp)
422
+ speech_data = _streaming_recognize_response_to_speech_data(
423
+ resp,
424
+ min_confidence_threshold=self._config.min_confidence_threshold,
425
+ )
409
426
  if speech_data is None:
410
427
  continue
411
428
 
@@ -461,6 +478,7 @@ class SpeechStream(stt.SpeechStream):
461
478
  features=cloud_speech.RecognitionFeatures(
462
479
  enable_automatic_punctuation=self._config.punctuate,
463
480
  enable_word_time_offsets=True,
481
+ enable_spoken_punctuation=self._config.spoken_punctuation,
464
482
  ),
465
483
  ),
466
484
  streaming_features=cloud_speech.StreamingRecognitionFeatures(
@@ -494,7 +512,12 @@ class SpeechStream(stt.SpeechStream):
494
512
  except DeadlineExceeded:
495
513
  raise APITimeoutError() from None
496
514
  except GoogleAPICallError as e:
497
- raise APIStatusError(e.message, status_code=e.code or -1) from None
515
+ if e.code == 409:
516
+ logger.debug("stream timed out, restarting.")
517
+ else:
518
+ raise APIStatusError(
519
+ f"{e.message} {e.details}", status_code=e.code or -1
520
+ ) from e
498
521
  except Exception as e:
499
522
  raise APIConnectionError() from e
500
523
 
@@ -530,6 +553,8 @@ def _recognize_response_to_speech_event(
530
553
 
531
554
  def _streaming_recognize_response_to_speech_data(
532
555
  resp: cloud_speech.StreamingRecognizeResponse,
556
+ *,
557
+ min_confidence_threshold: float,
533
558
  ) -> stt.SpeechData | None:
534
559
  text = ""
535
560
  confidence = 0.0
@@ -542,7 +567,7 @@ def _streaming_recognize_response_to_speech_data(
542
567
  confidence /= len(resp.results)
543
568
  lg = resp.results[0].language_code
544
569
 
545
- if confidence < _min_confidence:
570
+ if confidence < min_confidence_threshold:
546
571
  return None
547
572
  if text == "":
548
573
  return None
@@ -56,6 +56,7 @@ class TTS(tts.TTS):
56
56
  effects_profile_id: str = "",
57
57
  speaking_rate: float = 1.0,
58
58
  location: str = "global",
59
+ audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.PCM,
59
60
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
60
61
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
61
62
  ) -> None:
@@ -105,7 +106,7 @@ class TTS(tts.TTS):
105
106
  self._opts = _TTSOptions(
106
107
  voice=voice_params,
107
108
  audio_config=texttospeech.AudioConfig(
108
- audio_encoding=texttospeech.AudioEncoding.PCM,
109
+ audio_encoding=audio_encoding,
109
110
  sample_rate_hertz=sample_rate,
110
111
  pitch=pitch,
111
112
  effects_profile_id=effects_profile_id,
@@ -20,17 +20,21 @@ def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
20
20
  return [_build_gemini_fnc(fnc) for fnc in fncs]
21
21
 
22
22
 
23
- def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClientToolResponse | None:
23
+ def get_tool_results_for_realtime(
24
+ chat_ctx: llm.ChatContext, *, vertexai: bool = False
25
+ ) -> types.LiveClientToolResponse | None:
24
26
  function_responses: list[types.FunctionResponse] = []
25
27
  for msg in chat_ctx.items:
26
28
  if msg.type == "function_call_output":
27
- function_responses.append(
28
- types.FunctionResponse(
29
- id=msg.call_id,
30
- name=msg.name,
31
- response={"output": msg.output},
32
- )
29
+ res = types.FunctionResponse(
30
+ name=msg.name,
31
+ response={"output": msg.output},
33
32
  )
33
+ if not vertexai:
34
+ # vertexai does not support id in FunctionResponse
35
+ # see: https://github.com/googleapis/python-genai/blob/85e00bc/google/genai/_live_converters.py#L1435
36
+ res.id = msg.call_id
37
+ function_responses.append(res)
34
38
  return (
35
39
  types.LiveClientToolResponse(function_responses=function_responses)
36
40
  if function_responses
@@ -39,7 +43,10 @@ def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClient
39
43
 
40
44
 
41
45
  def to_chat_ctx(
42
- chat_ctx: llm.ChatContext, cache_key: Any, ignore_functions: bool = False
46
+ chat_ctx: llm.ChatContext,
47
+ cache_key: Any,
48
+ ignore_functions: bool = False,
49
+ generate: bool = False,
43
50
  ) -> tuple[list[types.Content], types.Content | None]:
44
51
  turns: list[types.Content] = []
45
52
  system_instruction: types.Content | None = None
@@ -99,10 +106,9 @@ def to_chat_ctx(
99
106
  if current_role is not None and parts:
100
107
  turns.append(types.Content(role=current_role, parts=parts))
101
108
 
102
- # # Gemini requires the last message to end with user's turn before they can generate
103
- # # currently not used because to_chat_ctx should not be used to force a new generation
104
- # if current_role != "user":
105
- # turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
109
+ # Gemini requires the last message to end with user's turn before they can generate
110
+ if generate and current_role != "user":
111
+ turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
106
112
 
107
113
  return turns, system_instruction
108
114
 
@@ -173,6 +179,15 @@ class _GeminiJsonSchema:
173
179
  schema.pop("title", None)
174
180
  schema.pop("default", None)
175
181
  schema.pop("additionalProperties", None)
182
+ schema.pop("$schema", None)
183
+
184
+ if (const := schema.pop("const", None)) is not None:
185
+ # Gemini doesn't support const, but it does support enum with a single value
186
+ schema["enum"] = [const]
187
+
188
+ schema.pop("discriminator", None)
189
+ schema.pop("examples", None)
190
+
176
191
  if ref := schema.pop("$ref", None):
177
192
  key = re.sub(r"^#/\$defs/", "", ref)
178
193
  if key in refs_stack:
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.18"
15
+ __version__ = "1.0.20"
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.4
2
+ Name: livekit-plugins-google
3
+ Version: 1.0.20
4
+ Summary: Agent Framework plugin for services from Google Cloud
5
+ Project-URL: Documentation, https://docs.livekit.io
6
+ Project-URL: Website, https://livekit.io/
7
+ Project-URL: Source, https://github.com/livekit/agents
8
+ Author: LiveKit
9
+ License-Expression: Apache-2.0
10
+ Keywords: audio,livekit,realtime,video,webrtc
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Topic :: Multimedia :: Sound/Audio
18
+ Classifier: Topic :: Multimedia :: Video
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.9.0
21
+ Requires-Dist: google-auth<3,>=2
22
+ Requires-Dist: google-cloud-speech<3,>=2
23
+ Requires-Dist: google-cloud-texttospeech<3,>=2
24
+ Requires-Dist: google-genai>=1.12.1
25
+ Requires-Dist: livekit-agents>=1.0.20
26
+ Description-Content-Type: text/markdown
27
+
28
+ # Google AI plugin for LiveKit Agents
29
+
30
+ Support for Gemini, Gemini Live, Cloud Speech-to-Text, and Cloud Text-to-Speech.
31
+
32
+ See [https://docs.livekit.io/agents/integrations/google/](https://docs.livekit.io/agents/integrations/google/) for more information.
33
+
34
+ ## Installation
35
+
36
+ ```bash
37
+ pip install livekit-plugins-google
38
+ ```
39
+
40
+ ## Pre-requisites
41
+
42
+ For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
43
+
44
+ To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
45
+
46
+ - Cloud Speech-to-Text API
47
+ - Cloud Text-to-Speech API
@@ -0,0 +1,16 @@
1
+ livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
2
+ livekit/plugins/google/llm.py,sha256=m_lRoUw4RIO1d-LtNYugl99LUNcA1y4NQ17wX7Vv5j0,16189
3
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
+ livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
7
+ livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
8
+ livekit/plugins/google/utils.py,sha256=zPzmnR-Rs2I87mT_k5S-PVbbuJMH8S-Hp5QcM4wv8vA,10067
9
+ livekit/plugins/google/version.py,sha256=t4KmPVTpEy1pOJ2GRCA-GNJfCQq_-zHNDBxGj4GKfVk,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=K_YD2CND3PMGV7c3gJY2UdReeLfsOPtIWDys5EU2T_A,31699
14
+ livekit_plugins_google-1.0.20.dist-info/METADATA,sha256=govmSaj6few3t11vreVNKlH9Ki2YzbRGnN3b3il2f20,1905
15
+ livekit_plugins_google-1.0.20.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ livekit_plugins_google-1.0.20.dist-info/RECORD,,
@@ -1,99 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: livekit-plugins-google
3
- Version: 1.0.18
4
- Summary: Agent Framework plugin for services from Google Cloud
5
- Project-URL: Documentation, https://docs.livekit.io
6
- Project-URL: Website, https://livekit.io/
7
- Project-URL: Source, https://github.com/livekit/agents
8
- Author: LiveKit
9
- License-Expression: Apache-2.0
10
- Keywords: audio,livekit,realtime,video,webrtc
11
- Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: Apache Software License
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3 :: Only
15
- Classifier: Programming Language :: Python :: 3.9
16
- Classifier: Programming Language :: Python :: 3.10
17
- Classifier: Topic :: Multimedia :: Sound/Audio
18
- Classifier: Topic :: Multimedia :: Video
19
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
- Requires-Python: >=3.9.0
21
- Requires-Dist: google-auth<3,>=2
22
- Requires-Dist: google-cloud-speech<3,>=2
23
- Requires-Dist: google-cloud-texttospeech<3,>=2
24
- Requires-Dist: google-genai>=1.12.1
25
- Requires-Dist: livekit-agents>=1.0.18
26
- Description-Content-Type: text/markdown
27
-
28
- # LiveKit Plugins Google
29
-
30
- Agent Framework plugin for services from Google Cloud. Currently supporting Google's [Speech-to-Text](https://cloud.google.com/speech-to-text) API.
31
-
32
- ## Installation
33
-
34
- ```bash
35
- pip install livekit-plugins-google
36
- ```
37
-
38
- ## Pre-requisites
39
-
40
- For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
41
-
42
- To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
43
-
44
- - Cloud Speech-to-Text API
45
- - Cloud Text-to-Speech API
46
-
47
-
48
- ## Gemini Multimodal Live
49
-
50
- Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
51
-
52
- ### Live Video Input (experimental)
53
-
54
- You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
55
-
56
- ```
57
- # Make sure you subscribe to audio and video tracks
58
- await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
59
-
60
- # Create your RealtimeModel and store a reference
61
- model = google.beta.realtime.RealtimeModel(
62
- # ...
63
- )
64
-
65
- # Create your MultimodalAgent as usual
66
- agent = MultimodalAgent(
67
- model=model,
68
- # ...
69
- )
70
-
71
- # Async method to process the video track and push frames to Gemini
72
- async def _process_video_track(self, track: Track):
73
- video_stream = VideoStream(track)
74
- last_frame_time = 0
75
-
76
- async for event in video_stream:
77
- current_time = asyncio.get_event_loop().time()
78
-
79
- # Sample at 1 FPS
80
- if current_time - last_frame_time < 1.0:
81
- continue
82
-
83
- last_frame_time = current_time
84
- frame = event.frame
85
-
86
- # Push the frame into the RealtimeSession
87
- model.sessions[0].push_video(frame)
88
-
89
- await video_stream.aclose()
90
-
91
- # Subscribe to new tracks and process them
92
- @ctx.room.on("track_subscribed")
93
- def _on_track_subscribed(track: Track, pub, participant):
94
- if track.kind == TrackKind.KIND_VIDEO:
95
- asyncio.create_task(self._process_video_track(track))
96
- ```
97
-
98
-
99
-
@@ -1,16 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
- livekit/plugins/google/llm.py,sha256=SqNGg6-wlrIUo9uaismP7QW5XztkXyDivJXLVgOIZMI,16175
3
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
- livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/google/stt.py,sha256=AG_lh2fuuduJi0jFbA_QKFXLJ6NUdF1W_FfkLUJML_Q,22413
7
- livekit/plugins/google/tts.py,sha256=fmQwW9a1kPsEsrTvIo8fqw479RxWEx0SIc3oTVaj41U,9031
8
- livekit/plugins/google/utils.py,sha256=TjjTwMbdJdxr3bZjUXxs-J_fipTTM00goW2-d9KWX6w,9582
9
- livekit/plugins/google/version.py,sha256=cnPu9FVKZV9tFmmz7lEvftrO3B_nWJVFghi3j6UcJLs,601
10
- livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=sXp2oHnTlHrAp5wFmcXj0bRtQKixBYedfbufcbjVHxk,30897
14
- livekit_plugins_google-1.0.18.dist-info/METADATA,sha256=Vqt0FoqibcKzX_jFXlyFkn-mT7iPC16JlH61VS0fbuw,3492
15
- livekit_plugins_google-1.0.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- livekit_plugins_google-1.0.18.dist-info/RECORD,,