livekit-plugins-google 1.0.18__py3-none-any.whl → 1.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/__init__.py +7 -0
- livekit/plugins/google/beta/__init__.py +9 -0
- livekit/plugins/google/beta/realtime/realtime_api.py +28 -8
- livekit/plugins/google/llm.py +3 -3
- livekit/plugins/google/stt.py +31 -6
- livekit/plugins/google/tts.py +2 -1
- livekit/plugins/google/utils.py +27 -12
- livekit/plugins/google/version.py +1 -1
- livekit_plugins_google-1.0.20.dist-info/METADATA +47 -0
- livekit_plugins_google-1.0.20.dist-info/RECORD +16 -0
- livekit_plugins_google-1.0.18.dist-info/METADATA +0 -99
- livekit_plugins_google-1.0.18.dist-info/RECORD +0 -16
- {livekit_plugins_google-1.0.18.dist-info → livekit_plugins_google-1.0.20.dist-info}/WHEEL +0 -0
@@ -12,6 +12,13 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
"""Google AI plugin for LiveKit Agents
|
16
|
+
|
17
|
+
Supports Gemini, Cloud Speech-to-Text, and Cloud Text-to-Speech.
|
18
|
+
|
19
|
+
See https://docs.livekit.io/agents/integrations/stt/google/ for more information.
|
20
|
+
"""
|
21
|
+
|
15
22
|
from . import beta
|
16
23
|
from .llm import LLM
|
17
24
|
from .stt import STT, SpeechStream
|
@@ -18,6 +18,7 @@ from google.genai.types import (
|
|
18
18
|
GenerationConfig,
|
19
19
|
LiveClientContent,
|
20
20
|
LiveClientRealtimeInput,
|
21
|
+
LiveClientToolResponse,
|
21
22
|
LiveConnectConfig,
|
22
23
|
LiveServerContent,
|
23
24
|
LiveServerGoAway,
|
@@ -101,7 +102,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
101
102
|
self,
|
102
103
|
*,
|
103
104
|
instructions: NotGivenOr[str] = NOT_GIVEN,
|
104
|
-
model: LiveAPIModels | str =
|
105
|
+
model: NotGivenOr[LiveAPIModels | str] = NOT_GIVEN,
|
105
106
|
api_key: NotGivenOr[str] = NOT_GIVEN,
|
106
107
|
voice: Voice | str = "Puck",
|
107
108
|
language: NotGivenOr[str] = NOT_GIVEN,
|
@@ -133,7 +134,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
133
134
|
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
134
135
|
api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
135
136
|
modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
136
|
-
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
|
137
|
+
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001" or "gemini-2.0-flash-exp" (vertexai).
|
137
138
|
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
138
139
|
language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
|
139
140
|
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
@@ -159,14 +160,24 @@ class RealtimeModel(llm.RealtimeModel):
|
|
159
160
|
)
|
160
161
|
)
|
161
162
|
|
163
|
+
if not is_given(model):
|
164
|
+
if vertexai:
|
165
|
+
model = "gemini-2.0-flash-exp"
|
166
|
+
else:
|
167
|
+
model = "gemini-2.0-flash-live-001"
|
168
|
+
|
162
169
|
gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
|
163
170
|
gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
|
164
|
-
gcp_location =
|
171
|
+
gcp_location = (
|
172
|
+
location
|
173
|
+
if is_given(location)
|
174
|
+
else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
|
175
|
+
)
|
165
176
|
|
166
177
|
if vertexai:
|
167
178
|
if not gcp_project or not gcp_location:
|
168
179
|
raise ValueError(
|
169
|
-
"Project
|
180
|
+
"Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
|
170
181
|
)
|
171
182
|
gemini_api_key = None # VertexAI does not require an API key
|
172
183
|
else:
|
@@ -310,7 +321,9 @@ class RealtimeSession(llm.RealtimeSession):
|
|
310
321
|
async with self._update_lock:
|
311
322
|
self._chat_ctx = chat_ctx.copy()
|
312
323
|
turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
|
313
|
-
tool_results = get_tool_results_for_realtime(
|
324
|
+
tool_results = get_tool_results_for_realtime(
|
325
|
+
self._chat_ctx, vertexai=self._opts.vertexai
|
326
|
+
)
|
314
327
|
# TODO(dz): need to compute delta and then either append or recreate session
|
315
328
|
if turns:
|
316
329
|
self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
|
@@ -481,11 +494,18 @@ class RealtimeSession(llm.RealtimeSession):
|
|
481
494
|
not self._active_session or self._active_session != session
|
482
495
|
):
|
483
496
|
break
|
484
|
-
|
485
497
|
if isinstance(msg, LiveClientContent):
|
486
|
-
await session.
|
498
|
+
await session.send_client_content(
|
499
|
+
turns=msg.turns, turn_complete=msg.turn_complete
|
500
|
+
)
|
501
|
+
elif isinstance(msg, LiveClientToolResponse):
|
502
|
+
await session.send_tool_response(function_responses=msg.function_responses)
|
503
|
+
elif isinstance(msg, LiveClientRealtimeInput):
|
504
|
+
for media_chunk in msg.media_chunks:
|
505
|
+
await session.send_realtime_input(media=media_chunk)
|
487
506
|
else:
|
488
|
-
|
507
|
+
logger.warning(f"Warning: Received unhandled message type: {type(msg)}")
|
508
|
+
|
489
509
|
except Exception as e:
|
490
510
|
if not self._session_should_close.is_set():
|
491
511
|
logger.error(f"error in send task: {e}", exc_info=e)
|
livekit/plugins/google/llm.py
CHANGED
@@ -241,7 +241,7 @@ class LLM(llm.LLM):
|
|
241
241
|
client=self._client,
|
242
242
|
model=self._opts.model,
|
243
243
|
chat_ctx=chat_ctx,
|
244
|
-
tools=tools,
|
244
|
+
tools=tools or [],
|
245
245
|
conn_options=conn_options,
|
246
246
|
extra_kwargs=extra,
|
247
247
|
)
|
@@ -256,7 +256,7 @@ class LLMStream(llm.LLMStream):
|
|
256
256
|
model: str | ChatModels,
|
257
257
|
chat_ctx: llm.ChatContext,
|
258
258
|
conn_options: APIConnectOptions,
|
259
|
-
tools: list[FunctionTool]
|
259
|
+
tools: list[FunctionTool],
|
260
260
|
extra_kwargs: dict[str, Any],
|
261
261
|
) -> None:
|
262
262
|
super().__init__(llm, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
|
@@ -270,7 +270,7 @@ class LLMStream(llm.LLMStream):
|
|
270
270
|
request_id = utils.shortuuid()
|
271
271
|
|
272
272
|
try:
|
273
|
-
turns, system_instruction = to_chat_ctx(self._chat_ctx, id(self._llm))
|
273
|
+
turns, system_instruction = to_chat_ctx(self._chat_ctx, id(self._llm), generate=True)
|
274
274
|
function_declarations = to_fnc_ctx(self._tools)
|
275
275
|
if function_declarations:
|
276
276
|
self._extra_kwargs["tools"] = [
|
livekit/plugins/google/stt.py
CHANGED
@@ -54,7 +54,7 @@ LanguageCode = Union[LgType, list[LgType]]
|
|
54
54
|
_max_session_duration = 240
|
55
55
|
|
56
56
|
# Google is very sensitive to background noise, so we'll ignore results with low confidence
|
57
|
-
|
57
|
+
_default_min_confidence = 0.65
|
58
58
|
|
59
59
|
|
60
60
|
# This class is only be used internally to encapsulate the options
|
@@ -67,6 +67,7 @@ class STTOptions:
|
|
67
67
|
spoken_punctuation: bool
|
68
68
|
model: SpeechModels | str
|
69
69
|
sample_rate: int
|
70
|
+
min_confidence_threshold: float
|
70
71
|
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
|
71
72
|
|
72
73
|
def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
|
@@ -98,9 +99,11 @@ class STT(stt.STT):
|
|
98
99
|
model: SpeechModels | str = "latest_long",
|
99
100
|
location: str = "global",
|
100
101
|
sample_rate: int = 16000,
|
102
|
+
min_confidence_threshold: float = _default_min_confidence,
|
101
103
|
credentials_info: NotGivenOr[dict] = NOT_GIVEN,
|
102
104
|
credentials_file: NotGivenOr[str] = NOT_GIVEN,
|
103
105
|
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
|
106
|
+
use_streaming: NotGivenOr[bool] = NOT_GIVEN,
|
104
107
|
):
|
105
108
|
"""
|
106
109
|
Create a new instance of Google STT.
|
@@ -118,11 +121,18 @@ class STT(stt.STT):
|
|
118
121
|
model(SpeechModels): the model to use for recognition default: "latest_long"
|
119
122
|
location(str): the location to use for recognition default: "global"
|
120
123
|
sample_rate(int): the sample rate of the audio default: 16000
|
124
|
+
min_confidence_threshold(float): minimum confidence threshold for recognition
|
125
|
+
(default: 0.65)
|
121
126
|
credentials_info(dict): the credentials info to use for recognition (default: None)
|
122
127
|
credentials_file(str): the credentials file to use for recognition (default: None)
|
123
128
|
keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
|
129
|
+
use_streaming(bool): whether to use streaming for recognition (default: True)
|
124
130
|
"""
|
125
|
-
|
131
|
+
if not is_given(use_streaming):
|
132
|
+
use_streaming = True
|
133
|
+
super().__init__(
|
134
|
+
capabilities=stt.STTCapabilities(streaming=use_streaming, interim_results=True)
|
135
|
+
)
|
126
136
|
|
127
137
|
self._location = location
|
128
138
|
self._credentials_info = credentials_info
|
@@ -149,6 +159,7 @@ class STT(stt.STT):
|
|
149
159
|
spoken_punctuation=spoken_punctuation,
|
150
160
|
model=model,
|
151
161
|
sample_rate=sample_rate,
|
162
|
+
min_confidence_threshold=min_confidence_threshold,
|
152
163
|
keywords=keywords,
|
153
164
|
)
|
154
165
|
self._streams = weakref.WeakSet[SpeechStream]()
|
@@ -246,7 +257,7 @@ class STT(stt.STT):
|
|
246
257
|
except DeadlineExceeded:
|
247
258
|
raise APITimeoutError() from None
|
248
259
|
except GoogleAPICallError as e:
|
249
|
-
raise APIStatusError(e.message, status_code=e.code or -1) from
|
260
|
+
raise APIStatusError(f"{e.message} {e.details}", status_code=e.code or -1) from e
|
250
261
|
except Exception as e:
|
251
262
|
raise APIConnectionError() from e
|
252
263
|
|
@@ -343,6 +354,7 @@ class SpeechStream(stt.SpeechStream):
|
|
343
354
|
punctuate: NotGivenOr[bool] = NOT_GIVEN,
|
344
355
|
spoken_punctuation: NotGivenOr[bool] = NOT_GIVEN,
|
345
356
|
model: NotGivenOr[SpeechModels] = NOT_GIVEN,
|
357
|
+
min_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
|
346
358
|
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
|
347
359
|
):
|
348
360
|
if is_given(languages):
|
@@ -359,6 +371,8 @@ class SpeechStream(stt.SpeechStream):
|
|
359
371
|
self._config.spoken_punctuation = spoken_punctuation
|
360
372
|
if is_given(model):
|
361
373
|
self._config.model = model
|
374
|
+
if is_given(min_confidence_threshold):
|
375
|
+
self._config.min_confidence_threshold = min_confidence_threshold
|
362
376
|
if is_given(keywords):
|
363
377
|
self._config.keywords = keywords
|
364
378
|
|
@@ -405,7 +419,10 @@ class SpeechStream(stt.SpeechStream):
|
|
405
419
|
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED # noqa: E501
|
406
420
|
):
|
407
421
|
result = resp.results[0]
|
408
|
-
speech_data = _streaming_recognize_response_to_speech_data(
|
422
|
+
speech_data = _streaming_recognize_response_to_speech_data(
|
423
|
+
resp,
|
424
|
+
min_confidence_threshold=self._config.min_confidence_threshold,
|
425
|
+
)
|
409
426
|
if speech_data is None:
|
410
427
|
continue
|
411
428
|
|
@@ -461,6 +478,7 @@ class SpeechStream(stt.SpeechStream):
|
|
461
478
|
features=cloud_speech.RecognitionFeatures(
|
462
479
|
enable_automatic_punctuation=self._config.punctuate,
|
463
480
|
enable_word_time_offsets=True,
|
481
|
+
enable_spoken_punctuation=self._config.spoken_punctuation,
|
464
482
|
),
|
465
483
|
),
|
466
484
|
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
@@ -494,7 +512,12 @@ class SpeechStream(stt.SpeechStream):
|
|
494
512
|
except DeadlineExceeded:
|
495
513
|
raise APITimeoutError() from None
|
496
514
|
except GoogleAPICallError as e:
|
497
|
-
|
515
|
+
if e.code == 409:
|
516
|
+
logger.debug("stream timed out, restarting.")
|
517
|
+
else:
|
518
|
+
raise APIStatusError(
|
519
|
+
f"{e.message} {e.details}", status_code=e.code or -1
|
520
|
+
) from e
|
498
521
|
except Exception as e:
|
499
522
|
raise APIConnectionError() from e
|
500
523
|
|
@@ -530,6 +553,8 @@ def _recognize_response_to_speech_event(
|
|
530
553
|
|
531
554
|
def _streaming_recognize_response_to_speech_data(
|
532
555
|
resp: cloud_speech.StreamingRecognizeResponse,
|
556
|
+
*,
|
557
|
+
min_confidence_threshold: float,
|
533
558
|
) -> stt.SpeechData | None:
|
534
559
|
text = ""
|
535
560
|
confidence = 0.0
|
@@ -542,7 +567,7 @@ def _streaming_recognize_response_to_speech_data(
|
|
542
567
|
confidence /= len(resp.results)
|
543
568
|
lg = resp.results[0].language_code
|
544
569
|
|
545
|
-
if confidence <
|
570
|
+
if confidence < min_confidence_threshold:
|
546
571
|
return None
|
547
572
|
if text == "":
|
548
573
|
return None
|
livekit/plugins/google/tts.py
CHANGED
@@ -56,6 +56,7 @@ class TTS(tts.TTS):
|
|
56
56
|
effects_profile_id: str = "",
|
57
57
|
speaking_rate: float = 1.0,
|
58
58
|
location: str = "global",
|
59
|
+
audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.PCM,
|
59
60
|
credentials_info: NotGivenOr[dict] = NOT_GIVEN,
|
60
61
|
credentials_file: NotGivenOr[str] = NOT_GIVEN,
|
61
62
|
) -> None:
|
@@ -105,7 +106,7 @@ class TTS(tts.TTS):
|
|
105
106
|
self._opts = _TTSOptions(
|
106
107
|
voice=voice_params,
|
107
108
|
audio_config=texttospeech.AudioConfig(
|
108
|
-
audio_encoding=
|
109
|
+
audio_encoding=audio_encoding,
|
109
110
|
sample_rate_hertz=sample_rate,
|
110
111
|
pitch=pitch,
|
111
112
|
effects_profile_id=effects_profile_id,
|
livekit/plugins/google/utils.py
CHANGED
@@ -20,17 +20,21 @@ def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
|
|
20
20
|
return [_build_gemini_fnc(fnc) for fnc in fncs]
|
21
21
|
|
22
22
|
|
23
|
-
def get_tool_results_for_realtime(
|
23
|
+
def get_tool_results_for_realtime(
|
24
|
+
chat_ctx: llm.ChatContext, *, vertexai: bool = False
|
25
|
+
) -> types.LiveClientToolResponse | None:
|
24
26
|
function_responses: list[types.FunctionResponse] = []
|
25
27
|
for msg in chat_ctx.items:
|
26
28
|
if msg.type == "function_call_output":
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
name=msg.name,
|
31
|
-
response={"output": msg.output},
|
32
|
-
)
|
29
|
+
res = types.FunctionResponse(
|
30
|
+
name=msg.name,
|
31
|
+
response={"output": msg.output},
|
33
32
|
)
|
33
|
+
if not vertexai:
|
34
|
+
# vertexai does not support id in FunctionResponse
|
35
|
+
# see: https://github.com/googleapis/python-genai/blob/85e00bc/google/genai/_live_converters.py#L1435
|
36
|
+
res.id = msg.call_id
|
37
|
+
function_responses.append(res)
|
34
38
|
return (
|
35
39
|
types.LiveClientToolResponse(function_responses=function_responses)
|
36
40
|
if function_responses
|
@@ -39,7 +43,10 @@ def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClient
|
|
39
43
|
|
40
44
|
|
41
45
|
def to_chat_ctx(
|
42
|
-
chat_ctx: llm.ChatContext,
|
46
|
+
chat_ctx: llm.ChatContext,
|
47
|
+
cache_key: Any,
|
48
|
+
ignore_functions: bool = False,
|
49
|
+
generate: bool = False,
|
43
50
|
) -> tuple[list[types.Content], types.Content | None]:
|
44
51
|
turns: list[types.Content] = []
|
45
52
|
system_instruction: types.Content | None = None
|
@@ -99,10 +106,9 @@ def to_chat_ctx(
|
|
99
106
|
if current_role is not None and parts:
|
100
107
|
turns.append(types.Content(role=current_role, parts=parts))
|
101
108
|
|
102
|
-
#
|
103
|
-
|
104
|
-
|
105
|
-
# turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
|
109
|
+
# Gemini requires the last message to end with user's turn before they can generate
|
110
|
+
if generate and current_role != "user":
|
111
|
+
turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
|
106
112
|
|
107
113
|
return turns, system_instruction
|
108
114
|
|
@@ -173,6 +179,15 @@ class _GeminiJsonSchema:
|
|
173
179
|
schema.pop("title", None)
|
174
180
|
schema.pop("default", None)
|
175
181
|
schema.pop("additionalProperties", None)
|
182
|
+
schema.pop("$schema", None)
|
183
|
+
|
184
|
+
if (const := schema.pop("const", None)) is not None:
|
185
|
+
# Gemini doesn't support const, but it does support enum with a single value
|
186
|
+
schema["enum"] = [const]
|
187
|
+
|
188
|
+
schema.pop("discriminator", None)
|
189
|
+
schema.pop("examples", None)
|
190
|
+
|
176
191
|
if ref := schema.pop("$ref", None):
|
177
192
|
key = re.sub(r"^#/\$defs/", "", ref)
|
178
193
|
if key in refs_stack:
|
@@ -0,0 +1,47 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: livekit-plugins-google
|
3
|
+
Version: 1.0.20
|
4
|
+
Summary: Agent Framework plugin for services from Google Cloud
|
5
|
+
Project-URL: Documentation, https://docs.livekit.io
|
6
|
+
Project-URL: Website, https://livekit.io/
|
7
|
+
Project-URL: Source, https://github.com/livekit/agents
|
8
|
+
Author: LiveKit
|
9
|
+
License-Expression: Apache-2.0
|
10
|
+
Keywords: audio,livekit,realtime,video,webrtc
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
18
|
+
Classifier: Topic :: Multimedia :: Video
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Requires-Python: >=3.9.0
|
21
|
+
Requires-Dist: google-auth<3,>=2
|
22
|
+
Requires-Dist: google-cloud-speech<3,>=2
|
23
|
+
Requires-Dist: google-cloud-texttospeech<3,>=2
|
24
|
+
Requires-Dist: google-genai>=1.12.1
|
25
|
+
Requires-Dist: livekit-agents>=1.0.20
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
|
28
|
+
# Google AI plugin for LiveKit Agents
|
29
|
+
|
30
|
+
Support for Gemini, Gemini Live, Cloud Speech-to-Text, and Cloud Text-to-Speech.
|
31
|
+
|
32
|
+
See [https://docs.livekit.io/agents/integrations/google/](https://docs.livekit.io/agents/integrations/google/) for more information.
|
33
|
+
|
34
|
+
## Installation
|
35
|
+
|
36
|
+
```bash
|
37
|
+
pip install livekit-plugins-google
|
38
|
+
```
|
39
|
+
|
40
|
+
## Pre-requisites
|
41
|
+
|
42
|
+
For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
|
43
|
+
|
44
|
+
To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
|
45
|
+
|
46
|
+
- Cloud Speech-to-Text API
|
47
|
+
- Cloud Text-to-Speech API
|
@@ -0,0 +1,16 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
|
2
|
+
livekit/plugins/google/llm.py,sha256=m_lRoUw4RIO1d-LtNYugl99LUNcA1y4NQ17wX7Vv5j0,16189
|
3
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
4
|
+
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
5
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
|
7
|
+
livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
|
8
|
+
livekit/plugins/google/utils.py,sha256=zPzmnR-Rs2I87mT_k5S-PVbbuJMH8S-Hp5QcM4wv8vA,10067
|
9
|
+
livekit/plugins/google/version.py,sha256=t4KmPVTpEy1pOJ2GRCA-GNJfCQq_-zHNDBxGj4GKfVk,601
|
10
|
+
livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
|
11
|
+
livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
|
12
|
+
livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=K_YD2CND3PMGV7c3gJY2UdReeLfsOPtIWDys5EU2T_A,31699
|
14
|
+
livekit_plugins_google-1.0.20.dist-info/METADATA,sha256=govmSaj6few3t11vreVNKlH9Ki2YzbRGnN3b3il2f20,1905
|
15
|
+
livekit_plugins_google-1.0.20.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
livekit_plugins_google-1.0.20.dist-info/RECORD,,
|
@@ -1,99 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: livekit-plugins-google
|
3
|
-
Version: 1.0.18
|
4
|
-
Summary: Agent Framework plugin for services from Google Cloud
|
5
|
-
Project-URL: Documentation, https://docs.livekit.io
|
6
|
-
Project-URL: Website, https://livekit.io/
|
7
|
-
Project-URL: Source, https://github.com/livekit/agents
|
8
|
-
Author: LiveKit
|
9
|
-
License-Expression: Apache-2.0
|
10
|
-
Keywords: audio,livekit,realtime,video,webrtc
|
11
|
-
Classifier: Intended Audience :: Developers
|
12
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
14
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
16
|
-
Classifier: Programming Language :: Python :: 3.10
|
17
|
-
Classifier: Topic :: Multimedia :: Sound/Audio
|
18
|
-
Classifier: Topic :: Multimedia :: Video
|
19
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
-
Requires-Python: >=3.9.0
|
21
|
-
Requires-Dist: google-auth<3,>=2
|
22
|
-
Requires-Dist: google-cloud-speech<3,>=2
|
23
|
-
Requires-Dist: google-cloud-texttospeech<3,>=2
|
24
|
-
Requires-Dist: google-genai>=1.12.1
|
25
|
-
Requires-Dist: livekit-agents>=1.0.18
|
26
|
-
Description-Content-Type: text/markdown
|
27
|
-
|
28
|
-
# LiveKit Plugins Google
|
29
|
-
|
30
|
-
Agent Framework plugin for services from Google Cloud. Currently supporting Google's [Speech-to-Text](https://cloud.google.com/speech-to-text) API.
|
31
|
-
|
32
|
-
## Installation
|
33
|
-
|
34
|
-
```bash
|
35
|
-
pip install livekit-plugins-google
|
36
|
-
```
|
37
|
-
|
38
|
-
## Pre-requisites
|
39
|
-
|
40
|
-
For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
|
41
|
-
|
42
|
-
To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
|
43
|
-
|
44
|
-
- Cloud Speech-to-Text API
|
45
|
-
- Cloud Text-to-Speech API
|
46
|
-
|
47
|
-
|
48
|
-
## Gemini Multimodal Live
|
49
|
-
|
50
|
-
Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
|
51
|
-
|
52
|
-
### Live Video Input (experimental)
|
53
|
-
|
54
|
-
You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
|
55
|
-
|
56
|
-
```
|
57
|
-
# Make sure you subscribe to audio and video tracks
|
58
|
-
await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
|
59
|
-
|
60
|
-
# Create your RealtimeModel and store a reference
|
61
|
-
model = google.beta.realtime.RealtimeModel(
|
62
|
-
# ...
|
63
|
-
)
|
64
|
-
|
65
|
-
# Create your MultimodalAgent as usual
|
66
|
-
agent = MultimodalAgent(
|
67
|
-
model=model,
|
68
|
-
# ...
|
69
|
-
)
|
70
|
-
|
71
|
-
# Async method to process the video track and push frames to Gemini
|
72
|
-
async def _process_video_track(self, track: Track):
|
73
|
-
video_stream = VideoStream(track)
|
74
|
-
last_frame_time = 0
|
75
|
-
|
76
|
-
async for event in video_stream:
|
77
|
-
current_time = asyncio.get_event_loop().time()
|
78
|
-
|
79
|
-
# Sample at 1 FPS
|
80
|
-
if current_time - last_frame_time < 1.0:
|
81
|
-
continue
|
82
|
-
|
83
|
-
last_frame_time = current_time
|
84
|
-
frame = event.frame
|
85
|
-
|
86
|
-
# Push the frame into the RealtimeSession
|
87
|
-
model.sessions[0].push_video(frame)
|
88
|
-
|
89
|
-
await video_stream.aclose()
|
90
|
-
|
91
|
-
# Subscribe to new tracks and process them
|
92
|
-
@ctx.room.on("track_subscribed")
|
93
|
-
def _on_track_subscribed(track: Track, pub, participant):
|
94
|
-
if track.kind == TrackKind.KIND_VIDEO:
|
95
|
-
asyncio.create_task(self._process_video_track(track))
|
96
|
-
```
|
97
|
-
|
98
|
-
|
99
|
-
|
@@ -1,16 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
-
livekit/plugins/google/llm.py,sha256=SqNGg6-wlrIUo9uaismP7QW5XztkXyDivJXLVgOIZMI,16175
|
3
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
4
|
-
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
5
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
livekit/plugins/google/stt.py,sha256=AG_lh2fuuduJi0jFbA_QKFXLJ6NUdF1W_FfkLUJML_Q,22413
|
7
|
-
livekit/plugins/google/tts.py,sha256=fmQwW9a1kPsEsrTvIo8fqw479RxWEx0SIc3oTVaj41U,9031
|
8
|
-
livekit/plugins/google/utils.py,sha256=TjjTwMbdJdxr3bZjUXxs-J_fipTTM00goW2-d9KWX6w,9582
|
9
|
-
livekit/plugins/google/version.py,sha256=cnPu9FVKZV9tFmmz7lEvftrO3B_nWJVFghi3j6UcJLs,601
|
10
|
-
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
-
livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
|
12
|
-
livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=sXp2oHnTlHrAp5wFmcXj0bRtQKixBYedfbufcbjVHxk,30897
|
14
|
-
livekit_plugins_google-1.0.18.dist-info/METADATA,sha256=Vqt0FoqibcKzX_jFXlyFkn-mT7iPC16JlH61VS0fbuw,3492
|
15
|
-
livekit_plugins_google-1.0.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
-
livekit_plugins_google-1.0.18.dist-info/RECORD,,
|
File without changes
|