livekit-plugins-google 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,12 +12,12 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from . import beta
15
16
  from .stt import STT, SpeechStream
16
17
  from .tts import TTS
17
18
  from .version import __version__
18
19
 
19
- __all__ = ["STT", "TTS", "SpeechStream", "__version__"]
20
-
20
+ __all__ = ["STT", "TTS", "SpeechStream", "__version__", "beta"]
21
21
  from livekit.agents import Plugin
22
22
 
23
23
  from .log import logger
@@ -0,0 +1,3 @@
1
+ from . import realtime
2
+
3
+ __all__ = ["realtime"]
@@ -0,0 +1,15 @@
1
+ from .api_proto import (
2
+ ClientEvents,
3
+ LiveAPIModels,
4
+ ResponseModality,
5
+ Voice,
6
+ )
7
+ from .realtime_api import RealtimeModel
8
+
9
+ __all__ = [
10
+ "RealtimeModel",
11
+ "ClientEvents",
12
+ "LiveAPIModels",
13
+ "ResponseModality",
14
+ "Voice",
15
+ ]
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ from typing import Any, Dict, List, Literal, Sequence, Union
5
+
6
+ from google.genai import types # type: ignore
7
+
8
+ LiveAPIModels = Literal["gemini-2.0-flash-exp"]
9
+
10
+ Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
11
+ ResponseModality = Literal["AUDIO", "TEXT"]
12
+
13
+
14
+ ClientEvents = Union[
15
+ types.ContentListUnion,
16
+ types.ContentListUnionDict,
17
+ types.LiveClientContentOrDict,
18
+ types.LiveClientRealtimeInput,
19
+ types.LiveClientRealtimeInputOrDict,
20
+ types.LiveClientToolResponseOrDict,
21
+ types.FunctionResponseOrDict,
22
+ Sequence[types.FunctionResponseOrDict],
23
+ ]
24
+
25
+
26
+ JSON_SCHEMA_TYPE_MAP = {
27
+ str: "string",
28
+ int: "integer",
29
+ float: "number",
30
+ bool: "boolean",
31
+ dict: "object",
32
+ list: "array",
33
+ }
34
+
35
+
36
+ def _build_parameters(arguments: Dict[str, Any]) -> types.SchemaDict:
37
+ properties: Dict[str, types.SchemaDict] = {}
38
+ required: List[str] = []
39
+
40
+ for arg_name, arg_info in arguments.items():
41
+ py_type = arg_info.type
42
+ if py_type not in JSON_SCHEMA_TYPE_MAP:
43
+ raise ValueError(f"Unsupported type: {py_type}")
44
+
45
+ prop: types.SchemaDict = {
46
+ "type": JSON_SCHEMA_TYPE_MAP[py_type],
47
+ "description": arg_info.description,
48
+ }
49
+
50
+ if arg_info.choices:
51
+ prop["enum"] = arg_info.choices
52
+
53
+ properties[arg_name] = prop
54
+
55
+ if arg_info.default is inspect.Parameter.empty:
56
+ required.append(arg_name)
57
+
58
+ parameters: types.SchemaDict = {"type": "object", "properties": properties}
59
+
60
+ if required:
61
+ parameters["required"] = required
62
+
63
+ return parameters
64
+
65
+
66
+ def _build_tools(fnc_ctx: Any) -> List[types.FunctionDeclarationDict]:
67
+ function_declarations: List[types.FunctionDeclarationDict] = []
68
+ for fnc_info in fnc_ctx.ai_functions.values():
69
+ parameters = _build_parameters(fnc_info.arguments)
70
+
71
+ func_decl: types.FunctionDeclarationDict = {
72
+ "name": fnc_info.name,
73
+ "description": fnc_info.description,
74
+ "parameters": parameters,
75
+ }
76
+
77
+ function_declarations.append(func_decl)
78
+
79
+ return function_declarations
@@ -0,0 +1,424 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ import json
6
+ import os
7
+ from dataclasses import dataclass
8
+ from typing import AsyncIterable, Literal
9
+
10
+ from livekit import rtc
11
+ from livekit.agents import llm, utils
12
+ from livekit.agents.llm.function_context import _create_ai_function_info
13
+
14
+ from google import genai # type: ignore
15
+ from google.genai.types import ( # type: ignore
16
+ FunctionResponse,
17
+ GenerationConfigDict,
18
+ LiveClientToolResponse,
19
+ LiveConnectConfigDict,
20
+ PrebuiltVoiceConfig,
21
+ SpeechConfig,
22
+ VoiceConfig,
23
+ )
24
+
25
+ from ...log import logger
26
+ from .api_proto import (
27
+ ClientEvents,
28
+ LiveAPIModels,
29
+ ResponseModality,
30
+ Voice,
31
+ _build_tools,
32
+ )
33
+
34
+ EventTypes = Literal[
35
+ "start_session",
36
+ "input_speech_started",
37
+ "response_content_added",
38
+ "response_content_done",
39
+ "function_calls_collected",
40
+ "function_calls_finished",
41
+ "function_calls_cancelled",
42
+ ]
43
+
44
+
45
+ @dataclass
46
+ class GeminiContent:
47
+ response_id: str
48
+ item_id: str
49
+ output_index: int
50
+ content_index: int
51
+ text: str
52
+ audio: list[rtc.AudioFrame]
53
+ text_stream: AsyncIterable[str]
54
+ audio_stream: AsyncIterable[rtc.AudioFrame]
55
+ content_type: Literal["text", "audio"]
56
+
57
+
58
+ @dataclass
59
+ class Capabilities:
60
+ supports_truncate: bool
61
+
62
+
63
+ @dataclass
64
+ class ModelOptions:
65
+ model: LiveAPIModels | str
66
+ api_key: str | None
67
+ voice: Voice | str
68
+ response_modalities: ResponseModality
69
+ vertexai: bool
70
+ project: str | None
71
+ location: str | None
72
+ candidate_count: int
73
+ temperature: float | None
74
+ max_output_tokens: int | None
75
+ top_p: float | None
76
+ top_k: int | None
77
+ presence_penalty: float | None
78
+ frequency_penalty: float | None
79
+ instructions: str
80
+
81
+
82
+ class RealtimeModel:
83
+ def __init__(
84
+ self,
85
+ *,
86
+ instructions: str = "",
87
+ model: LiveAPIModels | str = "gemini-2.0-flash-exp",
88
+ api_key: str | None = None,
89
+ voice: Voice | str = "Puck",
90
+ modalities: ResponseModality = "AUDIO",
91
+ vertexai: bool = False,
92
+ project: str | None = None,
93
+ location: str | None = None,
94
+ candidate_count: int = 1,
95
+ temperature: float | None = None,
96
+ max_output_tokens: int | None = None,
97
+ top_p: float | None = None,
98
+ top_k: int | None = None,
99
+ presence_penalty: float | None = None,
100
+ frequency_penalty: float | None = None,
101
+ loop: asyncio.AbstractEventLoop | None = None,
102
+ ):
103
+ """
104
+ Initializes a RealtimeModel instance for interacting with Google's Realtime API.
105
+
106
+ Args:
107
+ instructions (str, optional): Initial system instructions for the model. Defaults to "".
108
+ api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
109
+ modalities (ResponseModality): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
110
+ model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
111
+ voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
112
+ temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
113
+ vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
114
+ project (str or None, optional): The project to use for the API. Defaults to None. (for vertexai)
115
+ location (str or None, optional): The location to use for the API. Defaults to None. (for vertexai)
116
+ candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
117
+ top_p (float, optional): The top-p value for response generation
118
+ top_k (int, optional): The top-k value for response generation
119
+ presence_penalty (float, optional): The presence penalty for response generation
120
+ frequency_penalty (float, optional): The frequency penalty for response generation
121
+ loop (asyncio.AbstractEventLoop or None, optional): Event loop to use for async operations. If None, the current event loop is used.
122
+
123
+ Raises:
124
+ ValueError: If the API key is not provided and cannot be found in environment variables.
125
+ """
126
+ super().__init__()
127
+ self._capabilities = Capabilities(
128
+ supports_truncate=False,
129
+ )
130
+ self._model = model
131
+ self._loop = loop or asyncio.get_event_loop()
132
+ self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
133
+ self._vertexai = vertexai
134
+ self._project_id = project or os.environ.get("GOOGLE_PROJECT")
135
+ self._location = location or os.environ.get("GOOGLE_LOCATION")
136
+ if self._api_key is None and not self._vertexai:
137
+ raise ValueError("GOOGLE_API_KEY is not set")
138
+
139
+ self._rt_sessions: list[GeminiRealtimeSession] = []
140
+ self._opts = ModelOptions(
141
+ model=model,
142
+ api_key=api_key,
143
+ voice=voice,
144
+ response_modalities=modalities,
145
+ vertexai=vertexai,
146
+ project=project,
147
+ location=location,
148
+ candidate_count=candidate_count,
149
+ temperature=temperature,
150
+ max_output_tokens=max_output_tokens,
151
+ top_p=top_p,
152
+ top_k=top_k,
153
+ presence_penalty=presence_penalty,
154
+ frequency_penalty=frequency_penalty,
155
+ instructions=instructions,
156
+ )
157
+
158
+ @property
159
+ def sessions(self) -> list[GeminiRealtimeSession]:
160
+ return self._rt_sessions
161
+
162
+ @property
163
+ def capabilities(self) -> Capabilities:
164
+ return self._capabilities
165
+
166
+ def session(
167
+ self,
168
+ *,
169
+ chat_ctx: llm.ChatContext | None = None,
170
+ fnc_ctx: llm.FunctionContext | None = None,
171
+ ) -> GeminiRealtimeSession:
172
+ session = GeminiRealtimeSession(
173
+ opts=self._opts,
174
+ chat_ctx=chat_ctx or llm.ChatContext(),
175
+ fnc_ctx=fnc_ctx,
176
+ loop=self._loop,
177
+ )
178
+ self._rt_sessions.append(session)
179
+
180
+ return session
181
+
182
+ async def aclose(self) -> None:
183
+ for session in self._rt_sessions:
184
+ await session.aclose()
185
+
186
+
187
+ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
188
+ def __init__(
189
+ self,
190
+ *,
191
+ opts: ModelOptions,
192
+ chat_ctx: llm.ChatContext,
193
+ fnc_ctx: llm.FunctionContext | None,
194
+ loop: asyncio.AbstractEventLoop,
195
+ ):
196
+ """
197
+ Initializes a GeminiRealtimeSession instance for interacting with Google's Realtime API.
198
+
199
+ Args:
200
+ opts (ModelOptions): The model options for the session.
201
+ chat_ctx (llm.ChatContext): The chat context for the session.
202
+ fnc_ctx (llm.FunctionContext or None): The function context for the session.
203
+ loop (asyncio.AbstractEventLoop): The event loop for the session.
204
+ """
205
+ super().__init__()
206
+ self._loop = loop
207
+ self._opts = opts
208
+ self._chat_ctx = chat_ctx
209
+ self._fnc_ctx = fnc_ctx
210
+ self._fnc_tasks = utils.aio.TaskSet()
211
+
212
+ tools = []
213
+ if self._fnc_ctx is not None:
214
+ functions = _build_tools(self._fnc_ctx)
215
+ tools.append({"function_declarations": functions})
216
+
217
+ self._config = LiveConnectConfigDict(
218
+ model=self._opts.model,
219
+ response_modalities=self._opts.response_modalities,
220
+ generation_config=GenerationConfigDict(
221
+ candidate_count=self._opts.candidate_count,
222
+ temperature=self._opts.temperature,
223
+ max_output_tokens=self._opts.max_output_tokens,
224
+ top_p=self._opts.top_p,
225
+ top_k=self._opts.top_k,
226
+ presence_penalty=self._opts.presence_penalty,
227
+ frequency_penalty=self._opts.frequency_penalty,
228
+ ),
229
+ system_instruction=self._opts.instructions,
230
+ speech_config=SpeechConfig(
231
+ voice_config=VoiceConfig(
232
+ prebuilt_voice_config=PrebuiltVoiceConfig(
233
+ voice_name=self._opts.voice
234
+ )
235
+ )
236
+ ),
237
+ tools=tools,
238
+ )
239
+ self._client = genai.Client(
240
+ http_options={"api_version": "v1alpha"},
241
+ api_key=self._opts.api_key,
242
+ vertexai=self._opts.vertexai,
243
+ project=self._opts.project,
244
+ location=self._opts.location,
245
+ )
246
+ self._main_atask = asyncio.create_task(
247
+ self._main_task(), name="gemini-realtime-session"
248
+ )
249
+ # dummy task to wait for the session to be initialized # TODO: sync chat ctx
250
+ self._init_sync_task = asyncio.create_task(
251
+ asyncio.sleep(0), name="gemini-realtime-session-init"
252
+ )
253
+ self._send_ch = utils.aio.Chan[ClientEvents]()
254
+ self._active_response_id = None
255
+
256
+ async def aclose(self) -> None:
257
+ if self._send_ch.closed:
258
+ return
259
+
260
+ self._send_ch.close()
261
+ await self._main_atask
262
+
263
+ @property
264
+ def fnc_ctx(self) -> llm.FunctionContext | None:
265
+ return self._fnc_ctx
266
+
267
+ @fnc_ctx.setter
268
+ def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
269
+ self._fnc_ctx = value
270
+
271
+ def _push_audio(self, frame: rtc.AudioFrame) -> None:
272
+ data = base64.b64encode(frame.data).decode("utf-8")
273
+ self._queue_msg({"mime_type": "audio/pcm", "data": data})
274
+
275
+ def _queue_msg(self, msg: dict) -> None:
276
+ self._send_ch.send_nowait(msg)
277
+
278
+ def chat_ctx_copy(self) -> llm.ChatContext:
279
+ return self._chat_ctx.copy()
280
+
281
+ async def set_chat_ctx(self, ctx: llm.ChatContext) -> None:
282
+ self._chat_ctx = ctx.copy()
283
+
284
+ @utils.log_exceptions(logger=logger)
285
+ async def _main_task(self):
286
+ @utils.log_exceptions(logger=logger)
287
+ async def _send_task():
288
+ async for msg in self._send_ch:
289
+ await self._session.send(msg)
290
+
291
+ await self._session.send(".", end_of_turn=True)
292
+
293
+ @utils.log_exceptions(logger=logger)
294
+ async def _recv_task():
295
+ while True:
296
+ async for response in self._session.receive():
297
+ if self._active_response_id is None:
298
+ self._active_response_id = utils.shortuuid()
299
+ text_stream = utils.aio.Chan[str]()
300
+ audio_stream = utils.aio.Chan[rtc.AudioFrame]()
301
+ content = GeminiContent(
302
+ response_id=self._active_response_id,
303
+ item_id=self._active_response_id,
304
+ output_index=0,
305
+ content_index=0,
306
+ text="",
307
+ audio=[],
308
+ text_stream=text_stream,
309
+ audio_stream=audio_stream,
310
+ content_type=self._opts.response_modalities,
311
+ )
312
+ self.emit("response_content_added", content)
313
+
314
+ server_content = response.server_content
315
+ if server_content:
316
+ model_turn = server_content.model_turn
317
+ if model_turn:
318
+ for part in model_turn.parts:
319
+ if part.text:
320
+ content.text_stream.send_nowait(part.text)
321
+ if part.inline_data:
322
+ frame = rtc.AudioFrame(
323
+ data=part.inline_data.data,
324
+ sample_rate=24000,
325
+ num_channels=1,
326
+ samples_per_channel=len(part.inline_data.data)
327
+ // 2,
328
+ )
329
+ content.audio_stream.send_nowait(frame)
330
+
331
+ if server_content.interrupted or server_content.turn_complete:
332
+ for stream in (content.text_stream, content.audio_stream):
333
+ if isinstance(stream, utils.aio.Chan):
334
+ stream.close()
335
+
336
+ if server_content.interrupted:
337
+ self.emit("input_speech_started")
338
+ elif server_content.turn_complete:
339
+ self.emit("response_content_done", content)
340
+
341
+ self._active_response_id = None
342
+
343
+ if response.tool_call:
344
+ if self._fnc_ctx is None:
345
+ raise ValueError("Function context is not set")
346
+ fnc_calls = []
347
+ for fnc_call in response.tool_call.function_calls:
348
+ fnc_call_info = _create_ai_function_info(
349
+ self._fnc_ctx,
350
+ fnc_call.id,
351
+ fnc_call.name,
352
+ json.dumps(fnc_call.args),
353
+ )
354
+ fnc_calls.append(fnc_call_info)
355
+
356
+ self.emit("function_calls_collected", fnc_calls)
357
+
358
+ for fnc_call_info in fnc_calls:
359
+ self._fnc_tasks.create_task(
360
+ self._run_fnc_task(fnc_call_info, content.item_id)
361
+ )
362
+
363
+ # Handle function call cancellations
364
+ if response.tool_call_cancellation:
365
+ logger.warning(
366
+ "function call cancelled",
367
+ extra={
368
+ "function_call_ids": response.tool_call_cancellation.function_call_ids,
369
+ },
370
+ )
371
+ self.emit(
372
+ "function_calls_cancelled",
373
+ response.tool_call_cancellation.function_call_ids,
374
+ )
375
+
376
+ async with self._client.aio.live.connect(
377
+ model=self._opts.model, config=self._config
378
+ ) as session:
379
+ self._session = session
380
+ tasks = [
381
+ asyncio.create_task(_send_task(), name="gemini-realtime-send"),
382
+ asyncio.create_task(_recv_task(), name="gemini-realtime-recv"),
383
+ ]
384
+
385
+ try:
386
+ await asyncio.gather(*tasks)
387
+ finally:
388
+ await utils.aio.gracefully_cancel(*tasks)
389
+ await self._session.close()
390
+
391
+ @utils.log_exceptions(logger=logger)
392
+ async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str):
393
+ logger.debug(
394
+ "executing ai function",
395
+ extra={
396
+ "function": fnc_call_info.function_info.name,
397
+ },
398
+ )
399
+
400
+ called_fnc = fnc_call_info.execute()
401
+ try:
402
+ await called_fnc.task
403
+ except Exception as e:
404
+ logger.exception(
405
+ "error executing ai function",
406
+ extra={
407
+ "function": fnc_call_info.function_info.name,
408
+ },
409
+ exc_info=e,
410
+ )
411
+ tool_call = llm.ChatMessage.create_tool_from_called_function(called_fnc)
412
+ if tool_call.content is not None:
413
+ tool_response = LiveClientToolResponse(
414
+ function_responses=[
415
+ FunctionResponse(
416
+ name=tool_call.name,
417
+ id=tool_call.tool_call_id,
418
+ response={"result": tool_call.content},
419
+ )
420
+ ]
421
+ )
422
+ await self._session.send(tool_response)
423
+
424
+ self.emit("function_calls_finished", [called_fnc])
@@ -16,6 +16,7 @@ from __future__ import annotations
16
16
 
17
17
  import asyncio
18
18
  import dataclasses
19
+ import time
19
20
  import weakref
20
21
  from dataclasses import dataclass
21
22
  from typing import List, Union
@@ -44,6 +45,10 @@ from .models import SpeechLanguages, SpeechModels
44
45
  LgType = Union[SpeechLanguages, str]
45
46
  LanguageCode = Union[LgType, List[LgType]]
46
47
 
48
+ # Google STT has a timeout of 5 mins, we'll attempt to restart the session
49
+ # before that timeout is reached
50
+ _max_session_duration = 240
51
+
47
52
 
48
53
  # This class is only be used internally to encapsulate the options
49
54
  @dataclass
@@ -229,8 +234,6 @@ class STT(stt.STT):
229
234
  raise APIStatusError(
230
235
  e.message,
231
236
  status_code=e.code or -1,
232
- request_id=None,
233
- body=None,
234
237
  )
235
238
  except Exception as e:
236
239
  raise APIConnectionError() from e
@@ -278,6 +281,13 @@ class STT(stt.STT):
278
281
  self._config.spoken_punctuation = spoken_punctuation
279
282
  if model is not None:
280
283
  self._config.model = model
284
+ client = None
285
+ recognizer = None
286
+ if location is not None:
287
+ self._location = location
288
+ # if location is changed, fetch a new client and recognizer as per the new location
289
+ client = self._ensure_client()
290
+ recognizer = self._recognizer
281
291
  if keywords is not None:
282
292
  self._config.keywords = keywords
283
293
 
@@ -289,8 +299,9 @@ class STT(stt.STT):
289
299
  punctuate=punctuate,
290
300
  spoken_punctuation=spoken_punctuation,
291
301
  model=model,
292
- location=location,
293
302
  keywords=keywords,
303
+ client=client,
304
+ recognizer=recognizer,
294
305
  )
295
306
 
296
307
 
@@ -312,6 +323,7 @@ class SpeechStream(stt.SpeechStream):
312
323
  self._recognizer = recognizer
313
324
  self._config = config
314
325
  self._reconnect_event = asyncio.Event()
326
+ self._session_connected_at: float = 0
315
327
 
316
328
  def update_options(
317
329
  self,
@@ -322,8 +334,9 @@ class SpeechStream(stt.SpeechStream):
322
334
  punctuate: bool | None = None,
323
335
  spoken_punctuation: bool | None = None,
324
336
  model: SpeechModels | None = None,
325
- location: str | None = None,
326
337
  keywords: List[tuple[str, float]] | None = None,
338
+ client: SpeechAsyncClient | None = None,
339
+ recognizer: str | None = None,
327
340
  ):
328
341
  if languages is not None:
329
342
  if isinstance(languages, str):
@@ -341,13 +354,17 @@ class SpeechStream(stt.SpeechStream):
341
354
  self._config.model = model
342
355
  if keywords is not None:
343
356
  self._config.keywords = keywords
357
+ if client is not None:
358
+ self._client = client
359
+ if recognizer is not None:
360
+ self._recognizer = recognizer
344
361
 
345
362
  self._reconnect_event.set()
346
363
 
347
364
  async def _run(self) -> None:
348
365
  # google requires a async generator when calling streaming_recognize
349
366
  # this function basically convert the queue into a async generator
350
- async def input_generator():
367
+ async def input_generator(should_stop: asyncio.Event):
351
368
  try:
352
369
  # first request should contain the config
353
370
  yield cloud_speech.StreamingRecognizeRequest(
@@ -356,6 +373,12 @@ class SpeechStream(stt.SpeechStream):
356
373
  )
357
374
 
358
375
  async for frame in self._input_ch:
376
+ # when the stream is aborted due to reconnect, this input_generator
377
+ # needs to stop consuming frames
378
+ # when the generator stops, the previous gRPC stream will close
379
+ if should_stop.is_set():
380
+ return
381
+
359
382
  if isinstance(frame, rtc.AudioFrame):
360
383
  yield cloud_speech.StreamingRecognizeRequest(
361
384
  audio=frame.data.tobytes()
@@ -367,6 +390,7 @@ class SpeechStream(stt.SpeechStream):
367
390
  )
368
391
 
369
392
  async def process_stream(stream):
393
+ has_started = False
370
394
  async for resp in stream:
371
395
  if (
372
396
  resp.speech_event_type
@@ -375,6 +399,7 @@ class SpeechStream(stt.SpeechStream):
375
399
  self._event_ch.send_nowait(
376
400
  stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
377
401
  )
402
+ has_started = True
378
403
 
379
404
  if (
380
405
  resp.speech_event_type
@@ -399,6 +424,22 @@ class SpeechStream(stt.SpeechStream):
399
424
  alternatives=[speech_data],
400
425
  )
401
426
  )
427
+ if (
428
+ time.time() - self._session_connected_at
429
+ > _max_session_duration
430
+ ):
431
+ logger.debug(
432
+ "Google STT maximum connection time reached. Reconnecting..."
433
+ )
434
+ if has_started:
435
+ self._event_ch.send_nowait(
436
+ stt.SpeechEvent(
437
+ type=stt.SpeechEventType.END_OF_SPEECH
438
+ )
439
+ )
440
+ has_started = False
441
+ self._reconnect_event.set()
442
+ return
402
443
 
403
444
  if (
404
445
  resp.speech_event_type
@@ -407,6 +448,7 @@ class SpeechStream(stt.SpeechStream):
407
448
  self._event_ch.send_nowait(
408
449
  stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
409
450
  )
451
+ has_started = False
410
452
 
411
453
  while True:
412
454
  try:
@@ -431,25 +473,40 @@ class SpeechStream(stt.SpeechStream):
431
473
  ),
432
474
  )
433
475
 
476
+ should_stop = asyncio.Event()
434
477
  stream = await self._client.streaming_recognize(
435
- requests=input_generator(),
478
+ requests=input_generator(should_stop),
436
479
  )
480
+ self._session_connected_at = time.time()
437
481
 
438
482
  process_stream_task = asyncio.create_task(process_stream(stream))
439
483
  wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
484
+
440
485
  try:
441
- await asyncio.wait(
486
+ done, _ = await asyncio.wait(
442
487
  [process_stream_task, wait_reconnect_task],
443
488
  return_when=asyncio.FIRST_COMPLETED,
444
489
  )
490
+ for task in done:
491
+ if task != wait_reconnect_task:
492
+ task.result()
493
+ if wait_reconnect_task not in done:
494
+ break
495
+ self._reconnect_event.clear()
445
496
  finally:
446
497
  await utils.aio.gracefully_cancel(
447
498
  process_stream_task, wait_reconnect_task
448
499
  )
449
- finally:
450
- if not self._reconnect_event.is_set():
451
- break
452
- self._reconnect_event.clear()
500
+ should_stop.set()
501
+ except DeadlineExceeded:
502
+ raise APITimeoutError()
503
+ except GoogleAPICallError as e:
504
+ raise APIStatusError(
505
+ e.message,
506
+ status_code=e.code or -1,
507
+ )
508
+ except Exception as e:
509
+ raise APIConnectionError() from e
453
510
 
454
511
 
455
512
  def _recognize_response_to_speech_event(
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.8.1"
15
+ __version__ = "0.9.1"
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: livekit-plugins-google
3
- Version: 0.8.1
3
+ Version: 0.9.1
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -22,7 +22,18 @@ Description-Content-Type: text/markdown
22
22
  Requires-Dist: google-auth<3,>=2
23
23
  Requires-Dist: google-cloud-speech<3,>=2
24
24
  Requires-Dist: google-cloud-texttospeech<3,>=2
25
- Requires-Dist: livekit-agents>=0.11
25
+ Requires-Dist: google-genai>=0.3.0
26
+ Requires-Dist: livekit-agents>=0.12.3
27
+ Dynamic: classifier
28
+ Dynamic: description
29
+ Dynamic: description-content-type
30
+ Dynamic: home-page
31
+ Dynamic: keywords
32
+ Dynamic: license
33
+ Dynamic: project-url
34
+ Dynamic: requires-dist
35
+ Dynamic: requires-python
36
+ Dynamic: summary
26
37
 
27
38
  # LiveKit Plugins Google
28
39
 
@@ -37,3 +48,8 @@ pip install livekit-plugins-google
37
48
  ## Pre-requisites
38
49
 
39
50
  For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
51
+
52
+ To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
53
+
54
+ - Cloud Speech-to-Text API
55
+ - Cloud Text-to-Speech API
@@ -0,0 +1,15 @@
1
+ livekit/plugins/google/__init__.py,sha256=TY-5FwEX4Vs7GLO1wSegIxC5W4UPkHBthlr-__yuE4w,1143
2
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
+ livekit/plugins/google/models.py,sha256=cBXhZGY9bFaSCyL9VeSng9wsxhf3peJi3AUYBKV-8GQ,1343
4
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/google/stt.py,sha256=E5kXPbicH4FEXBjyBzfqQWA-nPhKkojzcc-cbtWdmNs,21088
6
+ livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
7
+ livekit/plugins/google/version.py,sha256=4GcbYy7J7gvPMEA4wlPB0BJqg8CjF7HRVjQ-i1EH7M8,600
8
+ livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
9
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=XnJpNIN6NRm7Y4hH2RNA8Xt-tTmkZEKCs_zzU3_koBI,251
10
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=IHYBryuzpfGQD86Twlfq6qxrBhFHptf_IvOk36Wxo1M,2156
11
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=YUEf3iR9dIctnXRqev_qKSBM_plqcYKudodFO8nADJY,15966
12
+ livekit_plugins_google-0.9.1.dist-info/METADATA,sha256=y5d0OEdbkoGk0IPGURiDZbt6e6sWhsxOU2cioNrPu7w,2056
13
+ livekit_plugins_google-0.9.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
+ livekit_plugins_google-0.9.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
15
+ livekit_plugins_google-0.9.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=rqV6C5mFNDFlrA2IcGJrsebr2VxQwMzoDUjY1JhMBZM,1117
2
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
- livekit/plugins/google/models.py,sha256=cBXhZGY9bFaSCyL9VeSng9wsxhf3peJi3AUYBKV-8GQ,1343
4
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/google/stt.py,sha256=tmjktdO6C2AuJWHSKl20ae3cfy_DqfN_oNYYcE552pQ,18566
6
- livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
7
- livekit/plugins/google/version.py,sha256=PoHw-_DNE2B5SpeoQ-r6HSfVmbDgYuGamg0dN2jhayQ,600
8
- livekit_plugins_google-0.8.1.dist-info/METADATA,sha256=RHRMpfHxvaWjwWStByUPghWBLY5tIuC5Lm8r9C3hEhc,1643
9
- livekit_plugins_google-0.8.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
10
- livekit_plugins_google-0.8.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
- livekit_plugins_google-0.8.1.dist-info/RECORD,,