livekit-plugins-google 0.10.2__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ from google.genai import types
6
6
 
7
7
  from ..._utils import _build_gemini_ctx, _build_tools
8
8
 
9
- LiveAPIModels = Literal["gemini-2.0-flash-exp"]
9
+ LiveAPIModels = Literal["gemini-2.0-flash-001",]
10
10
 
11
11
  Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
12
12
 
@@ -37,7 +37,7 @@ from .api_proto import (
37
37
  _build_gemini_ctx,
38
38
  _build_tools,
39
39
  )
40
- from .transcriber import TranscriberSession, TranscriptionContent
40
+ from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
41
41
 
42
42
  EventTypes = Literal[
43
43
  "start_session",
@@ -104,7 +104,7 @@ class RealtimeModel:
104
104
  self,
105
105
  *,
106
106
  instructions: str | None = None,
107
- model: LiveAPIModels | str = "gemini-2.0-flash-exp",
107
+ model: LiveAPIModels | str = "gemini-2.0-flash-001",
108
108
  api_key: str | None = None,
109
109
  voice: Voice | str = "Puck",
110
110
  modalities: list[Modality] = ["AUDIO"],
@@ -136,7 +136,7 @@ class RealtimeModel:
136
136
  instructions (str, optional): Initial system instructions for the model. Defaults to "".
137
137
  api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
138
138
  modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
139
- model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
139
+ model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-001".
140
140
  voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
141
141
  enable_user_audio_transcription (bool, optional): Whether to enable user audio transcription. Defaults to True
142
142
  enable_agent_audio_transcription (bool, optional): Whether to enable agent audio transcription. Defaults to True
@@ -301,7 +301,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
301
301
  )
302
302
  self._transcriber.on("input_speech_done", self._on_input_speech_done)
303
303
  if self._opts.enable_agent_audio_transcription:
304
- self._agent_transcriber = TranscriberSession(
304
+ self._agent_transcriber = ModelTranscriber(
305
305
  client=self._client, model=self._opts.model
306
306
  )
307
307
  self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
@@ -382,7 +382,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
382
382
  # TODO: implement sync mechanism to make sure the transcribed user speech is inside the chat_ctx and always before the generated agent speech
383
383
 
384
384
  def _on_agent_speech_done(self, content: TranscriptionContent) -> None:
385
- if not self._is_interrupted and content.response_id and content.text:
385
+ if content.response_id and content.text:
386
386
  self.emit(
387
387
  "agent_speech_transcription_completed",
388
388
  InputTranscription(
@@ -439,10 +439,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
439
439
  // 2,
440
440
  )
441
441
  if self._opts.enable_agent_audio_transcription:
442
- self._agent_transcriber._push_audio(frame)
442
+ content.audio.append(frame)
443
443
  content.audio_stream.send_nowait(frame)
444
444
 
445
445
  if server_content.interrupted or server_content.turn_complete:
446
+ if self._opts.enable_agent_audio_transcription:
447
+ self._agent_transcriber._push_audio(content.audio)
446
448
  for stream in (content.text_stream, content.audio_stream):
447
449
  if isinstance(stream, utils.aio.Chan):
448
450
  stream.close()
@@ -7,24 +7,21 @@ from typing import Literal
7
7
 
8
8
  import websockets
9
9
  from livekit import rtc
10
- from livekit.agents import utils
10
+ from livekit.agents import APIConnectionError, APIStatusError, utils
11
11
 
12
12
  from google import genai
13
13
  from google.genai import types
14
+ from google.genai.errors import APIError, ClientError, ServerError
14
15
 
15
16
  from ...log import logger
16
17
  from .api_proto import ClientEvents, LiveAPIModels
17
18
 
18
- EventTypes = Literal[
19
- "input_speech_started",
20
- "input_speech_done",
21
- ]
19
+ EventTypes = Literal["input_speech_started", "input_speech_done"]
22
20
 
23
21
  DEFAULT_LANGUAGE = "English"
24
22
 
25
23
  SYSTEM_INSTRUCTIONS = f"""
26
24
  You are an **Audio Transcriber**. Your task is to convert audio content into accurate and precise text.
27
-
28
25
  - Transcribe verbatim; exclude non-speech sounds.
29
26
  - Provide only transcription; no extra text or explanations.
30
27
  - If audio is unclear, respond with: `...`
@@ -32,7 +29,6 @@ You are an **Audio Transcriber**. Your task is to convert audio content into acc
32
29
  - Use proper punctuation and formatting.
33
30
  - Do not add explanations, comments, or extra information.
34
31
  - Do not include timestamps, speaker labels, or annotations unless specified.
35
-
36
32
  - Audio Language: {DEFAULT_LANGUAGE}
37
33
  """
38
34
 
@@ -44,30 +40,24 @@ class TranscriptionContent:
44
40
 
45
41
 
46
42
  class TranscriberSession(utils.EventEmitter[EventTypes]):
47
- def __init__(
48
- self,
49
- *,
50
- client: genai.Client,
51
- model: LiveAPIModels | str,
52
- ):
53
- """
54
- Initializes a TranscriberSession instance for interacting with Google's Realtime API.
55
- """
43
+ """
44
+ Handles live audio transcription using the realtime API.
45
+ """
46
+
47
+ def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
56
48
  super().__init__()
57
49
  self._client = client
58
50
  self._model = model
59
51
  self._needed_sr = 16000
60
52
  self._closed = False
53
+
61
54
  system_instructions = types.Content(
62
55
  parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
63
56
  )
64
-
65
57
  self._config = types.LiveConnectConfig(
66
58
  response_modalities=["TEXT"],
67
59
  system_instruction=system_instructions,
68
- generation_config=types.GenerationConfig(
69
- temperature=0.0,
70
- ),
60
+ generation_config=types.GenerationConfig(temperature=0.0),
71
61
  )
72
62
  self._main_atask = asyncio.create_task(
73
63
  self._main_task(), name="gemini-realtime-transcriber"
@@ -187,6 +177,93 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
187
177
  await self._session.close()
188
178
 
189
179
 
180
+ class ModelTranscriber(utils.EventEmitter[EventTypes]):
181
+ """
182
+ Transcribes agent audio using model generation.
183
+ """
184
+
185
+ def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
186
+ super().__init__()
187
+ self._client = client
188
+ self._model = model
189
+ self._needed_sr = 16000
190
+ self._system_instructions = types.Content(
191
+ parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
192
+ )
193
+ self._config = types.GenerateContentConfig(
194
+ temperature=0.0,
195
+ system_instruction=self._system_instructions,
196
+ # TODO: add response_schem
197
+ )
198
+ self._resampler: rtc.AudioResampler | None = None
199
+ self._buffer: rtc.AudioFrame | None = None
200
+ self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
201
+ self._main_atask = asyncio.create_task(
202
+ self._main_task(), name="gemini-model-transcriber"
203
+ )
204
+
205
+ async def aclose(self) -> None:
206
+ if self._audio_ch.closed:
207
+ return
208
+ self._audio_ch.close()
209
+ await self._main_atask
210
+
211
+ def _push_audio(self, frames: list[rtc.AudioFrame]) -> None:
212
+ if not frames:
213
+ return
214
+
215
+ buffer = utils.merge_frames(frames)
216
+
217
+ if buffer.sample_rate != self._needed_sr:
218
+ if self._resampler is None:
219
+ self._resampler = rtc.AudioResampler(
220
+ input_rate=buffer.sample_rate,
221
+ output_rate=self._needed_sr,
222
+ quality=rtc.AudioResamplerQuality.HIGH,
223
+ )
224
+
225
+ buffer = utils.merge_frames(self._resampler.push(buffer))
226
+
227
+ self._audio_ch.send_nowait(buffer)
228
+
229
+ @utils.log_exceptions(logger=logger)
230
+ async def _main_task(self):
231
+ request_id = utils.shortuuid()
232
+ try:
233
+ async for buffer in self._audio_ch:
234
+ # TODO: stream content for better latency
235
+ response = await self._client.aio.models.generate_content(
236
+ model=self._model,
237
+ contents=[
238
+ types.Content(
239
+ parts=[
240
+ types.Part(text=SYSTEM_INSTRUCTIONS),
241
+ types.Part.from_bytes(
242
+ data=buffer.to_wav_bytes(),
243
+ mime_type="audio/wav",
244
+ ),
245
+ ],
246
+ role="user",
247
+ )
248
+ ],
249
+ config=self._config,
250
+ )
251
+ content = TranscriptionContent(
252
+ response_id=request_id, text=clean_transcription(response.text)
253
+ )
254
+ self.emit("input_speech_done", content)
255
+
256
+ except (ClientError, ServerError, APIError) as e:
257
+ raise APIStatusError(
258
+ f"model transcriber error: {e}",
259
+ status_code=e.code,
260
+ body=e.message,
261
+ request_id=request_id,
262
+ ) from e
263
+ except Exception as e:
264
+ raise APIConnectionError("Error generating transcription") from e
265
+
266
+
190
267
  def clean_transcription(text: str) -> str:
191
268
  text = text.replace("\n", " ")
192
269
  text = re.sub(r"\s+", " ", text)
@@ -27,7 +27,7 @@ from livekit.agents import (
27
27
  llm,
28
28
  utils,
29
29
  )
30
- from livekit.agents.llm import ToolChoice, _create_ai_function_info
30
+ from livekit.agents.llm import LLMCapabilities, ToolChoice, _create_ai_function_info
31
31
  from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions
32
32
 
33
33
  from google import genai
@@ -60,7 +60,7 @@ class LLM(llm.LLM):
60
60
  def __init__(
61
61
  self,
62
62
  *,
63
- model: ChatModels | str = "gemini-2.0-flash-exp",
63
+ model: ChatModels | str = "gemini-2.0-flash-001",
64
64
  api_key: str | None = None,
65
65
  vertexai: bool = False,
66
66
  project: str | None = None,
@@ -85,7 +85,7 @@ class LLM(llm.LLM):
85
85
  - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
86
86
 
87
87
  Args:
88
- model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-exp".
88
+ model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
89
89
  api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
90
90
  vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
91
91
  project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
@@ -99,8 +99,12 @@ class LLM(llm.LLM):
99
99
  frequency_penalty (float, optional): Penalizes the model for repeating words. Defaults to None.
100
100
  tool_choice (ToolChoice or Literal["auto", "required", "none"], optional): Specifies whether to use tools during response generation. Defaults to "auto".
101
101
  """
102
- super().__init__()
103
- self._capabilities = llm.LLMCapabilities(supports_choices_on_int=False)
102
+ super().__init__(
103
+ capabilities=LLMCapabilities(
104
+ supports_choices_on_int=False,
105
+ requires_persistent_functions=False,
106
+ )
107
+ )
104
108
  self._project_id = project or os.environ.get("GOOGLE_CLOUD_PROJECT", None)
105
109
  self._location = location or os.environ.get(
106
110
  "GOOGLE_CLOUD_LOCATION", "us-central1"
@@ -94,4 +94,9 @@ Gender = Literal["male", "female", "neutral"]
94
94
 
95
95
  AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
96
96
 
97
- ChatModels = Literal["gemini-2.0-flash-exp", "gemini-1.5-pro"]
97
+ ChatModels = Literal[
98
+ "gemini-2.0-flash-001",
99
+ "gemini-2.0-flash-lite-preview-02-05",
100
+ "gemini-2.0-pro-exp-02-05",
101
+ "gemini-1.5-pro",
102
+ ]
@@ -139,23 +139,26 @@ class STT(stt.STT):
139
139
  self._streams = weakref.WeakSet[SpeechStream]()
140
140
 
141
141
  def _ensure_client(self) -> SpeechAsyncClient:
142
+ # Add support for passing a specific location that matches recognizer
143
+ # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
144
+ client_options = None
145
+ if self._location != "global":
146
+ client_options = ClientOptions(
147
+ api_endpoint=f"{self._location}-speech.googleapis.com"
148
+ )
142
149
  if self._credentials_info:
143
150
  self._client = SpeechAsyncClient.from_service_account_info(
144
- self._credentials_info
151
+ self._credentials_info,
152
+ client_options=client_options,
145
153
  )
146
154
  elif self._credentials_file:
147
155
  self._client = SpeechAsyncClient.from_service_account_file(
148
- self._credentials_file
156
+ self._credentials_file,
157
+ client_options=client_options,
149
158
  )
150
- elif self._location == "global":
151
- self._client = SpeechAsyncClient()
152
159
  else:
153
- # Add support for passing a specific location that matches recognizer
154
- # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
155
160
  self._client = SpeechAsyncClient(
156
- client_options=ClientOptions(
157
- api_endpoint=f"{self._location}-speech.googleapis.com"
158
- )
161
+ client_options=client_options,
159
162
  )
160
163
  assert self._client is not None
161
164
  return self._client
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.10.2"
15
+ __version__ = "0.10.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: livekit-plugins-google
3
- Version: 0.10.2
3
+ Version: 0.10.3
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -0,0 +1,18 @@
1
+ livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
+ livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
3
+ livekit/plugins/google/llm.py,sha256=TVTerAabIf10AKVZr-Kn13eajhQ9RV7K4xaVD771yHU,16547
4
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
5
+ livekit/plugins/google/models.py,sha256=Q47z_tIwLCufxhJyJHH7_1bo4xdBYZBSkkvMeycuItg,1493
6
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ livekit/plugins/google/stt.py,sha256=zl5B8MroarvoBbOmSK5YzC1d3GJeltkpv4Y0n2XLoVE,21203
8
+ livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
9
+ livekit/plugins/google/version.py,sha256=k8ij2VzlolcsqiNUU1AriNVHljCjUQz0tYetVwc1gH0,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=3k2yJ-avbkyDBH3MKlCuBi0xiho003LHxCiYCsCXpg4,21281
14
+ livekit/plugins/google/beta/realtime/transcriber.py,sha256=ZpKA3F8dqOtJPDlPiAgjw0AUDBIuhQiBVnvSYL4cdBg,9796
15
+ livekit_plugins_google-0.10.3.dist-info/METADATA,sha256=kWXttBYbuIpMxR3KwJMchDcNn7OASsguQ_Sctm0t0Lw,2058
16
+ livekit_plugins_google-0.10.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
17
+ livekit_plugins_google-0.10.3.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
+ livekit_plugins_google-0.10.3.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
- livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
3
- livekit/plugins/google/llm.py,sha256=o9EJBv3rS5vKRq7m5YjSSqOxtH6pPekxRS_lra35hzk,16445
4
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
5
- livekit/plugins/google/models.py,sha256=w_qmOk5y86vjtszDiGpP9p0ctjQeaB8-UzqprxgpvCY,1407
6
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- livekit/plugins/google/stt.py,sha256=FA6Lpeb8QvRXLzkQ7cjsoMxHdtEGwHWkpN_TKqAdKAQ,21097
8
- livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
9
- livekit/plugins/google/version.py,sha256=jklx55q_NtxoIUiYD5AFOO11S_Jij8P491Y8nkw-VZk,601
10
- livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=9EhmwgeIgKDqdSijv5Q9pgx7UhAakK02ZDwbnUsra_o,657
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=OwNoPmmomMtRkmYw-g2u7hIYpeIrSSNky7FlcHBVyFQ,21150
14
- livekit/plugins/google/beta/realtime/transcriber.py,sha256=JnZ75NyiOLkpvQ5N2nDniumDKcrjiq_tlryiLbuBoDM,6658
15
- livekit_plugins_google-0.10.2.dist-info/METADATA,sha256=dTBdAuYpGyCFVJNw0c8upUEdaFgdodWwrm1bB3a4Xp4,2058
16
- livekit_plugins_google-0.10.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
17
- livekit_plugins_google-0.10.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
- livekit_plugins_google-0.10.2.dist-info/RECORD,,