livekit-plugins-google 0.10.2__py3-none-any.whl → 0.10.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/api_proto.py +1 -1
- livekit/plugins/google/beta/realtime/realtime_api.py +6 -4
- livekit/plugins/google/beta/realtime/transcriber.py +97 -20
- livekit/plugins/google/llm.py +9 -5
- livekit/plugins/google/models.py +6 -1
- livekit/plugins/google/stt.py +12 -9
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.10.2.dist-info → livekit_plugins_google-0.10.4.dist-info}/METADATA +1 -1
- livekit_plugins_google-0.10.4.dist-info/RECORD +18 -0
- livekit_plugins_google-0.10.2.dist-info/RECORD +0 -18
- {livekit_plugins_google-0.10.2.dist-info → livekit_plugins_google-0.10.4.dist-info}/WHEEL +0 -0
- {livekit_plugins_google-0.10.2.dist-info → livekit_plugins_google-0.10.4.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,7 @@ from .api_proto import (
|
|
37
37
|
_build_gemini_ctx,
|
38
38
|
_build_tools,
|
39
39
|
)
|
40
|
-
from .transcriber import TranscriberSession, TranscriptionContent
|
40
|
+
from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
|
41
41
|
|
42
42
|
EventTypes = Literal[
|
43
43
|
"start_session",
|
@@ -301,7 +301,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
301
301
|
)
|
302
302
|
self._transcriber.on("input_speech_done", self._on_input_speech_done)
|
303
303
|
if self._opts.enable_agent_audio_transcription:
|
304
|
-
self._agent_transcriber =
|
304
|
+
self._agent_transcriber = ModelTranscriber(
|
305
305
|
client=self._client, model=self._opts.model
|
306
306
|
)
|
307
307
|
self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
|
@@ -382,7 +382,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
382
382
|
# TODO: implement sync mechanism to make sure the transcribed user speech is inside the chat_ctx and always before the generated agent speech
|
383
383
|
|
384
384
|
def _on_agent_speech_done(self, content: TranscriptionContent) -> None:
|
385
|
-
if
|
385
|
+
if content.response_id and content.text:
|
386
386
|
self.emit(
|
387
387
|
"agent_speech_transcription_completed",
|
388
388
|
InputTranscription(
|
@@ -439,10 +439,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
439
439
|
// 2,
|
440
440
|
)
|
441
441
|
if self._opts.enable_agent_audio_transcription:
|
442
|
-
|
442
|
+
content.audio.append(frame)
|
443
443
|
content.audio_stream.send_nowait(frame)
|
444
444
|
|
445
445
|
if server_content.interrupted or server_content.turn_complete:
|
446
|
+
if self._opts.enable_agent_audio_transcription:
|
447
|
+
self._agent_transcriber._push_audio(content.audio)
|
446
448
|
for stream in (content.text_stream, content.audio_stream):
|
447
449
|
if isinstance(stream, utils.aio.Chan):
|
448
450
|
stream.close()
|
@@ -7,24 +7,21 @@ from typing import Literal
|
|
7
7
|
|
8
8
|
import websockets
|
9
9
|
from livekit import rtc
|
10
|
-
from livekit.agents import utils
|
10
|
+
from livekit.agents import APIConnectionError, APIStatusError, utils
|
11
11
|
|
12
12
|
from google import genai
|
13
13
|
from google.genai import types
|
14
|
+
from google.genai.errors import APIError, ClientError, ServerError
|
14
15
|
|
15
16
|
from ...log import logger
|
16
17
|
from .api_proto import ClientEvents, LiveAPIModels
|
17
18
|
|
18
|
-
EventTypes = Literal[
|
19
|
-
"input_speech_started",
|
20
|
-
"input_speech_done",
|
21
|
-
]
|
19
|
+
EventTypes = Literal["input_speech_started", "input_speech_done"]
|
22
20
|
|
23
21
|
DEFAULT_LANGUAGE = "English"
|
24
22
|
|
25
23
|
SYSTEM_INSTRUCTIONS = f"""
|
26
24
|
You are an **Audio Transcriber**. Your task is to convert audio content into accurate and precise text.
|
27
|
-
|
28
25
|
- Transcribe verbatim; exclude non-speech sounds.
|
29
26
|
- Provide only transcription; no extra text or explanations.
|
30
27
|
- If audio is unclear, respond with: `...`
|
@@ -32,7 +29,6 @@ You are an **Audio Transcriber**. Your task is to convert audio content into acc
|
|
32
29
|
- Use proper punctuation and formatting.
|
33
30
|
- Do not add explanations, comments, or extra information.
|
34
31
|
- Do not include timestamps, speaker labels, or annotations unless specified.
|
35
|
-
|
36
32
|
- Audio Language: {DEFAULT_LANGUAGE}
|
37
33
|
"""
|
38
34
|
|
@@ -44,30 +40,24 @@ class TranscriptionContent:
|
|
44
40
|
|
45
41
|
|
46
42
|
class TranscriberSession(utils.EventEmitter[EventTypes]):
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
):
|
53
|
-
"""
|
54
|
-
Initializes a TranscriberSession instance for interacting with Google's Realtime API.
|
55
|
-
"""
|
43
|
+
"""
|
44
|
+
Handles live audio transcription using the realtime API.
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
|
56
48
|
super().__init__()
|
57
49
|
self._client = client
|
58
50
|
self._model = model
|
59
51
|
self._needed_sr = 16000
|
60
52
|
self._closed = False
|
53
|
+
|
61
54
|
system_instructions = types.Content(
|
62
55
|
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
63
56
|
)
|
64
|
-
|
65
57
|
self._config = types.LiveConnectConfig(
|
66
58
|
response_modalities=["TEXT"],
|
67
59
|
system_instruction=system_instructions,
|
68
|
-
generation_config=types.GenerationConfig(
|
69
|
-
temperature=0.0,
|
70
|
-
),
|
60
|
+
generation_config=types.GenerationConfig(temperature=0.0),
|
71
61
|
)
|
72
62
|
self._main_atask = asyncio.create_task(
|
73
63
|
self._main_task(), name="gemini-realtime-transcriber"
|
@@ -187,6 +177,93 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
187
177
|
await self._session.close()
|
188
178
|
|
189
179
|
|
180
|
+
class ModelTranscriber(utils.EventEmitter[EventTypes]):
|
181
|
+
"""
|
182
|
+
Transcribes agent audio using model generation.
|
183
|
+
"""
|
184
|
+
|
185
|
+
def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
|
186
|
+
super().__init__()
|
187
|
+
self._client = client
|
188
|
+
self._model = model
|
189
|
+
self._needed_sr = 16000
|
190
|
+
self._system_instructions = types.Content(
|
191
|
+
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
192
|
+
)
|
193
|
+
self._config = types.GenerateContentConfig(
|
194
|
+
temperature=0.0,
|
195
|
+
system_instruction=self._system_instructions,
|
196
|
+
# TODO: add response_schem
|
197
|
+
)
|
198
|
+
self._resampler: rtc.AudioResampler | None = None
|
199
|
+
self._buffer: rtc.AudioFrame | None = None
|
200
|
+
self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
|
201
|
+
self._main_atask = asyncio.create_task(
|
202
|
+
self._main_task(), name="gemini-model-transcriber"
|
203
|
+
)
|
204
|
+
|
205
|
+
async def aclose(self) -> None:
|
206
|
+
if self._audio_ch.closed:
|
207
|
+
return
|
208
|
+
self._audio_ch.close()
|
209
|
+
await self._main_atask
|
210
|
+
|
211
|
+
def _push_audio(self, frames: list[rtc.AudioFrame]) -> None:
|
212
|
+
if not frames:
|
213
|
+
return
|
214
|
+
|
215
|
+
buffer = utils.merge_frames(frames)
|
216
|
+
|
217
|
+
if buffer.sample_rate != self._needed_sr:
|
218
|
+
if self._resampler is None:
|
219
|
+
self._resampler = rtc.AudioResampler(
|
220
|
+
input_rate=buffer.sample_rate,
|
221
|
+
output_rate=self._needed_sr,
|
222
|
+
quality=rtc.AudioResamplerQuality.HIGH,
|
223
|
+
)
|
224
|
+
|
225
|
+
buffer = utils.merge_frames(self._resampler.push(buffer))
|
226
|
+
|
227
|
+
self._audio_ch.send_nowait(buffer)
|
228
|
+
|
229
|
+
@utils.log_exceptions(logger=logger)
|
230
|
+
async def _main_task(self):
|
231
|
+
request_id = utils.shortuuid()
|
232
|
+
try:
|
233
|
+
async for buffer in self._audio_ch:
|
234
|
+
# TODO: stream content for better latency
|
235
|
+
response = await self._client.aio.models.generate_content(
|
236
|
+
model=self._model,
|
237
|
+
contents=[
|
238
|
+
types.Content(
|
239
|
+
parts=[
|
240
|
+
types.Part(text=SYSTEM_INSTRUCTIONS),
|
241
|
+
types.Part.from_bytes(
|
242
|
+
data=buffer.to_wav_bytes(),
|
243
|
+
mime_type="audio/wav",
|
244
|
+
),
|
245
|
+
],
|
246
|
+
role="user",
|
247
|
+
)
|
248
|
+
],
|
249
|
+
config=self._config,
|
250
|
+
)
|
251
|
+
content = TranscriptionContent(
|
252
|
+
response_id=request_id, text=clean_transcription(response.text)
|
253
|
+
)
|
254
|
+
self.emit("input_speech_done", content)
|
255
|
+
|
256
|
+
except (ClientError, ServerError, APIError) as e:
|
257
|
+
raise APIStatusError(
|
258
|
+
f"model transcriber error: {e}",
|
259
|
+
status_code=e.code,
|
260
|
+
body=e.message,
|
261
|
+
request_id=request_id,
|
262
|
+
) from e
|
263
|
+
except Exception as e:
|
264
|
+
raise APIConnectionError("Error generating transcription") from e
|
265
|
+
|
266
|
+
|
190
267
|
def clean_transcription(text: str) -> str:
|
191
268
|
text = text.replace("\n", " ")
|
192
269
|
text = re.sub(r"\s+", " ", text)
|
livekit/plugins/google/llm.py
CHANGED
@@ -27,7 +27,7 @@ from livekit.agents import (
|
|
27
27
|
llm,
|
28
28
|
utils,
|
29
29
|
)
|
30
|
-
from livekit.agents.llm import ToolChoice, _create_ai_function_info
|
30
|
+
from livekit.agents.llm import LLMCapabilities, ToolChoice, _create_ai_function_info
|
31
31
|
from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions
|
32
32
|
|
33
33
|
from google import genai
|
@@ -60,7 +60,7 @@ class LLM(llm.LLM):
|
|
60
60
|
def __init__(
|
61
61
|
self,
|
62
62
|
*,
|
63
|
-
model: ChatModels | str = "gemini-2.0-flash-
|
63
|
+
model: ChatModels | str = "gemini-2.0-flash-001",
|
64
64
|
api_key: str | None = None,
|
65
65
|
vertexai: bool = False,
|
66
66
|
project: str | None = None,
|
@@ -85,7 +85,7 @@ class LLM(llm.LLM):
|
|
85
85
|
- For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
|
86
86
|
|
87
87
|
Args:
|
88
|
-
model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-
|
88
|
+
model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
|
89
89
|
api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
|
90
90
|
vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
|
91
91
|
project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
|
@@ -99,8 +99,12 @@ class LLM(llm.LLM):
|
|
99
99
|
frequency_penalty (float, optional): Penalizes the model for repeating words. Defaults to None.
|
100
100
|
tool_choice (ToolChoice or Literal["auto", "required", "none"], optional): Specifies whether to use tools during response generation. Defaults to "auto".
|
101
101
|
"""
|
102
|
-
super().__init__(
|
103
|
-
|
102
|
+
super().__init__(
|
103
|
+
capabilities=LLMCapabilities(
|
104
|
+
supports_choices_on_int=False,
|
105
|
+
requires_persistent_functions=False,
|
106
|
+
)
|
107
|
+
)
|
104
108
|
self._project_id = project or os.environ.get("GOOGLE_CLOUD_PROJECT", None)
|
105
109
|
self._location = location or os.environ.get(
|
106
110
|
"GOOGLE_CLOUD_LOCATION", "us-central1"
|
livekit/plugins/google/models.py
CHANGED
@@ -94,4 +94,9 @@ Gender = Literal["male", "female", "neutral"]
|
|
94
94
|
|
95
95
|
AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
|
96
96
|
|
97
|
-
ChatModels = Literal[
|
97
|
+
ChatModels = Literal[
|
98
|
+
"gemini-2.0-flash-001",
|
99
|
+
"gemini-2.0-flash-lite-preview-02-05",
|
100
|
+
"gemini-2.0-pro-exp-02-05",
|
101
|
+
"gemini-1.5-pro",
|
102
|
+
]
|
livekit/plugins/google/stt.py
CHANGED
@@ -139,23 +139,26 @@ class STT(stt.STT):
|
|
139
139
|
self._streams = weakref.WeakSet[SpeechStream]()
|
140
140
|
|
141
141
|
def _ensure_client(self) -> SpeechAsyncClient:
|
142
|
+
# Add support for passing a specific location that matches recognizer
|
143
|
+
# see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
|
144
|
+
client_options = None
|
145
|
+
if self._location != "global":
|
146
|
+
client_options = ClientOptions(
|
147
|
+
api_endpoint=f"{self._location}-speech.googleapis.com"
|
148
|
+
)
|
142
149
|
if self._credentials_info:
|
143
150
|
self._client = SpeechAsyncClient.from_service_account_info(
|
144
|
-
self._credentials_info
|
151
|
+
self._credentials_info,
|
152
|
+
client_options=client_options,
|
145
153
|
)
|
146
154
|
elif self._credentials_file:
|
147
155
|
self._client = SpeechAsyncClient.from_service_account_file(
|
148
|
-
self._credentials_file
|
156
|
+
self._credentials_file,
|
157
|
+
client_options=client_options,
|
149
158
|
)
|
150
|
-
elif self._location == "global":
|
151
|
-
self._client = SpeechAsyncClient()
|
152
159
|
else:
|
153
|
-
# Add support for passing a specific location that matches recognizer
|
154
|
-
# see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
|
155
160
|
self._client = SpeechAsyncClient(
|
156
|
-
client_options=
|
157
|
-
api_endpoint=f"{self._location}-speech.googleapis.com"
|
158
|
-
)
|
161
|
+
client_options=client_options,
|
159
162
|
)
|
160
163
|
assert self._client is not None
|
161
164
|
return self._client
|
@@ -0,0 +1,18 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
+
livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
|
3
|
+
livekit/plugins/google/llm.py,sha256=TVTerAabIf10AKVZr-Kn13eajhQ9RV7K4xaVD771yHU,16547
|
4
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
+
livekit/plugins/google/models.py,sha256=Q47z_tIwLCufxhJyJHH7_1bo4xdBYZBSkkvMeycuItg,1493
|
6
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
livekit/plugins/google/stt.py,sha256=zl5B8MroarvoBbOmSK5YzC1d3GJeltkpv4Y0n2XLoVE,21203
|
8
|
+
livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
|
9
|
+
livekit/plugins/google/version.py,sha256=4H1pRTakUdztFHr_mZA7ybSGAF2BVH1xhvAHHQwGqwA,601
|
10
|
+
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
+
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
|
+
livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=RPGYAJXelYPo16YyR2qccjUjxUJCkJBU2N5rNTpKxyo,21281
|
14
|
+
livekit/plugins/google/beta/realtime/transcriber.py,sha256=ZpKA3F8dqOtJPDlPiAgjw0AUDBIuhQiBVnvSYL4cdBg,9796
|
15
|
+
livekit_plugins_google-0.10.4.dist-info/METADATA,sha256=UkYK-aE8XAbV0BIinD9e_xGJXi-Oq-oQuc_ZASS8d_c,2058
|
16
|
+
livekit_plugins_google-0.10.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
17
|
+
livekit_plugins_google-0.10.4.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
+
livekit_plugins_google-0.10.4.dist-info/RECORD,,
|
@@ -1,18 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
-
livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
|
3
|
-
livekit/plugins/google/llm.py,sha256=o9EJBv3rS5vKRq7m5YjSSqOxtH6pPekxRS_lra35hzk,16445
|
4
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
-
livekit/plugins/google/models.py,sha256=w_qmOk5y86vjtszDiGpP9p0ctjQeaB8-UzqprxgpvCY,1407
|
6
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
livekit/plugins/google/stt.py,sha256=FA6Lpeb8QvRXLzkQ7cjsoMxHdtEGwHWkpN_TKqAdKAQ,21097
|
8
|
-
livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
|
9
|
-
livekit/plugins/google/version.py,sha256=jklx55q_NtxoIUiYD5AFOO11S_Jij8P491Y8nkw-VZk,601
|
10
|
-
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
-
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
|
-
livekit/plugins/google/beta/realtime/api_proto.py,sha256=9EhmwgeIgKDqdSijv5Q9pgx7UhAakK02ZDwbnUsra_o,657
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=OwNoPmmomMtRkmYw-g2u7hIYpeIrSSNky7FlcHBVyFQ,21150
|
14
|
-
livekit/plugins/google/beta/realtime/transcriber.py,sha256=JnZ75NyiOLkpvQ5N2nDniumDKcrjiq_tlryiLbuBoDM,6658
|
15
|
-
livekit_plugins_google-0.10.2.dist-info/METADATA,sha256=dTBdAuYpGyCFVJNw0c8upUEdaFgdodWwrm1bB3a4Xp4,2058
|
16
|
-
livekit_plugins_google-0.10.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
17
|
-
livekit_plugins_google-0.10.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
-
livekit_plugins_google-0.10.2.dist-info/RECORD,,
|
File without changes
|
{livekit_plugins_google-0.10.2.dist-info → livekit_plugins_google-0.10.4.dist-info}/top_level.txt
RENAMED
File without changes
|