livekit-plugins-google 0.11.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/__init__.py +1 -5
- livekit/plugins/google/beta/realtime/api_proto.py +2 -4
- livekit/plugins/google/beta/realtime/realtime_api.py +407 -449
- livekit/plugins/google/llm.py +158 -220
- livekit/plugins/google/stt.py +80 -115
- livekit/plugins/google/tts.py +50 -55
- livekit/plugins/google/utils.py +251 -0
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.11.2.dist-info → livekit_plugins_google-1.0.0.dist-info}/METADATA +11 -21
- livekit_plugins_google-1.0.0.dist-info/RECORD +16 -0
- {livekit_plugins_google-0.11.2.dist-info → livekit_plugins_google-1.0.0.dist-info}/WHEEL +1 -2
- livekit/plugins/google/_utils.py +0 -199
- livekit/plugins/google/beta/realtime/transcriber.py +0 -270
- livekit_plugins_google-0.11.2.dist-info/RECORD +0 -18
- livekit_plugins_google-0.11.2.dist-info/top_level.txt +0 -1
@@ -1,270 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import asyncio
|
4
|
-
import re
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import Literal
|
7
|
-
|
8
|
-
import websockets
|
9
|
-
from livekit import rtc
|
10
|
-
from livekit.agents import APIConnectionError, APIStatusError, utils
|
11
|
-
|
12
|
-
from google import genai
|
13
|
-
from google.genai import types
|
14
|
-
from google.genai.errors import APIError, ClientError, ServerError
|
15
|
-
|
16
|
-
from ...log import logger
|
17
|
-
from .api_proto import ClientEvents, LiveAPIModels
|
18
|
-
|
19
|
-
EventTypes = Literal["input_speech_started", "input_speech_done"]
|
20
|
-
|
21
|
-
DEFAULT_LANGUAGE = "English"
|
22
|
-
|
23
|
-
SYSTEM_INSTRUCTIONS = f"""
|
24
|
-
You are an **Audio Transcriber**. Your task is to convert audio content into accurate and precise text.
|
25
|
-
- Transcribe verbatim; exclude non-speech sounds.
|
26
|
-
- Provide only transcription; no extra text or explanations.
|
27
|
-
- If audio is unclear, respond with: `...`
|
28
|
-
- Ensure error-free transcription, preserving meaning and context.
|
29
|
-
- Use proper punctuation and formatting.
|
30
|
-
- Do not add explanations, comments, or extra information.
|
31
|
-
- Do not include timestamps, speaker labels, or annotations unless specified.
|
32
|
-
- Audio Language: {DEFAULT_LANGUAGE}
|
33
|
-
"""
|
34
|
-
|
35
|
-
|
36
|
-
@dataclass
|
37
|
-
class TranscriptionContent:
|
38
|
-
response_id: str
|
39
|
-
text: str
|
40
|
-
|
41
|
-
|
42
|
-
class TranscriberSession(utils.EventEmitter[EventTypes]):
|
43
|
-
"""
|
44
|
-
Handles live audio transcription using the realtime API.
|
45
|
-
"""
|
46
|
-
|
47
|
-
def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
|
48
|
-
super().__init__()
|
49
|
-
self._client = client
|
50
|
-
self._model = model
|
51
|
-
self._needed_sr = 16000
|
52
|
-
self._closed = False
|
53
|
-
|
54
|
-
system_instructions = types.Content(
|
55
|
-
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
56
|
-
)
|
57
|
-
self._config = types.LiveConnectConfig(
|
58
|
-
response_modalities=[types.Modality.TEXT],
|
59
|
-
system_instruction=system_instructions,
|
60
|
-
generation_config=types.GenerationConfig(temperature=0.0),
|
61
|
-
)
|
62
|
-
self._main_atask = asyncio.create_task(
|
63
|
-
self._main_task(), name="gemini-realtime-transcriber"
|
64
|
-
)
|
65
|
-
self._send_ch = utils.aio.Chan[ClientEvents]()
|
66
|
-
self._resampler: rtc.AudioResampler | None = None
|
67
|
-
self._active_response_id = None
|
68
|
-
|
69
|
-
def _push_audio(self, frame: rtc.AudioFrame) -> None:
|
70
|
-
if self._closed:
|
71
|
-
return
|
72
|
-
if frame.sample_rate != self._needed_sr:
|
73
|
-
if not self._resampler:
|
74
|
-
self._resampler = rtc.AudioResampler(
|
75
|
-
frame.sample_rate,
|
76
|
-
self._needed_sr,
|
77
|
-
quality=rtc.AudioResamplerQuality.HIGH,
|
78
|
-
)
|
79
|
-
|
80
|
-
if self._resampler:
|
81
|
-
for f in self._resampler.push(frame):
|
82
|
-
self._queue_msg(
|
83
|
-
types.LiveClientRealtimeInput(
|
84
|
-
media_chunks=[
|
85
|
-
types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")
|
86
|
-
]
|
87
|
-
)
|
88
|
-
)
|
89
|
-
else:
|
90
|
-
self._queue_msg(
|
91
|
-
types.LiveClientRealtimeInput(
|
92
|
-
media_chunks=[
|
93
|
-
types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")
|
94
|
-
]
|
95
|
-
)
|
96
|
-
)
|
97
|
-
|
98
|
-
def _queue_msg(self, msg: ClientEvents) -> None:
|
99
|
-
if not self._closed:
|
100
|
-
self._send_ch.send_nowait(msg)
|
101
|
-
|
102
|
-
async def aclose(self) -> None:
|
103
|
-
if self._send_ch.closed:
|
104
|
-
return
|
105
|
-
self._closed = True
|
106
|
-
self._send_ch.close()
|
107
|
-
await self._main_atask
|
108
|
-
|
109
|
-
@utils.log_exceptions(logger=logger)
|
110
|
-
async def _main_task(self):
|
111
|
-
@utils.log_exceptions(logger=logger)
|
112
|
-
async def _send_task():
|
113
|
-
try:
|
114
|
-
async for msg in self._send_ch:
|
115
|
-
if self._closed:
|
116
|
-
break
|
117
|
-
await self._session.send(input=msg)
|
118
|
-
except websockets.exceptions.ConnectionClosedError as e:
|
119
|
-
logger.exception(f"Transcriber session closed in _send_task: {e}")
|
120
|
-
self._closed = True
|
121
|
-
except Exception as e:
|
122
|
-
logger.exception(f"Uncaught error in transcriber _send_task: {e}")
|
123
|
-
self._closed = True
|
124
|
-
|
125
|
-
@utils.log_exceptions(logger=logger)
|
126
|
-
async def _recv_task():
|
127
|
-
try:
|
128
|
-
while not self._closed:
|
129
|
-
async for response in self._session.receive():
|
130
|
-
if self._closed:
|
131
|
-
break
|
132
|
-
if self._active_response_id is None:
|
133
|
-
self._active_response_id = utils.shortuuid()
|
134
|
-
content = TranscriptionContent(
|
135
|
-
response_id=self._active_response_id,
|
136
|
-
text="",
|
137
|
-
)
|
138
|
-
self.emit("input_speech_started", content)
|
139
|
-
|
140
|
-
server_content = response.server_content
|
141
|
-
if server_content:
|
142
|
-
model_turn = server_content.model_turn
|
143
|
-
if model_turn:
|
144
|
-
for part in model_turn.parts:
|
145
|
-
if part.text:
|
146
|
-
content.text += part.text
|
147
|
-
|
148
|
-
if server_content.turn_complete:
|
149
|
-
content.text = clean_transcription(content.text)
|
150
|
-
self.emit("input_speech_done", content)
|
151
|
-
self._active_response_id = None
|
152
|
-
|
153
|
-
except websockets.exceptions.ConnectionClosedError as e:
|
154
|
-
logger.exception(f"Transcriber session closed in _recv_task: {e}")
|
155
|
-
self._closed = True
|
156
|
-
except Exception as e:
|
157
|
-
logger.exception(f"Uncaught error in transcriber _recv_task: {e}")
|
158
|
-
self._closed = True
|
159
|
-
|
160
|
-
async with self._client.aio.live.connect(
|
161
|
-
model=self._model, config=self._config
|
162
|
-
) as session:
|
163
|
-
self._session = session
|
164
|
-
tasks = [
|
165
|
-
asyncio.create_task(
|
166
|
-
_send_task(), name="gemini-realtime-transcriber-send"
|
167
|
-
),
|
168
|
-
asyncio.create_task(
|
169
|
-
_recv_task(), name="gemini-realtime-transcriber-recv"
|
170
|
-
),
|
171
|
-
]
|
172
|
-
|
173
|
-
try:
|
174
|
-
await asyncio.gather(*tasks)
|
175
|
-
finally:
|
176
|
-
await utils.aio.gracefully_cancel(*tasks)
|
177
|
-
await self._session.close()
|
178
|
-
|
179
|
-
|
180
|
-
class ModelTranscriber(utils.EventEmitter[EventTypes]):
|
181
|
-
"""
|
182
|
-
Transcribes agent audio using model generation.
|
183
|
-
"""
|
184
|
-
|
185
|
-
def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
|
186
|
-
super().__init__()
|
187
|
-
self._client = client
|
188
|
-
self._model = model
|
189
|
-
self._needed_sr = 16000
|
190
|
-
self._system_instructions = types.Content(
|
191
|
-
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
192
|
-
)
|
193
|
-
self._config = types.GenerateContentConfig(
|
194
|
-
temperature=0.0,
|
195
|
-
system_instruction=self._system_instructions,
|
196
|
-
# TODO: add response_schem
|
197
|
-
)
|
198
|
-
self._resampler: rtc.AudioResampler | None = None
|
199
|
-
self._buffer: rtc.AudioFrame | None = None
|
200
|
-
self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
|
201
|
-
self._main_atask = asyncio.create_task(
|
202
|
-
self._main_task(), name="gemini-model-transcriber"
|
203
|
-
)
|
204
|
-
|
205
|
-
async def aclose(self) -> None:
|
206
|
-
if self._audio_ch.closed:
|
207
|
-
return
|
208
|
-
self._audio_ch.close()
|
209
|
-
await self._main_atask
|
210
|
-
|
211
|
-
def _push_audio(self, frames: list[rtc.AudioFrame]) -> None:
|
212
|
-
if not frames:
|
213
|
-
return
|
214
|
-
|
215
|
-
buffer = utils.merge_frames(frames)
|
216
|
-
|
217
|
-
if buffer.sample_rate != self._needed_sr:
|
218
|
-
if self._resampler is None:
|
219
|
-
self._resampler = rtc.AudioResampler(
|
220
|
-
input_rate=buffer.sample_rate,
|
221
|
-
output_rate=self._needed_sr,
|
222
|
-
quality=rtc.AudioResamplerQuality.HIGH,
|
223
|
-
)
|
224
|
-
|
225
|
-
buffer = utils.merge_frames(self._resampler.push(buffer))
|
226
|
-
|
227
|
-
self._audio_ch.send_nowait(buffer)
|
228
|
-
|
229
|
-
@utils.log_exceptions(logger=logger)
|
230
|
-
async def _main_task(self):
|
231
|
-
request_id = utils.shortuuid()
|
232
|
-
try:
|
233
|
-
async for buffer in self._audio_ch:
|
234
|
-
# TODO: stream content for better latency
|
235
|
-
response = await self._client.aio.models.generate_content(
|
236
|
-
model=self._model,
|
237
|
-
contents=[
|
238
|
-
types.Content(
|
239
|
-
parts=[
|
240
|
-
types.Part(text=SYSTEM_INSTRUCTIONS),
|
241
|
-
types.Part.from_bytes(
|
242
|
-
data=buffer.to_wav_bytes(),
|
243
|
-
mime_type="audio/wav",
|
244
|
-
),
|
245
|
-
],
|
246
|
-
role="user",
|
247
|
-
)
|
248
|
-
],
|
249
|
-
config=self._config,
|
250
|
-
)
|
251
|
-
content = TranscriptionContent(
|
252
|
-
response_id=request_id, text=clean_transcription(response.text)
|
253
|
-
)
|
254
|
-
self.emit("input_speech_done", content)
|
255
|
-
|
256
|
-
except (ClientError, ServerError, APIError) as e:
|
257
|
-
raise APIStatusError(
|
258
|
-
f"model transcriber error: {e}",
|
259
|
-
status_code=e.code,
|
260
|
-
body=e.message,
|
261
|
-
request_id=request_id,
|
262
|
-
) from e
|
263
|
-
except Exception as e:
|
264
|
-
raise APIConnectionError("Error generating transcription") from e
|
265
|
-
|
266
|
-
|
267
|
-
def clean_transcription(text: str) -> str:
|
268
|
-
text = text.replace("\n", " ")
|
269
|
-
text = re.sub(r"\s+", " ", text)
|
270
|
-
return text.strip()
|
@@ -1,18 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
-
livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
|
3
|
-
livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
|
4
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
-
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
6
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
livekit/plugins/google/stt.py,sha256=l6UB9oaM7eFInnI_0t7Ub-edXLVRgvaiyHj-e_gEuwE,22781
|
8
|
-
livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
|
9
|
-
livekit/plugins/google/version.py,sha256=_06ctkD1XWTWec2BVgcsxun2sFLxqnvJJJs7ZxIBuHA,601
|
10
|
-
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
-
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
|
-
livekit/plugins/google/beta/realtime/api_proto.py,sha256=9EhmwgeIgKDqdSijv5Q9pgx7UhAakK02ZDwbnUsra_o,657
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=8JdWUMUheGhy1ia6JbN3_U2_cL7CNs8-1fTOAgW4I38,22999
|
14
|
-
livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
|
15
|
-
livekit_plugins_google-0.11.2.dist-info/METADATA,sha256=MQF9voerbBB1t5fGRw94z7jyfgJOnsM-DmWxtCT10V8,3732
|
16
|
-
livekit_plugins_google-0.11.2.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
17
|
-
livekit_plugins_google-0.11.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
-
livekit_plugins_google-0.11.2.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
livekit
|