livekit-plugins-google 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/__init__.py +1 -1
- livekit/plugins/google/py.typed +0 -0
- livekit/plugins/google/stt.py +188 -104
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.2.0.dist-info → livekit_plugins_google-0.3.0.dist-info}/METADATA +3 -3
- livekit_plugins_google-0.3.0.dist-info/RECORD +9 -0
- {livekit_plugins_google-0.2.0.dist-info → livekit_plugins_google-0.3.0.dist-info}/WHEEL +1 -1
- livekit_plugins_google-0.2.0.dist-info/RECORD +0 -8
- {livekit_plugins_google-0.2.0.dist-info → livekit_plugins_google-0.3.0.dist-info}/top_level.txt +0 -0
File without changes
|
livekit/plugins/google/stt.py
CHANGED
@@ -12,23 +12,27 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
import asyncio
|
15
18
|
import contextlib
|
16
|
-
from typing import Optional, Union, List
|
17
|
-
from google.auth import credentials
|
18
|
-
from google.cloud.speech_v2 import SpeechAsyncClient
|
19
|
-
from google.cloud.speech_v2.types import cloud_speech
|
20
|
-
from livekit import rtc, agents
|
21
|
-
from livekit.agents.utils import AudioBuffer
|
22
|
-
from livekit.agents import stt
|
23
|
-
from .models import SpeechModels, SpeechLanguages
|
24
|
-
from dataclasses import dataclass
|
25
19
|
import dataclasses
|
26
|
-
import asyncio
|
27
20
|
import logging
|
21
|
+
from dataclasses import dataclass
|
22
|
+
from typing import Any, AsyncIterable, Dict, List
|
28
23
|
|
24
|
+
from livekit import agents, rtc
|
25
|
+
from livekit.agents import stt
|
26
|
+
from livekit.agents.utils import AudioBuffer
|
27
|
+
|
28
|
+
from google.auth import credentials # type: ignore
|
29
|
+
from google.cloud.speech_v2 import SpeechAsyncClient
|
30
|
+
from google.cloud.speech_v2.types import cloud_speech
|
29
31
|
|
30
|
-
|
31
|
-
|
32
|
+
from .models import SpeechLanguages, SpeechModels
|
33
|
+
|
34
|
+
LgType = SpeechLanguages | str
|
35
|
+
LanguageCode = LgType | List[LgType]
|
32
36
|
|
33
37
|
|
34
38
|
# This class is only be used internally to encapsulate the options
|
@@ -52,8 +56,8 @@ class STT(stt.STT):
|
|
52
56
|
punctuate: bool = True,
|
53
57
|
spoken_punctuation: bool = True,
|
54
58
|
model: SpeechModels = "long",
|
55
|
-
credentials_info:
|
56
|
-
credentials_file:
|
59
|
+
credentials_info: Dict[str, Any] | None = None,
|
60
|
+
credentials_file: str | None = None,
|
57
61
|
):
|
58
62
|
"""
|
59
63
|
if no credentials is provided, it will use the credentials on the environment
|
@@ -90,7 +94,7 @@ class STT(stt.STT):
|
|
90
94
|
def _sanitize_options(
|
91
95
|
self,
|
92
96
|
*,
|
93
|
-
language:
|
97
|
+
language: str | None = None,
|
94
98
|
) -> STTOptions:
|
95
99
|
config = dataclasses.replace(self._config)
|
96
100
|
|
@@ -112,7 +116,7 @@ class STT(stt.STT):
|
|
112
116
|
self,
|
113
117
|
*,
|
114
118
|
buffer: AudioBuffer,
|
115
|
-
language:
|
119
|
+
language: SpeechLanguages | str | None = None,
|
116
120
|
) -> stt.SpeechEvent:
|
117
121
|
config = self._sanitize_options(language=language)
|
118
122
|
buffer = agents.utils.merge_frames(buffer)
|
@@ -144,7 +148,7 @@ class STT(stt.STT):
|
|
144
148
|
def stream(
|
145
149
|
self,
|
146
150
|
*,
|
147
|
-
language:
|
151
|
+
language: SpeechLanguages | str | None = None,
|
148
152
|
) -> "SpeechStream":
|
149
153
|
config = self._sanitize_options(language=language)
|
150
154
|
return SpeechStream(
|
@@ -164,6 +168,7 @@ class SpeechStream(stt.SpeechStream):
|
|
164
168
|
config: STTOptions,
|
165
169
|
sample_rate: int = 24000,
|
166
170
|
num_channels: int = 1,
|
171
|
+
max_retry: int = 32,
|
167
172
|
) -> None:
|
168
173
|
super().__init__()
|
169
174
|
|
@@ -174,33 +179,15 @@ class SpeechStream(stt.SpeechStream):
|
|
174
179
|
self._sample_rate = sample_rate
|
175
180
|
self._num_channels = num_channels
|
176
181
|
|
177
|
-
self._queue = asyncio.Queue[rtc.AudioFrame]()
|
178
|
-
self._event_queue = asyncio.Queue[stt.SpeechEvent]()
|
182
|
+
self._queue = asyncio.Queue[rtc.AudioFrame | None]()
|
183
|
+
self._event_queue = asyncio.Queue[stt.SpeechEvent | None]()
|
179
184
|
self._closed = False
|
180
|
-
self._main_task = asyncio.create_task(self._run(max_retry=
|
181
|
-
|
182
|
-
def log_exception(task: asyncio.Task) -> None:
|
183
|
-
if not task.cancelled() and task.exception():
|
184
|
-
logging.error(f"google speech task failed: {task.exception()}")
|
185
|
+
self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
|
185
186
|
|
186
|
-
self.
|
187
|
-
|
188
|
-
def push_frame(self, frame: rtc.AudioFrame) -> None:
|
189
|
-
if self._closed:
|
190
|
-
raise ValueError("cannot push frame to closed stream")
|
191
|
-
|
192
|
-
self._queue.put_nowait(frame)
|
193
|
-
|
194
|
-
async def flush(self) -> None:
|
195
|
-
await self._queue.join()
|
196
|
-
|
197
|
-
async def aclose(self) -> None:
|
198
|
-
self._main_task.cancel()
|
199
|
-
with contextlib.suppress(asyncio.CancelledError):
|
200
|
-
await self._main_task
|
187
|
+
self._final_events: List[stt.SpeechEvent] = []
|
188
|
+
self._speaking = False
|
201
189
|
|
202
|
-
|
203
|
-
return cloud_speech.StreamingRecognitionConfig(
|
190
|
+
self._streaming_config = cloud_speech.StreamingRecognitionConfig(
|
204
191
|
config=cloud_speech.RecognitionConfig(
|
205
192
|
explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
|
206
193
|
encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
@@ -214,64 +201,168 @@ class SpeechStream(stt.SpeechStream):
|
|
214
201
|
),
|
215
202
|
),
|
216
203
|
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
204
|
+
enable_voice_activity_events=True,
|
217
205
|
interim_results=self._config.interim_results,
|
218
206
|
),
|
219
207
|
)
|
220
208
|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
while True:
|
225
|
-
try:
|
226
|
-
input_gen = self._input_gen(self._streaming_config())
|
227
|
-
stream = await self._client.streaming_recognize(requests=input_gen)
|
228
|
-
retry_count = 0
|
229
|
-
|
230
|
-
async for resp in stream:
|
231
|
-
self._event_queue.put_nowait(
|
232
|
-
streaming_recognize_response_to_speech_event(resp)
|
233
|
-
)
|
209
|
+
def log_exception(task: asyncio.Task) -> None:
|
210
|
+
if not task.cancelled() and task.exception():
|
211
|
+
logging.error(f"google stt task failed: {task.exception()}")
|
234
212
|
|
235
|
-
|
236
|
-
break
|
237
|
-
except Exception as e:
|
238
|
-
if retry_count > max_retry and max_retry > 0:
|
239
|
-
logging.error(f"failed to connect to Google Speech: {e}")
|
240
|
-
break
|
213
|
+
self._main_task.add_done_callback(log_exception)
|
241
214
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
f"failed to connect to Google Speech: {e} - retrying in {retry_delay}s"
|
246
|
-
)
|
247
|
-
await asyncio.sleep(retry_delay)
|
215
|
+
def push_frame(self, frame: rtc.AudioFrame) -> None:
|
216
|
+
if self._closed:
|
217
|
+
raise ValueError("cannot push frame to closed stream")
|
248
218
|
|
219
|
+
self._queue.put_nowait(frame)
|
220
|
+
|
221
|
+
async def aclose(self, wait: bool = True) -> None:
|
249
222
|
self._closed = True
|
223
|
+
if not wait:
|
224
|
+
self._main_task.cancel()
|
250
225
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
226
|
+
self._queue.put_nowait(None)
|
227
|
+
with contextlib.suppress(asyncio.CancelledError):
|
228
|
+
await self._main_task
|
229
|
+
|
230
|
+
async def _run(self, max_retry: int) -> None:
|
231
|
+
retry_count = 0
|
255
232
|
try:
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
233
|
+
while not self._closed:
|
234
|
+
try:
|
235
|
+
# google requires a async generator when calling streaming_recognize
|
236
|
+
# this function basically convert the queue into a async generator
|
237
|
+
async def input_generator():
|
238
|
+
try:
|
239
|
+
# first request should contain the config
|
240
|
+
yield cloud_speech.StreamingRecognizeRequest(
|
241
|
+
recognizer=self._recognizer,
|
242
|
+
streaming_config=self._streaming_config,
|
243
|
+
)
|
244
|
+
while True:
|
245
|
+
frame = (
|
246
|
+
await self._queue.get()
|
247
|
+
) # wait for a new rtc.AudioFrame
|
248
|
+
if frame is None:
|
249
|
+
break # None is sent inside aclose
|
250
|
+
|
251
|
+
self._queue.task_done()
|
252
|
+
frame = frame.remix_and_resample(
|
253
|
+
self._sample_rate, self._num_channels
|
254
|
+
)
|
255
|
+
yield cloud_speech.StreamingRecognizeRequest(
|
256
|
+
audio=frame.data.tobytes(),
|
257
|
+
)
|
258
|
+
except Exception as e:
|
259
|
+
logging.error(
|
260
|
+
f"an error occurred while streaming inputs: {e}"
|
261
|
+
)
|
262
|
+
|
263
|
+
# try to connect
|
264
|
+
stream = await self._client.streaming_recognize(
|
265
|
+
requests=input_generator()
|
266
|
+
)
|
267
|
+
retry_count = 0 # connection successful, reset retry count
|
268
|
+
|
269
|
+
await self._run_stream(stream)
|
270
|
+
except Exception as e:
|
271
|
+
if retry_count >= max_retry:
|
272
|
+
logging.error(
|
273
|
+
f"failed to connect to google stt after {max_retry} tries",
|
274
|
+
exc_info=e,
|
275
|
+
)
|
276
|
+
break
|
277
|
+
|
278
|
+
retry_delay = min(retry_count * 2, 10) # max 10s
|
279
|
+
retry_count += 1
|
280
|
+
logging.warning(
|
281
|
+
f"google stt connection failed, retrying in {retry_delay}s",
|
282
|
+
exc_info=e,
|
283
|
+
)
|
284
|
+
await asyncio.sleep(retry_delay)
|
285
|
+
finally:
|
286
|
+
self._event_queue.put_nowait(None)
|
287
|
+
|
288
|
+
async def _run_stream(
|
289
|
+
self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
|
290
|
+
):
|
291
|
+
async for resp in stream:
|
292
|
+
if (
|
293
|
+
resp.speech_event_type
|
294
|
+
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
|
295
|
+
):
|
296
|
+
self._speaking = True
|
297
|
+
start_event = stt.SpeechEvent(
|
298
|
+
type=stt.SpeechEventType.START_OF_SPEECH,
|
265
299
|
)
|
266
|
-
self.
|
267
|
-
|
268
|
-
|
300
|
+
self._event_queue.put_nowait(start_event)
|
301
|
+
|
302
|
+
if (
|
303
|
+
resp.speech_event_type
|
304
|
+
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
|
305
|
+
):
|
306
|
+
result = resp.results[0]
|
307
|
+
if not result.is_final:
|
308
|
+
# interim results
|
309
|
+
iterim_event = stt.SpeechEvent(
|
310
|
+
type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
|
311
|
+
alternatives=streaming_recognize_response_to_speech_data(resp),
|
312
|
+
)
|
313
|
+
self._event_queue.put_nowait(iterim_event)
|
314
|
+
|
315
|
+
else:
|
316
|
+
final_event = stt.SpeechEvent(
|
317
|
+
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
318
|
+
alternatives=streaming_recognize_response_to_speech_data(resp),
|
319
|
+
)
|
320
|
+
self._final_events.append(final_event)
|
321
|
+
self._event_queue.put_nowait(final_event)
|
322
|
+
|
323
|
+
if not self._speaking:
|
324
|
+
# With Google STT, we receive the final event after the END_OF_SPEECH event
|
325
|
+
sentence = ""
|
326
|
+
confidence = 0.0
|
327
|
+
for alt in self._final_events:
|
328
|
+
sentence += f"{alt.alternatives[0].text.strip()} "
|
329
|
+
confidence += alt.alternatives[0].confidence
|
330
|
+
|
331
|
+
sentence = sentence.rstrip()
|
332
|
+
confidence /= len(self._final_events) # avg. of confidence
|
333
|
+
|
334
|
+
end_event = stt.SpeechEvent(
|
335
|
+
type=stt.SpeechEventType.END_OF_SPEECH,
|
336
|
+
alternatives=[
|
337
|
+
stt.SpeechData(
|
338
|
+
language=result.language_code,
|
339
|
+
start_time=self._final_events[0]
|
340
|
+
.alternatives[0]
|
341
|
+
.start_time,
|
342
|
+
end_time=self._final_events[-1]
|
343
|
+
.alternatives[0]
|
344
|
+
.end_time,
|
345
|
+
confidence=confidence,
|
346
|
+
text=sentence,
|
347
|
+
)
|
348
|
+
],
|
349
|
+
)
|
350
|
+
|
351
|
+
self._final_events = []
|
352
|
+
self._event_queue.put_nowait(end_event)
|
353
|
+
|
354
|
+
if (
|
355
|
+
resp.speech_event_type
|
356
|
+
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
|
357
|
+
):
|
358
|
+
self._speaking = False
|
269
359
|
|
270
360
|
async def __anext__(self) -> stt.SpeechEvent:
|
271
|
-
|
361
|
+
evt = await self._event_queue.get()
|
362
|
+
if evt is None:
|
272
363
|
raise StopAsyncIteration
|
273
364
|
|
274
|
-
return
|
365
|
+
return evt
|
275
366
|
|
276
367
|
|
277
368
|
def recognize_response_to_speech_event(
|
@@ -280,8 +371,7 @@ def recognize_response_to_speech_event(
|
|
280
371
|
result = resp.results[0]
|
281
372
|
gg_alts = result.alternatives
|
282
373
|
return stt.SpeechEvent(
|
283
|
-
|
284
|
-
end_of_speech=True,
|
374
|
+
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
285
375
|
alternatives=[
|
286
376
|
stt.SpeechData(
|
287
377
|
language=result.language_code,
|
@@ -295,24 +385,18 @@ def recognize_response_to_speech_event(
|
|
295
385
|
)
|
296
386
|
|
297
387
|
|
298
|
-
def
|
388
|
+
def streaming_recognize_response_to_speech_data(
|
299
389
|
resp: cloud_speech.StreamingRecognizeResponse,
|
300
|
-
) -> stt.
|
390
|
+
) -> List[stt.SpeechData]:
|
301
391
|
result = resp.results[0]
|
302
392
|
gg_alts = result.alternatives
|
303
|
-
return
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
confidence=alt.confidence,
|
314
|
-
text=alt.transcript,
|
315
|
-
)
|
316
|
-
for alt in gg_alts
|
317
|
-
],
|
318
|
-
)
|
393
|
+
return [
|
394
|
+
stt.SpeechData(
|
395
|
+
language=result.language_code,
|
396
|
+
start_time=alt.words[0].start_offset.seconds if alt.words else 0,
|
397
|
+
end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
|
398
|
+
confidence=alt.confidence,
|
399
|
+
text=alt.transcript,
|
400
|
+
)
|
401
|
+
for alt in gg_alts
|
402
|
+
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -29,8 +29,8 @@ Requires-Dist: google-cloud-speech <3,>=2
|
|
29
29
|
Requires-Dist: google-cloud-texttospeech <3,>=2
|
30
30
|
Requires-Dist: google-cloud-translate <4,>=3
|
31
31
|
Requires-Dist: googleapis-common-protos <2,>=1
|
32
|
-
Requires-Dist: livekit >=0.9.
|
33
|
-
Requires-Dist: livekit-agents
|
32
|
+
Requires-Dist: livekit >=0.9.2
|
33
|
+
Requires-Dist: livekit-agents ~=0.5.dev0
|
34
34
|
|
35
35
|
# LiveKit Plugins Google
|
36
36
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=snPMHNLrurYbLWQOkV_o6qG1CEWsOCZ8ZfPMvmh5ejY,931
|
2
|
+
livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
|
3
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
livekit/plugins/google/stt.py,sha256=lYA8hlkxG3YSw1Q34j8hgs4us5Ij-TLBQTRwtGPN9MY,15025
|
5
|
+
livekit/plugins/google/version.py,sha256=G5iYozum4q7UpHwW43F7QfhzUfwcncPxBZ0gmUGsd5I,600
|
6
|
+
livekit_plugins_google-0.3.0.dist-info/METADATA,sha256=sPd3OZxViD0Aq1uF1qJpbsYeqLAlq8tB720JXk-_RKw,1945
|
7
|
+
livekit_plugins_google-0.3.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
8
|
+
livekit_plugins_google-0.3.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
9
|
+
livekit_plugins_google-0.3.0.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=uDkfCsfqWmuPDrDolu-nJrZxpTD53pTCaRVWmyA8a6w,918
|
2
|
-
livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
|
3
|
-
livekit/plugins/google/stt.py,sha256=efyE7vjxWuO99dR9-nSLF9LkmoX0khOwXpayh7-5saY,11149
|
4
|
-
livekit/plugins/google/version.py,sha256=cLFCdnm5S21CiJ5UJBcqfRvvFkCQ8p6M5fFUJVJkEiM,600
|
5
|
-
livekit_plugins_google-0.2.0.dist-info/METADATA,sha256=8tnZ8TW_UHy87ADQvAJSGFqm42Yi-E30bvV2x1LzzBg,1942
|
6
|
-
livekit_plugins_google-0.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
7
|
-
livekit_plugins_google-0.2.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
8
|
-
livekit_plugins_google-0.2.0.dist-info/RECORD,,
|
{livekit_plugins_google-0.2.0.dist-info → livekit_plugins_google-0.3.0.dist-info}/top_level.txt
RENAMED
File without changes
|