livekit-plugins-google 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@ from livekit.agents import Plugin
22
22
 
23
23
  class GooglePlugin(Plugin):
24
24
  def __init__(self):
25
- super().__init__(__name__, __version__)
25
+ super().__init__(__name__, __version__, __package__)
26
26
 
27
27
  def download_files(self):
28
28
  pass
File without changes
@@ -12,23 +12,27 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from __future__ import annotations
16
+
17
+ import asyncio
15
18
  import contextlib
16
- from typing import Optional, Union, List
17
- from google.auth import credentials
18
- from google.cloud.speech_v2 import SpeechAsyncClient
19
- from google.cloud.speech_v2.types import cloud_speech
20
- from livekit import rtc, agents
21
- from livekit.agents.utils import AudioBuffer
22
- from livekit.agents import stt
23
- from .models import SpeechModels, SpeechLanguages
24
- from dataclasses import dataclass
25
19
  import dataclasses
26
- import asyncio
27
20
  import logging
21
+ from dataclasses import dataclass
22
+ from typing import Any, AsyncIterable, Dict, List
28
23
 
24
+ from livekit import agents, rtc
25
+ from livekit.agents import stt
26
+ from livekit.agents.utils import AudioBuffer
27
+
28
+ from google.auth import credentials # type: ignore
29
+ from google.cloud.speech_v2 import SpeechAsyncClient
30
+ from google.cloud.speech_v2.types import cloud_speech
29
31
 
30
- LgType = Union[SpeechLanguages, str]
31
- LanguageCode = Union[LgType, List[LgType]]
32
+ from .models import SpeechLanguages, SpeechModels
33
+
34
+ LgType = SpeechLanguages | str
35
+ LanguageCode = LgType | List[LgType]
32
36
 
33
37
 
34
38
  # This class is only be used internally to encapsulate the options
@@ -52,8 +56,8 @@ class STT(stt.STT):
52
56
  punctuate: bool = True,
53
57
  spoken_punctuation: bool = True,
54
58
  model: SpeechModels = "long",
55
- credentials_info: Optional[dict] = None,
56
- credentials_file: Optional[str] = None,
59
+ credentials_info: Dict[str, Any] | None = None,
60
+ credentials_file: str | None = None,
57
61
  ):
58
62
  """
59
63
  if no credentials is provided, it will use the credentials on the environment
@@ -90,7 +94,7 @@ class STT(stt.STT):
90
94
  def _sanitize_options(
91
95
  self,
92
96
  *,
93
- language: Optional[str] = None,
97
+ language: str | None = None,
94
98
  ) -> STTOptions:
95
99
  config = dataclasses.replace(self._config)
96
100
 
@@ -112,7 +116,7 @@ class STT(stt.STT):
112
116
  self,
113
117
  *,
114
118
  buffer: AudioBuffer,
115
- language: Optional[Union[SpeechLanguages, str]] = None,
119
+ language: SpeechLanguages | str | None = None,
116
120
  ) -> stt.SpeechEvent:
117
121
  config = self._sanitize_options(language=language)
118
122
  buffer = agents.utils.merge_frames(buffer)
@@ -144,7 +148,7 @@ class STT(stt.STT):
144
148
  def stream(
145
149
  self,
146
150
  *,
147
- language: Optional[Union[SpeechLanguages, str]] = None,
151
+ language: SpeechLanguages | str | None = None,
148
152
  ) -> "SpeechStream":
149
153
  config = self._sanitize_options(language=language)
150
154
  return SpeechStream(
@@ -164,6 +168,7 @@ class SpeechStream(stt.SpeechStream):
164
168
  config: STTOptions,
165
169
  sample_rate: int = 24000,
166
170
  num_channels: int = 1,
171
+ max_retry: int = 32,
167
172
  ) -> None:
168
173
  super().__init__()
169
174
 
@@ -174,33 +179,15 @@ class SpeechStream(stt.SpeechStream):
174
179
  self._sample_rate = sample_rate
175
180
  self._num_channels = num_channels
176
181
 
177
- self._queue = asyncio.Queue[rtc.AudioFrame]()
178
- self._event_queue = asyncio.Queue[stt.SpeechEvent]()
182
+ self._queue = asyncio.Queue[rtc.AudioFrame | None]()
183
+ self._event_queue = asyncio.Queue[stt.SpeechEvent | None]()
179
184
  self._closed = False
180
- self._main_task = asyncio.create_task(self._run(max_retry=32))
181
-
182
- def log_exception(task: asyncio.Task) -> None:
183
- if not task.cancelled() and task.exception():
184
- logging.error(f"google speech task failed: {task.exception()}")
185
+ self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
185
186
 
186
- self._main_task.add_done_callback(log_exception)
187
-
188
- def push_frame(self, frame: rtc.AudioFrame) -> None:
189
- if self._closed:
190
- raise ValueError("cannot push frame to closed stream")
191
-
192
- self._queue.put_nowait(frame)
193
-
194
- async def flush(self) -> None:
195
- await self._queue.join()
196
-
197
- async def aclose(self) -> None:
198
- self._main_task.cancel()
199
- with contextlib.suppress(asyncio.CancelledError):
200
- await self._main_task
187
+ self._final_events: List[stt.SpeechEvent] = []
188
+ self._speaking = False
201
189
 
202
- def _streaming_config(self) -> cloud_speech.StreamingRecognitionConfig:
203
- return cloud_speech.StreamingRecognitionConfig(
190
+ self._streaming_config = cloud_speech.StreamingRecognitionConfig(
204
191
  config=cloud_speech.RecognitionConfig(
205
192
  explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
206
193
  encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
@@ -214,64 +201,168 @@ class SpeechStream(stt.SpeechStream):
214
201
  ),
215
202
  ),
216
203
  streaming_features=cloud_speech.StreamingRecognitionFeatures(
204
+ enable_voice_activity_events=True,
217
205
  interim_results=self._config.interim_results,
218
206
  ),
219
207
  )
220
208
 
221
- async def _run(self, max_retry: int) -> None:
222
- """Try to connect to Google Speech API and forward frames"""
223
- retry_count = 0
224
- while True:
225
- try:
226
- input_gen = self._input_gen(self._streaming_config())
227
- stream = await self._client.streaming_recognize(requests=input_gen)
228
- retry_count = 0
229
-
230
- async for resp in stream:
231
- self._event_queue.put_nowait(
232
- streaming_recognize_response_to_speech_event(resp)
233
- )
209
+ def log_exception(task: asyncio.Task) -> None:
210
+ if not task.cancelled() and task.exception():
211
+ logging.error(f"google stt task failed: {task.exception()}")
234
212
 
235
- except asyncio.CancelledError:
236
- break
237
- except Exception as e:
238
- if retry_count > max_retry and max_retry > 0:
239
- logging.error(f"failed to connect to Google Speech: {e}")
240
- break
213
+ self._main_task.add_done_callback(log_exception)
241
214
 
242
- retry_delay = min(retry_count * 5, 5) # max 5s
243
- retry_count += 1
244
- logging.warning(
245
- f"failed to connect to Google Speech: {e} - retrying in {retry_delay}s"
246
- )
247
- await asyncio.sleep(retry_delay)
215
+ def push_frame(self, frame: rtc.AudioFrame) -> None:
216
+ if self._closed:
217
+ raise ValueError("cannot push frame to closed stream")
248
218
 
219
+ self._queue.put_nowait(frame)
220
+
221
+ async def aclose(self, wait: bool = True) -> None:
249
222
  self._closed = True
223
+ if not wait:
224
+ self._main_task.cancel()
250
225
 
251
- async def _input_gen(self, config):
252
- """
253
- Convert our input queue to a generator (needed by the Google Speech client in Python)
254
- """
226
+ self._queue.put_nowait(None)
227
+ with contextlib.suppress(asyncio.CancelledError):
228
+ await self._main_task
229
+
230
+ async def _run(self, max_retry: int) -> None:
231
+ retry_count = 0
255
232
  try:
256
- yield cloud_speech.StreamingRecognizeRequest(
257
- recognizer=self._recognizer,
258
- streaming_config=config,
259
- )
260
- while True:
261
- frame = await self._queue.get() # wait for a new rtc.AudioFrame
262
- frame = frame.remix_and_resample(self._sample_rate, self._num_channels)
263
- yield cloud_speech.StreamingRecognizeRequest(
264
- audio=frame.data.tobytes(),
233
+ while not self._closed:
234
+ try:
235
+ # google requires a async generator when calling streaming_recognize
236
+ # this function basically convert the queue into a async generator
237
+ async def input_generator():
238
+ try:
239
+ # first request should contain the config
240
+ yield cloud_speech.StreamingRecognizeRequest(
241
+ recognizer=self._recognizer,
242
+ streaming_config=self._streaming_config,
243
+ )
244
+ while True:
245
+ frame = (
246
+ await self._queue.get()
247
+ ) # wait for a new rtc.AudioFrame
248
+ if frame is None:
249
+ break # None is sent inside aclose
250
+
251
+ self._queue.task_done()
252
+ frame = frame.remix_and_resample(
253
+ self._sample_rate, self._num_channels
254
+ )
255
+ yield cloud_speech.StreamingRecognizeRequest(
256
+ audio=frame.data.tobytes(),
257
+ )
258
+ except Exception as e:
259
+ logging.error(
260
+ f"an error occurred while streaming inputs: {e}"
261
+ )
262
+
263
+ # try to connect
264
+ stream = await self._client.streaming_recognize(
265
+ requests=input_generator()
266
+ )
267
+ retry_count = 0 # connection successful, reset retry count
268
+
269
+ await self._run_stream(stream)
270
+ except Exception as e:
271
+ if retry_count >= max_retry:
272
+ logging.error(
273
+ f"failed to connect to google stt after {max_retry} tries",
274
+ exc_info=e,
275
+ )
276
+ break
277
+
278
+ retry_delay = min(retry_count * 2, 10) # max 10s
279
+ retry_count += 1
280
+ logging.warning(
281
+ f"google stt connection failed, retrying in {retry_delay}s",
282
+ exc_info=e,
283
+ )
284
+ await asyncio.sleep(retry_delay)
285
+ finally:
286
+ self._event_queue.put_nowait(None)
287
+
288
+ async def _run_stream(
289
+ self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
290
+ ):
291
+ async for resp in stream:
292
+ if (
293
+ resp.speech_event_type
294
+ == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
295
+ ):
296
+ self._speaking = True
297
+ start_event = stt.SpeechEvent(
298
+ type=stt.SpeechEventType.START_OF_SPEECH,
265
299
  )
266
- self._queue.task_done()
267
- except Exception as e:
268
- logging.error(f"an error occurred while streaming inputs: {e}")
300
+ self._event_queue.put_nowait(start_event)
301
+
302
+ if (
303
+ resp.speech_event_type
304
+ == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
305
+ ):
306
+ result = resp.results[0]
307
+ if not result.is_final:
308
+ # interim results
309
+ iterim_event = stt.SpeechEvent(
310
+ type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
311
+ alternatives=streaming_recognize_response_to_speech_data(resp),
312
+ )
313
+ self._event_queue.put_nowait(iterim_event)
314
+
315
+ else:
316
+ final_event = stt.SpeechEvent(
317
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT,
318
+ alternatives=streaming_recognize_response_to_speech_data(resp),
319
+ )
320
+ self._final_events.append(final_event)
321
+ self._event_queue.put_nowait(final_event)
322
+
323
+ if not self._speaking:
324
+ # With Google STT, we receive the final event after the END_OF_SPEECH event
325
+ sentence = ""
326
+ confidence = 0.0
327
+ for alt in self._final_events:
328
+ sentence += f"{alt.alternatives[0].text.strip()} "
329
+ confidence += alt.alternatives[0].confidence
330
+
331
+ sentence = sentence.rstrip()
332
+ confidence /= len(self._final_events) # avg. of confidence
333
+
334
+ end_event = stt.SpeechEvent(
335
+ type=stt.SpeechEventType.END_OF_SPEECH,
336
+ alternatives=[
337
+ stt.SpeechData(
338
+ language=result.language_code,
339
+ start_time=self._final_events[0]
340
+ .alternatives[0]
341
+ .start_time,
342
+ end_time=self._final_events[-1]
343
+ .alternatives[0]
344
+ .end_time,
345
+ confidence=confidence,
346
+ text=sentence,
347
+ )
348
+ ],
349
+ )
350
+
351
+ self._final_events = []
352
+ self._event_queue.put_nowait(end_event)
353
+
354
+ if (
355
+ resp.speech_event_type
356
+ == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
357
+ ):
358
+ self._speaking = False
269
359
 
270
360
  async def __anext__(self) -> stt.SpeechEvent:
271
- if self._closed and self._event_queue.empty():
361
+ evt = await self._event_queue.get()
362
+ if evt is None:
272
363
  raise StopAsyncIteration
273
364
 
274
- return await self._event_queue.get()
365
+ return evt
275
366
 
276
367
 
277
368
  def recognize_response_to_speech_event(
@@ -280,8 +371,7 @@ def recognize_response_to_speech_event(
280
371
  result = resp.results[0]
281
372
  gg_alts = result.alternatives
282
373
  return stt.SpeechEvent(
283
- is_final=True,
284
- end_of_speech=True,
374
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT,
285
375
  alternatives=[
286
376
  stt.SpeechData(
287
377
  language=result.language_code,
@@ -295,24 +385,18 @@ def recognize_response_to_speech_event(
295
385
  )
296
386
 
297
387
 
298
- def streaming_recognize_response_to_speech_event(
388
+ def streaming_recognize_response_to_speech_data(
299
389
  resp: cloud_speech.StreamingRecognizeResponse,
300
- ) -> stt.SpeechEvent:
390
+ ) -> List[stt.SpeechData]:
301
391
  result = resp.results[0]
302
392
  gg_alts = result.alternatives
303
- return stt.SpeechEvent(
304
- is_final=result.is_final,
305
- # Google STT does not have a separate end_of_speech indicator
306
- # so we'll use is_final
307
- end_of_speech=result.is_final,
308
- alternatives=[
309
- stt.SpeechData(
310
- language=result.language_code,
311
- start_time=alt.words[0].start_offset.seconds if alt.words else 0,
312
- end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
313
- confidence=alt.confidence,
314
- text=alt.transcript,
315
- )
316
- for alt in gg_alts
317
- ],
318
- )
393
+ return [
394
+ stt.SpeechData(
395
+ language=result.language_code,
396
+ start_time=alt.words[0].start_offset.seconds if alt.words else 0,
397
+ end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
398
+ confidence=alt.confidence,
399
+ text=alt.transcript,
400
+ )
401
+ for alt in gg_alts
402
+ ]
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.2.0"
15
+ __version__ = "0.3.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-google
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -29,8 +29,8 @@ Requires-Dist: google-cloud-speech <3,>=2
29
29
  Requires-Dist: google-cloud-texttospeech <3,>=2
30
30
  Requires-Dist: google-cloud-translate <4,>=3
31
31
  Requires-Dist: googleapis-common-protos <2,>=1
32
- Requires-Dist: livekit >=0.9.0
33
- Requires-Dist: livekit-agents >=0.3.0
32
+ Requires-Dist: livekit >=0.9.2
33
+ Requires-Dist: livekit-agents ~=0.5.dev0
34
34
 
35
35
  # LiveKit Plugins Google
36
36
 
@@ -0,0 +1,9 @@
1
+ livekit/plugins/google/__init__.py,sha256=snPMHNLrurYbLWQOkV_o6qG1CEWsOCZ8ZfPMvmh5ejY,931
2
+ livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
3
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ livekit/plugins/google/stt.py,sha256=lYA8hlkxG3YSw1Q34j8hgs4us5Ij-TLBQTRwtGPN9MY,15025
5
+ livekit/plugins/google/version.py,sha256=G5iYozum4q7UpHwW43F7QfhzUfwcncPxBZ0gmUGsd5I,600
6
+ livekit_plugins_google-0.3.0.dist-info/METADATA,sha256=sPd3OZxViD0Aq1uF1qJpbsYeqLAlq8tB720JXk-_RKw,1945
7
+ livekit_plugins_google-0.3.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
8
+ livekit_plugins_google-0.3.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
9
+ livekit_plugins_google-0.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=uDkfCsfqWmuPDrDolu-nJrZxpTD53pTCaRVWmyA8a6w,918
2
- livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
3
- livekit/plugins/google/stt.py,sha256=efyE7vjxWuO99dR9-nSLF9LkmoX0khOwXpayh7-5saY,11149
4
- livekit/plugins/google/version.py,sha256=cLFCdnm5S21CiJ5UJBcqfRvvFkCQ8p6M5fFUJVJkEiM,600
5
- livekit_plugins_google-0.2.0.dist-info/METADATA,sha256=8tnZ8TW_UHy87ADQvAJSGFqm42Yi-E30bvV2x1LzzBg,1942
6
- livekit_plugins_google-0.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
7
- livekit_plugins_google-0.2.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
8
- livekit_plugins_google-0.2.0.dist-info/RECORD,,