livekit-plugins-google 0.6.0__py3-none-any.whl → 0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,13 +15,15 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  import asyncio
18
+ import contextlib
18
19
  import dataclasses
19
20
  import os
20
21
  from dataclasses import dataclass
21
- from typing import AsyncIterable, List, Union
22
+ from typing import AsyncIterable, List, Optional, Union
22
23
 
23
- from livekit import agents
24
- from livekit.agents import stt, utils
24
+ from livekit import agents, rtc
25
+ from livekit.agents import stt
26
+ from livekit.agents.utils import AudioBuffer
25
27
 
26
28
  from google.cloud.speech_v2 import SpeechAsyncClient
27
29
  from google.cloud.speech_v2.types import cloud_speech
@@ -61,9 +63,7 @@ class STT(stt.STT):
61
63
  if no credentials is provided, it will use the credentials on the environment
62
64
  GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google SpeechAsyncClient)
63
65
  """
64
- super().__init__(
65
- capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
66
- )
66
+ super().__init__(streaming_supported=True)
67
67
 
68
68
  self._client: SpeechAsyncClient | None = None
69
69
  self._credentials_info = credentials_info
@@ -112,7 +112,11 @@ class STT(stt.STT):
112
112
  project_id = self._ensure_client().transport._credentials.project_id # type: ignore
113
113
  return f"projects/{project_id}/locations/global/recognizers/_"
114
114
 
115
- def _sanitize_options(self, *, language: str | None = None) -> STTOptions:
115
+ def _sanitize_options(
116
+ self,
117
+ *,
118
+ language: str | None = None,
119
+ ) -> STTOptions:
116
120
  config = dataclasses.replace(self._config)
117
121
 
118
122
  if language:
@@ -131,8 +135,8 @@ class STT(stt.STT):
131
135
 
132
136
  async def recognize(
133
137
  self,
134
- buffer: utils.AudioBuffer,
135
138
  *,
139
+ buffer: AudioBuffer,
136
140
  language: SpeechLanguages | str | None = None,
137
141
  ) -> stt.SpeechEvent:
138
142
  config = self._sanitize_options(language=language)
@@ -155,16 +159,24 @@ class STT(stt.STT):
155
159
 
156
160
  raw = await self._ensure_client().recognize(
157
161
  cloud_speech.RecognizeRequest(
158
- recognizer=self._recognizer, config=config, content=frame.data.tobytes()
162
+ recognizer=self._recognizer,
163
+ config=config,
164
+ content=frame.data.tobytes(),
159
165
  )
160
166
  )
161
167
  return _recognize_response_to_speech_event(raw)
162
168
 
163
169
  def stream(
164
- self, *, language: SpeechLanguages | str | None = None
170
+ self,
171
+ *,
172
+ language: SpeechLanguages | str | None = None,
165
173
  ) -> "SpeechStream":
166
174
  config = self._sanitize_options(language=language)
167
- return SpeechStream(self._ensure_client(), self._recognizer, config)
175
+ return SpeechStream(
176
+ self._ensure_client(),
177
+ self._recognizer,
178
+ config,
179
+ )
168
180
 
169
181
 
170
182
  class SpeechStream(stt.SpeechStream):
@@ -184,7 +196,15 @@ class SpeechStream(stt.SpeechStream):
184
196
  self._config = config
185
197
  self._sample_rate = sample_rate
186
198
  self._num_channels = num_channels
187
- self._max_retry = max_retry
199
+
200
+ self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
201
+ self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
202
+ self._closed = False
203
+ self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
204
+
205
+ self._final_events: List[stt.SpeechEvent] = []
206
+ self._need_bos = True
207
+ self._need_eos = False
188
208
 
189
209
  self._streaming_config = cloud_speech.StreamingRecognitionConfig(
190
210
  config=cloud_speech.RecognitionConfig(
@@ -206,13 +226,30 @@ class SpeechStream(stt.SpeechStream):
206
226
  ),
207
227
  )
208
228
 
209
- @utils.log_exceptions(logger=logger)
210
- async def _main_task(self) -> None:
211
- await self._run(self._max_retry)
229
+ def log_exception(task: asyncio.Task) -> None:
230
+ if not task.cancelled() and task.exception():
231
+ logger.error(f"google stt task failed: {task.exception()}")
232
+
233
+ self._main_task.add_done_callback(log_exception)
234
+
235
+ def push_frame(self, frame: rtc.AudioFrame) -> None:
236
+ if self._closed:
237
+ raise ValueError("cannot push frame to closed stream")
238
+
239
+ self._queue.put_nowait(frame)
240
+
241
+ async def aclose(self, *, wait: bool = True) -> None:
242
+ self._closed = True
243
+ if not wait:
244
+ self._main_task.cancel()
245
+
246
+ self._queue.put_nowait(None)
247
+ with contextlib.suppress(asyncio.CancelledError):
248
+ await self._main_task
212
249
 
213
250
  async def _run(self, max_retry: int) -> None:
214
251
  retry_count = 0
215
- while not self._input_ch.closed:
252
+ while not self._closed:
216
253
  try:
217
254
  # google requires a async generator when calling streaming_recognize
218
255
  # this function basically convert the queue into a async generator
@@ -223,19 +260,19 @@ class SpeechStream(stt.SpeechStream):
223
260
  recognizer=self._recognizer,
224
261
  streaming_config=self._streaming_config,
225
262
  )
263
+ while True:
264
+ frame = await self._queue.get()
265
+ if frame is None:
266
+ break
226
267
 
227
- async for frame in self._input_ch:
228
268
  frame = frame.remix_and_resample(
229
269
  self._sample_rate, self._num_channels
230
270
  )
231
271
  yield cloud_speech.StreamingRecognizeRequest(
232
- audio=frame.data.tobytes()
272
+ audio=frame.data.tobytes(),
233
273
  )
234
-
235
- except Exception:
236
- logger.exception(
237
- "an error occurred while streaming input to google STT"
238
- )
274
+ except Exception as e:
275
+ logger.error(f"an error occurred while streaming inputs: {e}")
239
276
 
240
277
  # try to connect
241
278
  stream = await self._client.streaming_recognize(
@@ -260,6 +297,8 @@ class SpeechStream(stt.SpeechStream):
260
297
  )
261
298
  await asyncio.sleep(retry_delay)
262
299
 
300
+ self._event_queue.put_nowait(None)
301
+
263
302
  async def _run_stream(
264
303
  self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
265
304
  ):
@@ -268,9 +307,11 @@ class SpeechStream(stt.SpeechStream):
268
307
  resp.speech_event_type
269
308
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
270
309
  ):
271
- self._event_ch.send_nowait(
272
- stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
273
- )
310
+ if self._need_eos:
311
+ self._send_eos()
312
+
313
+ if self._need_bos:
314
+ self._send_bos()
274
315
 
275
316
  if (
276
317
  resp.speech_event_type
@@ -278,31 +319,96 @@ class SpeechStream(stt.SpeechStream):
278
319
  ):
279
320
  result = resp.results[0]
280
321
  if not result.is_final:
281
- self._event_ch.send_nowait(
282
- stt.SpeechEvent(
283
- type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
284
- alternatives=[
285
- _streaming_recognize_response_to_speech_data(resp)
286
- ],
287
- )
322
+ iterim_event = stt.SpeechEvent(
323
+ type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
324
+ alternatives=[
325
+ _streaming_recognize_response_to_speech_data(resp)
326
+ ],
288
327
  )
328
+ self._event_queue.put_nowait(iterim_event)
329
+
289
330
  else:
290
- self._event_ch.send_nowait(
291
- stt.SpeechEvent(
292
- type=stt.SpeechEventType.FINAL_TRANSCRIPT,
293
- alternatives=[
294
- _streaming_recognize_response_to_speech_data(resp)
295
- ],
296
- )
331
+ final_event = stt.SpeechEvent(
332
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT,
333
+ alternatives=[
334
+ _streaming_recognize_response_to_speech_data(resp)
335
+ ],
297
336
  )
337
+ self._final_events.append(final_event)
338
+ self._event_queue.put_nowait(final_event)
339
+
340
+ if self._need_eos:
341
+ self._send_eos()
298
342
 
299
343
  if (
300
344
  resp.speech_event_type
301
345
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
302
346
  ):
303
- self._event_ch.send_nowait(
304
- stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
305
- )
347
+ self._need_eos = True
348
+
349
+ if not self._need_bos:
350
+ self._send_eos()
351
+
352
+ def _send_bos(self) -> None:
353
+ self._need_bos = False
354
+ start_event = stt.SpeechEvent(
355
+ type=stt.SpeechEventType.START_OF_SPEECH,
356
+ )
357
+ self._event_queue.put_nowait(start_event)
358
+
359
+ def _send_eos(self) -> None:
360
+ self._need_eos = False
361
+ self._need_bos = True
362
+
363
+ if self._final_events:
364
+ lg = self._final_events[0].alternatives[0].language
365
+
366
+ sentence = ""
367
+ confidence = 0.0
368
+ for alt in self._final_events:
369
+ sentence += f"{alt.alternatives[0].text.strip()} "
370
+ confidence += alt.alternatives[0].confidence
371
+
372
+ sentence = sentence.rstrip()
373
+ confidence /= len(self._final_events) # avg. of confidence
374
+
375
+ end_event = stt.SpeechEvent(
376
+ type=stt.SpeechEventType.END_OF_SPEECH,
377
+ alternatives=[
378
+ stt.SpeechData(
379
+ language=lg,
380
+ start_time=self._final_events[0].alternatives[0].start_time,
381
+ end_time=self._final_events[-1].alternatives[0].end_time,
382
+ confidence=confidence,
383
+ text=sentence,
384
+ )
385
+ ],
386
+ )
387
+
388
+ self._final_events = []
389
+ self._event_queue.put_nowait(end_event)
390
+ else:
391
+ end_event = stt.SpeechEvent(
392
+ type=stt.SpeechEventType.END_OF_SPEECH,
393
+ alternatives=[
394
+ stt.SpeechData(
395
+ language="",
396
+ start_time=0,
397
+ end_time=0,
398
+ confidence=0,
399
+ text="",
400
+ )
401
+ ],
402
+ )
403
+
404
+ self._event_queue.put_nowait(end_event)
405
+
406
+ async def __anext__(self) -> stt.SpeechEvent:
407
+ evt = await self._event_queue.get()
408
+ if evt is None:
409
+ raise StopAsyncIteration
410
+
411
+ return evt
306
412
 
307
413
 
308
414
  def _recognize_response_to_speech_event(
@@ -347,7 +453,11 @@ def _streaming_recognize_response_to_speech_data(
347
453
  lg = resp.results[0].language_code
348
454
 
349
455
  data = stt.SpeechData(
350
- language=lg, start_time=0, end_time=0, confidence=confidence, text=text
456
+ language=lg,
457
+ start_time=0,
458
+ end_time=0,
459
+ confidence=confidence,
460
+ text=text,
351
461
  )
352
462
 
353
463
  return data
@@ -14,14 +14,19 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import asyncio
18
+ import contextlib
17
19
  from dataclasses import dataclass
18
- from typing import Union
20
+ from typing import Optional, Union
19
21
 
20
22
  from livekit import rtc
21
- from livekit.agents import tts, utils
23
+ from livekit.agents import codecs, tts
22
24
 
23
25
  from google.cloud import texttospeech
24
- from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
26
+ from google.cloud.texttospeech_v1.types import (
27
+ SsmlVoiceGender,
28
+ SynthesizeSpeechResponse,
29
+ )
25
30
 
26
31
  from .log import logger
27
32
  from .models import AudioEncoding, Gender, SpeechLanguages
@@ -55,11 +60,7 @@ class TTS(tts.TTS):
55
60
  GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google TextToSpeechAsyncClient)
56
61
  """
57
62
  super().__init__(
58
- capabilities=tts.TTSCapabilities(
59
- streaming=True,
60
- ),
61
- sample_rate=sample_rate,
62
- num_channels=1,
63
+ streaming_supported=False, sample_rate=sample_rate, num_channels=1
63
64
  )
64
65
 
65
66
  self._client: texttospeech.TextToSpeechAsyncClient | None = None
@@ -73,7 +74,9 @@ class TTS(tts.TTS):
73
74
  ssml_gender = SsmlVoiceGender.FEMALE
74
75
 
75
76
  voice = texttospeech.VoiceSelectionParams(
76
- name=voice_name, language_code=language, ssml_gender=ssml_gender
77
+ name=voice_name,
78
+ language_code=language,
79
+ ssml_gender=ssml_gender,
77
80
  )
78
81
 
79
82
  if encoding == "linear16" or encoding == "wav":
@@ -113,7 +116,10 @@ class TTS(tts.TTS):
113
116
  assert self._client is not None
114
117
  return self._client
115
118
 
116
- def synthesize(self, text: str) -> "ChunkedStream":
119
+ def synthesize(
120
+ self,
121
+ text: str,
122
+ ) -> "ChunkedStream":
117
123
  return ChunkedStream(text, self._opts, self._ensure_client())
118
124
 
119
125
 
@@ -121,38 +127,60 @@ class ChunkedStream(tts.ChunkedStream):
121
127
  def __init__(
122
128
  self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
123
129
  ) -> None:
124
- super().__init__()
125
- self._text, self._opts, self._client = text, opts, client
126
-
127
- @utils.log_exceptions(logger=logger)
128
- async def _main_task(self) -> None:
129
- request_id = utils.shortuuid()
130
- segment_id = utils.shortuuid()
131
- response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
132
- input=texttospeech.SynthesisInput(text=self._text),
133
- voice=self._opts.voice,
134
- audio_config=self._opts.audio_config,
135
- )
130
+ self._text = text
131
+ self._opts = opts
132
+ self._client = client
133
+ self._main_task: asyncio.Task | None = None
134
+ self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
135
+
136
+ async def _run(self) -> None:
137
+ try:
138
+ response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
139
+ input=texttospeech.SynthesisInput(text=self._text),
140
+ voice=self._opts.voice,
141
+ audio_config=self._opts.audio_config,
142
+ )
136
143
 
137
- data = response.audio_content
138
- if self._opts.audio_config.audio_encoding == "mp3":
139
- decoder = utils.codecs.Mp3StreamDecoder()
140
- for frame in decoder.decode_chunk(data):
141
- self._event_ch.send_nowait(
144
+ data = response.audio_content
145
+ if self._opts.audio_config.audio_encoding == "mp3":
146
+ decoder = codecs.Mp3StreamDecoder()
147
+ frames = decoder.decode_chunk(data)
148
+ for frame in frames:
149
+ self._queue.put_nowait(
150
+ tts.SynthesizedAudio(text=self._text, data=frame)
151
+ )
152
+ else:
153
+ self._queue.put_nowait(
142
154
  tts.SynthesizedAudio(
143
- request_id=request_id, segment_id=segment_id, frame=frame
155
+ text="",
156
+ data=rtc.AudioFrame(
157
+ data=data,
158
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
159
+ num_channels=1,
160
+ samples_per_channel=len(data) // 2, # 16-bit
161
+ ),
144
162
  )
145
163
  )
146
- else:
147
- self._event_ch.send_nowait(
148
- tts.SynthesizedAudio(
149
- request_id=request_id,
150
- segment_id=segment_id,
151
- frame=rtc.AudioFrame(
152
- data=data,
153
- sample_rate=self._opts.audio_config.sample_rate_hertz,
154
- num_channels=1,
155
- samples_per_channel=len(data) // 2, # 16-bit
156
- ),
157
- )
158
- )
164
+
165
+ except Exception:
166
+ logger.exception("failed to synthesize")
167
+ finally:
168
+ self._queue.put_nowait(None)
169
+
170
+ async def __anext__(self) -> tts.SynthesizedAudio:
171
+ if not self._main_task:
172
+ self._main_task = asyncio.create_task(self._run())
173
+
174
+ frame = await self._queue.get()
175
+ if frame is None:
176
+ raise StopAsyncIteration
177
+
178
+ return frame
179
+
180
+ async def aclose(self) -> None:
181
+ if not self._main_task:
182
+ return
183
+
184
+ self._main_task.cancel()
185
+ with contextlib.suppress(asyncio.CancelledError):
186
+ await self._main_task
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.6.0"
15
+ __version__ = "0.6.dev0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-google
3
- Version: 0.6.0
3
+ Version: 0.6.dev0
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -14,14 +14,23 @@ Classifier: Topic :: Multimedia :: Sound/Audio
14
14
  Classifier: Topic :: Multimedia :: Video
15
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
17
19
  Classifier: Programming Language :: Python :: 3.9
18
20
  Classifier: Programming Language :: Python :: 3.10
19
21
  Classifier: Programming Language :: Python :: 3 :: Only
20
- Requires-Python: >=3.9.0
22
+ Requires-Python: >=3.7.0
21
23
  Description-Content-Type: text/markdown
24
+ Requires-Dist: numpy <2,>=1
25
+ Requires-Dist: google-api-core <3,>=2
26
+ Requires-Dist: google-auth <3,>=2
27
+ Requires-Dist: google-cloud-core <3,>=2
22
28
  Requires-Dist: google-cloud-speech <3,>=2
23
29
  Requires-Dist: google-cloud-texttospeech <3,>=2
24
- Requires-Dist: livekit-agents >=0.8.0.dev0
30
+ Requires-Dist: google-cloud-translate <4,>=3
31
+ Requires-Dist: googleapis-common-protos <2,>=1
32
+ Requires-Dist: livekit ~=0.11
33
+ Requires-Dist: livekit-agents ~=0.8.dev0
25
34
 
26
35
  # LiveKit Plugins Google
27
36
 
@@ -0,0 +1,11 @@
1
+ livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
2
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
+ livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
4
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/google/stt.py,sha256=GfWita3mgLZG2KpS9WYMCL8jwCNN5qukicpI58zPCcY,16058
6
+ livekit/plugins/google/tts.py,sha256=J3V5aDUz0V2_Dfs16pobDVx7XwQqU1AEM8TWXdaDn9w,6182
7
+ livekit/plugins/google/version.py,sha256=yB6WnbnD5MFhQDT5ItJ02XWVsNanlDYiOezzwv0IdcM,603
8
+ livekit_plugins_google-0.6.dev0.dist-info/METADATA,sha256=azeNkX6imQv83LarBM4dZedsNBmaeDG0ESFS8-Q-S0E,1947
9
+ livekit_plugins_google-0.6.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
10
+ livekit_plugins_google-0.6.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
+ livekit_plugins_google-0.6.dev0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.1.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
2
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
- livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
4
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/google/stt.py,sha256=bqXaoi5trER7PE45axfEpHwReElmf7yl38RpK1iJsdc,12849
6
- livekit/plugins/google/tts.py,sha256=KUw826CK3yt5meGVj0TKkueQ8o_gaXbc1Rtvdv2yF5M,5548
7
- livekit/plugins/google/version.py,sha256=Z62pORgDetwUvtfZOgPeIzXJugcrpDAOzC876rjCR0o,600
8
- livekit_plugins_google-0.6.0.dist-info/METADATA,sha256=Gb5O82GO4CpSvNHeYs4kD2K-neRklRGXaEQwOSQ8SpM,1584
9
- livekit_plugins_google-0.6.0.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
10
- livekit_plugins_google-0.6.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
- livekit_plugins_google-0.6.0.dist-info/RECORD,,