livekit-plugins-google 0.6.dev0__py3-none-any.whl → 0.6.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,15 +15,13 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  import asyncio
18
- import contextlib
19
18
  import dataclasses
20
19
  import os
21
20
  from dataclasses import dataclass
22
- from typing import AsyncIterable, List, Optional, Union
21
+ from typing import AsyncIterable, List, Union
23
22
 
24
- from livekit import agents, rtc
25
- from livekit.agents import stt
26
- from livekit.agents.utils import AudioBuffer
23
+ from livekit import agents
24
+ from livekit.agents import stt, utils
27
25
 
28
26
  from google.cloud.speech_v2 import SpeechAsyncClient
29
27
  from google.cloud.speech_v2.types import cloud_speech
@@ -63,7 +61,9 @@ class STT(stt.STT):
63
61
  if no credentials is provided, it will use the credentials on the environment
64
62
  GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google SpeechAsyncClient)
65
63
  """
66
- super().__init__(streaming_supported=True)
64
+ super().__init__(
65
+ capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
66
+ )
67
67
 
68
68
  self._client: SpeechAsyncClient | None = None
69
69
  self._credentials_info = credentials_info
@@ -112,11 +112,7 @@ class STT(stt.STT):
112
112
  project_id = self._ensure_client().transport._credentials.project_id # type: ignore
113
113
  return f"projects/{project_id}/locations/global/recognizers/_"
114
114
 
115
- def _sanitize_options(
116
- self,
117
- *,
118
- language: str | None = None,
119
- ) -> STTOptions:
115
+ def _sanitize_options(self, *, language: str | None = None) -> STTOptions:
120
116
  config = dataclasses.replace(self._config)
121
117
 
122
118
  if language:
@@ -136,7 +132,7 @@ class STT(stt.STT):
136
132
  async def recognize(
137
133
  self,
138
134
  *,
139
- buffer: AudioBuffer,
135
+ buffer: utils.AudioBuffer,
140
136
  language: SpeechLanguages | str | None = None,
141
137
  ) -> stt.SpeechEvent:
142
138
  config = self._sanitize_options(language=language)
@@ -159,24 +155,16 @@ class STT(stt.STT):
159
155
 
160
156
  raw = await self._ensure_client().recognize(
161
157
  cloud_speech.RecognizeRequest(
162
- recognizer=self._recognizer,
163
- config=config,
164
- content=frame.data.tobytes(),
158
+ recognizer=self._recognizer, config=config, content=frame.data.tobytes()
165
159
  )
166
160
  )
167
161
  return _recognize_response_to_speech_event(raw)
168
162
 
169
163
  def stream(
170
- self,
171
- *,
172
- language: SpeechLanguages | str | None = None,
164
+ self, *, language: SpeechLanguages | str | None = None
173
165
  ) -> "SpeechStream":
174
166
  config = self._sanitize_options(language=language)
175
- return SpeechStream(
176
- self._ensure_client(),
177
- self._recognizer,
178
- config,
179
- )
167
+ return SpeechStream(self._ensure_client(), self._recognizer, config)
180
168
 
181
169
 
182
170
  class SpeechStream(stt.SpeechStream):
@@ -196,15 +184,7 @@ class SpeechStream(stt.SpeechStream):
196
184
  self._config = config
197
185
  self._sample_rate = sample_rate
198
186
  self._num_channels = num_channels
199
-
200
- self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
201
- self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
202
- self._closed = False
203
- self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
204
-
205
- self._final_events: List[stt.SpeechEvent] = []
206
- self._need_bos = True
207
- self._need_eos = False
187
+ self._max_retry = max_retry
208
188
 
209
189
  self._streaming_config = cloud_speech.StreamingRecognitionConfig(
210
190
  config=cloud_speech.RecognitionConfig(
@@ -226,30 +206,13 @@ class SpeechStream(stt.SpeechStream):
226
206
  ),
227
207
  )
228
208
 
229
- def log_exception(task: asyncio.Task) -> None:
230
- if not task.cancelled() and task.exception():
231
- logger.error(f"google stt task failed: {task.exception()}")
232
-
233
- self._main_task.add_done_callback(log_exception)
234
-
235
- def push_frame(self, frame: rtc.AudioFrame) -> None:
236
- if self._closed:
237
- raise ValueError("cannot push frame to closed stream")
238
-
239
- self._queue.put_nowait(frame)
240
-
241
- async def aclose(self, *, wait: bool = True) -> None:
242
- self._closed = True
243
- if not wait:
244
- self._main_task.cancel()
245
-
246
- self._queue.put_nowait(None)
247
- with contextlib.suppress(asyncio.CancelledError):
248
- await self._main_task
209
+ @utils.log_exceptions(logger=logger)
210
+ async def _main_task(self) -> None:
211
+ await self._run(self._max_retry)
249
212
 
250
213
  async def _run(self, max_retry: int) -> None:
251
214
  retry_count = 0
252
- while not self._closed:
215
+ while not self._input_ch.closed:
253
216
  try:
254
217
  # google requires a async generator when calling streaming_recognize
255
218
  # this function basically convert the queue into a async generator
@@ -260,19 +223,19 @@ class SpeechStream(stt.SpeechStream):
260
223
  recognizer=self._recognizer,
261
224
  streaming_config=self._streaming_config,
262
225
  )
263
- while True:
264
- frame = await self._queue.get()
265
- if frame is None:
266
- break
267
226
 
227
+ async for frame in self._input_ch:
268
228
  frame = frame.remix_and_resample(
269
229
  self._sample_rate, self._num_channels
270
230
  )
271
231
  yield cloud_speech.StreamingRecognizeRequest(
272
- audio=frame.data.tobytes(),
232
+ audio=frame.data.tobytes()
273
233
  )
274
- except Exception as e:
275
- logger.error(f"an error occurred while streaming inputs: {e}")
234
+
235
+ except Exception:
236
+ logger.exception(
237
+ "an error occurred while streaming input to google STT"
238
+ )
276
239
 
277
240
  # try to connect
278
241
  stream = await self._client.streaming_recognize(
@@ -297,8 +260,6 @@ class SpeechStream(stt.SpeechStream):
297
260
  )
298
261
  await asyncio.sleep(retry_delay)
299
262
 
300
- self._event_queue.put_nowait(None)
301
-
302
263
  async def _run_stream(
303
264
  self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
304
265
  ):
@@ -307,11 +268,9 @@ class SpeechStream(stt.SpeechStream):
307
268
  resp.speech_event_type
308
269
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
309
270
  ):
310
- if self._need_eos:
311
- self._send_eos()
312
-
313
- if self._need_bos:
314
- self._send_bos()
271
+ self._event_ch.send_nowait(
272
+ stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
273
+ )
315
274
 
316
275
  if (
317
276
  resp.speech_event_type
@@ -319,96 +278,31 @@ class SpeechStream(stt.SpeechStream):
319
278
  ):
320
279
  result = resp.results[0]
321
280
  if not result.is_final:
322
- iterim_event = stt.SpeechEvent(
323
- type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
324
- alternatives=[
325
- _streaming_recognize_response_to_speech_data(resp)
326
- ],
281
+ self._event_ch.send_nowait(
282
+ stt.SpeechEvent(
283
+ type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
284
+ alternatives=[
285
+ _streaming_recognize_response_to_speech_data(resp)
286
+ ],
287
+ )
327
288
  )
328
- self._event_queue.put_nowait(iterim_event)
329
-
330
289
  else:
331
- final_event = stt.SpeechEvent(
332
- type=stt.SpeechEventType.FINAL_TRANSCRIPT,
333
- alternatives=[
334
- _streaming_recognize_response_to_speech_data(resp)
335
- ],
290
+ self._event_ch.send_nowait(
291
+ stt.SpeechEvent(
292
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT,
293
+ alternatives=[
294
+ _streaming_recognize_response_to_speech_data(resp)
295
+ ],
296
+ )
336
297
  )
337
- self._final_events.append(final_event)
338
- self._event_queue.put_nowait(final_event)
339
-
340
- if self._need_eos:
341
- self._send_eos()
342
298
 
343
299
  if (
344
300
  resp.speech_event_type
345
301
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
346
302
  ):
347
- self._need_eos = True
348
-
349
- if not self._need_bos:
350
- self._send_eos()
351
-
352
- def _send_bos(self) -> None:
353
- self._need_bos = False
354
- start_event = stt.SpeechEvent(
355
- type=stt.SpeechEventType.START_OF_SPEECH,
356
- )
357
- self._event_queue.put_nowait(start_event)
358
-
359
- def _send_eos(self) -> None:
360
- self._need_eos = False
361
- self._need_bos = True
362
-
363
- if self._final_events:
364
- lg = self._final_events[0].alternatives[0].language
365
-
366
- sentence = ""
367
- confidence = 0.0
368
- for alt in self._final_events:
369
- sentence += f"{alt.alternatives[0].text.strip()} "
370
- confidence += alt.alternatives[0].confidence
371
-
372
- sentence = sentence.rstrip()
373
- confidence /= len(self._final_events) # avg. of confidence
374
-
375
- end_event = stt.SpeechEvent(
376
- type=stt.SpeechEventType.END_OF_SPEECH,
377
- alternatives=[
378
- stt.SpeechData(
379
- language=lg,
380
- start_time=self._final_events[0].alternatives[0].start_time,
381
- end_time=self._final_events[-1].alternatives[0].end_time,
382
- confidence=confidence,
383
- text=sentence,
384
- )
385
- ],
386
- )
387
-
388
- self._final_events = []
389
- self._event_queue.put_nowait(end_event)
390
- else:
391
- end_event = stt.SpeechEvent(
392
- type=stt.SpeechEventType.END_OF_SPEECH,
393
- alternatives=[
394
- stt.SpeechData(
395
- language="",
396
- start_time=0,
397
- end_time=0,
398
- confidence=0,
399
- text="",
400
- )
401
- ],
402
- )
403
-
404
- self._event_queue.put_nowait(end_event)
405
-
406
- async def __anext__(self) -> stt.SpeechEvent:
407
- evt = await self._event_queue.get()
408
- if evt is None:
409
- raise StopAsyncIteration
410
-
411
- return evt
303
+ self._event_ch.send_nowait(
304
+ stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
305
+ )
412
306
 
413
307
 
414
308
  def _recognize_response_to_speech_event(
@@ -453,11 +347,7 @@ def _streaming_recognize_response_to_speech_data(
453
347
  lg = resp.results[0].language_code
454
348
 
455
349
  data = stt.SpeechData(
456
- language=lg,
457
- start_time=0,
458
- end_time=0,
459
- confidence=confidence,
460
- text=text,
350
+ language=lg, start_time=0, end_time=0, confidence=confidence, text=text
461
351
  )
462
352
 
463
353
  return data
@@ -14,19 +14,14 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
- import asyncio
18
- import contextlib
19
17
  from dataclasses import dataclass
20
- from typing import Optional, Union
18
+ from typing import Union
21
19
 
22
20
  from livekit import rtc
23
- from livekit.agents import codecs, tts
21
+ from livekit.agents import tts, utils
24
22
 
25
23
  from google.cloud import texttospeech
26
- from google.cloud.texttospeech_v1.types import (
27
- SsmlVoiceGender,
28
- SynthesizeSpeechResponse,
29
- )
24
+ from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
30
25
 
31
26
  from .log import logger
32
27
  from .models import AudioEncoding, Gender, SpeechLanguages
@@ -60,7 +55,11 @@ class TTS(tts.TTS):
60
55
  GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google TextToSpeechAsyncClient)
61
56
  """
62
57
  super().__init__(
63
- streaming_supported=False, sample_rate=sample_rate, num_channels=1
58
+ capabilities=tts.TTSCapabilities(
59
+ streaming=True,
60
+ ),
61
+ sample_rate=sample_rate,
62
+ num_channels=1,
64
63
  )
65
64
 
66
65
  self._client: texttospeech.TextToSpeechAsyncClient | None = None
@@ -74,9 +73,7 @@ class TTS(tts.TTS):
74
73
  ssml_gender = SsmlVoiceGender.FEMALE
75
74
 
76
75
  voice = texttospeech.VoiceSelectionParams(
77
- name=voice_name,
78
- language_code=language,
79
- ssml_gender=ssml_gender,
76
+ name=voice_name, language_code=language, ssml_gender=ssml_gender
80
77
  )
81
78
 
82
79
  if encoding == "linear16" or encoding == "wav":
@@ -116,10 +113,7 @@ class TTS(tts.TTS):
116
113
  assert self._client is not None
117
114
  return self._client
118
115
 
119
- def synthesize(
120
- self,
121
- text: str,
122
- ) -> "ChunkedStream":
116
+ def synthesize(self, text: str) -> "ChunkedStream":
123
117
  return ChunkedStream(text, self._opts, self._ensure_client())
124
118
 
125
119
 
@@ -127,60 +121,38 @@ class ChunkedStream(tts.ChunkedStream):
127
121
  def __init__(
128
122
  self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
129
123
  ) -> None:
130
- self._text = text
131
- self._opts = opts
132
- self._client = client
133
- self._main_task: asyncio.Task | None = None
134
- self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
135
-
136
- async def _run(self) -> None:
137
- try:
138
- response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
139
- input=texttospeech.SynthesisInput(text=self._text),
140
- voice=self._opts.voice,
141
- audio_config=self._opts.audio_config,
142
- )
124
+ super().__init__()
125
+ self._text, self._opts, self._client = text, opts, client
126
+
127
+ @utils.log_exceptions(logger=logger)
128
+ async def _main_task(self) -> None:
129
+ request_id = utils.shortuuid()
130
+ segment_id = utils.shortuuid()
131
+ response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
132
+ input=texttospeech.SynthesisInput(text=self._text),
133
+ voice=self._opts.voice,
134
+ audio_config=self._opts.audio_config,
135
+ )
143
136
 
144
- data = response.audio_content
145
- if self._opts.audio_config.audio_encoding == "mp3":
146
- decoder = codecs.Mp3StreamDecoder()
147
- frames = decoder.decode_chunk(data)
148
- for frame in frames:
149
- self._queue.put_nowait(
150
- tts.SynthesizedAudio(text=self._text, data=frame)
151
- )
152
- else:
153
- self._queue.put_nowait(
137
+ data = response.audio_content
138
+ if self._opts.audio_config.audio_encoding == "mp3":
139
+ decoder = utils.codecs.Mp3StreamDecoder()
140
+ for frame in decoder.decode_chunk(data):
141
+ self._event_ch.send_nowait(
154
142
  tts.SynthesizedAudio(
155
- text="",
156
- data=rtc.AudioFrame(
157
- data=data,
158
- sample_rate=self._opts.audio_config.sample_rate_hertz,
159
- num_channels=1,
160
- samples_per_channel=len(data) // 2, # 16-bit
161
- ),
143
+ request_id=request_id, segment_id=segment_id, frame=frame
162
144
  )
163
145
  )
164
-
165
- except Exception:
166
- logger.exception("failed to synthesize")
167
- finally:
168
- self._queue.put_nowait(None)
169
-
170
- async def __anext__(self) -> tts.SynthesizedAudio:
171
- if not self._main_task:
172
- self._main_task = asyncio.create_task(self._run())
173
-
174
- frame = await self._queue.get()
175
- if frame is None:
176
- raise StopAsyncIteration
177
-
178
- return frame
179
-
180
- async def aclose(self) -> None:
181
- if not self._main_task:
182
- return
183
-
184
- self._main_task.cancel()
185
- with contextlib.suppress(asyncio.CancelledError):
186
- await self._main_task
146
+ else:
147
+ self._event_ch.send_nowait(
148
+ tts.SynthesizedAudio(
149
+ request_id=request_id,
150
+ segment_id=segment_id,
151
+ frame=rtc.AudioFrame(
152
+ data=data,
153
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
154
+ num_channels=1,
155
+ samples_per_channel=len(data) // 2, # 16-bit
156
+ ),
157
+ )
158
+ )
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.6.dev0"
15
+ __version__ = "0.6.0-dev.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-google
3
- Version: 0.6.dev0
3
+ Version: 0.6.0.dev1
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -14,23 +14,14 @@ Classifier: Topic :: Multimedia :: Sound/Audio
14
14
  Classifier: Topic :: Multimedia :: Video
15
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.7
18
- Classifier: Programming Language :: Python :: 3.8
19
17
  Classifier: Programming Language :: Python :: 3.9
20
18
  Classifier: Programming Language :: Python :: 3.10
21
19
  Classifier: Programming Language :: Python :: 3 :: Only
22
- Requires-Python: >=3.7.0
20
+ Requires-Python: >=3.9.0
23
21
  Description-Content-Type: text/markdown
24
- Requires-Dist: numpy <2,>=1
25
- Requires-Dist: google-api-core <3,>=2
26
- Requires-Dist: google-auth <3,>=2
27
- Requires-Dist: google-cloud-core <3,>=2
28
22
  Requires-Dist: google-cloud-speech <3,>=2
29
23
  Requires-Dist: google-cloud-texttospeech <3,>=2
30
- Requires-Dist: google-cloud-translate <4,>=3
31
- Requires-Dist: googleapis-common-protos <2,>=1
32
- Requires-Dist: livekit ~=0.11
33
- Requires-Dist: livekit-agents ~=0.8.dev0
24
+ Requires-Dist: livekit-agents ~=0.7
34
25
 
35
26
  # LiveKit Plugins Google
36
27
 
@@ -0,0 +1,11 @@
1
+ livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
2
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
+ livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
4
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/google/stt.py,sha256=WW6pOduUCS6pfA3smgA1R6AhxbJalZfTjBr8YY3bihU,12849
6
+ livekit/plugins/google/tts.py,sha256=KUw826CK3yt5meGVj0TKkueQ8o_gaXbc1Rtvdv2yF5M,5548
7
+ livekit/plugins/google/version.py,sha256=j3miHUi9rEKsns_jxSp6UXRQsGfGFJHftBOqTdsFPZc,606
8
+ livekit_plugins_google-0.6.0.dev1.dist-info/METADATA,sha256=sklyeVK32JzAYgz4OhncrDva5p7rwDKyWS059YQQxaE,1582
9
+ livekit_plugins_google-0.6.0.dev1.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
10
+ livekit_plugins_google-0.6.0.dev1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
+ livekit_plugins_google-0.6.0.dev1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
2
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
- livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
4
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/google/stt.py,sha256=GfWita3mgLZG2KpS9WYMCL8jwCNN5qukicpI58zPCcY,16058
6
- livekit/plugins/google/tts.py,sha256=J3V5aDUz0V2_Dfs16pobDVx7XwQqU1AEM8TWXdaDn9w,6182
7
- livekit/plugins/google/version.py,sha256=yB6WnbnD5MFhQDT5ItJ02XWVsNanlDYiOezzwv0IdcM,603
8
- livekit_plugins_google-0.6.dev0.dist-info/METADATA,sha256=azeNkX6imQv83LarBM4dZedsNBmaeDG0ESFS8-Q-S0E,1947
9
- livekit_plugins_google-0.6.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
10
- livekit_plugins_google-0.6.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
- livekit_plugins_google-0.6.dev0.dist-info/RECORD,,