livekit-plugins-google 0.4.dev0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,9 +13,10 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from .stt import STT, SpeechStream
16
+ from .tts import TTS
16
17
  from .version import __version__
17
18
 
18
- __all__ = ["STT", "SpeechStream", "__version__"]
19
+ __all__ = ["STT", "TTS", "SpeechStream", "__version__"]
19
20
 
20
21
  from livekit.agents import Plugin
21
22
 
@@ -83,3 +83,7 @@ SpeechLanguages = Literal[
83
83
  "vi-VN",
84
84
  "da-DK",
85
85
  ]
86
+
87
+ Gender = Literal["male", "female", "neutral"]
88
+
89
+ AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
@@ -17,22 +17,22 @@ from __future__ import annotations
17
17
  import asyncio
18
18
  import contextlib
19
19
  import dataclasses
20
+ import os
20
21
  from dataclasses import dataclass
21
- from typing import Any, AsyncIterable, Dict, List
22
+ from typing import AsyncIterable, List, Optional, Union
22
23
 
23
24
  from livekit import agents, rtc
24
25
  from livekit.agents import stt
25
26
  from livekit.agents.utils import AudioBuffer
26
27
 
27
- from google.auth import credentials # type: ignore
28
28
  from google.cloud.speech_v2 import SpeechAsyncClient
29
29
  from google.cloud.speech_v2.types import cloud_speech
30
30
 
31
31
  from .log import logger
32
32
  from .models import SpeechLanguages, SpeechModels
33
33
 
34
- LgType = SpeechLanguages | str
35
- LanguageCode = LgType | List[LgType]
34
+ LgType = Union[SpeechLanguages, str]
35
+ LanguageCode = Union[LgType, List[LgType]]
36
36
 
37
37
 
38
38
  # This class is only be used internally to encapsulate the options
@@ -56,21 +56,25 @@ class STT(stt.STT):
56
56
  punctuate: bool = True,
57
57
  spoken_punctuation: bool = True,
58
58
  model: SpeechModels = "long",
59
- credentials_info: Dict[str, Any] | None = None,
59
+ credentials_info: dict | None = None,
60
60
  credentials_file: str | None = None,
61
61
  ):
62
62
  """
63
63
  if no credentials is provided, it will use the credentials on the environment
64
- GOOGLE_APPLICATION_CREDENTIALS (Default behavior of Google SpeechAsyncClient)
64
+ GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google SpeechAsyncClient)
65
65
  """
66
66
  super().__init__(streaming_supported=True)
67
67
 
68
- if credentials_info:
69
- self._client = SpeechAsyncClient.from_service_account_info(credentials_info)
70
- elif credentials_file:
71
- self._client = SpeechAsyncClient.from_service_account_file(credentials_file)
72
- else:
73
- self._client = SpeechAsyncClient()
68
+ self._client: SpeechAsyncClient | None = None
69
+ self._credentials_info = credentials_info
70
+ self._credentials_file = credentials_file
71
+
72
+ if credentials_file is None and credentials_info is None:
73
+ creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
74
+ if not creds:
75
+ raise ValueError(
76
+ "GOOGLE_APPLICATION_CREDENTIALS must be set if no credentials is provided"
77
+ )
74
78
 
75
79
  if isinstance(languages, str):
76
80
  languages = [languages]
@@ -83,13 +87,30 @@ class STT(stt.STT):
83
87
  spoken_punctuation=spoken_punctuation,
84
88
  model=model,
85
89
  )
86
- self._creds = self._client.transport._credentials
90
+
91
+ def _ensure_client(self) -> SpeechAsyncClient:
92
+ if self._credentials_info:
93
+ self._client = SpeechAsyncClient.from_service_account_info(
94
+ self._credentials_info
95
+ )
96
+ elif self._credentials_file:
97
+ self._client = SpeechAsyncClient.from_service_account_file(
98
+ self._credentials_file
99
+ )
100
+ else:
101
+ self._client = SpeechAsyncClient()
102
+
103
+ assert self._client is not None
104
+ return self._client
87
105
 
88
106
  @property
89
107
  def _recognizer(self) -> str:
90
108
  # TODO(theomonnom): should we use recognizers?
91
- # Recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
92
- return f"projects/{self._creds.project_id}/locations/global/recognizers/_" # type: ignore
109
+ # recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
110
+
111
+ # TODO(theomonnom): find a better way to access the project_id
112
+ project_id = self._ensure_client().transport._credentials.project_id # type: ignore
113
+ return f"projects/{project_id}/locations/global/recognizers/_"
93
114
 
94
115
  def _sanitize_options(
95
116
  self,
@@ -119,31 +140,31 @@ class STT(stt.STT):
119
140
  language: SpeechLanguages | str | None = None,
120
141
  ) -> stt.SpeechEvent:
121
142
  config = self._sanitize_options(language=language)
122
- buffer = agents.utils.merge_frames(buffer)
143
+ frame = agents.utils.merge_frames(buffer)
123
144
 
124
145
  config = cloud_speech.RecognitionConfig(
125
146
  explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
126
147
  encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
127
- sample_rate_hertz=buffer.sample_rate,
128
- audio_channel_count=buffer.num_channels,
148
+ sample_rate_hertz=frame.sample_rate,
149
+ audio_channel_count=frame.num_channels,
129
150
  ),
130
151
  features=cloud_speech.RecognitionFeatures(
131
152
  enable_automatic_punctuation=config.punctuate,
132
153
  enable_spoken_punctuation=config.spoken_punctuation,
154
+ enable_word_time_offsets=True,
133
155
  ),
134
156
  model=config.model,
135
157
  language_codes=config.languages,
136
158
  )
137
159
 
138
- return recognize_response_to_speech_event(
139
- await self._client.recognize(
140
- cloud_speech.RecognizeRequest(
141
- recognizer=self._recognizer,
142
- config=config,
143
- content=buffer.data.tobytes(),
144
- )
160
+ raw = await self._ensure_client().recognize(
161
+ cloud_speech.RecognizeRequest(
162
+ recognizer=self._recognizer,
163
+ config=config,
164
+ content=frame.data.tobytes(),
145
165
  )
146
166
  )
167
+ return _recognize_response_to_speech_event(raw)
147
168
 
148
169
  def stream(
149
170
  self,
@@ -152,8 +173,7 @@ class STT(stt.STT):
152
173
  ) -> "SpeechStream":
153
174
  config = self._sanitize_options(language=language)
154
175
  return SpeechStream(
155
- self._client,
156
- self._creds,
176
+ self._ensure_client(),
157
177
  self._recognizer,
158
178
  config,
159
179
  )
@@ -163,29 +183,28 @@ class SpeechStream(stt.SpeechStream):
163
183
  def __init__(
164
184
  self,
165
185
  client: SpeechAsyncClient,
166
- creds: credentials.Credentials,
167
186
  recognizer: str,
168
187
  config: STTOptions,
169
- sample_rate: int = 24000,
188
+ sample_rate: int = 48000,
170
189
  num_channels: int = 1,
171
190
  max_retry: int = 32,
172
191
  ) -> None:
173
192
  super().__init__()
174
193
 
175
194
  self._client = client
176
- self._creds = creds
177
195
  self._recognizer = recognizer
178
196
  self._config = config
179
197
  self._sample_rate = sample_rate
180
198
  self._num_channels = num_channels
181
199
 
182
- self._queue = asyncio.Queue[rtc.AudioFrame | None]()
183
- self._event_queue = asyncio.Queue[stt.SpeechEvent | None]()
200
+ self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
201
+ self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
184
202
  self._closed = False
185
203
  self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
186
204
 
187
205
  self._final_events: List[stt.SpeechEvent] = []
188
- self._speaking = False
206
+ self._need_bos = True
207
+ self._need_eos = False
189
208
 
190
209
  self._streaming_config = cloud_speech.StreamingRecognitionConfig(
191
210
  config=cloud_speech.RecognitionConfig(
@@ -198,6 +217,7 @@ class SpeechStream(stt.SpeechStream):
198
217
  model=self._config.model,
199
218
  features=cloud_speech.RecognitionFeatures(
200
219
  enable_automatic_punctuation=self._config.punctuate,
220
+ enable_word_time_offsets=True,
201
221
  ),
202
222
  ),
203
223
  streaming_features=cloud_speech.StreamingRecognitionFeatures(
@@ -218,7 +238,7 @@ class SpeechStream(stt.SpeechStream):
218
238
 
219
239
  self._queue.put_nowait(frame)
220
240
 
221
- async def aclose(self, wait: bool = True) -> None:
241
+ async def aclose(self, *, wait: bool = True) -> None:
222
242
  self._closed = True
223
243
  if not wait:
224
244
  self._main_task.cancel()
@@ -229,61 +249,55 @@ class SpeechStream(stt.SpeechStream):
229
249
 
230
250
  async def _run(self, max_retry: int) -> None:
231
251
  retry_count = 0
232
- try:
233
- while not self._closed:
234
- try:
235
- # google requires a async generator when calling streaming_recognize
236
- # this function basically convert the queue into a async generator
237
- async def input_generator():
238
- try:
239
- # first request should contain the config
240
- yield cloud_speech.StreamingRecognizeRequest(
241
- recognizer=self._recognizer,
242
- streaming_config=self._streaming_config,
252
+ while not self._closed:
253
+ try:
254
+ # google requires a async generator when calling streaming_recognize
255
+ # this function basically convert the queue into a async generator
256
+ async def input_generator():
257
+ try:
258
+ # first request should contain the config
259
+ yield cloud_speech.StreamingRecognizeRequest(
260
+ recognizer=self._recognizer,
261
+ streaming_config=self._streaming_config,
262
+ )
263
+ while True:
264
+ frame = await self._queue.get()
265
+ if frame is None:
266
+ break
267
+
268
+ frame = frame.remix_and_resample(
269
+ self._sample_rate, self._num_channels
243
270
  )
244
- while True:
245
- frame = (
246
- await self._queue.get()
247
- ) # wait for a new rtc.AudioFrame
248
- if frame is None:
249
- break # None is sent inside aclose
250
-
251
- self._queue.task_done()
252
- frame = frame.remix_and_resample(
253
- self._sample_rate, self._num_channels
254
- )
255
- yield cloud_speech.StreamingRecognizeRequest(
256
- audio=frame.data.tobytes(),
257
- )
258
- except Exception as e:
259
- logger.error(
260
- f"an error occurred while streaming inputs: {e}"
271
+ yield cloud_speech.StreamingRecognizeRequest(
272
+ audio=frame.data.tobytes(),
261
273
  )
274
+ except Exception as e:
275
+ logger.error(f"an error occurred while streaming inputs: {e}")
262
276
 
263
- # try to connect
264
- stream = await self._client.streaming_recognize(
265
- requests=input_generator()
266
- )
267
- retry_count = 0 # connection successful, reset retry count
268
-
269
- await self._run_stream(stream)
270
- except Exception as e:
271
- if retry_count >= max_retry:
272
- logger.error(
273
- f"failed to connect to google stt after {max_retry} tries",
274
- exc_info=e,
275
- )
276
- break
277
+ # try to connect
278
+ stream = await self._client.streaming_recognize(
279
+ requests=input_generator()
280
+ )
281
+ retry_count = 0 # connection successful, reset retry count
277
282
 
278
- retry_delay = min(retry_count * 2, 10) # max 10s
279
- retry_count += 1
280
- logger.warning(
281
- f"google stt connection failed, retrying in {retry_delay}s",
283
+ await self._run_stream(stream)
284
+ except Exception as e:
285
+ if retry_count >= max_retry:
286
+ logger.error(
287
+ f"failed to connect to google stt after {max_retry} tries",
282
288
  exc_info=e,
283
289
  )
284
- await asyncio.sleep(retry_delay)
285
- finally:
286
- self._event_queue.put_nowait(None)
290
+ break
291
+
292
+ retry_delay = min(retry_count * 2, 5) # max 5s
293
+ retry_count += 1
294
+ logger.warning(
295
+ f"google stt connection failed, retrying in {retry_delay}s",
296
+ exc_info=e,
297
+ )
298
+ await asyncio.sleep(retry_delay)
299
+
300
+ self._event_queue.put_nowait(None)
287
301
 
288
302
  async def _run_stream(
289
303
  self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
@@ -293,11 +307,11 @@ class SpeechStream(stt.SpeechStream):
293
307
  resp.speech_event_type
294
308
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
295
309
  ):
296
- self._speaking = True
297
- start_event = stt.SpeechEvent(
298
- type=stt.SpeechEventType.START_OF_SPEECH,
299
- )
300
- self._event_queue.put_nowait(start_event)
310
+ if self._need_eos:
311
+ self._send_eos()
312
+
313
+ if self._need_bos:
314
+ self._send_bos()
301
315
 
302
316
  if (
303
317
  resp.speech_event_type
@@ -305,57 +319,89 @@ class SpeechStream(stt.SpeechStream):
305
319
  ):
306
320
  result = resp.results[0]
307
321
  if not result.is_final:
308
- # interim results
309
322
  iterim_event = stt.SpeechEvent(
310
323
  type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
311
- alternatives=streaming_recognize_response_to_speech_data(resp),
324
+ alternatives=[
325
+ _streaming_recognize_response_to_speech_data(resp)
326
+ ],
312
327
  )
313
328
  self._event_queue.put_nowait(iterim_event)
314
329
 
315
330
  else:
316
331
  final_event = stt.SpeechEvent(
317
332
  type=stt.SpeechEventType.FINAL_TRANSCRIPT,
318
- alternatives=streaming_recognize_response_to_speech_data(resp),
333
+ alternatives=[
334
+ _streaming_recognize_response_to_speech_data(resp)
335
+ ],
319
336
  )
320
337
  self._final_events.append(final_event)
321
338
  self._event_queue.put_nowait(final_event)
322
339
 
323
- if not self._speaking:
324
- # With Google STT, we receive the final event after the END_OF_SPEECH event
325
- sentence = ""
326
- confidence = 0.0
327
- for alt in self._final_events:
328
- sentence += f"{alt.alternatives[0].text.strip()} "
329
- confidence += alt.alternatives[0].confidence
330
-
331
- sentence = sentence.rstrip()
332
- confidence /= len(self._final_events) # avg. of confidence
333
-
334
- end_event = stt.SpeechEvent(
335
- type=stt.SpeechEventType.END_OF_SPEECH,
336
- alternatives=[
337
- stt.SpeechData(
338
- language=result.language_code,
339
- start_time=self._final_events[0]
340
- .alternatives[0]
341
- .start_time,
342
- end_time=self._final_events[-1]
343
- .alternatives[0]
344
- .end_time,
345
- confidence=confidence,
346
- text=sentence,
347
- )
348
- ],
349
- )
350
-
351
- self._final_events = []
352
- self._event_queue.put_nowait(end_event)
340
+ if self._need_eos:
341
+ self._send_eos()
353
342
 
354
343
  if (
355
344
  resp.speech_event_type
356
345
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
357
346
  ):
358
- self._speaking = False
347
+ self._need_eos = True
348
+
349
+ if not self._need_bos:
350
+ self._send_eos()
351
+
352
+ def _send_bos(self) -> None:
353
+ self._need_bos = False
354
+ start_event = stt.SpeechEvent(
355
+ type=stt.SpeechEventType.START_OF_SPEECH,
356
+ )
357
+ self._event_queue.put_nowait(start_event)
358
+
359
+ def _send_eos(self) -> None:
360
+ self._need_eos = False
361
+ self._need_bos = True
362
+
363
+ if self._final_events:
364
+ lg = self._final_events[0].alternatives[0].language
365
+
366
+ sentence = ""
367
+ confidence = 0.0
368
+ for alt in self._final_events:
369
+ sentence += f"{alt.alternatives[0].text.strip()} "
370
+ confidence += alt.alternatives[0].confidence
371
+
372
+ sentence = sentence.rstrip()
373
+ confidence /= len(self._final_events) # avg. of confidence
374
+
375
+ end_event = stt.SpeechEvent(
376
+ type=stt.SpeechEventType.END_OF_SPEECH,
377
+ alternatives=[
378
+ stt.SpeechData(
379
+ language=lg,
380
+ start_time=self._final_events[0].alternatives[0].start_time,
381
+ end_time=self._final_events[-1].alternatives[0].end_time,
382
+ confidence=confidence,
383
+ text=sentence,
384
+ )
385
+ ],
386
+ )
387
+
388
+ self._final_events = []
389
+ self._event_queue.put_nowait(end_event)
390
+ else:
391
+ end_event = stt.SpeechEvent(
392
+ type=stt.SpeechEventType.END_OF_SPEECH,
393
+ alternatives=[
394
+ stt.SpeechData(
395
+ language="",
396
+ start_time=0,
397
+ end_time=0,
398
+ confidence=0,
399
+ text="",
400
+ )
401
+ ],
402
+ )
403
+
404
+ self._event_queue.put_nowait(end_event)
359
405
 
360
406
  async def __anext__(self) -> stt.SpeechEvent:
361
407
  evt = await self._event_queue.get()
@@ -365,38 +411,53 @@ class SpeechStream(stt.SpeechStream):
365
411
  return evt
366
412
 
367
413
 
368
- def recognize_response_to_speech_event(
414
+ def _recognize_response_to_speech_event(
369
415
  resp: cloud_speech.RecognizeResponse,
370
416
  ) -> stt.SpeechEvent:
371
- result = resp.results[0]
372
- gg_alts = result.alternatives
417
+ text = ""
418
+ confidence = 0.0
419
+ for result in resp.results:
420
+ text += result.alternatives[0].transcript
421
+ confidence += result.alternatives[0].confidence
422
+
423
+ # not sure why start_offset and end_offset returns a timedelta
424
+ start_offset = resp.results[0].alternatives[0].words[0].start_offset
425
+ end_offset = resp.results[-1].alternatives[0].words[-1].end_offset
426
+
427
+ confidence /= len(resp.results)
428
+ lg = resp.results[0].language_code
373
429
  return stt.SpeechEvent(
374
430
  type=stt.SpeechEventType.FINAL_TRANSCRIPT,
375
431
  alternatives=[
376
432
  stt.SpeechData(
377
- language=result.language_code,
378
- start_time=alt.words[0].start_offset.seconds if alt.words else 0,
379
- end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
380
- confidence=alt.confidence,
381
- text=alt.transcript,
433
+ language=lg,
434
+ start_time=start_offset.total_seconds(), # type: ignore
435
+ end_time=end_offset.total_seconds(), # type: ignore
436
+ confidence=confidence,
437
+ text=text,
382
438
  )
383
- for alt in gg_alts
384
439
  ],
385
440
  )
386
441
 
387
442
 
388
- def streaming_recognize_response_to_speech_data(
443
+ def _streaming_recognize_response_to_speech_data(
389
444
  resp: cloud_speech.StreamingRecognizeResponse,
390
- ) -> List[stt.SpeechData]:
391
- result = resp.results[0]
392
- gg_alts = result.alternatives
393
- return [
394
- stt.SpeechData(
395
- language=result.language_code,
396
- start_time=alt.words[0].start_offset.seconds if alt.words else 0,
397
- end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
398
- confidence=alt.confidence,
399
- text=alt.transcript,
400
- )
401
- for alt in gg_alts
402
- ]
445
+ ) -> stt.SpeechData:
446
+ text = ""
447
+ confidence = 0.0
448
+ for result in resp.results:
449
+ text += result.alternatives[0].transcript
450
+ confidence += result.alternatives[0].confidence
451
+
452
+ confidence /= len(resp.results)
453
+ lg = resp.results[0].language_code
454
+
455
+ data = stt.SpeechData(
456
+ language=lg,
457
+ start_time=0,
458
+ end_time=0,
459
+ confidence=confidence,
460
+ text=text,
461
+ )
462
+
463
+ return data
@@ -0,0 +1,186 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import contextlib
19
+ from dataclasses import dataclass
20
+ from typing import Optional, Union
21
+
22
+ from livekit import rtc
23
+ from livekit.agents import codecs, tts
24
+
25
+ from google.cloud import texttospeech
26
+ from google.cloud.texttospeech_v1.types import (
27
+ SsmlVoiceGender,
28
+ SynthesizeSpeechResponse,
29
+ )
30
+
31
+ from .log import logger
32
+ from .models import AudioEncoding, Gender, SpeechLanguages
33
+
34
+ LgType = Union[SpeechLanguages, str]
35
+ GenderType = Union[Gender, str]
36
+ AudioEncodingType = Union[AudioEncoding, str]
37
+
38
+
39
+ @dataclass
40
+ class _TTSOptions:
41
+ voice: texttospeech.VoiceSelectionParams
42
+ audio_config: texttospeech.AudioConfig
43
+
44
+
45
+ class TTS(tts.TTS):
46
+ def __init__(
47
+ self,
48
+ *,
49
+ language: LgType = "en-US",
50
+ gender: GenderType = "neutral",
51
+ voice_name: str = "", # Not required
52
+ encoding: AudioEncodingType = "linear16",
53
+ sample_rate: int = 24000,
54
+ speaking_rate: float = 1.0,
55
+ credentials_info: dict | None = None,
56
+ credentials_file: str | None = None,
57
+ ) -> None:
58
+ """
59
+ if no credentials is provided, it will use the credentials on the environment
60
+ GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google TextToSpeechAsyncClient)
61
+ """
62
+ super().__init__(
63
+ streaming_supported=False, sample_rate=sample_rate, num_channels=1
64
+ )
65
+
66
+ self._client: texttospeech.TextToSpeechAsyncClient | None = None
67
+ self._credentials_info = credentials_info
68
+ self._credentials_file = credentials_file
69
+
70
+ ssml_gender = SsmlVoiceGender.NEUTRAL
71
+ if gender == "male":
72
+ ssml_gender = SsmlVoiceGender.MALE
73
+ elif gender == "female":
74
+ ssml_gender = SsmlVoiceGender.FEMALE
75
+
76
+ voice = texttospeech.VoiceSelectionParams(
77
+ name=voice_name,
78
+ language_code=language,
79
+ ssml_gender=ssml_gender,
80
+ )
81
+
82
+ if encoding == "linear16" or encoding == "wav":
83
+ _audio_encoding = texttospeech.AudioEncoding.LINEAR16
84
+ elif encoding == "mp3":
85
+ _audio_encoding = texttospeech.AudioEncoding.MP3
86
+ else:
87
+ raise NotImplementedError(f"audio encoding {encoding} is not supported")
88
+
89
+ self._opts = _TTSOptions(
90
+ voice=voice,
91
+ audio_config=texttospeech.AudioConfig(
92
+ audio_encoding=_audio_encoding,
93
+ sample_rate_hertz=sample_rate,
94
+ speaking_rate=speaking_rate,
95
+ ),
96
+ )
97
+
98
+ def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
99
+ if not self._client:
100
+ if self._credentials_info:
101
+ self._client = (
102
+ texttospeech.TextToSpeechAsyncClient.from_service_account_info(
103
+ self._credentials_info
104
+ )
105
+ )
106
+
107
+ elif self._credentials_file:
108
+ self._client = (
109
+ texttospeech.TextToSpeechAsyncClient.from_service_account_file(
110
+ self._credentials_file
111
+ )
112
+ )
113
+ else:
114
+ self._client = texttospeech.TextToSpeechAsyncClient()
115
+
116
+ assert self._client is not None
117
+ return self._client
118
+
119
+ def synthesize(
120
+ self,
121
+ text: str,
122
+ ) -> "ChunkedStream":
123
+ return ChunkedStream(text, self._opts, self._ensure_client())
124
+
125
+
126
+ class ChunkedStream(tts.ChunkedStream):
127
+ def __init__(
128
+ self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
129
+ ) -> None:
130
+ self._text = text
131
+ self._opts = opts
132
+ self._client = client
133
+ self._main_task: asyncio.Task | None = None
134
+ self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
135
+
136
+ async def _run(self) -> None:
137
+ try:
138
+ response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
139
+ input=texttospeech.SynthesisInput(text=self._text),
140
+ voice=self._opts.voice,
141
+ audio_config=self._opts.audio_config,
142
+ )
143
+
144
+ data = response.audio_content
145
+ if self._opts.audio_config.audio_encoding == "mp3":
146
+ decoder = codecs.Mp3StreamDecoder()
147
+ frames = decoder.decode_chunk(data)
148
+ for frame in frames:
149
+ self._queue.put_nowait(
150
+ tts.SynthesizedAudio(text=self._text, data=frame)
151
+ )
152
+ else:
153
+ self._queue.put_nowait(
154
+ tts.SynthesizedAudio(
155
+ text="",
156
+ data=rtc.AudioFrame(
157
+ data=data,
158
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
159
+ num_channels=1,
160
+ samples_per_channel=len(data) // 2, # 16-bit
161
+ ),
162
+ )
163
+ )
164
+
165
+ except Exception:
166
+ logger.exception("failed to synthesize")
167
+ finally:
168
+ self._queue.put_nowait(None)
169
+
170
+ async def __anext__(self) -> tts.SynthesizedAudio:
171
+ if not self._main_task:
172
+ self._main_task = asyncio.create_task(self._run())
173
+
174
+ frame = await self._queue.get()
175
+ if frame is None:
176
+ raise StopAsyncIteration
177
+
178
+ return frame
179
+
180
+ async def aclose(self) -> None:
181
+ if not self._main_task:
182
+ return
183
+
184
+ self._main_task.cancel()
185
+ with contextlib.suppress(asyncio.CancelledError):
186
+ await self._main_task
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.4.dev0"
15
+ __version__ = "0.5.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-google
3
- Version: 0.4.dev0
3
+ Version: 0.5.0
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -30,7 +30,7 @@ Requires-Dist: google-cloud-texttospeech <3,>=2
30
30
  Requires-Dist: google-cloud-translate <4,>=3
31
31
  Requires-Dist: googleapis-common-protos <2,>=1
32
32
  Requires-Dist: livekit ~=0.11
33
- Requires-Dist: livekit-agents ~=0.6.dev0
33
+ Requires-Dist: livekit-agents ~=0.7.0
34
34
 
35
35
  # LiveKit Plugins Google
36
36
 
@@ -0,0 +1,11 @@
1
+ livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
2
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
+ livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
4
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/google/stt.py,sha256=GfWita3mgLZG2KpS9WYMCL8jwCNN5qukicpI58zPCcY,16058
6
+ livekit/plugins/google/tts.py,sha256=J3V5aDUz0V2_Dfs16pobDVx7XwQqU1AEM8TWXdaDn9w,6182
7
+ livekit/plugins/google/version.py,sha256=pZ7bgeWLjw4VCWymU1ntHaHorKRusUkm56y6tZe5gmQ,600
8
+ livekit_plugins_google-0.5.0.dist-info/METADATA,sha256=Hf7P77h8fLEnGsNj4rUdSA_mSL4sCv5pMktzPoTsCbk,1941
9
+ livekit_plugins_google-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
10
+ livekit_plugins_google-0.5.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
+ livekit_plugins_google-0.5.0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=snPMHNLrurYbLWQOkV_o6qG1CEWsOCZ8ZfPMvmh5ejY,931
2
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
- livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
4
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/google/stt.py,sha256=sPV4ByAxfeGBNvAGIuwZvheEA0k7NYjXR_UiYWjd39Y,15029
6
- livekit/plugins/google/version.py,sha256=OwSbVTqWUJKy9w2Jbh1MIrp5cHPvEYsLXDhRGwdZKso,603
7
- livekit_plugins_google-0.4.dev0.dist-info/METADATA,sha256=i4l7y8p0Y57kK5oPQqF-8lp9SFvPL0nDVsuDkktHJN4,1947
8
- livekit_plugins_google-0.4.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
- livekit_plugins_google-0.4.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_google-0.4.dev0.dist-info/RECORD,,