livekit-plugins-google 0.4.dev0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/__init__.py +2 -1
- livekit/plugins/google/models.py +4 -0
- livekit/plugins/google/stt.py +206 -145
- livekit/plugins/google/tts.py +186 -0
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.4.dev0.dist-info → livekit_plugins_google-0.5.0.dist-info}/METADATA +2 -2
- livekit_plugins_google-0.5.0.dist-info/RECORD +11 -0
- livekit_plugins_google-0.4.dev0.dist-info/RECORD +0 -10
- {livekit_plugins_google-0.4.dev0.dist-info → livekit_plugins_google-0.5.0.dist-info}/WHEEL +0 -0
- {livekit_plugins_google-0.4.dev0.dist-info → livekit_plugins_google-0.5.0.dist-info}/top_level.txt +0 -0
@@ -13,9 +13,10 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from .stt import STT, SpeechStream
|
16
|
+
from .tts import TTS
|
16
17
|
from .version import __version__
|
17
18
|
|
18
|
-
__all__ = ["STT", "SpeechStream", "__version__"]
|
19
|
+
__all__ = ["STT", "TTS", "SpeechStream", "__version__"]
|
19
20
|
|
20
21
|
from livekit.agents import Plugin
|
21
22
|
|
livekit/plugins/google/models.py
CHANGED
livekit/plugins/google/stt.py
CHANGED
@@ -17,22 +17,22 @@ from __future__ import annotations
|
|
17
17
|
import asyncio
|
18
18
|
import contextlib
|
19
19
|
import dataclasses
|
20
|
+
import os
|
20
21
|
from dataclasses import dataclass
|
21
|
-
from typing import
|
22
|
+
from typing import AsyncIterable, List, Optional, Union
|
22
23
|
|
23
24
|
from livekit import agents, rtc
|
24
25
|
from livekit.agents import stt
|
25
26
|
from livekit.agents.utils import AudioBuffer
|
26
27
|
|
27
|
-
from google.auth import credentials # type: ignore
|
28
28
|
from google.cloud.speech_v2 import SpeechAsyncClient
|
29
29
|
from google.cloud.speech_v2.types import cloud_speech
|
30
30
|
|
31
31
|
from .log import logger
|
32
32
|
from .models import SpeechLanguages, SpeechModels
|
33
33
|
|
34
|
-
LgType = SpeechLanguages
|
35
|
-
LanguageCode = LgType
|
34
|
+
LgType = Union[SpeechLanguages, str]
|
35
|
+
LanguageCode = Union[LgType, List[LgType]]
|
36
36
|
|
37
37
|
|
38
38
|
# This class is only be used internally to encapsulate the options
|
@@ -56,21 +56,25 @@ class STT(stt.STT):
|
|
56
56
|
punctuate: bool = True,
|
57
57
|
spoken_punctuation: bool = True,
|
58
58
|
model: SpeechModels = "long",
|
59
|
-
credentials_info:
|
59
|
+
credentials_info: dict | None = None,
|
60
60
|
credentials_file: str | None = None,
|
61
61
|
):
|
62
62
|
"""
|
63
63
|
if no credentials is provided, it will use the credentials on the environment
|
64
|
-
GOOGLE_APPLICATION_CREDENTIALS (
|
64
|
+
GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google SpeechAsyncClient)
|
65
65
|
"""
|
66
66
|
super().__init__(streaming_supported=True)
|
67
67
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
68
|
+
self._client: SpeechAsyncClient | None = None
|
69
|
+
self._credentials_info = credentials_info
|
70
|
+
self._credentials_file = credentials_file
|
71
|
+
|
72
|
+
if credentials_file is None and credentials_info is None:
|
73
|
+
creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
74
|
+
if not creds:
|
75
|
+
raise ValueError(
|
76
|
+
"GOOGLE_APPLICATION_CREDENTIALS must be set if no credentials is provided"
|
77
|
+
)
|
74
78
|
|
75
79
|
if isinstance(languages, str):
|
76
80
|
languages = [languages]
|
@@ -83,13 +87,30 @@ class STT(stt.STT):
|
|
83
87
|
spoken_punctuation=spoken_punctuation,
|
84
88
|
model=model,
|
85
89
|
)
|
86
|
-
|
90
|
+
|
91
|
+
def _ensure_client(self) -> SpeechAsyncClient:
|
92
|
+
if self._credentials_info:
|
93
|
+
self._client = SpeechAsyncClient.from_service_account_info(
|
94
|
+
self._credentials_info
|
95
|
+
)
|
96
|
+
elif self._credentials_file:
|
97
|
+
self._client = SpeechAsyncClient.from_service_account_file(
|
98
|
+
self._credentials_file
|
99
|
+
)
|
100
|
+
else:
|
101
|
+
self._client = SpeechAsyncClient()
|
102
|
+
|
103
|
+
assert self._client is not None
|
104
|
+
return self._client
|
87
105
|
|
88
106
|
@property
|
89
107
|
def _recognizer(self) -> str:
|
90
108
|
# TODO(theomonnom): should we use recognizers?
|
91
|
-
#
|
92
|
-
|
109
|
+
# recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
|
110
|
+
|
111
|
+
# TODO(theomonnom): find a better way to access the project_id
|
112
|
+
project_id = self._ensure_client().transport._credentials.project_id # type: ignore
|
113
|
+
return f"projects/{project_id}/locations/global/recognizers/_"
|
93
114
|
|
94
115
|
def _sanitize_options(
|
95
116
|
self,
|
@@ -119,31 +140,31 @@ class STT(stt.STT):
|
|
119
140
|
language: SpeechLanguages | str | None = None,
|
120
141
|
) -> stt.SpeechEvent:
|
121
142
|
config = self._sanitize_options(language=language)
|
122
|
-
|
143
|
+
frame = agents.utils.merge_frames(buffer)
|
123
144
|
|
124
145
|
config = cloud_speech.RecognitionConfig(
|
125
146
|
explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
|
126
147
|
encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
127
|
-
sample_rate_hertz=
|
128
|
-
audio_channel_count=
|
148
|
+
sample_rate_hertz=frame.sample_rate,
|
149
|
+
audio_channel_count=frame.num_channels,
|
129
150
|
),
|
130
151
|
features=cloud_speech.RecognitionFeatures(
|
131
152
|
enable_automatic_punctuation=config.punctuate,
|
132
153
|
enable_spoken_punctuation=config.spoken_punctuation,
|
154
|
+
enable_word_time_offsets=True,
|
133
155
|
),
|
134
156
|
model=config.model,
|
135
157
|
language_codes=config.languages,
|
136
158
|
)
|
137
159
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
content=buffer.data.tobytes(),
|
144
|
-
)
|
160
|
+
raw = await self._ensure_client().recognize(
|
161
|
+
cloud_speech.RecognizeRequest(
|
162
|
+
recognizer=self._recognizer,
|
163
|
+
config=config,
|
164
|
+
content=frame.data.tobytes(),
|
145
165
|
)
|
146
166
|
)
|
167
|
+
return _recognize_response_to_speech_event(raw)
|
147
168
|
|
148
169
|
def stream(
|
149
170
|
self,
|
@@ -152,8 +173,7 @@ class STT(stt.STT):
|
|
152
173
|
) -> "SpeechStream":
|
153
174
|
config = self._sanitize_options(language=language)
|
154
175
|
return SpeechStream(
|
155
|
-
self.
|
156
|
-
self._creds,
|
176
|
+
self._ensure_client(),
|
157
177
|
self._recognizer,
|
158
178
|
config,
|
159
179
|
)
|
@@ -163,29 +183,28 @@ class SpeechStream(stt.SpeechStream):
|
|
163
183
|
def __init__(
|
164
184
|
self,
|
165
185
|
client: SpeechAsyncClient,
|
166
|
-
creds: credentials.Credentials,
|
167
186
|
recognizer: str,
|
168
187
|
config: STTOptions,
|
169
|
-
sample_rate: int =
|
188
|
+
sample_rate: int = 48000,
|
170
189
|
num_channels: int = 1,
|
171
190
|
max_retry: int = 32,
|
172
191
|
) -> None:
|
173
192
|
super().__init__()
|
174
193
|
|
175
194
|
self._client = client
|
176
|
-
self._creds = creds
|
177
195
|
self._recognizer = recognizer
|
178
196
|
self._config = config
|
179
197
|
self._sample_rate = sample_rate
|
180
198
|
self._num_channels = num_channels
|
181
199
|
|
182
|
-
self._queue = asyncio.Queue[rtc.AudioFrame
|
183
|
-
self._event_queue = asyncio.Queue[stt.SpeechEvent
|
200
|
+
self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
|
201
|
+
self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
|
184
202
|
self._closed = False
|
185
203
|
self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
|
186
204
|
|
187
205
|
self._final_events: List[stt.SpeechEvent] = []
|
188
|
-
self.
|
206
|
+
self._need_bos = True
|
207
|
+
self._need_eos = False
|
189
208
|
|
190
209
|
self._streaming_config = cloud_speech.StreamingRecognitionConfig(
|
191
210
|
config=cloud_speech.RecognitionConfig(
|
@@ -198,6 +217,7 @@ class SpeechStream(stt.SpeechStream):
|
|
198
217
|
model=self._config.model,
|
199
218
|
features=cloud_speech.RecognitionFeatures(
|
200
219
|
enable_automatic_punctuation=self._config.punctuate,
|
220
|
+
enable_word_time_offsets=True,
|
201
221
|
),
|
202
222
|
),
|
203
223
|
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
@@ -218,7 +238,7 @@ class SpeechStream(stt.SpeechStream):
|
|
218
238
|
|
219
239
|
self._queue.put_nowait(frame)
|
220
240
|
|
221
|
-
async def aclose(self, wait: bool = True) -> None:
|
241
|
+
async def aclose(self, *, wait: bool = True) -> None:
|
222
242
|
self._closed = True
|
223
243
|
if not wait:
|
224
244
|
self._main_task.cancel()
|
@@ -229,61 +249,55 @@ class SpeechStream(stt.SpeechStream):
|
|
229
249
|
|
230
250
|
async def _run(self, max_retry: int) -> None:
|
231
251
|
retry_count = 0
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
252
|
+
while not self._closed:
|
253
|
+
try:
|
254
|
+
# google requires a async generator when calling streaming_recognize
|
255
|
+
# this function basically convert the queue into a async generator
|
256
|
+
async def input_generator():
|
257
|
+
try:
|
258
|
+
# first request should contain the config
|
259
|
+
yield cloud_speech.StreamingRecognizeRequest(
|
260
|
+
recognizer=self._recognizer,
|
261
|
+
streaming_config=self._streaming_config,
|
262
|
+
)
|
263
|
+
while True:
|
264
|
+
frame = await self._queue.get()
|
265
|
+
if frame is None:
|
266
|
+
break
|
267
|
+
|
268
|
+
frame = frame.remix_and_resample(
|
269
|
+
self._sample_rate, self._num_channels
|
243
270
|
)
|
244
|
-
|
245
|
-
frame
|
246
|
-
await self._queue.get()
|
247
|
-
) # wait for a new rtc.AudioFrame
|
248
|
-
if frame is None:
|
249
|
-
break # None is sent inside aclose
|
250
|
-
|
251
|
-
self._queue.task_done()
|
252
|
-
frame = frame.remix_and_resample(
|
253
|
-
self._sample_rate, self._num_channels
|
254
|
-
)
|
255
|
-
yield cloud_speech.StreamingRecognizeRequest(
|
256
|
-
audio=frame.data.tobytes(),
|
257
|
-
)
|
258
|
-
except Exception as e:
|
259
|
-
logger.error(
|
260
|
-
f"an error occurred while streaming inputs: {e}"
|
271
|
+
yield cloud_speech.StreamingRecognizeRequest(
|
272
|
+
audio=frame.data.tobytes(),
|
261
273
|
)
|
274
|
+
except Exception as e:
|
275
|
+
logger.error(f"an error occurred while streaming inputs: {e}")
|
262
276
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
await self._run_stream(stream)
|
270
|
-
except Exception as e:
|
271
|
-
if retry_count >= max_retry:
|
272
|
-
logger.error(
|
273
|
-
f"failed to connect to google stt after {max_retry} tries",
|
274
|
-
exc_info=e,
|
275
|
-
)
|
276
|
-
break
|
277
|
+
# try to connect
|
278
|
+
stream = await self._client.streaming_recognize(
|
279
|
+
requests=input_generator()
|
280
|
+
)
|
281
|
+
retry_count = 0 # connection successful, reset retry count
|
277
282
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
283
|
+
await self._run_stream(stream)
|
284
|
+
except Exception as e:
|
285
|
+
if retry_count >= max_retry:
|
286
|
+
logger.error(
|
287
|
+
f"failed to connect to google stt after {max_retry} tries",
|
282
288
|
exc_info=e,
|
283
289
|
)
|
284
|
-
|
285
|
-
|
286
|
-
|
290
|
+
break
|
291
|
+
|
292
|
+
retry_delay = min(retry_count * 2, 5) # max 5s
|
293
|
+
retry_count += 1
|
294
|
+
logger.warning(
|
295
|
+
f"google stt connection failed, retrying in {retry_delay}s",
|
296
|
+
exc_info=e,
|
297
|
+
)
|
298
|
+
await asyncio.sleep(retry_delay)
|
299
|
+
|
300
|
+
self._event_queue.put_nowait(None)
|
287
301
|
|
288
302
|
async def _run_stream(
|
289
303
|
self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
|
@@ -293,11 +307,11 @@ class SpeechStream(stt.SpeechStream):
|
|
293
307
|
resp.speech_event_type
|
294
308
|
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
|
295
309
|
):
|
296
|
-
self.
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
self.
|
310
|
+
if self._need_eos:
|
311
|
+
self._send_eos()
|
312
|
+
|
313
|
+
if self._need_bos:
|
314
|
+
self._send_bos()
|
301
315
|
|
302
316
|
if (
|
303
317
|
resp.speech_event_type
|
@@ -305,57 +319,89 @@ class SpeechStream(stt.SpeechStream):
|
|
305
319
|
):
|
306
320
|
result = resp.results[0]
|
307
321
|
if not result.is_final:
|
308
|
-
# interim results
|
309
322
|
iterim_event = stt.SpeechEvent(
|
310
323
|
type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
|
311
|
-
alternatives=
|
324
|
+
alternatives=[
|
325
|
+
_streaming_recognize_response_to_speech_data(resp)
|
326
|
+
],
|
312
327
|
)
|
313
328
|
self._event_queue.put_nowait(iterim_event)
|
314
329
|
|
315
330
|
else:
|
316
331
|
final_event = stt.SpeechEvent(
|
317
332
|
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
318
|
-
alternatives=
|
333
|
+
alternatives=[
|
334
|
+
_streaming_recognize_response_to_speech_data(resp)
|
335
|
+
],
|
319
336
|
)
|
320
337
|
self._final_events.append(final_event)
|
321
338
|
self._event_queue.put_nowait(final_event)
|
322
339
|
|
323
|
-
|
324
|
-
|
325
|
-
sentence = ""
|
326
|
-
confidence = 0.0
|
327
|
-
for alt in self._final_events:
|
328
|
-
sentence += f"{alt.alternatives[0].text.strip()} "
|
329
|
-
confidence += alt.alternatives[0].confidence
|
330
|
-
|
331
|
-
sentence = sentence.rstrip()
|
332
|
-
confidence /= len(self._final_events) # avg. of confidence
|
333
|
-
|
334
|
-
end_event = stt.SpeechEvent(
|
335
|
-
type=stt.SpeechEventType.END_OF_SPEECH,
|
336
|
-
alternatives=[
|
337
|
-
stt.SpeechData(
|
338
|
-
language=result.language_code,
|
339
|
-
start_time=self._final_events[0]
|
340
|
-
.alternatives[0]
|
341
|
-
.start_time,
|
342
|
-
end_time=self._final_events[-1]
|
343
|
-
.alternatives[0]
|
344
|
-
.end_time,
|
345
|
-
confidence=confidence,
|
346
|
-
text=sentence,
|
347
|
-
)
|
348
|
-
],
|
349
|
-
)
|
350
|
-
|
351
|
-
self._final_events = []
|
352
|
-
self._event_queue.put_nowait(end_event)
|
340
|
+
if self._need_eos:
|
341
|
+
self._send_eos()
|
353
342
|
|
354
343
|
if (
|
355
344
|
resp.speech_event_type
|
356
345
|
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
|
357
346
|
):
|
358
|
-
self.
|
347
|
+
self._need_eos = True
|
348
|
+
|
349
|
+
if not self._need_bos:
|
350
|
+
self._send_eos()
|
351
|
+
|
352
|
+
def _send_bos(self) -> None:
|
353
|
+
self._need_bos = False
|
354
|
+
start_event = stt.SpeechEvent(
|
355
|
+
type=stt.SpeechEventType.START_OF_SPEECH,
|
356
|
+
)
|
357
|
+
self._event_queue.put_nowait(start_event)
|
358
|
+
|
359
|
+
def _send_eos(self) -> None:
|
360
|
+
self._need_eos = False
|
361
|
+
self._need_bos = True
|
362
|
+
|
363
|
+
if self._final_events:
|
364
|
+
lg = self._final_events[0].alternatives[0].language
|
365
|
+
|
366
|
+
sentence = ""
|
367
|
+
confidence = 0.0
|
368
|
+
for alt in self._final_events:
|
369
|
+
sentence += f"{alt.alternatives[0].text.strip()} "
|
370
|
+
confidence += alt.alternatives[0].confidence
|
371
|
+
|
372
|
+
sentence = sentence.rstrip()
|
373
|
+
confidence /= len(self._final_events) # avg. of confidence
|
374
|
+
|
375
|
+
end_event = stt.SpeechEvent(
|
376
|
+
type=stt.SpeechEventType.END_OF_SPEECH,
|
377
|
+
alternatives=[
|
378
|
+
stt.SpeechData(
|
379
|
+
language=lg,
|
380
|
+
start_time=self._final_events[0].alternatives[0].start_time,
|
381
|
+
end_time=self._final_events[-1].alternatives[0].end_time,
|
382
|
+
confidence=confidence,
|
383
|
+
text=sentence,
|
384
|
+
)
|
385
|
+
],
|
386
|
+
)
|
387
|
+
|
388
|
+
self._final_events = []
|
389
|
+
self._event_queue.put_nowait(end_event)
|
390
|
+
else:
|
391
|
+
end_event = stt.SpeechEvent(
|
392
|
+
type=stt.SpeechEventType.END_OF_SPEECH,
|
393
|
+
alternatives=[
|
394
|
+
stt.SpeechData(
|
395
|
+
language="",
|
396
|
+
start_time=0,
|
397
|
+
end_time=0,
|
398
|
+
confidence=0,
|
399
|
+
text="",
|
400
|
+
)
|
401
|
+
],
|
402
|
+
)
|
403
|
+
|
404
|
+
self._event_queue.put_nowait(end_event)
|
359
405
|
|
360
406
|
async def __anext__(self) -> stt.SpeechEvent:
|
361
407
|
evt = await self._event_queue.get()
|
@@ -365,38 +411,53 @@ class SpeechStream(stt.SpeechStream):
|
|
365
411
|
return evt
|
366
412
|
|
367
413
|
|
368
|
-
def
|
414
|
+
def _recognize_response_to_speech_event(
|
369
415
|
resp: cloud_speech.RecognizeResponse,
|
370
416
|
) -> stt.SpeechEvent:
|
371
|
-
|
372
|
-
|
417
|
+
text = ""
|
418
|
+
confidence = 0.0
|
419
|
+
for result in resp.results:
|
420
|
+
text += result.alternatives[0].transcript
|
421
|
+
confidence += result.alternatives[0].confidence
|
422
|
+
|
423
|
+
# not sure why start_offset and end_offset returns a timedelta
|
424
|
+
start_offset = resp.results[0].alternatives[0].words[0].start_offset
|
425
|
+
end_offset = resp.results[-1].alternatives[0].words[-1].end_offset
|
426
|
+
|
427
|
+
confidence /= len(resp.results)
|
428
|
+
lg = resp.results[0].language_code
|
373
429
|
return stt.SpeechEvent(
|
374
430
|
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
375
431
|
alternatives=[
|
376
432
|
stt.SpeechData(
|
377
|
-
language=
|
378
|
-
start_time=
|
379
|
-
end_time=
|
380
|
-
confidence=
|
381
|
-
text=
|
433
|
+
language=lg,
|
434
|
+
start_time=start_offset.total_seconds(), # type: ignore
|
435
|
+
end_time=end_offset.total_seconds(), # type: ignore
|
436
|
+
confidence=confidence,
|
437
|
+
text=text,
|
382
438
|
)
|
383
|
-
for alt in gg_alts
|
384
439
|
],
|
385
440
|
)
|
386
441
|
|
387
442
|
|
388
|
-
def
|
443
|
+
def _streaming_recognize_response_to_speech_data(
|
389
444
|
resp: cloud_speech.StreamingRecognizeResponse,
|
390
|
-
) ->
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
445
|
+
) -> stt.SpeechData:
|
446
|
+
text = ""
|
447
|
+
confidence = 0.0
|
448
|
+
for result in resp.results:
|
449
|
+
text += result.alternatives[0].transcript
|
450
|
+
confidence += result.alternatives[0].confidence
|
451
|
+
|
452
|
+
confidence /= len(resp.results)
|
453
|
+
lg = resp.results[0].language_code
|
454
|
+
|
455
|
+
data = stt.SpeechData(
|
456
|
+
language=lg,
|
457
|
+
start_time=0,
|
458
|
+
end_time=0,
|
459
|
+
confidence=confidence,
|
460
|
+
text=text,
|
461
|
+
)
|
462
|
+
|
463
|
+
return data
|
@@ -0,0 +1,186 @@
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
import asyncio
|
18
|
+
import contextlib
|
19
|
+
from dataclasses import dataclass
|
20
|
+
from typing import Optional, Union
|
21
|
+
|
22
|
+
from livekit import rtc
|
23
|
+
from livekit.agents import codecs, tts
|
24
|
+
|
25
|
+
from google.cloud import texttospeech
|
26
|
+
from google.cloud.texttospeech_v1.types import (
|
27
|
+
SsmlVoiceGender,
|
28
|
+
SynthesizeSpeechResponse,
|
29
|
+
)
|
30
|
+
|
31
|
+
from .log import logger
|
32
|
+
from .models import AudioEncoding, Gender, SpeechLanguages
|
33
|
+
|
34
|
+
LgType = Union[SpeechLanguages, str]
|
35
|
+
GenderType = Union[Gender, str]
|
36
|
+
AudioEncodingType = Union[AudioEncoding, str]
|
37
|
+
|
38
|
+
|
39
|
+
@dataclass
|
40
|
+
class _TTSOptions:
|
41
|
+
voice: texttospeech.VoiceSelectionParams
|
42
|
+
audio_config: texttospeech.AudioConfig
|
43
|
+
|
44
|
+
|
45
|
+
class TTS(tts.TTS):
|
46
|
+
def __init__(
|
47
|
+
self,
|
48
|
+
*,
|
49
|
+
language: LgType = "en-US",
|
50
|
+
gender: GenderType = "neutral",
|
51
|
+
voice_name: str = "", # Not required
|
52
|
+
encoding: AudioEncodingType = "linear16",
|
53
|
+
sample_rate: int = 24000,
|
54
|
+
speaking_rate: float = 1.0,
|
55
|
+
credentials_info: dict | None = None,
|
56
|
+
credentials_file: str | None = None,
|
57
|
+
) -> None:
|
58
|
+
"""
|
59
|
+
if no credentials is provided, it will use the credentials on the environment
|
60
|
+
GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google TextToSpeechAsyncClient)
|
61
|
+
"""
|
62
|
+
super().__init__(
|
63
|
+
streaming_supported=False, sample_rate=sample_rate, num_channels=1
|
64
|
+
)
|
65
|
+
|
66
|
+
self._client: texttospeech.TextToSpeechAsyncClient | None = None
|
67
|
+
self._credentials_info = credentials_info
|
68
|
+
self._credentials_file = credentials_file
|
69
|
+
|
70
|
+
ssml_gender = SsmlVoiceGender.NEUTRAL
|
71
|
+
if gender == "male":
|
72
|
+
ssml_gender = SsmlVoiceGender.MALE
|
73
|
+
elif gender == "female":
|
74
|
+
ssml_gender = SsmlVoiceGender.FEMALE
|
75
|
+
|
76
|
+
voice = texttospeech.VoiceSelectionParams(
|
77
|
+
name=voice_name,
|
78
|
+
language_code=language,
|
79
|
+
ssml_gender=ssml_gender,
|
80
|
+
)
|
81
|
+
|
82
|
+
if encoding == "linear16" or encoding == "wav":
|
83
|
+
_audio_encoding = texttospeech.AudioEncoding.LINEAR16
|
84
|
+
elif encoding == "mp3":
|
85
|
+
_audio_encoding = texttospeech.AudioEncoding.MP3
|
86
|
+
else:
|
87
|
+
raise NotImplementedError(f"audio encoding {encoding} is not supported")
|
88
|
+
|
89
|
+
self._opts = _TTSOptions(
|
90
|
+
voice=voice,
|
91
|
+
audio_config=texttospeech.AudioConfig(
|
92
|
+
audio_encoding=_audio_encoding,
|
93
|
+
sample_rate_hertz=sample_rate,
|
94
|
+
speaking_rate=speaking_rate,
|
95
|
+
),
|
96
|
+
)
|
97
|
+
|
98
|
+
def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
|
99
|
+
if not self._client:
|
100
|
+
if self._credentials_info:
|
101
|
+
self._client = (
|
102
|
+
texttospeech.TextToSpeechAsyncClient.from_service_account_info(
|
103
|
+
self._credentials_info
|
104
|
+
)
|
105
|
+
)
|
106
|
+
|
107
|
+
elif self._credentials_file:
|
108
|
+
self._client = (
|
109
|
+
texttospeech.TextToSpeechAsyncClient.from_service_account_file(
|
110
|
+
self._credentials_file
|
111
|
+
)
|
112
|
+
)
|
113
|
+
else:
|
114
|
+
self._client = texttospeech.TextToSpeechAsyncClient()
|
115
|
+
|
116
|
+
assert self._client is not None
|
117
|
+
return self._client
|
118
|
+
|
119
|
+
def synthesize(
|
120
|
+
self,
|
121
|
+
text: str,
|
122
|
+
) -> "ChunkedStream":
|
123
|
+
return ChunkedStream(text, self._opts, self._ensure_client())
|
124
|
+
|
125
|
+
|
126
|
+
class ChunkedStream(tts.ChunkedStream):
|
127
|
+
def __init__(
|
128
|
+
self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
|
129
|
+
) -> None:
|
130
|
+
self._text = text
|
131
|
+
self._opts = opts
|
132
|
+
self._client = client
|
133
|
+
self._main_task: asyncio.Task | None = None
|
134
|
+
self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
|
135
|
+
|
136
|
+
async def _run(self) -> None:
|
137
|
+
try:
|
138
|
+
response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
|
139
|
+
input=texttospeech.SynthesisInput(text=self._text),
|
140
|
+
voice=self._opts.voice,
|
141
|
+
audio_config=self._opts.audio_config,
|
142
|
+
)
|
143
|
+
|
144
|
+
data = response.audio_content
|
145
|
+
if self._opts.audio_config.audio_encoding == "mp3":
|
146
|
+
decoder = codecs.Mp3StreamDecoder()
|
147
|
+
frames = decoder.decode_chunk(data)
|
148
|
+
for frame in frames:
|
149
|
+
self._queue.put_nowait(
|
150
|
+
tts.SynthesizedAudio(text=self._text, data=frame)
|
151
|
+
)
|
152
|
+
else:
|
153
|
+
self._queue.put_nowait(
|
154
|
+
tts.SynthesizedAudio(
|
155
|
+
text="",
|
156
|
+
data=rtc.AudioFrame(
|
157
|
+
data=data,
|
158
|
+
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
159
|
+
num_channels=1,
|
160
|
+
samples_per_channel=len(data) // 2, # 16-bit
|
161
|
+
),
|
162
|
+
)
|
163
|
+
)
|
164
|
+
|
165
|
+
except Exception:
|
166
|
+
logger.exception("failed to synthesize")
|
167
|
+
finally:
|
168
|
+
self._queue.put_nowait(None)
|
169
|
+
|
170
|
+
async def __anext__(self) -> tts.SynthesizedAudio:
|
171
|
+
if not self._main_task:
|
172
|
+
self._main_task = asyncio.create_task(self._run())
|
173
|
+
|
174
|
+
frame = await self._queue.get()
|
175
|
+
if frame is None:
|
176
|
+
raise StopAsyncIteration
|
177
|
+
|
178
|
+
return frame
|
179
|
+
|
180
|
+
async def aclose(self) -> None:
|
181
|
+
if not self._main_task:
|
182
|
+
return
|
183
|
+
|
184
|
+
self._main_task.cancel()
|
185
|
+
with contextlib.suppress(asyncio.CancelledError):
|
186
|
+
await self._main_task
|
{livekit_plugins_google-0.4.dev0.dist-info → livekit_plugins_google-0.5.0.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -30,7 +30,7 @@ Requires-Dist: google-cloud-texttospeech <3,>=2
|
|
30
30
|
Requires-Dist: google-cloud-translate <4,>=3
|
31
31
|
Requires-Dist: googleapis-common-protos <2,>=1
|
32
32
|
Requires-Dist: livekit ~=0.11
|
33
|
-
Requires-Dist: livekit-agents ~=0.
|
33
|
+
Requires-Dist: livekit-agents ~=0.7.0
|
34
34
|
|
35
35
|
# LiveKit Plugins Google
|
36
36
|
|
@@ -0,0 +1,11 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
|
2
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
3
|
+
livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
|
4
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/google/stt.py,sha256=GfWita3mgLZG2KpS9WYMCL8jwCNN5qukicpI58zPCcY,16058
|
6
|
+
livekit/plugins/google/tts.py,sha256=J3V5aDUz0V2_Dfs16pobDVx7XwQqU1AEM8TWXdaDn9w,6182
|
7
|
+
livekit/plugins/google/version.py,sha256=pZ7bgeWLjw4VCWymU1ntHaHorKRusUkm56y6tZe5gmQ,600
|
8
|
+
livekit_plugins_google-0.5.0.dist-info/METADATA,sha256=Hf7P77h8fLEnGsNj4rUdSA_mSL4sCv5pMktzPoTsCbk,1941
|
9
|
+
livekit_plugins_google-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
10
|
+
livekit_plugins_google-0.5.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
11
|
+
livekit_plugins_google-0.5.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=snPMHNLrurYbLWQOkV_o6qG1CEWsOCZ8ZfPMvmh5ejY,931
|
2
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
3
|
-
livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
|
4
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/google/stt.py,sha256=sPV4ByAxfeGBNvAGIuwZvheEA0k7NYjXR_UiYWjd39Y,15029
|
6
|
-
livekit/plugins/google/version.py,sha256=OwSbVTqWUJKy9w2Jbh1MIrp5cHPvEYsLXDhRGwdZKso,603
|
7
|
-
livekit_plugins_google-0.4.dev0.dist-info/METADATA,sha256=i4l7y8p0Y57kK5oPQqF-8lp9SFvPL0nDVsuDkktHJN4,1947
|
8
|
-
livekit_plugins_google-0.4.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_google-0.4.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_google-0.4.dev0.dist-info/RECORD,,
|
File without changes
|
{livekit_plugins_google-0.4.dev0.dist-info → livekit_plugins_google-0.5.0.dist-info}/top_level.txt
RENAMED
File without changes
|