livekit-plugins-google 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/models.py +7 -1
- livekit/plugins/google/stt.py +231 -120
- livekit/plugins/google/tts.py +24 -9
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.7.2.dist-info → livekit_plugins_google-0.8.1.dist-info}/METADATA +5 -5
- livekit_plugins_google-0.8.1.dist-info/RECORD +11 -0
- {livekit_plugins_google-0.7.2.dist-info → livekit_plugins_google-0.8.1.dist-info}/WHEEL +1 -1
- livekit_plugins_google-0.7.2.dist-info/RECORD +0 -11
- {livekit_plugins_google-0.7.2.dist-info → livekit_plugins_google-0.8.1.dist-info}/top_level.txt +0 -0
livekit/plugins/google/models.py
CHANGED
@@ -3,7 +3,13 @@ from typing import Literal
|
|
3
3
|
# Speech to Text v2
|
4
4
|
|
5
5
|
SpeechModels = Literal[
|
6
|
-
"long",
|
6
|
+
"long",
|
7
|
+
"short",
|
8
|
+
"telephony",
|
9
|
+
"medical_dictation",
|
10
|
+
"medical_conversation",
|
11
|
+
"chirp",
|
12
|
+
"chirp_2",
|
7
13
|
]
|
8
14
|
|
9
15
|
SpeechLanguages = Literal[
|
livekit/plugins/google/stt.py
CHANGED
@@ -16,18 +16,22 @@ from __future__ import annotations
|
|
16
16
|
|
17
17
|
import asyncio
|
18
18
|
import dataclasses
|
19
|
+
import weakref
|
19
20
|
from dataclasses import dataclass
|
20
|
-
from typing import
|
21
|
+
from typing import List, Union
|
21
22
|
|
22
|
-
from livekit import
|
23
|
+
from livekit import rtc
|
23
24
|
from livekit.agents import (
|
25
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
24
26
|
APIConnectionError,
|
27
|
+
APIConnectOptions,
|
25
28
|
APIStatusError,
|
26
29
|
APITimeoutError,
|
27
30
|
stt,
|
28
31
|
utils,
|
29
32
|
)
|
30
33
|
|
34
|
+
from google.api_core.client_options import ClientOptions
|
31
35
|
from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
|
32
36
|
from google.auth import default as gauth_default
|
33
37
|
from google.auth.exceptions import DefaultCredentialsError
|
@@ -50,6 +54,26 @@ class STTOptions:
|
|
50
54
|
punctuate: bool
|
51
55
|
spoken_punctuation: bool
|
52
56
|
model: SpeechModels
|
57
|
+
sample_rate: int
|
58
|
+
keywords: List[tuple[str, float]] | None
|
59
|
+
|
60
|
+
def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
|
61
|
+
if self.keywords:
|
62
|
+
return cloud_speech.SpeechAdaptation(
|
63
|
+
phrase_sets=[
|
64
|
+
cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
|
65
|
+
inline_phrase_set=cloud_speech.PhraseSet(
|
66
|
+
phrases=[
|
67
|
+
cloud_speech.PhraseSet.Phrase(
|
68
|
+
value=keyword, boost=boost
|
69
|
+
)
|
70
|
+
for keyword, boost in self.keywords
|
71
|
+
]
|
72
|
+
)
|
73
|
+
)
|
74
|
+
]
|
75
|
+
)
|
76
|
+
return None
|
53
77
|
|
54
78
|
|
55
79
|
class STT(stt.STT):
|
@@ -62,8 +86,11 @@ class STT(stt.STT):
|
|
62
86
|
punctuate: bool = True,
|
63
87
|
spoken_punctuation: bool = True,
|
64
88
|
model: SpeechModels = "long",
|
89
|
+
location: str = "global",
|
90
|
+
sample_rate: int = 16000,
|
65
91
|
credentials_info: dict | None = None,
|
66
92
|
credentials_file: str | None = None,
|
93
|
+
keywords: List[tuple[str, float]] | None = None,
|
67
94
|
):
|
68
95
|
"""
|
69
96
|
Create a new instance of Google STT.
|
@@ -77,6 +104,7 @@ class STT(stt.STT):
|
|
77
104
|
)
|
78
105
|
|
79
106
|
self._client: SpeechAsyncClient | None = None
|
107
|
+
self._location = location
|
80
108
|
self._credentials_info = credentials_info
|
81
109
|
self._credentials_file = credentials_file
|
82
110
|
|
@@ -100,7 +128,10 @@ class STT(stt.STT):
|
|
100
128
|
punctuate=punctuate,
|
101
129
|
spoken_punctuation=spoken_punctuation,
|
102
130
|
model=model,
|
131
|
+
sample_rate=sample_rate,
|
132
|
+
keywords=keywords,
|
103
133
|
)
|
134
|
+
self._streams = weakref.WeakSet[SpeechStream]()
|
104
135
|
|
105
136
|
def _ensure_client(self) -> SpeechAsyncClient:
|
106
137
|
if self._credentials_info:
|
@@ -111,9 +142,16 @@ class STT(stt.STT):
|
|
111
142
|
self._client = SpeechAsyncClient.from_service_account_file(
|
112
143
|
self._credentials_file
|
113
144
|
)
|
114
|
-
|
145
|
+
elif self._location == "global":
|
115
146
|
self._client = SpeechAsyncClient()
|
116
|
-
|
147
|
+
else:
|
148
|
+
# Add support for passing a specific location that matches recognizer
|
149
|
+
# see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
|
150
|
+
self._client = SpeechAsyncClient(
|
151
|
+
client_options=ClientOptions(
|
152
|
+
api_endpoint=f"{self._location}-speech.googleapis.com"
|
153
|
+
)
|
154
|
+
)
|
117
155
|
assert self._client is not None
|
118
156
|
return self._client
|
119
157
|
|
@@ -129,7 +167,7 @@ class STT(stt.STT):
|
|
129
167
|
from google.auth import default as ga_default
|
130
168
|
|
131
169
|
_, project_id = ga_default()
|
132
|
-
return f"projects/{project_id}/locations/
|
170
|
+
return f"projects/{project_id}/locations/{self._location}/recognizers/_"
|
133
171
|
|
134
172
|
def _sanitize_options(self, *, language: str | None = None) -> STTOptions:
|
135
173
|
config = dataclasses.replace(self._config)
|
@@ -152,10 +190,11 @@ class STT(stt.STT):
|
|
152
190
|
self,
|
153
191
|
buffer: utils.AudioBuffer,
|
154
192
|
*,
|
155
|
-
language: SpeechLanguages | str | None
|
193
|
+
language: SpeechLanguages | str | None,
|
194
|
+
conn_options: APIConnectOptions,
|
156
195
|
) -> stt.SpeechEvent:
|
157
196
|
config = self._sanitize_options(language=language)
|
158
|
-
frame =
|
197
|
+
frame = rtc.combine_audio_frames(buffer)
|
159
198
|
|
160
199
|
config = cloud_speech.RecognitionConfig(
|
161
200
|
explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
|
@@ -163,6 +202,7 @@ class STT(stt.STT):
|
|
163
202
|
sample_rate_hertz=frame.sample_rate,
|
164
203
|
audio_channel_count=frame.num_channels,
|
165
204
|
),
|
205
|
+
adaptation=config.build_adaptation(),
|
166
206
|
features=cloud_speech.RecognitionFeatures(
|
167
207
|
enable_automatic_punctuation=config.punctuate,
|
168
208
|
enable_spoken_punctuation=config.spoken_punctuation,
|
@@ -178,7 +218,8 @@ class STT(stt.STT):
|
|
178
218
|
recognizer=self._recognizer,
|
179
219
|
config=config,
|
180
220
|
content=frame.data.tobytes(),
|
181
|
-
)
|
221
|
+
),
|
222
|
+
timeout=conn_options.timeout,
|
182
223
|
)
|
183
224
|
|
184
225
|
return _recognize_response_to_speech_event(raw)
|
@@ -195,150 +236,220 @@ class STT(stt.STT):
|
|
195
236
|
raise APIConnectionError() from e
|
196
237
|
|
197
238
|
def stream(
|
198
|
-
self,
|
239
|
+
self,
|
240
|
+
*,
|
241
|
+
language: SpeechLanguages | str | None = None,
|
242
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
199
243
|
) -> "SpeechStream":
|
200
244
|
config = self._sanitize_options(language=language)
|
201
|
-
|
245
|
+
stream = SpeechStream(
|
246
|
+
stt=self,
|
247
|
+
client=self._ensure_client(),
|
248
|
+
recognizer=self._recognizer,
|
249
|
+
config=config,
|
250
|
+
conn_options=conn_options,
|
251
|
+
)
|
252
|
+
self._streams.add(stream)
|
253
|
+
return stream
|
254
|
+
|
255
|
+
def update_options(
|
256
|
+
self,
|
257
|
+
*,
|
258
|
+
languages: LanguageCode | None = None,
|
259
|
+
detect_language: bool | None = None,
|
260
|
+
interim_results: bool | None = None,
|
261
|
+
punctuate: bool | None = None,
|
262
|
+
spoken_punctuation: bool | None = None,
|
263
|
+
model: SpeechModels | None = None,
|
264
|
+
location: str | None = None,
|
265
|
+
keywords: List[tuple[str, float]] | None = None,
|
266
|
+
):
|
267
|
+
if languages is not None:
|
268
|
+
if isinstance(languages, str):
|
269
|
+
languages = [languages]
|
270
|
+
self._config.languages = languages
|
271
|
+
if detect_language is not None:
|
272
|
+
self._config.detect_language = detect_language
|
273
|
+
if interim_results is not None:
|
274
|
+
self._config.interim_results = interim_results
|
275
|
+
if punctuate is not None:
|
276
|
+
self._config.punctuate = punctuate
|
277
|
+
if spoken_punctuation is not None:
|
278
|
+
self._config.spoken_punctuation = spoken_punctuation
|
279
|
+
if model is not None:
|
280
|
+
self._config.model = model
|
281
|
+
if keywords is not None:
|
282
|
+
self._config.keywords = keywords
|
283
|
+
|
284
|
+
for stream in self._streams:
|
285
|
+
stream.update_options(
|
286
|
+
languages=languages,
|
287
|
+
detect_language=detect_language,
|
288
|
+
interim_results=interim_results,
|
289
|
+
punctuate=punctuate,
|
290
|
+
spoken_punctuation=spoken_punctuation,
|
291
|
+
model=model,
|
292
|
+
location=location,
|
293
|
+
keywords=keywords,
|
294
|
+
)
|
202
295
|
|
203
296
|
|
204
297
|
class SpeechStream(stt.SpeechStream):
|
205
298
|
def __init__(
|
206
299
|
self,
|
300
|
+
*,
|
207
301
|
stt: STT,
|
302
|
+
conn_options: APIConnectOptions,
|
208
303
|
client: SpeechAsyncClient,
|
209
304
|
recognizer: str,
|
210
305
|
config: STTOptions,
|
211
|
-
sample_rate: int = 48000,
|
212
|
-
num_channels: int = 1,
|
213
|
-
max_retry: int = 32,
|
214
306
|
) -> None:
|
215
|
-
super().__init__(
|
307
|
+
super().__init__(
|
308
|
+
stt=stt, conn_options=conn_options, sample_rate=config.sample_rate
|
309
|
+
)
|
216
310
|
|
217
311
|
self._client = client
|
218
312
|
self._recognizer = recognizer
|
219
313
|
self._config = config
|
220
|
-
self.
|
221
|
-
self._num_channels = num_channels
|
222
|
-
self._max_retry = max_retry
|
223
|
-
|
224
|
-
self._streaming_config = cloud_speech.StreamingRecognitionConfig(
|
225
|
-
config=cloud_speech.RecognitionConfig(
|
226
|
-
explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
|
227
|
-
encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
228
|
-
sample_rate_hertz=self._sample_rate,
|
229
|
-
audio_channel_count=self._num_channels,
|
230
|
-
),
|
231
|
-
language_codes=self._config.languages,
|
232
|
-
model=self._config.model,
|
233
|
-
features=cloud_speech.RecognitionFeatures(
|
234
|
-
enable_automatic_punctuation=self._config.punctuate,
|
235
|
-
enable_word_time_offsets=True,
|
236
|
-
),
|
237
|
-
),
|
238
|
-
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
239
|
-
enable_voice_activity_events=True,
|
240
|
-
interim_results=self._config.interim_results,
|
241
|
-
),
|
242
|
-
)
|
314
|
+
self._reconnect_event = asyncio.Event()
|
243
315
|
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
316
|
+
def update_options(
|
317
|
+
self,
|
318
|
+
*,
|
319
|
+
languages: LanguageCode | None = None,
|
320
|
+
detect_language: bool | None = None,
|
321
|
+
interim_results: bool | None = None,
|
322
|
+
punctuate: bool | None = None,
|
323
|
+
spoken_punctuation: bool | None = None,
|
324
|
+
model: SpeechModels | None = None,
|
325
|
+
location: str | None = None,
|
326
|
+
keywords: List[tuple[str, float]] | None = None,
|
327
|
+
):
|
328
|
+
if languages is not None:
|
329
|
+
if isinstance(languages, str):
|
330
|
+
languages = [languages]
|
331
|
+
self._config.languages = languages
|
332
|
+
if detect_language is not None:
|
333
|
+
self._config.detect_language = detect_language
|
334
|
+
if interim_results is not None:
|
335
|
+
self._config.interim_results = interim_results
|
336
|
+
if punctuate is not None:
|
337
|
+
self._config.punctuate = punctuate
|
338
|
+
if spoken_punctuation is not None:
|
339
|
+
self._config.spoken_punctuation = spoken_punctuation
|
340
|
+
if model is not None:
|
341
|
+
self._config.model = model
|
342
|
+
if keywords is not None:
|
343
|
+
self._config.keywords = keywords
|
344
|
+
|
345
|
+
self._reconnect_event.set()
|
346
|
+
|
347
|
+
async def _run(self) -> None:
|
348
|
+
# google requires a async generator when calling streaming_recognize
|
349
|
+
# this function basically convert the queue into a async generator
|
350
|
+
async def input_generator():
|
251
351
|
try:
|
252
|
-
#
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
352
|
+
# first request should contain the config
|
353
|
+
yield cloud_speech.StreamingRecognizeRequest(
|
354
|
+
recognizer=self._recognizer,
|
355
|
+
streaming_config=self._streaming_config,
|
356
|
+
)
|
357
|
+
|
358
|
+
async for frame in self._input_ch:
|
359
|
+
if isinstance(frame, rtc.AudioFrame):
|
257
360
|
yield cloud_speech.StreamingRecognizeRequest(
|
258
|
-
|
259
|
-
streaming_config=self._streaming_config,
|
361
|
+
audio=frame.data.tobytes()
|
260
362
|
)
|
261
363
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
)
|
267
|
-
yield cloud_speech.StreamingRecognizeRequest(
|
268
|
-
audio=frame.data.tobytes()
|
269
|
-
)
|
364
|
+
except Exception:
|
365
|
+
logger.exception(
|
366
|
+
"an error occurred while streaming input to google STT"
|
367
|
+
)
|
270
368
|
|
271
|
-
|
272
|
-
|
273
|
-
|
369
|
+
async def process_stream(stream):
|
370
|
+
async for resp in stream:
|
371
|
+
if (
|
372
|
+
resp.speech_event_type
|
373
|
+
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
|
374
|
+
):
|
375
|
+
self._event_ch.send_nowait(
|
376
|
+
stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
|
377
|
+
)
|
378
|
+
|
379
|
+
if (
|
380
|
+
resp.speech_event_type
|
381
|
+
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
|
382
|
+
):
|
383
|
+
result = resp.results[0]
|
384
|
+
speech_data = _streaming_recognize_response_to_speech_data(resp)
|
385
|
+
if speech_data is None:
|
386
|
+
continue
|
387
|
+
|
388
|
+
if not result.is_final:
|
389
|
+
self._event_ch.send_nowait(
|
390
|
+
stt.SpeechEvent(
|
391
|
+
type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
|
392
|
+
alternatives=[speech_data],
|
393
|
+
)
|
394
|
+
)
|
395
|
+
else:
|
396
|
+
self._event_ch.send_nowait(
|
397
|
+
stt.SpeechEvent(
|
398
|
+
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
399
|
+
alternatives=[speech_data],
|
400
|
+
)
|
274
401
|
)
|
275
402
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
)
|
280
|
-
|
281
|
-
|
282
|
-
await self._run_stream(stream)
|
283
|
-
except Exception as e:
|
284
|
-
if retry_count >= max_retry:
|
285
|
-
logger.error(
|
286
|
-
f"failed to connect to google stt after {max_retry} tries",
|
287
|
-
exc_info=e,
|
403
|
+
if (
|
404
|
+
resp.speech_event_type
|
405
|
+
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
|
406
|
+
):
|
407
|
+
self._event_ch.send_nowait(
|
408
|
+
stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
|
288
409
|
)
|
289
|
-
break
|
290
410
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
411
|
+
while True:
|
412
|
+
try:
|
413
|
+
self._streaming_config = cloud_speech.StreamingRecognitionConfig(
|
414
|
+
config=cloud_speech.RecognitionConfig(
|
415
|
+
explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
|
416
|
+
encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
417
|
+
sample_rate_hertz=self._config.sample_rate,
|
418
|
+
audio_channel_count=1,
|
419
|
+
),
|
420
|
+
adaptation=self._config.build_adaptation(),
|
421
|
+
language_codes=self._config.languages,
|
422
|
+
model=self._config.model,
|
423
|
+
features=cloud_speech.RecognitionFeatures(
|
424
|
+
enable_automatic_punctuation=self._config.punctuate,
|
425
|
+
enable_word_time_offsets=True,
|
426
|
+
),
|
427
|
+
),
|
428
|
+
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
429
|
+
enable_voice_activity_events=True,
|
430
|
+
interim_results=self._config.interim_results,
|
431
|
+
),
|
296
432
|
)
|
297
|
-
await asyncio.sleep(retry_delay)
|
298
433
|
|
299
|
-
|
300
|
-
|
301
|
-
):
|
302
|
-
async for resp in stream:
|
303
|
-
if (
|
304
|
-
resp.speech_event_type
|
305
|
-
== cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
|
306
|
-
):
|
307
|
-
self._event_ch.send_nowait(
|
308
|
-
stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
|
434
|
+
stream = await self._client.streaming_recognize(
|
435
|
+
requests=input_generator(),
|
309
436
|
)
|
310
437
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
if speech_data is None:
|
318
|
-
continue
|
319
|
-
|
320
|
-
if not result.is_final:
|
321
|
-
self._event_ch.send_nowait(
|
322
|
-
stt.SpeechEvent(
|
323
|
-
type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
|
324
|
-
alternatives=[speech_data],
|
325
|
-
)
|
438
|
+
process_stream_task = asyncio.create_task(process_stream(stream))
|
439
|
+
wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
|
440
|
+
try:
|
441
|
+
await asyncio.wait(
|
442
|
+
[process_stream_task, wait_reconnect_task],
|
443
|
+
return_when=asyncio.FIRST_COMPLETED,
|
326
444
|
)
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
331
|
-
alternatives=[speech_data],
|
332
|
-
)
|
445
|
+
finally:
|
446
|
+
await utils.aio.gracefully_cancel(
|
447
|
+
process_stream_task, wait_reconnect_task
|
333
448
|
)
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
):
|
339
|
-
self._event_ch.send_nowait(
|
340
|
-
stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
|
341
|
-
)
|
449
|
+
finally:
|
450
|
+
if not self._reconnect_event.is_set():
|
451
|
+
break
|
452
|
+
self._reconnect_event.clear()
|
342
453
|
|
343
454
|
|
344
455
|
def _recognize_response_to_speech_event(
|
livekit/plugins/google/tts.py
CHANGED
@@ -18,7 +18,9 @@ from dataclasses import dataclass
|
|
18
18
|
|
19
19
|
from livekit import rtc
|
20
20
|
from livekit.agents import (
|
21
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
21
22
|
APIConnectionError,
|
23
|
+
APIConnectOptions,
|
22
24
|
APIStatusError,
|
23
25
|
APITimeoutError,
|
24
26
|
tts,
|
@@ -134,7 +136,7 @@ class TTS(tts.TTS):
|
|
134
136
|
self._opts.audio_config.speaking_rate = speaking_rate
|
135
137
|
|
136
138
|
def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
|
137
|
-
if
|
139
|
+
if self._client is None:
|
138
140
|
if self._credentials_info:
|
139
141
|
self._client = (
|
140
142
|
texttospeech.TextToSpeechAsyncClient.from_service_account_info(
|
@@ -154,22 +156,35 @@ class TTS(tts.TTS):
|
|
154
156
|
assert self._client is not None
|
155
157
|
return self._client
|
156
158
|
|
157
|
-
def synthesize(
|
158
|
-
|
159
|
+
def synthesize(
|
160
|
+
self,
|
161
|
+
text: str,
|
162
|
+
*,
|
163
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
164
|
+
) -> "ChunkedStream":
|
165
|
+
return ChunkedStream(
|
166
|
+
tts=self,
|
167
|
+
input_text=text,
|
168
|
+
conn_options=conn_options,
|
169
|
+
opts=self._opts,
|
170
|
+
client=self._ensure_client(),
|
171
|
+
)
|
159
172
|
|
160
173
|
|
161
174
|
class ChunkedStream(tts.ChunkedStream):
|
162
175
|
def __init__(
|
163
176
|
self,
|
177
|
+
*,
|
164
178
|
tts: TTS,
|
165
|
-
|
179
|
+
input_text: str,
|
180
|
+
conn_options: APIConnectOptions,
|
166
181
|
opts: _TTSOptions,
|
167
182
|
client: texttospeech.TextToSpeechAsyncClient,
|
168
183
|
) -> None:
|
169
|
-
super().__init__(tts,
|
184
|
+
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
170
185
|
self._opts, self._client = opts, client
|
171
186
|
|
172
|
-
async def
|
187
|
+
async def _run(self) -> None:
|
173
188
|
request_id = utils.shortuuid()
|
174
189
|
|
175
190
|
try:
|
@@ -177,16 +192,16 @@ class ChunkedStream(tts.ChunkedStream):
|
|
177
192
|
input=texttospeech.SynthesisInput(text=self._input_text),
|
178
193
|
voice=self._opts.voice,
|
179
194
|
audio_config=self._opts.audio_config,
|
195
|
+
timeout=self._conn_options.timeout,
|
180
196
|
)
|
181
197
|
|
182
|
-
data = response.audio_content
|
183
198
|
if self._opts.audio_config.audio_encoding == "mp3":
|
184
199
|
decoder = utils.codecs.Mp3StreamDecoder()
|
185
200
|
bstream = utils.audio.AudioByteStream(
|
186
201
|
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
187
202
|
num_channels=1,
|
188
203
|
)
|
189
|
-
for frame in decoder.decode_chunk(
|
204
|
+
for frame in decoder.decode_chunk(response.audio_content):
|
190
205
|
for frame in bstream.write(frame.data.tobytes()):
|
191
206
|
self._event_ch.send_nowait(
|
192
207
|
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
@@ -197,7 +212,7 @@ class ChunkedStream(tts.ChunkedStream):
|
|
197
212
|
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
198
213
|
)
|
199
214
|
else:
|
200
|
-
data =
|
215
|
+
data = response.audio_content[44:] # skip WAV header
|
201
216
|
self._event_ch.send_nowait(
|
202
217
|
tts.SynthesizedAudio(
|
203
218
|
request_id=request_id,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.8.1
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -19,10 +19,10 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: google-auth
|
23
|
-
Requires-Dist: google-cloud-speech
|
24
|
-
Requires-Dist: google-cloud-texttospeech
|
25
|
-
Requires-Dist: livekit-agents
|
22
|
+
Requires-Dist: google-auth<3,>=2
|
23
|
+
Requires-Dist: google-cloud-speech<3,>=2
|
24
|
+
Requires-Dist: google-cloud-texttospeech<3,>=2
|
25
|
+
Requires-Dist: livekit-agents>=0.11
|
26
26
|
|
27
27
|
# LiveKit Plugins Google
|
28
28
|
|
@@ -0,0 +1,11 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=rqV6C5mFNDFlrA2IcGJrsebr2VxQwMzoDUjY1JhMBZM,1117
|
2
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
3
|
+
livekit/plugins/google/models.py,sha256=cBXhZGY9bFaSCyL9VeSng9wsxhf3peJi3AUYBKV-8GQ,1343
|
4
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/google/stt.py,sha256=tmjktdO6C2AuJWHSKl20ae3cfy_DqfN_oNYYcE552pQ,18566
|
6
|
+
livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
|
7
|
+
livekit/plugins/google/version.py,sha256=PoHw-_DNE2B5SpeoQ-r6HSfVmbDgYuGamg0dN2jhayQ,600
|
8
|
+
livekit_plugins_google-0.8.1.dist-info/METADATA,sha256=RHRMpfHxvaWjwWStByUPghWBLY5tIuC5Lm8r9C3hEhc,1643
|
9
|
+
livekit_plugins_google-0.8.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
10
|
+
livekit_plugins_google-0.8.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
11
|
+
livekit_plugins_google-0.8.1.dist-info/RECORD,,
|
@@ -1,11 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=rqV6C5mFNDFlrA2IcGJrsebr2VxQwMzoDUjY1JhMBZM,1117
|
2
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
3
|
-
livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
|
4
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/google/stt.py,sha256=XNU9G0DKc-joOMdhgfJJ2u6IZ3JJ33Wi-XmdqX426fg,14198
|
6
|
-
livekit/plugins/google/tts.py,sha256=hRN8ul1lDXU8LPVEfbTszgBiRYsifZXCPMwk-Pv2KeA,8793
|
7
|
-
livekit/plugins/google/version.py,sha256=wNTnO8L3jrMdUjS-xAEFoMTKPaPYiFY9Kxnvzm4hTBc,600
|
8
|
-
livekit_plugins_google-0.7.2.dist-info/METADATA,sha256=ohgXDVPUSOXfZ8AA7PQhC5RU5huOaZF9dq9GDDRO0-E,1647
|
9
|
-
livekit_plugins_google-0.7.2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
10
|
-
livekit_plugins_google-0.7.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
11
|
-
livekit_plugins_google-0.7.2.dist-info/RECORD,,
|
{livekit_plugins_google-0.7.2.dist-info → livekit_plugins_google-0.8.1.dist-info}/top_level.txt
RENAMED
File without changes
|