livekit-plugins-google 1.3.8__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/__init__.py +11 -3
- livekit/plugins/google/llm.py +142 -81
- livekit/plugins/google/models.py +15 -1
- livekit/plugins/google/realtime/api_proto.py +12 -10
- livekit/plugins/google/realtime/realtime_api.py +25 -28
- livekit/plugins/google/stt.py +281 -93
- livekit/plugins/google/tools.py +69 -9
- livekit/plugins/google/tts.py +17 -9
- livekit/plugins/google/utils.py +21 -87
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-1.3.8.dist-info → livekit_plugins_google-1.3.11.dist-info}/METADATA +1 -1
- livekit_plugins_google-1.3.11.dist-info/RECORD +18 -0
- livekit_plugins_google-1.3.8.dist-info/RECORD +0 -18
- {livekit_plugins_google-1.3.8.dist-info → livekit_plugins_google-1.3.11.dist-info}/WHEEL +0 -0
livekit/plugins/google/stt.py
CHANGED
|
@@ -21,14 +21,16 @@ import weakref
|
|
|
21
21
|
from collections.abc import AsyncGenerator, AsyncIterable
|
|
22
22
|
from dataclasses import dataclass
|
|
23
23
|
from datetime import timedelta
|
|
24
|
-
from typing import Callable, Union, cast
|
|
24
|
+
from typing import Callable, Union, cast, get_args
|
|
25
25
|
|
|
26
26
|
from google.api_core.client_options import ClientOptions
|
|
27
27
|
from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
|
|
28
28
|
from google.auth import default as gauth_default
|
|
29
29
|
from google.auth.exceptions import DefaultCredentialsError
|
|
30
|
-
from google.cloud.
|
|
31
|
-
from google.cloud.
|
|
30
|
+
from google.cloud.speech_v1 import SpeechAsyncClient as SpeechAsyncClientV1
|
|
31
|
+
from google.cloud.speech_v1.types import cloud_speech as cloud_speech_v1, resource as resource_v1
|
|
32
|
+
from google.cloud.speech_v2 import SpeechAsyncClient as SpeechAsyncClientV2
|
|
33
|
+
from google.cloud.speech_v2.types import cloud_speech as cloud_speech_v2
|
|
32
34
|
from google.protobuf.duration_pb2 import Duration
|
|
33
35
|
from livekit import rtc
|
|
34
36
|
from livekit.agents import (
|
|
@@ -45,9 +47,10 @@ from livekit.agents.types import (
|
|
|
45
47
|
NotGivenOr,
|
|
46
48
|
)
|
|
47
49
|
from livekit.agents.utils import is_given
|
|
50
|
+
from livekit.agents.voice.io import TimedString
|
|
48
51
|
|
|
49
52
|
from .log import logger
|
|
50
|
-
from .models import SpeechLanguages, SpeechModels
|
|
53
|
+
from .models import SpeechLanguages, SpeechModels, SpeechModelsV2
|
|
51
54
|
|
|
52
55
|
LgType = Union[SpeechLanguages, str]
|
|
53
56
|
LanguageCode = Union[LgType, list[LgType]]
|
|
@@ -76,17 +79,35 @@ class STTOptions:
|
|
|
76
79
|
min_confidence_threshold: float
|
|
77
80
|
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
|
|
78
81
|
|
|
79
|
-
|
|
82
|
+
@property
|
|
83
|
+
def version(self) -> int:
|
|
84
|
+
return 2 if self.model in get_args(SpeechModelsV2) else 1
|
|
85
|
+
|
|
86
|
+
def build_adaptation(
|
|
87
|
+
self,
|
|
88
|
+
) -> cloud_speech_v2.SpeechAdaptation | resource_v1.SpeechAdaptation | None:
|
|
80
89
|
if is_given(self.keywords):
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
90
|
+
if self.version == 2:
|
|
91
|
+
return cloud_speech_v2.SpeechAdaptation(
|
|
92
|
+
phrase_sets=[
|
|
93
|
+
cloud_speech_v2.SpeechAdaptation.AdaptationPhraseSet(
|
|
94
|
+
inline_phrase_set=cloud_speech_v2.PhraseSet(
|
|
95
|
+
phrases=[
|
|
96
|
+
cloud_speech_v2.PhraseSet.Phrase(value=keyword, boost=boost)
|
|
97
|
+
for keyword, boost in self.keywords
|
|
98
|
+
]
|
|
99
|
+
)
|
|
89
100
|
)
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
return resource_v1.SpeechAdaptation(
|
|
104
|
+
phrase_sets=[
|
|
105
|
+
resource_v1.PhraseSet(
|
|
106
|
+
name="keywords",
|
|
107
|
+
phrases=[
|
|
108
|
+
resource_v1.PhraseSet.Phrase(value=keyword, boost=boost)
|
|
109
|
+
for keyword, boost in self.keywords
|
|
110
|
+
],
|
|
90
111
|
)
|
|
91
112
|
]
|
|
92
113
|
)
|
|
@@ -102,7 +123,7 @@ class STT(stt.STT):
|
|
|
102
123
|
interim_results: bool = True,
|
|
103
124
|
punctuate: bool = True,
|
|
104
125
|
spoken_punctuation: bool = False,
|
|
105
|
-
enable_word_time_offsets: bool =
|
|
126
|
+
enable_word_time_offsets: NotGivenOr[bool] = NOT_GIVEN,
|
|
106
127
|
enable_word_confidence: bool = False,
|
|
107
128
|
enable_voice_activity_events: bool = False,
|
|
108
129
|
model: SpeechModels | str = "latest_long",
|
|
@@ -127,7 +148,7 @@ class STT(stt.STT):
|
|
|
127
148
|
interim_results(bool): whether to return interim results (default: True)
|
|
128
149
|
punctuate(bool): whether to punctuate the audio (default: True)
|
|
129
150
|
spoken_punctuation(bool): whether to use spoken punctuation (default: False)
|
|
130
|
-
enable_word_time_offsets(bool): whether to enable word time offsets (default:
|
|
151
|
+
enable_word_time_offsets(bool): whether to enable word time offsets (default: None)
|
|
131
152
|
enable_word_confidence(bool): whether to enable word confidence (default: False)
|
|
132
153
|
enable_voice_activity_events(bool): whether to enable voice activity events (default: False)
|
|
133
154
|
model(SpeechModels): the model to use for recognition default: "latest_long"
|
|
@@ -142,8 +163,24 @@ class STT(stt.STT):
|
|
|
142
163
|
"""
|
|
143
164
|
if not is_given(use_streaming):
|
|
144
165
|
use_streaming = True
|
|
166
|
+
|
|
167
|
+
if model == "chirp_3":
|
|
168
|
+
if is_given(enable_word_time_offsets) and enable_word_time_offsets:
|
|
169
|
+
logger.warning(
|
|
170
|
+
"Chirp 3 does not support word timestamps, setting 'enable_word_time_offsets' to False."
|
|
171
|
+
)
|
|
172
|
+
enable_word_time_offsets = False
|
|
173
|
+
elif is_given(enable_word_time_offsets):
|
|
174
|
+
enable_word_time_offsets = enable_word_time_offsets
|
|
175
|
+
else:
|
|
176
|
+
enable_word_time_offsets = True
|
|
177
|
+
|
|
145
178
|
super().__init__(
|
|
146
|
-
capabilities=stt.STTCapabilities(
|
|
179
|
+
capabilities=stt.STTCapabilities(
|
|
180
|
+
streaming=use_streaming,
|
|
181
|
+
interim_results=True,
|
|
182
|
+
aligned_transcript="word" if enable_word_time_offsets and use_streaming else False,
|
|
183
|
+
)
|
|
147
184
|
)
|
|
148
185
|
|
|
149
186
|
self._location = location
|
|
@@ -178,7 +215,7 @@ class STT(stt.STT):
|
|
|
178
215
|
keywords=keywords,
|
|
179
216
|
)
|
|
180
217
|
self._streams = weakref.WeakSet[SpeechStream]()
|
|
181
|
-
self._pool = utils.ConnectionPool[
|
|
218
|
+
self._pool = utils.ConnectionPool[SpeechAsyncClientV2 | SpeechAsyncClientV1](
|
|
182
219
|
max_session_duration=_max_session_duration,
|
|
183
220
|
connect_cb=self._create_client,
|
|
184
221
|
)
|
|
@@ -191,28 +228,29 @@ class STT(stt.STT):
|
|
|
191
228
|
def provider(self) -> str:
|
|
192
229
|
return "Google Cloud Platform"
|
|
193
230
|
|
|
194
|
-
async def _create_client(self, timeout: float) ->
|
|
231
|
+
async def _create_client(self, timeout: float) -> SpeechAsyncClientV2 | SpeechAsyncClientV1:
|
|
195
232
|
# Add support for passing a specific location that matches recognizer
|
|
196
233
|
# see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
|
|
197
234
|
# TODO(long): how to set timeout?
|
|
198
235
|
client_options = None
|
|
199
|
-
client:
|
|
236
|
+
client: SpeechAsyncClientV2 | SpeechAsyncClientV1 | None = None
|
|
237
|
+
client_cls = SpeechAsyncClientV2 if self._config.version == 2 else SpeechAsyncClientV1
|
|
200
238
|
if self._location != "global":
|
|
201
239
|
client_options = ClientOptions(api_endpoint=f"{self._location}-speech.googleapis.com")
|
|
202
240
|
if is_given(self._credentials_info):
|
|
203
|
-
client =
|
|
241
|
+
client = client_cls.from_service_account_info(
|
|
204
242
|
self._credentials_info, client_options=client_options
|
|
205
243
|
)
|
|
206
244
|
elif is_given(self._credentials_file):
|
|
207
|
-
client =
|
|
245
|
+
client = client_cls.from_service_account_file(
|
|
208
246
|
self._credentials_file, client_options=client_options
|
|
209
247
|
)
|
|
210
248
|
else:
|
|
211
|
-
client =
|
|
249
|
+
client = client_cls(client_options=client_options)
|
|
212
250
|
assert client is not None
|
|
213
251
|
return client
|
|
214
252
|
|
|
215
|
-
def _get_recognizer(self, client:
|
|
253
|
+
def _get_recognizer(self, client: SpeechAsyncClientV2) -> str:
|
|
216
254
|
# TODO(theomonnom): should we use recognizers?
|
|
217
255
|
# recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
|
|
218
256
|
|
|
@@ -240,6 +278,62 @@ class STT(stt.STT):
|
|
|
240
278
|
|
|
241
279
|
return config
|
|
242
280
|
|
|
281
|
+
def _build_recognition_config(
|
|
282
|
+
self,
|
|
283
|
+
sample_rate: int,
|
|
284
|
+
num_channels: int,
|
|
285
|
+
language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
|
|
286
|
+
) -> cloud_speech_v2.RecognitionConfig | cloud_speech_v1.RecognitionConfig:
|
|
287
|
+
config = self._sanitize_options(language=language)
|
|
288
|
+
if self._config.version == 2:
|
|
289
|
+
return cloud_speech_v2.RecognitionConfig(
|
|
290
|
+
explicit_decoding_config=cloud_speech_v2.ExplicitDecodingConfig(
|
|
291
|
+
encoding=cloud_speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
|
292
|
+
sample_rate_hertz=sample_rate,
|
|
293
|
+
audio_channel_count=num_channels,
|
|
294
|
+
),
|
|
295
|
+
adaptation=config.build_adaptation(),
|
|
296
|
+
features=cloud_speech_v2.RecognitionFeatures(
|
|
297
|
+
enable_automatic_punctuation=config.punctuate,
|
|
298
|
+
enable_spoken_punctuation=config.spoken_punctuation,
|
|
299
|
+
enable_word_time_offsets=config.enable_word_time_offsets,
|
|
300
|
+
enable_word_confidence=config.enable_word_confidence,
|
|
301
|
+
),
|
|
302
|
+
model=config.model,
|
|
303
|
+
language_codes=config.languages,
|
|
304
|
+
)
|
|
305
|
+
return cloud_speech_v1.RecognitionConfig(
|
|
306
|
+
encoding=cloud_speech_v1.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
307
|
+
sample_rate_hertz=sample_rate,
|
|
308
|
+
audio_channel_count=num_channels,
|
|
309
|
+
adaptation=config.build_adaptation(),
|
|
310
|
+
language_code=config.languages[0],
|
|
311
|
+
alternative_language_codes=config.languages[1:],
|
|
312
|
+
enable_word_time_offsets=config.enable_word_time_offsets,
|
|
313
|
+
enable_word_confidence=config.enable_word_confidence,
|
|
314
|
+
enable_automatic_punctuation=config.punctuate,
|
|
315
|
+
enable_spoken_punctuation=config.spoken_punctuation,
|
|
316
|
+
model=config.model,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
def _build_recognition_request(
|
|
320
|
+
self,
|
|
321
|
+
client: SpeechAsyncClientV2 | SpeechAsyncClientV1,
|
|
322
|
+
config: cloud_speech_v2.RecognitionConfig | cloud_speech_v1.RecognitionConfig,
|
|
323
|
+
content: bytes,
|
|
324
|
+
) -> cloud_speech_v2.RecognizeRequest | cloud_speech_v1.RecognizeRequest:
|
|
325
|
+
if self._config.version == 2:
|
|
326
|
+
return cloud_speech_v2.RecognizeRequest(
|
|
327
|
+
recognizer=self._get_recognizer(cast(SpeechAsyncClientV2, client)),
|
|
328
|
+
config=config,
|
|
329
|
+
content=content,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return cloud_speech_v1.RecognizeRequest(
|
|
333
|
+
config=config,
|
|
334
|
+
audio=cloud_speech_v1.RecognitionAudio(content=content),
|
|
335
|
+
)
|
|
336
|
+
|
|
243
337
|
async def _recognize_impl(
|
|
244
338
|
self,
|
|
245
339
|
buffer: utils.AudioBuffer,
|
|
@@ -247,37 +341,20 @@ class STT(stt.STT):
|
|
|
247
341
|
language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
|
|
248
342
|
conn_options: APIConnectOptions,
|
|
249
343
|
) -> stt.SpeechEvent:
|
|
250
|
-
config = self._sanitize_options(language=language)
|
|
251
344
|
frame = rtc.combine_audio_frames(buffer)
|
|
252
345
|
|
|
253
|
-
config =
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
audio_channel_count=frame.num_channels,
|
|
258
|
-
),
|
|
259
|
-
adaptation=config.build_adaptation(),
|
|
260
|
-
features=cloud_speech.RecognitionFeatures(
|
|
261
|
-
enable_automatic_punctuation=config.punctuate,
|
|
262
|
-
enable_spoken_punctuation=config.spoken_punctuation,
|
|
263
|
-
enable_word_time_offsets=config.enable_word_time_offsets,
|
|
264
|
-
enable_word_confidence=config.enable_word_confidence,
|
|
265
|
-
),
|
|
266
|
-
model=config.model,
|
|
267
|
-
language_codes=config.languages,
|
|
346
|
+
config = self._build_recognition_config(
|
|
347
|
+
sample_rate=frame.sample_rate,
|
|
348
|
+
num_channels=frame.num_channels,
|
|
349
|
+
language=language,
|
|
268
350
|
)
|
|
269
351
|
|
|
270
352
|
try:
|
|
271
353
|
async with self._pool.connection(timeout=conn_options.timeout) as client:
|
|
272
354
|
raw = await client.recognize(
|
|
273
|
-
|
|
274
|
-
recognizer=self._get_recognizer(client),
|
|
275
|
-
config=config,
|
|
276
|
-
content=frame.data.tobytes(),
|
|
277
|
-
),
|
|
355
|
+
self._build_recognition_request(client, config, frame.data.tobytes()),
|
|
278
356
|
timeout=conn_options.timeout,
|
|
279
357
|
)
|
|
280
|
-
|
|
281
358
|
return _recognize_response_to_speech_event(raw)
|
|
282
359
|
except DeadlineExceeded:
|
|
283
360
|
raise APITimeoutError() from None
|
|
@@ -328,7 +405,11 @@ class STT(stt.STT):
|
|
|
328
405
|
if is_given(spoken_punctuation):
|
|
329
406
|
self._config.spoken_punctuation = spoken_punctuation
|
|
330
407
|
if is_given(model):
|
|
408
|
+
old_version = self._config.version
|
|
331
409
|
self._config.model = model
|
|
410
|
+
if self._config.version != old_version:
|
|
411
|
+
self._pool.invalidate()
|
|
412
|
+
|
|
332
413
|
if is_given(location):
|
|
333
414
|
self._location = location
|
|
334
415
|
# if location is changed, fetch a new client and recognizer as per the new location
|
|
@@ -358,8 +439,8 @@ class SpeechStream(stt.SpeechStream):
|
|
|
358
439
|
*,
|
|
359
440
|
stt: STT,
|
|
360
441
|
conn_options: APIConnectOptions,
|
|
361
|
-
pool: utils.ConnectionPool[
|
|
362
|
-
recognizer_cb: Callable[[
|
|
442
|
+
pool: utils.ConnectionPool[SpeechAsyncClientV2 | SpeechAsyncClientV1],
|
|
443
|
+
recognizer_cb: Callable[[SpeechAsyncClientV2], str],
|
|
363
444
|
config: STTOptions,
|
|
364
445
|
) -> None:
|
|
365
446
|
super().__init__(stt=stt, conn_options=conn_options, sample_rate=config.sample_rate)
|
|
@@ -395,7 +476,10 @@ class SpeechStream(stt.SpeechStream):
|
|
|
395
476
|
if is_given(spoken_punctuation):
|
|
396
477
|
self._config.spoken_punctuation = spoken_punctuation
|
|
397
478
|
if is_given(model):
|
|
479
|
+
old_version = self._config.version
|
|
398
480
|
self._config.model = model
|
|
481
|
+
if self._config.version != old_version:
|
|
482
|
+
self._pool.invalidate()
|
|
399
483
|
if is_given(min_confidence_threshold):
|
|
400
484
|
self._config.min_confidence_threshold = min_confidence_threshold
|
|
401
485
|
if is_given(keywords):
|
|
@@ -403,21 +487,86 @@ class SpeechStream(stt.SpeechStream):
|
|
|
403
487
|
|
|
404
488
|
self._reconnect_event.set()
|
|
405
489
|
|
|
490
|
+
def _build_streaming_config(
|
|
491
|
+
self,
|
|
492
|
+
) -> cloud_speech_v2.StreamingRecognitionConfig | cloud_speech_v1.StreamingRecognitionConfig:
|
|
493
|
+
if self._config.version == 2:
|
|
494
|
+
return cloud_speech_v2.StreamingRecognitionConfig(
|
|
495
|
+
config=cloud_speech_v2.RecognitionConfig(
|
|
496
|
+
explicit_decoding_config=cloud_speech_v2.ExplicitDecodingConfig(
|
|
497
|
+
encoding=cloud_speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
|
498
|
+
sample_rate_hertz=self._config.sample_rate,
|
|
499
|
+
audio_channel_count=1,
|
|
500
|
+
),
|
|
501
|
+
adaptation=self._config.build_adaptation(),
|
|
502
|
+
language_codes=self._config.languages,
|
|
503
|
+
model=self._config.model,
|
|
504
|
+
features=cloud_speech_v2.RecognitionFeatures(
|
|
505
|
+
enable_automatic_punctuation=self._config.punctuate,
|
|
506
|
+
enable_word_time_offsets=self._config.enable_word_time_offsets,
|
|
507
|
+
enable_spoken_punctuation=self._config.spoken_punctuation,
|
|
508
|
+
enable_word_confidence=self._config.enable_word_confidence,
|
|
509
|
+
),
|
|
510
|
+
),
|
|
511
|
+
streaming_features=cloud_speech_v2.StreamingRecognitionFeatures(
|
|
512
|
+
interim_results=self._config.interim_results,
|
|
513
|
+
enable_voice_activity_events=self._config.enable_voice_activity_events,
|
|
514
|
+
),
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
return cloud_speech_v1.StreamingRecognitionConfig(
|
|
518
|
+
config=cloud_speech_v1.RecognitionConfig(
|
|
519
|
+
encoding=cloud_speech_v1.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
520
|
+
sample_rate_hertz=self._config.sample_rate,
|
|
521
|
+
audio_channel_count=1,
|
|
522
|
+
adaptation=self._config.build_adaptation(),
|
|
523
|
+
language_code=self._config.languages[0],
|
|
524
|
+
alternative_language_codes=self._config.languages[1:],
|
|
525
|
+
enable_word_time_offsets=self._config.enable_word_time_offsets,
|
|
526
|
+
enable_word_confidence=self._config.enable_word_confidence,
|
|
527
|
+
enable_automatic_punctuation=self._config.punctuate,
|
|
528
|
+
enable_spoken_punctuation=self._config.spoken_punctuation,
|
|
529
|
+
model=self._config.model,
|
|
530
|
+
),
|
|
531
|
+
interim_results=self._config.interim_results,
|
|
532
|
+
enable_voice_activity_events=self._config.enable_voice_activity_events,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
def _build_init_request(
|
|
536
|
+
self,
|
|
537
|
+
client: SpeechAsyncClientV2 | SpeechAsyncClientV1,
|
|
538
|
+
) -> cloud_speech_v2.StreamingRecognizeRequest | cloud_speech_v1.StreamingRecognizeRequest:
|
|
539
|
+
if self._config.version == 2:
|
|
540
|
+
return cloud_speech_v2.StreamingRecognizeRequest(
|
|
541
|
+
recognizer=self._recognizer_cb(cast(SpeechAsyncClientV2, client)),
|
|
542
|
+
streaming_config=self._streaming_config,
|
|
543
|
+
)
|
|
544
|
+
return cloud_speech_v1.StreamingRecognizeRequest(
|
|
545
|
+
streaming_config=self._streaming_config,
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
def _build_audio_request(
|
|
549
|
+
self,
|
|
550
|
+
frame: rtc.AudioFrame,
|
|
551
|
+
) -> cloud_speech_v2.StreamingRecognizeRequest | cloud_speech_v1.StreamingRecognizeRequest:
|
|
552
|
+
if self._config.version == 2:
|
|
553
|
+
return cloud_speech_v2.StreamingRecognizeRequest(audio=frame.data.tobytes())
|
|
554
|
+
return cloud_speech_v1.StreamingRecognizeRequest(audio_content=frame.data.tobytes())
|
|
555
|
+
|
|
406
556
|
async def _run(self) -> None:
|
|
407
557
|
audio_pushed = False
|
|
408
558
|
|
|
409
559
|
# google requires a async generator when calling streaming_recognize
|
|
410
560
|
# this function basically convert the queue into a async generator
|
|
411
561
|
async def input_generator(
|
|
412
|
-
client:
|
|
413
|
-
) -> AsyncGenerator[
|
|
562
|
+
client: SpeechAsyncClientV2 | SpeechAsyncClientV1, should_stop: asyncio.Event
|
|
563
|
+
) -> AsyncGenerator[
|
|
564
|
+
cloud_speech_v2.StreamingRecognizeRequest | cloud_speech_v1.StreamingRecognizeRequest,
|
|
565
|
+
None,
|
|
566
|
+
]:
|
|
414
567
|
nonlocal audio_pushed
|
|
415
568
|
try:
|
|
416
|
-
|
|
417
|
-
yield cloud_speech.StreamingRecognizeRequest(
|
|
418
|
-
recognizer=self._recognizer_cb(client),
|
|
419
|
-
streaming_config=self._streaming_config,
|
|
420
|
-
)
|
|
569
|
+
yield self._build_init_request(client)
|
|
421
570
|
|
|
422
571
|
async for frame in self._input_ch:
|
|
423
572
|
# when the stream is aborted due to reconnect, this input_generator
|
|
@@ -427,7 +576,7 @@ class SpeechStream(stt.SpeechStream):
|
|
|
427
576
|
return
|
|
428
577
|
|
|
429
578
|
if isinstance(frame, rtc.AudioFrame):
|
|
430
|
-
yield
|
|
579
|
+
yield self._build_audio_request(frame)
|
|
431
580
|
if not audio_pushed:
|
|
432
581
|
audio_pushed = True
|
|
433
582
|
|
|
@@ -435,28 +584,34 @@ class SpeechStream(stt.SpeechStream):
|
|
|
435
584
|
logger.exception("an error occurred while streaming input to google STT")
|
|
436
585
|
|
|
437
586
|
async def process_stream(
|
|
438
|
-
client:
|
|
439
|
-
stream: AsyncIterable[
|
|
587
|
+
client: SpeechAsyncClientV2 | SpeechAsyncClientV1,
|
|
588
|
+
stream: AsyncIterable[
|
|
589
|
+
cloud_speech_v2.StreamingRecognizeResponse
|
|
590
|
+
| cloud_speech_v1.StreamingRecognizeResponse
|
|
591
|
+
],
|
|
440
592
|
) -> None:
|
|
441
593
|
has_started = False
|
|
442
594
|
async for resp in stream:
|
|
443
|
-
if (
|
|
444
|
-
|
|
445
|
-
|
|
595
|
+
if resp.speech_event_type == (
|
|
596
|
+
cloud_speech_v2.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
|
|
597
|
+
if self._config.version == 2
|
|
598
|
+
else cloud_speech_v1.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
|
|
446
599
|
):
|
|
447
600
|
self._event_ch.send_nowait(
|
|
448
601
|
stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
|
|
449
602
|
)
|
|
450
603
|
has_started = True
|
|
451
604
|
|
|
452
|
-
if (
|
|
453
|
-
|
|
454
|
-
|
|
605
|
+
if resp.speech_event_type == (
|
|
606
|
+
cloud_speech_v2.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
|
|
607
|
+
if self._config.version == 2
|
|
608
|
+
else cloud_speech_v1.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_UNSPECIFIED
|
|
455
609
|
):
|
|
456
610
|
result = resp.results[0]
|
|
457
611
|
speech_data = _streaming_recognize_response_to_speech_data(
|
|
458
612
|
resp,
|
|
459
613
|
min_confidence_threshold=self._config.min_confidence_threshold,
|
|
614
|
+
start_time_offset=self.start_time_offset,
|
|
460
615
|
)
|
|
461
616
|
if speech_data is None:
|
|
462
617
|
continue
|
|
@@ -488,9 +643,10 @@ class SpeechStream(stt.SpeechStream):
|
|
|
488
643
|
self._reconnect_event.set()
|
|
489
644
|
return
|
|
490
645
|
|
|
491
|
-
if (
|
|
492
|
-
|
|
493
|
-
|
|
646
|
+
if resp.speech_event_type == (
|
|
647
|
+
cloud_speech_v2.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
|
|
648
|
+
if self._config.version == 2
|
|
649
|
+
else cloud_speech_v1.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
|
|
494
650
|
):
|
|
495
651
|
self._event_ch.send_nowait(
|
|
496
652
|
stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
|
|
@@ -501,27 +657,7 @@ class SpeechStream(stt.SpeechStream):
|
|
|
501
657
|
audio_pushed = False
|
|
502
658
|
try:
|
|
503
659
|
async with self._pool.connection(timeout=self._conn_options.timeout) as client:
|
|
504
|
-
self._streaming_config =
|
|
505
|
-
config=cloud_speech.RecognitionConfig(
|
|
506
|
-
explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
|
|
507
|
-
encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
|
508
|
-
sample_rate_hertz=self._config.sample_rate,
|
|
509
|
-
audio_channel_count=1,
|
|
510
|
-
),
|
|
511
|
-
adaptation=self._config.build_adaptation(),
|
|
512
|
-
language_codes=self._config.languages,
|
|
513
|
-
model=self._config.model,
|
|
514
|
-
features=cloud_speech.RecognitionFeatures(
|
|
515
|
-
enable_automatic_punctuation=self._config.punctuate,
|
|
516
|
-
enable_word_time_offsets=self._config.enable_word_time_offsets,
|
|
517
|
-
enable_spoken_punctuation=self._config.spoken_punctuation,
|
|
518
|
-
),
|
|
519
|
-
),
|
|
520
|
-
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
|
521
|
-
interim_results=self._config.interim_results,
|
|
522
|
-
enable_voice_activity_events=self._config.enable_voice_activity_events,
|
|
523
|
-
),
|
|
524
|
-
)
|
|
660
|
+
self._streaming_config = self._build_streaming_config()
|
|
525
661
|
|
|
526
662
|
should_stop = asyncio.Event()
|
|
527
663
|
stream = await client.streaming_recognize(
|
|
@@ -575,8 +711,20 @@ def _duration_to_seconds(duration: Duration | timedelta) -> float:
|
|
|
575
711
|
return duration.seconds + duration.nanos / 1e9
|
|
576
712
|
|
|
577
713
|
|
|
714
|
+
def _get_start_time(word: cloud_speech_v2.WordInfo | cloud_speech_v1.WordInfo) -> float:
|
|
715
|
+
if hasattr(word, "start_offset"):
|
|
716
|
+
return _duration_to_seconds(word.start_offset)
|
|
717
|
+
return _duration_to_seconds(word.start_time)
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def _get_end_time(word: cloud_speech_v2.WordInfo | cloud_speech_v1.WordInfo) -> float:
|
|
721
|
+
if hasattr(word, "end_offset"):
|
|
722
|
+
return _duration_to_seconds(word.end_offset)
|
|
723
|
+
return _duration_to_seconds(word.end_time)
|
|
724
|
+
|
|
725
|
+
|
|
578
726
|
def _recognize_response_to_speech_event(
|
|
579
|
-
resp:
|
|
727
|
+
resp: cloud_speech_v2.RecognizeResponse | cloud_speech_v1.RecognizeResponse,
|
|
580
728
|
) -> stt.SpeechEvent:
|
|
581
729
|
text = ""
|
|
582
730
|
confidence = 0.0
|
|
@@ -589,8 +737,8 @@ def _recognize_response_to_speech_event(
|
|
|
589
737
|
# Google STT may return empty results when spoken_lang != stt_lang
|
|
590
738
|
if resp.results:
|
|
591
739
|
try:
|
|
592
|
-
start_time =
|
|
593
|
-
end_time =
|
|
740
|
+
start_time = _get_start_time(resp.results[0].alternatives[0].words[0])
|
|
741
|
+
end_time = _get_end_time(resp.results[-1].alternatives[0].words[-1])
|
|
594
742
|
except IndexError:
|
|
595
743
|
# When enable_word_time_offsets=False, there are no "words" to access
|
|
596
744
|
start_time = end_time = 0
|
|
@@ -605,20 +753,33 @@ def _recognize_response_to_speech_event(
|
|
|
605
753
|
end_time=end_time,
|
|
606
754
|
confidence=confidence,
|
|
607
755
|
text=text,
|
|
756
|
+
words=[
|
|
757
|
+
TimedString(
|
|
758
|
+
text=word.word,
|
|
759
|
+
start_time=_get_start_time(word),
|
|
760
|
+
end_time=_get_end_time(word),
|
|
761
|
+
)
|
|
762
|
+
for word in resp.results[0].alternatives[0].words
|
|
763
|
+
]
|
|
764
|
+
if resp.results[0].alternatives[0].words
|
|
765
|
+
else None,
|
|
608
766
|
)
|
|
609
767
|
]
|
|
610
768
|
|
|
611
769
|
return stt.SpeechEvent(type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=alternatives)
|
|
612
770
|
|
|
613
771
|
|
|
772
|
+
@utils.log_exceptions(logger=logger)
|
|
614
773
|
def _streaming_recognize_response_to_speech_data(
|
|
615
|
-
resp:
|
|
774
|
+
resp: cloud_speech_v2.StreamingRecognizeResponse | cloud_speech_v1.StreamingRecognizeResponse,
|
|
616
775
|
*,
|
|
617
776
|
min_confidence_threshold: float,
|
|
777
|
+
start_time_offset: float,
|
|
618
778
|
) -> stt.SpeechData | None:
|
|
619
779
|
text = ""
|
|
620
780
|
confidence = 0.0
|
|
621
781
|
final_result = None
|
|
782
|
+
words: list[cloud_speech_v2.WordInfo | cloud_speech_v1.WordInfo] = []
|
|
622
783
|
for result in resp.results:
|
|
623
784
|
if len(result.alternatives) == 0:
|
|
624
785
|
continue
|
|
@@ -629,10 +790,12 @@ def _streaming_recognize_response_to_speech_data(
|
|
|
629
790
|
else:
|
|
630
791
|
text += result.alternatives[0].transcript
|
|
631
792
|
confidence += result.alternatives[0].confidence
|
|
793
|
+
words.extend(result.alternatives[0].words)
|
|
632
794
|
|
|
633
795
|
if final_result is not None:
|
|
634
796
|
text = final_result.alternatives[0].transcript
|
|
635
797
|
confidence = final_result.alternatives[0].confidence
|
|
798
|
+
words = list(final_result.alternatives[0].words)
|
|
636
799
|
lg = final_result.language_code
|
|
637
800
|
else:
|
|
638
801
|
confidence /= len(resp.results)
|
|
@@ -640,9 +803,34 @@ def _streaming_recognize_response_to_speech_data(
|
|
|
640
803
|
return None
|
|
641
804
|
lg = resp.results[0].language_code
|
|
642
805
|
|
|
643
|
-
if text == "":
|
|
806
|
+
if text == "" or not words:
|
|
807
|
+
if text and not words:
|
|
808
|
+
data = stt.SpeechData(
|
|
809
|
+
language=lg,
|
|
810
|
+
start_time=start_time_offset,
|
|
811
|
+
end_time=start_time_offset,
|
|
812
|
+
confidence=confidence,
|
|
813
|
+
text=text,
|
|
814
|
+
)
|
|
815
|
+
return data
|
|
644
816
|
return None
|
|
645
817
|
|
|
646
|
-
data = stt.SpeechData(
|
|
818
|
+
data = stt.SpeechData(
|
|
819
|
+
language=lg,
|
|
820
|
+
start_time=_get_start_time(words[0]) + start_time_offset,
|
|
821
|
+
end_time=_get_end_time(words[-1]) + start_time_offset,
|
|
822
|
+
confidence=confidence,
|
|
823
|
+
text=text,
|
|
824
|
+
words=[
|
|
825
|
+
TimedString(
|
|
826
|
+
text=word.word,
|
|
827
|
+
start_time=_get_start_time(word) + start_time_offset,
|
|
828
|
+
end_time=_get_end_time(word) + start_time_offset,
|
|
829
|
+
start_time_offset=start_time_offset,
|
|
830
|
+
confidence=word.confidence,
|
|
831
|
+
)
|
|
832
|
+
for word in words
|
|
833
|
+
],
|
|
834
|
+
)
|
|
647
835
|
|
|
648
836
|
return data
|