livekit-plugins-google 1.3.8__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,14 +21,16 @@ import weakref
21
21
  from collections.abc import AsyncGenerator, AsyncIterable
22
22
  from dataclasses import dataclass
23
23
  from datetime import timedelta
24
- from typing import Callable, Union, cast
24
+ from typing import Callable, Union, cast, get_args
25
25
 
26
26
  from google.api_core.client_options import ClientOptions
27
27
  from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
28
28
  from google.auth import default as gauth_default
29
29
  from google.auth.exceptions import DefaultCredentialsError
30
- from google.cloud.speech_v2 import SpeechAsyncClient
31
- from google.cloud.speech_v2.types import cloud_speech
30
+ from google.cloud.speech_v1 import SpeechAsyncClient as SpeechAsyncClientV1
31
+ from google.cloud.speech_v1.types import cloud_speech as cloud_speech_v1, resource as resource_v1
32
+ from google.cloud.speech_v2 import SpeechAsyncClient as SpeechAsyncClientV2
33
+ from google.cloud.speech_v2.types import cloud_speech as cloud_speech_v2
32
34
  from google.protobuf.duration_pb2 import Duration
33
35
  from livekit import rtc
34
36
  from livekit.agents import (
@@ -45,9 +47,10 @@ from livekit.agents.types import (
45
47
  NotGivenOr,
46
48
  )
47
49
  from livekit.agents.utils import is_given
50
+ from livekit.agents.voice.io import TimedString
48
51
 
49
52
  from .log import logger
50
- from .models import SpeechLanguages, SpeechModels
53
+ from .models import SpeechLanguages, SpeechModels, SpeechModelsV2
51
54
 
52
55
  LgType = Union[SpeechLanguages, str]
53
56
  LanguageCode = Union[LgType, list[LgType]]
@@ -76,17 +79,35 @@ class STTOptions:
76
79
  min_confidence_threshold: float
77
80
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
78
81
 
79
- def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
82
+ @property
83
+ def version(self) -> int:
84
+ return 2 if self.model in get_args(SpeechModelsV2) else 1
85
+
86
+ def build_adaptation(
87
+ self,
88
+ ) -> cloud_speech_v2.SpeechAdaptation | resource_v1.SpeechAdaptation | None:
80
89
  if is_given(self.keywords):
81
- return cloud_speech.SpeechAdaptation(
82
- phrase_sets=[
83
- cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
84
- inline_phrase_set=cloud_speech.PhraseSet(
85
- phrases=[
86
- cloud_speech.PhraseSet.Phrase(value=keyword, boost=boost)
87
- for keyword, boost in self.keywords
88
- ]
90
+ if self.version == 2:
91
+ return cloud_speech_v2.SpeechAdaptation(
92
+ phrase_sets=[
93
+ cloud_speech_v2.SpeechAdaptation.AdaptationPhraseSet(
94
+ inline_phrase_set=cloud_speech_v2.PhraseSet(
95
+ phrases=[
96
+ cloud_speech_v2.PhraseSet.Phrase(value=keyword, boost=boost)
97
+ for keyword, boost in self.keywords
98
+ ]
99
+ )
89
100
  )
101
+ ]
102
+ )
103
+ return resource_v1.SpeechAdaptation(
104
+ phrase_sets=[
105
+ resource_v1.PhraseSet(
106
+ name="keywords",
107
+ phrases=[
108
+ resource_v1.PhraseSet.Phrase(value=keyword, boost=boost)
109
+ for keyword, boost in self.keywords
110
+ ],
90
111
  )
91
112
  ]
92
113
  )
@@ -102,7 +123,7 @@ class STT(stt.STT):
102
123
  interim_results: bool = True,
103
124
  punctuate: bool = True,
104
125
  spoken_punctuation: bool = False,
105
- enable_word_time_offsets: bool = True,
126
+ enable_word_time_offsets: NotGivenOr[bool] = NOT_GIVEN,
106
127
  enable_word_confidence: bool = False,
107
128
  enable_voice_activity_events: bool = False,
108
129
  model: SpeechModels | str = "latest_long",
@@ -127,7 +148,7 @@ class STT(stt.STT):
127
148
  interim_results(bool): whether to return interim results (default: True)
128
149
  punctuate(bool): whether to punctuate the audio (default: True)
129
150
  spoken_punctuation(bool): whether to use spoken punctuation (default: False)
130
- enable_word_time_offsets(bool): whether to enable word time offsets (default: True)
151
+ enable_word_time_offsets(bool): whether to enable word time offsets (default: None)
131
152
  enable_word_confidence(bool): whether to enable word confidence (default: False)
132
153
  enable_voice_activity_events(bool): whether to enable voice activity events (default: False)
133
154
  model(SpeechModels): the model to use for recognition default: "latest_long"
@@ -142,8 +163,24 @@ class STT(stt.STT):
142
163
  """
143
164
  if not is_given(use_streaming):
144
165
  use_streaming = True
166
+
167
+ if model == "chirp_3":
168
+ if is_given(enable_word_time_offsets) and enable_word_time_offsets:
169
+ logger.warning(
170
+ "Chirp 3 does not support word timestamps, setting 'enable_word_time_offsets' to False."
171
+ )
172
+ enable_word_time_offsets = False
173
+ elif is_given(enable_word_time_offsets):
174
+ enable_word_time_offsets = enable_word_time_offsets
175
+ else:
176
+ enable_word_time_offsets = True
177
+
145
178
  super().__init__(
146
- capabilities=stt.STTCapabilities(streaming=use_streaming, interim_results=True)
179
+ capabilities=stt.STTCapabilities(
180
+ streaming=use_streaming,
181
+ interim_results=True,
182
+ aligned_transcript="word" if enable_word_time_offsets and use_streaming else False,
183
+ )
147
184
  )
148
185
 
149
186
  self._location = location
@@ -178,7 +215,7 @@ class STT(stt.STT):
178
215
  keywords=keywords,
179
216
  )
180
217
  self._streams = weakref.WeakSet[SpeechStream]()
181
- self._pool = utils.ConnectionPool[SpeechAsyncClient](
218
+ self._pool = utils.ConnectionPool[SpeechAsyncClientV2 | SpeechAsyncClientV1](
182
219
  max_session_duration=_max_session_duration,
183
220
  connect_cb=self._create_client,
184
221
  )
@@ -191,28 +228,29 @@ class STT(stt.STT):
191
228
  def provider(self) -> str:
192
229
  return "Google Cloud Platform"
193
230
 
194
- async def _create_client(self, timeout: float) -> SpeechAsyncClient:
231
+ async def _create_client(self, timeout: float) -> SpeechAsyncClientV2 | SpeechAsyncClientV1:
195
232
  # Add support for passing a specific location that matches recognizer
196
233
  # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
197
234
  # TODO(long): how to set timeout?
198
235
  client_options = None
199
- client: SpeechAsyncClient | None = None
236
+ client: SpeechAsyncClientV2 | SpeechAsyncClientV1 | None = None
237
+ client_cls = SpeechAsyncClientV2 if self._config.version == 2 else SpeechAsyncClientV1
200
238
  if self._location != "global":
201
239
  client_options = ClientOptions(api_endpoint=f"{self._location}-speech.googleapis.com")
202
240
  if is_given(self._credentials_info):
203
- client = SpeechAsyncClient.from_service_account_info(
241
+ client = client_cls.from_service_account_info(
204
242
  self._credentials_info, client_options=client_options
205
243
  )
206
244
  elif is_given(self._credentials_file):
207
- client = SpeechAsyncClient.from_service_account_file(
245
+ client = client_cls.from_service_account_file(
208
246
  self._credentials_file, client_options=client_options
209
247
  )
210
248
  else:
211
- client = SpeechAsyncClient(client_options=client_options)
249
+ client = client_cls(client_options=client_options)
212
250
  assert client is not None
213
251
  return client
214
252
 
215
- def _get_recognizer(self, client: SpeechAsyncClient) -> str:
253
+ def _get_recognizer(self, client: SpeechAsyncClientV2) -> str:
216
254
  # TODO(theomonnom): should we use recognizers?
217
255
  # recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
218
256
 
@@ -240,6 +278,62 @@ class STT(stt.STT):
240
278
 
241
279
  return config
242
280
 
281
+ def _build_recognition_config(
282
+ self,
283
+ sample_rate: int,
284
+ num_channels: int,
285
+ language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
286
+ ) -> cloud_speech_v2.RecognitionConfig | cloud_speech_v1.RecognitionConfig:
287
+ config = self._sanitize_options(language=language)
288
+ if self._config.version == 2:
289
+ return cloud_speech_v2.RecognitionConfig(
290
+ explicit_decoding_config=cloud_speech_v2.ExplicitDecodingConfig(
291
+ encoding=cloud_speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
292
+ sample_rate_hertz=sample_rate,
293
+ audio_channel_count=num_channels,
294
+ ),
295
+ adaptation=config.build_adaptation(),
296
+ features=cloud_speech_v2.RecognitionFeatures(
297
+ enable_automatic_punctuation=config.punctuate,
298
+ enable_spoken_punctuation=config.spoken_punctuation,
299
+ enable_word_time_offsets=config.enable_word_time_offsets,
300
+ enable_word_confidence=config.enable_word_confidence,
301
+ ),
302
+ model=config.model,
303
+ language_codes=config.languages,
304
+ )
305
+ return cloud_speech_v1.RecognitionConfig(
306
+ encoding=cloud_speech_v1.RecognitionConfig.AudioEncoding.LINEAR16,
307
+ sample_rate_hertz=sample_rate,
308
+ audio_channel_count=num_channels,
309
+ adaptation=config.build_adaptation(),
310
+ language_code=config.languages[0],
311
+ alternative_language_codes=config.languages[1:],
312
+ enable_word_time_offsets=config.enable_word_time_offsets,
313
+ enable_word_confidence=config.enable_word_confidence,
314
+ enable_automatic_punctuation=config.punctuate,
315
+ enable_spoken_punctuation=config.spoken_punctuation,
316
+ model=config.model,
317
+ )
318
+
319
+ def _build_recognition_request(
320
+ self,
321
+ client: SpeechAsyncClientV2 | SpeechAsyncClientV1,
322
+ config: cloud_speech_v2.RecognitionConfig | cloud_speech_v1.RecognitionConfig,
323
+ content: bytes,
324
+ ) -> cloud_speech_v2.RecognizeRequest | cloud_speech_v1.RecognizeRequest:
325
+ if self._config.version == 2:
326
+ return cloud_speech_v2.RecognizeRequest(
327
+ recognizer=self._get_recognizer(cast(SpeechAsyncClientV2, client)),
328
+ config=config,
329
+ content=content,
330
+ )
331
+
332
+ return cloud_speech_v1.RecognizeRequest(
333
+ config=config,
334
+ audio=cloud_speech_v1.RecognitionAudio(content=content),
335
+ )
336
+
243
337
  async def _recognize_impl(
244
338
  self,
245
339
  buffer: utils.AudioBuffer,
@@ -247,37 +341,20 @@ class STT(stt.STT):
247
341
  language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
248
342
  conn_options: APIConnectOptions,
249
343
  ) -> stt.SpeechEvent:
250
- config = self._sanitize_options(language=language)
251
344
  frame = rtc.combine_audio_frames(buffer)
252
345
 
253
- config = cloud_speech.RecognitionConfig(
254
- explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
255
- encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
256
- sample_rate_hertz=frame.sample_rate,
257
- audio_channel_count=frame.num_channels,
258
- ),
259
- adaptation=config.build_adaptation(),
260
- features=cloud_speech.RecognitionFeatures(
261
- enable_automatic_punctuation=config.punctuate,
262
- enable_spoken_punctuation=config.spoken_punctuation,
263
- enable_word_time_offsets=config.enable_word_time_offsets,
264
- enable_word_confidence=config.enable_word_confidence,
265
- ),
266
- model=config.model,
267
- language_codes=config.languages,
346
+ config = self._build_recognition_config(
347
+ sample_rate=frame.sample_rate,
348
+ num_channels=frame.num_channels,
349
+ language=language,
268
350
  )
269
351
 
270
352
  try:
271
353
  async with self._pool.connection(timeout=conn_options.timeout) as client:
272
354
  raw = await client.recognize(
273
- cloud_speech.RecognizeRequest(
274
- recognizer=self._get_recognizer(client),
275
- config=config,
276
- content=frame.data.tobytes(),
277
- ),
355
+ self._build_recognition_request(client, config, frame.data.tobytes()),
278
356
  timeout=conn_options.timeout,
279
357
  )
280
-
281
358
  return _recognize_response_to_speech_event(raw)
282
359
  except DeadlineExceeded:
283
360
  raise APITimeoutError() from None
@@ -328,7 +405,11 @@ class STT(stt.STT):
328
405
  if is_given(spoken_punctuation):
329
406
  self._config.spoken_punctuation = spoken_punctuation
330
407
  if is_given(model):
408
+ old_version = self._config.version
331
409
  self._config.model = model
410
+ if self._config.version != old_version:
411
+ self._pool.invalidate()
412
+
332
413
  if is_given(location):
333
414
  self._location = location
334
415
  # if location is changed, fetch a new client and recognizer as per the new location
@@ -358,8 +439,8 @@ class SpeechStream(stt.SpeechStream):
358
439
  *,
359
440
  stt: STT,
360
441
  conn_options: APIConnectOptions,
361
- pool: utils.ConnectionPool[SpeechAsyncClient],
362
- recognizer_cb: Callable[[SpeechAsyncClient], str],
442
+ pool: utils.ConnectionPool[SpeechAsyncClientV2 | SpeechAsyncClientV1],
443
+ recognizer_cb: Callable[[SpeechAsyncClientV2], str],
363
444
  config: STTOptions,
364
445
  ) -> None:
365
446
  super().__init__(stt=stt, conn_options=conn_options, sample_rate=config.sample_rate)
@@ -395,7 +476,10 @@ class SpeechStream(stt.SpeechStream):
395
476
  if is_given(spoken_punctuation):
396
477
  self._config.spoken_punctuation = spoken_punctuation
397
478
  if is_given(model):
479
+ old_version = self._config.version
398
480
  self._config.model = model
481
+ if self._config.version != old_version:
482
+ self._pool.invalidate()
399
483
  if is_given(min_confidence_threshold):
400
484
  self._config.min_confidence_threshold = min_confidence_threshold
401
485
  if is_given(keywords):
@@ -403,21 +487,86 @@ class SpeechStream(stt.SpeechStream):
403
487
 
404
488
  self._reconnect_event.set()
405
489
 
490
+ def _build_streaming_config(
491
+ self,
492
+ ) -> cloud_speech_v2.StreamingRecognitionConfig | cloud_speech_v1.StreamingRecognitionConfig:
493
+ if self._config.version == 2:
494
+ return cloud_speech_v2.StreamingRecognitionConfig(
495
+ config=cloud_speech_v2.RecognitionConfig(
496
+ explicit_decoding_config=cloud_speech_v2.ExplicitDecodingConfig(
497
+ encoding=cloud_speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
498
+ sample_rate_hertz=self._config.sample_rate,
499
+ audio_channel_count=1,
500
+ ),
501
+ adaptation=self._config.build_adaptation(),
502
+ language_codes=self._config.languages,
503
+ model=self._config.model,
504
+ features=cloud_speech_v2.RecognitionFeatures(
505
+ enable_automatic_punctuation=self._config.punctuate,
506
+ enable_word_time_offsets=self._config.enable_word_time_offsets,
507
+ enable_spoken_punctuation=self._config.spoken_punctuation,
508
+ enable_word_confidence=self._config.enable_word_confidence,
509
+ ),
510
+ ),
511
+ streaming_features=cloud_speech_v2.StreamingRecognitionFeatures(
512
+ interim_results=self._config.interim_results,
513
+ enable_voice_activity_events=self._config.enable_voice_activity_events,
514
+ ),
515
+ )
516
+
517
+ return cloud_speech_v1.StreamingRecognitionConfig(
518
+ config=cloud_speech_v1.RecognitionConfig(
519
+ encoding=cloud_speech_v1.RecognitionConfig.AudioEncoding.LINEAR16,
520
+ sample_rate_hertz=self._config.sample_rate,
521
+ audio_channel_count=1,
522
+ adaptation=self._config.build_adaptation(),
523
+ language_code=self._config.languages[0],
524
+ alternative_language_codes=self._config.languages[1:],
525
+ enable_word_time_offsets=self._config.enable_word_time_offsets,
526
+ enable_word_confidence=self._config.enable_word_confidence,
527
+ enable_automatic_punctuation=self._config.punctuate,
528
+ enable_spoken_punctuation=self._config.spoken_punctuation,
529
+ model=self._config.model,
530
+ ),
531
+ interim_results=self._config.interim_results,
532
+ enable_voice_activity_events=self._config.enable_voice_activity_events,
533
+ )
534
+
535
+ def _build_init_request(
536
+ self,
537
+ client: SpeechAsyncClientV2 | SpeechAsyncClientV1,
538
+ ) -> cloud_speech_v2.StreamingRecognizeRequest | cloud_speech_v1.StreamingRecognizeRequest:
539
+ if self._config.version == 2:
540
+ return cloud_speech_v2.StreamingRecognizeRequest(
541
+ recognizer=self._recognizer_cb(cast(SpeechAsyncClientV2, client)),
542
+ streaming_config=self._streaming_config,
543
+ )
544
+ return cloud_speech_v1.StreamingRecognizeRequest(
545
+ streaming_config=self._streaming_config,
546
+ )
547
+
548
+ def _build_audio_request(
549
+ self,
550
+ frame: rtc.AudioFrame,
551
+ ) -> cloud_speech_v2.StreamingRecognizeRequest | cloud_speech_v1.StreamingRecognizeRequest:
552
+ if self._config.version == 2:
553
+ return cloud_speech_v2.StreamingRecognizeRequest(audio=frame.data.tobytes())
554
+ return cloud_speech_v1.StreamingRecognizeRequest(audio_content=frame.data.tobytes())
555
+
406
556
  async def _run(self) -> None:
407
557
  audio_pushed = False
408
558
 
409
559
  # google requires a async generator when calling streaming_recognize
410
560
  # this function basically convert the queue into a async generator
411
561
  async def input_generator(
412
- client: SpeechAsyncClient, should_stop: asyncio.Event
413
- ) -> AsyncGenerator[cloud_speech.StreamingRecognizeRequest, None]:
562
+ client: SpeechAsyncClientV2 | SpeechAsyncClientV1, should_stop: asyncio.Event
563
+ ) -> AsyncGenerator[
564
+ cloud_speech_v2.StreamingRecognizeRequest | cloud_speech_v1.StreamingRecognizeRequest,
565
+ None,
566
+ ]:
414
567
  nonlocal audio_pushed
415
568
  try:
416
- # first request should contain the config
417
- yield cloud_speech.StreamingRecognizeRequest(
418
- recognizer=self._recognizer_cb(client),
419
- streaming_config=self._streaming_config,
420
- )
569
+ yield self._build_init_request(client)
421
570
 
422
571
  async for frame in self._input_ch:
423
572
  # when the stream is aborted due to reconnect, this input_generator
@@ -427,7 +576,7 @@ class SpeechStream(stt.SpeechStream):
427
576
  return
428
577
 
429
578
  if isinstance(frame, rtc.AudioFrame):
430
- yield cloud_speech.StreamingRecognizeRequest(audio=frame.data.tobytes())
579
+ yield self._build_audio_request(frame)
431
580
  if not audio_pushed:
432
581
  audio_pushed = True
433
582
 
@@ -435,28 +584,34 @@ class SpeechStream(stt.SpeechStream):
435
584
  logger.exception("an error occurred while streaming input to google STT")
436
585
 
437
586
  async def process_stream(
438
- client: SpeechAsyncClient,
439
- stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse],
587
+ client: SpeechAsyncClientV2 | SpeechAsyncClientV1,
588
+ stream: AsyncIterable[
589
+ cloud_speech_v2.StreamingRecognizeResponse
590
+ | cloud_speech_v1.StreamingRecognizeResponse
591
+ ],
440
592
  ) -> None:
441
593
  has_started = False
442
594
  async for resp in stream:
443
- if (
444
- resp.speech_event_type
445
- == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
595
+ if resp.speech_event_type == (
596
+ cloud_speech_v2.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
597
+ if self._config.version == 2
598
+ else cloud_speech_v1.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
446
599
  ):
447
600
  self._event_ch.send_nowait(
448
601
  stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
449
602
  )
450
603
  has_started = True
451
604
 
452
- if (
453
- resp.speech_event_type
454
- == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED # noqa: E501
605
+ if resp.speech_event_type == (
606
+ cloud_speech_v2.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
607
+ if self._config.version == 2
608
+ else cloud_speech_v1.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_UNSPECIFIED
455
609
  ):
456
610
  result = resp.results[0]
457
611
  speech_data = _streaming_recognize_response_to_speech_data(
458
612
  resp,
459
613
  min_confidence_threshold=self._config.min_confidence_threshold,
614
+ start_time_offset=self.start_time_offset,
460
615
  )
461
616
  if speech_data is None:
462
617
  continue
@@ -488,9 +643,10 @@ class SpeechStream(stt.SpeechStream):
488
643
  self._reconnect_event.set()
489
644
  return
490
645
 
491
- if (
492
- resp.speech_event_type
493
- == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
646
+ if resp.speech_event_type == (
647
+ cloud_speech_v2.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
648
+ if self._config.version == 2
649
+ else cloud_speech_v1.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
494
650
  ):
495
651
  self._event_ch.send_nowait(
496
652
  stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
@@ -501,27 +657,7 @@ class SpeechStream(stt.SpeechStream):
501
657
  audio_pushed = False
502
658
  try:
503
659
  async with self._pool.connection(timeout=self._conn_options.timeout) as client:
504
- self._streaming_config = cloud_speech.StreamingRecognitionConfig(
505
- config=cloud_speech.RecognitionConfig(
506
- explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
507
- encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
508
- sample_rate_hertz=self._config.sample_rate,
509
- audio_channel_count=1,
510
- ),
511
- adaptation=self._config.build_adaptation(),
512
- language_codes=self._config.languages,
513
- model=self._config.model,
514
- features=cloud_speech.RecognitionFeatures(
515
- enable_automatic_punctuation=self._config.punctuate,
516
- enable_word_time_offsets=self._config.enable_word_time_offsets,
517
- enable_spoken_punctuation=self._config.spoken_punctuation,
518
- ),
519
- ),
520
- streaming_features=cloud_speech.StreamingRecognitionFeatures(
521
- interim_results=self._config.interim_results,
522
- enable_voice_activity_events=self._config.enable_voice_activity_events,
523
- ),
524
- )
660
+ self._streaming_config = self._build_streaming_config()
525
661
 
526
662
  should_stop = asyncio.Event()
527
663
  stream = await client.streaming_recognize(
@@ -575,8 +711,20 @@ def _duration_to_seconds(duration: Duration | timedelta) -> float:
575
711
  return duration.seconds + duration.nanos / 1e9
576
712
 
577
713
 
714
+ def _get_start_time(word: cloud_speech_v2.WordInfo | cloud_speech_v1.WordInfo) -> float:
715
+ if hasattr(word, "start_offset"):
716
+ return _duration_to_seconds(word.start_offset)
717
+ return _duration_to_seconds(word.start_time)
718
+
719
+
720
+ def _get_end_time(word: cloud_speech_v2.WordInfo | cloud_speech_v1.WordInfo) -> float:
721
+ if hasattr(word, "end_offset"):
722
+ return _duration_to_seconds(word.end_offset)
723
+ return _duration_to_seconds(word.end_time)
724
+
725
+
578
726
  def _recognize_response_to_speech_event(
579
- resp: cloud_speech.RecognizeResponse,
727
+ resp: cloud_speech_v2.RecognizeResponse | cloud_speech_v1.RecognizeResponse,
580
728
  ) -> stt.SpeechEvent:
581
729
  text = ""
582
730
  confidence = 0.0
@@ -589,8 +737,8 @@ def _recognize_response_to_speech_event(
589
737
  # Google STT may return empty results when spoken_lang != stt_lang
590
738
  if resp.results:
591
739
  try:
592
- start_time = _duration_to_seconds(resp.results[0].alternatives[0].words[0].start_offset)
593
- end_time = _duration_to_seconds(resp.results[-1].alternatives[0].words[-1].end_offset)
740
+ start_time = _get_start_time(resp.results[0].alternatives[0].words[0])
741
+ end_time = _get_end_time(resp.results[-1].alternatives[0].words[-1])
594
742
  except IndexError:
595
743
  # When enable_word_time_offsets=False, there are no "words" to access
596
744
  start_time = end_time = 0
@@ -605,20 +753,33 @@ def _recognize_response_to_speech_event(
605
753
  end_time=end_time,
606
754
  confidence=confidence,
607
755
  text=text,
756
+ words=[
757
+ TimedString(
758
+ text=word.word,
759
+ start_time=_get_start_time(word),
760
+ end_time=_get_end_time(word),
761
+ )
762
+ for word in resp.results[0].alternatives[0].words
763
+ ]
764
+ if resp.results[0].alternatives[0].words
765
+ else None,
608
766
  )
609
767
  ]
610
768
 
611
769
  return stt.SpeechEvent(type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=alternatives)
612
770
 
613
771
 
772
+ @utils.log_exceptions(logger=logger)
614
773
  def _streaming_recognize_response_to_speech_data(
615
- resp: cloud_speech.StreamingRecognizeResponse,
774
+ resp: cloud_speech_v2.StreamingRecognizeResponse | cloud_speech_v1.StreamingRecognizeResponse,
616
775
  *,
617
776
  min_confidence_threshold: float,
777
+ start_time_offset: float,
618
778
  ) -> stt.SpeechData | None:
619
779
  text = ""
620
780
  confidence = 0.0
621
781
  final_result = None
782
+ words: list[cloud_speech_v2.WordInfo | cloud_speech_v1.WordInfo] = []
622
783
  for result in resp.results:
623
784
  if len(result.alternatives) == 0:
624
785
  continue
@@ -629,10 +790,12 @@ def _streaming_recognize_response_to_speech_data(
629
790
  else:
630
791
  text += result.alternatives[0].transcript
631
792
  confidence += result.alternatives[0].confidence
793
+ words.extend(result.alternatives[0].words)
632
794
 
633
795
  if final_result is not None:
634
796
  text = final_result.alternatives[0].transcript
635
797
  confidence = final_result.alternatives[0].confidence
798
+ words = list(final_result.alternatives[0].words)
636
799
  lg = final_result.language_code
637
800
  else:
638
801
  confidence /= len(resp.results)
@@ -640,9 +803,34 @@ def _streaming_recognize_response_to_speech_data(
640
803
  return None
641
804
  lg = resp.results[0].language_code
642
805
 
643
- if text == "":
806
+ if text == "" or not words:
807
+ if text and not words:
808
+ data = stt.SpeechData(
809
+ language=lg,
810
+ start_time=start_time_offset,
811
+ end_time=start_time_offset,
812
+ confidence=confidence,
813
+ text=text,
814
+ )
815
+ return data
644
816
  return None
645
817
 
646
- data = stt.SpeechData(language=lg, start_time=0, end_time=0, confidence=confidence, text=text)
818
+ data = stt.SpeechData(
819
+ language=lg,
820
+ start_time=_get_start_time(words[0]) + start_time_offset,
821
+ end_time=_get_end_time(words[-1]) + start_time_offset,
822
+ confidence=confidence,
823
+ text=text,
824
+ words=[
825
+ TimedString(
826
+ text=word.word,
827
+ start_time=_get_start_time(word) + start_time_offset,
828
+ end_time=_get_end_time(word) + start_time_offset,
829
+ start_time_offset=start_time_offset,
830
+ confidence=word.confidence,
831
+ )
832
+ for word in words
833
+ ],
834
+ )
647
835
 
648
836
  return data