livekit-plugins-google 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,3 +29,12 @@ class GooglePlugin(Plugin):
29
29
 
30
30
 
31
31
  Plugin.register_plugin(GooglePlugin())
32
+
33
+ # Cleanup docs of unexported modules
34
+ _module = dir()
35
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
36
+
37
+ __pdoc__ = {}
38
+
39
+ for n in NOT_IN_ALL:
40
+ __pdoc__[n] = False
@@ -20,8 +20,15 @@ from dataclasses import dataclass
20
20
  from typing import AsyncIterable, List, Union
21
21
 
22
22
  from livekit import agents, rtc
23
- from livekit.agents import stt, utils
24
-
23
+ from livekit.agents import (
24
+ APIConnectionError,
25
+ APIStatusError,
26
+ APITimeoutError,
27
+ stt,
28
+ utils,
29
+ )
30
+
31
+ from google.api_core.exceptions import Aborted, DeadlineExceeded, GoogleAPICallError
25
32
  from google.auth import default as gauth_default
26
33
  from google.auth.exceptions import DefaultCredentialsError
27
34
  from google.cloud.speech_v2 import SpeechAsyncClient
@@ -43,6 +50,25 @@ class STTOptions:
43
50
  punctuate: bool
44
51
  spoken_punctuation: bool
45
52
  model: SpeechModels
53
+ keywords: List[tuple[str, float]] | None
54
+
55
+ def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
56
+ if self.keywords:
57
+ return cloud_speech.SpeechAdaptation(
58
+ phrase_sets=[
59
+ cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
60
+ inline_phrase_set=cloud_speech.PhraseSet(
61
+ phrases=[
62
+ cloud_speech.PhraseSet.Phrase(
63
+ value=keyword, boost=boost
64
+ )
65
+ for keyword, boost in self.keywords
66
+ ]
67
+ )
68
+ )
69
+ ]
70
+ )
71
+ return None
46
72
 
47
73
 
48
74
  class STT(stt.STT):
@@ -57,6 +83,7 @@ class STT(stt.STT):
57
83
  model: SpeechModels = "long",
58
84
  credentials_info: dict | None = None,
59
85
  credentials_file: str | None = None,
86
+ keywords: List[tuple[str, float]] | None = None,
60
87
  ):
61
88
  """
62
89
  Create a new instance of Google STT.
@@ -93,6 +120,7 @@ class STT(stt.STT):
93
120
  punctuate=punctuate,
94
121
  spoken_punctuation=spoken_punctuation,
95
122
  model=model,
123
+ keywords=keywords,
96
124
  )
97
125
 
98
126
  def _ensure_client(self) -> SpeechAsyncClient:
@@ -141,7 +169,7 @@ class STT(stt.STT):
141
169
 
142
170
  return config
143
171
 
144
- async def recognize(
172
+ async def _recognize_impl(
145
173
  self,
146
174
  buffer: utils.AudioBuffer,
147
175
  *,
@@ -156,6 +184,7 @@ class STT(stt.STT):
156
184
  sample_rate_hertz=frame.sample_rate,
157
185
  audio_channel_count=frame.num_channels,
158
186
  ),
187
+ adaptation=config.build_adaptation(),
159
188
  features=cloud_speech.RecognitionFeatures(
160
189
  enable_automatic_punctuation=config.punctuate,
161
190
  enable_spoken_punctuation=config.spoken_punctuation,
@@ -165,23 +194,39 @@ class STT(stt.STT):
165
194
  language_codes=config.languages,
166
195
  )
167
196
 
168
- raw = await self._ensure_client().recognize(
169
- cloud_speech.RecognizeRequest(
170
- recognizer=self._recognizer, config=config, content=frame.data.tobytes()
197
+ try:
198
+ raw = await self._ensure_client().recognize(
199
+ cloud_speech.RecognizeRequest(
200
+ recognizer=self._recognizer,
201
+ config=config,
202
+ content=frame.data.tobytes(),
203
+ )
171
204
  )
172
- )
173
- return _recognize_response_to_speech_event(raw)
205
+
206
+ return _recognize_response_to_speech_event(raw)
207
+ except DeadlineExceeded:
208
+ raise APITimeoutError()
209
+ except GoogleAPICallError as e:
210
+ raise APIStatusError(
211
+ e.message,
212
+ status_code=e.code or -1,
213
+ request_id=None,
214
+ body=None,
215
+ )
216
+ except Exception as e:
217
+ raise APIConnectionError() from e
174
218
 
175
219
  def stream(
176
220
  self, *, language: SpeechLanguages | str | None = None
177
221
  ) -> "SpeechStream":
178
222
  config = self._sanitize_options(language=language)
179
- return SpeechStream(self._ensure_client(), self._recognizer, config)
223
+ return SpeechStream(self, self._ensure_client(), self._recognizer, config)
180
224
 
181
225
 
182
226
  class SpeechStream(stt.SpeechStream):
183
227
  def __init__(
184
228
  self,
229
+ stt: STT,
185
230
  client: SpeechAsyncClient,
186
231
  recognizer: str,
187
232
  config: STTOptions,
@@ -189,7 +234,7 @@ class SpeechStream(stt.SpeechStream):
189
234
  num_channels: int = 1,
190
235
  max_retry: int = 32,
191
236
  ) -> None:
192
- super().__init__()
237
+ super().__init__(stt)
193
238
 
194
239
  self._client = client
195
240
  self._recognizer = recognizer
@@ -205,6 +250,7 @@ class SpeechStream(stt.SpeechStream):
205
250
  sample_rate_hertz=self._sample_rate,
206
251
  audio_channel_count=self._num_channels,
207
252
  ),
253
+ adaptation=config.build_adaptation(),
208
254
  language_codes=self._config.languages,
209
255
  model=self._config.model,
210
256
  features=cloud_speech.RecognitionFeatures(
@@ -257,6 +303,9 @@ class SpeechStream(stt.SpeechStream):
257
303
  retry_count = 0 # connection successful, reset retry count
258
304
 
259
305
  await self._run_stream(stream)
306
+ except Aborted:
307
+ logger.error("google stt connection aborted")
308
+ break
260
309
  except Exception as e:
261
310
  if retry_count >= max_retry:
262
311
  logger.error(
@@ -15,21 +15,22 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  from dataclasses import dataclass
18
- from typing import Union
19
18
 
20
19
  from livekit import rtc
21
- from livekit.agents import tts, utils
22
-
20
+ from livekit.agents import (
21
+ APIConnectionError,
22
+ APIStatusError,
23
+ APITimeoutError,
24
+ tts,
25
+ utils,
26
+ )
27
+
28
+ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
23
29
  from google.cloud import texttospeech
24
30
  from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
25
31
 
26
- from .log import logger
27
32
  from .models import AudioEncoding, Gender, SpeechLanguages
28
33
 
29
- LgType = Union[SpeechLanguages, str]
30
- GenderType = Union[Gender, str]
31
- AudioEncodingType = Union[AudioEncoding, str]
32
-
33
34
 
34
35
  @dataclass
35
36
  class _TTSOptions:
@@ -41,11 +42,13 @@ class TTS(tts.TTS):
41
42
  def __init__(
42
43
  self,
43
44
  *,
44
- language: LgType = "en-US",
45
- gender: GenderType = "neutral",
45
+ language: SpeechLanguages | str = "en-US",
46
+ gender: Gender | str = "neutral",
46
47
  voice_name: str = "", # Not required
47
- encoding: AudioEncodingType = "linear16",
48
+ encoding: AudioEncoding | str = "linear16",
48
49
  sample_rate: int = 24000,
50
+ pitch: int = 0,
51
+ effects_profile_id: str = "",
49
52
  speaking_rate: float = 1.0,
50
53
  credentials_info: dict | None = None,
51
54
  credentials_file: str | None = None,
@@ -56,6 +59,18 @@ class TTS(tts.TTS):
56
59
  Credentials must be provided, either by using the ``credentials_info`` dict, or reading
57
60
  from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
58
61
  environmental variable.
62
+
63
+ Args:
64
+ language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
65
+ gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
66
+ voice_name (str, optional): Specific voice name. Default is an empty string.
67
+ encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
68
+ sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
69
+ pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
70
+ effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
71
+ speaking_rate (float, optional): Speed of speech. Default is 1.0.
72
+ credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
73
+ credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
59
74
  """
60
75
 
61
76
  super().__init__(
@@ -70,14 +85,10 @@ class TTS(tts.TTS):
70
85
  self._credentials_info = credentials_info
71
86
  self._credentials_file = credentials_file
72
87
 
73
- ssml_gender = SsmlVoiceGender.NEUTRAL
74
- if gender == "male":
75
- ssml_gender = SsmlVoiceGender.MALE
76
- elif gender == "female":
77
- ssml_gender = SsmlVoiceGender.FEMALE
78
-
79
88
  voice = texttospeech.VoiceSelectionParams(
80
- name=voice_name, language_code=language, ssml_gender=ssml_gender
89
+ name=voice_name,
90
+ language_code=language,
91
+ ssml_gender=_gender_from_str(gender),
81
92
  )
82
93
 
83
94
  if encoding == "linear16" or encoding == "wav":
@@ -92,10 +103,36 @@ class TTS(tts.TTS):
92
103
  audio_config=texttospeech.AudioConfig(
93
104
  audio_encoding=_audio_encoding,
94
105
  sample_rate_hertz=sample_rate,
106
+ pitch=pitch,
107
+ effects_profile_id=effects_profile_id,
95
108
  speaking_rate=speaking_rate,
96
109
  ),
97
110
  )
98
111
 
112
+ def update_options(
113
+ self,
114
+ *,
115
+ language: SpeechLanguages | str = "en-US",
116
+ gender: Gender | str = "neutral",
117
+ voice_name: str = "", # Not required
118
+ speaking_rate: float = 1.0,
119
+ ) -> None:
120
+ """
121
+ Update the TTS options.
122
+
123
+ Args:
124
+ language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
125
+ gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
126
+ voice_name (str, optional): Specific voice name. Default is an empty string.
127
+ speaking_rate (float, optional): Speed of speech. Default is 1.0.
128
+ """
129
+ self._opts.voice = texttospeech.VoiceSelectionParams(
130
+ name=voice_name,
131
+ language_code=language,
132
+ ssml_gender=_gender_from_str(gender),
133
+ )
134
+ self._opts.audio_config.speaking_rate = speaking_rate
135
+
99
136
  def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
100
137
  if not self._client:
101
138
  if self._credentials_info:
@@ -118,57 +155,79 @@ class TTS(tts.TTS):
118
155
  return self._client
119
156
 
120
157
  def synthesize(self, text: str) -> "ChunkedStream":
121
- return ChunkedStream(text, self._opts, self._ensure_client())
158
+ return ChunkedStream(self, text, self._opts, self._ensure_client())
122
159
 
123
160
 
124
161
  class ChunkedStream(tts.ChunkedStream):
125
162
  def __init__(
126
- self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
163
+ self,
164
+ tts: TTS,
165
+ text: str,
166
+ opts: _TTSOptions,
167
+ client: texttospeech.TextToSpeechAsyncClient,
127
168
  ) -> None:
128
- super().__init__()
129
- self._text, self._opts, self._client = text, opts, client
169
+ super().__init__(tts, text)
170
+ self._opts, self._client = opts, client
130
171
 
131
- @utils.log_exceptions(logger=logger)
132
172
  async def _main_task(self) -> None:
133
173
  request_id = utils.shortuuid()
134
- segment_id = utils.shortuuid()
135
- response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
136
- input=texttospeech.SynthesisInput(text=self._text),
137
- voice=self._opts.voice,
138
- audio_config=self._opts.audio_config,
139
- )
140
174
 
141
- data = response.audio_content
142
- if self._opts.audio_config.audio_encoding == "mp3":
143
- decoder = utils.codecs.Mp3StreamDecoder()
144
- bstream = utils.audio.AudioByteStream(
145
- sample_rate=self._opts.audio_config.sample_rate_hertz, num_channels=1
175
+ try:
176
+ response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
177
+ input=texttospeech.SynthesisInput(text=self._input_text),
178
+ voice=self._opts.voice,
179
+ audio_config=self._opts.audio_config,
146
180
  )
147
- for frame in decoder.decode_chunk(data):
148
- for frame in bstream.write(frame.data):
149
- self._event_ch.send_nowait(
150
- tts.SynthesizedAudio(
151
- request_id=request_id, segment_id=segment_id, frame=frame
181
+
182
+ data = response.audio_content
183
+ if self._opts.audio_config.audio_encoding == "mp3":
184
+ decoder = utils.codecs.Mp3StreamDecoder()
185
+ bstream = utils.audio.AudioByteStream(
186
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
187
+ num_channels=1,
188
+ )
189
+ for frame in decoder.decode_chunk(data):
190
+ for frame in bstream.write(frame.data.tobytes()):
191
+ self._event_ch.send_nowait(
192
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
152
193
  )
153
- )
154
194
 
155
- for frame in bstream.flush():
195
+ for frame in bstream.flush():
196
+ self._event_ch.send_nowait(
197
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
198
+ )
199
+ else:
200
+ data = data[44:] # skip WAV header
156
201
  self._event_ch.send_nowait(
157
202
  tts.SynthesizedAudio(
158
- request_id=request_id, segment_id=segment_id, frame=frame
203
+ request_id=request_id,
204
+ frame=rtc.AudioFrame(
205
+ data=data,
206
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
207
+ num_channels=1,
208
+ samples_per_channel=len(data) // 2, # 16-bit
209
+ ),
159
210
  )
160
211
  )
161
- else:
162
- data = data[44:] # skip WAV header
163
- self._event_ch.send_nowait(
164
- tts.SynthesizedAudio(
165
- request_id=request_id,
166
- segment_id=segment_id,
167
- frame=rtc.AudioFrame(
168
- data=data,
169
- sample_rate=self._opts.audio_config.sample_rate_hertz,
170
- num_channels=1,
171
- samples_per_channel=len(data) // 2, # 16-bit
172
- ),
173
- )
212
+
213
+ except DeadlineExceeded:
214
+ raise APITimeoutError()
215
+ except GoogleAPICallError as e:
216
+ raise APIStatusError(
217
+ e.message,
218
+ status_code=e.code or -1,
219
+ request_id=None,
220
+ body=None,
174
221
  )
222
+ except Exception as e:
223
+ raise APIConnectionError() from e
224
+
225
+
226
+ def _gender_from_str(gender: str) -> SsmlVoiceGender:
227
+ ssml_gender = SsmlVoiceGender.NEUTRAL
228
+ if gender == "male":
229
+ ssml_gender = SsmlVoiceGender.MALE
230
+ elif gender == "female":
231
+ ssml_gender = SsmlVoiceGender.FEMALE
232
+
233
+ return ssml_gender # type: ignore
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.7.1"
15
+ __version__ = "0.7.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-google
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -22,7 +22,7 @@ Description-Content-Type: text/markdown
22
22
  Requires-Dist: google-auth <3,>=2
23
23
  Requires-Dist: google-cloud-speech <3,>=2
24
24
  Requires-Dist: google-cloud-texttospeech <3,>=2
25
- Requires-Dist: livekit-agents >=0.8.0.dev0
25
+ Requires-Dist: livekit-agents >=0.11
26
26
 
27
27
  # LiveKit Plugins Google
28
28
 
@@ -0,0 +1,11 @@
1
+ livekit/plugins/google/__init__.py,sha256=rqV6C5mFNDFlrA2IcGJrsebr2VxQwMzoDUjY1JhMBZM,1117
2
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
+ livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
4
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/google/stt.py,sha256=WjeqYsunW8jY-WHlnNeks7gR-TiojMRR7LYdAVdCxqY,15268
6
+ livekit/plugins/google/tts.py,sha256=hRN8ul1lDXU8LPVEfbTszgBiRYsifZXCPMwk-Pv2KeA,8793
7
+ livekit/plugins/google/version.py,sha256=yJeG0VwiekDJAk7GHcIAe43ebagJgloe-ZsqEGZnqzE,600
8
+ livekit_plugins_google-0.7.3.dist-info/METADATA,sha256=8UvORpoVunOTq0xKxHEk8M3sexKFnBnu66DkEJCnrRY,1647
9
+ livekit_plugins_google-0.7.3.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
10
+ livekit_plugins_google-0.7.3.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
+ livekit_plugins_google-0.7.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.5.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=CYbSmm5fEw71F_r_4pEApGaWQ_r15Y3ZEocH88a4yc8,948
2
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
3
- livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
4
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/google/stt.py,sha256=XXDOISg-8U1MzVu543xLEB3-mr_NFKJp9qo1-ya2-Hc,13569
6
- livekit/plugins/google/tts.py,sha256=T9AHsxofwo3XaMciJPWh9O7lTZqDVYdQQlnFPiGWVbQ,6170
7
- livekit/plugins/google/version.py,sha256=JOBYrlKcxbTTRXkUKH0921GsmV-i71_KHczg2cgQiLc,600
8
- livekit_plugins_google-0.7.1.dist-info/METADATA,sha256=MyDLqZp1DC52KWx_Re3Hj0kO75l-Dg9z9IfiihtH4KY,1653
9
- livekit_plugins_google-0.7.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
10
- livekit_plugins_google-0.7.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
11
- livekit_plugins_google-0.7.1.dist-info/RECORD,,