livekit-plugins-google 0.7.1__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/PKG-INFO +2 -2
  2. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/__init__.py +9 -0
  3. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/stt.py +59 -10
  4. livekit_plugins_google-0.7.3/livekit/plugins/google/tts.py +233 -0
  5. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/version.py +1 -1
  6. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/PKG-INFO +2 -2
  7. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/requires.txt +1 -1
  8. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/setup.py +1 -1
  9. livekit_plugins_google-0.7.1/livekit/plugins/google/tts.py +0 -174
  10. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/README.md +0 -0
  11. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/log.py +0 -0
  12. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/models.py +0 -0
  13. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/py.typed +0 -0
  14. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/SOURCES.txt +0 -0
  15. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/dependency_links.txt +0 -0
  16. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/top_level.txt +0 -0
  17. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/pyproject.toml +0 -0
  18. {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-google
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -22,7 +22,7 @@ Description-Content-Type: text/markdown
22
22
  Requires-Dist: google-auth<3,>=2
23
23
  Requires-Dist: google-cloud-speech<3,>=2
24
24
  Requires-Dist: google-cloud-texttospeech<3,>=2
25
- Requires-Dist: livekit-agents>=0.8.0.dev0
25
+ Requires-Dist: livekit-agents>=0.11
26
26
 
27
27
  # LiveKit Plugins Google
28
28
 
@@ -29,3 +29,12 @@ class GooglePlugin(Plugin):
29
29
 
30
30
 
31
31
  Plugin.register_plugin(GooglePlugin())
32
+
33
+ # Cleanup docs of unexported modules
34
+ _module = dir()
35
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
36
+
37
+ __pdoc__ = {}
38
+
39
+ for n in NOT_IN_ALL:
40
+ __pdoc__[n] = False
@@ -20,8 +20,15 @@ from dataclasses import dataclass
20
20
  from typing import AsyncIterable, List, Union
21
21
 
22
22
  from livekit import agents, rtc
23
- from livekit.agents import stt, utils
24
-
23
+ from livekit.agents import (
24
+ APIConnectionError,
25
+ APIStatusError,
26
+ APITimeoutError,
27
+ stt,
28
+ utils,
29
+ )
30
+
31
+ from google.api_core.exceptions import Aborted, DeadlineExceeded, GoogleAPICallError
25
32
  from google.auth import default as gauth_default
26
33
  from google.auth.exceptions import DefaultCredentialsError
27
34
  from google.cloud.speech_v2 import SpeechAsyncClient
@@ -43,6 +50,25 @@ class STTOptions:
43
50
  punctuate: bool
44
51
  spoken_punctuation: bool
45
52
  model: SpeechModels
53
+ keywords: List[tuple[str, float]] | None
54
+
55
+ def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
56
+ if self.keywords:
57
+ return cloud_speech.SpeechAdaptation(
58
+ phrase_sets=[
59
+ cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
60
+ inline_phrase_set=cloud_speech.PhraseSet(
61
+ phrases=[
62
+ cloud_speech.PhraseSet.Phrase(
63
+ value=keyword, boost=boost
64
+ )
65
+ for keyword, boost in self.keywords
66
+ ]
67
+ )
68
+ )
69
+ ]
70
+ )
71
+ return None
46
72
 
47
73
 
48
74
  class STT(stt.STT):
@@ -57,6 +83,7 @@ class STT(stt.STT):
57
83
  model: SpeechModels = "long",
58
84
  credentials_info: dict | None = None,
59
85
  credentials_file: str | None = None,
86
+ keywords: List[tuple[str, float]] | None = None,
60
87
  ):
61
88
  """
62
89
  Create a new instance of Google STT.
@@ -93,6 +120,7 @@ class STT(stt.STT):
93
120
  punctuate=punctuate,
94
121
  spoken_punctuation=spoken_punctuation,
95
122
  model=model,
123
+ keywords=keywords,
96
124
  )
97
125
 
98
126
  def _ensure_client(self) -> SpeechAsyncClient:
@@ -141,7 +169,7 @@ class STT(stt.STT):
141
169
 
142
170
  return config
143
171
 
144
- async def recognize(
172
+ async def _recognize_impl(
145
173
  self,
146
174
  buffer: utils.AudioBuffer,
147
175
  *,
@@ -156,6 +184,7 @@ class STT(stt.STT):
156
184
  sample_rate_hertz=frame.sample_rate,
157
185
  audio_channel_count=frame.num_channels,
158
186
  ),
187
+ adaptation=config.build_adaptation(),
159
188
  features=cloud_speech.RecognitionFeatures(
160
189
  enable_automatic_punctuation=config.punctuate,
161
190
  enable_spoken_punctuation=config.spoken_punctuation,
@@ -165,23 +194,39 @@ class STT(stt.STT):
165
194
  language_codes=config.languages,
166
195
  )
167
196
 
168
- raw = await self._ensure_client().recognize(
169
- cloud_speech.RecognizeRequest(
170
- recognizer=self._recognizer, config=config, content=frame.data.tobytes()
197
+ try:
198
+ raw = await self._ensure_client().recognize(
199
+ cloud_speech.RecognizeRequest(
200
+ recognizer=self._recognizer,
201
+ config=config,
202
+ content=frame.data.tobytes(),
203
+ )
171
204
  )
172
- )
173
- return _recognize_response_to_speech_event(raw)
205
+
206
+ return _recognize_response_to_speech_event(raw)
207
+ except DeadlineExceeded:
208
+ raise APITimeoutError()
209
+ except GoogleAPICallError as e:
210
+ raise APIStatusError(
211
+ e.message,
212
+ status_code=e.code or -1,
213
+ request_id=None,
214
+ body=None,
215
+ )
216
+ except Exception as e:
217
+ raise APIConnectionError() from e
174
218
 
175
219
  def stream(
176
220
  self, *, language: SpeechLanguages | str | None = None
177
221
  ) -> "SpeechStream":
178
222
  config = self._sanitize_options(language=language)
179
- return SpeechStream(self._ensure_client(), self._recognizer, config)
223
+ return SpeechStream(self, self._ensure_client(), self._recognizer, config)
180
224
 
181
225
 
182
226
  class SpeechStream(stt.SpeechStream):
183
227
  def __init__(
184
228
  self,
229
+ stt: STT,
185
230
  client: SpeechAsyncClient,
186
231
  recognizer: str,
187
232
  config: STTOptions,
@@ -189,7 +234,7 @@ class SpeechStream(stt.SpeechStream):
189
234
  num_channels: int = 1,
190
235
  max_retry: int = 32,
191
236
  ) -> None:
192
- super().__init__()
237
+ super().__init__(stt)
193
238
 
194
239
  self._client = client
195
240
  self._recognizer = recognizer
@@ -205,6 +250,7 @@ class SpeechStream(stt.SpeechStream):
205
250
  sample_rate_hertz=self._sample_rate,
206
251
  audio_channel_count=self._num_channels,
207
252
  ),
253
+ adaptation=config.build_adaptation(),
208
254
  language_codes=self._config.languages,
209
255
  model=self._config.model,
210
256
  features=cloud_speech.RecognitionFeatures(
@@ -257,6 +303,9 @@ class SpeechStream(stt.SpeechStream):
257
303
  retry_count = 0 # connection successful, reset retry count
258
304
 
259
305
  await self._run_stream(stream)
306
+ except Aborted:
307
+ logger.error("google stt connection aborted")
308
+ break
260
309
  except Exception as e:
261
310
  if retry_count >= max_retry:
262
311
  logger.error(
@@ -0,0 +1,233 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+
19
+ from livekit import rtc
20
+ from livekit.agents import (
21
+ APIConnectionError,
22
+ APIStatusError,
23
+ APITimeoutError,
24
+ tts,
25
+ utils,
26
+ )
27
+
28
+ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
29
+ from google.cloud import texttospeech
30
+ from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
31
+
32
+ from .models import AudioEncoding, Gender, SpeechLanguages
33
+
34
+
35
+ @dataclass
36
+ class _TTSOptions:
37
+ voice: texttospeech.VoiceSelectionParams
38
+ audio_config: texttospeech.AudioConfig
39
+
40
+
41
+ class TTS(tts.TTS):
42
+ def __init__(
43
+ self,
44
+ *,
45
+ language: SpeechLanguages | str = "en-US",
46
+ gender: Gender | str = "neutral",
47
+ voice_name: str = "", # Not required
48
+ encoding: AudioEncoding | str = "linear16",
49
+ sample_rate: int = 24000,
50
+ pitch: int = 0,
51
+ effects_profile_id: str = "",
52
+ speaking_rate: float = 1.0,
53
+ credentials_info: dict | None = None,
54
+ credentials_file: str | None = None,
55
+ ) -> None:
56
+ """
57
+ Create a new instance of Google TTS.
58
+
59
+ Credentials must be provided, either by using the ``credentials_info`` dict, or reading
60
+ from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
61
+ environmental variable.
62
+
63
+ Args:
64
+ language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
65
+ gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
66
+ voice_name (str, optional): Specific voice name. Default is an empty string.
67
+ encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
68
+ sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
69
+ pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
70
+ effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
71
+ speaking_rate (float, optional): Speed of speech. Default is 1.0.
72
+ credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
73
+ credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
74
+ """
75
+
76
+ super().__init__(
77
+ capabilities=tts.TTSCapabilities(
78
+ streaming=False,
79
+ ),
80
+ sample_rate=sample_rate,
81
+ num_channels=1,
82
+ )
83
+
84
+ self._client: texttospeech.TextToSpeechAsyncClient | None = None
85
+ self._credentials_info = credentials_info
86
+ self._credentials_file = credentials_file
87
+
88
+ voice = texttospeech.VoiceSelectionParams(
89
+ name=voice_name,
90
+ language_code=language,
91
+ ssml_gender=_gender_from_str(gender),
92
+ )
93
+
94
+ if encoding == "linear16" or encoding == "wav":
95
+ _audio_encoding = texttospeech.AudioEncoding.LINEAR16
96
+ elif encoding == "mp3":
97
+ _audio_encoding = texttospeech.AudioEncoding.MP3
98
+ else:
99
+ raise NotImplementedError(f"audio encoding {encoding} is not supported")
100
+
101
+ self._opts = _TTSOptions(
102
+ voice=voice,
103
+ audio_config=texttospeech.AudioConfig(
104
+ audio_encoding=_audio_encoding,
105
+ sample_rate_hertz=sample_rate,
106
+ pitch=pitch,
107
+ effects_profile_id=effects_profile_id,
108
+ speaking_rate=speaking_rate,
109
+ ),
110
+ )
111
+
112
+ def update_options(
113
+ self,
114
+ *,
115
+ language: SpeechLanguages | str = "en-US",
116
+ gender: Gender | str = "neutral",
117
+ voice_name: str = "", # Not required
118
+ speaking_rate: float = 1.0,
119
+ ) -> None:
120
+ """
121
+ Update the TTS options.
122
+
123
+ Args:
124
+ language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
125
+ gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
126
+ voice_name (str, optional): Specific voice name. Default is an empty string.
127
+ speaking_rate (float, optional): Speed of speech. Default is 1.0.
128
+ """
129
+ self._opts.voice = texttospeech.VoiceSelectionParams(
130
+ name=voice_name,
131
+ language_code=language,
132
+ ssml_gender=_gender_from_str(gender),
133
+ )
134
+ self._opts.audio_config.speaking_rate = speaking_rate
135
+
136
+ def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
137
+ if not self._client:
138
+ if self._credentials_info:
139
+ self._client = (
140
+ texttospeech.TextToSpeechAsyncClient.from_service_account_info(
141
+ self._credentials_info
142
+ )
143
+ )
144
+
145
+ elif self._credentials_file:
146
+ self._client = (
147
+ texttospeech.TextToSpeechAsyncClient.from_service_account_file(
148
+ self._credentials_file
149
+ )
150
+ )
151
+ else:
152
+ self._client = texttospeech.TextToSpeechAsyncClient()
153
+
154
+ assert self._client is not None
155
+ return self._client
156
+
157
+ def synthesize(self, text: str) -> "ChunkedStream":
158
+ return ChunkedStream(self, text, self._opts, self._ensure_client())
159
+
160
+
161
+ class ChunkedStream(tts.ChunkedStream):
162
+ def __init__(
163
+ self,
164
+ tts: TTS,
165
+ text: str,
166
+ opts: _TTSOptions,
167
+ client: texttospeech.TextToSpeechAsyncClient,
168
+ ) -> None:
169
+ super().__init__(tts, text)
170
+ self._opts, self._client = opts, client
171
+
172
+ async def _main_task(self) -> None:
173
+ request_id = utils.shortuuid()
174
+
175
+ try:
176
+ response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
177
+ input=texttospeech.SynthesisInput(text=self._input_text),
178
+ voice=self._opts.voice,
179
+ audio_config=self._opts.audio_config,
180
+ )
181
+
182
+ data = response.audio_content
183
+ if self._opts.audio_config.audio_encoding == "mp3":
184
+ decoder = utils.codecs.Mp3StreamDecoder()
185
+ bstream = utils.audio.AudioByteStream(
186
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
187
+ num_channels=1,
188
+ )
189
+ for frame in decoder.decode_chunk(data):
190
+ for frame in bstream.write(frame.data.tobytes()):
191
+ self._event_ch.send_nowait(
192
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
193
+ )
194
+
195
+ for frame in bstream.flush():
196
+ self._event_ch.send_nowait(
197
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
198
+ )
199
+ else:
200
+ data = data[44:] # skip WAV header
201
+ self._event_ch.send_nowait(
202
+ tts.SynthesizedAudio(
203
+ request_id=request_id,
204
+ frame=rtc.AudioFrame(
205
+ data=data,
206
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
207
+ num_channels=1,
208
+ samples_per_channel=len(data) // 2, # 16-bit
209
+ ),
210
+ )
211
+ )
212
+
213
+ except DeadlineExceeded:
214
+ raise APITimeoutError()
215
+ except GoogleAPICallError as e:
216
+ raise APIStatusError(
217
+ e.message,
218
+ status_code=e.code or -1,
219
+ request_id=None,
220
+ body=None,
221
+ )
222
+ except Exception as e:
223
+ raise APIConnectionError() from e
224
+
225
+
226
+ def _gender_from_str(gender: str) -> SsmlVoiceGender:
227
+ ssml_gender = SsmlVoiceGender.NEUTRAL
228
+ if gender == "male":
229
+ ssml_gender = SsmlVoiceGender.MALE
230
+ elif gender == "female":
231
+ ssml_gender = SsmlVoiceGender.FEMALE
232
+
233
+ return ssml_gender # type: ignore
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.7.1"
15
+ __version__ = "0.7.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-google
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -22,7 +22,7 @@ Description-Content-Type: text/markdown
22
22
  Requires-Dist: google-auth<3,>=2
23
23
  Requires-Dist: google-cloud-speech<3,>=2
24
24
  Requires-Dist: google-cloud-texttospeech<3,>=2
25
- Requires-Dist: livekit-agents>=0.8.0.dev0
25
+ Requires-Dist: livekit-agents>=0.11
26
26
 
27
27
  # LiveKit Plugins Google
28
28
 
@@ -1,4 +1,4 @@
1
1
  google-auth<3,>=2
2
2
  google-cloud-speech<3,>=2
3
3
  google-cloud-texttospeech<3,>=2
4
- livekit-agents>=0.8.0.dev0
4
+ livekit-agents>=0.11
@@ -51,7 +51,7 @@ setuptools.setup(
51
51
  "google-auth >= 2, < 3",
52
52
  "google-cloud-speech >= 2, < 3",
53
53
  "google-cloud-texttospeech >= 2, < 3",
54
- "livekit-agents>=0.8.0.dev0",
54
+ "livekit-agents>=0.11",
55
55
  ],
56
56
  package_data={"livekit.plugins.google": ["py.typed"]},
57
57
  project_urls={
@@ -1,174 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- from dataclasses import dataclass
18
- from typing import Union
19
-
20
- from livekit import rtc
21
- from livekit.agents import tts, utils
22
-
23
- from google.cloud import texttospeech
24
- from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
25
-
26
- from .log import logger
27
- from .models import AudioEncoding, Gender, SpeechLanguages
28
-
29
- LgType = Union[SpeechLanguages, str]
30
- GenderType = Union[Gender, str]
31
- AudioEncodingType = Union[AudioEncoding, str]
32
-
33
-
34
- @dataclass
35
- class _TTSOptions:
36
- voice: texttospeech.VoiceSelectionParams
37
- audio_config: texttospeech.AudioConfig
38
-
39
-
40
- class TTS(tts.TTS):
41
- def __init__(
42
- self,
43
- *,
44
- language: LgType = "en-US",
45
- gender: GenderType = "neutral",
46
- voice_name: str = "", # Not required
47
- encoding: AudioEncodingType = "linear16",
48
- sample_rate: int = 24000,
49
- speaking_rate: float = 1.0,
50
- credentials_info: dict | None = None,
51
- credentials_file: str | None = None,
52
- ) -> None:
53
- """
54
- Create a new instance of Google TTS.
55
-
56
- Credentials must be provided, either by using the ``credentials_info`` dict, or reading
57
- from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
58
- environmental variable.
59
- """
60
-
61
- super().__init__(
62
- capabilities=tts.TTSCapabilities(
63
- streaming=False,
64
- ),
65
- sample_rate=sample_rate,
66
- num_channels=1,
67
- )
68
-
69
- self._client: texttospeech.TextToSpeechAsyncClient | None = None
70
- self._credentials_info = credentials_info
71
- self._credentials_file = credentials_file
72
-
73
- ssml_gender = SsmlVoiceGender.NEUTRAL
74
- if gender == "male":
75
- ssml_gender = SsmlVoiceGender.MALE
76
- elif gender == "female":
77
- ssml_gender = SsmlVoiceGender.FEMALE
78
-
79
- voice = texttospeech.VoiceSelectionParams(
80
- name=voice_name, language_code=language, ssml_gender=ssml_gender
81
- )
82
-
83
- if encoding == "linear16" or encoding == "wav":
84
- _audio_encoding = texttospeech.AudioEncoding.LINEAR16
85
- elif encoding == "mp3":
86
- _audio_encoding = texttospeech.AudioEncoding.MP3
87
- else:
88
- raise NotImplementedError(f"audio encoding {encoding} is not supported")
89
-
90
- self._opts = _TTSOptions(
91
- voice=voice,
92
- audio_config=texttospeech.AudioConfig(
93
- audio_encoding=_audio_encoding,
94
- sample_rate_hertz=sample_rate,
95
- speaking_rate=speaking_rate,
96
- ),
97
- )
98
-
99
- def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
100
- if not self._client:
101
- if self._credentials_info:
102
- self._client = (
103
- texttospeech.TextToSpeechAsyncClient.from_service_account_info(
104
- self._credentials_info
105
- )
106
- )
107
-
108
- elif self._credentials_file:
109
- self._client = (
110
- texttospeech.TextToSpeechAsyncClient.from_service_account_file(
111
- self._credentials_file
112
- )
113
- )
114
- else:
115
- self._client = texttospeech.TextToSpeechAsyncClient()
116
-
117
- assert self._client is not None
118
- return self._client
119
-
120
- def synthesize(self, text: str) -> "ChunkedStream":
121
- return ChunkedStream(text, self._opts, self._ensure_client())
122
-
123
-
124
- class ChunkedStream(tts.ChunkedStream):
125
- def __init__(
126
- self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
127
- ) -> None:
128
- super().__init__()
129
- self._text, self._opts, self._client = text, opts, client
130
-
131
- @utils.log_exceptions(logger=logger)
132
- async def _main_task(self) -> None:
133
- request_id = utils.shortuuid()
134
- segment_id = utils.shortuuid()
135
- response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
136
- input=texttospeech.SynthesisInput(text=self._text),
137
- voice=self._opts.voice,
138
- audio_config=self._opts.audio_config,
139
- )
140
-
141
- data = response.audio_content
142
- if self._opts.audio_config.audio_encoding == "mp3":
143
- decoder = utils.codecs.Mp3StreamDecoder()
144
- bstream = utils.audio.AudioByteStream(
145
- sample_rate=self._opts.audio_config.sample_rate_hertz, num_channels=1
146
- )
147
- for frame in decoder.decode_chunk(data):
148
- for frame in bstream.write(frame.data):
149
- self._event_ch.send_nowait(
150
- tts.SynthesizedAudio(
151
- request_id=request_id, segment_id=segment_id, frame=frame
152
- )
153
- )
154
-
155
- for frame in bstream.flush():
156
- self._event_ch.send_nowait(
157
- tts.SynthesizedAudio(
158
- request_id=request_id, segment_id=segment_id, frame=frame
159
- )
160
- )
161
- else:
162
- data = data[44:] # skip WAV header
163
- self._event_ch.send_nowait(
164
- tts.SynthesizedAudio(
165
- request_id=request_id,
166
- segment_id=segment_id,
167
- frame=rtc.AudioFrame(
168
- data=data,
169
- sample_rate=self._opts.audio_config.sample_rate_hertz,
170
- num_channels=1,
171
- samples_per_channel=len(data) // 2, # 16-bit
172
- ),
173
- )
174
- )