livekit-plugins-google 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/PKG-INFO +2 -2
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/__init__.py +9 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/stt.py +59 -10
- livekit_plugins_google-0.7.3/livekit/plugins/google/tts.py +233 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/PKG-INFO +2 -2
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/requires.txt +1 -1
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/setup.py +1 -1
- livekit_plugins_google-0.7.1/livekit/plugins/google/tts.py +0 -174
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/README.md +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/log.py +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/models.py +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/py.typed +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/SOURCES.txt +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/dependency_links.txt +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/top_level.txt +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/pyproject.toml +0 -0
- {livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.3
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -22,7 +22,7 @@ Description-Content-Type: text/markdown
|
|
22
22
|
Requires-Dist: google-auth<3,>=2
|
23
23
|
Requires-Dist: google-cloud-speech<3,>=2
|
24
24
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
25
|
-
Requires-Dist: livekit-agents>=0.
|
25
|
+
Requires-Dist: livekit-agents>=0.11
|
26
26
|
|
27
27
|
# LiveKit Plugins Google
|
28
28
|
|
{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/__init__.py
RENAMED
@@ -29,3 +29,12 @@ class GooglePlugin(Plugin):
|
|
29
29
|
|
30
30
|
|
31
31
|
Plugin.register_plugin(GooglePlugin())
|
32
|
+
|
33
|
+
# Cleanup docs of unexported modules
|
34
|
+
_module = dir()
|
35
|
+
NOT_IN_ALL = [m for m in _module if m not in __all__]
|
36
|
+
|
37
|
+
__pdoc__ = {}
|
38
|
+
|
39
|
+
for n in NOT_IN_ALL:
|
40
|
+
__pdoc__[n] = False
|
@@ -20,8 +20,15 @@ from dataclasses import dataclass
|
|
20
20
|
from typing import AsyncIterable, List, Union
|
21
21
|
|
22
22
|
from livekit import agents, rtc
|
23
|
-
from livekit.agents import
|
24
|
-
|
23
|
+
from livekit.agents import (
|
24
|
+
APIConnectionError,
|
25
|
+
APIStatusError,
|
26
|
+
APITimeoutError,
|
27
|
+
stt,
|
28
|
+
utils,
|
29
|
+
)
|
30
|
+
|
31
|
+
from google.api_core.exceptions import Aborted, DeadlineExceeded, GoogleAPICallError
|
25
32
|
from google.auth import default as gauth_default
|
26
33
|
from google.auth.exceptions import DefaultCredentialsError
|
27
34
|
from google.cloud.speech_v2 import SpeechAsyncClient
|
@@ -43,6 +50,25 @@ class STTOptions:
|
|
43
50
|
punctuate: bool
|
44
51
|
spoken_punctuation: bool
|
45
52
|
model: SpeechModels
|
53
|
+
keywords: List[tuple[str, float]] | None
|
54
|
+
|
55
|
+
def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
|
56
|
+
if self.keywords:
|
57
|
+
return cloud_speech.SpeechAdaptation(
|
58
|
+
phrase_sets=[
|
59
|
+
cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
|
60
|
+
inline_phrase_set=cloud_speech.PhraseSet(
|
61
|
+
phrases=[
|
62
|
+
cloud_speech.PhraseSet.Phrase(
|
63
|
+
value=keyword, boost=boost
|
64
|
+
)
|
65
|
+
for keyword, boost in self.keywords
|
66
|
+
]
|
67
|
+
)
|
68
|
+
)
|
69
|
+
]
|
70
|
+
)
|
71
|
+
return None
|
46
72
|
|
47
73
|
|
48
74
|
class STT(stt.STT):
|
@@ -57,6 +83,7 @@ class STT(stt.STT):
|
|
57
83
|
model: SpeechModels = "long",
|
58
84
|
credentials_info: dict | None = None,
|
59
85
|
credentials_file: str | None = None,
|
86
|
+
keywords: List[tuple[str, float]] | None = None,
|
60
87
|
):
|
61
88
|
"""
|
62
89
|
Create a new instance of Google STT.
|
@@ -93,6 +120,7 @@ class STT(stt.STT):
|
|
93
120
|
punctuate=punctuate,
|
94
121
|
spoken_punctuation=spoken_punctuation,
|
95
122
|
model=model,
|
123
|
+
keywords=keywords,
|
96
124
|
)
|
97
125
|
|
98
126
|
def _ensure_client(self) -> SpeechAsyncClient:
|
@@ -141,7 +169,7 @@ class STT(stt.STT):
|
|
141
169
|
|
142
170
|
return config
|
143
171
|
|
144
|
-
async def
|
172
|
+
async def _recognize_impl(
|
145
173
|
self,
|
146
174
|
buffer: utils.AudioBuffer,
|
147
175
|
*,
|
@@ -156,6 +184,7 @@ class STT(stt.STT):
|
|
156
184
|
sample_rate_hertz=frame.sample_rate,
|
157
185
|
audio_channel_count=frame.num_channels,
|
158
186
|
),
|
187
|
+
adaptation=config.build_adaptation(),
|
159
188
|
features=cloud_speech.RecognitionFeatures(
|
160
189
|
enable_automatic_punctuation=config.punctuate,
|
161
190
|
enable_spoken_punctuation=config.spoken_punctuation,
|
@@ -165,23 +194,39 @@ class STT(stt.STT):
|
|
165
194
|
language_codes=config.languages,
|
166
195
|
)
|
167
196
|
|
168
|
-
|
169
|
-
|
170
|
-
|
197
|
+
try:
|
198
|
+
raw = await self._ensure_client().recognize(
|
199
|
+
cloud_speech.RecognizeRequest(
|
200
|
+
recognizer=self._recognizer,
|
201
|
+
config=config,
|
202
|
+
content=frame.data.tobytes(),
|
203
|
+
)
|
171
204
|
)
|
172
|
-
|
173
|
-
|
205
|
+
|
206
|
+
return _recognize_response_to_speech_event(raw)
|
207
|
+
except DeadlineExceeded:
|
208
|
+
raise APITimeoutError()
|
209
|
+
except GoogleAPICallError as e:
|
210
|
+
raise APIStatusError(
|
211
|
+
e.message,
|
212
|
+
status_code=e.code or -1,
|
213
|
+
request_id=None,
|
214
|
+
body=None,
|
215
|
+
)
|
216
|
+
except Exception as e:
|
217
|
+
raise APIConnectionError() from e
|
174
218
|
|
175
219
|
def stream(
|
176
220
|
self, *, language: SpeechLanguages | str | None = None
|
177
221
|
) -> "SpeechStream":
|
178
222
|
config = self._sanitize_options(language=language)
|
179
|
-
return SpeechStream(self._ensure_client(), self._recognizer, config)
|
223
|
+
return SpeechStream(self, self._ensure_client(), self._recognizer, config)
|
180
224
|
|
181
225
|
|
182
226
|
class SpeechStream(stt.SpeechStream):
|
183
227
|
def __init__(
|
184
228
|
self,
|
229
|
+
stt: STT,
|
185
230
|
client: SpeechAsyncClient,
|
186
231
|
recognizer: str,
|
187
232
|
config: STTOptions,
|
@@ -189,7 +234,7 @@ class SpeechStream(stt.SpeechStream):
|
|
189
234
|
num_channels: int = 1,
|
190
235
|
max_retry: int = 32,
|
191
236
|
) -> None:
|
192
|
-
super().__init__()
|
237
|
+
super().__init__(stt)
|
193
238
|
|
194
239
|
self._client = client
|
195
240
|
self._recognizer = recognizer
|
@@ -205,6 +250,7 @@ class SpeechStream(stt.SpeechStream):
|
|
205
250
|
sample_rate_hertz=self._sample_rate,
|
206
251
|
audio_channel_count=self._num_channels,
|
207
252
|
),
|
253
|
+
adaptation=config.build_adaptation(),
|
208
254
|
language_codes=self._config.languages,
|
209
255
|
model=self._config.model,
|
210
256
|
features=cloud_speech.RecognitionFeatures(
|
@@ -257,6 +303,9 @@ class SpeechStream(stt.SpeechStream):
|
|
257
303
|
retry_count = 0 # connection successful, reset retry count
|
258
304
|
|
259
305
|
await self._run_stream(stream)
|
306
|
+
except Aborted:
|
307
|
+
logger.error("google stt connection aborted")
|
308
|
+
break
|
260
309
|
except Exception as e:
|
261
310
|
if retry_count >= max_retry:
|
262
311
|
logger.error(
|
@@ -0,0 +1,233 @@
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
from dataclasses import dataclass
|
18
|
+
|
19
|
+
from livekit import rtc
|
20
|
+
from livekit.agents import (
|
21
|
+
APIConnectionError,
|
22
|
+
APIStatusError,
|
23
|
+
APITimeoutError,
|
24
|
+
tts,
|
25
|
+
utils,
|
26
|
+
)
|
27
|
+
|
28
|
+
from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
|
29
|
+
from google.cloud import texttospeech
|
30
|
+
from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
|
31
|
+
|
32
|
+
from .models import AudioEncoding, Gender, SpeechLanguages
|
33
|
+
|
34
|
+
|
35
|
+
@dataclass
|
36
|
+
class _TTSOptions:
|
37
|
+
voice: texttospeech.VoiceSelectionParams
|
38
|
+
audio_config: texttospeech.AudioConfig
|
39
|
+
|
40
|
+
|
41
|
+
class TTS(tts.TTS):
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
*,
|
45
|
+
language: SpeechLanguages | str = "en-US",
|
46
|
+
gender: Gender | str = "neutral",
|
47
|
+
voice_name: str = "", # Not required
|
48
|
+
encoding: AudioEncoding | str = "linear16",
|
49
|
+
sample_rate: int = 24000,
|
50
|
+
pitch: int = 0,
|
51
|
+
effects_profile_id: str = "",
|
52
|
+
speaking_rate: float = 1.0,
|
53
|
+
credentials_info: dict | None = None,
|
54
|
+
credentials_file: str | None = None,
|
55
|
+
) -> None:
|
56
|
+
"""
|
57
|
+
Create a new instance of Google TTS.
|
58
|
+
|
59
|
+
Credentials must be provided, either by using the ``credentials_info`` dict, or reading
|
60
|
+
from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
|
61
|
+
environmental variable.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
|
65
|
+
gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
|
66
|
+
voice_name (str, optional): Specific voice name. Default is an empty string.
|
67
|
+
encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
|
68
|
+
sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
|
69
|
+
pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
|
70
|
+
effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
|
71
|
+
speaking_rate (float, optional): Speed of speech. Default is 1.0.
|
72
|
+
credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
|
73
|
+
credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
|
74
|
+
"""
|
75
|
+
|
76
|
+
super().__init__(
|
77
|
+
capabilities=tts.TTSCapabilities(
|
78
|
+
streaming=False,
|
79
|
+
),
|
80
|
+
sample_rate=sample_rate,
|
81
|
+
num_channels=1,
|
82
|
+
)
|
83
|
+
|
84
|
+
self._client: texttospeech.TextToSpeechAsyncClient | None = None
|
85
|
+
self._credentials_info = credentials_info
|
86
|
+
self._credentials_file = credentials_file
|
87
|
+
|
88
|
+
voice = texttospeech.VoiceSelectionParams(
|
89
|
+
name=voice_name,
|
90
|
+
language_code=language,
|
91
|
+
ssml_gender=_gender_from_str(gender),
|
92
|
+
)
|
93
|
+
|
94
|
+
if encoding == "linear16" or encoding == "wav":
|
95
|
+
_audio_encoding = texttospeech.AudioEncoding.LINEAR16
|
96
|
+
elif encoding == "mp3":
|
97
|
+
_audio_encoding = texttospeech.AudioEncoding.MP3
|
98
|
+
else:
|
99
|
+
raise NotImplementedError(f"audio encoding {encoding} is not supported")
|
100
|
+
|
101
|
+
self._opts = _TTSOptions(
|
102
|
+
voice=voice,
|
103
|
+
audio_config=texttospeech.AudioConfig(
|
104
|
+
audio_encoding=_audio_encoding,
|
105
|
+
sample_rate_hertz=sample_rate,
|
106
|
+
pitch=pitch,
|
107
|
+
effects_profile_id=effects_profile_id,
|
108
|
+
speaking_rate=speaking_rate,
|
109
|
+
),
|
110
|
+
)
|
111
|
+
|
112
|
+
def update_options(
|
113
|
+
self,
|
114
|
+
*,
|
115
|
+
language: SpeechLanguages | str = "en-US",
|
116
|
+
gender: Gender | str = "neutral",
|
117
|
+
voice_name: str = "", # Not required
|
118
|
+
speaking_rate: float = 1.0,
|
119
|
+
) -> None:
|
120
|
+
"""
|
121
|
+
Update the TTS options.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
|
125
|
+
gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
|
126
|
+
voice_name (str, optional): Specific voice name. Default is an empty string.
|
127
|
+
speaking_rate (float, optional): Speed of speech. Default is 1.0.
|
128
|
+
"""
|
129
|
+
self._opts.voice = texttospeech.VoiceSelectionParams(
|
130
|
+
name=voice_name,
|
131
|
+
language_code=language,
|
132
|
+
ssml_gender=_gender_from_str(gender),
|
133
|
+
)
|
134
|
+
self._opts.audio_config.speaking_rate = speaking_rate
|
135
|
+
|
136
|
+
def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
|
137
|
+
if not self._client:
|
138
|
+
if self._credentials_info:
|
139
|
+
self._client = (
|
140
|
+
texttospeech.TextToSpeechAsyncClient.from_service_account_info(
|
141
|
+
self._credentials_info
|
142
|
+
)
|
143
|
+
)
|
144
|
+
|
145
|
+
elif self._credentials_file:
|
146
|
+
self._client = (
|
147
|
+
texttospeech.TextToSpeechAsyncClient.from_service_account_file(
|
148
|
+
self._credentials_file
|
149
|
+
)
|
150
|
+
)
|
151
|
+
else:
|
152
|
+
self._client = texttospeech.TextToSpeechAsyncClient()
|
153
|
+
|
154
|
+
assert self._client is not None
|
155
|
+
return self._client
|
156
|
+
|
157
|
+
def synthesize(self, text: str) -> "ChunkedStream":
|
158
|
+
return ChunkedStream(self, text, self._opts, self._ensure_client())
|
159
|
+
|
160
|
+
|
161
|
+
class ChunkedStream(tts.ChunkedStream):
|
162
|
+
def __init__(
|
163
|
+
self,
|
164
|
+
tts: TTS,
|
165
|
+
text: str,
|
166
|
+
opts: _TTSOptions,
|
167
|
+
client: texttospeech.TextToSpeechAsyncClient,
|
168
|
+
) -> None:
|
169
|
+
super().__init__(tts, text)
|
170
|
+
self._opts, self._client = opts, client
|
171
|
+
|
172
|
+
async def _main_task(self) -> None:
|
173
|
+
request_id = utils.shortuuid()
|
174
|
+
|
175
|
+
try:
|
176
|
+
response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
|
177
|
+
input=texttospeech.SynthesisInput(text=self._input_text),
|
178
|
+
voice=self._opts.voice,
|
179
|
+
audio_config=self._opts.audio_config,
|
180
|
+
)
|
181
|
+
|
182
|
+
data = response.audio_content
|
183
|
+
if self._opts.audio_config.audio_encoding == "mp3":
|
184
|
+
decoder = utils.codecs.Mp3StreamDecoder()
|
185
|
+
bstream = utils.audio.AudioByteStream(
|
186
|
+
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
187
|
+
num_channels=1,
|
188
|
+
)
|
189
|
+
for frame in decoder.decode_chunk(data):
|
190
|
+
for frame in bstream.write(frame.data.tobytes()):
|
191
|
+
self._event_ch.send_nowait(
|
192
|
+
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
193
|
+
)
|
194
|
+
|
195
|
+
for frame in bstream.flush():
|
196
|
+
self._event_ch.send_nowait(
|
197
|
+
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
198
|
+
)
|
199
|
+
else:
|
200
|
+
data = data[44:] # skip WAV header
|
201
|
+
self._event_ch.send_nowait(
|
202
|
+
tts.SynthesizedAudio(
|
203
|
+
request_id=request_id,
|
204
|
+
frame=rtc.AudioFrame(
|
205
|
+
data=data,
|
206
|
+
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
207
|
+
num_channels=1,
|
208
|
+
samples_per_channel=len(data) // 2, # 16-bit
|
209
|
+
),
|
210
|
+
)
|
211
|
+
)
|
212
|
+
|
213
|
+
except DeadlineExceeded:
|
214
|
+
raise APITimeoutError()
|
215
|
+
except GoogleAPICallError as e:
|
216
|
+
raise APIStatusError(
|
217
|
+
e.message,
|
218
|
+
status_code=e.code or -1,
|
219
|
+
request_id=None,
|
220
|
+
body=None,
|
221
|
+
)
|
222
|
+
except Exception as e:
|
223
|
+
raise APIConnectionError() from e
|
224
|
+
|
225
|
+
|
226
|
+
def _gender_from_str(gender: str) -> SsmlVoiceGender:
|
227
|
+
ssml_gender = SsmlVoiceGender.NEUTRAL
|
228
|
+
if gender == "male":
|
229
|
+
ssml_gender = SsmlVoiceGender.MALE
|
230
|
+
elif gender == "female":
|
231
|
+
ssml_gender = SsmlVoiceGender.FEMALE
|
232
|
+
|
233
|
+
return ssml_gender # type: ignore
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.3
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -22,7 +22,7 @@ Description-Content-Type: text/markdown
|
|
22
22
|
Requires-Dist: google-auth<3,>=2
|
23
23
|
Requires-Dist: google-cloud-speech<3,>=2
|
24
24
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
25
|
-
Requires-Dist: livekit-agents>=0.
|
25
|
+
Requires-Dist: livekit-agents>=0.11
|
26
26
|
|
27
27
|
# LiveKit Plugins Google
|
28
28
|
|
@@ -51,7 +51,7 @@ setuptools.setup(
|
|
51
51
|
"google-auth >= 2, < 3",
|
52
52
|
"google-cloud-speech >= 2, < 3",
|
53
53
|
"google-cloud-texttospeech >= 2, < 3",
|
54
|
-
"livekit-agents>=0.
|
54
|
+
"livekit-agents>=0.11",
|
55
55
|
],
|
56
56
|
package_data={"livekit.plugins.google": ["py.typed"]},
|
57
57
|
project_urls={
|
@@ -1,174 +0,0 @@
|
|
1
|
-
# Copyright 2023 LiveKit, Inc.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
|
15
|
-
from __future__ import annotations
|
16
|
-
|
17
|
-
from dataclasses import dataclass
|
18
|
-
from typing import Union
|
19
|
-
|
20
|
-
from livekit import rtc
|
21
|
-
from livekit.agents import tts, utils
|
22
|
-
|
23
|
-
from google.cloud import texttospeech
|
24
|
-
from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
|
25
|
-
|
26
|
-
from .log import logger
|
27
|
-
from .models import AudioEncoding, Gender, SpeechLanguages
|
28
|
-
|
29
|
-
LgType = Union[SpeechLanguages, str]
|
30
|
-
GenderType = Union[Gender, str]
|
31
|
-
AudioEncodingType = Union[AudioEncoding, str]
|
32
|
-
|
33
|
-
|
34
|
-
@dataclass
|
35
|
-
class _TTSOptions:
|
36
|
-
voice: texttospeech.VoiceSelectionParams
|
37
|
-
audio_config: texttospeech.AudioConfig
|
38
|
-
|
39
|
-
|
40
|
-
class TTS(tts.TTS):
|
41
|
-
def __init__(
|
42
|
-
self,
|
43
|
-
*,
|
44
|
-
language: LgType = "en-US",
|
45
|
-
gender: GenderType = "neutral",
|
46
|
-
voice_name: str = "", # Not required
|
47
|
-
encoding: AudioEncodingType = "linear16",
|
48
|
-
sample_rate: int = 24000,
|
49
|
-
speaking_rate: float = 1.0,
|
50
|
-
credentials_info: dict | None = None,
|
51
|
-
credentials_file: str | None = None,
|
52
|
-
) -> None:
|
53
|
-
"""
|
54
|
-
Create a new instance of Google TTS.
|
55
|
-
|
56
|
-
Credentials must be provided, either by using the ``credentials_info`` dict, or reading
|
57
|
-
from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
|
58
|
-
environmental variable.
|
59
|
-
"""
|
60
|
-
|
61
|
-
super().__init__(
|
62
|
-
capabilities=tts.TTSCapabilities(
|
63
|
-
streaming=False,
|
64
|
-
),
|
65
|
-
sample_rate=sample_rate,
|
66
|
-
num_channels=1,
|
67
|
-
)
|
68
|
-
|
69
|
-
self._client: texttospeech.TextToSpeechAsyncClient | None = None
|
70
|
-
self._credentials_info = credentials_info
|
71
|
-
self._credentials_file = credentials_file
|
72
|
-
|
73
|
-
ssml_gender = SsmlVoiceGender.NEUTRAL
|
74
|
-
if gender == "male":
|
75
|
-
ssml_gender = SsmlVoiceGender.MALE
|
76
|
-
elif gender == "female":
|
77
|
-
ssml_gender = SsmlVoiceGender.FEMALE
|
78
|
-
|
79
|
-
voice = texttospeech.VoiceSelectionParams(
|
80
|
-
name=voice_name, language_code=language, ssml_gender=ssml_gender
|
81
|
-
)
|
82
|
-
|
83
|
-
if encoding == "linear16" or encoding == "wav":
|
84
|
-
_audio_encoding = texttospeech.AudioEncoding.LINEAR16
|
85
|
-
elif encoding == "mp3":
|
86
|
-
_audio_encoding = texttospeech.AudioEncoding.MP3
|
87
|
-
else:
|
88
|
-
raise NotImplementedError(f"audio encoding {encoding} is not supported")
|
89
|
-
|
90
|
-
self._opts = _TTSOptions(
|
91
|
-
voice=voice,
|
92
|
-
audio_config=texttospeech.AudioConfig(
|
93
|
-
audio_encoding=_audio_encoding,
|
94
|
-
sample_rate_hertz=sample_rate,
|
95
|
-
speaking_rate=speaking_rate,
|
96
|
-
),
|
97
|
-
)
|
98
|
-
|
99
|
-
def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
|
100
|
-
if not self._client:
|
101
|
-
if self._credentials_info:
|
102
|
-
self._client = (
|
103
|
-
texttospeech.TextToSpeechAsyncClient.from_service_account_info(
|
104
|
-
self._credentials_info
|
105
|
-
)
|
106
|
-
)
|
107
|
-
|
108
|
-
elif self._credentials_file:
|
109
|
-
self._client = (
|
110
|
-
texttospeech.TextToSpeechAsyncClient.from_service_account_file(
|
111
|
-
self._credentials_file
|
112
|
-
)
|
113
|
-
)
|
114
|
-
else:
|
115
|
-
self._client = texttospeech.TextToSpeechAsyncClient()
|
116
|
-
|
117
|
-
assert self._client is not None
|
118
|
-
return self._client
|
119
|
-
|
120
|
-
def synthesize(self, text: str) -> "ChunkedStream":
|
121
|
-
return ChunkedStream(text, self._opts, self._ensure_client())
|
122
|
-
|
123
|
-
|
124
|
-
class ChunkedStream(tts.ChunkedStream):
|
125
|
-
def __init__(
|
126
|
-
self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
|
127
|
-
) -> None:
|
128
|
-
super().__init__()
|
129
|
-
self._text, self._opts, self._client = text, opts, client
|
130
|
-
|
131
|
-
@utils.log_exceptions(logger=logger)
|
132
|
-
async def _main_task(self) -> None:
|
133
|
-
request_id = utils.shortuuid()
|
134
|
-
segment_id = utils.shortuuid()
|
135
|
-
response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
|
136
|
-
input=texttospeech.SynthesisInput(text=self._text),
|
137
|
-
voice=self._opts.voice,
|
138
|
-
audio_config=self._opts.audio_config,
|
139
|
-
)
|
140
|
-
|
141
|
-
data = response.audio_content
|
142
|
-
if self._opts.audio_config.audio_encoding == "mp3":
|
143
|
-
decoder = utils.codecs.Mp3StreamDecoder()
|
144
|
-
bstream = utils.audio.AudioByteStream(
|
145
|
-
sample_rate=self._opts.audio_config.sample_rate_hertz, num_channels=1
|
146
|
-
)
|
147
|
-
for frame in decoder.decode_chunk(data):
|
148
|
-
for frame in bstream.write(frame.data):
|
149
|
-
self._event_ch.send_nowait(
|
150
|
-
tts.SynthesizedAudio(
|
151
|
-
request_id=request_id, segment_id=segment_id, frame=frame
|
152
|
-
)
|
153
|
-
)
|
154
|
-
|
155
|
-
for frame in bstream.flush():
|
156
|
-
self._event_ch.send_nowait(
|
157
|
-
tts.SynthesizedAudio(
|
158
|
-
request_id=request_id, segment_id=segment_id, frame=frame
|
159
|
-
)
|
160
|
-
)
|
161
|
-
else:
|
162
|
-
data = data[44:] # skip WAV header
|
163
|
-
self._event_ch.send_nowait(
|
164
|
-
tts.SynthesizedAudio(
|
165
|
-
request_id=request_id,
|
166
|
-
segment_id=segment_id,
|
167
|
-
frame=rtc.AudioFrame(
|
168
|
-
data=data,
|
169
|
-
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
170
|
-
num_channels=1,
|
171
|
-
samples_per_channel=len(data) // 2, # 16-bit
|
172
|
-
),
|
173
|
-
)
|
174
|
-
)
|
File without changes
|
File without changes
|
{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/models.py
RENAMED
File without changes
|
{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/py.typed
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|