livekit-plugins-neuphonic 1.2.15__tar.gz → 1.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-neuphonic might be problematic. Click here for more details.

@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-neuphonic
3
- Version: 1.2.15
3
+ Version: 1.2.16
4
4
  Summary: Neuphonic inference plugin for LiveKit Agents
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
7
7
  Project-URL: Source, https://github.com/livekit/agents
8
8
  Author-email: LiveKit <hello@livekit.io>
9
9
  License-Expression: Apache-2.0
10
- Keywords: audio,livekit,neuphonic,realtime,webrtc
10
+ Keywords: ai,audio,livekit,neuphonic,realtime,video,voice
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: Programming Language :: Python :: 3
13
13
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -16,7 +16,7 @@ Classifier: Programming Language :: Python :: 3.12
16
16
  Classifier: Topic :: Multimedia :: Sound/Audio
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.9.0
19
- Requires-Dist: livekit-agents>=1.2.15
19
+ Requires-Dist: livekit-agents>=1.2.16
20
20
  Description-Content-Type: text/markdown
21
21
 
22
22
  # Neuphonic plugin for LiveKit Agents
@@ -0,0 +1,419 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations # noqa: I001
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ import weakref
22
+ from dataclasses import dataclass, replace
23
+
24
+ import aiohttp
25
+ from livekit.agents import (
26
+ APIConnectionError,
27
+ APIConnectOptions,
28
+ APIError,
29
+ APIStatusError,
30
+ APITimeoutError,
31
+ tokenize,
32
+ tts,
33
+ utils,
34
+ )
35
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
36
+ from livekit.agents.utils import is_given
37
+
38
+ from .log import logger # noqa: I001
39
+ from .models import TTSLangCodes # noqa: I001
40
+
41
+ API_AUTH_HEADER = "x-api-key"
42
+
43
+
44
+ @dataclass
45
+ class _TTSOptions:
46
+ lang_code: TTSLangCodes | str
47
+ encoding: str
48
+ sample_rate: int
49
+ voice_id: str
50
+ speed: float | None
51
+ api_key: str
52
+ base_url: str
53
+ word_tokenizer: tokenize.WordTokenizer
54
+
55
+ def get_http_url(self, path: str) -> str:
56
+ return f"{self.base_url}{path}"
57
+
58
+ def get_ws_url(self, path: str) -> str:
59
+ return f"{self.base_url.replace('http', 'ws', 1)}{path}"
60
+
61
+
62
+ class TTS(tts.TTS):
63
+ def __init__(
64
+ self,
65
+ *,
66
+ api_key: str | None = None,
67
+ lang_code: TTSLangCodes | str = "en",
68
+ encoding: str = "pcm_linear",
69
+ voice_id: str = "8e9c4bc8-3979-48ab-8626-df53befc2090",
70
+ speed: float | None = 1.0,
71
+ sample_rate: int = 22050,
72
+ http_session: aiohttp.ClientSession | None = None,
73
+ word_tokenizer: NotGivenOr[tokenize.WordTokenizer] = NOT_GIVEN,
74
+ tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
75
+ base_url: str = "https://api.neuphonic.com",
76
+ ) -> None:
77
+ """
78
+ Create a new instance of NeuPhonic TTS.
79
+
80
+ See https://docs.neuphonic.com for more details on the NeuPhonic API.
81
+
82
+ Args:
83
+ lang_code (TTSLangCodes | str, optional): The language code for synthesis. Defaults to "en".
84
+ encoding (str, optional): The audio encoding format. Defaults to "pcm_linear".
85
+ voice_id (str, optional): The voice ID for the desired voice.
86
+ speed (float, optional): The audio playback speed. Defaults to 1.0.
87
+ sample_rate (int, optional): The audio sample rate in Hz. Defaults to 22050.
88
+ api_key (str, optional): The NeuPhonic API key. If not provided, it will be read from the NEUPHONIC_API_KEY environment variable.
89
+ http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
90
+ word_tokenizer (tokenize.WordTokenizer, optional): The word tokenizer to use. Defaults to tokenize.basic.WordTokenizer().
91
+ tokenizer (tokenize.SentenceTokenizer, optional): The sentence tokenizer to use. Defaults to tokenize.blingfire.SentenceTokenizer().
92
+ base_url (str, optional): The base URL for the NeuPhonic API. Defaults to "https://api.neuphonic.com".
93
+ """ # noqa: E501
94
+
95
+ super().__init__(
96
+ capabilities=tts.TTSCapabilities(streaming=True),
97
+ sample_rate=sample_rate,
98
+ num_channels=1,
99
+ )
100
+ neuphonic_api_key = api_key or os.environ.get("NEUPHONIC_API_KEY")
101
+ if not neuphonic_api_key:
102
+ raise ValueError("NEUPHONIC_API_KEY must be set")
103
+
104
+ if not is_given(word_tokenizer):
105
+ word_tokenizer = tokenize.basic.WordTokenizer(ignore_punctuation=False)
106
+
107
+ self._opts = _TTSOptions(
108
+ lang_code=lang_code,
109
+ encoding=encoding,
110
+ sample_rate=sample_rate,
111
+ voice_id=voice_id,
112
+ speed=speed,
113
+ api_key=neuphonic_api_key,
114
+ base_url=base_url,
115
+ word_tokenizer=word_tokenizer,
116
+ )
117
+ self._session = http_session
118
+ self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
119
+ connect_cb=self._connect_ws,
120
+ close_cb=self._close_ws,
121
+ max_session_duration=300,
122
+ mark_refreshed_on_get=True,
123
+ )
124
+ self._streams = weakref.WeakSet[SynthesizeStream]()
125
+ self._sentence_tokenizer = (
126
+ tokenizer if is_given(tokenizer) else tokenize.blingfire.SentenceTokenizer()
127
+ )
128
+
129
+ async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
130
+ session = self._ensure_session()
131
+ url = self._opts.get_ws_url(
132
+ f"/speak/en?api_key={self._opts.api_key}&speed={self._opts.speed}&lang_code={self._opts.lang_code}&sampling_rate={self._opts.sample_rate}&voice_id={self._opts.voice_id}"
133
+ )
134
+
135
+ headers = {API_AUTH_HEADER: self._opts.api_key}
136
+ return await asyncio.wait_for(session.ws_connect(url, headers=headers), timeout)
137
+
138
+ async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
139
+ await ws.close()
140
+
141
+ @property
142
+ def model(self) -> str:
143
+ return "Octave"
144
+
145
+ @property
146
+ def provider(self) -> str:
147
+ return "Neuphonic"
148
+
149
+ def _ensure_session(self) -> aiohttp.ClientSession:
150
+ if not self._session:
151
+ self._session = utils.http_context.http_session()
152
+
153
+ return self._session
154
+
155
+ def prewarm(self) -> None:
156
+ self._pool.prewarm()
157
+
158
+ def update_options(
159
+ self,
160
+ *,
161
+ lang_code: NotGivenOr[TTSLangCodes | str] = NOT_GIVEN,
162
+ voice_id: NotGivenOr[str] = NOT_GIVEN,
163
+ speed: NotGivenOr[float | None] = NOT_GIVEN,
164
+ ) -> None:
165
+ """
166
+ Update the Text-to-Speech (TTS) configuration options.
167
+
168
+ This allows updating the TTS settings, including lang_code, voice_id, and speed.
169
+ If any parameter is not provided, the existing value will be retained.
170
+
171
+ Args:
172
+ lang_code (TTSLangCodes | str, optional): The language code for synthesis.
173
+ voice_id (str, optional): The voice ID for the desired voice.
174
+ speed (float, optional): The audio playback speed.
175
+ """
176
+ if is_given(lang_code):
177
+ self._opts.lang_code = lang_code
178
+ if is_given(voice_id):
179
+ self._opts.voice_id = voice_id
180
+ if is_given(speed):
181
+ self._opts.speed = speed
182
+
183
+ def synthesize(
184
+ self,
185
+ text: str,
186
+ *,
187
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
188
+ ) -> ChunkedStream:
189
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
190
+
191
+ def stream(
192
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
193
+ ) -> SynthesizeStream:
194
+ stream = SynthesizeStream(tts=self, conn_options=conn_options)
195
+ self._streams.add(stream)
196
+ return stream
197
+
198
+ async def aclose(self) -> None:
199
+ for stream in list(self._streams):
200
+ await stream.aclose()
201
+
202
+ self._streams.clear()
203
+ await self._pool.aclose()
204
+
205
+
206
+ class ChunkedStream(tts.ChunkedStream):
207
+ """Synthesize chunked text using the SSE endpoint"""
208
+
209
+ def __init__(
210
+ self,
211
+ *,
212
+ tts: TTS,
213
+ input_text: str,
214
+ conn_options: APIConnectOptions,
215
+ ) -> None:
216
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
217
+ self._tts: TTS = tts
218
+ self._opts = replace(tts._opts)
219
+
220
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
221
+ try:
222
+ async with self._tts._ensure_session().post(
223
+ f"{self._opts.base_url}/sse/speak/{self._opts.lang_code}",
224
+ headers={API_AUTH_HEADER: self._opts.api_key},
225
+ json={
226
+ "text": self._input_text,
227
+ "voice_id": self._opts.voice_id,
228
+ "lang_code": self._opts.lang_code,
229
+ "encoding": "pcm_linear",
230
+ "sampling_rate": self._opts.sample_rate,
231
+ "speed": self._opts.speed,
232
+ },
233
+ timeout=aiohttp.ClientTimeout(
234
+ total=30,
235
+ sock_connect=self._conn_options.timeout,
236
+ ),
237
+ # large read_bufsize to avoid `ValueError: Chunk too big`
238
+ read_bufsize=10 * 1024 * 1024,
239
+ ) as resp:
240
+ resp.raise_for_status()
241
+
242
+ output_emitter.initialize(
243
+ request_id=utils.shortuuid(),
244
+ sample_rate=self._opts.sample_rate,
245
+ num_channels=1,
246
+ mime_type="audio/pcm",
247
+ )
248
+
249
+ async for line in resp.content:
250
+ message = line.decode("utf-8")
251
+ if not message:
252
+ continue
253
+
254
+ parsed_message = _parse_sse_message(message)
255
+
256
+ if (
257
+ parsed_message is not None
258
+ and parsed_message.get("data", {}).get("audio") is not None
259
+ ):
260
+ audio_bytes = base64.b64decode(parsed_message["data"]["audio"])
261
+ output_emitter.push(audio_bytes)
262
+
263
+ output_emitter.flush()
264
+ except asyncio.TimeoutError:
265
+ raise APITimeoutError() from None
266
+ except aiohttp.ClientResponseError as e:
267
+ raise APIStatusError(
268
+ message=e.message, status_code=e.status, request_id=None, body=None
269
+ ) from None
270
+ except Exception as e:
271
+ raise APIConnectionError() from e
272
+
273
+
274
+ def _parse_sse_message(message: str) -> dict | None:
275
+ """
276
+ Parse each response from the SSE endpoint.
277
+
278
+ The message will either be a string reading:
279
+ - `event: error`
280
+ - `event: message`
281
+ - `data: { "status_code": 200, "data": {"audio": ... } }`
282
+ """
283
+ message = message.strip()
284
+
285
+ if not message or "data" not in message:
286
+ return None
287
+
288
+ _, value = message.split(": ", 1)
289
+ message_dict: dict = json.loads(value)
290
+
291
+ if message_dict.get("errors") is not None:
292
+ raise Exception(
293
+ f"received error status {message_dict['status_code']}:{message_dict['errors']}"
294
+ )
295
+
296
+ return message_dict
297
+
298
+
299
+ class SynthesizeStream(tts.SynthesizeStream):
300
+ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
301
+ super().__init__(tts=tts, conn_options=conn_options)
302
+ self._tts: TTS = tts
303
+ self._opts = replace(tts._opts)
304
+ self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
305
+
306
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
307
+ request_id = utils.shortuuid()
308
+ output_emitter.initialize(
309
+ request_id=request_id,
310
+ sample_rate=self._opts.sample_rate,
311
+ num_channels=1,
312
+ mime_type="audio/pcm",
313
+ stream=True,
314
+ )
315
+
316
+ async def _tokenize_input() -> None:
317
+ word_stream = None
318
+ async for input in self._input_ch:
319
+ if isinstance(input, str):
320
+ if word_stream is None:
321
+ word_stream = self._opts.word_tokenizer.stream()
322
+ self._segments_ch.send_nowait(word_stream)
323
+ word_stream.push_text(input)
324
+ elif isinstance(input, self._FlushSentinel):
325
+ if word_stream:
326
+ word_stream.end_input()
327
+ word_stream = None
328
+
329
+ self._segments_ch.close()
330
+
331
+ async def _run_segments() -> None:
332
+ async for word_stream in self._segments_ch:
333
+ await self._run_ws(word_stream, output_emitter)
334
+
335
+ tasks = [
336
+ asyncio.create_task(_tokenize_input()),
337
+ asyncio.create_task(_run_segments()),
338
+ ]
339
+ try:
340
+ await asyncio.gather(*tasks)
341
+ except asyncio.TimeoutError:
342
+ raise APITimeoutError() from None
343
+ except aiohttp.ClientResponseError as e:
344
+ raise APIStatusError(
345
+ message=e.message,
346
+ status_code=e.status,
347
+ request_id=request_id,
348
+ body=None,
349
+ ) from None
350
+ except Exception as e:
351
+ raise APIConnectionError() from e
352
+ finally:
353
+ await utils.aio.gracefully_cancel(*tasks)
354
+
355
+ async def _run_ws(
356
+ self, word_stream: tokenize.WordStream, output_emitter: tts.AudioEmitter
357
+ ) -> None:
358
+ segment_id = utils.shortuuid()
359
+ output_emitter.start_segment(segment_id=segment_id)
360
+
361
+ async def send_task(ws: aiohttp.ClientWebSocketResponse) -> None:
362
+ async for word in word_stream:
363
+ text_msg = {"text": f"{word.token} "}
364
+ self._mark_started()
365
+ await ws.send_str(json.dumps(text_msg))
366
+
367
+ stop_msg = {"text": "<STOP>"}
368
+ await ws.send_str(json.dumps(stop_msg))
369
+
370
+ async def recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
371
+ while True:
372
+ msg = await ws.receive()
373
+
374
+ if msg.type in (
375
+ aiohttp.WSMsgType.CLOSE,
376
+ aiohttp.WSMsgType.CLOSED,
377
+ aiohttp.WSMsgType.CLOSING,
378
+ ):
379
+ raise APIStatusError("NeuPhonic websocket connection closed unexpectedly")
380
+
381
+ if msg.type == aiohttp.WSMsgType.TEXT:
382
+ try:
383
+ resp = json.loads(msg.data)
384
+ except json.JSONDecodeError:
385
+ logger.warning("Invalid JSON from NeuPhonic")
386
+ continue
387
+
388
+ if resp.get("type") == "error":
389
+ raise APIError(f"NeuPhonic returned error: {resp}")
390
+
391
+ data = resp.get("data", {})
392
+ audio_data = data.get("audio")
393
+ if audio_data and audio_data != "":
394
+ try:
395
+ b64data = base64.b64decode(audio_data)
396
+ if b64data:
397
+ output_emitter.push(b64data)
398
+ except Exception as e:
399
+ logger.warning("Failed to decode NeuPhonic audio data: %s", e)
400
+
401
+ if data.get("stop"):
402
+ output_emitter.end_segment()
403
+ break
404
+
405
+ elif msg.type == aiohttp.WSMsgType.BINARY:
406
+ pass
407
+ else:
408
+ logger.warning("Unexpected NeuPhonic message type: %s", msg.type)
409
+
410
+ async with self._tts._pool.connection(timeout=self._conn_options.timeout) as ws:
411
+ tasks = [
412
+ asyncio.create_task(send_task(ws)),
413
+ asyncio.create_task(recv_task(ws)),
414
+ ]
415
+
416
+ try:
417
+ await asyncio.gather(*tasks)
418
+ finally:
419
+ await utils.aio.gracefully_cancel(*tasks)
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.2.15"
15
+ __version__ = "1.2.16"
@@ -10,7 +10,7 @@ readme = "README.md"
10
10
  license = "Apache-2.0"
11
11
  requires-python = ">=3.9.0"
12
12
  authors = [{ name = "LiveKit", email = "hello@livekit.io" }]
13
- keywords = ["webrtc", "realtime", "audio", "livekit", "neuphonic"]
13
+ keywords = ["voice", "ai", "realtime", "audio", "video", "livekit", "neuphonic"]
14
14
  classifiers = [
15
15
  "Intended Audience :: Developers",
16
16
  "Topic :: Multimedia :: Sound/Audio",
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3 :: Only",
22
22
  ]
23
23
  dependencies = [
24
- "livekit-agents>=1.2.15",
24
+ "livekit-agents>=1.2.16",
25
25
  ]
26
26
 
27
27
  [project.urls]
@@ -1,241 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import asyncio
18
- import base64
19
- import json
20
- import os
21
- from dataclasses import dataclass, replace
22
-
23
- import aiohttp
24
-
25
- from livekit.agents import (
26
- APIConnectionError,
27
- APIConnectOptions,
28
- APIStatusError,
29
- APITimeoutError,
30
- tts,
31
- utils,
32
- )
33
- from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
34
- from livekit.agents.utils import is_given
35
-
36
- from .models import TTSLangCodes
37
-
38
- API_BASE_URL = "api.neuphonic.com"
39
- AUTHORIZATION_HEADER = "X-API-KEY"
40
-
41
-
42
- @dataclass
43
- class _TTSOptions:
44
- base_url: str
45
- lang_code: TTSLangCodes | str
46
- api_key: str
47
- sample_rate: int
48
- speed: float
49
- voice_id: str | None
50
-
51
-
52
- class TTS(tts.TTS):
53
- def __init__(
54
- self,
55
- *,
56
- voice_id: str = "8e9c4bc8-3979-48ab-8626-df53befc2090",
57
- api_key: str | None = None,
58
- lang_code: TTSLangCodes | str = "en",
59
- speed: float = 1.0,
60
- sample_rate: int = 22050,
61
- http_session: aiohttp.ClientSession | None = None,
62
- base_url: str = API_BASE_URL,
63
- ) -> None:
64
- """
65
- Create a new instance of the Neuphonic TTS.
66
-
67
- See https://docs.neuphonic.com for more documentation on all of these options, or go to https://app.neuphonic.com/ to test out different options.
68
-
69
- Args:
70
- voice_id (str, optional): The voice ID for the desired voice. Defaults to None.
71
- lang_code (TTSLanguages | str, optional): The language code for synthesis. Defaults to "en".
72
- encoding (TTSEncodings | str, optional): The audio encoding format. Defaults to "pcm_mulaw".
73
- speed (float, optional): The audio playback speed. Defaults to 1.0.
74
- sample_rate (int, optional): The audio sample rate in Hz. Defaults to 22050.
75
- api_key (str | None, optional): The Neuphonic API key. If not provided, it will be read from the NEUPHONIC_API_KEY environment variable.
76
- http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
77
- base_url (str, optional): The base URL for the Neuphonic API. Defaults to "api.neuphonic.com".
78
- """ # noqa: E501
79
- super().__init__(
80
- capabilities=tts.TTSCapabilities(streaming=False),
81
- sample_rate=sample_rate,
82
- num_channels=1,
83
- )
84
-
85
- api_key = api_key or os.environ.get("NEUPHONIC_API_KEY")
86
- if not api_key:
87
- raise ValueError("API key must be provided or set in NEUPHONIC_API_KEY")
88
-
89
- self._opts = _TTSOptions(
90
- voice_id=voice_id,
91
- lang_code=lang_code,
92
- api_key=api_key,
93
- speed=speed,
94
- sample_rate=sample_rate,
95
- base_url=base_url,
96
- )
97
- self._session = http_session
98
-
99
- @property
100
- def model(self) -> str:
101
- return "Octave"
102
-
103
- @property
104
- def provider(self) -> str:
105
- return "Neuphonic"
106
-
107
- def _ensure_session(self) -> aiohttp.ClientSession:
108
- if not self._session:
109
- self._session = utils.http_context.http_session()
110
-
111
- return self._session
112
-
113
- def update_options(
114
- self,
115
- *,
116
- voice_id: NotGivenOr[str] = NOT_GIVEN,
117
- lang_code: NotGivenOr[TTSLangCodes] = NOT_GIVEN,
118
- speed: NotGivenOr[float] = NOT_GIVEN,
119
- sample_rate: NotGivenOr[int] = NOT_GIVEN,
120
- ) -> None:
121
- """
122
- Update the Text-to-Speech (TTS) configuration options.
123
-
124
- This method allows updating the TTS settings, including model type, voice_id, lang_code,
125
- encoding, speed and sample_rate. If any parameter is not provided, the existing value will be
126
- retained.
127
-
128
- Args:
129
- model (TTSModels | str, optional): The Neuphonic model to use.
130
- voice_id (str, optional): The voice ID for the desired voice.
131
- lang_code (TTSLanguages | str, optional): The language code for synthesis..
132
- encoding (TTSEncodings | str, optional): The audio encoding format.
133
- speed (float, optional): The audio playback speed.
134
- sample_rate (int, optional): The audio sample rate in Hz.
135
- """ # noqa: E501
136
- if is_given(voice_id):
137
- self._opts.voice_id = voice_id
138
- if is_given(lang_code):
139
- self._opts.lang_code = lang_code
140
- if is_given(speed):
141
- self._opts.speed = speed
142
- if is_given(sample_rate):
143
- self._opts.sample_rate = sample_rate
144
-
145
- def synthesize(
146
- self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
147
- ) -> ChunkedStream:
148
- return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
149
-
150
-
151
- class ChunkedStream(tts.ChunkedStream):
152
- """Synthesize chunked text using the SSE endpoint"""
153
-
154
- def __init__(
155
- self,
156
- *,
157
- tts: TTS,
158
- input_text: str,
159
- conn_options: APIConnectOptions,
160
- ) -> None:
161
- super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
162
- self._tts: TTS = tts
163
- self._opts = replace(tts._opts)
164
-
165
- async def _run(self, output_emitter: tts.AudioEmitter) -> None:
166
- try:
167
- async with self._tts._ensure_session().post(
168
- f"https://{self._opts.base_url}/sse/speak/{self._opts.lang_code}",
169
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
170
- json={
171
- "text": self._input_text,
172
- "voice_id": self._opts.voice_id,
173
- "lang_code": self._opts.lang_code,
174
- "encoding": "pcm_linear",
175
- "sampling_rate": self._opts.sample_rate,
176
- "speed": self._opts.speed,
177
- },
178
- timeout=aiohttp.ClientTimeout(
179
- total=30,
180
- sock_connect=self._conn_options.timeout,
181
- ),
182
- # large read_bufsize to avoid `ValueError: Chunk too big`
183
- read_bufsize=10 * 1024 * 1024,
184
- ) as resp:
185
- resp.raise_for_status()
186
-
187
- output_emitter.initialize(
188
- request_id=utils.shortuuid(),
189
- sample_rate=self._opts.sample_rate,
190
- num_channels=1,
191
- mime_type="audio/pcm",
192
- )
193
-
194
- async for line in resp.content:
195
- message = line.decode("utf-8")
196
- if not message:
197
- continue
198
-
199
- parsed_message = _parse_sse_message(message)
200
-
201
- if (
202
- parsed_message is not None
203
- and parsed_message.get("data", {}).get("audio") is not None
204
- ):
205
- audio_bytes = base64.b64decode(parsed_message["data"]["audio"])
206
- output_emitter.push(audio_bytes)
207
-
208
- output_emitter.flush()
209
- except asyncio.TimeoutError:
210
- raise APITimeoutError() from None
211
- except aiohttp.ClientResponseError as e:
212
- raise APIStatusError(
213
- message=e.message, status_code=e.status, request_id=None, body=None
214
- ) from None
215
- except Exception as e:
216
- raise APIConnectionError() from e
217
-
218
-
219
- def _parse_sse_message(message: str) -> dict | None:
220
- """
221
- Parse each response from the SSE endpoint.
222
-
223
- The message will either be a string reading:
224
- - `event: error`
225
- - `event: message`
226
- - `data: { "status_code": 200, "data": {"audio": ... } }`
227
- """
228
- message = message.strip()
229
-
230
- if not message or "data" not in message:
231
- return None
232
-
233
- _, value = message.split(": ", 1)
234
- message_dict: dict = json.loads(value)
235
-
236
- if message_dict.get("errors") is not None:
237
- raise Exception(
238
- f"received error status {message_dict['status_code']}: {message_dict['errors']}"
239
- )
240
-
241
- return message_dict