livekit-plugins-neuphonic 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .tts import TTS, ChunkedStream
16
+ from .version import __version__
17
+
18
+ __all__ = ["TTS", "ChunkedStream", "__version__"]
19
+
20
+ from livekit.agents import Plugin
21
+
22
+ from .log import logger
23
+
24
+
25
+ class NeuphonicPlugin(Plugin):
26
+ def __init__(self):
27
+ super().__init__(__name__, __version__, __package__, logger)
28
+
29
+
30
+ Plugin.register_plugin(NeuphonicPlugin())
31
+
32
+ # Cleanup docs of unexported modules
33
+ _module = dir()
34
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
35
+
36
+ __pdoc__ = {}
37
+
38
+ for n in NOT_IN_ALL:
39
+ __pdoc__[n] = False
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.neuphonic")
@@ -0,0 +1,10 @@
1
+ from typing import Literal
2
+
3
+ TTSEncodings = Literal[
4
+ "pcm_linear",
5
+ "pcm_mulaw",
6
+ ]
7
+
8
+ TTSModels = Literal["neu-fast", "neu-hq"]
9
+
10
+ TTSLangCodes = Literal["en", "nl", "es", "de", "hi", "en-hi", "ar"]
File without changes
@@ -0,0 +1,435 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ import weakref
22
+ from dataclasses import dataclass
23
+ from typing import Optional
24
+
25
+ import aiohttp
26
+ from livekit.agents import (
27
+ APIConnectionError,
28
+ APIConnectOptions,
29
+ APIStatusError,
30
+ APITimeoutError,
31
+ tts,
32
+ utils,
33
+ )
34
+
35
+ from .log import logger
36
+ from .models import TTSEncodings, TTSLangCodes, TTSModels
37
+
38
+ API_BASE_URL = "api.neuphonic.com"
39
+ AUTHORIZATION_HEADER = "X-API-KEY"
40
+ NUM_CHANNELS = 1
41
+
42
+
43
+ @dataclass
44
+ class _TTSOptions:
45
+ base_url: str
46
+ api_key: str
47
+ model: TTSModels | str
48
+ lang_code: TTSLangCodes | str
49
+ encoding: TTSEncodings | str
50
+ sampling_rate: int
51
+ speed: float
52
+ voice_id: str | None = None
53
+
54
+ @property
55
+ def model_params(self) -> dict:
56
+ """Returns a dict of all model parameters and their values."""
57
+ params = [
58
+ "voice_id",
59
+ "model",
60
+ "lang_code",
61
+ "encoding",
62
+ "sampling_rate",
63
+ "speed",
64
+ ]
65
+ values = {}
66
+
67
+ for param in params:
68
+ if hasattr(self, param) and getattr(self, param) is not None:
69
+ values[param] = getattr(self, param)
70
+
71
+ return values
72
+
73
+ def get_query_param_string(self):
74
+ """Forms the query parameter string from all model parameters."""
75
+ queries = []
76
+ for key, value in self.model_params.items():
77
+ queries.append(f"{key}={value}")
78
+
79
+ return "?" + "&".join(queries)
80
+
81
+
82
+ def _parse_sse_message(message: str) -> dict:
83
+ """
84
+ Parse each response from the SSE endpoint.
85
+
86
+ The message will either be a string reading:
87
+ - `event: error`
88
+ - `event: message`
89
+ - `data: { "status_code": 200, "data": {"audio": ... } }`
90
+ """
91
+ message = message.strip()
92
+
93
+ if not message or "data" not in message:
94
+ return None
95
+
96
+ _, value = message.split(": ", 1)
97
+ message = json.loads(value)
98
+
99
+ if message.get("errors") is not None:
100
+ raise Exception(
101
+ f"Status {message.status_code} error received: {message.errors}."
102
+ )
103
+
104
+ return message
105
+
106
+
107
+ class TTS(tts.TTS):
108
+ def __init__(
109
+ self,
110
+ *,
111
+ model: TTSModels | str = "neu_hq",
112
+ voice_id: str | None = None,
113
+ lang_code: TTSLangCodes | str = "en",
114
+ encoding: TTSEncodings | str = "pcm_linear",
115
+ speed: float = 1.0,
116
+ sample_rate: int = 22050,
117
+ api_key: str | None = None,
118
+ http_session: aiohttp.ClientSession | None = None,
119
+ base_url: str = API_BASE_URL,
120
+ ) -> None:
121
+ """
122
+ Create a new instance of the Neuphonic TTS.
123
+
124
+ See https://docs.neuphonic.com for more documentation on all of these options, or go to https://app.neuphonic.com/ to test out different options.
125
+
126
+ Args:
127
+ model (TTSModels | str, optional): The Neuphonic model to use. See Defaults to "neu_hq".
128
+ voice_id (str, optional): The voice ID for the desired voice. Defaults to None.
129
+ lang_code (TTSLanguages | str, optional): The language code for synthesis. Defaults to "en".
130
+ encoding (TTSEncodings | str, optional): The audio encoding format. Defaults to "pcm_mulaw".
131
+ speed (float, optional): The audio playback speed. Defaults to 1.0.
132
+ sample_rate (int, optional): The audio sample rate in Hz. Defaults to 22050.
133
+ api_key (str | None, optional): The Neuphonic API key. If not provided, it will be read from the NEUPHONIC_API_TOKEN environment variable.
134
+ http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
135
+ base_url (str, optional): The base URL for the Neuphonic API. Defaults to "api.neuphonic.com".
136
+ """
137
+ super().__init__(
138
+ capabilities=tts.TTSCapabilities(streaming=True),
139
+ sample_rate=sample_rate,
140
+ num_channels=NUM_CHANNELS,
141
+ )
142
+
143
+ api_key = api_key or os.environ.get("NEUPHONIC_API_TOKEN")
144
+
145
+ if not api_key:
146
+ raise ValueError("NEUPHONIC_API_TOKEN must be set")
147
+
148
+ self._opts = _TTSOptions(
149
+ model=model,
150
+ voice_id=voice_id,
151
+ lang_code=lang_code,
152
+ encoding=encoding,
153
+ speed=speed,
154
+ sampling_rate=sample_rate,
155
+ api_key=api_key,
156
+ base_url=base_url,
157
+ )
158
+
159
+ self._session = http_session
160
+ self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
161
+ connect_cb=self._connect_ws,
162
+ close_cb=self._close_ws,
163
+ max_session_duration=90,
164
+ mark_refreshed_on_get=True,
165
+ )
166
+ self._streams = weakref.WeakSet[SynthesizeStream]()
167
+
168
+ async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
169
+ session = self._ensure_session()
170
+ url = f"wss://{self._opts.base_url}/speak/{self._opts.lang_code}{self._opts.get_query_param_string()}"
171
+
172
+ return await asyncio.wait_for(
173
+ session.ws_connect(url, headers={AUTHORIZATION_HEADER: self._opts.api_key}),
174
+ self._conn_options.timeout,
175
+ )
176
+
177
+ async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
178
+ await ws.close()
179
+
180
+ def _ensure_session(self) -> aiohttp.ClientSession:
181
+ if not self._session:
182
+ self._session = utils.http_context.http_session()
183
+
184
+ return self._session
185
+
186
+ def prewarm(self) -> None:
187
+ self._pool.prewarm()
188
+
189
+ def update_options(
190
+ self,
191
+ *,
192
+ model: TTSModels | str = None,
193
+ voice_id: str | None = None,
194
+ lang_code: TTSLangCodes | str | None = None,
195
+ encoding: TTSEncodings | str | None = None,
196
+ speed: float | None = None,
197
+ sample_rate: int | None = None,
198
+ ) -> None:
199
+ """
200
+ Update the Text-to-Speech (TTS) configuration options.
201
+
202
+ This method allows updating the TTS settings, including model type, voice_id, lang_code,
203
+ encoding, speed and sample_rate. If any parameter is not provided, the existing value will be
204
+ retained.
205
+
206
+ Args:
207
+ model (TTSModels | str, optional): The Neuphonic model to use.
208
+ voice_id (str, optional): The voice ID for the desired voice.
209
+ lang_code (TTSLanguages | str, optional): The language code for synthesis..
210
+ encoding (TTSEncodings | str, optional): The audio encoding format.
211
+ speed (float, optional): The audio playback speed.
212
+ sample_rate (int, optional): The audio sample rate in Hz.
213
+ """
214
+ self._opts.model = model or self._opts.model
215
+ self._opts.voice_id = voice_id or self._opts.voice_id
216
+ self._opts.lang_code = lang_code or self._opts.lang_code
217
+ self._opts.encoding = encoding or self._opts.encoding
218
+ self._opts.speed = speed or self._opts.speed
219
+ self._opts.sampling_rate = sample_rate or self._opts.sampling_rate
220
+ self._pool.invalidate()
221
+
222
+ def synthesize(
223
+ self,
224
+ text: str,
225
+ *,
226
+ conn_options: Optional[APIConnectOptions] = None,
227
+ ) -> ChunkedStream:
228
+ return ChunkedStream(
229
+ tts=self,
230
+ input_text=text,
231
+ conn_options=conn_options,
232
+ opts=self._opts,
233
+ session=self._ensure_session(),
234
+ )
235
+
236
+ def stream(
237
+ self, *, conn_options: Optional[APIConnectOptions] = None
238
+ ) -> SynthesizeStream:
239
+ stream = SynthesizeStream(
240
+ tts=self,
241
+ pool=self._pool,
242
+ opts=self._opts,
243
+ )
244
+ self._streams.add(stream)
245
+ return stream
246
+
247
+ async def aclose(self) -> None:
248
+ for stream in list(self._streams):
249
+ await stream.aclose()
250
+ self._streams.clear()
251
+ await self._pool.aclose()
252
+ await super().aclose()
253
+
254
+
255
+ class ChunkedStream(tts.ChunkedStream):
256
+ """Synthesize chunked text using the SSE endpoint"""
257
+
258
+ def __init__(
259
+ self,
260
+ *,
261
+ tts: TTS,
262
+ input_text: str,
263
+ opts: _TTSOptions,
264
+ session: aiohttp.ClientSession,
265
+ conn_options: Optional[APIConnectOptions] = None,
266
+ ) -> None:
267
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
268
+ self._opts, self._session = opts, session
269
+
270
+ async def _run(self) -> None:
271
+ request_id = utils.shortuuid()
272
+ bstream = utils.audio.AudioByteStream(
273
+ sample_rate=self._opts.sampling_rate, num_channels=NUM_CHANNELS
274
+ )
275
+
276
+ json_data = {
277
+ "text": self._input_text,
278
+ **self._opts.model_params,
279
+ }
280
+
281
+ headers = {
282
+ AUTHORIZATION_HEADER: self._opts.api_key,
283
+ }
284
+
285
+ try:
286
+ async with self._session.post(
287
+ f"https://{self._opts.base_url}/sse/speak/{self._opts.lang_code}",
288
+ headers=headers,
289
+ json=json_data,
290
+ timeout=aiohttp.ClientTimeout(
291
+ total=30,
292
+ sock_connect=self._conn_options.timeout,
293
+ ),
294
+ read_bufsize=10
295
+ * 1024
296
+ * 1024, # large read_bufsize to avoid `ValueError: Chunk too big`
297
+ ) as response:
298
+ response.raise_for_status()
299
+ emitter = tts.SynthesizedAudioEmitter(
300
+ event_ch=self._event_ch,
301
+ request_id=request_id,
302
+ )
303
+
304
+ async for line in response.content:
305
+ message = line.decode("utf-8").strip()
306
+ if message:
307
+ parsed_message = _parse_sse_message(message)
308
+
309
+ if (
310
+ parsed_message is not None
311
+ and parsed_message.get("data", {}).get("audio") is not None
312
+ ):
313
+ audio_bytes = base64.b64decode(
314
+ parsed_message["data"]["audio"]
315
+ )
316
+
317
+ for frame in bstream.write(audio_bytes):
318
+ emitter.push(frame)
319
+
320
+ for frame in bstream.flush():
321
+ emitter.push(frame)
322
+ emitter.flush()
323
+ except asyncio.TimeoutError as e:
324
+ raise APITimeoutError() from e
325
+ except aiohttp.ClientResponseError as e:
326
+ raise APIStatusError(
327
+ message=e.message,
328
+ status_code=e.status,
329
+ request_id=None,
330
+ body=None,
331
+ ) from e
332
+ except Exception as e:
333
+ raise APIConnectionError() from e
334
+
335
+
336
+ class SynthesizeStream(tts.SynthesizeStream):
337
+ def __init__(
338
+ self,
339
+ *,
340
+ tts: TTS,
341
+ opts: _TTSOptions,
342
+ pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
343
+ ):
344
+ super().__init__(tts=tts)
345
+ self._opts, self._pool = opts, pool
346
+
347
+ async def _run(self) -> None:
348
+ request_id = utils.shortuuid()
349
+ request_data = {request_id: {"sent": "", "recv": ""}}
350
+
351
+ def _is_all_audio_recv():
352
+ """Check whether all audio has been recieved."""
353
+ recv_text = (
354
+ request_data[request_id]["recv"]
355
+ .lower()
356
+ .replace(" ", "")
357
+ .replace("\n", "")
358
+ .replace("<stop>", "")
359
+ )
360
+ sent_text = (
361
+ request_data[request_id]["sent"]
362
+ .lower()
363
+ .replace(" ", "")
364
+ .replace("\n", "")
365
+ .replace("<stop>", "")
366
+ )
367
+
368
+ return sent_text == recv_text
369
+
370
+ async def _send_task(ws: aiohttp.ClientWebSocketResponse):
371
+ """Stream text to the websocket."""
372
+ async for data in self._input_ch:
373
+ self._mark_started()
374
+
375
+ if isinstance(data, self._FlushSentinel):
376
+ await ws.send_str(json.dumps({"text": "<STOP>"}))
377
+ continue
378
+
379
+ request_data[request_id]["sent"] += data
380
+ await ws.send_str(json.dumps({"text": data}))
381
+
382
+ async def _recv_task(ws: aiohttp.ClientWebSocketResponse):
383
+ audio_bstream = utils.audio.AudioByteStream(
384
+ sample_rate=self._opts.sampling_rate,
385
+ num_channels=NUM_CHANNELS,
386
+ )
387
+ emitter = tts.SynthesizedAudioEmitter(
388
+ event_ch=self._event_ch,
389
+ request_id=request_id,
390
+ )
391
+
392
+ while True:
393
+ msg = await ws.receive()
394
+ if msg.type in (
395
+ aiohttp.WSMsgType.CLOSED,
396
+ aiohttp.WSMsgType.CLOSE,
397
+ aiohttp.WSMsgType.CLOSING,
398
+ ):
399
+ raise APIStatusError(
400
+ "Neuphonic connection closed unexpectedly",
401
+ request_id=request_id,
402
+ )
403
+
404
+ if msg.type != aiohttp.WSMsgType.TEXT:
405
+ logger.warning("Unexpected Neuphonic message type %s", msg.type)
406
+ continue
407
+
408
+ data = json.loads(msg.data)
409
+
410
+ if data.get("data"):
411
+ b64data = base64.b64decode(data["data"]["audio"])
412
+ recv_text = data["data"]["text"]
413
+ for frame in audio_bstream.write(b64data):
414
+ emitter.push(frame)
415
+
416
+ request_data[request_id]["recv"] += recv_text
417
+ else:
418
+ logger.error("Unexpected Neuphonic message %s", data)
419
+
420
+ if _is_all_audio_recv():
421
+ for frame in audio_bstream.flush():
422
+ emitter.push(frame)
423
+ emitter.flush()
424
+ break # we are not going to receive any more audio
425
+
426
+ async with self._pool.connection() as ws:
427
+ tasks = [
428
+ asyncio.create_task(_send_task(ws)),
429
+ asyncio.create_task(_recv_task(ws)),
430
+ ]
431
+
432
+ try:
433
+ await asyncio.gather(*tasks)
434
+ finally:
435
+ await utils.aio.gracefully_cancel(*tasks)
@@ -0,0 +1,15 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "0.1.0"
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.2
2
+ Name: livekit-plugins-neuphonic
3
+ Version: 0.1.0
4
+ Summary: LiveKit Agents Plugin for Neuphonic
5
+ Home-page: https://github.com/livekit/agents
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://docs.livekit.io
8
+ Project-URL: Website, https://livekit.io/
9
+ Project-URL: Source, https://github.com/livekit/agents
10
+ Keywords: webrtc,realtime,audio,video,livekit
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Topic :: Multimedia :: Sound/Audio
14
+ Classifier: Topic :: Multimedia :: Video
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3 :: Only
20
+ Requires-Python: >=3.9.0
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: livekit-agents<1.0.0,>=0.12.16
23
+ Dynamic: classifier
24
+ Dynamic: description
25
+ Dynamic: description-content-type
26
+ Dynamic: home-page
27
+ Dynamic: keywords
28
+ Dynamic: license
29
+ Dynamic: project-url
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ # LiveKit Plugins Neuphonic
35
+
36
+ Agent Framework plugin for voice synthesis with [Neuphonic](https://neuphonic.com) API.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install livekit-plugins-neuphonic
42
+ ```
43
+
44
+ ## Pre-requisites
45
+
46
+ You'll need an API key from Neuphonic. It can be set as an environment variable: `NEUPHONIC_API_TOKEN`
@@ -0,0 +1,10 @@
1
+ livekit/plugins/neuphonic/__init__.py,sha256=mJnPVLsKAdUkdWuHWd16A0n2vsVBi3GjgNmB8gv9jjI,1097
2
+ livekit/plugins/neuphonic/log.py,sha256=rAHz71IcbvPkixndXBVffPQsmWUKTLqRaYRuPIxO29w,72
3
+ livekit/plugins/neuphonic/models.py,sha256=Svfn_sWA3Q2ZXsPBXY-K5hslq5FE62hvyXBES2C3aSc,201
4
+ livekit/plugins/neuphonic/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/neuphonic/tts.py,sha256=FFLDghnve_Mrx-0qfEYb4o_Bs8VNnZqAsQNI-M6Zxkw,14736
6
+ livekit/plugins/neuphonic/version.py,sha256=vQH9cItKAVYAmrLbOntkbLqmxrUZrPiKb1TjkZ8jRKQ,600
7
+ livekit_plugins_neuphonic-0.1.0.dist-info/METADATA,sha256=Q7Skn-28cnC318qr28oepZyaWnQ9etIO-H2c7D-M9jo,1480
8
+ livekit_plugins_neuphonic-0.1.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
9
+ livekit_plugins_neuphonic-0.1.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_neuphonic-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (76.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+