livekit-plugins-resemble 1.0.14__tar.gz → 1.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -146,6 +146,9 @@ venv.bak/
146
146
  .dmypy.json
147
147
  dmypy.json
148
148
 
149
+ # trunk
150
+ .trunk/
151
+
149
152
  # Pyre type checker
150
153
  .pyre/
151
154
 
@@ -166,3 +169,11 @@ node_modules
166
169
 
167
170
  credentials.json
168
171
  pyrightconfig.json
172
+ docs/
173
+
174
+ # Database files
175
+ *.db
176
+
177
+
178
+ # Examples for development
179
+ examples/dev/*
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-resemble
3
- Version: 1.0.14
3
+ Version: 1.3.1
4
4
  Summary: LiveKit Agents Plugin for Resemble AI
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
7
7
  Project-URL: Source, https://github.com/livekit/agents
8
8
  Author-email: LiveKit <hello@livekit.io>
9
9
  License-Expression: Apache-2.0
10
- Keywords: audio,livekit,realtime,video,webrtc
10
+ Keywords: ai,audio,livekit,realtime,video,voice,webrtc
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Programming Language :: Python :: 3
@@ -18,12 +18,14 @@ Classifier: Topic :: Multimedia :: Sound/Audio
18
18
  Classifier: Topic :: Multimedia :: Video
19
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: >=3.9.0
21
- Requires-Dist: livekit-agents>=1.0.14
21
+ Requires-Dist: livekit-agents>=1.3.1
22
22
  Description-Content-Type: text/markdown
23
23
 
24
- # LiveKit Plugins Resemble
24
+ # Resemble plugin for LiveKit Agents
25
25
 
26
- Agent Framework plugin for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
26
+ Support for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
27
+
28
+ See [https://docs.livekit.io/agents/integrations/tts/resemble/](https://docs.livekit.io/agents/integrations/tts/resemble/) for more information.
27
29
 
28
30
  ## Installation
29
31
 
@@ -1,6 +1,8 @@
1
- # LiveKit Plugins Resemble
1
+ # Resemble plugin for LiveKit Agents
2
2
 
3
- Agent Framework plugin for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
3
+ Support for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
4
+
5
+ See [https://docs.livekit.io/agents/integrations/tts/resemble/](https://docs.livekit.io/agents/integrations/tts/resemble/) for more information.
4
6
 
5
7
  ## Installation
6
8
 
@@ -12,6 +12,11 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ """Resemble plugin for LiveKit Agents
16
+
17
+ See https://docs.livekit.io/agents/integrations/tts/resemble/ for more information.
18
+ """
19
+
15
20
  from .tts import TTS, ChunkedStream, SynthesizeStream
16
21
  from .version import __version__
17
22
 
@@ -0,0 +1,360 @@
1
+ # Copyright 2025 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ import weakref
22
+ from dataclasses import dataclass, replace
23
+
24
+ import aiohttp
25
+
26
+ from livekit.agents import (
27
+ APIConnectionError,
28
+ APIConnectOptions,
29
+ APIError,
30
+ APIStatusError,
31
+ APITimeoutError,
32
+ tokenize,
33
+ tts,
34
+ utils,
35
+ )
36
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS
37
+
38
+ from .log import logger
39
+
40
+ RESEMBLE_WEBSOCKET_URL = "wss://websocket.cluster.resemble.ai/stream"
41
+ RESEMBLE_REST_API_URL = "https://f.cluster.resemble.ai/synthesize"
42
+ DEFAULT_VOICE_UUID = "55592656"
43
+
44
+
45
+ @dataclass
46
+ class _TTSOptions:
47
+ voice_uuid: str
48
+ sample_rate: int
49
+ tokenizer: tokenize.SentenceTokenizer
50
+
51
+
52
+ class TTS(tts.TTS):
53
+ def __init__(
54
+ self,
55
+ *,
56
+ api_key: str | None = None,
57
+ voice_uuid: str | None = None,
58
+ tokenizer: tokenize.SentenceTokenizer | None = None,
59
+ sample_rate: int = 44100,
60
+ http_session: aiohttp.ClientSession | None = None,
61
+ use_streaming: bool = True,
62
+ ) -> None:
63
+ """
64
+ Create a new instance of the Resemble TTS.
65
+
66
+ See https://docs.app.resemble.ai/docs/text_to_speech/ for more documentation on all of these options.
67
+
68
+ Args:
69
+ voice_uuid (str, optional): The voice UUID for the desired voice. Defaults to None.
70
+ sample_rate (int, optional): The audio sample rate in Hz. Defaults to 44100.
71
+ api_key (str | None, optional): The Resemble API key. If not provided, it will be read from the RESEMBLE_API_KEY environment variable.
72
+ http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
73
+ tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use. Defaults to tokenize.SentenceTokenizer().
74
+ use_streaming (bool, optional): Whether to use streaming or not. Defaults to True.
75
+ """ # noqa: E501
76
+ super().__init__(
77
+ capabilities=tts.TTSCapabilities(streaming=use_streaming),
78
+ sample_rate=sample_rate,
79
+ num_channels=1,
80
+ )
81
+
82
+ api_key = api_key or os.environ.get("RESEMBLE_API_KEY")
83
+ if not api_key:
84
+ raise ValueError(
85
+ "Resemble API key is required, either as argument or set RESEMBLE_API_KEY"
86
+ " environment variable"
87
+ )
88
+ self._api_key = api_key
89
+
90
+ if tokenizer is None:
91
+ tokenizer = tokenize.blingfire.SentenceTokenizer()
92
+
93
+ if voice_uuid is None:
94
+ voice_uuid = DEFAULT_VOICE_UUID
95
+
96
+ self._opts = _TTSOptions(
97
+ voice_uuid=voice_uuid,
98
+ sample_rate=sample_rate,
99
+ tokenizer=tokenizer,
100
+ )
101
+
102
+ self._session = http_session
103
+ self._streams = weakref.WeakSet[SynthesizeStream]()
104
+ self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
105
+ connect_cb=self._connect_ws,
106
+ close_cb=self._close_ws,
107
+ )
108
+
109
+ @property
110
+ def model(self) -> str:
111
+ return "unknown"
112
+
113
+ @property
114
+ def provider(self) -> str:
115
+ return "Resemble"
116
+
117
+ async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
118
+ return await asyncio.wait_for(
119
+ self._ensure_session().ws_connect(
120
+ RESEMBLE_WEBSOCKET_URL,
121
+ headers={"Authorization": f"Bearer {self._api_key}"},
122
+ ),
123
+ timeout,
124
+ )
125
+
126
+ async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
127
+ await ws.close()
128
+
129
+ def _ensure_session(self) -> aiohttp.ClientSession:
130
+ if not self._session:
131
+ self._session = utils.http_context.http_session()
132
+
133
+ return self._session
134
+
135
+ def prewarm(self) -> None:
136
+ self._pool.prewarm()
137
+
138
+ def update_options(
139
+ self,
140
+ *,
141
+ voice_uuid: str | None = None,
142
+ ) -> None:
143
+ """
144
+ Update the Text-to-Speech (TTS) configuration options.
145
+
146
+ Args:
147
+ voice_uuid (str, optional): The voice UUID for the desired voice.
148
+ """ # noqa: E501
149
+ self._opts.voice_uuid = voice_uuid or self._opts.voice_uuid
150
+
151
+ def synthesize(
152
+ self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
153
+ ) -> ChunkedStream:
154
+ return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
155
+
156
+ def stream(
157
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
158
+ ) -> SynthesizeStream:
159
+ stream = SynthesizeStream(tts=self, conn_options=conn_options)
160
+ self._streams.add(stream)
161
+ return stream
162
+
163
+ async def aclose(self) -> None:
164
+ for stream in list(self._streams):
165
+ await stream.aclose()
166
+
167
+ self._streams.clear()
168
+ await self._pool.aclose()
169
+
170
+
171
+ class ChunkedStream(tts.ChunkedStream):
172
+ """Synthesize text into speech in one go using Resemble AI's REST API."""
173
+
174
+ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
175
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
176
+ self._tts: TTS = tts
177
+ self._opts = replace(tts._opts)
178
+
179
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
180
+ try:
181
+ async with self._tts._ensure_session().post(
182
+ RESEMBLE_REST_API_URL,
183
+ headers={
184
+ "Authorization": f"Bearer {self._tts._api_key}",
185
+ "Content-Type": "application/json",
186
+ "Accept": "application/json",
187
+ },
188
+ json={
189
+ "voice_uuid": self._opts.voice_uuid,
190
+ "data": self._input_text,
191
+ "sample_rate": self._opts.sample_rate,
192
+ "precision": "PCM_16",
193
+ },
194
+ timeout=aiohttp.ClientTimeout(
195
+ total=30,
196
+ sock_connect=self._conn_options.timeout,
197
+ ),
198
+ ) as resp:
199
+ resp.raise_for_status()
200
+ response_json = await resp.json()
201
+
202
+ if not response_json.get("success", False):
203
+ issues = response_json.get("issues", ["Unknown error"])
204
+ error_msg = "; ".join(issues)
205
+ raise APIError(
206
+ message=f"Resemble API returned failure: {error_msg}",
207
+ body=json.dumps(response_json),
208
+ )
209
+
210
+ output_emitter.initialize(
211
+ request_id=utils.shortuuid(),
212
+ sample_rate=self._opts.sample_rate,
213
+ num_channels=1,
214
+ mime_type="audio/wav",
215
+ )
216
+
217
+ audio_b64 = response_json["audio_content"]
218
+ audio_bytes = base64.b64decode(audio_b64)
219
+
220
+ output_emitter.push(audio_bytes)
221
+ output_emitter.flush()
222
+
223
+ except asyncio.TimeoutError:
224
+ raise APITimeoutError() from None
225
+ except aiohttp.ClientResponseError as e:
226
+ raise APIStatusError(
227
+ message=e.message, status_code=e.status, request_id=None, body=None
228
+ ) from None
229
+ except Exception as e:
230
+ raise APIConnectionError() from e
231
+
232
+
233
+ class SynthesizeStream(tts.SynthesizeStream):
234
+ """Stream-based text-to-speech synthesis using Resemble AI WebSocket API.
235
+
236
+
237
+ This implementation connects to Resemble's WebSocket API for real-time streaming
238
+ synthesis. Note that this requires a Business plan subscription with Resemble AI.
239
+ """
240
+
241
+ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
242
+ super().__init__(tts=tts, conn_options=conn_options)
243
+ self._tts: TTS = tts
244
+ self._opts = replace(tts._opts)
245
+ self._segments_ch = utils.aio.Chan[tokenize.SentenceStream]()
246
+
247
+ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
248
+ request_id = utils.shortuuid()
249
+ output_emitter.initialize(
250
+ request_id=request_id,
251
+ sample_rate=self._opts.sample_rate,
252
+ num_channels=1,
253
+ stream=True,
254
+ mime_type="audio/mp3",
255
+ )
256
+
257
+ async def _tokenize_input() -> None:
258
+ """tokenize text from the input_ch to words"""
259
+ input_stream = None
260
+ async for text in self._input_ch:
261
+ if isinstance(text, str):
262
+ if input_stream is None:
263
+ # new segment (after flush for e.g)
264
+ input_stream = self._opts.tokenizer.stream()
265
+ self._segments_ch.send_nowait(input_stream)
266
+ input_stream.push_text(text)
267
+ elif isinstance(text, self._FlushSentinel):
268
+ if input_stream is not None:
269
+ input_stream.end_input()
270
+ input_stream = None
271
+
272
+ if input_stream is not None:
273
+ input_stream.end_input()
274
+
275
+ self._segments_ch.close()
276
+
277
+ async def _process_segments() -> None:
278
+ async for input_stream in self._segments_ch:
279
+ await self._run_ws(input_stream, output_emitter)
280
+
281
+ tasks = [
282
+ asyncio.create_task(_tokenize_input()),
283
+ asyncio.create_task(_process_segments()),
284
+ ]
285
+ try:
286
+ await asyncio.gather(*tasks)
287
+ except asyncio.TimeoutError:
288
+ raise APITimeoutError() from None
289
+ except aiohttp.ClientResponseError as e:
290
+ raise APIStatusError(
291
+ message=e.message, status_code=e.status, request_id=request_id, body=None
292
+ ) from None
293
+ except Exception as e:
294
+ raise APIConnectionError() from e
295
+ finally:
296
+ await utils.aio.gracefully_cancel(*tasks)
297
+
298
+ async def _run_ws(
299
+ self, input_stream: tokenize.SentenceStream, output_emitter: tts.AudioEmitter
300
+ ) -> None:
301
+ segment_id = utils.shortuuid()
302
+ output_emitter.start_segment(segment_id=segment_id)
303
+
304
+ last_index = 0
305
+ input_ended = False
306
+
307
+ async def _send_task(ws: aiohttp.ClientWebSocketResponse) -> None:
308
+ nonlocal input_ended, last_index
309
+ async for data in input_stream:
310
+ last_index += 1
311
+ payload = {
312
+ "voice_uuid": self._opts.voice_uuid,
313
+ "data": data.token,
314
+ "request_id": last_index,
315
+ "sample_rate": self._opts.sample_rate,
316
+ "precision": "PCM_16",
317
+ "output_format": "mp3",
318
+ }
319
+ self._mark_started()
320
+ await ws.send_str(json.dumps(payload))
321
+
322
+ input_ended = True
323
+
324
+ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
325
+ while True:
326
+ msg = await ws.receive()
327
+ if msg.type in (
328
+ aiohttp.WSMsgType.CLOSED,
329
+ aiohttp.WSMsgType.CLOSE,
330
+ aiohttp.WSMsgType.CLOSING,
331
+ ):
332
+ raise APIStatusError("Resemble connection closed unexpectedly")
333
+
334
+ if msg.type != aiohttp.WSMsgType.TEXT:
335
+ logger.warning("Unexpected Resemble message type %s", msg.type)
336
+ continue
337
+
338
+ data = json.loads(msg.data)
339
+ if data.get("type") == "audio":
340
+ if data.get("audio_content", None):
341
+ b64data = base64.b64decode(data["audio_content"])
342
+ output_emitter.push(b64data)
343
+
344
+ elif data.get("type") == "audio_end":
345
+ index = data["request_id"]
346
+ if index == last_index and input_ended:
347
+ output_emitter.end_segment()
348
+ break
349
+ else:
350
+ logger.error("Unexpected Resemble message %s", data)
351
+
352
+ async with self._tts._pool.connection(timeout=self._conn_options.timeout) as ws:
353
+ tasks = [
354
+ asyncio.create_task(_send_task(ws)),
355
+ asyncio.create_task(_recv_task(ws)),
356
+ ]
357
+ try:
358
+ await asyncio.gather(*tasks)
359
+ finally:
360
+ await utils.aio.gracefully_cancel(*tasks)
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.14"
15
+ __version__ = "1.3.1"
@@ -10,7 +10,7 @@ readme = "README.md"
10
10
  license = "Apache-2.0"
11
11
  requires-python = ">=3.9.0"
12
12
  authors = [{ name = "LiveKit", email = "hello@livekit.io" }]
13
- keywords = ["webrtc", "realtime", "audio", "video", "livekit"]
13
+ keywords = ["voice", "ai", "realtime", "audio", "video", "livekit", "webrtc"]
14
14
  classifiers = [
15
15
  "Intended Audience :: Developers",
16
16
  "License :: OSI Approved :: Apache Software License",
@@ -23,7 +23,7 @@ classifiers = [
23
23
  "Programming Language :: Python :: 3 :: Only",
24
24
  ]
25
25
  dependencies = [
26
- "livekit-agents>=1.0.14",
26
+ "livekit-agents>=1.3.1",
27
27
  ]
28
28
 
29
29
  [project.urls]
@@ -1,452 +0,0 @@
1
- # Copyright 2025 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import asyncio
18
- import base64
19
- import json
20
- import os
21
- import weakref
22
- from dataclasses import dataclass
23
-
24
- import aiohttp
25
-
26
- from livekit.agents import (
27
- APIConnectionError,
28
- APIConnectOptions,
29
- APIStatusError,
30
- APITimeoutError,
31
- tokenize,
32
- tts,
33
- utils,
34
- )
35
- from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS
36
-
37
- from .log import logger
38
-
39
- RESEMBLE_WEBSOCKET_URL = "wss://websocket.cluster.resemble.ai/stream"
40
- RESEMBLE_REST_API_URL = "https://f.cluster.resemble.ai/synthesize"
41
- NUM_CHANNELS = 1
42
- DEFAULT_VOICE_UUID = "55592656"
43
- BUFFERED_WORDS_COUNT = 3
44
-
45
-
46
- @dataclass
47
- class _TTSOptions:
48
- voice_uuid: str
49
- sample_rate: int
50
- tokenizer: tokenize.SentenceTokenizer
51
-
52
-
53
- class TTS(tts.TTS):
54
- def __init__(
55
- self,
56
- *,
57
- api_key: str | None = None,
58
- voice_uuid: str | None = None,
59
- tokenizer: tokenize.SentenceTokenizer | None = None,
60
- sample_rate: int = 44100,
61
- http_session: aiohttp.ClientSession | None = None,
62
- use_streaming: bool = True,
63
- ) -> None:
64
- """
65
- Create a new instance of the Resemble TTS.
66
-
67
- See https://docs.app.resemble.ai/docs/text_to_speech/ for more documentation on all of these options.
68
-
69
- Args:
70
- voice_uuid (str, optional): The voice UUID for the desired voice. Defaults to None.
71
- sample_rate (int, optional): The audio sample rate in Hz. Defaults to 44100.
72
- api_key (str | None, optional): The Resemble API key. If not provided, it will be read from the RESEMBLE_API_KEY environment variable.
73
- http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
74
- tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use. Defaults to tokenize.SentenceTokenizer().
75
- use_streaming (bool, optional): Whether to use streaming or not. Defaults to True.
76
- """ # noqa: E501
77
- super().__init__(
78
- capabilities=tts.TTSCapabilities(streaming=use_streaming),
79
- sample_rate=sample_rate,
80
- num_channels=NUM_CHANNELS,
81
- )
82
-
83
- api_key = api_key or os.environ.get("RESEMBLE_API_KEY")
84
- if not api_key:
85
- raise ValueError(
86
- "Resemble API key is required, either as argument or set RESEMBLE_API_KEY"
87
- " environment variable"
88
- )
89
- self._api_key = api_key
90
-
91
- if tokenizer is None:
92
- tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
93
-
94
- if voice_uuid is None:
95
- voice_uuid = DEFAULT_VOICE_UUID
96
-
97
- self._opts = _TTSOptions(
98
- voice_uuid=voice_uuid,
99
- sample_rate=sample_rate,
100
- tokenizer=tokenizer,
101
- )
102
-
103
- self._session = http_session
104
- self._streams = weakref.WeakSet[SynthesizeStream]()
105
- self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
106
- connect_cb=self._connect_ws,
107
- close_cb=self._close_ws,
108
- )
109
-
110
- async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
111
- session = self._ensure_session()
112
-
113
- return await asyncio.wait_for(
114
- session.ws_connect(
115
- RESEMBLE_WEBSOCKET_URL,
116
- headers={"Authorization": f"Bearer {self._api_key}"},
117
- ),
118
- self._conn_options.timeout,
119
- )
120
-
121
- async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
122
- await ws.close()
123
-
124
- def _ensure_session(self) -> aiohttp.ClientSession:
125
- if not self._session:
126
- self._session = utils.http_context.http_session()
127
-
128
- return self._session
129
-
130
- def prewarm(self) -> None:
131
- self._pool.prewarm()
132
-
133
- def update_options(
134
- self,
135
- *,
136
- voice_uuid: str | None = None,
137
- sample_rate: int | None = None,
138
- ) -> None:
139
- """
140
- Update the Text-to-Speech (TTS) configuration options.
141
-
142
- Args:
143
- voice_uuid (str, optional): The voice UUID for the desired voice.
144
- sample_rate (int, optional): The audio sample rate in Hz.
145
- """ # noqa: E501
146
- self._opts.voice_uuid = voice_uuid or self._opts.voice_uuid
147
- self._opts.sample_rate = sample_rate or self._opts.sample_rate
148
-
149
- def synthesize(
150
- self,
151
- text: str,
152
- *,
153
- conn_options: APIConnectOptions | None = None,
154
- ) -> ChunkedStream:
155
- return ChunkedStream(
156
- tts=self,
157
- input_text=text,
158
- conn_options=conn_options or DEFAULT_API_CONNECT_OPTIONS,
159
- opts=self._opts,
160
- api_key=self._api_key,
161
- session=self._ensure_session(),
162
- )
163
-
164
- def stream(self, *, conn_options: APIConnectOptions | None = None) -> SynthesizeStream:
165
- stream = SynthesizeStream(
166
- tts=self,
167
- pool=self._pool,
168
- opts=self._opts,
169
- api_key=self._api_key,
170
- )
171
- self._streams.add(stream)
172
- return stream
173
-
174
- async def aclose(self) -> None:
175
- for stream in list(self._streams):
176
- await stream.aclose()
177
- self._streams.clear()
178
- await self._pool.aclose()
179
- await super().aclose()
180
-
181
-
182
- class ChunkedStream(tts.ChunkedStream):
183
- """Synthesize text into speech in one go using Resemble AI's REST API."""
184
-
185
- def __init__(
186
- self,
187
- *,
188
- tts: TTS,
189
- input_text: str,
190
- opts: _TTSOptions,
191
- conn_options: APIConnectOptions,
192
- api_key: str,
193
- session: aiohttp.ClientSession,
194
- ) -> None:
195
- super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
196
- self._opts, self._session, self._api_key = opts, session, api_key
197
-
198
- async def _run(self) -> None:
199
- request_id = utils.shortuuid()
200
-
201
- # Create request headers
202
- headers = {
203
- "Authorization": f"Bearer {self._api_key}",
204
- "Content-Type": "application/json",
205
- "Accept": "application/json", # Expect JSON response
206
- }
207
-
208
- # Create request payload
209
- payload = {
210
- "voice_uuid": self._opts.voice_uuid,
211
- "data": self._input_text,
212
- "sample_rate": self._opts.sample_rate,
213
- "precision": "PCM_16",
214
- }
215
- decoder = utils.codecs.AudioStreamDecoder(
216
- sample_rate=self._opts.sample_rate,
217
- num_channels=NUM_CHANNELS,
218
- )
219
-
220
- try:
221
- async with self._session.post(
222
- RESEMBLE_REST_API_URL,
223
- headers=headers,
224
- json=payload,
225
- timeout=aiohttp.ClientTimeout(
226
- total=30,
227
- sock_connect=self._conn_options.timeout,
228
- ),
229
- ) as response:
230
- response.raise_for_status()
231
- response_json = await response.json()
232
-
233
- # Check for success
234
- if not response_json.get("success", False):
235
- issues = response_json.get("issues", ["Unknown error"])
236
- error_msg = "; ".join(issues)
237
- raise APIStatusError(
238
- message=f"Resemble API returned failure: {error_msg}",
239
- status_code=response.status,
240
- request_id=request_id,
241
- body=json.dumps(response_json),
242
- )
243
-
244
- # Extract base64-encoded audio content
245
- audio_content_b64 = response_json.get("audio_content")
246
- if not audio_content_b64:
247
- raise APIStatusError(
248
- message="No audio content in response",
249
- status_code=response.status,
250
- request_id=request_id,
251
- body=json.dumps(response_json),
252
- )
253
-
254
- # Decode base64 to get raw audio bytes
255
- audio_bytes = base64.b64decode(audio_content_b64)
256
- decoder.push(audio_bytes)
257
- decoder.end_input()
258
-
259
- emitter = tts.SynthesizedAudioEmitter(
260
- event_ch=self._event_ch,
261
- request_id=request_id,
262
- )
263
- async for frame in decoder:
264
- emitter.push(frame)
265
- emitter.flush()
266
-
267
- except aiohttp.ClientResponseError as e:
268
- raise APIStatusError(
269
- message=e.message,
270
- status_code=e.status,
271
- request_id=request_id,
272
- body=f"resemble api error: {str(e)}",
273
- ) from e
274
- except asyncio.TimeoutError as e:
275
- raise APITimeoutError() from e
276
- except aiohttp.ClientError as e:
277
- raise APIConnectionError(
278
- message=f"Resemble API connection error: {str(e)}",
279
- ) from e
280
- except Exception as e:
281
- raise APIConnectionError(f"Error during synthesis: {str(e)}") from e
282
- finally:
283
- await decoder.aclose()
284
-
285
-
286
- class SynthesizeStream(tts.SynthesizeStream):
287
- """Stream-based text-to-speech synthesis using Resemble AI WebSocket API.
288
-
289
-
290
- This implementation connects to Resemble's WebSocket API for real-time streaming
291
- synthesis. Note that this requires a Business plan subscription with Resemble AI.
292
- """
293
-
294
- def __init__(
295
- self,
296
- *,
297
- tts: TTS,
298
- opts: _TTSOptions,
299
- pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
300
- api_key: str,
301
- ):
302
- super().__init__(tts=tts)
303
- self._opts, self._pool, self._api_key = opts, pool, api_key
304
-
305
- async def _run(self) -> None:
306
- request_id = utils.shortuuid()
307
- self._segments_ch = utils.aio.Chan[tokenize.SentenceStream]()
308
-
309
- @utils.log_exceptions(logger=logger)
310
- async def _tokenize_input():
311
- """tokenize text from the input_ch to words"""
312
- input_stream = None
313
- async for input in self._input_ch:
314
- if isinstance(input, str):
315
- if input_stream is None:
316
- # new segment (after flush for e.g)
317
- input_stream = self._opts.tokenizer.stream()
318
- self._segments_ch.send_nowait(input_stream)
319
- input_stream.push_text(input)
320
- elif isinstance(input, self._FlushSentinel):
321
- if input_stream is not None:
322
- input_stream.end_input()
323
- input_stream = None
324
- if input_stream is not None:
325
- input_stream.end_input()
326
- self._segments_ch.close()
327
-
328
- @utils.log_exceptions(logger=logger)
329
- async def _process_segments():
330
- async for input_stream in self._segments_ch:
331
- await self._run_ws(input_stream)
332
-
333
- tasks = [
334
- asyncio.create_task(_tokenize_input()),
335
- asyncio.create_task(_process_segments()),
336
- ]
337
- try:
338
- await asyncio.gather(*tasks)
339
- except asyncio.TimeoutError as e:
340
- raise APITimeoutError() from e
341
- except aiohttp.ClientResponseError as e:
342
- raise APIStatusError(
343
- message=e.message,
344
- status_code=e.status,
345
- request_id=request_id,
346
- body=None,
347
- ) from e
348
- except Exception as e:
349
- raise APIConnectionError() from e
350
- finally:
351
- await utils.aio.gracefully_cancel(*tasks)
352
-
353
- async def _run_ws(
354
- self,
355
- input_stream: tokenize.SentenceStream,
356
- ) -> None:
357
- async with self._pool.connection() as ws:
358
- segment_id = utils.shortuuid()
359
- decoder = utils.codecs.AudioStreamDecoder(
360
- sample_rate=self._opts.sample_rate,
361
- num_channels=NUM_CHANNELS,
362
- )
363
- index_lock = asyncio.Lock()
364
- current_index = 0
365
- pending_requests = set()
366
-
367
- @utils.log_exceptions(logger=logger)
368
- async def _send_task(ws: aiohttp.ClientWebSocketResponse):
369
- nonlocal current_index
370
- index = 0
371
- async for data in input_stream:
372
- payload = {
373
- "voice_uuid": self._opts.voice_uuid,
374
- "data": data.token,
375
- "request_id": index,
376
- "sample_rate": self._opts.sample_rate,
377
- "precision": "PCM_16",
378
- "output_format": "mp3",
379
- }
380
- async with index_lock:
381
- pending_requests.add(index)
382
- index += 1
383
- current_index = index
384
- await ws.send_str(json.dumps(payload))
385
-
386
- @utils.log_exceptions(logger=logger)
387
- async def _emit_task():
388
- emitter = tts.SynthesizedAudioEmitter(
389
- event_ch=self._event_ch,
390
- request_id=str(current_index),
391
- segment_id=segment_id,
392
- )
393
- async for frame in decoder:
394
- emitter.push(frame)
395
- emitter.flush()
396
-
397
- @utils.log_exceptions(logger=logger)
398
- async def _recv_task(ws: aiohttp.ClientWebSocketResponse):
399
- while True:
400
- msg = await ws.receive()
401
- if msg.type in (
402
- aiohttp.WSMsgType.CLOSED,
403
- aiohttp.WSMsgType.CLOSE,
404
- aiohttp.WSMsgType.CLOSING,
405
- ):
406
- raise APIStatusError(
407
- "Resemble connection closed unexpectedly",
408
- request_id=str(current_index),
409
- )
410
-
411
- if msg.type != aiohttp.WSMsgType.TEXT:
412
- logger.warning("Unexpected Resemble message type %s", msg.type)
413
- continue
414
-
415
- data = json.loads(msg.data)
416
-
417
- if data.get("type") == "audio":
418
- if data.get("audio_content", None):
419
- b64data = base64.b64decode(data["audio_content"])
420
- decoder.push(b64data)
421
-
422
- elif data.get("type") == "audio_end":
423
- async with index_lock:
424
- index = data["request_id"]
425
- pending_requests.remove(index)
426
- if not pending_requests:
427
- decoder.end_input()
428
- break # we are not going to receive any more audio
429
- else:
430
- logger.error("Unexpected Resemble message %s", data)
431
-
432
- tasks = [
433
- asyncio.create_task(_send_task(ws)),
434
- asyncio.create_task(_recv_task(ws)),
435
- asyncio.create_task(_emit_task()),
436
- ]
437
-
438
- try:
439
- await asyncio.gather(*tasks)
440
- except asyncio.TimeoutError as e:
441
- raise APITimeoutError() from e
442
- except aiohttp.ClientResponseError as e:
443
- raise APIStatusError(
444
- message=e.message,
445
- status_code=e.status,
446
- request_id=str(current_index),
447
- body=None,
448
- ) from e
449
- except Exception as e:
450
- raise APIConnectionError() from e
451
- finally:
452
- await utils.aio.gracefully_cancel(*tasks)