livekit-plugins-resemble 0.1.0__py3-none-any.whl → 0.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of livekit-plugins-resemble might be problematic. Click here for more details.
- livekit/plugins/resemble/models.py +0 -5
- livekit/plugins/resemble/tts.py +233 -401
- livekit/plugins/resemble/version.py +1 -1
- {livekit_plugins_resemble-0.1.0.dist-info → livekit_plugins_resemble-0.1.0rc1.dist-info}/METADATA +12 -21
- livekit_plugins_resemble-0.1.0rc1.dist-info/RECORD +9 -0
- {livekit_plugins_resemble-0.1.0.dist-info → livekit_plugins_resemble-0.1.0rc1.dist-info}/WHEEL +1 -2
- livekit_plugins_resemble-0.1.0.dist-info/RECORD +0 -10
- livekit_plugins_resemble-0.1.0.dist-info/top_level.txt +0 -1
livekit/plugins/resemble/tts.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright
|
|
1
|
+
# Copyright 2025 LiveKit, Inc.
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -18,22 +18,21 @@ import asyncio
|
|
|
18
18
|
import base64
|
|
19
19
|
import json
|
|
20
20
|
import os
|
|
21
|
-
import time
|
|
22
21
|
import weakref
|
|
23
22
|
from dataclasses import dataclass
|
|
24
|
-
from typing import Optional
|
|
25
23
|
|
|
26
24
|
import aiohttp
|
|
27
|
-
|
|
28
|
-
from livekit import rtc
|
|
25
|
+
|
|
29
26
|
from livekit.agents import (
|
|
30
27
|
APIConnectionError,
|
|
31
28
|
APIConnectOptions,
|
|
32
29
|
APIStatusError,
|
|
33
30
|
APITimeoutError,
|
|
31
|
+
tokenize,
|
|
34
32
|
tts,
|
|
35
33
|
utils,
|
|
36
34
|
)
|
|
35
|
+
from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS
|
|
37
36
|
|
|
38
37
|
from .log import logger
|
|
39
38
|
|
|
@@ -41,12 +40,14 @@ RESEMBLE_WEBSOCKET_URL = "wss://websocket.cluster.resemble.ai/stream"
|
|
|
41
40
|
RESEMBLE_REST_API_URL = "https://f.cluster.resemble.ai/synthesize"
|
|
42
41
|
NUM_CHANNELS = 1
|
|
43
42
|
DEFAULT_VOICE_UUID = "55592656"
|
|
43
|
+
BUFFERED_WORDS_COUNT = 3
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
@dataclass
|
|
47
|
-
class
|
|
47
|
+
class _TTSOptions:
|
|
48
48
|
voice_uuid: str
|
|
49
49
|
sample_rate: int
|
|
50
|
+
tokenizer: tokenize.SentenceTokenizer
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
class TTS(tts.TTS):
|
|
@@ -54,111 +55,127 @@ class TTS(tts.TTS):
|
|
|
54
55
|
self,
|
|
55
56
|
*,
|
|
56
57
|
api_key: str | None = None,
|
|
57
|
-
voice_uuid: str | None =
|
|
58
|
+
voice_uuid: str | None = None,
|
|
59
|
+
tokenizer: tokenize.SentenceTokenizer | None = None,
|
|
58
60
|
sample_rate: int = 44100,
|
|
59
61
|
http_session: aiohttp.ClientSession | None = None,
|
|
62
|
+
use_streaming: bool = True,
|
|
60
63
|
) -> None:
|
|
64
|
+
"""
|
|
65
|
+
Create a new instance of the Resemble TTS.
|
|
66
|
+
|
|
67
|
+
See https://docs.app.resemble.ai/docs/text_to_speech/ for more documentation on all of these options.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
voice_uuid (str, optional): The voice UUID for the desired voice. Defaults to None.
|
|
71
|
+
sample_rate (int, optional): The audio sample rate in Hz. Defaults to 44100.
|
|
72
|
+
api_key (str | None, optional): The Resemble API key. If not provided, it will be read from the RESEMBLE_API_KEY environment variable.
|
|
73
|
+
http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
|
|
74
|
+
tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use. Defaults to tokenize.SentenceTokenizer().
|
|
75
|
+
use_streaming (bool, optional): Whether to use streaming or not. Defaults to True.
|
|
76
|
+
""" # noqa: E501
|
|
61
77
|
super().__init__(
|
|
62
|
-
capabilities=tts.TTSCapabilities(
|
|
63
|
-
streaming=True,
|
|
64
|
-
),
|
|
78
|
+
capabilities=tts.TTSCapabilities(streaming=use_streaming),
|
|
65
79
|
sample_rate=sample_rate,
|
|
66
80
|
num_channels=NUM_CHANNELS,
|
|
67
81
|
)
|
|
68
82
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
if not self._api_key:
|
|
83
|
+
api_key = api_key or os.environ.get("RESEMBLE_API_KEY")
|
|
84
|
+
if not api_key:
|
|
72
85
|
raise ValueError(
|
|
73
|
-
"Resemble API key is required, either as argument or set RESEMBLE_API_KEY
|
|
86
|
+
"Resemble API key is required, either as argument or set RESEMBLE_API_KEY"
|
|
87
|
+
" environment variable"
|
|
74
88
|
)
|
|
89
|
+
self._api_key = api_key
|
|
90
|
+
|
|
91
|
+
if tokenizer is None:
|
|
92
|
+
tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
|
|
93
|
+
|
|
94
|
+
if voice_uuid is None:
|
|
95
|
+
voice_uuid = DEFAULT_VOICE_UUID
|
|
75
96
|
|
|
76
|
-
|
|
77
|
-
self._opts = _Options(
|
|
97
|
+
self._opts = _TTSOptions(
|
|
78
98
|
voice_uuid=voice_uuid,
|
|
79
99
|
sample_rate=sample_rate,
|
|
100
|
+
tokenizer=tokenizer,
|
|
80
101
|
)
|
|
81
102
|
|
|
82
103
|
self._session = http_session
|
|
83
104
|
self._streams = weakref.WeakSet[SynthesizeStream]()
|
|
84
|
-
|
|
85
|
-
# Create a connection pool for WebSockets
|
|
86
|
-
self._pool = utils.ConnectionPool[websockets.WebSocketClientProtocol](
|
|
105
|
+
self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
|
|
87
106
|
connect_cb=self._connect_ws,
|
|
88
107
|
close_cb=self._close_ws,
|
|
89
108
|
)
|
|
90
109
|
|
|
91
|
-
async def _connect_ws(self) ->
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
110
|
+
async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
|
|
111
|
+
session = self._ensure_session()
|
|
112
|
+
|
|
113
|
+
return await asyncio.wait_for(
|
|
114
|
+
session.ws_connect(
|
|
115
|
+
RESEMBLE_WEBSOCKET_URL,
|
|
116
|
+
headers={"Authorization": f"Bearer {self._api_key}"},
|
|
117
|
+
),
|
|
118
|
+
self._conn_options.timeout,
|
|
98
119
|
)
|
|
99
120
|
|
|
100
|
-
async def _close_ws(self, ws:
|
|
101
|
-
"""Close the WebSocket connection."""
|
|
121
|
+
async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
|
|
102
122
|
await ws.close()
|
|
103
123
|
|
|
124
|
+
def _ensure_session(self) -> aiohttp.ClientSession:
|
|
125
|
+
if not self._session:
|
|
126
|
+
self._session = utils.http_context.http_session()
|
|
127
|
+
|
|
128
|
+
return self._session
|
|
129
|
+
|
|
130
|
+
def prewarm(self) -> None:
|
|
131
|
+
self._pool.prewarm()
|
|
132
|
+
|
|
104
133
|
def update_options(
|
|
105
134
|
self,
|
|
106
135
|
*,
|
|
107
136
|
voice_uuid: str | None = None,
|
|
108
|
-
|
|
137
|
+
sample_rate: int | None = None,
|
|
109
138
|
) -> None:
|
|
110
|
-
"""
|
|
111
|
-
|
|
112
|
-
|
|
139
|
+
"""
|
|
140
|
+
Update the Text-to-Speech (TTS) configuration options.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
voice_uuid (str, optional): The voice UUID for the desired voice.
|
|
144
|
+
sample_rate (int, optional): The audio sample rate in Hz.
|
|
145
|
+
""" # noqa: E501
|
|
146
|
+
self._opts.voice_uuid = voice_uuid or self._opts.voice_uuid
|
|
147
|
+
self._opts.sample_rate = sample_rate or self._opts.sample_rate
|
|
113
148
|
|
|
114
149
|
def synthesize(
|
|
115
150
|
self,
|
|
116
151
|
text: str,
|
|
117
152
|
*,
|
|
118
|
-
conn_options:
|
|
119
|
-
) ->
|
|
120
|
-
"""Synthesize text into speech using Resemble AI."""
|
|
153
|
+
conn_options: APIConnectOptions | None = None,
|
|
154
|
+
) -> ChunkedStream:
|
|
121
155
|
return ChunkedStream(
|
|
122
156
|
tts=self,
|
|
123
157
|
input_text=text,
|
|
158
|
+
conn_options=conn_options or DEFAULT_API_CONNECT_OPTIONS,
|
|
124
159
|
opts=self._opts,
|
|
125
|
-
conn_options=conn_options,
|
|
126
160
|
api_key=self._api_key,
|
|
127
|
-
session=self.
|
|
161
|
+
session=self._ensure_session(),
|
|
128
162
|
)
|
|
129
163
|
|
|
130
|
-
def stream(
|
|
131
|
-
self, *, conn_options: Optional[APIConnectOptions] = None
|
|
132
|
-
) -> "SynthesizeStream":
|
|
133
|
-
"""Create a streaming synthesis connection to Resemble AI."""
|
|
164
|
+
def stream(self, *, conn_options: APIConnectOptions | None = None) -> SynthesizeStream:
|
|
134
165
|
stream = SynthesizeStream(
|
|
135
166
|
tts=self,
|
|
167
|
+
pool=self._pool,
|
|
136
168
|
opts=self._opts,
|
|
137
|
-
conn_options=conn_options,
|
|
138
169
|
api_key=self._api_key,
|
|
139
|
-
pool=self._pool,
|
|
140
170
|
)
|
|
141
171
|
self._streams.add(stream)
|
|
142
172
|
return stream
|
|
143
173
|
|
|
144
|
-
async def __aenter__(self) -> "TTS":
|
|
145
|
-
"""Enter async context manager."""
|
|
146
|
-
return self
|
|
147
|
-
|
|
148
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
149
|
-
"""Exit async context manager and clean up resources."""
|
|
150
|
-
await self.aclose()
|
|
151
|
-
|
|
152
174
|
async def aclose(self) -> None:
|
|
153
|
-
"""Clean up resources."""
|
|
154
|
-
# Close all active streams
|
|
155
175
|
for stream in list(self._streams):
|
|
156
176
|
await stream.aclose()
|
|
157
177
|
self._streams.clear()
|
|
158
|
-
|
|
159
|
-
# Close the WebSocket connection pool
|
|
160
178
|
await self._pool.aclose()
|
|
161
|
-
|
|
162
179
|
await super().aclose()
|
|
163
180
|
|
|
164
181
|
|
|
@@ -170,19 +187,15 @@ class ChunkedStream(tts.ChunkedStream):
|
|
|
170
187
|
*,
|
|
171
188
|
tts: TTS,
|
|
172
189
|
input_text: str,
|
|
173
|
-
opts:
|
|
174
|
-
conn_options:
|
|
175
|
-
api_key: str
|
|
190
|
+
opts: _TTSOptions,
|
|
191
|
+
conn_options: APIConnectOptions,
|
|
192
|
+
api_key: str,
|
|
176
193
|
session: aiohttp.ClientSession,
|
|
177
194
|
) -> None:
|
|
178
195
|
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
|
179
|
-
self._opts = opts
|
|
180
|
-
self._api_key = api_key
|
|
181
|
-
self._session = session
|
|
182
|
-
self._segment_id = utils.shortuuid()
|
|
196
|
+
self._opts, self._session, self._api_key = opts, session, api_key
|
|
183
197
|
|
|
184
198
|
async def _run(self) -> None:
|
|
185
|
-
"""Run the synthesis process using REST API."""
|
|
186
199
|
request_id = utils.shortuuid()
|
|
187
200
|
|
|
188
201
|
# Create request headers
|
|
@@ -197,35 +210,24 @@ class ChunkedStream(tts.ChunkedStream):
|
|
|
197
210
|
"voice_uuid": self._opts.voice_uuid,
|
|
198
211
|
"data": self._input_text,
|
|
199
212
|
"sample_rate": self._opts.sample_rate,
|
|
213
|
+
"precision": "PCM_16",
|
|
200
214
|
}
|
|
201
|
-
|
|
202
|
-
# Create decoder for audio processing
|
|
203
215
|
decoder = utils.codecs.AudioStreamDecoder(
|
|
204
216
|
sample_rate=self._opts.sample_rate,
|
|
205
217
|
num_channels=NUM_CHANNELS,
|
|
206
218
|
)
|
|
207
219
|
|
|
208
220
|
try:
|
|
209
|
-
# Make the HTTP request with explicit timeout
|
|
210
221
|
async with self._session.post(
|
|
211
222
|
RESEMBLE_REST_API_URL,
|
|
212
223
|
headers=headers,
|
|
213
224
|
json=payload,
|
|
214
225
|
timeout=aiohttp.ClientTimeout(
|
|
215
|
-
total=30,
|
|
226
|
+
total=30,
|
|
216
227
|
sock_connect=self._conn_options.timeout,
|
|
217
228
|
),
|
|
218
229
|
) as response:
|
|
219
|
-
|
|
220
|
-
error_text = await response.text()
|
|
221
|
-
raise APIStatusError(
|
|
222
|
-
message=f"Resemble API error: {error_text}",
|
|
223
|
-
status_code=response.status,
|
|
224
|
-
request_id=request_id,
|
|
225
|
-
body=error_text,
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
# Parse the JSON response
|
|
230
|
+
response.raise_for_status()
|
|
229
231
|
response_json = await response.json()
|
|
230
232
|
|
|
231
233
|
# Check for success
|
|
@@ -251,42 +253,32 @@ class ChunkedStream(tts.ChunkedStream):
|
|
|
251
253
|
|
|
252
254
|
# Decode base64 to get raw audio bytes
|
|
253
255
|
audio_bytes = base64.b64decode(audio_content_b64)
|
|
256
|
+
decoder.push(audio_bytes)
|
|
257
|
+
decoder.end_input()
|
|
254
258
|
|
|
255
|
-
# Create audio emitter
|
|
256
259
|
emitter = tts.SynthesizedAudioEmitter(
|
|
257
260
|
event_ch=self._event_ch,
|
|
258
261
|
request_id=request_id,
|
|
259
|
-
segment_id=self._segment_id,
|
|
260
262
|
)
|
|
261
|
-
|
|
262
|
-
# Push audio data to decoder
|
|
263
|
-
decoder.push(audio_bytes)
|
|
264
|
-
decoder.end_input()
|
|
265
|
-
|
|
266
|
-
# Emit audio frames
|
|
267
263
|
async for frame in decoder:
|
|
268
264
|
emitter.push(frame)
|
|
269
|
-
|
|
270
|
-
# Final flush of the emitter
|
|
271
265
|
emitter.flush()
|
|
272
266
|
|
|
273
267
|
except aiohttp.ClientResponseError as e:
|
|
274
|
-
# Handle HTTP errors (4xx, 5xx)
|
|
275
268
|
raise APIStatusError(
|
|
276
|
-
message=
|
|
269
|
+
message=e.message,
|
|
277
270
|
status_code=e.status,
|
|
278
271
|
request_id=request_id,
|
|
279
|
-
body=
|
|
272
|
+
body=f"resemble api error: {str(e)}",
|
|
280
273
|
) from e
|
|
281
274
|
except asyncio.TimeoutError as e:
|
|
282
|
-
logger.error("Timeout while connecting to Resemble API")
|
|
283
275
|
raise APITimeoutError() from e
|
|
284
276
|
except aiohttp.ClientError as e:
|
|
285
|
-
|
|
286
|
-
|
|
277
|
+
raise APIConnectionError(
|
|
278
|
+
message=f"Resemble API connection error: {str(e)}",
|
|
279
|
+
) from e
|
|
287
280
|
except Exception as e:
|
|
288
|
-
|
|
289
|
-
raise APIConnectionError(f"Error during synthesis: {e}") from e
|
|
281
|
+
raise APIConnectionError(f"Error during synthesis: {str(e)}") from e
|
|
290
282
|
finally:
|
|
291
283
|
await decoder.aclose()
|
|
292
284
|
|
|
@@ -294,6 +286,7 @@ class ChunkedStream(tts.ChunkedStream):
|
|
|
294
286
|
class SynthesizeStream(tts.SynthesizeStream):
|
|
295
287
|
"""Stream-based text-to-speech synthesis using Resemble AI WebSocket API.
|
|
296
288
|
|
|
289
|
+
|
|
297
290
|
This implementation connects to Resemble's WebSocket API for real-time streaming
|
|
298
291
|
synthesis. Note that this requires a Business plan subscription with Resemble AI.
|
|
299
292
|
"""
|
|
@@ -302,319 +295,158 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
|
302
295
|
self,
|
|
303
296
|
*,
|
|
304
297
|
tts: TTS,
|
|
305
|
-
opts:
|
|
306
|
-
|
|
307
|
-
api_key: str
|
|
308
|
-
pool: utils.ConnectionPool[websockets.WebSocketClientProtocol],
|
|
298
|
+
opts: _TTSOptions,
|
|
299
|
+
pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
|
|
300
|
+
api_key: str,
|
|
309
301
|
):
|
|
310
|
-
super().__init__(tts=tts
|
|
311
|
-
self._opts = opts
|
|
312
|
-
self._api_key = api_key
|
|
313
|
-
self._request_id = 0
|
|
314
|
-
self._running = False
|
|
315
|
-
self._websocket = None
|
|
316
|
-
self._pool = pool
|
|
317
|
-
|
|
318
|
-
# Channels for communication between components
|
|
319
|
-
self._text_ch = asyncio.Queue()
|
|
320
|
-
self._audio_ch = asyncio.Queue()
|
|
321
|
-
|
|
322
|
-
# Tasks for processing
|
|
323
|
-
self._websocket_task = None
|
|
324
|
-
self._processing_task = None
|
|
325
|
-
self._closed = False
|
|
326
|
-
|
|
327
|
-
# Create a task to monitor the base class's input channel
|
|
328
|
-
self._input_monitor_task = asyncio.create_task(self._monitor_input_channel())
|
|
302
|
+
super().__init__(tts=tts)
|
|
303
|
+
self._opts, self._pool, self._api_key = opts, pool, api_key
|
|
329
304
|
|
|
330
|
-
async def
|
|
331
|
-
|
|
305
|
+
async def _run(self) -> None:
|
|
306
|
+
request_id = utils.shortuuid()
|
|
307
|
+
self._segments_ch = utils.aio.Chan[tokenize.SentenceStream]()
|
|
308
|
+
|
|
309
|
+
@utils.log_exceptions(logger=logger)
|
|
310
|
+
async def _tokenize_input():
|
|
311
|
+
"""tokenize text from the input_ch to words"""
|
|
312
|
+
input_stream = None
|
|
313
|
+
async for input in self._input_ch:
|
|
314
|
+
if isinstance(input, str):
|
|
315
|
+
if input_stream is None:
|
|
316
|
+
# new segment (after flush for e.g)
|
|
317
|
+
input_stream = self._opts.tokenizer.stream()
|
|
318
|
+
self._segments_ch.send_nowait(input_stream)
|
|
319
|
+
input_stream.push_text(input)
|
|
320
|
+
elif isinstance(input, self._FlushSentinel):
|
|
321
|
+
if input_stream is not None:
|
|
322
|
+
input_stream.end_input()
|
|
323
|
+
input_stream = None
|
|
324
|
+
if input_stream is not None:
|
|
325
|
+
input_stream.end_input()
|
|
326
|
+
self._segments_ch.close()
|
|
327
|
+
|
|
328
|
+
@utils.log_exceptions(logger=logger)
|
|
329
|
+
async def _process_segments():
|
|
330
|
+
async for input_stream in self._segments_ch:
|
|
331
|
+
await self._run_ws(input_stream)
|
|
332
|
+
|
|
333
|
+
tasks = [
|
|
334
|
+
asyncio.create_task(_tokenize_input()),
|
|
335
|
+
asyncio.create_task(_process_segments()),
|
|
336
|
+
]
|
|
332
337
|
try:
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
word_count = 0
|
|
344
|
-
# Signal end of input
|
|
345
|
-
await self._text_ch.put(None)
|
|
346
|
-
continue
|
|
347
|
-
else:
|
|
348
|
-
# It's a text token, add to buffer
|
|
349
|
-
buffer += item
|
|
350
|
-
|
|
351
|
-
# Count words in the buffer
|
|
352
|
-
if item.strip() and (item.endswith(" ") or item.endswith("\n")):
|
|
353
|
-
word_count += 1
|
|
354
|
-
|
|
355
|
-
# Send buffer when we have enough words or hit sentence-ending punctuation
|
|
356
|
-
if word_count >= MIN_WORDS_TO_BUFFER or any(
|
|
357
|
-
buffer.rstrip().endswith(p) for p in [".", "!", "?", ":", ";"]
|
|
358
|
-
):
|
|
359
|
-
await self._text_ch.put(buffer)
|
|
360
|
-
buffer = ""
|
|
361
|
-
word_count = 0
|
|
362
|
-
|
|
363
|
-
# End of input - send any remaining text in buffer
|
|
364
|
-
if buffer:
|
|
365
|
-
await self._text_ch.put(buffer)
|
|
338
|
+
await asyncio.gather(*tasks)
|
|
339
|
+
except asyncio.TimeoutError as e:
|
|
340
|
+
raise APITimeoutError() from e
|
|
341
|
+
except aiohttp.ClientResponseError as e:
|
|
342
|
+
raise APIStatusError(
|
|
343
|
+
message=e.message,
|
|
344
|
+
status_code=e.status,
|
|
345
|
+
request_id=request_id,
|
|
346
|
+
body=None,
|
|
347
|
+
) from e
|
|
366
348
|
except Exception as e:
|
|
367
|
-
|
|
349
|
+
raise APIConnectionError() from e
|
|
368
350
|
finally:
|
|
369
|
-
|
|
370
|
-
# Signal end of input if our monitor is shutting down unexpectedly
|
|
371
|
-
await self._text_ch.put(None)
|
|
372
|
-
|
|
373
|
-
def _preprocess_text(self, text: str) -> str:
|
|
374
|
-
"""Preprocess text before sending to Resemble API.
|
|
375
|
-
|
|
376
|
-
This ensures punctuation is properly handled by combining it with adjacent words.
|
|
377
|
-
"""
|
|
378
|
-
# Skip if text is empty or None
|
|
379
|
-
if not text or not text.strip():
|
|
380
|
-
return text
|
|
381
|
-
|
|
382
|
-
# If text is just punctuation, add a space before it to avoid errors
|
|
383
|
-
if text.strip() in ",.!?;:":
|
|
384
|
-
return " " + text
|
|
385
|
-
|
|
386
|
-
return text
|
|
387
|
-
|
|
388
|
-
async def synthesize_text(self, text: str) -> None:
|
|
389
|
-
"""Queue text for synthesis."""
|
|
390
|
-
if self._closed:
|
|
391
|
-
raise RuntimeError("Stream is closed")
|
|
392
|
-
|
|
393
|
-
# Preprocess text before sending
|
|
394
|
-
processed_text = self._preprocess_text(text)
|
|
395
|
-
await self._text_ch.put(processed_text)
|
|
396
|
-
|
|
397
|
-
if not self._running:
|
|
398
|
-
# Start processing if not already running
|
|
399
|
-
self._running = True
|
|
400
|
-
self._processing_task = asyncio.create_task(self._run())
|
|
401
|
-
|
|
402
|
-
# Wait for the text to be processed
|
|
403
|
-
await self._text_ch.join()
|
|
404
|
-
|
|
405
|
-
# Signal end of input - this will close the channel
|
|
406
|
-
# Note: We don't call flush() here because it's already done in end_input()
|
|
407
|
-
self.end_input()
|
|
408
|
-
|
|
409
|
-
async def aclose(self) -> None:
|
|
410
|
-
"""Close the stream and clean up resources."""
|
|
411
|
-
self._closed = True
|
|
412
|
-
|
|
413
|
-
# Close the text channel to signal the end
|
|
414
|
-
if self._running:
|
|
415
|
-
await self._text_ch.put(None) # Signal end of input
|
|
416
|
-
|
|
417
|
-
# Cancel the input monitor task
|
|
418
|
-
if self._input_monitor_task and not self._input_monitor_task.done():
|
|
419
|
-
self._input_monitor_task.cancel()
|
|
420
|
-
try:
|
|
421
|
-
await self._input_monitor_task
|
|
422
|
-
except asyncio.CancelledError:
|
|
423
|
-
pass
|
|
424
|
-
|
|
425
|
-
# Cancel any running tasks
|
|
426
|
-
if self._processing_task and not self._processing_task.done():
|
|
427
|
-
self._processing_task.cancel()
|
|
428
|
-
try:
|
|
429
|
-
await self._processing_task
|
|
430
|
-
except asyncio.CancelledError:
|
|
431
|
-
pass
|
|
432
|
-
|
|
433
|
-
await super().aclose()
|
|
351
|
+
await utils.aio.gracefully_cancel(*tasks)
|
|
434
352
|
|
|
435
|
-
async def
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
sample_rate=self._opts.sample_rate,
|
|
441
|
-
num_channels=NUM_CHANNELS,
|
|
442
|
-
)
|
|
443
|
-
|
|
444
|
-
try:
|
|
445
|
-
request_id = utils.shortuuid()
|
|
353
|
+
async def _run_ws(
|
|
354
|
+
self,
|
|
355
|
+
input_stream: tokenize.SentenceStream,
|
|
356
|
+
) -> None:
|
|
357
|
+
async with self._pool.connection() as ws:
|
|
446
358
|
segment_id = utils.shortuuid()
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
event_ch=self._event_ch,
|
|
451
|
-
request_id=request_id,
|
|
452
|
-
segment_id=segment_id,
|
|
359
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
|
360
|
+
sample_rate=self._opts.sample_rate,
|
|
361
|
+
num_channels=NUM_CHANNELS,
|
|
453
362
|
)
|
|
454
|
-
|
|
455
|
-
|
|
363
|
+
index_lock = asyncio.Lock()
|
|
364
|
+
current_index = 0
|
|
456
365
|
pending_requests = set()
|
|
457
366
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
)
|
|
488
|
-
|
|
489
|
-
emitter.push(frame)
|
|
490
|
-
|
|
491
|
-
emitter.flush()
|
|
492
|
-
|
|
493
|
-
except Exception as e:
|
|
494
|
-
logger.error(
|
|
495
|
-
f"Error processing audio data: {e}",
|
|
496
|
-
exc_info=True,
|
|
497
|
-
)
|
|
498
|
-
|
|
499
|
-
# Handle end of audio
|
|
500
|
-
elif data.get("type") == "audio_end":
|
|
501
|
-
# Complete current segment
|
|
502
|
-
emitter.flush()
|
|
503
|
-
|
|
504
|
-
# Mark request as completed if request_id is present
|
|
505
|
-
if "request_id" in data:
|
|
506
|
-
req_id = data["request_id"]
|
|
507
|
-
if req_id in pending_requests:
|
|
508
|
-
pending_requests.remove(req_id)
|
|
509
|
-
|
|
510
|
-
# Handle errors
|
|
511
|
-
elif data.get("type") == "error":
|
|
512
|
-
error_msg = data.get("message", "Unknown error")
|
|
513
|
-
logger.error(
|
|
514
|
-
f"Resemble WebSocket API error: {error_msg}"
|
|
515
|
-
)
|
|
516
|
-
|
|
517
|
-
# Don't raise an error for punctuation-only inputs
|
|
518
|
-
if (
|
|
519
|
-
"would not generate any audio" in error_msg
|
|
520
|
-
and data.get("request_id") in pending_requests
|
|
521
|
-
):
|
|
522
|
-
req_id = data.get("request_id")
|
|
523
|
-
pending_requests.remove(req_id)
|
|
524
|
-
else:
|
|
525
|
-
raise APIStatusError(
|
|
526
|
-
message=f"Resemble API error: {error_msg}",
|
|
527
|
-
status_code=data.get("status_code", 500),
|
|
528
|
-
request_id=str(request_id),
|
|
529
|
-
body=None,
|
|
530
|
-
)
|
|
531
|
-
except json.JSONDecodeError:
|
|
532
|
-
logger.error(
|
|
533
|
-
f"Failed to decode JSON response: {message}"
|
|
534
|
-
)
|
|
535
|
-
except websockets.exceptions.ConnectionClosed as e:
|
|
536
|
-
logger.error(f"WebSocket connection closed: {e}")
|
|
537
|
-
if not self._closed:
|
|
538
|
-
raise APIConnectionError(
|
|
539
|
-
f"WebSocket connection closed unexpectedly: {e}"
|
|
540
|
-
)
|
|
541
|
-
except Exception as e:
|
|
542
|
-
logger.error(f"Error in WebSocket receive task: {e}")
|
|
543
|
-
if not self._closed:
|
|
544
|
-
raise
|
|
545
|
-
|
|
546
|
-
# Start WebSocket receive task
|
|
547
|
-
ws_task = asyncio.create_task(_ws_recv_task())
|
|
548
|
-
|
|
549
|
-
# Process text input
|
|
550
|
-
try:
|
|
551
|
-
while not self._closed:
|
|
552
|
-
# Wait for text to synthesize
|
|
553
|
-
text = await self._text_ch.get()
|
|
554
|
-
|
|
555
|
-
# None signals end of input
|
|
556
|
-
if text is None:
|
|
557
|
-
break
|
|
558
|
-
|
|
559
|
-
if not text.strip():
|
|
560
|
-
self._text_ch.task_done()
|
|
561
|
-
continue
|
|
562
|
-
|
|
563
|
-
# Preprocess text before sending
|
|
564
|
-
text = self._preprocess_text(text)
|
|
565
|
-
|
|
566
|
-
self._mark_started()
|
|
567
|
-
|
|
568
|
-
payload = {
|
|
569
|
-
"voice_uuid": self._opts.voice_uuid,
|
|
570
|
-
"data": text,
|
|
571
|
-
"request_id": self._request_id,
|
|
572
|
-
"sample_rate": self._opts.sample_rate,
|
|
573
|
-
"precision": "PCM_16",
|
|
574
|
-
"no_audio_header": True,
|
|
575
|
-
}
|
|
576
|
-
|
|
577
|
-
# Add request to pending set
|
|
578
|
-
pending_requests.add(self._request_id)
|
|
579
|
-
|
|
580
|
-
# Send synthesis request
|
|
581
|
-
await websocket.send(json.dumps(payload))
|
|
582
|
-
self._request_id += 1
|
|
583
|
-
|
|
584
|
-
# Mark the text as processed
|
|
585
|
-
self._text_ch.task_done()
|
|
586
|
-
|
|
587
|
-
# Wait for all pending requests to complete
|
|
588
|
-
if pending_requests:
|
|
589
|
-
# Wait with a timeout to avoid hanging indefinitely
|
|
590
|
-
wait_start = time.time()
|
|
591
|
-
while pending_requests and (time.time() - wait_start) < 5.0:
|
|
592
|
-
await asyncio.sleep(0.1)
|
|
593
|
-
|
|
594
|
-
if pending_requests:
|
|
595
|
-
logger.warning(
|
|
596
|
-
f"Timed out waiting for {len(pending_requests)} audio responses"
|
|
597
|
-
)
|
|
598
|
-
|
|
599
|
-
finally:
|
|
600
|
-
# Cancel WebSocket task
|
|
601
|
-
if not ws_task.done():
|
|
602
|
-
ws_task.cancel()
|
|
603
|
-
try:
|
|
604
|
-
await ws_task
|
|
605
|
-
except asyncio.CancelledError:
|
|
606
|
-
pass
|
|
607
|
-
|
|
608
|
-
except asyncio.CancelledError:
|
|
609
|
-
raise
|
|
610
|
-
except websockets.exceptions.ConnectionClosed as e:
|
|
611
|
-
logger.error(f"WebSocket connection closed: {e}")
|
|
612
|
-
raise APIConnectionError(f"WebSocket connection closed: {e}") from e
|
|
613
|
-
except Exception as e:
|
|
614
|
-
logger.error(f"Error during streaming synthesis: {e}")
|
|
615
|
-
raise APIConnectionError(f"Error during streaming synthesis: {e}") from e
|
|
616
|
-
finally:
|
|
617
|
-
# Clean up resources
|
|
618
|
-
await decoder.aclose()
|
|
367
|
+
@utils.log_exceptions(logger=logger)
|
|
368
|
+
async def _send_task(ws: aiohttp.ClientWebSocketResponse):
|
|
369
|
+
nonlocal current_index
|
|
370
|
+
index = 0
|
|
371
|
+
async for data in input_stream:
|
|
372
|
+
payload = {
|
|
373
|
+
"voice_uuid": self._opts.voice_uuid,
|
|
374
|
+
"data": data.token,
|
|
375
|
+
"request_id": index,
|
|
376
|
+
"sample_rate": self._opts.sample_rate,
|
|
377
|
+
"precision": "PCM_16",
|
|
378
|
+
"output_format": "mp3",
|
|
379
|
+
}
|
|
380
|
+
async with index_lock:
|
|
381
|
+
pending_requests.add(index)
|
|
382
|
+
index += 1
|
|
383
|
+
current_index = index
|
|
384
|
+
await ws.send_str(json.dumps(payload))
|
|
385
|
+
|
|
386
|
+
@utils.log_exceptions(logger=logger)
|
|
387
|
+
async def _emit_task():
|
|
388
|
+
emitter = tts.SynthesizedAudioEmitter(
|
|
389
|
+
event_ch=self._event_ch,
|
|
390
|
+
request_id=str(current_index),
|
|
391
|
+
segment_id=segment_id,
|
|
392
|
+
)
|
|
393
|
+
async for frame in decoder:
|
|
394
|
+
emitter.push(frame)
|
|
395
|
+
emitter.flush()
|
|
619
396
|
|
|
620
|
-
|
|
397
|
+
@utils.log_exceptions(logger=logger)
|
|
398
|
+
async def _recv_task(ws: aiohttp.ClientWebSocketResponse):
|
|
399
|
+
while True:
|
|
400
|
+
msg = await ws.receive()
|
|
401
|
+
if msg.type in (
|
|
402
|
+
aiohttp.WSMsgType.CLOSED,
|
|
403
|
+
aiohttp.WSMsgType.CLOSE,
|
|
404
|
+
aiohttp.WSMsgType.CLOSING,
|
|
405
|
+
):
|
|
406
|
+
raise APIStatusError(
|
|
407
|
+
"Resemble connection closed unexpectedly",
|
|
408
|
+
request_id=str(current_index),
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
|
412
|
+
logger.warning("Unexpected Resemble message type %s", msg.type)
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
data = json.loads(msg.data)
|
|
416
|
+
|
|
417
|
+
if data.get("type") == "audio":
|
|
418
|
+
if data.get("audio_content", None):
|
|
419
|
+
b64data = base64.b64decode(data["audio_content"])
|
|
420
|
+
decoder.push(b64data)
|
|
421
|
+
|
|
422
|
+
elif data.get("type") == "audio_end":
|
|
423
|
+
async with index_lock:
|
|
424
|
+
index = data["request_id"]
|
|
425
|
+
pending_requests.remove(index)
|
|
426
|
+
if not pending_requests:
|
|
427
|
+
decoder.end_input()
|
|
428
|
+
break # we are not going to receive any more audio
|
|
429
|
+
else:
|
|
430
|
+
logger.error("Unexpected Resemble message %s", data)
|
|
431
|
+
|
|
432
|
+
tasks = [
|
|
433
|
+
asyncio.create_task(_send_task(ws)),
|
|
434
|
+
asyncio.create_task(_recv_task(ws)),
|
|
435
|
+
asyncio.create_task(_emit_task()),
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
await asyncio.gather(*tasks)
|
|
440
|
+
except asyncio.TimeoutError as e:
|
|
441
|
+
raise APITimeoutError() from e
|
|
442
|
+
except aiohttp.ClientResponseError as e:
|
|
443
|
+
raise APIStatusError(
|
|
444
|
+
message=e.message,
|
|
445
|
+
status_code=e.status,
|
|
446
|
+
request_id=str(current_index),
|
|
447
|
+
body=None,
|
|
448
|
+
) from e
|
|
449
|
+
except Exception as e:
|
|
450
|
+
raise APIConnectionError() from e
|
|
451
|
+
finally:
|
|
452
|
+
await utils.aio.gracefully_cancel(*tasks)
|
{livekit_plugins_resemble-0.1.0.dist-info → livekit_plugins_resemble-0.1.0rc1.dist-info}/METADATA
RENAMED
|
@@ -1,34 +1,25 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: livekit-plugins-resemble
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0rc1
|
|
4
4
|
Summary: LiveKit Agents Plugin for Resemble AI
|
|
5
|
-
Home-page: https://github.com/livekit/agents
|
|
6
|
-
License: Apache-2.0
|
|
7
5
|
Project-URL: Documentation, https://docs.livekit.io
|
|
8
6
|
Project-URL: Website, https://livekit.io/
|
|
9
7
|
Project-URL: Source, https://github.com/livekit/agents
|
|
10
|
-
|
|
8
|
+
Author-email: LiveKit <hello@livekit.io>
|
|
9
|
+
License-Expression: Apache-2.0
|
|
10
|
+
Keywords: audio,livekit,realtime,video,webrtc
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
|
-
Classifier:
|
|
13
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
13
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
14
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
18
|
+
Classifier: Topic :: Multimedia :: Video
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
20
|
Requires-Python: >=3.9.0
|
|
21
|
+
Requires-Dist: livekit-agents>=1.0.0.rc7
|
|
19
22
|
Description-Content-Type: text/markdown
|
|
20
|
-
Requires-Dist: livekit-agents[codecs]>=0.12.3
|
|
21
|
-
Requires-Dist: websockets==12.0
|
|
22
|
-
Dynamic: classifier
|
|
23
|
-
Dynamic: description
|
|
24
|
-
Dynamic: description-content-type
|
|
25
|
-
Dynamic: home-page
|
|
26
|
-
Dynamic: keywords
|
|
27
|
-
Dynamic: license
|
|
28
|
-
Dynamic: project-url
|
|
29
|
-
Dynamic: requires-dist
|
|
30
|
-
Dynamic: requires-python
|
|
31
|
-
Dynamic: summary
|
|
32
23
|
|
|
33
24
|
# LiveKit Plugins Resemble
|
|
34
25
|
|
|
@@ -147,4 +138,4 @@ This plugin uses two different approaches to generate speech:
|
|
|
147
138
|
1. **One-off Synthesis** - Uses Resemble's REST API for simple text-to-speech conversion
|
|
148
139
|
2. **Streaming Synthesis** - Uses Resemble's WebSocket API for real-time streaming synthesis
|
|
149
140
|
|
|
150
|
-
The WebSocket streaming API is only available for Resemble AI Business plan users.
|
|
141
|
+
The WebSocket streaming API is only available for Resemble AI Business plan users.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
livekit/plugins/resemble/__init__.py,sha256=9xuQxGYo_lKpEOLN_o7FbWVFiyeP-_v-PJmy_zOs5Y8,1108
|
|
2
|
+
livekit/plugins/resemble/log.py,sha256=Pgg3yqt4OUcjrnnF8SKfH7G-Dk7jFI0yIhDa5hjTW5k,71
|
|
3
|
+
livekit/plugins/resemble/models.py,sha256=nK29wOCKkS29KjbiDaTpb7mlmUQSad9U_0bTD8yRcwk,74
|
|
4
|
+
livekit/plugins/resemble/py.typed,sha256=ajz1GSNU9xYVrFEDSz6Xwg7amWQ_yvW75tQa1ZvRIWc,3
|
|
5
|
+
livekit/plugins/resemble/tts.py,sha256=N8T0NrYh_nW77DN9yzI4OiNmSxzY7h9fISD2xHYfI8A,16169
|
|
6
|
+
livekit/plugins/resemble/version.py,sha256=xALnp90Zq1RJWmgPi_DHhHh2uCgvunb6LEWBCu5gQ20,604
|
|
7
|
+
livekit_plugins_resemble-0.1.0rc1.dist-info/METADATA,sha256=Yw_yuKL1RPJ_8smSjz24cXblMDc3MamQSRfukffzlx0,4801
|
|
8
|
+
livekit_plugins_resemble-0.1.0rc1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
9
|
+
livekit_plugins_resemble-0.1.0rc1.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
livekit/plugins/resemble/__init__.py,sha256=9xuQxGYo_lKpEOLN_o7FbWVFiyeP-_v-PJmy_zOs5Y8,1108
|
|
2
|
-
livekit/plugins/resemble/log.py,sha256=Pgg3yqt4OUcjrnnF8SKfH7G-Dk7jFI0yIhDa5hjTW5k,71
|
|
3
|
-
livekit/plugins/resemble/models.py,sha256=C96f5YDrhc3UyG-P90QiGai6pIRr1W7mQgwXUlN9-ts,139
|
|
4
|
-
livekit/plugins/resemble/py.typed,sha256=ajz1GSNU9xYVrFEDSz6Xwg7amWQ_yvW75tQa1ZvRIWc,3
|
|
5
|
-
livekit/plugins/resemble/tts.py,sha256=x8ee1dhn983mXs9ia0U6ITa8CK6sqWF-LvlC2V41iZo,23016
|
|
6
|
-
livekit/plugins/resemble/version.py,sha256=vQH9cItKAVYAmrLbOntkbLqmxrUZrPiKb1TjkZ8jRKQ,600
|
|
7
|
-
livekit_plugins_resemble-0.1.0.dist-info/METADATA,sha256=XDyrThCqG_bDTIPIv2fIAodMp6hOp-4aKFi771W9uW0,4954
|
|
8
|
-
livekit_plugins_resemble-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
9
|
-
livekit_plugins_resemble-0.1.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
|
10
|
-
livekit_plugins_resemble-0.1.0.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
livekit
|