livekit-plugins-resemble 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-resemble might be problematic. Click here for more details.

@@ -0,0 +1,37 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .tts import TTS, ChunkedStream, SynthesizeStream
16
+ from .version import __version__
17
+
18
+ __all__ = ["TTS", "ChunkedStream", "SynthesizeStream", "__version__"]
19
+
20
+ from livekit.agents import Plugin
21
+
22
+
23
+ class ResemblePlugin(Plugin):
24
+ def __init__(self) -> None:
25
+ super().__init__(__name__, __version__, __package__)
26
+
27
+
28
+ Plugin.register_plugin(ResemblePlugin())
29
+
30
+ # Cleanup docs of unexported modules
31
+ _module = dir()
32
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
33
+
34
+ __pdoc__ = {}
35
+
36
+ for n in NOT_IN_ALL:
37
+ __pdoc__[n] = False
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.resemble")
@@ -0,0 +1,10 @@
1
+ from enum import Enum
2
+
3
+
4
+ class OutputFormat(str, Enum):
5
+ WAV = "wav"
6
+ MP3 = "mp3"
7
+
8
+
9
+ class Precision(str, Enum):
10
+ PCM_16 = "PCM_16"
@@ -0,0 +1,3 @@
1
+
2
+
3
+
@@ -0,0 +1,620 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import json
20
+ import os
21
+ import time
22
+ import weakref
23
+ from dataclasses import dataclass
24
+ from typing import Optional
25
+
26
+ import aiohttp
27
+ import websockets
28
+ from livekit import rtc
29
+ from livekit.agents import (
30
+ APIConnectionError,
31
+ APIConnectOptions,
32
+ APIStatusError,
33
+ APITimeoutError,
34
+ tts,
35
+ utils,
36
+ )
37
+
38
+ from .log import logger
39
+
40
+ RESEMBLE_WEBSOCKET_URL = "wss://websocket.cluster.resemble.ai/stream"
41
+ RESEMBLE_REST_API_URL = "https://f.cluster.resemble.ai/synthesize"
42
+ NUM_CHANNELS = 1
43
+ DEFAULT_VOICE_UUID = "55592656"
44
+
45
+
46
+ @dataclass
47
+ class _Options:
48
+ voice_uuid: str
49
+ sample_rate: int
50
+
51
+
52
+ class TTS(tts.TTS):
53
+ def __init__(
54
+ self,
55
+ *,
56
+ api_key: str | None = None,
57
+ voice_uuid: str | None = DEFAULT_VOICE_UUID,
58
+ sample_rate: int = 44100,
59
+ http_session: aiohttp.ClientSession | None = None,
60
+ ) -> None:
61
+ super().__init__(
62
+ capabilities=tts.TTSCapabilities(
63
+ streaming=True,
64
+ ),
65
+ sample_rate=sample_rate,
66
+ num_channels=NUM_CHANNELS,
67
+ )
68
+
69
+ # Validate and set API key
70
+ self._api_key = api_key or os.environ.get("RESEMBLE_API_KEY")
71
+ if not self._api_key:
72
+ raise ValueError(
73
+ "Resemble API key is required, either as argument or set RESEMBLE_API_KEY environment variable"
74
+ )
75
+
76
+ # Set options
77
+ self._opts = _Options(
78
+ voice_uuid=voice_uuid,
79
+ sample_rate=sample_rate,
80
+ )
81
+
82
+ self._session = http_session
83
+ self._streams = weakref.WeakSet[SynthesizeStream]()
84
+
85
+ # Create a connection pool for WebSockets
86
+ self._pool = utils.ConnectionPool[websockets.WebSocketClientProtocol](
87
+ connect_cb=self._connect_ws,
88
+ close_cb=self._close_ws,
89
+ )
90
+
91
+ async def _connect_ws(self) -> websockets.WebSocketClientProtocol:
92
+ """Connect to the Resemble WebSocket API."""
93
+ return await websockets.connect(
94
+ RESEMBLE_WEBSOCKET_URL,
95
+ extra_headers={"Authorization": f"Bearer {self._api_key}"},
96
+ ping_interval=5,
97
+ ping_timeout=10,
98
+ )
99
+
100
+ async def _close_ws(self, ws: websockets.WebSocketClientProtocol):
101
+ """Close the WebSocket connection."""
102
+ await ws.close()
103
+
104
+ def update_options(
105
+ self,
106
+ *,
107
+ voice_uuid: str | None = None,
108
+ **kwargs,
109
+ ) -> None:
110
+ """Update TTS options."""
111
+ if voice_uuid:
112
+ self._opts.voice_uuid = voice_uuid
113
+
114
+ def synthesize(
115
+ self,
116
+ text: str,
117
+ *,
118
+ conn_options: Optional[APIConnectOptions] = None,
119
+ ) -> "ChunkedStream":
120
+ """Synthesize text into speech using Resemble AI."""
121
+ return ChunkedStream(
122
+ tts=self,
123
+ input_text=text,
124
+ opts=self._opts,
125
+ conn_options=conn_options,
126
+ api_key=self._api_key,
127
+ session=self._session,
128
+ )
129
+
130
+ def stream(
131
+ self, *, conn_options: Optional[APIConnectOptions] = None
132
+ ) -> "SynthesizeStream":
133
+ """Create a streaming synthesis connection to Resemble AI."""
134
+ stream = SynthesizeStream(
135
+ tts=self,
136
+ opts=self._opts,
137
+ conn_options=conn_options,
138
+ api_key=self._api_key,
139
+ pool=self._pool,
140
+ )
141
+ self._streams.add(stream)
142
+ return stream
143
+
144
+ async def __aenter__(self) -> "TTS":
145
+ """Enter async context manager."""
146
+ return self
147
+
148
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
149
+ """Exit async context manager and clean up resources."""
150
+ await self.aclose()
151
+
152
+ async def aclose(self) -> None:
153
+ """Clean up resources."""
154
+ # Close all active streams
155
+ for stream in list(self._streams):
156
+ await stream.aclose()
157
+ self._streams.clear()
158
+
159
+ # Close the WebSocket connection pool
160
+ await self._pool.aclose()
161
+
162
+ await super().aclose()
163
+
164
+
165
+ class ChunkedStream(tts.ChunkedStream):
166
+ """Synthesize text into speech in one go using Resemble AI's REST API."""
167
+
168
+ def __init__(
169
+ self,
170
+ *,
171
+ tts: TTS,
172
+ input_text: str,
173
+ opts: _Options,
174
+ conn_options: Optional[APIConnectOptions] = None,
175
+ api_key: str | None = None,
176
+ session: aiohttp.ClientSession,
177
+ ) -> None:
178
+ super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
179
+ self._opts = opts
180
+ self._api_key = api_key
181
+ self._session = session
182
+ self._segment_id = utils.shortuuid()
183
+
184
+ async def _run(self) -> None:
185
+ """Run the synthesis process using REST API."""
186
+ request_id = utils.shortuuid()
187
+
188
+ # Create request headers
189
+ headers = {
190
+ "Authorization": f"Bearer {self._api_key}",
191
+ "Content-Type": "application/json",
192
+ "Accept": "application/json", # Expect JSON response
193
+ }
194
+
195
+ # Create request payload
196
+ payload = {
197
+ "voice_uuid": self._opts.voice_uuid,
198
+ "data": self._input_text,
199
+ "sample_rate": self._opts.sample_rate,
200
+ }
201
+
202
+ # Create decoder for audio processing
203
+ decoder = utils.codecs.AudioStreamDecoder(
204
+ sample_rate=self._opts.sample_rate,
205
+ num_channels=NUM_CHANNELS,
206
+ )
207
+
208
+ try:
209
+ # Make the HTTP request with explicit timeout
210
+ async with self._session.post(
211
+ RESEMBLE_REST_API_URL,
212
+ headers=headers,
213
+ json=payload,
214
+ timeout=aiohttp.ClientTimeout(
215
+ total=30, # 30 seconds total timeout
216
+ sock_connect=self._conn_options.timeout,
217
+ ),
218
+ ) as response:
219
+ if not response.ok:
220
+ error_text = await response.text()
221
+ raise APIStatusError(
222
+ message=f"Resemble API error: {error_text}",
223
+ status_code=response.status,
224
+ request_id=request_id,
225
+ body=error_text,
226
+ )
227
+
228
+ # Parse the JSON response
229
+ response_json = await response.json()
230
+
231
+ # Check for success
232
+ if not response_json.get("success", False):
233
+ issues = response_json.get("issues", ["Unknown error"])
234
+ error_msg = "; ".join(issues)
235
+ raise APIStatusError(
236
+ message=f"Resemble API returned failure: {error_msg}",
237
+ status_code=response.status,
238
+ request_id=request_id,
239
+ body=json.dumps(response_json),
240
+ )
241
+
242
+ # Extract base64-encoded audio content
243
+ audio_content_b64 = response_json.get("audio_content")
244
+ if not audio_content_b64:
245
+ raise APIStatusError(
246
+ message="No audio content in response",
247
+ status_code=response.status,
248
+ request_id=request_id,
249
+ body=json.dumps(response_json),
250
+ )
251
+
252
+ # Decode base64 to get raw audio bytes
253
+ audio_bytes = base64.b64decode(audio_content_b64)
254
+
255
+ # Create audio emitter
256
+ emitter = tts.SynthesizedAudioEmitter(
257
+ event_ch=self._event_ch,
258
+ request_id=request_id,
259
+ segment_id=self._segment_id,
260
+ )
261
+
262
+ # Push audio data to decoder
263
+ decoder.push(audio_bytes)
264
+ decoder.end_input()
265
+
266
+ # Emit audio frames
267
+ async for frame in decoder:
268
+ emitter.push(frame)
269
+
270
+ # Final flush of the emitter
271
+ emitter.flush()
272
+
273
+ except aiohttp.ClientResponseError as e:
274
+ # Handle HTTP errors (4xx, 5xx)
275
+ raise APIStatusError(
276
+ message=f"Resemble API error: {e.message}",
277
+ status_code=e.status,
278
+ request_id=request_id,
279
+ body=None,
280
+ ) from e
281
+ except asyncio.TimeoutError as e:
282
+ logger.error("Timeout while connecting to Resemble API")
283
+ raise APITimeoutError() from e
284
+ except aiohttp.ClientError as e:
285
+ logger.error(f"Connection error to Resemble API: {e}")
286
+ raise APIConnectionError(f"Connection error: {e}") from e
287
+ except Exception as e:
288
+ logger.error(f"Unexpected error during synthesis: {e}")
289
+ raise APIConnectionError(f"Error during synthesis: {e}") from e
290
+ finally:
291
+ await decoder.aclose()
292
+
293
+
294
+ class SynthesizeStream(tts.SynthesizeStream):
295
+ """Stream-based text-to-speech synthesis using Resemble AI WebSocket API.
296
+
297
+ This implementation connects to Resemble's WebSocket API for real-time streaming
298
+ synthesis. Note that this requires a Business plan subscription with Resemble AI.
299
+ """
300
+
301
+ def __init__(
302
+ self,
303
+ *,
304
+ tts: TTS,
305
+ opts: _Options,
306
+ conn_options: Optional[APIConnectOptions] = None,
307
+ api_key: str | None = None,
308
+ pool: utils.ConnectionPool[websockets.WebSocketClientProtocol],
309
+ ):
310
+ super().__init__(tts=tts, conn_options=conn_options)
311
+ self._opts = opts
312
+ self._api_key = api_key
313
+ self._request_id = 0
314
+ self._running = False
315
+ self._websocket = None
316
+ self._pool = pool
317
+
318
+ # Channels for communication between components
319
+ self._text_ch = asyncio.Queue()
320
+ self._audio_ch = asyncio.Queue()
321
+
322
+ # Tasks for processing
323
+ self._websocket_task = None
324
+ self._processing_task = None
325
+ self._closed = False
326
+
327
+ # Create a task to monitor the base class's input channel
328
+ self._input_monitor_task = asyncio.create_task(self._monitor_input_channel())
329
+
330
+ async def _monitor_input_channel(self) -> None:
331
+ """Monitor the input channel from the base class and forward to our text channel."""
332
+ try:
333
+ buffer = ""
334
+ word_count = 0
335
+ MIN_WORDS_TO_BUFFER = 5 # Buffer at least this many words before sending
336
+
337
+ async for item in self._input_ch:
338
+ if isinstance(item, self._FlushSentinel):
339
+ # When we get a flush sentinel, send any buffered text
340
+ if buffer:
341
+ await self._text_ch.put(buffer)
342
+ buffer = ""
343
+ word_count = 0
344
+ # Signal end of input
345
+ await self._text_ch.put(None)
346
+ continue
347
+ else:
348
+ # It's a text token, add to buffer
349
+ buffer += item
350
+
351
+ # Count words in the buffer
352
+ if item.strip() and (item.endswith(" ") or item.endswith("\n")):
353
+ word_count += 1
354
+
355
+ # Send buffer when we have enough words or hit sentence-ending punctuation
356
+ if word_count >= MIN_WORDS_TO_BUFFER or any(
357
+ buffer.rstrip().endswith(p) for p in [".", "!", "?", ":", ";"]
358
+ ):
359
+ await self._text_ch.put(buffer)
360
+ buffer = ""
361
+ word_count = 0
362
+
363
+ # End of input - send any remaining text in buffer
364
+ if buffer:
365
+ await self._text_ch.put(buffer)
366
+ except Exception as e:
367
+ logger.error(f"Error in input channel monitor: {e}")
368
+ finally:
369
+ if not self._closed:
370
+ # Signal end of input if our monitor is shutting down unexpectedly
371
+ await self._text_ch.put(None)
372
+
373
+ def _preprocess_text(self, text: str) -> str:
374
+ """Preprocess text before sending to Resemble API.
375
+
376
+ This ensures punctuation is properly handled by combining it with adjacent words.
377
+ """
378
+ # Skip if text is empty or None
379
+ if not text or not text.strip():
380
+ return text
381
+
382
+ # If text is just punctuation, add a space before it to avoid errors
383
+ if text.strip() in ",.!?;:":
384
+ return " " + text
385
+
386
+ return text
387
+
388
+ async def synthesize_text(self, text: str) -> None:
389
+ """Queue text for synthesis."""
390
+ if self._closed:
391
+ raise RuntimeError("Stream is closed")
392
+
393
+ # Preprocess text before sending
394
+ processed_text = self._preprocess_text(text)
395
+ await self._text_ch.put(processed_text)
396
+
397
+ if not self._running:
398
+ # Start processing if not already running
399
+ self._running = True
400
+ self._processing_task = asyncio.create_task(self._run())
401
+
402
+ # Wait for the text to be processed
403
+ await self._text_ch.join()
404
+
405
+ # Signal end of input - this will close the channel
406
+ # Note: We don't call flush() here because it's already done in end_input()
407
+ self.end_input()
408
+
409
+ async def aclose(self) -> None:
410
+ """Close the stream and clean up resources."""
411
+ self._closed = True
412
+
413
+ # Close the text channel to signal the end
414
+ if self._running:
415
+ await self._text_ch.put(None) # Signal end of input
416
+
417
+ # Cancel the input monitor task
418
+ if self._input_monitor_task and not self._input_monitor_task.done():
419
+ self._input_monitor_task.cancel()
420
+ try:
421
+ await self._input_monitor_task
422
+ except asyncio.CancelledError:
423
+ pass
424
+
425
+ # Cancel any running tasks
426
+ if self._processing_task and not self._processing_task.done():
427
+ self._processing_task.cancel()
428
+ try:
429
+ await self._processing_task
430
+ except asyncio.CancelledError:
431
+ pass
432
+
433
+ await super().aclose()
434
+
435
+ async def _run(self) -> None:
436
+ """Main processing loop for the streaming synthesis."""
437
+
438
+ # Initialize decoder for audio processing
439
+ decoder = utils.codecs.AudioStreamDecoder(
440
+ sample_rate=self._opts.sample_rate,
441
+ num_channels=NUM_CHANNELS,
442
+ )
443
+
444
+ try:
445
+ request_id = utils.shortuuid()
446
+ segment_id = utils.shortuuid()
447
+
448
+ # Create audio emitter
449
+ emitter = tts.SynthesizedAudioEmitter(
450
+ event_ch=self._event_ch,
451
+ request_id=request_id,
452
+ segment_id=segment_id,
453
+ )
454
+
455
+ # Track pending requests to ensure all responses are received
456
+ pending_requests = set()
457
+
458
+ async with self._pool.connection() as websocket:
459
+ # Start a separate task to handle WebSocket messages
460
+ async def _ws_recv_task():
461
+ try:
462
+ while not self._closed:
463
+ message = await websocket.recv()
464
+
465
+ # Handle JSON response
466
+ try:
467
+ data = json.loads(message)
468
+
469
+ # Handle audio data
470
+ if data.get("type") == "audio":
471
+ # Decode base64 audio content
472
+ audio_data = base64.b64decode(data["audio_content"])
473
+
474
+ try:
475
+ # For PCM_16, each sample is 2 bytes (16 bits)
476
+ bytes_per_sample = 2
477
+ samples_per_channel = (
478
+ len(audio_data) // bytes_per_sample
479
+ )
480
+
481
+ # Create audio frame directly from the PCM data
482
+ frame = rtc.AudioFrame(
483
+ data=audio_data,
484
+ samples_per_channel=samples_per_channel,
485
+ sample_rate=self._opts.sample_rate,
486
+ num_channels=NUM_CHANNELS,
487
+ )
488
+
489
+ emitter.push(frame)
490
+
491
+ emitter.flush()
492
+
493
+ except Exception as e:
494
+ logger.error(
495
+ f"Error processing audio data: {e}",
496
+ exc_info=True,
497
+ )
498
+
499
+ # Handle end of audio
500
+ elif data.get("type") == "audio_end":
501
+ # Complete current segment
502
+ emitter.flush()
503
+
504
+ # Mark request as completed if request_id is present
505
+ if "request_id" in data:
506
+ req_id = data["request_id"]
507
+ if req_id in pending_requests:
508
+ pending_requests.remove(req_id)
509
+
510
+ # Handle errors
511
+ elif data.get("type") == "error":
512
+ error_msg = data.get("message", "Unknown error")
513
+ logger.error(
514
+ f"Resemble WebSocket API error: {error_msg}"
515
+ )
516
+
517
+ # Don't raise an error for punctuation-only inputs
518
+ if (
519
+ "would not generate any audio" in error_msg
520
+ and data.get("request_id") in pending_requests
521
+ ):
522
+ req_id = data.get("request_id")
523
+ pending_requests.remove(req_id)
524
+ else:
525
+ raise APIStatusError(
526
+ message=f"Resemble API error: {error_msg}",
527
+ status_code=data.get("status_code", 500),
528
+ request_id=str(request_id),
529
+ body=None,
530
+ )
531
+ except json.JSONDecodeError:
532
+ logger.error(
533
+ f"Failed to decode JSON response: {message}"
534
+ )
535
+ except websockets.exceptions.ConnectionClosed as e:
536
+ logger.error(f"WebSocket connection closed: {e}")
537
+ if not self._closed:
538
+ raise APIConnectionError(
539
+ f"WebSocket connection closed unexpectedly: {e}"
540
+ )
541
+ except Exception as e:
542
+ logger.error(f"Error in WebSocket receive task: {e}")
543
+ if not self._closed:
544
+ raise
545
+
546
+ # Start WebSocket receive task
547
+ ws_task = asyncio.create_task(_ws_recv_task())
548
+
549
+ # Process text input
550
+ try:
551
+ while not self._closed:
552
+ # Wait for text to synthesize
553
+ text = await self._text_ch.get()
554
+
555
+ # None signals end of input
556
+ if text is None:
557
+ break
558
+
559
+ if not text.strip():
560
+ self._text_ch.task_done()
561
+ continue
562
+
563
+ # Preprocess text before sending
564
+ text = self._preprocess_text(text)
565
+
566
+ self._mark_started()
567
+
568
+ payload = {
569
+ "voice_uuid": self._opts.voice_uuid,
570
+ "data": text,
571
+ "request_id": self._request_id,
572
+ "sample_rate": self._opts.sample_rate,
573
+ "precision": "PCM_16",
574
+ "no_audio_header": True,
575
+ }
576
+
577
+ # Add request to pending set
578
+ pending_requests.add(self._request_id)
579
+
580
+ # Send synthesis request
581
+ await websocket.send(json.dumps(payload))
582
+ self._request_id += 1
583
+
584
+ # Mark the text as processed
585
+ self._text_ch.task_done()
586
+
587
+ # Wait for all pending requests to complete
588
+ if pending_requests:
589
+ # Wait with a timeout to avoid hanging indefinitely
590
+ wait_start = time.time()
591
+ while pending_requests and (time.time() - wait_start) < 5.0:
592
+ await asyncio.sleep(0.1)
593
+
594
+ if pending_requests:
595
+ logger.warning(
596
+ f"Timed out waiting for {len(pending_requests)} audio responses"
597
+ )
598
+
599
+ finally:
600
+ # Cancel WebSocket task
601
+ if not ws_task.done():
602
+ ws_task.cancel()
603
+ try:
604
+ await ws_task
605
+ except asyncio.CancelledError:
606
+ pass
607
+
608
+ except asyncio.CancelledError:
609
+ raise
610
+ except websockets.exceptions.ConnectionClosed as e:
611
+ logger.error(f"WebSocket connection closed: {e}")
612
+ raise APIConnectionError(f"WebSocket connection closed: {e}") from e
613
+ except Exception as e:
614
+ logger.error(f"Error during streaming synthesis: {e}")
615
+ raise APIConnectionError(f"Error during streaming synthesis: {e}") from e
616
+ finally:
617
+ # Clean up resources
618
+ await decoder.aclose()
619
+
620
+ self._running = False
@@ -0,0 +1,15 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "0.1.0"
@@ -0,0 +1,150 @@
1
+ Metadata-Version: 2.4
2
+ Name: livekit-plugins-resemble
3
+ Version: 0.1.0
4
+ Summary: LiveKit Agents Plugin for Resemble AI
5
+ Home-page: https://github.com/livekit/agents
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://docs.livekit.io
8
+ Project-URL: Website, https://livekit.io/
9
+ Project-URL: Source, https://github.com/livekit/agents
10
+ Keywords: webrtc,realtime,audio,video,livekit,resemble,tts
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Multimedia :: Sound/Audio
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Requires-Python: >=3.9.0
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: livekit-agents[codecs]>=0.12.3
21
+ Requires-Dist: websockets==12.0
22
+ Dynamic: classifier
23
+ Dynamic: description
24
+ Dynamic: description-content-type
25
+ Dynamic: home-page
26
+ Dynamic: keywords
27
+ Dynamic: license
28
+ Dynamic: project-url
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
32
+
33
+ # LiveKit Plugins Resemble
34
+
35
+ Agent Framework plugin for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install livekit-plugins-resemble
41
+ ```
42
+
43
+ ## Pre-requisites
44
+
45
+ You'll need an API key from Resemble AI. It can be set as an environment variable: `RESEMBLE_API_KEY`
46
+
47
+ Additionally, you'll need the voice UUID from your Resemble AI account.
48
+
49
+ ## Examples
50
+
51
+ ### Recommended
52
+
53
+ ```python
54
+ import asyncio
55
+ from livekit.plugins.resemble import TTS
56
+
57
+ async def run_tts_example():
58
+ # Use TTS with async context manager for automatic resource cleanup
59
+ async with TTS(
60
+ api_key="your_api_key", # or set RESEMBLE_API_KEY environment variable
61
+ voice_uuid="your_voice_uuid",
62
+ # Optional parameters
63
+ sample_rate=44100, # Sample rate in Hz (default: 44100)
64
+ precision="PCM_16", # Audio precision (PCM_32, PCM_24, PCM_16, MULAW)
65
+ output_format="wav", # Output format (wav or mp3)
66
+ ) as tts:
67
+ # One-off synthesis (uses REST API)
68
+ audio_stream = tts.synthesize("Hello, world!")
69
+
70
+ # Process chunks as they arrive
71
+ async for chunk in audio_stream:
72
+ # Audio data is in the 'frame.data' attribute of SynthesizedAudio objects
73
+ audio_data = chunk.frame.data
74
+ print(f"Received chunk: {len(audio_data)} bytes")
75
+
76
+ # Alternative: collect all audio at once into a single AudioFrame
77
+ audio_stream = tts.synthesize("Another example sentence.")
78
+ audio_frame = await audio_stream.collect()
79
+ print(f"Collected complete audio: {len(audio_frame.data)} bytes")
80
+
81
+ # Real-time streaming synthesis (uses WebSocket API)
82
+ # Only available for Business plan users in Resemble AI
83
+ stream = tts.stream()
84
+ await stream.synthesize_text("Hello, world!")
85
+
86
+
87
+
88
+ # Run the example
89
+ asyncio.run(run_tts_example())
90
+ ```
91
+
92
+ ### Alternative: Manual Resource Management
93
+
94
+ If you prefer to manage resources manually, make sure to properly clean up:
95
+
96
+ ```python
97
+ import asyncio
98
+ from livekit.plugins.resemble import TTS
99
+
100
+ async def run_tts_example():
101
+ # Initialize TTS with your credentials
102
+ tts = TTS(
103
+ api_key="your_api_key",
104
+ voice_uuid="your_voice_uuid",
105
+ )
106
+
107
+ try:
108
+ # TTS operations
109
+ audio_stream = tts.synthesize("Hello, world!")
110
+ async for chunk in audio_stream:
111
+ # Access audio data correctly
112
+ process_audio(chunk.frame.data)
113
+ finally:
114
+ # Always clean up resources when done
115
+ await tts.aclose()
116
+
117
+ # Run the example
118
+ asyncio.run(run_tts_example())
119
+ ```
120
+
121
+ ### Resource Management
122
+
123
+ When using this plugin outside of the LiveKit agent framework, it's important to properly manage the TTS instance lifecycle:
124
+
125
+ 1. **Preferred method**: Use the async context manager pattern (`async with TTS(...) as tts:`)
126
+ 2. If managing manually, always call `await tts.aclose()` in a finally block
127
+ 3. If you prefer to provide your own HTTP session, you can pass it using the `http_session` parameter:
128
+
129
+ ```python
130
+ import aiohttp
131
+
132
+ async def with_custom_session():
133
+ async with aiohttp.ClientSession() as session:
134
+ async with TTS(
135
+ api_key="your_api_key",
136
+ voice_uuid="your_voice_uuid",
137
+ http_session=session
138
+ ) as tts:
139
+ # Use TTS...
140
+ # No need to manually close anything - context managers handle it all
141
+ ```
142
+
143
+ ## Implementation Details
144
+
145
+ This plugin uses two different approaches to generate speech:
146
+
147
+ 1. **One-off Synthesis** - Uses Resemble's REST API for simple text-to-speech conversion
148
+ 2. **Streaming Synthesis** - Uses Resemble's WebSocket API for real-time streaming synthesis
149
+
150
+ The WebSocket streaming API is only available for Resemble AI Business plan users.
@@ -0,0 +1,10 @@
1
+ livekit/plugins/resemble/__init__.py,sha256=9xuQxGYo_lKpEOLN_o7FbWVFiyeP-_v-PJmy_zOs5Y8,1108
2
+ livekit/plugins/resemble/log.py,sha256=Pgg3yqt4OUcjrnnF8SKfH7G-Dk7jFI0yIhDa5hjTW5k,71
3
+ livekit/plugins/resemble/models.py,sha256=C96f5YDrhc3UyG-P90QiGai6pIRr1W7mQgwXUlN9-ts,139
4
+ livekit/plugins/resemble/py.typed,sha256=ajz1GSNU9xYVrFEDSz6Xwg7amWQ_yvW75tQa1ZvRIWc,3
5
+ livekit/plugins/resemble/tts.py,sha256=x8ee1dhn983mXs9ia0U6ITa8CK6sqWF-LvlC2V41iZo,23016
6
+ livekit/plugins/resemble/version.py,sha256=vQH9cItKAVYAmrLbOntkbLqmxrUZrPiKb1TjkZ8jRKQ,600
7
+ livekit_plugins_resemble-0.1.0.dist-info/METADATA,sha256=XDyrThCqG_bDTIPIv2fIAodMp6hOp-4aKFi771W9uW0,4954
8
+ livekit_plugins_resemble-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
9
+ livekit_plugins_resemble-0.1.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_resemble-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (78.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ livekit