livekit-plugins-resemble 0.1.0__py3-none-any.whl → 0.1.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-resemble might be problematic. Click here for more details.

@@ -1,10 +1,5 @@
1
1
  from enum import Enum
2
2
 
3
3
 
4
- class OutputFormat(str, Enum):
5
- WAV = "wav"
6
- MP3 = "mp3"
7
-
8
-
9
4
  class Precision(str, Enum):
10
5
  PCM_16 = "PCM_16"
@@ -1,4 +1,4 @@
1
- # Copyright 2023 LiveKit, Inc.
1
+ # Copyright 2025 LiveKit, Inc.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -18,22 +18,21 @@ import asyncio
18
18
  import base64
19
19
  import json
20
20
  import os
21
- import time
22
21
  import weakref
23
22
  from dataclasses import dataclass
24
- from typing import Optional
25
23
 
26
24
  import aiohttp
27
- import websockets
28
- from livekit import rtc
25
+
29
26
  from livekit.agents import (
30
27
  APIConnectionError,
31
28
  APIConnectOptions,
32
29
  APIStatusError,
33
30
  APITimeoutError,
31
+ tokenize,
34
32
  tts,
35
33
  utils,
36
34
  )
35
+ from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS
37
36
 
38
37
  from .log import logger
39
38
 
@@ -41,12 +40,14 @@ RESEMBLE_WEBSOCKET_URL = "wss://websocket.cluster.resemble.ai/stream"
41
40
  RESEMBLE_REST_API_URL = "https://f.cluster.resemble.ai/synthesize"
42
41
  NUM_CHANNELS = 1
43
42
  DEFAULT_VOICE_UUID = "55592656"
43
+ BUFFERED_WORDS_COUNT = 3
44
44
 
45
45
 
46
46
  @dataclass
47
- class _Options:
47
+ class _TTSOptions:
48
48
  voice_uuid: str
49
49
  sample_rate: int
50
+ tokenizer: tokenize.SentenceTokenizer
50
51
 
51
52
 
52
53
  class TTS(tts.TTS):
@@ -54,111 +55,127 @@ class TTS(tts.TTS):
54
55
  self,
55
56
  *,
56
57
  api_key: str | None = None,
57
- voice_uuid: str | None = DEFAULT_VOICE_UUID,
58
+ voice_uuid: str | None = None,
59
+ tokenizer: tokenize.SentenceTokenizer | None = None,
58
60
  sample_rate: int = 44100,
59
61
  http_session: aiohttp.ClientSession | None = None,
62
+ use_streaming: bool = True,
60
63
  ) -> None:
64
+ """
65
+ Create a new instance of the Resemble TTS.
66
+
67
+ See https://docs.app.resemble.ai/docs/text_to_speech/ for more documentation on all of these options.
68
+
69
+ Args:
70
+ voice_uuid (str, optional): The voice UUID for the desired voice. Defaults to None.
71
+ sample_rate (int, optional): The audio sample rate in Hz. Defaults to 44100.
72
+ api_key (str | None, optional): The Resemble API key. If not provided, it will be read from the RESEMBLE_API_KEY environment variable.
73
+ http_session (aiohttp.ClientSession | None, optional): An existing aiohttp ClientSession to use. If not provided, a new session will be created.
74
+ tokenizer (tokenize.SentenceTokenizer, optional): The tokenizer to use. Defaults to tokenize.SentenceTokenizer().
75
+ use_streaming (bool, optional): Whether to use streaming or not. Defaults to True.
76
+ """ # noqa: E501
61
77
  super().__init__(
62
- capabilities=tts.TTSCapabilities(
63
- streaming=True,
64
- ),
78
+ capabilities=tts.TTSCapabilities(streaming=use_streaming),
65
79
  sample_rate=sample_rate,
66
80
  num_channels=NUM_CHANNELS,
67
81
  )
68
82
 
69
- # Validate and set API key
70
- self._api_key = api_key or os.environ.get("RESEMBLE_API_KEY")
71
- if not self._api_key:
83
+ api_key = api_key or os.environ.get("RESEMBLE_API_KEY")
84
+ if not api_key:
72
85
  raise ValueError(
73
- "Resemble API key is required, either as argument or set RESEMBLE_API_KEY environment variable"
86
+ "Resemble API key is required, either as argument or set RESEMBLE_API_KEY"
87
+ " environment variable"
74
88
  )
89
+ self._api_key = api_key
90
+
91
+ if tokenizer is None:
92
+ tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
93
+
94
+ if voice_uuid is None:
95
+ voice_uuid = DEFAULT_VOICE_UUID
75
96
 
76
- # Set options
77
- self._opts = _Options(
97
+ self._opts = _TTSOptions(
78
98
  voice_uuid=voice_uuid,
79
99
  sample_rate=sample_rate,
100
+ tokenizer=tokenizer,
80
101
  )
81
102
 
82
103
  self._session = http_session
83
104
  self._streams = weakref.WeakSet[SynthesizeStream]()
84
-
85
- # Create a connection pool for WebSockets
86
- self._pool = utils.ConnectionPool[websockets.WebSocketClientProtocol](
105
+ self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
87
106
  connect_cb=self._connect_ws,
88
107
  close_cb=self._close_ws,
89
108
  )
90
109
 
91
- async def _connect_ws(self) -> websockets.WebSocketClientProtocol:
92
- """Connect to the Resemble WebSocket API."""
93
- return await websockets.connect(
94
- RESEMBLE_WEBSOCKET_URL,
95
- extra_headers={"Authorization": f"Bearer {self._api_key}"},
96
- ping_interval=5,
97
- ping_timeout=10,
110
+ async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
111
+ session = self._ensure_session()
112
+
113
+ return await asyncio.wait_for(
114
+ session.ws_connect(
115
+ RESEMBLE_WEBSOCKET_URL,
116
+ headers={"Authorization": f"Bearer {self._api_key}"},
117
+ ),
118
+ self._conn_options.timeout,
98
119
  )
99
120
 
100
- async def _close_ws(self, ws: websockets.WebSocketClientProtocol):
101
- """Close the WebSocket connection."""
121
+ async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
102
122
  await ws.close()
103
123
 
124
+ def _ensure_session(self) -> aiohttp.ClientSession:
125
+ if not self._session:
126
+ self._session = utils.http_context.http_session()
127
+
128
+ return self._session
129
+
130
+ def prewarm(self) -> None:
131
+ self._pool.prewarm()
132
+
104
133
  def update_options(
105
134
  self,
106
135
  *,
107
136
  voice_uuid: str | None = None,
108
- **kwargs,
137
+ sample_rate: int | None = None,
109
138
  ) -> None:
110
- """Update TTS options."""
111
- if voice_uuid:
112
- self._opts.voice_uuid = voice_uuid
139
+ """
140
+ Update the Text-to-Speech (TTS) configuration options.
141
+
142
+ Args:
143
+ voice_uuid (str, optional): The voice UUID for the desired voice.
144
+ sample_rate (int, optional): The audio sample rate in Hz.
145
+ """ # noqa: E501
146
+ self._opts.voice_uuid = voice_uuid or self._opts.voice_uuid
147
+ self._opts.sample_rate = sample_rate or self._opts.sample_rate
113
148
 
114
149
  def synthesize(
115
150
  self,
116
151
  text: str,
117
152
  *,
118
- conn_options: Optional[APIConnectOptions] = None,
119
- ) -> "ChunkedStream":
120
- """Synthesize text into speech using Resemble AI."""
153
+ conn_options: APIConnectOptions | None = None,
154
+ ) -> ChunkedStream:
121
155
  return ChunkedStream(
122
156
  tts=self,
123
157
  input_text=text,
158
+ conn_options=conn_options or DEFAULT_API_CONNECT_OPTIONS,
124
159
  opts=self._opts,
125
- conn_options=conn_options,
126
160
  api_key=self._api_key,
127
- session=self._session,
161
+ session=self._ensure_session(),
128
162
  )
129
163
 
130
- def stream(
131
- self, *, conn_options: Optional[APIConnectOptions] = None
132
- ) -> "SynthesizeStream":
133
- """Create a streaming synthesis connection to Resemble AI."""
164
+ def stream(self, *, conn_options: APIConnectOptions | None = None) -> SynthesizeStream:
134
165
  stream = SynthesizeStream(
135
166
  tts=self,
167
+ pool=self._pool,
136
168
  opts=self._opts,
137
- conn_options=conn_options,
138
169
  api_key=self._api_key,
139
- pool=self._pool,
140
170
  )
141
171
  self._streams.add(stream)
142
172
  return stream
143
173
 
144
- async def __aenter__(self) -> "TTS":
145
- """Enter async context manager."""
146
- return self
147
-
148
- async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
149
- """Exit async context manager and clean up resources."""
150
- await self.aclose()
151
-
152
174
  async def aclose(self) -> None:
153
- """Clean up resources."""
154
- # Close all active streams
155
175
  for stream in list(self._streams):
156
176
  await stream.aclose()
157
177
  self._streams.clear()
158
-
159
- # Close the WebSocket connection pool
160
178
  await self._pool.aclose()
161
-
162
179
  await super().aclose()
163
180
 
164
181
 
@@ -170,19 +187,15 @@ class ChunkedStream(tts.ChunkedStream):
170
187
  *,
171
188
  tts: TTS,
172
189
  input_text: str,
173
- opts: _Options,
174
- conn_options: Optional[APIConnectOptions] = None,
175
- api_key: str | None = None,
190
+ opts: _TTSOptions,
191
+ conn_options: APIConnectOptions,
192
+ api_key: str,
176
193
  session: aiohttp.ClientSession,
177
194
  ) -> None:
178
195
  super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
179
- self._opts = opts
180
- self._api_key = api_key
181
- self._session = session
182
- self._segment_id = utils.shortuuid()
196
+ self._opts, self._session, self._api_key = opts, session, api_key
183
197
 
184
198
  async def _run(self) -> None:
185
- """Run the synthesis process using REST API."""
186
199
  request_id = utils.shortuuid()
187
200
 
188
201
  # Create request headers
@@ -197,35 +210,24 @@ class ChunkedStream(tts.ChunkedStream):
197
210
  "voice_uuid": self._opts.voice_uuid,
198
211
  "data": self._input_text,
199
212
  "sample_rate": self._opts.sample_rate,
213
+ "precision": "PCM_16",
200
214
  }
201
-
202
- # Create decoder for audio processing
203
215
  decoder = utils.codecs.AudioStreamDecoder(
204
216
  sample_rate=self._opts.sample_rate,
205
217
  num_channels=NUM_CHANNELS,
206
218
  )
207
219
 
208
220
  try:
209
- # Make the HTTP request with explicit timeout
210
221
  async with self._session.post(
211
222
  RESEMBLE_REST_API_URL,
212
223
  headers=headers,
213
224
  json=payload,
214
225
  timeout=aiohttp.ClientTimeout(
215
- total=30, # 30 seconds total timeout
226
+ total=30,
216
227
  sock_connect=self._conn_options.timeout,
217
228
  ),
218
229
  ) as response:
219
- if not response.ok:
220
- error_text = await response.text()
221
- raise APIStatusError(
222
- message=f"Resemble API error: {error_text}",
223
- status_code=response.status,
224
- request_id=request_id,
225
- body=error_text,
226
- )
227
-
228
- # Parse the JSON response
230
+ response.raise_for_status()
229
231
  response_json = await response.json()
230
232
 
231
233
  # Check for success
@@ -251,42 +253,32 @@ class ChunkedStream(tts.ChunkedStream):
251
253
 
252
254
  # Decode base64 to get raw audio bytes
253
255
  audio_bytes = base64.b64decode(audio_content_b64)
256
+ decoder.push(audio_bytes)
257
+ decoder.end_input()
254
258
 
255
- # Create audio emitter
256
259
  emitter = tts.SynthesizedAudioEmitter(
257
260
  event_ch=self._event_ch,
258
261
  request_id=request_id,
259
- segment_id=self._segment_id,
260
262
  )
261
-
262
- # Push audio data to decoder
263
- decoder.push(audio_bytes)
264
- decoder.end_input()
265
-
266
- # Emit audio frames
267
263
  async for frame in decoder:
268
264
  emitter.push(frame)
269
-
270
- # Final flush of the emitter
271
265
  emitter.flush()
272
266
 
273
267
  except aiohttp.ClientResponseError as e:
274
- # Handle HTTP errors (4xx, 5xx)
275
268
  raise APIStatusError(
276
- message=f"Resemble API error: {e.message}",
269
+ message=e.message,
277
270
  status_code=e.status,
278
271
  request_id=request_id,
279
- body=None,
272
+ body=f"resemble api error: {str(e)}",
280
273
  ) from e
281
274
  except asyncio.TimeoutError as e:
282
- logger.error("Timeout while connecting to Resemble API")
283
275
  raise APITimeoutError() from e
284
276
  except aiohttp.ClientError as e:
285
- logger.error(f"Connection error to Resemble API: {e}")
286
- raise APIConnectionError(f"Connection error: {e}") from e
277
+ raise APIConnectionError(
278
+ message=f"Resemble API connection error: {str(e)}",
279
+ ) from e
287
280
  except Exception as e:
288
- logger.error(f"Unexpected error during synthesis: {e}")
289
- raise APIConnectionError(f"Error during synthesis: {e}") from e
281
+ raise APIConnectionError(f"Error during synthesis: {str(e)}") from e
290
282
  finally:
291
283
  await decoder.aclose()
292
284
 
@@ -294,6 +286,7 @@ class ChunkedStream(tts.ChunkedStream):
294
286
  class SynthesizeStream(tts.SynthesizeStream):
295
287
  """Stream-based text-to-speech synthesis using Resemble AI WebSocket API.
296
288
 
289
+
297
290
  This implementation connects to Resemble's WebSocket API for real-time streaming
298
291
  synthesis. Note that this requires a Business plan subscription with Resemble AI.
299
292
  """
@@ -302,319 +295,158 @@ class SynthesizeStream(tts.SynthesizeStream):
302
295
  self,
303
296
  *,
304
297
  tts: TTS,
305
- opts: _Options,
306
- conn_options: Optional[APIConnectOptions] = None,
307
- api_key: str | None = None,
308
- pool: utils.ConnectionPool[websockets.WebSocketClientProtocol],
298
+ opts: _TTSOptions,
299
+ pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
300
+ api_key: str,
309
301
  ):
310
- super().__init__(tts=tts, conn_options=conn_options)
311
- self._opts = opts
312
- self._api_key = api_key
313
- self._request_id = 0
314
- self._running = False
315
- self._websocket = None
316
- self._pool = pool
317
-
318
- # Channels for communication between components
319
- self._text_ch = asyncio.Queue()
320
- self._audio_ch = asyncio.Queue()
321
-
322
- # Tasks for processing
323
- self._websocket_task = None
324
- self._processing_task = None
325
- self._closed = False
326
-
327
- # Create a task to monitor the base class's input channel
328
- self._input_monitor_task = asyncio.create_task(self._monitor_input_channel())
302
+ super().__init__(tts=tts)
303
+ self._opts, self._pool, self._api_key = opts, pool, api_key
329
304
 
330
- async def _monitor_input_channel(self) -> None:
331
- """Monitor the input channel from the base class and forward to our text channel."""
305
+ async def _run(self) -> None:
306
+ request_id = utils.shortuuid()
307
+ self._segments_ch = utils.aio.Chan[tokenize.SentenceStream]()
308
+
309
+ @utils.log_exceptions(logger=logger)
310
+ async def _tokenize_input():
311
+ """tokenize text from the input_ch to words"""
312
+ input_stream = None
313
+ async for input in self._input_ch:
314
+ if isinstance(input, str):
315
+ if input_stream is None:
316
+ # new segment (after flush for e.g)
317
+ input_stream = self._opts.tokenizer.stream()
318
+ self._segments_ch.send_nowait(input_stream)
319
+ input_stream.push_text(input)
320
+ elif isinstance(input, self._FlushSentinel):
321
+ if input_stream is not None:
322
+ input_stream.end_input()
323
+ input_stream = None
324
+ if input_stream is not None:
325
+ input_stream.end_input()
326
+ self._segments_ch.close()
327
+
328
+ @utils.log_exceptions(logger=logger)
329
+ async def _process_segments():
330
+ async for input_stream in self._segments_ch:
331
+ await self._run_ws(input_stream)
332
+
333
+ tasks = [
334
+ asyncio.create_task(_tokenize_input()),
335
+ asyncio.create_task(_process_segments()),
336
+ ]
332
337
  try:
333
- buffer = ""
334
- word_count = 0
335
- MIN_WORDS_TO_BUFFER = 5 # Buffer at least this many words before sending
336
-
337
- async for item in self._input_ch:
338
- if isinstance(item, self._FlushSentinel):
339
- # When we get a flush sentinel, send any buffered text
340
- if buffer:
341
- await self._text_ch.put(buffer)
342
- buffer = ""
343
- word_count = 0
344
- # Signal end of input
345
- await self._text_ch.put(None)
346
- continue
347
- else:
348
- # It's a text token, add to buffer
349
- buffer += item
350
-
351
- # Count words in the buffer
352
- if item.strip() and (item.endswith(" ") or item.endswith("\n")):
353
- word_count += 1
354
-
355
- # Send buffer when we have enough words or hit sentence-ending punctuation
356
- if word_count >= MIN_WORDS_TO_BUFFER or any(
357
- buffer.rstrip().endswith(p) for p in [".", "!", "?", ":", ";"]
358
- ):
359
- await self._text_ch.put(buffer)
360
- buffer = ""
361
- word_count = 0
362
-
363
- # End of input - send any remaining text in buffer
364
- if buffer:
365
- await self._text_ch.put(buffer)
338
+ await asyncio.gather(*tasks)
339
+ except asyncio.TimeoutError as e:
340
+ raise APITimeoutError() from e
341
+ except aiohttp.ClientResponseError as e:
342
+ raise APIStatusError(
343
+ message=e.message,
344
+ status_code=e.status,
345
+ request_id=request_id,
346
+ body=None,
347
+ ) from e
366
348
  except Exception as e:
367
- logger.error(f"Error in input channel monitor: {e}")
349
+ raise APIConnectionError() from e
368
350
  finally:
369
- if not self._closed:
370
- # Signal end of input if our monitor is shutting down unexpectedly
371
- await self._text_ch.put(None)
372
-
373
- def _preprocess_text(self, text: str) -> str:
374
- """Preprocess text before sending to Resemble API.
375
-
376
- This ensures punctuation is properly handled by combining it with adjacent words.
377
- """
378
- # Skip if text is empty or None
379
- if not text or not text.strip():
380
- return text
381
-
382
- # If text is just punctuation, add a space before it to avoid errors
383
- if text.strip() in ",.!?;:":
384
- return " " + text
385
-
386
- return text
387
-
388
- async def synthesize_text(self, text: str) -> None:
389
- """Queue text for synthesis."""
390
- if self._closed:
391
- raise RuntimeError("Stream is closed")
392
-
393
- # Preprocess text before sending
394
- processed_text = self._preprocess_text(text)
395
- await self._text_ch.put(processed_text)
396
-
397
- if not self._running:
398
- # Start processing if not already running
399
- self._running = True
400
- self._processing_task = asyncio.create_task(self._run())
401
-
402
- # Wait for the text to be processed
403
- await self._text_ch.join()
404
-
405
- # Signal end of input - this will close the channel
406
- # Note: We don't call flush() here because it's already done in end_input()
407
- self.end_input()
408
-
409
- async def aclose(self) -> None:
410
- """Close the stream and clean up resources."""
411
- self._closed = True
412
-
413
- # Close the text channel to signal the end
414
- if self._running:
415
- await self._text_ch.put(None) # Signal end of input
416
-
417
- # Cancel the input monitor task
418
- if self._input_monitor_task and not self._input_monitor_task.done():
419
- self._input_monitor_task.cancel()
420
- try:
421
- await self._input_monitor_task
422
- except asyncio.CancelledError:
423
- pass
424
-
425
- # Cancel any running tasks
426
- if self._processing_task and not self._processing_task.done():
427
- self._processing_task.cancel()
428
- try:
429
- await self._processing_task
430
- except asyncio.CancelledError:
431
- pass
432
-
433
- await super().aclose()
351
+ await utils.aio.gracefully_cancel(*tasks)
434
352
 
435
- async def _run(self) -> None:
436
- """Main processing loop for the streaming synthesis."""
437
-
438
- # Initialize decoder for audio processing
439
- decoder = utils.codecs.AudioStreamDecoder(
440
- sample_rate=self._opts.sample_rate,
441
- num_channels=NUM_CHANNELS,
442
- )
443
-
444
- try:
445
- request_id = utils.shortuuid()
353
+ async def _run_ws(
354
+ self,
355
+ input_stream: tokenize.SentenceStream,
356
+ ) -> None:
357
+ async with self._pool.connection() as ws:
446
358
  segment_id = utils.shortuuid()
447
-
448
- # Create audio emitter
449
- emitter = tts.SynthesizedAudioEmitter(
450
- event_ch=self._event_ch,
451
- request_id=request_id,
452
- segment_id=segment_id,
359
+ decoder = utils.codecs.AudioStreamDecoder(
360
+ sample_rate=self._opts.sample_rate,
361
+ num_channels=NUM_CHANNELS,
453
362
  )
454
-
455
- # Track pending requests to ensure all responses are received
363
+ index_lock = asyncio.Lock()
364
+ current_index = 0
456
365
  pending_requests = set()
457
366
 
458
- async with self._pool.connection() as websocket:
459
- # Start a separate task to handle WebSocket messages
460
- async def _ws_recv_task():
461
- try:
462
- while not self._closed:
463
- message = await websocket.recv()
464
-
465
- # Handle JSON response
466
- try:
467
- data = json.loads(message)
468
-
469
- # Handle audio data
470
- if data.get("type") == "audio":
471
- # Decode base64 audio content
472
- audio_data = base64.b64decode(data["audio_content"])
473
-
474
- try:
475
- # For PCM_16, each sample is 2 bytes (16 bits)
476
- bytes_per_sample = 2
477
- samples_per_channel = (
478
- len(audio_data) // bytes_per_sample
479
- )
480
-
481
- # Create audio frame directly from the PCM data
482
- frame = rtc.AudioFrame(
483
- data=audio_data,
484
- samples_per_channel=samples_per_channel,
485
- sample_rate=self._opts.sample_rate,
486
- num_channels=NUM_CHANNELS,
487
- )
488
-
489
- emitter.push(frame)
490
-
491
- emitter.flush()
492
-
493
- except Exception as e:
494
- logger.error(
495
- f"Error processing audio data: {e}",
496
- exc_info=True,
497
- )
498
-
499
- # Handle end of audio
500
- elif data.get("type") == "audio_end":
501
- # Complete current segment
502
- emitter.flush()
503
-
504
- # Mark request as completed if request_id is present
505
- if "request_id" in data:
506
- req_id = data["request_id"]
507
- if req_id in pending_requests:
508
- pending_requests.remove(req_id)
509
-
510
- # Handle errors
511
- elif data.get("type") == "error":
512
- error_msg = data.get("message", "Unknown error")
513
- logger.error(
514
- f"Resemble WebSocket API error: {error_msg}"
515
- )
516
-
517
- # Don't raise an error for punctuation-only inputs
518
- if (
519
- "would not generate any audio" in error_msg
520
- and data.get("request_id") in pending_requests
521
- ):
522
- req_id = data.get("request_id")
523
- pending_requests.remove(req_id)
524
- else:
525
- raise APIStatusError(
526
- message=f"Resemble API error: {error_msg}",
527
- status_code=data.get("status_code", 500),
528
- request_id=str(request_id),
529
- body=None,
530
- )
531
- except json.JSONDecodeError:
532
- logger.error(
533
- f"Failed to decode JSON response: {message}"
534
- )
535
- except websockets.exceptions.ConnectionClosed as e:
536
- logger.error(f"WebSocket connection closed: {e}")
537
- if not self._closed:
538
- raise APIConnectionError(
539
- f"WebSocket connection closed unexpectedly: {e}"
540
- )
541
- except Exception as e:
542
- logger.error(f"Error in WebSocket receive task: {e}")
543
- if not self._closed:
544
- raise
545
-
546
- # Start WebSocket receive task
547
- ws_task = asyncio.create_task(_ws_recv_task())
548
-
549
- # Process text input
550
- try:
551
- while not self._closed:
552
- # Wait for text to synthesize
553
- text = await self._text_ch.get()
554
-
555
- # None signals end of input
556
- if text is None:
557
- break
558
-
559
- if not text.strip():
560
- self._text_ch.task_done()
561
- continue
562
-
563
- # Preprocess text before sending
564
- text = self._preprocess_text(text)
565
-
566
- self._mark_started()
567
-
568
- payload = {
569
- "voice_uuid": self._opts.voice_uuid,
570
- "data": text,
571
- "request_id": self._request_id,
572
- "sample_rate": self._opts.sample_rate,
573
- "precision": "PCM_16",
574
- "no_audio_header": True,
575
- }
576
-
577
- # Add request to pending set
578
- pending_requests.add(self._request_id)
579
-
580
- # Send synthesis request
581
- await websocket.send(json.dumps(payload))
582
- self._request_id += 1
583
-
584
- # Mark the text as processed
585
- self._text_ch.task_done()
586
-
587
- # Wait for all pending requests to complete
588
- if pending_requests:
589
- # Wait with a timeout to avoid hanging indefinitely
590
- wait_start = time.time()
591
- while pending_requests and (time.time() - wait_start) < 5.0:
592
- await asyncio.sleep(0.1)
593
-
594
- if pending_requests:
595
- logger.warning(
596
- f"Timed out waiting for {len(pending_requests)} audio responses"
597
- )
598
-
599
- finally:
600
- # Cancel WebSocket task
601
- if not ws_task.done():
602
- ws_task.cancel()
603
- try:
604
- await ws_task
605
- except asyncio.CancelledError:
606
- pass
607
-
608
- except asyncio.CancelledError:
609
- raise
610
- except websockets.exceptions.ConnectionClosed as e:
611
- logger.error(f"WebSocket connection closed: {e}")
612
- raise APIConnectionError(f"WebSocket connection closed: {e}") from e
613
- except Exception as e:
614
- logger.error(f"Error during streaming synthesis: {e}")
615
- raise APIConnectionError(f"Error during streaming synthesis: {e}") from e
616
- finally:
617
- # Clean up resources
618
- await decoder.aclose()
367
+ @utils.log_exceptions(logger=logger)
368
+ async def _send_task(ws: aiohttp.ClientWebSocketResponse):
369
+ nonlocal current_index
370
+ index = 0
371
+ async for data in input_stream:
372
+ payload = {
373
+ "voice_uuid": self._opts.voice_uuid,
374
+ "data": data.token,
375
+ "request_id": index,
376
+ "sample_rate": self._opts.sample_rate,
377
+ "precision": "PCM_16",
378
+ "output_format": "mp3",
379
+ }
380
+ async with index_lock:
381
+ pending_requests.add(index)
382
+ index += 1
383
+ current_index = index
384
+ await ws.send_str(json.dumps(payload))
385
+
386
+ @utils.log_exceptions(logger=logger)
387
+ async def _emit_task():
388
+ emitter = tts.SynthesizedAudioEmitter(
389
+ event_ch=self._event_ch,
390
+ request_id=str(current_index),
391
+ segment_id=segment_id,
392
+ )
393
+ async for frame in decoder:
394
+ emitter.push(frame)
395
+ emitter.flush()
619
396
 
620
- self._running = False
397
+ @utils.log_exceptions(logger=logger)
398
+ async def _recv_task(ws: aiohttp.ClientWebSocketResponse):
399
+ while True:
400
+ msg = await ws.receive()
401
+ if msg.type in (
402
+ aiohttp.WSMsgType.CLOSED,
403
+ aiohttp.WSMsgType.CLOSE,
404
+ aiohttp.WSMsgType.CLOSING,
405
+ ):
406
+ raise APIStatusError(
407
+ "Resemble connection closed unexpectedly",
408
+ request_id=str(current_index),
409
+ )
410
+
411
+ if msg.type != aiohttp.WSMsgType.TEXT:
412
+ logger.warning("Unexpected Resemble message type %s", msg.type)
413
+ continue
414
+
415
+ data = json.loads(msg.data)
416
+
417
+ if data.get("type") == "audio":
418
+ if data.get("audio_content", None):
419
+ b64data = base64.b64decode(data["audio_content"])
420
+ decoder.push(b64data)
421
+
422
+ elif data.get("type") == "audio_end":
423
+ async with index_lock:
424
+ index = data["request_id"]
425
+ pending_requests.remove(index)
426
+ if not pending_requests:
427
+ decoder.end_input()
428
+ break # we are not going to receive any more audio
429
+ else:
430
+ logger.error("Unexpected Resemble message %s", data)
431
+
432
+ tasks = [
433
+ asyncio.create_task(_send_task(ws)),
434
+ asyncio.create_task(_recv_task(ws)),
435
+ asyncio.create_task(_emit_task()),
436
+ ]
437
+
438
+ try:
439
+ await asyncio.gather(*tasks)
440
+ except asyncio.TimeoutError as e:
441
+ raise APITimeoutError() from e
442
+ except aiohttp.ClientResponseError as e:
443
+ raise APIStatusError(
444
+ message=e.message,
445
+ status_code=e.status,
446
+ request_id=str(current_index),
447
+ body=None,
448
+ ) from e
449
+ except Exception as e:
450
+ raise APIConnectionError() from e
451
+ finally:
452
+ await utils.aio.gracefully_cancel(*tasks)
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.1.0"
15
+ __version__ = '0.1.0.rc1'
@@ -1,34 +1,25 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-resemble
3
- Version: 0.1.0
3
+ Version: 0.1.0rc1
4
4
  Summary: LiveKit Agents Plugin for Resemble AI
5
- Home-page: https://github.com/livekit/agents
6
- License: Apache-2.0
7
5
  Project-URL: Documentation, https://docs.livekit.io
8
6
  Project-URL: Website, https://livekit.io/
9
7
  Project-URL: Source, https://github.com/livekit/agents
10
- Keywords: webrtc,realtime,audio,video,livekit,resemble,tts
8
+ Author-email: LiveKit <hello@livekit.io>
9
+ License-Expression: Apache-2.0
10
+ Keywords: audio,livekit,realtime,video,webrtc
11
11
  Classifier: Intended Audience :: Developers
12
- Classifier: Topic :: Multimedia :: Sound/Audio
13
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: License :: OSI Approved :: Apache Software License
14
13
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
14
  Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Topic :: Multimedia :: Sound/Audio
18
+ Classifier: Topic :: Multimedia :: Video
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
20
  Requires-Python: >=3.9.0
21
+ Requires-Dist: livekit-agents>=1.0.0.rc7
19
22
  Description-Content-Type: text/markdown
20
- Requires-Dist: livekit-agents[codecs]>=0.12.3
21
- Requires-Dist: websockets==12.0
22
- Dynamic: classifier
23
- Dynamic: description
24
- Dynamic: description-content-type
25
- Dynamic: home-page
26
- Dynamic: keywords
27
- Dynamic: license
28
- Dynamic: project-url
29
- Dynamic: requires-dist
30
- Dynamic: requires-python
31
- Dynamic: summary
32
23
 
33
24
  # LiveKit Plugins Resemble
34
25
 
@@ -147,4 +138,4 @@ This plugin uses two different approaches to generate speech:
147
138
  1. **One-off Synthesis** - Uses Resemble's REST API for simple text-to-speech conversion
148
139
  2. **Streaming Synthesis** - Uses Resemble's WebSocket API for real-time streaming synthesis
149
140
 
150
- The WebSocket streaming API is only available for Resemble AI Business plan users.
141
+ The WebSocket streaming API is only available for Resemble AI Business plan users.
@@ -0,0 +1,9 @@
1
+ livekit/plugins/resemble/__init__.py,sha256=9xuQxGYo_lKpEOLN_o7FbWVFiyeP-_v-PJmy_zOs5Y8,1108
2
+ livekit/plugins/resemble/log.py,sha256=Pgg3yqt4OUcjrnnF8SKfH7G-Dk7jFI0yIhDa5hjTW5k,71
3
+ livekit/plugins/resemble/models.py,sha256=nK29wOCKkS29KjbiDaTpb7mlmUQSad9U_0bTD8yRcwk,74
4
+ livekit/plugins/resemble/py.typed,sha256=ajz1GSNU9xYVrFEDSz6Xwg7amWQ_yvW75tQa1ZvRIWc,3
5
+ livekit/plugins/resemble/tts.py,sha256=N8T0NrYh_nW77DN9yzI4OiNmSxzY7h9fISD2xHYfI8A,16169
6
+ livekit/plugins/resemble/version.py,sha256=xALnp90Zq1RJWmgPi_DHhHh2uCgvunb6LEWBCu5gQ20,604
7
+ livekit_plugins_resemble-0.1.0rc1.dist-info/METADATA,sha256=Yw_yuKL1RPJ_8smSjz24cXblMDc3MamQSRfukffzlx0,4801
8
+ livekit_plugins_resemble-0.1.0rc1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ livekit_plugins_resemble-0.1.0rc1.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1,10 +0,0 @@
1
- livekit/plugins/resemble/__init__.py,sha256=9xuQxGYo_lKpEOLN_o7FbWVFiyeP-_v-PJmy_zOs5Y8,1108
2
- livekit/plugins/resemble/log.py,sha256=Pgg3yqt4OUcjrnnF8SKfH7G-Dk7jFI0yIhDa5hjTW5k,71
3
- livekit/plugins/resemble/models.py,sha256=C96f5YDrhc3UyG-P90QiGai6pIRr1W7mQgwXUlN9-ts,139
4
- livekit/plugins/resemble/py.typed,sha256=ajz1GSNU9xYVrFEDSz6Xwg7amWQ_yvW75tQa1ZvRIWc,3
5
- livekit/plugins/resemble/tts.py,sha256=x8ee1dhn983mXs9ia0U6ITa8CK6sqWF-LvlC2V41iZo,23016
6
- livekit/plugins/resemble/version.py,sha256=vQH9cItKAVYAmrLbOntkbLqmxrUZrPiKb1TjkZ8jRKQ,600
7
- livekit_plugins_resemble-0.1.0.dist-info/METADATA,sha256=XDyrThCqG_bDTIPIv2fIAodMp6hOp-4aKFi771W9uW0,4954
8
- livekit_plugins_resemble-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
9
- livekit_plugins_resemble-0.1.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_resemble-0.1.0.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- livekit