livekit-plugins-resemble 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of livekit-plugins-resemble might be problematic. Click here for more details.
- livekit_plugins_resemble-0.1.0/PKG-INFO +150 -0
- livekit_plugins_resemble-0.1.0/README.md +118 -0
- livekit_plugins_resemble-0.1.0/livekit/plugins/resemble/__init__.py +37 -0
- livekit_plugins_resemble-0.1.0/livekit/plugins/resemble/log.py +3 -0
- livekit_plugins_resemble-0.1.0/livekit/plugins/resemble/models.py +10 -0
- livekit_plugins_resemble-0.1.0/livekit/plugins/resemble/py.typed +3 -0
- livekit_plugins_resemble-0.1.0/livekit/plugins/resemble/tts.py +620 -0
- livekit_plugins_resemble-0.1.0/livekit/plugins/resemble/version.py +15 -0
- livekit_plugins_resemble-0.1.0/livekit_plugins_resemble.egg-info/PKG-INFO +150 -0
- livekit_plugins_resemble-0.1.0/livekit_plugins_resemble.egg-info/SOURCES.txt +14 -0
- livekit_plugins_resemble-0.1.0/livekit_plugins_resemble.egg-info/dependency_links.txt +1 -0
- livekit_plugins_resemble-0.1.0/livekit_plugins_resemble.egg-info/requires.txt +2 -0
- livekit_plugins_resemble-0.1.0/livekit_plugins_resemble.egg-info/top_level.txt +1 -0
- livekit_plugins_resemble-0.1.0/pyproject.toml +3 -0
- livekit_plugins_resemble-0.1.0/setup.cfg +4 -0
- livekit_plugins_resemble-0.1.0/setup.py +55 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: livekit-plugins-resemble
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LiveKit Agents Plugin for Resemble AI
|
|
5
|
+
Home-page: https://github.com/livekit/agents
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://docs.livekit.io
|
|
8
|
+
Project-URL: Website, https://livekit.io/
|
|
9
|
+
Project-URL: Source, https://github.com/livekit/agents
|
|
10
|
+
Keywords: webrtc,realtime,audio,video,livekit,resemble,tts
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Requires-Python: >=3.9.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: livekit-agents[codecs]>=0.12.3
|
|
21
|
+
Requires-Dist: websockets==12.0
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: home-page
|
|
26
|
+
Dynamic: keywords
|
|
27
|
+
Dynamic: license
|
|
28
|
+
Dynamic: project-url
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
# LiveKit Plugins Resemble
|
|
34
|
+
|
|
35
|
+
Agent Framework plugin for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install livekit-plugins-resemble
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Pre-requisites
|
|
44
|
+
|
|
45
|
+
You'll need an API key from Resemble AI. It can be set as an environment variable: `RESEMBLE_API_KEY`
|
|
46
|
+
|
|
47
|
+
Additionally, you'll need the voice UUID from your Resemble AI account.
|
|
48
|
+
|
|
49
|
+
## Examples
|
|
50
|
+
|
|
51
|
+
### Recommended
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import asyncio
|
|
55
|
+
from livekit.plugins.resemble import TTS
|
|
56
|
+
|
|
57
|
+
async def run_tts_example():
|
|
58
|
+
# Use TTS with async context manager for automatic resource cleanup
|
|
59
|
+
async with TTS(
|
|
60
|
+
api_key="your_api_key", # or set RESEMBLE_API_KEY environment variable
|
|
61
|
+
voice_uuid="your_voice_uuid",
|
|
62
|
+
# Optional parameters
|
|
63
|
+
sample_rate=44100, # Sample rate in Hz (default: 44100)
|
|
64
|
+
precision="PCM_16", # Audio precision (PCM_32, PCM_24, PCM_16, MULAW)
|
|
65
|
+
output_format="wav", # Output format (wav or mp3)
|
|
66
|
+
) as tts:
|
|
67
|
+
# One-off synthesis (uses REST API)
|
|
68
|
+
audio_stream = tts.synthesize("Hello, world!")
|
|
69
|
+
|
|
70
|
+
# Process chunks as they arrive
|
|
71
|
+
async for chunk in audio_stream:
|
|
72
|
+
# Audio data is in the 'frame.data' attribute of SynthesizedAudio objects
|
|
73
|
+
audio_data = chunk.frame.data
|
|
74
|
+
print(f"Received chunk: {len(audio_data)} bytes")
|
|
75
|
+
|
|
76
|
+
# Alternative: collect all audio at once into a single AudioFrame
|
|
77
|
+
audio_stream = tts.synthesize("Another example sentence.")
|
|
78
|
+
audio_frame = await audio_stream.collect()
|
|
79
|
+
print(f"Collected complete audio: {len(audio_frame.data)} bytes")
|
|
80
|
+
|
|
81
|
+
# Real-time streaming synthesis (uses WebSocket API)
|
|
82
|
+
# Only available for Business plan users in Resemble AI
|
|
83
|
+
stream = tts.stream()
|
|
84
|
+
await stream.synthesize_text("Hello, world!")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# Run the example
|
|
89
|
+
asyncio.run(run_tts_example())
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Alternative: Manual Resource Management
|
|
93
|
+
|
|
94
|
+
If you prefer to manage resources manually, make sure to properly clean up:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
import asyncio
|
|
98
|
+
from livekit.plugins.resemble import TTS
|
|
99
|
+
|
|
100
|
+
async def run_tts_example():
|
|
101
|
+
# Initialize TTS with your credentials
|
|
102
|
+
tts = TTS(
|
|
103
|
+
api_key="your_api_key",
|
|
104
|
+
voice_uuid="your_voice_uuid",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
# TTS operations
|
|
109
|
+
audio_stream = tts.synthesize("Hello, world!")
|
|
110
|
+
async for chunk in audio_stream:
|
|
111
|
+
# Access audio data correctly
|
|
112
|
+
process_audio(chunk.frame.data)
|
|
113
|
+
finally:
|
|
114
|
+
# Always clean up resources when done
|
|
115
|
+
await tts.aclose()
|
|
116
|
+
|
|
117
|
+
# Run the example
|
|
118
|
+
asyncio.run(run_tts_example())
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Resource Management
|
|
122
|
+
|
|
123
|
+
When using this plugin outside of the LiveKit agent framework, it's important to properly manage the TTS instance lifecycle:
|
|
124
|
+
|
|
125
|
+
1. **Preferred method**: Use the async context manager pattern (`async with TTS(...) as tts:`)
|
|
126
|
+
2. If managing manually, always call `await tts.aclose()` in a finally block
|
|
127
|
+
3. If you prefer to provide your own HTTP session, you can pass it using the `http_session` parameter:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
import aiohttp
|
|
131
|
+
|
|
132
|
+
async def with_custom_session():
|
|
133
|
+
async with aiohttp.ClientSession() as session:
|
|
134
|
+
async with TTS(
|
|
135
|
+
api_key="your_api_key",
|
|
136
|
+
voice_uuid="your_voice_uuid",
|
|
137
|
+
http_session=session
|
|
138
|
+
) as tts:
|
|
139
|
+
# Use TTS...
|
|
140
|
+
# No need to manually close anything - context managers handle it all
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Implementation Details
|
|
144
|
+
|
|
145
|
+
This plugin uses two different approaches to generate speech:
|
|
146
|
+
|
|
147
|
+
1. **One-off Synthesis** - Uses Resemble's REST API for simple text-to-speech conversion
|
|
148
|
+
2. **Streaming Synthesis** - Uses Resemble's WebSocket API for real-time streaming synthesis
|
|
149
|
+
|
|
150
|
+
The WebSocket streaming API is only available for Resemble AI Business plan users.
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# LiveKit Plugins Resemble
|
|
2
|
+
|
|
3
|
+
Agent Framework plugin for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install livekit-plugins-resemble
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Pre-requisites
|
|
12
|
+
|
|
13
|
+
You'll need an API key from Resemble AI. It can be set as an environment variable: `RESEMBLE_API_KEY`
|
|
14
|
+
|
|
15
|
+
Additionally, you'll need the voice UUID from your Resemble AI account.
|
|
16
|
+
|
|
17
|
+
## Examples
|
|
18
|
+
|
|
19
|
+
### Recommended
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import asyncio
|
|
23
|
+
from livekit.plugins.resemble import TTS
|
|
24
|
+
|
|
25
|
+
async def run_tts_example():
|
|
26
|
+
# Use TTS with async context manager for automatic resource cleanup
|
|
27
|
+
async with TTS(
|
|
28
|
+
api_key="your_api_key", # or set RESEMBLE_API_KEY environment variable
|
|
29
|
+
voice_uuid="your_voice_uuid",
|
|
30
|
+
# Optional parameters
|
|
31
|
+
sample_rate=44100, # Sample rate in Hz (default: 44100)
|
|
32
|
+
precision="PCM_16", # Audio precision (PCM_32, PCM_24, PCM_16, MULAW)
|
|
33
|
+
output_format="wav", # Output format (wav or mp3)
|
|
34
|
+
) as tts:
|
|
35
|
+
# One-off synthesis (uses REST API)
|
|
36
|
+
audio_stream = tts.synthesize("Hello, world!")
|
|
37
|
+
|
|
38
|
+
# Process chunks as they arrive
|
|
39
|
+
async for chunk in audio_stream:
|
|
40
|
+
# Audio data is in the 'frame.data' attribute of SynthesizedAudio objects
|
|
41
|
+
audio_data = chunk.frame.data
|
|
42
|
+
print(f"Received chunk: {len(audio_data)} bytes")
|
|
43
|
+
|
|
44
|
+
# Alternative: collect all audio at once into a single AudioFrame
|
|
45
|
+
audio_stream = tts.synthesize("Another example sentence.")
|
|
46
|
+
audio_frame = await audio_stream.collect()
|
|
47
|
+
print(f"Collected complete audio: {len(audio_frame.data)} bytes")
|
|
48
|
+
|
|
49
|
+
# Real-time streaming synthesis (uses WebSocket API)
|
|
50
|
+
# Only available for Business plan users in Resemble AI
|
|
51
|
+
stream = tts.stream()
|
|
52
|
+
await stream.synthesize_text("Hello, world!")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Run the example
|
|
57
|
+
asyncio.run(run_tts_example())
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Alternative: Manual Resource Management
|
|
61
|
+
|
|
62
|
+
If you prefer to manage resources manually, make sure to properly clean up:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import asyncio
|
|
66
|
+
from livekit.plugins.resemble import TTS
|
|
67
|
+
|
|
68
|
+
async def run_tts_example():
|
|
69
|
+
# Initialize TTS with your credentials
|
|
70
|
+
tts = TTS(
|
|
71
|
+
api_key="your_api_key",
|
|
72
|
+
voice_uuid="your_voice_uuid",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
# TTS operations
|
|
77
|
+
audio_stream = tts.synthesize("Hello, world!")
|
|
78
|
+
async for chunk in audio_stream:
|
|
79
|
+
# Access audio data correctly
|
|
80
|
+
process_audio(chunk.frame.data)
|
|
81
|
+
finally:
|
|
82
|
+
# Always clean up resources when done
|
|
83
|
+
await tts.aclose()
|
|
84
|
+
|
|
85
|
+
# Run the example
|
|
86
|
+
asyncio.run(run_tts_example())
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Resource Management
|
|
90
|
+
|
|
91
|
+
When using this plugin outside of the LiveKit agent framework, it's important to properly manage the TTS instance lifecycle:
|
|
92
|
+
|
|
93
|
+
1. **Preferred method**: Use the async context manager pattern (`async with TTS(...) as tts:`)
|
|
94
|
+
2. If managing manually, always call `await tts.aclose()` in a finally block
|
|
95
|
+
3. If you prefer to provide your own HTTP session, you can pass it using the `http_session` parameter:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import aiohttp
|
|
99
|
+
|
|
100
|
+
async def with_custom_session():
|
|
101
|
+
async with aiohttp.ClientSession() as session:
|
|
102
|
+
async with TTS(
|
|
103
|
+
api_key="your_api_key",
|
|
104
|
+
voice_uuid="your_voice_uuid",
|
|
105
|
+
http_session=session
|
|
106
|
+
) as tts:
|
|
107
|
+
# Use TTS...
|
|
108
|
+
# No need to manually close anything - context managers handle it all
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Implementation Details
|
|
112
|
+
|
|
113
|
+
This plugin uses two different approaches to generate speech:
|
|
114
|
+
|
|
115
|
+
1. **One-off Synthesis** - Uses Resemble's REST API for simple text-to-speech conversion
|
|
116
|
+
2. **Streaming Synthesis** - Uses Resemble's WebSocket API for real-time streaming synthesis
|
|
117
|
+
|
|
118
|
+
The WebSocket streaming API is only available for Resemble AI Business plan users.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .tts import TTS, ChunkedStream, SynthesizeStream
|
|
16
|
+
from .version import __version__
|
|
17
|
+
|
|
18
|
+
__all__ = ["TTS", "ChunkedStream", "SynthesizeStream", "__version__"]
|
|
19
|
+
|
|
20
|
+
from livekit.agents import Plugin
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ResemblePlugin(Plugin):
|
|
24
|
+
def __init__(self) -> None:
|
|
25
|
+
super().__init__(__name__, __version__, __package__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
Plugin.register_plugin(ResemblePlugin())
|
|
29
|
+
|
|
30
|
+
# Cleanup docs of unexported modules
|
|
31
|
+
_module = dir()
|
|
32
|
+
NOT_IN_ALL = [m for m in _module if m not in __all__]
|
|
33
|
+
|
|
34
|
+
__pdoc__ = {}
|
|
35
|
+
|
|
36
|
+
for n in NOT_IN_ALL:
|
|
37
|
+
__pdoc__[n] = False
|
|
@@ -0,0 +1,620 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import base64
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
import weakref
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
import aiohttp
|
|
27
|
+
import websockets
|
|
28
|
+
from livekit import rtc
|
|
29
|
+
from livekit.agents import (
|
|
30
|
+
APIConnectionError,
|
|
31
|
+
APIConnectOptions,
|
|
32
|
+
APIStatusError,
|
|
33
|
+
APITimeoutError,
|
|
34
|
+
tts,
|
|
35
|
+
utils,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
from .log import logger
|
|
39
|
+
|
|
40
|
+
RESEMBLE_WEBSOCKET_URL = "wss://websocket.cluster.resemble.ai/stream"
|
|
41
|
+
RESEMBLE_REST_API_URL = "https://f.cluster.resemble.ai/synthesize"
|
|
42
|
+
NUM_CHANNELS = 1
|
|
43
|
+
DEFAULT_VOICE_UUID = "55592656"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class _Options:
|
|
48
|
+
voice_uuid: str
|
|
49
|
+
sample_rate: int
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TTS(tts.TTS):
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
api_key: str | None = None,
|
|
57
|
+
voice_uuid: str | None = DEFAULT_VOICE_UUID,
|
|
58
|
+
sample_rate: int = 44100,
|
|
59
|
+
http_session: aiohttp.ClientSession | None = None,
|
|
60
|
+
) -> None:
|
|
61
|
+
super().__init__(
|
|
62
|
+
capabilities=tts.TTSCapabilities(
|
|
63
|
+
streaming=True,
|
|
64
|
+
),
|
|
65
|
+
sample_rate=sample_rate,
|
|
66
|
+
num_channels=NUM_CHANNELS,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Validate and set API key
|
|
70
|
+
self._api_key = api_key or os.environ.get("RESEMBLE_API_KEY")
|
|
71
|
+
if not self._api_key:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
"Resemble API key is required, either as argument or set RESEMBLE_API_KEY environment variable"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Set options
|
|
77
|
+
self._opts = _Options(
|
|
78
|
+
voice_uuid=voice_uuid,
|
|
79
|
+
sample_rate=sample_rate,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self._session = http_session
|
|
83
|
+
self._streams = weakref.WeakSet[SynthesizeStream]()
|
|
84
|
+
|
|
85
|
+
# Create a connection pool for WebSockets
|
|
86
|
+
self._pool = utils.ConnectionPool[websockets.WebSocketClientProtocol](
|
|
87
|
+
connect_cb=self._connect_ws,
|
|
88
|
+
close_cb=self._close_ws,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
async def _connect_ws(self) -> websockets.WebSocketClientProtocol:
|
|
92
|
+
"""Connect to the Resemble WebSocket API."""
|
|
93
|
+
return await websockets.connect(
|
|
94
|
+
RESEMBLE_WEBSOCKET_URL,
|
|
95
|
+
extra_headers={"Authorization": f"Bearer {self._api_key}"},
|
|
96
|
+
ping_interval=5,
|
|
97
|
+
ping_timeout=10,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
async def _close_ws(self, ws: websockets.WebSocketClientProtocol):
|
|
101
|
+
"""Close the WebSocket connection."""
|
|
102
|
+
await ws.close()
|
|
103
|
+
|
|
104
|
+
def update_options(
|
|
105
|
+
self,
|
|
106
|
+
*,
|
|
107
|
+
voice_uuid: str | None = None,
|
|
108
|
+
**kwargs,
|
|
109
|
+
) -> None:
|
|
110
|
+
"""Update TTS options."""
|
|
111
|
+
if voice_uuid:
|
|
112
|
+
self._opts.voice_uuid = voice_uuid
|
|
113
|
+
|
|
114
|
+
def synthesize(
|
|
115
|
+
self,
|
|
116
|
+
text: str,
|
|
117
|
+
*,
|
|
118
|
+
conn_options: Optional[APIConnectOptions] = None,
|
|
119
|
+
) -> "ChunkedStream":
|
|
120
|
+
"""Synthesize text into speech using Resemble AI."""
|
|
121
|
+
return ChunkedStream(
|
|
122
|
+
tts=self,
|
|
123
|
+
input_text=text,
|
|
124
|
+
opts=self._opts,
|
|
125
|
+
conn_options=conn_options,
|
|
126
|
+
api_key=self._api_key,
|
|
127
|
+
session=self._session,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def stream(
|
|
131
|
+
self, *, conn_options: Optional[APIConnectOptions] = None
|
|
132
|
+
) -> "SynthesizeStream":
|
|
133
|
+
"""Create a streaming synthesis connection to Resemble AI."""
|
|
134
|
+
stream = SynthesizeStream(
|
|
135
|
+
tts=self,
|
|
136
|
+
opts=self._opts,
|
|
137
|
+
conn_options=conn_options,
|
|
138
|
+
api_key=self._api_key,
|
|
139
|
+
pool=self._pool,
|
|
140
|
+
)
|
|
141
|
+
self._streams.add(stream)
|
|
142
|
+
return stream
|
|
143
|
+
|
|
144
|
+
async def __aenter__(self) -> "TTS":
|
|
145
|
+
"""Enter async context manager."""
|
|
146
|
+
return self
|
|
147
|
+
|
|
148
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
149
|
+
"""Exit async context manager and clean up resources."""
|
|
150
|
+
await self.aclose()
|
|
151
|
+
|
|
152
|
+
async def aclose(self) -> None:
|
|
153
|
+
"""Clean up resources."""
|
|
154
|
+
# Close all active streams
|
|
155
|
+
for stream in list(self._streams):
|
|
156
|
+
await stream.aclose()
|
|
157
|
+
self._streams.clear()
|
|
158
|
+
|
|
159
|
+
# Close the WebSocket connection pool
|
|
160
|
+
await self._pool.aclose()
|
|
161
|
+
|
|
162
|
+
await super().aclose()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class ChunkedStream(tts.ChunkedStream):
|
|
166
|
+
"""Synthesize text into speech in one go using Resemble AI's REST API."""
|
|
167
|
+
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
*,
|
|
171
|
+
tts: TTS,
|
|
172
|
+
input_text: str,
|
|
173
|
+
opts: _Options,
|
|
174
|
+
conn_options: Optional[APIConnectOptions] = None,
|
|
175
|
+
api_key: str | None = None,
|
|
176
|
+
session: aiohttp.ClientSession,
|
|
177
|
+
) -> None:
|
|
178
|
+
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
|
179
|
+
self._opts = opts
|
|
180
|
+
self._api_key = api_key
|
|
181
|
+
self._session = session
|
|
182
|
+
self._segment_id = utils.shortuuid()
|
|
183
|
+
|
|
184
|
+
async def _run(self) -> None:
|
|
185
|
+
"""Run the synthesis process using REST API."""
|
|
186
|
+
request_id = utils.shortuuid()
|
|
187
|
+
|
|
188
|
+
# Create request headers
|
|
189
|
+
headers = {
|
|
190
|
+
"Authorization": f"Bearer {self._api_key}",
|
|
191
|
+
"Content-Type": "application/json",
|
|
192
|
+
"Accept": "application/json", # Expect JSON response
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
# Create request payload
|
|
196
|
+
payload = {
|
|
197
|
+
"voice_uuid": self._opts.voice_uuid,
|
|
198
|
+
"data": self._input_text,
|
|
199
|
+
"sample_rate": self._opts.sample_rate,
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# Create decoder for audio processing
|
|
203
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
|
204
|
+
sample_rate=self._opts.sample_rate,
|
|
205
|
+
num_channels=NUM_CHANNELS,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
# Make the HTTP request with explicit timeout
|
|
210
|
+
async with self._session.post(
|
|
211
|
+
RESEMBLE_REST_API_URL,
|
|
212
|
+
headers=headers,
|
|
213
|
+
json=payload,
|
|
214
|
+
timeout=aiohttp.ClientTimeout(
|
|
215
|
+
total=30, # 30 seconds total timeout
|
|
216
|
+
sock_connect=self._conn_options.timeout,
|
|
217
|
+
),
|
|
218
|
+
) as response:
|
|
219
|
+
if not response.ok:
|
|
220
|
+
error_text = await response.text()
|
|
221
|
+
raise APIStatusError(
|
|
222
|
+
message=f"Resemble API error: {error_text}",
|
|
223
|
+
status_code=response.status,
|
|
224
|
+
request_id=request_id,
|
|
225
|
+
body=error_text,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Parse the JSON response
|
|
229
|
+
response_json = await response.json()
|
|
230
|
+
|
|
231
|
+
# Check for success
|
|
232
|
+
if not response_json.get("success", False):
|
|
233
|
+
issues = response_json.get("issues", ["Unknown error"])
|
|
234
|
+
error_msg = "; ".join(issues)
|
|
235
|
+
raise APIStatusError(
|
|
236
|
+
message=f"Resemble API returned failure: {error_msg}",
|
|
237
|
+
status_code=response.status,
|
|
238
|
+
request_id=request_id,
|
|
239
|
+
body=json.dumps(response_json),
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Extract base64-encoded audio content
|
|
243
|
+
audio_content_b64 = response_json.get("audio_content")
|
|
244
|
+
if not audio_content_b64:
|
|
245
|
+
raise APIStatusError(
|
|
246
|
+
message="No audio content in response",
|
|
247
|
+
status_code=response.status,
|
|
248
|
+
request_id=request_id,
|
|
249
|
+
body=json.dumps(response_json),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Decode base64 to get raw audio bytes
|
|
253
|
+
audio_bytes = base64.b64decode(audio_content_b64)
|
|
254
|
+
|
|
255
|
+
# Create audio emitter
|
|
256
|
+
emitter = tts.SynthesizedAudioEmitter(
|
|
257
|
+
event_ch=self._event_ch,
|
|
258
|
+
request_id=request_id,
|
|
259
|
+
segment_id=self._segment_id,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Push audio data to decoder
|
|
263
|
+
decoder.push(audio_bytes)
|
|
264
|
+
decoder.end_input()
|
|
265
|
+
|
|
266
|
+
# Emit audio frames
|
|
267
|
+
async for frame in decoder:
|
|
268
|
+
emitter.push(frame)
|
|
269
|
+
|
|
270
|
+
# Final flush of the emitter
|
|
271
|
+
emitter.flush()
|
|
272
|
+
|
|
273
|
+
except aiohttp.ClientResponseError as e:
|
|
274
|
+
# Handle HTTP errors (4xx, 5xx)
|
|
275
|
+
raise APIStatusError(
|
|
276
|
+
message=f"Resemble API error: {e.message}",
|
|
277
|
+
status_code=e.status,
|
|
278
|
+
request_id=request_id,
|
|
279
|
+
body=None,
|
|
280
|
+
) from e
|
|
281
|
+
except asyncio.TimeoutError as e:
|
|
282
|
+
logger.error("Timeout while connecting to Resemble API")
|
|
283
|
+
raise APITimeoutError() from e
|
|
284
|
+
except aiohttp.ClientError as e:
|
|
285
|
+
logger.error(f"Connection error to Resemble API: {e}")
|
|
286
|
+
raise APIConnectionError(f"Connection error: {e}") from e
|
|
287
|
+
except Exception as e:
|
|
288
|
+
logger.error(f"Unexpected error during synthesis: {e}")
|
|
289
|
+
raise APIConnectionError(f"Error during synthesis: {e}") from e
|
|
290
|
+
finally:
|
|
291
|
+
await decoder.aclose()
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
class SynthesizeStream(tts.SynthesizeStream):
|
|
295
|
+
"""Stream-based text-to-speech synthesis using Resemble AI WebSocket API.
|
|
296
|
+
|
|
297
|
+
This implementation connects to Resemble's WebSocket API for real-time streaming
|
|
298
|
+
synthesis. Note that this requires a Business plan subscription with Resemble AI.
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
def __init__(
|
|
302
|
+
self,
|
|
303
|
+
*,
|
|
304
|
+
tts: TTS,
|
|
305
|
+
opts: _Options,
|
|
306
|
+
conn_options: Optional[APIConnectOptions] = None,
|
|
307
|
+
api_key: str | None = None,
|
|
308
|
+
pool: utils.ConnectionPool[websockets.WebSocketClientProtocol],
|
|
309
|
+
):
|
|
310
|
+
super().__init__(tts=tts, conn_options=conn_options)
|
|
311
|
+
self._opts = opts
|
|
312
|
+
self._api_key = api_key
|
|
313
|
+
self._request_id = 0
|
|
314
|
+
self._running = False
|
|
315
|
+
self._websocket = None
|
|
316
|
+
self._pool = pool
|
|
317
|
+
|
|
318
|
+
# Channels for communication between components
|
|
319
|
+
self._text_ch = asyncio.Queue()
|
|
320
|
+
self._audio_ch = asyncio.Queue()
|
|
321
|
+
|
|
322
|
+
# Tasks for processing
|
|
323
|
+
self._websocket_task = None
|
|
324
|
+
self._processing_task = None
|
|
325
|
+
self._closed = False
|
|
326
|
+
|
|
327
|
+
# Create a task to monitor the base class's input channel
|
|
328
|
+
self._input_monitor_task = asyncio.create_task(self._monitor_input_channel())
|
|
329
|
+
|
|
330
|
+
async def _monitor_input_channel(self) -> None:
|
|
331
|
+
"""Monitor the input channel from the base class and forward to our text channel."""
|
|
332
|
+
try:
|
|
333
|
+
buffer = ""
|
|
334
|
+
word_count = 0
|
|
335
|
+
MIN_WORDS_TO_BUFFER = 5 # Buffer at least this many words before sending
|
|
336
|
+
|
|
337
|
+
async for item in self._input_ch:
|
|
338
|
+
if isinstance(item, self._FlushSentinel):
|
|
339
|
+
# When we get a flush sentinel, send any buffered text
|
|
340
|
+
if buffer:
|
|
341
|
+
await self._text_ch.put(buffer)
|
|
342
|
+
buffer = ""
|
|
343
|
+
word_count = 0
|
|
344
|
+
# Signal end of input
|
|
345
|
+
await self._text_ch.put(None)
|
|
346
|
+
continue
|
|
347
|
+
else:
|
|
348
|
+
# It's a text token, add to buffer
|
|
349
|
+
buffer += item
|
|
350
|
+
|
|
351
|
+
# Count words in the buffer
|
|
352
|
+
if item.strip() and (item.endswith(" ") or item.endswith("\n")):
|
|
353
|
+
word_count += 1
|
|
354
|
+
|
|
355
|
+
# Send buffer when we have enough words or hit sentence-ending punctuation
|
|
356
|
+
if word_count >= MIN_WORDS_TO_BUFFER or any(
|
|
357
|
+
buffer.rstrip().endswith(p) for p in [".", "!", "?", ":", ";"]
|
|
358
|
+
):
|
|
359
|
+
await self._text_ch.put(buffer)
|
|
360
|
+
buffer = ""
|
|
361
|
+
word_count = 0
|
|
362
|
+
|
|
363
|
+
# End of input - send any remaining text in buffer
|
|
364
|
+
if buffer:
|
|
365
|
+
await self._text_ch.put(buffer)
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.error(f"Error in input channel monitor: {e}")
|
|
368
|
+
finally:
|
|
369
|
+
if not self._closed:
|
|
370
|
+
# Signal end of input if our monitor is shutting down unexpectedly
|
|
371
|
+
await self._text_ch.put(None)
|
|
372
|
+
|
|
373
|
+
def _preprocess_text(self, text: str) -> str:
|
|
374
|
+
"""Preprocess text before sending to Resemble API.
|
|
375
|
+
|
|
376
|
+
This ensures punctuation is properly handled by combining it with adjacent words.
|
|
377
|
+
"""
|
|
378
|
+
# Skip if text is empty or None
|
|
379
|
+
if not text or not text.strip():
|
|
380
|
+
return text
|
|
381
|
+
|
|
382
|
+
# If text is just punctuation, add a space before it to avoid errors
|
|
383
|
+
if text.strip() in ",.!?;:":
|
|
384
|
+
return " " + text
|
|
385
|
+
|
|
386
|
+
return text
|
|
387
|
+
|
|
388
|
+
async def synthesize_text(self, text: str) -> None:
|
|
389
|
+
"""Queue text for synthesis."""
|
|
390
|
+
if self._closed:
|
|
391
|
+
raise RuntimeError("Stream is closed")
|
|
392
|
+
|
|
393
|
+
# Preprocess text before sending
|
|
394
|
+
processed_text = self._preprocess_text(text)
|
|
395
|
+
await self._text_ch.put(processed_text)
|
|
396
|
+
|
|
397
|
+
if not self._running:
|
|
398
|
+
# Start processing if not already running
|
|
399
|
+
self._running = True
|
|
400
|
+
self._processing_task = asyncio.create_task(self._run())
|
|
401
|
+
|
|
402
|
+
# Wait for the text to be processed
|
|
403
|
+
await self._text_ch.join()
|
|
404
|
+
|
|
405
|
+
# Signal end of input - this will close the channel
|
|
406
|
+
# Note: We don't call flush() here because it's already done in end_input()
|
|
407
|
+
self.end_input()
|
|
408
|
+
|
|
409
|
+
async def aclose(self) -> None:
|
|
410
|
+
"""Close the stream and clean up resources."""
|
|
411
|
+
self._closed = True
|
|
412
|
+
|
|
413
|
+
# Close the text channel to signal the end
|
|
414
|
+
if self._running:
|
|
415
|
+
await self._text_ch.put(None) # Signal end of input
|
|
416
|
+
|
|
417
|
+
# Cancel the input monitor task
|
|
418
|
+
if self._input_monitor_task and not self._input_monitor_task.done():
|
|
419
|
+
self._input_monitor_task.cancel()
|
|
420
|
+
try:
|
|
421
|
+
await self._input_monitor_task
|
|
422
|
+
except asyncio.CancelledError:
|
|
423
|
+
pass
|
|
424
|
+
|
|
425
|
+
# Cancel any running tasks
|
|
426
|
+
if self._processing_task and not self._processing_task.done():
|
|
427
|
+
self._processing_task.cancel()
|
|
428
|
+
try:
|
|
429
|
+
await self._processing_task
|
|
430
|
+
except asyncio.CancelledError:
|
|
431
|
+
pass
|
|
432
|
+
|
|
433
|
+
await super().aclose()
|
|
434
|
+
|
|
435
|
+
async def _run(self) -> None:
|
|
436
|
+
"""Main processing loop for the streaming synthesis."""
|
|
437
|
+
|
|
438
|
+
# Initialize decoder for audio processing
|
|
439
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
|
440
|
+
sample_rate=self._opts.sample_rate,
|
|
441
|
+
num_channels=NUM_CHANNELS,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
try:
|
|
445
|
+
request_id = utils.shortuuid()
|
|
446
|
+
segment_id = utils.shortuuid()
|
|
447
|
+
|
|
448
|
+
# Create audio emitter
|
|
449
|
+
emitter = tts.SynthesizedAudioEmitter(
|
|
450
|
+
event_ch=self._event_ch,
|
|
451
|
+
request_id=request_id,
|
|
452
|
+
segment_id=segment_id,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Track pending requests to ensure all responses are received
|
|
456
|
+
pending_requests = set()
|
|
457
|
+
|
|
458
|
+
async with self._pool.connection() as websocket:
|
|
459
|
+
# Start a separate task to handle WebSocket messages
|
|
460
|
+
async def _ws_recv_task():
|
|
461
|
+
try:
|
|
462
|
+
while not self._closed:
|
|
463
|
+
message = await websocket.recv()
|
|
464
|
+
|
|
465
|
+
# Handle JSON response
|
|
466
|
+
try:
|
|
467
|
+
data = json.loads(message)
|
|
468
|
+
|
|
469
|
+
# Handle audio data
|
|
470
|
+
if data.get("type") == "audio":
|
|
471
|
+
# Decode base64 audio content
|
|
472
|
+
audio_data = base64.b64decode(data["audio_content"])
|
|
473
|
+
|
|
474
|
+
try:
|
|
475
|
+
# For PCM_16, each sample is 2 bytes (16 bits)
|
|
476
|
+
bytes_per_sample = 2
|
|
477
|
+
samples_per_channel = (
|
|
478
|
+
len(audio_data) // bytes_per_sample
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Create audio frame directly from the PCM data
|
|
482
|
+
frame = rtc.AudioFrame(
|
|
483
|
+
data=audio_data,
|
|
484
|
+
samples_per_channel=samples_per_channel,
|
|
485
|
+
sample_rate=self._opts.sample_rate,
|
|
486
|
+
num_channels=NUM_CHANNELS,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
emitter.push(frame)
|
|
490
|
+
|
|
491
|
+
emitter.flush()
|
|
492
|
+
|
|
493
|
+
except Exception as e:
|
|
494
|
+
logger.error(
|
|
495
|
+
f"Error processing audio data: {e}",
|
|
496
|
+
exc_info=True,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Handle end of audio
|
|
500
|
+
elif data.get("type") == "audio_end":
|
|
501
|
+
# Complete current segment
|
|
502
|
+
emitter.flush()
|
|
503
|
+
|
|
504
|
+
# Mark request as completed if request_id is present
|
|
505
|
+
if "request_id" in data:
|
|
506
|
+
req_id = data["request_id"]
|
|
507
|
+
if req_id in pending_requests:
|
|
508
|
+
pending_requests.remove(req_id)
|
|
509
|
+
|
|
510
|
+
# Handle errors
|
|
511
|
+
elif data.get("type") == "error":
|
|
512
|
+
error_msg = data.get("message", "Unknown error")
|
|
513
|
+
logger.error(
|
|
514
|
+
f"Resemble WebSocket API error: {error_msg}"
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Don't raise an error for punctuation-only inputs
|
|
518
|
+
if (
|
|
519
|
+
"would not generate any audio" in error_msg
|
|
520
|
+
and data.get("request_id") in pending_requests
|
|
521
|
+
):
|
|
522
|
+
req_id = data.get("request_id")
|
|
523
|
+
pending_requests.remove(req_id)
|
|
524
|
+
else:
|
|
525
|
+
raise APIStatusError(
|
|
526
|
+
message=f"Resemble API error: {error_msg}",
|
|
527
|
+
status_code=data.get("status_code", 500),
|
|
528
|
+
request_id=str(request_id),
|
|
529
|
+
body=None,
|
|
530
|
+
)
|
|
531
|
+
except json.JSONDecodeError:
|
|
532
|
+
logger.error(
|
|
533
|
+
f"Failed to decode JSON response: {message}"
|
|
534
|
+
)
|
|
535
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
536
|
+
logger.error(f"WebSocket connection closed: {e}")
|
|
537
|
+
if not self._closed:
|
|
538
|
+
raise APIConnectionError(
|
|
539
|
+
f"WebSocket connection closed unexpectedly: {e}"
|
|
540
|
+
)
|
|
541
|
+
except Exception as e:
|
|
542
|
+
logger.error(f"Error in WebSocket receive task: {e}")
|
|
543
|
+
if not self._closed:
|
|
544
|
+
raise
|
|
545
|
+
|
|
546
|
+
# Start WebSocket receive task
|
|
547
|
+
ws_task = asyncio.create_task(_ws_recv_task())
|
|
548
|
+
|
|
549
|
+
# Process text input
|
|
550
|
+
try:
|
|
551
|
+
while not self._closed:
|
|
552
|
+
# Wait for text to synthesize
|
|
553
|
+
text = await self._text_ch.get()
|
|
554
|
+
|
|
555
|
+
# None signals end of input
|
|
556
|
+
if text is None:
|
|
557
|
+
break
|
|
558
|
+
|
|
559
|
+
if not text.strip():
|
|
560
|
+
self._text_ch.task_done()
|
|
561
|
+
continue
|
|
562
|
+
|
|
563
|
+
# Preprocess text before sending
|
|
564
|
+
text = self._preprocess_text(text)
|
|
565
|
+
|
|
566
|
+
self._mark_started()
|
|
567
|
+
|
|
568
|
+
payload = {
|
|
569
|
+
"voice_uuid": self._opts.voice_uuid,
|
|
570
|
+
"data": text,
|
|
571
|
+
"request_id": self._request_id,
|
|
572
|
+
"sample_rate": self._opts.sample_rate,
|
|
573
|
+
"precision": "PCM_16",
|
|
574
|
+
"no_audio_header": True,
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
# Add request to pending set
|
|
578
|
+
pending_requests.add(self._request_id)
|
|
579
|
+
|
|
580
|
+
# Send synthesis request
|
|
581
|
+
await websocket.send(json.dumps(payload))
|
|
582
|
+
self._request_id += 1
|
|
583
|
+
|
|
584
|
+
# Mark the text as processed
|
|
585
|
+
self._text_ch.task_done()
|
|
586
|
+
|
|
587
|
+
# Wait for all pending requests to complete
|
|
588
|
+
if pending_requests:
|
|
589
|
+
# Wait with a timeout to avoid hanging indefinitely
|
|
590
|
+
wait_start = time.time()
|
|
591
|
+
while pending_requests and (time.time() - wait_start) < 5.0:
|
|
592
|
+
await asyncio.sleep(0.1)
|
|
593
|
+
|
|
594
|
+
if pending_requests:
|
|
595
|
+
logger.warning(
|
|
596
|
+
f"Timed out waiting for {len(pending_requests)} audio responses"
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
finally:
|
|
600
|
+
# Cancel WebSocket task
|
|
601
|
+
if not ws_task.done():
|
|
602
|
+
ws_task.cancel()
|
|
603
|
+
try:
|
|
604
|
+
await ws_task
|
|
605
|
+
except asyncio.CancelledError:
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
except asyncio.CancelledError:
|
|
609
|
+
raise
|
|
610
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
611
|
+
logger.error(f"WebSocket connection closed: {e}")
|
|
612
|
+
raise APIConnectionError(f"WebSocket connection closed: {e}") from e
|
|
613
|
+
except Exception as e:
|
|
614
|
+
logger.error(f"Error during streaming synthesis: {e}")
|
|
615
|
+
raise APIConnectionError(f"Error during streaming synthesis: {e}") from e
|
|
616
|
+
finally:
|
|
617
|
+
# Clean up resources
|
|
618
|
+
await decoder.aclose()
|
|
619
|
+
|
|
620
|
+
self._running = False
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: livekit-plugins-resemble
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LiveKit Agents Plugin for Resemble AI
|
|
5
|
+
Home-page: https://github.com/livekit/agents
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://docs.livekit.io
|
|
8
|
+
Project-URL: Website, https://livekit.io/
|
|
9
|
+
Project-URL: Source, https://github.com/livekit/agents
|
|
10
|
+
Keywords: webrtc,realtime,audio,video,livekit,resemble,tts
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Requires-Python: >=3.9.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: livekit-agents[codecs]>=0.12.3
|
|
21
|
+
Requires-Dist: websockets==12.0
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: home-page
|
|
26
|
+
Dynamic: keywords
|
|
27
|
+
Dynamic: license
|
|
28
|
+
Dynamic: project-url
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
# LiveKit Plugins Resemble
|
|
34
|
+
|
|
35
|
+
Agent Framework plugin for voice synthesis with the [Resemble AI](https://www.resemble.ai/) API, using both their REST API and WebSocket streaming interface.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install livekit-plugins-resemble
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Pre-requisites
|
|
44
|
+
|
|
45
|
+
You'll need an API key from Resemble AI. It can be set as an environment variable: `RESEMBLE_API_KEY`
|
|
46
|
+
|
|
47
|
+
Additionally, you'll need the voice UUID from your Resemble AI account.
|
|
48
|
+
|
|
49
|
+
## Examples
|
|
50
|
+
|
|
51
|
+
### Recommended
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import asyncio
|
|
55
|
+
from livekit.plugins.resemble import TTS
|
|
56
|
+
|
|
57
|
+
async def run_tts_example():
|
|
58
|
+
# Use TTS with async context manager for automatic resource cleanup
|
|
59
|
+
async with TTS(
|
|
60
|
+
api_key="your_api_key", # or set RESEMBLE_API_KEY environment variable
|
|
61
|
+
voice_uuid="your_voice_uuid",
|
|
62
|
+
# Optional parameters
|
|
63
|
+
sample_rate=44100, # Sample rate in Hz (default: 44100)
|
|
64
|
+
precision="PCM_16", # Audio precision (PCM_32, PCM_24, PCM_16, MULAW)
|
|
65
|
+
output_format="wav", # Output format (wav or mp3)
|
|
66
|
+
) as tts:
|
|
67
|
+
# One-off synthesis (uses REST API)
|
|
68
|
+
audio_stream = tts.synthesize("Hello, world!")
|
|
69
|
+
|
|
70
|
+
# Process chunks as they arrive
|
|
71
|
+
async for chunk in audio_stream:
|
|
72
|
+
# Audio data is in the 'frame.data' attribute of SynthesizedAudio objects
|
|
73
|
+
audio_data = chunk.frame.data
|
|
74
|
+
print(f"Received chunk: {len(audio_data)} bytes")
|
|
75
|
+
|
|
76
|
+
# Alternative: collect all audio at once into a single AudioFrame
|
|
77
|
+
audio_stream = tts.synthesize("Another example sentence.")
|
|
78
|
+
audio_frame = await audio_stream.collect()
|
|
79
|
+
print(f"Collected complete audio: {len(audio_frame.data)} bytes")
|
|
80
|
+
|
|
81
|
+
# Real-time streaming synthesis (uses WebSocket API)
|
|
82
|
+
# Only available for Business plan users in Resemble AI
|
|
83
|
+
stream = tts.stream()
|
|
84
|
+
await stream.synthesize_text("Hello, world!")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# Run the example
|
|
89
|
+
asyncio.run(run_tts_example())
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Alternative: Manual Resource Management
|
|
93
|
+
|
|
94
|
+
If you prefer to manage resources manually, make sure to properly clean up:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
import asyncio
|
|
98
|
+
from livekit.plugins.resemble import TTS
|
|
99
|
+
|
|
100
|
+
async def run_tts_example():
|
|
101
|
+
# Initialize TTS with your credentials
|
|
102
|
+
tts = TTS(
|
|
103
|
+
api_key="your_api_key",
|
|
104
|
+
voice_uuid="your_voice_uuid",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
# TTS operations
|
|
109
|
+
audio_stream = tts.synthesize("Hello, world!")
|
|
110
|
+
async for chunk in audio_stream:
|
|
111
|
+
# Access audio data correctly
|
|
112
|
+
process_audio(chunk.frame.data)
|
|
113
|
+
finally:
|
|
114
|
+
# Always clean up resources when done
|
|
115
|
+
await tts.aclose()
|
|
116
|
+
|
|
117
|
+
# Run the example
|
|
118
|
+
asyncio.run(run_tts_example())
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Resource Management
|
|
122
|
+
|
|
123
|
+
When using this plugin outside of the LiveKit agent framework, it's important to properly manage the TTS instance lifecycle:
|
|
124
|
+
|
|
125
|
+
1. **Preferred method**: Use the async context manager pattern (`async with TTS(...) as tts:`)
|
|
126
|
+
2. If managing manually, always call `await tts.aclose()` in a finally block
|
|
127
|
+
3. If you prefer to provide your own HTTP session, you can pass it using the `http_session` parameter:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
import aiohttp
|
|
131
|
+
|
|
132
|
+
async def with_custom_session():
|
|
133
|
+
async with aiohttp.ClientSession() as session:
|
|
134
|
+
async with TTS(
|
|
135
|
+
api_key="your_api_key",
|
|
136
|
+
voice_uuid="your_voice_uuid",
|
|
137
|
+
http_session=session
|
|
138
|
+
) as tts:
|
|
139
|
+
# Use TTS...
|
|
140
|
+
# No need to manually close anything - context managers handle it all
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Implementation Details
|
|
144
|
+
|
|
145
|
+
This plugin uses two different approaches to generate speech:
|
|
146
|
+
|
|
147
|
+
1. **One-off Synthesis** - Uses Resemble's REST API for simple text-to-speech conversion
|
|
148
|
+
2. **Streaming Synthesis** - Uses Resemble's WebSocket API for real-time streaming synthesis
|
|
149
|
+
|
|
150
|
+
The WebSocket streaming API is only available for Resemble AI Business plan users.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
livekit/plugins/resemble/__init__.py
|
|
5
|
+
livekit/plugins/resemble/log.py
|
|
6
|
+
livekit/plugins/resemble/models.py
|
|
7
|
+
livekit/plugins/resemble/py.typed
|
|
8
|
+
livekit/plugins/resemble/tts.py
|
|
9
|
+
livekit/plugins/resemble/version.py
|
|
10
|
+
livekit_plugins_resemble.egg-info/PKG-INFO
|
|
11
|
+
livekit_plugins_resemble.egg-info/SOURCES.txt
|
|
12
|
+
livekit_plugins_resemble.egg-info/dependency_links.txt
|
|
13
|
+
livekit_plugins_resemble.egg-info/requires.txt
|
|
14
|
+
livekit_plugins_resemble.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
livekit
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import pathlib
|
|
17
|
+
|
|
18
|
+
import setuptools
|
|
19
|
+
import setuptools.command.build_py
|
|
20
|
+
|
|
21
|
+
here = pathlib.Path(__file__).parent.resolve()
|
|
22
|
+
about = {}
|
|
23
|
+
with open(os.path.join(here, "livekit", "plugins", "resemble", "version.py"), "r") as f:
|
|
24
|
+
exec(f.read(), about)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
setuptools.setup(
|
|
28
|
+
name="livekit-plugins-resemble",
|
|
29
|
+
version=about["__version__"],
|
|
30
|
+
description="LiveKit Agents Plugin for Resemble AI",
|
|
31
|
+
long_description=(here / "README.md").read_text(encoding="utf-8"),
|
|
32
|
+
long_description_content_type="text/markdown",
|
|
33
|
+
url="https://github.com/livekit/agents",
|
|
34
|
+
cmdclass={},
|
|
35
|
+
classifiers=[
|
|
36
|
+
"Intended Audience :: Developers",
|
|
37
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
38
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
39
|
+
"Programming Language :: Python :: 3",
|
|
40
|
+
"Programming Language :: Python :: 3.11",
|
|
41
|
+
"Programming Language :: Python :: 3.12",
|
|
42
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
43
|
+
],
|
|
44
|
+
keywords=["webrtc", "realtime", "audio", "video", "livekit", "resemble", "tts"],
|
|
45
|
+
license="Apache-2.0",
|
|
46
|
+
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
|
|
47
|
+
python_requires=">=3.9.0",
|
|
48
|
+
install_requires=["livekit-agents[codecs]>=0.12.3", "websockets==12.0"],
|
|
49
|
+
package_data={"livekit.plugins.resemble": ["py.typed"]},
|
|
50
|
+
project_urls={
|
|
51
|
+
"Documentation": "https://docs.livekit.io",
|
|
52
|
+
"Website": "https://livekit.io/",
|
|
53
|
+
"Source": "https://github.com/livekit/agents",
|
|
54
|
+
},
|
|
55
|
+
)
|