videosdk-plugins-resemble 0.0.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- videosdk_plugins_resemble-0.0.47/.gitignore +19 -0
- videosdk_plugins_resemble-0.0.47/PKG-INFO +26 -0
- videosdk_plugins_resemble-0.0.47/README.md +9 -0
- videosdk_plugins_resemble-0.0.47/pyproject.toml +32 -0
- videosdk_plugins_resemble-0.0.47/videosdk/plugins/resemble/__init__.py +3 -0
- videosdk_plugins_resemble-0.0.47/videosdk/plugins/resemble/tts.py +179 -0
- videosdk_plugins_resemble-0.0.47/videosdk/plugins/resemble/version.py +1 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videosdk-plugins-resemble
|
|
3
|
+
Version: 0.0.47
|
|
4
|
+
Summary: VideoSDK Agent Framework plugin for Resemble
|
|
5
|
+
Author: videosdk
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Keywords: ai,audio,resemble,video,videosdk
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Communications :: Conferencing
|
|
11
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
12
|
+
Classifier: Topic :: Multimedia :: Video
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.47
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# VideoSDK Resemble Plugin
|
|
19
|
+
|
|
20
|
+
Agent Framework plugin for TTS services from Resemble.
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install videosdk-plugins-resemble
|
|
26
|
+
```
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "videosdk-plugins-resemble"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "VideoSDK Agent Framework plugin for Resemble"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "videosdk" }]
|
|
13
|
+
keywords = ["video", "audio", "ai", "videosdk", "resemble"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Topic :: Communications :: Conferencing",
|
|
19
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
20
|
+
"Topic :: Multimedia :: Video",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = ["videosdk-agents>=0.0.47"]
|
|
24
|
+
|
|
25
|
+
[tool.hatch.version]
|
|
26
|
+
path = "videosdk/plugins/resemble/version.py"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["videosdk"]
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.sdist]
|
|
32
|
+
include = ["/videosdk"]
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, AsyncIterator, Optional
|
|
4
|
+
import os
|
|
5
|
+
import asyncio
|
|
6
|
+
import httpx
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from videosdk.agents import TTS
|
|
10
|
+
from videosdk.agents.utils import segment_text
|
|
11
|
+
|
|
12
|
+
RESEMBLE_HTTP_STREAMING_URL = "https://f.cluster.resemble.ai/stream"
|
|
13
|
+
DEFAULT_VOICE_UUID = "55592656"
|
|
14
|
+
DEFAULT_SAMPLE_RATE = 22050
|
|
15
|
+
DEFAULT_PRECISION = "PCM_16"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ResembleTTS(TTS):
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
*,
|
|
22
|
+
api_key: str | None = None,
|
|
23
|
+
voice_uuid: str = DEFAULT_VOICE_UUID,
|
|
24
|
+
sample_rate: int = DEFAULT_SAMPLE_RATE,
|
|
25
|
+
precision: str = DEFAULT_PRECISION,
|
|
26
|
+
) -> None:
|
|
27
|
+
"""Initialize the Resemble TTS plugin.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
api_key (Optional[str], optional): Resemble API key. Defaults to None.
|
|
31
|
+
voice_uuid (str): The voice UUID to use for the TTS plugin. Defaults to "55592656".
|
|
32
|
+
sample_rate (int): The sample rate to use for the TTS plugin. Defaults to 22050.
|
|
33
|
+
precision (str): The precision to use for the TTS plugin. Defaults to "PCM_16".
|
|
34
|
+
"""
|
|
35
|
+
super().__init__(sample_rate=sample_rate, num_channels=1)
|
|
36
|
+
|
|
37
|
+
self.api_key = api_key or os.getenv("RESEMBLE_API_KEY")
|
|
38
|
+
if not self.api_key:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
"Resemble API key is required. Provide either `api_key` or set `RESEMBLE_API_KEY` environment variable.")
|
|
41
|
+
|
|
42
|
+
self.voice_uuid = voice_uuid
|
|
43
|
+
self.precision = precision
|
|
44
|
+
|
|
45
|
+
self.audio_track = None
|
|
46
|
+
self.loop = None
|
|
47
|
+
self._first_chunk_sent = False
|
|
48
|
+
self._interrupted = False
|
|
49
|
+
self._current_synthesis_task: asyncio.Task | None = None
|
|
50
|
+
self._http_client = httpx.AsyncClient(
|
|
51
|
+
timeout=httpx.Timeout(connect=15.0, read=30.0,
|
|
52
|
+
write=5.0, pool=5.0),
|
|
53
|
+
follow_redirects=True,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def reset_first_audio_tracking(self) -> None:
|
|
57
|
+
"""Reset the first audio tracking state for next TTS task"""
|
|
58
|
+
self._first_chunk_sent = False
|
|
59
|
+
|
|
60
|
+
async def synthesize(
|
|
61
|
+
self,
|
|
62
|
+
text: AsyncIterator[str] | str,
|
|
63
|
+
**kwargs: Any,
|
|
64
|
+
) -> None:
|
|
65
|
+
try:
|
|
66
|
+
if not self.audio_track or not self.loop:
|
|
67
|
+
self.emit("error", "Audio track or event loop not set")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
self._interrupted = False
|
|
71
|
+
|
|
72
|
+
if isinstance(text, AsyncIterator):
|
|
73
|
+
async for segment in segment_text(text):
|
|
74
|
+
if self._interrupted:
|
|
75
|
+
break
|
|
76
|
+
await self._synthesize_segment(segment, **kwargs)
|
|
77
|
+
else:
|
|
78
|
+
if not self._interrupted:
|
|
79
|
+
await self._synthesize_segment(text, **kwargs)
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
self.emit("error", f"Resemble TTS synthesis failed: {str(e)}")
|
|
83
|
+
|
|
84
|
+
async def _synthesize_segment(self, text: str, **kwargs: Any) -> None:
|
|
85
|
+
"""Synthesize a single text segment"""
|
|
86
|
+
if not text.strip() or self._interrupted:
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
await self._http_stream_synthesis(text)
|
|
91
|
+
except Exception as e:
|
|
92
|
+
if not self._interrupted:
|
|
93
|
+
self.emit("error", f"Segment synthesis failed: {str(e)}")
|
|
94
|
+
|
|
95
|
+
async def _http_stream_synthesis(self, text: str) -> None:
|
|
96
|
+
headers = {
|
|
97
|
+
"Authorization": f"Token {self.api_key}",
|
|
98
|
+
"Content-Type": "application/json",
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
payload = {
|
|
102
|
+
"voice_uuid": self.voice_uuid,
|
|
103
|
+
"data": text,
|
|
104
|
+
"precision": self.precision,
|
|
105
|
+
"sample_rate": self.sample_rate,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
async with self._http_client.stream(
|
|
110
|
+
"POST",
|
|
111
|
+
RESEMBLE_HTTP_STREAMING_URL,
|
|
112
|
+
headers=headers,
|
|
113
|
+
json=payload
|
|
114
|
+
) as response:
|
|
115
|
+
response.raise_for_status()
|
|
116
|
+
|
|
117
|
+
audio_data = b""
|
|
118
|
+
header_processed = False
|
|
119
|
+
|
|
120
|
+
async for chunk in response.aiter_bytes():
|
|
121
|
+
if self._interrupted:
|
|
122
|
+
break
|
|
123
|
+
if not header_processed:
|
|
124
|
+
audio_data += chunk
|
|
125
|
+
data_pos = audio_data.find(b"data")
|
|
126
|
+
if data_pos != -1:
|
|
127
|
+
header_size = data_pos + 8
|
|
128
|
+
audio_data = audio_data[header_size:]
|
|
129
|
+
header_processed = True
|
|
130
|
+
else:
|
|
131
|
+
if chunk:
|
|
132
|
+
audio_data += chunk
|
|
133
|
+
|
|
134
|
+
if audio_data and not self._interrupted:
|
|
135
|
+
await self._stream_audio_chunks(audio_data)
|
|
136
|
+
|
|
137
|
+
except httpx.HTTPStatusError as e:
|
|
138
|
+
if not self._interrupted:
|
|
139
|
+
self.emit(
|
|
140
|
+
"error", f"HTTP error {e.response.status_code}: {e.response.text}")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
if not self._interrupted:
|
|
143
|
+
self.emit(
|
|
144
|
+
"error", f"HTTP streaming synthesis failed: {str(e)}")
|
|
145
|
+
|
|
146
|
+
async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
|
|
147
|
+
"""Stream audio data in chunks for smooth playback """
|
|
148
|
+
chunk_size = int(self.sample_rate * 1 * 2 * 20 / 1000)
|
|
149
|
+
|
|
150
|
+
for i in range(0, len(audio_bytes), chunk_size):
|
|
151
|
+
if self._interrupted:
|
|
152
|
+
break
|
|
153
|
+
|
|
154
|
+
chunk = audio_bytes[i:i + chunk_size]
|
|
155
|
+
|
|
156
|
+
if len(chunk) < chunk_size and len(chunk) > 0:
|
|
157
|
+
padding_needed = chunk_size - len(chunk)
|
|
158
|
+
chunk += b'\x00' * padding_needed
|
|
159
|
+
|
|
160
|
+
if len(chunk) == chunk_size:
|
|
161
|
+
if not self._first_chunk_sent and self._first_audio_callback:
|
|
162
|
+
self._first_chunk_sent = True
|
|
163
|
+
await self._first_audio_callback()
|
|
164
|
+
|
|
165
|
+
asyncio.create_task(self.audio_track.add_new_bytes(chunk))
|
|
166
|
+
await asyncio.sleep(0.001)
|
|
167
|
+
|
|
168
|
+
async def aclose(self) -> None:
|
|
169
|
+
if self._http_client:
|
|
170
|
+
await self._http_client.aclose()
|
|
171
|
+
await super().aclose()
|
|
172
|
+
|
|
173
|
+
async def interrupt(self) -> None:
|
|
174
|
+
"""Interrupt TTS synthesis"""
|
|
175
|
+
self._interrupted = True
|
|
176
|
+
if self._current_synthesis_task and not self._current_synthesis_task.done():
|
|
177
|
+
self._current_synthesis_task.cancel()
|
|
178
|
+
if self.audio_track:
|
|
179
|
+
self.audio_track.interrupt()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.47"
|