You need to sign in or sign up before continuing.
cartesia 1.3.1__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-1.3.1 → cartesia-1.4.0}/PKG-INFO +4 -2
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/_types.py +1 -1
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/_websocket.py +2 -2
- cartesia-1.4.0/cartesia/async_tts.py +176 -0
- cartesia-1.4.0/cartesia/tts.py +292 -0
- cartesia-1.4.0/cartesia/version.py +1 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/voices.py +1 -5
- {cartesia-1.3.1 → cartesia-1.4.0}/pyproject.toml +2 -1
- {cartesia-1.3.1 → cartesia-1.4.0}/tests/test_tts.py +0 -18
- {cartesia-1.3.1 → cartesia-1.4.0}/uv.lock +12 -1
- cartesia-1.3.1/cartesia/async_tts.py +0 -63
- cartesia-1.3.1/cartesia/tts.py +0 -137
- cartesia-1.3.1/cartesia/version.py +0 -1
- {cartesia-1.3.1 → cartesia-1.4.0}/.github/workflows/ci.yaml +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/.github/workflows/publish.yaml +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/.gitignore +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/LICENSE.md +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/Makefile +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/README.md +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/bumpversion.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/__init__.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/_async_sse.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/_async_websocket.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/_constants.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/_logger.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/_sse.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/async_client.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/client.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/resource.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/utils/__init__.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/utils/deprecated.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/utils/retry.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/cartesia/utils/tts.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/tests/__init__.py +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/tests/resources/sample-speech-4s.wav +0 -0
- {cartesia-1.3.1 → cartesia-1.4.0}/tests/test_deprecated.py +0 -0
@@ -1,11 +1,13 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4.0
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
|
+
License-File: LICENSE.md
|
5
6
|
Requires-Python: >=3.9
|
6
7
|
Requires-Dist: aiohttp>=3.10.10
|
7
8
|
Requires-Dist: httpx>=0.27.0
|
8
9
|
Requires-Dist: iterators>=0.2.0
|
10
|
+
Requires-Dist: pydub>=0.25.1
|
9
11
|
Requires-Dist: requests>=2.31.0
|
10
12
|
Requires-Dist: websockets>=10.4
|
11
13
|
Description-Content-Type: text/markdown
|
@@ -36,7 +36,6 @@ class VoiceMetadata(TypedDict):
|
|
36
36
|
user_id: str
|
37
37
|
created_at: str
|
38
38
|
language: str
|
39
|
-
base_voice_id: Optional[str] = None
|
40
39
|
|
41
40
|
|
42
41
|
class VoiceControls(TypedDict):
|
@@ -62,6 +61,7 @@ class OutputFormat(TypedDict):
|
|
62
61
|
container: str
|
63
62
|
encoding: str
|
64
63
|
sample_rate: int
|
64
|
+
bit_rate: Optional[int] = None
|
65
65
|
|
66
66
|
|
67
67
|
class EventType:
|
@@ -121,7 +121,7 @@ class _TTSContext:
|
|
121
121
|
raise RuntimeError(f"Error generating audio:\n{response['error']}")
|
122
122
|
if response["done"]:
|
123
123
|
break
|
124
|
-
if response["data"]:
|
124
|
+
if "data" in response and response["data"]:
|
125
125
|
yield self._websocket._convert_response(
|
126
126
|
response=response, include_context_id=True
|
127
127
|
)
|
@@ -138,7 +138,7 @@ class _TTSContext:
|
|
138
138
|
raise RuntimeError(f"Error generating audio:\n{response['error']}")
|
139
139
|
if response["done"]:
|
140
140
|
break
|
141
|
-
if response["data"]:
|
141
|
+
if "data" in response and response["data"]:
|
142
142
|
yield self._websocket._convert_response(
|
143
143
|
response=response, include_context_id=True
|
144
144
|
)
|
@@ -0,0 +1,176 @@
|
|
1
|
+
from typing import Iterator, List, Optional, Tuple
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
from cartesia._async_sse import _AsyncSSE
|
5
|
+
from cartesia._async_websocket import _AsyncWebSocket
|
6
|
+
from cartesia._types import OutputFormat, VoiceControls
|
7
|
+
from cartesia.tts import TTS
|
8
|
+
from cartesia.utils.tts import _construct_tts_request
|
9
|
+
|
10
|
+
|
11
|
+
class AsyncTTS(TTS):
|
12
|
+
def __init__(self, api_key, base_url, timeout, get_session):
|
13
|
+
super().__init__(api_key, base_url, timeout)
|
14
|
+
self._get_session = get_session
|
15
|
+
self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
|
16
|
+
self.sse = self._sse_class.send
|
17
|
+
|
18
|
+
async def websocket(self) -> _AsyncWebSocket:
|
19
|
+
ws = _AsyncWebSocket(
|
20
|
+
self._ws_url(),
|
21
|
+
self.api_key,
|
22
|
+
self.cartesia_version,
|
23
|
+
self.timeout,
|
24
|
+
self._get_session,
|
25
|
+
)
|
26
|
+
await ws.connect()
|
27
|
+
return ws
|
28
|
+
|
29
|
+
async def bytes(
|
30
|
+
self,
|
31
|
+
*,
|
32
|
+
model_id: str,
|
33
|
+
transcript: str,
|
34
|
+
output_format: OutputFormat,
|
35
|
+
voice_id: Optional[str] = None,
|
36
|
+
voice_embedding: Optional[List[float]] = None,
|
37
|
+
duration: Optional[int] = None,
|
38
|
+
language: Optional[str] = None,
|
39
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
40
|
+
) -> bytes:
|
41
|
+
request_body = _construct_tts_request(
|
42
|
+
model_id=model_id,
|
43
|
+
transcript=transcript,
|
44
|
+
output_format=output_format,
|
45
|
+
voice_id=voice_id,
|
46
|
+
voice_embedding=voice_embedding,
|
47
|
+
duration=duration,
|
48
|
+
language=language,
|
49
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
50
|
+
)
|
51
|
+
|
52
|
+
async with httpx.AsyncClient() as client:
|
53
|
+
response = await client.post(
|
54
|
+
f"{self._http_url()}/tts/bytes",
|
55
|
+
headers=self.headers,
|
56
|
+
timeout=self.timeout,
|
57
|
+
json=request_body,
|
58
|
+
)
|
59
|
+
|
60
|
+
if not response.is_success:
|
61
|
+
raise ValueError(f"Failed to generate audio. Error: {response.text}")
|
62
|
+
|
63
|
+
return response.content
|
64
|
+
|
65
|
+
async def infill(
|
66
|
+
self,
|
67
|
+
*,
|
68
|
+
model_id: str,
|
69
|
+
language: str,
|
70
|
+
transcript: str,
|
71
|
+
voice_id: str,
|
72
|
+
output_format: OutputFormat,
|
73
|
+
left_audio_path: Optional[str] = None,
|
74
|
+
right_audio_path: Optional[str] = None,
|
75
|
+
experimental_voice_controls: Optional[VoiceControls] = None,
|
76
|
+
) -> Tuple[bytes, bytes]:
|
77
|
+
"""Generate infill audio between two existing audio segments.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
model_id: The ID of the model to use for generating audio
|
81
|
+
language: The language of the transcript
|
82
|
+
transcript: The text to synthesize
|
83
|
+
voice_id: The ID of the voice to use for generating audio
|
84
|
+
output_format: The desired audio output format
|
85
|
+
left_audio_path: Path to the audio file that comes before the infill
|
86
|
+
right_audio_path: Path to the audio file that comes after the infill
|
87
|
+
experimental_voice_controls: Optional voice control parameters
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
A tuple containing:
|
91
|
+
- The generated infill audio (bytes)
|
92
|
+
- The complete concatenated audio (bytes)
|
93
|
+
"""
|
94
|
+
if not left_audio_path and not right_audio_path:
|
95
|
+
raise ValueError("Must specify at least one of left_audio_path or right_audio_path")
|
96
|
+
|
97
|
+
headers = self.headers.copy()
|
98
|
+
headers.pop("Content-Type", None)
|
99
|
+
|
100
|
+
left_audio_file = None
|
101
|
+
right_audio_file = None
|
102
|
+
try:
|
103
|
+
files = {}
|
104
|
+
if left_audio_path:
|
105
|
+
left_audio_file = open(left_audio_path, "rb")
|
106
|
+
files["left_audio"] = left_audio_file
|
107
|
+
if right_audio_path:
|
108
|
+
right_audio_file = open(right_audio_path, "rb")
|
109
|
+
files["right_audio"] = right_audio_file
|
110
|
+
|
111
|
+
# Construct form data with output_format fields directly
|
112
|
+
data = {
|
113
|
+
"model_id": model_id,
|
114
|
+
"language": language,
|
115
|
+
"transcript": transcript,
|
116
|
+
"voice_id": voice_id,
|
117
|
+
"output_format[container]": output_format["container"],
|
118
|
+
"output_format[encoding]": output_format["encoding"],
|
119
|
+
"output_format[sample_rate]": output_format["sample_rate"],
|
120
|
+
}
|
121
|
+
|
122
|
+
# Add bit_rate for mp3 container
|
123
|
+
if "bit_rate" in output_format:
|
124
|
+
data["output_format[bit_rate]"] = output_format["bit_rate"]
|
125
|
+
|
126
|
+
# Add voice controls if specified
|
127
|
+
if experimental_voice_controls:
|
128
|
+
if "speed" in experimental_voice_controls:
|
129
|
+
data["voice[__experimental_controls][speed]"] = experimental_voice_controls[
|
130
|
+
"speed"
|
131
|
+
]
|
132
|
+
if "emotion" in experimental_voice_controls:
|
133
|
+
# Pass emotions as a list instead of individual values
|
134
|
+
data["voice[__experimental_controls][emotion][]"] = experimental_voice_controls[
|
135
|
+
"emotion"
|
136
|
+
]
|
137
|
+
|
138
|
+
async with httpx.AsyncClient() as client:
|
139
|
+
response = await client.post(
|
140
|
+
f"{self._http_url()}/infill/bytes",
|
141
|
+
headers=headers,
|
142
|
+
timeout=self.timeout,
|
143
|
+
files=files,
|
144
|
+
data=data,
|
145
|
+
)
|
146
|
+
|
147
|
+
if not response.is_success:
|
148
|
+
raise ValueError(
|
149
|
+
f"Failed to infill audio. Status Code: {response.status_code}\n"
|
150
|
+
f"Error: {response.text}"
|
151
|
+
)
|
152
|
+
|
153
|
+
if left_audio_file:
|
154
|
+
left_audio_file.seek(0)
|
155
|
+
left_audio = left_audio_file.read()
|
156
|
+
else:
|
157
|
+
left_audio = None
|
158
|
+
|
159
|
+
if right_audio_file:
|
160
|
+
right_audio_file.seek(0)
|
161
|
+
right_audio = right_audio_file.read()
|
162
|
+
else:
|
163
|
+
right_audio = None
|
164
|
+
|
165
|
+
infill_audio = response.content
|
166
|
+
format = output_format["container"].lower()
|
167
|
+
total_audio = self._concat_audio_segments(
|
168
|
+
left_audio, infill_audio, right_audio, format=format
|
169
|
+
)
|
170
|
+
return infill_audio, total_audio
|
171
|
+
|
172
|
+
finally:
|
173
|
+
if left_audio_file:
|
174
|
+
left_audio_file.close()
|
175
|
+
if right_audio_file:
|
176
|
+
right_audio_file.close()
|
@@ -0,0 +1,292 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Iterator, List, Optional, Tuple
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
import io
|
6
|
+
from pydub import AudioSegment
|
7
|
+
|
8
|
+
from cartesia._sse import _SSE
|
9
|
+
from cartesia._types import (
|
10
|
+
OutputFormat,
|
11
|
+
OutputFormatMapping,
|
12
|
+
VoiceControls,
|
13
|
+
)
|
14
|
+
from cartesia._websocket import _WebSocket
|
15
|
+
from cartesia.resource import Resource
|
16
|
+
from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
|
17
|
+
|
18
|
+
|
19
|
+
class TTS(Resource):
|
20
|
+
"""This resource contains methods to generate audio using Cartesia's text-to-speech API."""
|
21
|
+
|
22
|
+
def __init__(self, api_key: str, base_url: str, timeout: float):
|
23
|
+
super().__init__(
|
24
|
+
api_key=api_key,
|
25
|
+
base_url=base_url,
|
26
|
+
timeout=timeout,
|
27
|
+
)
|
28
|
+
self._sse_class = _SSE(self._http_url(), self.headers, self.timeout)
|
29
|
+
self.sse = self._sse_class.send
|
30
|
+
|
31
|
+
def websocket(self) -> _WebSocket:
|
32
|
+
"""This method returns a WebSocket object that can be used to generate audio using WebSocket.
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
_WebSocket: A WebSocket object that can be used to generate audio using WebSocket.
|
36
|
+
"""
|
37
|
+
ws = _WebSocket(self._ws_url(), self.api_key, self.cartesia_version)
|
38
|
+
ws.connect()
|
39
|
+
return ws
|
40
|
+
|
41
|
+
def bytes(
|
42
|
+
self,
|
43
|
+
*,
|
44
|
+
model_id: str,
|
45
|
+
transcript: str,
|
46
|
+
output_format: OutputFormat,
|
47
|
+
voice_id: Optional[str] = None,
|
48
|
+
voice_embedding: Optional[List[float]] = None,
|
49
|
+
duration: Optional[int] = None,
|
50
|
+
language: Optional[str] = None,
|
51
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
52
|
+
) -> bytes:
|
53
|
+
request_body = _construct_tts_request(
|
54
|
+
model_id=model_id,
|
55
|
+
transcript=transcript,
|
56
|
+
output_format=output_format,
|
57
|
+
voice_id=voice_id,
|
58
|
+
voice_embedding=voice_embedding,
|
59
|
+
duration=duration,
|
60
|
+
language=language,
|
61
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
62
|
+
)
|
63
|
+
|
64
|
+
response = httpx.post(
|
65
|
+
f"{self._http_url()}/tts/bytes",
|
66
|
+
headers=self.headers,
|
67
|
+
timeout=self.timeout,
|
68
|
+
json=request_body,
|
69
|
+
)
|
70
|
+
|
71
|
+
if not response.is_success:
|
72
|
+
raise ValueError(f"Failed to generate audio. Error: {response.text}")
|
73
|
+
|
74
|
+
return response.content
|
75
|
+
|
76
|
+
@staticmethod
|
77
|
+
def get_output_format(output_format_name: str) -> OutputFormat:
|
78
|
+
"""Convenience method to get the output_format dictionary from a given output format name.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
output_format_name (str): The name of the output format.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
OutputFormat: A dictionary containing the details of the output format to be passed into tts.sse() or tts.websocket().send()
|
85
|
+
|
86
|
+
Raises:
|
87
|
+
ValueError: If the output_format name is not supported
|
88
|
+
"""
|
89
|
+
if output_format_name in OutputFormatMapping._format_mapping:
|
90
|
+
output_format_obj = OutputFormatMapping.get_format(output_format_name)
|
91
|
+
else:
|
92
|
+
raise ValueError(f"Unsupported format: {output_format_name}")
|
93
|
+
|
94
|
+
return OutputFormat(
|
95
|
+
container=output_format_obj["container"],
|
96
|
+
encoding=output_format_obj["encoding"],
|
97
|
+
sample_rate=output_format_obj["sample_rate"],
|
98
|
+
)
|
99
|
+
|
100
|
+
@staticmethod
|
101
|
+
def get_sample_rate(output_format_name: str) -> int:
|
102
|
+
"""Convenience method to get the sample rate for a given output format.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
output_format_name (str): The name of the output format.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
int: The sample rate for the output format.
|
109
|
+
|
110
|
+
Raises:
|
111
|
+
ValueError: If the output_format name is not supported
|
112
|
+
"""
|
113
|
+
if output_format_name in OutputFormatMapping._format_mapping:
|
114
|
+
output_format_obj = OutputFormatMapping.get_format(output_format_name)
|
115
|
+
else:
|
116
|
+
raise ValueError(f"Unsupported format: {output_format_name}")
|
117
|
+
|
118
|
+
return output_format_obj["sample_rate"]
|
119
|
+
|
120
|
+
@staticmethod
|
121
|
+
def _validate_and_construct_voice(
|
122
|
+
voice_id: Optional[str] = None,
|
123
|
+
voice_embedding: Optional[List[float]] = None,
|
124
|
+
experimental_voice_controls: Optional[VoiceControls] = None,
|
125
|
+
) -> dict:
|
126
|
+
"""Validate and construct the voice dictionary for the request.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
voice_id: The ID of the voice to use for generating audio.
|
130
|
+
voice_embedding: The embedding of the voice to use for generating audio.
|
131
|
+
experimental_voice_controls: Voice controls for emotion and speed.
|
132
|
+
Note: This is an experimental feature and may rapidly change in the future.
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
A dictionary representing the voice configuration.
|
136
|
+
|
137
|
+
Raises:
|
138
|
+
ValueError: If neither or both voice_id and voice_embedding are specified.
|
139
|
+
"""
|
140
|
+
return _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls)
|
141
|
+
|
142
|
+
def infill(
|
143
|
+
self,
|
144
|
+
*,
|
145
|
+
model_id: str,
|
146
|
+
language: str,
|
147
|
+
transcript: str,
|
148
|
+
voice_id: str,
|
149
|
+
output_format: OutputFormat,
|
150
|
+
left_audio_path: Optional[str] = None,
|
151
|
+
right_audio_path: Optional[str] = None,
|
152
|
+
experimental_voice_controls: Optional[VoiceControls] = None,
|
153
|
+
) -> Tuple[bytes, bytes]:
|
154
|
+
"""Generate infill audio between two existing audio segments.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
model_id: The ID of the model to use for generating audio
|
158
|
+
language: The language of the transcript
|
159
|
+
transcript: The text to synthesize
|
160
|
+
voice_id: The ID of the voice to use for generating audio
|
161
|
+
output_format: The desired audio output format
|
162
|
+
left_audio_path: Path to the audio file that comes before the infill
|
163
|
+
right_audio_path: Path to the audio file that comes after the infill
|
164
|
+
experimental_voice_controls: Optional voice control parameters
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
A tuple containing:
|
168
|
+
- The generated infill audio (bytes)
|
169
|
+
- The complete concatenated audio (bytes)
|
170
|
+
"""
|
171
|
+
if not left_audio_path and not right_audio_path:
|
172
|
+
raise ValueError("Must specify at least one of left_audio_path or right_audio_path")
|
173
|
+
|
174
|
+
headers = self.headers.copy()
|
175
|
+
headers.pop("Content-Type", None)
|
176
|
+
|
177
|
+
left_audio_file = None
|
178
|
+
right_audio_file = None
|
179
|
+
try:
|
180
|
+
files = {}
|
181
|
+
if left_audio_path:
|
182
|
+
left_audio_file = open(left_audio_path, "rb")
|
183
|
+
files["left_audio"] = left_audio_file
|
184
|
+
if right_audio_path:
|
185
|
+
right_audio_file = open(right_audio_path, "rb")
|
186
|
+
files["right_audio"] = right_audio_file
|
187
|
+
|
188
|
+
# Construct form data with output_format fields directly
|
189
|
+
data = {
|
190
|
+
"model_id": model_id,
|
191
|
+
"language": language,
|
192
|
+
"transcript": transcript,
|
193
|
+
"voice_id": voice_id,
|
194
|
+
"output_format[container]": output_format["container"],
|
195
|
+
"output_format[encoding]": output_format["encoding"],
|
196
|
+
"output_format[sample_rate]": output_format["sample_rate"],
|
197
|
+
}
|
198
|
+
|
199
|
+
# Add bit_rate for mp3 container
|
200
|
+
if "bit_rate" in output_format:
|
201
|
+
data["output_format[bit_rate]"] = output_format["bit_rate"]
|
202
|
+
|
203
|
+
# Add voice controls if specified
|
204
|
+
if experimental_voice_controls:
|
205
|
+
if "speed" in experimental_voice_controls:
|
206
|
+
data["voice[__experimental_controls][speed]"] = experimental_voice_controls[
|
207
|
+
"speed"
|
208
|
+
]
|
209
|
+
if "emotion" in experimental_voice_controls:
|
210
|
+
# Pass emotions as a list instead of individual values
|
211
|
+
data["voice[__experimental_controls][emotion][]"] = experimental_voice_controls[
|
212
|
+
"emotion"
|
213
|
+
]
|
214
|
+
|
215
|
+
response = httpx.post(
|
216
|
+
f"{self._http_url()}/infill/bytes",
|
217
|
+
headers=headers,
|
218
|
+
timeout=self.timeout,
|
219
|
+
files=files,
|
220
|
+
data=data,
|
221
|
+
)
|
222
|
+
|
223
|
+
if not response.is_success:
|
224
|
+
raise ValueError(
|
225
|
+
f"Failed to infill audio. Status Code: {response.status_code}\n"
|
226
|
+
f"Error: {response.text}"
|
227
|
+
)
|
228
|
+
|
229
|
+
if left_audio_file:
|
230
|
+
left_audio_file.seek(0)
|
231
|
+
left_audio = left_audio_file.read()
|
232
|
+
else:
|
233
|
+
left_audio = None
|
234
|
+
|
235
|
+
if right_audio_file:
|
236
|
+
right_audio_file.seek(0)
|
237
|
+
right_audio = right_audio_file.read()
|
238
|
+
else:
|
239
|
+
right_audio = None
|
240
|
+
|
241
|
+
infill_audio = response.content
|
242
|
+
format = output_format["container"].lower()
|
243
|
+
total_audio = self._concat_audio_segments(
|
244
|
+
left_audio, infill_audio, right_audio, format=format
|
245
|
+
)
|
246
|
+
return infill_audio, total_audio
|
247
|
+
|
248
|
+
finally:
|
249
|
+
if left_audio_file:
|
250
|
+
left_audio_file.close()
|
251
|
+
if right_audio_file:
|
252
|
+
right_audio_file.close()
|
253
|
+
|
254
|
+
@staticmethod
|
255
|
+
def _concat_audio_segments(
|
256
|
+
left_audio: Optional[bytes],
|
257
|
+
infill_audio: bytes,
|
258
|
+
right_audio: Optional[bytes],
|
259
|
+
format: str = "wav",
|
260
|
+
) -> bytes:
|
261
|
+
"""Helper method to concatenate three audio segments while preserving audio format and headers.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
left_audio: The audio segment that comes before the infill
|
265
|
+
infill_audio: The generated infill audio segment
|
266
|
+
right_audio: The audio segment that comes after the infill
|
267
|
+
format: The audio format (e.g., 'wav', 'mp3'). Defaults to 'wav'
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
bytes: The concatenated audio as bytes
|
271
|
+
|
272
|
+
Raises:
|
273
|
+
ValueError: If the audio segments cannot be loaded or concatenated
|
274
|
+
"""
|
275
|
+
try:
|
276
|
+
# Convert bytes to AudioSegment objects
|
277
|
+
combined = AudioSegment.empty()
|
278
|
+
if left_audio:
|
279
|
+
combined += AudioSegment.from_file(io.BytesIO(left_audio), format=format)
|
280
|
+
|
281
|
+
combined += AudioSegment.from_file(io.BytesIO(infill_audio), format=format)
|
282
|
+
|
283
|
+
if right_audio:
|
284
|
+
combined += AudioSegment.from_file(io.BytesIO(right_audio), format=format)
|
285
|
+
|
286
|
+
# Export to bytes
|
287
|
+
output = io.BytesIO()
|
288
|
+
combined.export(output, format=format)
|
289
|
+
return output.getvalue()
|
290
|
+
|
291
|
+
except Exception as e:
|
292
|
+
raise ValueError(f"Failed to concatenate audio segments: {str(e)}")
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.4.0"
|
@@ -52,8 +52,7 @@ class Voices(Resource):
|
|
52
52
|
|
53
53
|
if not response.is_success:
|
54
54
|
raise ValueError(
|
55
|
-
f"Failed to get voice. Status Code: {response.status_code}\
|
56
|
-
f"Error: {response.text}"
|
55
|
+
f"Failed to get voice. Status Code: {response.status_code}\nError: {response.text}"
|
57
56
|
)
|
58
57
|
|
59
58
|
return response.json()
|
@@ -123,7 +122,6 @@ class Voices(Resource):
|
|
123
122
|
name: str,
|
124
123
|
description: str,
|
125
124
|
embedding: List[float],
|
126
|
-
base_voice_id: Optional[str] = None,
|
127
125
|
language: str = "en",
|
128
126
|
) -> VoiceMetadata:
|
129
127
|
"""Create a new voice.
|
@@ -132,7 +130,6 @@ class Voices(Resource):
|
|
132
130
|
name: The name of the voice.
|
133
131
|
description: The description of the voice.
|
134
132
|
embedding: The embedding of the voice. This should be generated with :meth:`clone`.
|
135
|
-
base_voice_id: The ID of the base voice. This should be a valid voice ID if specified.
|
136
133
|
|
137
134
|
Returns:
|
138
135
|
A dictionary containing the voice metadata.
|
@@ -144,7 +141,6 @@ class Voices(Resource):
|
|
144
141
|
"name": name,
|
145
142
|
"description": description,
|
146
143
|
"embedding": embedding,
|
147
|
-
"base_voice_id": base_voice_id,
|
148
144
|
"language": language,
|
149
145
|
},
|
150
146
|
timeout=self.timeout,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "cartesia"
|
3
|
-
version = "1.
|
3
|
+
version = "1.4.0"
|
4
4
|
description = "The official Python library for the Cartesia API."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.9"
|
@@ -10,6 +10,7 @@ dependencies = [
|
|
10
10
|
"iterators>=0.2.0",
|
11
11
|
"requests>=2.31.0",
|
12
12
|
"websockets>=10.4",
|
13
|
+
"pydub>=0.25.1",
|
13
14
|
]
|
14
15
|
|
15
16
|
[build-system]
|
@@ -126,24 +126,6 @@ def test_create_voice(client: Cartesia):
|
|
126
126
|
client.voices.delete(voice["id"])
|
127
127
|
|
128
128
|
|
129
|
-
@pytest.mark.skip(reason="Enable after https://github.com/cartesia-ai/bifrost/pull/847 is deployed")
|
130
|
-
def test_create_voice_with_parent(client: Cartesia):
|
131
|
-
logger.info("Testing voices.create with parent")
|
132
|
-
voice = client.voices.create(
|
133
|
-
name="Test Base voice",
|
134
|
-
description="Test base voice description",
|
135
|
-
embedding=np.ones(192).tolist(),
|
136
|
-
base_voice_id=SAMPLE_VOICE_ID,
|
137
|
-
)
|
138
|
-
assert isinstance(voice, dict)
|
139
|
-
assert voice["base_voice_id"] == SAMPLE_VOICE_ID
|
140
|
-
|
141
|
-
get_voice = client.voices.get(voice["id"])
|
142
|
-
assert get_voice["base_voice_id"] == SAMPLE_VOICE_ID
|
143
|
-
|
144
|
-
client.voices.delete(voice["id"])
|
145
|
-
|
146
|
-
|
147
129
|
def test_mix_voice(client: Cartesia):
|
148
130
|
logger.info("Testing voices.mix")
|
149
131
|
output = client.voices.mix(
|
@@ -162,12 +162,13 @@ wheels = [
|
|
162
162
|
|
163
163
|
[[package]]
|
164
164
|
name = "cartesia"
|
165
|
-
version = "1.
|
165
|
+
version = "1.4.0"
|
166
166
|
source = { editable = "." }
|
167
167
|
dependencies = [
|
168
168
|
{ name = "aiohttp" },
|
169
169
|
{ name = "httpx" },
|
170
170
|
{ name = "iterators" },
|
171
|
+
{ name = "pydub" },
|
171
172
|
{ name = "requests" },
|
172
173
|
{ name = "websockets" },
|
173
174
|
]
|
@@ -191,6 +192,7 @@ requires-dist = [
|
|
191
192
|
{ name = "aiohttp", specifier = ">=3.10.10" },
|
192
193
|
{ name = "httpx", specifier = ">=0.27.0" },
|
193
194
|
{ name = "iterators", specifier = ">=0.2.0" },
|
195
|
+
{ name = "pydub", specifier = ">=0.25.1" },
|
194
196
|
{ name = "requests", specifier = ">=2.31.0" },
|
195
197
|
{ name = "websockets", specifier = ">=10.4" },
|
196
198
|
]
|
@@ -1029,6 +1031,15 @@ wheels = [
|
|
1029
1031
|
{ url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 },
|
1030
1032
|
]
|
1031
1033
|
|
1034
|
+
[[package]]
|
1035
|
+
name = "pydub"
|
1036
|
+
version = "0.25.1"
|
1037
|
+
source = { registry = "https://pypi.org/simple" }
|
1038
|
+
sdist = { url = "https://files.pythonhosted.org/packages/fe/9a/e6bca0eed82db26562c73b5076539a4a08d3cffd19c3cc5913a3e61145fd/pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f", size = 38326 }
|
1039
|
+
wheels = [
|
1040
|
+
{ url = "https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", size = 32327 },
|
1041
|
+
]
|
1042
|
+
|
1032
1043
|
[[package]]
|
1033
1044
|
name = "pygments"
|
1034
1045
|
version = "2.18.0"
|
@@ -1,63 +0,0 @@
|
|
1
|
-
from typing import Iterator, List, Optional
|
2
|
-
|
3
|
-
import httpx
|
4
|
-
from cartesia._async_sse import _AsyncSSE
|
5
|
-
from cartesia._async_websocket import _AsyncWebSocket
|
6
|
-
from cartesia._types import OutputFormat, VoiceControls
|
7
|
-
from cartesia.tts import TTS
|
8
|
-
from cartesia.utils.tts import _construct_tts_request
|
9
|
-
|
10
|
-
|
11
|
-
class AsyncTTS(TTS):
|
12
|
-
def __init__(self, api_key, base_url, timeout, get_session):
|
13
|
-
super().__init__(api_key, base_url, timeout)
|
14
|
-
self._get_session = get_session
|
15
|
-
self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
|
16
|
-
self.sse = self._sse_class.send
|
17
|
-
|
18
|
-
async def websocket(self) -> _AsyncWebSocket:
|
19
|
-
ws = _AsyncWebSocket(
|
20
|
-
self._ws_url(),
|
21
|
-
self.api_key,
|
22
|
-
self.cartesia_version,
|
23
|
-
self.timeout,
|
24
|
-
self._get_session,
|
25
|
-
)
|
26
|
-
await ws.connect()
|
27
|
-
return ws
|
28
|
-
|
29
|
-
async def bytes(
|
30
|
-
self,
|
31
|
-
*,
|
32
|
-
model_id: str,
|
33
|
-
transcript: str,
|
34
|
-
output_format: OutputFormat,
|
35
|
-
voice_id: Optional[str] = None,
|
36
|
-
voice_embedding: Optional[List[float]] = None,
|
37
|
-
duration: Optional[int] = None,
|
38
|
-
language: Optional[str] = None,
|
39
|
-
_experimental_voice_controls: Optional[VoiceControls] = None,
|
40
|
-
) -> bytes:
|
41
|
-
request_body = _construct_tts_request(
|
42
|
-
model_id=model_id,
|
43
|
-
transcript=transcript,
|
44
|
-
output_format=output_format,
|
45
|
-
voice_id=voice_id,
|
46
|
-
voice_embedding=voice_embedding,
|
47
|
-
duration=duration,
|
48
|
-
language=language,
|
49
|
-
_experimental_voice_controls=_experimental_voice_controls,
|
50
|
-
)
|
51
|
-
|
52
|
-
async with httpx.AsyncClient() as client:
|
53
|
-
response = await client.post(
|
54
|
-
f"{self._http_url()}/tts/bytes",
|
55
|
-
headers=self.headers,
|
56
|
-
timeout=self.timeout,
|
57
|
-
json=request_body,
|
58
|
-
)
|
59
|
-
|
60
|
-
if not response.is_success:
|
61
|
-
raise ValueError(f"Failed to generate audio. Error: {response.text}")
|
62
|
-
|
63
|
-
return response.content
|
cartesia-1.3.1/cartesia/tts.py
DELETED
@@ -1,137 +0,0 @@
|
|
1
|
-
from typing import Iterator, List, Optional
|
2
|
-
|
3
|
-
import httpx
|
4
|
-
|
5
|
-
from cartesia._sse import _SSE
|
6
|
-
from cartesia._types import (
|
7
|
-
OutputFormat,
|
8
|
-
OutputFormatMapping,
|
9
|
-
VoiceControls,
|
10
|
-
)
|
11
|
-
from cartesia._websocket import _WebSocket
|
12
|
-
from cartesia.resource import Resource
|
13
|
-
from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
|
14
|
-
|
15
|
-
|
16
|
-
class TTS(Resource):
|
17
|
-
"""This resource contains methods to generate audio using Cartesia's text-to-speech API."""
|
18
|
-
|
19
|
-
def __init__(self, api_key: str, base_url: str, timeout: float):
|
20
|
-
super().__init__(
|
21
|
-
api_key=api_key,
|
22
|
-
base_url=base_url,
|
23
|
-
timeout=timeout,
|
24
|
-
)
|
25
|
-
self._sse_class = _SSE(self._http_url(), self.headers, self.timeout)
|
26
|
-
self.sse = self._sse_class.send
|
27
|
-
|
28
|
-
def websocket(self) -> _WebSocket:
|
29
|
-
"""This method returns a WebSocket object that can be used to generate audio using WebSocket.
|
30
|
-
|
31
|
-
Returns:
|
32
|
-
_WebSocket: A WebSocket object that can be used to generate audio using WebSocket.
|
33
|
-
"""
|
34
|
-
ws = _WebSocket(self._ws_url(), self.api_key, self.cartesia_version)
|
35
|
-
ws.connect()
|
36
|
-
return ws
|
37
|
-
|
38
|
-
def bytes(
|
39
|
-
self,
|
40
|
-
*,
|
41
|
-
model_id: str,
|
42
|
-
transcript: str,
|
43
|
-
output_format: OutputFormat,
|
44
|
-
voice_id: Optional[str] = None,
|
45
|
-
voice_embedding: Optional[List[float]] = None,
|
46
|
-
duration: Optional[int] = None,
|
47
|
-
language: Optional[str] = None,
|
48
|
-
_experimental_voice_controls: Optional[VoiceControls] = None,
|
49
|
-
) -> bytes:
|
50
|
-
request_body = _construct_tts_request(
|
51
|
-
model_id=model_id,
|
52
|
-
transcript=transcript,
|
53
|
-
output_format=output_format,
|
54
|
-
voice_id=voice_id,
|
55
|
-
voice_embedding=voice_embedding,
|
56
|
-
duration=duration,
|
57
|
-
language=language,
|
58
|
-
_experimental_voice_controls=_experimental_voice_controls,
|
59
|
-
)
|
60
|
-
|
61
|
-
response = httpx.post(
|
62
|
-
f"{self._http_url()}/tts/bytes",
|
63
|
-
headers=self.headers,
|
64
|
-
timeout=self.timeout,
|
65
|
-
json=request_body,
|
66
|
-
)
|
67
|
-
|
68
|
-
if not response.is_success:
|
69
|
-
raise ValueError(f"Failed to generate audio. Error: {response.text}")
|
70
|
-
|
71
|
-
return response.content
|
72
|
-
|
73
|
-
@staticmethod
|
74
|
-
def get_output_format(output_format_name: str) -> OutputFormat:
|
75
|
-
"""Convenience method to get the output_format dictionary from a given output format name.
|
76
|
-
|
77
|
-
Args:
|
78
|
-
output_format_name (str): The name of the output format.
|
79
|
-
|
80
|
-
Returns:
|
81
|
-
OutputFormat: A dictionary containing the details of the output format to be passed into tts.sse() or tts.websocket().send()
|
82
|
-
|
83
|
-
Raises:
|
84
|
-
ValueError: If the output_format name is not supported
|
85
|
-
"""
|
86
|
-
if output_format_name in OutputFormatMapping._format_mapping:
|
87
|
-
output_format_obj = OutputFormatMapping.get_format(output_format_name)
|
88
|
-
else:
|
89
|
-
raise ValueError(f"Unsupported format: {output_format_name}")
|
90
|
-
|
91
|
-
return OutputFormat(
|
92
|
-
container=output_format_obj["container"],
|
93
|
-
encoding=output_format_obj["encoding"],
|
94
|
-
sample_rate=output_format_obj["sample_rate"],
|
95
|
-
)
|
96
|
-
|
97
|
-
@staticmethod
|
98
|
-
def get_sample_rate(output_format_name: str) -> int:
|
99
|
-
"""Convenience method to get the sample rate for a given output format.
|
100
|
-
|
101
|
-
Args:
|
102
|
-
output_format_name (str): The name of the output format.
|
103
|
-
|
104
|
-
Returns:
|
105
|
-
int: The sample rate for the output format.
|
106
|
-
|
107
|
-
Raises:
|
108
|
-
ValueError: If the output_format name is not supported
|
109
|
-
"""
|
110
|
-
if output_format_name in OutputFormatMapping._format_mapping:
|
111
|
-
output_format_obj = OutputFormatMapping.get_format(output_format_name)
|
112
|
-
else:
|
113
|
-
raise ValueError(f"Unsupported format: {output_format_name}")
|
114
|
-
|
115
|
-
return output_format_obj["sample_rate"]
|
116
|
-
|
117
|
-
@staticmethod
|
118
|
-
def _validate_and_construct_voice(
|
119
|
-
voice_id: Optional[str] = None,
|
120
|
-
voice_embedding: Optional[List[float]] = None,
|
121
|
-
experimental_voice_controls: Optional[VoiceControls] = None,
|
122
|
-
) -> dict:
|
123
|
-
"""Validate and construct the voice dictionary for the request.
|
124
|
-
|
125
|
-
Args:
|
126
|
-
voice_id: The ID of the voice to use for generating audio.
|
127
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
128
|
-
experimental_voice_controls: Voice controls for emotion and speed.
|
129
|
-
Note: This is an experimental feature and may rapidly change in the future.
|
130
|
-
|
131
|
-
Returns:
|
132
|
-
A dictionary representing the voice configuration.
|
133
|
-
|
134
|
-
Raises:
|
135
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
136
|
-
"""
|
137
|
-
return _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls)
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.3.1"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|