cartesia 1.0.14__tar.gz → 1.1.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-1.0.14/cartesia.egg-info → cartesia-1.1.0.dev0}/PKG-INFO +32 -15
- cartesia-1.0.14/PKG-INFO → cartesia-1.1.0.dev0/README.md +25 -21
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_async_sse.py +9 -19
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_async_websocket.py +16 -26
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_sse.py +9 -18
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_websocket.py +22 -41
- cartesia-1.1.0.dev0/cartesia/async_tts.py +63 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/tts.py +39 -2
- cartesia-1.1.0.dev0/cartesia/utils/tts.py +74 -0
- cartesia-1.1.0.dev0/cartesia/version.py +1 -0
- cartesia-1.0.14/README.md → cartesia-1.1.0.dev0/cartesia.egg-info/PKG-INFO +38 -4
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia.egg-info/SOURCES.txt +0 -1
- cartesia-1.1.0.dev0/cartesia.egg-info/requires.txt +5 -0
- cartesia-1.1.0.dev0/pyproject.toml +84 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/tests/test_tts.py +60 -22
- cartesia-1.0.14/cartesia/async_tts.py +0 -22
- cartesia-1.0.14/cartesia/utils/tts.py +0 -25
- cartesia-1.0.14/cartesia/version.py +0 -1
- cartesia-1.0.14/cartesia.egg-info/requires.txt +0 -26
- cartesia-1.0.14/pyproject.toml +0 -56
- cartesia-1.0.14/setup.py +0 -292
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/LICENSE.md +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/__init__.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_constants.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_logger.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_types.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/async_client.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/client.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/resource.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/utils/__init__.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/utils/deprecated.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/utils/retry.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/voices.py +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/setup.cfg +0 -0
- {cartesia-1.0.14 → cartesia-1.1.0.dev0}/tests/test_deprecated.py +0 -0
@@ -1,19 +1,15 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.1.0.dev0
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
|
-
|
6
|
-
Author: Cartesia, Inc.
|
7
|
-
Author-email: support@cartesia.ai
|
8
|
-
Classifier: Programming Language :: Python
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
10
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
11
|
-
Requires-Python: >=3.8.0
|
5
|
+
Requires-Python: >=3.9
|
12
6
|
Description-Content-Type: text/markdown
|
13
|
-
Provides-Extra: dev
|
14
|
-
Provides-Extra: all
|
15
7
|
License-File: LICENSE.md
|
16
|
-
|
8
|
+
Requires-Dist: aiohttp>=3.10.10
|
9
|
+
Requires-Dist: httpx>=0.27.2
|
10
|
+
Requires-Dist: iterators>=0.2.0
|
11
|
+
Requires-Dist: requests>=2.32.3
|
12
|
+
Requires-Dist: websockets>=13.1
|
17
13
|
|
18
14
|
# Cartesia Python API Library
|
19
15
|
|
@@ -30,6 +26,7 @@ The official Cartesia Python library which provides convenient access to the Car
|
|
30
26
|
- [Installation](#installation)
|
31
27
|
- [Voices](#voices)
|
32
28
|
- [Text-to-Speech](#text-to-speech)
|
29
|
+
- [Bytes](#bytes)
|
33
30
|
- [Server-Sent Events (SSE)](#server-sent-events-sse)
|
34
31
|
- [WebSocket](#websocket)
|
35
32
|
- [Conditioning speech on previous generations using WebSocket](#conditioning-speech-on-previous-generations-using-websocket)
|
@@ -88,6 +85,30 @@ new_voice = client.voices.create(
|
|
88
85
|
|
89
86
|
## Text-to-Speech
|
90
87
|
|
88
|
+
### Bytes
|
89
|
+
|
90
|
+
```python
|
91
|
+
from cartesia import Cartesia
|
92
|
+
import os
|
93
|
+
|
94
|
+
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
95
|
+
|
96
|
+
data = client.tts.bytes(
|
97
|
+
model_id="sonic-english",
|
98
|
+
transcript="Hello, world! I'm generating audio on Cartesia.",
|
99
|
+
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
100
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/tts/bytes
|
101
|
+
output_format={
|
102
|
+
"container": "wav",
|
103
|
+
"encoding": "pcm_f32le",
|
104
|
+
"sample_rate": 44100,
|
105
|
+
},
|
106
|
+
)
|
107
|
+
|
108
|
+
with open("output.wav", "wb") as f:
|
109
|
+
f.write(data)
|
110
|
+
```
|
111
|
+
|
91
112
|
### Server-Sent Events (SSE)
|
92
113
|
|
93
114
|
```python
|
@@ -96,7 +117,6 @@ import pyaudio
|
|
96
117
|
import os
|
97
118
|
|
98
119
|
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
99
|
-
voice_name = "Barbershop Man"
|
100
120
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
101
121
|
voice = client.voices.get(id=voice_id)
|
102
122
|
|
@@ -149,7 +169,6 @@ import os
|
|
149
169
|
|
150
170
|
async def write_stream():
|
151
171
|
client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
152
|
-
voice_name = "Barbershop Man"
|
153
172
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
154
173
|
voice = client.voices.get(id=voice_id)
|
155
174
|
transcript = "Hello! Welcome to Cartesia"
|
@@ -203,7 +222,6 @@ import pyaudio
|
|
203
222
|
import os
|
204
223
|
|
205
224
|
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
206
|
-
voice_name = "Barbershop Man"
|
207
225
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
208
226
|
voice = client.voices.get(id=voice_id)
|
209
227
|
transcript = "Hello! Welcome to Cartesia"
|
@@ -460,7 +478,6 @@ import pyaudio
|
|
460
478
|
import os
|
461
479
|
|
462
480
|
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
463
|
-
voice_name = "Barbershop Man"
|
464
481
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
465
482
|
voice = client.voices.get(id=voice_id)
|
466
483
|
|
@@ -1,20 +1,3 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: cartesia
|
3
|
-
Version: 1.0.14
|
4
|
-
Summary: The official Python library for the Cartesia API.
|
5
|
-
Home-page:
|
6
|
-
Author: Cartesia, Inc.
|
7
|
-
Author-email: support@cartesia.ai
|
8
|
-
Classifier: Programming Language :: Python
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
10
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
11
|
-
Requires-Python: >=3.8.0
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
Provides-Extra: dev
|
14
|
-
Provides-Extra: all
|
15
|
-
License-File: LICENSE.md
|
16
|
-
|
17
|
-
|
18
1
|
# Cartesia Python API Library
|
19
2
|
|
20
3
|

|
@@ -30,6 +13,7 @@ The official Cartesia Python library which provides convenient access to the Car
|
|
30
13
|
- [Installation](#installation)
|
31
14
|
- [Voices](#voices)
|
32
15
|
- [Text-to-Speech](#text-to-speech)
|
16
|
+
- [Bytes](#bytes)
|
33
17
|
- [Server-Sent Events (SSE)](#server-sent-events-sse)
|
34
18
|
- [WebSocket](#websocket)
|
35
19
|
- [Conditioning speech on previous generations using WebSocket](#conditioning-speech-on-previous-generations-using-websocket)
|
@@ -88,6 +72,30 @@ new_voice = client.voices.create(
|
|
88
72
|
|
89
73
|
## Text-to-Speech
|
90
74
|
|
75
|
+
### Bytes
|
76
|
+
|
77
|
+
```python
|
78
|
+
from cartesia import Cartesia
|
79
|
+
import os
|
80
|
+
|
81
|
+
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
82
|
+
|
83
|
+
data = client.tts.bytes(
|
84
|
+
model_id="sonic-english",
|
85
|
+
transcript="Hello, world! I'm generating audio on Cartesia.",
|
86
|
+
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
87
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/tts/bytes
|
88
|
+
output_format={
|
89
|
+
"container": "wav",
|
90
|
+
"encoding": "pcm_f32le",
|
91
|
+
"sample_rate": 44100,
|
92
|
+
},
|
93
|
+
)
|
94
|
+
|
95
|
+
with open("output.wav", "wb") as f:
|
96
|
+
f.write(data)
|
97
|
+
```
|
98
|
+
|
91
99
|
### Server-Sent Events (SSE)
|
92
100
|
|
93
101
|
```python
|
@@ -96,7 +104,6 @@ import pyaudio
|
|
96
104
|
import os
|
97
105
|
|
98
106
|
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
99
|
-
voice_name = "Barbershop Man"
|
100
107
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
101
108
|
voice = client.voices.get(id=voice_id)
|
102
109
|
|
@@ -149,7 +156,6 @@ import os
|
|
149
156
|
|
150
157
|
async def write_stream():
|
151
158
|
client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
152
|
-
voice_name = "Barbershop Man"
|
153
159
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
154
160
|
voice = client.voices.get(id=voice_id)
|
155
161
|
transcript = "Hello! Welcome to Cartesia"
|
@@ -203,7 +209,6 @@ import pyaudio
|
|
203
209
|
import os
|
204
210
|
|
205
211
|
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
206
|
-
voice_name = "Barbershop Man"
|
207
212
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
208
213
|
voice = client.voices.get(id=voice_id)
|
209
214
|
transcript = "Hello! Welcome to Cartesia"
|
@@ -460,7 +465,6 @@ import pyaudio
|
|
460
465
|
import os
|
461
466
|
|
462
467
|
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
463
|
-
voice_name = "Barbershop Man"
|
464
468
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
465
469
|
voice = client.voices.get(id=voice_id)
|
466
470
|
|
@@ -8,8 +8,8 @@ from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
|
|
8
8
|
from cartesia._logger import logger
|
9
9
|
from cartesia._sse import _SSE
|
10
10
|
from cartesia._types import OutputFormat, VoiceControls
|
11
|
-
from cartesia.tts import TTS
|
12
11
|
from cartesia.utils.retry import retry_on_connection_error_async
|
12
|
+
from cartesia.utils.tts import _construct_tts_request
|
13
13
|
|
14
14
|
|
15
15
|
class _AsyncSSE(_SSE):
|
@@ -37,27 +37,17 @@ class _AsyncSSE(_SSE):
|
|
37
37
|
stream: bool = True,
|
38
38
|
_experimental_voice_controls: Optional[VoiceControls] = None,
|
39
39
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
40
|
-
|
41
|
-
|
40
|
+
request_body = _construct_tts_request(
|
41
|
+
model_id=model_id,
|
42
|
+
transcript=transcript,
|
43
|
+
output_format=output_format,
|
44
|
+
voice_id=voice_id,
|
42
45
|
voice_embedding=voice_embedding,
|
43
|
-
|
46
|
+
duration=duration,
|
47
|
+
language=language,
|
48
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
44
49
|
)
|
45
50
|
|
46
|
-
request_body = {
|
47
|
-
"model_id": model_id,
|
48
|
-
"transcript": transcript,
|
49
|
-
"voice": voice,
|
50
|
-
"output_format": {
|
51
|
-
"container": output_format["container"],
|
52
|
-
"encoding": output_format["encoding"],
|
53
|
-
"sample_rate": output_format["sample_rate"],
|
54
|
-
},
|
55
|
-
"language": language,
|
56
|
-
}
|
57
|
-
|
58
|
-
if duration is not None:
|
59
|
-
request_body["duration"] = duration
|
60
|
-
|
61
51
|
generator = self._sse_generator_wrapper(request_body)
|
62
52
|
|
63
53
|
if stream:
|
@@ -10,6 +10,7 @@ from cartesia._constants import DEFAULT_MODEL_ID, DEFAULT_VOICE_EMBEDDING
|
|
10
10
|
from cartesia._types import OutputFormat, VoiceControls
|
11
11
|
from cartesia._websocket import _WebSocket
|
12
12
|
from cartesia.tts import TTS
|
13
|
+
from cartesia.utils.tts import _construct_tts_request
|
13
14
|
|
14
15
|
|
15
16
|
class _AsyncTTSContext:
|
@@ -75,30 +76,20 @@ class _AsyncTTSContext:
|
|
75
76
|
|
76
77
|
await self._websocket.connect()
|
77
78
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
79
|
+
request_body = _construct_tts_request(
|
80
|
+
model_id=model_id,
|
81
|
+
transcript=transcript,
|
82
|
+
output_format=output_format,
|
83
|
+
voice_id=voice_id,
|
84
|
+
voice_embedding=voice_embedding,
|
85
|
+
duration=duration,
|
86
|
+
language=language,
|
87
|
+
context_id=self._context_id,
|
88
|
+
add_timestamps=add_timestamps,
|
89
|
+
continue_=continue_,
|
90
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
82
91
|
)
|
83
92
|
|
84
|
-
request_body = {
|
85
|
-
"model_id": model_id,
|
86
|
-
"transcript": transcript,
|
87
|
-
"voice": voice,
|
88
|
-
"output_format": {
|
89
|
-
"container": output_format["container"],
|
90
|
-
"encoding": output_format["encoding"],
|
91
|
-
"sample_rate": output_format["sample_rate"],
|
92
|
-
},
|
93
|
-
"context_id": self._context_id,
|
94
|
-
"continue": continue_,
|
95
|
-
"language": language,
|
96
|
-
"add_timestamps": add_timestamps,
|
97
|
-
}
|
98
|
-
|
99
|
-
if duration is not None:
|
100
|
-
request_body["duration"] = duration
|
101
|
-
|
102
93
|
await self._websocket.websocket.send_json(request_body)
|
103
94
|
|
104
95
|
# Start listening for responses on the WebSocket
|
@@ -202,12 +193,11 @@ class _AsyncWebSocket(_WebSocket):
|
|
202
193
|
if self.websocket is None or self._is_websocket_closed():
|
203
194
|
route = "tts/websocket"
|
204
195
|
session = await self._get_session()
|
196
|
+
url = f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
|
205
197
|
try:
|
206
|
-
self.websocket = await session.ws_connect(
|
207
|
-
f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
|
208
|
-
)
|
198
|
+
self.websocket = await session.ws_connect(url)
|
209
199
|
except Exception as e:
|
210
|
-
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
200
|
+
raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
|
211
201
|
|
212
202
|
def _is_websocket_closed(self):
|
213
203
|
return self.websocket.closed
|
@@ -8,7 +8,7 @@ from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
|
|
8
8
|
from cartesia._logger import logger
|
9
9
|
from cartesia._types import OutputFormat, VoiceControls
|
10
10
|
from cartesia.utils.retry import retry_on_connection_error
|
11
|
-
from cartesia.utils.tts import _validate_and_construct_voice
|
11
|
+
from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
|
12
12
|
|
13
13
|
|
14
14
|
class _SSE:
|
@@ -84,25 +84,16 @@ class _SSE:
|
|
84
84
|
Both the generator and the dictionary contain the following key(s):
|
85
85
|
- audio: The audio as bytes.
|
86
86
|
"""
|
87
|
-
|
88
|
-
|
87
|
+
request_body = _construct_tts_request(
|
88
|
+
model_id=model_id,
|
89
|
+
transcript=transcript,
|
90
|
+
output_format=output_format,
|
91
|
+
voice_id=voice_id,
|
89
92
|
voice_embedding=voice_embedding,
|
90
|
-
|
93
|
+
duration=duration,
|
94
|
+
language=language,
|
95
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
91
96
|
)
|
92
|
-
request_body = {
|
93
|
-
"model_id": model_id,
|
94
|
-
"transcript": transcript,
|
95
|
-
"voice": voice,
|
96
|
-
"output_format": {
|
97
|
-
"container": output_format["container"],
|
98
|
-
"encoding": output_format["encoding"],
|
99
|
-
"sample_rate": output_format["sample_rate"],
|
100
|
-
},
|
101
|
-
"language": language,
|
102
|
-
}
|
103
|
-
|
104
|
-
if duration is not None:
|
105
|
-
request_body["duration"] = duration
|
106
97
|
|
107
98
|
generator = self._sse_generator_wrapper(request_body)
|
108
99
|
|
@@ -14,7 +14,7 @@ except ImportError:
|
|
14
14
|
from iterators import TimeoutIterator
|
15
15
|
|
16
16
|
from cartesia._types import EventType, OutputFormat, VoiceControls
|
17
|
-
from cartesia.utils.tts import
|
17
|
+
from cartesia.utils.tts import _construct_tts_request
|
18
18
|
|
19
19
|
|
20
20
|
class _TTSContext:
|
@@ -81,29 +81,20 @@ class _TTSContext:
|
|
81
81
|
|
82
82
|
self._websocket.connect()
|
83
83
|
|
84
|
-
|
85
|
-
|
84
|
+
# Create the initial request body
|
85
|
+
request_body = _construct_tts_request(
|
86
|
+
model_id=model_id,
|
87
|
+
transcript=transcript,
|
88
|
+
output_format=output_format,
|
89
|
+
voice_id=voice_id,
|
86
90
|
voice_embedding=voice_embedding,
|
87
|
-
|
91
|
+
duration=duration,
|
92
|
+
language=language,
|
93
|
+
context_id=self._context_id,
|
94
|
+
add_timestamps=add_timestamps,
|
95
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
88
96
|
)
|
89
97
|
|
90
|
-
# Create the initial request body
|
91
|
-
request_body = {
|
92
|
-
"model_id": model_id,
|
93
|
-
"voice": voice,
|
94
|
-
"output_format": {
|
95
|
-
"container": output_format["container"],
|
96
|
-
"encoding": output_format["encoding"],
|
97
|
-
"sample_rate": output_format["sample_rate"],
|
98
|
-
},
|
99
|
-
"context_id": self._context_id,
|
100
|
-
"language": language,
|
101
|
-
"add_timestamps": add_timestamps,
|
102
|
-
}
|
103
|
-
|
104
|
-
if duration is not None:
|
105
|
-
request_body["duration"] = duration
|
106
|
-
|
107
98
|
try:
|
108
99
|
# Create an iterator with a timeout to get text chunks
|
109
100
|
text_iterator = TimeoutIterator(
|
@@ -303,29 +294,19 @@ class _WebSocket:
|
|
303
294
|
if context_id is None:
|
304
295
|
context_id = str(uuid.uuid4())
|
305
296
|
|
306
|
-
|
307
|
-
|
297
|
+
request_body = _construct_tts_request(
|
298
|
+
model_id=model_id,
|
299
|
+
transcript=transcript,
|
300
|
+
output_format=output_format,
|
301
|
+
voice_id=voice_id,
|
308
302
|
voice_embedding=voice_embedding,
|
309
|
-
|
303
|
+
context_id=context_id,
|
304
|
+
duration=duration,
|
305
|
+
language=language,
|
306
|
+
add_timestamps=add_timestamps,
|
307
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
310
308
|
)
|
311
309
|
|
312
|
-
request_body = {
|
313
|
-
"model_id": model_id,
|
314
|
-
"transcript": transcript,
|
315
|
-
"voice": voice,
|
316
|
-
"output_format": {
|
317
|
-
"container": output_format["container"],
|
318
|
-
"encoding": output_format["encoding"],
|
319
|
-
"sample_rate": output_format["sample_rate"],
|
320
|
-
},
|
321
|
-
"context_id": context_id,
|
322
|
-
"language": language,
|
323
|
-
"add_timestamps": add_timestamps,
|
324
|
-
}
|
325
|
-
|
326
|
-
if duration is not None:
|
327
|
-
request_body["duration"] = duration
|
328
|
-
|
329
310
|
generator = self._websocket_generator(request_body)
|
330
311
|
|
331
312
|
if stream:
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from typing import Iterator, List, Optional
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
from cartesia._async_sse import _AsyncSSE
|
5
|
+
from cartesia._async_websocket import _AsyncWebSocket
|
6
|
+
from cartesia._types import OutputFormat, VoiceControls
|
7
|
+
from cartesia.tts import TTS
|
8
|
+
from cartesia.utils.tts import _construct_tts_request
|
9
|
+
|
10
|
+
|
11
|
+
class AsyncTTS(TTS):
|
12
|
+
def __init__(self, api_key, base_url, timeout, get_session):
|
13
|
+
super().__init__(api_key, base_url, timeout)
|
14
|
+
self._get_session = get_session
|
15
|
+
self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
|
16
|
+
self.sse = self._sse_class.send
|
17
|
+
|
18
|
+
async def websocket(self) -> _AsyncWebSocket:
|
19
|
+
ws = _AsyncWebSocket(
|
20
|
+
self._ws_url(),
|
21
|
+
self.api_key,
|
22
|
+
self.cartesia_version,
|
23
|
+
self.timeout,
|
24
|
+
self._get_session,
|
25
|
+
)
|
26
|
+
await ws.connect()
|
27
|
+
return ws
|
28
|
+
|
29
|
+
async def bytes(
|
30
|
+
self,
|
31
|
+
*,
|
32
|
+
model_id: str,
|
33
|
+
transcript: str,
|
34
|
+
output_format: OutputFormat,
|
35
|
+
voice_id: Optional[str] = None,
|
36
|
+
voice_embedding: Optional[List[float]] = None,
|
37
|
+
duration: Optional[int] = None,
|
38
|
+
language: Optional[str] = None,
|
39
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
40
|
+
) -> bytes:
|
41
|
+
request_body = _construct_tts_request(
|
42
|
+
model_id=model_id,
|
43
|
+
transcript=transcript,
|
44
|
+
output_format=output_format,
|
45
|
+
voice_id=voice_id,
|
46
|
+
voice_embedding=voice_embedding,
|
47
|
+
duration=duration,
|
48
|
+
language=language,
|
49
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
50
|
+
)
|
51
|
+
|
52
|
+
async with httpx.AsyncClient() as client:
|
53
|
+
response = await client.post(
|
54
|
+
f"{self._http_url()}/tts/bytes",
|
55
|
+
headers=self.headers,
|
56
|
+
timeout=self.timeout,
|
57
|
+
json=request_body,
|
58
|
+
)
|
59
|
+
|
60
|
+
if not response.is_success:
|
61
|
+
raise ValueError(f"Failed to generate audio. Error: {response.text}")
|
62
|
+
|
63
|
+
return response.content
|
@@ -1,4 +1,6 @@
|
|
1
|
-
from typing import List, Optional
|
1
|
+
from typing import Iterator, List, Optional
|
2
|
+
|
3
|
+
import httpx
|
2
4
|
|
3
5
|
from cartesia._sse import _SSE
|
4
6
|
from cartesia._types import (
|
@@ -9,7 +11,7 @@ from cartesia._types import (
|
|
9
11
|
)
|
10
12
|
from cartesia._websocket import _WebSocket
|
11
13
|
from cartesia.resource import Resource
|
12
|
-
from cartesia.utils.tts import _validate_and_construct_voice
|
14
|
+
from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
|
13
15
|
|
14
16
|
|
15
17
|
class TTS(Resource):
|
@@ -34,6 +36,41 @@ class TTS(Resource):
|
|
34
36
|
ws.connect()
|
35
37
|
return ws
|
36
38
|
|
39
|
+
def bytes(
|
40
|
+
self,
|
41
|
+
*,
|
42
|
+
model_id: str,
|
43
|
+
transcript: str,
|
44
|
+
output_format: OutputFormat,
|
45
|
+
voice_id: Optional[str] = None,
|
46
|
+
voice_embedding: Optional[List[float]] = None,
|
47
|
+
duration: Optional[int] = None,
|
48
|
+
language: Optional[str] = None,
|
49
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
50
|
+
) -> bytes:
|
51
|
+
request_body = _construct_tts_request(
|
52
|
+
model_id=model_id,
|
53
|
+
transcript=transcript,
|
54
|
+
output_format=output_format,
|
55
|
+
voice_id=voice_id,
|
56
|
+
voice_embedding=voice_embedding,
|
57
|
+
duration=duration,
|
58
|
+
language=language,
|
59
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
60
|
+
)
|
61
|
+
|
62
|
+
response = httpx.post(
|
63
|
+
f"{self._http_url()}/tts/bytes",
|
64
|
+
headers=self.headers,
|
65
|
+
timeout=self.timeout,
|
66
|
+
json=request_body,
|
67
|
+
)
|
68
|
+
|
69
|
+
if not response.is_success:
|
70
|
+
raise ValueError(f"Failed to generate audio. Error: {response.text}")
|
71
|
+
|
72
|
+
return response.content
|
73
|
+
|
37
74
|
@staticmethod
|
38
75
|
def get_output_format(output_format_name: str) -> OutputFormat:
|
39
76
|
"""Convenience method to get the output_format dictionary from a given output format name.
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
|
3
|
+
from cartesia._types import OutputFormat, VoiceControls
|
4
|
+
|
5
|
+
|
6
|
+
def _validate_and_construct_voice(
|
7
|
+
voice_id: Optional[str] = None,
|
8
|
+
voice_embedding: Optional[List[float]] = None,
|
9
|
+
experimental_voice_controls: Optional[VoiceControls] = None,
|
10
|
+
) -> dict:
|
11
|
+
if voice_id is None and voice_embedding is None:
|
12
|
+
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
13
|
+
|
14
|
+
voice = {}
|
15
|
+
|
16
|
+
if voice_id is not None:
|
17
|
+
voice["id"] = voice_id
|
18
|
+
|
19
|
+
if voice_embedding is not None:
|
20
|
+
voice["embedding"] = voice_embedding
|
21
|
+
|
22
|
+
if experimental_voice_controls is not None:
|
23
|
+
voice["__experimental_controls"] = experimental_voice_controls
|
24
|
+
|
25
|
+
return voice
|
26
|
+
|
27
|
+
|
28
|
+
def _construct_tts_request(
|
29
|
+
*,
|
30
|
+
model_id: str,
|
31
|
+
output_format: OutputFormat,
|
32
|
+
transcript: Optional[str] = None,
|
33
|
+
voice_id: Optional[str] = None,
|
34
|
+
voice_embedding: Optional[List[float]] = None,
|
35
|
+
duration: Optional[int] = None,
|
36
|
+
language: Optional[str] = None,
|
37
|
+
add_timestamps: bool = False,
|
38
|
+
context_id: Optional[str] = None,
|
39
|
+
continue_: bool = False,
|
40
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
41
|
+
):
|
42
|
+
tts_request = {
|
43
|
+
"model_id": model_id,
|
44
|
+
"voice": _validate_and_construct_voice(
|
45
|
+
voice_id,
|
46
|
+
voice_embedding=voice_embedding,
|
47
|
+
experimental_voice_controls=_experimental_voice_controls,
|
48
|
+
),
|
49
|
+
"output_format": {
|
50
|
+
"container": output_format["container"],
|
51
|
+
"encoding": output_format["encoding"],
|
52
|
+
"sample_rate": output_format["sample_rate"],
|
53
|
+
},
|
54
|
+
}
|
55
|
+
|
56
|
+
if language is not None:
|
57
|
+
tts_request["language"] = language
|
58
|
+
|
59
|
+
if transcript is not None:
|
60
|
+
tts_request["transcript"] = transcript
|
61
|
+
|
62
|
+
if duration is not None:
|
63
|
+
tts_request["duration"] = duration
|
64
|
+
|
65
|
+
if add_timestamps:
|
66
|
+
tts_request["add_timestamps"] = add_timestamps
|
67
|
+
|
68
|
+
if context_id is not None:
|
69
|
+
tts_request["context_id"] = context_id
|
70
|
+
|
71
|
+
if continue_:
|
72
|
+
tts_request["continue"] = continue_
|
73
|
+
|
74
|
+
return tts_request
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.1.0-dev0"
|