cartesia 1.0.14__tar.gz → 1.1.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {cartesia-1.0.14/cartesia.egg-info → cartesia-1.1.0.dev0}/PKG-INFO +32 -15
  2. cartesia-1.0.14/PKG-INFO → cartesia-1.1.0.dev0/README.md +25 -21
  3. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_async_sse.py +9 -19
  4. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_async_websocket.py +16 -26
  5. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_sse.py +9 -18
  6. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_websocket.py +22 -41
  7. cartesia-1.1.0.dev0/cartesia/async_tts.py +63 -0
  8. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/tts.py +39 -2
  9. cartesia-1.1.0.dev0/cartesia/utils/tts.py +74 -0
  10. cartesia-1.1.0.dev0/cartesia/version.py +1 -0
  11. cartesia-1.0.14/README.md → cartesia-1.1.0.dev0/cartesia.egg-info/PKG-INFO +38 -4
  12. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia.egg-info/SOURCES.txt +0 -1
  13. cartesia-1.1.0.dev0/cartesia.egg-info/requires.txt +5 -0
  14. cartesia-1.1.0.dev0/pyproject.toml +84 -0
  15. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/tests/test_tts.py +60 -22
  16. cartesia-1.0.14/cartesia/async_tts.py +0 -22
  17. cartesia-1.0.14/cartesia/utils/tts.py +0 -25
  18. cartesia-1.0.14/cartesia/version.py +0 -1
  19. cartesia-1.0.14/cartesia.egg-info/requires.txt +0 -26
  20. cartesia-1.0.14/pyproject.toml +0 -56
  21. cartesia-1.0.14/setup.py +0 -292
  22. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/LICENSE.md +0 -0
  23. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/__init__.py +0 -0
  24. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_constants.py +0 -0
  25. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_logger.py +0 -0
  26. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/_types.py +0 -0
  27. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/async_client.py +0 -0
  28. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/client.py +0 -0
  29. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/resource.py +0 -0
  30. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/utils/__init__.py +0 -0
  31. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/utils/deprecated.py +0 -0
  32. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/utils/retry.py +0 -0
  33. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia/voices.py +0 -0
  34. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia.egg-info/dependency_links.txt +0 -0
  35. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/cartesia.egg-info/top_level.txt +0 -0
  36. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/setup.cfg +0 -0
  37. {cartesia-1.0.14 → cartesia-1.1.0.dev0}/tests/test_deprecated.py +0 -0
@@ -1,19 +1,15 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.14
3
+ Version: 1.1.0.dev0
4
4
  Summary: The official Python library for the Cartesia API.
5
- Home-page:
6
- Author: Cartesia, Inc.
7
- Author-email: support@cartesia.ai
8
- Classifier: Programming Language :: Python
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
- Requires-Python: >=3.8.0
5
+ Requires-Python: >=3.9
12
6
  Description-Content-Type: text/markdown
13
- Provides-Extra: dev
14
- Provides-Extra: all
15
7
  License-File: LICENSE.md
16
-
8
+ Requires-Dist: aiohttp>=3.10.10
9
+ Requires-Dist: httpx>=0.27.2
10
+ Requires-Dist: iterators>=0.2.0
11
+ Requires-Dist: requests>=2.32.3
12
+ Requires-Dist: websockets>=13.1
17
13
 
18
14
  # Cartesia Python API Library
19
15
 
@@ -30,6 +26,7 @@ The official Cartesia Python library which provides convenient access to the Car
30
26
  - [Installation](#installation)
31
27
  - [Voices](#voices)
32
28
  - [Text-to-Speech](#text-to-speech)
29
+ - [Bytes](#bytes)
33
30
  - [Server-Sent Events (SSE)](#server-sent-events-sse)
34
31
  - [WebSocket](#websocket)
35
32
  - [Conditioning speech on previous generations using WebSocket](#conditioning-speech-on-previous-generations-using-websocket)
@@ -88,6 +85,30 @@ new_voice = client.voices.create(
88
85
 
89
86
  ## Text-to-Speech
90
87
 
88
+ ### Bytes
89
+
90
+ ```python
91
+ from cartesia import Cartesia
92
+ import os
93
+
94
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
95
+
96
+ data = client.tts.bytes(
97
+ model_id="sonic-english",
98
+ transcript="Hello, world! I'm generating audio on Cartesia.",
99
+ voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
100
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/tts/bytes
101
+ output_format={
102
+ "container": "wav",
103
+ "encoding": "pcm_f32le",
104
+ "sample_rate": 44100,
105
+ },
106
+ )
107
+
108
+ with open("output.wav", "wb") as f:
109
+ f.write(data)
110
+ ```
111
+
91
112
  ### Server-Sent Events (SSE)
92
113
 
93
114
  ```python
@@ -96,7 +117,6 @@ import pyaudio
96
117
  import os
97
118
 
98
119
  client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
99
- voice_name = "Barbershop Man"
100
120
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
101
121
  voice = client.voices.get(id=voice_id)
102
122
 
@@ -149,7 +169,6 @@ import os
149
169
 
150
170
  async def write_stream():
151
171
  client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
152
- voice_name = "Barbershop Man"
153
172
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
154
173
  voice = client.voices.get(id=voice_id)
155
174
  transcript = "Hello! Welcome to Cartesia"
@@ -203,7 +222,6 @@ import pyaudio
203
222
  import os
204
223
 
205
224
  client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
206
- voice_name = "Barbershop Man"
207
225
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
208
226
  voice = client.voices.get(id=voice_id)
209
227
  transcript = "Hello! Welcome to Cartesia"
@@ -460,7 +478,6 @@ import pyaudio
460
478
  import os
461
479
 
462
480
  client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
463
- voice_name = "Barbershop Man"
464
481
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
465
482
  voice = client.voices.get(id=voice_id)
466
483
 
@@ -1,20 +1,3 @@
1
- Metadata-Version: 2.1
2
- Name: cartesia
3
- Version: 1.0.14
4
- Summary: The official Python library for the Cartesia API.
5
- Home-page:
6
- Author: Cartesia, Inc.
7
- Author-email: support@cartesia.ai
8
- Classifier: Programming Language :: Python
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
- Requires-Python: >=3.8.0
12
- Description-Content-Type: text/markdown
13
- Provides-Extra: dev
14
- Provides-Extra: all
15
- License-File: LICENSE.md
16
-
17
-
18
1
  # Cartesia Python API Library
19
2
 
20
3
  ![PyPI - Version](https://img.shields.io/pypi/v/cartesia)
@@ -30,6 +13,7 @@ The official Cartesia Python library which provides convenient access to the Car
30
13
  - [Installation](#installation)
31
14
  - [Voices](#voices)
32
15
  - [Text-to-Speech](#text-to-speech)
16
+ - [Bytes](#bytes)
33
17
  - [Server-Sent Events (SSE)](#server-sent-events-sse)
34
18
  - [WebSocket](#websocket)
35
19
  - [Conditioning speech on previous generations using WebSocket](#conditioning-speech-on-previous-generations-using-websocket)
@@ -88,6 +72,30 @@ new_voice = client.voices.create(
88
72
 
89
73
  ## Text-to-Speech
90
74
 
75
+ ### Bytes
76
+
77
+ ```python
78
+ from cartesia import Cartesia
79
+ import os
80
+
81
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
82
+
83
+ data = client.tts.bytes(
84
+ model_id="sonic-english",
85
+ transcript="Hello, world! I'm generating audio on Cartesia.",
86
+ voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
87
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/tts/bytes
88
+ output_format={
89
+ "container": "wav",
90
+ "encoding": "pcm_f32le",
91
+ "sample_rate": 44100,
92
+ },
93
+ )
94
+
95
+ with open("output.wav", "wb") as f:
96
+ f.write(data)
97
+ ```
98
+
91
99
  ### Server-Sent Events (SSE)
92
100
 
93
101
  ```python
@@ -96,7 +104,6 @@ import pyaudio
96
104
  import os
97
105
 
98
106
  client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
99
- voice_name = "Barbershop Man"
100
107
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
101
108
  voice = client.voices.get(id=voice_id)
102
109
 
@@ -149,7 +156,6 @@ import os
149
156
 
150
157
  async def write_stream():
151
158
  client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
152
- voice_name = "Barbershop Man"
153
159
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
154
160
  voice = client.voices.get(id=voice_id)
155
161
  transcript = "Hello! Welcome to Cartesia"
@@ -203,7 +209,6 @@ import pyaudio
203
209
  import os
204
210
 
205
211
  client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
206
- voice_name = "Barbershop Man"
207
212
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
208
213
  voice = client.voices.get(id=voice_id)
209
214
  transcript = "Hello! Welcome to Cartesia"
@@ -460,7 +465,6 @@ import pyaudio
460
465
  import os
461
466
 
462
467
  client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
463
- voice_name = "Barbershop Man"
464
468
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
465
469
  voice = client.voices.get(id=voice_id)
466
470
 
@@ -8,8 +8,8 @@ from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
8
8
  from cartesia._logger import logger
9
9
  from cartesia._sse import _SSE
10
10
  from cartesia._types import OutputFormat, VoiceControls
11
- from cartesia.tts import TTS
12
11
  from cartesia.utils.retry import retry_on_connection_error_async
12
+ from cartesia.utils.tts import _construct_tts_request
13
13
 
14
14
 
15
15
  class _AsyncSSE(_SSE):
@@ -37,27 +37,17 @@ class _AsyncSSE(_SSE):
37
37
  stream: bool = True,
38
38
  _experimental_voice_controls: Optional[VoiceControls] = None,
39
39
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
40
- voice = TTS._validate_and_construct_voice(
41
- voice_id,
40
+ request_body = _construct_tts_request(
41
+ model_id=model_id,
42
+ transcript=transcript,
43
+ output_format=output_format,
44
+ voice_id=voice_id,
42
45
  voice_embedding=voice_embedding,
43
- experimental_voice_controls=_experimental_voice_controls,
46
+ duration=duration,
47
+ language=language,
48
+ _experimental_voice_controls=_experimental_voice_controls,
44
49
  )
45
50
 
46
- request_body = {
47
- "model_id": model_id,
48
- "transcript": transcript,
49
- "voice": voice,
50
- "output_format": {
51
- "container": output_format["container"],
52
- "encoding": output_format["encoding"],
53
- "sample_rate": output_format["sample_rate"],
54
- },
55
- "language": language,
56
- }
57
-
58
- if duration is not None:
59
- request_body["duration"] = duration
60
-
61
51
  generator = self._sse_generator_wrapper(request_body)
62
52
 
63
53
  if stream:
@@ -10,6 +10,7 @@ from cartesia._constants import DEFAULT_MODEL_ID, DEFAULT_VOICE_EMBEDDING
10
10
  from cartesia._types import OutputFormat, VoiceControls
11
11
  from cartesia._websocket import _WebSocket
12
12
  from cartesia.tts import TTS
13
+ from cartesia.utils.tts import _construct_tts_request
13
14
 
14
15
 
15
16
  class _AsyncTTSContext:
@@ -75,30 +76,20 @@ class _AsyncTTSContext:
75
76
 
76
77
  await self._websocket.connect()
77
78
 
78
- voice = TTS._validate_and_construct_voice(
79
- voice_id,
80
- voice_embedding,
81
- experimental_voice_controls=_experimental_voice_controls,
79
+ request_body = _construct_tts_request(
80
+ model_id=model_id,
81
+ transcript=transcript,
82
+ output_format=output_format,
83
+ voice_id=voice_id,
84
+ voice_embedding=voice_embedding,
85
+ duration=duration,
86
+ language=language,
87
+ context_id=self._context_id,
88
+ add_timestamps=add_timestamps,
89
+ continue_=continue_,
90
+ _experimental_voice_controls=_experimental_voice_controls,
82
91
  )
83
92
 
84
- request_body = {
85
- "model_id": model_id,
86
- "transcript": transcript,
87
- "voice": voice,
88
- "output_format": {
89
- "container": output_format["container"],
90
- "encoding": output_format["encoding"],
91
- "sample_rate": output_format["sample_rate"],
92
- },
93
- "context_id": self._context_id,
94
- "continue": continue_,
95
- "language": language,
96
- "add_timestamps": add_timestamps,
97
- }
98
-
99
- if duration is not None:
100
- request_body["duration"] = duration
101
-
102
93
  await self._websocket.websocket.send_json(request_body)
103
94
 
104
95
  # Start listening for responses on the WebSocket
@@ -202,12 +193,11 @@ class _AsyncWebSocket(_WebSocket):
202
193
  if self.websocket is None or self._is_websocket_closed():
203
194
  route = "tts/websocket"
204
195
  session = await self._get_session()
196
+ url = f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
205
197
  try:
206
- self.websocket = await session.ws_connect(
207
- f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
208
- )
198
+ self.websocket = await session.ws_connect(url)
209
199
  except Exception as e:
210
- raise RuntimeError(f"Failed to connect to WebSocket. {e}")
200
+ raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
211
201
 
212
202
  def _is_websocket_closed(self):
213
203
  return self.websocket.closed
@@ -8,7 +8,7 @@ from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
8
8
  from cartesia._logger import logger
9
9
  from cartesia._types import OutputFormat, VoiceControls
10
10
  from cartesia.utils.retry import retry_on_connection_error
11
- from cartesia.utils.tts import _validate_and_construct_voice
11
+ from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
12
12
 
13
13
 
14
14
  class _SSE:
@@ -84,25 +84,16 @@ class _SSE:
84
84
  Both the generator and the dictionary contain the following key(s):
85
85
  - audio: The audio as bytes.
86
86
  """
87
- voice = _validate_and_construct_voice(
88
- voice_id,
87
+ request_body = _construct_tts_request(
88
+ model_id=model_id,
89
+ transcript=transcript,
90
+ output_format=output_format,
91
+ voice_id=voice_id,
89
92
  voice_embedding=voice_embedding,
90
- experimental_voice_controls=_experimental_voice_controls,
93
+ duration=duration,
94
+ language=language,
95
+ _experimental_voice_controls=_experimental_voice_controls,
91
96
  )
92
- request_body = {
93
- "model_id": model_id,
94
- "transcript": transcript,
95
- "voice": voice,
96
- "output_format": {
97
- "container": output_format["container"],
98
- "encoding": output_format["encoding"],
99
- "sample_rate": output_format["sample_rate"],
100
- },
101
- "language": language,
102
- }
103
-
104
- if duration is not None:
105
- request_body["duration"] = duration
106
97
 
107
98
  generator = self._sse_generator_wrapper(request_body)
108
99
 
@@ -14,7 +14,7 @@ except ImportError:
14
14
  from iterators import TimeoutIterator
15
15
 
16
16
  from cartesia._types import EventType, OutputFormat, VoiceControls
17
- from cartesia.utils.tts import _validate_and_construct_voice
17
+ from cartesia.utils.tts import _construct_tts_request
18
18
 
19
19
 
20
20
  class _TTSContext:
@@ -81,29 +81,20 @@ class _TTSContext:
81
81
 
82
82
  self._websocket.connect()
83
83
 
84
- voice = _validate_and_construct_voice(
85
- voice_id,
84
+ # Create the initial request body
85
+ request_body = _construct_tts_request(
86
+ model_id=model_id,
87
+ transcript=transcript,
88
+ output_format=output_format,
89
+ voice_id=voice_id,
86
90
  voice_embedding=voice_embedding,
87
- experimental_voice_controls=_experimental_voice_controls,
91
+ duration=duration,
92
+ language=language,
93
+ context_id=self._context_id,
94
+ add_timestamps=add_timestamps,
95
+ _experimental_voice_controls=_experimental_voice_controls,
88
96
  )
89
97
 
90
- # Create the initial request body
91
- request_body = {
92
- "model_id": model_id,
93
- "voice": voice,
94
- "output_format": {
95
- "container": output_format["container"],
96
- "encoding": output_format["encoding"],
97
- "sample_rate": output_format["sample_rate"],
98
- },
99
- "context_id": self._context_id,
100
- "language": language,
101
- "add_timestamps": add_timestamps,
102
- }
103
-
104
- if duration is not None:
105
- request_body["duration"] = duration
106
-
107
98
  try:
108
99
  # Create an iterator with a timeout to get text chunks
109
100
  text_iterator = TimeoutIterator(
@@ -303,29 +294,19 @@ class _WebSocket:
303
294
  if context_id is None:
304
295
  context_id = str(uuid.uuid4())
305
296
 
306
- voice = _validate_and_construct_voice(
307
- voice_id,
297
+ request_body = _construct_tts_request(
298
+ model_id=model_id,
299
+ transcript=transcript,
300
+ output_format=output_format,
301
+ voice_id=voice_id,
308
302
  voice_embedding=voice_embedding,
309
- experimental_voice_controls=_experimental_voice_controls,
303
+ context_id=context_id,
304
+ duration=duration,
305
+ language=language,
306
+ add_timestamps=add_timestamps,
307
+ _experimental_voice_controls=_experimental_voice_controls,
310
308
  )
311
309
 
312
- request_body = {
313
- "model_id": model_id,
314
- "transcript": transcript,
315
- "voice": voice,
316
- "output_format": {
317
- "container": output_format["container"],
318
- "encoding": output_format["encoding"],
319
- "sample_rate": output_format["sample_rate"],
320
- },
321
- "context_id": context_id,
322
- "language": language,
323
- "add_timestamps": add_timestamps,
324
- }
325
-
326
- if duration is not None:
327
- request_body["duration"] = duration
328
-
329
310
  generator = self._websocket_generator(request_body)
330
311
 
331
312
  if stream:
@@ -0,0 +1,63 @@
1
+ from typing import Iterator, List, Optional
2
+
3
+ import httpx
4
+ from cartesia._async_sse import _AsyncSSE
5
+ from cartesia._async_websocket import _AsyncWebSocket
6
+ from cartesia._types import OutputFormat, VoiceControls
7
+ from cartesia.tts import TTS
8
+ from cartesia.utils.tts import _construct_tts_request
9
+
10
+
11
+ class AsyncTTS(TTS):
12
+ def __init__(self, api_key, base_url, timeout, get_session):
13
+ super().__init__(api_key, base_url, timeout)
14
+ self._get_session = get_session
15
+ self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
16
+ self.sse = self._sse_class.send
17
+
18
+ async def websocket(self) -> _AsyncWebSocket:
19
+ ws = _AsyncWebSocket(
20
+ self._ws_url(),
21
+ self.api_key,
22
+ self.cartesia_version,
23
+ self.timeout,
24
+ self._get_session,
25
+ )
26
+ await ws.connect()
27
+ return ws
28
+
29
+ async def bytes(
30
+ self,
31
+ *,
32
+ model_id: str,
33
+ transcript: str,
34
+ output_format: OutputFormat,
35
+ voice_id: Optional[str] = None,
36
+ voice_embedding: Optional[List[float]] = None,
37
+ duration: Optional[int] = None,
38
+ language: Optional[str] = None,
39
+ _experimental_voice_controls: Optional[VoiceControls] = None,
40
+ ) -> bytes:
41
+ request_body = _construct_tts_request(
42
+ model_id=model_id,
43
+ transcript=transcript,
44
+ output_format=output_format,
45
+ voice_id=voice_id,
46
+ voice_embedding=voice_embedding,
47
+ duration=duration,
48
+ language=language,
49
+ _experimental_voice_controls=_experimental_voice_controls,
50
+ )
51
+
52
+ async with httpx.AsyncClient() as client:
53
+ response = await client.post(
54
+ f"{self._http_url()}/tts/bytes",
55
+ headers=self.headers,
56
+ timeout=self.timeout,
57
+ json=request_body,
58
+ )
59
+
60
+ if not response.is_success:
61
+ raise ValueError(f"Failed to generate audio. Error: {response.text}")
62
+
63
+ return response.content
@@ -1,4 +1,6 @@
1
- from typing import List, Optional
1
+ from typing import Iterator, List, Optional
2
+
3
+ import httpx
2
4
 
3
5
  from cartesia._sse import _SSE
4
6
  from cartesia._types import (
@@ -9,7 +11,7 @@ from cartesia._types import (
9
11
  )
10
12
  from cartesia._websocket import _WebSocket
11
13
  from cartesia.resource import Resource
12
- from cartesia.utils.tts import _validate_and_construct_voice
14
+ from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
13
15
 
14
16
 
15
17
  class TTS(Resource):
@@ -34,6 +36,41 @@ class TTS(Resource):
34
36
  ws.connect()
35
37
  return ws
36
38
 
39
+ def bytes(
40
+ self,
41
+ *,
42
+ model_id: str,
43
+ transcript: str,
44
+ output_format: OutputFormat,
45
+ voice_id: Optional[str] = None,
46
+ voice_embedding: Optional[List[float]] = None,
47
+ duration: Optional[int] = None,
48
+ language: Optional[str] = None,
49
+ _experimental_voice_controls: Optional[VoiceControls] = None,
50
+ ) -> bytes:
51
+ request_body = _construct_tts_request(
52
+ model_id=model_id,
53
+ transcript=transcript,
54
+ output_format=output_format,
55
+ voice_id=voice_id,
56
+ voice_embedding=voice_embedding,
57
+ duration=duration,
58
+ language=language,
59
+ _experimental_voice_controls=_experimental_voice_controls,
60
+ )
61
+
62
+ response = httpx.post(
63
+ f"{self._http_url()}/tts/bytes",
64
+ headers=self.headers,
65
+ timeout=self.timeout,
66
+ json=request_body,
67
+ )
68
+
69
+ if not response.is_success:
70
+ raise ValueError(f"Failed to generate audio. Error: {response.text}")
71
+
72
+ return response.content
73
+
37
74
  @staticmethod
38
75
  def get_output_format(output_format_name: str) -> OutputFormat:
39
76
  """Convenience method to get the output_format dictionary from a given output format name.
@@ -0,0 +1,74 @@
1
+ from typing import List, Optional
2
+
3
+ from cartesia._types import OutputFormat, VoiceControls
4
+
5
+
6
+ def _validate_and_construct_voice(
7
+ voice_id: Optional[str] = None,
8
+ voice_embedding: Optional[List[float]] = None,
9
+ experimental_voice_controls: Optional[VoiceControls] = None,
10
+ ) -> dict:
11
+ if voice_id is None and voice_embedding is None:
12
+ raise ValueError("Either voice_id or voice_embedding must be specified.")
13
+
14
+ voice = {}
15
+
16
+ if voice_id is not None:
17
+ voice["id"] = voice_id
18
+
19
+ if voice_embedding is not None:
20
+ voice["embedding"] = voice_embedding
21
+
22
+ if experimental_voice_controls is not None:
23
+ voice["__experimental_controls"] = experimental_voice_controls
24
+
25
+ return voice
26
+
27
+
28
+ def _construct_tts_request(
29
+ *,
30
+ model_id: str,
31
+ output_format: OutputFormat,
32
+ transcript: Optional[str] = None,
33
+ voice_id: Optional[str] = None,
34
+ voice_embedding: Optional[List[float]] = None,
35
+ duration: Optional[int] = None,
36
+ language: Optional[str] = None,
37
+ add_timestamps: bool = False,
38
+ context_id: Optional[str] = None,
39
+ continue_: bool = False,
40
+ _experimental_voice_controls: Optional[VoiceControls] = None,
41
+ ):
42
+ tts_request = {
43
+ "model_id": model_id,
44
+ "voice": _validate_and_construct_voice(
45
+ voice_id,
46
+ voice_embedding=voice_embedding,
47
+ experimental_voice_controls=_experimental_voice_controls,
48
+ ),
49
+ "output_format": {
50
+ "container": output_format["container"],
51
+ "encoding": output_format["encoding"],
52
+ "sample_rate": output_format["sample_rate"],
53
+ },
54
+ }
55
+
56
+ if language is not None:
57
+ tts_request["language"] = language
58
+
59
+ if transcript is not None:
60
+ tts_request["transcript"] = transcript
61
+
62
+ if duration is not None:
63
+ tts_request["duration"] = duration
64
+
65
+ if add_timestamps:
66
+ tts_request["add_timestamps"] = add_timestamps
67
+
68
+ if context_id is not None:
69
+ tts_request["context_id"] = context_id
70
+
71
+ if continue_:
72
+ tts_request["continue"] = continue_
73
+
74
+ return tts_request
@@ -0,0 +1 @@
1
+ __version__ = "1.1.0-dev0"