cartesia 1.3.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {cartesia-1.3.0 → cartesia-1.4.0}/PKG-INFO +6 -4
  2. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/_types.py +1 -1
  3. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/_websocket.py +2 -2
  4. cartesia-1.4.0/cartesia/async_tts.py +176 -0
  5. cartesia-1.4.0/cartesia/tts.py +292 -0
  6. cartesia-1.4.0/cartesia/version.py +1 -0
  7. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/voices.py +1 -5
  8. {cartesia-1.3.0 → cartesia-1.4.0}/pyproject.toml +4 -3
  9. {cartesia-1.3.0 → cartesia-1.4.0}/tests/test_tts.py +0 -18
  10. {cartesia-1.3.0 → cartesia-1.4.0}/uv.lock +14 -3
  11. cartesia-1.3.0/cartesia/async_tts.py +0 -63
  12. cartesia-1.3.0/cartesia/tts.py +0 -137
  13. cartesia-1.3.0/cartesia/version.py +0 -1
  14. {cartesia-1.3.0 → cartesia-1.4.0}/.github/workflows/ci.yaml +0 -0
  15. {cartesia-1.3.0 → cartesia-1.4.0}/.github/workflows/publish.yaml +0 -0
  16. {cartesia-1.3.0 → cartesia-1.4.0}/.gitignore +0 -0
  17. {cartesia-1.3.0 → cartesia-1.4.0}/LICENSE.md +0 -0
  18. {cartesia-1.3.0 → cartesia-1.4.0}/Makefile +0 -0
  19. {cartesia-1.3.0 → cartesia-1.4.0}/README.md +0 -0
  20. {cartesia-1.3.0 → cartesia-1.4.0}/bumpversion.py +0 -0
  21. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/__init__.py +0 -0
  22. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/_async_sse.py +0 -0
  23. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/_async_websocket.py +0 -0
  24. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/_constants.py +0 -0
  25. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/_logger.py +0 -0
  26. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/_sse.py +0 -0
  27. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/async_client.py +0 -0
  28. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/client.py +0 -0
  29. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/resource.py +0 -0
  30. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/utils/__init__.py +0 -0
  31. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/utils/deprecated.py +0 -0
  32. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/utils/retry.py +0 -0
  33. {cartesia-1.3.0 → cartesia-1.4.0}/cartesia/utils/tts.py +0 -0
  34. {cartesia-1.3.0 → cartesia-1.4.0}/tests/__init__.py +0 -0
  35. {cartesia-1.3.0 → cartesia-1.4.0}/tests/resources/sample-speech-4s.wav +0 -0
  36. {cartesia-1.3.0 → cartesia-1.4.0}/tests/test_deprecated.py +0 -0
@@ -1,12 +1,14 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: cartesia
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: The official Python library for the Cartesia API.
5
+ License-File: LICENSE.md
5
6
  Requires-Python: >=3.9
6
7
  Requires-Dist: aiohttp>=3.10.10
7
- Requires-Dist: httpx>=0.27.2
8
+ Requires-Dist: httpx>=0.27.0
8
9
  Requires-Dist: iterators>=0.2.0
9
- Requires-Dist: requests>=2.32.3
10
+ Requires-Dist: pydub>=0.25.1
11
+ Requires-Dist: requests>=2.31.0
10
12
  Requires-Dist: websockets>=10.4
11
13
  Description-Content-Type: text/markdown
12
14
 
@@ -36,7 +36,6 @@ class VoiceMetadata(TypedDict):
36
36
  user_id: str
37
37
  created_at: str
38
38
  language: str
39
- base_voice_id: Optional[str] = None
40
39
 
41
40
 
42
41
  class VoiceControls(TypedDict):
@@ -62,6 +61,7 @@ class OutputFormat(TypedDict):
62
61
  container: str
63
62
  encoding: str
64
63
  sample_rate: int
64
+ bit_rate: Optional[int] = None
65
65
 
66
66
 
67
67
  class EventType:
@@ -121,7 +121,7 @@ class _TTSContext:
121
121
  raise RuntimeError(f"Error generating audio:\n{response['error']}")
122
122
  if response["done"]:
123
123
  break
124
- if response["data"]:
124
+ if "data" in response and response["data"]:
125
125
  yield self._websocket._convert_response(
126
126
  response=response, include_context_id=True
127
127
  )
@@ -138,7 +138,7 @@ class _TTSContext:
138
138
  raise RuntimeError(f"Error generating audio:\n{response['error']}")
139
139
  if response["done"]:
140
140
  break
141
- if response["data"]:
141
+ if "data" in response and response["data"]:
142
142
  yield self._websocket._convert_response(
143
143
  response=response, include_context_id=True
144
144
  )
@@ -0,0 +1,176 @@
1
+ from typing import Iterator, List, Optional, Tuple
2
+
3
+ import httpx
4
+ from cartesia._async_sse import _AsyncSSE
5
+ from cartesia._async_websocket import _AsyncWebSocket
6
+ from cartesia._types import OutputFormat, VoiceControls
7
+ from cartesia.tts import TTS
8
+ from cartesia.utils.tts import _construct_tts_request
9
+
10
+
11
+ class AsyncTTS(TTS):
12
+ def __init__(self, api_key, base_url, timeout, get_session):
13
+ super().__init__(api_key, base_url, timeout)
14
+ self._get_session = get_session
15
+ self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
16
+ self.sse = self._sse_class.send
17
+
18
+ async def websocket(self) -> _AsyncWebSocket:
19
+ ws = _AsyncWebSocket(
20
+ self._ws_url(),
21
+ self.api_key,
22
+ self.cartesia_version,
23
+ self.timeout,
24
+ self._get_session,
25
+ )
26
+ await ws.connect()
27
+ return ws
28
+
29
+ async def bytes(
30
+ self,
31
+ *,
32
+ model_id: str,
33
+ transcript: str,
34
+ output_format: OutputFormat,
35
+ voice_id: Optional[str] = None,
36
+ voice_embedding: Optional[List[float]] = None,
37
+ duration: Optional[int] = None,
38
+ language: Optional[str] = None,
39
+ _experimental_voice_controls: Optional[VoiceControls] = None,
40
+ ) -> bytes:
41
+ request_body = _construct_tts_request(
42
+ model_id=model_id,
43
+ transcript=transcript,
44
+ output_format=output_format,
45
+ voice_id=voice_id,
46
+ voice_embedding=voice_embedding,
47
+ duration=duration,
48
+ language=language,
49
+ _experimental_voice_controls=_experimental_voice_controls,
50
+ )
51
+
52
+ async with httpx.AsyncClient() as client:
53
+ response = await client.post(
54
+ f"{self._http_url()}/tts/bytes",
55
+ headers=self.headers,
56
+ timeout=self.timeout,
57
+ json=request_body,
58
+ )
59
+
60
+ if not response.is_success:
61
+ raise ValueError(f"Failed to generate audio. Error: {response.text}")
62
+
63
+ return response.content
64
+
65
+ async def infill(
66
+ self,
67
+ *,
68
+ model_id: str,
69
+ language: str,
70
+ transcript: str,
71
+ voice_id: str,
72
+ output_format: OutputFormat,
73
+ left_audio_path: Optional[str] = None,
74
+ right_audio_path: Optional[str] = None,
75
+ experimental_voice_controls: Optional[VoiceControls] = None,
76
+ ) -> Tuple[bytes, bytes]:
77
+ """Generate infill audio between two existing audio segments.
78
+
79
+ Args:
80
+ model_id: The ID of the model to use for generating audio
81
+ language: The language of the transcript
82
+ transcript: The text to synthesize
83
+ voice_id: The ID of the voice to use for generating audio
84
+ output_format: The desired audio output format
85
+ left_audio_path: Path to the audio file that comes before the infill
86
+ right_audio_path: Path to the audio file that comes after the infill
87
+ experimental_voice_controls: Optional voice control parameters
88
+
89
+ Returns:
90
+ A tuple containing:
91
+ - The generated infill audio (bytes)
92
+ - The complete concatenated audio (bytes)
93
+ """
94
+ if not left_audio_path and not right_audio_path:
95
+ raise ValueError("Must specify at least one of left_audio_path or right_audio_path")
96
+
97
+ headers = self.headers.copy()
98
+ headers.pop("Content-Type", None)
99
+
100
+ left_audio_file = None
101
+ right_audio_file = None
102
+ try:
103
+ files = {}
104
+ if left_audio_path:
105
+ left_audio_file = open(left_audio_path, "rb")
106
+ files["left_audio"] = left_audio_file
107
+ if right_audio_path:
108
+ right_audio_file = open(right_audio_path, "rb")
109
+ files["right_audio"] = right_audio_file
110
+
111
+ # Construct form data with output_format fields directly
112
+ data = {
113
+ "model_id": model_id,
114
+ "language": language,
115
+ "transcript": transcript,
116
+ "voice_id": voice_id,
117
+ "output_format[container]": output_format["container"],
118
+ "output_format[encoding]": output_format["encoding"],
119
+ "output_format[sample_rate]": output_format["sample_rate"],
120
+ }
121
+
122
+ # Add bit_rate for mp3 container
123
+ if "bit_rate" in output_format:
124
+ data["output_format[bit_rate]"] = output_format["bit_rate"]
125
+
126
+ # Add voice controls if specified
127
+ if experimental_voice_controls:
128
+ if "speed" in experimental_voice_controls:
129
+ data["voice[__experimental_controls][speed]"] = experimental_voice_controls[
130
+ "speed"
131
+ ]
132
+ if "emotion" in experimental_voice_controls:
133
+ # Pass emotions as a list instead of individual values
134
+ data["voice[__experimental_controls][emotion][]"] = experimental_voice_controls[
135
+ "emotion"
136
+ ]
137
+
138
+ async with httpx.AsyncClient() as client:
139
+ response = await client.post(
140
+ f"{self._http_url()}/infill/bytes",
141
+ headers=headers,
142
+ timeout=self.timeout,
143
+ files=files,
144
+ data=data,
145
+ )
146
+
147
+ if not response.is_success:
148
+ raise ValueError(
149
+ f"Failed to infill audio. Status Code: {response.status_code}\n"
150
+ f"Error: {response.text}"
151
+ )
152
+
153
+ if left_audio_file:
154
+ left_audio_file.seek(0)
155
+ left_audio = left_audio_file.read()
156
+ else:
157
+ left_audio = None
158
+
159
+ if right_audio_file:
160
+ right_audio_file.seek(0)
161
+ right_audio = right_audio_file.read()
162
+ else:
163
+ right_audio = None
164
+
165
+ infill_audio = response.content
166
+ format = output_format["container"].lower()
167
+ total_audio = self._concat_audio_segments(
168
+ left_audio, infill_audio, right_audio, format=format
169
+ )
170
+ return infill_audio, total_audio
171
+
172
+ finally:
173
+ if left_audio_file:
174
+ left_audio_file.close()
175
+ if right_audio_file:
176
+ right_audio_file.close()
@@ -0,0 +1,292 @@
1
+ import json
2
+ from typing import Iterator, List, Optional, Tuple
3
+
4
+ import httpx
5
+ import io
6
+ from pydub import AudioSegment
7
+
8
+ from cartesia._sse import _SSE
9
+ from cartesia._types import (
10
+ OutputFormat,
11
+ OutputFormatMapping,
12
+ VoiceControls,
13
+ )
14
+ from cartesia._websocket import _WebSocket
15
+ from cartesia.resource import Resource
16
+ from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
17
+
18
+
19
+ class TTS(Resource):
20
+ """This resource contains methods to generate audio using Cartesia's text-to-speech API."""
21
+
22
+ def __init__(self, api_key: str, base_url: str, timeout: float):
23
+ super().__init__(
24
+ api_key=api_key,
25
+ base_url=base_url,
26
+ timeout=timeout,
27
+ )
28
+ self._sse_class = _SSE(self._http_url(), self.headers, self.timeout)
29
+ self.sse = self._sse_class.send
30
+
31
+ def websocket(self) -> _WebSocket:
32
+ """This method returns a WebSocket object that can be used to generate audio using WebSocket.
33
+
34
+ Returns:
35
+ _WebSocket: A WebSocket object that can be used to generate audio using WebSocket.
36
+ """
37
+ ws = _WebSocket(self._ws_url(), self.api_key, self.cartesia_version)
38
+ ws.connect()
39
+ return ws
40
+
41
+ def bytes(
42
+ self,
43
+ *,
44
+ model_id: str,
45
+ transcript: str,
46
+ output_format: OutputFormat,
47
+ voice_id: Optional[str] = None,
48
+ voice_embedding: Optional[List[float]] = None,
49
+ duration: Optional[int] = None,
50
+ language: Optional[str] = None,
51
+ _experimental_voice_controls: Optional[VoiceControls] = None,
52
+ ) -> bytes:
53
+ request_body = _construct_tts_request(
54
+ model_id=model_id,
55
+ transcript=transcript,
56
+ output_format=output_format,
57
+ voice_id=voice_id,
58
+ voice_embedding=voice_embedding,
59
+ duration=duration,
60
+ language=language,
61
+ _experimental_voice_controls=_experimental_voice_controls,
62
+ )
63
+
64
+ response = httpx.post(
65
+ f"{self._http_url()}/tts/bytes",
66
+ headers=self.headers,
67
+ timeout=self.timeout,
68
+ json=request_body,
69
+ )
70
+
71
+ if not response.is_success:
72
+ raise ValueError(f"Failed to generate audio. Error: {response.text}")
73
+
74
+ return response.content
75
+
76
+ @staticmethod
77
+ def get_output_format(output_format_name: str) -> OutputFormat:
78
+ """Convenience method to get the output_format dictionary from a given output format name.
79
+
80
+ Args:
81
+ output_format_name (str): The name of the output format.
82
+
83
+ Returns:
84
+ OutputFormat: A dictionary containing the details of the output format to be passed into tts.sse() or tts.websocket().send()
85
+
86
+ Raises:
87
+ ValueError: If the output_format name is not supported
88
+ """
89
+ if output_format_name in OutputFormatMapping._format_mapping:
90
+ output_format_obj = OutputFormatMapping.get_format(output_format_name)
91
+ else:
92
+ raise ValueError(f"Unsupported format: {output_format_name}")
93
+
94
+ return OutputFormat(
95
+ container=output_format_obj["container"],
96
+ encoding=output_format_obj["encoding"],
97
+ sample_rate=output_format_obj["sample_rate"],
98
+ )
99
+
100
+ @staticmethod
101
+ def get_sample_rate(output_format_name: str) -> int:
102
+ """Convenience method to get the sample rate for a given output format.
103
+
104
+ Args:
105
+ output_format_name (str): The name of the output format.
106
+
107
+ Returns:
108
+ int: The sample rate for the output format.
109
+
110
+ Raises:
111
+ ValueError: If the output_format name is not supported
112
+ """
113
+ if output_format_name in OutputFormatMapping._format_mapping:
114
+ output_format_obj = OutputFormatMapping.get_format(output_format_name)
115
+ else:
116
+ raise ValueError(f"Unsupported format: {output_format_name}")
117
+
118
+ return output_format_obj["sample_rate"]
119
+
120
+ @staticmethod
121
+ def _validate_and_construct_voice(
122
+ voice_id: Optional[str] = None,
123
+ voice_embedding: Optional[List[float]] = None,
124
+ experimental_voice_controls: Optional[VoiceControls] = None,
125
+ ) -> dict:
126
+ """Validate and construct the voice dictionary for the request.
127
+
128
+ Args:
129
+ voice_id: The ID of the voice to use for generating audio.
130
+ voice_embedding: The embedding of the voice to use for generating audio.
131
+ experimental_voice_controls: Voice controls for emotion and speed.
132
+ Note: This is an experimental feature and may rapidly change in the future.
133
+
134
+ Returns:
135
+ A dictionary representing the voice configuration.
136
+
137
+ Raises:
138
+ ValueError: If neither or both voice_id and voice_embedding are specified.
139
+ """
140
+ return _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls)
141
+
142
+ def infill(
143
+ self,
144
+ *,
145
+ model_id: str,
146
+ language: str,
147
+ transcript: str,
148
+ voice_id: str,
149
+ output_format: OutputFormat,
150
+ left_audio_path: Optional[str] = None,
151
+ right_audio_path: Optional[str] = None,
152
+ experimental_voice_controls: Optional[VoiceControls] = None,
153
+ ) -> Tuple[bytes, bytes]:
154
+ """Generate infill audio between two existing audio segments.
155
+
156
+ Args:
157
+ model_id: The ID of the model to use for generating audio
158
+ language: The language of the transcript
159
+ transcript: The text to synthesize
160
+ voice_id: The ID of the voice to use for generating audio
161
+ output_format: The desired audio output format
162
+ left_audio_path: Path to the audio file that comes before the infill
163
+ right_audio_path: Path to the audio file that comes after the infill
164
+ experimental_voice_controls: Optional voice control parameters
165
+
166
+ Returns:
167
+ A tuple containing:
168
+ - The generated infill audio (bytes)
169
+ - The complete concatenated audio (bytes)
170
+ """
171
+ if not left_audio_path and not right_audio_path:
172
+ raise ValueError("Must specify at least one of left_audio_path or right_audio_path")
173
+
174
+ headers = self.headers.copy()
175
+ headers.pop("Content-Type", None)
176
+
177
+ left_audio_file = None
178
+ right_audio_file = None
179
+ try:
180
+ files = {}
181
+ if left_audio_path:
182
+ left_audio_file = open(left_audio_path, "rb")
183
+ files["left_audio"] = left_audio_file
184
+ if right_audio_path:
185
+ right_audio_file = open(right_audio_path, "rb")
186
+ files["right_audio"] = right_audio_file
187
+
188
+ # Construct form data with output_format fields directly
189
+ data = {
190
+ "model_id": model_id,
191
+ "language": language,
192
+ "transcript": transcript,
193
+ "voice_id": voice_id,
194
+ "output_format[container]": output_format["container"],
195
+ "output_format[encoding]": output_format["encoding"],
196
+ "output_format[sample_rate]": output_format["sample_rate"],
197
+ }
198
+
199
+ # Add bit_rate for mp3 container
200
+ if "bit_rate" in output_format:
201
+ data["output_format[bit_rate]"] = output_format["bit_rate"]
202
+
203
+ # Add voice controls if specified
204
+ if experimental_voice_controls:
205
+ if "speed" in experimental_voice_controls:
206
+ data["voice[__experimental_controls][speed]"] = experimental_voice_controls[
207
+ "speed"
208
+ ]
209
+ if "emotion" in experimental_voice_controls:
210
+ # Pass emotions as a list instead of individual values
211
+ data["voice[__experimental_controls][emotion][]"] = experimental_voice_controls[
212
+ "emotion"
213
+ ]
214
+
215
+ response = httpx.post(
216
+ f"{self._http_url()}/infill/bytes",
217
+ headers=headers,
218
+ timeout=self.timeout,
219
+ files=files,
220
+ data=data,
221
+ )
222
+
223
+ if not response.is_success:
224
+ raise ValueError(
225
+ f"Failed to infill audio. Status Code: {response.status_code}\n"
226
+ f"Error: {response.text}"
227
+ )
228
+
229
+ if left_audio_file:
230
+ left_audio_file.seek(0)
231
+ left_audio = left_audio_file.read()
232
+ else:
233
+ left_audio = None
234
+
235
+ if right_audio_file:
236
+ right_audio_file.seek(0)
237
+ right_audio = right_audio_file.read()
238
+ else:
239
+ right_audio = None
240
+
241
+ infill_audio = response.content
242
+ format = output_format["container"].lower()
243
+ total_audio = self._concat_audio_segments(
244
+ left_audio, infill_audio, right_audio, format=format
245
+ )
246
+ return infill_audio, total_audio
247
+
248
+ finally:
249
+ if left_audio_file:
250
+ left_audio_file.close()
251
+ if right_audio_file:
252
+ right_audio_file.close()
253
+
254
+ @staticmethod
255
+ def _concat_audio_segments(
256
+ left_audio: Optional[bytes],
257
+ infill_audio: bytes,
258
+ right_audio: Optional[bytes],
259
+ format: str = "wav",
260
+ ) -> bytes:
261
+ """Helper method to concatenate three audio segments while preserving audio format and headers.
262
+
263
+ Args:
264
+ left_audio: The audio segment that comes before the infill
265
+ infill_audio: The generated infill audio segment
266
+ right_audio: The audio segment that comes after the infill
267
+ format: The audio format (e.g., 'wav', 'mp3'). Defaults to 'wav'
268
+
269
+ Returns:
270
+ bytes: The concatenated audio as bytes
271
+
272
+ Raises:
273
+ ValueError: If the audio segments cannot be loaded or concatenated
274
+ """
275
+ try:
276
+ # Convert bytes to AudioSegment objects
277
+ combined = AudioSegment.empty()
278
+ if left_audio:
279
+ combined += AudioSegment.from_file(io.BytesIO(left_audio), format=format)
280
+
281
+ combined += AudioSegment.from_file(io.BytesIO(infill_audio), format=format)
282
+
283
+ if right_audio:
284
+ combined += AudioSegment.from_file(io.BytesIO(right_audio), format=format)
285
+
286
+ # Export to bytes
287
+ output = io.BytesIO()
288
+ combined.export(output, format=format)
289
+ return output.getvalue()
290
+
291
+ except Exception as e:
292
+ raise ValueError(f"Failed to concatenate audio segments: {str(e)}")
@@ -0,0 +1 @@
1
+ __version__ = "1.4.0"
@@ -52,8 +52,7 @@ class Voices(Resource):
52
52
 
53
53
  if not response.is_success:
54
54
  raise ValueError(
55
- f"Failed to get voice. Status Code: {response.status_code}\n"
56
- f"Error: {response.text}"
55
+ f"Failed to get voice. Status Code: {response.status_code}\nError: {response.text}"
57
56
  )
58
57
 
59
58
  return response.json()
@@ -123,7 +122,6 @@ class Voices(Resource):
123
122
  name: str,
124
123
  description: str,
125
124
  embedding: List[float],
126
- base_voice_id: Optional[str] = None,
127
125
  language: str = "en",
128
126
  ) -> VoiceMetadata:
129
127
  """Create a new voice.
@@ -132,7 +130,6 @@ class Voices(Resource):
132
130
  name: The name of the voice.
133
131
  description: The description of the voice.
134
132
  embedding: The embedding of the voice. This should be generated with :meth:`clone`.
135
- base_voice_id: The ID of the base voice. This should be a valid voice ID if specified.
136
133
 
137
134
  Returns:
138
135
  A dictionary containing the voice metadata.
@@ -144,7 +141,6 @@ class Voices(Resource):
144
141
  "name": name,
145
142
  "description": description,
146
143
  "embedding": embedding,
147
- "base_voice_id": base_voice_id,
148
144
  "language": language,
149
145
  },
150
146
  timeout=self.timeout,
@@ -1,15 +1,16 @@
1
1
  [project]
2
2
  name = "cartesia"
3
- version = "1.3.0"
3
+ version = "1.4.0"
4
4
  description = "The official Python library for the Cartesia API."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
7
7
  dependencies = [
8
8
  "aiohttp>=3.10.10",
9
- "httpx>=0.27.2",
9
+ "httpx>=0.27.0",
10
10
  "iterators>=0.2.0",
11
- "requests>=2.32.3",
11
+ "requests>=2.31.0",
12
12
  "websockets>=10.4",
13
+ "pydub>=0.25.1",
13
14
  ]
14
15
 
15
16
  [build-system]
@@ -126,24 +126,6 @@ def test_create_voice(client: Cartesia):
126
126
  client.voices.delete(voice["id"])
127
127
 
128
128
 
129
- @pytest.mark.skip(reason="Enable after https://github.com/cartesia-ai/bifrost/pull/847 is deployed")
130
- def test_create_voice_with_parent(client: Cartesia):
131
- logger.info("Testing voices.create with parent")
132
- voice = client.voices.create(
133
- name="Test Base voice",
134
- description="Test base voice description",
135
- embedding=np.ones(192).tolist(),
136
- base_voice_id=SAMPLE_VOICE_ID,
137
- )
138
- assert isinstance(voice, dict)
139
- assert voice["base_voice_id"] == SAMPLE_VOICE_ID
140
-
141
- get_voice = client.voices.get(voice["id"])
142
- assert get_voice["base_voice_id"] == SAMPLE_VOICE_ID
143
-
144
- client.voices.delete(voice["id"])
145
-
146
-
147
129
  def test_mix_voice(client: Cartesia):
148
130
  logger.info("Testing voices.mix")
149
131
  output = client.voices.mix(
@@ -162,12 +162,13 @@ wheels = [
162
162
 
163
163
  [[package]]
164
164
  name = "cartesia"
165
- version = "1.3.0"
165
+ version = "1.4.0"
166
166
  source = { editable = "." }
167
167
  dependencies = [
168
168
  { name = "aiohttp" },
169
169
  { name = "httpx" },
170
170
  { name = "iterators" },
171
+ { name = "pydub" },
171
172
  { name = "requests" },
172
173
  { name = "websockets" },
173
174
  ]
@@ -189,9 +190,10 @@ dev = [
189
190
  [package.metadata]
190
191
  requires-dist = [
191
192
  { name = "aiohttp", specifier = ">=3.10.10" },
192
- { name = "httpx", specifier = ">=0.27.2" },
193
+ { name = "httpx", specifier = ">=0.27.0" },
193
194
  { name = "iterators", specifier = ">=0.2.0" },
194
- { name = "requests", specifier = ">=2.32.3" },
195
+ { name = "pydub", specifier = ">=0.25.1" },
196
+ { name = "requests", specifier = ">=2.31.0" },
195
197
  { name = "websockets", specifier = ">=10.4" },
196
198
  ]
197
199
 
@@ -1029,6 +1031,15 @@ wheels = [
1029
1031
  { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 },
1030
1032
  ]
1031
1033
 
1034
+ [[package]]
1035
+ name = "pydub"
1036
+ version = "0.25.1"
1037
+ source = { registry = "https://pypi.org/simple" }
1038
+ sdist = { url = "https://files.pythonhosted.org/packages/fe/9a/e6bca0eed82db26562c73b5076539a4a08d3cffd19c3cc5913a3e61145fd/pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f", size = 38326 }
1039
+ wheels = [
1040
+ { url = "https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", size = 32327 },
1041
+ ]
1042
+
1032
1043
  [[package]]
1033
1044
  name = "pygments"
1034
1045
  version = "2.18.0"
@@ -1,63 +0,0 @@
1
- from typing import Iterator, List, Optional
2
-
3
- import httpx
4
- from cartesia._async_sse import _AsyncSSE
5
- from cartesia._async_websocket import _AsyncWebSocket
6
- from cartesia._types import OutputFormat, VoiceControls
7
- from cartesia.tts import TTS
8
- from cartesia.utils.tts import _construct_tts_request
9
-
10
-
11
- class AsyncTTS(TTS):
12
- def __init__(self, api_key, base_url, timeout, get_session):
13
- super().__init__(api_key, base_url, timeout)
14
- self._get_session = get_session
15
- self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
16
- self.sse = self._sse_class.send
17
-
18
- async def websocket(self) -> _AsyncWebSocket:
19
- ws = _AsyncWebSocket(
20
- self._ws_url(),
21
- self.api_key,
22
- self.cartesia_version,
23
- self.timeout,
24
- self._get_session,
25
- )
26
- await ws.connect()
27
- return ws
28
-
29
- async def bytes(
30
- self,
31
- *,
32
- model_id: str,
33
- transcript: str,
34
- output_format: OutputFormat,
35
- voice_id: Optional[str] = None,
36
- voice_embedding: Optional[List[float]] = None,
37
- duration: Optional[int] = None,
38
- language: Optional[str] = None,
39
- _experimental_voice_controls: Optional[VoiceControls] = None,
40
- ) -> bytes:
41
- request_body = _construct_tts_request(
42
- model_id=model_id,
43
- transcript=transcript,
44
- output_format=output_format,
45
- voice_id=voice_id,
46
- voice_embedding=voice_embedding,
47
- duration=duration,
48
- language=language,
49
- _experimental_voice_controls=_experimental_voice_controls,
50
- )
51
-
52
- async with httpx.AsyncClient() as client:
53
- response = await client.post(
54
- f"{self._http_url()}/tts/bytes",
55
- headers=self.headers,
56
- timeout=self.timeout,
57
- json=request_body,
58
- )
59
-
60
- if not response.is_success:
61
- raise ValueError(f"Failed to generate audio. Error: {response.text}")
62
-
63
- return response.content
@@ -1,137 +0,0 @@
1
- from typing import Iterator, List, Optional
2
-
3
- import httpx
4
-
5
- from cartesia._sse import _SSE
6
- from cartesia._types import (
7
- OutputFormat,
8
- OutputFormatMapping,
9
- VoiceControls,
10
- )
11
- from cartesia._websocket import _WebSocket
12
- from cartesia.resource import Resource
13
- from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
14
-
15
-
16
- class TTS(Resource):
17
- """This resource contains methods to generate audio using Cartesia's text-to-speech API."""
18
-
19
- def __init__(self, api_key: str, base_url: str, timeout: float):
20
- super().__init__(
21
- api_key=api_key,
22
- base_url=base_url,
23
- timeout=timeout,
24
- )
25
- self._sse_class = _SSE(self._http_url(), self.headers, self.timeout)
26
- self.sse = self._sse_class.send
27
-
28
- def websocket(self) -> _WebSocket:
29
- """This method returns a WebSocket object that can be used to generate audio using WebSocket.
30
-
31
- Returns:
32
- _WebSocket: A WebSocket object that can be used to generate audio using WebSocket.
33
- """
34
- ws = _WebSocket(self._ws_url(), self.api_key, self.cartesia_version)
35
- ws.connect()
36
- return ws
37
-
38
- def bytes(
39
- self,
40
- *,
41
- model_id: str,
42
- transcript: str,
43
- output_format: OutputFormat,
44
- voice_id: Optional[str] = None,
45
- voice_embedding: Optional[List[float]] = None,
46
- duration: Optional[int] = None,
47
- language: Optional[str] = None,
48
- _experimental_voice_controls: Optional[VoiceControls] = None,
49
- ) -> bytes:
50
- request_body = _construct_tts_request(
51
- model_id=model_id,
52
- transcript=transcript,
53
- output_format=output_format,
54
- voice_id=voice_id,
55
- voice_embedding=voice_embedding,
56
- duration=duration,
57
- language=language,
58
- _experimental_voice_controls=_experimental_voice_controls,
59
- )
60
-
61
- response = httpx.post(
62
- f"{self._http_url()}/tts/bytes",
63
- headers=self.headers,
64
- timeout=self.timeout,
65
- json=request_body,
66
- )
67
-
68
- if not response.is_success:
69
- raise ValueError(f"Failed to generate audio. Error: {response.text}")
70
-
71
- return response.content
72
-
73
- @staticmethod
74
- def get_output_format(output_format_name: str) -> OutputFormat:
75
- """Convenience method to get the output_format dictionary from a given output format name.
76
-
77
- Args:
78
- output_format_name (str): The name of the output format.
79
-
80
- Returns:
81
- OutputFormat: A dictionary containing the details of the output format to be passed into tts.sse() or tts.websocket().send()
82
-
83
- Raises:
84
- ValueError: If the output_format name is not supported
85
- """
86
- if output_format_name in OutputFormatMapping._format_mapping:
87
- output_format_obj = OutputFormatMapping.get_format(output_format_name)
88
- else:
89
- raise ValueError(f"Unsupported format: {output_format_name}")
90
-
91
- return OutputFormat(
92
- container=output_format_obj["container"],
93
- encoding=output_format_obj["encoding"],
94
- sample_rate=output_format_obj["sample_rate"],
95
- )
96
-
97
- @staticmethod
98
- def get_sample_rate(output_format_name: str) -> int:
99
- """Convenience method to get the sample rate for a given output format.
100
-
101
- Args:
102
- output_format_name (str): The name of the output format.
103
-
104
- Returns:
105
- int: The sample rate for the output format.
106
-
107
- Raises:
108
- ValueError: If the output_format name is not supported
109
- """
110
- if output_format_name in OutputFormatMapping._format_mapping:
111
- output_format_obj = OutputFormatMapping.get_format(output_format_name)
112
- else:
113
- raise ValueError(f"Unsupported format: {output_format_name}")
114
-
115
- return output_format_obj["sample_rate"]
116
-
117
- @staticmethod
118
- def _validate_and_construct_voice(
119
- voice_id: Optional[str] = None,
120
- voice_embedding: Optional[List[float]] = None,
121
- experimental_voice_controls: Optional[VoiceControls] = None,
122
- ) -> dict:
123
- """Validate and construct the voice dictionary for the request.
124
-
125
- Args:
126
- voice_id: The ID of the voice to use for generating audio.
127
- voice_embedding: The embedding of the voice to use for generating audio.
128
- experimental_voice_controls: Voice controls for emotion and speed.
129
- Note: This is an experimental feature and may rapidly change in the future.
130
-
131
- Returns:
132
- A dictionary representing the voice configuration.
133
-
134
- Raises:
135
- ValueError: If neither or both voice_id and voice_embedding are specified.
136
- """
137
- return _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls)
@@ -1 +0,0 @@
1
- __version__ = "1.3.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes