cartesia 1.3.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +302 -3
- cartesia/api_status/__init__.py +6 -0
- cartesia/api_status/client.py +104 -0
- cartesia/api_status/requests/__init__.py +5 -0
- cartesia/api_status/requests/api_info.py +8 -0
- cartesia/api_status/types/__init__.py +5 -0
- cartesia/api_status/types/api_info.py +20 -0
- cartesia/base_client.py +156 -0
- cartesia/client.py +163 -40
- cartesia/core/__init__.py +50 -0
- cartesia/core/api_error.py +15 -0
- cartesia/core/client_wrapper.py +55 -0
- cartesia/core/datetime_utils.py +28 -0
- cartesia/core/file.py +67 -0
- cartesia/core/http_client.py +499 -0
- cartesia/core/jsonable_encoder.py +101 -0
- cartesia/core/pagination.py +88 -0
- cartesia/core/pydantic_utilities.py +296 -0
- cartesia/core/query_encoder.py +58 -0
- cartesia/core/remove_none_from_dict.py +11 -0
- cartesia/core/request_options.py +35 -0
- cartesia/core/serialization.py +272 -0
- cartesia/datasets/__init__.py +24 -0
- cartesia/datasets/requests/__init__.py +15 -0
- cartesia/datasets/requests/create_dataset_request.py +7 -0
- cartesia/datasets/requests/dataset.py +9 -0
- cartesia/datasets/requests/dataset_file.py +9 -0
- cartesia/datasets/requests/paginated_dataset_files.py +10 -0
- cartesia/datasets/requests/paginated_datasets.py +10 -0
- cartesia/datasets/types/__init__.py +17 -0
- cartesia/datasets/types/create_dataset_request.py +19 -0
- cartesia/datasets/types/dataset.py +21 -0
- cartesia/datasets/types/dataset_file.py +21 -0
- cartesia/datasets/types/file_purpose.py +5 -0
- cartesia/datasets/types/paginated_dataset_files.py +21 -0
- cartesia/datasets/types/paginated_datasets.py +21 -0
- cartesia/embedding/__init__.py +5 -0
- cartesia/embedding/types/__init__.py +5 -0
- cartesia/embedding/types/embedding.py +201 -0
- cartesia/environment.py +7 -0
- cartesia/infill/__init__.py +2 -0
- cartesia/infill/client.py +318 -0
- cartesia/tts/__init__.py +167 -0
- cartesia/{_async_websocket.py → tts/_async_websocket.py} +212 -85
- cartesia/tts/_websocket.py +479 -0
- cartesia/tts/client.py +407 -0
- cartesia/tts/requests/__init__.py +76 -0
- cartesia/tts/requests/cancel_context_request.py +17 -0
- cartesia/tts/requests/controls.py +11 -0
- cartesia/tts/requests/generation_request.py +58 -0
- cartesia/tts/requests/mp_3_output_format.py +11 -0
- cartesia/tts/requests/output_format.py +30 -0
- cartesia/tts/requests/phoneme_timestamps.py +10 -0
- cartesia/tts/requests/raw_output_format.py +11 -0
- cartesia/tts/requests/speed.py +7 -0
- cartesia/tts/requests/tts_request.py +24 -0
- cartesia/tts/requests/tts_request_embedding_specifier.py +16 -0
- cartesia/tts/requests/tts_request_id_specifier.py +16 -0
- cartesia/tts/requests/tts_request_voice_specifier.py +7 -0
- cartesia/tts/requests/wav_output_format.py +7 -0
- cartesia/tts/requests/web_socket_base_response.py +11 -0
- cartesia/tts/requests/web_socket_chunk_response.py +11 -0
- cartesia/tts/requests/web_socket_done_response.py +7 -0
- cartesia/tts/requests/web_socket_error_response.py +7 -0
- cartesia/tts/requests/web_socket_flush_done_response.py +9 -0
- cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_raw_output_format.py +11 -0
- cartesia/tts/requests/web_socket_request.py +7 -0
- cartesia/tts/requests/web_socket_response.py +70 -0
- cartesia/tts/requests/web_socket_stream_options.py +8 -0
- cartesia/tts/requests/web_socket_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_tts_output.py +18 -0
- cartesia/tts/requests/web_socket_tts_request.py +25 -0
- cartesia/tts/requests/word_timestamps.py +10 -0
- cartesia/tts/socket_client.py +302 -0
- cartesia/tts/types/__init__.py +90 -0
- cartesia/tts/types/cancel_context_request.py +28 -0
- cartesia/tts/types/context_id.py +3 -0
- cartesia/tts/types/controls.py +22 -0
- cartesia/tts/types/emotion.py +34 -0
- cartesia/tts/types/flush_id.py +3 -0
- cartesia/tts/types/generation_request.py +71 -0
- cartesia/tts/types/mp_3_output_format.py +23 -0
- cartesia/tts/types/natural_specifier.py +5 -0
- cartesia/tts/types/numerical_specifier.py +3 -0
- cartesia/tts/types/output_format.py +58 -0
- cartesia/tts/types/phoneme_timestamps.py +21 -0
- cartesia/tts/types/raw_encoding.py +5 -0
- cartesia/tts/types/raw_output_format.py +22 -0
- cartesia/tts/types/speed.py +7 -0
- cartesia/tts/types/supported_language.py +7 -0
- cartesia/tts/types/tts_request.py +35 -0
- cartesia/tts/types/tts_request_embedding_specifier.py +27 -0
- cartesia/tts/types/tts_request_id_specifier.py +27 -0
- cartesia/tts/types/tts_request_voice_specifier.py +7 -0
- cartesia/tts/types/wav_output_format.py +17 -0
- cartesia/tts/types/web_socket_base_response.py +22 -0
- cartesia/tts/types/web_socket_chunk_response.py +22 -0
- cartesia/tts/types/web_socket_done_response.py +17 -0
- cartesia/tts/types/web_socket_error_response.py +19 -0
- cartesia/tts/types/web_socket_flush_done_response.py +21 -0
- cartesia/tts/types/web_socket_phoneme_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_raw_output_format.py +22 -0
- cartesia/tts/types/web_socket_request.py +7 -0
- cartesia/tts/types/web_socket_response.py +125 -0
- cartesia/tts/types/web_socket_stream_options.py +19 -0
- cartesia/tts/types/web_socket_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_tts_output.py +29 -0
- cartesia/tts/types/web_socket_tts_request.py +37 -0
- cartesia/tts/types/word_timestamps.py +21 -0
- cartesia/{_constants.py → tts/utils/constants.py} +2 -2
- cartesia/tts/utils/tts.py +64 -0
- cartesia/tts/utils/types.py +70 -0
- cartesia/version.py +3 -1
- cartesia/voice_changer/__init__.py +27 -0
- cartesia/voice_changer/client.py +395 -0
- cartesia/voice_changer/requests/__init__.py +15 -0
- cartesia/voice_changer/requests/streaming_response.py +38 -0
- cartesia/voice_changer/types/__init__.py +17 -0
- cartesia/voice_changer/types/output_format_container.py +5 -0
- cartesia/voice_changer/types/streaming_response.py +64 -0
- cartesia/voices/__init__.py +81 -0
- cartesia/voices/client.py +1218 -0
- cartesia/voices/requests/__init__.py +29 -0
- cartesia/voices/requests/create_voice_request.py +23 -0
- cartesia/voices/requests/embedding_response.py +8 -0
- cartesia/voices/requests/embedding_specifier.py +10 -0
- cartesia/voices/requests/get_voices_response.py +24 -0
- cartesia/voices/requests/id_specifier.py +10 -0
- cartesia/voices/requests/localize_dialect.py +11 -0
- cartesia/voices/requests/localize_voice_request.py +28 -0
- cartesia/voices/requests/mix_voice_specifier.py +7 -0
- cartesia/voices/requests/mix_voices_request.py +9 -0
- cartesia/voices/requests/update_voice_request.py +15 -0
- cartesia/voices/requests/voice.py +43 -0
- cartesia/voices/requests/voice_metadata.py +36 -0
- cartesia/voices/types/__init__.py +53 -0
- cartesia/voices/types/base_voice_id.py +5 -0
- cartesia/voices/types/clone_mode.py +5 -0
- cartesia/voices/types/create_voice_request.py +34 -0
- cartesia/voices/types/embedding_response.py +20 -0
- cartesia/voices/types/embedding_specifier.py +22 -0
- cartesia/voices/types/gender.py +5 -0
- cartesia/voices/types/gender_presentation.py +5 -0
- cartesia/voices/types/get_voices_response.py +34 -0
- cartesia/voices/types/id_specifier.py +22 -0
- cartesia/voices/types/localize_dialect.py +11 -0
- cartesia/voices/types/localize_english_dialect.py +5 -0
- cartesia/voices/types/localize_french_dialect.py +5 -0
- cartesia/voices/types/localize_portuguese_dialect.py +5 -0
- cartesia/voices/types/localize_spanish_dialect.py +5 -0
- cartesia/voices/types/localize_target_language.py +7 -0
- cartesia/voices/types/localize_voice_request.py +39 -0
- cartesia/voices/types/mix_voice_specifier.py +7 -0
- cartesia/voices/types/mix_voices_request.py +20 -0
- cartesia/voices/types/update_voice_request.py +27 -0
- cartesia/voices/types/voice.py +54 -0
- cartesia/voices/types/voice_expand_options.py +5 -0
- cartesia/voices/types/voice_id.py +3 -0
- cartesia/voices/types/voice_metadata.py +48 -0
- cartesia/voices/types/weight.py +3 -0
- cartesia-2.0.0.dist-info/METADATA +414 -0
- cartesia-2.0.0.dist-info/RECORD +165 -0
- {cartesia-1.3.1.dist-info → cartesia-2.0.0.dist-info}/WHEEL +1 -1
- cartesia/_async_sse.py +0 -95
- cartesia/_logger.py +0 -3
- cartesia/_sse.py +0 -143
- cartesia/_types.py +0 -70
- cartesia/_websocket.py +0 -358
- cartesia/async_client.py +0 -82
- cartesia/async_tts.py +0 -63
- cartesia/resource.py +0 -44
- cartesia/tts.py +0 -137
- cartesia/utils/deprecated.py +0 -55
- cartesia/utils/retry.py +0 -87
- cartesia/utils/tts.py +0 -78
- cartesia/voices.py +0 -208
- cartesia-1.3.1.dist-info/METADATA +0 -661
- cartesia-1.3.1.dist-info/RECORD +0 -23
- cartesia-1.3.1.dist-info/licenses/LICENSE.md +0 -21
- /cartesia/{utils/__init__.py → py.typed} +0 -0
@@ -0,0 +1,318 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from ..core.client_wrapper import SyncClientWrapper
|
5
|
+
from .. import core
|
6
|
+
from ..voice_changer.types.output_format_container import OutputFormatContainer
|
7
|
+
from ..tts.types.raw_encoding import RawEncoding
|
8
|
+
from ..tts.types.speed import Speed
|
9
|
+
from ..tts.types.emotion import Emotion
|
10
|
+
from ..core.request_options import RequestOptions
|
11
|
+
from json.decoder import JSONDecodeError
|
12
|
+
from ..core.api_error import ApiError
|
13
|
+
from ..core.client_wrapper import AsyncClientWrapper
|
14
|
+
|
15
|
+
# this is used as the default value for optional parameters
|
16
|
+
OMIT = typing.cast(typing.Any, ...)
|
17
|
+
|
18
|
+
|
19
|
+
class InfillClient:
|
20
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
21
|
+
self._client_wrapper = client_wrapper
|
22
|
+
|
23
|
+
def bytes(
|
24
|
+
self,
|
25
|
+
*,
|
26
|
+
left_audio: core.File,
|
27
|
+
right_audio: core.File,
|
28
|
+
model_id: str,
|
29
|
+
language: str,
|
30
|
+
transcript: str,
|
31
|
+
voice_id: str,
|
32
|
+
output_format_container: OutputFormatContainer,
|
33
|
+
output_format_sample_rate: int,
|
34
|
+
output_format_encoding: typing.Optional[RawEncoding] = OMIT,
|
35
|
+
output_format_bit_rate: typing.Optional[int] = OMIT,
|
36
|
+
voice_experimental_controls_speed: typing.Optional[Speed] = OMIT,
|
37
|
+
voice_experimental_controls_emotion: typing.Optional[typing.List[Emotion]] = OMIT,
|
38
|
+
request_options: typing.Optional[RequestOptions] = None,
|
39
|
+
) -> typing.Iterator[bytes]:
|
40
|
+
"""
|
41
|
+
Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
|
42
|
+
|
43
|
+
**The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
|
44
|
+
|
45
|
+
Infilling is only available on `sonic-2` at this time.
|
46
|
+
|
47
|
+
At least one of `left_audio` or `right_audio` must be provided.
|
48
|
+
|
49
|
+
As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
|
50
|
+
- Use longer infill transcripts
|
51
|
+
- This gives the model more flexibility to adapt to the rest of the audio
|
52
|
+
- Target natural pauses in the audio when deciding where to clip
|
53
|
+
- This means you don't need word-level timestamps to be as precise
|
54
|
+
- Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
|
55
|
+
- This helps the model generate more natural transitions
|
56
|
+
|
57
|
+
Parameters
|
58
|
+
----------
|
59
|
+
left_audio : core.File
|
60
|
+
See core.File for more documentation
|
61
|
+
|
62
|
+
right_audio : core.File
|
63
|
+
See core.File for more documentation
|
64
|
+
|
65
|
+
model_id : str
|
66
|
+
The ID of the model to use for generating audio
|
67
|
+
|
68
|
+
language : str
|
69
|
+
The language of the transcript
|
70
|
+
|
71
|
+
transcript : str
|
72
|
+
The infill text to generate
|
73
|
+
|
74
|
+
voice_id : str
|
75
|
+
The ID of the voice to use for generating audio
|
76
|
+
|
77
|
+
output_format_container : OutputFormatContainer
|
78
|
+
The format of the output audio
|
79
|
+
|
80
|
+
output_format_sample_rate : int
|
81
|
+
The sample rate of the output audio
|
82
|
+
|
83
|
+
output_format_encoding : typing.Optional[RawEncoding]
|
84
|
+
Required for `raw` and `wav` containers.
|
85
|
+
|
86
|
+
|
87
|
+
output_format_bit_rate : typing.Optional[int]
|
88
|
+
Required for `mp3` containers.
|
89
|
+
|
90
|
+
|
91
|
+
voice_experimental_controls_speed : typing.Optional[Speed]
|
92
|
+
Either a number between -1.0 and 1.0 or a natural language description of speed.
|
93
|
+
|
94
|
+
If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
|
95
|
+
|
96
|
+
|
97
|
+
voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
|
98
|
+
An array of emotion:level tags.
|
99
|
+
|
100
|
+
Supported emotions are: anger, positivity, surprise, sadness, and curiosity.
|
101
|
+
|
102
|
+
Supported levels are: lowest, low, (omit), high, highest.
|
103
|
+
|
104
|
+
|
105
|
+
request_options : typing.Optional[RequestOptions]
|
106
|
+
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
107
|
+
|
108
|
+
Yields
|
109
|
+
------
|
110
|
+
typing.Iterator[bytes]
|
111
|
+
|
112
|
+
Examples
|
113
|
+
--------
|
114
|
+
from cartesia import Cartesia
|
115
|
+
|
116
|
+
client = Cartesia(
|
117
|
+
api_key="YOUR_API_KEY",
|
118
|
+
)
|
119
|
+
client.infill.bytes(
|
120
|
+
model_id="sonic-2",
|
121
|
+
language="en",
|
122
|
+
transcript="middle segment",
|
123
|
+
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
124
|
+
output_format_container="mp3",
|
125
|
+
output_format_sample_rate=44100,
|
126
|
+
output_format_bit_rate=128000,
|
127
|
+
voice_experimental_controls_speed="slowest",
|
128
|
+
voice_experimental_controls_emotion=["surprise:high", "curiosity:high"],
|
129
|
+
)
|
130
|
+
"""
|
131
|
+
with self._client_wrapper.httpx_client.stream(
|
132
|
+
"infill/bytes",
|
133
|
+
method="POST",
|
134
|
+
data={
|
135
|
+
"model_id": model_id,
|
136
|
+
"language": language,
|
137
|
+
"transcript": transcript,
|
138
|
+
"voice_id": voice_id,
|
139
|
+
"output_format[container]": output_format_container,
|
140
|
+
"output_format[sample_rate]": output_format_sample_rate,
|
141
|
+
"output_format[encoding]": output_format_encoding,
|
142
|
+
"output_format[bit_rate]": output_format_bit_rate,
|
143
|
+
"voice[__experimental_controls][speed]": voice_experimental_controls_speed,
|
144
|
+
"voice[__experimental_controls][emotion][]": voice_experimental_controls_emotion,
|
145
|
+
},
|
146
|
+
files={
|
147
|
+
"left_audio": left_audio,
|
148
|
+
"right_audio": right_audio,
|
149
|
+
},
|
150
|
+
request_options=request_options,
|
151
|
+
omit=OMIT,
|
152
|
+
) as _response:
|
153
|
+
try:
|
154
|
+
if 200 <= _response.status_code < 300:
|
155
|
+
_chunk_size = request_options.get("chunk_size", None) if request_options is not None else None
|
156
|
+
for _chunk in _response.iter_bytes(chunk_size=_chunk_size):
|
157
|
+
yield _chunk
|
158
|
+
return
|
159
|
+
_response.read()
|
160
|
+
_response_json = _response.json()
|
161
|
+
except JSONDecodeError:
|
162
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
163
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
164
|
+
|
165
|
+
|
166
|
+
class AsyncInfillClient:
|
167
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
168
|
+
self._client_wrapper = client_wrapper
|
169
|
+
|
170
|
+
async def bytes(
|
171
|
+
self,
|
172
|
+
*,
|
173
|
+
left_audio: core.File,
|
174
|
+
right_audio: core.File,
|
175
|
+
model_id: str,
|
176
|
+
language: str,
|
177
|
+
transcript: str,
|
178
|
+
voice_id: str,
|
179
|
+
output_format_container: OutputFormatContainer,
|
180
|
+
output_format_sample_rate: int,
|
181
|
+
output_format_encoding: typing.Optional[RawEncoding] = OMIT,
|
182
|
+
output_format_bit_rate: typing.Optional[int] = OMIT,
|
183
|
+
voice_experimental_controls_speed: typing.Optional[Speed] = OMIT,
|
184
|
+
voice_experimental_controls_emotion: typing.Optional[typing.List[Emotion]] = OMIT,
|
185
|
+
request_options: typing.Optional[RequestOptions] = None,
|
186
|
+
) -> typing.AsyncIterator[bytes]:
|
187
|
+
"""
|
188
|
+
Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
|
189
|
+
|
190
|
+
**The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
|
191
|
+
|
192
|
+
Infilling is only available on `sonic-2` at this time.
|
193
|
+
|
194
|
+
At least one of `left_audio` or `right_audio` must be provided.
|
195
|
+
|
196
|
+
As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
|
197
|
+
- Use longer infill transcripts
|
198
|
+
- This gives the model more flexibility to adapt to the rest of the audio
|
199
|
+
- Target natural pauses in the audio when deciding where to clip
|
200
|
+
- This means you don't need word-level timestamps to be as precise
|
201
|
+
- Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
|
202
|
+
- This helps the model generate more natural transitions
|
203
|
+
|
204
|
+
Parameters
|
205
|
+
----------
|
206
|
+
left_audio : core.File
|
207
|
+
See core.File for more documentation
|
208
|
+
|
209
|
+
right_audio : core.File
|
210
|
+
See core.File for more documentation
|
211
|
+
|
212
|
+
model_id : str
|
213
|
+
The ID of the model to use for generating audio
|
214
|
+
|
215
|
+
language : str
|
216
|
+
The language of the transcript
|
217
|
+
|
218
|
+
transcript : str
|
219
|
+
The infill text to generate
|
220
|
+
|
221
|
+
voice_id : str
|
222
|
+
The ID of the voice to use for generating audio
|
223
|
+
|
224
|
+
output_format_container : OutputFormatContainer
|
225
|
+
The format of the output audio
|
226
|
+
|
227
|
+
output_format_sample_rate : int
|
228
|
+
The sample rate of the output audio
|
229
|
+
|
230
|
+
output_format_encoding : typing.Optional[RawEncoding]
|
231
|
+
Required for `raw` and `wav` containers.
|
232
|
+
|
233
|
+
|
234
|
+
output_format_bit_rate : typing.Optional[int]
|
235
|
+
Required for `mp3` containers.
|
236
|
+
|
237
|
+
|
238
|
+
voice_experimental_controls_speed : typing.Optional[Speed]
|
239
|
+
Either a number between -1.0 and 1.0 or a natural language description of speed.
|
240
|
+
|
241
|
+
If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
|
242
|
+
|
243
|
+
|
244
|
+
voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
|
245
|
+
An array of emotion:level tags.
|
246
|
+
|
247
|
+
Supported emotions are: anger, positivity, surprise, sadness, and curiosity.
|
248
|
+
|
249
|
+
Supported levels are: lowest, low, (omit), high, highest.
|
250
|
+
|
251
|
+
|
252
|
+
request_options : typing.Optional[RequestOptions]
|
253
|
+
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
254
|
+
|
255
|
+
Yields
|
256
|
+
------
|
257
|
+
typing.AsyncIterator[bytes]
|
258
|
+
|
259
|
+
Examples
|
260
|
+
--------
|
261
|
+
import asyncio
|
262
|
+
|
263
|
+
from cartesia import AsyncCartesia
|
264
|
+
|
265
|
+
client = AsyncCartesia(
|
266
|
+
api_key="YOUR_API_KEY",
|
267
|
+
)
|
268
|
+
|
269
|
+
|
270
|
+
async def main() -> None:
|
271
|
+
await client.infill.bytes(
|
272
|
+
model_id="sonic-2",
|
273
|
+
language="en",
|
274
|
+
transcript="middle segment",
|
275
|
+
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
276
|
+
output_format_container="mp3",
|
277
|
+
output_format_sample_rate=44100,
|
278
|
+
output_format_bit_rate=128000,
|
279
|
+
voice_experimental_controls_speed="slowest",
|
280
|
+
voice_experimental_controls_emotion=["surprise:high", "curiosity:high"],
|
281
|
+
)
|
282
|
+
|
283
|
+
|
284
|
+
asyncio.run(main())
|
285
|
+
"""
|
286
|
+
async with self._client_wrapper.httpx_client.stream(
|
287
|
+
"infill/bytes",
|
288
|
+
method="POST",
|
289
|
+
data={
|
290
|
+
"model_id": model_id,
|
291
|
+
"language": language,
|
292
|
+
"transcript": transcript,
|
293
|
+
"voice_id": voice_id,
|
294
|
+
"output_format[container]": output_format_container,
|
295
|
+
"output_format[sample_rate]": output_format_sample_rate,
|
296
|
+
"output_format[encoding]": output_format_encoding,
|
297
|
+
"output_format[bit_rate]": output_format_bit_rate,
|
298
|
+
"voice[__experimental_controls][speed]": voice_experimental_controls_speed,
|
299
|
+
"voice[__experimental_controls][emotion][]": voice_experimental_controls_emotion,
|
300
|
+
},
|
301
|
+
files={
|
302
|
+
"left_audio": left_audio,
|
303
|
+
"right_audio": right_audio,
|
304
|
+
},
|
305
|
+
request_options=request_options,
|
306
|
+
omit=OMIT,
|
307
|
+
) as _response:
|
308
|
+
try:
|
309
|
+
if 200 <= _response.status_code < 300:
|
310
|
+
_chunk_size = request_options.get("chunk_size", None) if request_options is not None else None
|
311
|
+
async for _chunk in _response.aiter_bytes(chunk_size=_chunk_size):
|
312
|
+
yield _chunk
|
313
|
+
return
|
314
|
+
await _response.aread()
|
315
|
+
_response_json = _response.json()
|
316
|
+
except JSONDecodeError:
|
317
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
318
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
cartesia/tts/__init__.py
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .types import (
|
4
|
+
CancelContextRequest,
|
5
|
+
ContextId,
|
6
|
+
Controls,
|
7
|
+
Emotion,
|
8
|
+
FlushId,
|
9
|
+
GenerationRequest,
|
10
|
+
Mp3OutputFormat,
|
11
|
+
NaturalSpecifier,
|
12
|
+
NumericalSpecifier,
|
13
|
+
OutputFormat,
|
14
|
+
OutputFormat_Mp3,
|
15
|
+
OutputFormat_Raw,
|
16
|
+
OutputFormat_Wav,
|
17
|
+
PhonemeTimestamps,
|
18
|
+
RawEncoding,
|
19
|
+
RawOutputFormat,
|
20
|
+
Speed,
|
21
|
+
SupportedLanguage,
|
22
|
+
TtsRequest,
|
23
|
+
TtsRequestEmbeddingSpecifier,
|
24
|
+
TtsRequestIdSpecifier,
|
25
|
+
TtsRequestVoiceSpecifier,
|
26
|
+
WavOutputFormat,
|
27
|
+
WebSocketBaseResponse,
|
28
|
+
WebSocketChunkResponse,
|
29
|
+
WebSocketDoneResponse,
|
30
|
+
WebSocketErrorResponse,
|
31
|
+
WebSocketFlushDoneResponse,
|
32
|
+
WebSocketPhonemeTimestampsResponse,
|
33
|
+
WebSocketRawOutputFormat,
|
34
|
+
WebSocketRequest,
|
35
|
+
WebSocketResponse,
|
36
|
+
WebSocketResponse_Chunk,
|
37
|
+
WebSocketResponse_Done,
|
38
|
+
WebSocketResponse_Error,
|
39
|
+
WebSocketResponse_FlushDone,
|
40
|
+
WebSocketResponse_PhonemeTimestamps,
|
41
|
+
WebSocketResponse_Timestamps,
|
42
|
+
WebSocketStreamOptions,
|
43
|
+
WebSocketTimestampsResponse,
|
44
|
+
WebSocketTtsOutput,
|
45
|
+
WebSocketTtsRequest,
|
46
|
+
WordTimestamps,
|
47
|
+
)
|
48
|
+
from .requests import (
|
49
|
+
CancelContextRequestParams,
|
50
|
+
ControlsParams,
|
51
|
+
GenerationRequestParams,
|
52
|
+
Mp3OutputFormatParams,
|
53
|
+
OutputFormatParams,
|
54
|
+
OutputFormat_Mp3Params,
|
55
|
+
OutputFormat_RawParams,
|
56
|
+
OutputFormat_WavParams,
|
57
|
+
PhonemeTimestampsParams,
|
58
|
+
RawOutputFormatParams,
|
59
|
+
SpeedParams,
|
60
|
+
TtsRequestEmbeddingSpecifierParams,
|
61
|
+
TtsRequestIdSpecifierParams,
|
62
|
+
TtsRequestParams,
|
63
|
+
TtsRequestVoiceSpecifierParams,
|
64
|
+
WavOutputFormatParams,
|
65
|
+
WebSocketBaseResponseParams,
|
66
|
+
WebSocketChunkResponseParams,
|
67
|
+
WebSocketDoneResponseParams,
|
68
|
+
WebSocketErrorResponseParams,
|
69
|
+
WebSocketFlushDoneResponseParams,
|
70
|
+
WebSocketPhonemeTimestampsResponseParams,
|
71
|
+
WebSocketRawOutputFormatParams,
|
72
|
+
WebSocketRequestParams,
|
73
|
+
WebSocketResponseParams,
|
74
|
+
WebSocketResponse_ChunkParams,
|
75
|
+
WebSocketResponse_DoneParams,
|
76
|
+
WebSocketResponse_ErrorParams,
|
77
|
+
WebSocketResponse_FlushDoneParams,
|
78
|
+
WebSocketResponse_PhonemeTimestampsParams,
|
79
|
+
WebSocketResponse_TimestampsParams,
|
80
|
+
WebSocketStreamOptionsParams,
|
81
|
+
WebSocketTimestampsResponseParams,
|
82
|
+
WebSocketTtsOutputParams,
|
83
|
+
WebSocketTtsRequestParams,
|
84
|
+
WordTimestampsParams,
|
85
|
+
)
|
86
|
+
|
87
|
+
__all__ = [
|
88
|
+
"CancelContextRequest",
|
89
|
+
"CancelContextRequestParams",
|
90
|
+
"ContextId",
|
91
|
+
"Controls",
|
92
|
+
"ControlsParams",
|
93
|
+
"Emotion",
|
94
|
+
"FlushId",
|
95
|
+
"GenerationRequest",
|
96
|
+
"GenerationRequestParams",
|
97
|
+
"Mp3OutputFormat",
|
98
|
+
"Mp3OutputFormatParams",
|
99
|
+
"NaturalSpecifier",
|
100
|
+
"NumericalSpecifier",
|
101
|
+
"OutputFormat",
|
102
|
+
"OutputFormatParams",
|
103
|
+
"OutputFormat_Mp3",
|
104
|
+
"OutputFormat_Mp3Params",
|
105
|
+
"OutputFormat_Raw",
|
106
|
+
"OutputFormat_RawParams",
|
107
|
+
"OutputFormat_Wav",
|
108
|
+
"OutputFormat_WavParams",
|
109
|
+
"PhonemeTimestamps",
|
110
|
+
"PhonemeTimestampsParams",
|
111
|
+
"RawEncoding",
|
112
|
+
"RawOutputFormat",
|
113
|
+
"RawOutputFormatParams",
|
114
|
+
"Speed",
|
115
|
+
"SpeedParams",
|
116
|
+
"SupportedLanguage",
|
117
|
+
"TtsRequest",
|
118
|
+
"TtsRequestEmbeddingSpecifier",
|
119
|
+
"TtsRequestEmbeddingSpecifierParams",
|
120
|
+
"TtsRequestIdSpecifier",
|
121
|
+
"TtsRequestIdSpecifierParams",
|
122
|
+
"TtsRequestParams",
|
123
|
+
"TtsRequestVoiceSpecifier",
|
124
|
+
"TtsRequestVoiceSpecifierParams",
|
125
|
+
"WavOutputFormat",
|
126
|
+
"WavOutputFormatParams",
|
127
|
+
"WebSocketBaseResponse",
|
128
|
+
"WebSocketBaseResponseParams",
|
129
|
+
"WebSocketChunkResponse",
|
130
|
+
"WebSocketChunkResponseParams",
|
131
|
+
"WebSocketDoneResponse",
|
132
|
+
"WebSocketDoneResponseParams",
|
133
|
+
"WebSocketErrorResponse",
|
134
|
+
"WebSocketErrorResponseParams",
|
135
|
+
"WebSocketFlushDoneResponse",
|
136
|
+
"WebSocketFlushDoneResponseParams",
|
137
|
+
"WebSocketPhonemeTimestampsResponse",
|
138
|
+
"WebSocketPhonemeTimestampsResponseParams",
|
139
|
+
"WebSocketRawOutputFormat",
|
140
|
+
"WebSocketRawOutputFormatParams",
|
141
|
+
"WebSocketRequest",
|
142
|
+
"WebSocketRequestParams",
|
143
|
+
"WebSocketResponse",
|
144
|
+
"WebSocketResponseParams",
|
145
|
+
"WebSocketResponse_Chunk",
|
146
|
+
"WebSocketResponse_ChunkParams",
|
147
|
+
"WebSocketResponse_Done",
|
148
|
+
"WebSocketResponse_DoneParams",
|
149
|
+
"WebSocketResponse_Error",
|
150
|
+
"WebSocketResponse_ErrorParams",
|
151
|
+
"WebSocketResponse_FlushDone",
|
152
|
+
"WebSocketResponse_FlushDoneParams",
|
153
|
+
"WebSocketResponse_PhonemeTimestamps",
|
154
|
+
"WebSocketResponse_PhonemeTimestampsParams",
|
155
|
+
"WebSocketResponse_Timestamps",
|
156
|
+
"WebSocketResponse_TimestampsParams",
|
157
|
+
"WebSocketStreamOptions",
|
158
|
+
"WebSocketStreamOptionsParams",
|
159
|
+
"WebSocketTimestampsResponse",
|
160
|
+
"WebSocketTimestampsResponseParams",
|
161
|
+
"WebSocketTtsOutput",
|
162
|
+
"WebSocketTtsOutputParams",
|
163
|
+
"WebSocketTtsRequest",
|
164
|
+
"WebSocketTtsRequestParams",
|
165
|
+
"WordTimestamps",
|
166
|
+
"WordTimestampsParams",
|
167
|
+
]
|