cartesia 1.4.0__py3-none-any.whl → 2.0.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +292 -3
- cartesia/api_status/__init__.py +6 -0
- cartesia/api_status/client.py +104 -0
- cartesia/api_status/requests/__init__.py +5 -0
- cartesia/api_status/requests/api_info.py +8 -0
- cartesia/api_status/types/__init__.py +5 -0
- cartesia/api_status/types/api_info.py +20 -0
- cartesia/base_client.py +160 -0
- cartesia/client.py +163 -40
- cartesia/core/__init__.py +47 -0
- cartesia/core/api_error.py +15 -0
- cartesia/core/client_wrapper.py +55 -0
- cartesia/core/datetime_utils.py +28 -0
- cartesia/core/file.py +67 -0
- cartesia/core/http_client.py +499 -0
- cartesia/core/jsonable_encoder.py +101 -0
- cartesia/core/pydantic_utilities.py +296 -0
- cartesia/core/query_encoder.py +58 -0
- cartesia/core/remove_none_from_dict.py +11 -0
- cartesia/core/request_options.py +35 -0
- cartesia/core/serialization.py +272 -0
- cartesia/datasets/__init__.py +24 -0
- cartesia/datasets/client.py +392 -0
- cartesia/datasets/requests/__init__.py +15 -0
- cartesia/datasets/requests/create_dataset_request.py +7 -0
- cartesia/datasets/requests/dataset.py +9 -0
- cartesia/datasets/requests/dataset_file.py +9 -0
- cartesia/datasets/requests/paginated_dataset_files.py +10 -0
- cartesia/datasets/requests/paginated_datasets.py +10 -0
- cartesia/datasets/types/__init__.py +17 -0
- cartesia/datasets/types/create_dataset_request.py +19 -0
- cartesia/datasets/types/dataset.py +21 -0
- cartesia/datasets/types/dataset_file.py +21 -0
- cartesia/datasets/types/file_purpose.py +5 -0
- cartesia/datasets/types/paginated_dataset_files.py +21 -0
- cartesia/datasets/types/paginated_datasets.py +21 -0
- cartesia/embedding/__init__.py +5 -0
- cartesia/embedding/types/__init__.py +5 -0
- cartesia/embedding/types/embedding.py +201 -0
- cartesia/environment.py +7 -0
- cartesia/infill/__init__.py +2 -0
- cartesia/infill/client.py +318 -0
- cartesia/tts/__init__.py +167 -0
- cartesia/{_async_websocket.py → tts/_async_websocket.py} +159 -84
- cartesia/tts/_websocket.py +430 -0
- cartesia/tts/client.py +407 -0
- cartesia/tts/requests/__init__.py +76 -0
- cartesia/tts/requests/cancel_context_request.py +17 -0
- cartesia/tts/requests/controls.py +11 -0
- cartesia/tts/requests/generation_request.py +53 -0
- cartesia/tts/requests/mp_3_output_format.py +11 -0
- cartesia/tts/requests/output_format.py +30 -0
- cartesia/tts/requests/phoneme_timestamps.py +10 -0
- cartesia/tts/requests/raw_output_format.py +11 -0
- cartesia/tts/requests/speed.py +7 -0
- cartesia/tts/requests/tts_request.py +24 -0
- cartesia/tts/requests/tts_request_embedding_specifier.py +16 -0
- cartesia/tts/requests/tts_request_id_specifier.py +16 -0
- cartesia/tts/requests/tts_request_voice_specifier.py +7 -0
- cartesia/tts/requests/wav_output_format.py +7 -0
- cartesia/tts/requests/web_socket_base_response.py +11 -0
- cartesia/tts/requests/web_socket_chunk_response.py +8 -0
- cartesia/tts/requests/web_socket_done_response.py +7 -0
- cartesia/tts/requests/web_socket_error_response.py +7 -0
- cartesia/tts/requests/web_socket_flush_done_response.py +9 -0
- cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_raw_output_format.py +11 -0
- cartesia/tts/requests/web_socket_request.py +7 -0
- cartesia/tts/requests/web_socket_response.py +69 -0
- cartesia/tts/requests/web_socket_stream_options.py +8 -0
- cartesia/tts/requests/web_socket_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_tts_output.py +18 -0
- cartesia/tts/requests/web_socket_tts_request.py +24 -0
- cartesia/tts/requests/word_timestamps.py +10 -0
- cartesia/tts/socket_client.py +302 -0
- cartesia/tts/types/__init__.py +90 -0
- cartesia/tts/types/cancel_context_request.py +28 -0
- cartesia/tts/types/context_id.py +3 -0
- cartesia/tts/types/controls.py +22 -0
- cartesia/tts/types/emotion.py +29 -0
- cartesia/tts/types/flush_id.py +3 -0
- cartesia/tts/types/generation_request.py +66 -0
- cartesia/tts/types/mp_3_output_format.py +23 -0
- cartesia/tts/types/natural_specifier.py +5 -0
- cartesia/tts/types/numerical_specifier.py +3 -0
- cartesia/tts/types/output_format.py +58 -0
- cartesia/tts/types/phoneme_timestamps.py +21 -0
- cartesia/tts/types/raw_encoding.py +5 -0
- cartesia/tts/types/raw_output_format.py +22 -0
- cartesia/tts/types/speed.py +7 -0
- cartesia/tts/types/supported_language.py +7 -0
- cartesia/tts/types/tts_request.py +35 -0
- cartesia/tts/types/tts_request_embedding_specifier.py +27 -0
- cartesia/tts/types/tts_request_id_specifier.py +27 -0
- cartesia/tts/types/tts_request_voice_specifier.py +7 -0
- cartesia/tts/types/wav_output_format.py +17 -0
- cartesia/tts/types/web_socket_base_response.py +22 -0
- cartesia/tts/types/web_socket_chunk_response.py +20 -0
- cartesia/tts/types/web_socket_done_response.py +17 -0
- cartesia/tts/types/web_socket_error_response.py +19 -0
- cartesia/tts/types/web_socket_flush_done_response.py +21 -0
- cartesia/tts/types/web_socket_phoneme_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_raw_output_format.py +22 -0
- cartesia/tts/types/web_socket_request.py +7 -0
- cartesia/tts/types/web_socket_response.py +124 -0
- cartesia/tts/types/web_socket_stream_options.py +19 -0
- cartesia/tts/types/web_socket_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_tts_output.py +27 -0
- cartesia/tts/types/web_socket_tts_request.py +36 -0
- cartesia/tts/types/word_timestamps.py +21 -0
- cartesia/tts/utils/tts.py +64 -0
- cartesia/tts/utils/types.py +70 -0
- cartesia/version.py +3 -1
- cartesia/voice_changer/__init__.py +27 -0
- cartesia/voice_changer/client.py +395 -0
- cartesia/voice_changer/requests/__init__.py +15 -0
- cartesia/voice_changer/requests/streaming_response.py +36 -0
- cartesia/voice_changer/types/__init__.py +17 -0
- cartesia/voice_changer/types/output_format_container.py +5 -0
- cartesia/voice_changer/types/streaming_response.py +62 -0
- cartesia/voices/__init__.py +71 -0
- cartesia/voices/client.py +1053 -0
- cartesia/voices/requests/__init__.py +27 -0
- cartesia/voices/requests/create_voice_request.py +23 -0
- cartesia/voices/requests/embedding_response.py +8 -0
- cartesia/voices/requests/embedding_specifier.py +10 -0
- cartesia/voices/requests/id_specifier.py +10 -0
- cartesia/voices/requests/localize_dialect.py +8 -0
- cartesia/voices/requests/localize_voice_request.py +15 -0
- cartesia/voices/requests/mix_voice_specifier.py +7 -0
- cartesia/voices/requests/mix_voices_request.py +9 -0
- cartesia/voices/requests/update_voice_request.py +15 -0
- cartesia/voices/requests/voice.py +39 -0
- cartesia/voices/requests/voice_metadata.py +36 -0
- cartesia/voices/types/__init__.py +45 -0
- cartesia/voices/types/base_voice_id.py +5 -0
- cartesia/voices/types/clone_mode.py +5 -0
- cartesia/voices/types/create_voice_request.py +34 -0
- cartesia/voices/types/embedding_response.py +20 -0
- cartesia/voices/types/embedding_specifier.py +22 -0
- cartesia/voices/types/gender.py +5 -0
- cartesia/voices/types/id_specifier.py +22 -0
- cartesia/voices/types/localize_dialect.py +8 -0
- cartesia/voices/types/localize_english_dialect.py +5 -0
- cartesia/voices/types/localize_portuguese_dialect.py +5 -0
- cartesia/voices/types/localize_spanish_dialect.py +5 -0
- cartesia/voices/types/localize_target_language.py +7 -0
- cartesia/voices/types/localize_voice_request.py +26 -0
- cartesia/voices/types/mix_voice_specifier.py +7 -0
- cartesia/voices/types/mix_voices_request.py +20 -0
- cartesia/voices/types/update_voice_request.py +27 -0
- cartesia/voices/types/voice.py +50 -0
- cartesia/voices/types/voice_id.py +3 -0
- cartesia/voices/types/voice_metadata.py +48 -0
- cartesia/voices/types/weight.py +3 -0
- cartesia-2.0.0a2.dist-info/METADATA +307 -0
- cartesia-2.0.0a2.dist-info/RECORD +160 -0
- {cartesia-1.4.0.dist-info → cartesia-2.0.0a2.dist-info}/WHEEL +1 -1
- cartesia/_async_sse.py +0 -95
- cartesia/_logger.py +0 -3
- cartesia/_sse.py +0 -143
- cartesia/_types.py +0 -70
- cartesia/_websocket.py +0 -358
- cartesia/async_client.py +0 -82
- cartesia/async_tts.py +0 -176
- cartesia/resource.py +0 -44
- cartesia/tts.py +0 -292
- cartesia/utils/deprecated.py +0 -55
- cartesia/utils/retry.py +0 -87
- cartesia/utils/tts.py +0 -78
- cartesia/voices.py +0 -204
- cartesia-1.4.0.dist-info/METADATA +0 -663
- cartesia-1.4.0.dist-info/RECORD +0 -23
- cartesia-1.4.0.dist-info/licenses/LICENSE.md +0 -21
- /cartesia/{utils/__init__.py → py.typed} +0 -0
- /cartesia/{_constants.py → tts/utils/constants.py} +0 -0
cartesia/tts/client.py
ADDED
@@ -0,0 +1,407 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from ..core.client_wrapper import SyncClientWrapper
|
5
|
+
from .requests.tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
6
|
+
from .requests.output_format import OutputFormatParams
|
7
|
+
from .types.supported_language import SupportedLanguage
|
8
|
+
from ..core.request_options import RequestOptions
|
9
|
+
from ..core.serialization import convert_and_respect_annotation_metadata
|
10
|
+
from json.decoder import JSONDecodeError
|
11
|
+
from ..core.api_error import ApiError
|
12
|
+
from .types.web_socket_response import WebSocketResponse
|
13
|
+
import httpx_sse
|
14
|
+
from ..core.pydantic_utilities import parse_obj_as
|
15
|
+
import json
|
16
|
+
from ..core.client_wrapper import AsyncClientWrapper
|
17
|
+
|
18
|
+
# this is used as the default value for optional parameters
|
19
|
+
OMIT = typing.cast(typing.Any, ...)
|
20
|
+
|
21
|
+
|
22
|
+
class TtsClient:
|
23
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
24
|
+
self._client_wrapper = client_wrapper
|
25
|
+
|
26
|
+
def bytes(
|
27
|
+
self,
|
28
|
+
*,
|
29
|
+
model_id: str,
|
30
|
+
transcript: str,
|
31
|
+
voice: TtsRequestVoiceSpecifierParams,
|
32
|
+
output_format: OutputFormatParams,
|
33
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
34
|
+
duration: typing.Optional[float] = OMIT,
|
35
|
+
request_options: typing.Optional[RequestOptions] = None,
|
36
|
+
) -> typing.Iterator[bytes]:
|
37
|
+
"""
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
model_id : str
|
41
|
+
The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
|
42
|
+
|
43
|
+
transcript : str
|
44
|
+
|
45
|
+
voice : TtsRequestVoiceSpecifierParams
|
46
|
+
|
47
|
+
output_format : OutputFormatParams
|
48
|
+
|
49
|
+
language : typing.Optional[SupportedLanguage]
|
50
|
+
|
51
|
+
duration : typing.Optional[float]
|
52
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
53
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
54
|
+
|
55
|
+
request_options : typing.Optional[RequestOptions]
|
56
|
+
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
57
|
+
|
58
|
+
Yields
|
59
|
+
------
|
60
|
+
typing.Iterator[bytes]
|
61
|
+
|
62
|
+
Examples
|
63
|
+
--------
|
64
|
+
from cartesia import Cartesia
|
65
|
+
|
66
|
+
client = Cartesia(
|
67
|
+
api_key="YOUR_API_KEY",
|
68
|
+
)
|
69
|
+
client.tts.bytes(
|
70
|
+
model_id="sonic",
|
71
|
+
transcript="Hello, world!",
|
72
|
+
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
73
|
+
language="en",
|
74
|
+
output_format={
|
75
|
+
"sample_rate": 44100,
|
76
|
+
"bit_rate": 128000,
|
77
|
+
"container": "mp3",
|
78
|
+
},
|
79
|
+
)
|
80
|
+
"""
|
81
|
+
with self._client_wrapper.httpx_client.stream(
|
82
|
+
"tts/bytes",
|
83
|
+
method="POST",
|
84
|
+
json={
|
85
|
+
"model_id": model_id,
|
86
|
+
"transcript": transcript,
|
87
|
+
"voice": convert_and_respect_annotation_metadata(
|
88
|
+
object_=voice, annotation=TtsRequestVoiceSpecifierParams, direction="write"
|
89
|
+
),
|
90
|
+
"language": language,
|
91
|
+
"output_format": convert_and_respect_annotation_metadata(
|
92
|
+
object_=output_format, annotation=OutputFormatParams, direction="write"
|
93
|
+
),
|
94
|
+
"duration": duration,
|
95
|
+
},
|
96
|
+
request_options=request_options,
|
97
|
+
omit=OMIT,
|
98
|
+
) as _response:
|
99
|
+
try:
|
100
|
+
if 200 <= _response.status_code < 300:
|
101
|
+
_chunk_size = request_options.get("chunk_size", None) if request_options is not None else None
|
102
|
+
for _chunk in _response.iter_bytes(chunk_size=_chunk_size):
|
103
|
+
yield _chunk
|
104
|
+
return
|
105
|
+
_response.read()
|
106
|
+
_response_json = _response.json()
|
107
|
+
except JSONDecodeError:
|
108
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
109
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
110
|
+
|
111
|
+
def sse(
|
112
|
+
self,
|
113
|
+
*,
|
114
|
+
model_id: str,
|
115
|
+
transcript: str,
|
116
|
+
voice: TtsRequestVoiceSpecifierParams,
|
117
|
+
output_format: OutputFormatParams,
|
118
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
119
|
+
duration: typing.Optional[float] = OMIT,
|
120
|
+
request_options: typing.Optional[RequestOptions] = None,
|
121
|
+
) -> typing.Iterator[WebSocketResponse]:
|
122
|
+
"""
|
123
|
+
Parameters
|
124
|
+
----------
|
125
|
+
model_id : str
|
126
|
+
The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
|
127
|
+
|
128
|
+
transcript : str
|
129
|
+
|
130
|
+
voice : TtsRequestVoiceSpecifierParams
|
131
|
+
|
132
|
+
output_format : OutputFormatParams
|
133
|
+
|
134
|
+
language : typing.Optional[SupportedLanguage]
|
135
|
+
|
136
|
+
duration : typing.Optional[float]
|
137
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
138
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
139
|
+
|
140
|
+
request_options : typing.Optional[RequestOptions]
|
141
|
+
Request-specific configuration.
|
142
|
+
|
143
|
+
Yields
|
144
|
+
------
|
145
|
+
typing.Iterator[WebSocketResponse]
|
146
|
+
|
147
|
+
Examples
|
148
|
+
--------
|
149
|
+
from cartesia import Cartesia
|
150
|
+
|
151
|
+
client = Cartesia(
|
152
|
+
api_key="YOUR_API_KEY",
|
153
|
+
)
|
154
|
+
response = client.tts.sse(
|
155
|
+
model_id="sonic",
|
156
|
+
transcript="Hello, world!",
|
157
|
+
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
158
|
+
language="en",
|
159
|
+
output_format={
|
160
|
+
"sample_rate": 44100,
|
161
|
+
"encoding": "pcm_f32le",
|
162
|
+
"container": "raw",
|
163
|
+
},
|
164
|
+
)
|
165
|
+
for chunk in response:
|
166
|
+
yield chunk
|
167
|
+
"""
|
168
|
+
with self._client_wrapper.httpx_client.stream(
|
169
|
+
"tts/sse",
|
170
|
+
method="POST",
|
171
|
+
json={
|
172
|
+
"model_id": model_id,
|
173
|
+
"transcript": transcript,
|
174
|
+
"voice": convert_and_respect_annotation_metadata(
|
175
|
+
object_=voice, annotation=TtsRequestVoiceSpecifierParams, direction="write"
|
176
|
+
),
|
177
|
+
"language": language,
|
178
|
+
"output_format": convert_and_respect_annotation_metadata(
|
179
|
+
object_=output_format, annotation=OutputFormatParams, direction="write"
|
180
|
+
),
|
181
|
+
"duration": duration,
|
182
|
+
},
|
183
|
+
request_options=request_options,
|
184
|
+
omit=OMIT,
|
185
|
+
) as _response:
|
186
|
+
try:
|
187
|
+
if 200 <= _response.status_code < 300:
|
188
|
+
_event_source = httpx_sse.EventSource(_response)
|
189
|
+
for _sse in _event_source.iter_sse():
|
190
|
+
try:
|
191
|
+
yield typing.cast(
|
192
|
+
WebSocketResponse,
|
193
|
+
parse_obj_as(
|
194
|
+
type_=WebSocketResponse, # type: ignore
|
195
|
+
object_=json.loads(_sse.data),
|
196
|
+
),
|
197
|
+
)
|
198
|
+
except:
|
199
|
+
pass
|
200
|
+
return
|
201
|
+
_response.read()
|
202
|
+
_response_json = _response.json()
|
203
|
+
except JSONDecodeError:
|
204
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
205
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
206
|
+
|
207
|
+
|
208
|
+
class AsyncTtsClient:
|
209
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
210
|
+
self._client_wrapper = client_wrapper
|
211
|
+
|
212
|
+
async def bytes(
|
213
|
+
self,
|
214
|
+
*,
|
215
|
+
model_id: str,
|
216
|
+
transcript: str,
|
217
|
+
voice: TtsRequestVoiceSpecifierParams,
|
218
|
+
output_format: OutputFormatParams,
|
219
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
220
|
+
duration: typing.Optional[float] = OMIT,
|
221
|
+
request_options: typing.Optional[RequestOptions] = None,
|
222
|
+
) -> typing.AsyncIterator[bytes]:
|
223
|
+
"""
|
224
|
+
Parameters
|
225
|
+
----------
|
226
|
+
model_id : str
|
227
|
+
The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
|
228
|
+
|
229
|
+
transcript : str
|
230
|
+
|
231
|
+
voice : TtsRequestVoiceSpecifierParams
|
232
|
+
|
233
|
+
output_format : OutputFormatParams
|
234
|
+
|
235
|
+
language : typing.Optional[SupportedLanguage]
|
236
|
+
|
237
|
+
duration : typing.Optional[float]
|
238
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
239
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
240
|
+
|
241
|
+
request_options : typing.Optional[RequestOptions]
|
242
|
+
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
243
|
+
|
244
|
+
Yields
|
245
|
+
------
|
246
|
+
typing.AsyncIterator[bytes]
|
247
|
+
|
248
|
+
Examples
|
249
|
+
--------
|
250
|
+
import asyncio
|
251
|
+
|
252
|
+
from cartesia import AsyncCartesia
|
253
|
+
|
254
|
+
client = AsyncCartesia(
|
255
|
+
api_key="YOUR_API_KEY",
|
256
|
+
)
|
257
|
+
|
258
|
+
|
259
|
+
async def main() -> None:
|
260
|
+
await client.tts.bytes(
|
261
|
+
model_id="sonic",
|
262
|
+
transcript="Hello, world!",
|
263
|
+
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
264
|
+
language="en",
|
265
|
+
output_format={
|
266
|
+
"sample_rate": 44100,
|
267
|
+
"bit_rate": 128000,
|
268
|
+
"container": "mp3",
|
269
|
+
},
|
270
|
+
)
|
271
|
+
|
272
|
+
|
273
|
+
asyncio.run(main())
|
274
|
+
"""
|
275
|
+
async with self._client_wrapper.httpx_client.stream(
|
276
|
+
"tts/bytes",
|
277
|
+
method="POST",
|
278
|
+
json={
|
279
|
+
"model_id": model_id,
|
280
|
+
"transcript": transcript,
|
281
|
+
"voice": convert_and_respect_annotation_metadata(
|
282
|
+
object_=voice, annotation=TtsRequestVoiceSpecifierParams, direction="write"
|
283
|
+
),
|
284
|
+
"language": language,
|
285
|
+
"output_format": convert_and_respect_annotation_metadata(
|
286
|
+
object_=output_format, annotation=OutputFormatParams, direction="write"
|
287
|
+
),
|
288
|
+
"duration": duration,
|
289
|
+
},
|
290
|
+
request_options=request_options,
|
291
|
+
omit=OMIT,
|
292
|
+
) as _response:
|
293
|
+
try:
|
294
|
+
if 200 <= _response.status_code < 300:
|
295
|
+
_chunk_size = request_options.get("chunk_size", None) if request_options is not None else None
|
296
|
+
async for _chunk in _response.aiter_bytes(chunk_size=_chunk_size):
|
297
|
+
yield _chunk
|
298
|
+
return
|
299
|
+
await _response.aread()
|
300
|
+
_response_json = _response.json()
|
301
|
+
except JSONDecodeError:
|
302
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
303
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
304
|
+
|
305
|
+
async def sse(
|
306
|
+
self,
|
307
|
+
*,
|
308
|
+
model_id: str,
|
309
|
+
transcript: str,
|
310
|
+
voice: TtsRequestVoiceSpecifierParams,
|
311
|
+
output_format: OutputFormatParams,
|
312
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
313
|
+
duration: typing.Optional[float] = OMIT,
|
314
|
+
request_options: typing.Optional[RequestOptions] = None,
|
315
|
+
) -> typing.AsyncIterator[WebSocketResponse]:
|
316
|
+
"""
|
317
|
+
Parameters
|
318
|
+
----------
|
319
|
+
model_id : str
|
320
|
+
The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
|
321
|
+
|
322
|
+
transcript : str
|
323
|
+
|
324
|
+
voice : TtsRequestVoiceSpecifierParams
|
325
|
+
|
326
|
+
output_format : OutputFormatParams
|
327
|
+
|
328
|
+
language : typing.Optional[SupportedLanguage]
|
329
|
+
|
330
|
+
duration : typing.Optional[float]
|
331
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
332
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
333
|
+
|
334
|
+
request_options : typing.Optional[RequestOptions]
|
335
|
+
Request-specific configuration.
|
336
|
+
|
337
|
+
Yields
|
338
|
+
------
|
339
|
+
typing.AsyncIterator[WebSocketResponse]
|
340
|
+
|
341
|
+
Examples
|
342
|
+
--------
|
343
|
+
import asyncio
|
344
|
+
|
345
|
+
from cartesia import AsyncCartesia
|
346
|
+
|
347
|
+
client = AsyncCartesia(
|
348
|
+
api_key="YOUR_API_KEY",
|
349
|
+
)
|
350
|
+
|
351
|
+
|
352
|
+
async def main() -> None:
|
353
|
+
response = await client.tts.sse(
|
354
|
+
model_id="sonic",
|
355
|
+
transcript="Hello, world!",
|
356
|
+
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
357
|
+
language="en",
|
358
|
+
output_format={
|
359
|
+
"sample_rate": 44100,
|
360
|
+
"encoding": "pcm_f32le",
|
361
|
+
"container": "raw",
|
362
|
+
},
|
363
|
+
)
|
364
|
+
async for chunk in response:
|
365
|
+
yield chunk
|
366
|
+
|
367
|
+
|
368
|
+
asyncio.run(main())
|
369
|
+
"""
|
370
|
+
async with self._client_wrapper.httpx_client.stream(
|
371
|
+
"tts/sse",
|
372
|
+
method="POST",
|
373
|
+
json={
|
374
|
+
"model_id": model_id,
|
375
|
+
"transcript": transcript,
|
376
|
+
"voice": convert_and_respect_annotation_metadata(
|
377
|
+
object_=voice, annotation=TtsRequestVoiceSpecifierParams, direction="write"
|
378
|
+
),
|
379
|
+
"language": language,
|
380
|
+
"output_format": convert_and_respect_annotation_metadata(
|
381
|
+
object_=output_format, annotation=OutputFormatParams, direction="write"
|
382
|
+
),
|
383
|
+
"duration": duration,
|
384
|
+
},
|
385
|
+
request_options=request_options,
|
386
|
+
omit=OMIT,
|
387
|
+
) as _response:
|
388
|
+
try:
|
389
|
+
if 200 <= _response.status_code < 300:
|
390
|
+
_event_source = httpx_sse.EventSource(_response)
|
391
|
+
async for _sse in _event_source.aiter_sse():
|
392
|
+
try:
|
393
|
+
yield typing.cast(
|
394
|
+
WebSocketResponse,
|
395
|
+
parse_obj_as(
|
396
|
+
type_=WebSocketResponse, # type: ignore
|
397
|
+
object_=json.loads(_sse.data),
|
398
|
+
),
|
399
|
+
)
|
400
|
+
except:
|
401
|
+
pass
|
402
|
+
return
|
403
|
+
await _response.aread()
|
404
|
+
_response_json = _response.json()
|
405
|
+
except JSONDecodeError:
|
406
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
407
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .cancel_context_request import CancelContextRequestParams
|
4
|
+
from .controls import ControlsParams
|
5
|
+
from .generation_request import GenerationRequestParams
|
6
|
+
from .mp_3_output_format import Mp3OutputFormatParams
|
7
|
+
from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFormat_RawParams, OutputFormat_WavParams
|
8
|
+
from .phoneme_timestamps import PhonemeTimestampsParams
|
9
|
+
from .raw_output_format import RawOutputFormatParams
|
10
|
+
from .speed import SpeedParams
|
11
|
+
from .tts_request import TtsRequestParams
|
12
|
+
from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
|
13
|
+
from .tts_request_id_specifier import TtsRequestIdSpecifierParams
|
14
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
15
|
+
from .wav_output_format import WavOutputFormatParams
|
16
|
+
from .web_socket_base_response import WebSocketBaseResponseParams
|
17
|
+
from .web_socket_chunk_response import WebSocketChunkResponseParams
|
18
|
+
from .web_socket_done_response import WebSocketDoneResponseParams
|
19
|
+
from .web_socket_error_response import WebSocketErrorResponseParams
|
20
|
+
from .web_socket_flush_done_response import WebSocketFlushDoneResponseParams
|
21
|
+
from .web_socket_phoneme_timestamps_response import WebSocketPhonemeTimestampsResponseParams
|
22
|
+
from .web_socket_raw_output_format import WebSocketRawOutputFormatParams
|
23
|
+
from .web_socket_request import WebSocketRequestParams
|
24
|
+
from .web_socket_response import (
|
25
|
+
WebSocketResponseParams,
|
26
|
+
WebSocketResponse_ChunkParams,
|
27
|
+
WebSocketResponse_DoneParams,
|
28
|
+
WebSocketResponse_ErrorParams,
|
29
|
+
WebSocketResponse_FlushDoneParams,
|
30
|
+
WebSocketResponse_PhonemeTimestampsParams,
|
31
|
+
WebSocketResponse_TimestampsParams,
|
32
|
+
)
|
33
|
+
from .web_socket_stream_options import WebSocketStreamOptionsParams
|
34
|
+
from .web_socket_timestamps_response import WebSocketTimestampsResponseParams
|
35
|
+
from .web_socket_tts_output import WebSocketTtsOutputParams
|
36
|
+
from .web_socket_tts_request import WebSocketTtsRequestParams
|
37
|
+
from .word_timestamps import WordTimestampsParams
|
38
|
+
|
39
|
+
__all__ = [
|
40
|
+
"CancelContextRequestParams",
|
41
|
+
"ControlsParams",
|
42
|
+
"GenerationRequestParams",
|
43
|
+
"Mp3OutputFormatParams",
|
44
|
+
"OutputFormatParams",
|
45
|
+
"OutputFormat_Mp3Params",
|
46
|
+
"OutputFormat_RawParams",
|
47
|
+
"OutputFormat_WavParams",
|
48
|
+
"PhonemeTimestampsParams",
|
49
|
+
"RawOutputFormatParams",
|
50
|
+
"SpeedParams",
|
51
|
+
"TtsRequestEmbeddingSpecifierParams",
|
52
|
+
"TtsRequestIdSpecifierParams",
|
53
|
+
"TtsRequestParams",
|
54
|
+
"TtsRequestVoiceSpecifierParams",
|
55
|
+
"WavOutputFormatParams",
|
56
|
+
"WebSocketBaseResponseParams",
|
57
|
+
"WebSocketChunkResponseParams",
|
58
|
+
"WebSocketDoneResponseParams",
|
59
|
+
"WebSocketErrorResponseParams",
|
60
|
+
"WebSocketFlushDoneResponseParams",
|
61
|
+
"WebSocketPhonemeTimestampsResponseParams",
|
62
|
+
"WebSocketRawOutputFormatParams",
|
63
|
+
"WebSocketRequestParams",
|
64
|
+
"WebSocketResponseParams",
|
65
|
+
"WebSocketResponse_ChunkParams",
|
66
|
+
"WebSocketResponse_DoneParams",
|
67
|
+
"WebSocketResponse_ErrorParams",
|
68
|
+
"WebSocketResponse_FlushDoneParams",
|
69
|
+
"WebSocketResponse_PhonemeTimestampsParams",
|
70
|
+
"WebSocketResponse_TimestampsParams",
|
71
|
+
"WebSocketStreamOptionsParams",
|
72
|
+
"WebSocketTimestampsResponseParams",
|
73
|
+
"WebSocketTtsOutputParams",
|
74
|
+
"WebSocketTtsRequestParams",
|
75
|
+
"WordTimestampsParams",
|
76
|
+
]
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from ..types.context_id import ContextId
|
5
|
+
import typing
|
6
|
+
|
7
|
+
|
8
|
+
class CancelContextRequestParams(typing_extensions.TypedDict):
|
9
|
+
context_id: ContextId
|
10
|
+
"""
|
11
|
+
The ID of the context to cancel.
|
12
|
+
"""
|
13
|
+
|
14
|
+
cancel: typing.Literal[True]
|
15
|
+
"""
|
16
|
+
Whether to cancel the context, so that no more messages are generated for that context.
|
17
|
+
"""
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from .speed import SpeedParams
|
5
|
+
import typing
|
6
|
+
from ..types.emotion import Emotion
|
7
|
+
|
8
|
+
|
9
|
+
class ControlsParams(typing_extensions.TypedDict):
|
10
|
+
speed: SpeedParams
|
11
|
+
emotion: typing.Sequence[Emotion]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing
|
5
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
6
|
+
import typing_extensions
|
7
|
+
from ..types.supported_language import SupportedLanguage
|
8
|
+
from .web_socket_raw_output_format import WebSocketRawOutputFormatParams
|
9
|
+
from ..types.context_id import ContextId
|
10
|
+
from ...core.serialization import FieldMetadata
|
11
|
+
|
12
|
+
|
13
|
+
class GenerationRequestParams(typing_extensions.TypedDict):
|
14
|
+
model_id: str
|
15
|
+
"""
|
16
|
+
The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
|
17
|
+
"""
|
18
|
+
|
19
|
+
transcript: typing.Optional[typing.Any]
|
20
|
+
"""
|
21
|
+
The transcript to generate speech for. This can be a string or an iterator over strings.
|
22
|
+
"""
|
23
|
+
|
24
|
+
voice: TtsRequestVoiceSpecifierParams
|
25
|
+
language: typing_extensions.NotRequired[SupportedLanguage]
|
26
|
+
output_format: WebSocketRawOutputFormatParams
|
27
|
+
duration: typing_extensions.NotRequired[float]
|
28
|
+
"""
|
29
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
30
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
31
|
+
"""
|
32
|
+
|
33
|
+
context_id: typing_extensions.NotRequired[ContextId]
|
34
|
+
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
35
|
+
"""
|
36
|
+
Whether this input may be followed by more inputs.
|
37
|
+
If not specified, this defaults to `false`.
|
38
|
+
"""
|
39
|
+
|
40
|
+
flush: typing_extensions.NotRequired[bool]
|
41
|
+
"""
|
42
|
+
Whether to flush the context.
|
43
|
+
"""
|
44
|
+
|
45
|
+
add_timestamps: typing_extensions.NotRequired[bool]
|
46
|
+
"""
|
47
|
+
Whether to return word-level timestamps.
|
48
|
+
"""
|
49
|
+
|
50
|
+
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
51
|
+
"""
|
52
|
+
Whether to return phoneme-level timestamps.
|
53
|
+
"""
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
|
5
|
+
|
6
|
+
class Mp3OutputFormatParams(typing_extensions.TypedDict):
|
7
|
+
sample_rate: int
|
8
|
+
bit_rate: int
|
9
|
+
"""
|
10
|
+
The bit rate of the audio in bits per second. Supported bit rates are 32000, 64000, 96000, 128000, 192000.
|
11
|
+
"""
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
import typing_extensions
|
5
|
+
import typing
|
6
|
+
from ..types.raw_encoding import RawEncoding
|
7
|
+
import typing_extensions
|
8
|
+
|
9
|
+
|
10
|
+
class OutputFormat_RawParams(typing_extensions.TypedDict):
|
11
|
+
container: typing.Literal["raw"]
|
12
|
+
encoding: RawEncoding
|
13
|
+
sample_rate: int
|
14
|
+
bit_rate: typing_extensions.NotRequired[int]
|
15
|
+
|
16
|
+
|
17
|
+
class OutputFormat_WavParams(typing_extensions.TypedDict):
|
18
|
+
container: typing.Literal["wav"]
|
19
|
+
encoding: RawEncoding
|
20
|
+
sample_rate: int
|
21
|
+
bit_rate: typing_extensions.NotRequired[int]
|
22
|
+
|
23
|
+
|
24
|
+
class OutputFormat_Mp3Params(typing_extensions.TypedDict):
|
25
|
+
container: typing.Literal["mp3"]
|
26
|
+
sample_rate: int
|
27
|
+
bit_rate: int
|
28
|
+
|
29
|
+
|
30
|
+
OutputFormatParams = typing.Union[OutputFormat_RawParams, OutputFormat_WavParams, OutputFormat_Mp3Params]
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing
|
5
|
+
|
6
|
+
|
7
|
+
class PhonemeTimestampsParams(typing_extensions.TypedDict):
|
8
|
+
phonemes: typing.Sequence[str]
|
9
|
+
start: typing.Sequence[float]
|
10
|
+
end: typing.Sequence[float]
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from ..types.raw_encoding import RawEncoding
|
5
|
+
import typing_extensions
|
6
|
+
|
7
|
+
|
8
|
+
class RawOutputFormatParams(typing_extensions.TypedDict):
|
9
|
+
encoding: RawEncoding
|
10
|
+
sample_rate: int
|
11
|
+
bit_rate: typing_extensions.NotRequired[int]
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from ..types.numerical_specifier import NumericalSpecifier
|
5
|
+
from ..types.natural_specifier import NaturalSpecifier
|
6
|
+
|
7
|
+
SpeedParams = typing.Union[NumericalSpecifier, NaturalSpecifier]
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
5
|
+
import typing_extensions
|
6
|
+
from ..types.supported_language import SupportedLanguage
|
7
|
+
from .output_format import OutputFormatParams
|
8
|
+
|
9
|
+
|
10
|
+
class TtsRequestParams(typing_extensions.TypedDict):
|
11
|
+
model_id: str
|
12
|
+
"""
|
13
|
+
The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
|
14
|
+
"""
|
15
|
+
|
16
|
+
transcript: str
|
17
|
+
voice: TtsRequestVoiceSpecifierParams
|
18
|
+
language: typing_extensions.NotRequired[SupportedLanguage]
|
19
|
+
output_format: OutputFormatParams
|
20
|
+
duration: typing_extensions.NotRequired[float]
|
21
|
+
"""
|
22
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
23
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
24
|
+
"""
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing
|
5
|
+
from ...embedding.types.embedding import Embedding
|
6
|
+
import typing_extensions
|
7
|
+
from .controls import ControlsParams
|
8
|
+
from ...core.serialization import FieldMetadata
|
9
|
+
|
10
|
+
|
11
|
+
class TtsRequestEmbeddingSpecifierParams(typing_extensions.TypedDict):
|
12
|
+
mode: typing.Literal["embedding"]
|
13
|
+
embedding: Embedding
|
14
|
+
experimental_controls: typing_extensions.NotRequired[
|
15
|
+
typing_extensions.Annotated[ControlsParams, FieldMetadata(alias="__experimental_controls")]
|
16
|
+
]
|