cartesia 2.0.0b7__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +6 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/tts/_async_websocket.py +5 -0
- cartesia/tts/_websocket.py +8 -0
- cartesia/voices/__init__.py +6 -0
- cartesia/voices/client.py +208 -159
- cartesia/voices/requests/create_voice_request.py +2 -0
- cartesia/voices/requests/localize_dialect.py +6 -1
- cartesia/voices/requests/localize_voice_request.py +15 -2
- cartesia/voices/types/__init__.py +6 -0
- cartesia/voices/types/create_voice_request.py +2 -0
- cartesia/voices/types/localize_dialect.py +6 -1
- cartesia/voices/types/localize_french_dialect.py +5 -0
- cartesia/voices/types/localize_portuguese_dialect.py +5 -0
- cartesia/voices/types/localize_spanish_dialect.py +5 -0
- cartesia/voices/types/localize_voice_request.py +16 -3
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.0b8.dist-info}/METADATA +68 -63
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.0b8.dist-info}/RECORD +19 -16
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.0b8.dist-info}/WHEEL +0 -0
cartesia/__init__.py
CHANGED
@@ -129,6 +129,9 @@ from .voices import (
|
|
129
129
|
LocalizeDialect,
|
130
130
|
LocalizeDialectParams,
|
131
131
|
LocalizeEnglishDialect,
|
132
|
+
LocalizeFrenchDialect,
|
133
|
+
LocalizePortugueseDialect,
|
134
|
+
LocalizeSpanishDialect,
|
132
135
|
LocalizeTargetLanguage,
|
133
136
|
LocalizeVoiceRequest,
|
134
137
|
LocalizeVoiceRequestParams,
|
@@ -187,6 +190,9 @@ __all__ = [
|
|
187
190
|
"LocalizeDialect",
|
188
191
|
"LocalizeDialectParams",
|
189
192
|
"LocalizeEnglishDialect",
|
193
|
+
"LocalizeFrenchDialect",
|
194
|
+
"LocalizePortugueseDialect",
|
195
|
+
"LocalizeSpanishDialect",
|
190
196
|
"LocalizeTargetLanguage",
|
191
197
|
"LocalizeVoiceRequest",
|
192
198
|
"LocalizeVoiceRequestParams",
|
cartesia/core/client_wrapper.py
CHANGED
@@ -16,7 +16,7 @@ class BaseClientWrapper:
|
|
16
16
|
headers: typing.Dict[str, str] = {
|
17
17
|
"X-Fern-Language": "Python",
|
18
18
|
"X-Fern-SDK-Name": "cartesia",
|
19
|
-
"X-Fern-SDK-Version": "2.0.
|
19
|
+
"X-Fern-SDK-Version": "2.0.0b8",
|
20
20
|
}
|
21
21
|
headers["X-API-Key"] = self.api_key
|
22
22
|
headers["Cartesia-Version"] = "2024-11-13"
|
cartesia/tts/_async_websocket.py
CHANGED
@@ -69,6 +69,7 @@ class _AsyncTTSContext:
|
|
69
69
|
stream: bool = True,
|
70
70
|
add_timestamps: bool = False,
|
71
71
|
add_phoneme_timestamps: bool = False,
|
72
|
+
use_original_timestamps: bool = False,
|
72
73
|
continue_: bool = False,
|
73
74
|
flush: bool = False,
|
74
75
|
) -> None:
|
@@ -106,6 +107,8 @@ class _AsyncTTSContext:
|
|
106
107
|
request_body["add_timestamps"] = add_timestamps
|
107
108
|
if add_phoneme_timestamps:
|
108
109
|
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
110
|
+
if use_original_timestamps:
|
111
|
+
request_body["use_original_timestamps"] = use_original_timestamps
|
109
112
|
if continue_:
|
110
113
|
request_body["continue"] = continue_
|
111
114
|
if flush:
|
@@ -367,6 +370,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
367
370
|
stream: bool = True,
|
368
371
|
add_timestamps: bool = False,
|
369
372
|
add_phoneme_timestamps: bool = False,
|
373
|
+
use_original_timestamps: bool = False,
|
370
374
|
):
|
371
375
|
"""See :meth:`_WebSocket.send` for details."""
|
372
376
|
if context_id is None:
|
@@ -385,6 +389,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
385
389
|
continue_=False,
|
386
390
|
add_timestamps=add_timestamps,
|
387
391
|
add_phoneme_timestamps=add_phoneme_timestamps,
|
392
|
+
use_original_timestamps=use_original_timestamps,
|
388
393
|
)
|
389
394
|
|
390
395
|
generator = ctx.receive()
|
cartesia/tts/_websocket.py
CHANGED
@@ -67,6 +67,8 @@ class _TTSContext:
|
|
67
67
|
language: Optional[str] = None,
|
68
68
|
stream: bool = True,
|
69
69
|
add_timestamps: bool = False,
|
70
|
+
add_phoneme_timestamps: bool = False,
|
71
|
+
use_original_timestamps: bool = False,
|
70
72
|
) -> Generator[bytes, None, None]:
|
71
73
|
"""Send audio generation requests to the WebSocket and yield responses.
|
72
74
|
|
@@ -102,6 +104,10 @@ class _TTSContext:
|
|
102
104
|
request_body["stream"] = stream
|
103
105
|
if add_timestamps:
|
104
106
|
request_body["add_timestamps"] = add_timestamps
|
107
|
+
if add_phoneme_timestamps:
|
108
|
+
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
109
|
+
if use_original_timestamps:
|
110
|
+
request_body["use_original_timestamps"] = use_original_timestamps
|
105
111
|
|
106
112
|
if (
|
107
113
|
"context_id" in request_body
|
@@ -354,6 +360,7 @@ class TtsWebsocket:
|
|
354
360
|
stream: bool = True,
|
355
361
|
add_timestamps: bool = False,
|
356
362
|
add_phoneme_timestamps: bool = False,
|
363
|
+
use_original_timestamps: bool = False,
|
357
364
|
):
|
358
365
|
"""Send a request to the WebSocket to generate audio.
|
359
366
|
|
@@ -384,6 +391,7 @@ class TtsWebsocket:
|
|
384
391
|
"stream": stream,
|
385
392
|
"add_timestamps": add_timestamps,
|
386
393
|
"add_phoneme_timestamps": add_phoneme_timestamps,
|
394
|
+
"use_original_timestamps": use_original_timestamps,
|
387
395
|
}
|
388
396
|
generator = self._websocket_generator(request_body)
|
389
397
|
|
cartesia/voices/__init__.py
CHANGED
@@ -12,6 +12,9 @@ from .types import (
|
|
12
12
|
IdSpecifier,
|
13
13
|
LocalizeDialect,
|
14
14
|
LocalizeEnglishDialect,
|
15
|
+
LocalizeFrenchDialect,
|
16
|
+
LocalizePortugueseDialect,
|
17
|
+
LocalizeSpanishDialect,
|
15
18
|
LocalizeTargetLanguage,
|
16
19
|
LocalizeVoiceRequest,
|
17
20
|
MixVoiceSpecifier,
|
@@ -56,6 +59,9 @@ __all__ = [
|
|
56
59
|
"LocalizeDialect",
|
57
60
|
"LocalizeDialectParams",
|
58
61
|
"LocalizeEnglishDialect",
|
62
|
+
"LocalizeFrenchDialect",
|
63
|
+
"LocalizePortugueseDialect",
|
64
|
+
"LocalizeSpanishDialect",
|
59
65
|
"LocalizeTargetLanguage",
|
60
66
|
"LocalizeVoiceRequest",
|
61
67
|
"LocalizeVoiceRequestParams",
|
cartesia/voices/client.py
CHANGED
@@ -11,19 +11,20 @@ from .types.get_voices_response import GetVoicesResponse
|
|
11
11
|
from ..core.pydantic_utilities import parse_obj_as
|
12
12
|
from json.decoder import JSONDecodeError
|
13
13
|
from ..core.api_error import ApiError
|
14
|
-
from ..
|
14
|
+
from .. import core
|
15
15
|
from ..tts.types.supported_language import SupportedLanguage
|
16
|
+
from .types.clone_mode import CloneMode
|
17
|
+
from .types.voice_metadata import VoiceMetadata
|
16
18
|
from .types.voice_id import VoiceId
|
17
19
|
from ..core.jsonable_encoder import jsonable_encoder
|
18
20
|
from .types.localize_target_language import LocalizeTargetLanguage
|
19
21
|
from .types.gender import Gender
|
20
22
|
from .requests.localize_dialect import LocalizeDialectParams
|
21
|
-
from .types.embedding_response import EmbeddingResponse
|
22
23
|
from ..core.serialization import convert_and_respect_annotation_metadata
|
23
24
|
from .requests.mix_voice_specifier import MixVoiceSpecifierParams
|
24
|
-
from
|
25
|
-
from .types.
|
26
|
-
from .types.
|
25
|
+
from .types.embedding_response import EmbeddingResponse
|
26
|
+
from ..embedding.types.embedding import Embedding
|
27
|
+
from .types.base_voice_id import BaseVoiceId
|
27
28
|
from ..core.client_wrapper import AsyncClientWrapper
|
28
29
|
from ..core.pagination import AsyncPager
|
29
30
|
|
@@ -140,34 +141,60 @@ class VoicesClient:
|
|
140
141
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
141
142
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
142
143
|
|
143
|
-
def
|
144
|
+
def clone(
|
144
145
|
self,
|
145
146
|
*,
|
147
|
+
clip: core.File,
|
146
148
|
name: str,
|
147
|
-
|
148
|
-
|
149
|
-
|
149
|
+
language: SupportedLanguage,
|
150
|
+
mode: CloneMode,
|
151
|
+
enhance: bool,
|
152
|
+
description: typing.Optional[str] = OMIT,
|
153
|
+
transcript: typing.Optional[str] = OMIT,
|
150
154
|
request_options: typing.Optional[RequestOptions] = None,
|
151
|
-
) ->
|
155
|
+
) -> VoiceMetadata:
|
152
156
|
"""
|
157
|
+
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
158
|
+
|
159
|
+
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
160
|
+
|
161
|
+
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
162
|
+
|
153
163
|
Parameters
|
154
164
|
----------
|
165
|
+
clip : core.File
|
166
|
+
See core.File for more documentation
|
167
|
+
|
155
168
|
name : str
|
156
169
|
The name of the voice.
|
157
170
|
|
158
|
-
description : str
|
159
|
-
The description of the voice.
|
160
171
|
|
161
|
-
|
172
|
+
language : SupportedLanguage
|
173
|
+
The language of the voice.
|
174
|
+
|
175
|
+
|
176
|
+
mode : CloneMode
|
177
|
+
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
178
|
+
|
179
|
+
|
180
|
+
enhance : bool
|
181
|
+
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
182
|
+
|
183
|
+
|
184
|
+
description : typing.Optional[str]
|
185
|
+
A description for the voice.
|
186
|
+
|
187
|
+
|
188
|
+
transcript : typing.Optional[str]
|
189
|
+
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
162
190
|
|
163
|
-
language : typing.Optional[SupportedLanguage]
|
164
191
|
|
165
192
|
request_options : typing.Optional[RequestOptions]
|
166
193
|
Request-specific configuration.
|
167
194
|
|
168
195
|
Returns
|
169
196
|
-------
|
170
|
-
|
197
|
+
VoiceMetadata
|
171
198
|
|
172
199
|
Examples
|
173
200
|
--------
|
@@ -176,20 +203,27 @@ class VoicesClient:
|
|
176
203
|
client = Cartesia(
|
177
204
|
api_key="YOUR_API_KEY",
|
178
205
|
)
|
179
|
-
client.voices.
|
180
|
-
name="
|
181
|
-
description="
|
182
|
-
|
206
|
+
client.voices.clone(
|
207
|
+
name="A high-stability cloned voice",
|
208
|
+
description="Copied from Cartesia docs",
|
209
|
+
mode="stability",
|
210
|
+
language="en",
|
211
|
+
enhance=True,
|
183
212
|
)
|
184
213
|
"""
|
185
214
|
_response = self._client_wrapper.httpx_client.request(
|
186
|
-
"voices/",
|
215
|
+
"voices/clone",
|
187
216
|
method="POST",
|
188
|
-
|
217
|
+
data={
|
189
218
|
"name": name,
|
190
219
|
"description": description,
|
191
|
-
"embedding": embedding,
|
192
220
|
"language": language,
|
221
|
+
"mode": mode,
|
222
|
+
"enhance": enhance,
|
223
|
+
"transcript": transcript,
|
224
|
+
},
|
225
|
+
files={
|
226
|
+
"clip": clip,
|
193
227
|
},
|
194
228
|
request_options=request_options,
|
195
229
|
omit=OMIT,
|
@@ -197,9 +231,9 @@ class VoicesClient:
|
|
197
231
|
try:
|
198
232
|
if 200 <= _response.status_code < 300:
|
199
233
|
return typing.cast(
|
200
|
-
|
234
|
+
VoiceMetadata,
|
201
235
|
parse_obj_as(
|
202
|
-
type_=
|
236
|
+
type_=VoiceMetadata, # type: ignore
|
203
237
|
object_=_response.json(),
|
204
238
|
),
|
205
239
|
)
|
@@ -349,16 +383,27 @@ class VoicesClient:
|
|
349
383
|
def localize(
|
350
384
|
self,
|
351
385
|
*,
|
352
|
-
|
386
|
+
voice_id: str,
|
387
|
+
name: str,
|
388
|
+
description: str,
|
353
389
|
language: LocalizeTargetLanguage,
|
354
390
|
original_speaker_gender: Gender,
|
355
391
|
dialect: typing.Optional[LocalizeDialectParams] = OMIT,
|
356
392
|
request_options: typing.Optional[RequestOptions] = None,
|
357
|
-
) ->
|
393
|
+
) -> VoiceMetadata:
|
358
394
|
"""
|
395
|
+
Create a new voice from an existing voice localized to a new language and dialect.
|
396
|
+
|
359
397
|
Parameters
|
360
398
|
----------
|
361
|
-
|
399
|
+
voice_id : str
|
400
|
+
The ID of the voice to localize.
|
401
|
+
|
402
|
+
name : str
|
403
|
+
The name of the new localized voice.
|
404
|
+
|
405
|
+
description : str
|
406
|
+
The description of the new localized voice.
|
362
407
|
|
363
408
|
language : LocalizeTargetLanguage
|
364
409
|
|
@@ -371,7 +416,7 @@ class VoicesClient:
|
|
371
416
|
|
372
417
|
Returns
|
373
418
|
-------
|
374
|
-
|
419
|
+
VoiceMetadata
|
375
420
|
|
376
421
|
Examples
|
377
422
|
--------
|
@@ -381,16 +426,21 @@ class VoicesClient:
|
|
381
426
|
api_key="YOUR_API_KEY",
|
382
427
|
)
|
383
428
|
client.voices.localize(
|
384
|
-
|
385
|
-
|
386
|
-
|
429
|
+
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
430
|
+
name="Sarah Peninsular Spanish",
|
431
|
+
description="Sarah Voice in Peninsular Spanish",
|
432
|
+
language="es",
|
433
|
+
original_speaker_gender="female",
|
434
|
+
dialect="pe",
|
387
435
|
)
|
388
436
|
"""
|
389
437
|
_response = self._client_wrapper.httpx_client.request(
|
390
438
|
"voices/localize",
|
391
439
|
method="POST",
|
392
440
|
json={
|
393
|
-
"
|
441
|
+
"voice_id": voice_id,
|
442
|
+
"name": name,
|
443
|
+
"description": description,
|
394
444
|
"language": language,
|
395
445
|
"original_speaker_gender": original_speaker_gender,
|
396
446
|
"dialect": convert_and_respect_annotation_metadata(
|
@@ -403,9 +453,9 @@ class VoicesClient:
|
|
403
453
|
try:
|
404
454
|
if 200 <= _response.status_code < 300:
|
405
455
|
return typing.cast(
|
406
|
-
|
456
|
+
VoiceMetadata,
|
407
457
|
parse_obj_as(
|
408
|
-
type_=
|
458
|
+
type_=VoiceMetadata, # type: ignore
|
409
459
|
object_=_response.json(),
|
410
460
|
),
|
411
461
|
)
|
@@ -468,58 +518,39 @@ class VoicesClient:
|
|
468
518
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
469
519
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
470
520
|
|
471
|
-
def
|
521
|
+
def create(
|
472
522
|
self,
|
473
523
|
*,
|
474
|
-
clip: core.File,
|
475
524
|
name: str,
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
transcript: typing.Optional[str] = OMIT,
|
525
|
+
description: str,
|
526
|
+
embedding: Embedding,
|
527
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
528
|
+
base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
|
481
529
|
request_options: typing.Optional[RequestOptions] = None,
|
482
|
-
) ->
|
530
|
+
) -> Voice:
|
483
531
|
"""
|
484
|
-
|
485
|
-
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
486
|
-
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
532
|
+
Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
|
487
533
|
|
488
534
|
Parameters
|
489
535
|
----------
|
490
|
-
clip : core.File
|
491
|
-
See core.File for more documentation
|
492
|
-
|
493
536
|
name : str
|
494
537
|
The name of the voice.
|
495
538
|
|
539
|
+
description : str
|
540
|
+
The description of the voice.
|
496
541
|
|
497
|
-
|
498
|
-
The language of the voice.
|
499
|
-
|
500
|
-
|
501
|
-
mode : CloneMode
|
502
|
-
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
503
|
-
|
504
|
-
|
505
|
-
enhance : bool
|
506
|
-
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
507
|
-
|
508
|
-
|
509
|
-
description : typing.Optional[str]
|
510
|
-
A description for the voice.
|
511
|
-
|
542
|
+
embedding : Embedding
|
512
543
|
|
513
|
-
|
514
|
-
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
544
|
+
language : typing.Optional[SupportedLanguage]
|
515
545
|
|
546
|
+
base_voice_id : typing.Optional[BaseVoiceId]
|
516
547
|
|
517
548
|
request_options : typing.Optional[RequestOptions]
|
518
549
|
Request-specific configuration.
|
519
550
|
|
520
551
|
Returns
|
521
552
|
-------
|
522
|
-
|
553
|
+
Voice
|
523
554
|
|
524
555
|
Examples
|
525
556
|
--------
|
@@ -528,27 +559,21 @@ class VoicesClient:
|
|
528
559
|
client = Cartesia(
|
529
560
|
api_key="YOUR_API_KEY",
|
530
561
|
)
|
531
|
-
client.voices.
|
532
|
-
name="
|
533
|
-
description="
|
534
|
-
|
535
|
-
language="en",
|
536
|
-
enhance=True,
|
562
|
+
client.voices.create(
|
563
|
+
name="name",
|
564
|
+
description="description",
|
565
|
+
embedding=[1.1, 1.1],
|
537
566
|
)
|
538
567
|
"""
|
539
568
|
_response = self._client_wrapper.httpx_client.request(
|
540
|
-
"voices/
|
569
|
+
"voices/",
|
541
570
|
method="POST",
|
542
|
-
|
571
|
+
json={
|
543
572
|
"name": name,
|
544
573
|
"description": description,
|
574
|
+
"embedding": embedding,
|
545
575
|
"language": language,
|
546
|
-
"
|
547
|
-
"enhance": enhance,
|
548
|
-
"transcript": transcript,
|
549
|
-
},
|
550
|
-
files={
|
551
|
-
"clip": clip,
|
576
|
+
"base_voice_id": base_voice_id,
|
552
577
|
},
|
553
578
|
request_options=request_options,
|
554
579
|
omit=OMIT,
|
@@ -556,9 +581,9 @@ class VoicesClient:
|
|
556
581
|
try:
|
557
582
|
if 200 <= _response.status_code < 300:
|
558
583
|
return typing.cast(
|
559
|
-
|
584
|
+
Voice,
|
560
585
|
parse_obj_as(
|
561
|
-
type_=
|
586
|
+
type_=Voice, # type: ignore
|
562
587
|
object_=_response.json(),
|
563
588
|
),
|
564
589
|
)
|
@@ -685,34 +710,60 @@ class AsyncVoicesClient:
|
|
685
710
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
686
711
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
687
712
|
|
688
|
-
async def
|
713
|
+
async def clone(
|
689
714
|
self,
|
690
715
|
*,
|
716
|
+
clip: core.File,
|
691
717
|
name: str,
|
692
|
-
|
693
|
-
|
694
|
-
|
718
|
+
language: SupportedLanguage,
|
719
|
+
mode: CloneMode,
|
720
|
+
enhance: bool,
|
721
|
+
description: typing.Optional[str] = OMIT,
|
722
|
+
transcript: typing.Optional[str] = OMIT,
|
695
723
|
request_options: typing.Optional[RequestOptions] = None,
|
696
|
-
) ->
|
724
|
+
) -> VoiceMetadata:
|
697
725
|
"""
|
726
|
+
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
727
|
+
|
728
|
+
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
729
|
+
|
730
|
+
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
731
|
+
|
698
732
|
Parameters
|
699
733
|
----------
|
734
|
+
clip : core.File
|
735
|
+
See core.File for more documentation
|
736
|
+
|
700
737
|
name : str
|
701
738
|
The name of the voice.
|
702
739
|
|
703
|
-
description : str
|
704
|
-
The description of the voice.
|
705
740
|
|
706
|
-
|
741
|
+
language : SupportedLanguage
|
742
|
+
The language of the voice.
|
743
|
+
|
744
|
+
|
745
|
+
mode : CloneMode
|
746
|
+
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
747
|
+
|
748
|
+
|
749
|
+
enhance : bool
|
750
|
+
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
751
|
+
|
752
|
+
|
753
|
+
description : typing.Optional[str]
|
754
|
+
A description for the voice.
|
755
|
+
|
756
|
+
|
757
|
+
transcript : typing.Optional[str]
|
758
|
+
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
707
759
|
|
708
|
-
language : typing.Optional[SupportedLanguage]
|
709
760
|
|
710
761
|
request_options : typing.Optional[RequestOptions]
|
711
762
|
Request-specific configuration.
|
712
763
|
|
713
764
|
Returns
|
714
765
|
-------
|
715
|
-
|
766
|
+
VoiceMetadata
|
716
767
|
|
717
768
|
Examples
|
718
769
|
--------
|
@@ -726,23 +777,30 @@ class AsyncVoicesClient:
|
|
726
777
|
|
727
778
|
|
728
779
|
async def main() -> None:
|
729
|
-
await client.voices.
|
730
|
-
name="
|
731
|
-
description="
|
732
|
-
|
780
|
+
await client.voices.clone(
|
781
|
+
name="A high-stability cloned voice",
|
782
|
+
description="Copied from Cartesia docs",
|
783
|
+
mode="stability",
|
784
|
+
language="en",
|
785
|
+
enhance=True,
|
733
786
|
)
|
734
787
|
|
735
788
|
|
736
789
|
asyncio.run(main())
|
737
790
|
"""
|
738
791
|
_response = await self._client_wrapper.httpx_client.request(
|
739
|
-
"voices/",
|
792
|
+
"voices/clone",
|
740
793
|
method="POST",
|
741
|
-
|
794
|
+
data={
|
742
795
|
"name": name,
|
743
796
|
"description": description,
|
744
|
-
"embedding": embedding,
|
745
797
|
"language": language,
|
798
|
+
"mode": mode,
|
799
|
+
"enhance": enhance,
|
800
|
+
"transcript": transcript,
|
801
|
+
},
|
802
|
+
files={
|
803
|
+
"clip": clip,
|
746
804
|
},
|
747
805
|
request_options=request_options,
|
748
806
|
omit=OMIT,
|
@@ -750,9 +808,9 @@ class AsyncVoicesClient:
|
|
750
808
|
try:
|
751
809
|
if 200 <= _response.status_code < 300:
|
752
810
|
return typing.cast(
|
753
|
-
|
811
|
+
VoiceMetadata,
|
754
812
|
parse_obj_as(
|
755
|
-
type_=
|
813
|
+
type_=VoiceMetadata, # type: ignore
|
756
814
|
object_=_response.json(),
|
757
815
|
),
|
758
816
|
)
|
@@ -926,16 +984,27 @@ class AsyncVoicesClient:
|
|
926
984
|
async def localize(
|
927
985
|
self,
|
928
986
|
*,
|
929
|
-
|
987
|
+
voice_id: str,
|
988
|
+
name: str,
|
989
|
+
description: str,
|
930
990
|
language: LocalizeTargetLanguage,
|
931
991
|
original_speaker_gender: Gender,
|
932
992
|
dialect: typing.Optional[LocalizeDialectParams] = OMIT,
|
933
993
|
request_options: typing.Optional[RequestOptions] = None,
|
934
|
-
) ->
|
994
|
+
) -> VoiceMetadata:
|
935
995
|
"""
|
996
|
+
Create a new voice from an existing voice localized to a new language and dialect.
|
997
|
+
|
936
998
|
Parameters
|
937
999
|
----------
|
938
|
-
|
1000
|
+
voice_id : str
|
1001
|
+
The ID of the voice to localize.
|
1002
|
+
|
1003
|
+
name : str
|
1004
|
+
The name of the new localized voice.
|
1005
|
+
|
1006
|
+
description : str
|
1007
|
+
The description of the new localized voice.
|
939
1008
|
|
940
1009
|
language : LocalizeTargetLanguage
|
941
1010
|
|
@@ -948,7 +1017,7 @@ class AsyncVoicesClient:
|
|
948
1017
|
|
949
1018
|
Returns
|
950
1019
|
-------
|
951
|
-
|
1020
|
+
VoiceMetadata
|
952
1021
|
|
953
1022
|
Examples
|
954
1023
|
--------
|
@@ -963,9 +1032,12 @@ class AsyncVoicesClient:
|
|
963
1032
|
|
964
1033
|
async def main() -> None:
|
965
1034
|
await client.voices.localize(
|
966
|
-
|
967
|
-
|
968
|
-
|
1035
|
+
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
1036
|
+
name="Sarah Peninsular Spanish",
|
1037
|
+
description="Sarah Voice in Peninsular Spanish",
|
1038
|
+
language="es",
|
1039
|
+
original_speaker_gender="female",
|
1040
|
+
dialect="pe",
|
969
1041
|
)
|
970
1042
|
|
971
1043
|
|
@@ -975,7 +1047,9 @@ class AsyncVoicesClient:
|
|
975
1047
|
"voices/localize",
|
976
1048
|
method="POST",
|
977
1049
|
json={
|
978
|
-
"
|
1050
|
+
"voice_id": voice_id,
|
1051
|
+
"name": name,
|
1052
|
+
"description": description,
|
979
1053
|
"language": language,
|
980
1054
|
"original_speaker_gender": original_speaker_gender,
|
981
1055
|
"dialect": convert_and_respect_annotation_metadata(
|
@@ -988,9 +1062,9 @@ class AsyncVoicesClient:
|
|
988
1062
|
try:
|
989
1063
|
if 200 <= _response.status_code < 300:
|
990
1064
|
return typing.cast(
|
991
|
-
|
1065
|
+
VoiceMetadata,
|
992
1066
|
parse_obj_as(
|
993
|
-
type_=
|
1067
|
+
type_=VoiceMetadata, # type: ignore
|
994
1068
|
object_=_response.json(),
|
995
1069
|
),
|
996
1070
|
)
|
@@ -1061,58 +1135,39 @@ class AsyncVoicesClient:
|
|
1061
1135
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
1062
1136
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
1063
1137
|
|
1064
|
-
async def
|
1138
|
+
async def create(
|
1065
1139
|
self,
|
1066
1140
|
*,
|
1067
|
-
clip: core.File,
|
1068
1141
|
name: str,
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
transcript: typing.Optional[str] = OMIT,
|
1142
|
+
description: str,
|
1143
|
+
embedding: Embedding,
|
1144
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
1145
|
+
base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
|
1074
1146
|
request_options: typing.Optional[RequestOptions] = None,
|
1075
|
-
) ->
|
1147
|
+
) -> Voice:
|
1076
1148
|
"""
|
1077
|
-
|
1078
|
-
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
1079
|
-
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
1149
|
+
Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
|
1080
1150
|
|
1081
1151
|
Parameters
|
1082
1152
|
----------
|
1083
|
-
clip : core.File
|
1084
|
-
See core.File for more documentation
|
1085
|
-
|
1086
1153
|
name : str
|
1087
1154
|
The name of the voice.
|
1088
1155
|
|
1156
|
+
description : str
|
1157
|
+
The description of the voice.
|
1089
1158
|
|
1090
|
-
|
1091
|
-
The language of the voice.
|
1092
|
-
|
1093
|
-
|
1094
|
-
mode : CloneMode
|
1095
|
-
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
1096
|
-
|
1097
|
-
|
1098
|
-
enhance : bool
|
1099
|
-
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
1100
|
-
|
1101
|
-
|
1102
|
-
description : typing.Optional[str]
|
1103
|
-
A description for the voice.
|
1104
|
-
|
1159
|
+
embedding : Embedding
|
1105
1160
|
|
1106
|
-
|
1107
|
-
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
1161
|
+
language : typing.Optional[SupportedLanguage]
|
1108
1162
|
|
1163
|
+
base_voice_id : typing.Optional[BaseVoiceId]
|
1109
1164
|
|
1110
1165
|
request_options : typing.Optional[RequestOptions]
|
1111
1166
|
Request-specific configuration.
|
1112
1167
|
|
1113
1168
|
Returns
|
1114
1169
|
-------
|
1115
|
-
|
1170
|
+
Voice
|
1116
1171
|
|
1117
1172
|
Examples
|
1118
1173
|
--------
|
@@ -1126,30 +1181,24 @@ class AsyncVoicesClient:
|
|
1126
1181
|
|
1127
1182
|
|
1128
1183
|
async def main() -> None:
|
1129
|
-
await client.voices.
|
1130
|
-
name="
|
1131
|
-
description="
|
1132
|
-
|
1133
|
-
language="en",
|
1134
|
-
enhance=True,
|
1184
|
+
await client.voices.create(
|
1185
|
+
name="name",
|
1186
|
+
description="description",
|
1187
|
+
embedding=[1.1, 1.1],
|
1135
1188
|
)
|
1136
1189
|
|
1137
1190
|
|
1138
1191
|
asyncio.run(main())
|
1139
1192
|
"""
|
1140
1193
|
_response = await self._client_wrapper.httpx_client.request(
|
1141
|
-
"voices/
|
1194
|
+
"voices/",
|
1142
1195
|
method="POST",
|
1143
|
-
|
1196
|
+
json={
|
1144
1197
|
"name": name,
|
1145
1198
|
"description": description,
|
1199
|
+
"embedding": embedding,
|
1146
1200
|
"language": language,
|
1147
|
-
"
|
1148
|
-
"enhance": enhance,
|
1149
|
-
"transcript": transcript,
|
1150
|
-
},
|
1151
|
-
files={
|
1152
|
-
"clip": clip,
|
1201
|
+
"base_voice_id": base_voice_id,
|
1153
1202
|
},
|
1154
1203
|
request_options=request_options,
|
1155
1204
|
omit=OMIT,
|
@@ -1157,9 +1206,9 @@ class AsyncVoicesClient:
|
|
1157
1206
|
try:
|
1158
1207
|
if 200 <= _response.status_code < 300:
|
1159
1208
|
return typing.cast(
|
1160
|
-
|
1209
|
+
Voice,
|
1161
1210
|
parse_obj_as(
|
1162
|
-
type_=
|
1211
|
+
type_=Voice, # type: ignore
|
1163
1212
|
object_=_response.json(),
|
1164
1213
|
),
|
1165
1214
|
)
|
@@ -4,6 +4,7 @@ import typing_extensions
|
|
4
4
|
from ...embedding.types.embedding import Embedding
|
5
5
|
import typing_extensions
|
6
6
|
from ...tts.types.supported_language import SupportedLanguage
|
7
|
+
from ..types.base_voice_id import BaseVoiceId
|
7
8
|
|
8
9
|
|
9
10
|
class CreateVoiceRequestParams(typing_extensions.TypedDict):
|
@@ -19,3 +20,4 @@ class CreateVoiceRequestParams(typing_extensions.TypedDict):
|
|
19
20
|
|
20
21
|
embedding: Embedding
|
21
22
|
language: typing_extensions.NotRequired[SupportedLanguage]
|
23
|
+
base_voice_id: typing_extensions.NotRequired[BaseVoiceId]
|
@@ -2,5 +2,10 @@
|
|
2
2
|
|
3
3
|
import typing
|
4
4
|
from ..types.localize_english_dialect import LocalizeEnglishDialect
|
5
|
+
from ..types.localize_spanish_dialect import LocalizeSpanishDialect
|
6
|
+
from ..types.localize_portuguese_dialect import LocalizePortugueseDialect
|
7
|
+
from ..types.localize_french_dialect import LocalizeFrenchDialect
|
5
8
|
|
6
|
-
LocalizeDialectParams = typing.Union[
|
9
|
+
LocalizeDialectParams = typing.Union[
|
10
|
+
LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect, LocalizeFrenchDialect
|
11
|
+
]
|
@@ -1,7 +1,6 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
import typing_extensions
|
4
|
-
from ...embedding.types.embedding import Embedding
|
5
4
|
from ..types.localize_target_language import LocalizeTargetLanguage
|
6
5
|
from ..types.gender import Gender
|
7
6
|
import typing_extensions
|
@@ -9,7 +8,21 @@ from .localize_dialect import LocalizeDialectParams
|
|
9
8
|
|
10
9
|
|
11
10
|
class LocalizeVoiceRequestParams(typing_extensions.TypedDict):
|
12
|
-
|
11
|
+
voice_id: str
|
12
|
+
"""
|
13
|
+
The ID of the voice to localize.
|
14
|
+
"""
|
15
|
+
|
16
|
+
name: str
|
17
|
+
"""
|
18
|
+
The name of the new localized voice.
|
19
|
+
"""
|
20
|
+
|
21
|
+
description: str
|
22
|
+
"""
|
23
|
+
The description of the new localized voice.
|
24
|
+
"""
|
25
|
+
|
13
26
|
language: LocalizeTargetLanguage
|
14
27
|
original_speaker_gender: Gender
|
15
28
|
dialect: typing_extensions.NotRequired[LocalizeDialectParams]
|
@@ -11,6 +11,9 @@ from .get_voices_response import GetVoicesResponse
|
|
11
11
|
from .id_specifier import IdSpecifier
|
12
12
|
from .localize_dialect import LocalizeDialect
|
13
13
|
from .localize_english_dialect import LocalizeEnglishDialect
|
14
|
+
from .localize_french_dialect import LocalizeFrenchDialect
|
15
|
+
from .localize_portuguese_dialect import LocalizePortugueseDialect
|
16
|
+
from .localize_spanish_dialect import LocalizeSpanishDialect
|
14
17
|
from .localize_target_language import LocalizeTargetLanguage
|
15
18
|
from .localize_voice_request import LocalizeVoiceRequest
|
16
19
|
from .mix_voice_specifier import MixVoiceSpecifier
|
@@ -34,6 +37,9 @@ __all__ = [
|
|
34
37
|
"IdSpecifier",
|
35
38
|
"LocalizeDialect",
|
36
39
|
"LocalizeEnglishDialect",
|
40
|
+
"LocalizeFrenchDialect",
|
41
|
+
"LocalizePortugueseDialect",
|
42
|
+
"LocalizeSpanishDialect",
|
37
43
|
"LocalizeTargetLanguage",
|
38
44
|
"LocalizeVoiceRequest",
|
39
45
|
"MixVoiceSpecifier",
|
@@ -5,6 +5,7 @@ import pydantic
|
|
5
5
|
from ...embedding.types.embedding import Embedding
|
6
6
|
import typing
|
7
7
|
from ...tts.types.supported_language import SupportedLanguage
|
8
|
+
from .base_voice_id import BaseVoiceId
|
8
9
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
10
|
|
10
11
|
|
@@ -21,6 +22,7 @@ class CreateVoiceRequest(UniversalBaseModel):
|
|
21
22
|
|
22
23
|
embedding: Embedding
|
23
24
|
language: typing.Optional[SupportedLanguage] = None
|
25
|
+
base_voice_id: typing.Optional[BaseVoiceId] = None
|
24
26
|
|
25
27
|
if IS_PYDANTIC_V2:
|
26
28
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -2,5 +2,10 @@
|
|
2
2
|
|
3
3
|
import typing
|
4
4
|
from .localize_english_dialect import LocalizeEnglishDialect
|
5
|
+
from .localize_spanish_dialect import LocalizeSpanishDialect
|
6
|
+
from .localize_portuguese_dialect import LocalizePortugueseDialect
|
7
|
+
from .localize_french_dialect import LocalizeFrenchDialect
|
5
8
|
|
6
|
-
LocalizeDialect = typing.Union[
|
9
|
+
LocalizeDialect = typing.Union[
|
10
|
+
LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect, LocalizeFrenchDialect
|
11
|
+
]
|
@@ -1,17 +1,30 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
-
|
4
|
+
import pydantic
|
5
5
|
from .localize_target_language import LocalizeTargetLanguage
|
6
6
|
from .gender import Gender
|
7
7
|
import typing
|
8
8
|
from .localize_dialect import LocalizeDialect
|
9
9
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
10
|
-
import pydantic
|
11
10
|
|
12
11
|
|
13
12
|
class LocalizeVoiceRequest(UniversalBaseModel):
|
14
|
-
|
13
|
+
voice_id: str = pydantic.Field()
|
14
|
+
"""
|
15
|
+
The ID of the voice to localize.
|
16
|
+
"""
|
17
|
+
|
18
|
+
name: str = pydantic.Field()
|
19
|
+
"""
|
20
|
+
The name of the new localized voice.
|
21
|
+
"""
|
22
|
+
|
23
|
+
description: str = pydantic.Field()
|
24
|
+
"""
|
25
|
+
The description of the new localized voice.
|
26
|
+
"""
|
27
|
+
|
15
28
|
language: LocalizeTargetLanguage
|
16
29
|
original_speaker_gender: Gender
|
17
30
|
dialect: typing.Optional[LocalizeDialect] = None
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.0b8
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.8,<4.0
|
6
6
|
Classifier: Intended Audience :: Developers
|
@@ -47,53 +47,6 @@ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.c
|
|
47
47
|
pip install cartesia
|
48
48
|
```
|
49
49
|
|
50
|
-
## Reference
|
51
|
-
|
52
|
-
A full reference for this library is available [here](./reference.md).
|
53
|
-
|
54
|
-
## Voices
|
55
|
-
|
56
|
-
```python
|
57
|
-
from cartesia import Cartesia
|
58
|
-
import os
|
59
|
-
|
60
|
-
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
61
|
-
|
62
|
-
# Get all available voices
|
63
|
-
voices = client.voices.list()
|
64
|
-
print(voices)
|
65
|
-
|
66
|
-
# Get a specific voice
|
67
|
-
voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
|
68
|
-
print("The embedding for", voice.name, "is", voice.embedding)
|
69
|
-
|
70
|
-
# Clone a voice using file data
|
71
|
-
cloned_voice = client.voices.clone(
|
72
|
-
clip=open("path/to/voice.wav", "rb"),
|
73
|
-
name="Test cloned voice",
|
74
|
-
language="en",
|
75
|
-
mode="similarity", # or "stability"
|
76
|
-
enhance=False, # use enhance=True to clean and denoise the cloning audio
|
77
|
-
description="Test voice description"
|
78
|
-
)
|
79
|
-
|
80
|
-
# Mix voices together
|
81
|
-
mixed_voice = client.voices.mix(
|
82
|
-
voices=[
|
83
|
-
{"id": "voice_id_1", "weight": 0.25},
|
84
|
-
{"id": "voice_id_2", "weight": 0.75}
|
85
|
-
]
|
86
|
-
)
|
87
|
-
|
88
|
-
# Create a new voice from embedding
|
89
|
-
new_voice = client.voices.create(
|
90
|
-
name="Test Voice",
|
91
|
-
description="Test voice description",
|
92
|
-
embedding=[...], # List[float] with 192 dimensions
|
93
|
-
language="en"
|
94
|
-
)
|
95
|
-
```
|
96
|
-
|
97
50
|
## Usage
|
98
51
|
|
99
52
|
Instantiate and use the client with the following:
|
@@ -112,10 +65,6 @@ client.tts.bytes(
|
|
112
65
|
voice={
|
113
66
|
"mode": "id",
|
114
67
|
"id": "694f9389-aac1-45b6-b726-9d9369183238",
|
115
|
-
"experimental_controls": {
|
116
|
-
"speed": 0.5, # range between [-1.0, 1.0], or "slow", "fastest", etc.
|
117
|
-
"emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
|
118
|
-
}
|
119
68
|
},
|
120
69
|
language="en",
|
121
70
|
output_format={
|
@@ -176,7 +125,7 @@ except ApiError as e:
|
|
176
125
|
|
177
126
|
## Streaming
|
178
127
|
|
179
|
-
The SDK supports streaming responses
|
128
|
+
The SDK supports streaming responses as well, returning a generator that you can iterate over with a `for ... in ...` loop:
|
180
129
|
|
181
130
|
```python
|
182
131
|
from cartesia import Cartesia
|
@@ -215,7 +164,9 @@ for chunk in chunks:
|
|
215
164
|
print(f"Received chunk of size: {len(chunk.data)}")
|
216
165
|
```
|
217
166
|
|
218
|
-
##
|
167
|
+
## WebSockets
|
168
|
+
|
169
|
+
For the lowest latency in advanced usecases (such as streaming in an LLM-generated transcript and streaming out audio), you should use our websockets client:
|
219
170
|
|
220
171
|
```python
|
221
172
|
from cartesia import Cartesia
|
@@ -223,15 +174,10 @@ from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawPar
|
|
223
174
|
import pyaudio
|
224
175
|
import os
|
225
176
|
|
226
|
-
client = Cartesia(
|
227
|
-
api_key=os.getenv("CARTESIA_API_KEY"),
|
228
|
-
)
|
177
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
229
178
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
230
179
|
transcript = "Hello! Welcome to Cartesia"
|
231
180
|
|
232
|
-
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
233
|
-
model_id = "sonic-2"
|
234
|
-
|
235
181
|
p = pyaudio.PyAudio()
|
236
182
|
rate = 22050
|
237
183
|
|
@@ -242,14 +188,14 @@ ws = client.tts.websocket()
|
|
242
188
|
|
243
189
|
# Generate and stream audio using the websocket
|
244
190
|
for output in ws.send(
|
245
|
-
model_id=
|
191
|
+
model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
|
246
192
|
transcript=transcript,
|
247
193
|
voice={"id": voice_id},
|
248
194
|
stream=True,
|
249
195
|
output_format={
|
250
196
|
"container": "raw",
|
251
197
|
"encoding": "pcm_f32le",
|
252
|
-
"sample_rate":
|
198
|
+
"sample_rate": rate
|
253
199
|
},
|
254
200
|
):
|
255
201
|
buffer = output.audio
|
@@ -267,6 +213,40 @@ p.terminate()
|
|
267
213
|
ws.close() # Close the websocket connection
|
268
214
|
```
|
269
215
|
|
216
|
+
## Voices
|
217
|
+
|
218
|
+
List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
|
219
|
+
|
220
|
+
```python
|
221
|
+
from cartesia import Cartesia
|
222
|
+
import os
|
223
|
+
|
224
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
225
|
+
|
226
|
+
# Get all available Voices
|
227
|
+
voices = client.voices.list()
|
228
|
+
for voice in voices:
|
229
|
+
print(voice)
|
230
|
+
```
|
231
|
+
|
232
|
+
You can also get the complete metadata for a specific Voice, or make a new Voice by cloning from an audio sample:
|
233
|
+
|
234
|
+
```python
|
235
|
+
# Get a specific Voice
|
236
|
+
voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
|
237
|
+
print("The embedding for", voice.name, "is", voice.embedding)
|
238
|
+
|
239
|
+
# Clone a Voice using file data
|
240
|
+
cloned_voice = client.voices.clone(
|
241
|
+
clip=open("path/to/voice.wav", "rb"),
|
242
|
+
name="Test cloned voice",
|
243
|
+
language="en",
|
244
|
+
mode="similarity", # or "stability"
|
245
|
+
enhance=False, # use enhance=True to clean and denoise the cloning audio
|
246
|
+
description="Test voice description"
|
247
|
+
)
|
248
|
+
```
|
249
|
+
|
270
250
|
## Requesting Timestamps
|
271
251
|
|
272
252
|
```python
|
@@ -290,7 +270,8 @@ async def main():
|
|
290
270
|
"encoding": "pcm_f32le",
|
291
271
|
"sample_rate": 44100
|
292
272
|
},
|
293
|
-
add_timestamps=True,
|
273
|
+
add_timestamps=True, # Enable word-level timestamps
|
274
|
+
add_phoneme_timestamps=True, # Enable phonemized timestamps
|
294
275
|
stream=True
|
295
276
|
)
|
296
277
|
|
@@ -358,6 +339,26 @@ client.tts.bytes(..., request_options={
|
|
358
339
|
})
|
359
340
|
```
|
360
341
|
|
342
|
+
### Mixing voices and creating from embeddings
|
343
|
+
|
344
|
+
```python
|
345
|
+
# Mix voices together
|
346
|
+
mixed_voice = client.voices.mix(
|
347
|
+
voices=[
|
348
|
+
{"id": "voice_id_1", "weight": 0.25},
|
349
|
+
{"id": "voice_id_2", "weight": 0.75}
|
350
|
+
]
|
351
|
+
)
|
352
|
+
|
353
|
+
# Create a new voice from embedding
|
354
|
+
new_voice = client.voices.create(
|
355
|
+
name="Test Voice",
|
356
|
+
description="Test voice description",
|
357
|
+
embedding=[...], # List[float] with 192 dimensions
|
358
|
+
language="en"
|
359
|
+
)
|
360
|
+
```
|
361
|
+
|
361
362
|
### Custom Client
|
362
363
|
|
363
364
|
You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
|
@@ -375,6 +376,10 @@ client = Cartesia(
|
|
375
376
|
)
|
376
377
|
```
|
377
378
|
|
379
|
+
## Reference
|
380
|
+
|
381
|
+
A full reference for this library is available [here](./reference.md).
|
382
|
+
|
378
383
|
## Contributing
|
379
384
|
|
380
385
|
Note that most of this library is generated programmatically from
|
@@ -1,4 +1,4 @@
|
|
1
|
-
cartesia/__init__.py,sha256=
|
1
|
+
cartesia/__init__.py,sha256=k-YMKYUtzKObkF9Zn0TuHTC2_Z07mH6CTnZmn1my7po,8143
|
2
2
|
cartesia/api_status/__init__.py,sha256=_dHNLdknrBjxHtU2PvLumttJM-JTQhJQqhhAQkLqt_U,168
|
3
3
|
cartesia/api_status/client.py,sha256=GJ9Dq8iCn3hn8vCIqc6k1fCGEhSz0T0kaPGcdFnbMDY,3146
|
4
4
|
cartesia/api_status/requests/__init__.py,sha256=ilEMzEy1JEw484CuL92bX5lHGOznc62pjiDMgiZ0tKM,130
|
@@ -9,7 +9,7 @@ cartesia/base_client.py,sha256=EIfMrSkJgMCgzYWJ5GN2RxsWikxcH0kMmcb3WYqfQ_g,6321
|
|
9
9
|
cartesia/client.py,sha256=sPAYQLt9W2E_2F17ooocvvJImuNyLrL8xUypgf6dZeI,6238
|
10
10
|
cartesia/core/__init__.py,sha256=-t9txgeQZL_1FDw_08GEoj4ft1Cn9Dti6X0Drsadlr0,1519
|
11
11
|
cartesia/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
|
12
|
-
cartesia/core/client_wrapper.py,sha256=
|
12
|
+
cartesia/core/client_wrapper.py,sha256=BEIOireABuSTdCAcsHeQKtZ1D3sIi-CVQv5YFHmfi3Y,1856
|
13
13
|
cartesia/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
|
14
14
|
cartesia/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
|
15
15
|
cartesia/core/http_client.py,sha256=KL5RGa0y4n8nX0-07WRg4ZQUTq30sc-XJbWcP5vjBDg,19552
|
@@ -42,8 +42,8 @@ cartesia/infill/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,6
|
|
42
42
|
cartesia/infill/client.py,sha256=PWE5Ak-wsaBM_8g52oDl9PYx76PkW6f900mnxvZf4Bk,12571
|
43
43
|
cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
cartesia/tts/__init__.py,sha256=G0wcYlPrr7hmu5DQgCG7bDTQq36fpP3iBM5164Z0-Js,4701
|
45
|
-
cartesia/tts/_async_websocket.py,sha256=
|
46
|
-
cartesia/tts/_websocket.py,sha256=
|
45
|
+
cartesia/tts/_async_websocket.py,sha256=U7ySTJqb3V0RDSKPcFfzpBa0pqui05k5BTqiIpSBth0,18652
|
46
|
+
cartesia/tts/_websocket.py,sha256=roMJ7oDSjr5U5sTHM8EcGu-EtzbIVUH4HmOY1yI2JL4,19118
|
47
47
|
cartesia/tts/client.py,sha256=KMhDaW0gG_uwkSq1EzoC-bCx1G0TLB4K4Gm57L4xDSs,14832
|
48
48
|
cartesia/tts/requests/__init__.py,sha256=0rcfMLHNbUhkRI1xS09UE4p-WT1BCqrcblFtPxcATOI,3261
|
49
49
|
cartesia/tts/requests/cancel_context_request.py,sha256=Wl8g-o5vwl9ENm-H1wsLx441FkIR_4Wt5UYtuWce2Yw,431
|
@@ -120,35 +120,38 @@ cartesia/voice_changer/requests/streaming_response.py,sha256=cV6L9mMY0w2JpJ0xKoF
|
|
120
120
|
cartesia/voice_changer/types/__init__.py,sha256=qAiHsdRpnFeS0lBkYp_NRrhSJiRXCg5-uFibqDWzYVU,430
|
121
121
|
cartesia/voice_changer/types/output_format_container.py,sha256=RqLDELdgeOjYqNTJX1Le62qjiFiJGxf0cYnol88-LLM,166
|
122
122
|
cartesia/voice_changer/types/streaming_response.py,sha256=rQ4ZehtOHsCBKijyULz_ahGQYNj1yus6AM6u2wgcBsI,1963
|
123
|
-
cartesia/voices/__init__.py,sha256=
|
124
|
-
cartesia/voices/client.py,sha256=
|
123
|
+
cartesia/voices/__init__.py,sha256=2D58Bir45LvcvP08QMnPlFE8DD8BONTjPLkIDdKs7vg,1891
|
124
|
+
cartesia/voices/client.py,sha256=8zQZAtaCAJi79puMxVhzR5OWCDjows53k4oTvSgcdJM,38867
|
125
125
|
cartesia/voices/requests/__init__.py,sha256=XiBJbSYeQCgFMtwywKvQ0Nmp7Zf_0WskzRhgr9c8h38,1072
|
126
|
-
cartesia/voices/requests/create_voice_request.py,sha256=
|
126
|
+
cartesia/voices/requests/create_voice_request.py,sha256=r6dKb9ga0ZsAi_6PXuE43u2lLgfQg2DIYjk2Neng7pI,617
|
127
127
|
cartesia/voices/requests/embedding_response.py,sha256=PGZkBD8UBcv2MYQbBXyD4T6lzaE9oSGGwXx-MoXCp0M,228
|
128
128
|
cartesia/voices/requests/embedding_specifier.py,sha256=PAHdGsVmLLeJC2b1fWHWI_OlhogO1WnJdzoX9pj5N8c,282
|
129
129
|
cartesia/voices/requests/get_voices_response.py,sha256=g-ZCaCaLOlZSitcKVhdCtfdKQQz8N3W6E7_wZUNOi5M,747
|
130
130
|
cartesia/voices/requests/id_specifier.py,sha256=UTtoXBEEYaGvg-Dn2QxUDACNB3Vm1O1XbrPtBA3rGzU,252
|
131
|
-
cartesia/voices/requests/localize_dialect.py,sha256=
|
132
|
-
cartesia/voices/requests/localize_voice_request.py,sha256=
|
131
|
+
cartesia/voices/requests/localize_dialect.py,sha256=OHAInU6IP0LBzIY3VYSiU9bRLjXfr1pGXunsLgv1QHs,497
|
132
|
+
cartesia/voices/requests/localize_voice_request.py,sha256=oh828eqYkiticD_lerc8WemN3bW13mLZpfRDiKbG75g,703
|
133
133
|
cartesia/voices/requests/mix_voice_specifier.py,sha256=YjOJ2Qt3nqMQzHsYbF1DnZgmZS9zZepLXpji6V9mfgs,266
|
134
134
|
cartesia/voices/requests/mix_voices_request.py,sha256=6JCzFmWKIS1_t-uSoO1m-FQbLWB1zaykTcGV-1s-RqM,275
|
135
135
|
cartesia/voices/requests/update_voice_request.py,sha256=XxJ6TKO4M2s1kXQAZRj8uA4okIABvmWiFhAHJv4BS0Q,282
|
136
136
|
cartesia/voices/requests/voice.py,sha256=M-4lf4W57fx84_JFOy55b9mWcqO4LfzpY-G_Ekv-2Bo,1031
|
137
137
|
cartesia/voices/requests/voice_metadata.py,sha256=S0jPQtBpEb2WSnYDLQTS7pcbNJpc0d01uWravHaqzso,697
|
138
|
-
cartesia/voices/types/__init__.py,sha256=
|
138
|
+
cartesia/voices/types/__init__.py,sha256=yjxMWjoBpwAZ5UJ2iRSC_kKgZvGmqVd09kQxgcTnMac,1782
|
139
139
|
cartesia/voices/types/base_voice_id.py,sha256=nWRC0rvLpjeMpRbLSmUTPziWo1ZrbPxw22l4gEBWp8Q,118
|
140
140
|
cartesia/voices/types/clone_mode.py,sha256=3sR6wdxym4xDVsoHppp3-V9mpDwP9F9fDfMUQKG24xw,160
|
141
|
-
cartesia/voices/types/create_voice_request.py,sha256=
|
141
|
+
cartesia/voices/types/create_voice_request.py,sha256=_q0d8QojmQrpU-Puzd_YvWmiC7cBp_lrbKmTLuknYqQ,1005
|
142
142
|
cartesia/voices/types/embedding_response.py,sha256=B7MJ79HIAnxtiP6OT0tt27KBDYTZ3VU0MLuQfb5qVOg,624
|
143
143
|
cartesia/voices/types/embedding_specifier.py,sha256=cf6JfVnISyrvjWup3oAg-RFdMVRxytem6HLwZgKl3gA,671
|
144
144
|
cartesia/voices/types/gender.py,sha256=OrbTO__3HVNculvkcb5Pz-Yoa-Xv8N_rNMrFoy2DoaA,148
|
145
145
|
cartesia/voices/types/gender_presentation.py,sha256=rM8pSurYCSH0AGgLsVpVAPp7uz7TQMM1nPa7-Vus7gw,185
|
146
146
|
cartesia/voices/types/get_voices_response.py,sha256=c6KMkmJepTUmT7I6tAVOGrPst2kkXxDCXLIf1AnR9NE,1136
|
147
147
|
cartesia/voices/types/id_specifier.py,sha256=yAY-uc9hRJkHXdsSfRZWkE8ga2Sb-KVipOTSXa8Wmp0,634
|
148
|
-
cartesia/voices/types/localize_dialect.py,sha256=
|
148
|
+
cartesia/voices/types/localize_dialect.py,sha256=6JpJKeQvtDjCT2n-5yaGOe3D-4nYqUoYrvcCSE2Zxik,463
|
149
149
|
cartesia/voices/types/localize_english_dialect.py,sha256=0PjZNjQv5ll2wWZxGveQIYCUGLtGDVELK9FBWFe7SNc,176
|
150
|
+
cartesia/voices/types/localize_french_dialect.py,sha256=aMhqLi_5goAaSGZguZIFOwQ9Yqh5ApL6gS3cDI315lQ,157
|
151
|
+
cartesia/voices/types/localize_portuguese_dialect.py,sha256=6dcThK1qWyS3c-W--3Zz7HK5ixS0qslEWrVQmKSrl9E,161
|
152
|
+
cartesia/voices/types/localize_spanish_dialect.py,sha256=h-H52vk0MBOvJqlzPVPgajfQU6oxpTzHoQAKmSDyaC4,158
|
150
153
|
cartesia/voices/types/localize_target_language.py,sha256=ttngtFVpMvuWAKQztJu_pCaf7V62DzmNq9zthPCb2LI,242
|
151
|
-
cartesia/voices/types/localize_voice_request.py,sha256=
|
154
|
+
cartesia/voices/types/localize_voice_request.py,sha256=gvjg292kMgji0L9TNO3VqDS0pHO1vGJUcf0l_vEW_5Y,1098
|
152
155
|
cartesia/voices/types/mix_voice_specifier.py,sha256=B0FE6UREGk1TxlN0GOPwyCuqJbMkWVUs0EFqiJuQfZ8,236
|
153
156
|
cartesia/voices/types/mix_voices_request.py,sha256=R_8bmUmE1br4wmfH1Qu6EnL9uC-V1z5BV3_B7u51EOw,641
|
154
157
|
cartesia/voices/types/update_voice_request.py,sha256=_CEH8nuSZn2qZa9xZlANZXOhJd49XLel3dRy2dfOvr8,716
|
@@ -157,6 +160,6 @@ cartesia/voices/types/voice_expand_options.py,sha256=e4FroWdlxEE-LXQfT1RWlGHtswl
|
|
157
160
|
cartesia/voices/types/voice_id.py,sha256=GDoXcRVeIm-V21R4suxG2zqLD3DLYkXE9kgizadzFKo,79
|
158
161
|
cartesia/voices/types/voice_metadata.py,sha256=4KNGjXMUKm3niv-NvKIFVGtiilpH13heuzKcZYNQxk4,1181
|
159
162
|
cartesia/voices/types/weight.py,sha256=XqDU7_JItNUb5QykIDqTbELlRYQdbt2SviRgW0w2LKo,80
|
160
|
-
cartesia-2.0.
|
161
|
-
cartesia-2.0.
|
162
|
-
cartesia-2.0.
|
163
|
+
cartesia-2.0.0b8.dist-info/METADATA,sha256=ynQsxGb1v5ZHMnXkeqYceRFrC-bxwuRaopOPyuBbCsk,11208
|
164
|
+
cartesia-2.0.0b8.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
165
|
+
cartesia-2.0.0b8.dist-info/RECORD,,
|
File without changes
|