cartesia 2.0.0b7__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +15 -1
- cartesia/auth/__init__.py +13 -0
- cartesia/auth/client.py +159 -0
- cartesia/auth/requests/__init__.py +7 -0
- cartesia/auth/requests/token_grant.py +10 -0
- cartesia/auth/requests/token_request.py +17 -0
- cartesia/auth/requests/token_response.py +10 -0
- cartesia/auth/types/__init__.py +7 -0
- cartesia/auth/types/token_grant.py +22 -0
- cartesia/auth/types/token_request.py +28 -0
- cartesia/auth/types/token_response.py +22 -0
- cartesia/base_client.py +4 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/tts/_async_websocket.py +8 -0
- cartesia/tts/_websocket.py +11 -0
- cartesia/tts/client.py +40 -4
- cartesia/tts/requests/generation_request.py +19 -1
- cartesia/tts/requests/tts_request.py +10 -1
- cartesia/tts/requests/web_socket_tts_request.py +3 -1
- cartesia/tts/types/generation_request.py +19 -1
- cartesia/tts/types/tts_request.py +10 -1
- cartesia/tts/types/web_socket_tts_request.py +3 -1
- cartesia/voices/__init__.py +6 -0
- cartesia/voices/client.py +208 -159
- cartesia/voices/requests/create_voice_request.py +2 -0
- cartesia/voices/requests/localize_dialect.py +6 -1
- cartesia/voices/requests/localize_voice_request.py +15 -2
- cartesia/voices/types/__init__.py +6 -0
- cartesia/voices/types/create_voice_request.py +2 -0
- cartesia/voices/types/localize_dialect.py +6 -1
- cartesia/voices/types/localize_french_dialect.py +5 -0
- cartesia/voices/types/localize_portuguese_dialect.py +5 -0
- cartesia/voices/types/localize_spanish_dialect.py +5 -0
- cartesia/voices/types/localize_voice_request.py +16 -3
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/METADATA +68 -63
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/RECORD +37 -24
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/WHEEL +0 -0
cartesia/voices/client.py
CHANGED
@@ -11,19 +11,20 @@ from .types.get_voices_response import GetVoicesResponse
|
|
11
11
|
from ..core.pydantic_utilities import parse_obj_as
|
12
12
|
from json.decoder import JSONDecodeError
|
13
13
|
from ..core.api_error import ApiError
|
14
|
-
from ..
|
14
|
+
from .. import core
|
15
15
|
from ..tts.types.supported_language import SupportedLanguage
|
16
|
+
from .types.clone_mode import CloneMode
|
17
|
+
from .types.voice_metadata import VoiceMetadata
|
16
18
|
from .types.voice_id import VoiceId
|
17
19
|
from ..core.jsonable_encoder import jsonable_encoder
|
18
20
|
from .types.localize_target_language import LocalizeTargetLanguage
|
19
21
|
from .types.gender import Gender
|
20
22
|
from .requests.localize_dialect import LocalizeDialectParams
|
21
|
-
from .types.embedding_response import EmbeddingResponse
|
22
23
|
from ..core.serialization import convert_and_respect_annotation_metadata
|
23
24
|
from .requests.mix_voice_specifier import MixVoiceSpecifierParams
|
24
|
-
from
|
25
|
-
from .types.
|
26
|
-
from .types.
|
25
|
+
from .types.embedding_response import EmbeddingResponse
|
26
|
+
from ..embedding.types.embedding import Embedding
|
27
|
+
from .types.base_voice_id import BaseVoiceId
|
27
28
|
from ..core.client_wrapper import AsyncClientWrapper
|
28
29
|
from ..core.pagination import AsyncPager
|
29
30
|
|
@@ -140,34 +141,60 @@ class VoicesClient:
|
|
140
141
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
141
142
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
142
143
|
|
143
|
-
def
|
144
|
+
def clone(
|
144
145
|
self,
|
145
146
|
*,
|
147
|
+
clip: core.File,
|
146
148
|
name: str,
|
147
|
-
|
148
|
-
|
149
|
-
|
149
|
+
language: SupportedLanguage,
|
150
|
+
mode: CloneMode,
|
151
|
+
description: typing.Optional[str] = OMIT,
|
152
|
+
enhance: typing.Optional[bool] = OMIT,
|
153
|
+
transcript: typing.Optional[str] = OMIT,
|
150
154
|
request_options: typing.Optional[RequestOptions] = None,
|
151
|
-
) ->
|
155
|
+
) -> VoiceMetadata:
|
152
156
|
"""
|
157
|
+
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
158
|
+
|
159
|
+
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
160
|
+
|
161
|
+
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
162
|
+
|
153
163
|
Parameters
|
154
164
|
----------
|
165
|
+
clip : core.File
|
166
|
+
See core.File for more documentation
|
167
|
+
|
155
168
|
name : str
|
156
169
|
The name of the voice.
|
157
170
|
|
158
|
-
description : str
|
159
|
-
The description of the voice.
|
160
171
|
|
161
|
-
|
172
|
+
language : SupportedLanguage
|
173
|
+
The language of the voice.
|
174
|
+
|
175
|
+
|
176
|
+
mode : CloneMode
|
177
|
+
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
178
|
+
|
179
|
+
|
180
|
+
description : typing.Optional[str]
|
181
|
+
A description for the voice.
|
182
|
+
|
183
|
+
|
184
|
+
enhance : typing.Optional[bool]
|
185
|
+
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
186
|
+
|
187
|
+
|
188
|
+
transcript : typing.Optional[str]
|
189
|
+
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
162
190
|
|
163
|
-
language : typing.Optional[SupportedLanguage]
|
164
191
|
|
165
192
|
request_options : typing.Optional[RequestOptions]
|
166
193
|
Request-specific configuration.
|
167
194
|
|
168
195
|
Returns
|
169
196
|
-------
|
170
|
-
|
197
|
+
VoiceMetadata
|
171
198
|
|
172
199
|
Examples
|
173
200
|
--------
|
@@ -176,20 +203,27 @@ class VoicesClient:
|
|
176
203
|
client = Cartesia(
|
177
204
|
api_key="YOUR_API_KEY",
|
178
205
|
)
|
179
|
-
client.voices.
|
180
|
-
name="
|
181
|
-
description="
|
182
|
-
|
206
|
+
client.voices.clone(
|
207
|
+
name="A high-stability cloned voice",
|
208
|
+
description="Copied from Cartesia docs",
|
209
|
+
mode="stability",
|
210
|
+
language="en",
|
211
|
+
enhance=True,
|
183
212
|
)
|
184
213
|
"""
|
185
214
|
_response = self._client_wrapper.httpx_client.request(
|
186
|
-
"voices/",
|
215
|
+
"voices/clone",
|
187
216
|
method="POST",
|
188
|
-
|
217
|
+
data={
|
189
218
|
"name": name,
|
190
219
|
"description": description,
|
191
|
-
"embedding": embedding,
|
192
220
|
"language": language,
|
221
|
+
"mode": mode,
|
222
|
+
"enhance": enhance,
|
223
|
+
"transcript": transcript,
|
224
|
+
},
|
225
|
+
files={
|
226
|
+
"clip": clip,
|
193
227
|
},
|
194
228
|
request_options=request_options,
|
195
229
|
omit=OMIT,
|
@@ -197,9 +231,9 @@ class VoicesClient:
|
|
197
231
|
try:
|
198
232
|
if 200 <= _response.status_code < 300:
|
199
233
|
return typing.cast(
|
200
|
-
|
234
|
+
VoiceMetadata,
|
201
235
|
parse_obj_as(
|
202
|
-
type_=
|
236
|
+
type_=VoiceMetadata, # type: ignore
|
203
237
|
object_=_response.json(),
|
204
238
|
),
|
205
239
|
)
|
@@ -349,16 +383,27 @@ class VoicesClient:
|
|
349
383
|
def localize(
|
350
384
|
self,
|
351
385
|
*,
|
352
|
-
|
386
|
+
voice_id: str,
|
387
|
+
name: str,
|
388
|
+
description: str,
|
353
389
|
language: LocalizeTargetLanguage,
|
354
390
|
original_speaker_gender: Gender,
|
355
391
|
dialect: typing.Optional[LocalizeDialectParams] = OMIT,
|
356
392
|
request_options: typing.Optional[RequestOptions] = None,
|
357
|
-
) ->
|
393
|
+
) -> VoiceMetadata:
|
358
394
|
"""
|
395
|
+
Create a new voice from an existing voice localized to a new language and dialect.
|
396
|
+
|
359
397
|
Parameters
|
360
398
|
----------
|
361
|
-
|
399
|
+
voice_id : str
|
400
|
+
The ID of the voice to localize.
|
401
|
+
|
402
|
+
name : str
|
403
|
+
The name of the new localized voice.
|
404
|
+
|
405
|
+
description : str
|
406
|
+
The description of the new localized voice.
|
362
407
|
|
363
408
|
language : LocalizeTargetLanguage
|
364
409
|
|
@@ -371,7 +416,7 @@ class VoicesClient:
|
|
371
416
|
|
372
417
|
Returns
|
373
418
|
-------
|
374
|
-
|
419
|
+
VoiceMetadata
|
375
420
|
|
376
421
|
Examples
|
377
422
|
--------
|
@@ -381,16 +426,21 @@ class VoicesClient:
|
|
381
426
|
api_key="YOUR_API_KEY",
|
382
427
|
)
|
383
428
|
client.voices.localize(
|
384
|
-
|
385
|
-
|
386
|
-
|
429
|
+
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
430
|
+
name="Sarah Peninsular Spanish",
|
431
|
+
description="Sarah Voice in Peninsular Spanish",
|
432
|
+
language="es",
|
433
|
+
original_speaker_gender="female",
|
434
|
+
dialect="pe",
|
387
435
|
)
|
388
436
|
"""
|
389
437
|
_response = self._client_wrapper.httpx_client.request(
|
390
438
|
"voices/localize",
|
391
439
|
method="POST",
|
392
440
|
json={
|
393
|
-
"
|
441
|
+
"voice_id": voice_id,
|
442
|
+
"name": name,
|
443
|
+
"description": description,
|
394
444
|
"language": language,
|
395
445
|
"original_speaker_gender": original_speaker_gender,
|
396
446
|
"dialect": convert_and_respect_annotation_metadata(
|
@@ -403,9 +453,9 @@ class VoicesClient:
|
|
403
453
|
try:
|
404
454
|
if 200 <= _response.status_code < 300:
|
405
455
|
return typing.cast(
|
406
|
-
|
456
|
+
VoiceMetadata,
|
407
457
|
parse_obj_as(
|
408
|
-
type_=
|
458
|
+
type_=VoiceMetadata, # type: ignore
|
409
459
|
object_=_response.json(),
|
410
460
|
),
|
411
461
|
)
|
@@ -468,58 +518,39 @@ class VoicesClient:
|
|
468
518
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
469
519
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
470
520
|
|
471
|
-
def
|
521
|
+
def create(
|
472
522
|
self,
|
473
523
|
*,
|
474
|
-
clip: core.File,
|
475
524
|
name: str,
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
transcript: typing.Optional[str] = OMIT,
|
525
|
+
description: str,
|
526
|
+
embedding: Embedding,
|
527
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
528
|
+
base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
|
481
529
|
request_options: typing.Optional[RequestOptions] = None,
|
482
|
-
) ->
|
530
|
+
) -> Voice:
|
483
531
|
"""
|
484
|
-
|
485
|
-
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
486
|
-
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
532
|
+
Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
|
487
533
|
|
488
534
|
Parameters
|
489
535
|
----------
|
490
|
-
clip : core.File
|
491
|
-
See core.File for more documentation
|
492
|
-
|
493
536
|
name : str
|
494
537
|
The name of the voice.
|
495
538
|
|
539
|
+
description : str
|
540
|
+
The description of the voice.
|
496
541
|
|
497
|
-
|
498
|
-
The language of the voice.
|
499
|
-
|
500
|
-
|
501
|
-
mode : CloneMode
|
502
|
-
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
503
|
-
|
504
|
-
|
505
|
-
enhance : bool
|
506
|
-
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
507
|
-
|
508
|
-
|
509
|
-
description : typing.Optional[str]
|
510
|
-
A description for the voice.
|
511
|
-
|
542
|
+
embedding : Embedding
|
512
543
|
|
513
|
-
|
514
|
-
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
544
|
+
language : typing.Optional[SupportedLanguage]
|
515
545
|
|
546
|
+
base_voice_id : typing.Optional[BaseVoiceId]
|
516
547
|
|
517
548
|
request_options : typing.Optional[RequestOptions]
|
518
549
|
Request-specific configuration.
|
519
550
|
|
520
551
|
Returns
|
521
552
|
-------
|
522
|
-
|
553
|
+
Voice
|
523
554
|
|
524
555
|
Examples
|
525
556
|
--------
|
@@ -528,27 +559,21 @@ class VoicesClient:
|
|
528
559
|
client = Cartesia(
|
529
560
|
api_key="YOUR_API_KEY",
|
530
561
|
)
|
531
|
-
client.voices.
|
532
|
-
name="
|
533
|
-
description="
|
534
|
-
|
535
|
-
language="en",
|
536
|
-
enhance=True,
|
562
|
+
client.voices.create(
|
563
|
+
name="name",
|
564
|
+
description="description",
|
565
|
+
embedding=[1.1, 1.1],
|
537
566
|
)
|
538
567
|
"""
|
539
568
|
_response = self._client_wrapper.httpx_client.request(
|
540
|
-
"voices/
|
569
|
+
"voices/",
|
541
570
|
method="POST",
|
542
|
-
|
571
|
+
json={
|
543
572
|
"name": name,
|
544
573
|
"description": description,
|
574
|
+
"embedding": embedding,
|
545
575
|
"language": language,
|
546
|
-
"
|
547
|
-
"enhance": enhance,
|
548
|
-
"transcript": transcript,
|
549
|
-
},
|
550
|
-
files={
|
551
|
-
"clip": clip,
|
576
|
+
"base_voice_id": base_voice_id,
|
552
577
|
},
|
553
578
|
request_options=request_options,
|
554
579
|
omit=OMIT,
|
@@ -556,9 +581,9 @@ class VoicesClient:
|
|
556
581
|
try:
|
557
582
|
if 200 <= _response.status_code < 300:
|
558
583
|
return typing.cast(
|
559
|
-
|
584
|
+
Voice,
|
560
585
|
parse_obj_as(
|
561
|
-
type_=
|
586
|
+
type_=Voice, # type: ignore
|
562
587
|
object_=_response.json(),
|
563
588
|
),
|
564
589
|
)
|
@@ -685,34 +710,60 @@ class AsyncVoicesClient:
|
|
685
710
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
686
711
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
687
712
|
|
688
|
-
async def
|
713
|
+
async def clone(
|
689
714
|
self,
|
690
715
|
*,
|
716
|
+
clip: core.File,
|
691
717
|
name: str,
|
692
|
-
|
693
|
-
|
694
|
-
|
718
|
+
language: SupportedLanguage,
|
719
|
+
mode: CloneMode,
|
720
|
+
description: typing.Optional[str] = OMIT,
|
721
|
+
enhance: typing.Optional[bool] = OMIT,
|
722
|
+
transcript: typing.Optional[str] = OMIT,
|
695
723
|
request_options: typing.Optional[RequestOptions] = None,
|
696
|
-
) ->
|
724
|
+
) -> VoiceMetadata:
|
697
725
|
"""
|
726
|
+
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
727
|
+
|
728
|
+
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
729
|
+
|
730
|
+
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
731
|
+
|
698
732
|
Parameters
|
699
733
|
----------
|
734
|
+
clip : core.File
|
735
|
+
See core.File for more documentation
|
736
|
+
|
700
737
|
name : str
|
701
738
|
The name of the voice.
|
702
739
|
|
703
|
-
description : str
|
704
|
-
The description of the voice.
|
705
740
|
|
706
|
-
|
741
|
+
language : SupportedLanguage
|
742
|
+
The language of the voice.
|
743
|
+
|
744
|
+
|
745
|
+
mode : CloneMode
|
746
|
+
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
747
|
+
|
748
|
+
|
749
|
+
description : typing.Optional[str]
|
750
|
+
A description for the voice.
|
751
|
+
|
752
|
+
|
753
|
+
enhance : typing.Optional[bool]
|
754
|
+
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
755
|
+
|
756
|
+
|
757
|
+
transcript : typing.Optional[str]
|
758
|
+
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
707
759
|
|
708
|
-
language : typing.Optional[SupportedLanguage]
|
709
760
|
|
710
761
|
request_options : typing.Optional[RequestOptions]
|
711
762
|
Request-specific configuration.
|
712
763
|
|
713
764
|
Returns
|
714
765
|
-------
|
715
|
-
|
766
|
+
VoiceMetadata
|
716
767
|
|
717
768
|
Examples
|
718
769
|
--------
|
@@ -726,23 +777,30 @@ class AsyncVoicesClient:
|
|
726
777
|
|
727
778
|
|
728
779
|
async def main() -> None:
|
729
|
-
await client.voices.
|
730
|
-
name="
|
731
|
-
description="
|
732
|
-
|
780
|
+
await client.voices.clone(
|
781
|
+
name="A high-stability cloned voice",
|
782
|
+
description="Copied from Cartesia docs",
|
783
|
+
mode="stability",
|
784
|
+
language="en",
|
785
|
+
enhance=True,
|
733
786
|
)
|
734
787
|
|
735
788
|
|
736
789
|
asyncio.run(main())
|
737
790
|
"""
|
738
791
|
_response = await self._client_wrapper.httpx_client.request(
|
739
|
-
"voices/",
|
792
|
+
"voices/clone",
|
740
793
|
method="POST",
|
741
|
-
|
794
|
+
data={
|
742
795
|
"name": name,
|
743
796
|
"description": description,
|
744
|
-
"embedding": embedding,
|
745
797
|
"language": language,
|
798
|
+
"mode": mode,
|
799
|
+
"enhance": enhance,
|
800
|
+
"transcript": transcript,
|
801
|
+
},
|
802
|
+
files={
|
803
|
+
"clip": clip,
|
746
804
|
},
|
747
805
|
request_options=request_options,
|
748
806
|
omit=OMIT,
|
@@ -750,9 +808,9 @@ class AsyncVoicesClient:
|
|
750
808
|
try:
|
751
809
|
if 200 <= _response.status_code < 300:
|
752
810
|
return typing.cast(
|
753
|
-
|
811
|
+
VoiceMetadata,
|
754
812
|
parse_obj_as(
|
755
|
-
type_=
|
813
|
+
type_=VoiceMetadata, # type: ignore
|
756
814
|
object_=_response.json(),
|
757
815
|
),
|
758
816
|
)
|
@@ -926,16 +984,27 @@ class AsyncVoicesClient:
|
|
926
984
|
async def localize(
|
927
985
|
self,
|
928
986
|
*,
|
929
|
-
|
987
|
+
voice_id: str,
|
988
|
+
name: str,
|
989
|
+
description: str,
|
930
990
|
language: LocalizeTargetLanguage,
|
931
991
|
original_speaker_gender: Gender,
|
932
992
|
dialect: typing.Optional[LocalizeDialectParams] = OMIT,
|
933
993
|
request_options: typing.Optional[RequestOptions] = None,
|
934
|
-
) ->
|
994
|
+
) -> VoiceMetadata:
|
935
995
|
"""
|
996
|
+
Create a new voice from an existing voice localized to a new language and dialect.
|
997
|
+
|
936
998
|
Parameters
|
937
999
|
----------
|
938
|
-
|
1000
|
+
voice_id : str
|
1001
|
+
The ID of the voice to localize.
|
1002
|
+
|
1003
|
+
name : str
|
1004
|
+
The name of the new localized voice.
|
1005
|
+
|
1006
|
+
description : str
|
1007
|
+
The description of the new localized voice.
|
939
1008
|
|
940
1009
|
language : LocalizeTargetLanguage
|
941
1010
|
|
@@ -948,7 +1017,7 @@ class AsyncVoicesClient:
|
|
948
1017
|
|
949
1018
|
Returns
|
950
1019
|
-------
|
951
|
-
|
1020
|
+
VoiceMetadata
|
952
1021
|
|
953
1022
|
Examples
|
954
1023
|
--------
|
@@ -963,9 +1032,12 @@ class AsyncVoicesClient:
|
|
963
1032
|
|
964
1033
|
async def main() -> None:
|
965
1034
|
await client.voices.localize(
|
966
|
-
|
967
|
-
|
968
|
-
|
1035
|
+
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
1036
|
+
name="Sarah Peninsular Spanish",
|
1037
|
+
description="Sarah Voice in Peninsular Spanish",
|
1038
|
+
language="es",
|
1039
|
+
original_speaker_gender="female",
|
1040
|
+
dialect="pe",
|
969
1041
|
)
|
970
1042
|
|
971
1043
|
|
@@ -975,7 +1047,9 @@ class AsyncVoicesClient:
|
|
975
1047
|
"voices/localize",
|
976
1048
|
method="POST",
|
977
1049
|
json={
|
978
|
-
"
|
1050
|
+
"voice_id": voice_id,
|
1051
|
+
"name": name,
|
1052
|
+
"description": description,
|
979
1053
|
"language": language,
|
980
1054
|
"original_speaker_gender": original_speaker_gender,
|
981
1055
|
"dialect": convert_and_respect_annotation_metadata(
|
@@ -988,9 +1062,9 @@ class AsyncVoicesClient:
|
|
988
1062
|
try:
|
989
1063
|
if 200 <= _response.status_code < 300:
|
990
1064
|
return typing.cast(
|
991
|
-
|
1065
|
+
VoiceMetadata,
|
992
1066
|
parse_obj_as(
|
993
|
-
type_=
|
1067
|
+
type_=VoiceMetadata, # type: ignore
|
994
1068
|
object_=_response.json(),
|
995
1069
|
),
|
996
1070
|
)
|
@@ -1061,58 +1135,39 @@ class AsyncVoicesClient:
|
|
1061
1135
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
1062
1136
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
1063
1137
|
|
1064
|
-
async def
|
1138
|
+
async def create(
|
1065
1139
|
self,
|
1066
1140
|
*,
|
1067
|
-
clip: core.File,
|
1068
1141
|
name: str,
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
transcript: typing.Optional[str] = OMIT,
|
1142
|
+
description: str,
|
1143
|
+
embedding: Embedding,
|
1144
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
1145
|
+
base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
|
1074
1146
|
request_options: typing.Optional[RequestOptions] = None,
|
1075
|
-
) ->
|
1147
|
+
) -> Voice:
|
1076
1148
|
"""
|
1077
|
-
|
1078
|
-
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
1079
|
-
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
1149
|
+
Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
|
1080
1150
|
|
1081
1151
|
Parameters
|
1082
1152
|
----------
|
1083
|
-
clip : core.File
|
1084
|
-
See core.File for more documentation
|
1085
|
-
|
1086
1153
|
name : str
|
1087
1154
|
The name of the voice.
|
1088
1155
|
|
1156
|
+
description : str
|
1157
|
+
The description of the voice.
|
1089
1158
|
|
1090
|
-
|
1091
|
-
The language of the voice.
|
1092
|
-
|
1093
|
-
|
1094
|
-
mode : CloneMode
|
1095
|
-
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
1096
|
-
|
1097
|
-
|
1098
|
-
enhance : bool
|
1099
|
-
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
1100
|
-
|
1101
|
-
|
1102
|
-
description : typing.Optional[str]
|
1103
|
-
A description for the voice.
|
1104
|
-
|
1159
|
+
embedding : Embedding
|
1105
1160
|
|
1106
|
-
|
1107
|
-
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
1161
|
+
language : typing.Optional[SupportedLanguage]
|
1108
1162
|
|
1163
|
+
base_voice_id : typing.Optional[BaseVoiceId]
|
1109
1164
|
|
1110
1165
|
request_options : typing.Optional[RequestOptions]
|
1111
1166
|
Request-specific configuration.
|
1112
1167
|
|
1113
1168
|
Returns
|
1114
1169
|
-------
|
1115
|
-
|
1170
|
+
Voice
|
1116
1171
|
|
1117
1172
|
Examples
|
1118
1173
|
--------
|
@@ -1126,30 +1181,24 @@ class AsyncVoicesClient:
|
|
1126
1181
|
|
1127
1182
|
|
1128
1183
|
async def main() -> None:
|
1129
|
-
await client.voices.
|
1130
|
-
name="
|
1131
|
-
description="
|
1132
|
-
|
1133
|
-
language="en",
|
1134
|
-
enhance=True,
|
1184
|
+
await client.voices.create(
|
1185
|
+
name="name",
|
1186
|
+
description="description",
|
1187
|
+
embedding=[1.1, 1.1],
|
1135
1188
|
)
|
1136
1189
|
|
1137
1190
|
|
1138
1191
|
asyncio.run(main())
|
1139
1192
|
"""
|
1140
1193
|
_response = await self._client_wrapper.httpx_client.request(
|
1141
|
-
"voices/
|
1194
|
+
"voices/",
|
1142
1195
|
method="POST",
|
1143
|
-
|
1196
|
+
json={
|
1144
1197
|
"name": name,
|
1145
1198
|
"description": description,
|
1199
|
+
"embedding": embedding,
|
1146
1200
|
"language": language,
|
1147
|
-
"
|
1148
|
-
"enhance": enhance,
|
1149
|
-
"transcript": transcript,
|
1150
|
-
},
|
1151
|
-
files={
|
1152
|
-
"clip": clip,
|
1201
|
+
"base_voice_id": base_voice_id,
|
1153
1202
|
},
|
1154
1203
|
request_options=request_options,
|
1155
1204
|
omit=OMIT,
|
@@ -1157,9 +1206,9 @@ class AsyncVoicesClient:
|
|
1157
1206
|
try:
|
1158
1207
|
if 200 <= _response.status_code < 300:
|
1159
1208
|
return typing.cast(
|
1160
|
-
|
1209
|
+
Voice,
|
1161
1210
|
parse_obj_as(
|
1162
|
-
type_=
|
1211
|
+
type_=Voice, # type: ignore
|
1163
1212
|
object_=_response.json(),
|
1164
1213
|
),
|
1165
1214
|
)
|
@@ -4,6 +4,7 @@ import typing_extensions
|
|
4
4
|
from ...embedding.types.embedding import Embedding
|
5
5
|
import typing_extensions
|
6
6
|
from ...tts.types.supported_language import SupportedLanguage
|
7
|
+
from ..types.base_voice_id import BaseVoiceId
|
7
8
|
|
8
9
|
|
9
10
|
class CreateVoiceRequestParams(typing_extensions.TypedDict):
|
@@ -19,3 +20,4 @@ class CreateVoiceRequestParams(typing_extensions.TypedDict):
|
|
19
20
|
|
20
21
|
embedding: Embedding
|
21
22
|
language: typing_extensions.NotRequired[SupportedLanguage]
|
23
|
+
base_voice_id: typing_extensions.NotRequired[BaseVoiceId]
|
@@ -2,5 +2,10 @@
|
|
2
2
|
|
3
3
|
import typing
|
4
4
|
from ..types.localize_english_dialect import LocalizeEnglishDialect
|
5
|
+
from ..types.localize_spanish_dialect import LocalizeSpanishDialect
|
6
|
+
from ..types.localize_portuguese_dialect import LocalizePortugueseDialect
|
7
|
+
from ..types.localize_french_dialect import LocalizeFrenchDialect
|
5
8
|
|
6
|
-
LocalizeDialectParams = typing.Union[
|
9
|
+
LocalizeDialectParams = typing.Union[
|
10
|
+
LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect, LocalizeFrenchDialect
|
11
|
+
]
|