cartesia 2.0.0b2__py3-none-any.whl → 2.0.0b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +8 -4
- cartesia/base_client.py +0 -4
- cartesia/core/__init__.py +3 -0
- cartesia/core/client_wrapper.py +2 -2
- cartesia/core/pagination.py +88 -0
- cartesia/infill/client.py +4 -4
- cartesia/tts/_async_websocket.py +48 -1
- cartesia/tts/_websocket.py +44 -3
- cartesia/tts/client.py +4 -4
- cartesia/tts/requests/generation_request.py +5 -0
- cartesia/tts/requests/web_socket_chunk_response.py +3 -0
- cartesia/tts/requests/web_socket_response.py +2 -1
- cartesia/tts/requests/web_socket_tts_request.py +1 -0
- cartesia/tts/types/emotion.py +5 -0
- cartesia/tts/types/generation_request.py +5 -0
- cartesia/tts/types/web_socket_chunk_response.py +3 -1
- cartesia/tts/types/web_socket_response.py +2 -1
- cartesia/tts/types/web_socket_tts_output.py +2 -0
- cartesia/tts/types/web_socket_tts_request.py +1 -0
- cartesia/tts/utils/constants.py +2 -2
- cartesia/voice_changer/requests/streaming_response.py +2 -0
- cartesia/voice_changer/types/streaming_response.py +2 -0
- cartesia/voices/__init__.py +8 -4
- cartesia/voices/client.py +285 -169
- cartesia/voices/requests/__init__.py +2 -0
- cartesia/voices/requests/create_voice_request.py +0 -2
- cartesia/voices/requests/get_voices_response.py +24 -0
- cartesia/voices/requests/localize_dialect.py +1 -3
- cartesia/voices/requests/voice.py +13 -9
- cartesia/voices/types/__init__.py +6 -4
- cartesia/voices/types/create_voice_request.py +0 -2
- cartesia/voices/types/gender_presentation.py +5 -0
- cartesia/voices/types/get_voices_response.py +34 -0
- cartesia/voices/types/localize_dialect.py +1 -3
- cartesia/voices/types/voice.py +13 -9
- cartesia/voices/types/voice_expand_options.py +5 -0
- {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/METADATA +85 -14
- {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/RECORD +39 -37
- cartesia/datasets/client.py +0 -392
- cartesia/voices/types/localize_portuguese_dialect.py +0 -5
- cartesia/voices/types/localize_spanish_dialect.py +0 -5
- {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/WHEEL +0 -0
cartesia/voices/client.py
CHANGED
@@ -2,26 +2,30 @@
|
|
2
2
|
|
3
3
|
import typing
|
4
4
|
from ..core.client_wrapper import SyncClientWrapper
|
5
|
+
from .types.gender_presentation import GenderPresentation
|
6
|
+
from .types.voice_expand_options import VoiceExpandOptions
|
5
7
|
from ..core.request_options import RequestOptions
|
8
|
+
from ..core.pagination import SyncPager
|
6
9
|
from .types.voice import Voice
|
10
|
+
from .types.get_voices_response import GetVoicesResponse
|
7
11
|
from ..core.pydantic_utilities import parse_obj_as
|
8
12
|
from json.decoder import JSONDecodeError
|
9
13
|
from ..core.api_error import ApiError
|
10
|
-
from .. import
|
14
|
+
from ..embedding.types.embedding import Embedding
|
11
15
|
from ..tts.types.supported_language import SupportedLanguage
|
12
|
-
from .types.clone_mode import CloneMode
|
13
|
-
from .types.voice_metadata import VoiceMetadata
|
14
16
|
from .types.voice_id import VoiceId
|
15
17
|
from ..core.jsonable_encoder import jsonable_encoder
|
16
|
-
from ..embedding.types.embedding import Embedding
|
17
18
|
from .types.localize_target_language import LocalizeTargetLanguage
|
18
19
|
from .types.gender import Gender
|
19
20
|
from .requests.localize_dialect import LocalizeDialectParams
|
20
21
|
from .types.embedding_response import EmbeddingResponse
|
21
22
|
from ..core.serialization import convert_and_respect_annotation_metadata
|
22
23
|
from .requests.mix_voice_specifier import MixVoiceSpecifierParams
|
23
|
-
from
|
24
|
+
from .. import core
|
25
|
+
from .types.clone_mode import CloneMode
|
26
|
+
from .types.voice_metadata import VoiceMetadata
|
24
27
|
from ..core.client_wrapper import AsyncClientWrapper
|
28
|
+
from ..core.pagination import AsyncPager
|
25
29
|
|
26
30
|
# this is used as the default value for optional parameters
|
27
31
|
OMIT = typing.cast(typing.Any, ...)
|
@@ -31,16 +35,54 @@ class VoicesClient:
|
|
31
35
|
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
32
36
|
self._client_wrapper = client_wrapper
|
33
37
|
|
34
|
-
def list(
|
38
|
+
def list(
|
39
|
+
self,
|
40
|
+
*,
|
41
|
+
limit: typing.Optional[int] = None,
|
42
|
+
starting_after: typing.Optional[str] = None,
|
43
|
+
ending_before: typing.Optional[str] = None,
|
44
|
+
is_owner: typing.Optional[bool] = None,
|
45
|
+
is_starred: typing.Optional[bool] = None,
|
46
|
+
gender: typing.Optional[GenderPresentation] = None,
|
47
|
+
expand: typing.Optional[typing.Sequence[VoiceExpandOptions]] = None,
|
48
|
+
request_options: typing.Optional[RequestOptions] = None,
|
49
|
+
) -> SyncPager[Voice]:
|
35
50
|
"""
|
36
51
|
Parameters
|
37
52
|
----------
|
53
|
+
limit : typing.Optional[int]
|
54
|
+
The number of Voices to return per page, ranging between 1 and 100.
|
55
|
+
|
56
|
+
starting_after : typing.Optional[str]
|
57
|
+
A cursor to use in pagination. `starting_after` is a Voice ID that defines your
|
58
|
+
place in the list. For example, if you make a /voices request and receive 100
|
59
|
+
objects, ending with `voice_abc123`, your subsequent call can include
|
60
|
+
`starting_after=voice_abc123` to fetch the next page of the list.
|
61
|
+
|
62
|
+
ending_before : typing.Optional[str]
|
63
|
+
A cursor to use in pagination. `ending_before` is a Voice ID that defines your
|
64
|
+
place in the list. For example, if you make a /voices request and receive 100
|
65
|
+
objects, starting with `voice_abc123`, your subsequent call can include
|
66
|
+
`ending_before=voice_abc123` to fetch the previous page of the list.
|
67
|
+
|
68
|
+
is_owner : typing.Optional[bool]
|
69
|
+
Whether to only return voices owned by the current user.
|
70
|
+
|
71
|
+
is_starred : typing.Optional[bool]
|
72
|
+
Whether to only return starred voices.
|
73
|
+
|
74
|
+
gender : typing.Optional[GenderPresentation]
|
75
|
+
The gender presentation of the voices to return.
|
76
|
+
|
77
|
+
expand : typing.Optional[typing.Sequence[VoiceExpandOptions]]
|
78
|
+
Additional fields to include in the response.
|
79
|
+
|
38
80
|
request_options : typing.Optional[RequestOptions]
|
39
81
|
Request-specific configuration.
|
40
82
|
|
41
83
|
Returns
|
42
84
|
-------
|
43
|
-
|
85
|
+
SyncPager[Voice]
|
44
86
|
|
45
87
|
Examples
|
46
88
|
--------
|
@@ -49,81 +91,83 @@ class VoicesClient:
|
|
49
91
|
client = Cartesia(
|
50
92
|
api_key="YOUR_API_KEY",
|
51
93
|
)
|
52
|
-
client.voices.list()
|
94
|
+
response = client.voices.list()
|
95
|
+
for item in response:
|
96
|
+
yield item
|
97
|
+
# alternatively, you can paginate page-by-page
|
98
|
+
for page in response.iter_pages():
|
99
|
+
yield page
|
53
100
|
"""
|
54
101
|
_response = self._client_wrapper.httpx_client.request(
|
55
102
|
"voices/",
|
56
103
|
method="GET",
|
104
|
+
params={
|
105
|
+
"limit": limit,
|
106
|
+
"starting_after": starting_after,
|
107
|
+
"ending_before": ending_before,
|
108
|
+
"is_owner": is_owner,
|
109
|
+
"is_starred": is_starred,
|
110
|
+
"gender": gender,
|
111
|
+
"expand[]": expand,
|
112
|
+
},
|
57
113
|
request_options=request_options,
|
58
114
|
)
|
59
115
|
try:
|
60
116
|
if 200 <= _response.status_code < 300:
|
61
|
-
|
62
|
-
|
117
|
+
_parsed_response = typing.cast(
|
118
|
+
GetVoicesResponse,
|
63
119
|
parse_obj_as(
|
64
|
-
type_=
|
120
|
+
type_=GetVoicesResponse, # type: ignore
|
65
121
|
object_=_response.json(),
|
66
122
|
),
|
67
123
|
)
|
124
|
+
_parsed_next = _parsed_response.next_page
|
125
|
+
_has_next = _parsed_next is not None and _parsed_next != ""
|
126
|
+
_get_next = lambda: self.list(
|
127
|
+
limit=limit,
|
128
|
+
starting_after=_parsed_next,
|
129
|
+
ending_before=ending_before,
|
130
|
+
is_owner=is_owner,
|
131
|
+
is_starred=is_starred,
|
132
|
+
gender=gender,
|
133
|
+
expand=expand,
|
134
|
+
request_options=request_options,
|
135
|
+
)
|
136
|
+
_items = _parsed_response.data
|
137
|
+
return SyncPager(has_next=_has_next, items=_items, get_next=_get_next)
|
68
138
|
_response_json = _response.json()
|
69
139
|
except JSONDecodeError:
|
70
140
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
71
141
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
72
142
|
|
73
|
-
def
|
143
|
+
def create(
|
74
144
|
self,
|
75
145
|
*,
|
76
|
-
clip: core.File,
|
77
146
|
name: str,
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
description: typing.Optional[str] = OMIT,
|
82
|
-
transcript: typing.Optional[str] = OMIT,
|
147
|
+
description: str,
|
148
|
+
embedding: Embedding,
|
149
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
83
150
|
request_options: typing.Optional[RequestOptions] = None,
|
84
|
-
) ->
|
151
|
+
) -> Voice:
|
85
152
|
"""
|
86
|
-
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
87
|
-
|
88
|
-
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
89
|
-
|
90
|
-
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
91
|
-
|
92
153
|
Parameters
|
93
154
|
----------
|
94
|
-
clip : core.File
|
95
|
-
See core.File for more documentation
|
96
|
-
|
97
155
|
name : str
|
98
156
|
The name of the voice.
|
99
157
|
|
158
|
+
description : str
|
159
|
+
The description of the voice.
|
100
160
|
|
101
|
-
|
102
|
-
The language of the voice.
|
103
|
-
|
104
|
-
|
105
|
-
mode : CloneMode
|
106
|
-
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
107
|
-
|
108
|
-
|
109
|
-
enhance : bool
|
110
|
-
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
111
|
-
|
112
|
-
|
113
|
-
description : typing.Optional[str]
|
114
|
-
A description for the voice.
|
115
|
-
|
116
|
-
|
117
|
-
transcript : typing.Optional[str]
|
118
|
-
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
161
|
+
embedding : Embedding
|
119
162
|
|
163
|
+
language : typing.Optional[SupportedLanguage]
|
120
164
|
|
121
165
|
request_options : typing.Optional[RequestOptions]
|
122
166
|
Request-specific configuration.
|
123
167
|
|
124
168
|
Returns
|
125
169
|
-------
|
126
|
-
|
170
|
+
Voice
|
127
171
|
|
128
172
|
Examples
|
129
173
|
--------
|
@@ -132,27 +176,20 @@ class VoicesClient:
|
|
132
176
|
client = Cartesia(
|
133
177
|
api_key="YOUR_API_KEY",
|
134
178
|
)
|
135
|
-
client.voices.
|
136
|
-
name="
|
137
|
-
description="
|
138
|
-
|
139
|
-
language="en",
|
140
|
-
enhance=True,
|
179
|
+
client.voices.create(
|
180
|
+
name="name",
|
181
|
+
description="description",
|
182
|
+
embedding=[1.1, 1.1],
|
141
183
|
)
|
142
184
|
"""
|
143
185
|
_response = self._client_wrapper.httpx_client.request(
|
144
|
-
"voices/
|
186
|
+
"voices/",
|
145
187
|
method="POST",
|
146
|
-
|
188
|
+
json={
|
147
189
|
"name": name,
|
148
190
|
"description": description,
|
191
|
+
"embedding": embedding,
|
149
192
|
"language": language,
|
150
|
-
"mode": mode,
|
151
|
-
"enhance": enhance,
|
152
|
-
"transcript": transcript,
|
153
|
-
},
|
154
|
-
files={
|
155
|
-
"clip": clip,
|
156
193
|
},
|
157
194
|
request_options=request_options,
|
158
195
|
omit=OMIT,
|
@@ -160,9 +197,9 @@ class VoicesClient:
|
|
160
197
|
try:
|
161
198
|
if 200 <= _response.status_code < 300:
|
162
199
|
return typing.cast(
|
163
|
-
|
200
|
+
Voice,
|
164
201
|
parse_obj_as(
|
165
|
-
type_=
|
202
|
+
type_=Voice, # type: ignore
|
166
203
|
object_=_response.json(),
|
167
204
|
),
|
168
205
|
)
|
@@ -431,39 +468,58 @@ class VoicesClient:
|
|
431
468
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
432
469
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
433
470
|
|
434
|
-
def
|
471
|
+
def clone(
|
435
472
|
self,
|
436
473
|
*,
|
474
|
+
clip: core.File,
|
437
475
|
name: str,
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
476
|
+
language: SupportedLanguage,
|
477
|
+
mode: CloneMode,
|
478
|
+
enhance: bool,
|
479
|
+
description: typing.Optional[str] = OMIT,
|
480
|
+
transcript: typing.Optional[str] = OMIT,
|
442
481
|
request_options: typing.Optional[RequestOptions] = None,
|
443
|
-
) ->
|
482
|
+
) -> VoiceMetadata:
|
444
483
|
"""
|
445
|
-
|
484
|
+
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
485
|
+
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
486
|
+
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
446
487
|
|
447
488
|
Parameters
|
448
489
|
----------
|
490
|
+
clip : core.File
|
491
|
+
See core.File for more documentation
|
492
|
+
|
449
493
|
name : str
|
450
494
|
The name of the voice.
|
451
495
|
|
452
|
-
description : str
|
453
|
-
The description of the voice.
|
454
496
|
|
455
|
-
|
497
|
+
language : SupportedLanguage
|
498
|
+
The language of the voice.
|
456
499
|
|
457
|
-
language : typing.Optional[SupportedLanguage]
|
458
500
|
|
459
|
-
|
501
|
+
mode : CloneMode
|
502
|
+
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
503
|
+
|
504
|
+
|
505
|
+
enhance : bool
|
506
|
+
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
507
|
+
|
508
|
+
|
509
|
+
description : typing.Optional[str]
|
510
|
+
A description for the voice.
|
511
|
+
|
512
|
+
|
513
|
+
transcript : typing.Optional[str]
|
514
|
+
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
515
|
+
|
460
516
|
|
461
517
|
request_options : typing.Optional[RequestOptions]
|
462
518
|
Request-specific configuration.
|
463
519
|
|
464
520
|
Returns
|
465
521
|
-------
|
466
|
-
|
522
|
+
VoiceMetadata
|
467
523
|
|
468
524
|
Examples
|
469
525
|
--------
|
@@ -472,23 +528,27 @@ class VoicesClient:
|
|
472
528
|
client = Cartesia(
|
473
529
|
api_key="YOUR_API_KEY",
|
474
530
|
)
|
475
|
-
client.voices.
|
476
|
-
name="
|
477
|
-
description="
|
478
|
-
|
531
|
+
client.voices.clone(
|
532
|
+
name="A high-stability cloned voice",
|
533
|
+
description="Copied from Cartesia docs",
|
534
|
+
mode="stability",
|
479
535
|
language="en",
|
480
|
-
|
536
|
+
enhance=True,
|
481
537
|
)
|
482
538
|
"""
|
483
539
|
_response = self._client_wrapper.httpx_client.request(
|
484
|
-
"voices/",
|
540
|
+
"voices/clone",
|
485
541
|
method="POST",
|
486
|
-
|
542
|
+
data={
|
487
543
|
"name": name,
|
488
544
|
"description": description,
|
489
|
-
"embedding": embedding,
|
490
545
|
"language": language,
|
491
|
-
"
|
546
|
+
"mode": mode,
|
547
|
+
"enhance": enhance,
|
548
|
+
"transcript": transcript,
|
549
|
+
},
|
550
|
+
files={
|
551
|
+
"clip": clip,
|
492
552
|
},
|
493
553
|
request_options=request_options,
|
494
554
|
omit=OMIT,
|
@@ -496,9 +556,9 @@ class VoicesClient:
|
|
496
556
|
try:
|
497
557
|
if 200 <= _response.status_code < 300:
|
498
558
|
return typing.cast(
|
499
|
-
|
559
|
+
VoiceMetadata,
|
500
560
|
parse_obj_as(
|
501
|
-
type_=
|
561
|
+
type_=VoiceMetadata, # type: ignore
|
502
562
|
object_=_response.json(),
|
503
563
|
),
|
504
564
|
)
|
@@ -512,16 +572,54 @@ class AsyncVoicesClient:
|
|
512
572
|
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
513
573
|
self._client_wrapper = client_wrapper
|
514
574
|
|
515
|
-
async def list(
|
575
|
+
async def list(
|
576
|
+
self,
|
577
|
+
*,
|
578
|
+
limit: typing.Optional[int] = None,
|
579
|
+
starting_after: typing.Optional[str] = None,
|
580
|
+
ending_before: typing.Optional[str] = None,
|
581
|
+
is_owner: typing.Optional[bool] = None,
|
582
|
+
is_starred: typing.Optional[bool] = None,
|
583
|
+
gender: typing.Optional[GenderPresentation] = None,
|
584
|
+
expand: typing.Optional[typing.Sequence[VoiceExpandOptions]] = None,
|
585
|
+
request_options: typing.Optional[RequestOptions] = None,
|
586
|
+
) -> AsyncPager[Voice]:
|
516
587
|
"""
|
517
588
|
Parameters
|
518
589
|
----------
|
590
|
+
limit : typing.Optional[int]
|
591
|
+
The number of Voices to return per page, ranging between 1 and 100.
|
592
|
+
|
593
|
+
starting_after : typing.Optional[str]
|
594
|
+
A cursor to use in pagination. `starting_after` is a Voice ID that defines your
|
595
|
+
place in the list. For example, if you make a /voices request and receive 100
|
596
|
+
objects, ending with `voice_abc123`, your subsequent call can include
|
597
|
+
`starting_after=voice_abc123` to fetch the next page of the list.
|
598
|
+
|
599
|
+
ending_before : typing.Optional[str]
|
600
|
+
A cursor to use in pagination. `ending_before` is a Voice ID that defines your
|
601
|
+
place in the list. For example, if you make a /voices request and receive 100
|
602
|
+
objects, starting with `voice_abc123`, your subsequent call can include
|
603
|
+
`ending_before=voice_abc123` to fetch the previous page of the list.
|
604
|
+
|
605
|
+
is_owner : typing.Optional[bool]
|
606
|
+
Whether to only return voices owned by the current user.
|
607
|
+
|
608
|
+
is_starred : typing.Optional[bool]
|
609
|
+
Whether to only return starred voices.
|
610
|
+
|
611
|
+
gender : typing.Optional[GenderPresentation]
|
612
|
+
The gender presentation of the voices to return.
|
613
|
+
|
614
|
+
expand : typing.Optional[typing.Sequence[VoiceExpandOptions]]
|
615
|
+
Additional fields to include in the response.
|
616
|
+
|
519
617
|
request_options : typing.Optional[RequestOptions]
|
520
618
|
Request-specific configuration.
|
521
619
|
|
522
620
|
Returns
|
523
621
|
-------
|
524
|
-
|
622
|
+
AsyncPager[Voice]
|
525
623
|
|
526
624
|
Examples
|
527
625
|
--------
|
@@ -535,7 +633,12 @@ class AsyncVoicesClient:
|
|
535
633
|
|
536
634
|
|
537
635
|
async def main() -> None:
|
538
|
-
await client.voices.list()
|
636
|
+
response = await client.voices.list()
|
637
|
+
async for item in response:
|
638
|
+
yield item
|
639
|
+
# alternatively, you can paginate page-by-page
|
640
|
+
async for page in response.iter_pages():
|
641
|
+
yield page
|
539
642
|
|
540
643
|
|
541
644
|
asyncio.run(main())
|
@@ -543,76 +646,73 @@ class AsyncVoicesClient:
|
|
543
646
|
_response = await self._client_wrapper.httpx_client.request(
|
544
647
|
"voices/",
|
545
648
|
method="GET",
|
649
|
+
params={
|
650
|
+
"limit": limit,
|
651
|
+
"starting_after": starting_after,
|
652
|
+
"ending_before": ending_before,
|
653
|
+
"is_owner": is_owner,
|
654
|
+
"is_starred": is_starred,
|
655
|
+
"gender": gender,
|
656
|
+
"expand[]": expand,
|
657
|
+
},
|
546
658
|
request_options=request_options,
|
547
659
|
)
|
548
660
|
try:
|
549
661
|
if 200 <= _response.status_code < 300:
|
550
|
-
|
551
|
-
|
662
|
+
_parsed_response = typing.cast(
|
663
|
+
GetVoicesResponse,
|
552
664
|
parse_obj_as(
|
553
|
-
type_=
|
665
|
+
type_=GetVoicesResponse, # type: ignore
|
554
666
|
object_=_response.json(),
|
555
667
|
),
|
556
668
|
)
|
669
|
+
_parsed_next = _parsed_response.next_page
|
670
|
+
_has_next = _parsed_next is not None and _parsed_next != ""
|
671
|
+
_get_next = lambda: self.list(
|
672
|
+
limit=limit,
|
673
|
+
starting_after=_parsed_next,
|
674
|
+
ending_before=ending_before,
|
675
|
+
is_owner=is_owner,
|
676
|
+
is_starred=is_starred,
|
677
|
+
gender=gender,
|
678
|
+
expand=expand,
|
679
|
+
request_options=request_options,
|
680
|
+
)
|
681
|
+
_items = _parsed_response.data
|
682
|
+
return AsyncPager(has_next=_has_next, items=_items, get_next=_get_next)
|
557
683
|
_response_json = _response.json()
|
558
684
|
except JSONDecodeError:
|
559
685
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
560
686
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
561
687
|
|
562
|
-
async def
|
688
|
+
async def create(
|
563
689
|
self,
|
564
690
|
*,
|
565
|
-
clip: core.File,
|
566
691
|
name: str,
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
description: typing.Optional[str] = OMIT,
|
571
|
-
transcript: typing.Optional[str] = OMIT,
|
692
|
+
description: str,
|
693
|
+
embedding: Embedding,
|
694
|
+
language: typing.Optional[SupportedLanguage] = OMIT,
|
572
695
|
request_options: typing.Optional[RequestOptions] = None,
|
573
|
-
) ->
|
696
|
+
) -> Voice:
|
574
697
|
"""
|
575
|
-
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
576
|
-
|
577
|
-
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
578
|
-
|
579
|
-
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
580
|
-
|
581
698
|
Parameters
|
582
699
|
----------
|
583
|
-
clip : core.File
|
584
|
-
See core.File for more documentation
|
585
|
-
|
586
700
|
name : str
|
587
701
|
The name of the voice.
|
588
702
|
|
703
|
+
description : str
|
704
|
+
The description of the voice.
|
589
705
|
|
590
|
-
|
591
|
-
The language of the voice.
|
592
|
-
|
593
|
-
|
594
|
-
mode : CloneMode
|
595
|
-
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
596
|
-
|
597
|
-
|
598
|
-
enhance : bool
|
599
|
-
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
600
|
-
|
601
|
-
|
602
|
-
description : typing.Optional[str]
|
603
|
-
A description for the voice.
|
604
|
-
|
605
|
-
|
606
|
-
transcript : typing.Optional[str]
|
607
|
-
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
706
|
+
embedding : Embedding
|
608
707
|
|
708
|
+
language : typing.Optional[SupportedLanguage]
|
609
709
|
|
610
710
|
request_options : typing.Optional[RequestOptions]
|
611
711
|
Request-specific configuration.
|
612
712
|
|
613
713
|
Returns
|
614
714
|
-------
|
615
|
-
|
715
|
+
Voice
|
616
716
|
|
617
717
|
Examples
|
618
718
|
--------
|
@@ -626,30 +726,23 @@ class AsyncVoicesClient:
|
|
626
726
|
|
627
727
|
|
628
728
|
async def main() -> None:
|
629
|
-
await client.voices.
|
630
|
-
name="
|
631
|
-
description="
|
632
|
-
|
633
|
-
language="en",
|
634
|
-
enhance=True,
|
729
|
+
await client.voices.create(
|
730
|
+
name="name",
|
731
|
+
description="description",
|
732
|
+
embedding=[1.1, 1.1],
|
635
733
|
)
|
636
734
|
|
637
735
|
|
638
736
|
asyncio.run(main())
|
639
737
|
"""
|
640
738
|
_response = await self._client_wrapper.httpx_client.request(
|
641
|
-
"voices/
|
739
|
+
"voices/",
|
642
740
|
method="POST",
|
643
|
-
|
741
|
+
json={
|
644
742
|
"name": name,
|
645
743
|
"description": description,
|
744
|
+
"embedding": embedding,
|
646
745
|
"language": language,
|
647
|
-
"mode": mode,
|
648
|
-
"enhance": enhance,
|
649
|
-
"transcript": transcript,
|
650
|
-
},
|
651
|
-
files={
|
652
|
-
"clip": clip,
|
653
746
|
},
|
654
747
|
request_options=request_options,
|
655
748
|
omit=OMIT,
|
@@ -657,9 +750,9 @@ class AsyncVoicesClient:
|
|
657
750
|
try:
|
658
751
|
if 200 <= _response.status_code < 300:
|
659
752
|
return typing.cast(
|
660
|
-
|
753
|
+
Voice,
|
661
754
|
parse_obj_as(
|
662
|
-
type_=
|
755
|
+
type_=Voice, # type: ignore
|
663
756
|
object_=_response.json(),
|
664
757
|
),
|
665
758
|
)
|
@@ -968,39 +1061,58 @@ class AsyncVoicesClient:
|
|
968
1061
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
969
1062
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
970
1063
|
|
971
|
-
async def
|
1064
|
+
async def clone(
|
972
1065
|
self,
|
973
1066
|
*,
|
1067
|
+
clip: core.File,
|
974
1068
|
name: str,
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
1069
|
+
language: SupportedLanguage,
|
1070
|
+
mode: CloneMode,
|
1071
|
+
enhance: bool,
|
1072
|
+
description: typing.Optional[str] = OMIT,
|
1073
|
+
transcript: typing.Optional[str] = OMIT,
|
979
1074
|
request_options: typing.Optional[RequestOptions] = None,
|
980
|
-
) ->
|
1075
|
+
) -> VoiceMetadata:
|
981
1076
|
"""
|
982
|
-
|
1077
|
+
Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
|
1078
|
+
Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
|
1079
|
+
Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
|
983
1080
|
|
984
1081
|
Parameters
|
985
1082
|
----------
|
1083
|
+
clip : core.File
|
1084
|
+
See core.File for more documentation
|
1085
|
+
|
986
1086
|
name : str
|
987
1087
|
The name of the voice.
|
988
1088
|
|
989
|
-
description : str
|
990
|
-
The description of the voice.
|
991
1089
|
|
992
|
-
|
1090
|
+
language : SupportedLanguage
|
1091
|
+
The language of the voice.
|
993
1092
|
|
994
|
-
language : typing.Optional[SupportedLanguage]
|
995
1093
|
|
996
|
-
|
1094
|
+
mode : CloneMode
|
1095
|
+
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
1096
|
+
|
1097
|
+
|
1098
|
+
enhance : bool
|
1099
|
+
Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
|
1100
|
+
|
1101
|
+
|
1102
|
+
description : typing.Optional[str]
|
1103
|
+
A description for the voice.
|
1104
|
+
|
1105
|
+
|
1106
|
+
transcript : typing.Optional[str]
|
1107
|
+
Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
|
1108
|
+
|
997
1109
|
|
998
1110
|
request_options : typing.Optional[RequestOptions]
|
999
1111
|
Request-specific configuration.
|
1000
1112
|
|
1001
1113
|
Returns
|
1002
1114
|
-------
|
1003
|
-
|
1115
|
+
VoiceMetadata
|
1004
1116
|
|
1005
1117
|
Examples
|
1006
1118
|
--------
|
@@ -1014,26 +1126,30 @@ class AsyncVoicesClient:
|
|
1014
1126
|
|
1015
1127
|
|
1016
1128
|
async def main() -> None:
|
1017
|
-
await client.voices.
|
1018
|
-
name="
|
1019
|
-
description="
|
1020
|
-
|
1129
|
+
await client.voices.clone(
|
1130
|
+
name="A high-stability cloned voice",
|
1131
|
+
description="Copied from Cartesia docs",
|
1132
|
+
mode="stability",
|
1021
1133
|
language="en",
|
1022
|
-
|
1134
|
+
enhance=True,
|
1023
1135
|
)
|
1024
1136
|
|
1025
1137
|
|
1026
1138
|
asyncio.run(main())
|
1027
1139
|
"""
|
1028
1140
|
_response = await self._client_wrapper.httpx_client.request(
|
1029
|
-
"voices/",
|
1141
|
+
"voices/clone",
|
1030
1142
|
method="POST",
|
1031
|
-
|
1143
|
+
data={
|
1032
1144
|
"name": name,
|
1033
1145
|
"description": description,
|
1034
|
-
"embedding": embedding,
|
1035
1146
|
"language": language,
|
1036
|
-
"
|
1147
|
+
"mode": mode,
|
1148
|
+
"enhance": enhance,
|
1149
|
+
"transcript": transcript,
|
1150
|
+
},
|
1151
|
+
files={
|
1152
|
+
"clip": clip,
|
1037
1153
|
},
|
1038
1154
|
request_options=request_options,
|
1039
1155
|
omit=OMIT,
|
@@ -1041,9 +1157,9 @@ class AsyncVoicesClient:
|
|
1041
1157
|
try:
|
1042
1158
|
if 200 <= _response.status_code < 300:
|
1043
1159
|
return typing.cast(
|
1044
|
-
|
1160
|
+
VoiceMetadata,
|
1045
1161
|
parse_obj_as(
|
1046
|
-
type_=
|
1162
|
+
type_=VoiceMetadata, # type: ignore
|
1047
1163
|
object_=_response.json(),
|
1048
1164
|
),
|
1049
1165
|
)
|