cartesia 2.0.0a0__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/voices/client.py CHANGED
@@ -7,19 +7,20 @@ from .types.voice import Voice
7
7
  from ..core.pydantic_utilities import parse_obj_as
8
8
  from json.decoder import JSONDecodeError
9
9
  from ..core.api_error import ApiError
10
- from ..embedding.types.embedding import Embedding
10
+ from .. import core
11
11
  from ..tts.types.supported_language import SupportedLanguage
12
+ from .types.clone_mode import CloneMode
13
+ from .types.voice_metadata import VoiceMetadata
12
14
  from .types.voice_id import VoiceId
13
15
  from ..core.jsonable_encoder import jsonable_encoder
16
+ from ..embedding.types.embedding import Embedding
14
17
  from .types.localize_target_language import LocalizeTargetLanguage
15
18
  from .types.gender import Gender
16
19
  from .requests.localize_dialect import LocalizeDialectParams
17
20
  from .types.embedding_response import EmbeddingResponse
18
21
  from ..core.serialization import convert_and_respect_annotation_metadata
19
22
  from .requests.mix_voice_specifier import MixVoiceSpecifierParams
20
- from .. import core
21
- from .types.clone_mode import CloneMode
22
- from .types.voice_metadata import VoiceMetadata
23
+ from .types.base_voice_id import BaseVoiceId
23
24
  from ..core.client_wrapper import AsyncClientWrapper
24
25
 
25
26
  # this is used as the default value for optional parameters
@@ -69,34 +70,60 @@ class VoicesClient:
69
70
  raise ApiError(status_code=_response.status_code, body=_response.text)
70
71
  raise ApiError(status_code=_response.status_code, body=_response_json)
71
72
 
72
- def create(
73
+ def clone(
73
74
  self,
74
75
  *,
76
+ clip: core.File,
75
77
  name: str,
76
- description: str,
77
- embedding: Embedding,
78
- language: typing.Optional[SupportedLanguage] = OMIT,
78
+ language: SupportedLanguage,
79
+ mode: CloneMode,
80
+ enhance: bool,
81
+ description: typing.Optional[str] = OMIT,
82
+ transcript: typing.Optional[str] = OMIT,
79
83
  request_options: typing.Optional[RequestOptions] = None,
80
- ) -> Voice:
84
+ ) -> VoiceMetadata:
81
85
  """
86
+ Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
87
+
88
+ Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
89
+
90
+ Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
91
+
82
92
  Parameters
83
93
  ----------
94
+ clip : core.File
95
+ See core.File for more documentation
96
+
84
97
  name : str
85
98
  The name of the voice.
86
99
 
87
- description : str
88
- The description of the voice.
89
100
 
90
- embedding : Embedding
101
+ language : SupportedLanguage
102
+ The language of the voice.
103
+
104
+
105
+ mode : CloneMode
106
+ Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
107
+
108
+
109
+ enhance : bool
110
+ Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
111
+
112
+
113
+ description : typing.Optional[str]
114
+ A description for the voice.
115
+
116
+
117
+ transcript : typing.Optional[str]
118
+ Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
91
119
 
92
- language : typing.Optional[SupportedLanguage]
93
120
 
94
121
  request_options : typing.Optional[RequestOptions]
95
122
  Request-specific configuration.
96
123
 
97
124
  Returns
98
125
  -------
99
- Voice
126
+ VoiceMetadata
100
127
 
101
128
  Examples
102
129
  --------
@@ -105,214 +132,27 @@ class VoicesClient:
105
132
  client = Cartesia(
106
133
  api_key="YOUR_API_KEY",
107
134
  )
108
- client.voices.create(
109
- name="string",
110
- description="string",
111
- embedding=[
112
- 1.0,
113
- 1.0,
114
- 1.0,
115
- 1.0,
116
- 1.0,
117
- 1.0,
118
- 1.0,
119
- 1.0,
120
- 1.0,
121
- 1.0,
122
- 1.0,
123
- 1.0,
124
- 1.0,
125
- 1.0,
126
- 1.0,
127
- 1.0,
128
- 1.0,
129
- 1.0,
130
- 1.0,
131
- 1.0,
132
- 1.0,
133
- 1.0,
134
- 1.0,
135
- 1.0,
136
- 1.0,
137
- 1.0,
138
- 1.0,
139
- 1.0,
140
- 1.0,
141
- 1.0,
142
- 1.0,
143
- 1.0,
144
- 1.0,
145
- 1.0,
146
- 1.0,
147
- 1.0,
148
- 1.0,
149
- 1.0,
150
- 1.0,
151
- 1.0,
152
- 1.0,
153
- 1.0,
154
- 1.0,
155
- 1.0,
156
- 1.0,
157
- 1.0,
158
- 1.0,
159
- 1.0,
160
- 1.0,
161
- 1.0,
162
- 1.0,
163
- 1.0,
164
- 1.0,
165
- 1.0,
166
- 1.0,
167
- 1.0,
168
- 1.0,
169
- 1.0,
170
- 1.0,
171
- 1.0,
172
- 1.0,
173
- 1.0,
174
- 1.0,
175
- 1.0,
176
- 1.0,
177
- 1.0,
178
- 1.0,
179
- 1.0,
180
- 1.0,
181
- 1.0,
182
- 1.0,
183
- 1.0,
184
- 1.0,
185
- 1.0,
186
- 1.0,
187
- 1.0,
188
- 1.0,
189
- 1.0,
190
- 1.0,
191
- 1.0,
192
- 1.0,
193
- 1.0,
194
- 1.0,
195
- 1.0,
196
- 1.0,
197
- 1.0,
198
- 1.0,
199
- 1.0,
200
- 1.0,
201
- 1.0,
202
- 1.0,
203
- 1.0,
204
- 1.0,
205
- 1.0,
206
- 1.0,
207
- 1.0,
208
- 1.0,
209
- 1.0,
210
- 1.0,
211
- 1.0,
212
- 1.0,
213
- 1.0,
214
- 1.0,
215
- 1.0,
216
- 1.0,
217
- 1.0,
218
- 1.0,
219
- 1.0,
220
- 1.0,
221
- 1.0,
222
- 1.0,
223
- 1.0,
224
- 1.0,
225
- 1.0,
226
- 1.0,
227
- 1.0,
228
- 1.0,
229
- 1.0,
230
- 1.0,
231
- 1.0,
232
- 1.0,
233
- 1.0,
234
- 1.0,
235
- 1.0,
236
- 1.0,
237
- 1.0,
238
- 1.0,
239
- 1.0,
240
- 1.0,
241
- 1.0,
242
- 1.0,
243
- 1.0,
244
- 1.0,
245
- 1.0,
246
- 1.0,
247
- 1.0,
248
- 1.0,
249
- 1.0,
250
- 1.0,
251
- 1.0,
252
- 1.0,
253
- 1.0,
254
- 1.0,
255
- 1.0,
256
- 1.0,
257
- 1.0,
258
- 1.0,
259
- 1.0,
260
- 1.0,
261
- 1.0,
262
- 1.0,
263
- 1.0,
264
- 1.0,
265
- 1.0,
266
- 1.0,
267
- 1.0,
268
- 1.0,
269
- 1.0,
270
- 1.0,
271
- 1.0,
272
- 1.0,
273
- 1.0,
274
- 1.0,
275
- 1.0,
276
- 1.0,
277
- 1.0,
278
- 1.0,
279
- 1.0,
280
- 1.0,
281
- 1.0,
282
- 1.0,
283
- 1.0,
284
- 1.0,
285
- 1.0,
286
- 1.0,
287
- 1.0,
288
- 1.0,
289
- 1.0,
290
- 1.0,
291
- 1.0,
292
- 1.0,
293
- 1.0,
294
- 1.0,
295
- 1.0,
296
- 1.0,
297
- 1.0,
298
- 1.0,
299
- 1.0,
300
- 1.0,
301
- 1.0,
302
- 1.0,
303
- 1.0,
304
- ],
135
+ client.voices.clone(
136
+ name="A high-stability cloned voice",
137
+ description="Copied from Cartesia docs",
138
+ mode="stability",
305
139
  language="en",
140
+ enhance=True,
306
141
  )
307
142
  """
308
143
  _response = self._client_wrapper.httpx_client.request(
309
- "voices/",
144
+ "voices/clone",
310
145
  method="POST",
311
- json={
146
+ data={
312
147
  "name": name,
313
148
  "description": description,
314
- "embedding": embedding,
315
149
  "language": language,
150
+ "mode": mode,
151
+ "enhance": enhance,
152
+ "transcript": transcript,
153
+ },
154
+ files={
155
+ "clip": clip,
316
156
  },
317
157
  request_options=request_options,
318
158
  omit=OMIT,
@@ -320,9 +160,9 @@ class VoicesClient:
320
160
  try:
321
161
  if 200 <= _response.status_code < 300:
322
162
  return typing.cast(
323
- Voice,
163
+ VoiceMetadata,
324
164
  parse_obj_as(
325
- type_=Voice, # type: ignore
165
+ type_=VoiceMetadata, # type: ignore
326
166
  object_=_response.json(),
327
167
  ),
328
168
  )
@@ -352,7 +192,7 @@ class VoicesClient:
352
192
  api_key="YOUR_API_KEY",
353
193
  )
354
194
  client.voices.delete(
355
- id="string",
195
+ id="id",
356
196
  )
357
197
  """
358
198
  _response = self._client_wrapper.httpx_client.request(
@@ -397,9 +237,9 @@ class VoicesClient:
397
237
  api_key="YOUR_API_KEY",
398
238
  )
399
239
  client.voices.update(
400
- id="string",
401
- name="string",
402
- description="string",
240
+ id="id",
241
+ name="name",
242
+ description="description",
403
243
  )
404
244
  """
405
245
  _response = self._client_wrapper.httpx_client.request(
@@ -447,7 +287,7 @@ class VoicesClient:
447
287
  api_key="YOUR_API_KEY",
448
288
  )
449
289
  client.voices.get(
450
- id="string",
290
+ id="id",
451
291
  )
452
292
  """
453
293
  _response = self._client_wrapper.httpx_client.request(
@@ -504,203 +344,9 @@ class VoicesClient:
504
344
  api_key="YOUR_API_KEY",
505
345
  )
506
346
  client.voices.localize(
507
- embedding=[
508
- 1.0,
509
- 1.0,
510
- 1.0,
511
- 1.0,
512
- 1.0,
513
- 1.0,
514
- 1.0,
515
- 1.0,
516
- 1.0,
517
- 1.0,
518
- 1.0,
519
- 1.0,
520
- 1.0,
521
- 1.0,
522
- 1.0,
523
- 1.0,
524
- 1.0,
525
- 1.0,
526
- 1.0,
527
- 1.0,
528
- 1.0,
529
- 1.0,
530
- 1.0,
531
- 1.0,
532
- 1.0,
533
- 1.0,
534
- 1.0,
535
- 1.0,
536
- 1.0,
537
- 1.0,
538
- 1.0,
539
- 1.0,
540
- 1.0,
541
- 1.0,
542
- 1.0,
543
- 1.0,
544
- 1.0,
545
- 1.0,
546
- 1.0,
547
- 1.0,
548
- 1.0,
549
- 1.0,
550
- 1.0,
551
- 1.0,
552
- 1.0,
553
- 1.0,
554
- 1.0,
555
- 1.0,
556
- 1.0,
557
- 1.0,
558
- 1.0,
559
- 1.0,
560
- 1.0,
561
- 1.0,
562
- 1.0,
563
- 1.0,
564
- 1.0,
565
- 1.0,
566
- 1.0,
567
- 1.0,
568
- 1.0,
569
- 1.0,
570
- 1.0,
571
- 1.0,
572
- 1.0,
573
- 1.0,
574
- 1.0,
575
- 1.0,
576
- 1.0,
577
- 1.0,
578
- 1.0,
579
- 1.0,
580
- 1.0,
581
- 1.0,
582
- 1.0,
583
- 1.0,
584
- 1.0,
585
- 1.0,
586
- 1.0,
587
- 1.0,
588
- 1.0,
589
- 1.0,
590
- 1.0,
591
- 1.0,
592
- 1.0,
593
- 1.0,
594
- 1.0,
595
- 1.0,
596
- 1.0,
597
- 1.0,
598
- 1.0,
599
- 1.0,
600
- 1.0,
601
- 1.0,
602
- 1.0,
603
- 1.0,
604
- 1.0,
605
- 1.0,
606
- 1.0,
607
- 1.0,
608
- 1.0,
609
- 1.0,
610
- 1.0,
611
- 1.0,
612
- 1.0,
613
- 1.0,
614
- 1.0,
615
- 1.0,
616
- 1.0,
617
- 1.0,
618
- 1.0,
619
- 1.0,
620
- 1.0,
621
- 1.0,
622
- 1.0,
623
- 1.0,
624
- 1.0,
625
- 1.0,
626
- 1.0,
627
- 1.0,
628
- 1.0,
629
- 1.0,
630
- 1.0,
631
- 1.0,
632
- 1.0,
633
- 1.0,
634
- 1.0,
635
- 1.0,
636
- 1.0,
637
- 1.0,
638
- 1.0,
639
- 1.0,
640
- 1.0,
641
- 1.0,
642
- 1.0,
643
- 1.0,
644
- 1.0,
645
- 1.0,
646
- 1.0,
647
- 1.0,
648
- 1.0,
649
- 1.0,
650
- 1.0,
651
- 1.0,
652
- 1.0,
653
- 1.0,
654
- 1.0,
655
- 1.0,
656
- 1.0,
657
- 1.0,
658
- 1.0,
659
- 1.0,
660
- 1.0,
661
- 1.0,
662
- 1.0,
663
- 1.0,
664
- 1.0,
665
- 1.0,
666
- 1.0,
667
- 1.0,
668
- 1.0,
669
- 1.0,
670
- 1.0,
671
- 1.0,
672
- 1.0,
673
- 1.0,
674
- 1.0,
675
- 1.0,
676
- 1.0,
677
- 1.0,
678
- 1.0,
679
- 1.0,
680
- 1.0,
681
- 1.0,
682
- 1.0,
683
- 1.0,
684
- 1.0,
685
- 1.0,
686
- 1.0,
687
- 1.0,
688
- 1.0,
689
- 1.0,
690
- 1.0,
691
- 1.0,
692
- 1.0,
693
- 1.0,
694
- 1.0,
695
- 1.0,
696
- 1.0,
697
- 1.0,
698
- 1.0,
699
- 1.0,
700
- ],
347
+ embedding=[1.1, 1.1],
701
348
  language="en",
702
349
  original_speaker_gender="male",
703
- dialect="au",
704
350
  )
705
351
  """
706
352
  _response = self._client_wrapper.httpx_client.request(
@@ -757,7 +403,7 @@ class VoicesClient:
757
403
  api_key="YOUR_API_KEY",
758
404
  )
759
405
  client.voices.mix(
760
- voices=[{"id": "string", "weight": 1.1}],
406
+ voices=[{"id": "id", "weight": 1.1}, {"id": "id", "weight": 1.1}],
761
407
  )
762
408
  """
763
409
  _response = self._client_wrapper.httpx_client.request(
@@ -785,60 +431,39 @@ class VoicesClient:
785
431
  raise ApiError(status_code=_response.status_code, body=_response.text)
786
432
  raise ApiError(status_code=_response.status_code, body=_response_json)
787
433
 
788
- def clone(
434
+ def create(
789
435
  self,
790
436
  *,
791
- clip: core.File,
792
437
  name: str,
793
- language: SupportedLanguage,
794
- mode: CloneMode,
795
- enhance: bool,
796
- description: typing.Optional[str] = OMIT,
797
- transcript: typing.Optional[str] = OMIT,
438
+ description: str,
439
+ embedding: Embedding,
440
+ language: typing.Optional[SupportedLanguage] = OMIT,
441
+ base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
798
442
  request_options: typing.Optional[RequestOptions] = None,
799
- ) -> VoiceMetadata:
443
+ ) -> Voice:
800
444
  """
801
- Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
802
-
803
- Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
804
-
805
- Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
445
+ Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
806
446
 
807
447
  Parameters
808
448
  ----------
809
- clip : core.File
810
- See core.File for more documentation
811
-
812
449
  name : str
813
450
  The name of the voice.
814
451
 
452
+ description : str
453
+ The description of the voice.
815
454
 
816
- language : SupportedLanguage
817
- The language of the voice.
818
-
819
-
820
- mode : CloneMode
821
- Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
822
-
823
-
824
- enhance : bool
825
- Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
826
-
827
-
828
- description : typing.Optional[str]
829
- A description for the voice.
830
-
455
+ embedding : Embedding
831
456
 
832
- transcript : typing.Optional[str]
833
- Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
457
+ language : typing.Optional[SupportedLanguage]
834
458
 
459
+ base_voice_id : typing.Optional[BaseVoiceId]
835
460
 
836
461
  request_options : typing.Optional[RequestOptions]
837
462
  Request-specific configuration.
838
463
 
839
464
  Returns
840
465
  -------
841
- VoiceMetadata
466
+ Voice
842
467
 
843
468
  Examples
844
469
  --------
@@ -847,27 +472,23 @@ class VoicesClient:
847
472
  client = Cartesia(
848
473
  api_key="YOUR_API_KEY",
849
474
  )
850
- client.voices.clone(
851
- name="A high-stability cloned voice",
852
- description="Copied from Cartesia docs",
853
- mode="stability",
475
+ client.voices.create(
476
+ name="My Custom Voice",
477
+ description="A custom voice created through the API",
478
+ embedding=[],
854
479
  language="en",
855
- enhance=True,
480
+ base_voice_id="123e4567-e89b-12d3-a456-426614174000",
856
481
  )
857
482
  """
858
483
  _response = self._client_wrapper.httpx_client.request(
859
- "voices/clone",
484
+ "voices/",
860
485
  method="POST",
861
- data={
486
+ json={
862
487
  "name": name,
863
488
  "description": description,
489
+ "embedding": embedding,
864
490
  "language": language,
865
- "mode": mode,
866
- "enhance": enhance,
867
- "transcript": transcript,
868
- },
869
- files={
870
- "clip": clip,
491
+ "base_voice_id": base_voice_id,
871
492
  },
872
493
  request_options=request_options,
873
494
  omit=OMIT,
@@ -875,9 +496,9 @@ class VoicesClient:
875
496
  try:
876
497
  if 200 <= _response.status_code < 300:
877
498
  return typing.cast(
878
- VoiceMetadata,
499
+ Voice,
879
500
  parse_obj_as(
880
- type_=VoiceMetadata, # type: ignore
501
+ type_=Voice, # type: ignore
881
502
  object_=_response.json(),
882
503
  ),
883
504
  )
@@ -938,34 +559,60 @@ class AsyncVoicesClient:
938
559
  raise ApiError(status_code=_response.status_code, body=_response.text)
939
560
  raise ApiError(status_code=_response.status_code, body=_response_json)
940
561
 
941
- async def create(
562
+ async def clone(
942
563
  self,
943
564
  *,
565
+ clip: core.File,
944
566
  name: str,
945
- description: str,
946
- embedding: Embedding,
947
- language: typing.Optional[SupportedLanguage] = OMIT,
567
+ language: SupportedLanguage,
568
+ mode: CloneMode,
569
+ enhance: bool,
570
+ description: typing.Optional[str] = OMIT,
571
+ transcript: typing.Optional[str] = OMIT,
948
572
  request_options: typing.Optional[RequestOptions] = None,
949
- ) -> Voice:
573
+ ) -> VoiceMetadata:
950
574
  """
575
+ Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
576
+
577
+ Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
578
+
579
+ Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
580
+
951
581
  Parameters
952
582
  ----------
583
+ clip : core.File
584
+ See core.File for more documentation
585
+
953
586
  name : str
954
587
  The name of the voice.
955
588
 
956
- description : str
957
- The description of the voice.
958
589
 
959
- embedding : Embedding
590
+ language : SupportedLanguage
591
+ The language of the voice.
592
+
593
+
594
+ mode : CloneMode
595
+ Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
596
+
597
+
598
+ enhance : bool
599
+ Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
600
+
601
+
602
+ description : typing.Optional[str]
603
+ A description for the voice.
604
+
605
+
606
+ transcript : typing.Optional[str]
607
+ Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
960
608
 
961
- language : typing.Optional[SupportedLanguage]
962
609
 
963
610
  request_options : typing.Optional[RequestOptions]
964
611
  Request-specific configuration.
965
612
 
966
613
  Returns
967
614
  -------
968
- Voice
615
+ VoiceMetadata
969
616
 
970
617
  Examples
971
618
  --------
@@ -979,217 +626,30 @@ class AsyncVoicesClient:
979
626
 
980
627
 
981
628
  async def main() -> None:
982
- await client.voices.create(
983
- name="string",
984
- description="string",
985
- embedding=[
986
- 1.0,
987
- 1.0,
988
- 1.0,
989
- 1.0,
990
- 1.0,
991
- 1.0,
992
- 1.0,
993
- 1.0,
994
- 1.0,
995
- 1.0,
996
- 1.0,
997
- 1.0,
998
- 1.0,
999
- 1.0,
1000
- 1.0,
1001
- 1.0,
1002
- 1.0,
1003
- 1.0,
1004
- 1.0,
1005
- 1.0,
1006
- 1.0,
1007
- 1.0,
1008
- 1.0,
1009
- 1.0,
1010
- 1.0,
1011
- 1.0,
1012
- 1.0,
1013
- 1.0,
1014
- 1.0,
1015
- 1.0,
1016
- 1.0,
1017
- 1.0,
1018
- 1.0,
1019
- 1.0,
1020
- 1.0,
1021
- 1.0,
1022
- 1.0,
1023
- 1.0,
1024
- 1.0,
1025
- 1.0,
1026
- 1.0,
1027
- 1.0,
1028
- 1.0,
1029
- 1.0,
1030
- 1.0,
1031
- 1.0,
1032
- 1.0,
1033
- 1.0,
1034
- 1.0,
1035
- 1.0,
1036
- 1.0,
1037
- 1.0,
1038
- 1.0,
1039
- 1.0,
1040
- 1.0,
1041
- 1.0,
1042
- 1.0,
1043
- 1.0,
1044
- 1.0,
1045
- 1.0,
1046
- 1.0,
1047
- 1.0,
1048
- 1.0,
1049
- 1.0,
1050
- 1.0,
1051
- 1.0,
1052
- 1.0,
1053
- 1.0,
1054
- 1.0,
1055
- 1.0,
1056
- 1.0,
1057
- 1.0,
1058
- 1.0,
1059
- 1.0,
1060
- 1.0,
1061
- 1.0,
1062
- 1.0,
1063
- 1.0,
1064
- 1.0,
1065
- 1.0,
1066
- 1.0,
1067
- 1.0,
1068
- 1.0,
1069
- 1.0,
1070
- 1.0,
1071
- 1.0,
1072
- 1.0,
1073
- 1.0,
1074
- 1.0,
1075
- 1.0,
1076
- 1.0,
1077
- 1.0,
1078
- 1.0,
1079
- 1.0,
1080
- 1.0,
1081
- 1.0,
1082
- 1.0,
1083
- 1.0,
1084
- 1.0,
1085
- 1.0,
1086
- 1.0,
1087
- 1.0,
1088
- 1.0,
1089
- 1.0,
1090
- 1.0,
1091
- 1.0,
1092
- 1.0,
1093
- 1.0,
1094
- 1.0,
1095
- 1.0,
1096
- 1.0,
1097
- 1.0,
1098
- 1.0,
1099
- 1.0,
1100
- 1.0,
1101
- 1.0,
1102
- 1.0,
1103
- 1.0,
1104
- 1.0,
1105
- 1.0,
1106
- 1.0,
1107
- 1.0,
1108
- 1.0,
1109
- 1.0,
1110
- 1.0,
1111
- 1.0,
1112
- 1.0,
1113
- 1.0,
1114
- 1.0,
1115
- 1.0,
1116
- 1.0,
1117
- 1.0,
1118
- 1.0,
1119
- 1.0,
1120
- 1.0,
1121
- 1.0,
1122
- 1.0,
1123
- 1.0,
1124
- 1.0,
1125
- 1.0,
1126
- 1.0,
1127
- 1.0,
1128
- 1.0,
1129
- 1.0,
1130
- 1.0,
1131
- 1.0,
1132
- 1.0,
1133
- 1.0,
1134
- 1.0,
1135
- 1.0,
1136
- 1.0,
1137
- 1.0,
1138
- 1.0,
1139
- 1.0,
1140
- 1.0,
1141
- 1.0,
1142
- 1.0,
1143
- 1.0,
1144
- 1.0,
1145
- 1.0,
1146
- 1.0,
1147
- 1.0,
1148
- 1.0,
1149
- 1.0,
1150
- 1.0,
1151
- 1.0,
1152
- 1.0,
1153
- 1.0,
1154
- 1.0,
1155
- 1.0,
1156
- 1.0,
1157
- 1.0,
1158
- 1.0,
1159
- 1.0,
1160
- 1.0,
1161
- 1.0,
1162
- 1.0,
1163
- 1.0,
1164
- 1.0,
1165
- 1.0,
1166
- 1.0,
1167
- 1.0,
1168
- 1.0,
1169
- 1.0,
1170
- 1.0,
1171
- 1.0,
1172
- 1.0,
1173
- 1.0,
1174
- 1.0,
1175
- 1.0,
1176
- 1.0,
1177
- 1.0,
1178
- ],
629
+ await client.voices.clone(
630
+ name="A high-stability cloned voice",
631
+ description="Copied from Cartesia docs",
632
+ mode="stability",
1179
633
  language="en",
634
+ enhance=True,
1180
635
  )
1181
636
 
1182
637
 
1183
638
  asyncio.run(main())
1184
639
  """
1185
640
  _response = await self._client_wrapper.httpx_client.request(
1186
- "voices/",
641
+ "voices/clone",
1187
642
  method="POST",
1188
- json={
643
+ data={
1189
644
  "name": name,
1190
645
  "description": description,
1191
- "embedding": embedding,
1192
646
  "language": language,
647
+ "mode": mode,
648
+ "enhance": enhance,
649
+ "transcript": transcript,
650
+ },
651
+ files={
652
+ "clip": clip,
1193
653
  },
1194
654
  request_options=request_options,
1195
655
  omit=OMIT,
@@ -1197,9 +657,9 @@ class AsyncVoicesClient:
1197
657
  try:
1198
658
  if 200 <= _response.status_code < 300:
1199
659
  return typing.cast(
1200
- Voice,
660
+ VoiceMetadata,
1201
661
  parse_obj_as(
1202
- type_=Voice, # type: ignore
662
+ type_=VoiceMetadata, # type: ignore
1203
663
  object_=_response.json(),
1204
664
  ),
1205
665
  )
@@ -1234,7 +694,7 @@ class AsyncVoicesClient:
1234
694
 
1235
695
  async def main() -> None:
1236
696
  await client.voices.delete(
1237
- id="string",
697
+ id="id",
1238
698
  )
1239
699
 
1240
700
 
@@ -1287,9 +747,9 @@ class AsyncVoicesClient:
1287
747
 
1288
748
  async def main() -> None:
1289
749
  await client.voices.update(
1290
- id="string",
1291
- name="string",
1292
- description="string",
750
+ id="id",
751
+ name="name",
752
+ description="description",
1293
753
  )
1294
754
 
1295
755
 
@@ -1345,7 +805,7 @@ class AsyncVoicesClient:
1345
805
 
1346
806
  async def main() -> None:
1347
807
  await client.voices.get(
1348
- id="string",
808
+ id="id",
1349
809
  )
1350
810
 
1351
811
 
@@ -1410,203 +870,9 @@ class AsyncVoicesClient:
1410
870
 
1411
871
  async def main() -> None:
1412
872
  await client.voices.localize(
1413
- embedding=[
1414
- 1.0,
1415
- 1.0,
1416
- 1.0,
1417
- 1.0,
1418
- 1.0,
1419
- 1.0,
1420
- 1.0,
1421
- 1.0,
1422
- 1.0,
1423
- 1.0,
1424
- 1.0,
1425
- 1.0,
1426
- 1.0,
1427
- 1.0,
1428
- 1.0,
1429
- 1.0,
1430
- 1.0,
1431
- 1.0,
1432
- 1.0,
1433
- 1.0,
1434
- 1.0,
1435
- 1.0,
1436
- 1.0,
1437
- 1.0,
1438
- 1.0,
1439
- 1.0,
1440
- 1.0,
1441
- 1.0,
1442
- 1.0,
1443
- 1.0,
1444
- 1.0,
1445
- 1.0,
1446
- 1.0,
1447
- 1.0,
1448
- 1.0,
1449
- 1.0,
1450
- 1.0,
1451
- 1.0,
1452
- 1.0,
1453
- 1.0,
1454
- 1.0,
1455
- 1.0,
1456
- 1.0,
1457
- 1.0,
1458
- 1.0,
1459
- 1.0,
1460
- 1.0,
1461
- 1.0,
1462
- 1.0,
1463
- 1.0,
1464
- 1.0,
1465
- 1.0,
1466
- 1.0,
1467
- 1.0,
1468
- 1.0,
1469
- 1.0,
1470
- 1.0,
1471
- 1.0,
1472
- 1.0,
1473
- 1.0,
1474
- 1.0,
1475
- 1.0,
1476
- 1.0,
1477
- 1.0,
1478
- 1.0,
1479
- 1.0,
1480
- 1.0,
1481
- 1.0,
1482
- 1.0,
1483
- 1.0,
1484
- 1.0,
1485
- 1.0,
1486
- 1.0,
1487
- 1.0,
1488
- 1.0,
1489
- 1.0,
1490
- 1.0,
1491
- 1.0,
1492
- 1.0,
1493
- 1.0,
1494
- 1.0,
1495
- 1.0,
1496
- 1.0,
1497
- 1.0,
1498
- 1.0,
1499
- 1.0,
1500
- 1.0,
1501
- 1.0,
1502
- 1.0,
1503
- 1.0,
1504
- 1.0,
1505
- 1.0,
1506
- 1.0,
1507
- 1.0,
1508
- 1.0,
1509
- 1.0,
1510
- 1.0,
1511
- 1.0,
1512
- 1.0,
1513
- 1.0,
1514
- 1.0,
1515
- 1.0,
1516
- 1.0,
1517
- 1.0,
1518
- 1.0,
1519
- 1.0,
1520
- 1.0,
1521
- 1.0,
1522
- 1.0,
1523
- 1.0,
1524
- 1.0,
1525
- 1.0,
1526
- 1.0,
1527
- 1.0,
1528
- 1.0,
1529
- 1.0,
1530
- 1.0,
1531
- 1.0,
1532
- 1.0,
1533
- 1.0,
1534
- 1.0,
1535
- 1.0,
1536
- 1.0,
1537
- 1.0,
1538
- 1.0,
1539
- 1.0,
1540
- 1.0,
1541
- 1.0,
1542
- 1.0,
1543
- 1.0,
1544
- 1.0,
1545
- 1.0,
1546
- 1.0,
1547
- 1.0,
1548
- 1.0,
1549
- 1.0,
1550
- 1.0,
1551
- 1.0,
1552
- 1.0,
1553
- 1.0,
1554
- 1.0,
1555
- 1.0,
1556
- 1.0,
1557
- 1.0,
1558
- 1.0,
1559
- 1.0,
1560
- 1.0,
1561
- 1.0,
1562
- 1.0,
1563
- 1.0,
1564
- 1.0,
1565
- 1.0,
1566
- 1.0,
1567
- 1.0,
1568
- 1.0,
1569
- 1.0,
1570
- 1.0,
1571
- 1.0,
1572
- 1.0,
1573
- 1.0,
1574
- 1.0,
1575
- 1.0,
1576
- 1.0,
1577
- 1.0,
1578
- 1.0,
1579
- 1.0,
1580
- 1.0,
1581
- 1.0,
1582
- 1.0,
1583
- 1.0,
1584
- 1.0,
1585
- 1.0,
1586
- 1.0,
1587
- 1.0,
1588
- 1.0,
1589
- 1.0,
1590
- 1.0,
1591
- 1.0,
1592
- 1.0,
1593
- 1.0,
1594
- 1.0,
1595
- 1.0,
1596
- 1.0,
1597
- 1.0,
1598
- 1.0,
1599
- 1.0,
1600
- 1.0,
1601
- 1.0,
1602
- 1.0,
1603
- 1.0,
1604
- 1.0,
1605
- 1.0,
1606
- ],
873
+ embedding=[1.1, 1.1],
1607
874
  language="en",
1608
875
  original_speaker_gender="male",
1609
- dialect="au",
1610
876
  )
1611
877
 
1612
878
 
@@ -1671,7 +937,7 @@ class AsyncVoicesClient:
1671
937
 
1672
938
  async def main() -> None:
1673
939
  await client.voices.mix(
1674
- voices=[{"id": "string", "weight": 1.1}],
940
+ voices=[{"id": "id", "weight": 1.1}, {"id": "id", "weight": 1.1}],
1675
941
  )
1676
942
 
1677
943
 
@@ -1702,60 +968,39 @@ class AsyncVoicesClient:
1702
968
  raise ApiError(status_code=_response.status_code, body=_response.text)
1703
969
  raise ApiError(status_code=_response.status_code, body=_response_json)
1704
970
 
1705
- async def clone(
971
+ async def create(
1706
972
  self,
1707
973
  *,
1708
- clip: core.File,
1709
974
  name: str,
1710
- language: SupportedLanguage,
1711
- mode: CloneMode,
1712
- enhance: bool,
1713
- description: typing.Optional[str] = OMIT,
1714
- transcript: typing.Optional[str] = OMIT,
975
+ description: str,
976
+ embedding: Embedding,
977
+ language: typing.Optional[SupportedLanguage] = OMIT,
978
+ base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
1715
979
  request_options: typing.Optional[RequestOptions] = None,
1716
- ) -> VoiceMetadata:
980
+ ) -> Voice:
1717
981
  """
1718
- Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
1719
-
1720
- Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
1721
-
1722
- Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
982
+ Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
1723
983
 
1724
984
  Parameters
1725
985
  ----------
1726
- clip : core.File
1727
- See core.File for more documentation
1728
-
1729
986
  name : str
1730
987
  The name of the voice.
1731
988
 
989
+ description : str
990
+ The description of the voice.
1732
991
 
1733
- language : SupportedLanguage
1734
- The language of the voice.
1735
-
1736
-
1737
- mode : CloneMode
1738
- Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
1739
-
1740
-
1741
- enhance : bool
1742
- Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
1743
-
1744
-
1745
- description : typing.Optional[str]
1746
- A description for the voice.
1747
-
992
+ embedding : Embedding
1748
993
 
1749
- transcript : typing.Optional[str]
1750
- Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
994
+ language : typing.Optional[SupportedLanguage]
1751
995
 
996
+ base_voice_id : typing.Optional[BaseVoiceId]
1752
997
 
1753
998
  request_options : typing.Optional[RequestOptions]
1754
999
  Request-specific configuration.
1755
1000
 
1756
1001
  Returns
1757
1002
  -------
1758
- VoiceMetadata
1003
+ Voice
1759
1004
 
1760
1005
  Examples
1761
1006
  --------
@@ -1769,30 +1014,26 @@ class AsyncVoicesClient:
1769
1014
 
1770
1015
 
1771
1016
  async def main() -> None:
1772
- await client.voices.clone(
1773
- name="A high-stability cloned voice",
1774
- description="Copied from Cartesia docs",
1775
- mode="stability",
1017
+ await client.voices.create(
1018
+ name="My Custom Voice",
1019
+ description="A custom voice created through the API",
1020
+ embedding=[],
1776
1021
  language="en",
1777
- enhance=True,
1022
+ base_voice_id="123e4567-e89b-12d3-a456-426614174000",
1778
1023
  )
1779
1024
 
1780
1025
 
1781
1026
  asyncio.run(main())
1782
1027
  """
1783
1028
  _response = await self._client_wrapper.httpx_client.request(
1784
- "voices/clone",
1029
+ "voices/",
1785
1030
  method="POST",
1786
- data={
1031
+ json={
1787
1032
  "name": name,
1788
1033
  "description": description,
1034
+ "embedding": embedding,
1789
1035
  "language": language,
1790
- "mode": mode,
1791
- "enhance": enhance,
1792
- "transcript": transcript,
1793
- },
1794
- files={
1795
- "clip": clip,
1036
+ "base_voice_id": base_voice_id,
1796
1037
  },
1797
1038
  request_options=request_options,
1798
1039
  omit=OMIT,
@@ -1800,9 +1041,9 @@ class AsyncVoicesClient:
1800
1041
  try:
1801
1042
  if 200 <= _response.status_code < 300:
1802
1043
  return typing.cast(
1803
- VoiceMetadata,
1044
+ Voice,
1804
1045
  parse_obj_as(
1805
- type_=VoiceMetadata, # type: ignore
1046
+ type_=Voice, # type: ignore
1806
1047
  object_=_response.json(),
1807
1048
  ),
1808
1049
  )