cartesia 2.0.0b2__py3-none-any.whl → 2.0.0b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cartesia/__init__.py +8 -4
  2. cartesia/base_client.py +0 -4
  3. cartesia/core/__init__.py +3 -0
  4. cartesia/core/client_wrapper.py +2 -2
  5. cartesia/core/pagination.py +88 -0
  6. cartesia/infill/client.py +4 -4
  7. cartesia/tts/_async_websocket.py +48 -1
  8. cartesia/tts/_websocket.py +44 -3
  9. cartesia/tts/client.py +4 -4
  10. cartesia/tts/requests/generation_request.py +5 -0
  11. cartesia/tts/requests/web_socket_chunk_response.py +3 -0
  12. cartesia/tts/requests/web_socket_response.py +2 -1
  13. cartesia/tts/requests/web_socket_tts_request.py +1 -0
  14. cartesia/tts/types/emotion.py +5 -0
  15. cartesia/tts/types/generation_request.py +5 -0
  16. cartesia/tts/types/web_socket_chunk_response.py +3 -1
  17. cartesia/tts/types/web_socket_response.py +2 -1
  18. cartesia/tts/types/web_socket_tts_output.py +2 -0
  19. cartesia/tts/types/web_socket_tts_request.py +1 -0
  20. cartesia/tts/utils/constants.py +2 -2
  21. cartesia/voice_changer/requests/streaming_response.py +2 -0
  22. cartesia/voice_changer/types/streaming_response.py +2 -0
  23. cartesia/voices/__init__.py +8 -4
  24. cartesia/voices/client.py +285 -169
  25. cartesia/voices/requests/__init__.py +2 -0
  26. cartesia/voices/requests/create_voice_request.py +0 -2
  27. cartesia/voices/requests/get_voices_response.py +24 -0
  28. cartesia/voices/requests/localize_dialect.py +1 -3
  29. cartesia/voices/requests/voice.py +13 -9
  30. cartesia/voices/types/__init__.py +6 -4
  31. cartesia/voices/types/create_voice_request.py +0 -2
  32. cartesia/voices/types/gender_presentation.py +5 -0
  33. cartesia/voices/types/get_voices_response.py +34 -0
  34. cartesia/voices/types/localize_dialect.py +1 -3
  35. cartesia/voices/types/voice.py +13 -9
  36. cartesia/voices/types/voice_expand_options.py +5 -0
  37. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/METADATA +85 -14
  38. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/RECORD +39 -37
  39. cartesia/datasets/client.py +0 -392
  40. cartesia/voices/types/localize_portuguese_dialect.py +0 -5
  41. cartesia/voices/types/localize_spanish_dialect.py +0 -5
  42. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/WHEEL +0 -0
cartesia/voices/client.py CHANGED
@@ -2,26 +2,30 @@
2
2
 
3
3
  import typing
4
4
  from ..core.client_wrapper import SyncClientWrapper
5
+ from .types.gender_presentation import GenderPresentation
6
+ from .types.voice_expand_options import VoiceExpandOptions
5
7
  from ..core.request_options import RequestOptions
8
+ from ..core.pagination import SyncPager
6
9
  from .types.voice import Voice
10
+ from .types.get_voices_response import GetVoicesResponse
7
11
  from ..core.pydantic_utilities import parse_obj_as
8
12
  from json.decoder import JSONDecodeError
9
13
  from ..core.api_error import ApiError
10
- from .. import core
14
+ from ..embedding.types.embedding import Embedding
11
15
  from ..tts.types.supported_language import SupportedLanguage
12
- from .types.clone_mode import CloneMode
13
- from .types.voice_metadata import VoiceMetadata
14
16
  from .types.voice_id import VoiceId
15
17
  from ..core.jsonable_encoder import jsonable_encoder
16
- from ..embedding.types.embedding import Embedding
17
18
  from .types.localize_target_language import LocalizeTargetLanguage
18
19
  from .types.gender import Gender
19
20
  from .requests.localize_dialect import LocalizeDialectParams
20
21
  from .types.embedding_response import EmbeddingResponse
21
22
  from ..core.serialization import convert_and_respect_annotation_metadata
22
23
  from .requests.mix_voice_specifier import MixVoiceSpecifierParams
23
- from .types.base_voice_id import BaseVoiceId
24
+ from .. import core
25
+ from .types.clone_mode import CloneMode
26
+ from .types.voice_metadata import VoiceMetadata
24
27
  from ..core.client_wrapper import AsyncClientWrapper
28
+ from ..core.pagination import AsyncPager
25
29
 
26
30
  # this is used as the default value for optional parameters
27
31
  OMIT = typing.cast(typing.Any, ...)
@@ -31,16 +35,54 @@ class VoicesClient:
31
35
  def __init__(self, *, client_wrapper: SyncClientWrapper):
32
36
  self._client_wrapper = client_wrapper
33
37
 
34
- def list(self, *, request_options: typing.Optional[RequestOptions] = None) -> typing.List[Voice]:
38
+ def list(
39
+ self,
40
+ *,
41
+ limit: typing.Optional[int] = None,
42
+ starting_after: typing.Optional[str] = None,
43
+ ending_before: typing.Optional[str] = None,
44
+ is_owner: typing.Optional[bool] = None,
45
+ is_starred: typing.Optional[bool] = None,
46
+ gender: typing.Optional[GenderPresentation] = None,
47
+ expand: typing.Optional[typing.Sequence[VoiceExpandOptions]] = None,
48
+ request_options: typing.Optional[RequestOptions] = None,
49
+ ) -> SyncPager[Voice]:
35
50
  """
36
51
  Parameters
37
52
  ----------
53
+ limit : typing.Optional[int]
54
+ The number of Voices to return per page, ranging between 1 and 100.
55
+
56
+ starting_after : typing.Optional[str]
57
+ A cursor to use in pagination. `starting_after` is a Voice ID that defines your
58
+ place in the list. For example, if you make a /voices request and receive 100
59
+ objects, ending with `voice_abc123`, your subsequent call can include
60
+ `starting_after=voice_abc123` to fetch the next page of the list.
61
+
62
+ ending_before : typing.Optional[str]
63
+ A cursor to use in pagination. `ending_before` is a Voice ID that defines your
64
+ place in the list. For example, if you make a /voices request and receive 100
65
+ objects, starting with `voice_abc123`, your subsequent call can include
66
+ `ending_before=voice_abc123` to fetch the previous page of the list.
67
+
68
+ is_owner : typing.Optional[bool]
69
+ Whether to only return voices owned by the current user.
70
+
71
+ is_starred : typing.Optional[bool]
72
+ Whether to only return starred voices.
73
+
74
+ gender : typing.Optional[GenderPresentation]
75
+ The gender presentation of the voices to return.
76
+
77
+ expand : typing.Optional[typing.Sequence[VoiceExpandOptions]]
78
+ Additional fields to include in the response.
79
+
38
80
  request_options : typing.Optional[RequestOptions]
39
81
  Request-specific configuration.
40
82
 
41
83
  Returns
42
84
  -------
43
- typing.List[Voice]
85
+ SyncPager[Voice]
44
86
 
45
87
  Examples
46
88
  --------
@@ -49,81 +91,83 @@ class VoicesClient:
49
91
  client = Cartesia(
50
92
  api_key="YOUR_API_KEY",
51
93
  )
52
- client.voices.list()
94
+ response = client.voices.list()
95
+ for item in response:
96
+ yield item
97
+ # alternatively, you can paginate page-by-page
98
+ for page in response.iter_pages():
99
+ yield page
53
100
  """
54
101
  _response = self._client_wrapper.httpx_client.request(
55
102
  "voices/",
56
103
  method="GET",
104
+ params={
105
+ "limit": limit,
106
+ "starting_after": starting_after,
107
+ "ending_before": ending_before,
108
+ "is_owner": is_owner,
109
+ "is_starred": is_starred,
110
+ "gender": gender,
111
+ "expand[]": expand,
112
+ },
57
113
  request_options=request_options,
58
114
  )
59
115
  try:
60
116
  if 200 <= _response.status_code < 300:
61
- return typing.cast(
62
- typing.List[Voice],
117
+ _parsed_response = typing.cast(
118
+ GetVoicesResponse,
63
119
  parse_obj_as(
64
- type_=typing.List[Voice], # type: ignore
120
+ type_=GetVoicesResponse, # type: ignore
65
121
  object_=_response.json(),
66
122
  ),
67
123
  )
124
+ _parsed_next = _parsed_response.next_page
125
+ _has_next = _parsed_next is not None and _parsed_next != ""
126
+ _get_next = lambda: self.list(
127
+ limit=limit,
128
+ starting_after=_parsed_next,
129
+ ending_before=ending_before,
130
+ is_owner=is_owner,
131
+ is_starred=is_starred,
132
+ gender=gender,
133
+ expand=expand,
134
+ request_options=request_options,
135
+ )
136
+ _items = _parsed_response.data
137
+ return SyncPager(has_next=_has_next, items=_items, get_next=_get_next)
68
138
  _response_json = _response.json()
69
139
  except JSONDecodeError:
70
140
  raise ApiError(status_code=_response.status_code, body=_response.text)
71
141
  raise ApiError(status_code=_response.status_code, body=_response_json)
72
142
 
73
- def clone(
143
+ def create(
74
144
  self,
75
145
  *,
76
- clip: core.File,
77
146
  name: str,
78
- language: SupportedLanguage,
79
- mode: CloneMode,
80
- enhance: bool,
81
- description: typing.Optional[str] = OMIT,
82
- transcript: typing.Optional[str] = OMIT,
147
+ description: str,
148
+ embedding: Embedding,
149
+ language: typing.Optional[SupportedLanguage] = OMIT,
83
150
  request_options: typing.Optional[RequestOptions] = None,
84
- ) -> VoiceMetadata:
151
+ ) -> Voice:
85
152
  """
86
- Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
87
-
88
- Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
89
-
90
- Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
91
-
92
153
  Parameters
93
154
  ----------
94
- clip : core.File
95
- See core.File for more documentation
96
-
97
155
  name : str
98
156
  The name of the voice.
99
157
 
158
+ description : str
159
+ The description of the voice.
100
160
 
101
- language : SupportedLanguage
102
- The language of the voice.
103
-
104
-
105
- mode : CloneMode
106
- Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
107
-
108
-
109
- enhance : bool
110
- Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
111
-
112
-
113
- description : typing.Optional[str]
114
- A description for the voice.
115
-
116
-
117
- transcript : typing.Optional[str]
118
- Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
161
+ embedding : Embedding
119
162
 
163
+ language : typing.Optional[SupportedLanguage]
120
164
 
121
165
  request_options : typing.Optional[RequestOptions]
122
166
  Request-specific configuration.
123
167
 
124
168
  Returns
125
169
  -------
126
- VoiceMetadata
170
+ Voice
127
171
 
128
172
  Examples
129
173
  --------
@@ -132,27 +176,20 @@ class VoicesClient:
132
176
  client = Cartesia(
133
177
  api_key="YOUR_API_KEY",
134
178
  )
135
- client.voices.clone(
136
- name="A high-stability cloned voice",
137
- description="Copied from Cartesia docs",
138
- mode="stability",
139
- language="en",
140
- enhance=True,
179
+ client.voices.create(
180
+ name="name",
181
+ description="description",
182
+ embedding=[1.1, 1.1],
141
183
  )
142
184
  """
143
185
  _response = self._client_wrapper.httpx_client.request(
144
- "voices/clone",
186
+ "voices/",
145
187
  method="POST",
146
- data={
188
+ json={
147
189
  "name": name,
148
190
  "description": description,
191
+ "embedding": embedding,
149
192
  "language": language,
150
- "mode": mode,
151
- "enhance": enhance,
152
- "transcript": transcript,
153
- },
154
- files={
155
- "clip": clip,
156
193
  },
157
194
  request_options=request_options,
158
195
  omit=OMIT,
@@ -160,9 +197,9 @@ class VoicesClient:
160
197
  try:
161
198
  if 200 <= _response.status_code < 300:
162
199
  return typing.cast(
163
- VoiceMetadata,
200
+ Voice,
164
201
  parse_obj_as(
165
- type_=VoiceMetadata, # type: ignore
202
+ type_=Voice, # type: ignore
166
203
  object_=_response.json(),
167
204
  ),
168
205
  )
@@ -431,39 +468,58 @@ class VoicesClient:
431
468
  raise ApiError(status_code=_response.status_code, body=_response.text)
432
469
  raise ApiError(status_code=_response.status_code, body=_response_json)
433
470
 
434
- def create(
471
+ def clone(
435
472
  self,
436
473
  *,
474
+ clip: core.File,
437
475
  name: str,
438
- description: str,
439
- embedding: Embedding,
440
- language: typing.Optional[SupportedLanguage] = OMIT,
441
- base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
476
+ language: SupportedLanguage,
477
+ mode: CloneMode,
478
+ enhance: bool,
479
+ description: typing.Optional[str] = OMIT,
480
+ transcript: typing.Optional[str] = OMIT,
442
481
  request_options: typing.Optional[RequestOptions] = None,
443
- ) -> Voice:
482
+ ) -> VoiceMetadata:
444
483
  """
445
- Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
484
+ Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
485
+ Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
486
+ Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
446
487
 
447
488
  Parameters
448
489
  ----------
490
+ clip : core.File
491
+ See core.File for more documentation
492
+
449
493
  name : str
450
494
  The name of the voice.
451
495
 
452
- description : str
453
- The description of the voice.
454
496
 
455
- embedding : Embedding
497
+ language : SupportedLanguage
498
+ The language of the voice.
456
499
 
457
- language : typing.Optional[SupportedLanguage]
458
500
 
459
- base_voice_id : typing.Optional[BaseVoiceId]
501
+ mode : CloneMode
502
+ Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
503
+
504
+
505
+ enhance : bool
506
+ Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
507
+
508
+
509
+ description : typing.Optional[str]
510
+ A description for the voice.
511
+
512
+
513
+ transcript : typing.Optional[str]
514
+ Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
515
+
460
516
 
461
517
  request_options : typing.Optional[RequestOptions]
462
518
  Request-specific configuration.
463
519
 
464
520
  Returns
465
521
  -------
466
- Voice
522
+ VoiceMetadata
467
523
 
468
524
  Examples
469
525
  --------
@@ -472,23 +528,27 @@ class VoicesClient:
472
528
  client = Cartesia(
473
529
  api_key="YOUR_API_KEY",
474
530
  )
475
- client.voices.create(
476
- name="My Custom Voice",
477
- description="A custom voice created through the API",
478
- embedding=[],
531
+ client.voices.clone(
532
+ name="A high-stability cloned voice",
533
+ description="Copied from Cartesia docs",
534
+ mode="stability",
479
535
  language="en",
480
- base_voice_id="123e4567-e89b-12d3-a456-426614174000",
536
+ enhance=True,
481
537
  )
482
538
  """
483
539
  _response = self._client_wrapper.httpx_client.request(
484
- "voices/",
540
+ "voices/clone",
485
541
  method="POST",
486
- json={
542
+ data={
487
543
  "name": name,
488
544
  "description": description,
489
- "embedding": embedding,
490
545
  "language": language,
491
- "base_voice_id": base_voice_id,
546
+ "mode": mode,
547
+ "enhance": enhance,
548
+ "transcript": transcript,
549
+ },
550
+ files={
551
+ "clip": clip,
492
552
  },
493
553
  request_options=request_options,
494
554
  omit=OMIT,
@@ -496,9 +556,9 @@ class VoicesClient:
496
556
  try:
497
557
  if 200 <= _response.status_code < 300:
498
558
  return typing.cast(
499
- Voice,
559
+ VoiceMetadata,
500
560
  parse_obj_as(
501
- type_=Voice, # type: ignore
561
+ type_=VoiceMetadata, # type: ignore
502
562
  object_=_response.json(),
503
563
  ),
504
564
  )
@@ -512,16 +572,54 @@ class AsyncVoicesClient:
512
572
  def __init__(self, *, client_wrapper: AsyncClientWrapper):
513
573
  self._client_wrapper = client_wrapper
514
574
 
515
- async def list(self, *, request_options: typing.Optional[RequestOptions] = None) -> typing.List[Voice]:
575
+ async def list(
576
+ self,
577
+ *,
578
+ limit: typing.Optional[int] = None,
579
+ starting_after: typing.Optional[str] = None,
580
+ ending_before: typing.Optional[str] = None,
581
+ is_owner: typing.Optional[bool] = None,
582
+ is_starred: typing.Optional[bool] = None,
583
+ gender: typing.Optional[GenderPresentation] = None,
584
+ expand: typing.Optional[typing.Sequence[VoiceExpandOptions]] = None,
585
+ request_options: typing.Optional[RequestOptions] = None,
586
+ ) -> AsyncPager[Voice]:
516
587
  """
517
588
  Parameters
518
589
  ----------
590
+ limit : typing.Optional[int]
591
+ The number of Voices to return per page, ranging between 1 and 100.
592
+
593
+ starting_after : typing.Optional[str]
594
+ A cursor to use in pagination. `starting_after` is a Voice ID that defines your
595
+ place in the list. For example, if you make a /voices request and receive 100
596
+ objects, ending with `voice_abc123`, your subsequent call can include
597
+ `starting_after=voice_abc123` to fetch the next page of the list.
598
+
599
+ ending_before : typing.Optional[str]
600
+ A cursor to use in pagination. `ending_before` is a Voice ID that defines your
601
+ place in the list. For example, if you make a /voices request and receive 100
602
+ objects, starting with `voice_abc123`, your subsequent call can include
603
+ `ending_before=voice_abc123` to fetch the previous page of the list.
604
+
605
+ is_owner : typing.Optional[bool]
606
+ Whether to only return voices owned by the current user.
607
+
608
+ is_starred : typing.Optional[bool]
609
+ Whether to only return starred voices.
610
+
611
+ gender : typing.Optional[GenderPresentation]
612
+ The gender presentation of the voices to return.
613
+
614
+ expand : typing.Optional[typing.Sequence[VoiceExpandOptions]]
615
+ Additional fields to include in the response.
616
+
519
617
  request_options : typing.Optional[RequestOptions]
520
618
  Request-specific configuration.
521
619
 
522
620
  Returns
523
621
  -------
524
- typing.List[Voice]
622
+ AsyncPager[Voice]
525
623
 
526
624
  Examples
527
625
  --------
@@ -535,7 +633,12 @@ class AsyncVoicesClient:
535
633
 
536
634
 
537
635
  async def main() -> None:
538
- await client.voices.list()
636
+ response = await client.voices.list()
637
+ async for item in response:
638
+ yield item
639
+ # alternatively, you can paginate page-by-page
640
+ async for page in response.iter_pages():
641
+ yield page
539
642
 
540
643
 
541
644
  asyncio.run(main())
@@ -543,76 +646,73 @@ class AsyncVoicesClient:
543
646
  _response = await self._client_wrapper.httpx_client.request(
544
647
  "voices/",
545
648
  method="GET",
649
+ params={
650
+ "limit": limit,
651
+ "starting_after": starting_after,
652
+ "ending_before": ending_before,
653
+ "is_owner": is_owner,
654
+ "is_starred": is_starred,
655
+ "gender": gender,
656
+ "expand[]": expand,
657
+ },
546
658
  request_options=request_options,
547
659
  )
548
660
  try:
549
661
  if 200 <= _response.status_code < 300:
550
- return typing.cast(
551
- typing.List[Voice],
662
+ _parsed_response = typing.cast(
663
+ GetVoicesResponse,
552
664
  parse_obj_as(
553
- type_=typing.List[Voice], # type: ignore
665
+ type_=GetVoicesResponse, # type: ignore
554
666
  object_=_response.json(),
555
667
  ),
556
668
  )
669
+ _parsed_next = _parsed_response.next_page
670
+ _has_next = _parsed_next is not None and _parsed_next != ""
671
+ _get_next = lambda: self.list(
672
+ limit=limit,
673
+ starting_after=_parsed_next,
674
+ ending_before=ending_before,
675
+ is_owner=is_owner,
676
+ is_starred=is_starred,
677
+ gender=gender,
678
+ expand=expand,
679
+ request_options=request_options,
680
+ )
681
+ _items = _parsed_response.data
682
+ return AsyncPager(has_next=_has_next, items=_items, get_next=_get_next)
557
683
  _response_json = _response.json()
558
684
  except JSONDecodeError:
559
685
  raise ApiError(status_code=_response.status_code, body=_response.text)
560
686
  raise ApiError(status_code=_response.status_code, body=_response_json)
561
687
 
562
- async def clone(
688
+ async def create(
563
689
  self,
564
690
  *,
565
- clip: core.File,
566
691
  name: str,
567
- language: SupportedLanguage,
568
- mode: CloneMode,
569
- enhance: bool,
570
- description: typing.Optional[str] = OMIT,
571
- transcript: typing.Optional[str] = OMIT,
692
+ description: str,
693
+ embedding: Embedding,
694
+ language: typing.Optional[SupportedLanguage] = OMIT,
572
695
  request_options: typing.Optional[RequestOptions] = None,
573
- ) -> VoiceMetadata:
696
+ ) -> Voice:
574
697
  """
575
- Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
576
-
577
- Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
578
-
579
- Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
580
-
581
698
  Parameters
582
699
  ----------
583
- clip : core.File
584
- See core.File for more documentation
585
-
586
700
  name : str
587
701
  The name of the voice.
588
702
 
703
+ description : str
704
+ The description of the voice.
589
705
 
590
- language : SupportedLanguage
591
- The language of the voice.
592
-
593
-
594
- mode : CloneMode
595
- Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
596
-
597
-
598
- enhance : bool
599
- Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
600
-
601
-
602
- description : typing.Optional[str]
603
- A description for the voice.
604
-
605
-
606
- transcript : typing.Optional[str]
607
- Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
706
+ embedding : Embedding
608
707
 
708
+ language : typing.Optional[SupportedLanguage]
609
709
 
610
710
  request_options : typing.Optional[RequestOptions]
611
711
  Request-specific configuration.
612
712
 
613
713
  Returns
614
714
  -------
615
- VoiceMetadata
715
+ Voice
616
716
 
617
717
  Examples
618
718
  --------
@@ -626,30 +726,23 @@ class AsyncVoicesClient:
626
726
 
627
727
 
628
728
  async def main() -> None:
629
- await client.voices.clone(
630
- name="A high-stability cloned voice",
631
- description="Copied from Cartesia docs",
632
- mode="stability",
633
- language="en",
634
- enhance=True,
729
+ await client.voices.create(
730
+ name="name",
731
+ description="description",
732
+ embedding=[1.1, 1.1],
635
733
  )
636
734
 
637
735
 
638
736
  asyncio.run(main())
639
737
  """
640
738
  _response = await self._client_wrapper.httpx_client.request(
641
- "voices/clone",
739
+ "voices/",
642
740
  method="POST",
643
- data={
741
+ json={
644
742
  "name": name,
645
743
  "description": description,
744
+ "embedding": embedding,
646
745
  "language": language,
647
- "mode": mode,
648
- "enhance": enhance,
649
- "transcript": transcript,
650
- },
651
- files={
652
- "clip": clip,
653
746
  },
654
747
  request_options=request_options,
655
748
  omit=OMIT,
@@ -657,9 +750,9 @@ class AsyncVoicesClient:
657
750
  try:
658
751
  if 200 <= _response.status_code < 300:
659
752
  return typing.cast(
660
- VoiceMetadata,
753
+ Voice,
661
754
  parse_obj_as(
662
- type_=VoiceMetadata, # type: ignore
755
+ type_=Voice, # type: ignore
663
756
  object_=_response.json(),
664
757
  ),
665
758
  )
@@ -968,39 +1061,58 @@ class AsyncVoicesClient:
968
1061
  raise ApiError(status_code=_response.status_code, body=_response.text)
969
1062
  raise ApiError(status_code=_response.status_code, body=_response_json)
970
1063
 
971
- async def create(
1064
+ async def clone(
972
1065
  self,
973
1066
  *,
1067
+ clip: core.File,
974
1068
  name: str,
975
- description: str,
976
- embedding: Embedding,
977
- language: typing.Optional[SupportedLanguage] = OMIT,
978
- base_voice_id: typing.Optional[BaseVoiceId] = OMIT,
1069
+ language: SupportedLanguage,
1070
+ mode: CloneMode,
1071
+ enhance: bool,
1072
+ description: typing.Optional[str] = OMIT,
1073
+ transcript: typing.Optional[str] = OMIT,
979
1074
  request_options: typing.Optional[RequestOptions] = None,
980
- ) -> Voice:
1075
+ ) -> VoiceMetadata:
981
1076
  """
982
- Create voice from raw features. If you'd like to clone a voice from an audio file, please use Clone Voice instead.
1077
+ Clone a voice from an audio clip. This endpoint has two modes, stability and similarity.
1078
+ Similarity mode clones are more similar to the source clip, but may reproduce background noise. For these, use an audio clip about 5 seconds long.
1079
+ Stability mode clones are more stable, but may not sound as similar to the source clip. For these, use an audio clip 10-20 seconds long.
983
1080
 
984
1081
  Parameters
985
1082
  ----------
1083
+ clip : core.File
1084
+ See core.File for more documentation
1085
+
986
1086
  name : str
987
1087
  The name of the voice.
988
1088
 
989
- description : str
990
- The description of the voice.
991
1089
 
992
- embedding : Embedding
1090
+ language : SupportedLanguage
1091
+ The language of the voice.
993
1092
 
994
- language : typing.Optional[SupportedLanguage]
995
1093
 
996
- base_voice_id : typing.Optional[BaseVoiceId]
1094
+ mode : CloneMode
1095
+ Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
1096
+
1097
+
1098
+ enhance : bool
1099
+ Whether to enhance the clip to improve its quality before cloning. Useful if the clip has background noise.
1100
+
1101
+
1102
+ description : typing.Optional[str]
1103
+ A description for the voice.
1104
+
1105
+
1106
+ transcript : typing.Optional[str]
1107
+ Optional transcript of the words spoken in the audio clip. Only used for similarity mode.
1108
+
997
1109
 
998
1110
  request_options : typing.Optional[RequestOptions]
999
1111
  Request-specific configuration.
1000
1112
 
1001
1113
  Returns
1002
1114
  -------
1003
- Voice
1115
+ VoiceMetadata
1004
1116
 
1005
1117
  Examples
1006
1118
  --------
@@ -1014,26 +1126,30 @@ class AsyncVoicesClient:
1014
1126
 
1015
1127
 
1016
1128
  async def main() -> None:
1017
- await client.voices.create(
1018
- name="My Custom Voice",
1019
- description="A custom voice created through the API",
1020
- embedding=[],
1129
+ await client.voices.clone(
1130
+ name="A high-stability cloned voice",
1131
+ description="Copied from Cartesia docs",
1132
+ mode="stability",
1021
1133
  language="en",
1022
- base_voice_id="123e4567-e89b-12d3-a456-426614174000",
1134
+ enhance=True,
1023
1135
  )
1024
1136
 
1025
1137
 
1026
1138
  asyncio.run(main())
1027
1139
  """
1028
1140
  _response = await self._client_wrapper.httpx_client.request(
1029
- "voices/",
1141
+ "voices/clone",
1030
1142
  method="POST",
1031
- json={
1143
+ data={
1032
1144
  "name": name,
1033
1145
  "description": description,
1034
- "embedding": embedding,
1035
1146
  "language": language,
1036
- "base_voice_id": base_voice_id,
1147
+ "mode": mode,
1148
+ "enhance": enhance,
1149
+ "transcript": transcript,
1150
+ },
1151
+ files={
1152
+ "clip": clip,
1037
1153
  },
1038
1154
  request_options=request_options,
1039
1155
  omit=OMIT,
@@ -1041,9 +1157,9 @@ class AsyncVoicesClient:
1041
1157
  try:
1042
1158
  if 200 <= _response.status_code < 300:
1043
1159
  return typing.cast(
1044
- Voice,
1160
+ VoiceMetadata,
1045
1161
  parse_obj_as(
1046
- type_=Voice, # type: ignore
1162
+ type_=VoiceMetadata, # type: ignore
1047
1163
  object_=_response.json(),
1048
1164
  ),
1049
1165
  )