cartesia 2.0.9__py3-none-any.whl → 2.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartesia might be problematic. Click here for more details.

Files changed (34) hide show
  1. cartesia/__init__.py +6 -0
  2. cartesia/core/client_wrapper.py +1 -1
  3. cartesia/infill/client.py +2 -2
  4. cartesia/tts/__init__.py +6 -0
  5. cartesia/tts/client.py +25 -0
  6. cartesia/tts/requests/__init__.py +2 -0
  7. cartesia/tts/requests/controls.py +2 -2
  8. cartesia/tts/requests/generation_config.py +26 -0
  9. cartesia/tts/requests/generation_request.py +2 -0
  10. cartesia/tts/requests/mp_3_output_format.py +4 -0
  11. cartesia/tts/requests/raw_output_format.py +4 -0
  12. cartesia/tts/requests/sse_output_format.py +3 -0
  13. cartesia/tts/requests/tts_request.py +2 -0
  14. cartesia/tts/requests/ttssse_request.py +2 -0
  15. cartesia/tts/requests/web_socket_raw_output_format.py +3 -0
  16. cartesia/tts/requests/web_socket_tts_request.py +2 -0
  17. cartesia/tts/types/__init__.py +4 -0
  18. cartesia/tts/types/controls.py +2 -2
  19. cartesia/tts/types/emotion.py +1 -32
  20. cartesia/tts/types/emotion_deprecated.py +34 -0
  21. cartesia/tts/types/generation_config.py +37 -0
  22. cartesia/tts/types/generation_request.py +2 -0
  23. cartesia/tts/types/mp_3_output_format.py +5 -1
  24. cartesia/tts/types/raw_output_format.py +6 -2
  25. cartesia/tts/types/sse_output_format.py +5 -2
  26. cartesia/tts/types/tts_request.py +2 -0
  27. cartesia/tts/types/ttssse_request.py +2 -0
  28. cartesia/tts/types/web_socket_raw_output_format.py +5 -2
  29. cartesia/tts/types/web_socket_tts_request.py +2 -0
  30. cartesia/voice_changer/client.py +2 -0
  31. {cartesia-2.0.9.dist-info → cartesia-2.0.13.dist-info}/METADATA +82 -72
  32. {cartesia-2.0.9.dist-info → cartesia-2.0.13.dist-info}/RECORD +34 -31
  33. {cartesia-2.0.9.dist-info → cartesia-2.0.13.dist-info}/LICENSE +0 -0
  34. {cartesia-2.0.9.dist-info → cartesia-2.0.13.dist-info}/WHEEL +0 -0
cartesia/__init__.py CHANGED
@@ -52,7 +52,10 @@ from .tts import (
52
52
  Controls,
53
53
  ControlsParams,
54
54
  Emotion,
55
+ EmotionDeprecated,
55
56
  FlushId,
57
+ GenerationConfig,
58
+ GenerationConfigParams,
56
59
  GenerationRequest,
57
60
  GenerationRequestParams,
58
61
  ModelSpeed,
@@ -211,6 +214,7 @@ __all__ = [
211
214
  "EmbeddingSpecifier",
212
215
  "EmbeddingSpecifierParams",
213
216
  "Emotion",
217
+ "EmotionDeprecated",
214
218
  "ErrorMessage",
215
219
  "ErrorMessageParams",
216
220
  "FilePurpose",
@@ -219,6 +223,8 @@ __all__ = [
219
223
  "FlushId",
220
224
  "Gender",
221
225
  "GenderPresentation",
226
+ "GenerationConfig",
227
+ "GenerationConfigParams",
222
228
  "GenerationRequest",
223
229
  "GenerationRequestParams",
224
230
  "GetVoicesResponse",
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "cartesia",
19
- "X-Fern-SDK-Version": "2.0.9",
19
+ "X-Fern-SDK-Version": "2.0.13",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  headers["Cartesia-Version"] = "2024-11-13"
cartesia/infill/client.py CHANGED
@@ -78,7 +78,7 @@ class InfillClient:
78
78
  The format of the output audio
79
79
 
80
80
  output_format_sample_rate : int
81
- The sample rate of the output audio
81
+ The sample rate of the output audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
82
82
 
83
83
  output_format_encoding : typing.Optional[RawEncoding]
84
84
  Required for `raw` and `wav` containers.
@@ -221,7 +221,7 @@ class AsyncInfillClient:
221
221
  The format of the output audio
222
222
 
223
223
  output_format_sample_rate : int
224
- The sample rate of the output audio
224
+ The sample rate of the output audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
225
225
 
226
226
  output_format_encoding : typing.Optional[RawEncoding]
227
227
  Required for `raw` and `wav` containers.
cartesia/tts/__init__.py CHANGED
@@ -5,7 +5,9 @@ from .types import (
5
5
  ContextId,
6
6
  Controls,
7
7
  Emotion,
8
+ EmotionDeprecated,
8
9
  FlushId,
10
+ GenerationConfig,
9
11
  GenerationRequest,
10
12
  ModelSpeed,
11
13
  Mp3OutputFormat,
@@ -51,6 +53,7 @@ from .types import (
51
53
  from .requests import (
52
54
  CancelContextRequestParams,
53
55
  ControlsParams,
56
+ GenerationConfigParams,
54
57
  GenerationRequestParams,
55
58
  Mp3OutputFormatParams,
56
59
  OutputFormatParams,
@@ -96,7 +99,10 @@ __all__ = [
96
99
  "Controls",
97
100
  "ControlsParams",
98
101
  "Emotion",
102
+ "EmotionDeprecated",
99
103
  "FlushId",
104
+ "GenerationConfig",
105
+ "GenerationConfigParams",
100
106
  "GenerationRequest",
101
107
  "GenerationRequestParams",
102
108
  "ModelSpeed",
cartesia/tts/client.py CHANGED
@@ -5,6 +5,7 @@ from ..core.client_wrapper import SyncClientWrapper
5
5
  from .requests.tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
6
6
  from .requests.output_format import OutputFormatParams
7
7
  from .types.supported_language import SupportedLanguage
8
+ from .requests.generation_config import GenerationConfigParams
8
9
  from .types.model_speed import ModelSpeed
9
10
  from ..core.request_options import RequestOptions
10
11
  from ..core.serialization import convert_and_respect_annotation_metadata
@@ -34,6 +35,7 @@ class TtsClient:
34
35
  voice: TtsRequestVoiceSpecifierParams,
35
36
  output_format: OutputFormatParams,
36
37
  language: typing.Optional[SupportedLanguage] = OMIT,
38
+ generation_config: typing.Optional[GenerationConfigParams] = OMIT,
37
39
  duration: typing.Optional[float] = OMIT,
38
40
  speed: typing.Optional[ModelSpeed] = OMIT,
39
41
  request_options: typing.Optional[RequestOptions] = None,
@@ -52,6 +54,8 @@ class TtsClient:
52
54
 
53
55
  language : typing.Optional[SupportedLanguage]
54
56
 
57
+ generation_config : typing.Optional[GenerationConfigParams]
58
+
55
59
  duration : typing.Optional[float]
56
60
  The maximum duration of the audio in seconds. You do not usually need to specify this.
57
61
  If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
@@ -97,6 +101,9 @@ class TtsClient:
97
101
  "output_format": convert_and_respect_annotation_metadata(
98
102
  object_=output_format, annotation=OutputFormatParams, direction="write"
99
103
  ),
104
+ "generation_config": convert_and_respect_annotation_metadata(
105
+ object_=generation_config, annotation=GenerationConfigParams, direction="write"
106
+ ),
100
107
  "duration": duration,
101
108
  "speed": speed,
102
109
  },
@@ -123,6 +130,7 @@ class TtsClient:
123
130
  voice: TtsRequestVoiceSpecifierParams,
124
131
  output_format: SseOutputFormatParams,
125
132
  language: typing.Optional[SupportedLanguage] = OMIT,
133
+ generation_config: typing.Optional[GenerationConfigParams] = OMIT,
126
134
  duration: typing.Optional[float] = OMIT,
127
135
  speed: typing.Optional[ModelSpeed] = OMIT,
128
136
  add_timestamps: typing.Optional[bool] = OMIT,
@@ -145,6 +153,8 @@ class TtsClient:
145
153
 
146
154
  language : typing.Optional[SupportedLanguage]
147
155
 
156
+ generation_config : typing.Optional[GenerationConfigParams]
157
+
148
158
  duration : typing.Optional[float]
149
159
  The maximum duration of the audio in seconds. You do not usually need to specify this.
150
160
  If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
@@ -204,6 +214,9 @@ class TtsClient:
204
214
  "output_format": convert_and_respect_annotation_metadata(
205
215
  object_=output_format, annotation=SseOutputFormatParams, direction="write"
206
216
  ),
217
+ "generation_config": convert_and_respect_annotation_metadata(
218
+ object_=generation_config, annotation=GenerationConfigParams, direction="write"
219
+ ),
207
220
  "duration": duration,
208
221
  "speed": speed,
209
222
  "add_timestamps": add_timestamps,
@@ -248,6 +261,7 @@ class AsyncTtsClient:
248
261
  voice: TtsRequestVoiceSpecifierParams,
249
262
  output_format: OutputFormatParams,
250
263
  language: typing.Optional[SupportedLanguage] = OMIT,
264
+ generation_config: typing.Optional[GenerationConfigParams] = OMIT,
251
265
  duration: typing.Optional[float] = OMIT,
252
266
  speed: typing.Optional[ModelSpeed] = OMIT,
253
267
  request_options: typing.Optional[RequestOptions] = None,
@@ -266,6 +280,8 @@ class AsyncTtsClient:
266
280
 
267
281
  language : typing.Optional[SupportedLanguage]
268
282
 
283
+ generation_config : typing.Optional[GenerationConfigParams]
284
+
269
285
  duration : typing.Optional[float]
270
286
  The maximum duration of the audio in seconds. You do not usually need to specify this.
271
287
  If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
@@ -319,6 +335,9 @@ class AsyncTtsClient:
319
335
  "output_format": convert_and_respect_annotation_metadata(
320
336
  object_=output_format, annotation=OutputFormatParams, direction="write"
321
337
  ),
338
+ "generation_config": convert_and_respect_annotation_metadata(
339
+ object_=generation_config, annotation=GenerationConfigParams, direction="write"
340
+ ),
322
341
  "duration": duration,
323
342
  "speed": speed,
324
343
  },
@@ -345,6 +364,7 @@ class AsyncTtsClient:
345
364
  voice: TtsRequestVoiceSpecifierParams,
346
365
  output_format: SseOutputFormatParams,
347
366
  language: typing.Optional[SupportedLanguage] = OMIT,
367
+ generation_config: typing.Optional[GenerationConfigParams] = OMIT,
348
368
  duration: typing.Optional[float] = OMIT,
349
369
  speed: typing.Optional[ModelSpeed] = OMIT,
350
370
  add_timestamps: typing.Optional[bool] = OMIT,
@@ -367,6 +387,8 @@ class AsyncTtsClient:
367
387
 
368
388
  language : typing.Optional[SupportedLanguage]
369
389
 
390
+ generation_config : typing.Optional[GenerationConfigParams]
391
+
370
392
  duration : typing.Optional[float]
371
393
  The maximum duration of the audio in seconds. You do not usually need to specify this.
372
394
  If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
@@ -434,6 +456,9 @@ class AsyncTtsClient:
434
456
  "output_format": convert_and_respect_annotation_metadata(
435
457
  object_=output_format, annotation=SseOutputFormatParams, direction="write"
436
458
  ),
459
+ "generation_config": convert_and_respect_annotation_metadata(
460
+ object_=generation_config, annotation=GenerationConfigParams, direction="write"
461
+ ),
437
462
  "duration": duration,
438
463
  "speed": speed,
439
464
  "add_timestamps": add_timestamps,
@@ -2,6 +2,7 @@
2
2
 
3
3
  from .cancel_context_request import CancelContextRequestParams
4
4
  from .controls import ControlsParams
5
+ from .generation_config import GenerationConfigParams
5
6
  from .generation_request import GenerationRequestParams
6
7
  from .mp_3_output_format import Mp3OutputFormatParams
7
8
  from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFormat_RawParams, OutputFormat_WavParams
@@ -41,6 +42,7 @@ from .word_timestamps import WordTimestampsParams
41
42
  __all__ = [
42
43
  "CancelContextRequestParams",
43
44
  "ControlsParams",
45
+ "GenerationConfigParams",
44
46
  "GenerationRequestParams",
45
47
  "Mp3OutputFormatParams",
46
48
  "OutputFormatParams",
@@ -3,9 +3,9 @@
3
3
  import typing_extensions
4
4
  from .speed import SpeedParams
5
5
  import typing
6
- from ..types.emotion import Emotion
6
+ from ..types.emotion_deprecated import EmotionDeprecated
7
7
 
8
8
 
9
9
  class ControlsParams(typing_extensions.TypedDict):
10
10
  speed: SpeedParams
11
- emotion: typing.Sequence[Emotion]
11
+ emotion: typing.Sequence[EmotionDeprecated]
@@ -0,0 +1,26 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+ from ..types.emotion import Emotion
6
+
7
+
8
+ class GenerationConfigParams(typing_extensions.TypedDict):
9
+ """
10
+ Configure the various attributes of the generated speech. These controls only for `sonic-3` and have no effect on earlier models.
11
+ """
12
+
13
+ volume: typing_extensions.NotRequired[float]
14
+ """
15
+ Adjust the volume of the generated speech between 0.5x and 2.0x the original volume (default is 1.0x). Valid values are between 0.5 and 2.0 inclusive.
16
+ """
17
+
18
+ speed: typing_extensions.NotRequired[float]
19
+ """
20
+ Adjust the speed of the generated speech between 0.6x and 1.5x the original speed (default is 1.0x). Valid values are between 0.6 and 1.5 inclusive.
21
+ """
22
+
23
+ emotion: typing_extensions.NotRequired[Emotion]
24
+ """
25
+ Guide the emotion of the generated speech.
26
+ """
@@ -6,6 +6,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
6
6
  import typing_extensions
7
7
  from ..types.supported_language import SupportedLanguage
8
8
  from .web_socket_raw_output_format import WebSocketRawOutputFormatParams
9
+ from .generation_config import GenerationConfigParams
9
10
  from ..types.model_speed import ModelSpeed
10
11
  from ..types.context_id import ContextId
11
12
  from ...core.serialization import FieldMetadata
@@ -25,6 +26,7 @@ class GenerationRequestParams(typing_extensions.TypedDict):
25
26
  voice: TtsRequestVoiceSpecifierParams
26
27
  language: typing_extensions.NotRequired[SupportedLanguage]
27
28
  output_format: WebSocketRawOutputFormatParams
29
+ generation_config: typing_extensions.NotRequired[GenerationConfigParams]
28
30
  duration: typing_extensions.NotRequired[float]
29
31
  """
30
32
  The maximum duration of the audio in seconds. You do not usually need to specify this.
@@ -5,6 +5,10 @@ import typing_extensions
5
5
 
6
6
  class Mp3OutputFormatParams(typing_extensions.TypedDict):
7
7
  sample_rate: int
8
+ """
9
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
10
+ """
11
+
8
12
  bit_rate: int
9
13
  """
10
14
  The bit rate of the audio in bits per second. Supported bit rates are 32000, 64000, 96000, 128000, 192000.
@@ -8,4 +8,8 @@ import typing_extensions
8
8
  class RawOutputFormatParams(typing_extensions.TypedDict):
9
9
  encoding: RawEncoding
10
10
  sample_rate: int
11
+ """
12
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
13
+ """
14
+
11
15
  bit_rate: typing_extensions.NotRequired[int]
@@ -9,3 +9,6 @@ class SseOutputFormatParams(typing_extensions.TypedDict):
9
9
  container: typing.Literal["raw"]
10
10
  encoding: RawEncoding
11
11
  sample_rate: int
12
+ """
13
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
14
+ """
@@ -5,6 +5,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
5
5
  import typing_extensions
6
6
  from ..types.supported_language import SupportedLanguage
7
7
  from .output_format import OutputFormatParams
8
+ from .generation_config import GenerationConfigParams
8
9
  from ..types.model_speed import ModelSpeed
9
10
 
10
11
 
@@ -18,6 +19,7 @@ class TtsRequestParams(typing_extensions.TypedDict):
18
19
  voice: TtsRequestVoiceSpecifierParams
19
20
  language: typing_extensions.NotRequired[SupportedLanguage]
20
21
  output_format: OutputFormatParams
22
+ generation_config: typing_extensions.NotRequired[GenerationConfigParams]
21
23
  duration: typing_extensions.NotRequired[float]
22
24
  """
23
25
  The maximum duration of the audio in seconds. You do not usually need to specify this.
@@ -5,6 +5,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
5
5
  import typing_extensions
6
6
  from ..types.supported_language import SupportedLanguage
7
7
  from .sse_output_format import SseOutputFormatParams
8
+ from .generation_config import GenerationConfigParams
8
9
  from ..types.model_speed import ModelSpeed
9
10
  from ..types.context_id import ContextId
10
11
 
@@ -19,6 +20,7 @@ class TtssseRequestParams(typing_extensions.TypedDict):
19
20
  voice: TtsRequestVoiceSpecifierParams
20
21
  language: typing_extensions.NotRequired[SupportedLanguage]
21
22
  output_format: SseOutputFormatParams
23
+ generation_config: typing_extensions.NotRequired[GenerationConfigParams]
22
24
  duration: typing_extensions.NotRequired[float]
23
25
  """
24
26
  The maximum duration of the audio in seconds. You do not usually need to specify this.
@@ -9,3 +9,6 @@ class WebSocketRawOutputFormatParams(typing_extensions.TypedDict):
9
9
  container: typing.Literal["raw"]
10
10
  encoding: RawEncoding
11
11
  sample_rate: int
12
+ """
13
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
14
+ """
@@ -3,6 +3,7 @@
3
3
  import typing_extensions
4
4
  import typing_extensions
5
5
  from .output_format import OutputFormatParams
6
+ from .generation_config import GenerationConfigParams
6
7
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
7
8
  from ...core.serialization import FieldMetadata
8
9
  from ..types.model_speed import ModelSpeed
@@ -15,6 +16,7 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
15
16
  """
16
17
 
17
18
  output_format: typing_extensions.NotRequired[OutputFormatParams]
19
+ generation_config: typing_extensions.NotRequired[GenerationConfigParams]
18
20
  transcript: typing_extensions.NotRequired[str]
19
21
  voice: TtsRequestVoiceSpecifierParams
20
22
  duration: typing_extensions.NotRequired[int]
@@ -4,7 +4,9 @@ from .cancel_context_request import CancelContextRequest
4
4
  from .context_id import ContextId
5
5
  from .controls import Controls
6
6
  from .emotion import Emotion
7
+ from .emotion_deprecated import EmotionDeprecated
7
8
  from .flush_id import FlushId
9
+ from .generation_config import GenerationConfig
8
10
  from .generation_request import GenerationRequest
9
11
  from .model_speed import ModelSpeed
10
12
  from .mp_3_output_format import Mp3OutputFormat
@@ -51,7 +53,9 @@ __all__ = [
51
53
  "ContextId",
52
54
  "Controls",
53
55
  "Emotion",
56
+ "EmotionDeprecated",
54
57
  "FlushId",
58
+ "GenerationConfig",
55
59
  "GenerationRequest",
56
60
  "ModelSpeed",
57
61
  "Mp3OutputFormat",
@@ -3,14 +3,14 @@
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  from .speed import Speed
5
5
  import typing
6
- from .emotion import Emotion
6
+ from .emotion_deprecated import EmotionDeprecated
7
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
8
  import pydantic
9
9
 
10
10
 
11
11
  class Controls(UniversalBaseModel):
12
12
  speed: Speed
13
- emotion: typing.List[Emotion]
13
+ emotion: typing.List[EmotionDeprecated]
14
14
 
15
15
  if IS_PYDANTIC_V2:
16
16
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -1,34 +1,3 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
- import typing
4
-
5
- Emotion = typing.Union[
6
- typing.Literal[
7
- "anger:lowest",
8
- "anger:low",
9
- "anger",
10
- "anger:high",
11
- "anger:highest",
12
- "positivity:lowest",
13
- "positivity:low",
14
- "positivity",
15
- "positivity:high",
16
- "positivity:highest",
17
- "surprise:lowest",
18
- "surprise:low",
19
- "surprise",
20
- "surprise:high",
21
- "surprise:highest",
22
- "sadness:lowest",
23
- "sadness:low",
24
- "sadness",
25
- "sadness:high",
26
- "sadness:highest",
27
- "curiosity:lowest",
28
- "curiosity:low",
29
- "curiosity",
30
- "curiosity:high",
31
- "curiosity:highest",
32
- ],
33
- typing.Any,
34
- ]
3
+ Emotion = str
@@ -0,0 +1,34 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ EmotionDeprecated = typing.Union[
6
+ typing.Literal[
7
+ "anger:lowest",
8
+ "anger:low",
9
+ "anger",
10
+ "anger:high",
11
+ "anger:highest",
12
+ "positivity:lowest",
13
+ "positivity:low",
14
+ "positivity",
15
+ "positivity:high",
16
+ "positivity:highest",
17
+ "surprise:lowest",
18
+ "surprise:low",
19
+ "surprise",
20
+ "surprise:high",
21
+ "surprise:highest",
22
+ "sadness:lowest",
23
+ "sadness:low",
24
+ "sadness",
25
+ "sadness:high",
26
+ "sadness:highest",
27
+ "curiosity:lowest",
28
+ "curiosity:low",
29
+ "curiosity",
30
+ "curiosity:high",
31
+ "curiosity:highest",
32
+ ],
33
+ typing.Any,
34
+ ]
@@ -0,0 +1,37 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ import pydantic
6
+ from .emotion import Emotion
7
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
+
9
+
10
+ class GenerationConfig(UniversalBaseModel):
11
+ """
12
+ Configure the various attributes of the generated speech. These controls only for `sonic-3` and have no effect on earlier models.
13
+ """
14
+
15
+ volume: typing.Optional[float] = pydantic.Field(default=None)
16
+ """
17
+ Adjust the volume of the generated speech between 0.5x and 2.0x the original volume (default is 1.0x). Valid values are between 0.5 and 2.0 inclusive.
18
+ """
19
+
20
+ speed: typing.Optional[float] = pydantic.Field(default=None)
21
+ """
22
+ Adjust the speed of the generated speech between 0.6x and 1.5x the original speed (default is 1.0x). Valid values are between 0.6 and 1.5 inclusive.
23
+ """
24
+
25
+ emotion: typing.Optional[Emotion] = pydantic.Field(default=None)
26
+ """
27
+ Guide the emotion of the generated speech.
28
+ """
29
+
30
+ if IS_PYDANTIC_V2:
31
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
32
+ else:
33
+
34
+ class Config:
35
+ frozen = True
36
+ smart_union = True
37
+ extra = pydantic.Extra.allow
@@ -6,6 +6,7 @@ import typing
6
6
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
7
7
  from .supported_language import SupportedLanguage
8
8
  from .web_socket_raw_output_format import WebSocketRawOutputFormat
9
+ from .generation_config import GenerationConfig
9
10
  from .model_speed import ModelSpeed
10
11
  from .context_id import ContextId
11
12
  import typing_extensions
@@ -27,6 +28,7 @@ class GenerationRequest(UniversalBaseModel):
27
28
  voice: TtsRequestVoiceSpecifier
28
29
  language: typing.Optional[SupportedLanguage] = None
29
30
  output_format: WebSocketRawOutputFormat
31
+ generation_config: typing.Optional[GenerationConfig] = None
30
32
  duration: typing.Optional[float] = pydantic.Field(default=None)
31
33
  """
32
34
  The maximum duration of the audio in seconds. You do not usually need to specify this.
@@ -7,7 +7,11 @@ import typing
7
7
 
8
8
 
9
9
  class Mp3OutputFormat(UniversalBaseModel):
10
- sample_rate: int
10
+ sample_rate: int = pydantic.Field()
11
+ """
12
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
13
+ """
14
+
11
15
  bit_rate: int = pydantic.Field()
12
16
  """
13
17
  The bit rate of the audio in bits per second. Supported bit rates are 32000, 64000, 96000, 128000, 192000.
@@ -2,14 +2,18 @@
2
2
 
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  from .raw_encoding import RawEncoding
5
+ import pydantic
5
6
  import typing
6
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
- import pydantic
8
8
 
9
9
 
10
10
  class RawOutputFormat(UniversalBaseModel):
11
11
  encoding: RawEncoding
12
- sample_rate: int
12
+ sample_rate: int = pydantic.Field()
13
+ """
14
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
15
+ """
16
+
13
17
  bit_rate: typing.Optional[int] = None
14
18
 
15
19
  if IS_PYDANTIC_V2:
@@ -3,14 +3,17 @@
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  import typing
5
5
  from .raw_encoding import RawEncoding
6
- from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
6
  import pydantic
7
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
8
 
9
9
 
10
10
  class SseOutputFormat(UniversalBaseModel):
11
11
  container: typing.Literal["raw"] = "raw"
12
12
  encoding: RawEncoding
13
- sample_rate: int
13
+ sample_rate: int = pydantic.Field()
14
+ """
15
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
16
+ """
14
17
 
15
18
  if IS_PYDANTIC_V2:
16
19
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -6,6 +6,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
6
6
  import typing
7
7
  from .supported_language import SupportedLanguage
8
8
  from .output_format import OutputFormat
9
+ from .generation_config import GenerationConfig
9
10
  from .model_speed import ModelSpeed
10
11
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
11
12
 
@@ -20,6 +21,7 @@ class TtsRequest(UniversalBaseModel):
20
21
  voice: TtsRequestVoiceSpecifier
21
22
  language: typing.Optional[SupportedLanguage] = None
22
23
  output_format: OutputFormat
24
+ generation_config: typing.Optional[GenerationConfig] = None
23
25
  duration: typing.Optional[float] = pydantic.Field(default=None)
24
26
  """
25
27
  The maximum duration of the audio in seconds. You do not usually need to specify this.
@@ -6,6 +6,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
6
6
  import typing
7
7
  from .supported_language import SupportedLanguage
8
8
  from .sse_output_format import SseOutputFormat
9
+ from .generation_config import GenerationConfig
9
10
  from .model_speed import ModelSpeed
10
11
  from .context_id import ContextId
11
12
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -21,6 +22,7 @@ class TtssseRequest(UniversalBaseModel):
21
22
  voice: TtsRequestVoiceSpecifier
22
23
  language: typing.Optional[SupportedLanguage] = None
23
24
  output_format: SseOutputFormat
25
+ generation_config: typing.Optional[GenerationConfig] = None
24
26
  duration: typing.Optional[float] = pydantic.Field(default=None)
25
27
  """
26
28
  The maximum duration of the audio in seconds. You do not usually need to specify this.
@@ -3,14 +3,17 @@
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  import typing
5
5
  from .raw_encoding import RawEncoding
6
- from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
6
  import pydantic
7
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
8
 
9
9
 
10
10
  class WebSocketRawOutputFormat(UniversalBaseModel):
11
11
  container: typing.Literal["raw"] = "raw"
12
12
  encoding: RawEncoding
13
- sample_rate: int
13
+ sample_rate: int = pydantic.Field()
14
+ """
15
+ The sample rate of the audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
16
+ """
14
17
 
15
18
  if IS_PYDANTIC_V2:
16
19
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -4,6 +4,7 @@ from ...core.pydantic_utilities import UniversalBaseModel
4
4
  import pydantic
5
5
  import typing
6
6
  from .output_format import OutputFormat
7
+ from .generation_config import GenerationConfig
7
8
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
8
9
  import typing_extensions
9
10
  from ...core.serialization import FieldMetadata
@@ -18,6 +19,7 @@ class WebSocketTtsRequest(UniversalBaseModel):
18
19
  """
19
20
 
20
21
  output_format: typing.Optional[OutputFormat] = None
22
+ generation_config: typing.Optional[GenerationConfig] = None
21
23
  transcript: typing.Optional[str] = None
22
24
  voice: TtsRequestVoiceSpecifier
23
25
  duration: typing.Optional[int] = None
@@ -47,6 +47,7 @@ class VoiceChangerClient:
47
47
  output_format_container : OutputFormatContainer
48
48
 
49
49
  output_format_sample_rate : int
50
+ The sample rate of the output audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
50
51
 
51
52
  output_format_encoding : typing.Optional[RawEncoding]
52
53
  Required for `raw` and `wav` containers.
@@ -224,6 +225,7 @@ class AsyncVoiceChangerClient:
224
225
  output_format_container : OutputFormatContainer
225
226
 
226
227
  output_format_sample_rate : int
228
+ The sample rate of the output audio in Hz. Supported sample rates are 8000, 16000, 22050, 24000, 44100, 48000.
227
229
 
228
230
  output_format_encoding : typing.Optional[RawEncoding]
229
231
  Required for `raw` and `wav` containers.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 2.0.9
3
+ Version: 2.0.13
4
4
  Summary:
5
5
  Requires-Python: >=3.8,<4.0
6
6
  Classifier: Intended Audience :: Developers
@@ -53,26 +53,36 @@ Instantiate and use the client with the following:
53
53
 
54
54
  ```python
55
55
  from cartesia import Cartesia
56
- from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
57
56
  import os
58
57
 
59
58
  client = Cartesia(
60
- api_key=os.getenv("CARTESIA_API_KEY"),
61
- )
62
- client.tts.bytes(
63
- model_id="sonic-2",
64
- transcript="Hello, world!",
65
- voice={
66
- "mode": "id",
67
- "id": "694f9389-aac1-45b6-b726-9d9369183238",
68
- },
69
- language="en",
70
- output_format={
71
- "container": "raw",
72
- "sample_rate": 44100,
73
- "encoding": "pcm_f32le",
74
- },
59
+ api_key=os.environ["CARTESIA_API_KEY"],
75
60
  )
61
+
62
+
63
+ def main():
64
+ with open("sonic.wav", "wb") as f:
65
+ bytes_iter = client.tts.bytes(
66
+ model_id="sonic-3",
67
+ transcript="Hello, world!",
68
+ voice={
69
+ "mode": "id",
70
+ "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
71
+ },
72
+ language="en",
73
+ output_format={
74
+ "container": "wav",
75
+ "sample_rate": 44100,
76
+ "encoding": "pcm_f32le",
77
+ },
78
+ )
79
+
80
+ for chunk in bytes_iter:
81
+ f.write(chunk)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
76
86
  ```
77
87
 
78
88
  ## Async Client
@@ -81,31 +91,37 @@ The SDK also exports an `async` client so that you can make non-blocking calls t
81
91
 
82
92
  ```python
83
93
  import asyncio
84
- import os
85
-
86
94
  from cartesia import AsyncCartesia
87
- from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
95
+ import os
88
96
 
89
97
  client = AsyncCartesia(
90
- api_key=os.getenv("CARTESIA_API_KEY"),
98
+ api_key=os.environ["CARTESIA_API_KEY"],
91
99
  )
92
100
 
93
- async def main() -> None:
94
- async for output in client.tts.bytes(
95
- model_id="sonic-2",
96
- transcript="Hello, world!",
97
- voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
98
- language="en",
99
- output_format={
100
- "container": "raw",
101
- "sample_rate": 44100,
102
- "encoding": "pcm_f32le",
103
- },
104
- ):
105
- print(f"Received chunk of size: {len(output)}")
106
101
 
102
+ async def main():
103
+ with open("sonic.wav", "wb") as f:
104
+ bytes_iter = client.tts.bytes(
105
+ model_id="sonic-3",
106
+ transcript="Hello, world!",
107
+ voice={
108
+ "mode": "id",
109
+ "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
110
+ },
111
+ language="en",
112
+ output_format={
113
+ "container": "wav",
114
+ "sample_rate": 44100,
115
+ "encoding": "pcm_f32le",
116
+ },
117
+ )
118
+
119
+ async for chunk in bytes_iter:
120
+ f.write(chunk)
107
121
 
108
- asyncio.run(main())
122
+
123
+ if __name__ == "__main__":
124
+ asyncio.run(main())
109
125
  ```
110
126
 
111
127
  ## Exception Handling
@@ -129,7 +145,6 @@ The SDK supports streaming responses as well, returning a generator that you can
129
145
 
130
146
  ```python
131
147
  from cartesia import Cartesia
132
- from cartesia.tts import Controls, OutputFormat_RawParams, TtsRequestIdSpecifierParams
133
148
  import os
134
149
 
135
150
  def get_tts_chunks():
@@ -137,14 +152,11 @@ def get_tts_chunks():
137
152
  api_key=os.getenv("CARTESIA_API_KEY"),
138
153
  )
139
154
  response = client.tts.sse(
140
- model_id="sonic-2",
155
+ model_id="sonic-3",
141
156
  transcript="Hello world!",
142
157
  voice={
158
+ "mode": "id",
143
159
  "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
144
- "experimental_controls": {
145
- "speed": "normal",
146
- "emotion": [],
147
- },
148
160
  },
149
161
  language="en",
150
162
  output_format={
@@ -188,9 +200,9 @@ ws = client.tts.websocket()
188
200
 
189
201
  # Generate and stream audio using the websocket
190
202
  for output in ws.send(
191
- model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
203
+ model_id="sonic-3", # see: https://docs.cartesia.ai/build-with-cartesia/tts-models
192
204
  transcript=transcript,
193
- voice={"id": voice_id},
205
+ voice={"mode": "id", "id": voice_id},
194
206
  stream=True,
195
207
  output_format={
196
208
  "container": "raw",
@@ -252,7 +264,7 @@ ws.send("done")
252
264
  for result in ws.receive():
253
265
  if result['type'] == 'transcript':
254
266
  print(f"Transcription: {result['text']}")
255
-
267
+
256
268
  # Handle word-level timestamps if available
257
269
  if 'words' in result and result['words']:
258
270
  print("Word-level timestamps:")
@@ -261,7 +273,7 @@ for result in ws.receive():
261
273
  start = word_info['start']
262
274
  end = word_info['end']
263
275
  print(f" '{word}': {start:.2f}s - {end:.2f}s")
264
-
276
+
265
277
  if result['is_final']:
266
278
  print("Final result received")
267
279
  elif result['type'] == 'done':
@@ -286,7 +298,7 @@ async def streaming_stt_example():
286
298
  and demonstrates the new endpointing and word timestamp features.
287
299
  """
288
300
  client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
289
-
301
+
290
302
  try:
291
303
  # Create websocket connection with voice activity detection
292
304
  ws = await client.stt.websocket(
@@ -297,24 +309,24 @@ async def streaming_stt_example():
297
309
  min_volume=0.15, # Volume threshold for voice activity detection
298
310
  max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
299
311
  )
300
-
312
+
301
313
  # Simulate streaming audio data (replace with your audio source)
302
314
  async def audio_stream():
303
315
  """Simulate real-time audio streaming - replace with actual audio capture"""
304
316
  # Load audio file for simulation
305
317
  with open("path/to/audio.wav", "rb") as f:
306
318
  audio_data = f.read()
307
-
319
+
308
320
  # Stream in 100ms chunks (realistic for real-time processing)
309
321
  chunk_size = int(16000 * 0.1 * 2) # 100ms at 16kHz, 16-bit
310
-
322
+
311
323
  for i in range(0, len(audio_data), chunk_size):
312
324
  chunk = audio_data[i:i + chunk_size]
313
325
  if chunk:
314
326
  yield chunk
315
327
  # Simulate real-time streaming delay
316
328
  await asyncio.sleep(0.1)
317
-
329
+
318
330
  # Send audio and receive results concurrently
319
331
  async def send_audio():
320
332
  """Send audio chunks to the STT websocket"""
@@ -324,31 +336,31 @@ async def streaming_stt_example():
324
336
  print(f"Sent audio chunk of {len(chunk)} bytes")
325
337
  # Small delay to simulate realtime applications
326
338
  await asyncio.sleep(0.02)
327
-
339
+
328
340
  # Signal end of audio stream
329
341
  await ws.send("finalize")
330
342
  await ws.send("done")
331
343
  print("Audio streaming completed")
332
-
344
+
333
345
  except Exception as e:
334
346
  print(f"Error sending audio: {e}")
335
-
347
+
336
348
  async def receive_transcripts():
337
349
  """Receive and process transcription results with word timestamps"""
338
350
  full_transcript = ""
339
351
  all_word_timestamps = []
340
-
352
+
341
353
  try:
342
354
  async for result in ws.receive():
343
355
  if result['type'] == 'transcript':
344
356
  text = result['text']
345
357
  is_final = result['is_final']
346
-
358
+
347
359
  # Handle word-level timestamps
348
360
  if 'words' in result and result['words']:
349
361
  word_timestamps = result['words']
350
362
  all_word_timestamps.extend(word_timestamps)
351
-
363
+
352
364
  if is_final:
353
365
  print("Word-level timestamps:")
354
366
  for word_info in word_timestamps:
@@ -356,7 +368,7 @@ async def streaming_stt_example():
356
368
  start = word_info['start']
357
369
  end = word_info['end']
358
370
  print(f" '{word}': {start:.2f}s - {end:.2f}s")
359
-
371
+
360
372
  if is_final:
361
373
  # Final result - this text won't change
362
374
  full_transcript += text + " "
@@ -364,30 +376,30 @@ async def streaming_stt_example():
364
376
  else:
365
377
  # Partial result - may change as more audio is processed
366
378
  print(f"PARTIAL: {text}")
367
-
379
+
368
380
  elif result['type'] == 'done':
369
381
  print("Transcription completed")
370
382
  break
371
-
383
+
372
384
  except Exception as e:
373
385
  print(f"Error receiving transcripts: {e}")
374
-
386
+
375
387
  return full_transcript.strip(), all_word_timestamps
376
-
388
+
377
389
  print("Starting streaming STT...")
378
-
390
+
379
391
  # Use asyncio.gather to run audio sending and transcript receiving concurrently
380
392
  _, (final_transcript, word_timestamps) = await asyncio.gather(
381
393
  send_audio(),
382
394
  receive_transcripts()
383
395
  )
384
-
396
+
385
397
  print(f"\nComplete transcript: {final_transcript}")
386
398
  print(f"Total words with timestamps: {len(word_timestamps)}")
387
-
399
+
388
400
  # Clean up
389
401
  await ws.close()
390
-
402
+
391
403
  except Exception as e:
392
404
  print(f"STT streaming error: {e}")
393
405
  finally:
@@ -442,7 +454,7 @@ import os
442
454
 
443
455
  async def transcribe_file():
444
456
  client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
445
-
457
+
446
458
  with open("path/to/audio.wav", "rb") as audio_file:
447
459
  response = await client.stt.transcribe(
448
460
  file=audio_file,
@@ -450,14 +462,14 @@ async def transcribe_file():
450
462
  language="en",
451
463
  timestamp_granularities=["word"],
452
464
  )
453
-
465
+
454
466
  print(f"Transcribed text: {response.text}")
455
-
467
+
456
468
  # Process word timestamps
457
469
  if response.words:
458
470
  for word_info in response.words:
459
471
  print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
460
-
472
+
461
473
  await client.close()
462
474
 
463
475
  asyncio.run(transcribe_file())
@@ -545,6 +557,7 @@ async def main():
545
557
  all_ends.extend(out.word_timestamps.end) # End time for each word (seconds)
546
558
 
547
559
  await ws.close()
560
+ await client.close()
548
561
 
549
562
  asyncio.run(main())
550
563
  ```
@@ -663,6 +676,3 @@ $ git commit --amend -m "manually regenerate from docs" # optional
663
676
 
664
677
  From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
665
678
 
666
-
667
-
668
-
@@ -1,4 +1,4 @@
1
- cartesia/__init__.py,sha256=P8YXd1NsmEHQOF4p0MpPMGLOSy_0cIPHOnFe-iV94oU,10311
1
+ cartesia/__init__.py,sha256=bANRu5PeAnbf6O7MXltmngXPJa_G-xo7mck3sZy9B_Y,10463
2
2
  cartesia/api_status/__init__.py,sha256=_dHNLdknrBjxHtU2PvLumttJM-JTQhJQqhhAQkLqt_U,168
3
3
  cartesia/api_status/client.py,sha256=GJ9Dq8iCn3hn8vCIqc6k1fCGEhSz0T0kaPGcdFnbMDY,3146
4
4
  cartesia/api_status/requests/__init__.py,sha256=ilEMzEy1JEw484CuL92bX5lHGOznc62pjiDMgiZ0tKM,130
@@ -19,7 +19,7 @@ cartesia/base_client.py,sha256=igAZOMDXz2Nv58oXHa7I9UfgxVN48drqhEmfsCCQlg8,6701
19
19
  cartesia/client.py,sha256=LoJjlJW2kJA-hyDt-Wu7QuKQsiTiLQfLYZjsjtewPJM,6537
20
20
  cartesia/core/__init__.py,sha256=-t9txgeQZL_1FDw_08GEoj4ft1Cn9Dti6X0Drsadlr0,1519
21
21
  cartesia/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
22
- cartesia/core/client_wrapper.py,sha256=ZeQ4DT3ZKd07u6qrZFu4Z1p2uw71eeQOlfgXpmz6b0A,1854
22
+ cartesia/core/client_wrapper.py,sha256=AaBv3QuyR9s29eT23DbkL6YWcr6LjLQJTT0POZnzuTU,1855
23
23
  cartesia/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
24
24
  cartesia/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
25
25
  cartesia/core/http_client.py,sha256=KL5RGa0y4n8nX0-07WRg4ZQUTq30sc-XJbWcP5vjBDg,19552
@@ -49,7 +49,7 @@ cartesia/embedding/types/__init__.py,sha256=aOrEOGuiO6dlSGu7pckqVMTYEMVAR5I7qqca
49
49
  cartesia/embedding/types/embedding.py,sha256=C1OJg8M4T1Apfcv4qx79ndftg0SgH4Lfqe_iU3UF-bA,1851
50
50
  cartesia/environment.py,sha256=Qnp91BGLic7hXmKsiYub2m3nPfvDWm59aB1wWta1J6A,160
51
51
  cartesia/infill/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
52
- cartesia/infill/client.py,sha256=uEDhE3Cx47ZyG7ofR-GOR0LhHiHeTLkUcjkLSsyU2ug,12563
52
+ cartesia/infill/client.py,sha256=_S7DG_697mU9LanMuWePJthq1vFFt1DoIvmgyMXGzCY,12713
53
53
  cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
54
  cartesia/stt/__init__.py,sha256=UHT5OM-5phGwLCckL8BXGdC3QepJoboScW5eSXUE2S4,1763
55
55
  cartesia/stt/_async_websocket.py,sha256=6MVYvSz3d9sI5-zzT_aIPEFKxXeCQU00RYFpYSF0dio,12385
@@ -74,25 +74,26 @@ cartesia/stt/types/timestamp_granularity.py,sha256=Oe39JvLeMgR2BIJnx32abhvs05dJe
74
74
  cartesia/stt/types/transcript_message.py,sha256=J-MchlahI96nVBiMSLJrEOXFw2pBShbMXVocysQRnrY,1693
75
75
  cartesia/stt/types/transcription_response.py,sha256=QMcD6eLmp_Z2uaRLVyxYYIdoiRiVSGhBoxN3kjRTK2I,1190
76
76
  cartesia/stt/types/transcription_word.py,sha256=yxTndKXNmToPOM6_F_QfF-B0dE6Kx8-UwBpHLj2_zWk,803
77
- cartesia/tts/__init__.py,sha256=DwNzIilOcdNUbeIHIknngnW8WyZ6K5xZremSQQoo5VM,4927
77
+ cartesia/tts/__init__.py,sha256=KmlvJPusv7yRI3OmkEn3GlxqITbfewLVO5S0SJkqV5M,5079
78
78
  cartesia/tts/_async_websocket.py,sha256=YG0NJpfQU4j48Gy2riWu1ItelPFX-IUvSFD6eMBvfGM,19454
79
79
  cartesia/tts/_websocket.py,sha256=K93vHOdxhF4-Duk8xunNnIpvkAT_ztfAtaomD5im8c0,19247
80
- cartesia/tts/client.py,sha256=Oot_ctyaqBgRMpyBUaMwh3z1M62oPKVMXNvMkmo1fRw,18180
81
- cartesia/tts/requests/__init__.py,sha256=SeITRF5QSAjOE5pNxbD6VffwwttMnQwuv0Z5n9h7BKs,3418
80
+ cartesia/tts/client.py,sha256=mJXJG9JliWHw7UYCCd9evIW3gaSr3JYORW606E8lHzU,19607
81
+ cartesia/tts/requests/__init__.py,sha256=zS0ny3c6HXr2l6D9TiBmMuyp-tbVuszhBiOQ5RdcQyw,3502
82
82
  cartesia/tts/requests/cancel_context_request.py,sha256=Wl8g-o5vwl9ENm-H1wsLx441FkIR_4Wt5UYtuWce2Yw,431
83
- cartesia/tts/requests/controls.py,sha256=xzUJlfgqhaJ1A-JD0LTpoHYk4iEpCuGpSD7qE4YYsRg,285
84
- cartesia/tts/requests/generation_request.py,sha256=JQPumk0UMCHDQrcUvuqeDsdc8LCJAEolSs10LpJzK00,3083
85
- cartesia/tts/requests/mp_3_output_format.py,sha256=PGDVzC1d7-Jce12rFxtF8G1pTHmlUdiGAhykFTABg0w,316
83
+ cartesia/tts/requests/controls.py,sha256=TkywdstN4X9odGF_HfN25zYXcCxaJS8Q0H1HR0nv_rg,316
84
+ cartesia/tts/requests/generation_config.py,sha256=ZFed-oneBwyxkkI1DHmmvtYso7FjTYM01ApS1omr1ms,917
85
+ cartesia/tts/requests/generation_request.py,sha256=rZVpfwUzSea72b5gqPY47Fgunu_IJQM2PiVNHqCR9Jk,3214
86
+ cartesia/tts/requests/mp_3_output_format.py,sha256=HBM6452KdWD9tGa9QXNyUZcH1OlJrXt_PIwo2Jt3l2Q,441
86
87
  cartesia/tts/requests/output_format.py,sha256=8TKu9AAeHCR5L4edzYch8FIYIldn4bM7ySrsCl8W_g8,842
87
88
  cartesia/tts/requests/phoneme_timestamps.py,sha256=ft81nmqElZAnvTBT27lY6YWfF18ZGsCx3Y1XHv9J7cM,267
88
- cartesia/tts/requests/raw_output_format.py,sha256=S60Vp7DeAATCMLF3bXgxhw0zILJBWJ9GhI9irAg_UkI,316
89
+ cartesia/tts/requests/raw_output_format.py,sha256=WigDQlM_YkLk_-GK1_pNseGq8g-_POO84Su7jqSLsHQ,441
89
90
  cartesia/tts/requests/speed.py,sha256=-YGBWwh7_VtCBnYlT5EVsnrmcHFMEBTxy9LathZhkMA,259
90
- cartesia/tts/requests/sse_output_format.py,sha256=z_f7dlDYNvpheYOSnf3lOslHF40vS852pYkxHTpqAcc,293
91
- cartesia/tts/requests/tts_request.py,sha256=KBoahYfPbDENlEWsqnR4z1ZIhGIJwhLrzQIzkbtqtzE,1021
91
+ cartesia/tts/requests/sse_output_format.py,sha256=dsRyxFCD3Qt3hTppxV7HJhphPx3jTkZhryMXUP-Soc8,417
92
+ cartesia/tts/requests/tts_request.py,sha256=CUFMg_U2BhJQAxrqLAv4tfxAN326ItiCi0fQfJFi4lU,1152
92
93
  cartesia/tts/requests/tts_request_embedding_specifier.py,sha256=-M54ZjV0H5LPwcKtz0bOVqlkvO1pPiMbqMbVBMko3Ns,565
93
94
  cartesia/tts/requests/tts_request_id_specifier.py,sha256=-0ClfyJnnaH0uAcF5r84s3cM_cw2wT39dp6T4JYzOQ8,536
94
95
  cartesia/tts/requests/tts_request_voice_specifier.py,sha256=eGzL4aVGq4gKPxeglsV7-wuhxg8x33Qth3uFTTytgeI,337
95
- cartesia/tts/requests/ttssse_request.py,sha256=S8EkuEtveOetkcydinfLr5lS66PYpLQTNesyRIf_LwI,2007
96
+ cartesia/tts/requests/ttssse_request.py,sha256=IZ4Urm23VQBhuJmA8CqZegZnTVIBqfZWQ9ve2vy2gXc,2138
96
97
  cartesia/tts/requests/wav_output_format.py,sha256=qiipmT5hWsa8J-fwW1EH_rnUAX_zOUpGJUNzuLc65r4,181
97
98
  cartesia/tts/requests/web_socket_base_response.py,sha256=zCjHw-FaNJMOcHiAb2NQWrBBfrzU5rc95vqDp7y9RmA,315
98
99
  cartesia/tts/requests/web_socket_chunk_response.py,sha256=4fVPJH-ZZb8lJKwqyYGx5wyeYWzfuThGxMRXC6ku4bA,233
@@ -100,38 +101,40 @@ cartesia/tts/requests/web_socket_done_response.py,sha256=YLHrT6NkmDntBSxF-JGlXSa
100
101
  cartesia/tts/requests/web_socket_error_response.py,sha256=ek2O5Whlzn5Ma40NhYviVl3aJBVeCA8BBvbJPUYxEiQ,213
101
102
  cartesia/tts/requests/web_socket_flush_done_response.py,sha256=gP3fSWhEFVzdzBweUmVKo7JvdREW3TM9R6o9-u6V6FQ,282
102
103
  cartesia/tts/requests/web_socket_phoneme_timestamps_response.py,sha256=nDRK7wo4s6R7ayJrw-LJX9WCaW4mti0HAV4X5j7cxjI,370
103
- cartesia/tts/requests/web_socket_raw_output_format.py,sha256=9BJHE5l5bzmYCYuUoACRhbZdJBijnSiwkbR8K4EzPDY,302
104
+ cartesia/tts/requests/web_socket_raw_output_format.py,sha256=dcpXwOrCkB369pJ1AFOCwa5XgAFPUh9xEojrerH52bM,426
104
105
  cartesia/tts/requests/web_socket_request.py,sha256=5xfE0NgkBEZdus_vC-3RVQkuqhNmXHxLMX4DW3ezcKc,290
105
106
  cartesia/tts/requests/web_socket_response.py,sha256=kS46YN94ilUn4qjpt1TpauZApe0N8PpAefT87jFiusY,2079
106
107
  cartesia/tts/requests/web_socket_stream_options.py,sha256=VIvblFw9hGZvDzFpOnC11G0NvrFSVt-1-0sY5rpcZPI,232
107
108
  cartesia/tts/requests/web_socket_timestamps_response.py,sha256=MK3zN2Q_PVWJtX5DidNB0uXoF2o33rv6qCYPVaourxY,351
108
109
  cartesia/tts/requests/web_socket_tts_output.py,sha256=pX2uf0XVdziFhXCydwLlVOWb-LvBiuq-cBI6R1INiMg,760
109
- cartesia/tts/requests/web_socket_tts_request.py,sha256=1jdRjRAO7z-KLOyp8FcDoQh933RGt-ZPR3E8Vz3XPnQ,1795
110
+ cartesia/tts/requests/web_socket_tts_request.py,sha256=9IqZKwM8YSDoDqYNPQ6DrcRGfuaAExD0KIPC0Ptaq1U,1926
110
111
  cartesia/tts/requests/word_timestamps.py,sha256=WMfBJtETi6wTpES0pYZCFfFRfEbzWE-RtosDJ5seUWg,261
111
112
  cartesia/tts/socket_client.py,sha256=zTPayHbgy-yQQ50AE1HXN4GMyanisZcLXf7Ds1paYks,11621
112
- cartesia/tts/types/__init__.py,sha256=rXphJ9b9nSYYrepr2ssG6ghtQAOQBQcLegxbl-XG3tw,3438
113
+ cartesia/tts/types/__init__.py,sha256=VsVhynuJM_G3zHAzkAtB8M6eK_tq0Pa76FOAiulbRBc,3585
113
114
  cartesia/tts/types/cancel_context_request.py,sha256=zInhk3qRZsSc0F1aYJ-Q5BHJsosTrb22IJWhzue-eKE,856
114
115
  cartesia/tts/types/context_id.py,sha256=UCEtq5xFGOeBCECcY6Y-gYVe_Peg1hFhH9YYOkpApQg,81
115
- cartesia/tts/types/controls.py,sha256=H4CSu79mM1Ld4NZx_5uXw3EwRzTEMQRxKBRvFpcFb8Y,644
116
- cartesia/tts/types/emotion.py,sha256=zocyDcHTiFFnNRgo2YLMi70iGyffa080B4mkg9lcqVc,764
116
+ cartesia/tts/types/controls.py,sha256=SxeSPZ4KgvRiUawOUI9mycASv6ekQ11vZYKOMtZz5TU,675
117
+ cartesia/tts/types/emotion.py,sha256=N5E5Tf7L9tHcH-MB5fDPEFusotygu85ybEc-YeslVjc,79
118
+ cartesia/tts/types/emotion_deprecated.py,sha256=WQuI5pXbzgpNq4kT14NMfukCJPN58GbmTtPScMMLy4I,774
117
119
  cartesia/tts/types/flush_id.py,sha256=HCIKo9o8d7YWKtaSNU3TEvfUVBju93ckGQy01Z9wLcE,79
118
- cartesia/tts/types/generation_request.py,sha256=ZGVXmHZLaZg7kEg1cVGXLpr8uB3btr2eZt0NEJRZnSU,3582
120
+ cartesia/tts/types/generation_config.py,sha256=lIb52e8Ua777uvFnFTYn1NghxpzSTMC4QmDlV1cturU,1332
121
+ cartesia/tts/types/generation_request.py,sha256=qO7XKzvwIp8Foglv5_1DJL1pCZLVyea0fQ0oKJw0fGw,3694
119
122
  cartesia/tts/types/model_speed.py,sha256=iiTj8V0piFCX2FZh5B8EkgRhZDlj4z3VFcQhp66e7y8,160
120
- cartesia/tts/types/mp_3_output_format.py,sha256=0WGblkuDUL7pZO1aRuQ_mU2Z5gN9xIabRfRKkjtzms8,731
123
+ cartesia/tts/types/mp_3_output_format.py,sha256=LQ1-rEYjkK6XXWoj_Z7bezsguPpNI_SmprlIipsyNMI,875
121
124
  cartesia/tts/types/natural_specifier.py,sha256=K526P1RRuBGy80hyd_tX8tohPrE8DR9EgTCxS5wce0o,188
122
125
  cartesia/tts/types/numerical_specifier.py,sha256=tJpIskWO545luCKMFM9JlVc7VVhBhSvqL1qurhzL9cI,92
123
126
  cartesia/tts/types/output_format.py,sha256=bi9iZVQKmddTw6RjNKG9XAVrgEB7JVNsBS_emFLlGLs,1736
124
127
  cartesia/tts/types/phoneme_timestamps.py,sha256=SrhPmE7-1-bCVi4qCgMU7QR9ezkwUfqsWfZ2PchzwN0,637
125
128
  cartesia/tts/types/raw_encoding.py,sha256=eyc2goiYOTxWcuKHAgYZ2SrnfePW22Fbmc-5fGPlV2Y,186
126
- cartesia/tts/types/raw_output_format.py,sha256=jZGVaS0KIi9mU6trfskgA3HbMKJolhrwICnuDhF01ic,673
129
+ cartesia/tts/types/raw_output_format.py,sha256=ir5QxW986P8qB14pMD5PVsAgc0bdC37i7O8JipS1svA,817
127
130
  cartesia/tts/types/speed.py,sha256=4c5WdxocBw6WSMnundSaNnceUeooU0vikhy00FW6M-w,239
128
- cartesia/tts/types/sse_output_format.py,sha256=tRb4VcYqoPJMDyjfTZMCRTblT2NjwIsQhy1oMjxQWW0,676
131
+ cartesia/tts/types/sse_output_format.py,sha256=1_GB3rftQYAsXO6WrgQmzr-tsjCntHCVgKeTjay7M9g,819
129
132
  cartesia/tts/types/supported_language.py,sha256=riDRduThMbMWAq9i2uCfxhwVTpgaFwNDZ9LhEIl4zHY,237
130
- cartesia/tts/types/tts_request.py,sha256=FGcxW-siiQpEzJZSHMET3nDSYHSzRt3WSTO-cCEz9u4,1376
133
+ cartesia/tts/types/tts_request.py,sha256=TkngMxyGKnjQvIG5u4qFx9TKcohuLyjI1UeXv8xbj2U,1488
131
134
  cartesia/tts/types/tts_request_embedding_specifier.py,sha256=eL_qCEr4pvWfy4qp9hZBuVdCincX5DBVqfv1vLt2_Vk,942
132
135
  cartesia/tts/types/tts_request_id_specifier.py,sha256=ktGdkkTRQ9scA-lt8qJ2jn_E5WzoOK8AXMrVqi71gf0,906
133
136
  cartesia/tts/types/tts_request_voice_specifier.py,sha256=p-3UQ62uFL1SgbX73Ex1D_V73Ef0wmT1ApOt1iLZmwE,307
134
- cartesia/tts/types/ttssse_request.py,sha256=6KvDQYzetwbFOVvkMWDj94Biz08EZaiX6V1lChsy49U,2423
137
+ cartesia/tts/types/ttssse_request.py,sha256=QZa0LOwhtsxLFaTxCGA0EzMOYqp7tVu-ezmu-ibcmiA,2535
135
138
  cartesia/tts/types/wav_output_format.py,sha256=OTAgVn_gBMk252XO12kiNI9lKrbw3n38aBAiqlG5mdU,531
136
139
  cartesia/tts/types/web_socket_base_response.py,sha256=MWoTt1rGRqUQ8BOad1Zk2SA-i0E8a3JwPLSiehIbFj4,672
137
140
  cartesia/tts/types/web_socket_chunk_response.py,sha256=VOPXAlyGFdnfC69KxqDWDo1PPMydvQKmAypoWfbW8_s,593
@@ -139,20 +142,20 @@ cartesia/tts/types/web_socket_done_response.py,sha256=zZ6V-_pKNifdyuuRHGlZe6Zbc-
139
142
  cartesia/tts/types/web_socket_error_response.py,sha256=Jm26GnK0axyLQI3-JLHC0buYVIU8gKWxLAJlzo-cJFQ,573
140
143
  cartesia/tts/types/web_socket_flush_done_response.py,sha256=JLiVPDftr1arl_Kvj6038yj0mnjq6x0ooihsbdXajfw,635
141
144
  cartesia/tts/types/web_socket_phoneme_timestamps_response.py,sha256=R1-Z_W3XF7L7rrPwEOK_EfXHT4FWRpSAX3g71WebM90,686
142
- cartesia/tts/types/web_socket_raw_output_format.py,sha256=9PiOVmPDfT32IDIsmU7UY_rTLOShMMEw1pNv2yZ9Kyg,685
145
+ cartesia/tts/types/web_socket_raw_output_format.py,sha256=O9ay_TwnMs4r_D_Cml6lBJ_2BwnHHo18boIXYI4wTr0,828
143
146
  cartesia/tts/types/web_socket_request.py,sha256=_xoAShkCCNTVAWKCvHw5k0Wisq60y4fOWYjG7SA8edM,260
144
147
  cartesia/tts/types/web_socket_response.py,sha256=fUQbJ6yFzZbzUZPuQWgkFdzP8-FMiKTcya-DIPWjimY,3777
145
148
  cartesia/tts/types/web_socket_stream_options.py,sha256=MhDSxBFqMuQeWjoyPqXVnTEzLjF8g6aojeigb5dQUgU,596
146
149
  cartesia/tts/types/web_socket_timestamps_response.py,sha256=kuWXI82ncF1QapnaHEjwrL84qWob7ByQU-yh1e0IEmk,667
147
150
  cartesia/tts/types/web_socket_tts_output.py,sha256=uvkv0smTBhdm18Rl17C0Ml4Inh79YBHNzAcKnZBs14Y,979
148
- cartesia/tts/types/web_socket_tts_request.py,sha256=Gx8kSINX__VhQ3In3R1-4fq0bfjaMe7iL-M8nDNt0fQ,2150
151
+ cartesia/tts/types/web_socket_tts_request.py,sha256=mBVFoOdZDlxm2cQbmPTHgQjENfM4xhm_DywlTm5OtGI,2262
149
152
  cartesia/tts/types/word_timestamps.py,sha256=XZ2Q0prdb3F9c3AiOKXu4s3A3jBxE-qIt1npHOf16R0,631
150
153
  cartesia/tts/utils/constants.py,sha256=1CHa5flJf8--L_eYyOyOiWJNZ-Q81ufHZxDbJs8xYSk,418
151
154
  cartesia/tts/utils/tts.py,sha256=u7PgPxlJs6fcQTfr-jqAvBCAaK3JWLhF5QF4s-PwoMo,2093
152
155
  cartesia/tts/utils/types.py,sha256=DtsiRwrYypXScLu71gNyprUiELuR1l_-ikVaj47gpg4,2047
153
156
  cartesia/version.py,sha256=xk5z2FYkgnvzyjqzmRg67rYl8fnCeHEjPpVmD08bjyE,75
154
157
  cartesia/voice_changer/__init__.py,sha256=UKA8CSAwUb41OL-dcWWUhIsKLLsyY_NQtrklPAVWf9E,685
155
- cartesia/voice_changer/client.py,sha256=w3Z3A-92Fu5k9NRrfdn7Gu2nqmOONL-xLCHknZhkANY,13509
158
+ cartesia/voice_changer/client.py,sha256=CjTuptyKNpviBB21fsobBqQSn08WuYCgC6gXAxNPCUI,13755
156
159
  cartesia/voice_changer/requests/__init__.py,sha256=MRwSUqio3mg_tvfcpAS0wIZ69HvJsc2kYJ0tUBaJ53U,390
157
160
  cartesia/voice_changer/requests/streaming_response.py,sha256=lbo7CJeuh0f5hXT4lKG_sDUZDLJWaLqxcwCuSf1IbMQ,982
158
161
  cartesia/voice_changer/types/__init__.py,sha256=qAiHsdRpnFeS0lBkYp_NRrhSJiRXCg5-uFibqDWzYVU,430
@@ -198,7 +201,7 @@ cartesia/voices/types/voice_expand_options.py,sha256=e4FroWdlxEE-LXQfT1RWlGHtswl
198
201
  cartesia/voices/types/voice_id.py,sha256=GDoXcRVeIm-V21R4suxG2zqLD3DLYkXE9kgizadzFKo,79
199
202
  cartesia/voices/types/voice_metadata.py,sha256=4KNGjXMUKm3niv-NvKIFVGtiilpH13heuzKcZYNQxk4,1181
200
203
  cartesia/voices/types/weight.py,sha256=XqDU7_JItNUb5QykIDqTbELlRYQdbt2SviRgW0w2LKo,80
201
- cartesia-2.0.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
202
- cartesia-2.0.9.dist-info/METADATA,sha256=GG86uKWZW4iX4S3p-QOsgPVv7yGStCHj6qHq97e9V6Q,20804
203
- cartesia-2.0.9.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
204
- cartesia-2.0.9.dist-info/RECORD,,
204
+ cartesia-2.0.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
205
+ cartesia-2.0.13.dist-info/METADATA,sha256=NB9AzRg9IZKIC_ePwsUwBlaKG3X6kFQkObYJ8McqjMQ,20671
206
+ cartesia-2.0.13.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
207
+ cartesia-2.0.13.dist-info/RECORD,,