cartesia 2.0.0b7__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +15 -1
- cartesia/auth/__init__.py +13 -0
- cartesia/auth/client.py +159 -0
- cartesia/auth/requests/__init__.py +7 -0
- cartesia/auth/requests/token_grant.py +10 -0
- cartesia/auth/requests/token_request.py +17 -0
- cartesia/auth/requests/token_response.py +10 -0
- cartesia/auth/types/__init__.py +7 -0
- cartesia/auth/types/token_grant.py +22 -0
- cartesia/auth/types/token_request.py +28 -0
- cartesia/auth/types/token_response.py +22 -0
- cartesia/base_client.py +4 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/tts/_async_websocket.py +8 -0
- cartesia/tts/_websocket.py +11 -0
- cartesia/tts/client.py +40 -4
- cartesia/tts/requests/generation_request.py +19 -1
- cartesia/tts/requests/tts_request.py +10 -1
- cartesia/tts/requests/web_socket_tts_request.py +3 -1
- cartesia/tts/types/generation_request.py +19 -1
- cartesia/tts/types/tts_request.py +10 -1
- cartesia/tts/types/web_socket_tts_request.py +3 -1
- cartesia/voices/__init__.py +6 -0
- cartesia/voices/client.py +208 -159
- cartesia/voices/requests/create_voice_request.py +2 -0
- cartesia/voices/requests/localize_dialect.py +6 -1
- cartesia/voices/requests/localize_voice_request.py +15 -2
- cartesia/voices/types/__init__.py +6 -0
- cartesia/voices/types/create_voice_request.py +2 -0
- cartesia/voices/types/localize_dialect.py +6 -1
- cartesia/voices/types/localize_french_dialect.py +5 -0
- cartesia/voices/types/localize_portuguese_dialect.py +5 -0
- cartesia/voices/types/localize_spanish_dialect.py +5 -0
- cartesia/voices/types/localize_voice_request.py +16 -3
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/METADATA +68 -63
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/RECORD +37 -24
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/WHEEL +0 -0
@@ -10,7 +10,7 @@ from .output_format import OutputFormatParams
|
|
10
10
|
class TtsRequestParams(typing_extensions.TypedDict):
|
11
11
|
model_id: str
|
12
12
|
"""
|
13
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
13
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
14
14
|
"""
|
15
15
|
|
16
16
|
transcript: str
|
@@ -22,3 +22,12 @@ class TtsRequestParams(typing_extensions.TypedDict):
|
|
22
22
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
23
23
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
24
24
|
"""
|
25
|
+
|
26
|
+
text_cfg: typing_extensions.NotRequired[float]
|
27
|
+
"""
|
28
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
29
|
+
|
30
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
31
|
+
|
32
|
+
This parameter is only supported for `sonic-2` models.
|
33
|
+
"""
|
@@ -10,7 +10,7 @@ from ...core.serialization import FieldMetadata
|
|
10
10
|
class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
11
11
|
model_id: str
|
12
12
|
"""
|
13
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
13
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
14
14
|
"""
|
15
15
|
|
16
16
|
output_format: typing_extensions.NotRequired[OutputFormatParams]
|
@@ -23,3 +23,5 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
|
23
23
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
24
24
|
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
25
25
|
context_id: typing_extensions.NotRequired[str]
|
26
|
+
max_buffer_delay_ms: typing_extensions.NotRequired[int]
|
27
|
+
text_cfg: typing_extensions.NotRequired[float]
|
@@ -15,7 +15,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
15
15
|
class GenerationRequest(UniversalBaseModel):
|
16
16
|
model_id: str = pydantic.Field()
|
17
17
|
"""
|
18
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
18
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
19
19
|
"""
|
20
20
|
|
21
21
|
transcript: typing.Optional[typing.Any] = pydantic.Field(default=None)
|
@@ -32,6 +32,15 @@ class GenerationRequest(UniversalBaseModel):
|
|
32
32
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
33
33
|
"""
|
34
34
|
|
35
|
+
text_cfg: typing.Optional[float] = pydantic.Field(default=None)
|
36
|
+
"""
|
37
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
38
|
+
|
39
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
40
|
+
|
41
|
+
This parameter is only supported for `sonic-2` models.
|
42
|
+
"""
|
43
|
+
|
35
44
|
context_id: typing.Optional[ContextId] = None
|
36
45
|
continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = pydantic.Field(
|
37
46
|
default=None
|
@@ -41,6 +50,15 @@ class GenerationRequest(UniversalBaseModel):
|
|
41
50
|
If not specified, this defaults to `false`.
|
42
51
|
"""
|
43
52
|
|
53
|
+
max_buffer_delay_ms: typing.Optional[int] = pydantic.Field(default=None)
|
54
|
+
"""
|
55
|
+
The maximum time in milliseconds to buffer text before starting generation. Values between [0, 1000]ms are supported. Defaults to 0 (no buffering).
|
56
|
+
|
57
|
+
When set, the model will buffer incoming text chunks until it's confident it has enough context to generate high-quality speech, or the buffer delay elapses, whichever comes first. Without this option set, the model will kick off generations immediately, ceding control of buffering to the user.
|
58
|
+
|
59
|
+
Use this to balance responsiveness with higher quality speech generation, which often benefits from having more context.
|
60
|
+
"""
|
61
|
+
|
44
62
|
flush: typing.Optional[bool] = pydantic.Field(default=None)
|
45
63
|
"""
|
46
64
|
Whether to flush the context.
|
@@ -12,7 +12,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
12
12
|
class TtsRequest(UniversalBaseModel):
|
13
13
|
model_id: str = pydantic.Field()
|
14
14
|
"""
|
15
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
15
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
16
16
|
"""
|
17
17
|
|
18
18
|
transcript: str
|
@@ -25,6 +25,15 @@ class TtsRequest(UniversalBaseModel):
|
|
25
25
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
26
26
|
"""
|
27
27
|
|
28
|
+
text_cfg: typing.Optional[float] = pydantic.Field(default=None)
|
29
|
+
"""
|
30
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
31
|
+
|
32
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
33
|
+
|
34
|
+
This parameter is only supported for `sonic-2` models.
|
35
|
+
"""
|
36
|
+
|
28
37
|
if IS_PYDANTIC_V2:
|
29
38
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
30
39
|
else:
|
@@ -13,7 +13,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
13
13
|
class WebSocketTtsRequest(UniversalBaseModel):
|
14
14
|
model_id: str = pydantic.Field()
|
15
15
|
"""
|
16
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
16
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
17
17
|
"""
|
18
18
|
|
19
19
|
output_format: typing.Optional[OutputFormat] = None
|
@@ -26,6 +26,8 @@ class WebSocketTtsRequest(UniversalBaseModel):
|
|
26
26
|
add_phoneme_timestamps: typing.Optional[bool] = None
|
27
27
|
continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
|
28
28
|
context_id: typing.Optional[str] = None
|
29
|
+
max_buffer_delay_ms: typing.Optional[int] = None
|
30
|
+
text_cfg: typing.Optional[float] = None
|
29
31
|
|
30
32
|
if IS_PYDANTIC_V2:
|
31
33
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
cartesia/voices/__init__.py
CHANGED
@@ -12,6 +12,9 @@ from .types import (
|
|
12
12
|
IdSpecifier,
|
13
13
|
LocalizeDialect,
|
14
14
|
LocalizeEnglishDialect,
|
15
|
+
LocalizeFrenchDialect,
|
16
|
+
LocalizePortugueseDialect,
|
17
|
+
LocalizeSpanishDialect,
|
15
18
|
LocalizeTargetLanguage,
|
16
19
|
LocalizeVoiceRequest,
|
17
20
|
MixVoiceSpecifier,
|
@@ -56,6 +59,9 @@ __all__ = [
|
|
56
59
|
"LocalizeDialect",
|
57
60
|
"LocalizeDialectParams",
|
58
61
|
"LocalizeEnglishDialect",
|
62
|
+
"LocalizeFrenchDialect",
|
63
|
+
"LocalizePortugueseDialect",
|
64
|
+
"LocalizeSpanishDialect",
|
59
65
|
"LocalizeTargetLanguage",
|
60
66
|
"LocalizeVoiceRequest",
|
61
67
|
"LocalizeVoiceRequestParams",
|