PyPI - cartesia - Versions diffs - 2.0.0b7__py3-none-any.whl → 2.0.2__py3-none-any.whl - Mend

cartesia 2.0.0b7py3-none-any.whl → 2.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

cartesia/__init__.py +15 -1
cartesia/auth/__init__.py +13 -0
cartesia/auth/client.py +159 -0
cartesia/auth/requests/__init__.py +7 -0
cartesia/auth/requests/token_grant.py +10 -0
cartesia/auth/requests/token_request.py +17 -0
cartesia/auth/requests/token_response.py +10 -0
cartesia/auth/types/__init__.py +7 -0
cartesia/auth/types/token_grant.py +22 -0
cartesia/auth/types/token_request.py +28 -0
cartesia/auth/types/token_response.py +22 -0
cartesia/base_client.py +4 -0
cartesia/core/client_wrapper.py +1 -1
cartesia/tts/_async_websocket.py +8 -0
cartesia/tts/_websocket.py +11 -0
cartesia/tts/client.py +40 -4
cartesia/tts/requests/generation_request.py +19 -1
cartesia/tts/requests/tts_request.py +10 -1
cartesia/tts/requests/web_socket_tts_request.py +3 -1
cartesia/tts/types/generation_request.py +19 -1
cartesia/tts/types/tts_request.py +10 -1
cartesia/tts/types/web_socket_tts_request.py +3 -1
cartesia/voices/__init__.py +6 -0
cartesia/voices/client.py +208 -159
cartesia/voices/requests/create_voice_request.py +2 -0
cartesia/voices/requests/localize_dialect.py +6 -1
cartesia/voices/requests/localize_voice_request.py +15 -2
cartesia/voices/types/__init__.py +6 -0
cartesia/voices/types/create_voice_request.py +2 -0
cartesia/voices/types/localize_dialect.py +6 -1
cartesia/voices/types/localize_french_dialect.py +5 -0
cartesia/voices/types/localize_portuguese_dialect.py +5 -0
cartesia/voices/types/localize_spanish_dialect.py +5 -0
cartesia/voices/types/localize_voice_request.py +16 -3
{cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/METADATA +68 -63
{cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/RECORD +37 -24
{cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/WHEEL +0 -0

cartesia/tts/requests/tts_request.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .output_format import OutputFormatParams
 class TtsRequestParams(typing_extensions.TypedDict):
     model_id: str
     """
-    The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
     """
     transcript: str
@@ -22,3 +22,12 @@ class TtsRequestParams(typing_extensions.TypedDict):
     The maximum duration of the audio in seconds. You do not usually need to specify this.
     If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
     """
+    text_cfg: typing_extensions.NotRequired[float]
+    """
+    The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
+    Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
+    This parameter is only supported for `sonic-2` models.
+    """

cartesia/tts/requests/web_socket_tts_request.py CHANGED Viewed

@@ -10,7 +10,7 @@ from ...core.serialization import FieldMetadata
 class WebSocketTtsRequestParams(typing_extensions.TypedDict):
     model_id: str
     """
-    The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
     """
     output_format: typing_extensions.NotRequired[OutputFormatParams]
@@ -23,3 +23,5 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
     add_phoneme_timestamps: typing_extensions.NotRequired[bool]
     continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
     context_id: typing_extensions.NotRequired[str]
+    max_buffer_delay_ms: typing_extensions.NotRequired[int]
+    text_cfg: typing_extensions.NotRequired[float]

cartesia/tts/types/generation_request.py CHANGED Viewed

@@ -15,7 +15,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class GenerationRequest(UniversalBaseModel):
     model_id: str = pydantic.Field()
     """
-    The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
     """
     transcript: typing.Optional[typing.Any] = pydantic.Field(default=None)
@@ -32,6 +32,15 @@ class GenerationRequest(UniversalBaseModel):
     If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
     """
+    text_cfg: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
+    Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
+    This parameter is only supported for `sonic-2` models.
+    """
     context_id: typing.Optional[ContextId] = None
     continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = pydantic.Field(
         default=None
@@ -41,6 +50,15 @@ class GenerationRequest(UniversalBaseModel):
     If not specified, this defaults to `false`.
     """
+    max_buffer_delay_ms: typing.Optional[int] = pydantic.Field(default=None)
+    """
+    The maximum time in milliseconds to buffer text before starting generation. Values between [0, 1000]ms are supported. Defaults to 0 (no buffering).
+    When set, the model will buffer incoming text chunks until it's confident it has enough context to generate high-quality speech, or the buffer delay elapses, whichever comes first. Without this option set, the model will kick off generations immediately, ceding control of buffering to the user.
+    Use this to balance responsiveness with higher quality speech generation, which often benefits from having more context.
+    """
     flush: typing.Optional[bool] = pydantic.Field(default=None)
     """
     Whether to flush the context.

cartesia/tts/types/tts_request.py CHANGED Viewed

@@ -12,7 +12,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class TtsRequest(UniversalBaseModel):
     model_id: str = pydantic.Field()
     """
-    The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
     """
     transcript: str
@@ -25,6 +25,15 @@ class TtsRequest(UniversalBaseModel):
     If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
     """
+    text_cfg: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
+    Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
+    This parameter is only supported for `sonic-2` models.
+    """
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
     else:

cartesia/tts/types/web_socket_tts_request.py CHANGED Viewed

@@ -13,7 +13,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class WebSocketTtsRequest(UniversalBaseModel):
     model_id: str = pydantic.Field()
     """
-    The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
     """
     output_format: typing.Optional[OutputFormat] = None
@@ -26,6 +26,8 @@ class WebSocketTtsRequest(UniversalBaseModel):
     add_phoneme_timestamps: typing.Optional[bool] = None
     continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
     context_id: typing.Optional[str] = None
+    max_buffer_delay_ms: typing.Optional[int] = None
+    text_cfg: typing.Optional[float] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/voices/__init__.py CHANGED Viewed

@@ -12,6 +12,9 @@ from .types import (
     IdSpecifier,
     LocalizeDialect,
     LocalizeEnglishDialect,
+    LocalizeFrenchDialect,
+    LocalizePortugueseDialect,
+    LocalizeSpanishDialect,
     LocalizeTargetLanguage,
     LocalizeVoiceRequest,
     MixVoiceSpecifier,
@@ -56,6 +59,9 @@ __all__ = [
     "LocalizeDialect",
     "LocalizeDialectParams",
     "LocalizeEnglishDialect",
+    "LocalizeFrenchDialect",
+    "LocalizePortugueseDialect",
+    "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
     "LocalizeVoiceRequest",
     "LocalizeVoiceRequestParams",

cartesia 2.0.0b7__py3-none-any.whl → 2.0.2__py3-none-any.whl

cartesia 2.0.0b7py3-none-any.whl → 2.0.2py3-none-any.whl