PyPI - cartesia - Versions diffs - 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl - Mend

cartesia 2.0.5py3-none-any.whl → 2.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

cartesia/__init__.py +22 -0
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/core/client_wrapper.py +1 -1
cartesia/infill/client.py +0 -8
cartesia/stt/__init__.py +6 -0
cartesia/stt/_async_websocket.py +81 -72
cartesia/stt/_websocket.py +42 -20
cartesia/stt/client.py +450 -0
cartesia/stt/requests/__init__.py +2 -0
cartesia/stt/requests/streaming_transcription_response.py +2 -0
cartesia/stt/requests/transcript_message.py +8 -1
cartesia/stt/requests/transcription_response.py +8 -1
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +52 -109
cartesia/stt/types/__init__.py +4 -0
cartesia/stt/types/streaming_transcription_response.py +2 -0
cartesia/stt/types/stt_encoding.py +3 -1
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +7 -1
cartesia/stt/types/transcription_response.py +7 -1
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +16 -0
cartesia/tts/client.py +63 -8
cartesia/tts/requests/__init__.py +8 -0
cartesia/tts/requests/experimental_model_controls.py +17 -0
cartesia/tts/requests/generation_config.py +23 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/tts_request.py +2 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +8 -0
cartesia/tts/types/experimental_model_controls.py +28 -0
cartesia/tts/types/generation_config.py +34 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/tts_request.py +2 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/client.py +0 -8
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
cartesia/voices/client.py +0 -12
cartesia-2.0.7.dist-info/LICENSE +201 -0
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1

cartesia/tts/requests/__init__.py CHANGED Viewed

@@ -2,16 +2,20 @@
 from .cancel_context_request import CancelContextRequestParams
 from .controls import ControlsParams
+from .experimental_model_controls import ExperimentalModelControlsParams
+from .generation_config import GenerationConfigParams
 from .generation_request import GenerationRequestParams
 from .mp_3_output_format import Mp3OutputFormatParams
 from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFormat_RawParams, OutputFormat_WavParams
 from .phoneme_timestamps import PhonemeTimestampsParams
 from .raw_output_format import RawOutputFormatParams
 from .speed import SpeedParams
+from .sse_output_format import SseOutputFormatParams
 from .tts_request import TtsRequestParams
 from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
 from .tts_request_id_specifier import TtsRequestIdSpecifierParams
 from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
+from .ttssse_request import TtssseRequestParams
 from .wav_output_format import WavOutputFormatParams
 from .web_socket_base_response import WebSocketBaseResponseParams
 from .web_socket_chunk_response import WebSocketChunkResponseParams
@@ -39,6 +43,8 @@ from .word_timestamps import WordTimestampsParams
 __all__ = [
     "CancelContextRequestParams",
     "ControlsParams",
+    "ExperimentalModelControlsParams",
+    "GenerationConfigParams",
     "GenerationRequestParams",
     "Mp3OutputFormatParams",
     "OutputFormatParams",
@@ -48,10 +54,12 @@ __all__ = [
     "PhonemeTimestampsParams",
     "RawOutputFormatParams",
     "SpeedParams",
+    "SseOutputFormatParams",
     "TtsRequestEmbeddingSpecifierParams",
     "TtsRequestIdSpecifierParams",
     "TtsRequestParams",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequestParams",
     "WavOutputFormatParams",
     "WebSocketBaseResponseParams",
     "WebSocketChunkResponseParams",

cartesia/tts/requests/experimental_model_controls.py ADDED Viewed

@@ -0,0 +1,17 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+import typing_extensions
+class ExperimentalModelControlsParams(typing_extensions.TypedDict):
+    """
+    These controls are **experimental** and subject to breaking changes.
+    """
+    accent_localization: typing_extensions.NotRequired[int]
+    """
+    Toggle accent localization: 0 (disabled, default) or 1 (enabled).
+    When enabled, the voice adapts to match the transcript language's accent while preserving vocal characteristics. When disabled, maintains the original voice accent.
+    For more information, see [Localize Voices](/build-with-sonic/capabilities/localize-voices).
+    """

cartesia/tts/requests/generation_config.py ADDED Viewed

@@ -0,0 +1,23 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+import typing_extensions
+from .experimental_model_controls import ExperimentalModelControlsParams
+class GenerationConfigParams(typing_extensions.TypedDict):
+    """
+    Configure the various attributes of the generated speech. These controls are only available for `sonic-3-preview` and will have no effect on earlier models.
+    """
+    volume: typing_extensions.NotRequired[float]
+    """
+    Adjust the volume of the generated speech between -1.0 (softer) and 1.0 (louder). 0.0 is the default volume.
+    """
+    speed: typing_extensions.NotRequired[float]
+    """
+    Adjust the speed of the generated speech between -1.0 (slower) and 1.0 (faster). 0.0 is the default speed.
+    """
+    experimental: typing_extensions.NotRequired[ExperimentalModelControlsParams]

cartesia/tts/requests/generation_request.py CHANGED Viewed

@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
     add_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to return word-level timestamps.
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
     """
     add_phoneme_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to return phoneme-level timestamps.
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
     """
-    use_original_timestamps: typing_extensions.NotRequired[bool]
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to use the original transcript for timestamps.
+    Whether to use normalized timestamps (True) or original timestamps (False).
     """

cartesia/tts/requests/sse_output_format.py ADDED Viewed

@@ -0,0 +1,11 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+import typing
+from ..types.raw_encoding import RawEncoding
+class SseOutputFormatParams(typing_extensions.TypedDict):
+    container: typing.Literal["raw"]
+    encoding: RawEncoding
+    sample_rate: int

cartesia/tts/requests/tts_request.py CHANGED Viewed

@@ -6,6 +6,7 @@ import typing_extensions
 from ..types.supported_language import SupportedLanguage
 from .output_format import OutputFormatParams
 from ..types.model_speed import ModelSpeed
+from .generation_config import GenerationConfigParams
 class TtsRequestParams(typing_extensions.TypedDict):
@@ -25,3 +26,4 @@ class TtsRequestParams(typing_extensions.TypedDict):
     """
     speed: typing_extensions.NotRequired[ModelSpeed]
+    generation_config: typing_extensions.NotRequired[GenerationConfigParams]

cartesia/tts/requests/ttssse_request.py ADDED Viewed

@@ -0,0 +1,47 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
+import typing_extensions
+from ..types.supported_language import SupportedLanguage
+from .sse_output_format import SseOutputFormatParams
+from ..types.model_speed import ModelSpeed
+from ..types.context_id import ContextId
+class TtssseRequestParams(typing_extensions.TypedDict):
+    model_id: str
+    """
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
+    """
+    transcript: str
+    voice: TtsRequestVoiceSpecifierParams
+    language: typing_extensions.NotRequired[SupportedLanguage]
+    output_format: SseOutputFormatParams
+    duration: typing_extensions.NotRequired[float]
+    """
+    The maximum duration of the audio in seconds. You do not usually need to specify this.
+    If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
+    """
+    speed: typing_extensions.NotRequired[ModelSpeed]
+    add_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to use normalized timestamps (True) or original timestamps (False).
+    """
+    context_id: typing_extensions.NotRequired[ContextId]
+    """
+    Optional context ID for this request.
+    """

cartesia/tts/requests/web_socket_chunk_response.py CHANGED Viewed

@@ -1,11 +1,8 @@
 # This file was auto-generated by Fern from our API Definition.
 from .web_socket_base_response import WebSocketBaseResponseParams
-import typing_extensions
-from ..types.flush_id import FlushId
 class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]

cartesia/tts/requests/web_socket_response.py CHANGED Viewed

@@ -4,8 +4,8 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
-from ..types.flush_id import FlushId
 from ..types.context_id import ContextId
+from ..types.flush_id import FlushId
 from .word_timestamps import WordTimestampsParams
 from .phoneme_timestamps import PhonemeTimestampsParams
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
     type: typing.Literal["chunk"]
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]
     context_id: typing_extensions.NotRequired[ContextId]
     status_code: int
     done: bool

cartesia/tts/requests/web_socket_tts_request.py CHANGED Viewed

@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
     duration: typing_extensions.NotRequired[int]
     language: typing_extensions.NotRequired[str]
     add_timestamps: typing_extensions.NotRequired[bool]
-    use_original_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
     add_phoneme_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
     continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
     context_id: typing_extensions.NotRequired[str]
     max_buffer_delay_ms: typing_extensions.NotRequired[int]

cartesia/tts/types/__init__.py CHANGED Viewed

@@ -4,7 +4,9 @@ from .cancel_context_request import CancelContextRequest
 from .context_id import ContextId
 from .controls import Controls
 from .emotion import Emotion
+from .experimental_model_controls import ExperimentalModelControls
 from .flush_id import FlushId
+from .generation_config import GenerationConfig
 from .generation_request import GenerationRequest
 from .model_speed import ModelSpeed
 from .mp_3_output_format import Mp3OutputFormat
@@ -15,11 +17,13 @@ from .phoneme_timestamps import PhonemeTimestamps
 from .raw_encoding import RawEncoding
 from .raw_output_format import RawOutputFormat
 from .speed import Speed
+from .sse_output_format import SseOutputFormat
 from .supported_language import SupportedLanguage
 from .tts_request import TtsRequest
 from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
 from .tts_request_id_specifier import TtsRequestIdSpecifier
 from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
+from .ttssse_request import TtssseRequest
 from .wav_output_format import WavOutputFormat
 from .web_socket_base_response import WebSocketBaseResponse
 from .web_socket_chunk_response import WebSocketChunkResponse
@@ -49,7 +53,9 @@ __all__ = [
     "ContextId",
     "Controls",
     "Emotion",
+    "ExperimentalModelControls",
     "FlushId",
+    "GenerationConfig",
     "GenerationRequest",
     "ModelSpeed",
     "Mp3OutputFormat",
@@ -63,11 +69,13 @@ __all__ = [
     "RawEncoding",
     "RawOutputFormat",
     "Speed",
+    "SseOutputFormat",
     "SupportedLanguage",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestIdSpecifier",
     "TtsRequestVoiceSpecifier",
+    "TtssseRequest",
     "WavOutputFormat",
     "WebSocketBaseResponse",
     "WebSocketChunkResponse",

cartesia/tts/types/experimental_model_controls.py ADDED Viewed

@@ -0,0 +1,28 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+import pydantic
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class ExperimentalModelControls(UniversalBaseModel):
+    """
+    These controls are **experimental** and subject to breaking changes.
+    """
+    accent_localization: typing.Optional[int] = pydantic.Field(default=None)
+    """
+    Toggle accent localization: 0 (disabled, default) or 1 (enabled).
+    When enabled, the voice adapts to match the transcript language's accent while preserving vocal characteristics. When disabled, maintains the original voice accent.
+    For more information, see [Localize Voices](/build-with-sonic/capabilities/localize-voices).
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/generation_config.py ADDED Viewed

@@ -0,0 +1,34 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+import pydantic
+from .experimental_model_controls import ExperimentalModelControls
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class GenerationConfig(UniversalBaseModel):
+    """
+    Configure the various attributes of the generated speech. These controls are only available for `sonic-3-preview` and will have no effect on earlier models.
+    """
+    volume: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    Adjust the volume of the generated speech between -1.0 (softer) and 1.0 (louder). 0.0 is the default volume.
+    """
+    speed: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    Adjust the speed of the generated speech between -1.0 (slower) and 1.0 (faster). 0.0 is the default speed.
+    """
+    experimental: typing.Optional[ExperimentalModelControls] = None
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/generation_request.py CHANGED Viewed

@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
     add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to return word-level timestamps.
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
     """
     add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to return phoneme-level timestamps.
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
     """
-    use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to use the original transcript for timestamps.
+    Whether to use normalized timestamps (True) or original timestamps (False).
     """
     if IS_PYDANTIC_V2:

cartesia/tts/types/sse_output_format.py ADDED Viewed

@@ -0,0 +1,22 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+from .raw_encoding import RawEncoding
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import pydantic
+class SseOutputFormat(UniversalBaseModel):
+    container: typing.Literal["raw"] = "raw"
+    encoding: RawEncoding
+    sample_rate: int
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/tts_request.py CHANGED Viewed

@@ -7,6 +7,7 @@ import typing
 from .supported_language import SupportedLanguage
 from .output_format import OutputFormat
 from .model_speed import ModelSpeed
+from .generation_config import GenerationConfig
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -27,6 +28,7 @@ class TtsRequest(UniversalBaseModel):
     """
     speed: typing.Optional[ModelSpeed] = None
+    generation_config: typing.Optional[GenerationConfig] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/tts/types/ttssse_request.py ADDED Viewed

@@ -0,0 +1,58 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
+import typing
+from .supported_language import SupportedLanguage
+from .sse_output_format import SseOutputFormat
+from .model_speed import ModelSpeed
+from .context_id import ContextId
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class TtssseRequest(UniversalBaseModel):
+    model_id: str = pydantic.Field()
+    """
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
+    """
+    transcript: str
+    voice: TtsRequestVoiceSpecifier
+    language: typing.Optional[SupportedLanguage] = None
+    output_format: SseOutputFormat
+    duration: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The maximum duration of the audio in seconds. You do not usually need to specify this.
+    If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
+    """
+    speed: typing.Optional[ModelSpeed] = None
+    add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to use normalized timestamps (True) or original timestamps (False).
+    """
+    context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
+    """
+    Optional context ID for this request.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/web_socket_chunk_response.py CHANGED Viewed

@@ -1,16 +1,14 @@
 # This file was auto-generated by Fern from our API Definition.
 from .web_socket_base_response import WebSocketBaseResponse
-import typing
-from .flush_id import FlushId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
 import pydantic
 class WebSocketChunkResponse(WebSocketBaseResponse):
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/tts/types/web_socket_response.py CHANGED Viewed

@@ -3,10 +3,10 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
-from .flush_id import FlushId
 from .context_id import ContextId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
+from .flush_id import FlushId
 from .word_timestamps import WordTimestamps
 from .phoneme_timestamps import PhonemeTimestamps
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
     type: typing.Literal["chunk"] = "chunk"
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     context_id: typing.Optional[ContextId] = None
     status_code: int
     done: bool

cartesia/tts/types/web_socket_tts_request.py CHANGED Viewed

@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
     voice: TtsRequestVoiceSpecifier
     duration: typing.Optional[int] = None
     language: typing.Optional[str] = None
-    add_timestamps: typing.Optional[bool] = None
-    use_original_timestamps: typing.Optional[bool] = None
-    add_phoneme_timestamps: typing.Optional[bool] = None
+    add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing.Optional[bool] = None
     continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
     context_id: typing.Optional[str] = None
     max_buffer_delay_ms: typing.Optional[int] = None

cartesia/voice_changer/client.py CHANGED Viewed

@@ -51,11 +51,9 @@ class VoiceChangerClient:
         output_format_encoding : typing.Optional[RawEncoding]
             Required for `raw` and `wav` containers.
         output_format_bit_rate : typing.Optional[int]
             Required for `mp3` containers.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
@@ -131,11 +129,9 @@ class VoiceChangerClient:
         output_format_encoding : typing.Optional[RawEncoding]
             Required for `raw` and `wav` containers.
         output_format_bit_rate : typing.Optional[int]
             Required for `mp3` containers.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -232,11 +228,9 @@ class AsyncVoiceChangerClient:
         output_format_encoding : typing.Optional[RawEncoding]
             Required for `raw` and `wav` containers.
         output_format_bit_rate : typing.Optional[int]
             Required for `mp3` containers.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
@@ -320,11 +314,9 @@ class AsyncVoiceChangerClient:
         output_format_encoding : typing.Optional[RawEncoding]
             Required for `raw` and `wav` containers.
         output_format_bit_rate : typing.Optional[int]
             Required for `mp3` containers.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.

cartesia/voice_changer/requests/streaming_response.py CHANGED Viewed

@@ -4,7 +4,6 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
-from ...tts.types.flush_id import FlushId
 from ...tts.types.context_id import ContextId
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
     type: typing.Literal["chunk"]
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]
     context_id: typing_extensions.NotRequired[ContextId]
     status_code: int
     done: bool

cartesia/voice_changer/types/streaming_response.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
-from ...tts.types.flush_id import FlushId
 from ...tts.types.context_id import ContextId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
     type: typing.Literal["chunk"] = "chunk"
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     context_id: typing.Optional[ContextId] = None
     status_code: int
     done: bool

cartesia/voices/client.py CHANGED Viewed

@@ -168,27 +168,21 @@ class VoicesClient:
         name : str
             The name of the voice.
         language : SupportedLanguage
             The language of the voice.
         mode : CloneMode
             Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
         description : typing.Optional[str]
             A description for the voice.
         enhance : typing.Optional[bool]
             Whether to apply AI enhancements to the clip to reduce background noise. This leads to cleaner generated speech at the cost of reduced similarity to the source clip.
         base_voice_id : typing.Optional[VoiceId]
             Optional base voice ID that the cloned voice is derived from.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -736,27 +730,21 @@ class AsyncVoicesClient:
         name : str
             The name of the voice.
         language : SupportedLanguage
             The language of the voice.
         mode : CloneMode
             Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
         description : typing.Optional[str]
             A description for the voice.
         enhance : typing.Optional[bool]
             Whether to apply AI enhancements to the clip to reduce background noise. This leads to cleaner generated speech at the cost of reduced similarity to the source clip.
         base_voice_id : typing.Optional[VoiceId]
             Optional base voice ID that the cloned voice is derived from.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.

cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl

cartesia 2.0.5py3-none-any.whl → 2.0.7py3-none-any.whl