PyPI - cartesia - Versions diffs - 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl - Mend

cartesia 2.0.4py3-none-any.whl → 2.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

cartesia/__init__.py +60 -1
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/base_client.py +2 -0
cartesia/client.py +5 -0
cartesia/core/client_wrapper.py +1 -1
cartesia/stt/__init__.py +57 -0
cartesia/stt/_async_websocket.py +293 -0
cartesia/stt/_websocket.py +294 -0
cartesia/stt/client.py +456 -0
cartesia/stt/requests/__init__.py +29 -0
cartesia/stt/requests/done_message.py +14 -0
cartesia/stt/requests/error_message.py +16 -0
cartesia/stt/requests/flush_done_message.py +14 -0
cartesia/stt/requests/streaming_transcription_response.py +41 -0
cartesia/stt/requests/transcript_message.py +40 -0
cartesia/stt/requests/transcription_response.py +28 -0
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +138 -0
cartesia/stt/types/__init__.py +33 -0
cartesia/stt/types/done_message.py +26 -0
cartesia/stt/types/error_message.py +27 -0
cartesia/stt/types/flush_done_message.py +26 -0
cartesia/stt/types/streaming_transcription_response.py +94 -0
cartesia/stt/types/stt_encoding.py +7 -0
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +50 -0
cartesia/stt/types/transcription_response.py +38 -0
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +8 -0
cartesia/tts/client.py +50 -8
cartesia/tts/requests/__init__.py +4 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +4 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0

cartesia/stt/socket_client.py ADDED Viewed

@@ -0,0 +1,138 @@
+import typing
+from typing import Any, Dict, Generator, Optional
+from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
+from ._async_websocket import AsyncSttWebsocket
+from ._websocket import SttWebsocket
+from .client import AsyncSttClient, SttClient
+from .types.stt_encoding import SttEncoding
+class SttClientWithWebsocket(SttClient):
+    """
+    Extension of STT functionality that supports a synchronous WebSocket STT connection.
+    """
+    def __init__(self, *, client_wrapper: SyncClientWrapper):
+        super().__init__(client_wrapper=client_wrapper)
+    def _ws_url(self):
+        base_url = self._client_wrapper.get_base_url()
+        if base_url.startswith("ws://") or base_url.startswith("wss://"):
+            return base_url
+        else:
+            prefix = "ws" if "localhost" in base_url else "wss"
+            base_url_without_protocol = base_url.split("://")[-1]
+            return f"{prefix}://{base_url_without_protocol}"
+    def websocket(
+        self,
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ):
+        """Create a WebSocket connection for real-time speech transcription.
+        Args:
+            model: ID of the model to use for transcription
+            language: The language of the input audio in ISO-639-1 format
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
+        Returns:
+            SttWebsocket: A connected WebSocket client for STT operations.
+        Example:
+            >>> client = Cartesia(api_key="your-api-key")
+            >>> ws = client.stt.websocket()
+            >>> for result in ws.transcribe(audio_chunks):
+            ...     print(result["text"])
+        """
+        client_headers = self._client_wrapper.get_headers()
+        ws = SttWebsocket(
+            ws_url=self._ws_url(),
+            cartesia_version=client_headers["Cartesia-Version"],
+            api_key=client_headers["X-API-Key"],
+        )
+        # Auto-connect like TTS does for consistency
+        ws.connect(
+            model=model,
+            language=language,
+            encoding=encoding,
+            sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
+        )
+        return ws
+class AsyncSttClientWithWebsocket(AsyncSttClient):
+    """
+    Extension of STT functionality that supports an asynchronous WebSocket STT connection.
+    """
+    def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
+        super().__init__(client_wrapper=client_wrapper)
+        self._get_session = get_session
+    def _ws_url(self) -> str:
+        base_url = self._client_wrapper.get_base_url()
+        if base_url.startswith("ws://") or base_url.startswith("wss://"):
+            return base_url
+        else:
+            prefix = "ws" if "localhost" in base_url else "wss"
+            base_url_without_protocol = base_url.split("://")[-1]
+            return f"{prefix}://{base_url_without_protocol}"
+    async def websocket(
+        self,
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ):
+        """Create an async WebSocket connection for real-time speech transcription.
+        Args:
+            model: ID of the model to use for transcription
+            language: The language of the input audio in ISO-639-1 format
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
+        Returns:
+            AsyncSttWebsocket: A connected async WebSocket client for STT operations.
+        Example:
+            >>> client = AsyncCartesia(api_key="your-api-key")
+            >>> ws = await client.stt.websocket()
+            >>> async for result in ws.transcribe(audio_chunks):
+            ...     print(result["text"])
+        """
+        client_headers = self._client_wrapper.get_headers()
+        ws = AsyncSttWebsocket(
+            ws_url=self._ws_url(),
+            cartesia_version=client_headers["Cartesia-Version"],
+            api_key=client_headers["X-API-Key"],
+            get_session=self._get_session,
+        )
+        # Auto-connect like TTS does for consistency
+        await ws.connect(
+            model=model,
+            language=language,
+            encoding=encoding,
+            sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
+        )
+        return ws

cartesia/stt/types/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+# This file was auto-generated by Fern from our API Definition.
+from .done_message import DoneMessage
+from .error_message import ErrorMessage
+from .flush_done_message import FlushDoneMessage
+from .streaming_transcription_response import (
+    StreamingTranscriptionResponse,
+    StreamingTranscriptionResponse_Done,
+    StreamingTranscriptionResponse_Error,
+    StreamingTranscriptionResponse_FlushDone,
+    StreamingTranscriptionResponse_Transcript,
+)
+from .stt_encoding import SttEncoding
+from .timestamp_granularity import TimestampGranularity
+from .transcript_message import TranscriptMessage
+from .transcription_response import TranscriptionResponse
+from .transcription_word import TranscriptionWord
+__all__ = [
+    "DoneMessage",
+    "ErrorMessage",
+    "FlushDoneMessage",
+    "StreamingTranscriptionResponse",
+    "StreamingTranscriptionResponse_Done",
+    "StreamingTranscriptionResponse_Error",
+    "StreamingTranscriptionResponse_FlushDone",
+    "StreamingTranscriptionResponse_Transcript",
+    "SttEncoding",
+    "TimestampGranularity",
+    "TranscriptMessage",
+    "TranscriptionResponse",
+    "TranscriptionWord",
+]

cartesia/stt/types/done_message.py ADDED Viewed

@@ -0,0 +1,26 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
+class DoneMessage(UniversalBaseModel):
+    """
+    Acknowledgment message sent in response to a `done` command, indicating that the session is complete and the WebSocket will close.
+    """
+    request_id: str = pydantic.Field()
+    """
+    Unique identifier for this transcription session.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/stt/types/error_message.py ADDED Viewed

@@ -0,0 +1,27 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+import pydantic
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class ErrorMessage(UniversalBaseModel):
+    request_id: typing.Optional[str] = pydantic.Field(default=None)
+    """
+    The request ID associated with the error, if applicable.
+    """
+    message: str = pydantic.Field()
+    """
+    Human-readable error message describing what went wrong.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/stt/types/flush_done_message.py ADDED Viewed

@@ -0,0 +1,26 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
+class FlushDoneMessage(UniversalBaseModel):
+    """
+    Acknowledgment message sent in response to a `finalize` command, indicating that all buffered audio has been flushed and processed.
+    """
+    request_id: str = pydantic.Field()
+    """
+    Unique identifier for this transcription session.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/stt/types/streaming_transcription_response.py ADDED Viewed

@@ -0,0 +1,94 @@
+# This file was auto-generated by Fern from our API Definition.
+from __future__ import annotations
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+from .transcription_word import TranscriptionWord
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import pydantic
+class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
+    """
+    The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
+    """
+    type: typing.Literal["transcript"] = "transcript"
+    request_id: str
+    text: str
+    is_final: bool
+    duration: typing.Optional[float] = None
+    language: typing.Optional[str] = None
+    words: typing.Optional[typing.List[TranscriptionWord]] = None
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow
+class StreamingTranscriptionResponse_FlushDone(UniversalBaseModel):
+    """
+    The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
+    """
+    type: typing.Literal["flush_done"] = "flush_done"
+    request_id: str
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow
+class StreamingTranscriptionResponse_Done(UniversalBaseModel):
+    """
+    The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
+    """
+    type: typing.Literal["done"] = "done"
+    request_id: str
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow
+class StreamingTranscriptionResponse_Error(UniversalBaseModel):
+    """
+    The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
+    """
+    type: typing.Literal["error"] = "error"
+    request_id: typing.Optional[str] = None
+    message: str
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow
+StreamingTranscriptionResponse = typing.Union[
+    StreamingTranscriptionResponse_Transcript,
+    StreamingTranscriptionResponse_FlushDone,
+    StreamingTranscriptionResponse_Done,
+    StreamingTranscriptionResponse_Error,
+]

cartesia/stt/types/stt_encoding.py ADDED Viewed

@@ -0,0 +1,7 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+SttEncoding = typing.Union[
+    typing.Literal["pcm_s16le", "pcm_s32le", "pcm_f16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"], typing.Any
+]

cartesia/stt/types/timestamp_granularity.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+TimestampGranularity = typing.Union[typing.Literal["word"], typing.Any]

cartesia/stt/types/transcript_message.py ADDED Viewed

@@ -0,0 +1,50 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+import typing
+from .transcription_word import TranscriptionWord
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class TranscriptMessage(UniversalBaseModel):
+    request_id: str = pydantic.Field()
+    """
+    Unique identifier for this transcription session.
+    """
+    text: str = pydantic.Field()
+    """
+    The transcribed text. May be partial or final depending on is_final.
+    **Note**: Text may be empty in initial responses while the system accumulates sufficient audio for transcription. This is normal behavior - wait for responses with non-empty text or monitor is_final for completion status.
+    """
+    is_final: bool = pydantic.Field()
+    """
+    Whether this is a final transcription result or an interim result.
+    """
+    duration: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The duration of the audio transcribed so far, in seconds.
+    """
+    language: typing.Optional[str] = pydantic.Field(default=None)
+    """
+    The specified language of the input audio.
+    """
+    words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
+    """
+    Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/stt/types/transcription_response.py ADDED Viewed

@@ -0,0 +1,38 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+import typing
+from .transcription_word import TranscriptionWord
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class TranscriptionResponse(UniversalBaseModel):
+    text: str = pydantic.Field()
+    """
+    The transcribed text.
+    """
+    language: typing.Optional[str] = pydantic.Field(default=None)
+    """
+    The specified language of the input audio.
+    """
+    duration: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The duration of the input audio in seconds.
+    """
+    words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
+    """
+    Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/stt/types/transcription_word.py ADDED Viewed

@@ -0,0 +1,32 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
+class TranscriptionWord(UniversalBaseModel):
+    word: str = pydantic.Field()
+    """
+    The transcribed word.
+    """
+    start: float = pydantic.Field()
+    """
+    Start time of the word in seconds.
+    """
+    end: float = pydantic.Field()
+    """
+    End time of the word in seconds.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/__init__.py CHANGED Viewed

@@ -19,11 +19,13 @@ from .types import (
     RawEncoding,
     RawOutputFormat,
     Speed,
+    SseOutputFormat,
     SupportedLanguage,
     TtsRequest,
     TtsRequestEmbeddingSpecifier,
     TtsRequestIdSpecifier,
     TtsRequestVoiceSpecifier,
+    TtssseRequest,
     WavOutputFormat,
     WebSocketBaseResponse,
     WebSocketChunkResponse,
@@ -58,10 +60,12 @@ from .requests import (
     PhonemeTimestampsParams,
     RawOutputFormatParams,
     SpeedParams,
+    SseOutputFormatParams,
     TtsRequestEmbeddingSpecifierParams,
     TtsRequestIdSpecifierParams,
     TtsRequestParams,
     TtsRequestVoiceSpecifierParams,
+    TtssseRequestParams,
     WavOutputFormatParams,
     WebSocketBaseResponseParams,
     WebSocketChunkResponseParams,
@@ -115,6 +119,8 @@ __all__ = [
     "RawOutputFormatParams",
     "Speed",
     "SpeedParams",
+    "SseOutputFormat",
+    "SseOutputFormatParams",
     "SupportedLanguage",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
@@ -124,6 +130,8 @@ __all__ = [
     "TtsRequestParams",
     "TtsRequestVoiceSpecifier",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequest",
+    "TtssseRequestParams",
     "WavOutputFormat",
     "WavOutputFormatParams",
     "WebSocketBaseResponse",

cartesia/tts/client.py CHANGED Viewed

@@ -10,6 +10,8 @@ from ..core.request_options import RequestOptions
 from ..core.serialization import convert_and_respect_annotation_metadata
 from json.decoder import JSONDecodeError
 from ..core.api_error import ApiError
+from .requests.sse_output_format import SseOutputFormatParams
+from .types.context_id import ContextId
 from .types.web_socket_response import WebSocketResponse
 import httpx_sse
 from ..core.pydantic_utilities import parse_obj_as
@@ -119,10 +121,14 @@ class TtsClient:
         model_id: str,
         transcript: str,
         voice: TtsRequestVoiceSpecifierParams,
-        output_format: OutputFormatParams,
+        output_format: SseOutputFormatParams,
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        add_timestamps: typing.Optional[bool] = OMIT,
+        add_phoneme_timestamps: typing.Optional[bool] = OMIT,
+        use_normalized_timestamps: typing.Optional[bool] = OMIT,
+        context_id: typing.Optional[ContextId] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[WebSocketResponse]:
         """
@@ -135,7 +141,7 @@ class TtsClient:
         voice : TtsRequestVoiceSpecifierParams
-        output_format : OutputFormatParams
+        output_format : SseOutputFormatParams
         language : typing.Optional[SupportedLanguage]
@@ -145,6 +151,18 @@ class TtsClient:
         speed : typing.Optional[ModelSpeed]
+        add_timestamps : typing.Optional[bool]
+            Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+        add_phoneme_timestamps : typing.Optional[bool]
+            Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+        use_normalized_timestamps : typing.Optional[bool]
+            Whether to use normalized timestamps (True) or original timestamps (False).
+        context_id : typing.Optional[ContextId]
+            Optional context ID for this request.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -165,9 +183,9 @@ class TtsClient:
             voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
             language="en",
             output_format={
+                "container": "raw",
                 "sample_rate": 44100,
                 "encoding": "pcm_f32le",
-                "container": "raw",
             },
         )
         for chunk in response:
@@ -184,10 +202,14 @@ class TtsClient:
                 ),
                 "language": language,
                 "output_format": convert_and_respect_annotation_metadata(
-                    object_=output_format, annotation=OutputFormatParams, direction="write"
+                    object_=output_format, annotation=SseOutputFormatParams, direction="write"
                 ),
                 "duration": duration,
                 "speed": speed,
+                "add_timestamps": add_timestamps,
+                "add_phoneme_timestamps": add_phoneme_timestamps,
+                "use_normalized_timestamps": use_normalized_timestamps,
+                "context_id": context_id,
             },
             request_options=request_options,
             omit=OMIT,
@@ -321,10 +343,14 @@ class AsyncTtsClient:
         model_id: str,
         transcript: str,
         voice: TtsRequestVoiceSpecifierParams,
-        output_format: OutputFormatParams,
+        output_format: SseOutputFormatParams,
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        add_timestamps: typing.Optional[bool] = OMIT,
+        add_phoneme_timestamps: typing.Optional[bool] = OMIT,
+        use_normalized_timestamps: typing.Optional[bool] = OMIT,
+        context_id: typing.Optional[ContextId] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[WebSocketResponse]:
         """
@@ -337,7 +363,7 @@ class AsyncTtsClient:
         voice : TtsRequestVoiceSpecifierParams
-        output_format : OutputFormatParams
+        output_format : SseOutputFormatParams
         language : typing.Optional[SupportedLanguage]
@@ -347,6 +373,18 @@ class AsyncTtsClient:
         speed : typing.Optional[ModelSpeed]
+        add_timestamps : typing.Optional[bool]
+            Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+        add_phoneme_timestamps : typing.Optional[bool]
+            Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+        use_normalized_timestamps : typing.Optional[bool]
+            Whether to use normalized timestamps (True) or original timestamps (False).
+        context_id : typing.Optional[ContextId]
+            Optional context ID for this request.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -372,9 +410,9 @@ class AsyncTtsClient:
                 voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
                 language="en",
                 output_format={
+                    "container": "raw",
                     "sample_rate": 44100,
                     "encoding": "pcm_f32le",
-                    "container": "raw",
                 },
             )
             async for chunk in response:
@@ -394,10 +432,14 @@ class AsyncTtsClient:
                 ),
                 "language": language,
                 "output_format": convert_and_respect_annotation_metadata(
-                    object_=output_format, annotation=OutputFormatParams, direction="write"
+                    object_=output_format, annotation=SseOutputFormatParams, direction="write"
                 ),
                 "duration": duration,
                 "speed": speed,
+                "add_timestamps": add_timestamps,
+                "add_phoneme_timestamps": add_phoneme_timestamps,
+                "use_normalized_timestamps": use_normalized_timestamps,
+                "context_id": context_id,
             },
             request_options=request_options,
             omit=OMIT,

cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl

cartesia 2.0.4py3-none-any.whl → 2.0.6py3-none-any.whl