PyPI - cartesia - Versions diffs - 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl - Mend

cartesia 2.0.5py3-none-any.whl → 2.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

cartesia/__init__.py +22 -0
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/core/client_wrapper.py +1 -1
cartesia/infill/client.py +0 -8
cartesia/stt/__init__.py +6 -0
cartesia/stt/_async_websocket.py +81 -72
cartesia/stt/_websocket.py +42 -20
cartesia/stt/client.py +450 -0
cartesia/stt/requests/__init__.py +2 -0
cartesia/stt/requests/streaming_transcription_response.py +2 -0
cartesia/stt/requests/transcript_message.py +8 -1
cartesia/stt/requests/transcription_response.py +8 -1
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +52 -109
cartesia/stt/types/__init__.py +4 -0
cartesia/stt/types/streaming_transcription_response.py +2 -0
cartesia/stt/types/stt_encoding.py +3 -1
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +7 -1
cartesia/stt/types/transcription_response.py +7 -1
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +16 -0
cartesia/tts/client.py +63 -8
cartesia/tts/requests/__init__.py +8 -0
cartesia/tts/requests/experimental_model_controls.py +17 -0
cartesia/tts/requests/generation_config.py +23 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/tts_request.py +2 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +8 -0
cartesia/tts/types/experimental_model_controls.py +28 -0
cartesia/tts/types/generation_config.py +34 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/tts_request.py +2 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/client.py +0 -8
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
cartesia/voices/client.py +0 -12
cartesia-2.0.7.dist-info/LICENSE +201 -0
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1

cartesia/stt/requests/transcription_word.py ADDED Viewed

@@ -0,0 +1,20 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+class TranscriptionWordParams(typing_extensions.TypedDict):
+    word: str
+    """
+    The transcribed word.
+    """
+    start: float
+    """
+    Start time of the word in seconds.
+    """
+    end: float
+    """
+    End time of the word in seconds.
+    """

cartesia/stt/socket_client.py CHANGED Viewed

@@ -1,18 +1,20 @@
 import typing
-from typing import Any, Dict, Generator, Optional, Union
+from typing import Any, Dict, Generator, Optional
 from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
 from ._async_websocket import AsyncSttWebsocket
 from ._websocket import SttWebsocket
+from .client import AsyncSttClient, SttClient
+from .types.stt_encoding import SttEncoding
-class SttClientWithWebsocket:
+class SttClientWithWebsocket(SttClient):
     """
     Extension of STT functionality that supports a synchronous WebSocket STT connection.
     """
     def __init__(self, *, client_wrapper: SyncClientWrapper):
-        self._client_wrapper = client_wrapper
+        super().__init__(client_wrapper=client_wrapper)
     def _ws_url(self):
         base_url = self._client_wrapper.get_base_url()
@@ -23,21 +25,34 @@ class SttClientWithWebsocket:
             base_url_without_protocol = base_url.split("://")[-1]
             return f"{prefix}://{base_url_without_protocol}"
-    def websocket(self, *,
-                  model: str = "ink-whisper",
-                  language: Optional[str] = "en",
-                  encoding: Optional[str] = "pcm_s16le",
-                  sample_rate: int = 16000):
+    def websocket(
+        self,
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ):
         """Create a WebSocket connection for real-time speech transcription.
         Args:
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Returns:
             SttWebsocket: A connected WebSocket client for STT operations.
+        Example:
+            >>> client = Cartesia(api_key="your-api-key")
+            >>> ws = client.stt.websocket()
+            >>> for result in ws.transcribe(audio_chunks):
+            ...     print(result["text"])
         """
         client_headers = self._client_wrapper.get_headers()
         ws = SttWebsocket(
@@ -51,61 +66,19 @@ class SttClientWithWebsocket:
             language=language,
             encoding=encoding,
             sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
         )
         return ws
-    def transcribe(
-        self,
-        audio_chunks: typing.Iterator[bytes],
-        *,
-        model: str = "ink-whisper",
-        language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
-        sample_rate: int = 16000,
-    ) -> Generator[Dict[str, Any], None, None]:
-        """Transcribe audio chunks using WebSocket.
-        Args:
-            audio_chunks: Iterator of audio chunks as bytes
-            model: ID of the model to use for transcription
-            language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
-        Yields:
-            Dictionary containing transcription results, flush_done, done, or error messages
-        Example:
-            >>> client = Cartesia(api_key="your-api-key")
-            >>> ws_client = client.stt.websocket()
-            >>> for result in ws_client.transcribe(audio_chunks):
-            ...     print(result["text"])
-        """
-        ws = self.websocket(
-            model=model,
-            language=language,
-            encoding=encoding,
-            sample_rate=sample_rate,
-        )
-        try:
-            yield from ws.transcribe(
-                audio_chunks,
-                model=model,
-                language=language,
-                encoding=encoding,
-                sample_rate=sample_rate,
-            )
-        finally:
-            ws.close()
-class AsyncSttClientWithWebsocket:
+class AsyncSttClientWithWebsocket(AsyncSttClient):
     """
     Extension of STT functionality that supports an asynchronous WebSocket STT connection.
     """
     def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
-        self._client_wrapper = client_wrapper
+        super().__init__(client_wrapper=client_wrapper)
         self._get_session = get_session
     def _ws_url(self) -> str:
@@ -117,21 +90,34 @@ class AsyncSttClientWithWebsocket:
             base_url_without_protocol = base_url.split("://")[-1]
             return f"{prefix}://{base_url_without_protocol}"
-    async def websocket(self, *,
-                        model: str = "ink-whisper",
-                        language: Optional[str] = "en",
-                        encoding: Optional[str] = "pcm_s16le",
-                        sample_rate: int = 16000):
+    async def websocket(
+        self,
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ):
         """Create an async WebSocket connection for real-time speech transcription.
         Args:
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Returns:
             AsyncSttWebsocket: A connected async WebSocket client for STT operations.
+        Example:
+            >>> client = AsyncCartesia(api_key="your-api-key")
+            >>> ws = await client.stt.websocket()
+            >>> async for result in ws.transcribe(audio_chunks):
+            ...     print(result["text"])
         """
         client_headers = self._client_wrapper.get_headers()
         ws = AsyncSttWebsocket(
@@ -146,50 +132,7 @@ class AsyncSttClientWithWebsocket:
             language=language,
             encoding=encoding,
             sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
         )
-        return ws
-    async def transcribe(
-        self,
-        audio_chunks: typing.AsyncIterator[bytes],
-        *,
-        model: str = "ink-whisper",
-        language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
-        sample_rate: int = 16000,
-    ) -> typing.AsyncGenerator[Dict[str, Any], None]:
-        """Transcribe audio chunks using async WebSocket.
-        Args:
-            audio_chunks: Async iterator of audio chunks as bytes
-            model: ID of the model to use for transcription
-            language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
-        Yields:
-            Dictionary containing transcription results, flush_done, done, or error messages
-        Example:
-            >>> client = AsyncCartesia(api_key="your-api-key")
-            >>> ws_client = await client.stt.websocket()
-            >>> async for result in ws_client.transcribe(audio_chunks):
-            ...     print(result["text"])
-        """
-        ws = await self.websocket(
-            model=model,
-            language=language,
-            encoding=encoding,
-            sample_rate=sample_rate,
-        )
-        try:
-            async for result in ws.transcribe(
-                audio_chunks,
-                model=model,
-                language=language,
-                encoding=encoding,
-                sample_rate=sample_rate,
-            ):
-                yield result
-        finally:
-            await ws.close()
+        return ws

cartesia/stt/types/__init__.py CHANGED Viewed

@@ -11,8 +11,10 @@ from .streaming_transcription_response import (
     StreamingTranscriptionResponse_Transcript,
 )
 from .stt_encoding import SttEncoding
+from .timestamp_granularity import TimestampGranularity
 from .transcript_message import TranscriptMessage
 from .transcription_response import TranscriptionResponse
+from .transcription_word import TranscriptionWord
 __all__ = [
     "DoneMessage",
@@ -24,6 +26,8 @@ __all__ = [
     "StreamingTranscriptionResponse_FlushDone",
     "StreamingTranscriptionResponse_Transcript",
     "SttEncoding",
+    "TimestampGranularity",
     "TranscriptMessage",
     "TranscriptionResponse",
+    "TranscriptionWord",
 ]

cartesia/stt/types/streaming_transcription_response.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
+from .transcription_word import TranscriptionWord
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
@@ -18,6 +19,7 @@ class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
     is_final: bool
     duration: typing.Optional[float] = None
     language: typing.Optional[str] = None
+    words: typing.Optional[typing.List[TranscriptionWord]] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/stt/types/stt_encoding.py CHANGED Viewed

@@ -2,4 +2,6 @@
 import typing
-SttEncoding = typing.Union[typing.Literal["pcm_s16le"], typing.Any]
+SttEncoding = typing.Union[
+    typing.Literal["pcm_s16le", "pcm_s32le", "pcm_f16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"], typing.Any
+]

cartesia/stt/types/timestamp_granularity.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+TimestampGranularity = typing.Union[typing.Literal["word"], typing.Any]

cartesia/stt/types/transcript_message.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from ...core.pydantic_utilities import UniversalBaseModel
 import pydantic
 import typing
+from .transcription_word import TranscriptionWord
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -31,7 +32,12 @@ class TranscriptMessage(UniversalBaseModel):
     language: typing.Optional[str] = pydantic.Field(default=None)
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
+    """
+    words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
+    """
+    Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
     """
     if IS_PYDANTIC_V2:

cartesia/stt/types/transcription_response.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from ...core.pydantic_utilities import UniversalBaseModel
 import pydantic
 import typing
+from .transcription_word import TranscriptionWord
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -14,7 +15,7 @@ class TranscriptionResponse(UniversalBaseModel):
     language: typing.Optional[str] = pydantic.Field(default=None)
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
     """
     duration: typing.Optional[float] = pydantic.Field(default=None)
@@ -22,6 +23,11 @@ class TranscriptionResponse(UniversalBaseModel):
     The duration of the input audio in seconds.
     """
+    words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
+    """
+    Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
+    """
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
     else:

cartesia/stt/types/transcription_word.py ADDED Viewed

@@ -0,0 +1,32 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
+class TranscriptionWord(UniversalBaseModel):
+    word: str = pydantic.Field()
+    """
+    The transcribed word.
+    """
+    start: float = pydantic.Field()
+    """
+    Start time of the word in seconds.
+    """
+    end: float = pydantic.Field()
+    """
+    End time of the word in seconds.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/__init__.py CHANGED Viewed

@@ -5,7 +5,9 @@ from .types import (
     ContextId,
     Controls,
     Emotion,
+    ExperimentalModelControls,
     FlushId,
+    GenerationConfig,
     GenerationRequest,
     ModelSpeed,
     Mp3OutputFormat,
@@ -19,11 +21,13 @@ from .types import (
     RawEncoding,
     RawOutputFormat,
     Speed,
+    SseOutputFormat,
     SupportedLanguage,
     TtsRequest,
     TtsRequestEmbeddingSpecifier,
     TtsRequestIdSpecifier,
     TtsRequestVoiceSpecifier,
+    TtssseRequest,
     WavOutputFormat,
     WebSocketBaseResponse,
     WebSocketChunkResponse,
@@ -49,6 +53,8 @@ from .types import (
 from .requests import (
     CancelContextRequestParams,
     ControlsParams,
+    ExperimentalModelControlsParams,
+    GenerationConfigParams,
     GenerationRequestParams,
     Mp3OutputFormatParams,
     OutputFormatParams,
@@ -58,10 +64,12 @@ from .requests import (
     PhonemeTimestampsParams,
     RawOutputFormatParams,
     SpeedParams,
+    SseOutputFormatParams,
     TtsRequestEmbeddingSpecifierParams,
     TtsRequestIdSpecifierParams,
     TtsRequestParams,
     TtsRequestVoiceSpecifierParams,
+    TtssseRequestParams,
     WavOutputFormatParams,
     WebSocketBaseResponseParams,
     WebSocketChunkResponseParams,
@@ -92,7 +100,11 @@ __all__ = [
     "Controls",
     "ControlsParams",
     "Emotion",
+    "ExperimentalModelControls",
+    "ExperimentalModelControlsParams",
     "FlushId",
+    "GenerationConfig",
+    "GenerationConfigParams",
     "GenerationRequest",
     "GenerationRequestParams",
     "ModelSpeed",
@@ -115,6 +127,8 @@ __all__ = [
     "RawOutputFormatParams",
     "Speed",
     "SpeedParams",
+    "SseOutputFormat",
+    "SseOutputFormatParams",
     "SupportedLanguage",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
@@ -124,6 +138,8 @@ __all__ = [
     "TtsRequestParams",
     "TtsRequestVoiceSpecifier",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequest",
+    "TtssseRequestParams",
     "WavOutputFormat",
     "WavOutputFormatParams",
     "WebSocketBaseResponse",

cartesia/tts/client.py CHANGED Viewed

@@ -6,10 +6,13 @@ from .requests.tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
 from .requests.output_format import OutputFormatParams
 from .types.supported_language import SupportedLanguage
 from .types.model_speed import ModelSpeed
+from .requests.generation_config import GenerationConfigParams
 from ..core.request_options import RequestOptions
 from ..core.serialization import convert_and_respect_annotation_metadata
 from json.decoder import JSONDecodeError
 from ..core.api_error import ApiError
+from .requests.sse_output_format import SseOutputFormatParams
+from .types.context_id import ContextId
 from .types.web_socket_response import WebSocketResponse
 import httpx_sse
 from ..core.pydantic_utilities import parse_obj_as
@@ -34,6 +37,7 @@ class TtsClient:
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        generation_config: typing.Optional[GenerationConfigParams] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[bytes]:
         """
@@ -56,6 +60,8 @@ class TtsClient:
         speed : typing.Optional[ModelSpeed]
+        generation_config : typing.Optional[GenerationConfigParams]
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
@@ -97,6 +103,9 @@ class TtsClient:
                 ),
                 "duration": duration,
                 "speed": speed,
+                "generation_config": convert_and_respect_annotation_metadata(
+                    object_=generation_config, annotation=GenerationConfigParams, direction="write"
+                ),
             },
             request_options=request_options,
             omit=OMIT,
@@ -119,10 +128,14 @@ class TtsClient:
         model_id: str,
         transcript: str,
         voice: TtsRequestVoiceSpecifierParams,
-        output_format: OutputFormatParams,
+        output_format: SseOutputFormatParams,
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        add_timestamps: typing.Optional[bool] = OMIT,
+        add_phoneme_timestamps: typing.Optional[bool] = OMIT,
+        use_normalized_timestamps: typing.Optional[bool] = OMIT,
+        context_id: typing.Optional[ContextId] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[WebSocketResponse]:
         """
@@ -135,7 +148,7 @@ class TtsClient:
         voice : TtsRequestVoiceSpecifierParams
-        output_format : OutputFormatParams
+        output_format : SseOutputFormatParams
         language : typing.Optional[SupportedLanguage]
@@ -145,6 +158,18 @@ class TtsClient:
         speed : typing.Optional[ModelSpeed]
+        add_timestamps : typing.Optional[bool]
+            Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+        add_phoneme_timestamps : typing.Optional[bool]
+            Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+        use_normalized_timestamps : typing.Optional[bool]
+            Whether to use normalized timestamps (True) or original timestamps (False).
+        context_id : typing.Optional[ContextId]
+            Optional context ID for this request.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -165,9 +190,9 @@ class TtsClient:
             voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
             language="en",
             output_format={
+                "container": "raw",
                 "sample_rate": 44100,
                 "encoding": "pcm_f32le",
-                "container": "raw",
             },
         )
         for chunk in response:
@@ -184,10 +209,14 @@ class TtsClient:
                 ),
                 "language": language,
                 "output_format": convert_and_respect_annotation_metadata(
-                    object_=output_format, annotation=OutputFormatParams, direction="write"
+                    object_=output_format, annotation=SseOutputFormatParams, direction="write"
                 ),
                 "duration": duration,
                 "speed": speed,
+                "add_timestamps": add_timestamps,
+                "add_phoneme_timestamps": add_phoneme_timestamps,
+                "use_normalized_timestamps": use_normalized_timestamps,
+                "context_id": context_id,
             },
             request_options=request_options,
             omit=OMIT,
@@ -228,6 +257,7 @@ class AsyncTtsClient:
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        generation_config: typing.Optional[GenerationConfigParams] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[bytes]:
         """
@@ -250,6 +280,8 @@ class AsyncTtsClient:
         speed : typing.Optional[ModelSpeed]
+        generation_config : typing.Optional[GenerationConfigParams]
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
@@ -299,6 +331,9 @@ class AsyncTtsClient:
                 ),
                 "duration": duration,
                 "speed": speed,
+                "generation_config": convert_and_respect_annotation_metadata(
+                    object_=generation_config, annotation=GenerationConfigParams, direction="write"
+                ),
             },
             request_options=request_options,
             omit=OMIT,
@@ -321,10 +356,14 @@ class AsyncTtsClient:
         model_id: str,
         transcript: str,
         voice: TtsRequestVoiceSpecifierParams,
-        output_format: OutputFormatParams,
+        output_format: SseOutputFormatParams,
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        add_timestamps: typing.Optional[bool] = OMIT,
+        add_phoneme_timestamps: typing.Optional[bool] = OMIT,
+        use_normalized_timestamps: typing.Optional[bool] = OMIT,
+        context_id: typing.Optional[ContextId] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[WebSocketResponse]:
         """
@@ -337,7 +376,7 @@ class AsyncTtsClient:
         voice : TtsRequestVoiceSpecifierParams
-        output_format : OutputFormatParams
+        output_format : SseOutputFormatParams
         language : typing.Optional[SupportedLanguage]
@@ -347,6 +386,18 @@ class AsyncTtsClient:
         speed : typing.Optional[ModelSpeed]
+        add_timestamps : typing.Optional[bool]
+            Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+        add_phoneme_timestamps : typing.Optional[bool]
+            Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+        use_normalized_timestamps : typing.Optional[bool]
+            Whether to use normalized timestamps (True) or original timestamps (False).
+        context_id : typing.Optional[ContextId]
+            Optional context ID for this request.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -372,9 +423,9 @@ class AsyncTtsClient:
                 voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
                 language="en",
                 output_format={
+                    "container": "raw",
                     "sample_rate": 44100,
                     "encoding": "pcm_f32le",
-                    "container": "raw",
                 },
             )
             async for chunk in response:
@@ -394,10 +445,14 @@ class AsyncTtsClient:
                 ),
                 "language": language,
                 "output_format": convert_and_respect_annotation_metadata(
-                    object_=output_format, annotation=OutputFormatParams, direction="write"
+                    object_=output_format, annotation=SseOutputFormatParams, direction="write"
                 ),
                 "duration": duration,
                 "speed": speed,
+                "add_timestamps": add_timestamps,
+                "add_phoneme_timestamps": add_phoneme_timestamps,
+                "use_normalized_timestamps": use_normalized_timestamps,
+                "context_id": context_id,
             },
             request_options=request_options,
             omit=OMIT,

cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl

cartesia 2.0.5py3-none-any.whl → 2.0.7py3-none-any.whl