PyPI - cartesia - Versions diffs - 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl - Mend

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

cartesia/__init__.py +14 -0
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/core/client_wrapper.py +1 -1
cartesia/stt/__init__.py +6 -0
cartesia/stt/_async_websocket.py +81 -72
cartesia/stt/_websocket.py +42 -20
cartesia/stt/client.py +456 -0
cartesia/stt/requests/__init__.py +2 -0
cartesia/stt/requests/streaming_transcription_response.py +2 -0
cartesia/stt/requests/transcript_message.py +8 -1
cartesia/stt/requests/transcription_response.py +8 -1
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +52 -109
cartesia/stt/types/__init__.py +4 -0
cartesia/stt/types/streaming_transcription_response.py +2 -0
cartesia/stt/types/stt_encoding.py +3 -1
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +7 -1
cartesia/stt/types/transcription_response.py +7 -1
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +8 -0
cartesia/tts/client.py +50 -8
cartesia/tts/requests/__init__.py +4 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +4 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0

cartesia/stt/socket_client.py CHANGED Viewed

@@ -1,18 +1,20 @@
 import typing
-from typing import Any, Dict, Generator, Optional, Union
+from typing import Any, Dict, Generator, Optional
 from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
 from ._async_websocket import AsyncSttWebsocket
 from ._websocket import SttWebsocket
+from .client import AsyncSttClient, SttClient
+from .types.stt_encoding import SttEncoding
-class SttClientWithWebsocket:
+class SttClientWithWebsocket(SttClient):
     """
     Extension of STT functionality that supports a synchronous WebSocket STT connection.
     """
     def __init__(self, *, client_wrapper: SyncClientWrapper):
-        self._client_wrapper = client_wrapper
+        super().__init__(client_wrapper=client_wrapper)
     def _ws_url(self):
         base_url = self._client_wrapper.get_base_url()
@@ -23,21 +25,34 @@ class SttClientWithWebsocket:
             base_url_without_protocol = base_url.split("://")[-1]
             return f"{prefix}://{base_url_without_protocol}"
-    def websocket(self, *,
-                  model: str = "ink-whisper",
-                  language: Optional[str] = "en",
-                  encoding: Optional[str] = "pcm_s16le",
-                  sample_rate: int = 16000):
+    def websocket(
+        self,
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ):
         """Create a WebSocket connection for real-time speech transcription.
         Args:
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Returns:
             SttWebsocket: A connected WebSocket client for STT operations.
+        Example:
+            >>> client = Cartesia(api_key="your-api-key")
+            >>> ws = client.stt.websocket()
+            >>> for result in ws.transcribe(audio_chunks):
+            ...     print(result["text"])
         """
         client_headers = self._client_wrapper.get_headers()
         ws = SttWebsocket(
@@ -51,61 +66,19 @@ class SttClientWithWebsocket:
             language=language,
             encoding=encoding,
             sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
         )
         return ws
-    def transcribe(
-        self,
-        audio_chunks: typing.Iterator[bytes],
-        *,
-        model: str = "ink-whisper",
-        language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
-        sample_rate: int = 16000,
-    ) -> Generator[Dict[str, Any], None, None]:
-        """Transcribe audio chunks using WebSocket.
-        Args:
-            audio_chunks: Iterator of audio chunks as bytes
-            model: ID of the model to use for transcription
-            language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
-        Yields:
-            Dictionary containing transcription results, flush_done, done, or error messages
-        Example:
-            >>> client = Cartesia(api_key="your-api-key")
-            >>> ws_client = client.stt.websocket()
-            >>> for result in ws_client.transcribe(audio_chunks):
-            ...     print(result["text"])
-        """
-        ws = self.websocket(
-            model=model,
-            language=language,
-            encoding=encoding,
-            sample_rate=sample_rate,
-        )
-        try:
-            yield from ws.transcribe(
-                audio_chunks,
-                model=model,
-                language=language,
-                encoding=encoding,
-                sample_rate=sample_rate,
-            )
-        finally:
-            ws.close()
-class AsyncSttClientWithWebsocket:
+class AsyncSttClientWithWebsocket(AsyncSttClient):
     """
     Extension of STT functionality that supports an asynchronous WebSocket STT connection.
     """
     def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
-        self._client_wrapper = client_wrapper
+        super().__init__(client_wrapper=client_wrapper)
         self._get_session = get_session
     def _ws_url(self) -> str:
@@ -117,21 +90,34 @@ class AsyncSttClientWithWebsocket:
             base_url_without_protocol = base_url.split("://")[-1]
             return f"{prefix}://{base_url_without_protocol}"
-    async def websocket(self, *,
-                        model: str = "ink-whisper",
-                        language: Optional[str] = "en",
-                        encoding: Optional[str] = "pcm_s16le",
-                        sample_rate: int = 16000):
+    async def websocket(
+        self,
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ):
         """Create an async WebSocket connection for real-time speech transcription.
         Args:
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Returns:
             AsyncSttWebsocket: A connected async WebSocket client for STT operations.
+        Example:
+            >>> client = AsyncCartesia(api_key="your-api-key")
+            >>> ws = await client.stt.websocket()
+            >>> async for result in ws.transcribe(audio_chunks):
+            ...     print(result["text"])
         """
         client_headers = self._client_wrapper.get_headers()
         ws = AsyncSttWebsocket(
@@ -146,50 +132,7 @@ class AsyncSttClientWithWebsocket:
             language=language,
             encoding=encoding,
             sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
         )
-        return ws
-    async def transcribe(
-        self,
-        audio_chunks: typing.AsyncIterator[bytes],
-        *,
-        model: str = "ink-whisper",
-        language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
-        sample_rate: int = 16000,
-    ) -> typing.AsyncGenerator[Dict[str, Any], None]:
-        """Transcribe audio chunks using async WebSocket.
-        Args:
-            audio_chunks: Async iterator of audio chunks as bytes
-            model: ID of the model to use for transcription
-            language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
-        Yields:
-            Dictionary containing transcription results, flush_done, done, or error messages
-        Example:
-            >>> client = AsyncCartesia(api_key="your-api-key")
-            >>> ws_client = await client.stt.websocket()
-            >>> async for result in ws_client.transcribe(audio_chunks):
-            ...     print(result["text"])
-        """
-        ws = await self.websocket(
-            model=model,
-            language=language,
-            encoding=encoding,
-            sample_rate=sample_rate,
-        )
-        try:
-            async for result in ws.transcribe(
-                audio_chunks,
-                model=model,
-                language=language,
-                encoding=encoding,
-                sample_rate=sample_rate,
-            ):
-                yield result
-        finally:
-            await ws.close()
+        return ws

cartesia/stt/types/__init__.py CHANGED Viewed

@@ -11,8 +11,10 @@ from .streaming_transcription_response import (
     StreamingTranscriptionResponse_Transcript,
 )
 from .stt_encoding import SttEncoding
+from .timestamp_granularity import TimestampGranularity
 from .transcript_message import TranscriptMessage
 from .transcription_response import TranscriptionResponse
+from .transcription_word import TranscriptionWord
 __all__ = [
     "DoneMessage",
@@ -24,6 +26,8 @@ __all__ = [
     "StreamingTranscriptionResponse_FlushDone",
     "StreamingTranscriptionResponse_Transcript",
     "SttEncoding",
+    "TimestampGranularity",
     "TranscriptMessage",
     "TranscriptionResponse",
+    "TranscriptionWord",
 ]

cartesia/stt/types/streaming_transcription_response.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
+from .transcription_word import TranscriptionWord
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
@@ -18,6 +19,7 @@ class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
     is_final: bool
     duration: typing.Optional[float] = None
     language: typing.Optional[str] = None
+    words: typing.Optional[typing.List[TranscriptionWord]] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/stt/types/stt_encoding.py CHANGED Viewed

@@ -2,4 +2,6 @@
 import typing
-SttEncoding = typing.Union[typing.Literal["pcm_s16le"], typing.Any]
+SttEncoding = typing.Union[
+    typing.Literal["pcm_s16le", "pcm_s32le", "pcm_f16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"], typing.Any
+]

cartesia/stt/types/timestamp_granularity.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+TimestampGranularity = typing.Union[typing.Literal["word"], typing.Any]

cartesia/stt/types/transcript_message.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from ...core.pydantic_utilities import UniversalBaseModel
 import pydantic
 import typing
+from .transcription_word import TranscriptionWord
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -31,7 +32,12 @@ class TranscriptMessage(UniversalBaseModel):
     language: typing.Optional[str] = pydantic.Field(default=None)
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
+    """
+    words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
+    """
+    Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
     """
     if IS_PYDANTIC_V2:

cartesia/stt/types/transcription_response.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from ...core.pydantic_utilities import UniversalBaseModel
 import pydantic
 import typing
+from .transcription_word import TranscriptionWord
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -14,7 +15,7 @@ class TranscriptionResponse(UniversalBaseModel):
     language: typing.Optional[str] = pydantic.Field(default=None)
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
     """
     duration: typing.Optional[float] = pydantic.Field(default=None)
@@ -22,6 +23,11 @@ class TranscriptionResponse(UniversalBaseModel):
     The duration of the input audio in seconds.
     """
+    words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
+    """
+    Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
+    """
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
     else:

cartesia/stt/types/transcription_word.py ADDED Viewed

@@ -0,0 +1,32 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
+class TranscriptionWord(UniversalBaseModel):
+    word: str = pydantic.Field()
+    """
+    The transcribed word.
+    """
+    start: float = pydantic.Field()
+    """
+    Start time of the word in seconds.
+    """
+    end: float = pydantic.Field()
+    """
+    End time of the word in seconds.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/__init__.py CHANGED Viewed

@@ -19,11 +19,13 @@ from .types import (
     RawEncoding,
     RawOutputFormat,
     Speed,
+    SseOutputFormat,
     SupportedLanguage,
     TtsRequest,
     TtsRequestEmbeddingSpecifier,
     TtsRequestIdSpecifier,
     TtsRequestVoiceSpecifier,
+    TtssseRequest,
     WavOutputFormat,
     WebSocketBaseResponse,
     WebSocketChunkResponse,
@@ -58,10 +60,12 @@ from .requests import (
     PhonemeTimestampsParams,
     RawOutputFormatParams,
     SpeedParams,
+    SseOutputFormatParams,
     TtsRequestEmbeddingSpecifierParams,
     TtsRequestIdSpecifierParams,
     TtsRequestParams,
     TtsRequestVoiceSpecifierParams,
+    TtssseRequestParams,
     WavOutputFormatParams,
     WebSocketBaseResponseParams,
     WebSocketChunkResponseParams,
@@ -115,6 +119,8 @@ __all__ = [
     "RawOutputFormatParams",
     "Speed",
     "SpeedParams",
+    "SseOutputFormat",
+    "SseOutputFormatParams",
     "SupportedLanguage",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
@@ -124,6 +130,8 @@ __all__ = [
     "TtsRequestParams",
     "TtsRequestVoiceSpecifier",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequest",
+    "TtssseRequestParams",
     "WavOutputFormat",
     "WavOutputFormatParams",
     "WebSocketBaseResponse",

cartesia/tts/client.py CHANGED Viewed

@@ -10,6 +10,8 @@ from ..core.request_options import RequestOptions
 from ..core.serialization import convert_and_respect_annotation_metadata
 from json.decoder import JSONDecodeError
 from ..core.api_error import ApiError
+from .requests.sse_output_format import SseOutputFormatParams
+from .types.context_id import ContextId
 from .types.web_socket_response import WebSocketResponse
 import httpx_sse
 from ..core.pydantic_utilities import parse_obj_as
@@ -119,10 +121,14 @@ class TtsClient:
         model_id: str,
         transcript: str,
         voice: TtsRequestVoiceSpecifierParams,
-        output_format: OutputFormatParams,
+        output_format: SseOutputFormatParams,
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        add_timestamps: typing.Optional[bool] = OMIT,
+        add_phoneme_timestamps: typing.Optional[bool] = OMIT,
+        use_normalized_timestamps: typing.Optional[bool] = OMIT,
+        context_id: typing.Optional[ContextId] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[WebSocketResponse]:
         """
@@ -135,7 +141,7 @@ class TtsClient:
         voice : TtsRequestVoiceSpecifierParams
-        output_format : OutputFormatParams
+        output_format : SseOutputFormatParams
         language : typing.Optional[SupportedLanguage]
@@ -145,6 +151,18 @@ class TtsClient:
         speed : typing.Optional[ModelSpeed]
+        add_timestamps : typing.Optional[bool]
+            Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+        add_phoneme_timestamps : typing.Optional[bool]
+            Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+        use_normalized_timestamps : typing.Optional[bool]
+            Whether to use normalized timestamps (True) or original timestamps (False).
+        context_id : typing.Optional[ContextId]
+            Optional context ID for this request.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -165,9 +183,9 @@ class TtsClient:
             voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
             language="en",
             output_format={
+                "container": "raw",
                 "sample_rate": 44100,
                 "encoding": "pcm_f32le",
-                "container": "raw",
             },
         )
         for chunk in response:
@@ -184,10 +202,14 @@ class TtsClient:
                 ),
                 "language": language,
                 "output_format": convert_and_respect_annotation_metadata(
-                    object_=output_format, annotation=OutputFormatParams, direction="write"
+                    object_=output_format, annotation=SseOutputFormatParams, direction="write"
                 ),
                 "duration": duration,
                 "speed": speed,
+                "add_timestamps": add_timestamps,
+                "add_phoneme_timestamps": add_phoneme_timestamps,
+                "use_normalized_timestamps": use_normalized_timestamps,
+                "context_id": context_id,
             },
             request_options=request_options,
             omit=OMIT,
@@ -321,10 +343,14 @@ class AsyncTtsClient:
         model_id: str,
         transcript: str,
         voice: TtsRequestVoiceSpecifierParams,
-        output_format: OutputFormatParams,
+        output_format: SseOutputFormatParams,
         language: typing.Optional[SupportedLanguage] = OMIT,
         duration: typing.Optional[float] = OMIT,
         speed: typing.Optional[ModelSpeed] = OMIT,
+        add_timestamps: typing.Optional[bool] = OMIT,
+        add_phoneme_timestamps: typing.Optional[bool] = OMIT,
+        use_normalized_timestamps: typing.Optional[bool] = OMIT,
+        context_id: typing.Optional[ContextId] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[WebSocketResponse]:
         """
@@ -337,7 +363,7 @@ class AsyncTtsClient:
         voice : TtsRequestVoiceSpecifierParams
-        output_format : OutputFormatParams
+        output_format : SseOutputFormatParams
         language : typing.Optional[SupportedLanguage]
@@ -347,6 +373,18 @@ class AsyncTtsClient:
         speed : typing.Optional[ModelSpeed]
+        add_timestamps : typing.Optional[bool]
+            Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+        add_phoneme_timestamps : typing.Optional[bool]
+            Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+        use_normalized_timestamps : typing.Optional[bool]
+            Whether to use normalized timestamps (True) or original timestamps (False).
+        context_id : typing.Optional[ContextId]
+            Optional context ID for this request.
         request_options : typing.Optional[RequestOptions]
             Request-specific configuration.
@@ -372,9 +410,9 @@ class AsyncTtsClient:
                 voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
                 language="en",
                 output_format={
+                    "container": "raw",
                     "sample_rate": 44100,
                     "encoding": "pcm_f32le",
-                    "container": "raw",
                 },
             )
             async for chunk in response:
@@ -394,10 +432,14 @@ class AsyncTtsClient:
                 ),
                 "language": language,
                 "output_format": convert_and_respect_annotation_metadata(
-                    object_=output_format, annotation=OutputFormatParams, direction="write"
+                    object_=output_format, annotation=SseOutputFormatParams, direction="write"
                 ),
                 "duration": duration,
                 "speed": speed,
+                "add_timestamps": add_timestamps,
+                "add_phoneme_timestamps": add_phoneme_timestamps,
+                "use_normalized_timestamps": use_normalized_timestamps,
+                "context_id": context_id,
             },
             request_options=request_options,
             omit=OMIT,

cartesia/tts/requests/__init__.py CHANGED Viewed

@@ -8,10 +8,12 @@ from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFor
 from .phoneme_timestamps import PhonemeTimestampsParams
 from .raw_output_format import RawOutputFormatParams
 from .speed import SpeedParams
+from .sse_output_format import SseOutputFormatParams
 from .tts_request import TtsRequestParams
 from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
 from .tts_request_id_specifier import TtsRequestIdSpecifierParams
 from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
+from .ttssse_request import TtssseRequestParams
 from .wav_output_format import WavOutputFormatParams
 from .web_socket_base_response import WebSocketBaseResponseParams
 from .web_socket_chunk_response import WebSocketChunkResponseParams
@@ -48,10 +50,12 @@ __all__ = [
     "PhonemeTimestampsParams",
     "RawOutputFormatParams",
     "SpeedParams",
+    "SseOutputFormatParams",
     "TtsRequestEmbeddingSpecifierParams",
     "TtsRequestIdSpecifierParams",
     "TtsRequestParams",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequestParams",
     "WavOutputFormatParams",
     "WebSocketBaseResponseParams",
     "WebSocketChunkResponseParams",

cartesia/tts/requests/generation_request.py CHANGED Viewed

@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
     add_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to return word-level timestamps.
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
     """
     add_phoneme_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to return phoneme-level timestamps.
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
     """
-    use_original_timestamps: typing_extensions.NotRequired[bool]
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to use the original transcript for timestamps.
+    Whether to use normalized timestamps (True) or original timestamps (False).
     """

cartesia/tts/requests/sse_output_format.py ADDED Viewed

@@ -0,0 +1,11 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+import typing
+from ..types.raw_encoding import RawEncoding
+class SseOutputFormatParams(typing_extensions.TypedDict):
+    container: typing.Literal["raw"]
+    encoding: RawEncoding
+    sample_rate: int

cartesia/tts/requests/ttssse_request.py ADDED Viewed

@@ -0,0 +1,47 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
+import typing_extensions
+from ..types.supported_language import SupportedLanguage
+from .sse_output_format import SseOutputFormatParams
+from ..types.model_speed import ModelSpeed
+from ..types.context_id import ContextId
+class TtssseRequestParams(typing_extensions.TypedDict):
+    model_id: str
+    """
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
+    """
+    transcript: str
+    voice: TtsRequestVoiceSpecifierParams
+    language: typing_extensions.NotRequired[SupportedLanguage]
+    output_format: SseOutputFormatParams
+    duration: typing_extensions.NotRequired[float]
+    """
+    The maximum duration of the audio in seconds. You do not usually need to specify this.
+    If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
+    """
+    speed: typing_extensions.NotRequired[ModelSpeed]
+    add_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to use normalized timestamps (True) or original timestamps (False).
+    """
+    context_id: typing_extensions.NotRequired[ContextId]
+    """
+    Optional context ID for this request.
+    """

cartesia/tts/requests/web_socket_chunk_response.py CHANGED Viewed

@@ -1,11 +1,8 @@
 # This file was auto-generated by Fern from our API Definition.
 from .web_socket_base_response import WebSocketBaseResponseParams
-import typing_extensions
-from ..types.flush_id import FlushId
 class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]

cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl