PyPI - cartesia - Versions diffs - 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl - Mend

cartesia 2.0.5py3-none-any.whl → 2.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

cartesia/__init__.py +22 -0
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/core/client_wrapper.py +1 -1
cartesia/infill/client.py +0 -8
cartesia/stt/__init__.py +6 -0
cartesia/stt/_async_websocket.py +81 -72
cartesia/stt/_websocket.py +42 -20
cartesia/stt/client.py +450 -0
cartesia/stt/requests/__init__.py +2 -0
cartesia/stt/requests/streaming_transcription_response.py +2 -0
cartesia/stt/requests/transcript_message.py +8 -1
cartesia/stt/requests/transcription_response.py +8 -1
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +52 -109
cartesia/stt/types/__init__.py +4 -0
cartesia/stt/types/streaming_transcription_response.py +2 -0
cartesia/stt/types/stt_encoding.py +3 -1
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +7 -1
cartesia/stt/types/transcription_response.py +7 -1
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +16 -0
cartesia/tts/client.py +63 -8
cartesia/tts/requests/__init__.py +8 -0
cartesia/tts/requests/experimental_model_controls.py +17 -0
cartesia/tts/requests/generation_config.py +23 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/tts_request.py +2 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +8 -0
cartesia/tts/types/experimental_model_controls.py +28 -0
cartesia/tts/types/generation_config.py +34 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/tts_request.py +2 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/client.py +0 -8
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
cartesia/voices/client.py +0 -12
cartesia-2.0.7.dist-info/LICENSE +201 -0
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
{cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1

cartesia/stt/_websocket.py CHANGED Viewed

@@ -14,6 +14,7 @@ from cartesia.stt.types import (
     StreamingTranscriptionResponse_Error,
     StreamingTranscriptionResponse_Transcript,
 )
+from cartesia.stt.types.stt_encoding import SttEncoding
 from ..core.pydantic_utilities import parse_obj_as
@@ -45,8 +46,10 @@ class SttWebsocket:
         # Store default connection parameters for auto-connect with proper typing
         self._default_model: str = "ink-whisper"
         self._default_language: Optional[str] = "en"
-        self._default_encoding: Optional[str] = "pcm_s16le"
+        self._default_encoding: SttEncoding = "pcm_s16le"
         self._default_sample_rate: int = 16000
+        self._default_min_volume: Optional[float] = None
+        self._default_max_silence_duration_secs: Optional[float] = None
     def __del__(self):
         try:
@@ -59,16 +62,20 @@ class SttWebsocket:
         *,
         model: str = "ink-whisper",
         language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
+        encoding: SttEncoding = "pcm_s16le",
         sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
     ):
         """Connect to the STT WebSocket with the specified parameters.
         Args:
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Raises:
             RuntimeError: If the connection to the WebSocket fails.
@@ -78,6 +85,8 @@ class SttWebsocket:
         self._default_language = language
         self._default_encoding = encoding
         self._default_sample_rate = sample_rate
+        self._default_min_volume = min_volume
+        self._default_max_silence_duration_secs = max_silence_duration_secs
         if not IS_WEBSOCKET_SYNC_AVAILABLE:
             raise ImportError(
@@ -89,13 +98,15 @@ class SttWebsocket:
                 "model": model,
                 "api_key": self.api_key,
                 "cartesia_version": self.cartesia_version,
+                "encoding": encoding,
+                "sample_rate": str(sample_rate),
             }
             if language is not None:
                 params["language"] = language
-            if encoding is not None:
-                params["encoding"] = encoding
-            if sample_rate is not None:
-                params["sample_rate"] = str(sample_rate)
+            if min_volume is not None:
+                params["min_volume"] = str(min_volume)
+            if max_silence_duration_secs is not None:
+                params["max_silence_duration_secs"] = str(max_silence_duration_secs)
             query_string = "&".join([f"{k}={v}" for k, v in params.items()])
             url = f"{self.ws_url}/{route}?{query_string}"
@@ -143,6 +154,8 @@ class SttWebsocket:
                 language=self._default_language,
                 encoding=self._default_encoding,
                 sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
             )
         assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -167,6 +180,8 @@ class SttWebsocket:
                 language=self._default_language,
                 encoding=self._default_encoding,
                 sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
             )
         assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -197,6 +212,8 @@ class SttWebsocket:
                                 result["duration"] = raw_data["duration"]
                             if "language" in raw_data:
                                 result["language"] = raw_data["language"]
+                            if "words" in raw_data:
+                                result["words"] = raw_data["words"]
                             yield result
@@ -208,23 +225,22 @@ class SttWebsocket:
                             }
                             yield result
-                        # Handle done acknowledgment - session complete
+                        # Handle done acknowledgment
                         elif raw_data.get("type") == "done":
                             result = {
                                 "type": raw_data["type"],
                                 "request_id": raw_data.get("request_id", ""),
                             }
                             yield result
-                            # Session is complete, break out of loop
-                            break
-                except Exception as inner_e:
-                    self.close()
-                    raise RuntimeError(f"Error receiving transcription: {inner_e}")
+                            break  # Exit the loop when done
-        except Exception as e:
+                except Exception as e:
+                    if "Connection closed" in str(e) or "no active connection" in str(e):
+                        break  # WebSocket was closed
+                    raise e  # Re-raise other exceptions
+        except KeyboardInterrupt:
             self.close()
-            raise RuntimeError(f"Failed to receive transcription. {e}")
+            raise
     def transcribe(
         self,
@@ -232,8 +248,10 @@ class SttWebsocket:
         *,
         model: str = "ink-whisper",
         language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
+        encoding: SttEncoding = "pcm_s16le",
         sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
     ) -> Generator[Dict[str, Any], None, None]:
         """Transcribe audio chunks using the WebSocket.
@@ -241,8 +259,10 @@ class SttWebsocket:
             audio_chunks: Iterator of audio chunks as bytes
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Yields:
             Dictionary containing transcription results, flush_done, done, or error messages
@@ -252,6 +272,8 @@ class SttWebsocket:
             language=language,
             encoding=encoding,
             sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
         )
         try:

cartesia/stt/client.py ADDED Viewed

@@ -0,0 +1,450 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+from ..core.client_wrapper import SyncClientWrapper
+from .. import core
+from .types.stt_encoding import SttEncoding
+from .types.timestamp_granularity import TimestampGranularity
+from ..core.request_options import RequestOptions
+from .types.transcription_response import TranscriptionResponse
+from ..core.pydantic_utilities import parse_obj_as
+from json.decoder import JSONDecodeError
+from ..core.api_error import ApiError
+from ..core.client_wrapper import AsyncClientWrapper
+# this is used as the default value for optional parameters
+OMIT = typing.cast(typing.Any, ...)
+class SttClient:
+    def __init__(self, *, client_wrapper: SyncClientWrapper):
+        self._client_wrapper = client_wrapper
+    def transcribe(
+        self,
+        *,
+        file: core.File,
+        model: str,
+        encoding: typing.Optional[SttEncoding] = None,
+        sample_rate: typing.Optional[int] = None,
+        language: typing.Optional[str] = OMIT,
+        timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> TranscriptionResponse:
+        """
+        Transcribes audio files into text using Cartesia's Speech-to-Text API.
+        Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
+        **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
+        **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
+        **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
+        <Note>
+        For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
+        </Note>
+        Parameters
+        ----------
+        file : core.File
+            See core.File for more documentation
+        model : str
+            ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
+        encoding : typing.Optional[SttEncoding]
+            The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
+            **Supported formats:**
+            - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
+            - `pcm_s32le` - 32-bit signed integer PCM, little-endian
+            - `pcm_f16le` - 16-bit floating point PCM, little-endian
+            - `pcm_f32le` - 32-bit floating point PCM, little-endian
+            - `pcm_mulaw` - 8-bit μ-law encoded PCM
+            - `pcm_alaw` - 8-bit A-law encoded PCM
+        sample_rate : typing.Optional[int]
+            The sample rate of the audio in Hz.
+        language : typing.Optional[str]
+            The language of the input audio in ISO-639-1 format. Defaults to `en`.
+            <Accordion title="Supported languages">
+              - `en` (English)
+              - `zh` (Chinese)
+              - `de` (German)
+              - `es` (Spanish)
+              - `ru` (Russian)
+              - `ko` (Korean)
+              - `fr` (French)
+              - `ja` (Japanese)
+              - `pt` (Portuguese)
+              - `tr` (Turkish)
+              - `pl` (Polish)
+              - `ca` (Catalan)
+              - `nl` (Dutch)
+              - `ar` (Arabic)
+              - `sv` (Swedish)
+              - `it` (Italian)
+              - `id` (Indonesian)
+              - `hi` (Hindi)
+              - `fi` (Finnish)
+              - `vi` (Vietnamese)
+              - `he` (Hebrew)
+              - `uk` (Ukrainian)
+              - `el` (Greek)
+              - `ms` (Malay)
+              - `cs` (Czech)
+              - `ro` (Romanian)
+              - `da` (Danish)
+              - `hu` (Hungarian)
+              - `ta` (Tamil)
+              - `no` (Norwegian)
+              - `th` (Thai)
+              - `ur` (Urdu)
+              - `hr` (Croatian)
+              - `bg` (Bulgarian)
+              - `lt` (Lithuanian)
+              - `la` (Latin)
+              - `mi` (Maori)
+              - `ml` (Malayalam)
+              - `cy` (Welsh)
+              - `sk` (Slovak)
+              - `te` (Telugu)
+              - `fa` (Persian)
+              - `lv` (Latvian)
+              - `bn` (Bengali)
+              - `sr` (Serbian)
+              - `az` (Azerbaijani)
+              - `sl` (Slovenian)
+              - `kn` (Kannada)
+              - `et` (Estonian)
+              - `mk` (Macedonian)
+              - `br` (Breton)
+              - `eu` (Basque)
+              - `is` (Icelandic)
+              - `hy` (Armenian)
+              - `ne` (Nepali)
+              - `mn` (Mongolian)
+              - `bs` (Bosnian)
+              - `kk` (Kazakh)
+              - `sq` (Albanian)
+              - `sw` (Swahili)
+              - `gl` (Galician)
+              - `mr` (Marathi)
+              - `pa` (Punjabi)
+              - `si` (Sinhala)
+              - `km` (Khmer)
+              - `sn` (Shona)
+              - `yo` (Yoruba)
+              - `so` (Somali)
+              - `af` (Afrikaans)
+              - `oc` (Occitan)
+              - `ka` (Georgian)
+              - `be` (Belarusian)
+              - `tg` (Tajik)
+              - `sd` (Sindhi)
+              - `gu` (Gujarati)
+              - `am` (Amharic)
+              - `yi` (Yiddish)
+              - `lo` (Lao)
+              - `uz` (Uzbek)
+              - `fo` (Faroese)
+              - `ht` (Haitian Creole)
+              - `ps` (Pashto)
+              - `tk` (Turkmen)
+              - `nn` (Nynorsk)
+              - `mt` (Maltese)
+              - `sa` (Sanskrit)
+              - `lb` (Luxembourgish)
+              - `my` (Myanmar)
+              - `bo` (Tibetan)
+              - `tl` (Tagalog)
+              - `mg` (Malagasy)
+              - `as` (Assamese)
+              - `tt` (Tatar)
+              - `haw` (Hawaiian)
+              - `ln` (Lingala)
+              - `ha` (Hausa)
+              - `ba` (Bashkir)
+              - `jw` (Javanese)
+              - `su` (Sundanese)
+              - `yue` (Cantonese)
+            </Accordion>
+        timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
+            The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+        Returns
+        -------
+        TranscriptionResponse
+        Examples
+        --------
+        from cartesia import Cartesia
+        client = Cartesia(
+            api_key="YOUR_API_KEY",
+        )
+        client.stt.transcribe(
+            model="ink-whisper",
+            language="en",
+        )
+        """
+        _response = self._client_wrapper.httpx_client.request(
+            "stt",
+            method="POST",
+            params={
+                "encoding": encoding,
+                "sample_rate": sample_rate,
+            },
+            data={
+                "model": model,
+                "language": language,
+                "timestamp_granularities[]": timestamp_granularities,
+            },
+            files={
+                "file": file,
+            },
+            request_options=request_options,
+            omit=OMIT,
+        )
+        try:
+            if 200 <= _response.status_code < 300:
+                return typing.cast(
+                    TranscriptionResponse,
+                    parse_obj_as(
+                        type_=TranscriptionResponse,  # type: ignore
+                        object_=_response.json(),
+                    ),
+                )
+            _response_json = _response.json()
+        except JSONDecodeError:
+            raise ApiError(status_code=_response.status_code, body=_response.text)
+        raise ApiError(status_code=_response.status_code, body=_response_json)
+class AsyncSttClient:
+    def __init__(self, *, client_wrapper: AsyncClientWrapper):
+        self._client_wrapper = client_wrapper
+    async def transcribe(
+        self,
+        *,
+        file: core.File,
+        model: str,
+        encoding: typing.Optional[SttEncoding] = None,
+        sample_rate: typing.Optional[int] = None,
+        language: typing.Optional[str] = OMIT,
+        timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> TranscriptionResponse:
+        """
+        Transcribes audio files into text using Cartesia's Speech-to-Text API.
+        Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
+        **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
+        **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
+        **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
+        <Note>
+        For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
+        </Note>
+        Parameters
+        ----------
+        file : core.File
+            See core.File for more documentation
+        model : str
+            ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
+        encoding : typing.Optional[SttEncoding]
+            The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
+            **Supported formats:**
+            - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
+            - `pcm_s32le` - 32-bit signed integer PCM, little-endian
+            - `pcm_f16le` - 16-bit floating point PCM, little-endian
+            - `pcm_f32le` - 32-bit floating point PCM, little-endian
+            - `pcm_mulaw` - 8-bit μ-law encoded PCM
+            - `pcm_alaw` - 8-bit A-law encoded PCM
+        sample_rate : typing.Optional[int]
+            The sample rate of the audio in Hz.
+        language : typing.Optional[str]
+            The language of the input audio in ISO-639-1 format. Defaults to `en`.
+            <Accordion title="Supported languages">
+              - `en` (English)
+              - `zh` (Chinese)
+              - `de` (German)
+              - `es` (Spanish)
+              - `ru` (Russian)
+              - `ko` (Korean)
+              - `fr` (French)
+              - `ja` (Japanese)
+              - `pt` (Portuguese)
+              - `tr` (Turkish)
+              - `pl` (Polish)
+              - `ca` (Catalan)
+              - `nl` (Dutch)
+              - `ar` (Arabic)
+              - `sv` (Swedish)
+              - `it` (Italian)
+              - `id` (Indonesian)
+              - `hi` (Hindi)
+              - `fi` (Finnish)
+              - `vi` (Vietnamese)
+              - `he` (Hebrew)
+              - `uk` (Ukrainian)
+              - `el` (Greek)
+              - `ms` (Malay)
+              - `cs` (Czech)
+              - `ro` (Romanian)
+              - `da` (Danish)
+              - `hu` (Hungarian)
+              - `ta` (Tamil)
+              - `no` (Norwegian)
+              - `th` (Thai)
+              - `ur` (Urdu)
+              - `hr` (Croatian)
+              - `bg` (Bulgarian)
+              - `lt` (Lithuanian)
+              - `la` (Latin)
+              - `mi` (Maori)
+              - `ml` (Malayalam)
+              - `cy` (Welsh)
+              - `sk` (Slovak)
+              - `te` (Telugu)
+              - `fa` (Persian)
+              - `lv` (Latvian)
+              - `bn` (Bengali)
+              - `sr` (Serbian)
+              - `az` (Azerbaijani)
+              - `sl` (Slovenian)
+              - `kn` (Kannada)
+              - `et` (Estonian)
+              - `mk` (Macedonian)
+              - `br` (Breton)
+              - `eu` (Basque)
+              - `is` (Icelandic)
+              - `hy` (Armenian)
+              - `ne` (Nepali)
+              - `mn` (Mongolian)
+              - `bs` (Bosnian)
+              - `kk` (Kazakh)
+              - `sq` (Albanian)
+              - `sw` (Swahili)
+              - `gl` (Galician)
+              - `mr` (Marathi)
+              - `pa` (Punjabi)
+              - `si` (Sinhala)
+              - `km` (Khmer)
+              - `sn` (Shona)
+              - `yo` (Yoruba)
+              - `so` (Somali)
+              - `af` (Afrikaans)
+              - `oc` (Occitan)
+              - `ka` (Georgian)
+              - `be` (Belarusian)
+              - `tg` (Tajik)
+              - `sd` (Sindhi)
+              - `gu` (Gujarati)
+              - `am` (Amharic)
+              - `yi` (Yiddish)
+              - `lo` (Lao)
+              - `uz` (Uzbek)
+              - `fo` (Faroese)
+              - `ht` (Haitian Creole)
+              - `ps` (Pashto)
+              - `tk` (Turkmen)
+              - `nn` (Nynorsk)
+              - `mt` (Maltese)
+              - `sa` (Sanskrit)
+              - `lb` (Luxembourgish)
+              - `my` (Myanmar)
+              - `bo` (Tibetan)
+              - `tl` (Tagalog)
+              - `mg` (Malagasy)
+              - `as` (Assamese)
+              - `tt` (Tatar)
+              - `haw` (Hawaiian)
+              - `ln` (Lingala)
+              - `ha` (Hausa)
+              - `ba` (Bashkir)
+              - `jw` (Javanese)
+              - `su` (Sundanese)
+              - `yue` (Cantonese)
+            </Accordion>
+        timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
+            The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+        Returns
+        -------
+        TranscriptionResponse
+        Examples
+        --------
+        import asyncio
+        from cartesia import AsyncCartesia
+        client = AsyncCartesia(
+            api_key="YOUR_API_KEY",
+        )
+        async def main() -> None:
+            await client.stt.transcribe(
+                model="ink-whisper",
+                language="en",
+            )
+        asyncio.run(main())
+        """
+        _response = await self._client_wrapper.httpx_client.request(
+            "stt",
+            method="POST",
+            params={
+                "encoding": encoding,
+                "sample_rate": sample_rate,
+            },
+            data={
+                "model": model,
+                "language": language,
+                "timestamp_granularities[]": timestamp_granularities,
+            },
+            files={
+                "file": file,
+            },
+            request_options=request_options,
+            omit=OMIT,
+        )
+        try:
+            if 200 <= _response.status_code < 300:
+                return typing.cast(
+                    TranscriptionResponse,
+                    parse_obj_as(
+                        type_=TranscriptionResponse,  # type: ignore
+                        object_=_response.json(),
+                    ),
+                )
+            _response_json = _response.json()
+        except JSONDecodeError:
+            raise ApiError(status_code=_response.status_code, body=_response.text)
+        raise ApiError(status_code=_response.status_code, body=_response_json)

cartesia/stt/requests/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from .streaming_transcription_response import (
 )
 from .transcript_message import TranscriptMessageParams
 from .transcription_response import TranscriptionResponseParams
+from .transcription_word import TranscriptionWordParams
 __all__ = [
     "DoneMessageParams",
@@ -24,4 +25,5 @@ __all__ = [
     "StreamingTranscriptionResponse_TranscriptParams",
     "TranscriptMessageParams",
     "TranscriptionResponseParams",
+    "TranscriptionWordParams",
 ]

cartesia/stt/requests/streaming_transcription_response.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
+from .transcription_word import TranscriptionWordParams
 class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
@@ -13,6 +14,7 @@ class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDic
     is_final: bool
     duration: typing_extensions.NotRequired[float]
     language: typing_extensions.NotRequired[str]
+    words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
 class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):

cartesia/stt/requests/transcript_message.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import typing_extensions
 import typing_extensions
+import typing
+from .transcription_word import TranscriptionWordParams
 class TranscriptMessageParams(typing_extensions.TypedDict):
@@ -29,5 +31,10 @@ class TranscriptMessageParams(typing_extensions.TypedDict):
     language: typing_extensions.NotRequired[str]
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
+    """
+    words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
+    """
+    Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
     """

cartesia/stt/requests/transcription_response.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import typing_extensions
 import typing_extensions
+import typing
+from .transcription_word import TranscriptionWordParams
 class TranscriptionResponseParams(typing_extensions.TypedDict):
@@ -12,10 +14,15 @@ class TranscriptionResponseParams(typing_extensions.TypedDict):
     language: typing_extensions.NotRequired[str]
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
     """
     duration: typing_extensions.NotRequired[float]
     """
     The duration of the input audio in seconds.
     """
+    words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
+    """
+    Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
+    """

cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl

cartesia 2.0.5py3-none-any.whl → 2.0.7py3-none-any.whl