PyPI - cartesia - Versions diffs - 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl - Mend

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

cartesia/__init__.py +14 -0
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/core/client_wrapper.py +1 -1
cartesia/stt/__init__.py +6 -0
cartesia/stt/_async_websocket.py +81 -72
cartesia/stt/_websocket.py +42 -20
cartesia/stt/client.py +456 -0
cartesia/stt/requests/__init__.py +2 -0
cartesia/stt/requests/streaming_transcription_response.py +2 -0
cartesia/stt/requests/transcript_message.py +8 -1
cartesia/stt/requests/transcription_response.py +8 -1
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +52 -109
cartesia/stt/types/__init__.py +4 -0
cartesia/stt/types/streaming_transcription_response.py +2 -0
cartesia/stt/types/stt_encoding.py +3 -1
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +7 -1
cartesia/stt/types/transcription_response.py +7 -1
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +8 -0
cartesia/tts/client.py +50 -8
cartesia/tts/requests/__init__.py +4 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +4 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0

cartesia/stt/client.py ADDED Viewed

@@ -0,0 +1,456 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+from ..core.client_wrapper import SyncClientWrapper
+from .. import core
+from .types.stt_encoding import SttEncoding
+from .types.timestamp_granularity import TimestampGranularity
+from ..core.request_options import RequestOptions
+from .types.transcription_response import TranscriptionResponse
+from ..core.pydantic_utilities import parse_obj_as
+from json.decoder import JSONDecodeError
+from ..core.api_error import ApiError
+from ..core.client_wrapper import AsyncClientWrapper
+# this is used as the default value for optional parameters
+OMIT = typing.cast(typing.Any, ...)
+class SttClient:
+    def __init__(self, *, client_wrapper: SyncClientWrapper):
+        self._client_wrapper = client_wrapper
+    def transcribe(
+        self,
+        *,
+        file: core.File,
+        model: str,
+        encoding: typing.Optional[SttEncoding] = None,
+        sample_rate: typing.Optional[int] = None,
+        language: typing.Optional[str] = OMIT,
+        timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> TranscriptionResponse:
+        """
+        Transcribes audio files into text using Cartesia's Speech-to-Text API.
+        Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
+        **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
+        **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
+        **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
+        <Note>
+        For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
+        </Note>
+        Parameters
+        ----------
+        file : core.File
+            See core.File for more documentation
+        model : str
+            ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
+        encoding : typing.Optional[SttEncoding]
+            The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
+            **Supported formats:**
+            - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
+            - `pcm_s32le` - 32-bit signed integer PCM, little-endian
+            - `pcm_f16le` - 16-bit floating point PCM, little-endian
+            - `pcm_f32le` - 32-bit floating point PCM, little-endian
+            - `pcm_mulaw` - 8-bit μ-law encoded PCM
+            - `pcm_alaw` - 8-bit A-law encoded PCM
+        sample_rate : typing.Optional[int]
+            The sample rate of the audio in Hz.
+        language : typing.Optional[str]
+            The language of the input audio in ISO-639-1 format. Defaults to `en`.
+            <Accordion title="Supported languages">
+              - `en` (English)
+              - `zh` (Chinese)
+              - `de` (German)
+              - `es` (Spanish)
+              - `ru` (Russian)
+              - `ko` (Korean)
+              - `fr` (French)
+              - `ja` (Japanese)
+              - `pt` (Portuguese)
+              - `tr` (Turkish)
+              - `pl` (Polish)
+              - `ca` (Catalan)
+              - `nl` (Dutch)
+              - `ar` (Arabic)
+              - `sv` (Swedish)
+              - `it` (Italian)
+              - `id` (Indonesian)
+              - `hi` (Hindi)
+              - `fi` (Finnish)
+              - `vi` (Vietnamese)
+              - `he` (Hebrew)
+              - `uk` (Ukrainian)
+              - `el` (Greek)
+              - `ms` (Malay)
+              - `cs` (Czech)
+              - `ro` (Romanian)
+              - `da` (Danish)
+              - `hu` (Hungarian)
+              - `ta` (Tamil)
+              - `no` (Norwegian)
+              - `th` (Thai)
+              - `ur` (Urdu)
+              - `hr` (Croatian)
+              - `bg` (Bulgarian)
+              - `lt` (Lithuanian)
+              - `la` (Latin)
+              - `mi` (Maori)
+              - `ml` (Malayalam)
+              - `cy` (Welsh)
+              - `sk` (Slovak)
+              - `te` (Telugu)
+              - `fa` (Persian)
+              - `lv` (Latvian)
+              - `bn` (Bengali)
+              - `sr` (Serbian)
+              - `az` (Azerbaijani)
+              - `sl` (Slovenian)
+              - `kn` (Kannada)
+              - `et` (Estonian)
+              - `mk` (Macedonian)
+              - `br` (Breton)
+              - `eu` (Basque)
+              - `is` (Icelandic)
+              - `hy` (Armenian)
+              - `ne` (Nepali)
+              - `mn` (Mongolian)
+              - `bs` (Bosnian)
+              - `kk` (Kazakh)
+              - `sq` (Albanian)
+              - `sw` (Swahili)
+              - `gl` (Galician)
+              - `mr` (Marathi)
+              - `pa` (Punjabi)
+              - `si` (Sinhala)
+              - `km` (Khmer)
+              - `sn` (Shona)
+              - `yo` (Yoruba)
+              - `so` (Somali)
+              - `af` (Afrikaans)
+              - `oc` (Occitan)
+              - `ka` (Georgian)
+              - `be` (Belarusian)
+              - `tg` (Tajik)
+              - `sd` (Sindhi)
+              - `gu` (Gujarati)
+              - `am` (Amharic)
+              - `yi` (Yiddish)
+              - `lo` (Lao)
+              - `uz` (Uzbek)
+              - `fo` (Faroese)
+              - `ht` (Haitian Creole)
+              - `ps` (Pashto)
+              - `tk` (Turkmen)
+              - `nn` (Nynorsk)
+              - `mt` (Maltese)
+              - `sa` (Sanskrit)
+              - `lb` (Luxembourgish)
+              - `my` (Myanmar)
+              - `bo` (Tibetan)
+              - `tl` (Tagalog)
+              - `mg` (Malagasy)
+              - `as` (Assamese)
+              - `tt` (Tatar)
+              - `haw` (Hawaiian)
+              - `ln` (Lingala)
+              - `ha` (Hausa)
+              - `ba` (Bashkir)
+              - `jw` (Javanese)
+              - `su` (Sundanese)
+              - `yue` (Cantonese)
+            </Accordion>
+        timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
+            The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+        Returns
+        -------
+        TranscriptionResponse
+        Examples
+        --------
+        from cartesia import Cartesia
+        client = Cartesia(
+            api_key="YOUR_API_KEY",
+        )
+        client.stt.transcribe(
+            model="ink-whisper",
+            language="en",
+        )
+        """
+        _response = self._client_wrapper.httpx_client.request(
+            "stt",
+            method="POST",
+            params={
+                "encoding": encoding,
+                "sample_rate": sample_rate,
+            },
+            data={
+                "model": model,
+                "language": language,
+                "timestamp_granularities[]": timestamp_granularities,
+            },
+            files={
+                "file": file,
+            },
+            request_options=request_options,
+            omit=OMIT,
+        )
+        try:
+            if 200 <= _response.status_code < 300:
+                return typing.cast(
+                    TranscriptionResponse,
+                    parse_obj_as(
+                        type_=TranscriptionResponse,  # type: ignore
+                        object_=_response.json(),
+                    ),
+                )
+            _response_json = _response.json()
+        except JSONDecodeError:
+            raise ApiError(status_code=_response.status_code, body=_response.text)
+        raise ApiError(status_code=_response.status_code, body=_response_json)
+class AsyncSttClient:
+    def __init__(self, *, client_wrapper: AsyncClientWrapper):
+        self._client_wrapper = client_wrapper
+    async def transcribe(
+        self,
+        *,
+        file: core.File,
+        model: str,
+        encoding: typing.Optional[SttEncoding] = None,
+        sample_rate: typing.Optional[int] = None,
+        language: typing.Optional[str] = OMIT,
+        timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
+        request_options: typing.Optional[RequestOptions] = None,
+    ) -> TranscriptionResponse:
+        """
+        Transcribes audio files into text using Cartesia's Speech-to-Text API.
+        Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
+        **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
+        **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
+        **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
+        <Note>
+        For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
+        </Note>
+        Parameters
+        ----------
+        file : core.File
+            See core.File for more documentation
+        model : str
+            ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
+        encoding : typing.Optional[SttEncoding]
+            The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
+            **Supported formats:**
+            - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
+            - `pcm_s32le` - 32-bit signed integer PCM, little-endian
+            - `pcm_f16le` - 16-bit floating point PCM, little-endian
+            - `pcm_f32le` - 32-bit floating point PCM, little-endian
+            - `pcm_mulaw` - 8-bit μ-law encoded PCM
+            - `pcm_alaw` - 8-bit A-law encoded PCM
+        sample_rate : typing.Optional[int]
+            The sample rate of the audio in Hz.
+        language : typing.Optional[str]
+            The language of the input audio in ISO-639-1 format. Defaults to `en`.
+            <Accordion title="Supported languages">
+              - `en` (English)
+              - `zh` (Chinese)
+              - `de` (German)
+              - `es` (Spanish)
+              - `ru` (Russian)
+              - `ko` (Korean)
+              - `fr` (French)
+              - `ja` (Japanese)
+              - `pt` (Portuguese)
+              - `tr` (Turkish)
+              - `pl` (Polish)
+              - `ca` (Catalan)
+              - `nl` (Dutch)
+              - `ar` (Arabic)
+              - `sv` (Swedish)
+              - `it` (Italian)
+              - `id` (Indonesian)
+              - `hi` (Hindi)
+              - `fi` (Finnish)
+              - `vi` (Vietnamese)
+              - `he` (Hebrew)
+              - `uk` (Ukrainian)
+              - `el` (Greek)
+              - `ms` (Malay)
+              - `cs` (Czech)
+              - `ro` (Romanian)
+              - `da` (Danish)
+              - `hu` (Hungarian)
+              - `ta` (Tamil)
+              - `no` (Norwegian)
+              - `th` (Thai)
+              - `ur` (Urdu)
+              - `hr` (Croatian)
+              - `bg` (Bulgarian)
+              - `lt` (Lithuanian)
+              - `la` (Latin)
+              - `mi` (Maori)
+              - `ml` (Malayalam)
+              - `cy` (Welsh)
+              - `sk` (Slovak)
+              - `te` (Telugu)
+              - `fa` (Persian)
+              - `lv` (Latvian)
+              - `bn` (Bengali)
+              - `sr` (Serbian)
+              - `az` (Azerbaijani)
+              - `sl` (Slovenian)
+              - `kn` (Kannada)
+              - `et` (Estonian)
+              - `mk` (Macedonian)
+              - `br` (Breton)
+              - `eu` (Basque)
+              - `is` (Icelandic)
+              - `hy` (Armenian)
+              - `ne` (Nepali)
+              - `mn` (Mongolian)
+              - `bs` (Bosnian)
+              - `kk` (Kazakh)
+              - `sq` (Albanian)
+              - `sw` (Swahili)
+              - `gl` (Galician)
+              - `mr` (Marathi)
+              - `pa` (Punjabi)
+              - `si` (Sinhala)
+              - `km` (Khmer)
+              - `sn` (Shona)
+              - `yo` (Yoruba)
+              - `so` (Somali)
+              - `af` (Afrikaans)
+              - `oc` (Occitan)
+              - `ka` (Georgian)
+              - `be` (Belarusian)
+              - `tg` (Tajik)
+              - `sd` (Sindhi)
+              - `gu` (Gujarati)
+              - `am` (Amharic)
+              - `yi` (Yiddish)
+              - `lo` (Lao)
+              - `uz` (Uzbek)
+              - `fo` (Faroese)
+              - `ht` (Haitian Creole)
+              - `ps` (Pashto)
+              - `tk` (Turkmen)
+              - `nn` (Nynorsk)
+              - `mt` (Maltese)
+              - `sa` (Sanskrit)
+              - `lb` (Luxembourgish)
+              - `my` (Myanmar)
+              - `bo` (Tibetan)
+              - `tl` (Tagalog)
+              - `mg` (Malagasy)
+              - `as` (Assamese)
+              - `tt` (Tatar)
+              - `haw` (Hawaiian)
+              - `ln` (Lingala)
+              - `ha` (Hausa)
+              - `ba` (Bashkir)
+              - `jw` (Javanese)
+              - `su` (Sundanese)
+              - `yue` (Cantonese)
+            </Accordion>
+        timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
+            The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
+        request_options : typing.Optional[RequestOptions]
+            Request-specific configuration.
+        Returns
+        -------
+        TranscriptionResponse
+        Examples
+        --------
+        import asyncio
+        from cartesia import AsyncCartesia
+        client = AsyncCartesia(
+            api_key="YOUR_API_KEY",
+        )
+        async def main() -> None:
+            await client.stt.transcribe(
+                model="ink-whisper",
+                language="en",
+            )
+        asyncio.run(main())
+        """
+        _response = await self._client_wrapper.httpx_client.request(
+            "stt",
+            method="POST",
+            params={
+                "encoding": encoding,
+                "sample_rate": sample_rate,
+            },
+            data={
+                "model": model,
+                "language": language,
+                "timestamp_granularities[]": timestamp_granularities,
+            },
+            files={
+                "file": file,
+            },
+            request_options=request_options,
+            omit=OMIT,
+        )
+        try:
+            if 200 <= _response.status_code < 300:
+                return typing.cast(
+                    TranscriptionResponse,
+                    parse_obj_as(
+                        type_=TranscriptionResponse,  # type: ignore
+                        object_=_response.json(),
+                    ),
+                )
+            _response_json = _response.json()
+        except JSONDecodeError:
+            raise ApiError(status_code=_response.status_code, body=_response.text)
+        raise ApiError(status_code=_response.status_code, body=_response_json)

cartesia/stt/requests/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from .streaming_transcription_response import (
 )
 from .transcript_message import TranscriptMessageParams
 from .transcription_response import TranscriptionResponseParams
+from .transcription_word import TranscriptionWordParams
 __all__ = [
     "DoneMessageParams",
@@ -24,4 +25,5 @@ __all__ = [
     "StreamingTranscriptionResponse_TranscriptParams",
     "TranscriptMessageParams",
     "TranscriptionResponseParams",
+    "TranscriptionWordParams",
 ]

cartesia/stt/requests/streaming_transcription_response.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
+from .transcription_word import TranscriptionWordParams
 class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
@@ -13,6 +14,7 @@ class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDic
     is_final: bool
     duration: typing_extensions.NotRequired[float]
     language: typing_extensions.NotRequired[str]
+    words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
 class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):

cartesia/stt/requests/transcript_message.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import typing_extensions
 import typing_extensions
+import typing
+from .transcription_word import TranscriptionWordParams
 class TranscriptMessageParams(typing_extensions.TypedDict):
@@ -29,5 +31,10 @@ class TranscriptMessageParams(typing_extensions.TypedDict):
     language: typing_extensions.NotRequired[str]
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
+    """
+    words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
+    """
+    Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
     """

cartesia/stt/requests/transcription_response.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import typing_extensions
 import typing_extensions
+import typing
+from .transcription_word import TranscriptionWordParams
 class TranscriptionResponseParams(typing_extensions.TypedDict):
@@ -12,10 +14,15 @@ class TranscriptionResponseParams(typing_extensions.TypedDict):
     language: typing_extensions.NotRequired[str]
     """
-    The detected or specified language of the input audio.
+    The specified language of the input audio.
     """
     duration: typing_extensions.NotRequired[float]
     """
     The duration of the input audio in seconds.
     """
+    words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
+    """
+    Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
+    """

cartesia/stt/requests/transcription_word.py ADDED Viewed

@@ -0,0 +1,20 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+class TranscriptionWordParams(typing_extensions.TypedDict):
+    word: str
+    """
+    The transcribed word.
+    """
+    start: float
+    """
+    Start time of the word in seconds.
+    """
+    end: float
+    """
+    End time of the word in seconds.
+    """

cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl