PyPI - cartesia - Versions diffs - 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl - Mend

cartesia 2.0.4py3-none-any.whl → 2.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

cartesia/__init__.py +60 -1
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/base_client.py +2 -0
cartesia/client.py +5 -0
cartesia/core/client_wrapper.py +1 -1
cartesia/stt/__init__.py +57 -0
cartesia/stt/_async_websocket.py +293 -0
cartesia/stt/_websocket.py +294 -0
cartesia/stt/client.py +456 -0
cartesia/stt/requests/__init__.py +29 -0
cartesia/stt/requests/done_message.py +14 -0
cartesia/stt/requests/error_message.py +16 -0
cartesia/stt/requests/flush_done_message.py +14 -0
cartesia/stt/requests/streaming_transcription_response.py +41 -0
cartesia/stt/requests/transcript_message.py +40 -0
cartesia/stt/requests/transcription_response.py +28 -0
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +138 -0
cartesia/stt/types/__init__.py +33 -0
cartesia/stt/types/done_message.py +26 -0
cartesia/stt/types/error_message.py +27 -0
cartesia/stt/types/flush_done_message.py +26 -0
cartesia/stt/types/streaming_transcription_response.py +94 -0
cartesia/stt/types/stt_encoding.py +7 -0
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +50 -0
cartesia/stt/types/transcription_response.py +38 -0
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +8 -0
cartesia/tts/client.py +50 -8
cartesia/tts/requests/__init__.py +4 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +4 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0

cartesia/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # This file was auto-generated by Fern from our API Definition.
-from . import api_status, auth, datasets, embedding, infill, tts, voice_changer, voices
+from . import api_status, auth, datasets, embedding, infill, stt, tts, voice_changer, voices
 from .api_status import ApiInfo, ApiInfoParams
 from .auth import TokenGrant, TokenGrantParams, TokenRequest, TokenRequestParams, TokenResponse, TokenResponseParams
 from .client import AsyncCartesia, Cartesia
@@ -19,6 +19,32 @@ from .datasets import (
 )
 from .embedding import Embedding
 from .environment import CartesiaEnvironment
+from .stt import (
+    DoneMessage,
+    DoneMessageParams,
+    ErrorMessage,
+    ErrorMessageParams,
+    FlushDoneMessage,
+    FlushDoneMessageParams,
+    StreamingTranscriptionResponse,
+    StreamingTranscriptionResponseParams,
+    StreamingTranscriptionResponse_Done,
+    StreamingTranscriptionResponse_DoneParams,
+    StreamingTranscriptionResponse_Error,
+    StreamingTranscriptionResponse_ErrorParams,
+    StreamingTranscriptionResponse_FlushDone,
+    StreamingTranscriptionResponse_FlushDoneParams,
+    StreamingTranscriptionResponse_Transcript,
+    StreamingTranscriptionResponse_TranscriptParams,
+    SttEncoding,
+    TimestampGranularity,
+    TranscriptMessage,
+    TranscriptMessageParams,
+    TranscriptionResponse,
+    TranscriptionResponseParams,
+    TranscriptionWord,
+    TranscriptionWordParams,
+)
 from .tts import (
     CancelContextRequest,
     CancelContextRequestParams,
@@ -49,6 +75,8 @@ from .tts import (
     RawOutputFormatParams,
     Speed,
     SpeedParams,
+    SseOutputFormat,
+    SseOutputFormatParams,
     SupportedLanguage,
     TtsRequest,
     TtsRequestEmbeddingSpecifier,
@@ -58,6 +86,8 @@ from .tts import (
     TtsRequestParams,
     TtsRequestVoiceSpecifier,
     TtsRequestVoiceSpecifierParams,
+    TtssseRequest,
+    TtssseRequestParams,
     WavOutputFormat,
     WavOutputFormatParams,
     WebSocketBaseResponse,
@@ -173,13 +203,19 @@ __all__ = [
     "DatasetFile",
     "DatasetFileParams",
     "DatasetParams",
+    "DoneMessage",
+    "DoneMessageParams",
     "Embedding",
     "EmbeddingResponse",
     "EmbeddingResponseParams",
     "EmbeddingSpecifier",
     "EmbeddingSpecifierParams",
     "Emotion",
+    "ErrorMessage",
+    "ErrorMessageParams",
     "FilePurpose",
+    "FlushDoneMessage",
+    "FlushDoneMessageParams",
     "FlushId",
     "Gender",
     "GenderPresentation",
@@ -227,6 +263,8 @@ __all__ = [
     "RawOutputFormatParams",
     "Speed",
     "SpeedParams",
+    "SseOutputFormat",
+    "SseOutputFormatParams",
     "StreamingResponse",
     "StreamingResponseParams",
     "StreamingResponse_Chunk",
@@ -235,13 +273,31 @@ __all__ = [
     "StreamingResponse_DoneParams",
     "StreamingResponse_Error",
     "StreamingResponse_ErrorParams",
+    "StreamingTranscriptionResponse",
+    "StreamingTranscriptionResponseParams",
+    "StreamingTranscriptionResponse_Done",
+    "StreamingTranscriptionResponse_DoneParams",
+    "StreamingTranscriptionResponse_Error",
+    "StreamingTranscriptionResponse_ErrorParams",
+    "StreamingTranscriptionResponse_FlushDone",
+    "StreamingTranscriptionResponse_FlushDoneParams",
+    "StreamingTranscriptionResponse_Transcript",
+    "StreamingTranscriptionResponse_TranscriptParams",
+    "SttEncoding",
     "SupportedLanguage",
+    "TimestampGranularity",
     "TokenGrant",
     "TokenGrantParams",
     "TokenRequest",
     "TokenRequestParams",
     "TokenResponse",
     "TokenResponseParams",
+    "TranscriptMessage",
+    "TranscriptMessageParams",
+    "TranscriptionResponse",
+    "TranscriptionResponseParams",
+    "TranscriptionWord",
+    "TranscriptionWordParams",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestEmbeddingSpecifierParams",
@@ -250,6 +306,8 @@ __all__ = [
     "TtsRequestParams",
     "TtsRequestVoiceSpecifier",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequest",
+    "TtssseRequestParams",
     "UpdateVoiceRequest",
     "UpdateVoiceRequestParams",
     "Voice",
@@ -307,6 +365,7 @@ __all__ = [
     "datasets",
     "embedding",
     "infill",
+    "stt",
     "tts",
     "voice_changer",
     "voices",

cartesia/auth/client.py CHANGED Viewed

@@ -22,7 +22,7 @@ class AuthClient:
     def access_token(
         self,
         *,
-        grants: TokenGrantParams,
+        grants: typing.Optional[TokenGrantParams] = OMIT,
         expires_in: typing.Optional[int] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TokenResponse:
@@ -31,8 +31,8 @@ class AuthClient:
         Parameters
         ----------
-        grants : TokenGrantParams
-            The permissions to be granted via the token.
+        grants : typing.Optional[TokenGrantParams]
+            The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
         expires_in : typing.Optional[int]
             The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -52,7 +52,7 @@ class AuthClient:
             api_key="YOUR_API_KEY",
         )
         client.auth.access_token(
-            grants={"tts": True},
+            grants={"tts": True, "stt": True},
             expires_in=60,
         )
         """
@@ -90,7 +90,7 @@ class AsyncAuthClient:
     async def access_token(
         self,
         *,
-        grants: TokenGrantParams,
+        grants: typing.Optional[TokenGrantParams] = OMIT,
         expires_in: typing.Optional[int] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TokenResponse:
@@ -99,8 +99,8 @@ class AsyncAuthClient:
         Parameters
         ----------
-        grants : TokenGrantParams
-            The permissions to be granted via the token.
+        grants : typing.Optional[TokenGrantParams]
+            The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
         expires_in : typing.Optional[int]
             The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -125,7 +125,7 @@ class AsyncAuthClient:
         async def main() -> None:
             await client.auth.access_token(
-                grants={"tts": True},
+                grants={"tts": True, "stt": True},
                 expires_in=60,
             )

cartesia/auth/requests/token_grant.py CHANGED Viewed

@@ -1,10 +1,16 @@
 # This file was auto-generated by Fern from our API Definition.
 import typing_extensions
+import typing_extensions
 class TokenGrantParams(typing_extensions.TypedDict):
-    tts: bool
+    tts: typing_extensions.NotRequired[bool]
     """
     The `tts` grant allows the token to be used to access any TTS endpoint.
     """
+    stt: typing_extensions.NotRequired[bool]
+    """
+    The `stt` grant allows the token to be used to access any STT endpoint.
+    """

cartesia/auth/requests/token_request.py CHANGED Viewed

@@ -1,14 +1,14 @@
 # This file was auto-generated by Fern from our API Definition.
 import typing_extensions
-from .token_grant import TokenGrantParams
 import typing_extensions
+from .token_grant import TokenGrantParams
 class TokenRequestParams(typing_extensions.TypedDict):
-    grants: TokenGrantParams
+    grants: typing_extensions.NotRequired[TokenGrantParams]
     """
-    The permissions to be granted via the token.
+    The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
     """
     expires_in: typing_extensions.NotRequired[int]

cartesia/auth/types/token_grant.py CHANGED Viewed

@@ -1,17 +1,22 @@
 # This file was auto-generated by Fern from our API Definition.
 from ...core.pydantic_utilities import UniversalBaseModel
+import typing
 import pydantic
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
-import typing
 class TokenGrant(UniversalBaseModel):
-    tts: bool = pydantic.Field()
+    tts: typing.Optional[bool] = pydantic.Field(default=None)
     """
     The `tts` grant allows the token to be used to access any TTS endpoint.
     """
+    stt: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    The `stt` grant allows the token to be used to access any STT endpoint.
+    """
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
     else:

cartesia/auth/types/token_request.py CHANGED Viewed

@@ -1,16 +1,16 @@
 # This file was auto-generated by Fern from our API Definition.
 from ...core.pydantic_utilities import UniversalBaseModel
+import typing
 from .token_grant import TokenGrant
 import pydantic
-import typing
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class TokenRequest(UniversalBaseModel):
-    grants: TokenGrant = pydantic.Field()
+    grants: typing.Optional[TokenGrant] = pydantic.Field(default=None)
     """
-    The permissions to be granted via the token.
+    The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
     """
     expires_in: typing.Optional[int] = pydantic.Field(default=None)

cartesia/base_client.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .core.client_wrapper import SyncClientWrapper
 from .api_status.client import ApiStatusClient
 from .auth.client import AuthClient
 from .infill.client import InfillClient
+from .stt.socket_client import AsyncSttClientWithWebsocket, SttClientWithWebsocket
 from .tts.client import TtsClient
 from .voice_changer.client import VoiceChangerClient
 from .voices.client import VoicesClient
@@ -80,6 +81,7 @@ class BaseCartesia:
         self.api_status = ApiStatusClient(client_wrapper=self._client_wrapper)
         self.auth = AuthClient(client_wrapper=self._client_wrapper)
         self.infill = InfillClient(client_wrapper=self._client_wrapper)
+        self.stt = SttClientWithWebsocket(client_wrapper=self._client_wrapper)
         self.tts = TtsClient(client_wrapper=self._client_wrapper)
         self.voice_changer = VoiceChangerClient(client_wrapper=self._client_wrapper)
         self.voices = VoicesClient(client_wrapper=self._client_wrapper)

cartesia/client.py CHANGED Viewed

@@ -8,6 +8,7 @@ import httpx
 from .base_client import AsyncBaseCartesia, BaseCartesia
 from .environment import CartesiaEnvironment
+from .stt.socket_client import AsyncSttClientWithWebsocket, SttClientWithWebsocket
 from .tts.socket_client import AsyncTtsClientWithWebsocket, TtsClientWithWebsocket
@@ -66,6 +67,7 @@ class Cartesia(BaseCartesia):
             follow_redirects=follow_redirects,
             httpx_client=httpx_client,
         )
+        self.stt = SttClientWithWebsocket(client_wrapper=self._client_wrapper)
         self.tts = TtsClientWithWebsocket(client_wrapper=self._client_wrapper)
     def __enter__(self):
@@ -143,6 +145,9 @@ class AsyncCartesia(AsyncBaseCartesia):
         self._session = None
         self._loop = None
         self.max_num_connections = max_num_connections
+        self.stt = AsyncSttClientWithWebsocket(
+            client_wrapper=self._client_wrapper, get_session=self._get_session
+        )
         self.tts = AsyncTtsClientWithWebsocket(
             client_wrapper=self._client_wrapper, get_session=self._get_session
         )

cartesia/core/client_wrapper.py CHANGED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cartesia",
-            "X-Fern-SDK-Version": "2.0.4",
+            "X-Fern-SDK-Version": "2.0.6",
         }
         headers["X-API-Key"] = self.api_key
         headers["Cartesia-Version"] = "2024-11-13"

cartesia/stt/__init__.py ADDED Viewed

@@ -0,0 +1,57 @@
+# This file was auto-generated by Fern from our API Definition.
+from .types import (
+    DoneMessage,
+    ErrorMessage,
+    FlushDoneMessage,
+    StreamingTranscriptionResponse,
+    StreamingTranscriptionResponse_Done,
+    StreamingTranscriptionResponse_Error,
+    StreamingTranscriptionResponse_FlushDone,
+    StreamingTranscriptionResponse_Transcript,
+    SttEncoding,
+    TimestampGranularity,
+    TranscriptMessage,
+    TranscriptionResponse,
+    TranscriptionWord,
+)
+from .requests import (
+    DoneMessageParams,
+    ErrorMessageParams,
+    FlushDoneMessageParams,
+    StreamingTranscriptionResponseParams,
+    StreamingTranscriptionResponse_DoneParams,
+    StreamingTranscriptionResponse_ErrorParams,
+    StreamingTranscriptionResponse_FlushDoneParams,
+    StreamingTranscriptionResponse_TranscriptParams,
+    TranscriptMessageParams,
+    TranscriptionResponseParams,
+    TranscriptionWordParams,
+)
+__all__ = [
+    "DoneMessage",
+    "DoneMessageParams",
+    "ErrorMessage",
+    "ErrorMessageParams",
+    "FlushDoneMessage",
+    "FlushDoneMessageParams",
+    "StreamingTranscriptionResponse",
+    "StreamingTranscriptionResponseParams",
+    "StreamingTranscriptionResponse_Done",
+    "StreamingTranscriptionResponse_DoneParams",
+    "StreamingTranscriptionResponse_Error",
+    "StreamingTranscriptionResponse_ErrorParams",
+    "StreamingTranscriptionResponse_FlushDone",
+    "StreamingTranscriptionResponse_FlushDoneParams",
+    "StreamingTranscriptionResponse_Transcript",
+    "StreamingTranscriptionResponse_TranscriptParams",
+    "SttEncoding",
+    "TimestampGranularity",
+    "TranscriptMessage",
+    "TranscriptMessageParams",
+    "TranscriptionResponse",
+    "TranscriptionResponseParams",
+    "TranscriptionWord",
+    "TranscriptionWordParams",
+]

cartesia/stt/_async_websocket.py ADDED Viewed

@@ -0,0 +1,293 @@
+import asyncio
+import json
+import typing
+import uuid
+from typing import Any, Awaitable, AsyncGenerator, Callable, Dict, Optional, Union
+import aiohttp
+from cartesia.stt.types import (
+    StreamingTranscriptionResponse,
+    StreamingTranscriptionResponse_Error,
+    StreamingTranscriptionResponse_Transcript,
+)
+from cartesia.stt.types.stt_encoding import SttEncoding
+from ..core.pydantic_utilities import parse_obj_as
+from ._websocket import SttWebsocket
+class AsyncSttWebsocket(SttWebsocket):
+    """This class contains methods to transcribe audio using WebSocket asynchronously."""
+    def __init__(
+        self,
+        ws_url: str,
+        api_key: str,
+        cartesia_version: str,
+        get_session: Callable[[], Awaitable[Optional[aiohttp.ClientSession]]],
+        timeout: float = 30,
+    ):
+        """
+        Args:
+            ws_url: The WebSocket URL for the Cartesia API.
+            api_key: The API key to use for authorization.
+            cartesia_version: The version of the Cartesia API to use.
+            timeout: The timeout for responses on the WebSocket in seconds.
+            get_session: A function that returns an awaitable of aiohttp.ClientSession object.
+        """
+        super().__init__(ws_url, api_key, cartesia_version)
+        self.timeout = timeout
+        self._get_session = get_session
+        self.websocket: Optional[aiohttp.ClientWebSocketResponse] = None
+        self._default_model: str = "ink-whisper"
+        self._default_language: Optional[str] = "en"
+        self._default_encoding: SttEncoding = "pcm_s16le"
+        self._default_sample_rate: int = 16000
+        self._default_min_volume: Optional[float] = None
+        self._default_max_silence_duration_secs: Optional[float] = None
+    def __del__(self):
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+        if loop is None:
+            asyncio.run(self.close())
+        elif loop.is_running():
+            loop.create_task(self.close())
+    async def connect(
+        self,
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ):
+        """Connect to the STT WebSocket with the specified parameters.
+        Args:
+            model: ID of the model to use for transcription (required)
+            language: The language of the input audio in ISO-639-1 format (defaults to "en")
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
+        Raises:
+            RuntimeError: If the connection to the WebSocket fails.
+        """
+        self._default_model = model
+        self._default_language = language
+        self._default_encoding = encoding
+        self._default_sample_rate = sample_rate
+        self._default_min_volume = min_volume
+        self._default_max_silence_duration_secs = max_silence_duration_secs
+        if self.websocket is None or self._is_websocket_closed():
+            route = "stt/websocket"
+            session = await self._get_session()
+            params = {
+                "model": model,
+                "api_key": self.api_key,
+                "cartesia_version": self.cartesia_version,
+                "encoding": encoding,
+                "sample_rate": str(sample_rate),
+            }
+            if language is not None:
+                params["language"] = language
+            if min_volume is not None:
+                params["min_volume"] = str(min_volume)
+            if max_silence_duration_secs is not None:
+                params["max_silence_duration_secs"] = str(max_silence_duration_secs)
+            query_string = "&".join([f"{k}={v}" for k, v in params.items()])
+            url = f"{self.ws_url}/{route}?{query_string}"
+            try:
+                if session is None:
+                    raise RuntimeError("Session is not available")
+                self.websocket = await session.ws_connect(url)
+            except Exception as e:
+                status_code = None
+                error_message = str(e)
+                if hasattr(e, 'status') and e.status is not None:
+                    status_code = e.status
+                    if status_code == 402:
+                        error_message = "Payment required. Your API key may have insufficient credits or permissions."
+                    elif status_code == 401:
+                        error_message = "Unauthorized. Please check your API key."
+                    elif status_code == 403:
+                        error_message = "Forbidden. You don't have permission to access this resource."
+                    elif status_code == 404:
+                        error_message = "Not found. The requested resource doesn't exist."
+                    raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
+                else:
+                    raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
+    def _is_websocket_closed(self):
+        return self.websocket is None or self.websocket.closed
+    async def close(self):
+        """This method closes the websocket connection. Highly recommended to call this method when done."""
+        if self.websocket is not None and not self._is_websocket_closed():
+            await self.websocket.close()
+        self.websocket = None
+    async def send(self, data: Union[bytes, str]):
+        """Send audio data or control commands to the WebSocket.
+        Args:
+            data: Binary audio data or text command ("finalize" or "done")
+        """
+        if self.websocket is None or self._is_websocket_closed():
+            await self.connect(
+                model=self._default_model,
+                language=self._default_language,
+                encoding=self._default_encoding,
+                sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
+            )
+        assert self.websocket is not None, "WebSocket should be connected after connect() call"
+        if isinstance(data, bytes):
+            await self.websocket.send_bytes(data)
+        elif isinstance(data, str):
+            await self.websocket.send_str(data)
+        else:
+            raise TypeError("Data must be bytes (audio) or str (command)")
+    async def receive(self) -> AsyncGenerator[Dict[str, Any], None]:  # type: ignore[override]
+        """Receive transcription results from the WebSocket.
+        Yields:
+            Dictionary containing transcription results, flush_done, done, or error messages
+        """
+        if self.websocket is None or self._is_websocket_closed():
+            await self.connect(
+                model=self._default_model,
+                language=self._default_language,
+                encoding=self._default_encoding,
+                sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
+            )
+        assert self.websocket is not None, "WebSocket should be connected after connect() call"
+        try:
+            async for message in self.websocket:
+                if message.type == aiohttp.WSMsgType.TEXT:
+                    raw_data = json.loads(message.data)
+                    # Handle error responses
+                    if raw_data.get("type") == "error":
+                        raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
+                    # Handle transcript responses with flexible parsing
+                    if raw_data.get("type") == "transcript":
+                        # Provide defaults for missing required fields
+                        result = {
+                            "type": raw_data["type"],
+                            "request_id": raw_data.get("request_id", ""),
+                            "text": raw_data.get("text", ""),  # Default to empty string if missing
+                            "is_final": raw_data.get("is_final", False),  # Default to False if missing
+                        }
+                        # Add optional fields if present
+                        if "duration" in raw_data:
+                            result["duration"] = raw_data["duration"]
+                        if "language" in raw_data:
+                            result["language"] = raw_data["language"]
+                        if "words" in raw_data:
+                            result["words"] = raw_data["words"]
+                        yield result
+                    # Handle flush_done acknowledgment
+                    elif raw_data.get("type") == "flush_done":
+                        result = {
+                            "type": raw_data["type"],
+                            "request_id": raw_data.get("request_id", ""),
+                        }
+                        yield result
+                    # Handle done acknowledgment
+                    elif raw_data.get("type") == "done":
+                        result = {
+                            "type": raw_data["type"],
+                            "request_id": raw_data.get("request_id", ""),
+                        }
+                        yield result
+                        break  # Exit the loop when done
+                elif message.type == aiohttp.WSMsgType.ERROR:
+                    error_message = f"WebSocket error: {self.websocket.exception()}"
+                    raise RuntimeError(error_message)
+                elif message.type == aiohttp.WSMsgType.CLOSE:
+                    break  # WebSocket was closed
+        except Exception as e:
+            await self.close()
+            raise e
+    async def transcribe(  # type: ignore[override]
+        self,
+        audio_chunks: typing.AsyncIterator[bytes],
+        *,
+        model: str = "ink-whisper",
+        language: Optional[str] = "en",
+        encoding: SttEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
+    ) -> AsyncGenerator[Dict[str, Any], None]:
+        """Transcribe audio chunks using the WebSocket.
+        Args:
+            audio_chunks: Async iterator of audio chunks as bytes
+            model: ID of the model to use for transcription (required)
+            language: The language of the input audio in ISO-639-1 format (defaults to "en")
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
+        Yields:
+            Dictionary containing transcription results, flush_done, done, or error messages
+        """
+        await self.connect(
+            model=model,
+            language=language,
+            encoding=encoding,
+            sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
+        )
+        try:
+            # Send all audio chunks
+            async for chunk in audio_chunks:
+                await self.send(chunk)
+            # Send finalize command to flush remaining audio
+            await self.send("finalize")
+            # Send done command to close session cleanly
+            await self.send("done")
+            # Receive all responses until done
+            async for result in self.receive():
+                yield result
+        finally:
+            await self.close()

cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl

cartesia 2.0.4py3-none-any.whl → 2.0.6py3-none-any.whl