PyPI - cartesia - Versions diffs - 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl - Mend

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

cartesia/__init__.py +14 -0
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/core/client_wrapper.py +1 -1
cartesia/stt/__init__.py +6 -0
cartesia/stt/_async_websocket.py +81 -72
cartesia/stt/_websocket.py +42 -20
cartesia/stt/client.py +456 -0
cartesia/stt/requests/__init__.py +2 -0
cartesia/stt/requests/streaming_transcription_response.py +2 -0
cartesia/stt/requests/transcript_message.py +8 -1
cartesia/stt/requests/transcription_response.py +8 -1
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +52 -109
cartesia/stt/types/__init__.py +4 -0
cartesia/stt/types/streaming_transcription_response.py +2 -0
cartesia/stt/types/stt_encoding.py +3 -1
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +7 -1
cartesia/stt/types/transcription_response.py +7 -1
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +8 -0
cartesia/tts/client.py +50 -8
cartesia/tts/requests/__init__.py +4 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +4 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0

cartesia/__init__.py CHANGED Viewed

@@ -37,10 +37,13 @@ from .stt import (
     StreamingTranscriptionResponse_Transcript,
     StreamingTranscriptionResponse_TranscriptParams,
     SttEncoding,
+    TimestampGranularity,
     TranscriptMessage,
     TranscriptMessageParams,
     TranscriptionResponse,
     TranscriptionResponseParams,
+    TranscriptionWord,
+    TranscriptionWordParams,
 )
 from .tts import (
     CancelContextRequest,
@@ -72,6 +75,8 @@ from .tts import (
     RawOutputFormatParams,
     Speed,
     SpeedParams,
+    SseOutputFormat,
+    SseOutputFormatParams,
     SupportedLanguage,
     TtsRequest,
     TtsRequestEmbeddingSpecifier,
@@ -81,6 +86,8 @@ from .tts import (
     TtsRequestParams,
     TtsRequestVoiceSpecifier,
     TtsRequestVoiceSpecifierParams,
+    TtssseRequest,
+    TtssseRequestParams,
     WavOutputFormat,
     WavOutputFormatParams,
     WebSocketBaseResponse,
@@ -256,6 +263,8 @@ __all__ = [
     "RawOutputFormatParams",
     "Speed",
     "SpeedParams",
+    "SseOutputFormat",
+    "SseOutputFormatParams",
     "StreamingResponse",
     "StreamingResponseParams",
     "StreamingResponse_Chunk",
@@ -276,6 +285,7 @@ __all__ = [
     "StreamingTranscriptionResponse_TranscriptParams",
     "SttEncoding",
     "SupportedLanguage",
+    "TimestampGranularity",
     "TokenGrant",
     "TokenGrantParams",
     "TokenRequest",
@@ -286,6 +296,8 @@ __all__ = [
     "TranscriptMessageParams",
     "TranscriptionResponse",
     "TranscriptionResponseParams",
+    "TranscriptionWord",
+    "TranscriptionWordParams",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestEmbeddingSpecifierParams",
@@ -294,6 +306,8 @@ __all__ = [
     "TtsRequestParams",
     "TtsRequestVoiceSpecifier",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequest",
+    "TtssseRequestParams",
     "UpdateVoiceRequest",
     "UpdateVoiceRequestParams",
     "Voice",

cartesia/auth/client.py CHANGED Viewed

@@ -22,7 +22,7 @@ class AuthClient:
     def access_token(
         self,
         *,
-        grants: TokenGrantParams,
+        grants: typing.Optional[TokenGrantParams] = OMIT,
         expires_in: typing.Optional[int] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TokenResponse:
@@ -31,8 +31,8 @@ class AuthClient:
         Parameters
         ----------
-        grants : TokenGrantParams
-            The permissions to be granted via the token.
+        grants : typing.Optional[TokenGrantParams]
+            The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
         expires_in : typing.Optional[int]
             The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -52,7 +52,7 @@ class AuthClient:
             api_key="YOUR_API_KEY",
         )
         client.auth.access_token(
-            grants={"tts": True},
+            grants={"tts": True, "stt": True},
             expires_in=60,
         )
         """
@@ -90,7 +90,7 @@ class AsyncAuthClient:
     async def access_token(
         self,
         *,
-        grants: TokenGrantParams,
+        grants: typing.Optional[TokenGrantParams] = OMIT,
         expires_in: typing.Optional[int] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TokenResponse:
@@ -99,8 +99,8 @@ class AsyncAuthClient:
         Parameters
         ----------
-        grants : TokenGrantParams
-            The permissions to be granted via the token.
+        grants : typing.Optional[TokenGrantParams]
+            The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
         expires_in : typing.Optional[int]
             The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -125,7 +125,7 @@ class AsyncAuthClient:
         async def main() -> None:
             await client.auth.access_token(
-                grants={"tts": True},
+                grants={"tts": True, "stt": True},
                 expires_in=60,
             )

cartesia/auth/requests/token_grant.py CHANGED Viewed

@@ -1,10 +1,16 @@
 # This file was auto-generated by Fern from our API Definition.
 import typing_extensions
+import typing_extensions
 class TokenGrantParams(typing_extensions.TypedDict):
-    tts: bool
+    tts: typing_extensions.NotRequired[bool]
     """
     The `tts` grant allows the token to be used to access any TTS endpoint.
     """
+    stt: typing_extensions.NotRequired[bool]
+    """
+    The `stt` grant allows the token to be used to access any STT endpoint.
+    """

cartesia/auth/requests/token_request.py CHANGED Viewed

@@ -1,14 +1,14 @@
 # This file was auto-generated by Fern from our API Definition.
 import typing_extensions
-from .token_grant import TokenGrantParams
 import typing_extensions
+from .token_grant import TokenGrantParams
 class TokenRequestParams(typing_extensions.TypedDict):
-    grants: TokenGrantParams
+    grants: typing_extensions.NotRequired[TokenGrantParams]
     """
-    The permissions to be granted via the token.
+    The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
     """
     expires_in: typing_extensions.NotRequired[int]

cartesia/auth/types/token_grant.py CHANGED Viewed

@@ -1,17 +1,22 @@
 # This file was auto-generated by Fern from our API Definition.
 from ...core.pydantic_utilities import UniversalBaseModel
+import typing
 import pydantic
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
-import typing
 class TokenGrant(UniversalBaseModel):
-    tts: bool = pydantic.Field()
+    tts: typing.Optional[bool] = pydantic.Field(default=None)
     """
     The `tts` grant allows the token to be used to access any TTS endpoint.
     """
+    stt: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    The `stt` grant allows the token to be used to access any STT endpoint.
+    """
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
     else:

cartesia/auth/types/token_request.py CHANGED Viewed

@@ -1,16 +1,16 @@
 # This file was auto-generated by Fern from our API Definition.
 from ...core.pydantic_utilities import UniversalBaseModel
+import typing
 from .token_grant import TokenGrant
 import pydantic
-import typing
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class TokenRequest(UniversalBaseModel):
-    grants: TokenGrant = pydantic.Field()
+    grants: typing.Optional[TokenGrant] = pydantic.Field(default=None)
     """
-    The permissions to be granted via the token.
+    The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
     """
     expires_in: typing.Optional[int] = pydantic.Field(default=None)

cartesia/core/client_wrapper.py CHANGED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cartesia",
-            "X-Fern-SDK-Version": "2.0.5",
+            "X-Fern-SDK-Version": "2.0.6",
         }
         headers["X-API-Key"] = self.api_key
         headers["Cartesia-Version"] = "2024-11-13"

cartesia/stt/__init__.py CHANGED Viewed

@@ -10,8 +10,10 @@ from .types import (
     StreamingTranscriptionResponse_FlushDone,
     StreamingTranscriptionResponse_Transcript,
     SttEncoding,
+    TimestampGranularity,
     TranscriptMessage,
     TranscriptionResponse,
+    TranscriptionWord,
 )
 from .requests import (
     DoneMessageParams,
@@ -24,6 +26,7 @@ from .requests import (
     StreamingTranscriptionResponse_TranscriptParams,
     TranscriptMessageParams,
     TranscriptionResponseParams,
+    TranscriptionWordParams,
 )
 __all__ = [
@@ -44,8 +47,11 @@ __all__ = [
     "StreamingTranscriptionResponse_Transcript",
     "StreamingTranscriptionResponse_TranscriptParams",
     "SttEncoding",
+    "TimestampGranularity",
     "TranscriptMessage",
     "TranscriptMessageParams",
     "TranscriptionResponse",
     "TranscriptionResponseParams",
+    "TranscriptionWord",
+    "TranscriptionWordParams",
 ]

cartesia/stt/_async_websocket.py CHANGED Viewed

@@ -11,6 +11,7 @@ from cartesia.stt.types import (
     StreamingTranscriptionResponse_Error,
     StreamingTranscriptionResponse_Transcript,
 )
+from cartesia.stt.types.stt_encoding import SttEncoding
 from ..core.pydantic_utilities import parse_obj_as
 from ._websocket import SttWebsocket
@@ -41,8 +42,10 @@ class AsyncSttWebsocket(SttWebsocket):
         self.websocket: Optional[aiohttp.ClientWebSocketResponse] = None
         self._default_model: str = "ink-whisper"
         self._default_language: Optional[str] = "en"
-        self._default_encoding: Optional[str] = "pcm_s16le"
+        self._default_encoding: SttEncoding = "pcm_s16le"
         self._default_sample_rate: int = 16000
+        self._default_min_volume: Optional[float] = None
+        self._default_max_silence_duration_secs: Optional[float] = None
     def __del__(self):
         try:
@@ -60,16 +63,20 @@ class AsyncSttWebsocket(SttWebsocket):
         *,
         model: str = "ink-whisper",
         language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
+        encoding: SttEncoding = "pcm_s16le",
         sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
     ):
         """Connect to the STT WebSocket with the specified parameters.
         Args:
-            model: ID of the model to use for transcription
-            language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            model: ID of the model to use for transcription (required)
+            language: The language of the input audio in ISO-639-1 format (defaults to "en")
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Raises:
             RuntimeError: If the connection to the WebSocket fails.
@@ -78,6 +85,8 @@ class AsyncSttWebsocket(SttWebsocket):
         self._default_language = language
         self._default_encoding = encoding
         self._default_sample_rate = sample_rate
+        self._default_min_volume = min_volume
+        self._default_max_silence_duration_secs = max_silence_duration_secs
         if self.websocket is None or self._is_websocket_closed():
             route = "stt/websocket"
@@ -87,13 +96,15 @@ class AsyncSttWebsocket(SttWebsocket):
                 "model": model,
                 "api_key": self.api_key,
                 "cartesia_version": self.cartesia_version,
+                "encoding": encoding,
+                "sample_rate": str(sample_rate),
             }
             if language is not None:
                 params["language"] = language
-            if encoding is not None:
-                params["encoding"] = encoding
-            if sample_rate is not None:
-                params["sample_rate"] = str(sample_rate)
+            if min_volume is not None:
+                params["min_volume"] = str(min_volume)
+            if max_silence_duration_secs is not None:
+                params["max_silence_duration_secs"] = str(max_silence_duration_secs)
             query_string = "&".join([f"{k}={v}" for k, v in params.items()])
             url = f"{self.ws_url}/{route}?{query_string}"
@@ -143,6 +154,8 @@ class AsyncSttWebsocket(SttWebsocket):
                 language=self._default_language,
                 encoding=self._default_encoding,
                 sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
             )
         assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -166,76 +179,66 @@ class AsyncSttWebsocket(SttWebsocket):
                 language=self._default_language,
                 encoding=self._default_encoding,
                 sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
             )
         assert self.websocket is not None, "WebSocket should be connected after connect() call"
         try:
-            while True:
-                try:
-                    msg = await asyncio.wait_for(self.websocket.receive(), timeout=self.timeout)
+            async for message in self.websocket:
+                if message.type == aiohttp.WSMsgType.TEXT:
+                    raw_data = json.loads(message.data)
-                    if msg.type == aiohttp.WSMsgType.TEXT:
-                        raw_data = json.loads(msg.data)
-                        # Handle error responses
-                        if raw_data.get("type") == "error":
-                            raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
-                        # Handle transcript responses with flexible parsing
-                        if raw_data.get("type") == "transcript":
-                            # Provide defaults for missing required fields
-                            result = {
-                                "type": raw_data["type"],
-                                "request_id": raw_data.get("request_id", ""),
-                                "text": raw_data.get("text", ""),  # Default to empty string if missing
-                                "is_final": raw_data.get("is_final", False),  # Default to False if missing
-                            }
-                            # Add optional fields if present
-                            if "duration" in raw_data:
-                                result["duration"] = raw_data["duration"]
-                            if "language" in raw_data:
-                                result["language"] = raw_data["language"]
-                            yield result
+                    # Handle error responses
+                    if raw_data.get("type") == "error":
+                        raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
+                    # Handle transcript responses with flexible parsing
+                    if raw_data.get("type") == "transcript":
+                        # Provide defaults for missing required fields
+                        result = {
+                            "type": raw_data["type"],
+                            "request_id": raw_data.get("request_id", ""),
+                            "text": raw_data.get("text", ""),  # Default to empty string if missing
+                            "is_final": raw_data.get("is_final", False),  # Default to False if missing
+                        }
-                        # Handle flush_done acknowledgment
-                        elif raw_data.get("type") == "flush_done":
-                            result = {
-                                "type": raw_data["type"],
-                                "request_id": raw_data.get("request_id", ""),
-                            }
-                            yield result
+                        # Add optional fields if present
+                        if "duration" in raw_data:
+                            result["duration"] = raw_data["duration"]
+                        if "language" in raw_data:
+                            result["language"] = raw_data["language"]
+                        if "words" in raw_data:
+                            result["words"] = raw_data["words"]
-                        # Handle done acknowledgment - session complete
-                        elif raw_data.get("type") == "done":
-                            result = {
-                                "type": raw_data["type"],
-                                "request_id": raw_data.get("request_id", ""),
-                            }
-                            yield result
-                            # Session is complete, break out of loop
-                            break
+                        yield result
-                    elif msg.type == aiohttp.WSMsgType.ERROR:
-                        websocket_exception = self.websocket.exception() if self.websocket else None
-                        await self.close()
-                        raise RuntimeError(f"WebSocket error: {websocket_exception}")
+                    # Handle flush_done acknowledgment
+                    elif raw_data.get("type") == "flush_done":
+                        result = {
+                            "type": raw_data["type"],
+                            "request_id": raw_data.get("request_id", ""),
+                        }
+                        yield result
-                    elif msg.type == aiohttp.WSMsgType.CLOSE:
-                        break
+                    # Handle done acknowledgment
+                    elif raw_data.get("type") == "done":
+                        result = {
+                            "type": raw_data["type"],
+                            "request_id": raw_data.get("request_id", ""),
+                        }
+                        yield result
+                        break  # Exit the loop when done
-                except asyncio.TimeoutError:
-                    await self.close()
-                    raise RuntimeError("Timeout while waiting for transcription")
-                except Exception as inner_e:
-                    await self.close()
-                    raise RuntimeError(f"Error receiving transcription: {inner_e}")
+                elif message.type == aiohttp.WSMsgType.ERROR:
+                    error_message = f"WebSocket error: {self.websocket.exception()}"
+                    raise RuntimeError(error_message)
+                elif message.type == aiohttp.WSMsgType.CLOSE:
+                    break  # WebSocket was closed
         except Exception as e:
             await self.close()
-            raise RuntimeError(f"Failed to receive transcription. {e}")
+            raise e
     async def transcribe(  # type: ignore[override]
         self,
@@ -243,17 +246,21 @@ class AsyncSttWebsocket(SttWebsocket):
         *,
         model: str = "ink-whisper",
         language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
+        encoding: SttEncoding = "pcm_s16le",
         sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
     ) -> AsyncGenerator[Dict[str, Any], None]:
         """Transcribe audio chunks using the WebSocket.
         Args:
             audio_chunks: Async iterator of audio chunks as bytes
-            model: ID of the model to use for transcription
-            language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            model: ID of the model to use for transcription (required)
+            language: The language of the input audio in ISO-639-1 format (defaults to "en")
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Yields:
             Dictionary containing transcription results, flush_done, done, or error messages
@@ -263,6 +270,8 @@ class AsyncSttWebsocket(SttWebsocket):
             language=language,
             encoding=encoding,
             sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
         )
         try:

cartesia/stt/_websocket.py CHANGED Viewed

@@ -14,6 +14,7 @@ from cartesia.stt.types import (
     StreamingTranscriptionResponse_Error,
     StreamingTranscriptionResponse_Transcript,
 )
+from cartesia.stt.types.stt_encoding import SttEncoding
 from ..core.pydantic_utilities import parse_obj_as
@@ -45,8 +46,10 @@ class SttWebsocket:
         # Store default connection parameters for auto-connect with proper typing
         self._default_model: str = "ink-whisper"
         self._default_language: Optional[str] = "en"
-        self._default_encoding: Optional[str] = "pcm_s16le"
+        self._default_encoding: SttEncoding = "pcm_s16le"
         self._default_sample_rate: int = 16000
+        self._default_min_volume: Optional[float] = None
+        self._default_max_silence_duration_secs: Optional[float] = None
     def __del__(self):
         try:
@@ -59,16 +62,20 @@ class SttWebsocket:
         *,
         model: str = "ink-whisper",
         language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
+        encoding: SttEncoding = "pcm_s16le",
         sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
     ):
         """Connect to the STT WebSocket with the specified parameters.
         Args:
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Raises:
             RuntimeError: If the connection to the WebSocket fails.
@@ -78,6 +85,8 @@ class SttWebsocket:
         self._default_language = language
         self._default_encoding = encoding
         self._default_sample_rate = sample_rate
+        self._default_min_volume = min_volume
+        self._default_max_silence_duration_secs = max_silence_duration_secs
         if not IS_WEBSOCKET_SYNC_AVAILABLE:
             raise ImportError(
@@ -89,13 +98,15 @@ class SttWebsocket:
                 "model": model,
                 "api_key": self.api_key,
                 "cartesia_version": self.cartesia_version,
+                "encoding": encoding,
+                "sample_rate": str(sample_rate),
             }
             if language is not None:
                 params["language"] = language
-            if encoding is not None:
-                params["encoding"] = encoding
-            if sample_rate is not None:
-                params["sample_rate"] = str(sample_rate)
+            if min_volume is not None:
+                params["min_volume"] = str(min_volume)
+            if max_silence_duration_secs is not None:
+                params["max_silence_duration_secs"] = str(max_silence_duration_secs)
             query_string = "&".join([f"{k}={v}" for k, v in params.items()])
             url = f"{self.ws_url}/{route}?{query_string}"
@@ -143,6 +154,8 @@ class SttWebsocket:
                 language=self._default_language,
                 encoding=self._default_encoding,
                 sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
             )
         assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -167,6 +180,8 @@ class SttWebsocket:
                 language=self._default_language,
                 encoding=self._default_encoding,
                 sample_rate=self._default_sample_rate,
+                min_volume=self._default_min_volume,
+                max_silence_duration_secs=self._default_max_silence_duration_secs,
             )
         assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -197,6 +212,8 @@ class SttWebsocket:
                                 result["duration"] = raw_data["duration"]
                             if "language" in raw_data:
                                 result["language"] = raw_data["language"]
+                            if "words" in raw_data:
+                                result["words"] = raw_data["words"]
                             yield result
@@ -208,23 +225,22 @@ class SttWebsocket:
                             }
                             yield result
-                        # Handle done acknowledgment - session complete
+                        # Handle done acknowledgment
                         elif raw_data.get("type") == "done":
                             result = {
                                 "type": raw_data["type"],
                                 "request_id": raw_data.get("request_id", ""),
                             }
                             yield result
-                            # Session is complete, break out of loop
-                            break
-                except Exception as inner_e:
-                    self.close()
-                    raise RuntimeError(f"Error receiving transcription: {inner_e}")
+                            break  # Exit the loop when done
-        except Exception as e:
+                except Exception as e:
+                    if "Connection closed" in str(e) or "no active connection" in str(e):
+                        break  # WebSocket was closed
+                    raise e  # Re-raise other exceptions
+        except KeyboardInterrupt:
             self.close()
-            raise RuntimeError(f"Failed to receive transcription. {e}")
+            raise
     def transcribe(
         self,
@@ -232,8 +248,10 @@ class SttWebsocket:
         *,
         model: str = "ink-whisper",
         language: Optional[str] = "en",
-        encoding: Optional[str] = "pcm_s16le",
+        encoding: SttEncoding = "pcm_s16le",
         sample_rate: int = 16000,
+        min_volume: Optional[float] = None,
+        max_silence_duration_secs: Optional[float] = None,
     ) -> Generator[Dict[str, Any], None, None]:
         """Transcribe audio chunks using the WebSocket.
@@ -241,8 +259,10 @@ class SttWebsocket:
             audio_chunks: Iterator of audio chunks as bytes
             model: ID of the model to use for transcription
             language: The language of the input audio in ISO-639-1 format
-            encoding: The encoding format of the audio data
-            sample_rate: The sample rate of the audio in Hz
+            encoding: The encoding format of the audio data (required)
+            sample_rate: The sample rate of the audio in Hz (required)
+            min_volume: Volume threshold for voice activity detection (0.0-1.0)
+            max_silence_duration_secs: Maximum duration of silence before endpointing
         Yields:
             Dictionary containing transcription results, flush_done, done, or error messages
@@ -252,6 +272,8 @@ class SttWebsocket:
             language=language,
             encoding=encoding,
             sample_rate=sample_rate,
+            min_volume=min_volume,
+            max_silence_duration_secs=max_silence_duration_secs,
         )
         try:

cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl