PyPI - cartesia - Versions diffs - 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl - Mend

cartesia 2.0.4py3-none-any.whl → 2.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

cartesia/__init__.py +60 -1
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/base_client.py +2 -0
cartesia/client.py +5 -0
cartesia/core/client_wrapper.py +1 -1
cartesia/stt/__init__.py +57 -0
cartesia/stt/_async_websocket.py +293 -0
cartesia/stt/_websocket.py +294 -0
cartesia/stt/client.py +456 -0
cartesia/stt/requests/__init__.py +29 -0
cartesia/stt/requests/done_message.py +14 -0
cartesia/stt/requests/error_message.py +16 -0
cartesia/stt/requests/flush_done_message.py +14 -0
cartesia/stt/requests/streaming_transcription_response.py +41 -0
cartesia/stt/requests/transcript_message.py +40 -0
cartesia/stt/requests/transcription_response.py +28 -0
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +138 -0
cartesia/stt/types/__init__.py +33 -0
cartesia/stt/types/done_message.py +26 -0
cartesia/stt/types/error_message.py +27 -0
cartesia/stt/types/flush_done_message.py +26 -0
cartesia/stt/types/streaming_transcription_response.py +94 -0
cartesia/stt/types/stt_encoding.py +7 -0
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +50 -0
cartesia/stt/types/transcription_response.py +38 -0
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +8 -0
cartesia/tts/client.py +50 -8
cartesia/tts/requests/__init__.py +4 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +4 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0

cartesia/tts/requests/__init__.py CHANGED Viewed

@@ -8,10 +8,12 @@ from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFor
 from .phoneme_timestamps import PhonemeTimestampsParams
 from .raw_output_format import RawOutputFormatParams
 from .speed import SpeedParams
+from .sse_output_format import SseOutputFormatParams
 from .tts_request import TtsRequestParams
 from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
 from .tts_request_id_specifier import TtsRequestIdSpecifierParams
 from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
+from .ttssse_request import TtssseRequestParams
 from .wav_output_format import WavOutputFormatParams
 from .web_socket_base_response import WebSocketBaseResponseParams
 from .web_socket_chunk_response import WebSocketChunkResponseParams
@@ -48,10 +50,12 @@ __all__ = [
     "PhonemeTimestampsParams",
     "RawOutputFormatParams",
     "SpeedParams",
+    "SseOutputFormatParams",
     "TtsRequestEmbeddingSpecifierParams",
     "TtsRequestIdSpecifierParams",
     "TtsRequestParams",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequestParams",
     "WavOutputFormatParams",
     "WebSocketBaseResponseParams",
     "WebSocketChunkResponseParams",

cartesia/tts/requests/generation_request.py CHANGED Viewed

@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
     add_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to return word-level timestamps.
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
     """
     add_phoneme_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to return phoneme-level timestamps.
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
     """
-    use_original_timestamps: typing_extensions.NotRequired[bool]
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
     """
-    Whether to use the original transcript for timestamps.
+    Whether to use normalized timestamps (True) or original timestamps (False).
     """

cartesia/tts/requests/sse_output_format.py ADDED Viewed

@@ -0,0 +1,11 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+import typing
+from ..types.raw_encoding import RawEncoding
+class SseOutputFormatParams(typing_extensions.TypedDict):
+    container: typing.Literal["raw"]
+    encoding: RawEncoding
+    sample_rate: int

cartesia/tts/requests/ttssse_request.py ADDED Viewed

@@ -0,0 +1,47 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
+import typing_extensions
+from ..types.supported_language import SupportedLanguage
+from .sse_output_format import SseOutputFormatParams
+from ..types.model_speed import ModelSpeed
+from ..types.context_id import ContextId
+class TtssseRequestParams(typing_extensions.TypedDict):
+    model_id: str
+    """
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
+    """
+    transcript: str
+    voice: TtsRequestVoiceSpecifierParams
+    language: typing_extensions.NotRequired[SupportedLanguage]
+    output_format: SseOutputFormatParams
+    duration: typing_extensions.NotRequired[float]
+    """
+    The maximum duration of the audio in seconds. You do not usually need to specify this.
+    If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
+    """
+    speed: typing_extensions.NotRequired[ModelSpeed]
+    add_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to use normalized timestamps (True) or original timestamps (False).
+    """
+    context_id: typing_extensions.NotRequired[ContextId]
+    """
+    Optional context ID for this request.
+    """

cartesia/tts/requests/web_socket_chunk_response.py CHANGED Viewed

@@ -1,11 +1,8 @@
 # This file was auto-generated by Fern from our API Definition.
 from .web_socket_base_response import WebSocketBaseResponseParams
-import typing_extensions
-from ..types.flush_id import FlushId
 class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]

cartesia/tts/requests/web_socket_response.py CHANGED Viewed

@@ -4,8 +4,8 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
-from ..types.flush_id import FlushId
 from ..types.context_id import ContextId
+from ..types.flush_id import FlushId
 from .word_timestamps import WordTimestampsParams
 from .phoneme_timestamps import PhonemeTimestampsParams
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
     type: typing.Literal["chunk"]
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]
     context_id: typing_extensions.NotRequired[ContextId]
     status_code: int
     done: bool

cartesia/tts/requests/web_socket_tts_request.py CHANGED Viewed

@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
     duration: typing_extensions.NotRequired[int]
     language: typing_extensions.NotRequired[str]
     add_timestamps: typing_extensions.NotRequired[bool]
-    use_original_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
     add_phoneme_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
     continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
     context_id: typing_extensions.NotRequired[str]
     max_buffer_delay_ms: typing_extensions.NotRequired[int]

cartesia/tts/types/__init__.py CHANGED Viewed

@@ -15,11 +15,13 @@ from .phoneme_timestamps import PhonemeTimestamps
 from .raw_encoding import RawEncoding
 from .raw_output_format import RawOutputFormat
 from .speed import Speed
+from .sse_output_format import SseOutputFormat
 from .supported_language import SupportedLanguage
 from .tts_request import TtsRequest
 from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
 from .tts_request_id_specifier import TtsRequestIdSpecifier
 from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
+from .ttssse_request import TtssseRequest
 from .wav_output_format import WavOutputFormat
 from .web_socket_base_response import WebSocketBaseResponse
 from .web_socket_chunk_response import WebSocketChunkResponse
@@ -63,11 +65,13 @@ __all__ = [
     "RawEncoding",
     "RawOutputFormat",
     "Speed",
+    "SseOutputFormat",
     "SupportedLanguage",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestIdSpecifier",
     "TtsRequestVoiceSpecifier",
+    "TtssseRequest",
     "WavOutputFormat",
     "WebSocketBaseResponse",
     "WebSocketChunkResponse",

cartesia/tts/types/generation_request.py CHANGED Viewed

@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
     add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to return word-level timestamps.
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
     """
     add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to return phoneme-level timestamps.
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
     """
-    use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to use the original transcript for timestamps.
+    Whether to use normalized timestamps (True) or original timestamps (False).
     """
     if IS_PYDANTIC_V2:

cartesia/tts/types/sse_output_format.py ADDED Viewed

@@ -0,0 +1,22 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+from .raw_encoding import RawEncoding
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import pydantic
+class SseOutputFormat(UniversalBaseModel):
+    container: typing.Literal["raw"] = "raw"
+    encoding: RawEncoding
+    sample_rate: int
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/ttssse_request.py ADDED Viewed

@@ -0,0 +1,58 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
+import typing
+from .supported_language import SupportedLanguage
+from .sse_output_format import SseOutputFormat
+from .model_speed import ModelSpeed
+from .context_id import ContextId
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class TtssseRequest(UniversalBaseModel):
+    model_id: str = pydantic.Field()
+    """
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
+    """
+    transcript: str
+    voice: TtsRequestVoiceSpecifier
+    language: typing.Optional[SupportedLanguage] = None
+    output_format: SseOutputFormat
+    duration: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The maximum duration of the audio in seconds. You do not usually need to specify this.
+    If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
+    """
+    speed: typing.Optional[ModelSpeed] = None
+    add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to use normalized timestamps (True) or original timestamps (False).
+    """
+    context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
+    """
+    Optional context ID for this request.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/web_socket_chunk_response.py CHANGED Viewed

@@ -1,16 +1,14 @@
 # This file was auto-generated by Fern from our API Definition.
 from .web_socket_base_response import WebSocketBaseResponse
-import typing
-from .flush_id import FlushId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
 import pydantic
 class WebSocketChunkResponse(WebSocketBaseResponse):
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/tts/types/web_socket_response.py CHANGED Viewed

@@ -3,10 +3,10 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
-from .flush_id import FlushId
 from .context_id import ContextId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
+from .flush_id import FlushId
 from .word_timestamps import WordTimestamps
 from .phoneme_timestamps import PhonemeTimestamps
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
     type: typing.Literal["chunk"] = "chunk"
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     context_id: typing.Optional[ContextId] = None
     status_code: int
     done: bool

cartesia/tts/types/web_socket_tts_request.py CHANGED Viewed

@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
     voice: TtsRequestVoiceSpecifier
     duration: typing.Optional[int] = None
     language: typing.Optional[str] = None
-    add_timestamps: typing.Optional[bool] = None
-    use_original_timestamps: typing.Optional[bool] = None
-    add_phoneme_timestamps: typing.Optional[bool] = None
+    add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing.Optional[bool] = None
     continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
     context_id: typing.Optional[str] = None
     max_buffer_delay_ms: typing.Optional[int] = None

cartesia/voice_changer/requests/streaming_response.py CHANGED Viewed

@@ -4,7 +4,6 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
-from ...tts.types.flush_id import FlushId
 from ...tts.types.context_id import ContextId
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
     type: typing.Literal["chunk"]
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]
     context_id: typing_extensions.NotRequired[ContextId]
     status_code: int
     done: bool

cartesia/voice_changer/types/streaming_response.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
-from ...tts.types.flush_id import FlushId
 from ...tts.types.context_id import ContextId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
     type: typing.Literal["chunk"] = "chunk"
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     context_id: typing.Optional[ContextId] = None
     status_code: int
     done: bool

{cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.4
+Version: 2.0.6
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -213,6 +213,258 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Speech-to-Text (STT) with Websockets
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Load your audio file as bytes
+with open("path/to/audio.wav", "rb") as f:
+    audio_data = f.read()
+# Convert to audio chunks (20ms chunks used here for a streaming example)
+# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
+chunk_size = 640
+audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
+# Create websocket connection with endpointing parameters
+ws = client.stt.websocket(
+    model="ink-whisper",                 # Model (required)
+    language="en",                       # Language of your audio (required)
+    encoding="pcm_s16le",                # Audio encoding format (required)
+    sample_rate=16000,                   # Audio sample rate (required)
+    min_volume=0.1,                      # Volume threshold for voice activity detection
+    max_silence_duration_secs=0.4,       # Maximum silence duration before endpointing
+)
+# Send audio chunks (streaming approach)
+for chunk in audio_chunks:
+    ws.send(chunk)
+# Finalize and close
+ws.send("finalize")
+ws.send("done")
+# Receive transcription results with word-level timestamps
+for result in ws.receive():
+    if result['type'] == 'transcript':
+        print(f"Transcription: {result['text']}")
+        # Handle word-level timestamps if available
+        if 'words' in result and result['words']:
+            print("Word-level timestamps:")
+            for word_info in result['words']:
+                word = word_info['word']
+                start = word_info['start']
+                end = word_info['end']
+                print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+        if result['is_final']:
+            print("Final result received")
+    elif result['type'] == 'done':
+        break
+ws.close()
+```
+### Async Streaming Speech-to-Text (STT) with Websockets
+For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
+```python
+import asyncio
+import os
+from cartesia import AsyncCartesia
+async def streaming_stt_example():
+    """
+    Advanced async STT example for real-time streaming applications.
+    This example simulates streaming audio processing with proper error handling
+    and demonstrates the new endpointing and word timestamp features.
+    """
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    try:
+        # Create websocket connection with voice activity detection
+        ws = await client.stt.websocket(
+            model="ink-whisper",             # Model (required)
+            language="en",                   # Language of your audio (required)
+            encoding="pcm_s16le",            # Audio encoding format (required)
+            sample_rate=16000,               # Audio sample rate (required)
+            min_volume=0.15,                 # Volume threshold for voice activity detection
+            max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
+        )
+        # Simulate streaming audio data (replace with your audio source)
+        async def audio_stream():
+            """Simulate real-time audio streaming - replace with actual audio capture"""
+            # Load audio file for simulation
+            with open("path/to/audio.wav", "rb") as f:
+                audio_data = f.read()
+            # Stream in 100ms chunks (realistic for real-time processing)
+            chunk_size = int(16000 * 0.1 * 2)  # 100ms at 16kHz, 16-bit
+            for i in range(0, len(audio_data), chunk_size):
+                chunk = audio_data[i:i + chunk_size]
+                if chunk:
+                    yield chunk
+                    # Simulate real-time streaming delay
+                    await asyncio.sleep(0.1)
+        # Send audio and receive results concurrently
+        async def send_audio():
+            """Send audio chunks to the STT websocket"""
+            try:
+                async for chunk in audio_stream():
+                    await ws.send(chunk)
+                    print(f"Sent audio chunk of {len(chunk)} bytes")
+                    # Small delay to simulate realtime applications
+                    await asyncio.sleep(0.02)
+                # Signal end of audio stream
+                await ws.send("finalize")
+                await ws.send("done")
+                print("Audio streaming completed")
+            except Exception as e:
+                print(f"Error sending audio: {e}")
+        async def receive_transcripts():
+            """Receive and process transcription results with word timestamps"""
+            full_transcript = ""
+            all_word_timestamps = []
+            try:
+                async for result in ws.receive():
+                    if result['type'] == 'transcript':
+                        text = result['text']
+                        is_final = result['is_final']
+                        # Handle word-level timestamps
+                        if 'words' in result and result['words']:
+                            word_timestamps = result['words']
+                            all_word_timestamps.extend(word_timestamps)
+                            if is_final:
+                                print("Word-level timestamps:")
+                                for word_info in word_timestamps:
+                                    word = word_info['word']
+                                    start = word_info['start']
+                                    end = word_info['end']
+                                    print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+                        if is_final:
+                            # Final result - this text won't change
+                            full_transcript += text + " "
+                            print(f"FINAL: {text}")
+                        else:
+                            # Partial result - may change as more audio is processed
+                            print(f"PARTIAL: {text}")
+                    elif result['type'] == 'done':
+                        print("Transcription completed")
+                        break
+            except Exception as e:
+                print(f"Error receiving transcripts: {e}")
+            return full_transcript.strip(), all_word_timestamps
+        print("Starting streaming STT...")
+        # Use asyncio.gather to run audio sending and transcript receiving concurrently
+        _, (final_transcript, word_timestamps) = await asyncio.gather(
+            send_audio(),
+            receive_transcripts()
+        )
+        print(f"\nComplete transcript: {final_transcript}")
+        print(f"Total words with timestamps: {len(word_timestamps)}")
+        # Clean up
+        await ws.close()
+    except Exception as e:
+        print(f"STT streaming error: {e}")
+    finally:
+        await client.close()
+# Run the example
+if __name__ == "__main__":
+    asyncio.run(streaming_stt_example())
+```
+## Batch Speech-to-Text (STT)
+For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Transcribe an audio file with word-level timestamps
+with open("path/to/audio.wav", "rb") as audio_file:
+    response = client.stt.transcribe(
+        file=audio_file,                    # Audio file to transcribe
+        model="ink-whisper",                # STT model (required)
+        language="en",                      # Language of the audio (optional)
+        timestamp_granularities=["word"],   # Include word-level timestamps (optional)
+        encoding="pcm_s16le",               # Audio encoding (optional)
+        sample_rate=16000,                  # Audio sample rate (optional)
+    )
+# Access transcription results
+print(f"Transcribed text: {response.text}")
+print(f"Audio duration: {response.duration:.2f} seconds")
+# Process word-level timestamps if requested
+if response.words:
+    print("\nWord-level timestamps:")
+    for word_info in response.words:
+        word = word_info.word
+        start = word_info.start
+        end = word_info.end
+        print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+```
+### Async Batch STT
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def transcribe_file():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    with open("path/to/audio.wav", "rb") as audio_file:
+        response = await client.stt.transcribe(
+            file=audio_file,
+            model="ink-whisper",
+            language="en",
+            timestamp_granularities=["word"],
+        )
+    print(f"Transcribed text: {response.text}")
+    # Process word timestamps
+    if response.words:
+        for word_info in response.words:
+            print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
+    await client.close()
+asyncio.run(transcribe_file())
+```
+> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
@@ -358,7 +610,6 @@ new_voice = client.voices.create(
     language="en"
 )
 ```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -412,3 +663,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
 From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl

cartesia 2.0.4py3-none-any.whl → 2.0.6py3-none-any.whl