PyPI - cartesia - Versions diffs - 2.0.4__tar.gz → 2.0.6__tar.gz - Mend

cartesia 2.0.4tar.gz → 2.0.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

{cartesia-2.0.4 → cartesia-2.0.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.4
+Version: 2.0.6
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -213,6 +213,258 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Speech-to-Text (STT) with Websockets
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Load your audio file as bytes
+with open("path/to/audio.wav", "rb") as f:
+    audio_data = f.read()
+# Convert to audio chunks (20ms chunks used here for a streaming example)
+# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
+chunk_size = 640
+audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
+# Create websocket connection with endpointing parameters
+ws = client.stt.websocket(
+    model="ink-whisper",                 # Model (required)
+    language="en",                       # Language of your audio (required)
+    encoding="pcm_s16le",                # Audio encoding format (required)
+    sample_rate=16000,                   # Audio sample rate (required)
+    min_volume=0.1,                      # Volume threshold for voice activity detection
+    max_silence_duration_secs=0.4,       # Maximum silence duration before endpointing
+)
+# Send audio chunks (streaming approach)
+for chunk in audio_chunks:
+    ws.send(chunk)
+# Finalize and close
+ws.send("finalize")
+ws.send("done")
+# Receive transcription results with word-level timestamps
+for result in ws.receive():
+    if result['type'] == 'transcript':
+        print(f"Transcription: {result['text']}")
+        # Handle word-level timestamps if available
+        if 'words' in result and result['words']:
+            print("Word-level timestamps:")
+            for word_info in result['words']:
+                word = word_info['word']
+                start = word_info['start']
+                end = word_info['end']
+                print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+        if result['is_final']:
+            print("Final result received")
+    elif result['type'] == 'done':
+        break
+ws.close()
+```
+### Async Streaming Speech-to-Text (STT) with Websockets
+For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
+```python
+import asyncio
+import os
+from cartesia import AsyncCartesia
+async def streaming_stt_example():
+    """
+    Advanced async STT example for real-time streaming applications.
+    This example simulates streaming audio processing with proper error handling
+    and demonstrates the new endpointing and word timestamp features.
+    """
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    try:
+        # Create websocket connection with voice activity detection
+        ws = await client.stt.websocket(
+            model="ink-whisper",             # Model (required)
+            language="en",                   # Language of your audio (required)
+            encoding="pcm_s16le",            # Audio encoding format (required)
+            sample_rate=16000,               # Audio sample rate (required)
+            min_volume=0.15,                 # Volume threshold for voice activity detection
+            max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
+        )
+        # Simulate streaming audio data (replace with your audio source)
+        async def audio_stream():
+            """Simulate real-time audio streaming - replace with actual audio capture"""
+            # Load audio file for simulation
+            with open("path/to/audio.wav", "rb") as f:
+                audio_data = f.read()
+            # Stream in 100ms chunks (realistic for real-time processing)
+            chunk_size = int(16000 * 0.1 * 2)  # 100ms at 16kHz, 16-bit
+            for i in range(0, len(audio_data), chunk_size):
+                chunk = audio_data[i:i + chunk_size]
+                if chunk:
+                    yield chunk
+                    # Simulate real-time streaming delay
+                    await asyncio.sleep(0.1)
+        # Send audio and receive results concurrently
+        async def send_audio():
+            """Send audio chunks to the STT websocket"""
+            try:
+                async for chunk in audio_stream():
+                    await ws.send(chunk)
+                    print(f"Sent audio chunk of {len(chunk)} bytes")
+                    # Small delay to simulate realtime applications
+                    await asyncio.sleep(0.02)
+                # Signal end of audio stream
+                await ws.send("finalize")
+                await ws.send("done")
+                print("Audio streaming completed")
+            except Exception as e:
+                print(f"Error sending audio: {e}")
+        async def receive_transcripts():
+            """Receive and process transcription results with word timestamps"""
+            full_transcript = ""
+            all_word_timestamps = []
+            try:
+                async for result in ws.receive():
+                    if result['type'] == 'transcript':
+                        text = result['text']
+                        is_final = result['is_final']
+                        # Handle word-level timestamps
+                        if 'words' in result and result['words']:
+                            word_timestamps = result['words']
+                            all_word_timestamps.extend(word_timestamps)
+                            if is_final:
+                                print("Word-level timestamps:")
+                                for word_info in word_timestamps:
+                                    word = word_info['word']
+                                    start = word_info['start']
+                                    end = word_info['end']
+                                    print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+                        if is_final:
+                            # Final result - this text won't change
+                            full_transcript += text + " "
+                            print(f"FINAL: {text}")
+                        else:
+                            # Partial result - may change as more audio is processed
+                            print(f"PARTIAL: {text}")
+                    elif result['type'] == 'done':
+                        print("Transcription completed")
+                        break
+            except Exception as e:
+                print(f"Error receiving transcripts: {e}")
+            return full_transcript.strip(), all_word_timestamps
+        print("Starting streaming STT...")
+        # Use asyncio.gather to run audio sending and transcript receiving concurrently
+        _, (final_transcript, word_timestamps) = await asyncio.gather(
+            send_audio(),
+            receive_transcripts()
+        )
+        print(f"\nComplete transcript: {final_transcript}")
+        print(f"Total words with timestamps: {len(word_timestamps)}")
+        # Clean up
+        await ws.close()
+    except Exception as e:
+        print(f"STT streaming error: {e}")
+    finally:
+        await client.close()
+# Run the example
+if __name__ == "__main__":
+    asyncio.run(streaming_stt_example())
+```
+## Batch Speech-to-Text (STT)
+For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Transcribe an audio file with word-level timestamps
+with open("path/to/audio.wav", "rb") as audio_file:
+    response = client.stt.transcribe(
+        file=audio_file,                    # Audio file to transcribe
+        model="ink-whisper",                # STT model (required)
+        language="en",                      # Language of the audio (optional)
+        timestamp_granularities=["word"],   # Include word-level timestamps (optional)
+        encoding="pcm_s16le",               # Audio encoding (optional)
+        sample_rate=16000,                  # Audio sample rate (optional)
+    )
+# Access transcription results
+print(f"Transcribed text: {response.text}")
+print(f"Audio duration: {response.duration:.2f} seconds")
+# Process word-level timestamps if requested
+if response.words:
+    print("\nWord-level timestamps:")
+    for word_info in response.words:
+        word = word_info.word
+        start = word_info.start
+        end = word_info.end
+        print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+```
+### Async Batch STT
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def transcribe_file():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    with open("path/to/audio.wav", "rb") as audio_file:
+        response = await client.stt.transcribe(
+            file=audio_file,
+            model="ink-whisper",
+            language="en",
+            timestamp_granularities=["word"],
+        )
+    print(f"Transcribed text: {response.text}")
+    # Process word timestamps
+    if response.words:
+        for word_info in response.words:
+            print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
+    await client.close()
+asyncio.run(transcribe_file())
+```
+> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
@@ -358,7 +610,6 @@ new_voice = client.voices.create(
     language="en"
 )
 ```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -412,3 +663,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
 From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.4 → cartesia-2.0.6}/README.md RENAMED Viewed

@@ -181,6 +181,258 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Speech-to-Text (STT) with Websockets
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Load your audio file as bytes
+with open("path/to/audio.wav", "rb") as f:
+    audio_data = f.read()
+# Convert to audio chunks (20ms chunks used here for a streaming example)
+# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
+chunk_size = 640
+audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
+# Create websocket connection with endpointing parameters
+ws = client.stt.websocket(
+    model="ink-whisper",                 # Model (required)
+    language="en",                       # Language of your audio (required)
+    encoding="pcm_s16le",                # Audio encoding format (required)
+    sample_rate=16000,                   # Audio sample rate (required)
+    min_volume=0.1,                      # Volume threshold for voice activity detection
+    max_silence_duration_secs=0.4,       # Maximum silence duration before endpointing
+)
+# Send audio chunks (streaming approach)
+for chunk in audio_chunks:
+    ws.send(chunk)
+# Finalize and close
+ws.send("finalize")
+ws.send("done")
+# Receive transcription results with word-level timestamps
+for result in ws.receive():
+    if result['type'] == 'transcript':
+        print(f"Transcription: {result['text']}")
+        # Handle word-level timestamps if available
+        if 'words' in result and result['words']:
+            print("Word-level timestamps:")
+            for word_info in result['words']:
+                word = word_info['word']
+                start = word_info['start']
+                end = word_info['end']
+                print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+        if result['is_final']:
+            print("Final result received")
+    elif result['type'] == 'done':
+        break
+ws.close()
+```
+### Async Streaming Speech-to-Text (STT) with Websockets
+For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
+```python
+import asyncio
+import os
+from cartesia import AsyncCartesia
+async def streaming_stt_example():
+    """
+    Advanced async STT example for real-time streaming applications.
+    This example simulates streaming audio processing with proper error handling
+    and demonstrates the new endpointing and word timestamp features.
+    """
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    try:
+        # Create websocket connection with voice activity detection
+        ws = await client.stt.websocket(
+            model="ink-whisper",             # Model (required)
+            language="en",                   # Language of your audio (required)
+            encoding="pcm_s16le",            # Audio encoding format (required)
+            sample_rate=16000,               # Audio sample rate (required)
+            min_volume=0.15,                 # Volume threshold for voice activity detection
+            max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
+        )
+        # Simulate streaming audio data (replace with your audio source)
+        async def audio_stream():
+            """Simulate real-time audio streaming - replace with actual audio capture"""
+            # Load audio file for simulation
+            with open("path/to/audio.wav", "rb") as f:
+                audio_data = f.read()
+            # Stream in 100ms chunks (realistic for real-time processing)
+            chunk_size = int(16000 * 0.1 * 2)  # 100ms at 16kHz, 16-bit
+            for i in range(0, len(audio_data), chunk_size):
+                chunk = audio_data[i:i + chunk_size]
+                if chunk:
+                    yield chunk
+                    # Simulate real-time streaming delay
+                    await asyncio.sleep(0.1)
+        # Send audio and receive results concurrently
+        async def send_audio():
+            """Send audio chunks to the STT websocket"""
+            try:
+                async for chunk in audio_stream():
+                    await ws.send(chunk)
+                    print(f"Sent audio chunk of {len(chunk)} bytes")
+                    # Small delay to simulate realtime applications
+                    await asyncio.sleep(0.02)
+                # Signal end of audio stream
+                await ws.send("finalize")
+                await ws.send("done")
+                print("Audio streaming completed")
+            except Exception as e:
+                print(f"Error sending audio: {e}")
+        async def receive_transcripts():
+            """Receive and process transcription results with word timestamps"""
+            full_transcript = ""
+            all_word_timestamps = []
+            try:
+                async for result in ws.receive():
+                    if result['type'] == 'transcript':
+                        text = result['text']
+                        is_final = result['is_final']
+                        # Handle word-level timestamps
+                        if 'words' in result and result['words']:
+                            word_timestamps = result['words']
+                            all_word_timestamps.extend(word_timestamps)
+                            if is_final:
+                                print("Word-level timestamps:")
+                                for word_info in word_timestamps:
+                                    word = word_info['word']
+                                    start = word_info['start']
+                                    end = word_info['end']
+                                    print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+                        if is_final:
+                            # Final result - this text won't change
+                            full_transcript += text + " "
+                            print(f"FINAL: {text}")
+                        else:
+                            # Partial result - may change as more audio is processed
+                            print(f"PARTIAL: {text}")
+                    elif result['type'] == 'done':
+                        print("Transcription completed")
+                        break
+            except Exception as e:
+                print(f"Error receiving transcripts: {e}")
+            return full_transcript.strip(), all_word_timestamps
+        print("Starting streaming STT...")
+        # Use asyncio.gather to run audio sending and transcript receiving concurrently
+        _, (final_transcript, word_timestamps) = await asyncio.gather(
+            send_audio(),
+            receive_transcripts()
+        )
+        print(f"\nComplete transcript: {final_transcript}")
+        print(f"Total words with timestamps: {len(word_timestamps)}")
+        # Clean up
+        await ws.close()
+    except Exception as e:
+        print(f"STT streaming error: {e}")
+    finally:
+        await client.close()
+# Run the example
+if __name__ == "__main__":
+    asyncio.run(streaming_stt_example())
+```
+## Batch Speech-to-Text (STT)
+For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Transcribe an audio file with word-level timestamps
+with open("path/to/audio.wav", "rb") as audio_file:
+    response = client.stt.transcribe(
+        file=audio_file,                    # Audio file to transcribe
+        model="ink-whisper",                # STT model (required)
+        language="en",                      # Language of the audio (optional)
+        timestamp_granularities=["word"],   # Include word-level timestamps (optional)
+        encoding="pcm_s16le",               # Audio encoding (optional)
+        sample_rate=16000,                  # Audio sample rate (optional)
+    )
+# Access transcription results
+print(f"Transcribed text: {response.text}")
+print(f"Audio duration: {response.duration:.2f} seconds")
+# Process word-level timestamps if requested
+if response.words:
+    print("\nWord-level timestamps:")
+    for word_info in response.words:
+        word = word_info.word
+        start = word_info.start
+        end = word_info.end
+        print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+```
+### Async Batch STT
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def transcribe_file():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    with open("path/to/audio.wav", "rb") as audio_file:
+        response = await client.stt.transcribe(
+            file=audio_file,
+            model="ink-whisper",
+            language="en",
+            timestamp_granularities=["word"],
+        )
+    print(f"Transcribed text: {response.text}")
+    # Process word timestamps
+    if response.words:
+        for word_info in response.words:
+            print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
+    await client.close()
+asyncio.run(transcribe_file())
+```
+> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
@@ -326,7 +578,6 @@ new_voice = client.voices.create(
     language="en"
 )
 ```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -379,3 +630,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
 ### Automatically generating new SDK releases
 From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.4 → cartesia-2.0.6}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ name = "cartesia"
 [tool.poetry]
 name = "cartesia"
-version = "2.0.4"
+version = "2.0.6"
 description = ""
 readme = "README.md"
 authors = []

{cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # This file was auto-generated by Fern from our API Definition.
-from . import api_status, auth, datasets, embedding, infill, tts, voice_changer, voices
+from . import api_status, auth, datasets, embedding, infill, stt, tts, voice_changer, voices
 from .api_status import ApiInfo, ApiInfoParams
 from .auth import TokenGrant, TokenGrantParams, TokenRequest, TokenRequestParams, TokenResponse, TokenResponseParams
 from .client import AsyncCartesia, Cartesia
@@ -19,6 +19,32 @@ from .datasets import (
 )
 from .embedding import Embedding
 from .environment import CartesiaEnvironment
+from .stt import (
+    DoneMessage,
+    DoneMessageParams,
+    ErrorMessage,
+    ErrorMessageParams,
+    FlushDoneMessage,
+    FlushDoneMessageParams,
+    StreamingTranscriptionResponse,
+    StreamingTranscriptionResponseParams,
+    StreamingTranscriptionResponse_Done,
+    StreamingTranscriptionResponse_DoneParams,
+    StreamingTranscriptionResponse_Error,
+    StreamingTranscriptionResponse_ErrorParams,
+    StreamingTranscriptionResponse_FlushDone,
+    StreamingTranscriptionResponse_FlushDoneParams,
+    StreamingTranscriptionResponse_Transcript,
+    StreamingTranscriptionResponse_TranscriptParams,
+    SttEncoding,
+    TimestampGranularity,
+    TranscriptMessage,
+    TranscriptMessageParams,
+    TranscriptionResponse,
+    TranscriptionResponseParams,
+    TranscriptionWord,
+    TranscriptionWordParams,
+)
 from .tts import (
     CancelContextRequest,
     CancelContextRequestParams,
@@ -49,6 +75,8 @@ from .tts import (
     RawOutputFormatParams,
     Speed,
     SpeedParams,
+    SseOutputFormat,
+    SseOutputFormatParams,
     SupportedLanguage,
     TtsRequest,
     TtsRequestEmbeddingSpecifier,
@@ -58,6 +86,8 @@ from .tts import (
     TtsRequestParams,
     TtsRequestVoiceSpecifier,
     TtsRequestVoiceSpecifierParams,
+    TtssseRequest,
+    TtssseRequestParams,
     WavOutputFormat,
     WavOutputFormatParams,
     WebSocketBaseResponse,
@@ -173,13 +203,19 @@ __all__ = [
     "DatasetFile",
     "DatasetFileParams",
     "DatasetParams",
+    "DoneMessage",
+    "DoneMessageParams",
     "Embedding",
     "EmbeddingResponse",
     "EmbeddingResponseParams",
     "EmbeddingSpecifier",
     "EmbeddingSpecifierParams",
     "Emotion",
+    "ErrorMessage",
+    "ErrorMessageParams",
     "FilePurpose",
+    "FlushDoneMessage",
+    "FlushDoneMessageParams",
     "FlushId",
     "Gender",
     "GenderPresentation",
@@ -227,6 +263,8 @@ __all__ = [
     "RawOutputFormatParams",
     "Speed",
     "SpeedParams",
+    "SseOutputFormat",
+    "SseOutputFormatParams",
     "StreamingResponse",
     "StreamingResponseParams",
     "StreamingResponse_Chunk",
@@ -235,13 +273,31 @@ __all__ = [
     "StreamingResponse_DoneParams",
     "StreamingResponse_Error",
     "StreamingResponse_ErrorParams",
+    "StreamingTranscriptionResponse",
+    "StreamingTranscriptionResponseParams",
+    "StreamingTranscriptionResponse_Done",
+    "StreamingTranscriptionResponse_DoneParams",
+    "StreamingTranscriptionResponse_Error",
+    "StreamingTranscriptionResponse_ErrorParams",
+    "StreamingTranscriptionResponse_FlushDone",
+    "StreamingTranscriptionResponse_FlushDoneParams",
+    "StreamingTranscriptionResponse_Transcript",
+    "StreamingTranscriptionResponse_TranscriptParams",
+    "SttEncoding",
     "SupportedLanguage",
+    "TimestampGranularity",
     "TokenGrant",
     "TokenGrantParams",
     "TokenRequest",
     "TokenRequestParams",
     "TokenResponse",
     "TokenResponseParams",
+    "TranscriptMessage",
+    "TranscriptMessageParams",
+    "TranscriptionResponse",
+    "TranscriptionResponseParams",
+    "TranscriptionWord",
+    "TranscriptionWordParams",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestEmbeddingSpecifierParams",
@@ -250,6 +306,8 @@ __all__ = [
     "TtsRequestParams",
     "TtsRequestVoiceSpecifier",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequest",
+    "TtssseRequestParams",
     "UpdateVoiceRequest",
     "UpdateVoiceRequestParams",
     "Voice",
@@ -307,6 +365,7 @@ __all__ = [
     "datasets",
     "embedding",
     "infill",
+    "stt",
     "tts",
     "voice_changer",
     "voices",

cartesia 2.0.4__tar.gz → 2.0.6__tar.gz

cartesia 2.0.4tar.gz → 2.0.6tar.gz