PyPI - cartesia - Versions diffs - 2.0.3__tar.gz → 2.0.5__tar.gz - Mend

cartesia 2.0.3tar.gz → 2.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

{cartesia-2.0.3 → cartesia-2.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.3
+Version: 2.0.5
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -213,6 +213,161 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Speech-to-Text (STT) with Websockets
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Load your audio file as bytes
+with open("path/to/audio.wav", "rb") as f:
+    audio_data = f.read()
+# Convert to audio chunks (20ms chunks used here for a streaming example)
+# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
+chunk_size = 640
+audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
+# Create websocket connection
+ws = client.stt.websocket(
+    model="ink-whisper",
+    language="en",           # Must match the language of your audio
+    encoding="pcm_s16le",    # Must match your audio's encoding format
+    sample_rate=16000,       # Must match your audio's sample rate
+)
+# Send audio chunks (streaming approach)
+for chunk in audio_chunks:
+    ws.send(chunk)
+# Finalize and close
+ws.send("finalize")
+ws.send("done")
+# Receive transcription results
+for result in ws.receive():
+    if result['type'] == 'transcript':
+        print(f"Transcription: {result['text']}")
+        if result['is_final']:
+            print("Final result received")
+    elif result['type'] == 'done':
+        break
+ws.close()
+```
+### Async Streaming Speech-to-Text (STT) with Websockets
+For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
+```python
+import asyncio
+import os
+from cartesia import AsyncCartesia
+async def streaming_stt_example():
+    """
+    Advanced async STT example for real-time streaming applications.
+    This example simulates streaming audio processing with proper error handling.
+    """
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    try:
+        # Create websocket connection
+        ws = await client.stt.websocket(
+            model="ink-whisper",
+            language="en",           # Must match the language of your audio
+            encoding="pcm_s16le",    # Must match your audio's encoding format
+            sample_rate=16000,       # Must match your audio's sample rate
+        )
+        # Simulate streaming audio data (replace with your audio source)
+        async def audio_stream():
+            """Simulate real-time audio streaming - replace with actual audio capture"""
+            # Load audio file for simulation
+            with open("path/to/audio.wav", "rb") as f:
+                audio_data = f.read()
+            # Stream in 100ms chunks (realistic for real-time processing)
+            chunk_size = int(16000 * 0.1 * 2)  # 100ms at 16kHz, 16-bit
+            for i in range(0, len(audio_data), chunk_size):
+                chunk = audio_data[i:i + chunk_size]
+                if chunk:
+                    yield chunk
+                    # Simulate real-time streaming delay
+                    await asyncio.sleep(0.1)
+        # Send audio and receive results concurrently
+        async def send_audio():
+            """Send audio chunks to the STT websocket"""
+            try:
+                async for chunk in audio_stream():
+                    await ws.send(chunk)
+                    print(f"Sent audio chunk of {len(chunk)} bytes")
+                    # Small delay to simulate realtime applications
+                    await asyncio.sleep(0.02)
+                # Signal end of audio stream
+                await ws.send("finalize")
+                await ws.send("done")
+                print("Audio streaming completed")
+            except Exception as e:
+                print(f"Error sending audio: {e}")
+        async def receive_transcripts():
+            """Receive and process transcription results"""
+            full_transcript = ""
+            try:
+                async for result in ws.receive():
+                    if result['type'] == 'transcript':
+                        text = result['text']
+                        is_final = result['is_final']
+                        if is_final:
+                            # Final result - this text won't change
+                            full_transcript += text + " "
+                            print(f"FINAL: {text}")
+                        else:
+                            # Partial result - may change as more audio is processed
+                            print(f"PARTIAL: {text}")
+                    elif result['type'] == 'done':
+                        print("Transcription completed")
+                        break
+            except Exception as e:
+                print(f"Error receiving transcripts: {e}")
+            return full_transcript.strip()
+        print("Starting streaming STT...")
+        # Use asyncio.gather to run audio sending and transcript receiving concurrently
+        _, final_transcript = await asyncio.gather(
+            send_audio(),
+            receive_transcripts()
+        )
+        print(f"\nComplete transcript: {final_transcript}")
+        # Clean up
+        await ws.close()
+    except Exception as e:
+        print(f"STT streaming error: {e}")
+    finally:
+        await client.close()
+# Run the example
+if __name__ == "__main__":
+    asyncio.run(streaming_stt_example())
+```
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
@@ -358,7 +513,6 @@ new_voice = client.voices.create(
     language="en"
 )
 ```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -412,3 +566,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
 From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.3 → cartesia-2.0.5}/README.md RENAMED Viewed

@@ -181,6 +181,161 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Speech-to-Text (STT) with Websockets
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Load your audio file as bytes
+with open("path/to/audio.wav", "rb") as f:
+    audio_data = f.read()
+# Convert to audio chunks (20ms chunks used here for a streaming example)
+# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
+chunk_size = 640
+audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
+# Create websocket connection
+ws = client.stt.websocket(
+    model="ink-whisper",
+    language="en",           # Must match the language of your audio
+    encoding="pcm_s16le",    # Must match your audio's encoding format
+    sample_rate=16000,       # Must match your audio's sample rate
+)
+# Send audio chunks (streaming approach)
+for chunk in audio_chunks:
+    ws.send(chunk)
+# Finalize and close
+ws.send("finalize")
+ws.send("done")
+# Receive transcription results
+for result in ws.receive():
+    if result['type'] == 'transcript':
+        print(f"Transcription: {result['text']}")
+        if result['is_final']:
+            print("Final result received")
+    elif result['type'] == 'done':
+        break
+ws.close()
+```
+### Async Streaming Speech-to-Text (STT) with Websockets
+For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
+```python
+import asyncio
+import os
+from cartesia import AsyncCartesia
+async def streaming_stt_example():
+    """
+    Advanced async STT example for real-time streaming applications.
+    This example simulates streaming audio processing with proper error handling.
+    """
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    try:
+        # Create websocket connection
+        ws = await client.stt.websocket(
+            model="ink-whisper",
+            language="en",           # Must match the language of your audio
+            encoding="pcm_s16le",    # Must match your audio's encoding format
+            sample_rate=16000,       # Must match your audio's sample rate
+        )
+        # Simulate streaming audio data (replace with your audio source)
+        async def audio_stream():
+            """Simulate real-time audio streaming - replace with actual audio capture"""
+            # Load audio file for simulation
+            with open("path/to/audio.wav", "rb") as f:
+                audio_data = f.read()
+            # Stream in 100ms chunks (realistic for real-time processing)
+            chunk_size = int(16000 * 0.1 * 2)  # 100ms at 16kHz, 16-bit
+            for i in range(0, len(audio_data), chunk_size):
+                chunk = audio_data[i:i + chunk_size]
+                if chunk:
+                    yield chunk
+                    # Simulate real-time streaming delay
+                    await asyncio.sleep(0.1)
+        # Send audio and receive results concurrently
+        async def send_audio():
+            """Send audio chunks to the STT websocket"""
+            try:
+                async for chunk in audio_stream():
+                    await ws.send(chunk)
+                    print(f"Sent audio chunk of {len(chunk)} bytes")
+                    # Small delay to simulate realtime applications
+                    await asyncio.sleep(0.02)
+                # Signal end of audio stream
+                await ws.send("finalize")
+                await ws.send("done")
+                print("Audio streaming completed")
+            except Exception as e:
+                print(f"Error sending audio: {e}")
+        async def receive_transcripts():
+            """Receive and process transcription results"""
+            full_transcript = ""
+            try:
+                async for result in ws.receive():
+                    if result['type'] == 'transcript':
+                        text = result['text']
+                        is_final = result['is_final']
+                        if is_final:
+                            # Final result - this text won't change
+                            full_transcript += text + " "
+                            print(f"FINAL: {text}")
+                        else:
+                            # Partial result - may change as more audio is processed
+                            print(f"PARTIAL: {text}")
+                    elif result['type'] == 'done':
+                        print("Transcription completed")
+                        break
+            except Exception as e:
+                print(f"Error receiving transcripts: {e}")
+            return full_transcript.strip()
+        print("Starting streaming STT...")
+        # Use asyncio.gather to run audio sending and transcript receiving concurrently
+        _, final_transcript = await asyncio.gather(
+            send_audio(),
+            receive_transcripts()
+        )
+        print(f"\nComplete transcript: {final_transcript}")
+        # Clean up
+        await ws.close()
+    except Exception as e:
+        print(f"STT streaming error: {e}")
+    finally:
+        await client.close()
+# Run the example
+if __name__ == "__main__":
+    asyncio.run(streaming_stt_example())
+```
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
@@ -326,7 +481,6 @@ new_voice = client.voices.create(
     language="en"
 )
 ```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -379,3 +533,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
 ### Automatically generating new SDK releases
 From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.3 → cartesia-2.0.5}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ name = "cartesia"
 [tool.poetry]
 name = "cartesia"
-version = "2.0.3"
+version = "2.0.5"
 description = ""
 readme = "README.md"
 authors = []

{cartesia-2.0.3 → cartesia-2.0.5}/src/cartesia/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # This file was auto-generated by Fern from our API Definition.
-from . import api_status, auth, datasets, embedding, infill, tts, voice_changer, voices
+from . import api_status, auth, datasets, embedding, infill, stt, tts, voice_changer, voices
 from .api_status import ApiInfo, ApiInfoParams
 from .auth import TokenGrant, TokenGrantParams, TokenRequest, TokenRequestParams, TokenResponse, TokenResponseParams
 from .client import AsyncCartesia, Cartesia
@@ -19,6 +19,29 @@ from .datasets import (
 )
 from .embedding import Embedding
 from .environment import CartesiaEnvironment
+from .stt import (
+    DoneMessage,
+    DoneMessageParams,
+    ErrorMessage,
+    ErrorMessageParams,
+    FlushDoneMessage,
+    FlushDoneMessageParams,
+    StreamingTranscriptionResponse,
+    StreamingTranscriptionResponseParams,
+    StreamingTranscriptionResponse_Done,
+    StreamingTranscriptionResponse_DoneParams,
+    StreamingTranscriptionResponse_Error,
+    StreamingTranscriptionResponse_ErrorParams,
+    StreamingTranscriptionResponse_FlushDone,
+    StreamingTranscriptionResponse_FlushDoneParams,
+    StreamingTranscriptionResponse_Transcript,
+    StreamingTranscriptionResponse_TranscriptParams,
+    SttEncoding,
+    TranscriptMessage,
+    TranscriptMessageParams,
+    TranscriptionResponse,
+    TranscriptionResponseParams,
+)
 from .tts import (
     CancelContextRequest,
     CancelContextRequestParams,
@@ -173,13 +196,19 @@ __all__ = [
     "DatasetFile",
     "DatasetFileParams",
     "DatasetParams",
+    "DoneMessage",
+    "DoneMessageParams",
     "Embedding",
     "EmbeddingResponse",
     "EmbeddingResponseParams",
     "EmbeddingSpecifier",
     "EmbeddingSpecifierParams",
     "Emotion",
+    "ErrorMessage",
+    "ErrorMessageParams",
     "FilePurpose",
+    "FlushDoneMessage",
+    "FlushDoneMessageParams",
     "FlushId",
     "Gender",
     "GenderPresentation",
@@ -235,6 +264,17 @@ __all__ = [
     "StreamingResponse_DoneParams",
     "StreamingResponse_Error",
     "StreamingResponse_ErrorParams",
+    "StreamingTranscriptionResponse",
+    "StreamingTranscriptionResponseParams",
+    "StreamingTranscriptionResponse_Done",
+    "StreamingTranscriptionResponse_DoneParams",
+    "StreamingTranscriptionResponse_Error",
+    "StreamingTranscriptionResponse_ErrorParams",
+    "StreamingTranscriptionResponse_FlushDone",
+    "StreamingTranscriptionResponse_FlushDoneParams",
+    "StreamingTranscriptionResponse_Transcript",
+    "StreamingTranscriptionResponse_TranscriptParams",
+    "SttEncoding",
     "SupportedLanguage",
     "TokenGrant",
     "TokenGrantParams",
@@ -242,6 +282,10 @@ __all__ = [
     "TokenRequestParams",
     "TokenResponse",
     "TokenResponseParams",
+    "TranscriptMessage",
+    "TranscriptMessageParams",
+    "TranscriptionResponse",
+    "TranscriptionResponseParams",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestEmbeddingSpecifierParams",
@@ -307,6 +351,7 @@ __all__ = [
     "datasets",
     "embedding",
     "infill",
+    "stt",
     "tts",
     "voice_changer",
     "voices",

{cartesia-2.0.3 → cartesia-2.0.5}/src/cartesia/base_client.py RENAMED Viewed

@@ -7,6 +7,7 @@ from .core.client_wrapper import SyncClientWrapper
 from .api_status.client import ApiStatusClient
 from .auth.client import AuthClient
 from .infill.client import InfillClient
+from .stt.socket_client import AsyncSttClientWithWebsocket, SttClientWithWebsocket
 from .tts.client import TtsClient
 from .voice_changer.client import VoiceChangerClient
 from .voices.client import VoicesClient
@@ -80,6 +81,7 @@ class BaseCartesia:
         self.api_status = ApiStatusClient(client_wrapper=self._client_wrapper)
         self.auth = AuthClient(client_wrapper=self._client_wrapper)
         self.infill = InfillClient(client_wrapper=self._client_wrapper)
+        self.stt = SttClientWithWebsocket(client_wrapper=self._client_wrapper)
         self.tts = TtsClient(client_wrapper=self._client_wrapper)
         self.voice_changer = VoiceChangerClient(client_wrapper=self._client_wrapper)
         self.voices = VoicesClient(client_wrapper=self._client_wrapper)

{cartesia-2.0.3 → cartesia-2.0.5}/src/cartesia/client.py RENAMED Viewed

@@ -8,6 +8,7 @@ import httpx
 from .base_client import AsyncBaseCartesia, BaseCartesia
 from .environment import CartesiaEnvironment
+from .stt.socket_client import AsyncSttClientWithWebsocket, SttClientWithWebsocket
 from .tts.socket_client import AsyncTtsClientWithWebsocket, TtsClientWithWebsocket
@@ -66,6 +67,7 @@ class Cartesia(BaseCartesia):
             follow_redirects=follow_redirects,
             httpx_client=httpx_client,
         )
+        self.stt = SttClientWithWebsocket(client_wrapper=self._client_wrapper)
         self.tts = TtsClientWithWebsocket(client_wrapper=self._client_wrapper)
     def __enter__(self):
@@ -143,6 +145,9 @@ class AsyncCartesia(AsyncBaseCartesia):
         self._session = None
         self._loop = None
         self.max_num_connections = max_num_connections
+        self.stt = AsyncSttClientWithWebsocket(
+            client_wrapper=self._client_wrapper, get_session=self._get_session
+        )
         self.tts = AsyncTtsClientWithWebsocket(
             client_wrapper=self._client_wrapper, get_session=self._get_session
         )

{cartesia-2.0.3 → cartesia-2.0.5}/src/cartesia/core/client_wrapper.py RENAMED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cartesia",
-            "X-Fern-SDK-Version": "2.0.3",
+            "X-Fern-SDK-Version": "2.0.5",
         }
         headers["X-API-Key"] = self.api_key
         headers["Cartesia-Version"] = "2024-11-13"

cartesia-2.0.5/src/cartesia/stt/__init__.py ADDED Viewed

@@ -0,0 +1,51 @@
+# This file was auto-generated by Fern from our API Definition.
+from .types import (
+    DoneMessage,
+    ErrorMessage,
+    FlushDoneMessage,
+    StreamingTranscriptionResponse,
+    StreamingTranscriptionResponse_Done,
+    StreamingTranscriptionResponse_Error,
+    StreamingTranscriptionResponse_FlushDone,
+    StreamingTranscriptionResponse_Transcript,
+    SttEncoding,
+    TranscriptMessage,
+    TranscriptionResponse,
+)
+from .requests import (
+    DoneMessageParams,
+    ErrorMessageParams,
+    FlushDoneMessageParams,
+    StreamingTranscriptionResponseParams,
+    StreamingTranscriptionResponse_DoneParams,
+    StreamingTranscriptionResponse_ErrorParams,
+    StreamingTranscriptionResponse_FlushDoneParams,
+    StreamingTranscriptionResponse_TranscriptParams,
+    TranscriptMessageParams,
+    TranscriptionResponseParams,
+)
+__all__ = [
+    "DoneMessage",
+    "DoneMessageParams",
+    "ErrorMessage",
+    "ErrorMessageParams",
+    "FlushDoneMessage",
+    "FlushDoneMessageParams",
+    "StreamingTranscriptionResponse",
+    "StreamingTranscriptionResponseParams",
+    "StreamingTranscriptionResponse_Done",
+    "StreamingTranscriptionResponse_DoneParams",
+    "StreamingTranscriptionResponse_Error",
+    "StreamingTranscriptionResponse_ErrorParams",
+    "StreamingTranscriptionResponse_FlushDone",
+    "StreamingTranscriptionResponse_FlushDoneParams",
+    "StreamingTranscriptionResponse_Transcript",
+    "StreamingTranscriptionResponse_TranscriptParams",
+    "SttEncoding",
+    "TranscriptMessage",
+    "TranscriptMessageParams",
+    "TranscriptionResponse",
+    "TranscriptionResponseParams",
+]

cartesia 2.0.3__tar.gz → 2.0.5__tar.gz

cartesia 2.0.3tar.gz → 2.0.5tar.gz