PyPI - cartesia - Versions diffs - 2.0.5__tar.gz → 2.0.6__tar.gz - Mend

cartesia 2.0.5tar.gz → 2.0.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (204) hide show

{cartesia-2.0.5 → cartesia-2.0.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.5
+Version: 2.0.6
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -230,12 +230,14 @@ with open("path/to/audio.wav", "rb") as f:
 chunk_size = 640
 audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
-# Create websocket connection
+# Create websocket connection with endpointing parameters
 ws = client.stt.websocket(
-    model="ink-whisper",
-    language="en",           # Must match the language of your audio
-    encoding="pcm_s16le",    # Must match your audio's encoding format
-    sample_rate=16000,       # Must match your audio's sample rate
+    model="ink-whisper",                 # Model (required)
+    language="en",                       # Language of your audio (required)
+    encoding="pcm_s16le",                # Audio encoding format (required)
+    sample_rate=16000,                   # Audio sample rate (required)
+    min_volume=0.1,                      # Volume threshold for voice activity detection
+    max_silence_duration_secs=0.4,       # Maximum silence duration before endpointing
 )
 # Send audio chunks (streaming approach)
@@ -246,10 +248,20 @@ for chunk in audio_chunks:
 ws.send("finalize")
 ws.send("done")
-# Receive transcription results
+# Receive transcription results with word-level timestamps
 for result in ws.receive():
     if result['type'] == 'transcript':
         print(f"Transcription: {result['text']}")
+        # Handle word-level timestamps if available
+        if 'words' in result and result['words']:
+            print("Word-level timestamps:")
+            for word_info in result['words']:
+                word = word_info['word']
+                start = word_info['start']
+                end = word_info['end']
+                print(f"  '{word}': {start:.2f}s - {end:.2f}s")
         if result['is_final']:
             print("Final result received")
     elif result['type'] == 'done':
@@ -270,17 +282,20 @@ from cartesia import AsyncCartesia
 async def streaming_stt_example():
     """
     Advanced async STT example for real-time streaming applications.
-    This example simulates streaming audio processing with proper error handling.
+    This example simulates streaming audio processing with proper error handling
+    and demonstrates the new endpointing and word timestamp features.
     """
     client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
     try:
-        # Create websocket connection
+        # Create websocket connection with voice activity detection
         ws = await client.stt.websocket(
-            model="ink-whisper",
-            language="en",           # Must match the language of your audio
-            encoding="pcm_s16le",    # Must match your audio's encoding format
-            sample_rate=16000,       # Must match your audio's sample rate
+            model="ink-whisper",             # Model (required)
+            language="en",                   # Language of your audio (required)
+            encoding="pcm_s16le",            # Audio encoding format (required)
+            sample_rate=16000,               # Audio sample rate (required)
+            min_volume=0.15,                 # Volume threshold for voice activity detection
+            max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
         )
         # Simulate streaming audio data (replace with your audio source)
@@ -319,8 +334,9 @@ async def streaming_stt_example():
                 print(f"Error sending audio: {e}")
         async def receive_transcripts():
-            """Receive and process transcription results"""
+            """Receive and process transcription results with word timestamps"""
             full_transcript = ""
+            all_word_timestamps = []
             try:
                 async for result in ws.receive():
@@ -328,6 +344,19 @@ async def streaming_stt_example():
                         text = result['text']
                         is_final = result['is_final']
+                        # Handle word-level timestamps
+                        if 'words' in result and result['words']:
+                            word_timestamps = result['words']
+                            all_word_timestamps.extend(word_timestamps)
+                            if is_final:
+                                print("Word-level timestamps:")
+                                for word_info in word_timestamps:
+                                    word = word_info['word']
+                                    start = word_info['start']
+                                    end = word_info['end']
+                                    print(f"  '{word}': {start:.2f}s - {end:.2f}s")
                         if is_final:
                             # Final result - this text won't change
                             full_transcript += text + " "
@@ -343,17 +372,18 @@ async def streaming_stt_example():
             except Exception as e:
                 print(f"Error receiving transcripts: {e}")
-            return full_transcript.strip()
+            return full_transcript.strip(), all_word_timestamps
         print("Starting streaming STT...")
         # Use asyncio.gather to run audio sending and transcript receiving concurrently
-        _, final_transcript = await asyncio.gather(
+        _, (final_transcript, word_timestamps) = await asyncio.gather(
             send_audio(),
             receive_transcripts()
         )
         print(f"\nComplete transcript: {final_transcript}")
+        print(f"Total words with timestamps: {len(word_timestamps)}")
         # Clean up
         await ws.close()
@@ -368,6 +398,73 @@ if __name__ == "__main__":
     asyncio.run(streaming_stt_example())
 ```
+## Batch Speech-to-Text (STT)
+For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Transcribe an audio file with word-level timestamps
+with open("path/to/audio.wav", "rb") as audio_file:
+    response = client.stt.transcribe(
+        file=audio_file,                    # Audio file to transcribe
+        model="ink-whisper",                # STT model (required)
+        language="en",                      # Language of the audio (optional)
+        timestamp_granularities=["word"],   # Include word-level timestamps (optional)
+        encoding="pcm_s16le",               # Audio encoding (optional)
+        sample_rate=16000,                  # Audio sample rate (optional)
+    )
+# Access transcription results
+print(f"Transcribed text: {response.text}")
+print(f"Audio duration: {response.duration:.2f} seconds")
+# Process word-level timestamps if requested
+if response.words:
+    print("\nWord-level timestamps:")
+    for word_info in response.words:
+        word = word_info.word
+        start = word_info.start
+        end = word_info.end
+        print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+```
+### Async Batch STT
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def transcribe_file():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    with open("path/to/audio.wav", "rb") as audio_file:
+        response = await client.stt.transcribe(
+            file=audio_file,
+            model="ink-whisper",
+            language="en",
+            timestamp_granularities=["word"],
+        )
+    print(f"Transcribed text: {response.text}")
+    # Process word timestamps
+    if response.words:
+        for word_info in response.words:
+            print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
+    await client.close()
+asyncio.run(transcribe_file())
+```
+> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:

{cartesia-2.0.5 → cartesia-2.0.6}/README.md RENAMED Viewed

@@ -198,12 +198,14 @@ with open("path/to/audio.wav", "rb") as f:
 chunk_size = 640
 audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
-# Create websocket connection
+# Create websocket connection with endpointing parameters
 ws = client.stt.websocket(
-    model="ink-whisper",
-    language="en",           # Must match the language of your audio
-    encoding="pcm_s16le",    # Must match your audio's encoding format
-    sample_rate=16000,       # Must match your audio's sample rate
+    model="ink-whisper",                 # Model (required)
+    language="en",                       # Language of your audio (required)
+    encoding="pcm_s16le",                # Audio encoding format (required)
+    sample_rate=16000,                   # Audio sample rate (required)
+    min_volume=0.1,                      # Volume threshold for voice activity detection
+    max_silence_duration_secs=0.4,       # Maximum silence duration before endpointing
 )
 # Send audio chunks (streaming approach)
@@ -214,10 +216,20 @@ for chunk in audio_chunks:
 ws.send("finalize")
 ws.send("done")
-# Receive transcription results
+# Receive transcription results with word-level timestamps
 for result in ws.receive():
     if result['type'] == 'transcript':
         print(f"Transcription: {result['text']}")
+        # Handle word-level timestamps if available
+        if 'words' in result and result['words']:
+            print("Word-level timestamps:")
+            for word_info in result['words']:
+                word = word_info['word']
+                start = word_info['start']
+                end = word_info['end']
+                print(f"  '{word}': {start:.2f}s - {end:.2f}s")
         if result['is_final']:
             print("Final result received")
     elif result['type'] == 'done':
@@ -238,17 +250,20 @@ from cartesia import AsyncCartesia
 async def streaming_stt_example():
     """
     Advanced async STT example for real-time streaming applications.
-    This example simulates streaming audio processing with proper error handling.
+    This example simulates streaming audio processing with proper error handling
+    and demonstrates the new endpointing and word timestamp features.
     """
     client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
     try:
-        # Create websocket connection
+        # Create websocket connection with voice activity detection
         ws = await client.stt.websocket(
-            model="ink-whisper",
-            language="en",           # Must match the language of your audio
-            encoding="pcm_s16le",    # Must match your audio's encoding format
-            sample_rate=16000,       # Must match your audio's sample rate
+            model="ink-whisper",             # Model (required)
+            language="en",                   # Language of your audio (required)
+            encoding="pcm_s16le",            # Audio encoding format (required)
+            sample_rate=16000,               # Audio sample rate (required)
+            min_volume=0.15,                 # Volume threshold for voice activity detection
+            max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
         )
         # Simulate streaming audio data (replace with your audio source)
@@ -287,8 +302,9 @@ async def streaming_stt_example():
                 print(f"Error sending audio: {e}")
         async def receive_transcripts():
-            """Receive and process transcription results"""
+            """Receive and process transcription results with word timestamps"""
             full_transcript = ""
+            all_word_timestamps = []
             try:
                 async for result in ws.receive():
@@ -296,6 +312,19 @@ async def streaming_stt_example():
                         text = result['text']
                         is_final = result['is_final']
+                        # Handle word-level timestamps
+                        if 'words' in result and result['words']:
+                            word_timestamps = result['words']
+                            all_word_timestamps.extend(word_timestamps)
+                            if is_final:
+                                print("Word-level timestamps:")
+                                for word_info in word_timestamps:
+                                    word = word_info['word']
+                                    start = word_info['start']
+                                    end = word_info['end']
+                                    print(f"  '{word}': {start:.2f}s - {end:.2f}s")
                         if is_final:
                             # Final result - this text won't change
                             full_transcript += text + " "
@@ -311,17 +340,18 @@ async def streaming_stt_example():
             except Exception as e:
                 print(f"Error receiving transcripts: {e}")
-            return full_transcript.strip()
+            return full_transcript.strip(), all_word_timestamps
         print("Starting streaming STT...")
         # Use asyncio.gather to run audio sending and transcript receiving concurrently
-        _, final_transcript = await asyncio.gather(
+        _, (final_transcript, word_timestamps) = await asyncio.gather(
             send_audio(),
             receive_transcripts()
         )
         print(f"\nComplete transcript: {final_transcript}")
+        print(f"Total words with timestamps: {len(word_timestamps)}")
         # Clean up
         await ws.close()
@@ -336,6 +366,73 @@ if __name__ == "__main__":
     asyncio.run(streaming_stt_example())
 ```
+## Batch Speech-to-Text (STT)
+For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Transcribe an audio file with word-level timestamps
+with open("path/to/audio.wav", "rb") as audio_file:
+    response = client.stt.transcribe(
+        file=audio_file,                    # Audio file to transcribe
+        model="ink-whisper",                # STT model (required)
+        language="en",                      # Language of the audio (optional)
+        timestamp_granularities=["word"],   # Include word-level timestamps (optional)
+        encoding="pcm_s16le",               # Audio encoding (optional)
+        sample_rate=16000,                  # Audio sample rate (optional)
+    )
+# Access transcription results
+print(f"Transcribed text: {response.text}")
+print(f"Audio duration: {response.duration:.2f} seconds")
+# Process word-level timestamps if requested
+if response.words:
+    print("\nWord-level timestamps:")
+    for word_info in response.words:
+        word = word_info.word
+        start = word_info.start
+        end = word_info.end
+        print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+```
+### Async Batch STT
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def transcribe_file():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    with open("path/to/audio.wav", "rb") as audio_file:
+        response = await client.stt.transcribe(
+            file=audio_file,
+            model="ink-whisper",
+            language="en",
+            timestamp_granularities=["word"],
+        )
+    print(f"Transcribed text: {response.text}")
+    # Process word timestamps
+    if response.words:
+        for word_info in response.words:
+            print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
+    await client.close()
+asyncio.run(transcribe_file())
+```
+> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:

{cartesia-2.0.5 → cartesia-2.0.6}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ name = "cartesia"
 [tool.poetry]
 name = "cartesia"
-version = "2.0.5"
+version = "2.0.6"
 description = ""
 readme = "README.md"
 authors = []

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/__init__.py RENAMED Viewed

@@ -37,10 +37,13 @@ from .stt import (
     StreamingTranscriptionResponse_Transcript,
     StreamingTranscriptionResponse_TranscriptParams,
     SttEncoding,
+    TimestampGranularity,
     TranscriptMessage,
     TranscriptMessageParams,
     TranscriptionResponse,
     TranscriptionResponseParams,
+    TranscriptionWord,
+    TranscriptionWordParams,
 )
 from .tts import (
     CancelContextRequest,
@@ -72,6 +75,8 @@ from .tts import (
     RawOutputFormatParams,
     Speed,
     SpeedParams,
+    SseOutputFormat,
+    SseOutputFormatParams,
     SupportedLanguage,
     TtsRequest,
     TtsRequestEmbeddingSpecifier,
@@ -81,6 +86,8 @@ from .tts import (
     TtsRequestParams,
     TtsRequestVoiceSpecifier,
     TtsRequestVoiceSpecifierParams,
+    TtssseRequest,
+    TtssseRequestParams,
     WavOutputFormat,
     WavOutputFormatParams,
     WebSocketBaseResponse,
@@ -256,6 +263,8 @@ __all__ = [
     "RawOutputFormatParams",
     "Speed",
     "SpeedParams",
+    "SseOutputFormat",
+    "SseOutputFormatParams",
     "StreamingResponse",
     "StreamingResponseParams",
     "StreamingResponse_Chunk",
@@ -276,6 +285,7 @@ __all__ = [
     "StreamingTranscriptionResponse_TranscriptParams",
     "SttEncoding",
     "SupportedLanguage",
+    "TimestampGranularity",
     "TokenGrant",
     "TokenGrantParams",
     "TokenRequest",
@@ -286,6 +296,8 @@ __all__ = [
     "TranscriptMessageParams",
     "TranscriptionResponse",
     "TranscriptionResponseParams",
+    "TranscriptionWord",
+    "TranscriptionWordParams",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestEmbeddingSpecifierParams",
@@ -294,6 +306,8 @@ __all__ = [
     "TtsRequestParams",
     "TtsRequestVoiceSpecifier",
     "TtsRequestVoiceSpecifierParams",
+    "TtssseRequest",
+    "TtssseRequestParams",
     "UpdateVoiceRequest",
     "UpdateVoiceRequestParams",
     "Voice",

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/auth/client.py RENAMED Viewed

@@ -22,7 +22,7 @@ class AuthClient:
     def access_token(
         self,
         *,
-        grants: TokenGrantParams,
+        grants: typing.Optional[TokenGrantParams] = OMIT,
         expires_in: typing.Optional[int] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TokenResponse:
@@ -31,8 +31,8 @@ class AuthClient:
         Parameters
         ----------
-        grants : TokenGrantParams
-            The permissions to be granted via the token.
+        grants : typing.Optional[TokenGrantParams]
+            The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
         expires_in : typing.Optional[int]
             The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -52,7 +52,7 @@ class AuthClient:
             api_key="YOUR_API_KEY",
         )
         client.auth.access_token(
-            grants={"tts": True},
+            grants={"tts": True, "stt": True},
             expires_in=60,
         )
         """
@@ -90,7 +90,7 @@ class AsyncAuthClient:
     async def access_token(
         self,
         *,
-        grants: TokenGrantParams,
+        grants: typing.Optional[TokenGrantParams] = OMIT,
         expires_in: typing.Optional[int] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> TokenResponse:
@@ -99,8 +99,8 @@ class AsyncAuthClient:
         Parameters
         ----------
-        grants : TokenGrantParams
-            The permissions to be granted via the token.
+        grants : typing.Optional[TokenGrantParams]
+            The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
         expires_in : typing.Optional[int]
             The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -125,7 +125,7 @@ class AsyncAuthClient:
         async def main() -> None:
             await client.auth.access_token(
-                grants={"tts": True},
+                grants={"tts": True, "stt": True},
                 expires_in=60,
             )

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/auth/requests/token_grant.py RENAMED Viewed

@@ -1,10 +1,16 @@
 # This file was auto-generated by Fern from our API Definition.
 import typing_extensions
+import typing_extensions
 class TokenGrantParams(typing_extensions.TypedDict):
-    tts: bool
+    tts: typing_extensions.NotRequired[bool]
     """
     The `tts` grant allows the token to be used to access any TTS endpoint.
     """
+    stt: typing_extensions.NotRequired[bool]
+    """
+    The `stt` grant allows the token to be used to access any STT endpoint.
+    """

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/auth/requests/token_request.py RENAMED Viewed

@@ -1,14 +1,14 @@
 # This file was auto-generated by Fern from our API Definition.
 import typing_extensions
-from .token_grant import TokenGrantParams
 import typing_extensions
+from .token_grant import TokenGrantParams
 class TokenRequestParams(typing_extensions.TypedDict):
-    grants: TokenGrantParams
+    grants: typing_extensions.NotRequired[TokenGrantParams]
     """
-    The permissions to be granted via the token.
+    The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
     """
     expires_in: typing_extensions.NotRequired[int]

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/auth/types/token_grant.py RENAMED Viewed

@@ -1,17 +1,22 @@
 # This file was auto-generated by Fern from our API Definition.
 from ...core.pydantic_utilities import UniversalBaseModel
+import typing
 import pydantic
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
-import typing
 class TokenGrant(UniversalBaseModel):
-    tts: bool = pydantic.Field()
+    tts: typing.Optional[bool] = pydantic.Field(default=None)
     """
     The `tts` grant allows the token to be used to access any TTS endpoint.
     """
+    stt: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    The `stt` grant allows the token to be used to access any STT endpoint.
+    """
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
     else:

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/auth/types/token_request.py RENAMED Viewed

@@ -1,16 +1,16 @@
 # This file was auto-generated by Fern from our API Definition.
 from ...core.pydantic_utilities import UniversalBaseModel
+import typing
 from .token_grant import TokenGrant
 import pydantic
-import typing
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class TokenRequest(UniversalBaseModel):
-    grants: TokenGrant = pydantic.Field()
+    grants: typing.Optional[TokenGrant] = pydantic.Field(default=None)
     """
-    The permissions to be granted via the token.
+    The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
     """
     expires_in: typing.Optional[int] = pydantic.Field(default=None)

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/core/client_wrapper.py RENAMED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cartesia",
-            "X-Fern-SDK-Version": "2.0.5",
+            "X-Fern-SDK-Version": "2.0.6",
         }
         headers["X-API-Key"] = self.api_key
         headers["Cartesia-Version"] = "2024-11-13"

{cartesia-2.0.5 → cartesia-2.0.6}/src/cartesia/stt/__init__.py RENAMED Viewed

@@ -10,8 +10,10 @@ from .types import (
     StreamingTranscriptionResponse_FlushDone,
     StreamingTranscriptionResponse_Transcript,
     SttEncoding,
+    TimestampGranularity,
     TranscriptMessage,
     TranscriptionResponse,
+    TranscriptionWord,
 )
 from .requests import (
     DoneMessageParams,
@@ -24,6 +26,7 @@ from .requests import (
     StreamingTranscriptionResponse_TranscriptParams,
     TranscriptMessageParams,
     TranscriptionResponseParams,
+    TranscriptionWordParams,
 )
 __all__ = [
@@ -44,8 +47,11 @@ __all__ = [
     "StreamingTranscriptionResponse_Transcript",
     "StreamingTranscriptionResponse_TranscriptParams",
     "SttEncoding",
+    "TimestampGranularity",
     "TranscriptMessage",
     "TranscriptMessageParams",
     "TranscriptionResponse",
     "TranscriptionResponseParams",
+    "TranscriptionWord",
+    "TranscriptionWordParams",
 ]

cartesia 2.0.5__tar.gz → 2.0.6__tar.gz

cartesia 2.0.5tar.gz → 2.0.6tar.gz