PyPI - cartesia - Versions diffs - 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl - Mend

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

cartesia/__init__.py +14 -0
cartesia/auth/client.py +8 -8
cartesia/auth/requests/token_grant.py +7 -1
cartesia/auth/requests/token_request.py +3 -3
cartesia/auth/types/token_grant.py +7 -2
cartesia/auth/types/token_request.py +3 -3
cartesia/core/client_wrapper.py +1 -1
cartesia/stt/__init__.py +6 -0
cartesia/stt/_async_websocket.py +81 -72
cartesia/stt/_websocket.py +42 -20
cartesia/stt/client.py +456 -0
cartesia/stt/requests/__init__.py +2 -0
cartesia/stt/requests/streaming_transcription_response.py +2 -0
cartesia/stt/requests/transcript_message.py +8 -1
cartesia/stt/requests/transcription_response.py +8 -1
cartesia/stt/requests/transcription_word.py +20 -0
cartesia/stt/socket_client.py +52 -109
cartesia/stt/types/__init__.py +4 -0
cartesia/stt/types/streaming_transcription_response.py +2 -0
cartesia/stt/types/stt_encoding.py +3 -1
cartesia/stt/types/timestamp_granularity.py +5 -0
cartesia/stt/types/transcript_message.py +7 -1
cartesia/stt/types/transcription_response.py +7 -1
cartesia/stt/types/transcription_word.py +32 -0
cartesia/tts/__init__.py +8 -0
cartesia/tts/client.py +50 -8
cartesia/tts/requests/__init__.py +4 -0
cartesia/tts/requests/generation_request.py +4 -4
cartesia/tts/requests/sse_output_format.py +11 -0
cartesia/tts/requests/ttssse_request.py +47 -0
cartesia/tts/requests/web_socket_chunk_response.py +0 -3
cartesia/tts/requests/web_socket_response.py +1 -2
cartesia/tts/requests/web_socket_tts_request.py +9 -1
cartesia/tts/types/__init__.py +4 -0
cartesia/tts/types/generation_request.py +4 -4
cartesia/tts/types/sse_output_format.py +22 -0
cartesia/tts/types/ttssse_request.py +58 -0
cartesia/tts/types/web_socket_chunk_response.py +1 -3
cartesia/tts/types/web_socket_response.py +1 -2
cartesia/tts/types/web_socket_tts_request.py +11 -3
cartesia/voice_changer/requests/streaming_response.py +0 -2
cartesia/voice_changer/types/streaming_response.py +0 -2
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0

cartesia/tts/requests/web_socket_response.py CHANGED Viewed

@@ -4,8 +4,8 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
-from ..types.flush_id import FlushId
 from ..types.context_id import ContextId
+from ..types.flush_id import FlushId
 from .word_timestamps import WordTimestampsParams
 from .phoneme_timestamps import PhonemeTimestampsParams
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
     type: typing.Literal["chunk"]
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]
     context_id: typing_extensions.NotRequired[ContextId]
     status_code: int
     done: bool

cartesia/tts/requests/web_socket_tts_request.py CHANGED Viewed

@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
     duration: typing_extensions.NotRequired[int]
     language: typing_extensions.NotRequired[str]
     add_timestamps: typing_extensions.NotRequired[bool]
-    use_original_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
     add_phoneme_timestamps: typing_extensions.NotRequired[bool]
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing_extensions.NotRequired[bool]
     continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
     context_id: typing_extensions.NotRequired[str]
     max_buffer_delay_ms: typing_extensions.NotRequired[int]

cartesia/tts/types/__init__.py CHANGED Viewed

@@ -15,11 +15,13 @@ from .phoneme_timestamps import PhonemeTimestamps
 from .raw_encoding import RawEncoding
 from .raw_output_format import RawOutputFormat
 from .speed import Speed
+from .sse_output_format import SseOutputFormat
 from .supported_language import SupportedLanguage
 from .tts_request import TtsRequest
 from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
 from .tts_request_id_specifier import TtsRequestIdSpecifier
 from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
+from .ttssse_request import TtssseRequest
 from .wav_output_format import WavOutputFormat
 from .web_socket_base_response import WebSocketBaseResponse
 from .web_socket_chunk_response import WebSocketChunkResponse
@@ -63,11 +65,13 @@ __all__ = [
     "RawEncoding",
     "RawOutputFormat",
     "Speed",
+    "SseOutputFormat",
     "SupportedLanguage",
     "TtsRequest",
     "TtsRequestEmbeddingSpecifier",
     "TtsRequestIdSpecifier",
     "TtsRequestVoiceSpecifier",
+    "TtssseRequest",
     "WavOutputFormat",
     "WebSocketBaseResponse",
     "WebSocketChunkResponse",

cartesia/tts/types/generation_request.py CHANGED Viewed

@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
     add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to return word-level timestamps.
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
     """
     add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to return phoneme-level timestamps.
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
     """
-    use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
     """
-    Whether to use the original transcript for timestamps.
+    Whether to use normalized timestamps (True) or original timestamps (False).
     """
     if IS_PYDANTIC_V2:

cartesia/tts/types/sse_output_format.py ADDED Viewed

@@ -0,0 +1,22 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+from .raw_encoding import RawEncoding
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import pydantic
+class SseOutputFormat(UniversalBaseModel):
+    container: typing.Literal["raw"] = "raw"
+    encoding: RawEncoding
+    sample_rate: int
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/ttssse_request.py ADDED Viewed

@@ -0,0 +1,58 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import pydantic
+from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
+import typing
+from .supported_language import SupportedLanguage
+from .sse_output_format import SseOutputFormat
+from .model_speed import ModelSpeed
+from .context_id import ContextId
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class TtssseRequest(UniversalBaseModel):
+    model_id: str = pydantic.Field()
+    """
+    The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
+    """
+    transcript: str
+    voice: TtsRequestVoiceSpecifier
+    language: typing.Optional[SupportedLanguage] = None
+    output_format: SseOutputFormat
+    duration: typing.Optional[float] = pydantic.Field(default=None)
+    """
+    The maximum duration of the audio in seconds. You do not usually need to specify this.
+    If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
+    """
+    speed: typing.Optional[ModelSpeed] = None
+    add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to use normalized timestamps (True) or original timestamps (False).
+    """
+    context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
+    """
+    Optional context ID for this request.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/tts/types/web_socket_chunk_response.py CHANGED Viewed

@@ -1,16 +1,14 @@
 # This file was auto-generated by Fern from our API Definition.
 from .web_socket_base_response import WebSocketBaseResponse
-import typing
-from .flush_id import FlushId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
+import typing
 import pydantic
 class WebSocketChunkResponse(WebSocketBaseResponse):
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/tts/types/web_socket_response.py CHANGED Viewed

@@ -3,10 +3,10 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
-from .flush_id import FlushId
 from .context_id import ContextId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
+from .flush_id import FlushId
 from .word_timestamps import WordTimestamps
 from .phoneme_timestamps import PhonemeTimestamps
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
     type: typing.Literal["chunk"] = "chunk"
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     context_id: typing.Optional[ContextId] = None
     status_code: int
     done: bool

cartesia/tts/types/web_socket_tts_request.py CHANGED Viewed

@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
     voice: TtsRequestVoiceSpecifier
     duration: typing.Optional[int] = None
     language: typing.Optional[str] = None
-    add_timestamps: typing.Optional[bool] = None
-    use_original_timestamps: typing.Optional[bool] = None
-    add_phoneme_timestamps: typing.Optional[bool] = None
+    add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
+    """
+    add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
+    """
+    use_normalized_timestamps: typing.Optional[bool] = None
     continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
     context_id: typing.Optional[str] = None
     max_buffer_delay_ms: typing.Optional[int] = None

cartesia/voice_changer/requests/streaming_response.py CHANGED Viewed

@@ -4,7 +4,6 @@ from __future__ import annotations
 import typing_extensions
 import typing
 import typing_extensions
-from ...tts.types.flush_id import FlushId
 from ...tts.types.context_id import ContextId
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
     type: typing.Literal["chunk"]
     data: str
     step_time: float
-    flush_id: typing_extensions.NotRequired[FlushId]
     context_id: typing_extensions.NotRequired[ContextId]
     status_code: int
     done: bool

cartesia/voice_changer/types/streaming_response.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 from ...core.pydantic_utilities import UniversalBaseModel
 import typing
-from ...tts.types.flush_id import FlushId
 from ...tts.types.context_id import ContextId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
 import pydantic
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
     type: typing.Literal["chunk"] = "chunk"
     data: str
     step_time: float
-    flush_id: typing.Optional[FlushId] = None
     context_id: typing.Optional[ContextId] = None
     status_code: int
     done: bool

{cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.5
+Version: 2.0.6
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -230,12 +230,14 @@ with open("path/to/audio.wav", "rb") as f:
 chunk_size = 640
 audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
-# Create websocket connection
+# Create websocket connection with endpointing parameters
 ws = client.stt.websocket(
-    model="ink-whisper",
-    language="en",           # Must match the language of your audio
-    encoding="pcm_s16le",    # Must match your audio's encoding format
-    sample_rate=16000,       # Must match your audio's sample rate
+    model="ink-whisper",                 # Model (required)
+    language="en",                       # Language of your audio (required)
+    encoding="pcm_s16le",                # Audio encoding format (required)
+    sample_rate=16000,                   # Audio sample rate (required)
+    min_volume=0.1,                      # Volume threshold for voice activity detection
+    max_silence_duration_secs=0.4,       # Maximum silence duration before endpointing
 )
 # Send audio chunks (streaming approach)
@@ -246,10 +248,20 @@ for chunk in audio_chunks:
 ws.send("finalize")
 ws.send("done")
-# Receive transcription results
+# Receive transcription results with word-level timestamps
 for result in ws.receive():
     if result['type'] == 'transcript':
         print(f"Transcription: {result['text']}")
+        # Handle word-level timestamps if available
+        if 'words' in result and result['words']:
+            print("Word-level timestamps:")
+            for word_info in result['words']:
+                word = word_info['word']
+                start = word_info['start']
+                end = word_info['end']
+                print(f"  '{word}': {start:.2f}s - {end:.2f}s")
         if result['is_final']:
             print("Final result received")
     elif result['type'] == 'done':
@@ -270,17 +282,20 @@ from cartesia import AsyncCartesia
 async def streaming_stt_example():
     """
     Advanced async STT example for real-time streaming applications.
-    This example simulates streaming audio processing with proper error handling.
+    This example simulates streaming audio processing with proper error handling
+    and demonstrates the new endpointing and word timestamp features.
     """
     client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
     try:
-        # Create websocket connection
+        # Create websocket connection with voice activity detection
         ws = await client.stt.websocket(
-            model="ink-whisper",
-            language="en",           # Must match the language of your audio
-            encoding="pcm_s16le",    # Must match your audio's encoding format
-            sample_rate=16000,       # Must match your audio's sample rate
+            model="ink-whisper",             # Model (required)
+            language="en",                   # Language of your audio (required)
+            encoding="pcm_s16le",            # Audio encoding format (required)
+            sample_rate=16000,               # Audio sample rate (required)
+            min_volume=0.15,                 # Volume threshold for voice activity detection
+            max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
         )
         # Simulate streaming audio data (replace with your audio source)
@@ -319,8 +334,9 @@ async def streaming_stt_example():
                 print(f"Error sending audio: {e}")
         async def receive_transcripts():
-            """Receive and process transcription results"""
+            """Receive and process transcription results with word timestamps"""
             full_transcript = ""
+            all_word_timestamps = []
             try:
                 async for result in ws.receive():
@@ -328,6 +344,19 @@ async def streaming_stt_example():
                         text = result['text']
                         is_final = result['is_final']
+                        # Handle word-level timestamps
+                        if 'words' in result and result['words']:
+                            word_timestamps = result['words']
+                            all_word_timestamps.extend(word_timestamps)
+                            if is_final:
+                                print("Word-level timestamps:")
+                                for word_info in word_timestamps:
+                                    word = word_info['word']
+                                    start = word_info['start']
+                                    end = word_info['end']
+                                    print(f"  '{word}': {start:.2f}s - {end:.2f}s")
                         if is_final:
                             # Final result - this text won't change
                             full_transcript += text + " "
@@ -343,17 +372,18 @@ async def streaming_stt_example():
             except Exception as e:
                 print(f"Error receiving transcripts: {e}")
-            return full_transcript.strip()
+            return full_transcript.strip(), all_word_timestamps
         print("Starting streaming STT...")
         # Use asyncio.gather to run audio sending and transcript receiving concurrently
-        _, final_transcript = await asyncio.gather(
+        _, (final_transcript, word_timestamps) = await asyncio.gather(
             send_audio(),
             receive_transcripts()
         )
         print(f"\nComplete transcript: {final_transcript}")
+        print(f"Total words with timestamps: {len(word_timestamps)}")
         # Clean up
         await ws.close()
@@ -368,6 +398,73 @@ if __name__ == "__main__":
     asyncio.run(streaming_stt_example())
 ```
+## Batch Speech-to-Text (STT)
+For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Transcribe an audio file with word-level timestamps
+with open("path/to/audio.wav", "rb") as audio_file:
+    response = client.stt.transcribe(
+        file=audio_file,                    # Audio file to transcribe
+        model="ink-whisper",                # STT model (required)
+        language="en",                      # Language of the audio (optional)
+        timestamp_granularities=["word"],   # Include word-level timestamps (optional)
+        encoding="pcm_s16le",               # Audio encoding (optional)
+        sample_rate=16000,                  # Audio sample rate (optional)
+    )
+# Access transcription results
+print(f"Transcribed text: {response.text}")
+print(f"Audio duration: {response.duration:.2f} seconds")
+# Process word-level timestamps if requested
+if response.words:
+    print("\nWord-level timestamps:")
+    for word_info in response.words:
+        word = word_info.word
+        start = word_info.start
+        end = word_info.end
+        print(f"  '{word}': {start:.2f}s - {end:.2f}s")
+```
+### Async Batch STT
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def transcribe_file():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    with open("path/to/audio.wav", "rb") as audio_file:
+        response = await client.stt.transcribe(
+            file=audio_file,
+            model="ink-whisper",
+            language="en",
+            timestamp_granularities=["word"],
+        )
+    print(f"Transcribed text: {response.text}")
+    # Process word timestamps
+    if response.words:
+        for word_info in response.words:
+            print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
+    await client.close()
+asyncio.run(transcribe_file())
+```
+> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
 ## Voices
 List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:

cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl

cartesia 2.0.5py3-none-any.whl → 2.0.6py3-none-any.whl