PyPI - cartesia - Versions diffs - 2.0.11__tar.gz → 2.0.14__tar.gz - Mend

cartesia 2.0.11tar.gz → 2.0.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cartesia might be problematic. Click here for more details.

Files changed (207) hide show

{cartesia-2.0.11 → cartesia-2.0.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.11
+Version: 2.0.14
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -53,26 +53,36 @@ Instantiate and use the client with the following:
 ```python
 from cartesia import Cartesia
-from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
 import os
 client = Cartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
-)
-client.tts.bytes(
-    model_id="sonic-2",
-    transcript="Hello, world!",
-    voice={
-        "mode": "id",
-        "id": "694f9389-aac1-45b6-b726-9d9369183238",
-    },
-    language="en",
-    output_format={
-        "container": "raw",
-        "sample_rate": 44100,
-        "encoding": "pcm_f32le",
-    },
+    api_key=os.environ["CARTESIA_API_KEY"],
 )
+def main():
+    with open("sonic.wav", "wb") as f:
+        bytes_iter = client.tts.bytes(
+            model_id="sonic-3",
+            transcript="Hello, world!",
+            voice={
+                "mode": "id",
+                "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
+            },
+            language="en",
+            output_format={
+                "container": "wav",
+                "sample_rate": 44100,
+                "encoding": "pcm_f32le",
+            },
+        )
+        for chunk in bytes_iter:
+            f.write(chunk)
+if __name__ == "__main__":
+    main()
 ```
 ## Async Client
@@ -81,31 +91,37 @@ The SDK also exports an `async` client so that you can make non-blocking calls t
 ```python
 import asyncio
-import os
 from cartesia import AsyncCartesia
-from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
+import os
 client = AsyncCartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
+    api_key=os.environ["CARTESIA_API_KEY"],
 )
-async def main() -> None:
-    async for output in client.tts.bytes(
-        model_id="sonic-2",
-        transcript="Hello, world!",
-        voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
-        language="en",
-        output_format={
-            "container": "raw",
-            "sample_rate": 44100,
-            "encoding": "pcm_f32le",
-        },
-    ):
-        print(f"Received chunk of size: {len(output)}")
+async def main():
+    with open("sonic.wav", "wb") as f:
+        bytes_iter = client.tts.bytes(
+            model_id="sonic-3",
+            transcript="Hello, world!",
+            voice={
+                "mode": "id",
+                "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
+            },
+            language="en",
+            output_format={
+                "container": "wav",
+                "sample_rate": 44100,
+                "encoding": "pcm_f32le",
+            },
+        )
+        async for chunk in bytes_iter:
+            f.write(chunk)
-asyncio.run(main())
+if __name__ == "__main__":
+    asyncio.run(main())
 ```
 ## Exception Handling
@@ -129,7 +145,6 @@ The SDK supports streaming responses as well, returning a generator that you can
 ```python
 from cartesia import Cartesia
-from cartesia.tts import Controls, OutputFormat_RawParams, TtsRequestIdSpecifierParams
 import os
 def get_tts_chunks():
@@ -137,14 +152,11 @@ def get_tts_chunks():
         api_key=os.getenv("CARTESIA_API_KEY"),
     )
     response = client.tts.sse(
-        model_id="sonic-2",
+        model_id="sonic-3",
         transcript="Hello world!",
         voice={
+            "mode": "id",
             "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
-            "experimental_controls": {
-                "speed": "normal",
-                "emotion": [],
-            },
         },
         language="en",
         output_format={
@@ -188,9 +200,9 @@ ws = client.tts.websocket()
 # Generate and stream audio using the websocket
 for output in ws.send(
-    model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
+    model_id="sonic-3", # see: https://docs.cartesia.ai/build-with-cartesia/tts-models
     transcript=transcript,
-    voice={"id": voice_id},
+    voice={"mode": "id", "id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
@@ -252,7 +264,7 @@ ws.send("done")
 for result in ws.receive():
     if result['type'] == 'transcript':
         print(f"Transcription: {result['text']}")
         # Handle word-level timestamps if available
         if 'words' in result and result['words']:
             print("Word-level timestamps:")
@@ -261,7 +273,7 @@ for result in ws.receive():
                 start = word_info['start']
                 end = word_info['end']
                 print(f"  '{word}': {start:.2f}s - {end:.2f}s")
         if result['is_final']:
             print("Final result received")
     elif result['type'] == 'done':
@@ -286,7 +298,7 @@ async def streaming_stt_example():
     and demonstrates the new endpointing and word timestamp features.
     """
     client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
     try:
         # Create websocket connection with voice activity detection
         ws = await client.stt.websocket(
@@ -297,24 +309,24 @@ async def streaming_stt_example():
             min_volume=0.15,                 # Volume threshold for voice activity detection
             max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
         )
         # Simulate streaming audio data (replace with your audio source)
         async def audio_stream():
             """Simulate real-time audio streaming - replace with actual audio capture"""
             # Load audio file for simulation
             with open("path/to/audio.wav", "rb") as f:
                 audio_data = f.read()
             # Stream in 100ms chunks (realistic for real-time processing)
             chunk_size = int(16000 * 0.1 * 2)  # 100ms at 16kHz, 16-bit
             for i in range(0, len(audio_data), chunk_size):
                 chunk = audio_data[i:i + chunk_size]
                 if chunk:
                     yield chunk
                     # Simulate real-time streaming delay
                     await asyncio.sleep(0.1)
         # Send audio and receive results concurrently
         async def send_audio():
             """Send audio chunks to the STT websocket"""
@@ -324,31 +336,31 @@ async def streaming_stt_example():
                     print(f"Sent audio chunk of {len(chunk)} bytes")
                     # Small delay to simulate realtime applications
                     await asyncio.sleep(0.02)
                 # Signal end of audio stream
                 await ws.send("finalize")
                 await ws.send("done")
                 print("Audio streaming completed")
             except Exception as e:
                 print(f"Error sending audio: {e}")
         async def receive_transcripts():
             """Receive and process transcription results with word timestamps"""
             full_transcript = ""
             all_word_timestamps = []
             try:
                 async for result in ws.receive():
                     if result['type'] == 'transcript':
                         text = result['text']
                         is_final = result['is_final']
                         # Handle word-level timestamps
                         if 'words' in result and result['words']:
                             word_timestamps = result['words']
                             all_word_timestamps.extend(word_timestamps)
                             if is_final:
                                 print("Word-level timestamps:")
                                 for word_info in word_timestamps:
@@ -356,7 +368,7 @@ async def streaming_stt_example():
                                     start = word_info['start']
                                     end = word_info['end']
                                     print(f"  '{word}': {start:.2f}s - {end:.2f}s")
                         if is_final:
                             # Final result - this text won't change
                             full_transcript += text + " "
@@ -364,30 +376,30 @@ async def streaming_stt_example():
                         else:
                             # Partial result - may change as more audio is processed
                             print(f"PARTIAL: {text}")
                     elif result['type'] == 'done':
                         print("Transcription completed")
                         break
             except Exception as e:
                 print(f"Error receiving transcripts: {e}")
             return full_transcript.strip(), all_word_timestamps
         print("Starting streaming STT...")
         # Use asyncio.gather to run audio sending and transcript receiving concurrently
         _, (final_transcript, word_timestamps) = await asyncio.gather(
             send_audio(),
             receive_transcripts()
         )
         print(f"\nComplete transcript: {final_transcript}")
         print(f"Total words with timestamps: {len(word_timestamps)}")
         # Clean up
         await ws.close()
     except Exception as e:
         print(f"STT streaming error: {e}")
     finally:
@@ -442,7 +454,7 @@ import os
 async def transcribe_file():
     client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
     with open("path/to/audio.wav", "rb") as audio_file:
         response = await client.stt.transcribe(
             file=audio_file,
@@ -450,14 +462,14 @@ async def transcribe_file():
             language="en",
             timestamp_granularities=["word"],
         )
     print(f"Transcribed text: {response.text}")
     # Process word timestamps
     if response.words:
         for word_info in response.words:
             print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
     await client.close()
 asyncio.run(transcribe_file())
@@ -664,6 +676,3 @@ $ git commit --amend -m "manually regenerate from docs" # optional
 From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.11 → cartesia-2.0.14}/README.md RENAMED Viewed

@@ -21,26 +21,36 @@ Instantiate and use the client with the following:
 ```python
 from cartesia import Cartesia
-from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
 import os
 client = Cartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
-)
-client.tts.bytes(
-    model_id="sonic-2",
-    transcript="Hello, world!",
-    voice={
-        "mode": "id",
-        "id": "694f9389-aac1-45b6-b726-9d9369183238",
-    },
-    language="en",
-    output_format={
-        "container": "raw",
-        "sample_rate": 44100,
-        "encoding": "pcm_f32le",
-    },
+    api_key=os.environ["CARTESIA_API_KEY"],
 )
+def main():
+    with open("sonic.wav", "wb") as f:
+        bytes_iter = client.tts.bytes(
+            model_id="sonic-3",
+            transcript="Hello, world!",
+            voice={
+                "mode": "id",
+                "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
+            },
+            language="en",
+            output_format={
+                "container": "wav",
+                "sample_rate": 44100,
+                "encoding": "pcm_f32le",
+            },
+        )
+        for chunk in bytes_iter:
+            f.write(chunk)
+if __name__ == "__main__":
+    main()
 ```
 ## Async Client
@@ -49,31 +59,37 @@ The SDK also exports an `async` client so that you can make non-blocking calls t
 ```python
 import asyncio
-import os
 from cartesia import AsyncCartesia
-from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
+import os
 client = AsyncCartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
+    api_key=os.environ["CARTESIA_API_KEY"],
 )
-async def main() -> None:
-    async for output in client.tts.bytes(
-        model_id="sonic-2",
-        transcript="Hello, world!",
-        voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
-        language="en",
-        output_format={
-            "container": "raw",
-            "sample_rate": 44100,
-            "encoding": "pcm_f32le",
-        },
-    ):
-        print(f"Received chunk of size: {len(output)}")
+async def main():
+    with open("sonic.wav", "wb") as f:
+        bytes_iter = client.tts.bytes(
+            model_id="sonic-3",
+            transcript="Hello, world!",
+            voice={
+                "mode": "id",
+                "id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
+            },
+            language="en",
+            output_format={
+                "container": "wav",
+                "sample_rate": 44100,
+                "encoding": "pcm_f32le",
+            },
+        )
+        async for chunk in bytes_iter:
+            f.write(chunk)
-asyncio.run(main())
+if __name__ == "__main__":
+    asyncio.run(main())
 ```
 ## Exception Handling
@@ -97,7 +113,6 @@ The SDK supports streaming responses as well, returning a generator that you can
 ```python
 from cartesia import Cartesia
-from cartesia.tts import Controls, OutputFormat_RawParams, TtsRequestIdSpecifierParams
 import os
 def get_tts_chunks():
@@ -105,14 +120,11 @@ def get_tts_chunks():
         api_key=os.getenv("CARTESIA_API_KEY"),
     )
     response = client.tts.sse(
-        model_id="sonic-2",
+        model_id="sonic-3",
         transcript="Hello world!",
         voice={
+            "mode": "id",
             "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
-            "experimental_controls": {
-                "speed": "normal",
-                "emotion": [],
-            },
         },
         language="en",
         output_format={
@@ -156,9 +168,9 @@ ws = client.tts.websocket()
 # Generate and stream audio using the websocket
 for output in ws.send(
-    model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
+    model_id="sonic-3", # see: https://docs.cartesia.ai/build-with-cartesia/tts-models
     transcript=transcript,
-    voice={"id": voice_id},
+    voice={"mode": "id", "id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
@@ -220,7 +232,7 @@ ws.send("done")
 for result in ws.receive():
     if result['type'] == 'transcript':
         print(f"Transcription: {result['text']}")
         # Handle word-level timestamps if available
         if 'words' in result and result['words']:
             print("Word-level timestamps:")
@@ -229,7 +241,7 @@ for result in ws.receive():
                 start = word_info['start']
                 end = word_info['end']
                 print(f"  '{word}': {start:.2f}s - {end:.2f}s")
         if result['is_final']:
             print("Final result received")
     elif result['type'] == 'done':
@@ -254,7 +266,7 @@ async def streaming_stt_example():
     and demonstrates the new endpointing and word timestamp features.
     """
     client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
     try:
         # Create websocket connection with voice activity detection
         ws = await client.stt.websocket(
@@ -265,24 +277,24 @@ async def streaming_stt_example():
             min_volume=0.15,                 # Volume threshold for voice activity detection
             max_silence_duration_secs=0.3,   # Maximum silence duration before endpointing
         )
         # Simulate streaming audio data (replace with your audio source)
         async def audio_stream():
             """Simulate real-time audio streaming - replace with actual audio capture"""
             # Load audio file for simulation
             with open("path/to/audio.wav", "rb") as f:
                 audio_data = f.read()
             # Stream in 100ms chunks (realistic for real-time processing)
             chunk_size = int(16000 * 0.1 * 2)  # 100ms at 16kHz, 16-bit
             for i in range(0, len(audio_data), chunk_size):
                 chunk = audio_data[i:i + chunk_size]
                 if chunk:
                     yield chunk
                     # Simulate real-time streaming delay
                     await asyncio.sleep(0.1)
         # Send audio and receive results concurrently
         async def send_audio():
             """Send audio chunks to the STT websocket"""
@@ -292,31 +304,31 @@ async def streaming_stt_example():
                     print(f"Sent audio chunk of {len(chunk)} bytes")
                     # Small delay to simulate realtime applications
                     await asyncio.sleep(0.02)
                 # Signal end of audio stream
                 await ws.send("finalize")
                 await ws.send("done")
                 print("Audio streaming completed")
             except Exception as e:
                 print(f"Error sending audio: {e}")
         async def receive_transcripts():
             """Receive and process transcription results with word timestamps"""
             full_transcript = ""
             all_word_timestamps = []
             try:
                 async for result in ws.receive():
                     if result['type'] == 'transcript':
                         text = result['text']
                         is_final = result['is_final']
                         # Handle word-level timestamps
                         if 'words' in result and result['words']:
                             word_timestamps = result['words']
                             all_word_timestamps.extend(word_timestamps)
                             if is_final:
                                 print("Word-level timestamps:")
                                 for word_info in word_timestamps:
@@ -324,7 +336,7 @@ async def streaming_stt_example():
                                     start = word_info['start']
                                     end = word_info['end']
                                     print(f"  '{word}': {start:.2f}s - {end:.2f}s")
                         if is_final:
                             # Final result - this text won't change
                             full_transcript += text + " "
@@ -332,30 +344,30 @@ async def streaming_stt_example():
                         else:
                             # Partial result - may change as more audio is processed
                             print(f"PARTIAL: {text}")
                     elif result['type'] == 'done':
                         print("Transcription completed")
                         break
             except Exception as e:
                 print(f"Error receiving transcripts: {e}")
             return full_transcript.strip(), all_word_timestamps
         print("Starting streaming STT...")
         # Use asyncio.gather to run audio sending and transcript receiving concurrently
         _, (final_transcript, word_timestamps) = await asyncio.gather(
             send_audio(),
             receive_transcripts()
         )
         print(f"\nComplete transcript: {final_transcript}")
         print(f"Total words with timestamps: {len(word_timestamps)}")
         # Clean up
         await ws.close()
     except Exception as e:
         print(f"STT streaming error: {e}")
     finally:
@@ -410,7 +422,7 @@ import os
 async def transcribe_file():
     client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
     with open("path/to/audio.wav", "rb") as audio_file:
         response = await client.stt.transcribe(
             file=audio_file,
@@ -418,14 +430,14 @@ async def transcribe_file():
             language="en",
             timestamp_granularities=["word"],
         )
     print(f"Transcribed text: {response.text}")
     # Process word timestamps
     if response.words:
         for word_info in response.words:
             print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
     await client.close()
 asyncio.run(transcribe_file())
@@ -631,6 +643,3 @@ $ git commit --amend -m "manually regenerate from docs" # optional
 ### Automatically generating new SDK releases
 From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.11 → cartesia-2.0.14}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ name = "cartesia"
 [tool.poetry]
 name = "cartesia"
-version = "2.0.11"
+version = "2.0.14"
 description = ""
 readme = "README.md"
 authors = []

{cartesia-2.0.11 → cartesia-2.0.14}/src/cartesia/__init__.py RENAMED Viewed

@@ -52,7 +52,10 @@ from .tts import (
     Controls,
     ControlsParams,
     Emotion,
+    EmotionDeprecated,
     FlushId,
+    GenerationConfig,
+    GenerationConfigParams,
     GenerationRequest,
     GenerationRequestParams,
     ModelSpeed,
@@ -211,6 +214,7 @@ __all__ = [
     "EmbeddingSpecifier",
     "EmbeddingSpecifierParams",
     "Emotion",
+    "EmotionDeprecated",
     "ErrorMessage",
     "ErrorMessageParams",
     "FilePurpose",
@@ -219,6 +223,8 @@ __all__ = [
     "FlushId",
     "Gender",
     "GenderPresentation",
+    "GenerationConfig",
+    "GenerationConfigParams",
     "GenerationRequest",
     "GenerationRequestParams",
     "GetVoicesResponse",

{cartesia-2.0.11 → cartesia-2.0.14}/src/cartesia/core/client_wrapper.py RENAMED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cartesia",
-            "X-Fern-SDK-Version": "2.0.11",
+            "X-Fern-SDK-Version": "2.0.14",
         }
         headers["X-API-Key"] = self.api_key
         headers["Cartesia-Version"] = "2024-11-13"

cartesia 2.0.11__tar.gz → 2.0.14__tar.gz

Potentially problematic release.

cartesia 2.0.11tar.gz → 2.0.14tar.gz