PyPI - cartesia - Versions diffs - 2.0.0b2__tar.gz → 2.0.0b8__tar.gz - Mend

cartesia 2.0.0b2tar.gz → 2.0.0b8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

{cartesia-2.0.0b2 → cartesia-2.0.0b8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.0b2
+Version: 2.0.0b8
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -47,53 +47,6 @@ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.c
 pip install cartesia
 ```
-## Reference
-A full reference for this library is available [here](./reference.md).
-## Voices
-```python
-from cartesia import Cartesia
-import os
-client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
-# Get all available voices
-voices = client.voices.list()
-print(voices)
-# Get a specific voice
-voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
-print("The embedding for", voice.name, "is", voice.embedding)
-# Clone a voice using file data
-cloned_voice = client.voices.clone(
-    clip=open("path/to/voice.wav", "rb"),
-    name="Test cloned voice",
-    language="en",
-    mode="similarity",  # or "stability"
-    enhance=False, # use enhance=True to clean and denoise the cloning audio
-    description="Test voice description"
-)
-# Mix voices together
-mixed_voice = client.voices.mix(
-    voices=[
-        {"id": "voice_id_1", "weight": 0.25},
-        {"id": "voice_id_2", "weight": 0.75}
-    ]
-)
-# Create a new voice from embedding
-new_voice = client.voices.create(
-    name="Test Voice",
-    description="Test voice description",
-    embedding=[...],  # List[float] with 192 dimensions
-    language="en"
-)
-```
 ## Usage
 Instantiate and use the client with the following:
@@ -107,15 +60,11 @@ client = Cartesia(
     api_key=os.getenv("CARTESIA_API_KEY"),
 )
 client.tts.bytes(
-    model_id="sonic-english",
+    model_id="sonic-2",
     transcript="Hello, world!",
     voice={
         "mode": "id",
         "id": "694f9389-aac1-45b6-b726-9d9369183238",
-        "experimental_controls": {
-            "speed": 0.5,  # range between [-1.0, 1.0], or "slow", "fastest", etc.
-            "emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
-        }
     },
     language="en",
     output_format={
@@ -143,7 +92,7 @@ client = AsyncCartesia(
 async def main() -> None:
     async for output in client.tts.bytes(
-        model_id="sonic-english",
+        model_id="sonic-2",
         transcript="Hello, world!",
         voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
         language="en",
@@ -176,7 +125,7 @@ except ApiError as e:
 ## Streaming
-The SDK supports streaming responses, as well, the response will be a generator that you can loop over.
+The SDK supports streaming responses as well, returning a generator that you can iterate over with a `for ... in ...` loop:
 ```python
 from cartesia import Cartesia
@@ -188,7 +137,7 @@ def get_tts_chunks():
         api_key=os.getenv("CARTESIA_API_KEY"),
     )
     response = client.tts.sse(
-        model_id="sonic",
+        model_id="sonic-2",
         transcript="Hello world!",
         voice={
             "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
@@ -204,7 +153,7 @@ def get_tts_chunks():
             "sample_rate": 44100,
         },
     )
     audio_chunks = []
     for chunk in response:
         audio_chunks.append(chunk)
@@ -215,7 +164,9 @@ for chunk in chunks:
     print(f"Received chunk of size: {len(chunk.data)}")
 ```
-## WebSocket
+## WebSockets
+For the lowest latency in advanced usecases (such as streaming in an LLM-generated transcript and streaming out audio), you should use our websockets client:
 ```python
 from cartesia import Cartesia
@@ -223,15 +174,10 @@ from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawPar
 import pyaudio
 import os
-client = Cartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
-)
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
 voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
 transcript = "Hello! Welcome to Cartesia"
-# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
-model_id = "sonic"
 p = pyaudio.PyAudio()
 rate = 22050
@@ -242,14 +188,14 @@ ws = client.tts.websocket()
 # Generate and stream audio using the websocket
 for output in ws.send(
-    model_id=model_id,
+    model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
     transcript=transcript,
     voice={"id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
-        "encoding": "pcm_f32le",
-        "sample_rate": 22050
+        "encoding": "pcm_f32le",
+        "sample_rate": rate
     },
 ):
     buffer = output.audio
@@ -267,6 +213,90 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Voices
+List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Get all available Voices
+voices = client.voices.list()
+for voice in voices:
+    print(voice)
+```
+You can also get the complete metadata for a specific Voice, or make a new Voice by cloning from an audio sample:
+```python
+# Get a specific Voice
+voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
+print("The embedding for", voice.name, "is", voice.embedding)
+# Clone a Voice using file data
+cloned_voice = client.voices.clone(
+    clip=open("path/to/voice.wav", "rb"),
+    name="Test cloned voice",
+    language="en",
+    mode="similarity",  # or "stability"
+    enhance=False, # use enhance=True to clean and denoise the cloning audio
+    description="Test voice description"
+)
+```
+## Requesting Timestamps
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def main():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    # Connect to the websocket
+    ws = await client.tts.websocket()
+    # Generate speech with timestamps
+    output_generate = await ws.send(
+        model_id="sonic-2",
+        transcript="Hello! Welcome to Cartesia's text-to-speech.",
+        voice={"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"},
+        output_format={
+            "container": "raw",
+            "encoding": "pcm_f32le",
+            "sample_rate": 44100
+        },
+        add_timestamps=True,            # Enable word-level timestamps
+        add_phoneme_timestamps=True,    # Enable phonemized timestamps
+        stream=True
+    )
+    # Process the streaming response with timestamps
+    all_words = []
+    all_starts = []
+    all_ends = []
+    audio_chunks = []
+    async for out in output_generate:
+        # Collect audio data
+        if out.audio is not None:
+            audio_chunks.append(out.audio)
+        # Process timestamp data
+        if out.word_timestamps is not None:
+            all_words.extend(out.word_timestamps.words)    # List of words
+            all_starts.extend(out.word_timestamps.start)   # Start time for each word (seconds)
+            all_ends.extend(out.word_timestamps.end)       # End time for each word (seconds)
+    await ws.close()
+asyncio.run(main())
+```
 ## Advanced
 ### Retries
@@ -309,6 +339,26 @@ client.tts.bytes(..., request_options={
 })
 ```
+### Mixing voices and creating from embeddings
+```python
+# Mix voices together
+mixed_voice = client.voices.mix(
+    voices=[
+        {"id": "voice_id_1", "weight": 0.25},
+        {"id": "voice_id_2", "weight": 0.75}
+    ]
+)
+# Create a new voice from embedding
+new_voice = client.voices.create(
+    name="Test Voice",
+    description="Test voice description",
+    embedding=[...],  # List[float] with 192 dimensions
+    language="en"
+)
+```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -326,13 +376,39 @@ client = Cartesia(
 )
 ```
+## Reference
+A full reference for this library is available [here](./reference.md).
 ## Contributing
-While we value open-source contributions to this SDK, this library is generated programmatically.
-Additions made directly to this library would have to be moved over to our generation code,
-otherwise they would be overwritten upon the next generated release. Feel free to open a PR as
-a proof of concept, but know that we will not be able to merge it as-is. We suggest opening
-an issue first to discuss with us!
+Note that most of this library is generated programmatically from
+<https://github.com/cartesia-ai/docs> — before making edits to a file, verify it's not autogenerated
+by checking for this comment at the top of the file:
+```
+# This file was auto-generated by Fern from our API Definition.
+```
+### Running tests
+```sh
+uv pip install -r requirements.txt
+uv run pytest -rP -vv tests/custom/test_client.py::test_get_voices
+```
+### Manually generating SDK code from docs
+Assuming all your repos are cloned into your home directory:
+```sh
+$ cd ~/docs
+$ fern generate --group python-sdk --log-level debug --api version-2024-11-13 --preview
+$ cd ~/cartesia-python
+$ git pull ~/docs/fern/apis/version-2024-11-13/.preview/fern-python-sdk
+$ git commit --amend -m "manually regenerate from docs" # optional
+```
+### Automatically generating new SDK releases
-On the other hand, contributions to the README are always very welcome!
+From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.0b2 → cartesia-2.0.0b8}/README.md RENAMED Viewed

@@ -15,53 +15,6 @@ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.c
 pip install cartesia
 ```
-## Reference
-A full reference for this library is available [here](./reference.md).
-## Voices
-```python
-from cartesia import Cartesia
-import os
-client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
-# Get all available voices
-voices = client.voices.list()
-print(voices)
-# Get a specific voice
-voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
-print("The embedding for", voice.name, "is", voice.embedding)
-# Clone a voice using file data
-cloned_voice = client.voices.clone(
-    clip=open("path/to/voice.wav", "rb"),
-    name="Test cloned voice",
-    language="en",
-    mode="similarity",  # or "stability"
-    enhance=False, # use enhance=True to clean and denoise the cloning audio
-    description="Test voice description"
-)
-# Mix voices together
-mixed_voice = client.voices.mix(
-    voices=[
-        {"id": "voice_id_1", "weight": 0.25},
-        {"id": "voice_id_2", "weight": 0.75}
-    ]
-)
-# Create a new voice from embedding
-new_voice = client.voices.create(
-    name="Test Voice",
-    description="Test voice description",
-    embedding=[...],  # List[float] with 192 dimensions
-    language="en"
-)
-```
 ## Usage
 Instantiate and use the client with the following:
@@ -75,15 +28,11 @@ client = Cartesia(
     api_key=os.getenv("CARTESIA_API_KEY"),
 )
 client.tts.bytes(
-    model_id="sonic-english",
+    model_id="sonic-2",
     transcript="Hello, world!",
     voice={
         "mode": "id",
         "id": "694f9389-aac1-45b6-b726-9d9369183238",
-        "experimental_controls": {
-            "speed": 0.5,  # range between [-1.0, 1.0], or "slow", "fastest", etc.
-            "emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
-        }
     },
     language="en",
     output_format={
@@ -111,7 +60,7 @@ client = AsyncCartesia(
 async def main() -> None:
     async for output in client.tts.bytes(
-        model_id="sonic-english",
+        model_id="sonic-2",
         transcript="Hello, world!",
         voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
         language="en",
@@ -144,7 +93,7 @@ except ApiError as e:
 ## Streaming
-The SDK supports streaming responses, as well, the response will be a generator that you can loop over.
+The SDK supports streaming responses as well, returning a generator that you can iterate over with a `for ... in ...` loop:
 ```python
 from cartesia import Cartesia
@@ -156,7 +105,7 @@ def get_tts_chunks():
         api_key=os.getenv("CARTESIA_API_KEY"),
     )
     response = client.tts.sse(
-        model_id="sonic",
+        model_id="sonic-2",
         transcript="Hello world!",
         voice={
             "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
@@ -172,7 +121,7 @@ def get_tts_chunks():
             "sample_rate": 44100,
         },
     )
     audio_chunks = []
     for chunk in response:
         audio_chunks.append(chunk)
@@ -183,7 +132,9 @@ for chunk in chunks:
     print(f"Received chunk of size: {len(chunk.data)}")
 ```
-## WebSocket
+## WebSockets
+For the lowest latency in advanced usecases (such as streaming in an LLM-generated transcript and streaming out audio), you should use our websockets client:
 ```python
 from cartesia import Cartesia
@@ -191,15 +142,10 @@ from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawPar
 import pyaudio
 import os
-client = Cartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
-)
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
 voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
 transcript = "Hello! Welcome to Cartesia"
-# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
-model_id = "sonic"
 p = pyaudio.PyAudio()
 rate = 22050
@@ -210,14 +156,14 @@ ws = client.tts.websocket()
 # Generate and stream audio using the websocket
 for output in ws.send(
-    model_id=model_id,
+    model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
     transcript=transcript,
     voice={"id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
-        "encoding": "pcm_f32le",
-        "sample_rate": 22050
+        "encoding": "pcm_f32le",
+        "sample_rate": rate
     },
 ):
     buffer = output.audio
@@ -235,6 +181,90 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Voices
+List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Get all available Voices
+voices = client.voices.list()
+for voice in voices:
+    print(voice)
+```
+You can also get the complete metadata for a specific Voice, or make a new Voice by cloning from an audio sample:
+```python
+# Get a specific Voice
+voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
+print("The embedding for", voice.name, "is", voice.embedding)
+# Clone a Voice using file data
+cloned_voice = client.voices.clone(
+    clip=open("path/to/voice.wav", "rb"),
+    name="Test cloned voice",
+    language="en",
+    mode="similarity",  # or "stability"
+    enhance=False, # use enhance=True to clean and denoise the cloning audio
+    description="Test voice description"
+)
+```
+## Requesting Timestamps
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def main():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    # Connect to the websocket
+    ws = await client.tts.websocket()
+    # Generate speech with timestamps
+    output_generate = await ws.send(
+        model_id="sonic-2",
+        transcript="Hello! Welcome to Cartesia's text-to-speech.",
+        voice={"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"},
+        output_format={
+            "container": "raw",
+            "encoding": "pcm_f32le",
+            "sample_rate": 44100
+        },
+        add_timestamps=True,            # Enable word-level timestamps
+        add_phoneme_timestamps=True,    # Enable phonemized timestamps
+        stream=True
+    )
+    # Process the streaming response with timestamps
+    all_words = []
+    all_starts = []
+    all_ends = []
+    audio_chunks = []
+    async for out in output_generate:
+        # Collect audio data
+        if out.audio is not None:
+            audio_chunks.append(out.audio)
+        # Process timestamp data
+        if out.word_timestamps is not None:
+            all_words.extend(out.word_timestamps.words)    # List of words
+            all_starts.extend(out.word_timestamps.start)   # Start time for each word (seconds)
+            all_ends.extend(out.word_timestamps.end)       # End time for each word (seconds)
+    await ws.close()
+asyncio.run(main())
+```
 ## Advanced
 ### Retries
@@ -277,6 +307,26 @@ client.tts.bytes(..., request_options={
 })
 ```
+### Mixing voices and creating from embeddings
+```python
+# Mix voices together
+mixed_voice = client.voices.mix(
+    voices=[
+        {"id": "voice_id_1", "weight": 0.25},
+        {"id": "voice_id_2", "weight": 0.75}
+    ]
+)
+# Create a new voice from embedding
+new_voice = client.voices.create(
+    name="Test Voice",
+    description="Test voice description",
+    embedding=[...],  # List[float] with 192 dimensions
+    language="en"
+)
+```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -294,12 +344,38 @@ client = Cartesia(
 )
 ```
+## Reference
+A full reference for this library is available [here](./reference.md).
 ## Contributing
-While we value open-source contributions to this SDK, this library is generated programmatically.
-Additions made directly to this library would have to be moved over to our generation code,
-otherwise they would be overwritten upon the next generated release. Feel free to open a PR as
-a proof of concept, but know that we will not be able to merge it as-is. We suggest opening
-an issue first to discuss with us!
+Note that most of this library is generated programmatically from
+<https://github.com/cartesia-ai/docs> — before making edits to a file, verify it's not autogenerated
+by checking for this comment at the top of the file:
+```
+# This file was auto-generated by Fern from our API Definition.
+```
+### Running tests
+```sh
+uv pip install -r requirements.txt
+uv run pytest -rP -vv tests/custom/test_client.py::test_get_voices
+```
+### Manually generating SDK code from docs
+Assuming all your repos are cloned into your home directory:
+```sh
+$ cd ~/docs
+$ fern generate --group python-sdk --log-level debug --api version-2024-11-13 --preview
+$ cd ~/cartesia-python
+$ git pull ~/docs/fern/apis/version-2024-11-13/.preview/fern-python-sdk
+$ git commit --amend -m "manually regenerate from docs" # optional
+```
+### Automatically generating new SDK releases
-On the other hand, contributions to the README are always very welcome!
+From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

{cartesia-2.0.0b2 → cartesia-2.0.0b8}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ name = "cartesia"
 [tool.poetry]
 name = "cartesia"
-version = "2.0.0b2"
+version = "2.0.0b8"
 description = ""
 readme = "README.md"
 authors = []

{cartesia-2.0.0b2 → cartesia-2.0.0b8}/src/cartesia/__init__.py RENAMED Viewed

@@ -121,11 +121,15 @@ from .voices import (
     EmbeddingSpecifier,
     EmbeddingSpecifierParams,
     Gender,
+    GenderPresentation,
+    GetVoicesResponse,
+    GetVoicesResponseParams,
     IdSpecifier,
     IdSpecifierParams,
     LocalizeDialect,
     LocalizeDialectParams,
     LocalizeEnglishDialect,
+    LocalizeFrenchDialect,
     LocalizePortugueseDialect,
     LocalizeSpanishDialect,
     LocalizeTargetLanguage,
@@ -138,6 +142,7 @@ from .voices import (
     UpdateVoiceRequest,
     UpdateVoiceRequestParams,
     Voice,
+    VoiceExpandOptions,
     VoiceId,
     VoiceMetadata,
     VoiceMetadataParams,
@@ -175,13 +180,17 @@ __all__ = [
     "FilePurpose",
     "FlushId",
     "Gender",
+    "GenderPresentation",
     "GenerationRequest",
     "GenerationRequestParams",
+    "GetVoicesResponse",
+    "GetVoicesResponseParams",
     "IdSpecifier",
     "IdSpecifierParams",
     "LocalizeDialect",
     "LocalizeDialectParams",
     "LocalizeEnglishDialect",
+    "LocalizeFrenchDialect",
     "LocalizePortugueseDialect",
     "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
@@ -235,6 +244,7 @@ __all__ = [
     "UpdateVoiceRequest",
     "UpdateVoiceRequestParams",
     "Voice",
+    "VoiceExpandOptions",
     "VoiceId",
     "VoiceMetadata",
     "VoiceMetadataParams",

cartesia 2.0.0b2__tar.gz → 2.0.0b8__tar.gz

cartesia 2.0.0b2tar.gz → 2.0.0b8tar.gz