PyPI - cartesia - Versions diffs - 2.0.0b7__tar.gz → 2.0.0b8__tar.gz - Mend

cartesia 2.0.0b7tar.gz → 2.0.0b8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.0b7
+Version: 2.0.0b8
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -47,53 +47,6 @@ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.c
 pip install cartesia
 ```
-## Reference
-A full reference for this library is available [here](./reference.md).
-## Voices
-```python
-from cartesia import Cartesia
-import os
-client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
-# Get all available voices
-voices = client.voices.list()
-print(voices)
-# Get a specific voice
-voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
-print("The embedding for", voice.name, "is", voice.embedding)
-# Clone a voice using file data
-cloned_voice = client.voices.clone(
-    clip=open("path/to/voice.wav", "rb"),
-    name="Test cloned voice",
-    language="en",
-    mode="similarity",  # or "stability"
-    enhance=False, # use enhance=True to clean and denoise the cloning audio
-    description="Test voice description"
-)
-# Mix voices together
-mixed_voice = client.voices.mix(
-    voices=[
-        {"id": "voice_id_1", "weight": 0.25},
-        {"id": "voice_id_2", "weight": 0.75}
-    ]
-)
-# Create a new voice from embedding
-new_voice = client.voices.create(
-    name="Test Voice",
-    description="Test voice description",
-    embedding=[...],  # List[float] with 192 dimensions
-    language="en"
-)
-```
 ## Usage
 Instantiate and use the client with the following:
@@ -112,10 +65,6 @@ client.tts.bytes(
     voice={
         "mode": "id",
         "id": "694f9389-aac1-45b6-b726-9d9369183238",
-        "experimental_controls": {
-            "speed": 0.5,  # range between [-1.0, 1.0], or "slow", "fastest", etc.
-            "emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
-        }
     },
     language="en",
     output_format={
@@ -176,7 +125,7 @@ except ApiError as e:
 ## Streaming
-The SDK supports streaming responses, as well, the response will be a generator that you can loop over.
+The SDK supports streaming responses as well, returning a generator that you can iterate over with a `for ... in ...` loop:
 ```python
 from cartesia import Cartesia
@@ -215,7 +164,9 @@ for chunk in chunks:
     print(f"Received chunk of size: {len(chunk.data)}")
 ```
-## WebSocket
+## WebSockets
+For the lowest latency in advanced usecases (such as streaming in an LLM-generated transcript and streaming out audio), you should use our websockets client:
 ```python
 from cartesia import Cartesia
@@ -223,15 +174,10 @@ from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawPar
 import pyaudio
 import os
-client = Cartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
-)
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
 voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
 transcript = "Hello! Welcome to Cartesia"
-# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
-model_id = "sonic-2"
 p = pyaudio.PyAudio()
 rate = 22050
@@ -242,14 +188,14 @@ ws = client.tts.websocket()
 # Generate and stream audio using the websocket
 for output in ws.send(
-    model_id=model_id,
+    model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
     transcript=transcript,
     voice={"id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
         "encoding": "pcm_f32le",
-        "sample_rate": 22050
+        "sample_rate": rate
     },
 ):
     buffer = output.audio
@@ -267,6 +213,40 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Voices
+List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Get all available Voices
+voices = client.voices.list()
+for voice in voices:
+    print(voice)
+```
+You can also get the complete metadata for a specific Voice, or make a new Voice by cloning from an audio sample:
+```python
+# Get a specific Voice
+voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
+print("The embedding for", voice.name, "is", voice.embedding)
+# Clone a Voice using file data
+cloned_voice = client.voices.clone(
+    clip=open("path/to/voice.wav", "rb"),
+    name="Test cloned voice",
+    language="en",
+    mode="similarity",  # or "stability"
+    enhance=False, # use enhance=True to clean and denoise the cloning audio
+    description="Test voice description"
+)
+```
 ## Requesting Timestamps
 ```python
@@ -290,7 +270,8 @@ async def main():
             "encoding": "pcm_f32le",
             "sample_rate": 44100
         },
-        add_timestamps=True,  # Enable word-level timestamps
+        add_timestamps=True,            # Enable word-level timestamps
+        add_phoneme_timestamps=True,    # Enable phonemized timestamps
         stream=True
     )
@@ -358,6 +339,26 @@ client.tts.bytes(..., request_options={
 })
 ```
+### Mixing voices and creating from embeddings
+```python
+# Mix voices together
+mixed_voice = client.voices.mix(
+    voices=[
+        {"id": "voice_id_1", "weight": 0.25},
+        {"id": "voice_id_2", "weight": 0.75}
+    ]
+)
+# Create a new voice from embedding
+new_voice = client.voices.create(
+    name="Test Voice",
+    description="Test voice description",
+    embedding=[...],  # List[float] with 192 dimensions
+    language="en"
+)
+```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -375,6 +376,10 @@ client = Cartesia(
 )
 ```
+## Reference
+A full reference for this library is available [here](./reference.md).
 ## Contributing
 Note that most of this library is generated programmatically from

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/README.md RENAMED Viewed

@@ -15,53 +15,6 @@ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.c
 pip install cartesia
 ```
-## Reference
-A full reference for this library is available [here](./reference.md).
-## Voices
-```python
-from cartesia import Cartesia
-import os
-client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
-# Get all available voices
-voices = client.voices.list()
-print(voices)
-# Get a specific voice
-voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
-print("The embedding for", voice.name, "is", voice.embedding)
-# Clone a voice using file data
-cloned_voice = client.voices.clone(
-    clip=open("path/to/voice.wav", "rb"),
-    name="Test cloned voice",
-    language="en",
-    mode="similarity",  # or "stability"
-    enhance=False, # use enhance=True to clean and denoise the cloning audio
-    description="Test voice description"
-)
-# Mix voices together
-mixed_voice = client.voices.mix(
-    voices=[
-        {"id": "voice_id_1", "weight": 0.25},
-        {"id": "voice_id_2", "weight": 0.75}
-    ]
-)
-# Create a new voice from embedding
-new_voice = client.voices.create(
-    name="Test Voice",
-    description="Test voice description",
-    embedding=[...],  # List[float] with 192 dimensions
-    language="en"
-)
-```
 ## Usage
 Instantiate and use the client with the following:
@@ -80,10 +33,6 @@ client.tts.bytes(
     voice={
         "mode": "id",
         "id": "694f9389-aac1-45b6-b726-9d9369183238",
-        "experimental_controls": {
-            "speed": 0.5,  # range between [-1.0, 1.0], or "slow", "fastest", etc.
-            "emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
-        }
     },
     language="en",
     output_format={
@@ -144,7 +93,7 @@ except ApiError as e:
 ## Streaming
-The SDK supports streaming responses, as well, the response will be a generator that you can loop over.
+The SDK supports streaming responses as well, returning a generator that you can iterate over with a `for ... in ...` loop:
 ```python
 from cartesia import Cartesia
@@ -183,7 +132,9 @@ for chunk in chunks:
     print(f"Received chunk of size: {len(chunk.data)}")
 ```
-## WebSocket
+## WebSockets
+For the lowest latency in advanced usecases (such as streaming in an LLM-generated transcript and streaming out audio), you should use our websockets client:
 ```python
 from cartesia import Cartesia
@@ -191,15 +142,10 @@ from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawPar
 import pyaudio
 import os
-client = Cartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
-)
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
 voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
 transcript = "Hello! Welcome to Cartesia"
-# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
-model_id = "sonic-2"
 p = pyaudio.PyAudio()
 rate = 22050
@@ -210,14 +156,14 @@ ws = client.tts.websocket()
 # Generate and stream audio using the websocket
 for output in ws.send(
-    model_id=model_id,
+    model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
     transcript=transcript,
     voice={"id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
         "encoding": "pcm_f32le",
-        "sample_rate": 22050
+        "sample_rate": rate
     },
 ):
     buffer = output.audio
@@ -235,6 +181,40 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Voices
+List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Get all available Voices
+voices = client.voices.list()
+for voice in voices:
+    print(voice)
+```
+You can also get the complete metadata for a specific Voice, or make a new Voice by cloning from an audio sample:
+```python
+# Get a specific Voice
+voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
+print("The embedding for", voice.name, "is", voice.embedding)
+# Clone a Voice using file data
+cloned_voice = client.voices.clone(
+    clip=open("path/to/voice.wav", "rb"),
+    name="Test cloned voice",
+    language="en",
+    mode="similarity",  # or "stability"
+    enhance=False, # use enhance=True to clean and denoise the cloning audio
+    description="Test voice description"
+)
+```
 ## Requesting Timestamps
 ```python
@@ -258,7 +238,8 @@ async def main():
             "encoding": "pcm_f32le",
             "sample_rate": 44100
         },
-        add_timestamps=True,  # Enable word-level timestamps
+        add_timestamps=True,            # Enable word-level timestamps
+        add_phoneme_timestamps=True,    # Enable phonemized timestamps
         stream=True
     )
@@ -326,6 +307,26 @@ client.tts.bytes(..., request_options={
 })
 ```
+### Mixing voices and creating from embeddings
+```python
+# Mix voices together
+mixed_voice = client.voices.mix(
+    voices=[
+        {"id": "voice_id_1", "weight": 0.25},
+        {"id": "voice_id_2", "weight": 0.75}
+    ]
+)
+# Create a new voice from embedding
+new_voice = client.voices.create(
+    name="Test Voice",
+    description="Test voice description",
+    embedding=[...],  # List[float] with 192 dimensions
+    language="en"
+)
+```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -343,6 +344,10 @@ client = Cartesia(
 )
 ```
+## Reference
+A full reference for this library is available [here](./reference.md).
 ## Contributing
 Note that most of this library is generated programmatically from

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ name = "cartesia"
 [tool.poetry]
 name = "cartesia"
-version = "2.0.0b7"
+version = "2.0.0b8"
 description = ""
 readme = "README.md"
 authors = []

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/src/cartesia/__init__.py RENAMED Viewed

@@ -129,6 +129,9 @@ from .voices import (
     LocalizeDialect,
     LocalizeDialectParams,
     LocalizeEnglishDialect,
+    LocalizeFrenchDialect,
+    LocalizePortugueseDialect,
+    LocalizeSpanishDialect,
     LocalizeTargetLanguage,
     LocalizeVoiceRequest,
     LocalizeVoiceRequestParams,
@@ -187,6 +190,9 @@ __all__ = [
     "LocalizeDialect",
     "LocalizeDialectParams",
     "LocalizeEnglishDialect",
+    "LocalizeFrenchDialect",
+    "LocalizePortugueseDialect",
+    "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
     "LocalizeVoiceRequest",
     "LocalizeVoiceRequestParams",

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/src/cartesia/core/client_wrapper.py RENAMED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cartesia",
-            "X-Fern-SDK-Version": "2.0.0b7",
+            "X-Fern-SDK-Version": "2.0.0b8",
         }
         headers["X-API-Key"] = self.api_key
         headers["Cartesia-Version"] = "2024-11-13"

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/src/cartesia/tts/_async_websocket.py RENAMED Viewed

@@ -69,6 +69,7 @@ class _AsyncTTSContext:
         stream: bool = True,
         add_timestamps: bool = False,
         add_phoneme_timestamps: bool = False,
+        use_original_timestamps: bool = False,
         continue_: bool = False,
         flush: bool = False,
     ) -> None:
@@ -106,6 +107,8 @@ class _AsyncTTSContext:
             request_body["add_timestamps"] = add_timestamps
         if add_phoneme_timestamps:
             request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
+        if use_original_timestamps:
+            request_body["use_original_timestamps"] = use_original_timestamps
         if continue_:
             request_body["continue"] = continue_
         if flush:
@@ -367,6 +370,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
         stream: bool = True,
         add_timestamps: bool = False,
         add_phoneme_timestamps: bool = False,
+        use_original_timestamps: bool = False,
     ):
         """See :meth:`_WebSocket.send` for details."""
         if context_id is None:
@@ -385,6 +389,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
             continue_=False,
             add_timestamps=add_timestamps,
             add_phoneme_timestamps=add_phoneme_timestamps,
+            use_original_timestamps=use_original_timestamps,
         )
         generator = ctx.receive()

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/src/cartesia/tts/_websocket.py RENAMED Viewed

@@ -67,6 +67,8 @@ class _TTSContext:
         language: Optional[str] = None,
         stream: bool = True,
         add_timestamps: bool = False,
+        add_phoneme_timestamps: bool = False,
+        use_original_timestamps: bool = False,
     ) -> Generator[bytes, None, None]:
         """Send audio generation requests to the WebSocket and yield responses.
@@ -102,6 +104,10 @@ class _TTSContext:
             request_body["stream"] = stream
         if add_timestamps:
             request_body["add_timestamps"] = add_timestamps
+        if add_phoneme_timestamps:
+            request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
+        if use_original_timestamps:
+            request_body["use_original_timestamps"] = use_original_timestamps
         if (
             "context_id" in request_body
@@ -354,6 +360,7 @@ class TtsWebsocket:
         stream: bool = True,
         add_timestamps: bool = False,
         add_phoneme_timestamps: bool = False,
+        use_original_timestamps: bool = False,
     ):
         """Send a request to the WebSocket to generate audio.
@@ -384,6 +391,7 @@ class TtsWebsocket:
             "stream": stream,
             "add_timestamps": add_timestamps,
             "add_phoneme_timestamps": add_phoneme_timestamps,
+            "use_original_timestamps": use_original_timestamps,
         }
         generator = self._websocket_generator(request_body)

{cartesia-2.0.0b7 → cartesia-2.0.0b8}/src/cartesia/voices/__init__.py RENAMED Viewed

@@ -12,6 +12,9 @@ from .types import (
     IdSpecifier,
     LocalizeDialect,
     LocalizeEnglishDialect,
+    LocalizeFrenchDialect,
+    LocalizePortugueseDialect,
+    LocalizeSpanishDialect,
     LocalizeTargetLanguage,
     LocalizeVoiceRequest,
     MixVoiceSpecifier,
@@ -56,6 +59,9 @@ __all__ = [
     "LocalizeDialect",
     "LocalizeDialectParams",
     "LocalizeEnglishDialect",
+    "LocalizeFrenchDialect",
+    "LocalizePortugueseDialect",
+    "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
     "LocalizeVoiceRequest",
     "LocalizeVoiceRequestParams",

cartesia 2.0.0b7__tar.gz → 2.0.0b8__tar.gz

cartesia 2.0.0b7tar.gz → 2.0.0b8tar.gz