PyPI - cartesia - Versions diffs - 2.0.0b2__py3-none-any.whl → 2.0.0b8__py3-none-any.whl - Mend

cartesia 2.0.0b2py3-none-any.whl → 2.0.0b8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

cartesia/__init__.py +10 -0
cartesia/base_client.py +0 -4
cartesia/core/__init__.py +3 -0
cartesia/core/client_wrapper.py +2 -2
cartesia/core/pagination.py +88 -0
cartesia/infill/client.py +4 -4
cartesia/tts/_async_websocket.py +53 -1
cartesia/tts/_websocket.py +52 -3
cartesia/tts/client.py +4 -4
cartesia/tts/requests/generation_request.py +5 -0
cartesia/tts/requests/web_socket_chunk_response.py +3 -0
cartesia/tts/requests/web_socket_response.py +2 -1
cartesia/tts/requests/web_socket_tts_request.py +1 -0
cartesia/tts/types/emotion.py +5 -0
cartesia/tts/types/generation_request.py +5 -0
cartesia/tts/types/web_socket_chunk_response.py +3 -1
cartesia/tts/types/web_socket_response.py +2 -1
cartesia/tts/types/web_socket_tts_output.py +2 -0
cartesia/tts/types/web_socket_tts_request.py +1 -0
cartesia/tts/utils/constants.py +2 -2
cartesia/voice_changer/requests/streaming_response.py +2 -0
cartesia/voice_changer/types/streaming_response.py +2 -0
cartesia/voices/__init__.py +10 -0
cartesia/voices/client.py +209 -44
cartesia/voices/requests/__init__.py +2 -0
cartesia/voices/requests/get_voices_response.py +24 -0
cartesia/voices/requests/localize_dialect.py +4 -1
cartesia/voices/requests/localize_voice_request.py +15 -2
cartesia/voices/requests/voice.py +13 -9
cartesia/voices/types/__init__.py +8 -0
cartesia/voices/types/gender_presentation.py +5 -0
cartesia/voices/types/get_voices_response.py +34 -0
cartesia/voices/types/localize_dialect.py +4 -1
cartesia/voices/types/localize_french_dialect.py +5 -0
cartesia/voices/types/localize_voice_request.py +16 -3
cartesia/voices/types/voice.py +13 -9
cartesia/voices/types/voice_expand_options.py +5 -0
{cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/METADATA +149 -73
{cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/RECORD +40 -35
cartesia/datasets/client.py +0 -392
{cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/WHEEL +0 -0

cartesia/voices/types/__init__.py CHANGED Viewed

@@ -6,9 +6,12 @@ from .create_voice_request import CreateVoiceRequest
 from .embedding_response import EmbeddingResponse
 from .embedding_specifier import EmbeddingSpecifier
 from .gender import Gender
+from .gender_presentation import GenderPresentation
+from .get_voices_response import GetVoicesResponse
 from .id_specifier import IdSpecifier
 from .localize_dialect import LocalizeDialect
 from .localize_english_dialect import LocalizeEnglishDialect
+from .localize_french_dialect import LocalizeFrenchDialect
 from .localize_portuguese_dialect import LocalizePortugueseDialect
 from .localize_spanish_dialect import LocalizeSpanishDialect
 from .localize_target_language import LocalizeTargetLanguage
@@ -17,6 +20,7 @@ from .mix_voice_specifier import MixVoiceSpecifier
 from .mix_voices_request import MixVoicesRequest
 from .update_voice_request import UpdateVoiceRequest
 from .voice import Voice
+from .voice_expand_options import VoiceExpandOptions
 from .voice_id import VoiceId
 from .voice_metadata import VoiceMetadata
 from .weight import Weight
@@ -28,9 +32,12 @@ __all__ = [
     "EmbeddingResponse",
     "EmbeddingSpecifier",
     "Gender",
+    "GenderPresentation",
+    "GetVoicesResponse",
     "IdSpecifier",
     "LocalizeDialect",
     "LocalizeEnglishDialect",
+    "LocalizeFrenchDialect",
     "LocalizePortugueseDialect",
     "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
@@ -39,6 +46,7 @@ __all__ = [
     "MixVoicesRequest",
     "UpdateVoiceRequest",
     "Voice",
+    "VoiceExpandOptions",
     "VoiceId",
     "VoiceMetadata",
     "Weight",

cartesia/voices/types/gender_presentation.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+GenderPresentation = typing.Union[typing.Literal["masculine", "feminine", "gender_neutral"], typing.Any]

cartesia/voices/types/get_voices_response.py ADDED Viewed

@@ -0,0 +1,34 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+from .voice import Voice
+import pydantic
+from .voice_id import VoiceId
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class GetVoicesResponse(UniversalBaseModel):
+    data: typing.List[Voice] = pydantic.Field()
+    """
+    The paginated list of Voices.
+    """
+    has_more: bool = pydantic.Field()
+    """
+    Whether there are more Voices to fetch (using `starting_after=id`, where id is the ID of the last Voice in the current response).
+    """
+    next_page: typing.Optional[VoiceId] = pydantic.Field(default=None)
+    """
+    (Deprecated - use the id of the last Voice in the current response instead.) An ID that can be passed as `starting_after` to get the next page of Voices.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/voices/types/localize_dialect.py CHANGED Viewed

@@ -4,5 +4,8 @@ import typing
 from .localize_english_dialect import LocalizeEnglishDialect
 from .localize_spanish_dialect import LocalizeSpanishDialect
 from .localize_portuguese_dialect import LocalizePortugueseDialect
+from .localize_french_dialect import LocalizeFrenchDialect
-LocalizeDialect = typing.Union[LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect]
+LocalizeDialect = typing.Union[
+    LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect, LocalizeFrenchDialect
+]

cartesia/voices/types/localize_french_dialect.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+LocalizeFrenchDialect = typing.Union[typing.Literal["eu", "ca"], typing.Any]

cartesia/voices/types/localize_voice_request.py CHANGED Viewed

@@ -1,17 +1,30 @@
 # This file was auto-generated by Fern from our API Definition.
 from ...core.pydantic_utilities import UniversalBaseModel
-from ...embedding.types.embedding import Embedding
+import pydantic
 from .localize_target_language import LocalizeTargetLanguage
 from .gender import Gender
 import typing
 from .localize_dialect import LocalizeDialect
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
-import pydantic
 class LocalizeVoiceRequest(UniversalBaseModel):
-    embedding: Embedding
+    voice_id: str = pydantic.Field()
+    """
+    The ID of the voice to localize.
+    """
+    name: str = pydantic.Field()
+    """
+    The name of the new localized voice.
+    """
+    description: str = pydantic.Field()
+    """
+    The description of the new localized voice.
+    """
     language: LocalizeTargetLanguage
     original_speaker_gender: Gender
     dialect: typing.Optional[LocalizeDialect] = None

cartesia/voices/types/voice.py CHANGED Viewed

@@ -2,9 +2,9 @@
 from ...core.pydantic_utilities import UniversalBaseModel
 from .voice_id import VoiceId
-import typing
 import pydantic
 import datetime as dt
+import typing
 from ...embedding.types.embedding import Embedding
 from ...tts.types.supported_language import SupportedLanguage
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -12,14 +12,9 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class Voice(UniversalBaseModel):
     id: VoiceId
-    user_id: typing.Optional[str] = pydantic.Field(default=None)
-    """
-    The ID of the user who owns the voice.
+    is_owner: bool = pydantic.Field()
     """
-    is_public: bool = pydantic.Field()
-    """
-    Whether the voice is publicly accessible.
+    Whether the current user is the owner of the voice.
     """
     name: str = pydantic.Field()
@@ -37,7 +32,16 @@ class Voice(UniversalBaseModel):
     The date and time the voice was created.
     """
-    embedding: Embedding
+    embedding: typing.Optional[Embedding] = pydantic.Field(default=None)
+    """
+    The vector embedding of the voice. Only included when `expand` includes `embedding`.
+    """
+    is_starred: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether the current user has starred the voice. Only included when `expand` includes `is_starred`.
+    """
     language: SupportedLanguage
     if IS_PYDANTIC_V2:

cartesia/voices/types/voice_expand_options.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+VoiceExpandOptions = typing.Union[typing.Literal["embedding", "is_starred"], typing.Any]

{cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.0b2
+Version: 2.0.0b8
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -47,53 +47,6 @@ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.c
 pip install cartesia
 ```
-## Reference
-A full reference for this library is available [here](./reference.md).
-## Voices
-```python
-from cartesia import Cartesia
-import os
-client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
-# Get all available voices
-voices = client.voices.list()
-print(voices)
-# Get a specific voice
-voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
-print("The embedding for", voice.name, "is", voice.embedding)
-# Clone a voice using file data
-cloned_voice = client.voices.clone(
-    clip=open("path/to/voice.wav", "rb"),
-    name="Test cloned voice",
-    language="en",
-    mode="similarity",  # or "stability"
-    enhance=False, # use enhance=True to clean and denoise the cloning audio
-    description="Test voice description"
-)
-# Mix voices together
-mixed_voice = client.voices.mix(
-    voices=[
-        {"id": "voice_id_1", "weight": 0.25},
-        {"id": "voice_id_2", "weight": 0.75}
-    ]
-)
-# Create a new voice from embedding
-new_voice = client.voices.create(
-    name="Test Voice",
-    description="Test voice description",
-    embedding=[...],  # List[float] with 192 dimensions
-    language="en"
-)
-```
 ## Usage
 Instantiate and use the client with the following:
@@ -107,15 +60,11 @@ client = Cartesia(
     api_key=os.getenv("CARTESIA_API_KEY"),
 )
 client.tts.bytes(
-    model_id="sonic-english",
+    model_id="sonic-2",
     transcript="Hello, world!",
     voice={
         "mode": "id",
         "id": "694f9389-aac1-45b6-b726-9d9369183238",
-        "experimental_controls": {
-            "speed": 0.5,  # range between [-1.0, 1.0], or "slow", "fastest", etc.
-            "emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
-        }
     },
     language="en",
     output_format={
@@ -143,7 +92,7 @@ client = AsyncCartesia(
 async def main() -> None:
     async for output in client.tts.bytes(
-        model_id="sonic-english",
+        model_id="sonic-2",
         transcript="Hello, world!",
         voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
         language="en",
@@ -176,7 +125,7 @@ except ApiError as e:
 ## Streaming
-The SDK supports streaming responses, as well, the response will be a generator that you can loop over.
+The SDK supports streaming responses as well, returning a generator that you can iterate over with a `for ... in ...` loop:
 ```python
 from cartesia import Cartesia
@@ -188,7 +137,7 @@ def get_tts_chunks():
         api_key=os.getenv("CARTESIA_API_KEY"),
     )
     response = client.tts.sse(
-        model_id="sonic",
+        model_id="sonic-2",
         transcript="Hello world!",
         voice={
             "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
@@ -204,7 +153,7 @@ def get_tts_chunks():
             "sample_rate": 44100,
         },
     )
     audio_chunks = []
     for chunk in response:
         audio_chunks.append(chunk)
@@ -215,7 +164,9 @@ for chunk in chunks:
     print(f"Received chunk of size: {len(chunk.data)}")
 ```
-## WebSocket
+## WebSockets
+For the lowest latency in advanced usecases (such as streaming in an LLM-generated transcript and streaming out audio), you should use our websockets client:
 ```python
 from cartesia import Cartesia
@@ -223,15 +174,10 @@ from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawPar
 import pyaudio
 import os
-client = Cartesia(
-    api_key=os.getenv("CARTESIA_API_KEY"),
-)
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
 voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
 transcript = "Hello! Welcome to Cartesia"
-# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
-model_id = "sonic"
 p = pyaudio.PyAudio()
 rate = 22050
@@ -242,14 +188,14 @@ ws = client.tts.websocket()
 # Generate and stream audio using the websocket
 for output in ws.send(
-    model_id=model_id,
+    model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
     transcript=transcript,
     voice={"id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
-        "encoding": "pcm_f32le",
-        "sample_rate": 22050
+        "encoding": "pcm_f32le",
+        "sample_rate": rate
     },
 ):
     buffer = output.audio
@@ -267,6 +213,90 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Voices
+List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
+```python
+from cartesia import Cartesia
+import os
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+# Get all available Voices
+voices = client.voices.list()
+for voice in voices:
+    print(voice)
+```
+You can also get the complete metadata for a specific Voice, or make a new Voice by cloning from an audio sample:
+```python
+# Get a specific Voice
+voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
+print("The embedding for", voice.name, "is", voice.embedding)
+# Clone a Voice using file data
+cloned_voice = client.voices.clone(
+    clip=open("path/to/voice.wav", "rb"),
+    name="Test cloned voice",
+    language="en",
+    mode="similarity",  # or "stability"
+    enhance=False, # use enhance=True to clean and denoise the cloning audio
+    description="Test voice description"
+)
+```
+## Requesting Timestamps
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def main():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    # Connect to the websocket
+    ws = await client.tts.websocket()
+    # Generate speech with timestamps
+    output_generate = await ws.send(
+        model_id="sonic-2",
+        transcript="Hello! Welcome to Cartesia's text-to-speech.",
+        voice={"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"},
+        output_format={
+            "container": "raw",
+            "encoding": "pcm_f32le",
+            "sample_rate": 44100
+        },
+        add_timestamps=True,            # Enable word-level timestamps
+        add_phoneme_timestamps=True,    # Enable phonemized timestamps
+        stream=True
+    )
+    # Process the streaming response with timestamps
+    all_words = []
+    all_starts = []
+    all_ends = []
+    audio_chunks = []
+    async for out in output_generate:
+        # Collect audio data
+        if out.audio is not None:
+            audio_chunks.append(out.audio)
+        # Process timestamp data
+        if out.word_timestamps is not None:
+            all_words.extend(out.word_timestamps.words)    # List of words
+            all_starts.extend(out.word_timestamps.start)   # Start time for each word (seconds)
+            all_ends.extend(out.word_timestamps.end)       # End time for each word (seconds)
+    await ws.close()
+asyncio.run(main())
+```
 ## Advanced
 ### Retries
@@ -309,6 +339,26 @@ client.tts.bytes(..., request_options={
 })
 ```
+### Mixing voices and creating from embeddings
+```python
+# Mix voices together
+mixed_voice = client.voices.mix(
+    voices=[
+        {"id": "voice_id_1", "weight": 0.25},
+        {"id": "voice_id_2", "weight": 0.75}
+    ]
+)
+# Create a new voice from embedding
+new_voice = client.voices.create(
+    name="Test Voice",
+    description="Test voice description",
+    embedding=[...],  # List[float] with 192 dimensions
+    language="en"
+)
+```
 ### Custom Client
 You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -326,13 +376,39 @@ client = Cartesia(
 )
 ```
+## Reference
+A full reference for this library is available [here](./reference.md).
 ## Contributing
-While we value open-source contributions to this SDK, this library is generated programmatically.
-Additions made directly to this library would have to be moved over to our generation code,
-otherwise they would be overwritten upon the next generated release. Feel free to open a PR as
-a proof of concept, but know that we will not be able to merge it as-is. We suggest opening
-an issue first to discuss with us!
+Note that most of this library is generated programmatically from
+<https://github.com/cartesia-ai/docs> — before making edits to a file, verify it's not autogenerated
+by checking for this comment at the top of the file:
+```
+# This file was auto-generated by Fern from our API Definition.
+```
+### Running tests
+```sh
+uv pip install -r requirements.txt
+uv run pytest -rP -vv tests/custom/test_client.py::test_get_voices
+```
+### Manually generating SDK code from docs
+Assuming all your repos are cloned into your home directory:
+```sh
+$ cd ~/docs
+$ fern generate --group python-sdk --log-level debug --api version-2024-11-13 --preview
+$ cd ~/cartesia-python
+$ git pull ~/docs/fern/apis/version-2024-11-13/.preview/fern-python-sdk
+$ git commit --amend -m "manually regenerate from docs" # optional
+```
+### Automatically generating new SDK releases
-On the other hand, contributions to the README are always very welcome!
+From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

cartesia 2.0.0b2__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

cartesia 2.0.0b2py3-none-any.whl → 2.0.0b8py3-none-any.whl