PyPI - cartesia - Versions diffs - 2.0.0b1__py3-none-any.whl → 2.0.0b7__py3-none-any.whl - Mend

cartesia 2.0.0b1py3-none-any.whl → 2.0.0b7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

cartesia/__init__.py +8 -4
cartesia/base_client.py +0 -4
cartesia/core/__init__.py +3 -0
cartesia/core/client_wrapper.py +2 -2
cartesia/core/pagination.py +88 -0
cartesia/infill/client.py +4 -4
cartesia/tts/_async_websocket.py +48 -1
cartesia/tts/_websocket.py +44 -3
cartesia/tts/client.py +4 -4
cartesia/tts/requests/generation_request.py +5 -0
cartesia/tts/requests/web_socket_chunk_response.py +3 -0
cartesia/tts/requests/web_socket_response.py +2 -1
cartesia/tts/requests/web_socket_tts_request.py +1 -0
cartesia/tts/types/emotion.py +5 -0
cartesia/tts/types/generation_request.py +5 -0
cartesia/tts/types/web_socket_chunk_response.py +3 -1
cartesia/tts/types/web_socket_response.py +2 -1
cartesia/tts/types/web_socket_tts_output.py +2 -0
cartesia/tts/types/web_socket_tts_request.py +1 -0
cartesia/tts/utils/constants.py +2 -2
cartesia/voice_changer/requests/streaming_response.py +2 -0
cartesia/voice_changer/types/streaming_response.py +2 -0
cartesia/voices/__init__.py +8 -4
cartesia/voices/client.py +285 -169
cartesia/voices/requests/__init__.py +2 -0
cartesia/voices/requests/create_voice_request.py +0 -2
cartesia/voices/requests/get_voices_response.py +24 -0
cartesia/voices/requests/localize_dialect.py +1 -3
cartesia/voices/requests/voice.py +13 -9
cartesia/voices/types/__init__.py +6 -4
cartesia/voices/types/create_voice_request.py +0 -2
cartesia/voices/types/gender_presentation.py +5 -0
cartesia/voices/types/get_voices_response.py +34 -0
cartesia/voices/types/localize_dialect.py +1 -3
cartesia/voices/types/voice.py +13 -9
cartesia/voices/types/voice_expand_options.py +5 -0
{cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/METADATA +151 -49
{cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/RECORD +39 -37
cartesia/datasets/client.py +0 -392
cartesia/voices/types/localize_portuguese_dialect.py +0 -5
cartesia/voices/types/localize_spanish_dialect.py +0 -5
{cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/WHEEL +0 -0

cartesia/voices/requests/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from .create_voice_request import CreateVoiceRequestParams
 from .embedding_response import EmbeddingResponseParams
 from .embedding_specifier import EmbeddingSpecifierParams
+from .get_voices_response import GetVoicesResponseParams
 from .id_specifier import IdSpecifierParams
 from .localize_dialect import LocalizeDialectParams
 from .localize_voice_request import LocalizeVoiceRequestParams
@@ -16,6 +17,7 @@ __all__ = [
     "CreateVoiceRequestParams",
     "EmbeddingResponseParams",
     "EmbeddingSpecifierParams",
+    "GetVoicesResponseParams",
     "IdSpecifierParams",
     "LocalizeDialectParams",
     "LocalizeVoiceRequestParams",

cartesia/voices/requests/create_voice_request.py CHANGED Viewed

@@ -4,7 +4,6 @@ import typing_extensions
 from ...embedding.types.embedding import Embedding
 import typing_extensions
 from ...tts.types.supported_language import SupportedLanguage
-from ..types.base_voice_id import BaseVoiceId
 class CreateVoiceRequestParams(typing_extensions.TypedDict):
@@ -20,4 +19,3 @@ class CreateVoiceRequestParams(typing_extensions.TypedDict):
     embedding: Embedding
     language: typing_extensions.NotRequired[SupportedLanguage]
-    base_voice_id: typing_extensions.NotRequired[BaseVoiceId]

cartesia/voices/requests/get_voices_response.py ADDED Viewed

@@ -0,0 +1,24 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing_extensions
+import typing
+from .voice import VoiceParams
+import typing_extensions
+from ..types.voice_id import VoiceId
+class GetVoicesResponseParams(typing_extensions.TypedDict):
+    data: typing.Sequence[VoiceParams]
+    """
+    The paginated list of Voices.
+    """
+    has_more: bool
+    """
+    Whether there are more Voices to fetch (using `starting_after=id`, where id is the ID of the last Voice in the current response).
+    """
+    next_page: typing_extensions.NotRequired[VoiceId]
+    """
+    (Deprecated - use the id of the last Voice in the current response instead.) An ID that can be passed as `starting_after` to get the next page of Voices.
+    """

cartesia/voices/requests/localize_dialect.py CHANGED Viewed

@@ -2,7 +2,5 @@
 import typing
 from ..types.localize_english_dialect import LocalizeEnglishDialect
-from ..types.localize_spanish_dialect import LocalizeSpanishDialect
-from ..types.localize_portuguese_dialect import LocalizePortugueseDialect
-LocalizeDialectParams = typing.Union[LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect]
+LocalizeDialectParams = typing.Union[LocalizeEnglishDialect]

cartesia/voices/requests/voice.py CHANGED Viewed

@@ -2,22 +2,17 @@
 import typing_extensions
 from ..types.voice_id import VoiceId
-import typing_extensions
 import datetime as dt
+import typing_extensions
 from ...embedding.types.embedding import Embedding
 from ...tts.types.supported_language import SupportedLanguage
 class VoiceParams(typing_extensions.TypedDict):
     id: VoiceId
-    user_id: typing_extensions.NotRequired[str]
-    """
-    The ID of the user who owns the voice.
+    is_owner: bool
     """
-    is_public: bool
-    """
-    Whether the voice is publicly accessible.
+    Whether the current user is the owner of the voice.
     """
     name: str
@@ -35,5 +30,14 @@ class VoiceParams(typing_extensions.TypedDict):
     The date and time the voice was created.
     """
-    embedding: Embedding
+    embedding: typing_extensions.NotRequired[Embedding]
+    """
+    The vector embedding of the voice. Only included when `expand` includes `embedding`.
+    """
+    is_starred: typing_extensions.NotRequired[bool]
+    """
+    Whether the current user has starred the voice. Only included when `expand` includes `is_starred`.
+    """
     language: SupportedLanguage

cartesia/voices/types/__init__.py CHANGED Viewed

@@ -6,17 +6,18 @@ from .create_voice_request import CreateVoiceRequest
 from .embedding_response import EmbeddingResponse
 from .embedding_specifier import EmbeddingSpecifier
 from .gender import Gender
+from .gender_presentation import GenderPresentation
+from .get_voices_response import GetVoicesResponse
 from .id_specifier import IdSpecifier
 from .localize_dialect import LocalizeDialect
 from .localize_english_dialect import LocalizeEnglishDialect
-from .localize_portuguese_dialect import LocalizePortugueseDialect
-from .localize_spanish_dialect import LocalizeSpanishDialect
 from .localize_target_language import LocalizeTargetLanguage
 from .localize_voice_request import LocalizeVoiceRequest
 from .mix_voice_specifier import MixVoiceSpecifier
 from .mix_voices_request import MixVoicesRequest
 from .update_voice_request import UpdateVoiceRequest
 from .voice import Voice
+from .voice_expand_options import VoiceExpandOptions
 from .voice_id import VoiceId
 from .voice_metadata import VoiceMetadata
 from .weight import Weight
@@ -28,17 +29,18 @@ __all__ = [
     "EmbeddingResponse",
     "EmbeddingSpecifier",
     "Gender",
+    "GenderPresentation",
+    "GetVoicesResponse",
     "IdSpecifier",
     "LocalizeDialect",
     "LocalizeEnglishDialect",
-    "LocalizePortugueseDialect",
-    "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
     "LocalizeVoiceRequest",
     "MixVoiceSpecifier",
     "MixVoicesRequest",
     "UpdateVoiceRequest",
     "Voice",
+    "VoiceExpandOptions",
     "VoiceId",
     "VoiceMetadata",
     "Weight",

cartesia/voices/types/create_voice_request.py CHANGED Viewed

@@ -5,7 +5,6 @@ import pydantic
 from ...embedding.types.embedding import Embedding
 import typing
 from ...tts.types.supported_language import SupportedLanguage
-from .base_voice_id import BaseVoiceId
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -22,7 +21,6 @@ class CreateVoiceRequest(UniversalBaseModel):
     embedding: Embedding
     language: typing.Optional[SupportedLanguage] = None
-    base_voice_id: typing.Optional[BaseVoiceId] = None
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2

cartesia/voices/types/gender_presentation.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+GenderPresentation = typing.Union[typing.Literal["masculine", "feminine", "gender_neutral"], typing.Any]

cartesia/voices/types/get_voices_response.py ADDED Viewed

@@ -0,0 +1,34 @@
+# This file was auto-generated by Fern from our API Definition.
+from ...core.pydantic_utilities import UniversalBaseModel
+import typing
+from .voice import Voice
+import pydantic
+from .voice_id import VoiceId
+from ...core.pydantic_utilities import IS_PYDANTIC_V2
+class GetVoicesResponse(UniversalBaseModel):
+    data: typing.List[Voice] = pydantic.Field()
+    """
+    The paginated list of Voices.
+    """
+    has_more: bool = pydantic.Field()
+    """
+    Whether there are more Voices to fetch (using `starting_after=id`, where id is the ID of the last Voice in the current response).
+    """
+    next_page: typing.Optional[VoiceId] = pydantic.Field(default=None)
+    """
+    (Deprecated - use the id of the last Voice in the current response instead.) An ID that can be passed as `starting_after` to get the next page of Voices.
+    """
+    if IS_PYDANTIC_V2:
+        model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True)  # type: ignore # Pydantic v2
+    else:
+        class Config:
+            frozen = True
+            smart_union = True
+            extra = pydantic.Extra.allow

cartesia/voices/types/localize_dialect.py CHANGED Viewed

@@ -2,7 +2,5 @@
 import typing
 from .localize_english_dialect import LocalizeEnglishDialect
-from .localize_spanish_dialect import LocalizeSpanishDialect
-from .localize_portuguese_dialect import LocalizePortugueseDialect
-LocalizeDialect = typing.Union[LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect]
+LocalizeDialect = typing.Union[LocalizeEnglishDialect]

cartesia/voices/types/voice.py CHANGED Viewed

@@ -2,9 +2,9 @@
 from ...core.pydantic_utilities import UniversalBaseModel
 from .voice_id import VoiceId
-import typing
 import pydantic
 import datetime as dt
+import typing
 from ...embedding.types.embedding import Embedding
 from ...tts.types.supported_language import SupportedLanguage
 from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -12,14 +12,9 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
 class Voice(UniversalBaseModel):
     id: VoiceId
-    user_id: typing.Optional[str] = pydantic.Field(default=None)
-    """
-    The ID of the user who owns the voice.
+    is_owner: bool = pydantic.Field()
     """
-    is_public: bool = pydantic.Field()
-    """
-    Whether the voice is publicly accessible.
+    Whether the current user is the owner of the voice.
     """
     name: str = pydantic.Field()
@@ -37,7 +32,16 @@ class Voice(UniversalBaseModel):
     The date and time the voice was created.
     """
-    embedding: Embedding
+    embedding: typing.Optional[Embedding] = pydantic.Field(default=None)
+    """
+    The vector embedding of the voice. Only included when `expand` includes `embedding`.
+    """
+    is_starred: typing.Optional[bool] = pydantic.Field(default=None)
+    """
+    Whether the current user has starred the voice. Only included when `expand` includes `is_starred`.
+    """
     language: SupportedLanguage
     if IS_PYDANTIC_V2:

cartesia/voices/types/voice_expand_options.py ADDED Viewed

@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+import typing
+VoiceExpandOptions = typing.Union[typing.Literal["embedding", "is_starred"], typing.Any]

{cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 2.0.0b1
+Version: 2.0.0b7
 Summary:
 Requires-Python: >=3.8,<4.0
 Classifier: Intended Audience :: Developers
@@ -57,7 +57,7 @@ A full reference for this library is available [here](./reference.md).
 from cartesia import Cartesia
 import os
-client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
+client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
 # Get all available voices
 voices = client.voices.list()
@@ -65,21 +65,32 @@ print(voices)
 # Get a specific voice
 voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
-print("The embedding for", voice["name"], "is", voice["embedding"])
+print("The embedding for", voice.name, "is", voice.embedding)
-# Clone a voice using filepath
-cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
+# Clone a voice using file data
+cloned_voice = client.voices.clone(
+    clip=open("path/to/voice.wav", "rb"),
+    name="Test cloned voice",
+    language="en",
+    mode="similarity",  # or "stability"
+    enhance=False, # use enhance=True to clean and denoise the cloning audio
+    description="Test voice description"
+)
 # Mix voices together
-mixed_voice_embedding = client.voices.mix(
-    [{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
+mixed_voice = client.voices.mix(
+    voices=[
+        {"id": "voice_id_1", "weight": 0.25},
+        {"id": "voice_id_2", "weight": 0.75}
+    ]
 )
-# Create a new voice
+# Create a new voice from embedding
 new_voice = client.voices.create(
-    name="New Voice",
-    description="A clone of my own voice",
-    embedding=cloned_voice_embedding,
+    name="Test Voice",
+    description="Test voice description",
+    embedding=[...],  # List[float] with 192 dimensions
+    language="en"
 )
 ```
@@ -90,15 +101,22 @@ Instantiate and use the client with the following:
 ```python
 from cartesia import Cartesia
 from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
+import os
 client = Cartesia(
-    api_key="YOUR_API_KEY",
+    api_key=os.getenv("CARTESIA_API_KEY"),
 )
 client.tts.bytes(
-    model_id="sonic-english",
+    model_id="sonic-2",
     transcript="Hello, world!",
-    voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
-    ),
+    voice={
+        "mode": "id",
+        "id": "694f9389-aac1-45b6-b726-9d9369183238",
+        "experimental_controls": {
+            "speed": 0.5,  # range between [-1.0, 1.0], or "slow", "fastest", etc.
+            "emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
+        }
+    },
     language="en",
     output_format={
         "container": "raw",
@@ -114,18 +132,18 @@ The SDK also exports an `async` client so that you can make non-blocking calls t
 ```python
 import asyncio
+import os
 from cartesia import AsyncCartesia
 from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
 client = AsyncCartesia(
-    api_key="YOUR_API_KEY",
+    api_key=os.getenv("CARTESIA_API_KEY"),
 )
 async def main() -> None:
-    await client.tts.bytes(
-        model_id="sonic-english",
+    async for output in client.tts.bytes(
+        model_id="sonic-2",
         transcript="Hello, world!",
         voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
         language="en",
@@ -134,7 +152,8 @@ async def main() -> None:
             "sample_rate": 44100,
             "encoding": "pcm_f32le",
         },
-    )
+    ):
+        print(f"Received chunk of size: {len(output)}")
 asyncio.run(main())
@@ -162,26 +181,38 @@ The SDK supports streaming responses, as well, the response will be a generator
 ```python
 from cartesia import Cartesia
 from cartesia.tts import Controls, OutputFormat_RawParams, TtsRequestIdSpecifierParams
+import os
-client = Cartesia(
-    api_key="YOUR_API_KEY",
-)
-response = client.tts.sse(
-    model_id="string",
-    transcript="string",
-    voice={
-        "id": "string",
-        "experimental_controls": {
-            speed=1.1,
-            emotion="anger:lowest",
+def get_tts_chunks():
+    client = Cartesia(
+        api_key=os.getenv("CARTESIA_API_KEY"),
+    )
+    response = client.tts.sse(
+        model_id="sonic-2",
+        transcript="Hello world!",
+        voice={
+            "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
+            "experimental_controls": {
+                "speed": "normal",
+                "emotion": [],
+            },
         },
-    },
-    language="en",
-    output_format={},
-    duration=1.1,
-)
-for chunk in response:
-    yield chunk
+        language="en",
+        output_format={
+            "container": "raw",
+            "encoding": "pcm_f32le",
+            "sample_rate": 44100,
+        },
+    )
+    audio_chunks = []
+    for chunk in response:
+        audio_chunks.append(chunk)
+    return audio_chunks
+chunks = get_tts_chunks()
+for chunk in chunks:
+    print(f"Received chunk of size: {len(chunk.data)}")
 ```
 ## WebSocket
@@ -190,16 +221,16 @@ for chunk in response:
 from cartesia import Cartesia
 from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawParams
 import pyaudio
+import os
 client = Cartesia(
-    api_key="YOUR_API_KEY",
+    api_key=os.getenv("CARTESIA_API_KEY"),
 )
 voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
-voice = client.voices.get(id=voice_id)
 transcript = "Hello! Welcome to Cartesia"
 # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
-model_id = "sonic-english"
+model_id = "sonic-2"
 p = pyaudio.PyAudio()
 rate = 22050
@@ -213,11 +244,11 @@ ws = client.tts.websocket()
 for output in ws.send(
     model_id=model_id,
     transcript=transcript,
-    voice={"embedding": voice.embedding},
+    voice={"id": voice_id},
     stream=True,
     output_format={
         "container": "raw",
-        "encoding": "pcm_f32le",
+        "encoding": "pcm_f32le",
         "sample_rate": 22050
     },
 ):
@@ -236,6 +267,55 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+## Requesting Timestamps
+```python
+import asyncio
+from cartesia import AsyncCartesia
+import os
+async def main():
+    client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
+    # Connect to the websocket
+    ws = await client.tts.websocket()
+    # Generate speech with timestamps
+    output_generate = await ws.send(
+        model_id="sonic-2",
+        transcript="Hello! Welcome to Cartesia's text-to-speech.",
+        voice={"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"},
+        output_format={
+            "container": "raw",
+            "encoding": "pcm_f32le",
+            "sample_rate": 44100
+        },
+        add_timestamps=True,  # Enable word-level timestamps
+        stream=True
+    )
+    # Process the streaming response with timestamps
+    all_words = []
+    all_starts = []
+    all_ends = []
+    audio_chunks = []
+    async for out in output_generate:
+        # Collect audio data
+        if out.audio is not None:
+            audio_chunks.append(out.audio)
+        # Process timestamp data
+        if out.word_timestamps is not None:
+            all_words.extend(out.word_timestamps.words)    # List of words
+            all_starts.extend(out.word_timestamps.start)   # Start time for each word (seconds)
+            all_ends.extend(out.word_timestamps.end)       # End time for each word (seconds)
+    await ws.close()
+asyncio.run(main())
+```
 ## Advanced
 ### Retries
@@ -297,11 +377,33 @@ client = Cartesia(
 ## Contributing
-While we value open-source contributions to this SDK, this library is generated programmatically.
-Additions made directly to this library would have to be moved over to our generation code,
-otherwise they would be overwritten upon the next generated release. Feel free to open a PR as
-a proof of concept, but know that we will not be able to merge it as-is. We suggest opening
-an issue first to discuss with us!
+Note that most of this library is generated programmatically from
+<https://github.com/cartesia-ai/docs> — before making edits to a file, verify it's not autogenerated
+by checking for this comment at the top of the file:
+```
+# This file was auto-generated by Fern from our API Definition.
+```
+### Running tests
+```sh
+uv pip install -r requirements.txt
+uv run pytest -rP -vv tests/custom/test_client.py::test_get_voices
+```
+### Manually generating SDK code from docs
+Assuming all your repos are cloned into your home directory:
+```sh
+$ cd ~/docs
+$ fern generate --group python-sdk --log-level debug --api version-2024-11-13 --preview
+$ cd ~/cartesia-python
+$ git pull ~/docs/fern/apis/version-2024-11-13/.preview/fern-python-sdk
+$ git commit --amend -m "manually regenerate from docs" # optional
+```
+### Automatically generating new SDK releases
-On the other hand, contributions to the README are always very welcome!
+From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)

cartesia 2.0.0b1__py3-none-any.whl → 2.0.0b7__py3-none-any.whl

cartesia 2.0.0b1py3-none-any.whl → 2.0.0b7py3-none-any.whl