PyPI - cartesia - Versions diffs - 2.0.0a0__py3-none-any.whl → 2.0.0b1__py3-none-any.whl - Mend

cartesia 2.0.0a0py3-none-any.whl → 2.0.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

cartesia/__init__.py +4 -0
cartesia/core/client_wrapper.py +1 -1
cartesia/core/http_client.py +2 -2
cartesia/core/pydantic_utilities.py +2 -2
cartesia/datasets/client.py +4 -34
cartesia/infill/client.py +36 -12
cartesia/tts/client.py +4 -4
cartesia/voices/__init__.py +4 -0
cartesia/voices/client.py +169 -928
cartesia/voices/requests/create_voice_request.py +2 -0
cartesia/voices/requests/localize_dialect.py +3 -1
cartesia/voices/types/__init__.py +4 -0
cartesia/voices/types/create_voice_request.py +2 -0
cartesia/voices/types/localize_dialect.py +3 -1
cartesia/voices/types/localize_portuguese_dialect.py +5 -0
cartesia/voices/types/localize_spanish_dialect.py +5 -0
{cartesia-2.0.0a0.dist-info → cartesia-2.0.0b1.dist-info}/METADATA +2 -1
{cartesia-2.0.0a0.dist-info → cartesia-2.0.0b1.dist-info}/RECORD +19 -17
{cartesia-2.0.0a0.dist-info → cartesia-2.0.0b1.dist-info}/WHEEL +0 -0

cartesia/__init__.py CHANGED Viewed

@@ -126,6 +126,8 @@ from .voices import (
     LocalizeDialect,
     LocalizeDialectParams,
     LocalizeEnglishDialect,
+    LocalizePortugueseDialect,
+    LocalizeSpanishDialect,
     LocalizeTargetLanguage,
     LocalizeVoiceRequest,
     LocalizeVoiceRequestParams,
@@ -180,6 +182,8 @@ __all__ = [
     "LocalizeDialect",
     "LocalizeDialectParams",
     "LocalizeEnglishDialect",
+    "LocalizePortugueseDialect",
+    "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
     "LocalizeVoiceRequest",
     "LocalizeVoiceRequestParams",

cartesia/core/client_wrapper.py CHANGED Viewed

@@ -16,7 +16,7 @@ class BaseClientWrapper:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "cartesia",
-            "X-Fern-SDK-Version": "2.0.0a0",
+            "X-Fern-SDK-Version": "2.0.0b1",
         }
         headers["X-API-Key"] = self.api_key
         headers["Cartesia-Version"] = "2024-06-10"

cartesia/core/http_client.py CHANGED Viewed

@@ -85,8 +85,8 @@ def _retry_timeout(response: httpx.Response, retries: int) -> float:
 def _should_retry(response: httpx.Response) -> bool:
-    retriable_400s = [429, 408, 409]
-    return response.status_code >= 500 or response.status_code in retriable_400s
+    retryable_400s = [429, 408, 409]
+    return response.status_code >= 500 or response.status_code in retryable_400s
 def remove_omit_from_dict(

cartesia/core/pydantic_utilities.py CHANGED Viewed

@@ -79,7 +79,7 @@ def to_jsonable_with_fallback(
 class UniversalBaseModel(pydantic.BaseModel):
     if IS_PYDANTIC_V2:
         model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
-            # Allow fields begining with `model_` to be used in the model
+            # Allow fields beginning with `model_` to be used in the model
             protected_namespaces=(),
         )  # type: ignore # Pydantic v2
@@ -128,7 +128,7 @@ class UniversalBaseModel(pydantic.BaseModel):
         Override the default dict method to `exclude_unset` by default. This function patches
         `exclude_unset` to work include fields within non-None default values.
         """
-        # Note: the logic here is multi-plexed given the levers exposed in Pydantic V1 vs V2
+        # Note: the logic here is multiplexed given the levers exposed in Pydantic V1 vs V2
         # Pydantic V1's .dict can be extremely slow, so we do not want to call it twice.
         #
         # We'd ideally do the same for Pydantic V2, but it shells out to a library to serialize models

cartesia/datasets/client.py CHANGED Viewed

@@ -82,7 +82,7 @@ class DatasetsClient:
             api_key="YOUR_API_KEY",
         )
         client.datasets.create(
-            name="string",
+            name="name",
         )
         """
         _response = self._client_wrapper.httpx_client.request(
@@ -129,7 +129,7 @@ class DatasetsClient:
             api_key="YOUR_API_KEY",
         )
         client.datasets.list_files(
-            id="string",
+            id="id",
         )
         """
         _response = self._client_wrapper.httpx_client.request(
@@ -170,17 +170,6 @@ class DatasetsClient:
         Returns
         -------
         None
-        Examples
-        --------
-        from cartesia import Cartesia
-        client = Cartesia(
-            api_key="YOUR_API_KEY",
-        )
-        client.datasets.upload_file(
-            id="string",
-        )
         """
         _response = self._client_wrapper.httpx_client.request(
             f"datasets/{jsonable_encoder(id)}/files",
@@ -280,7 +269,7 @@ class AsyncDatasetsClient:
         async def main() -> None:
             await client.datasets.create(
-                name="string",
+                name="name",
             )
@@ -337,7 +326,7 @@ class AsyncDatasetsClient:
         async def main() -> None:
             await client.datasets.list_files(
-                id="string",
+                id="id",
             )
@@ -381,25 +370,6 @@ class AsyncDatasetsClient:
         Returns
         -------
         None
-        Examples
-        --------
-        import asyncio
-        from cartesia import AsyncCartesia
-        client = AsyncCartesia(
-            api_key="YOUR_API_KEY",
-        )
-        async def main() -> None:
-            await client.datasets.upload_file(
-                id="string",
-            )
-        asyncio.run(main())
         """
         _response = await self._client_wrapper.httpx_client.request(
             f"datasets/{jsonable_encoder(id)}/files",

cartesia/infill/client.py CHANGED Viewed

@@ -34,16 +34,26 @@ class InfillClient:
         output_format_encoding: typing.Optional[RawEncoding] = OMIT,
         output_format_bit_rate: typing.Optional[int] = OMIT,
         voice_experimental_controls_speed: typing.Optional[Speed] = OMIT,
-        voice_experimental_controls_emotion: typing.Optional[Emotion] = OMIT,
+        voice_experimental_controls_emotion: typing.Optional[typing.List[Emotion]] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.Iterator[bytes]:
         """
         Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
+        **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
         Only the `sonic-preview` model is supported for infill at this time.
         At least one of `left_audio` or `right_audio` must be provided.
+        As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
+        - Use longer infill transcripts
+          - This gives the model more flexibility to adapt to the rest of the audio
+        - Target natural pauses in the audio when deciding where to clip
+          - This means you don't need word-level timestamps to be as precise
+        - Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
+          - This helps the model generate more natural transitions
         Parameters
         ----------
         left_audio : core.File
@@ -84,7 +94,7 @@ class InfillClient:
             If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
-        voice_experimental_controls_emotion : typing.Optional[Emotion]
+        voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
             An array of emotion:level tags.
             Supported emotions are: anger, positivity, surprise, sadness, and curiosity.
@@ -114,16 +124,18 @@ class InfillClient:
             output_format_container="mp3",
             output_format_sample_rate=44100,
             output_format_bit_rate=128000,
+            voice_experimental_controls_speed="slowest",
+            voice_experimental_controls_emotion=["surprise:high", "curiosity:high"],
         )
         """
         with self._client_wrapper.httpx_client.stream(
             "infill/bytes",
             method="POST",
             data={
-                "model_id[]": model_id,
-                "language[]": language,
-                "transcript[]": transcript,
-                "voice[id]": voice_id,
+                "model_id": model_id,
+                "language": language,
+                "transcript": transcript,
+                "voice_id": voice_id,
                 "output_format[container]": output_format_container,
                 "output_format[sample_rate]": output_format_sample_rate,
                 "output_format[encoding]": output_format_encoding,
@@ -169,16 +181,26 @@ class AsyncInfillClient:
         output_format_encoding: typing.Optional[RawEncoding] = OMIT,
         output_format_bit_rate: typing.Optional[int] = OMIT,
         voice_experimental_controls_speed: typing.Optional[Speed] = OMIT,
-        voice_experimental_controls_emotion: typing.Optional[Emotion] = OMIT,
+        voice_experimental_controls_emotion: typing.Optional[typing.List[Emotion]] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
     ) -> typing.AsyncIterator[bytes]:
         """
         Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
+        **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
         Only the `sonic-preview` model is supported for infill at this time.
         At least one of `left_audio` or `right_audio` must be provided.
+        As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
+        - Use longer infill transcripts
+          - This gives the model more flexibility to adapt to the rest of the audio
+        - Target natural pauses in the audio when deciding where to clip
+          - This means you don't need word-level timestamps to be as precise
+        - Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
+          - This helps the model generate more natural transitions
         Parameters
         ----------
         left_audio : core.File
@@ -219,7 +241,7 @@ class AsyncInfillClient:
             If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
-        voice_experimental_controls_emotion : typing.Optional[Emotion]
+        voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
             An array of emotion:level tags.
             Supported emotions are: anger, positivity, surprise, sadness, and curiosity.
@@ -254,6 +276,8 @@ class AsyncInfillClient:
                 output_format_container="mp3",
                 output_format_sample_rate=44100,
                 output_format_bit_rate=128000,
+                voice_experimental_controls_speed="slowest",
+                voice_experimental_controls_emotion=["surprise:high", "curiosity:high"],
             )
@@ -263,10 +287,10 @@ class AsyncInfillClient:
             "infill/bytes",
             method="POST",
             data={
-                "model_id[]": model_id,
-                "language[]": language,
-                "transcript[]": transcript,
-                "voice[id]": voice_id,
+                "model_id": model_id,
+                "language": language,
+                "transcript": transcript,
+                "voice_id": voice_id,
                 "output_format[container]": output_format_container,
                 "output_format[sample_rate]": output_format_sample_rate,
                 "output_format[encoding]": output_format_encoding,

cartesia/tts/client.py CHANGED Viewed

@@ -67,7 +67,7 @@ class TtsClient:
             api_key="YOUR_API_KEY",
         )
         client.tts.bytes(
-            model_id="sonic-english",
+            model_id="sonic",
             transcript="Hello, world!",
             voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
             language="en",
@@ -152,7 +152,7 @@ class TtsClient:
             api_key="YOUR_API_KEY",
         )
         response = client.tts.sse(
-            model_id="sonic-english",
+            model_id="sonic",
             transcript="Hello, world!",
             voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
             language="en",
@@ -258,7 +258,7 @@ class AsyncTtsClient:
         async def main() -> None:
             await client.tts.bytes(
-                model_id="sonic-english",
+                model_id="sonic",
                 transcript="Hello, world!",
                 voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
                 language="en",
@@ -351,7 +351,7 @@ class AsyncTtsClient:
         async def main() -> None:
             response = await client.tts.sse(
-                model_id="sonic-english",
+                model_id="sonic",
                 transcript="Hello, world!",
                 voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
                 language="en",

cartesia/voices/__init__.py CHANGED Viewed

@@ -10,6 +10,8 @@ from .types import (
     IdSpecifier,
     LocalizeDialect,
     LocalizeEnglishDialect,
+    LocalizePortugueseDialect,
+    LocalizeSpanishDialect,
     LocalizeTargetLanguage,
     LocalizeVoiceRequest,
     MixVoiceSpecifier,
@@ -49,6 +51,8 @@ __all__ = [
     "LocalizeDialect",
     "LocalizeDialectParams",
     "LocalizeEnglishDialect",
+    "LocalizePortugueseDialect",
+    "LocalizeSpanishDialect",
     "LocalizeTargetLanguage",
     "LocalizeVoiceRequest",
     "LocalizeVoiceRequestParams",

cartesia 2.0.0a0__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

cartesia 2.0.0a0py3-none-any.whl → 2.0.0b1py3-none-any.whl