PyPI - cartesia - Versions diffs - 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

cartesia 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cartesia/_types.py +1 -1
cartesia/_websocket.py +2 -2
cartesia/async_tts.py +114 -1
cartesia/tts.py +156 -1
cartesia/version.py +1 -1
cartesia/voices.py +1 -5
{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/METADATA +6 -4
{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/RECORD +10 -10
{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/WHEEL +1 -1
{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/licenses/LICENSE.md +0 -0

cartesia/_types.py CHANGED Viewed

@@ -36,7 +36,6 @@ class VoiceMetadata(TypedDict):
     user_id: str
     created_at: str
     language: str
-    base_voice_id: Optional[str] = None
 class VoiceControls(TypedDict):
@@ -62,6 +61,7 @@ class OutputFormat(TypedDict):
     container: str
     encoding: str
     sample_rate: int
+    bit_rate: Optional[int] = None
 class EventType:

cartesia/_websocket.py CHANGED Viewed

@@ -121,7 +121,7 @@ class _TTSContext:
                         raise RuntimeError(f"Error generating audio:\n{response['error']}")
                     if response["done"]:
                         break
-                    if response["data"]:
+                    if "data" in response and response["data"]:
                         yield self._websocket._convert_response(
                             response=response, include_context_id=True
                         )
@@ -138,7 +138,7 @@ class _TTSContext:
                             raise RuntimeError(f"Error generating audio:\n{response['error']}")
                         if response["done"]:
                             break
-                        if response["data"]:
+                        if "data" in response and response["data"]:
                             yield self._websocket._convert_response(
                                 response=response, include_context_id=True
                             )

cartesia/async_tts.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Iterator, List, Optional
+from typing import Iterator, List, Optional, Tuple
 import httpx
 from cartesia._async_sse import _AsyncSSE
@@ -61,3 +61,116 @@ class AsyncTTS(TTS):
             raise ValueError(f"Failed to generate audio. Error: {response.text}")
         return response.content
+    async def infill(
+        self,
+        *,
+        model_id: str,
+        language: str,
+        transcript: str,
+        voice_id: str,
+        output_format: OutputFormat,
+        left_audio_path: Optional[str] = None,
+        right_audio_path: Optional[str] = None,
+        experimental_voice_controls: Optional[VoiceControls] = None,
+    ) -> Tuple[bytes, bytes]:
+        """Generate infill audio between two existing audio segments.
+        Args:
+            model_id: The ID of the model to use for generating audio
+            language: The language of the transcript
+            transcript: The text to synthesize
+            voice_id: The ID of the voice to use for generating audio
+            output_format: The desired audio output format
+            left_audio_path: Path to the audio file that comes before the infill
+            right_audio_path: Path to the audio file that comes after the infill
+            experimental_voice_controls: Optional voice control parameters
+        Returns:
+            A tuple containing:
+            - The generated infill audio (bytes)
+            - The complete concatenated audio (bytes)
+        """
+        if not left_audio_path and not right_audio_path:
+            raise ValueError("Must specify at least one of left_audio_path or right_audio_path")
+        headers = self.headers.copy()
+        headers.pop("Content-Type", None)
+        left_audio_file = None
+        right_audio_file = None
+        try:
+            files = {}
+            if left_audio_path:
+                left_audio_file = open(left_audio_path, "rb")
+                files["left_audio"] = left_audio_file
+            if right_audio_path:
+                right_audio_file = open(right_audio_path, "rb")
+                files["right_audio"] = right_audio_file
+            # Construct form data with output_format fields directly
+            data = {
+                "model_id": model_id,
+                "language": language,
+                "transcript": transcript,
+                "voice_id": voice_id,
+                "output_format[container]": output_format["container"],
+                "output_format[encoding]": output_format["encoding"],
+                "output_format[sample_rate]": output_format["sample_rate"],
+            }
+            # Add bit_rate for mp3 container
+            if "bit_rate" in output_format:
+                data["output_format[bit_rate]"] = output_format["bit_rate"]
+            # Add voice controls if specified
+            if experimental_voice_controls:
+                if "speed" in experimental_voice_controls:
+                    data["voice[__experimental_controls][speed]"] = experimental_voice_controls[
+                        "speed"
+                    ]
+                if "emotion" in experimental_voice_controls:
+                    # Pass emotions as a list instead of individual values
+                    data["voice[__experimental_controls][emotion][]"] = experimental_voice_controls[
+                        "emotion"
+                    ]
+            async with httpx.AsyncClient() as client:
+                response = await client.post(
+                    f"{self._http_url()}/infill/bytes",
+                    headers=headers,
+                    timeout=self.timeout,
+                    files=files,
+                    data=data,
+                )
+            if not response.is_success:
+                raise ValueError(
+                    f"Failed to infill audio. Status Code: {response.status_code}\n"
+                    f"Error: {response.text}"
+                )
+            if left_audio_file:
+                left_audio_file.seek(0)
+                left_audio = left_audio_file.read()
+            else:
+                left_audio = None
+            if right_audio_file:
+                right_audio_file.seek(0)
+                right_audio = right_audio_file.read()
+            else:
+                right_audio = None
+            infill_audio = response.content
+            format = output_format["container"].lower()
+            total_audio = self._concat_audio_segments(
+                left_audio, infill_audio, right_audio, format=format
+            )
+            return infill_audio, total_audio
+        finally:
+            if left_audio_file:
+                left_audio_file.close()
+            if right_audio_file:
+                right_audio_file.close()

cartesia/tts.py CHANGED Viewed

@@ -1,6 +1,9 @@
-from typing import Iterator, List, Optional
+import json
+from typing import Iterator, List, Optional, Tuple
 import httpx
+import io
+from pydub import AudioSegment
 from cartesia._sse import _SSE
 from cartesia._types import (
@@ -135,3 +138,155 @@ class TTS(Resource):
             ValueError: If neither or both voice_id and voice_embedding are specified.
         """
         return _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls)
+    def infill(
+        self,
+        *,
+        model_id: str,
+        language: str,
+        transcript: str,
+        voice_id: str,
+        output_format: OutputFormat,
+        left_audio_path: Optional[str] = None,
+        right_audio_path: Optional[str] = None,
+        experimental_voice_controls: Optional[VoiceControls] = None,
+    ) -> Tuple[bytes, bytes]:
+        """Generate infill audio between two existing audio segments.
+        Args:
+            model_id: The ID of the model to use for generating audio
+            language: The language of the transcript
+            transcript: The text to synthesize
+            voice_id: The ID of the voice to use for generating audio
+            output_format: The desired audio output format
+            left_audio_path: Path to the audio file that comes before the infill
+            right_audio_path: Path to the audio file that comes after the infill
+            experimental_voice_controls: Optional voice control parameters
+        Returns:
+            A tuple containing:
+            - The generated infill audio (bytes)
+            - The complete concatenated audio (bytes)
+        """
+        if not left_audio_path and not right_audio_path:
+            raise ValueError("Must specify at least one of left_audio_path or right_audio_path")
+        headers = self.headers.copy()
+        headers.pop("Content-Type", None)
+        left_audio_file = None
+        right_audio_file = None
+        try:
+            files = {}
+            if left_audio_path:
+                left_audio_file = open(left_audio_path, "rb")
+                files["left_audio"] = left_audio_file
+            if right_audio_path:
+                right_audio_file = open(right_audio_path, "rb")
+                files["right_audio"] = right_audio_file
+            # Construct form data with output_format fields directly
+            data = {
+                "model_id": model_id,
+                "language": language,
+                "transcript": transcript,
+                "voice_id": voice_id,
+                "output_format[container]": output_format["container"],
+                "output_format[encoding]": output_format["encoding"],
+                "output_format[sample_rate]": output_format["sample_rate"],
+            }
+            # Add bit_rate for mp3 container
+            if "bit_rate" in output_format:
+                data["output_format[bit_rate]"] = output_format["bit_rate"]
+            # Add voice controls if specified
+            if experimental_voice_controls:
+                if "speed" in experimental_voice_controls:
+                    data["voice[__experimental_controls][speed]"] = experimental_voice_controls[
+                        "speed"
+                    ]
+                if "emotion" in experimental_voice_controls:
+                    # Pass emotions as a list instead of individual values
+                    data["voice[__experimental_controls][emotion][]"] = experimental_voice_controls[
+                        "emotion"
+                    ]
+            response = httpx.post(
+                f"{self._http_url()}/infill/bytes",
+                headers=headers,
+                timeout=self.timeout,
+                files=files,
+                data=data,
+            )
+            if not response.is_success:
+                raise ValueError(
+                    f"Failed to infill audio. Status Code: {response.status_code}\n"
+                    f"Error: {response.text}"
+                )
+            if left_audio_file:
+                left_audio_file.seek(0)
+                left_audio = left_audio_file.read()
+            else:
+                left_audio = None
+            if right_audio_file:
+                right_audio_file.seek(0)
+                right_audio = right_audio_file.read()
+            else:
+                right_audio = None
+            infill_audio = response.content
+            format = output_format["container"].lower()
+            total_audio = self._concat_audio_segments(
+                left_audio, infill_audio, right_audio, format=format
+            )
+            return infill_audio, total_audio
+        finally:
+            if left_audio_file:
+                left_audio_file.close()
+            if right_audio_file:
+                right_audio_file.close()
+    @staticmethod
+    def _concat_audio_segments(
+        left_audio: Optional[bytes],
+        infill_audio: bytes,
+        right_audio: Optional[bytes],
+        format: str = "wav",
+    ) -> bytes:
+        """Helper method to concatenate three audio segments while preserving audio format and headers.
+        Args:
+            left_audio: The audio segment that comes before the infill
+            infill_audio: The generated infill audio segment
+            right_audio: The audio segment that comes after the infill
+            format: The audio format (e.g., 'wav', 'mp3'). Defaults to 'wav'
+        Returns:
+            bytes: The concatenated audio as bytes
+        Raises:
+            ValueError: If the audio segments cannot be loaded or concatenated
+        """
+        try:
+            # Convert bytes to AudioSegment objects
+            combined = AudioSegment.empty()
+            if left_audio:
+                combined += AudioSegment.from_file(io.BytesIO(left_audio), format=format)
+            combined += AudioSegment.from_file(io.BytesIO(infill_audio), format=format)
+            if right_audio:
+                combined += AudioSegment.from_file(io.BytesIO(right_audio), format=format)
+            # Export to bytes
+            output = io.BytesIO()
+            combined.export(output, format=format)
+            return output.getvalue()
+        except Exception as e:
+            raise ValueError(f"Failed to concatenate audio segments: {str(e)}")

cartesia/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.3.0"
1	+ __version__ = "1.4.0"

cartesia/voices.py CHANGED Viewed

@@ -52,8 +52,7 @@ class Voices(Resource):
         if not response.is_success:
             raise ValueError(
-                f"Failed to get voice. Status Code: {response.status_code}\n"
-                f"Error: {response.text}"
+                f"Failed to get voice. Status Code: {response.status_code}\nError: {response.text}"
             )
         return response.json()
@@ -123,7 +122,6 @@ class Voices(Resource):
         name: str,
         description: str,
         embedding: List[float],
-        base_voice_id: Optional[str] = None,
         language: str = "en",
     ) -> VoiceMetadata:
         """Create a new voice.
@@ -132,7 +130,6 @@ class Voices(Resource):
             name: The name of the voice.
             description: The description of the voice.
             embedding: The embedding of the voice. This should be generated with :meth:`clone`.
-            base_voice_id: The ID of the base voice. This should be a valid voice ID if specified.
         Returns:
             A dictionary containing the voice metadata.
@@ -144,7 +141,6 @@ class Voices(Resource):
                 "name": name,
                 "description": description,
                 "embedding": embedding,
-                "base_voice_id": base_voice_id,
                 "language": language,
             },
             timeout=self.timeout,

{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,14 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: cartesia
-Version: 1.3.0
+Version: 1.4.0
 Summary: The official Python library for the Cartesia API.
+License-File: LICENSE.md
 Requires-Python: >=3.9
 Requires-Dist: aiohttp>=3.10.10
-Requires-Dist: httpx>=0.27.2
+Requires-Dist: httpx>=0.27.0
 Requires-Dist: iterators>=0.2.0
-Requires-Dist: requests>=2.32.3
+Requires-Dist: pydub>=0.25.1
+Requires-Dist: requests>=2.31.0
 Requires-Dist: websockets>=10.4
 Description-Content-Type: text/markdown

{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/RECORD RENAMED Viewed

@@ -4,20 +4,20 @@ cartesia/_async_websocket.py,sha256=y9YL9fU8eLENZZECJUwRBVTfEx4ZMl96Y5zHaRY2BiI,
 cartesia/_constants.py,sha256=khGNVpiQVDmv1oZU7pKTd9C1AHjiaM8zQ2He9d5zI_c,435
 cartesia/_logger.py,sha256=vU7QiGSy_AJuJFmClUocqIJ-Ltku_8C24ZU8L6fLJR0,53
 cartesia/_sse.py,sha256=CugabGUAUM-N2BruxNFxDB20HyxDlRdbN-J_yAzvBMY,5667
-cartesia/_types.py,sha256=gixQbKbX-H8xbD7jxHmc02KXLyjEaup19lh_57_YBl8,2570
-cartesia/_websocket.py,sha256=nRCq9xB0T9yYHoLqtn0GsJmcap-OAlJdSIrzTl40qMI,14875
+cartesia/_types.py,sha256=p0OzSzH174WrG8LRyu_MvNXZPhhTfLArSSDpcYY4xa0,2565
+cartesia/_websocket.py,sha256=7gDLcfMoIwmKj07iLk5UZ4ypxlv-3UmMd3VFjVn1QaE,14921
 cartesia/async_client.py,sha256=y_K_Yuv0weA4k9ZYD0M9bNM3x3frsq07tqkg7R9h0-o,2714
-cartesia/async_tts.py,sha256=IbWVRKklNClXASR6ylHaukcMRR304LUguqc4yMopbDU,2076
+cartesia/async_tts.py,sha256=CgbrLk7tc0NKSBC8zZH5I4CpWHpOgkypo0D2hyg5LLE,6466
 cartesia/client.py,sha256=OS1ORUSlR8Jg-em1imeTAFfwkC85AQFnw8PYtTdUuC8,2364
 cartesia/resource.py,sha256=wpnB3IPcTdxYSp0vxSkpntp4NSvqvnwUWF-0ZpgWV9o,1585
-cartesia/tts.py,sha256=kWvqce9K3gZ4QrWD-ciYdK29n49SNkxhd2A7ueTOwMY,4878
-cartesia/version.py,sha256=F5mW07pSyGrqDNY2Ehr-UpDzpBtN-FsYU0QGZWf6PJE,22
-cartesia/voices.py,sha256=bDYbs0KoikAROJlmbnLdo4TrW0YwzjMvp70uKG6Alp0,7180
+cartesia/tts.py,sha256=WV8OduM87ciM1ht60Fi9Fh4gunX2Xew3K96ELCzpP-8,10702
+cartesia/version.py,sha256=8UhoYEXHs1Oai7BW_ExBmuwWnRI-yMG_u1fQAXMizHQ,22
+cartesia/voices.py,sha256=DLO_GJYDRhzFbqVIqzGOP1m1Ylzq7tVm6VHrknekFCk,6968
 cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
 cartesia/utils/retry.py,sha256=O6fyVWpH9Su8c0Fwupl57xMt6JrwJ52txBwP3faUL7k,3339
 cartesia/utils/tts.py,sha256=TbvBZqHR6LxPim6s5RyGiURi4hIfqWt3KUk5QYOOhfc,2177
-cartesia-1.3.0.dist-info/METADATA,sha256=eedG5B4V6MxvDuPUMdYwp6UHrX6yQ6dJTMRRZxq1-UA,20976
-cartesia-1.3.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
-cartesia-1.3.0.dist-info/licenses/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
-cartesia-1.3.0.dist-info/RECORD,,
+cartesia-1.4.0.dist-info/METADATA,sha256=LLv3iE6dKcAeZuSb6bDAc0GSVICUOS8szaM4n1F-Mww,21030
+cartesia-1.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+cartesia-1.4.0.dist-info/licenses/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
+cartesia-1.4.0.dist-info/RECORD,,

{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.26.3
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

{cartesia-1.3.0.dist-info → cartesia-1.4.0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

cartesia 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

cartesia 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl