cartesia 1.4.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. cartesia/__init__.py +302 -3
  2. cartesia/api_status/__init__.py +6 -0
  3. cartesia/api_status/client.py +104 -0
  4. cartesia/api_status/requests/__init__.py +5 -0
  5. cartesia/api_status/requests/api_info.py +8 -0
  6. cartesia/api_status/types/__init__.py +5 -0
  7. cartesia/api_status/types/api_info.py +20 -0
  8. cartesia/base_client.py +156 -0
  9. cartesia/client.py +163 -40
  10. cartesia/core/__init__.py +50 -0
  11. cartesia/core/api_error.py +15 -0
  12. cartesia/core/client_wrapper.py +55 -0
  13. cartesia/core/datetime_utils.py +28 -0
  14. cartesia/core/file.py +67 -0
  15. cartesia/core/http_client.py +499 -0
  16. cartesia/core/jsonable_encoder.py +101 -0
  17. cartesia/core/pagination.py +88 -0
  18. cartesia/core/pydantic_utilities.py +296 -0
  19. cartesia/core/query_encoder.py +58 -0
  20. cartesia/core/remove_none_from_dict.py +11 -0
  21. cartesia/core/request_options.py +35 -0
  22. cartesia/core/serialization.py +272 -0
  23. cartesia/datasets/__init__.py +24 -0
  24. cartesia/datasets/requests/__init__.py +15 -0
  25. cartesia/datasets/requests/create_dataset_request.py +7 -0
  26. cartesia/datasets/requests/dataset.py +9 -0
  27. cartesia/datasets/requests/dataset_file.py +9 -0
  28. cartesia/datasets/requests/paginated_dataset_files.py +10 -0
  29. cartesia/datasets/requests/paginated_datasets.py +10 -0
  30. cartesia/datasets/types/__init__.py +17 -0
  31. cartesia/datasets/types/create_dataset_request.py +19 -0
  32. cartesia/datasets/types/dataset.py +21 -0
  33. cartesia/datasets/types/dataset_file.py +21 -0
  34. cartesia/datasets/types/file_purpose.py +5 -0
  35. cartesia/datasets/types/paginated_dataset_files.py +21 -0
  36. cartesia/datasets/types/paginated_datasets.py +21 -0
  37. cartesia/embedding/__init__.py +5 -0
  38. cartesia/embedding/types/__init__.py +5 -0
  39. cartesia/embedding/types/embedding.py +201 -0
  40. cartesia/environment.py +7 -0
  41. cartesia/infill/__init__.py +2 -0
  42. cartesia/infill/client.py +318 -0
  43. cartesia/tts/__init__.py +167 -0
  44. cartesia/{_async_websocket.py → tts/_async_websocket.py} +212 -85
  45. cartesia/tts/_websocket.py +479 -0
  46. cartesia/tts/client.py +407 -0
  47. cartesia/tts/requests/__init__.py +76 -0
  48. cartesia/tts/requests/cancel_context_request.py +17 -0
  49. cartesia/tts/requests/controls.py +11 -0
  50. cartesia/tts/requests/generation_request.py +58 -0
  51. cartesia/tts/requests/mp_3_output_format.py +11 -0
  52. cartesia/tts/requests/output_format.py +30 -0
  53. cartesia/tts/requests/phoneme_timestamps.py +10 -0
  54. cartesia/tts/requests/raw_output_format.py +11 -0
  55. cartesia/tts/requests/speed.py +7 -0
  56. cartesia/tts/requests/tts_request.py +24 -0
  57. cartesia/tts/requests/tts_request_embedding_specifier.py +16 -0
  58. cartesia/tts/requests/tts_request_id_specifier.py +16 -0
  59. cartesia/tts/requests/tts_request_voice_specifier.py +7 -0
  60. cartesia/tts/requests/wav_output_format.py +7 -0
  61. cartesia/tts/requests/web_socket_base_response.py +11 -0
  62. cartesia/tts/requests/web_socket_chunk_response.py +11 -0
  63. cartesia/tts/requests/web_socket_done_response.py +7 -0
  64. cartesia/tts/requests/web_socket_error_response.py +7 -0
  65. cartesia/tts/requests/web_socket_flush_done_response.py +9 -0
  66. cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +9 -0
  67. cartesia/tts/requests/web_socket_raw_output_format.py +11 -0
  68. cartesia/tts/requests/web_socket_request.py +7 -0
  69. cartesia/tts/requests/web_socket_response.py +70 -0
  70. cartesia/tts/requests/web_socket_stream_options.py +8 -0
  71. cartesia/tts/requests/web_socket_timestamps_response.py +9 -0
  72. cartesia/tts/requests/web_socket_tts_output.py +18 -0
  73. cartesia/tts/requests/web_socket_tts_request.py +25 -0
  74. cartesia/tts/requests/word_timestamps.py +10 -0
  75. cartesia/tts/socket_client.py +302 -0
  76. cartesia/tts/types/__init__.py +90 -0
  77. cartesia/tts/types/cancel_context_request.py +28 -0
  78. cartesia/tts/types/context_id.py +3 -0
  79. cartesia/tts/types/controls.py +22 -0
  80. cartesia/tts/types/emotion.py +34 -0
  81. cartesia/tts/types/flush_id.py +3 -0
  82. cartesia/tts/types/generation_request.py +71 -0
  83. cartesia/tts/types/mp_3_output_format.py +23 -0
  84. cartesia/tts/types/natural_specifier.py +5 -0
  85. cartesia/tts/types/numerical_specifier.py +3 -0
  86. cartesia/tts/types/output_format.py +58 -0
  87. cartesia/tts/types/phoneme_timestamps.py +21 -0
  88. cartesia/tts/types/raw_encoding.py +5 -0
  89. cartesia/tts/types/raw_output_format.py +22 -0
  90. cartesia/tts/types/speed.py +7 -0
  91. cartesia/tts/types/supported_language.py +7 -0
  92. cartesia/tts/types/tts_request.py +35 -0
  93. cartesia/tts/types/tts_request_embedding_specifier.py +27 -0
  94. cartesia/tts/types/tts_request_id_specifier.py +27 -0
  95. cartesia/tts/types/tts_request_voice_specifier.py +7 -0
  96. cartesia/tts/types/wav_output_format.py +17 -0
  97. cartesia/tts/types/web_socket_base_response.py +22 -0
  98. cartesia/tts/types/web_socket_chunk_response.py +22 -0
  99. cartesia/tts/types/web_socket_done_response.py +17 -0
  100. cartesia/tts/types/web_socket_error_response.py +19 -0
  101. cartesia/tts/types/web_socket_flush_done_response.py +21 -0
  102. cartesia/tts/types/web_socket_phoneme_timestamps_response.py +20 -0
  103. cartesia/tts/types/web_socket_raw_output_format.py +22 -0
  104. cartesia/tts/types/web_socket_request.py +7 -0
  105. cartesia/tts/types/web_socket_response.py +125 -0
  106. cartesia/tts/types/web_socket_stream_options.py +19 -0
  107. cartesia/tts/types/web_socket_timestamps_response.py +20 -0
  108. cartesia/tts/types/web_socket_tts_output.py +29 -0
  109. cartesia/tts/types/web_socket_tts_request.py +37 -0
  110. cartesia/tts/types/word_timestamps.py +21 -0
  111. cartesia/{_constants.py → tts/utils/constants.py} +2 -2
  112. cartesia/tts/utils/tts.py +64 -0
  113. cartesia/tts/utils/types.py +70 -0
  114. cartesia/version.py +3 -1
  115. cartesia/voice_changer/__init__.py +27 -0
  116. cartesia/voice_changer/client.py +395 -0
  117. cartesia/voice_changer/requests/__init__.py +15 -0
  118. cartesia/voice_changer/requests/streaming_response.py +38 -0
  119. cartesia/voice_changer/types/__init__.py +17 -0
  120. cartesia/voice_changer/types/output_format_container.py +5 -0
  121. cartesia/voice_changer/types/streaming_response.py +64 -0
  122. cartesia/voices/__init__.py +81 -0
  123. cartesia/voices/client.py +1218 -0
  124. cartesia/voices/requests/__init__.py +29 -0
  125. cartesia/voices/requests/create_voice_request.py +23 -0
  126. cartesia/voices/requests/embedding_response.py +8 -0
  127. cartesia/voices/requests/embedding_specifier.py +10 -0
  128. cartesia/voices/requests/get_voices_response.py +24 -0
  129. cartesia/voices/requests/id_specifier.py +10 -0
  130. cartesia/voices/requests/localize_dialect.py +11 -0
  131. cartesia/voices/requests/localize_voice_request.py +28 -0
  132. cartesia/voices/requests/mix_voice_specifier.py +7 -0
  133. cartesia/voices/requests/mix_voices_request.py +9 -0
  134. cartesia/voices/requests/update_voice_request.py +15 -0
  135. cartesia/voices/requests/voice.py +43 -0
  136. cartesia/voices/requests/voice_metadata.py +36 -0
  137. cartesia/voices/types/__init__.py +53 -0
  138. cartesia/voices/types/base_voice_id.py +5 -0
  139. cartesia/voices/types/clone_mode.py +5 -0
  140. cartesia/voices/types/create_voice_request.py +34 -0
  141. cartesia/voices/types/embedding_response.py +20 -0
  142. cartesia/voices/types/embedding_specifier.py +22 -0
  143. cartesia/voices/types/gender.py +5 -0
  144. cartesia/voices/types/gender_presentation.py +5 -0
  145. cartesia/voices/types/get_voices_response.py +34 -0
  146. cartesia/voices/types/id_specifier.py +22 -0
  147. cartesia/voices/types/localize_dialect.py +11 -0
  148. cartesia/voices/types/localize_english_dialect.py +5 -0
  149. cartesia/voices/types/localize_french_dialect.py +5 -0
  150. cartesia/voices/types/localize_portuguese_dialect.py +5 -0
  151. cartesia/voices/types/localize_spanish_dialect.py +5 -0
  152. cartesia/voices/types/localize_target_language.py +7 -0
  153. cartesia/voices/types/localize_voice_request.py +39 -0
  154. cartesia/voices/types/mix_voice_specifier.py +7 -0
  155. cartesia/voices/types/mix_voices_request.py +20 -0
  156. cartesia/voices/types/update_voice_request.py +27 -0
  157. cartesia/voices/types/voice.py +54 -0
  158. cartesia/voices/types/voice_expand_options.py +5 -0
  159. cartesia/voices/types/voice_id.py +3 -0
  160. cartesia/voices/types/voice_metadata.py +48 -0
  161. cartesia/voices/types/weight.py +3 -0
  162. cartesia-2.0.0.dist-info/METADATA +414 -0
  163. cartesia-2.0.0.dist-info/RECORD +165 -0
  164. {cartesia-1.4.0.dist-info → cartesia-2.0.0.dist-info}/WHEEL +1 -1
  165. cartesia/_async_sse.py +0 -95
  166. cartesia/_logger.py +0 -3
  167. cartesia/_sse.py +0 -143
  168. cartesia/_types.py +0 -70
  169. cartesia/_websocket.py +0 -358
  170. cartesia/async_client.py +0 -82
  171. cartesia/async_tts.py +0 -176
  172. cartesia/resource.py +0 -44
  173. cartesia/tts.py +0 -292
  174. cartesia/utils/deprecated.py +0 -55
  175. cartesia/utils/retry.py +0 -87
  176. cartesia/utils/tts.py +0 -78
  177. cartesia/voices.py +0 -204
  178. cartesia-1.4.0.dist-info/METADATA +0 -663
  179. cartesia-1.4.0.dist-info/RECORD +0 -23
  180. cartesia-1.4.0.dist-info/licenses/LICENSE.md +0 -21
  181. /cartesia/{utils/__init__.py → py.typed} +0 -0
@@ -0,0 +1,16 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing
5
+ from ...voices.types.voice_id import VoiceId
6
+ import typing_extensions
7
+ from .controls import ControlsParams
8
+ from ...core.serialization import FieldMetadata
9
+
10
+
11
+ class TtsRequestIdSpecifierParams(typing_extensions.TypedDict):
12
+ mode: typing.Literal["id"]
13
+ id: VoiceId
14
+ experimental_controls: typing_extensions.NotRequired[
15
+ typing_extensions.Annotated[ControlsParams, FieldMetadata(alias="__experimental_controls")]
16
+ ]
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+ from .tts_request_id_specifier import TtsRequestIdSpecifierParams
5
+ from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
6
+
7
+ TtsRequestVoiceSpecifierParams = typing.Union[TtsRequestIdSpecifierParams, TtsRequestEmbeddingSpecifierParams]
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .raw_output_format import RawOutputFormatParams
4
+
5
+
6
+ class WavOutputFormatParams(RawOutputFormatParams):
7
+ pass
@@ -0,0 +1,11 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+ from ..types.context_id import ContextId
6
+
7
+
8
+ class WebSocketBaseResponseParams(typing_extensions.TypedDict):
9
+ context_id: typing_extensions.NotRequired[ContextId]
10
+ status_code: int
11
+ done: bool
@@ -0,0 +1,11 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .web_socket_base_response import WebSocketBaseResponseParams
4
+ import typing_extensions
5
+ from ..types.flush_id import FlushId
6
+
7
+
8
+ class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
9
+ data: str
10
+ step_time: float
11
+ flush_id: typing_extensions.NotRequired[FlushId]
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .web_socket_base_response import WebSocketBaseResponseParams
4
+
5
+
6
+ class WebSocketDoneResponseParams(WebSocketBaseResponseParams):
7
+ pass
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .web_socket_base_response import WebSocketBaseResponseParams
4
+
5
+
6
+ class WebSocketErrorResponseParams(WebSocketBaseResponseParams):
7
+ error: str
@@ -0,0 +1,9 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .web_socket_base_response import WebSocketBaseResponseParams
4
+ from ..types.flush_id import FlushId
5
+
6
+
7
+ class WebSocketFlushDoneResponseParams(WebSocketBaseResponseParams):
8
+ flush_id: FlushId
9
+ flush_done: bool
@@ -0,0 +1,9 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .web_socket_base_response import WebSocketBaseResponseParams
4
+ import typing_extensions
5
+ from .phoneme_timestamps import PhonemeTimestampsParams
6
+
7
+
8
+ class WebSocketPhonemeTimestampsResponseParams(WebSocketBaseResponseParams):
9
+ phoneme_timestamps: typing_extensions.NotRequired[PhonemeTimestampsParams]
@@ -0,0 +1,11 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing
5
+ from ..types.raw_encoding import RawEncoding
6
+
7
+
8
+ class WebSocketRawOutputFormatParams(typing_extensions.TypedDict):
9
+ container: typing.Literal["raw"]
10
+ encoding: RawEncoding
11
+ sample_rate: int
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+ from .generation_request import GenerationRequestParams
5
+ from .cancel_context_request import CancelContextRequestParams
6
+
7
+ WebSocketRequestParams = typing.Union[GenerationRequestParams, CancelContextRequestParams]
@@ -0,0 +1,70 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from __future__ import annotations
4
+ import typing_extensions
5
+ import typing
6
+ import typing_extensions
7
+ from ..types.flush_id import FlushId
8
+ from ..types.context_id import ContextId
9
+ from .word_timestamps import WordTimestampsParams
10
+ from .phoneme_timestamps import PhonemeTimestampsParams
11
+
12
+
13
+ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
14
+ type: typing.Literal["chunk"]
15
+ data: str
16
+ step_time: float
17
+ flush_id: typing_extensions.NotRequired[FlushId]
18
+ context_id: typing_extensions.NotRequired[ContextId]
19
+ status_code: int
20
+ done: bool
21
+
22
+
23
+ class WebSocketResponse_FlushDoneParams(typing_extensions.TypedDict):
24
+ type: typing.Literal["flush_done"]
25
+ flush_id: FlushId
26
+ flush_done: bool
27
+ context_id: typing_extensions.NotRequired[ContextId]
28
+ status_code: int
29
+ done: bool
30
+
31
+
32
+ class WebSocketResponse_DoneParams(typing_extensions.TypedDict):
33
+ type: typing.Literal["done"]
34
+ context_id: typing_extensions.NotRequired[ContextId]
35
+ status_code: int
36
+ done: bool
37
+
38
+
39
+ class WebSocketResponse_TimestampsParams(typing_extensions.TypedDict):
40
+ type: typing.Literal["timestamps"]
41
+ word_timestamps: typing_extensions.NotRequired[WordTimestampsParams]
42
+ context_id: typing_extensions.NotRequired[ContextId]
43
+ status_code: int
44
+ done: bool
45
+
46
+
47
+ class WebSocketResponse_ErrorParams(typing_extensions.TypedDict):
48
+ type: typing.Literal["error"]
49
+ error: str
50
+ context_id: typing_extensions.NotRequired[ContextId]
51
+ status_code: int
52
+ done: bool
53
+
54
+
55
+ class WebSocketResponse_PhonemeTimestampsParams(typing_extensions.TypedDict):
56
+ type: typing.Literal["phoneme_timestamps"]
57
+ phoneme_timestamps: typing_extensions.NotRequired[PhonemeTimestampsParams]
58
+ context_id: typing_extensions.NotRequired[ContextId]
59
+ status_code: int
60
+ done: bool
61
+
62
+
63
+ WebSocketResponseParams = typing.Union[
64
+ WebSocketResponse_ChunkParams,
65
+ WebSocketResponse_FlushDoneParams,
66
+ WebSocketResponse_DoneParams,
67
+ WebSocketResponse_TimestampsParams,
68
+ WebSocketResponse_ErrorParams,
69
+ WebSocketResponse_PhonemeTimestampsParams,
70
+ ]
@@ -0,0 +1,8 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+
6
+
7
+ class WebSocketStreamOptionsParams(typing_extensions.TypedDict):
8
+ timeout: typing_extensions.NotRequired[float]
@@ -0,0 +1,9 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .web_socket_base_response import WebSocketBaseResponseParams
4
+ import typing_extensions
5
+ from .word_timestamps import WordTimestampsParams
6
+
7
+
8
+ class WebSocketTimestampsResponseParams(WebSocketBaseResponseParams):
9
+ word_timestamps: typing_extensions.NotRequired[WordTimestampsParams]
@@ -0,0 +1,18 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+ from .word_timestamps import WordTimestampsParams
6
+ from .phoneme_timestamps import PhonemeTimestampsParams
7
+ import typing
8
+ from ..types.context_id import ContextId
9
+ from ..types.flush_id import FlushId
10
+
11
+
12
+ class WebSocketTtsOutputParams(typing_extensions.TypedDict):
13
+ word_timestamps: typing_extensions.NotRequired[WordTimestampsParams]
14
+ phoneme_timestamps: typing_extensions.NotRequired[PhonemeTimestampsParams]
15
+ audio: typing_extensions.NotRequired[typing.Optional[typing.Any]]
16
+ context_id: typing_extensions.NotRequired[ContextId]
17
+ flush_id: typing_extensions.NotRequired[FlushId]
18
+ flush_done: typing_extensions.NotRequired[bool]
@@ -0,0 +1,25 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+ from .output_format import OutputFormatParams
6
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
7
+ from ...core.serialization import FieldMetadata
8
+
9
+
10
+ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
11
+ model_id: str
12
+ """
13
+ The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
14
+ """
15
+
16
+ output_format: typing_extensions.NotRequired[OutputFormatParams]
17
+ transcript: typing_extensions.NotRequired[str]
18
+ voice: TtsRequestVoiceSpecifierParams
19
+ duration: typing_extensions.NotRequired[int]
20
+ language: typing_extensions.NotRequired[str]
21
+ add_timestamps: typing_extensions.NotRequired[bool]
22
+ use_original_timestamps: typing_extensions.NotRequired[bool]
23
+ add_phoneme_timestamps: typing_extensions.NotRequired[bool]
24
+ continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
25
+ context_id: typing_extensions.NotRequired[str]
@@ -0,0 +1,10 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing
5
+
6
+
7
+ class WordTimestampsParams(typing_extensions.TypedDict):
8
+ words: typing.Sequence[str]
9
+ start: typing.Sequence[float]
10
+ end: typing.Sequence[float]
@@ -0,0 +1,302 @@
1
+ import io
2
+ import typing
3
+ from json.decoder import JSONDecodeError
4
+
5
+ from pydub import AudioSegment # type: ignore
6
+
7
+ from ..core.api_error import ApiError
8
+ from ._async_websocket import AsyncTtsWebsocket
9
+ from ._websocket import TtsWebsocket
10
+ from .client import AsyncTtsClient, TtsClient
11
+ from .requests import TtsRequestVoiceSpecifierParams
12
+ from .requests.output_format import OutputFormatParams
13
+ from .utils.tts import concat_audio_segments, get_output_format
14
+
15
+
16
+ class TtsClientWithWebsocket(TtsClient):
17
+ """
18
+ Extension of TtsClient that supports a synchronous WebSocket TTS connection.
19
+ """
20
+
21
+ def __init__(self, *, client_wrapper):
22
+ super().__init__(client_wrapper=client_wrapper)
23
+
24
+ def get_output_format(self, output_format_name: str) -> OutputFormatParams:
25
+ return get_output_format(output_format_name)
26
+
27
+ def _ws_url(self):
28
+ base_url = self._client_wrapper.get_base_url()
29
+ if base_url.startswith("ws://") or base_url.startswith("wss://"):
30
+ return base_url
31
+ else:
32
+ prefix = "ws" if "localhost" in base_url else "wss"
33
+ base_url_without_protocol = base_url.split("://")[-1]
34
+ return f"{prefix}://{base_url_without_protocol}"
35
+
36
+ def infill(
37
+ self,
38
+ *,
39
+ model_id: str,
40
+ language: str,
41
+ transcript: str,
42
+ voice: TtsRequestVoiceSpecifierParams,
43
+ output_format: OutputFormatParams,
44
+ left_audio_path: typing.Optional[str] = None,
45
+ right_audio_path: typing.Optional[str] = None,
46
+ ) -> typing.Tuple[bytes, bytes]:
47
+ """Generate infill audio between two existing audio segments.
48
+
49
+ Args:
50
+ model_id: The ID of the model to use for generating audio
51
+ language: The language of the transcript
52
+ transcript: The text to synthesize
53
+ voice: The voice to use for generating audio
54
+ output_format: The desired audio output format
55
+ left_audio_path: Path to the audio file that comes before the infill
56
+ right_audio_path: Path to the audio file that comes after the infill
57
+
58
+ Returns:
59
+ A tuple containing:
60
+ - The generated infill audio (bytes)
61
+ - The complete concatenated audio (bytes)
62
+ """
63
+ if not left_audio_path and not right_audio_path:
64
+ raise ValueError(
65
+ "Must specify at least one of left_audio_path or right_audio_path"
66
+ )
67
+
68
+ if voice["mode"] != "id":
69
+ raise ValueError("Infill is only supported for id-based voice specifiers")
70
+
71
+ if output_format["container"] == "raw":
72
+ raise ValueError(
73
+ "Raw format is not supported for infill. Use wav or mp3 format instead."
74
+ )
75
+
76
+ headers = self._client_wrapper.get_headers()
77
+ headers.pop("Content-Type", None)
78
+
79
+ left_audio_file = None
80
+ right_audio_file = None
81
+ try:
82
+ files = {}
83
+ if left_audio_path:
84
+ left_audio_file = open(left_audio_path, "rb")
85
+ files["left_audio"] = left_audio_file
86
+ if right_audio_path:
87
+ right_audio_file = open(right_audio_path, "rb")
88
+ files["right_audio"] = right_audio_file
89
+
90
+ # Construct form data with output_format fields directly
91
+ data = {
92
+ "model_id": model_id,
93
+ "language": language,
94
+ "transcript": transcript,
95
+ "voice_id": voice["id"],
96
+ "output_format[container]": output_format["container"],
97
+ "output_format[sample_rate]": output_format["sample_rate"],
98
+ }
99
+
100
+ # Add bit_rate for mp3 container
101
+ if "bit_rate" in output_format and output_format["bit_rate"] is not None:
102
+ data["output_format[bit_rate]"] = output_format["bit_rate"]
103
+ if (
104
+ output_format["container"] != "mp3"
105
+ and "encoding" in output_format
106
+ and output_format["encoding"] is not None
107
+ ):
108
+ data["output_format[encoding]"] = output_format["encoding"]
109
+
110
+ _response = self._client_wrapper.httpx_client.request(
111
+ "infill/bytes",
112
+ method="POST",
113
+ files=files, # type: ignore
114
+ data=data,
115
+ headers=headers,
116
+ )
117
+ try:
118
+ if 200 <= _response.status_code < 300:
119
+ if left_audio_file:
120
+ left_audio_file.seek(0)
121
+ left_audio = left_audio_file.read()
122
+ else:
123
+ left_audio = None
124
+
125
+ if right_audio_file:
126
+ right_audio_file.seek(0)
127
+ right_audio = right_audio_file.read()
128
+ else:
129
+ right_audio = None
130
+
131
+ infill_audio = _response.content
132
+ format = output_format["container"].lower()
133
+ total_audio = concat_audio_segments(
134
+ left_audio, infill_audio, right_audio, format=format
135
+ )
136
+ return infill_audio, total_audio
137
+
138
+ _response_json = _response.json()
139
+ except JSONDecodeError:
140
+ raise ApiError(status_code=_response.status_code, body=_response.text)
141
+ raise ApiError(status_code=_response.status_code, body=_response_json)
142
+
143
+ finally:
144
+ if left_audio_file:
145
+ left_audio_file.close()
146
+ if right_audio_file:
147
+ right_audio_file.close()
148
+
149
+ def websocket(self):
150
+ client_headers = self._client_wrapper.get_headers()
151
+ ws = TtsWebsocket(
152
+ ws_url=self._ws_url(),
153
+ cartesia_version=client_headers["Cartesia-Version"],
154
+ api_key=client_headers["X-API-Key"],
155
+ )
156
+ ws.connect()
157
+ return ws
158
+
159
+
160
+ class AsyncTtsClientWithWebsocket(AsyncTtsClient):
161
+ """
162
+ Extension of AsyncTtsClient that supports an asynchronous WebSocket TTS connection.
163
+ """
164
+
165
+ def __init__(self, *, client_wrapper, get_session):
166
+ super().__init__(client_wrapper=client_wrapper)
167
+ self._get_session = get_session
168
+
169
+ def get_output_format(self, output_format_name: str) -> OutputFormatParams:
170
+ return get_output_format(output_format_name)
171
+
172
+ def _ws_url(self) -> str:
173
+ base_url = self._client_wrapper.get_base_url()
174
+ if base_url.startswith("ws://") or base_url.startswith("wss://"):
175
+ return base_url
176
+ else:
177
+ prefix = "ws" if "localhost" in base_url else "wss"
178
+ base_url_without_protocol = base_url.split("://")[-1]
179
+ return f"{prefix}://{base_url_without_protocol}"
180
+
181
+ async def infill(
182
+ self,
183
+ *,
184
+ model_id: str,
185
+ language: str,
186
+ transcript: str,
187
+ voice: TtsRequestVoiceSpecifierParams,
188
+ output_format: OutputFormatParams,
189
+ left_audio_path: typing.Optional[str] = None,
190
+ right_audio_path: typing.Optional[str] = None,
191
+ ) -> typing.Tuple[bytes, bytes]:
192
+ """Generate infill audio between two existing audio segments.
193
+ Args:
194
+ model_id: The ID of the model to use for generating audio
195
+ language: The language of the transcript
196
+ transcript: The text to synthesize
197
+ voice_id: The ID of the voice to use for generating audio
198
+ output_format: The desired audio output format
199
+ left_audio_path: Path to the audio file that comes before the infill
200
+ right_audio_path: Path to the audio file that comes after the infill
201
+ experimental_voice_controls: Optional voice control parameters
202
+ Returns:
203
+ A tuple containing:
204
+ - The generated infill audio (bytes)
205
+ - The complete concatenated audio (bytes)
206
+ """
207
+ if not left_audio_path and not right_audio_path:
208
+ raise ValueError(
209
+ "Must specify at least one of left_audio_path or right_audio_path"
210
+ )
211
+
212
+ if voice["mode"] != "id":
213
+ raise ValueError("Infill is only supported for id-based voice specifiers")
214
+
215
+ if output_format["container"] == "raw":
216
+ raise ValueError(
217
+ "Raw format is not supported for infill. Use wav or mp3 format instead."
218
+ )
219
+
220
+ headers = self._client_wrapper.get_headers()
221
+ headers.pop("Content-Type", None)
222
+
223
+ left_audio_file = None
224
+ right_audio_file = None
225
+ try:
226
+ files = {}
227
+ if left_audio_path:
228
+ left_audio_file = open(left_audio_path, "rb")
229
+ files["left_audio"] = left_audio_file
230
+ if right_audio_path:
231
+ right_audio_file = open(right_audio_path, "rb")
232
+ files["right_audio"] = right_audio_file
233
+
234
+ data = {
235
+ "model_id": model_id,
236
+ "language": language,
237
+ "transcript": transcript,
238
+ "voice_id": voice["id"],
239
+ "output_format[container]": output_format["container"],
240
+ "output_format[sample_rate]": output_format["sample_rate"],
241
+ }
242
+
243
+ if "bit_rate" in output_format and output_format["bit_rate"] is not None:
244
+ data["output_format[bit_rate]"] = output_format["bit_rate"]
245
+ if (
246
+ output_format["container"] != "mp3"
247
+ and "encoding" in output_format
248
+ and output_format["encoding"] is not None
249
+ ):
250
+ data["output_format[encoding]"] = output_format["encoding"]
251
+
252
+ _response = await self._client_wrapper.httpx_client.request(
253
+ "infill/bytes",
254
+ method="POST",
255
+ files=files, # type: ignore
256
+ headers=headers,
257
+ data=data,
258
+ request_options=None,
259
+ )
260
+
261
+ try:
262
+ if 200 <= _response.status_code < 300:
263
+ if left_audio_file:
264
+ left_audio_file.seek(0)
265
+ left_audio = left_audio_file.read()
266
+ else:
267
+ left_audio = None
268
+
269
+ if right_audio_file:
270
+ right_audio_file.seek(0)
271
+ right_audio = right_audio_file.read()
272
+ else:
273
+ right_audio = None
274
+
275
+ infill_audio = _response.content
276
+ audio_format = output_format["container"].lower()
277
+ total_audio = concat_audio_segments(
278
+ left_audio, infill_audio, right_audio, format=audio_format
279
+ )
280
+ return infill_audio, total_audio
281
+
282
+ _response_json = _response.json()
283
+ except JSONDecodeError:
284
+ raise ApiError(status_code=_response.status_code, body=_response.text)
285
+ raise ApiError(status_code=_response.status_code, body=_response_json)
286
+
287
+ finally:
288
+ if left_audio_file:
289
+ left_audio_file.close()
290
+ if right_audio_file:
291
+ right_audio_file.close()
292
+
293
+ async def websocket(self):
294
+ client_headers = self._client_wrapper.get_headers()
295
+ ws = AsyncTtsWebsocket(
296
+ ws_url=self._ws_url(),
297
+ cartesia_version=client_headers["Cartesia-Version"],
298
+ api_key=client_headers["X-API-Key"],
299
+ get_session=self._get_session,
300
+ )
301
+ await ws.connect()
302
+ return ws
@@ -0,0 +1,90 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .cancel_context_request import CancelContextRequest
4
+ from .context_id import ContextId
5
+ from .controls import Controls
6
+ from .emotion import Emotion
7
+ from .flush_id import FlushId
8
+ from .generation_request import GenerationRequest
9
+ from .mp_3_output_format import Mp3OutputFormat
10
+ from .natural_specifier import NaturalSpecifier
11
+ from .numerical_specifier import NumericalSpecifier
12
+ from .output_format import OutputFormat, OutputFormat_Mp3, OutputFormat_Raw, OutputFormat_Wav
13
+ from .phoneme_timestamps import PhonemeTimestamps
14
+ from .raw_encoding import RawEncoding
15
+ from .raw_output_format import RawOutputFormat
16
+ from .speed import Speed
17
+ from .supported_language import SupportedLanguage
18
+ from .tts_request import TtsRequest
19
+ from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
20
+ from .tts_request_id_specifier import TtsRequestIdSpecifier
21
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
22
+ from .wav_output_format import WavOutputFormat
23
+ from .web_socket_base_response import WebSocketBaseResponse
24
+ from .web_socket_chunk_response import WebSocketChunkResponse
25
+ from .web_socket_done_response import WebSocketDoneResponse
26
+ from .web_socket_error_response import WebSocketErrorResponse
27
+ from .web_socket_flush_done_response import WebSocketFlushDoneResponse
28
+ from .web_socket_phoneme_timestamps_response import WebSocketPhonemeTimestampsResponse
29
+ from .web_socket_raw_output_format import WebSocketRawOutputFormat
30
+ from .web_socket_request import WebSocketRequest
31
+ from .web_socket_response import (
32
+ WebSocketResponse,
33
+ WebSocketResponse_Chunk,
34
+ WebSocketResponse_Done,
35
+ WebSocketResponse_Error,
36
+ WebSocketResponse_FlushDone,
37
+ WebSocketResponse_PhonemeTimestamps,
38
+ WebSocketResponse_Timestamps,
39
+ )
40
+ from .web_socket_stream_options import WebSocketStreamOptions
41
+ from .web_socket_timestamps_response import WebSocketTimestampsResponse
42
+ from .web_socket_tts_output import WebSocketTtsOutput
43
+ from .web_socket_tts_request import WebSocketTtsRequest
44
+ from .word_timestamps import WordTimestamps
45
+
46
+ __all__ = [
47
+ "CancelContextRequest",
48
+ "ContextId",
49
+ "Controls",
50
+ "Emotion",
51
+ "FlushId",
52
+ "GenerationRequest",
53
+ "Mp3OutputFormat",
54
+ "NaturalSpecifier",
55
+ "NumericalSpecifier",
56
+ "OutputFormat",
57
+ "OutputFormat_Mp3",
58
+ "OutputFormat_Raw",
59
+ "OutputFormat_Wav",
60
+ "PhonemeTimestamps",
61
+ "RawEncoding",
62
+ "RawOutputFormat",
63
+ "Speed",
64
+ "SupportedLanguage",
65
+ "TtsRequest",
66
+ "TtsRequestEmbeddingSpecifier",
67
+ "TtsRequestIdSpecifier",
68
+ "TtsRequestVoiceSpecifier",
69
+ "WavOutputFormat",
70
+ "WebSocketBaseResponse",
71
+ "WebSocketChunkResponse",
72
+ "WebSocketDoneResponse",
73
+ "WebSocketErrorResponse",
74
+ "WebSocketFlushDoneResponse",
75
+ "WebSocketPhonemeTimestampsResponse",
76
+ "WebSocketRawOutputFormat",
77
+ "WebSocketRequest",
78
+ "WebSocketResponse",
79
+ "WebSocketResponse_Chunk",
80
+ "WebSocketResponse_Done",
81
+ "WebSocketResponse_Error",
82
+ "WebSocketResponse_FlushDone",
83
+ "WebSocketResponse_PhonemeTimestamps",
84
+ "WebSocketResponse_Timestamps",
85
+ "WebSocketStreamOptions",
86
+ "WebSocketTimestampsResponse",
87
+ "WebSocketTtsOutput",
88
+ "WebSocketTtsRequest",
89
+ "WordTimestamps",
90
+ ]