cartesia 1.4.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. cartesia/__init__.py +302 -3
  2. cartesia/api_status/__init__.py +6 -0
  3. cartesia/api_status/client.py +104 -0
  4. cartesia/api_status/requests/__init__.py +5 -0
  5. cartesia/api_status/requests/api_info.py +8 -0
  6. cartesia/api_status/types/__init__.py +5 -0
  7. cartesia/api_status/types/api_info.py +20 -0
  8. cartesia/base_client.py +156 -0
  9. cartesia/client.py +163 -40
  10. cartesia/core/__init__.py +50 -0
  11. cartesia/core/api_error.py +15 -0
  12. cartesia/core/client_wrapper.py +55 -0
  13. cartesia/core/datetime_utils.py +28 -0
  14. cartesia/core/file.py +67 -0
  15. cartesia/core/http_client.py +499 -0
  16. cartesia/core/jsonable_encoder.py +101 -0
  17. cartesia/core/pagination.py +88 -0
  18. cartesia/core/pydantic_utilities.py +296 -0
  19. cartesia/core/query_encoder.py +58 -0
  20. cartesia/core/remove_none_from_dict.py +11 -0
  21. cartesia/core/request_options.py +35 -0
  22. cartesia/core/serialization.py +272 -0
  23. cartesia/datasets/__init__.py +24 -0
  24. cartesia/datasets/requests/__init__.py +15 -0
  25. cartesia/datasets/requests/create_dataset_request.py +7 -0
  26. cartesia/datasets/requests/dataset.py +9 -0
  27. cartesia/datasets/requests/dataset_file.py +9 -0
  28. cartesia/datasets/requests/paginated_dataset_files.py +10 -0
  29. cartesia/datasets/requests/paginated_datasets.py +10 -0
  30. cartesia/datasets/types/__init__.py +17 -0
  31. cartesia/datasets/types/create_dataset_request.py +19 -0
  32. cartesia/datasets/types/dataset.py +21 -0
  33. cartesia/datasets/types/dataset_file.py +21 -0
  34. cartesia/datasets/types/file_purpose.py +5 -0
  35. cartesia/datasets/types/paginated_dataset_files.py +21 -0
  36. cartesia/datasets/types/paginated_datasets.py +21 -0
  37. cartesia/embedding/__init__.py +5 -0
  38. cartesia/embedding/types/__init__.py +5 -0
  39. cartesia/embedding/types/embedding.py +201 -0
  40. cartesia/environment.py +7 -0
  41. cartesia/infill/__init__.py +2 -0
  42. cartesia/infill/client.py +318 -0
  43. cartesia/tts/__init__.py +167 -0
  44. cartesia/{_async_websocket.py → tts/_async_websocket.py} +212 -85
  45. cartesia/tts/_websocket.py +479 -0
  46. cartesia/tts/client.py +407 -0
  47. cartesia/tts/requests/__init__.py +76 -0
  48. cartesia/tts/requests/cancel_context_request.py +17 -0
  49. cartesia/tts/requests/controls.py +11 -0
  50. cartesia/tts/requests/generation_request.py +58 -0
  51. cartesia/tts/requests/mp_3_output_format.py +11 -0
  52. cartesia/tts/requests/output_format.py +30 -0
  53. cartesia/tts/requests/phoneme_timestamps.py +10 -0
  54. cartesia/tts/requests/raw_output_format.py +11 -0
  55. cartesia/tts/requests/speed.py +7 -0
  56. cartesia/tts/requests/tts_request.py +24 -0
  57. cartesia/tts/requests/tts_request_embedding_specifier.py +16 -0
  58. cartesia/tts/requests/tts_request_id_specifier.py +16 -0
  59. cartesia/tts/requests/tts_request_voice_specifier.py +7 -0
  60. cartesia/tts/requests/wav_output_format.py +7 -0
  61. cartesia/tts/requests/web_socket_base_response.py +11 -0
  62. cartesia/tts/requests/web_socket_chunk_response.py +11 -0
  63. cartesia/tts/requests/web_socket_done_response.py +7 -0
  64. cartesia/tts/requests/web_socket_error_response.py +7 -0
  65. cartesia/tts/requests/web_socket_flush_done_response.py +9 -0
  66. cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +9 -0
  67. cartesia/tts/requests/web_socket_raw_output_format.py +11 -0
  68. cartesia/tts/requests/web_socket_request.py +7 -0
  69. cartesia/tts/requests/web_socket_response.py +70 -0
  70. cartesia/tts/requests/web_socket_stream_options.py +8 -0
  71. cartesia/tts/requests/web_socket_timestamps_response.py +9 -0
  72. cartesia/tts/requests/web_socket_tts_output.py +18 -0
  73. cartesia/tts/requests/web_socket_tts_request.py +25 -0
  74. cartesia/tts/requests/word_timestamps.py +10 -0
  75. cartesia/tts/socket_client.py +302 -0
  76. cartesia/tts/types/__init__.py +90 -0
  77. cartesia/tts/types/cancel_context_request.py +28 -0
  78. cartesia/tts/types/context_id.py +3 -0
  79. cartesia/tts/types/controls.py +22 -0
  80. cartesia/tts/types/emotion.py +34 -0
  81. cartesia/tts/types/flush_id.py +3 -0
  82. cartesia/tts/types/generation_request.py +71 -0
  83. cartesia/tts/types/mp_3_output_format.py +23 -0
  84. cartesia/tts/types/natural_specifier.py +5 -0
  85. cartesia/tts/types/numerical_specifier.py +3 -0
  86. cartesia/tts/types/output_format.py +58 -0
  87. cartesia/tts/types/phoneme_timestamps.py +21 -0
  88. cartesia/tts/types/raw_encoding.py +5 -0
  89. cartesia/tts/types/raw_output_format.py +22 -0
  90. cartesia/tts/types/speed.py +7 -0
  91. cartesia/tts/types/supported_language.py +7 -0
  92. cartesia/tts/types/tts_request.py +35 -0
  93. cartesia/tts/types/tts_request_embedding_specifier.py +27 -0
  94. cartesia/tts/types/tts_request_id_specifier.py +27 -0
  95. cartesia/tts/types/tts_request_voice_specifier.py +7 -0
  96. cartesia/tts/types/wav_output_format.py +17 -0
  97. cartesia/tts/types/web_socket_base_response.py +22 -0
  98. cartesia/tts/types/web_socket_chunk_response.py +22 -0
  99. cartesia/tts/types/web_socket_done_response.py +17 -0
  100. cartesia/tts/types/web_socket_error_response.py +19 -0
  101. cartesia/tts/types/web_socket_flush_done_response.py +21 -0
  102. cartesia/tts/types/web_socket_phoneme_timestamps_response.py +20 -0
  103. cartesia/tts/types/web_socket_raw_output_format.py +22 -0
  104. cartesia/tts/types/web_socket_request.py +7 -0
  105. cartesia/tts/types/web_socket_response.py +125 -0
  106. cartesia/tts/types/web_socket_stream_options.py +19 -0
  107. cartesia/tts/types/web_socket_timestamps_response.py +20 -0
  108. cartesia/tts/types/web_socket_tts_output.py +29 -0
  109. cartesia/tts/types/web_socket_tts_request.py +37 -0
  110. cartesia/tts/types/word_timestamps.py +21 -0
  111. cartesia/{_constants.py → tts/utils/constants.py} +2 -2
  112. cartesia/tts/utils/tts.py +64 -0
  113. cartesia/tts/utils/types.py +70 -0
  114. cartesia/version.py +3 -1
  115. cartesia/voice_changer/__init__.py +27 -0
  116. cartesia/voice_changer/client.py +395 -0
  117. cartesia/voice_changer/requests/__init__.py +15 -0
  118. cartesia/voice_changer/requests/streaming_response.py +38 -0
  119. cartesia/voice_changer/types/__init__.py +17 -0
  120. cartesia/voice_changer/types/output_format_container.py +5 -0
  121. cartesia/voice_changer/types/streaming_response.py +64 -0
  122. cartesia/voices/__init__.py +81 -0
  123. cartesia/voices/client.py +1218 -0
  124. cartesia/voices/requests/__init__.py +29 -0
  125. cartesia/voices/requests/create_voice_request.py +23 -0
  126. cartesia/voices/requests/embedding_response.py +8 -0
  127. cartesia/voices/requests/embedding_specifier.py +10 -0
  128. cartesia/voices/requests/get_voices_response.py +24 -0
  129. cartesia/voices/requests/id_specifier.py +10 -0
  130. cartesia/voices/requests/localize_dialect.py +11 -0
  131. cartesia/voices/requests/localize_voice_request.py +28 -0
  132. cartesia/voices/requests/mix_voice_specifier.py +7 -0
  133. cartesia/voices/requests/mix_voices_request.py +9 -0
  134. cartesia/voices/requests/update_voice_request.py +15 -0
  135. cartesia/voices/requests/voice.py +43 -0
  136. cartesia/voices/requests/voice_metadata.py +36 -0
  137. cartesia/voices/types/__init__.py +53 -0
  138. cartesia/voices/types/base_voice_id.py +5 -0
  139. cartesia/voices/types/clone_mode.py +5 -0
  140. cartesia/voices/types/create_voice_request.py +34 -0
  141. cartesia/voices/types/embedding_response.py +20 -0
  142. cartesia/voices/types/embedding_specifier.py +22 -0
  143. cartesia/voices/types/gender.py +5 -0
  144. cartesia/voices/types/gender_presentation.py +5 -0
  145. cartesia/voices/types/get_voices_response.py +34 -0
  146. cartesia/voices/types/id_specifier.py +22 -0
  147. cartesia/voices/types/localize_dialect.py +11 -0
  148. cartesia/voices/types/localize_english_dialect.py +5 -0
  149. cartesia/voices/types/localize_french_dialect.py +5 -0
  150. cartesia/voices/types/localize_portuguese_dialect.py +5 -0
  151. cartesia/voices/types/localize_spanish_dialect.py +5 -0
  152. cartesia/voices/types/localize_target_language.py +7 -0
  153. cartesia/voices/types/localize_voice_request.py +39 -0
  154. cartesia/voices/types/mix_voice_specifier.py +7 -0
  155. cartesia/voices/types/mix_voices_request.py +20 -0
  156. cartesia/voices/types/update_voice_request.py +27 -0
  157. cartesia/voices/types/voice.py +54 -0
  158. cartesia/voices/types/voice_expand_options.py +5 -0
  159. cartesia/voices/types/voice_id.py +3 -0
  160. cartesia/voices/types/voice_metadata.py +48 -0
  161. cartesia/voices/types/weight.py +3 -0
  162. cartesia-2.0.0.dist-info/METADATA +414 -0
  163. cartesia-2.0.0.dist-info/RECORD +165 -0
  164. {cartesia-1.4.0.dist-info → cartesia-2.0.0.dist-info}/WHEEL +1 -1
  165. cartesia/_async_sse.py +0 -95
  166. cartesia/_logger.py +0 -3
  167. cartesia/_sse.py +0 -143
  168. cartesia/_types.py +0 -70
  169. cartesia/_websocket.py +0 -358
  170. cartesia/async_client.py +0 -82
  171. cartesia/async_tts.py +0 -176
  172. cartesia/resource.py +0 -44
  173. cartesia/tts.py +0 -292
  174. cartesia/utils/deprecated.py +0 -55
  175. cartesia/utils/retry.py +0 -87
  176. cartesia/utils/tts.py +0 -78
  177. cartesia/voices.py +0 -204
  178. cartesia-1.4.0.dist-info/METADATA +0 -663
  179. cartesia-1.4.0.dist-info/RECORD +0 -23
  180. cartesia-1.4.0.dist-info/licenses/LICENSE.md +0 -21
  181. /cartesia/{utils/__init__.py → py.typed} +0 -0
@@ -1,663 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: cartesia
3
- Version: 1.4.0
4
- Summary: The official Python library for the Cartesia API.
5
- License-File: LICENSE.md
6
- Requires-Python: >=3.9
7
- Requires-Dist: aiohttp>=3.10.10
8
- Requires-Dist: httpx>=0.27.0
9
- Requires-Dist: iterators>=0.2.0
10
- Requires-Dist: pydub>=0.25.1
11
- Requires-Dist: requests>=2.31.0
12
- Requires-Dist: websockets>=10.4
13
- Description-Content-Type: text/markdown
14
-
15
- # Cartesia Python API Library
16
-
17
- ![PyPI - Version](https://img.shields.io/pypi/v/cartesia)
18
- [![Discord](https://badgen.net/badge/black/Cartesia/icon?icon=discord&label)](https://discord.gg/cartesia)
19
-
20
- The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
21
-
22
- > [!IMPORTANT]
23
- > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/cartesia) for any support requests!
24
-
25
- - [Cartesia Python API Library](#cartesia-python-api-library)
26
- - [Documentation](#documentation)
27
- - [Installation](#installation)
28
- - [Voices](#voices)
29
- - [Text-to-Speech](#text-to-speech)
30
- - [Bytes](#bytes)
31
- - [Server-Sent Events (SSE)](#server-sent-events-sse)
32
- - [WebSocket](#websocket)
33
- - [Conditioning speech on previous generations using WebSocket](#conditioning-speech-on-previous-generations-using-websocket)
34
- - [Generating timestamps using WebSocket](#generating-timestamps-using-websocket)
35
- - [Multilingual Text-to-Speech \[Alpha\]](#multilingual-text-to-speech-alpha)
36
- - [Speed and Emotion Control \[Experimental\]](#speed-and-emotion-control-experimental)
37
- - [Jupyter Notebook Usage](#jupyter-notebook-usage)
38
- - [Utility methods](#utility-methods)
39
- - [Output Formats](#output-formats)
40
-
41
-
42
- ## Documentation
43
-
44
- Our complete API documentation can be found [on docs.cartesia.ai](https://docs.cartesia.ai).
45
-
46
- ## Installation
47
-
48
- ```bash
49
- pip install cartesia
50
-
51
- # pip install in editable mode w/ dev dependencies
52
- pip install -e '.[dev]'
53
- ```
54
-
55
- ## Voices
56
-
57
- ```python
58
- from cartesia import Cartesia
59
- import os
60
-
61
- client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
62
-
63
- # Get all available voices
64
- voices = client.voices.list()
65
- print(voices)
66
-
67
- # Get a specific voice
68
- voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
69
- print("The embedding for", voice["name"], "is", voice["embedding"])
70
-
71
- # Clone a voice using filepath
72
- cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
73
-
74
- # Mix voices together
75
- mixed_voice_embedding = client.voices.mix(
76
- [{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
77
- )
78
-
79
- # Create a new voice
80
- new_voice = client.voices.create(
81
- name="New Voice",
82
- description="A clone of my own voice",
83
- embedding=cloned_voice_embedding,
84
- )
85
- ```
86
-
87
- ## Text-to-Speech
88
-
89
- ### Bytes
90
-
91
- ```python
92
- from cartesia import Cartesia
93
- import os
94
-
95
- client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
96
-
97
- data = client.tts.bytes(
98
- model_id="sonic-english",
99
- transcript="Hello, world! I'm generating audio on Cartesia.",
100
- voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
101
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/tts/bytes
102
- output_format={
103
- "container": "wav",
104
- "encoding": "pcm_f32le",
105
- "sample_rate": 44100,
106
- },
107
- )
108
-
109
- with open("output.wav", "wb") as f:
110
- f.write(data)
111
- ```
112
-
113
- ### Server-Sent Events (SSE)
114
-
115
- ```python
116
- from cartesia import Cartesia
117
- import pyaudio
118
- import os
119
-
120
- client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
121
- voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
122
- voice = client.voices.get(id=voice_id)
123
-
124
- transcript = "Hello! Welcome to Cartesia"
125
-
126
- # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
127
- model_id = "sonic-english"
128
-
129
- # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
130
- output_format = {
131
- "container": "raw",
132
- "encoding": "pcm_f32le",
133
- "sample_rate": 44100,
134
- }
135
-
136
- p = pyaudio.PyAudio()
137
- rate = 44100
138
-
139
- stream = None
140
-
141
- # Generate and stream audio
142
- for output in client.tts.sse(
143
- model_id=model_id,
144
- transcript=transcript,
145
- voice_embedding=voice["embedding"],
146
- stream=True,
147
- output_format=output_format,
148
- ):
149
- buffer = output["audio"]
150
-
151
- if not stream:
152
- stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
153
-
154
- # Write the audio data to the stream
155
- stream.write(buffer)
156
-
157
- stream.stop_stream()
158
- stream.close()
159
- p.terminate()
160
- ```
161
-
162
- You can also use the async client if you want to make asynchronous API calls. Simply import `AsyncCartesia` instead of `Cartesia` and use await with each API call:
163
-
164
- ```python
165
- from cartesia import AsyncCartesia
166
- import asyncio
167
- import pyaudio
168
- import os
169
-
170
-
171
- async def write_stream():
172
- client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
173
- voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
174
- voice = client.voices.get(id=voice_id)
175
- transcript = "Hello! Welcome to Cartesia"
176
- # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
177
- model_id = "sonic-english"
178
-
179
- # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
180
- output_format = {
181
- "container": "raw",
182
- "encoding": "pcm_f32le",
183
- "sample_rate": 44100,
184
- }
185
-
186
- p = pyaudio.PyAudio()
187
- rate = 44100
188
-
189
- stream = None
190
-
191
- # Generate and stream audio
192
- async for output in await client.tts.sse(
193
- model_id=model_id,
194
- transcript=transcript,
195
- voice_embedding=voice["embedding"],
196
- stream=True,
197
- output_format=output_format,
198
- ):
199
- buffer = output["audio"]
200
-
201
- if not stream:
202
- stream = p.open(
203
- format=pyaudio.paFloat32, channels=1, rate=rate, output=True
204
- )
205
-
206
- # Write the audio data to the stream
207
- stream.write(buffer)
208
-
209
- stream.stop_stream()
210
- stream.close()
211
- p.terminate()
212
- await client.close()
213
-
214
-
215
- asyncio.run(write_stream())
216
- ```
217
-
218
- ### WebSocket
219
-
220
- ```python
221
- from cartesia import Cartesia
222
- import pyaudio
223
- import os
224
-
225
- client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
226
- voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
227
- voice = client.voices.get(id=voice_id)
228
- transcript = "Hello! Welcome to Cartesia"
229
-
230
- # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
231
- model_id = "sonic-english"
232
-
233
- # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
234
- output_format = {
235
- "container": "raw",
236
- "encoding": "pcm_f32le",
237
- "sample_rate": 22050,
238
- }
239
-
240
- p = pyaudio.PyAudio()
241
- rate = 22050
242
-
243
- stream = None
244
-
245
- # Set up the websocket connection
246
- ws = client.tts.websocket()
247
-
248
- # Generate and stream audio using the websocket
249
- for output in ws.send(
250
- model_id=model_id,
251
- transcript=transcript,
252
- voice_embedding=voice["embedding"],
253
- stream=True,
254
- output_format=output_format,
255
- ):
256
- buffer = output["audio"]
257
-
258
- if not stream:
259
- stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
260
-
261
- # Write the audio data to the stream
262
- stream.write(buffer)
263
-
264
- stream.stop_stream()
265
- stream.close()
266
- p.terminate()
267
-
268
- ws.close() # Close the websocket connection
269
- ```
270
-
271
- #### Conditioning speech on previous generations using WebSocket
272
-
273
- In some cases, input text may need to be streamed in. In these cases, it would be slow to wait for all the text to buffer before sending it to Cartesia's TTS service.
274
-
275
- To mitigate this, Cartesia offers audio continuations. In this setting, users can send input text, as it becomes available, over a websocket connection.
276
-
277
- To do this, we will create a `context` and send multiple requests without awaiting the response. Then you can listen to the responses in the order they were sent.
278
-
279
- Each `context` will be closed automatically after 5 seconds of inactivity or when the `no_more_inputs` method is called. `no_more_inputs` sends a request with the `continue_=False`, which indicates no more inputs will be sent over this context
280
-
281
- ```python
282
- import asyncio
283
- import os
284
- import pyaudio
285
- from cartesia import AsyncCartesia
286
-
287
- async def send_transcripts(ctx):
288
- # Check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
289
- voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
290
-
291
- # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
292
- model_id = "sonic-english"
293
-
294
- # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
295
- output_format = {
296
- "container": "raw",
297
- "encoding": "pcm_f32le",
298
- "sample_rate": 44100,
299
- }
300
-
301
- transcripts = [
302
- "Sonic and Yoshi team up in a dimension-hopping adventure! ",
303
- "Racing through twisting zones, they dodge Eggman's badniks and solve ancient puzzles. ",
304
- "In the Echoing Caverns, they find the Harmonic Crystal, unlocking new powers. ",
305
- "Sonic's speed creates sound waves, while Yoshi's eggs become sonic bolts. ",
306
- "As they near Eggman's lair, our heroes charge their abilities for an epic boss battle. ",
307
- "Get ready to spin, jump, and sound-blast your way to victory in this high-octane crossover!"
308
- ]
309
-
310
- for transcript in transcripts:
311
- # Send text inputs as they become available
312
- await ctx.send(
313
- model_id=model_id,
314
- transcript=transcript,
315
- voice_id=voice_id,
316
- continue_=True,
317
- output_format=output_format,
318
- )
319
-
320
- # Indicate that no more inputs will be sent. Otherwise, the context will close after 5 seconds of inactivity.
321
- await ctx.no_more_inputs()
322
-
323
- async def receive_and_play_audio(ctx):
324
- p = pyaudio.PyAudio()
325
- stream = None
326
- rate = 44100
327
-
328
- async for output in ctx.receive():
329
- buffer = output["audio"]
330
-
331
- if not stream:
332
- stream = p.open(
333
- format=pyaudio.paFloat32,
334
- channels=1,
335
- rate=rate,
336
- output=True
337
- )
338
-
339
- stream.write(buffer)
340
-
341
- stream.stop_stream()
342
- stream.close()
343
- p.terminate()
344
-
345
- async def stream_and_listen():
346
- client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
347
-
348
- # Set up the websocket connection
349
- ws = await client.tts.websocket()
350
-
351
- # Create a context to send and receive audio
352
- ctx = ws.context() # Generates a random context ID if not provided
353
-
354
- send_task = asyncio.create_task(send_transcripts(ctx))
355
- listen_task = asyncio.create_task(receive_and_play_audio(ctx))
356
-
357
- # Call the two coroutine tasks concurrently
358
- await asyncio.gather(send_task, listen_task)
359
-
360
- await ws.close()
361
- await client.close()
362
-
363
- asyncio.run(stream_and_listen())
364
- ```
365
-
366
- You can also use continuations on the synchronous Cartesia client to stream in text as it becomes available. To do this, pass in a text generator that produces text chunks at intervals of less than 1 second, as shown below. This ensures smooth audio playback.
367
-
368
- Note: the sync client has a different API for continuations compared to the async client.
369
-
370
- ```python
371
- from cartesia import Cartesia
372
- import pyaudio
373
- import os
374
-
375
- client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
376
-
377
- transcripts = [
378
- "The crew engaged in a range of activities designed to mirror those "
379
- "they might perform on a real Mars mission. ",
380
- "Aside from growing vegetables and maintaining their habitat, they faced "
381
- "additional stressors like communication delays with Earth, ",
382
- "up to twenty-two minutes each way, to simulate the distance from Mars to our planet. ",
383
- "These exercises were critical for understanding how astronauts can "
384
- "maintain not just physical health but also mental well-being under such challenging conditions. ",
385
- ]
386
-
387
- # Ending each transcript with a space makes the audio smoother
388
- def chunk_generator(transcripts):
389
- for transcript in transcripts:
390
- if transcript.endswith(" "):
391
- yield transcript
392
- else:
393
- yield transcript + " "
394
-
395
-
396
- # You can check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
397
- voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
398
-
399
- # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
400
- model_id = "sonic-english"
401
-
402
- # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
403
- output_format = {
404
- "container": "raw",
405
- "encoding": "pcm_f32le",
406
- "sample_rate": 44100,
407
- }
408
-
409
- p = pyaudio.PyAudio()
410
- rate = 44100
411
-
412
- stream = None
413
-
414
- # Set up the websocket connection
415
- ws = client.tts.websocket()
416
-
417
- # Create a context to send and receive audio
418
- ctx = ws.context() # Generates a random context ID if not provided
419
-
420
- # Pass in a text generator to generate & stream the audio
421
- output_stream = ctx.send(
422
- model_id=model_id,
423
- transcript=chunk_generator(transcripts),
424
- voice_id=voice_id,
425
- output_format=output_format,
426
- )
427
-
428
- for output in output_stream:
429
- buffer = output["audio"]
430
-
431
- if not stream:
432
- stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
433
-
434
- # Write the audio data to the stream
435
- stream.write(buffer)
436
-
437
- stream.stop_stream()
438
- stream.close()
439
- p.terminate()
440
-
441
- ws.close() # Close the websocket connection
442
- ```
443
-
444
- ### Generating timestamps using WebSocket
445
-
446
- The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
447
- - words (list): The individual words in the transcript.
448
- - start (list): The starting timestamp for each word (in seconds).
449
- - end (list): The ending timestamp for each word (in seconds).
450
-
451
- ```python
452
- response = ws.send(
453
- model_id=model_id,
454
- transcript=transcript,
455
- voice_id=voice_id,
456
- output_format=output_format,
457
- stream=False,
458
- add_timestamps=True
459
- )
460
-
461
- # Accessing the word_timestamps object
462
- word_timestamps = response['word_timestamps']
463
-
464
- words = word_timestamps['words']
465
- start_times = word_timestamps['start']
466
- end_times = word_timestamps['end']
467
-
468
- for word, start, end in zip(words, start_times, end_times):
469
- print(f"Word: {word}, Start: {start}, End: {end}")
470
- ```
471
-
472
- ### Multilingual Text-to-Speech [Alpha]
473
-
474
- You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
475
-
476
- ```python
477
- from cartesia import Cartesia
478
- import pyaudio
479
- import os
480
-
481
- client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
482
- voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
483
- voice = client.voices.get(id=voice_id)
484
-
485
- transcript = "Hola! Bienvenido a Cartesia"
486
- language = "es" # Language code corresponding to the language of the transcript
487
-
488
- # Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
489
- model_id = "sonic-multilingual"
490
-
491
- # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
492
- output_format = {
493
- "container": "raw",
494
- "encoding": "pcm_f32le",
495
- "sample_rate": 44100,
496
- }
497
-
498
- p = pyaudio.PyAudio()
499
- rate = 44100
500
-
501
- stream = None
502
-
503
- # Pass in the corresponding language code to the `language` parameter to generate and stream audio.
504
- for output in client.tts.sse(
505
- model_id=model_id,
506
- transcript=transcript,
507
- voice_embedding=voice["embedding"],
508
- stream=True,
509
- output_format=output_format,
510
- language=language,
511
- ):
512
- buffer = output["audio"]
513
-
514
- if not stream:
515
- stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
516
-
517
- stream.write(buffer)
518
-
519
- stream.stop_stream()
520
- stream.close()
521
- p.terminate()
522
- ```
523
-
524
- ### Speed and Emotion Control [Experimental]
525
-
526
- You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
527
-
528
- Speed Options:
529
- - `slowest`, `slow`, `normal`, `fast`, `fastest`
530
- - Float values between -1.0 and 1.0, where -1.0 is the slowest speed and 1.0 is the fastest speed.
531
-
532
- Emotion Options:
533
- Use a list of tags in the format `emotion_name:level` where:
534
- - Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
535
- - Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
536
- The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
537
-
538
- ```python
539
- ws.send(
540
- model_id=model_id,
541
- transcript=transcript,
542
- voice_id=voice_id,
543
- output_format=output_format,
544
- _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
545
- )
546
- ```
547
-
548
- ### Jupyter Notebook Usage
549
-
550
- If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
551
- Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
552
-
553
- ```python
554
- from IPython.display import Audio
555
- import io
556
- import os
557
- import numpy as np
558
-
559
- from cartesia import Cartesia
560
-
561
- with Cartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
562
- output_format = {
563
- "container": "raw",
564
- "encoding": "pcm_f32le",
565
- "sample_rate": 8000,
566
- }
567
- rate = 8000
568
- voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
569
- voice = client.voices.get(id=voice_id)
570
- transcript = "Hey there! Welcome to Cartesia"
571
-
572
- # Create a BytesIO object to store the audio data
573
- audio_data = io.BytesIO()
574
-
575
- # Generate and stream audio
576
- for output in client.tts.sse(
577
- model_id="sonic-english",
578
- transcript=transcript,
579
- voice_embedding=voice["embedding"],
580
- stream=True,
581
- output_format=output_format,
582
- ):
583
- buffer = output["audio"]
584
- audio_data.write(buffer)
585
-
586
- # Set the cursor position to the beginning of the BytesIO object
587
- audio_data.seek(0)
588
-
589
- # Create an Audio object from the BytesIO data
590
- audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=rate)
591
-
592
- # Display the Audio object
593
- display(audio)
594
- ```
595
-
596
- Below is the same example using the async client:
597
-
598
- ```python
599
- from IPython.display import Audio
600
- import io
601
- import os
602
- import numpy as np
603
-
604
- from cartesia import AsyncCartesia
605
-
606
- async with AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
607
- output_format = {
608
- "container": "raw",
609
- "encoding": "pcm_f32le",
610
- "sample_rate": 8000,
611
- }
612
- rate = 8000
613
- voice_id = "248be419-c632-4f23-adf1-5324ed7dbf1d"
614
- transcript = "Hey there! Welcome to Cartesia"
615
-
616
- # Create a BytesIO object to store the audio data
617
- audio_data = io.BytesIO()
618
-
619
- # Generate and stream audio
620
- async for output in client.tts.sse(
621
- model_id="sonic-english",
622
- transcript=transcript,
623
- voice_id=voice_id,
624
- stream=True,
625
- output_format=output_format,
626
- ):
627
- buffer = output["audio"]
628
- audio_data.write(buffer)
629
-
630
- # Set the cursor position to the beginning of the BytesIO object
631
- audio_data.seek(0)
632
-
633
- # Create an Audio object from the BytesIO data
634
- audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=rate)
635
-
636
- # Display the Audio object
637
- display(audio)
638
- ```
639
-
640
- ### Utility methods
641
-
642
- #### Output Formats
643
-
644
- You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events).
645
-
646
- ```python
647
- # Get the output format dictionary from string name
648
- output_format = client.tts.get_output_format("raw_pcm_f32le_44100")
649
-
650
- # Pass in the output format dictionary to generate and stream audio
651
- generator = client.tts.sse(
652
- model_id=model,
653
- transcript=transcript,
654
- voice_id=SAMPLE_VOICE_ID,
655
- stream=True,
656
- output_format=output_format,
657
- )
658
- ```
659
-
660
- To avoid storing your API key in the source code, we recommend doing one of the following:
661
-
662
- 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
663
- 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -1,23 +0,0 @@
1
- cartesia/__init__.py,sha256=rS7jIg4iqT0VgnwjzYK25JXxnF5hjZGE_-PGynAqHFo,126
2
- cartesia/_async_sse.py,sha256=76oIvstzVcWZCbcD8Ps419k1FEHF6lOB5qoHwawvj9k,3327
3
- cartesia/_async_websocket.py,sha256=y9YL9fU8eLENZZECJUwRBVTfEx4ZMl96Y5zHaRY2BiI,14787
4
- cartesia/_constants.py,sha256=khGNVpiQVDmv1oZU7pKTd9C1AHjiaM8zQ2He9d5zI_c,435
5
- cartesia/_logger.py,sha256=vU7QiGSy_AJuJFmClUocqIJ-Ltku_8C24ZU8L6fLJR0,53
6
- cartesia/_sse.py,sha256=CugabGUAUM-N2BruxNFxDB20HyxDlRdbN-J_yAzvBMY,5667
7
- cartesia/_types.py,sha256=p0OzSzH174WrG8LRyu_MvNXZPhhTfLArSSDpcYY4xa0,2565
8
- cartesia/_websocket.py,sha256=7gDLcfMoIwmKj07iLk5UZ4ypxlv-3UmMd3VFjVn1QaE,14921
9
- cartesia/async_client.py,sha256=y_K_Yuv0weA4k9ZYD0M9bNM3x3frsq07tqkg7R9h0-o,2714
10
- cartesia/async_tts.py,sha256=CgbrLk7tc0NKSBC8zZH5I4CpWHpOgkypo0D2hyg5LLE,6466
11
- cartesia/client.py,sha256=OS1ORUSlR8Jg-em1imeTAFfwkC85AQFnw8PYtTdUuC8,2364
12
- cartesia/resource.py,sha256=wpnB3IPcTdxYSp0vxSkpntp4NSvqvnwUWF-0ZpgWV9o,1585
13
- cartesia/tts.py,sha256=WV8OduM87ciM1ht60Fi9Fh4gunX2Xew3K96ELCzpP-8,10702
14
- cartesia/version.py,sha256=8UhoYEXHs1Oai7BW_ExBmuwWnRI-yMG_u1fQAXMizHQ,22
15
- cartesia/voices.py,sha256=DLO_GJYDRhzFbqVIqzGOP1m1Ylzq7tVm6VHrknekFCk,6968
16
- cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
18
- cartesia/utils/retry.py,sha256=O6fyVWpH9Su8c0Fwupl57xMt6JrwJ52txBwP3faUL7k,3339
19
- cartesia/utils/tts.py,sha256=TbvBZqHR6LxPim6s5RyGiURi4hIfqWt3KUk5QYOOhfc,2177
20
- cartesia-1.4.0.dist-info/METADATA,sha256=LLv3iE6dKcAeZuSb6bDAc0GSVICUOS8szaM4n1F-Mww,21030
21
- cartesia-1.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- cartesia-1.4.0.dist-info/licenses/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
23
- cartesia-1.4.0.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2024 Cartesia AI, Inc.
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.