cartesia 1.0.6__py2.py3-none-any.whl → 1.0.8__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/_types.py CHANGED
@@ -45,7 +45,7 @@ class DeprecatedOutputFormatMapping:
45
45
  "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
46
46
  "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
47
47
  }
48
-
48
+
49
49
  @classmethod
50
50
  @deprecated(
51
51
  vdeprecated="1.0.1",
@@ -74,18 +74,19 @@ class VoiceControls(TypedDict):
74
74
  """Defines different voice control parameters for voice synthesis.
75
75
 
76
76
  For a complete list of supported parameters, refer to the Cartesia API documentation.
77
- https://docs.cartesia.ai/getting-started/welcome
77
+ https://docs.cartesia.ai/api-reference
78
78
 
79
79
  Examples:
80
80
  >>> {"speed": "fastest"}
81
- >>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
82
- >>> {"emotion": "surprise:high, positivity:high"}
81
+ >>> {"speed": "slow", "emotion": ["sadness:high"]}
82
+ >>> {"emotion": ["surprise:highest", "curiosity"]}
83
83
 
84
84
  Note:
85
85
  This is an experimental class and is subject to rapid change in future versions.
86
86
  """
87
+
87
88
  speed: str = ""
88
- emotion: str = ""
89
+ emotion: List[str] = []
89
90
 
90
91
 
91
92
  class OutputFormat(TypedDict):
cartesia/client.py CHANGED
@@ -23,7 +23,12 @@ import aiohttp
23
23
  import httpx
24
24
  import logging
25
25
  import requests
26
- from websockets.sync.client import connect
26
+ try:
27
+ from websockets.sync.client import connect
28
+ IS_WEBSOCKET_SYNC_AVAILABLE = True
29
+ except ImportError:
30
+ IS_WEBSOCKET_SYNC_AVAILABLE = False
31
+
27
32
  from iterators import TimeoutIterator
28
33
 
29
34
  from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
@@ -208,37 +213,25 @@ class Voices(Resource):
208
213
  return response.json()
209
214
 
210
215
  def clone(self, filepath: Optional[str] = None, link: Optional[str] = None) -> List[float]:
211
- """Clone a voice from a clip or a URL.
216
+ """Clone a voice from a clip.
212
217
 
213
218
  Args:
214
219
  filepath: The path to the clip file.
215
- link: The URL to the clip
216
220
 
217
221
  Returns:
218
222
  The embedding of the cloned voice as a list of floats.
219
223
  """
220
224
  # TODO: Python has a bytes object, use that instead of a filepath
221
- if not filepath and not link:
222
- raise ValueError("At least one of 'filepath' or 'link' must be specified.")
223
- if filepath and link:
224
- raise ValueError("Only one of 'filepath' or 'link' should be specified.")
225
- if filepath:
226
- url = f"{self._http_url()}/voices/clone/clip"
227
- with open(filepath, "rb") as file:
228
- files = {"clip": file}
229
- headers = self.headers.copy()
230
- headers.pop("Content-Type", None)
231
- response = httpx.post(url, headers=headers, files=files, timeout=self.timeout)
232
- if not response.is_success:
233
- raise ValueError(f"Failed to clone voice from clip. Error: {response.text}")
234
- elif link:
235
- url = f"{self._http_url()}/voices/clone/url"
236
- params = {"link": link}
225
+ if not filepath:
226
+ raise ValueError("Filepath must be specified.")
227
+ url = f"{self._http_url()}/voices/clone/clip"
228
+ with open(filepath, "rb") as file:
229
+ files = {"clip": file}
237
230
  headers = self.headers.copy()
238
- headers.pop("Content-Type") # The content type header is not required for URLs
239
- response = httpx.post(url, headers=self.headers, params=params, timeout=self.timeout)
231
+ headers.pop("Content-Type", None)
232
+ response = httpx.post(url, headers=headers, files=files, timeout=self.timeout)
240
233
  if not response.is_success:
241
- raise ValueError(f"Failed to clone voice from URL. Error: {response.text}")
234
+ raise ValueError(f"Failed to clone voice from clip. Error: {response.text}")
242
235
 
243
236
  return response.json()["embedding"]
244
237
 
@@ -328,7 +321,11 @@ class _TTSContext:
328
321
 
329
322
  self._websocket.connect()
330
323
 
331
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
324
+ voice = TTS._validate_and_construct_voice(
325
+ voice_id,
326
+ voice_embedding=voice_embedding,
327
+ experimental_voice_controls=_experimental_voice_controls,
328
+ )
332
329
 
333
330
  # Create the initial request body
334
331
  request_body = {
@@ -465,6 +462,10 @@ class _WebSocket:
465
462
  Raises:
466
463
  RuntimeError: If the connection to the WebSocket fails.
467
464
  """
465
+ if not IS_WEBSOCKET_SYNC_AVAILABLE:
466
+ raise ImportError(
467
+ "The synchronous WebSocket client is not available. Please ensure that you have 'websockets>=12.0' or compatible version installed."
468
+ )
468
469
  if self.websocket is None or self._is_websocket_closed():
469
470
  route = "tts/websocket"
470
471
  try:
@@ -493,7 +494,7 @@ class _WebSocket:
493
494
  out["audio"] = base64.b64decode(response["data"])
494
495
  elif response["type"] == EventType.TIMESTAMPS:
495
496
  out["word_timestamps"] = response["word_timestamps"]
496
-
497
+
497
498
  if include_context_id:
498
499
  out["context_id"] = response["context_id"]
499
500
 
@@ -541,7 +542,11 @@ class _WebSocket:
541
542
  if context_id is None:
542
543
  context_id = str(uuid.uuid4())
543
544
 
544
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
545
+ voice = TTS._validate_and_construct_voice(
546
+ voice_id,
547
+ voice_embedding=voice_embedding,
548
+ experimental_voice_controls=_experimental_voice_controls,
549
+ )
545
550
 
546
551
  request_body = {
547
552
  "model_id": model_id,
@@ -681,7 +686,11 @@ class _SSE:
681
686
  Both the generator and the dictionary contain the following key(s):
682
687
  - audio: The audio as bytes.
683
688
  """
684
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
689
+ voice = TTS._validate_and_construct_voice(
690
+ voice_id,
691
+ voice_embedding=voice_embedding,
692
+ experimental_voice_controls=_experimental_voice_controls,
693
+ )
685
694
  request_body = {
686
695
  "model_id": model_id,
687
696
  "transcript": transcript,
@@ -795,6 +804,7 @@ class TTS(Resource):
795
804
  sample_rate=output_format_obj["sample_rate"],
796
805
  )
797
806
 
807
+ @staticmethod
798
808
  def get_sample_rate(self, output_format_name: str) -> int:
799
809
  """Convenience method to get the sample rate for a given output format.
800
810
 
@@ -818,6 +828,40 @@ class TTS(Resource):
818
828
 
819
829
  return output_format_obj["sample_rate"]
820
830
 
831
+ @staticmethod
832
+ def _validate_and_construct_voice(
833
+ voice_id: Optional[str] = None,
834
+ voice_embedding: Optional[List[float]] = None,
835
+ experimental_voice_controls: Optional[VoiceControls] = None,
836
+ ) -> dict:
837
+ """Validate and construct the voice dictionary for the request.
838
+
839
+ Args:
840
+ voice_id: The ID of the voice to use for generating audio.
841
+ voice_embedding: The embedding of the voice to use for generating audio.
842
+ experimental_voice_controls: Voice controls for emotion and speed.
843
+ Note: This is an experimental feature and may rapidly change in the future.
844
+
845
+ Returns:
846
+ A dictionary representing the voice configuration.
847
+
848
+ Raises:
849
+ ValueError: If neither or both voice_id and voice_embedding are specified.
850
+ """
851
+ if voice_id is None and voice_embedding is None:
852
+ raise ValueError("Either voice_id or voice_embedding must be specified.")
853
+
854
+ if voice_id is not None and voice_embedding is not None:
855
+ raise ValueError("Only one of voice_id or voice_embedding should be specified.")
856
+
857
+ if voice_id:
858
+ voice = {"mode": "id", "id": voice_id}
859
+ else:
860
+ voice = {"mode": "embedding", "embedding": voice_embedding}
861
+ if experimental_voice_controls is not None:
862
+ voice["__experimental_controls"] = experimental_voice_controls
863
+ return voice
864
+
821
865
 
822
866
  class AsyncCartesia(Cartesia):
823
867
  """The asynchronous version of the Cartesia client."""
@@ -917,7 +961,11 @@ class _AsyncSSE(_SSE):
917
961
  stream: bool = True,
918
962
  _experimental_voice_controls: Optional[VoiceControls] = None,
919
963
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
920
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
964
+ voice = TTS._validate_and_construct_voice(
965
+ voice_id,
966
+ voice_embedding=voice_embedding,
967
+ experimental_voice_controls=_experimental_voice_controls,
968
+ )
921
969
 
922
970
  request_body = {
923
971
  "model_id": model_id,
@@ -1042,7 +1090,9 @@ class _AsyncTTSContext:
1042
1090
 
1043
1091
  await self._websocket.connect()
1044
1092
 
1045
- voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
1093
+ voice = TTS._validate_and_construct_voice(
1094
+ voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
1095
+ )
1046
1096
 
1047
1097
  request_body = {
1048
1098
  "model_id": model_id,
@@ -1229,7 +1279,7 @@ class _AsyncWebSocket(_WebSocket):
1229
1279
  duration=duration,
1230
1280
  language=language,
1231
1281
  continue_=False,
1232
- add_timestamps = add_timestamps,
1282
+ add_timestamps=add_timestamps,
1233
1283
  _experimental_voice_controls=_experimental_voice_controls,
1234
1284
  )
1235
1285
 
@@ -1299,35 +1349,3 @@ class AsyncTTS(TTS):
1299
1349
  )
1300
1350
  await ws.connect()
1301
1351
  return ws
1302
-
1303
-
1304
- def _validate_and_construct_voice(
1305
- voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
1306
- ) -> dict:
1307
- """Validate and construct the voice dictionary for the request.
1308
-
1309
- Args:
1310
- voice_id: The ID of the voice to use for generating audio.
1311
- voice_embedding: The embedding of the voice to use for generating audio.
1312
- experimental_voice_controls: Voice controls for emotion and speed.
1313
- Note: This is an experimental feature and may rapidly change in the future.
1314
-
1315
- Returns:
1316
- A dictionary representing the voice configuration.
1317
-
1318
- Raises:
1319
- ValueError: If neither or both voice_id and voice_embedding are specified.
1320
- """
1321
- if voice_id is None and voice_embedding is None:
1322
- raise ValueError("Either voice_id or voice_embedding must be specified.")
1323
-
1324
- if voice_id is not None and voice_embedding is not None:
1325
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
1326
-
1327
- if voice_id:
1328
- voice = {"mode": "id", "id": voice_id}
1329
- else:
1330
- voice = {"mode": "embedding", "embedding": voice_embedding}
1331
- if experimental_voice_controls is not None:
1332
- voice["__experimental_controls"] = experimental_voice_controls
1333
- return voice
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.6"
1
+ __version__ = "1.0.8"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.6
3
+ Version: 1.0.8
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -43,6 +43,22 @@ The official Cartesia Python library which provides convenient access to the Car
43
43
  > [!IMPORTANT]
44
44
  > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) for any support requests!
45
45
 
46
+ - [Cartesia Python API Library](#cartesia-python-api-library)
47
+ - [Documentation](#documentation)
48
+ - [Installation](#installation)
49
+ - [Voices](#voices)
50
+ - [Text-to-Speech](#text-to-speech)
51
+ - [Server-Sent Events (SSE)](#server-sent-events-sse)
52
+ - [WebSocket](#websocket)
53
+ - [Conditioning speech on previous generations using WebSocket](#conditioning-speech-on-previous-generations-using-websocket)
54
+ - [Generating timestamps using WebSocket](#generating-timestamps-using-websocket)
55
+ - [Multilingual Text-to-Speech \[Alpha\]](#multilingual-text-to-speech-alpha)
56
+ - [Speed and Emotion Control \[Experimental\]](#speed-and-emotion-control-experimental)
57
+ - [Jupyter Notebook Usage](#jupyter-notebook-usage)
58
+ - [Utility methods](#utility-methods)
59
+ - [Output Formats](#output-formats)
60
+
61
+
46
62
  ## Documentation
47
63
 
48
64
  Our complete API documentation can be found [on docs.cartesia.ai](https://docs.cartesia.ai).
@@ -268,7 +284,7 @@ async def send_transcripts(ctx):
268
284
 
269
285
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
270
286
  model_id = "sonic-english"
271
-
287
+
272
288
  # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
273
289
  output_format = {
274
290
  "container": "raw",
@@ -284,7 +300,7 @@ async def send_transcripts(ctx):
284
300
  "As they near Eggman's lair, our heroes charge their abilities for an epic boss battle. ",
285
301
  "Get ready to spin, jump, and sound-blast your way to victory in this high-octane crossover!"
286
302
  ]
287
-
303
+
288
304
  for transcript in transcripts:
289
305
  # Send text inputs as they become available
290
306
  await ctx.send(
@@ -296,7 +312,7 @@ async def send_transcripts(ctx):
296
312
  )
297
313
 
298
314
  # Indicate that no more inputs will be sent. Otherwise, the context will close after 5 seconds of inactivity.
299
- await ctx.no_more_inputs()
315
+ await ctx.no_more_inputs()
300
316
 
301
317
  async def receive_and_play_audio(ctx):
302
318
  p = pyaudio.PyAudio()
@@ -402,7 +418,7 @@ output_stream = ctx.send(
402
418
  voice_id=voice_id,
403
419
  output_format=output_format,
404
420
  )
405
-
421
+
406
422
  for output in output_stream:
407
423
  buffer = output["audio"]
408
424
 
@@ -419,6 +435,34 @@ p.terminate()
419
435
  ws.close() # Close the websocket connection
420
436
  ```
421
437
 
438
+ ### Generating timestamps using WebSocket
439
+
440
+ The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
441
+ - words (list): The individual words in the transcript.
442
+ - start (list): The starting timestamp for each word (in seconds).
443
+ - end (list): The ending timestamp for each word (in seconds).
444
+
445
+ ```python
446
+ response = ws.send(
447
+ model_id=model_id,
448
+ transcript=transcript,
449
+ voice_id=voice_id,
450
+ output_format=output_format,
451
+ stream=False,
452
+ add_timestamps=True
453
+ )
454
+
455
+ # Accessing the word_timestamps object
456
+ word_timestamps = response['word_timestamps']
457
+
458
+ words = word_timestamps['words']
459
+ start_times = word_timestamps['start']
460
+ end_times = word_timestamps['end']
461
+
462
+ for word, start, end in zip(words, start_times, end_times):
463
+ print(f"Word: {word}, Start: {start}, End: {end}")
464
+ ```
465
+
422
466
  ### Multilingual Text-to-Speech [Alpha]
423
467
 
424
468
  You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -472,6 +516,31 @@ stream.close()
472
516
  p.terminate()
473
517
  ```
474
518
 
519
+ ### Speed and Emotion Control [Experimental]
520
+
521
+ You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
522
+
523
+ Speed Options:
524
+ - `slowest`, `slow`, `normal`, `fast`, `fastest`
525
+
526
+ Emotion Options:
527
+ Use a list of tags in the format `emotion_name:level` where:
528
+ - Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
529
+ - Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
530
+ The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
531
+
532
+ ```python
533
+ ws.send(
534
+ model_id=model_id,
535
+ transcript=transcript,
536
+ voice_id=voice_id,
537
+ output_format=output_format,
538
+ _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
539
+ )
540
+ ```
541
+
542
+ ### Jupyter Notebook Usage
543
+
475
544
  If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
476
545
  Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
477
546
 
@@ -0,0 +1,12 @@
1
+ cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
+ cartesia/_types.py,sha256=Lcp4GOot5UfI0EveDi2QdNALMo1rK4PwUrtMvW5P6vY,4406
3
+ cartesia/client.py,sha256=gOH3ddVI-epHbPGijM3jExSmMOZm8TiyOfZlLKWt89w,51485
4
+ cartesia/version.py,sha256=NND_6JDYnYnGzN3-RTpS5F7wzv62vDf7hAxiyTSBJfE,22
5
+ cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
+ cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
+ cartesia-1.0.8.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
+ cartesia-1.0.8.dist-info/METADATA,sha256=83WFXYyycaZfvqX_bdyctzx270x1PMYlC-5lUUcMVDs,21137
10
+ cartesia-1.0.8.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
+ cartesia-1.0.8.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
+ cartesia-1.0.8.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
- cartesia/_types.py,sha256=l3tKFnyUInn5_OJOSB63Mp1g16p9R23VNAuJ5qykOzY,4424
3
- cartesia/client.py,sha256=zLyxaDkX0et6lY_hthSgDA-eoP6NXEN5ysDsxxseyZQ,51502
4
- cartesia/version.py,sha256=mqMuQB3aqJVPrHHqJMLjqiMKUiJjozc7EPLcX5DpKHg,22
5
- cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
- cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
- cartesia-1.0.6.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
- cartesia-1.0.6.dist-info/METADATA,sha256=JcNWr0UHSp_GK3X05YD92zbLZonV0BkeyuzT90HuGSs,18368
10
- cartesia-1.0.6.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
- cartesia-1.0.6.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
- cartesia-1.0.6.dist-info/RECORD,,