cartesia 1.0.4__py2.py3-none-any.whl → 1.0.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/_types.py CHANGED
@@ -70,7 +70,31 @@ class VoiceMetadata(TypedDict):
70
70
  language: str
71
71
 
72
72
 
73
+ class VoiceControls(TypedDict):
74
+ """Defines different voice control parameters for voice synthesis.
75
+
76
+ For a complete list of supported parameters, refer to the Cartesia API documentation.
77
+ https://docs.cartesia.ai/getting-started/welcome
78
+
79
+ Examples:
80
+ >>> {"speed": "fastest"}
81
+ >>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
82
+ >>> {"emotion": "surprise:high, positivity:high"}
83
+
84
+ Note:
85
+ This is an experimental class and is subject to rapid change in future versions.
86
+ """
87
+ speed: str = ""
88
+ emotion: str = ""
89
+
90
+
73
91
  class OutputFormat(TypedDict):
74
92
  container: str
75
93
  encoding: str
76
94
  sample_rate: int
95
+
96
+
97
+ class EventType:
98
+ NULL = ""
99
+ AUDIO = "chunk"
100
+ TIMESTAMPS = "timestamps"
cartesia/client.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  import base64
3
+ from collections import defaultdict
3
4
  import json
4
5
  import os
5
6
  import uuid
@@ -7,6 +8,7 @@ from types import TracebackType
7
8
  from typing import (
8
9
  Any,
9
10
  AsyncGenerator,
11
+ Iterator,
10
12
  Dict,
11
13
  Generator,
12
14
  List,
@@ -14,6 +16,7 @@ from typing import (
14
16
  Tuple,
15
17
  Union,
16
18
  Callable,
19
+ Set,
17
20
  )
18
21
 
19
22
  import aiohttp
@@ -21,12 +24,15 @@ import httpx
21
24
  import logging
22
25
  import requests
23
26
  from websockets.sync.client import connect
27
+ from iterators import TimeoutIterator
24
28
 
25
29
  from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
26
30
  from cartesia._types import (
31
+ EventType,
27
32
  OutputFormat,
28
33
  OutputFormatMapping,
29
34
  DeprecatedOutputFormatMapping,
35
+ VoiceControls,
30
36
  VoiceMetadata,
31
37
  )
32
38
 
@@ -260,6 +266,168 @@ class Voices(Resource):
260
266
  return response.json()
261
267
 
262
268
 
269
+ class _TTSContext:
270
+ """Manage a single context over a WebSocket.
271
+
272
+ This class can be used to stream inputs, as they become available, to a specific `context_id`. See README for usage.
273
+
274
+ See :class:`_AsyncTTSContext` for asynchronous use cases.
275
+
276
+ Each TTSContext will close automatically when a done message is received for that context. It also closes if there is an error.
277
+ """
278
+
279
+ def __init__(self, context_id: str, websocket: "_WebSocket"):
280
+ self._context_id = context_id
281
+ self._websocket = websocket
282
+ self._error = None
283
+
284
+ def __del__(self):
285
+ self._close()
286
+
287
+ @property
288
+ def context_id(self) -> str:
289
+ return self._context_id
290
+
291
+ def send(
292
+ self,
293
+ model_id: str,
294
+ transcript: Iterator[str],
295
+ output_format: OutputFormat,
296
+ voice_id: Optional[str] = None,
297
+ voice_embedding: Optional[List[float]] = None,
298
+ context_id: Optional[str] = None,
299
+ duration: Optional[int] = None,
300
+ language: Optional[str] = None,
301
+ _experimental_voice_controls: Optional[VoiceControls] = None,
302
+ ) -> Generator[bytes, None, None]:
303
+ """Send audio generation requests to the WebSocket and yield responses.
304
+
305
+ Args:
306
+ model_id: The ID of the model to use for generating audio.
307
+ transcript: Iterator over text chunks with <1s latency.
308
+ output_format: A dictionary containing the details of the output format.
309
+ voice_id: The ID of the voice to use for generating audio.
310
+ voice_embedding: The embedding of the voice to use for generating audio.
311
+ context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
312
+ duration: The duration of the audio in seconds.
313
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
314
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
315
+ Note: This is an experimental feature and may change rapidly in future releases.
316
+
317
+ Yields:
318
+ Dictionary containing the following key(s):
319
+ - audio: The audio as bytes.
320
+ - context_id: The context ID for the request.
321
+
322
+ Raises:
323
+ ValueError: If provided context_id doesn't match the current context.
324
+ RuntimeError: If there's an error generating audio.
325
+ """
326
+ if context_id is not None and context_id != self._context_id:
327
+ raise ValueError("Context ID does not match the context ID of the current context.")
328
+
329
+ self._websocket.connect()
330
+
331
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
332
+
333
+ # Create the initial request body
334
+ request_body = {
335
+ "model_id": model_id,
336
+ "voice": voice,
337
+ "output_format": {
338
+ "container": output_format["container"],
339
+ "encoding": output_format["encoding"],
340
+ "sample_rate": output_format["sample_rate"],
341
+ },
342
+ "context_id": self._context_id,
343
+ "language": language,
344
+ }
345
+
346
+ if duration is not None:
347
+ request_body["duration"] = duration
348
+
349
+ try:
350
+ # Create an iterator with a timeout to get text chunks
351
+ text_iterator = TimeoutIterator(
352
+ transcript, timeout=0.001
353
+ ) # 1ms timeout for nearly non-blocking receive
354
+ next_chunk = next(text_iterator, None)
355
+
356
+ while True:
357
+ # Send the next text chunk to the WebSocket if available
358
+ if next_chunk is not None and next_chunk != text_iterator.get_sentinel():
359
+ request_body["transcript"] = next_chunk
360
+ request_body["continue"] = True
361
+ self._websocket.websocket.send(json.dumps(request_body))
362
+ next_chunk = next(text_iterator, None)
363
+
364
+ try:
365
+ # Receive responses from the WebSocket with a small timeout
366
+ response = json.loads(
367
+ self._websocket.websocket.recv(timeout=0.001)
368
+ ) # 1ms timeout for nearly non-blocking receive
369
+ if response["context_id"] != self._context_id:
370
+ pass
371
+ if "error" in response:
372
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
373
+ if response["done"]:
374
+ break
375
+ if response["data"]:
376
+ yield self._websocket._convert_response(
377
+ response=response, include_context_id=True
378
+ )
379
+ except TimeoutError:
380
+ pass
381
+
382
+ # Continuously receive from WebSocket until the next text chunk is available
383
+ while next_chunk == text_iterator.get_sentinel():
384
+ try:
385
+ response = json.loads(self._websocket.websocket.recv(timeout=0.001))
386
+ if response["context_id"] != self._context_id:
387
+ continue
388
+ if "error" in response:
389
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
390
+ if response["done"]:
391
+ break
392
+ if response["data"]:
393
+ yield self._websocket._convert_response(
394
+ response=response, include_context_id=True
395
+ )
396
+ except TimeoutError:
397
+ pass
398
+ next_chunk = next(text_iterator, None)
399
+
400
+ # Send final message if all input text chunks are exhausted
401
+ if next_chunk is None:
402
+ request_body["transcript"] = ""
403
+ request_body["continue"] = False
404
+ self._websocket.websocket.send(json.dumps(request_body))
405
+ break
406
+
407
+ # Receive remaining messages from the WebSocket until "done" is received
408
+ while True:
409
+ response = json.loads(self._websocket.websocket.recv())
410
+ if response["context_id"] != self._context_id:
411
+ continue
412
+ if "error" in response:
413
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
414
+ if response["done"]:
415
+ break
416
+ yield self._websocket._convert_response(response=response, include_context_id=True)
417
+
418
+ except Exception as e:
419
+ self._websocket.close()
420
+ raise RuntimeError(f"Failed to generate audio. {e}")
421
+
422
+ def _close(self):
423
+ """Closes the context. Automatically called when a done message is received for this context."""
424
+ self._websocket._remove_context(self._context_id)
425
+
426
+ def is_closed(self):
427
+ """Check if the context is closed or not. Returns True if closed."""
428
+ return self._context_id not in self._websocket._contexts
429
+
430
+
263
431
  class _WebSocket:
264
432
  """This class contains methods to generate audio using WebSocket. Ideal for low-latency audio generation.
265
433
 
@@ -283,6 +451,13 @@ class _WebSocket:
283
451
  self.api_key = api_key
284
452
  self.cartesia_version = cartesia_version
285
453
  self.websocket = None
454
+ self._contexts: Set[str] = set()
455
+
456
+ def __del__(self):
457
+ try:
458
+ self.close()
459
+ except Exception as e:
460
+ raise RuntimeError("Failed to close WebSocket: ", e)
286
461
 
287
462
  def connect(self):
288
463
  """This method connects to the WebSocket if it is not already connected.
@@ -304,48 +479,25 @@ class _WebSocket:
304
479
 
305
480
  def close(self):
306
481
  """This method closes the WebSocket connection. *Highly* recommended to call this method when done using the WebSocket."""
307
- if self.websocket is not None and not self._is_websocket_closed():
482
+ if self.websocket and not self._is_websocket_closed():
308
483
  self.websocket.close()
309
484
 
485
+ if self._contexts:
486
+ self._contexts.clear()
487
+
310
488
  def _convert_response(
311
489
  self, response: Dict[str, any], include_context_id: bool
312
490
  ) -> Dict[str, Any]:
313
- audio = base64.b64decode(response["data"])
314
-
315
- optional_kwargs = {}
491
+ out = {}
492
+ if response["type"] == EventType.AUDIO:
493
+ out["audio"] = base64.b64decode(response["data"])
494
+ elif response["type"] == EventType.TIMESTAMPS:
495
+ out["word_timestamps"] = response["word_timestamps"]
496
+
316
497
  if include_context_id:
317
- optional_kwargs["context_id"] = response["context_id"]
318
-
319
- return {
320
- "audio": audio,
321
- **optional_kwargs,
322
- }
323
-
324
- def _validate_and_construct_voice(
325
- self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
326
- ) -> dict:
327
- """Validate and construct the voice dictionary for the request.
328
-
329
- Args:
330
- voice_id: The ID of the voice to use for generating audio.
331
- voice_embedding: The embedding of the voice to use for generating audio.
332
-
333
- Returns:
334
- A dictionary representing the voice configuration.
335
-
336
- Raises:
337
- ValueError: If neither or both voice_id and voice_embedding are specified.
338
- """
339
- if voice_id is None and voice_embedding is None:
340
- raise ValueError("Either voice_id or voice_embedding must be specified.")
498
+ out["context_id"] = response["context_id"]
341
499
 
342
- if voice_id is not None and voice_embedding is not None:
343
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
344
-
345
- if voice_id:
346
- return {"mode": "id", "id": voice_id}
347
-
348
- return {"mode": "embedding", "embedding": voice_embedding}
500
+ return out
349
501
 
350
502
  def send(
351
503
  self,
@@ -358,6 +510,8 @@ class _WebSocket:
358
510
  duration: Optional[int] = None,
359
511
  language: Optional[str] = None,
360
512
  stream: bool = True,
513
+ add_timestamps: bool = False,
514
+ _experimental_voice_controls: Optional[VoiceControls] = None,
361
515
  ) -> Union[bytes, Generator[bytes, None, None]]:
362
516
  """Send a request to the WebSocket to generate audio.
363
517
 
@@ -371,6 +525,9 @@ class _WebSocket:
371
525
  duration: The duration of the audio in seconds.
372
526
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
373
527
  stream: Whether to stream the audio or not.
528
+ add_timestamps: Whether to return word-level timestamps.
529
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
530
+ Note: This is an experimental feature and may change rapidly in future releases.
374
531
 
375
532
  Returns:
376
533
  If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -384,7 +541,7 @@ class _WebSocket:
384
541
  if context_id is None:
385
542
  context_id = str(uuid.uuid4())
386
543
 
387
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
544
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
388
545
 
389
546
  request_body = {
390
547
  "model_id": model_id,
@@ -397,6 +554,7 @@ class _WebSocket:
397
554
  },
398
555
  "context_id": context_id,
399
556
  "language": language,
557
+ "add_timestamps": add_timestamps,
400
558
  }
401
559
 
402
560
  if duration is not None:
@@ -408,10 +566,17 @@ class _WebSocket:
408
566
  return generator
409
567
 
410
568
  chunks = []
569
+ word_timestamps = defaultdict(list)
411
570
  for chunk in generator:
412
- chunks.append(chunk["audio"])
413
-
414
- return {"audio": b"".join(chunks), "context_id": context_id}
571
+ if "audio" in chunk:
572
+ chunks.append(chunk["audio"])
573
+ if add_timestamps and "word_timestamps" in chunk:
574
+ for k, v in chunk["word_timestamps"].items():
575
+ word_timestamps[k].extend(v)
576
+ out = {"audio": b"".join(chunks), "context_id": context_id}
577
+ if add_timestamps:
578
+ out["word_timestamps"] = word_timestamps
579
+ return out
415
580
 
416
581
  def _websocket_generator(self, request_body: Dict[str, Any]):
417
582
  self.websocket.send(json.dumps(request_body))
@@ -426,10 +591,22 @@ class _WebSocket:
426
591
  yield self._convert_response(response=response, include_context_id=True)
427
592
  except Exception as e:
428
593
  # Close the websocket connection if an error occurs.
429
- if self.websocket and not self._is_websocket_closed():
430
- self.websocket.close()
594
+ self.close()
431
595
  raise RuntimeError(f"Failed to generate audio. {response}") from e
432
596
 
597
+ def _remove_context(self, context_id: str):
598
+ if context_id in self._contexts:
599
+ self._contexts.remove(context_id)
600
+
601
+ def context(self, context_id: Optional[str] = None) -> _TTSContext:
602
+ if context_id in self._contexts:
603
+ raise ValueError(f"Context for context ID {context_id} already exists.")
604
+ if context_id is None:
605
+ context_id = str(uuid.uuid4())
606
+ if context_id not in self._contexts:
607
+ self._contexts.add(context_id)
608
+ return _TTSContext(context_id, self)
609
+
433
610
 
434
611
  class _SSE:
435
612
  """This class contains methods to generate audio using Server-Sent Events.
@@ -472,32 +649,6 @@ class _SSE:
472
649
  break
473
650
  return buffer, outputs
474
651
 
475
- def _validate_and_construct_voice(
476
- self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
477
- ) -> dict:
478
- """Validate and construct the voice dictionary for the request.
479
-
480
- Args:
481
- voice_id: The ID of the voice to use for generating audio.
482
- voice_embedding: The embedding of the voice to use for generating audio.
483
-
484
- Returns:
485
- A dictionary representing the voice configuration.
486
-
487
- Raises:
488
- ValueError: If neither or both voice_id and voice_embedding are specified.
489
- """
490
- if voice_id is None and voice_embedding is None:
491
- raise ValueError("Either voice_id or voice_embedding must be specified.")
492
-
493
- if voice_id is not None and voice_embedding is not None:
494
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
495
-
496
- if voice_id:
497
- return {"mode": "id", "id": voice_id}
498
-
499
- return {"mode": "embedding", "embedding": voice_embedding}
500
-
501
652
  def send(
502
653
  self,
503
654
  model_id: str,
@@ -508,6 +659,7 @@ class _SSE:
508
659
  duration: Optional[int] = None,
509
660
  language: Optional[str] = None,
510
661
  stream: bool = True,
662
+ _experimental_voice_controls: Optional[VoiceControls] = None,
511
663
  ) -> Union[bytes, Generator[bytes, None, None]]:
512
664
  """Send a request to the server to generate audio using Server-Sent Events.
513
665
 
@@ -520,6 +672,8 @@ class _SSE:
520
672
  duration: The duration of the audio in seconds.
521
673
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
522
674
  stream: Whether to stream the audio or not.
675
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
676
+ Note: This is an experimental feature and may change rapidly in future releases.
523
677
 
524
678
  Returns:
525
679
  If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -527,8 +681,7 @@ class _SSE:
527
681
  Both the generator and the dictionary contain the following key(s):
528
682
  - audio: The audio as bytes.
529
683
  """
530
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
531
-
684
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
532
685
  request_body = {
533
686
  "model_id": model_id,
534
687
  "transcript": transcript,
@@ -762,8 +915,9 @@ class _AsyncSSE(_SSE):
762
915
  duration: Optional[int] = None,
763
916
  language: Optional[str] = None,
764
917
  stream: bool = True,
918
+ _experimental_voice_controls: Optional[VoiceControls] = None,
765
919
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
766
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
920
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
767
921
 
768
922
  request_body = {
769
923
  "model_id": model_id,
@@ -826,7 +980,7 @@ class _AsyncSSE(_SSE):
826
980
 
827
981
 
828
982
  class _AsyncTTSContext:
829
- """Manage a single context over a WebSocket.
983
+ """Manage a single context over an AsyncWebSocket.
830
984
 
831
985
  This class separates sending requests and receiving responses into two separate methods.
832
986
  This can be used for sending multiple requests without awaiting the response.
@@ -859,6 +1013,8 @@ class _AsyncTTSContext:
859
1013
  continue_: bool = False,
860
1014
  duration: Optional[int] = None,
861
1015
  language: Optional[str] = None,
1016
+ add_timestamps: bool = False,
1017
+ _experimental_voice_controls: Optional[VoiceControls] = None,
862
1018
  ) -> None:
863
1019
  """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
864
1020
 
@@ -871,7 +1027,10 @@ class _AsyncTTSContext:
871
1027
  context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
872
1028
  continue_: Whether to continue the audio generation from the previous transcript or not.
873
1029
  duration: The duration of the audio in seconds.
874
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
1030
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
1031
+ add_timestamps: Whether to return word-level timestamps.
1032
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
1033
+ Note: This is an experimental feature and may change rapidly in future releases.
875
1034
 
876
1035
  Returns:
877
1036
  None.
@@ -883,7 +1042,7 @@ class _AsyncTTSContext:
883
1042
 
884
1043
  await self._websocket.connect()
885
1044
 
886
- voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
1045
+ voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
887
1046
 
888
1047
  request_body = {
889
1048
  "model_id": model_id,
@@ -897,6 +1056,7 @@ class _AsyncTTSContext:
897
1056
  "context_id": self._context_id,
898
1057
  "continue": continue_,
899
1058
  "language": language,
1059
+ "add_timestamps": add_timestamps,
900
1060
  }
901
1061
 
902
1062
  if duration is not None:
@@ -945,6 +1105,10 @@ class _AsyncTTSContext:
945
1105
  """Closes the context. Automatically called when a done message is received for this context."""
946
1106
  self._websocket._remove_context(self._context_id)
947
1107
 
1108
+ def is_closed(self):
1109
+ """Check if the context is closed or not. Returns True if closed."""
1110
+ return self._context_id not in self._websocket._context_queues
1111
+
948
1112
  async def __aenter__(self):
949
1113
  return self
950
1114
 
@@ -1046,7 +1210,10 @@ class _AsyncWebSocket(_WebSocket):
1046
1210
  duration: Optional[int] = None,
1047
1211
  language: Optional[str] = None,
1048
1212
  stream: bool = True,
1213
+ add_timestamps: bool = False,
1214
+ _experimental_voice_controls: Optional[VoiceControls] = None,
1049
1215
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
1216
+ """See :meth:`_WebSocket.send` for details."""
1050
1217
  if context_id is None:
1051
1218
  context_id = str(uuid.uuid4())
1052
1219
 
@@ -1062,6 +1229,8 @@ class _AsyncWebSocket(_WebSocket):
1062
1229
  duration=duration,
1063
1230
  language=language,
1064
1231
  continue_=False,
1232
+ add_timestamps = add_timestamps,
1233
+ _experimental_voice_controls=_experimental_voice_controls,
1065
1234
  )
1066
1235
 
1067
1236
  generator = ctx.receive()
@@ -1070,10 +1239,17 @@ class _AsyncWebSocket(_WebSocket):
1070
1239
  return generator
1071
1240
 
1072
1241
  chunks = []
1242
+ word_timestamps = defaultdict(list)
1073
1243
  async for chunk in generator:
1074
- chunks.append(chunk["audio"])
1075
-
1076
- return {"audio": b"".join(chunks), "context_id": context_id}
1244
+ if "audio" in chunk:
1245
+ chunks.append(chunk["audio"])
1246
+ if add_timestamps and "word_timestamps" in chunk:
1247
+ for k, v in chunk["word_timestamps"].items():
1248
+ word_timestamps[k].extend(v)
1249
+ out = {"audio": b"".join(chunks), "context_id": context_id}
1250
+ if add_timestamps:
1251
+ out["word_timestamps"] = word_timestamps
1252
+ return out
1077
1253
 
1078
1254
  async def _process_responses(self):
1079
1255
  try:
@@ -1123,3 +1299,35 @@ class AsyncTTS(TTS):
1123
1299
  )
1124
1300
  await ws.connect()
1125
1301
  return ws
1302
+
1303
+
1304
+ def _validate_and_construct_voice(
1305
+ voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
1306
+ ) -> dict:
1307
+ """Validate and construct the voice dictionary for the request.
1308
+
1309
+ Args:
1310
+ voice_id: The ID of the voice to use for generating audio.
1311
+ voice_embedding: The embedding of the voice to use for generating audio.
1312
+ experimental_voice_controls: Voice controls for emotion and speed.
1313
+ Note: This is an experimental feature and may rapidly change in the future.
1314
+
1315
+ Returns:
1316
+ A dictionary representing the voice configuration.
1317
+
1318
+ Raises:
1319
+ ValueError: If neither or both voice_id and voice_embedding are specified.
1320
+ """
1321
+ if voice_id is None and voice_embedding is None:
1322
+ raise ValueError("Either voice_id or voice_embedding must be specified.")
1323
+
1324
+ if voice_id is not None and voice_embedding is not None:
1325
+ raise ValueError("Only one of voice_id or voice_embedding should be specified.")
1326
+
1327
+ if voice_id:
1328
+ voice = {"mode": "id", "id": voice_id}
1329
+ else:
1330
+ voice = {"mode": "embedding", "embedding": voice_embedding}
1331
+ if experimental_voice_controls is not None:
1332
+ voice["__experimental_controls"] = experimental_voice_controls
1333
+ return voice
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.4"
1
+ __version__ = "1.0.6"
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Cartesia AI, Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -10,11 +10,13 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
11
  Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
+ License-File: LICENSE.md
13
14
  Requires-Dist: aiohttp
14
15
  Requires-Dist: httpx
15
16
  Requires-Dist: pytest-asyncio
16
17
  Requires-Dist: requests
17
18
  Requires-Dist: websockets
19
+ Requires-Dist: iterators
18
20
  Provides-Extra: all
19
21
  Requires-Dist: pytest >=8.0.2 ; extra == 'all'
20
22
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
@@ -97,10 +99,10 @@ voice = client.voices.get(id=voice_id)
97
99
 
98
100
  transcript = "Hello! Welcome to Cartesia"
99
101
 
100
- # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
102
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
101
103
  model_id = "sonic-english"
102
104
 
103
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
105
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
104
106
  output_format = {
105
107
  "container": "raw",
106
108
  "encoding": "pcm_f32le",
@@ -148,10 +150,10 @@ async def write_stream():
148
150
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
149
151
  voice = client.voices.get(id=voice_id)
150
152
  transcript = "Hello! Welcome to Cartesia"
151
- # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
153
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
152
154
  model_id = "sonic-english"
153
155
 
154
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
156
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
155
157
  output_format = {
156
158
  "container": "raw",
157
159
  "encoding": "pcm_f32le",
@@ -203,10 +205,10 @@ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
203
205
  voice = client.voices.get(id=voice_id)
204
206
  transcript = "Hello! Welcome to Cartesia"
205
207
 
206
- # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
208
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
207
209
  model_id = "sonic-english"
208
210
 
209
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
211
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
210
212
  output_format = {
211
213
  "container": "raw",
212
214
  "encoding": "pcm_f32le",
@@ -250,7 +252,7 @@ In some cases, input text may need to be streamed in. In these cases, it would b
250
252
 
251
253
  To mitigate this, Cartesia offers audio continuations. In this setting, users can send input text, as it becomes available, over a websocket connection.
252
254
 
253
- To do this, we will create a `context` and sending multiple requests without awaiting the response. Then you can listen to the responses in the order they were sent.
255
+ To do this, we will create a `context` and send multiple requests without awaiting the response. Then you can listen to the responses in the order they were sent.
254
256
 
255
257
  Each `context` will be closed automatically after 5 seconds of inactivity or when the `no_more_inputs` method is called. `no_more_inputs` sends a request with the `continue_=False`, which indicates no more inputs will be sent over this context
256
258
 
@@ -261,13 +263,13 @@ import pyaudio
261
263
  from cartesia import AsyncCartesia
262
264
 
263
265
  async def send_transcripts(ctx):
264
- # Check out voice IDs by calling `client.voices.list()` or on [play.cartesia.ai](https://play.cartesia.ai/)
266
+ # Check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
265
267
  voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
266
268
 
267
- # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
269
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
268
270
  model_id = "sonic-english"
269
271
 
270
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
272
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
271
273
  output_format = {
272
274
  "container": "raw",
273
275
  "encoding": "pcm_f32le",
@@ -339,6 +341,84 @@ async def stream_and_listen():
339
341
  asyncio.run(stream_and_listen())
340
342
  ```
341
343
 
344
+ You can also use continuations on the synchronous Cartesia client to stream in text as it becomes available. To do this, pass in a text generator that produces text chunks at intervals of less than 1 second, as shown below. This ensures smooth audio playback.
345
+
346
+ Note: the sync client has a different API for continuations compared to the async client.
347
+
348
+ ```python
349
+ from cartesia import Cartesia
350
+ import pyaudio
351
+ import os
352
+
353
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
354
+
355
+ transcripts = [
356
+ "The crew engaged in a range of activities designed to mirror those "
357
+ "they might perform on a real Mars mission. ",
358
+ "Aside from growing vegetables and maintaining their habitat, they faced "
359
+ "additional stressors like communication delays with Earth, ",
360
+ "up to twenty-two minutes each way, to simulate the distance from Mars to our planet. ",
361
+ "These exercises were critical for understanding how astronauts can "
362
+ "maintain not just physical health but also mental well-being under such challenging conditions. ",
363
+ ]
364
+
365
+ # Ending each transcript with a space makes the audio smoother
366
+ def chunk_generator(transcripts):
367
+ for transcript in transcripts:
368
+ if transcript.endswith(" "):
369
+ yield transcript
370
+ else:
371
+ yield transcript + " "
372
+
373
+
374
+ # You can check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
375
+ voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
376
+
377
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
378
+ model_id = "sonic-english"
379
+
380
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
381
+ output_format = {
382
+ "container": "raw",
383
+ "encoding": "pcm_f32le",
384
+ "sample_rate": 44100,
385
+ }
386
+
387
+ p = pyaudio.PyAudio()
388
+ rate = 44100
389
+
390
+ stream = None
391
+
392
+ # Set up the websocket connection
393
+ ws = client.tts.websocket()
394
+
395
+ # Create a context to send and receive audio
396
+ ctx = ws.context() # Generates a random context ID if not provided
397
+
398
+ # Pass in a text generator to generate & stream the audio
399
+ output_stream = ctx.send(
400
+ model_id=model_id,
401
+ transcript=chunk_generator(transcripts),
402
+ voice_id=voice_id,
403
+ output_format=output_format,
404
+ )
405
+
406
+ for output in output_stream:
407
+ buffer = output["audio"]
408
+
409
+ if not stream:
410
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
411
+
412
+ # Write the audio data to the stream
413
+ stream.write(buffer)
414
+
415
+ stream.stop_stream()
416
+ stream.close()
417
+ p.terminate()
418
+
419
+ ws.close() # Close the websocket connection
420
+ ```
421
+
342
422
  ### Multilingual Text-to-Speech [Alpha]
343
423
 
344
424
  You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -356,10 +436,10 @@ voice = client.voices.get(id=voice_id)
356
436
  transcript = "Hola! Bienvenido a Cartesia"
357
437
  language = "es" # Language code corresponding to the language of the transcript
358
438
 
359
- # Make sure you use the multilingual model! You can check out all models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
439
+ # Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
360
440
  model_id = "sonic-multilingual"
361
441
 
362
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
442
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
363
443
  output_format = {
364
444
  "container": "raw",
365
445
  "encoding": "pcm_f32le",
@@ -0,0 +1,12 @@
1
+ cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
+ cartesia/_types.py,sha256=l3tKFnyUInn5_OJOSB63Mp1g16p9R23VNAuJ5qykOzY,4424
3
+ cartesia/client.py,sha256=zLyxaDkX0et6lY_hthSgDA-eoP6NXEN5ysDsxxseyZQ,51502
4
+ cartesia/version.py,sha256=mqMuQB3aqJVPrHHqJMLjqiMKUiJjozc7EPLcX5DpKHg,22
5
+ cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
+ cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
+ cartesia-1.0.6.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
+ cartesia-1.0.6.dist-info/METADATA,sha256=JcNWr0UHSp_GK3X05YD92zbLZonV0BkeyuzT90HuGSs,18368
10
+ cartesia-1.0.6.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
+ cartesia-1.0.6.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
+ cartesia-1.0.6.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
- cartesia/_types.py,sha256=tO3Nef_V78TDMKDuIv_wsQLkxoSvYG4bdzFkMGXUFho,3765
3
- cartesia/client.py,sha256=UCNTAU8eVzb-o-bygxfQQXWTDov_FX8dbAQdn7a8Hr0,41458
4
- cartesia/version.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
5
- cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
- cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
- cartesia-1.0.4.dist-info/METADATA,sha256=N7NoGr6XBtmLI6EHsG3efw0QNJ7uhV_E9HV8uqTYfQM,15991
9
- cartesia-1.0.4.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
10
- cartesia-1.0.4.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
11
- cartesia-1.0.4.dist-info/RECORD,,