cartesia 1.0.5__tar.gz → 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-1.0.5 → cartesia-1.0.7}/PKG-INFO +54 -1
- {cartesia-1.0.5 → cartesia-1.0.7}/README.md +53 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia/_types.py +26 -1
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia/client.py +113 -72
- cartesia-1.0.7/cartesia/version.py +1 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia.egg-info/PKG-INFO +54 -1
- {cartesia-1.0.5 → cartesia-1.0.7}/tests/test_tts.py +97 -14
- cartesia-1.0.5/cartesia/version.py +0 -1
- {cartesia-1.0.5 → cartesia-1.0.7}/LICENSE.md +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia/__init__.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia/utils/__init__.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia/utils/deprecated.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia/utils/retry.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia.egg-info/SOURCES.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia.egg-info/requires.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/pyproject.toml +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/setup.cfg +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/setup.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.7}/tests/test_deprecated.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -401,6 +401,34 @@ p.terminate()
|
|
401
401
|
ws.close() # Close the websocket connection
|
402
402
|
```
|
403
403
|
|
404
|
+
### Generating timestamps using WebSocket
|
405
|
+
|
406
|
+
The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
|
407
|
+
- words (list): The individual words in the transcript.
|
408
|
+
- start (list): The starting timestamp for each word (in seconds).
|
409
|
+
- end (list): The ending timestamp for each word (in seconds).
|
410
|
+
|
411
|
+
```python
|
412
|
+
response = ws.send(
|
413
|
+
model_id=model_id,
|
414
|
+
transcript=transcript,
|
415
|
+
voice_id=voice_id,
|
416
|
+
output_format=output_format,
|
417
|
+
stream=False,
|
418
|
+
add_timestamps=True
|
419
|
+
)
|
420
|
+
|
421
|
+
# Accessing the word_timestamps object
|
422
|
+
word_timestamps = response['word_timestamps']
|
423
|
+
|
424
|
+
words = word_timestamps['words']
|
425
|
+
start_times = word_timestamps['start']
|
426
|
+
end_times = word_timestamps['end']
|
427
|
+
|
428
|
+
for word, start, end in zip(words, start_times, end_times):
|
429
|
+
print(f"Word: {word}, Start: {start}, End: {end}")
|
430
|
+
```
|
431
|
+
|
404
432
|
### Multilingual Text-to-Speech [Alpha]
|
405
433
|
|
406
434
|
You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
|
@@ -454,6 +482,31 @@ stream.close()
|
|
454
482
|
p.terminate()
|
455
483
|
```
|
456
484
|
|
485
|
+
### Speed and Emotion Control [Experimental]
|
486
|
+
|
487
|
+
You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
|
488
|
+
|
489
|
+
Speed Options:
|
490
|
+
- `slowest`, `slow`, `normal`, `fast`, `fastest`
|
491
|
+
|
492
|
+
Emotion Options:
|
493
|
+
Use a list of tags in the format `emotion_name:level` where:
|
494
|
+
- Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
|
495
|
+
- Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
|
496
|
+
The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
|
497
|
+
|
498
|
+
```python
|
499
|
+
ws.send(
|
500
|
+
model_id=model_id,
|
501
|
+
transcript=transcript,
|
502
|
+
voice_id=voice_id,
|
503
|
+
output_format=output_format,
|
504
|
+
_experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
|
505
|
+
)
|
506
|
+
```
|
507
|
+
|
508
|
+
### Jupyter Notebook Usage
|
509
|
+
|
457
510
|
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
|
458
511
|
Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
|
459
512
|
|
@@ -384,6 +384,34 @@ p.terminate()
|
|
384
384
|
ws.close() # Close the websocket connection
|
385
385
|
```
|
386
386
|
|
387
|
+
### Generating timestamps using WebSocket
|
388
|
+
|
389
|
+
The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
|
390
|
+
- words (list): The individual words in the transcript.
|
391
|
+
- start (list): The starting timestamp for each word (in seconds).
|
392
|
+
- end (list): The ending timestamp for each word (in seconds).
|
393
|
+
|
394
|
+
```python
|
395
|
+
response = ws.send(
|
396
|
+
model_id=model_id,
|
397
|
+
transcript=transcript,
|
398
|
+
voice_id=voice_id,
|
399
|
+
output_format=output_format,
|
400
|
+
stream=False,
|
401
|
+
add_timestamps=True
|
402
|
+
)
|
403
|
+
|
404
|
+
# Accessing the word_timestamps object
|
405
|
+
word_timestamps = response['word_timestamps']
|
406
|
+
|
407
|
+
words = word_timestamps['words']
|
408
|
+
start_times = word_timestamps['start']
|
409
|
+
end_times = word_timestamps['end']
|
410
|
+
|
411
|
+
for word, start, end in zip(words, start_times, end_times):
|
412
|
+
print(f"Word: {word}, Start: {start}, End: {end}")
|
413
|
+
```
|
414
|
+
|
387
415
|
### Multilingual Text-to-Speech [Alpha]
|
388
416
|
|
389
417
|
You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
|
@@ -437,6 +465,31 @@ stream.close()
|
|
437
465
|
p.terminate()
|
438
466
|
```
|
439
467
|
|
468
|
+
### Speed and Emotion Control [Experimental]
|
469
|
+
|
470
|
+
You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
|
471
|
+
|
472
|
+
Speed Options:
|
473
|
+
- `slowest`, `slow`, `normal`, `fast`, `fastest`
|
474
|
+
|
475
|
+
Emotion Options:
|
476
|
+
Use a list of tags in the format `emotion_name:level` where:
|
477
|
+
- Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
|
478
|
+
- Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
|
479
|
+
The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
|
480
|
+
|
481
|
+
```python
|
482
|
+
ws.send(
|
483
|
+
model_id=model_id,
|
484
|
+
transcript=transcript,
|
485
|
+
voice_id=voice_id,
|
486
|
+
output_format=output_format,
|
487
|
+
_experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
|
488
|
+
)
|
489
|
+
```
|
490
|
+
|
491
|
+
### Jupyter Notebook Usage
|
492
|
+
|
440
493
|
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
|
441
494
|
Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
|
442
495
|
|
@@ -45,7 +45,7 @@ class DeprecatedOutputFormatMapping:
|
|
45
45
|
"mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
|
46
46
|
"alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
|
47
47
|
}
|
48
|
-
|
48
|
+
|
49
49
|
@classmethod
|
50
50
|
@deprecated(
|
51
51
|
vdeprecated="1.0.1",
|
@@ -70,7 +70,32 @@ class VoiceMetadata(TypedDict):
|
|
70
70
|
language: str
|
71
71
|
|
72
72
|
|
73
|
+
class VoiceControls(TypedDict):
|
74
|
+
"""Defines different voice control parameters for voice synthesis.
|
75
|
+
|
76
|
+
For a complete list of supported parameters, refer to the Cartesia API documentation.
|
77
|
+
https://docs.cartesia.ai/api-reference
|
78
|
+
|
79
|
+
Examples:
|
80
|
+
>>> {"speed": "fastest"}
|
81
|
+
>>> {"speed": "slow", "emotion": ["sadness:high"]}
|
82
|
+
>>> {"emotion": ["surprise:highest", "curiosity"]}
|
83
|
+
|
84
|
+
Note:
|
85
|
+
This is an experimental class and is subject to rapid change in future versions.
|
86
|
+
"""
|
87
|
+
|
88
|
+
speed: str = ""
|
89
|
+
emotion: List[str] = []
|
90
|
+
|
91
|
+
|
73
92
|
class OutputFormat(TypedDict):
|
74
93
|
container: str
|
75
94
|
encoding: str
|
76
95
|
sample_rate: int
|
96
|
+
|
97
|
+
|
98
|
+
class EventType:
|
99
|
+
NULL = ""
|
100
|
+
AUDIO = "chunk"
|
101
|
+
TIMESTAMPS = "timestamps"
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
from collections import defaultdict
|
3
4
|
import json
|
4
5
|
import os
|
5
6
|
import uuid
|
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
|
|
27
28
|
|
28
29
|
from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
|
29
30
|
from cartesia._types import (
|
31
|
+
EventType,
|
30
32
|
OutputFormat,
|
31
33
|
OutputFormatMapping,
|
32
34
|
DeprecatedOutputFormatMapping,
|
35
|
+
VoiceControls,
|
33
36
|
VoiceMetadata,
|
34
37
|
)
|
35
38
|
|
@@ -295,6 +298,7 @@ class _TTSContext:
|
|
295
298
|
context_id: Optional[str] = None,
|
296
299
|
duration: Optional[int] = None,
|
297
300
|
language: Optional[str] = None,
|
301
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
298
302
|
) -> Generator[bytes, None, None]:
|
299
303
|
"""Send audio generation requests to the WebSocket and yield responses.
|
300
304
|
|
@@ -307,6 +311,8 @@ class _TTSContext:
|
|
307
311
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
308
312
|
duration: The duration of the audio in seconds.
|
309
313
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
314
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
315
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
310
316
|
|
311
317
|
Yields:
|
312
318
|
Dictionary containing the following key(s):
|
@@ -322,7 +328,11 @@ class _TTSContext:
|
|
322
328
|
|
323
329
|
self._websocket.connect()
|
324
330
|
|
325
|
-
voice =
|
331
|
+
voice = TTS._validate_and_construct_voice(
|
332
|
+
voice_id,
|
333
|
+
voice_embedding=voice_embedding,
|
334
|
+
experimental_voice_controls=_experimental_voice_controls,
|
335
|
+
)
|
326
336
|
|
327
337
|
# Create the initial request body
|
328
338
|
request_body = {
|
@@ -482,42 +492,16 @@ class _WebSocket:
|
|
482
492
|
def _convert_response(
|
483
493
|
self, response: Dict[str, any], include_context_id: bool
|
484
494
|
) -> Dict[str, Any]:
|
485
|
-
|
495
|
+
out = {}
|
496
|
+
if response["type"] == EventType.AUDIO:
|
497
|
+
out["audio"] = base64.b64decode(response["data"])
|
498
|
+
elif response["type"] == EventType.TIMESTAMPS:
|
499
|
+
out["word_timestamps"] = response["word_timestamps"]
|
486
500
|
|
487
|
-
optional_kwargs = {}
|
488
501
|
if include_context_id:
|
489
|
-
|
490
|
-
|
491
|
-
return {
|
492
|
-
"audio": audio,
|
493
|
-
**optional_kwargs,
|
494
|
-
}
|
495
|
-
|
496
|
-
def _validate_and_construct_voice(
|
497
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
498
|
-
) -> dict:
|
499
|
-
"""Validate and construct the voice dictionary for the request.
|
500
|
-
|
501
|
-
Args:
|
502
|
-
voice_id: The ID of the voice to use for generating audio.
|
503
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
502
|
+
out["context_id"] = response["context_id"]
|
504
503
|
|
505
|
-
|
506
|
-
A dictionary representing the voice configuration.
|
507
|
-
|
508
|
-
Raises:
|
509
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
510
|
-
"""
|
511
|
-
if voice_id is None and voice_embedding is None:
|
512
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
513
|
-
|
514
|
-
if voice_id is not None and voice_embedding is not None:
|
515
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
516
|
-
|
517
|
-
if voice_id:
|
518
|
-
return {"mode": "id", "id": voice_id}
|
519
|
-
|
520
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
504
|
+
return out
|
521
505
|
|
522
506
|
def send(
|
523
507
|
self,
|
@@ -530,6 +514,8 @@ class _WebSocket:
|
|
530
514
|
duration: Optional[int] = None,
|
531
515
|
language: Optional[str] = None,
|
532
516
|
stream: bool = True,
|
517
|
+
add_timestamps: bool = False,
|
518
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
533
519
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
534
520
|
"""Send a request to the WebSocket to generate audio.
|
535
521
|
|
@@ -543,6 +529,9 @@ class _WebSocket:
|
|
543
529
|
duration: The duration of the audio in seconds.
|
544
530
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
545
531
|
stream: Whether to stream the audio or not.
|
532
|
+
add_timestamps: Whether to return word-level timestamps.
|
533
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
534
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
546
535
|
|
547
536
|
Returns:
|
548
537
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -556,7 +545,11 @@ class _WebSocket:
|
|
556
545
|
if context_id is None:
|
557
546
|
context_id = str(uuid.uuid4())
|
558
547
|
|
559
|
-
voice =
|
548
|
+
voice = TTS._validate_and_construct_voice(
|
549
|
+
voice_id,
|
550
|
+
voice_embedding=voice_embedding,
|
551
|
+
experimental_voice_controls=_experimental_voice_controls,
|
552
|
+
)
|
560
553
|
|
561
554
|
request_body = {
|
562
555
|
"model_id": model_id,
|
@@ -569,6 +562,7 @@ class _WebSocket:
|
|
569
562
|
},
|
570
563
|
"context_id": context_id,
|
571
564
|
"language": language,
|
565
|
+
"add_timestamps": add_timestamps,
|
572
566
|
}
|
573
567
|
|
574
568
|
if duration is not None:
|
@@ -580,10 +574,17 @@ class _WebSocket:
|
|
580
574
|
return generator
|
581
575
|
|
582
576
|
chunks = []
|
577
|
+
word_timestamps = defaultdict(list)
|
583
578
|
for chunk in generator:
|
584
|
-
|
585
|
-
|
586
|
-
|
579
|
+
if "audio" in chunk:
|
580
|
+
chunks.append(chunk["audio"])
|
581
|
+
if add_timestamps and "word_timestamps" in chunk:
|
582
|
+
for k, v in chunk["word_timestamps"].items():
|
583
|
+
word_timestamps[k].extend(v)
|
584
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
585
|
+
if add_timestamps:
|
586
|
+
out["word_timestamps"] = word_timestamps
|
587
|
+
return out
|
587
588
|
|
588
589
|
def _websocket_generator(self, request_body: Dict[str, Any]):
|
589
590
|
self.websocket.send(json.dumps(request_body))
|
@@ -656,32 +657,6 @@ class _SSE:
|
|
656
657
|
break
|
657
658
|
return buffer, outputs
|
658
659
|
|
659
|
-
def _validate_and_construct_voice(
|
660
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
661
|
-
) -> dict:
|
662
|
-
"""Validate and construct the voice dictionary for the request.
|
663
|
-
|
664
|
-
Args:
|
665
|
-
voice_id: The ID of the voice to use for generating audio.
|
666
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
667
|
-
|
668
|
-
Returns:
|
669
|
-
A dictionary representing the voice configuration.
|
670
|
-
|
671
|
-
Raises:
|
672
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
673
|
-
"""
|
674
|
-
if voice_id is None and voice_embedding is None:
|
675
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
676
|
-
|
677
|
-
if voice_id is not None and voice_embedding is not None:
|
678
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
679
|
-
|
680
|
-
if voice_id:
|
681
|
-
return {"mode": "id", "id": voice_id}
|
682
|
-
|
683
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
684
|
-
|
685
660
|
def send(
|
686
661
|
self,
|
687
662
|
model_id: str,
|
@@ -692,6 +667,7 @@ class _SSE:
|
|
692
667
|
duration: Optional[int] = None,
|
693
668
|
language: Optional[str] = None,
|
694
669
|
stream: bool = True,
|
670
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
695
671
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
696
672
|
"""Send a request to the server to generate audio using Server-Sent Events.
|
697
673
|
|
@@ -704,6 +680,8 @@ class _SSE:
|
|
704
680
|
duration: The duration of the audio in seconds.
|
705
681
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
706
682
|
stream: Whether to stream the audio or not.
|
683
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
684
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
707
685
|
|
708
686
|
Returns:
|
709
687
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -711,8 +689,11 @@ class _SSE:
|
|
711
689
|
Both the generator and the dictionary contain the following key(s):
|
712
690
|
- audio: The audio as bytes.
|
713
691
|
"""
|
714
|
-
voice =
|
715
|
-
|
692
|
+
voice = TTS._validate_and_construct_voice(
|
693
|
+
voice_id,
|
694
|
+
voice_embedding=voice_embedding,
|
695
|
+
experimental_voice_controls=_experimental_voice_controls,
|
696
|
+
)
|
716
697
|
request_body = {
|
717
698
|
"model_id": model_id,
|
718
699
|
"transcript": transcript,
|
@@ -826,6 +807,7 @@ class TTS(Resource):
|
|
826
807
|
sample_rate=output_format_obj["sample_rate"],
|
827
808
|
)
|
828
809
|
|
810
|
+
@staticmethod
|
829
811
|
def get_sample_rate(self, output_format_name: str) -> int:
|
830
812
|
"""Convenience method to get the sample rate for a given output format.
|
831
813
|
|
@@ -849,6 +831,40 @@ class TTS(Resource):
|
|
849
831
|
|
850
832
|
return output_format_obj["sample_rate"]
|
851
833
|
|
834
|
+
@staticmethod
|
835
|
+
def _validate_and_construct_voice(
|
836
|
+
voice_id: Optional[str] = None,
|
837
|
+
voice_embedding: Optional[List[float]] = None,
|
838
|
+
experimental_voice_controls: Optional[VoiceControls] = None,
|
839
|
+
) -> dict:
|
840
|
+
"""Validate and construct the voice dictionary for the request.
|
841
|
+
|
842
|
+
Args:
|
843
|
+
voice_id: The ID of the voice to use for generating audio.
|
844
|
+
voice_embedding: The embedding of the voice to use for generating audio.
|
845
|
+
experimental_voice_controls: Voice controls for emotion and speed.
|
846
|
+
Note: This is an experimental feature and may rapidly change in the future.
|
847
|
+
|
848
|
+
Returns:
|
849
|
+
A dictionary representing the voice configuration.
|
850
|
+
|
851
|
+
Raises:
|
852
|
+
ValueError: If neither or both voice_id and voice_embedding are specified.
|
853
|
+
"""
|
854
|
+
if voice_id is None and voice_embedding is None:
|
855
|
+
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
856
|
+
|
857
|
+
if voice_id is not None and voice_embedding is not None:
|
858
|
+
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
859
|
+
|
860
|
+
if voice_id:
|
861
|
+
voice = {"mode": "id", "id": voice_id}
|
862
|
+
else:
|
863
|
+
voice = {"mode": "embedding", "embedding": voice_embedding}
|
864
|
+
if experimental_voice_controls is not None:
|
865
|
+
voice["__experimental_controls"] = experimental_voice_controls
|
866
|
+
return voice
|
867
|
+
|
852
868
|
|
853
869
|
class AsyncCartesia(Cartesia):
|
854
870
|
"""The asynchronous version of the Cartesia client."""
|
@@ -946,8 +962,13 @@ class _AsyncSSE(_SSE):
|
|
946
962
|
duration: Optional[int] = None,
|
947
963
|
language: Optional[str] = None,
|
948
964
|
stream: bool = True,
|
965
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
949
966
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
950
|
-
voice =
|
967
|
+
voice = TTS._validate_and_construct_voice(
|
968
|
+
voice_id,
|
969
|
+
voice_embedding=voice_embedding,
|
970
|
+
experimental_voice_controls=_experimental_voice_controls,
|
971
|
+
)
|
951
972
|
|
952
973
|
request_body = {
|
953
974
|
"model_id": model_id,
|
@@ -1043,6 +1064,8 @@ class _AsyncTTSContext:
|
|
1043
1064
|
continue_: bool = False,
|
1044
1065
|
duration: Optional[int] = None,
|
1045
1066
|
language: Optional[str] = None,
|
1067
|
+
add_timestamps: bool = False,
|
1068
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1046
1069
|
) -> None:
|
1047
1070
|
"""Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
|
1048
1071
|
|
@@ -1055,7 +1078,10 @@ class _AsyncTTSContext:
|
|
1055
1078
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
1056
1079
|
continue_: Whether to continue the audio generation from the previous transcript or not.
|
1057
1080
|
duration: The duration of the audio in seconds.
|
1058
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual
|
1081
|
+
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
|
1082
|
+
add_timestamps: Whether to return word-level timestamps.
|
1083
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
1084
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
1059
1085
|
|
1060
1086
|
Returns:
|
1061
1087
|
None.
|
@@ -1067,7 +1093,9 @@ class _AsyncTTSContext:
|
|
1067
1093
|
|
1068
1094
|
await self._websocket.connect()
|
1069
1095
|
|
1070
|
-
voice =
|
1096
|
+
voice = TTS._validate_and_construct_voice(
|
1097
|
+
voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
|
1098
|
+
)
|
1071
1099
|
|
1072
1100
|
request_body = {
|
1073
1101
|
"model_id": model_id,
|
@@ -1081,6 +1109,7 @@ class _AsyncTTSContext:
|
|
1081
1109
|
"context_id": self._context_id,
|
1082
1110
|
"continue": continue_,
|
1083
1111
|
"language": language,
|
1112
|
+
"add_timestamps": add_timestamps,
|
1084
1113
|
}
|
1085
1114
|
|
1086
1115
|
if duration is not None:
|
@@ -1234,7 +1263,10 @@ class _AsyncWebSocket(_WebSocket):
|
|
1234
1263
|
duration: Optional[int] = None,
|
1235
1264
|
language: Optional[str] = None,
|
1236
1265
|
stream: bool = True,
|
1266
|
+
add_timestamps: bool = False,
|
1267
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1237
1268
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
1269
|
+
"""See :meth:`_WebSocket.send` for details."""
|
1238
1270
|
if context_id is None:
|
1239
1271
|
context_id = str(uuid.uuid4())
|
1240
1272
|
|
@@ -1250,6 +1282,8 @@ class _AsyncWebSocket(_WebSocket):
|
|
1250
1282
|
duration=duration,
|
1251
1283
|
language=language,
|
1252
1284
|
continue_=False,
|
1285
|
+
add_timestamps=add_timestamps,
|
1286
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
1253
1287
|
)
|
1254
1288
|
|
1255
1289
|
generator = ctx.receive()
|
@@ -1258,10 +1292,17 @@ class _AsyncWebSocket(_WebSocket):
|
|
1258
1292
|
return generator
|
1259
1293
|
|
1260
1294
|
chunks = []
|
1295
|
+
word_timestamps = defaultdict(list)
|
1261
1296
|
async for chunk in generator:
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1297
|
+
if "audio" in chunk:
|
1298
|
+
chunks.append(chunk["audio"])
|
1299
|
+
if add_timestamps and "word_timestamps" in chunk:
|
1300
|
+
for k, v in chunk["word_timestamps"].items():
|
1301
|
+
word_timestamps[k].extend(v)
|
1302
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
1303
|
+
if add_timestamps:
|
1304
|
+
out["word_timestamps"] = word_timestamps
|
1305
|
+
return out
|
1265
1306
|
|
1266
1307
|
async def _process_responses(self):
|
1267
1308
|
try:
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.0.7"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -401,6 +401,34 @@ p.terminate()
|
|
401
401
|
ws.close() # Close the websocket connection
|
402
402
|
```
|
403
403
|
|
404
|
+
### Generating timestamps using WebSocket
|
405
|
+
|
406
|
+
The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
|
407
|
+
- words (list): The individual words in the transcript.
|
408
|
+
- start (list): The starting timestamp for each word (in seconds).
|
409
|
+
- end (list): The ending timestamp for each word (in seconds).
|
410
|
+
|
411
|
+
```python
|
412
|
+
response = ws.send(
|
413
|
+
model_id=model_id,
|
414
|
+
transcript=transcript,
|
415
|
+
voice_id=voice_id,
|
416
|
+
output_format=output_format,
|
417
|
+
stream=False,
|
418
|
+
add_timestamps=True
|
419
|
+
)
|
420
|
+
|
421
|
+
# Accessing the word_timestamps object
|
422
|
+
word_timestamps = response['word_timestamps']
|
423
|
+
|
424
|
+
words = word_timestamps['words']
|
425
|
+
start_times = word_timestamps['start']
|
426
|
+
end_times = word_timestamps['end']
|
427
|
+
|
428
|
+
for word, start, end in zip(words, start_times, end_times):
|
429
|
+
print(f"Word: {word}, Start: {start}, End: {end}")
|
430
|
+
```
|
431
|
+
|
404
432
|
### Multilingual Text-to-Speech [Alpha]
|
405
433
|
|
406
434
|
You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
|
@@ -454,6 +482,31 @@ stream.close()
|
|
454
482
|
p.terminate()
|
455
483
|
```
|
456
484
|
|
485
|
+
### Speed and Emotion Control [Experimental]
|
486
|
+
|
487
|
+
You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
|
488
|
+
|
489
|
+
Speed Options:
|
490
|
+
- `slowest`, `slow`, `normal`, `fast`, `fastest`
|
491
|
+
|
492
|
+
Emotion Options:
|
493
|
+
Use a list of tags in the format `emotion_name:level` where:
|
494
|
+
- Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
|
495
|
+
- Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
|
496
|
+
The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
|
497
|
+
|
498
|
+
```python
|
499
|
+
ws.send(
|
500
|
+
model_id=model_id,
|
501
|
+
transcript=transcript,
|
502
|
+
voice_id=voice_id,
|
503
|
+
output_format=output_format,
|
504
|
+
_experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
|
505
|
+
)
|
506
|
+
```
|
507
|
+
|
508
|
+
### Jupyter Notebook Usage
|
509
|
+
|
457
510
|
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
|
458
511
|
Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
|
459
512
|
|
@@ -10,7 +10,7 @@ import os
|
|
10
10
|
import sys
|
11
11
|
from cartesia import AsyncCartesia, Cartesia
|
12
12
|
from cartesia.client import DEFAULT_MODEL_ID, MULTILINGUAL_MODEL_ID
|
13
|
-
from cartesia._types import VoiceMetadata
|
13
|
+
from cartesia._types import VoiceControls, VoiceMetadata
|
14
14
|
from typing import AsyncGenerator, Generator, List
|
15
15
|
import numpy as np
|
16
16
|
import pytest
|
@@ -19,9 +19,11 @@ import asyncio
|
|
19
19
|
|
20
20
|
THISDIR = os.path.dirname(__file__)
|
21
21
|
sys.path.insert(0, os.path.dirname(THISDIR))
|
22
|
+
RESOURCES_DIR = os.path.join(THISDIR, "resources")
|
22
23
|
|
23
24
|
SAMPLE_VOICE = "Newsman"
|
24
25
|
SAMPLE_VOICE_ID = "d46abd1d-2d02-43e8-819f-51fb652c1c61"
|
26
|
+
EXPERIMENTAL_VOICE_CONTROLS = {"emotion": ["anger:high", "positivity:low"], "speed": "fastest"}
|
25
27
|
|
26
28
|
logger = logging.getLogger(__name__)
|
27
29
|
|
@@ -84,7 +86,12 @@ def test_get_voice_from_id(client: Cartesia):
|
|
84
86
|
# cloned_voice_embedding = client.voices.clone(link=url)
|
85
87
|
# assert isinstance(cloned_voice_embedding, list)
|
86
88
|
# assert len(cloned_voice_embedding) == 192
|
87
|
-
|
89
|
+
|
90
|
+
def test_clone_voice_with_file(client: Cartesia):
|
91
|
+
logger.info("Testing voices.clone with file")
|
92
|
+
output = client.voices.clone(filepath=os.path.join(RESOURCES_DIR, "sample-speech-4s.wav"))
|
93
|
+
assert isinstance(output, list)
|
94
|
+
|
88
95
|
def test_create_voice(client: Cartesia):
|
89
96
|
logger.info("Testing voices.create")
|
90
97
|
embedding = np.ones(192).tolist()
|
@@ -96,7 +103,8 @@ def test_create_voice(client: Cartesia):
|
|
96
103
|
assert voice in voices
|
97
104
|
|
98
105
|
@pytest.mark.parametrize("stream", [True, False])
|
99
|
-
|
106
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
107
|
+
def test_sse_send(resources: _Resources, stream: bool, _experimental_voice_controls: VoiceControls):
|
100
108
|
logger.info("Testing SSE send")
|
101
109
|
client = resources.client
|
102
110
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
@@ -105,7 +113,7 @@ def test_sse_send(resources: _Resources, stream: bool):
|
|
105
113
|
"container": "raw",
|
106
114
|
"encoding": "pcm_f32le",
|
107
115
|
"sample_rate": 44100
|
108
|
-
}, stream=stream, model_id=DEFAULT_MODEL_ID)
|
116
|
+
}, stream=stream, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
|
109
117
|
|
110
118
|
if not stream:
|
111
119
|
output_generate = [output_generate]
|
@@ -132,7 +140,8 @@ def test_sse_send_with_model_id(resources: _Resources, stream: bool):
|
|
132
140
|
assert isinstance(out["audio"], bytes)
|
133
141
|
|
134
142
|
@pytest.mark.parametrize("stream", [True, False])
|
135
|
-
|
143
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
144
|
+
def test_websocket_send(resources: _Resources, stream: bool, _experimental_voice_controls: VoiceControls):
|
136
145
|
logger.info("Testing WebSocket send")
|
137
146
|
client = resources.client
|
138
147
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
@@ -143,7 +152,7 @@ def test_websocket_send(resources: _Resources, stream: bool):
|
|
143
152
|
"container": "raw",
|
144
153
|
"encoding": "pcm_f32le",
|
145
154
|
"sample_rate": 44100
|
146
|
-
}, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id)
|
155
|
+
}, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id, _experimental_voice_controls=_experimental_voice_controls)
|
147
156
|
|
148
157
|
if not stream:
|
149
158
|
output_generate = [output_generate]
|
@@ -152,8 +161,37 @@ def test_websocket_send(resources: _Resources, stream: bool):
|
|
152
161
|
assert isinstance(out["audio"], bytes)
|
153
162
|
|
154
163
|
ws.close()
|
164
|
+
|
165
|
+
|
166
|
+
@pytest.mark.parametrize("stream", [True, False])
|
167
|
+
def test_websocket_send_timestamps(resources: _Resources, stream: bool):
|
168
|
+
logger.info("Testing WebSocket send")
|
169
|
+
client = resources.client
|
170
|
+
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
171
|
+
|
172
|
+
ws = client.tts.websocket()
|
173
|
+
context_id = str(uuid.uuid4())
|
174
|
+
output_generate = ws.send(transcript=transcript, voice_id=SAMPLE_VOICE_ID, output_format={
|
175
|
+
"container": "raw",
|
176
|
+
"encoding": "pcm_f32le",
|
177
|
+
"sample_rate": 44100
|
178
|
+
}, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id, add_timestamps=True)
|
179
|
+
|
180
|
+
if not stream:
|
181
|
+
output_generate = [output_generate]
|
182
|
+
|
183
|
+
has_wordtimestamps = False
|
184
|
+
for out in output_generate:
|
185
|
+
has_wordtimestamps |= "word_timestamps" in out
|
186
|
+
_validate_schema(out)
|
187
|
+
|
188
|
+
assert has_wordtimestamps, "No word timestamps found"
|
189
|
+
|
190
|
+
ws.close()
|
191
|
+
|
155
192
|
|
156
|
-
|
193
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
194
|
+
def test_sse_send_context_manager(resources: _Resources, _experimental_voice_controls: VoiceControls):
|
157
195
|
logger.info("Testing SSE send context manager")
|
158
196
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
159
197
|
|
@@ -162,7 +200,7 @@ def test_sse_send_context_manager(resources: _Resources):
|
|
162
200
|
"container": "raw",
|
163
201
|
"encoding": "pcm_f32le",
|
164
202
|
"sample_rate": 44100
|
165
|
-
}, stream=True, model_id=DEFAULT_MODEL_ID)
|
203
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
|
166
204
|
assert isinstance(output_generate, Generator)
|
167
205
|
|
168
206
|
for out in output_generate:
|
@@ -183,7 +221,7 @@ def test_sse_send_context_manager_with_err():
|
|
183
221
|
raise RuntimeError("Expected error to be thrown")
|
184
222
|
except Exception:
|
185
223
|
pass
|
186
|
-
|
224
|
+
|
187
225
|
def test_websocket_send_context_manager(resources: _Resources):
|
188
226
|
logger.info("Testing WebSocket send context manager")
|
189
227
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
@@ -216,9 +254,10 @@ def test_websocket_send_context_manage_err(resources: _Resources):
|
|
216
254
|
raise RuntimeError("Expected error to be thrown")
|
217
255
|
except Exception:
|
218
256
|
pass
|
219
|
-
|
257
|
+
|
220
258
|
@pytest.mark.asyncio
|
221
|
-
|
259
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
260
|
+
async def test_async_sse_send( resources: _Resources, _experimental_voice_controls: VoiceControls):
|
222
261
|
logger.info("Testing async SSE send")
|
223
262
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
224
263
|
|
@@ -228,7 +267,7 @@ async def test_async_sse_send(resources: _Resources):
|
|
228
267
|
"container": "raw",
|
229
268
|
"encoding": "pcm_f32le",
|
230
269
|
"sample_rate": 44100
|
231
|
-
}, stream=True, model_id=DEFAULT_MODEL_ID)
|
270
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
|
232
271
|
|
233
272
|
async for out in output:
|
234
273
|
assert out.keys() == {"audio"}
|
@@ -238,7 +277,8 @@ async def test_async_sse_send(resources: _Resources):
|
|
238
277
|
await async_client.close()
|
239
278
|
|
240
279
|
@pytest.mark.asyncio
|
241
|
-
|
280
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
281
|
+
async def test_async_websocket_send(resources: _Resources, _experimental_voice_controls: VoiceControls):
|
242
282
|
logger.info("Testing async WebSocket send")
|
243
283
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
244
284
|
|
@@ -250,7 +290,7 @@ async def test_async_websocket_send(resources: _Resources):
|
|
250
290
|
"container": "raw",
|
251
291
|
"encoding": "pcm_f32le",
|
252
292
|
"sample_rate": 44100,
|
253
|
-
}, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id)
|
293
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id, _experimental_voice_controls=_experimental_voice_controls)
|
254
294
|
|
255
295
|
async for out in output:
|
256
296
|
assert out.keys() == {"audio", "context_id"}
|
@@ -259,7 +299,37 @@ async def test_async_websocket_send(resources: _Resources):
|
|
259
299
|
# Close the websocket
|
260
300
|
await ws.close()
|
261
301
|
await async_client.close()
|
302
|
+
|
303
|
+
|
304
|
+
@pytest.mark.asyncio
|
305
|
+
async def test_async_websocket_send_timestamps(resources: _Resources):
|
306
|
+
logger.info("Testing async WebSocket send with timestamps")
|
307
|
+
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
308
|
+
|
309
|
+
async_client = create_async_client()
|
310
|
+
ws = await async_client.tts.websocket()
|
311
|
+
context_id = str(uuid.uuid4())
|
312
|
+
try:
|
313
|
+
output = await ws.send(transcript=transcript, voice_id=SAMPLE_VOICE_ID, output_format={
|
314
|
+
"container": "raw",
|
315
|
+
"encoding": "pcm_f32le",
|
316
|
+
"sample_rate": 44100,
|
317
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id, add_timestamps=True)
|
318
|
+
|
319
|
+
has_wordtimestamps = False
|
320
|
+
async for out in output:
|
321
|
+
assert "context_id" in out
|
322
|
+
has_wordtimestamps |= "word_timestamps" in out
|
323
|
+
_validate_schema(out)
|
262
324
|
|
325
|
+
assert has_wordtimestamps, "No word timestamps found"
|
326
|
+
|
327
|
+
finally:
|
328
|
+
# Close the websocket
|
329
|
+
await ws.close()
|
330
|
+
await async_client.close()
|
331
|
+
|
332
|
+
|
263
333
|
@pytest.mark.asyncio
|
264
334
|
async def test_async_sse_send_context_manager(resources: _Resources):
|
265
335
|
logger.info("Testing async SSE send context manager")
|
@@ -766,3 +836,16 @@ def test_websocket_send_with_incorrect_url():
|
|
766
836
|
ws.close()
|
767
837
|
except Exception as e:
|
768
838
|
logger.info("Unexpected error occured: ", e)
|
839
|
+
|
840
|
+
|
841
|
+
def _validate_schema(out):
|
842
|
+
if "audio" in out:
|
843
|
+
assert isinstance(out["audio"], bytes)
|
844
|
+
if "word_timestamps" in out:
|
845
|
+
assert isinstance(out["word_timestamps"], dict)
|
846
|
+
word_timestamps = out["word_timestamps"]
|
847
|
+
|
848
|
+
assert word_timestamps.keys() == {"words", "start", "end"}
|
849
|
+
assert isinstance(word_timestamps["words"], list) and all(isinstance(word, str) for word in word_timestamps["words"])
|
850
|
+
assert isinstance(word_timestamps["start"], list) and all(isinstance(start, (int, float)) for start in word_timestamps["start"])
|
851
|
+
assert isinstance(word_timestamps["end"], list) and all(isinstance(end, (int, float)) for end in word_timestamps["end"])
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.0.5"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|