cartesia 1.0.10__tar.gz → 1.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-1.0.10 → cartesia-1.0.12}/PKG-INFO +7 -1
- {cartesia-1.0.10 → cartesia-1.0.12}/README.md +6 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia/client.py +39 -3
- cartesia-1.0.12/cartesia/version.py +1 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia.egg-info/PKG-INFO +7 -1
- {cartesia-1.0.10 → cartesia-1.0.12}/tests/test_tts.py +27 -1
- cartesia-1.0.10/cartesia/version.py +0 -1
- {cartesia-1.0.10 → cartesia-1.0.12}/LICENSE.md +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia/__init__.py +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia/_types.py +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia/utils/__init__.py +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia/utils/deprecated.py +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia/utils/retry.py +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia.egg-info/SOURCES.txt +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia.egg-info/requires.txt +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/pyproject.toml +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/setup.cfg +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/setup.py +0 -0
- {cartesia-1.0.10 → cartesia-1.0.12}/tests/test_deprecated.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.12
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -73,6 +73,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
|
|
73
73
|
# Clone a voice using filepath
|
74
74
|
cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
|
75
75
|
|
76
|
+
# Mix voices together
|
77
|
+
mixed_voice_embedding = client.voices.mix(
|
78
|
+
[{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
|
79
|
+
)
|
80
|
+
|
76
81
|
# Create a new voice
|
77
82
|
new_voice = client.voices.create(
|
78
83
|
name="New Voice",
|
@@ -504,6 +509,7 @@ You can enhance the voice output by adjusting the `speed` and `emotion` paramete
|
|
504
509
|
|
505
510
|
Speed Options:
|
506
511
|
- `slowest`, `slow`, `normal`, `fast`, `fastest`
|
512
|
+
- Float values between -1.0 and 1.0, where -1.0 is the slowest speed and 1.0 is the fastest speed.
|
507
513
|
|
508
514
|
Emotion Options:
|
509
515
|
Use a list of tags in the format `emotion_name:level` where:
|
@@ -56,6 +56,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
|
|
56
56
|
# Clone a voice using filepath
|
57
57
|
cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
|
58
58
|
|
59
|
+
# Mix voices together
|
60
|
+
mixed_voice_embedding = client.voices.mix(
|
61
|
+
[{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
|
62
|
+
)
|
63
|
+
|
59
64
|
# Create a new voice
|
60
65
|
new_voice = client.voices.create(
|
61
66
|
name="New Voice",
|
@@ -487,6 +492,7 @@ You can enhance the voice output by adjusting the `speed` and `emotion` paramete
|
|
487
492
|
|
488
493
|
Speed Options:
|
489
494
|
- `slowest`, `slow`, `normal`, `fast`, `fastest`
|
495
|
+
- Float values between -1.0 and 1.0, where -1.0 is the slowest speed and 1.0 is the fastest speed.
|
490
496
|
|
491
497
|
Emotion Options:
|
492
498
|
Use a list of tags in the format `emotion_name:level` where:
|
@@ -32,7 +32,6 @@ except ImportError:
|
|
32
32
|
IS_WEBSOCKET_SYNC_AVAILABLE = False
|
33
33
|
|
34
34
|
from iterators import TimeoutIterator
|
35
|
-
from websockets.sync.client import connect
|
36
35
|
|
37
36
|
from cartesia._types import (
|
38
37
|
DeprecatedOutputFormatMapping,
|
@@ -261,6 +260,40 @@ class Voices(Resource):
|
|
261
260
|
|
262
261
|
return response.json()
|
263
262
|
|
263
|
+
def mix(self, voices: List[Dict[str, Union[str, float]]]) -> List[float]:
|
264
|
+
"""Mix multiple voices together.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
voices: A list of dictionaries, each containing either:
|
268
|
+
- 'id': The ID of an existing voice
|
269
|
+
- 'embedding': A voice embedding
|
270
|
+
AND
|
271
|
+
- 'weight': The weight of the voice in the mix (0.0 to 1.0)
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
The embedding of the mixed voice as a list of floats.
|
275
|
+
|
276
|
+
Raises:
|
277
|
+
ValueError: If the request fails or if the input is invalid.
|
278
|
+
"""
|
279
|
+
url = f"{self._http_url()}/voices/mix"
|
280
|
+
|
281
|
+
if not voices or not isinstance(voices, list):
|
282
|
+
raise ValueError("voices must be a non-empty list")
|
283
|
+
|
284
|
+
response = httpx.post(
|
285
|
+
url,
|
286
|
+
headers=self.headers,
|
287
|
+
json={"voices": voices},
|
288
|
+
timeout=self.timeout,
|
289
|
+
)
|
290
|
+
|
291
|
+
if not response.is_success:
|
292
|
+
raise ValueError(f"Failed to mix voices. Error: {response.text}")
|
293
|
+
|
294
|
+
result = response.json()
|
295
|
+
return result["embedding"]
|
296
|
+
|
264
297
|
|
265
298
|
class _TTSContext:
|
266
299
|
"""Manage a single context over a WebSocket.
|
@@ -294,6 +327,7 @@ class _TTSContext:
|
|
294
327
|
context_id: Optional[str] = None,
|
295
328
|
duration: Optional[int] = None,
|
296
329
|
language: Optional[str] = None,
|
330
|
+
add_timestamps: bool = False,
|
297
331
|
_experimental_voice_controls: Optional[VoiceControls] = None,
|
298
332
|
) -> Generator[bytes, None, None]:
|
299
333
|
"""Send audio generation requests to the WebSocket and yield responses.
|
@@ -307,6 +341,7 @@ class _TTSContext:
|
|
307
341
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
308
342
|
duration: The duration of the audio in seconds.
|
309
343
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
344
|
+
add_timestamps: Whether to return word-level timestamps.
|
310
345
|
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
311
346
|
Note: This is an experimental feature and may change rapidly in future releases.
|
312
347
|
|
@@ -341,6 +376,7 @@ class _TTSContext:
|
|
341
376
|
},
|
342
377
|
"context_id": self._context_id,
|
343
378
|
"language": language,
|
379
|
+
"add_timestamps": add_timestamps,
|
344
380
|
}
|
345
381
|
|
346
382
|
if duration is not None:
|
@@ -858,9 +894,9 @@ class TTS(Resource):
|
|
858
894
|
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
859
895
|
|
860
896
|
if voice_id:
|
861
|
-
voice = {"
|
897
|
+
voice = {"id": voice_id}
|
862
898
|
else:
|
863
|
-
voice = {"
|
899
|
+
voice = {"embedding": voice_embedding}
|
864
900
|
if experimental_voice_controls is not None:
|
865
901
|
voice["__experimental_controls"] = experimental_voice_controls
|
866
902
|
return voice
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.0.12"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.12
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -73,6 +73,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
|
|
73
73
|
# Clone a voice using filepath
|
74
74
|
cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
|
75
75
|
|
76
|
+
# Mix voices together
|
77
|
+
mixed_voice_embedding = client.voices.mix(
|
78
|
+
[{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
|
79
|
+
)
|
80
|
+
|
76
81
|
# Create a new voice
|
77
82
|
new_voice = client.voices.create(
|
78
83
|
name="New Voice",
|
@@ -504,6 +509,7 @@ You can enhance the voice output by adjusting the `speed` and `emotion` paramete
|
|
504
509
|
|
505
510
|
Speed Options:
|
506
511
|
- `slowest`, `slow`, `normal`, `fast`, `fastest`
|
512
|
+
- Float values between -1.0 and 1.0, where -1.0 is the slowest speed and 1.0 is the fastest speed.
|
507
513
|
|
508
514
|
Emotion Options:
|
509
515
|
Use a list of tags in the format `emotion_name:level` where:
|
@@ -102,6 +102,11 @@ def test_create_voice(client: Cartesia):
|
|
102
102
|
assert voice["is_public"] is False
|
103
103
|
voices = client.voices.list()
|
104
104
|
assert voice in voices
|
105
|
+
|
106
|
+
def test_mix_voice(client: Cartesia):
|
107
|
+
logger.info("Testing voices.mix")
|
108
|
+
output = client.voices.mix(voices = [{"id": SAMPLE_VOICE_ID, "weight": 0.1}, {"id": SAMPLE_VOICE_ID, "weight": 0.9}])
|
109
|
+
assert isinstance(output, list)
|
105
110
|
|
106
111
|
@pytest.mark.parametrize("stream", [True, False])
|
107
112
|
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS, EXPERIMENTAL_VOICE_CONTROLS_2])
|
@@ -190,7 +195,6 @@ def test_websocket_send_timestamps(resources: _Resources, stream: bool):
|
|
190
195
|
|
191
196
|
ws.close()
|
192
197
|
|
193
|
-
|
194
198
|
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS, EXPERIMENTAL_VOICE_CONTROLS_2])
|
195
199
|
def test_sse_send_context_manager(resources: _Resources, _experimental_voice_controls: VoiceControls):
|
196
200
|
logger.info("Testing SSE send context manager")
|
@@ -456,6 +460,28 @@ def test_sync_continuation_websocket_context_send():
|
|
456
460
|
assert isinstance(out["audio"], bytes)
|
457
461
|
finally:
|
458
462
|
ws.close()
|
463
|
+
|
464
|
+
def test_sync_context_send_timestamps(resources: _Resources):
|
465
|
+
logger.info("Testing WebSocket send")
|
466
|
+
client = resources.client
|
467
|
+
transcripts = ["Hello, world!", "I'\''m generating audio on Cartesia."]
|
468
|
+
|
469
|
+
ws = client.tts.websocket()
|
470
|
+
ctx = ws.context()
|
471
|
+
output_generate = ctx.send(transcript=chunk_generator(transcripts), voice_id=SAMPLE_VOICE_ID, output_format={
|
472
|
+
"container": "raw",
|
473
|
+
"encoding": "pcm_f32le",
|
474
|
+
"sample_rate": 44100
|
475
|
+
}, model_id=DEFAULT_MODEL_ID, add_timestamps=True)
|
476
|
+
|
477
|
+
has_wordtimestamps = False
|
478
|
+
for out in output_generate:
|
479
|
+
has_wordtimestamps |= "word_timestamps" in out
|
480
|
+
_validate_schema(out)
|
481
|
+
|
482
|
+
assert has_wordtimestamps, "No word timestamps found"
|
483
|
+
|
484
|
+
ws.close()
|
459
485
|
|
460
486
|
@pytest.mark.asyncio
|
461
487
|
async def test_continuation_websocket_context_send():
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.0.10"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|