cartesia 1.0.10__tar.gz → 1.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.10
3
+ Version: 1.0.12
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -73,6 +73,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
73
73
  # Clone a voice using filepath
74
74
  cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
75
75
 
76
+ # Mix voices together
77
+ mixed_voice_embedding = client.voices.mix(
78
+ [{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
79
+ )
80
+
76
81
  # Create a new voice
77
82
  new_voice = client.voices.create(
78
83
  name="New Voice",
@@ -504,6 +509,7 @@ You can enhance the voice output by adjusting the `speed` and `emotion` paramete
504
509
 
505
510
  Speed Options:
506
511
  - `slowest`, `slow`, `normal`, `fast`, `fastest`
512
+ - Float values between -1.0 and 1.0, where -1.0 is the slowest speed and 1.0 is the fastest speed.
507
513
 
508
514
  Emotion Options:
509
515
  Use a list of tags in the format `emotion_name:level` where:
@@ -56,6 +56,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
56
56
  # Clone a voice using filepath
57
57
  cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
58
58
 
59
+ # Mix voices together
60
+ mixed_voice_embedding = client.voices.mix(
61
+ [{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
62
+ )
63
+
59
64
  # Create a new voice
60
65
  new_voice = client.voices.create(
61
66
  name="New Voice",
@@ -487,6 +492,7 @@ You can enhance the voice output by adjusting the `speed` and `emotion` paramete
487
492
 
488
493
  Speed Options:
489
494
  - `slowest`, `slow`, `normal`, `fast`, `fastest`
495
+ - Float values between -1.0 and 1.0, where -1.0 is the slowest speed and 1.0 is the fastest speed.
490
496
 
491
497
  Emotion Options:
492
498
  Use a list of tags in the format `emotion_name:level` where:
@@ -32,7 +32,6 @@ except ImportError:
32
32
  IS_WEBSOCKET_SYNC_AVAILABLE = False
33
33
 
34
34
  from iterators import TimeoutIterator
35
- from websockets.sync.client import connect
36
35
 
37
36
  from cartesia._types import (
38
37
  DeprecatedOutputFormatMapping,
@@ -261,6 +260,40 @@ class Voices(Resource):
261
260
 
262
261
  return response.json()
263
262
 
263
+ def mix(self, voices: List[Dict[str, Union[str, float]]]) -> List[float]:
264
+ """Mix multiple voices together.
265
+
266
+ Args:
267
+ voices: A list of dictionaries, each containing either:
268
+ - 'id': The ID of an existing voice
269
+ - 'embedding': A voice embedding
270
+ AND
271
+ - 'weight': The weight of the voice in the mix (0.0 to 1.0)
272
+
273
+ Returns:
274
+ The embedding of the mixed voice as a list of floats.
275
+
276
+ Raises:
277
+ ValueError: If the request fails or if the input is invalid.
278
+ """
279
+ url = f"{self._http_url()}/voices/mix"
280
+
281
+ if not voices or not isinstance(voices, list):
282
+ raise ValueError("voices must be a non-empty list")
283
+
284
+ response = httpx.post(
285
+ url,
286
+ headers=self.headers,
287
+ json={"voices": voices},
288
+ timeout=self.timeout,
289
+ )
290
+
291
+ if not response.is_success:
292
+ raise ValueError(f"Failed to mix voices. Error: {response.text}")
293
+
294
+ result = response.json()
295
+ return result["embedding"]
296
+
264
297
 
265
298
  class _TTSContext:
266
299
  """Manage a single context over a WebSocket.
@@ -294,6 +327,7 @@ class _TTSContext:
294
327
  context_id: Optional[str] = None,
295
328
  duration: Optional[int] = None,
296
329
  language: Optional[str] = None,
330
+ add_timestamps: bool = False,
297
331
  _experimental_voice_controls: Optional[VoiceControls] = None,
298
332
  ) -> Generator[bytes, None, None]:
299
333
  """Send audio generation requests to the WebSocket and yield responses.
@@ -307,6 +341,7 @@ class _TTSContext:
307
341
  context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
308
342
  duration: The duration of the audio in seconds.
309
343
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
344
+ add_timestamps: Whether to return word-level timestamps.
310
345
  _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
311
346
  Note: This is an experimental feature and may change rapidly in future releases.
312
347
 
@@ -341,6 +376,7 @@ class _TTSContext:
341
376
  },
342
377
  "context_id": self._context_id,
343
378
  "language": language,
379
+ "add_timestamps": add_timestamps,
344
380
  }
345
381
 
346
382
  if duration is not None:
@@ -858,9 +894,9 @@ class TTS(Resource):
858
894
  raise ValueError("Only one of voice_id or voice_embedding should be specified.")
859
895
 
860
896
  if voice_id:
861
- voice = {"mode": "id", "id": voice_id}
897
+ voice = {"id": voice_id}
862
898
  else:
863
- voice = {"mode": "embedding", "embedding": voice_embedding}
899
+ voice = {"embedding": voice_embedding}
864
900
  if experimental_voice_controls is not None:
865
901
  voice["__experimental_controls"] = experimental_voice_controls
866
902
  return voice
@@ -0,0 +1 @@
1
+ __version__ = "1.0.12"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.10
3
+ Version: 1.0.12
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -73,6 +73,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
73
73
  # Clone a voice using filepath
74
74
  cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
75
75
 
76
+ # Mix voices together
77
+ mixed_voice_embedding = client.voices.mix(
78
+ [{ "id": "voice_id_1", "weight": 0.5 }, { "id": "voice_id_2", "weight": 0.25 }, { "id": "voice_id_3", "weight": 0.25 }]
79
+ )
80
+
76
81
  # Create a new voice
77
82
  new_voice = client.voices.create(
78
83
  name="New Voice",
@@ -504,6 +509,7 @@ You can enhance the voice output by adjusting the `speed` and `emotion` paramete
504
509
 
505
510
  Speed Options:
506
511
  - `slowest`, `slow`, `normal`, `fast`, `fastest`
512
+ - Float values between -1.0 and 1.0, where -1.0 is the slowest speed and 1.0 is the fastest speed.
507
513
 
508
514
  Emotion Options:
509
515
  Use a list of tags in the format `emotion_name:level` where:
@@ -102,6 +102,11 @@ def test_create_voice(client: Cartesia):
102
102
  assert voice["is_public"] is False
103
103
  voices = client.voices.list()
104
104
  assert voice in voices
105
+
106
+ def test_mix_voice(client: Cartesia):
107
+ logger.info("Testing voices.mix")
108
+ output = client.voices.mix(voices = [{"id": SAMPLE_VOICE_ID, "weight": 0.1}, {"id": SAMPLE_VOICE_ID, "weight": 0.9}])
109
+ assert isinstance(output, list)
105
110
 
106
111
  @pytest.mark.parametrize("stream", [True, False])
107
112
  @pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS, EXPERIMENTAL_VOICE_CONTROLS_2])
@@ -190,7 +195,6 @@ def test_websocket_send_timestamps(resources: _Resources, stream: bool):
190
195
 
191
196
  ws.close()
192
197
 
193
-
194
198
  @pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS, EXPERIMENTAL_VOICE_CONTROLS_2])
195
199
  def test_sse_send_context_manager(resources: _Resources, _experimental_voice_controls: VoiceControls):
196
200
  logger.info("Testing SSE send context manager")
@@ -456,6 +460,28 @@ def test_sync_continuation_websocket_context_send():
456
460
  assert isinstance(out["audio"], bytes)
457
461
  finally:
458
462
  ws.close()
463
+
464
+ def test_sync_context_send_timestamps(resources: _Resources):
465
+ logger.info("Testing WebSocket send")
466
+ client = resources.client
467
+ transcripts = ["Hello, world!", "I'\''m generating audio on Cartesia."]
468
+
469
+ ws = client.tts.websocket()
470
+ ctx = ws.context()
471
+ output_generate = ctx.send(transcript=chunk_generator(transcripts), voice_id=SAMPLE_VOICE_ID, output_format={
472
+ "container": "raw",
473
+ "encoding": "pcm_f32le",
474
+ "sample_rate": 44100
475
+ }, model_id=DEFAULT_MODEL_ID, add_timestamps=True)
476
+
477
+ has_wordtimestamps = False
478
+ for out in output_generate:
479
+ has_wordtimestamps |= "word_timestamps" in out
480
+ _validate_schema(out)
481
+
482
+ assert has_wordtimestamps, "No word timestamps found"
483
+
484
+ ws.close()
459
485
 
460
486
  @pytest.mark.asyncio
461
487
  async def test_continuation_websocket_context_send():
@@ -1 +0,0 @@
1
- __version__ = "1.0.10"
File without changes
File without changes
File without changes
File without changes
File without changes