cartesia 1.0.3__py2.py3-none-any.whl → 1.0.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/_types.py CHANGED
@@ -45,15 +45,16 @@ class DeprecatedOutputFormatMapping:
45
45
  "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
46
46
  "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
47
47
  }
48
-
48
+
49
+ @classmethod
49
50
  @deprecated(
50
51
  vdeprecated="1.0.1",
51
52
  vremove="1.2.0",
52
53
  reason="Old output format names are being deprecated in favor of names aligned with the Cartesia API. Use names from `OutputFormatMapping` instead.",
53
54
  )
54
- def get_format_deprecated(self, format_name):
55
- if format_name in self._format_mapping:
56
- return self._format_mapping[format_name]
55
+ def get_format_deprecated(cls, format_name):
56
+ if format_name in cls._format_mapping:
57
+ return cls._format_mapping[format_name]
57
58
  else:
58
59
  raise ValueError(f"Unsupported format: {format_name}")
59
60
 
cartesia/client.py CHANGED
@@ -4,16 +4,28 @@ import json
4
4
  import os
5
5
  import uuid
6
6
  from types import TracebackType
7
- from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, Union, Callable
7
+ from typing import (
8
+ Any,
9
+ AsyncGenerator,
10
+ Iterator,
11
+ Dict,
12
+ Generator,
13
+ List,
14
+ Optional,
15
+ Tuple,
16
+ Union,
17
+ Callable,
18
+ Set,
19
+ )
8
20
 
9
21
  import aiohttp
10
22
  import httpx
11
23
  import logging
12
24
  import requests
13
25
  from websockets.sync.client import connect
26
+ from iterators import TimeoutIterator
14
27
 
15
28
  from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
16
- from cartesia.utils.deprecated import deprecated
17
29
  from cartesia._types import (
18
30
  OutputFormat,
19
31
  OutputFormatMapping,
@@ -36,22 +48,34 @@ logger = logging.getLogger(__name__)
36
48
 
37
49
 
38
50
  class BaseClient:
39
- def __init__(self, *, api_key: Optional[str] = None, timeout: float = DEFAULT_TIMEOUT):
51
+ def __init__(
52
+ self,
53
+ *,
54
+ api_key: Optional[str] = None,
55
+ base_url: Optional[str] = None,
56
+ timeout: float = DEFAULT_TIMEOUT,
57
+ ):
40
58
  """Constructor for the BaseClient. Used by the Cartesia and AsyncCartesia clients."""
41
59
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
60
+ self._base_url = base_url or os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
42
61
  self.timeout = timeout
43
62
 
63
+ @property
64
+ def base_url(self):
65
+ return self._base_url
66
+
44
67
 
45
68
  class Resource:
46
69
  def __init__(
47
70
  self,
48
71
  api_key: str,
72
+ base_url: str,
49
73
  timeout: float,
50
74
  ):
51
75
  """Constructor for the Resource class. Used by the Voices and TTS classes."""
52
76
  self.api_key = api_key
53
77
  self.timeout = timeout
54
- self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
78
+ self._base_url = base_url
55
79
  self.cartesia_version = DEFAULT_CARTESIA_VERSION
56
80
  self.headers = {
57
81
  "X-API-Key": self.api_key,
@@ -59,25 +83,29 @@ class Resource:
59
83
  "Content-Type": "application/json",
60
84
  }
61
85
 
86
+ @property
87
+ def base_url(self):
88
+ return self._base_url
89
+
62
90
  def _http_url(self):
63
91
  """Returns the HTTP URL for the Cartesia API.
64
92
  If the base URL is localhost, the URL will start with 'http'. Otherwise, it will start with 'https'.
65
93
  """
66
- if self.base_url.startswith("http://") or self.base_url.startswith("https://"):
67
- return self.base_url
94
+ if self._base_url.startswith("http://") or self._base_url.startswith("https://"):
95
+ return self._base_url
68
96
  else:
69
- prefix = "http" if "localhost" in self.base_url else "https"
70
- return f"{prefix}://{self.base_url}"
97
+ prefix = "http" if "localhost" in self._base_url else "https"
98
+ return f"{prefix}://{self._base_url}"
71
99
 
72
100
  def _ws_url(self):
73
101
  """Returns the WebSocket URL for the Cartesia API.
74
102
  If the base URL is localhost, the URL will start with 'ws'. Otherwise, it will start with 'wss'.
75
103
  """
76
- if self.base_url.startswith("ws://") or self.base_url.startswith("wss://"):
77
- return self.base_url
104
+ if self._base_url.startswith("ws://") or self._base_url.startswith("wss://"):
105
+ return self._base_url
78
106
  else:
79
- prefix = "ws" if "localhost" in self.base_url else "wss"
80
- return f"{prefix}://{self.base_url}"
107
+ prefix = "ws" if "localhost" in self._base_url else "wss"
108
+ return f"{prefix}://{self._base_url}"
81
109
 
82
110
 
83
111
  class Cartesia(BaseClient):
@@ -90,18 +118,27 @@ class Cartesia(BaseClient):
90
118
  The client supports generating audio using both Server-Sent Events and WebSocket for lower latency.
91
119
  """
92
120
 
93
- def __init__(self, *, api_key: Optional[str] = None, timeout: float = DEFAULT_TIMEOUT):
121
+ def __init__(
122
+ self,
123
+ *,
124
+ api_key: Optional[str] = None,
125
+ base_url: Optional[str] = None,
126
+ timeout: float = DEFAULT_TIMEOUT,
127
+ ):
94
128
  """Constructor for the Cartesia client.
95
129
 
96
130
  Args:
97
131
  api_key: The API key to use for authorization.
98
132
  If not specified, the API key will be read from the environment variable
99
133
  `CARTESIA_API_KEY`.
100
- timeout: The timeout for the HTTP requests in seconds. Defaults to 30 seconds.
134
+ base_url: The base URL for the Cartesia API.
135
+ If not specified, the base URL will be read from the enviroment variable
136
+ `CARTESIA_BASE_URL`. Defaults to `api.cartesia.ai`.
137
+ timeout: The timeout for HTTP and WebSocket requests in seconds. Defaults to 30 seconds.
101
138
  """
102
- super().__init__(api_key=api_key, timeout=timeout)
103
- self.voices = Voices(api_key=self.api_key, timeout=self.timeout)
104
- self.tts = TTS(api_key=self.api_key, timeout=self.timeout)
139
+ super().__init__(api_key=api_key, base_url=base_url, timeout=timeout)
140
+ self.voices = Voices(api_key=self.api_key, base_url=self._base_url, timeout=self.timeout)
141
+ self.tts = TTS(api_key=self.api_key, base_url=self._base_url, timeout=self.timeout)
105
142
 
106
143
  def __enter__(self):
107
144
  return self
@@ -188,7 +225,6 @@ class Voices(Resource):
188
225
  files = {"clip": file}
189
226
  headers = self.headers.copy()
190
227
  headers.pop("Content-Type", None)
191
- headers["Content-Type"] = "multipart/form-data"
192
228
  response = httpx.post(url, headers=headers, files=files, timeout=self.timeout)
193
229
  if not response.is_success:
194
230
  raise ValueError(f"Failed to clone voice from clip. Error: {response.text}")
@@ -227,14 +263,174 @@ class Voices(Resource):
227
263
  return response.json()
228
264
 
229
265
 
266
+ class _TTSContext:
267
+ """Manage a single context over a WebSocket.
268
+
269
+ This class can be used to stream inputs, as they become available, to a specific `context_id`. See README for usage.
270
+
271
+ See :class:`_AsyncTTSContext` for asynchronous use cases.
272
+
273
+ Each TTSContext will close automatically when a done message is received for that context. It also closes if there is an error.
274
+ """
275
+
276
+ def __init__(self, context_id: str, websocket: "_WebSocket"):
277
+ self._context_id = context_id
278
+ self._websocket = websocket
279
+ self._error = None
280
+
281
+ def __del__(self):
282
+ self._close()
283
+
284
+ @property
285
+ def context_id(self) -> str:
286
+ return self._context_id
287
+
288
+ def send(
289
+ self,
290
+ model_id: str,
291
+ transcript: Iterator[str],
292
+ output_format: OutputFormat,
293
+ voice_id: Optional[str] = None,
294
+ voice_embedding: Optional[List[float]] = None,
295
+ context_id: Optional[str] = None,
296
+ duration: Optional[int] = None,
297
+ language: Optional[str] = None,
298
+ ) -> Generator[bytes, None, None]:
299
+ """Send audio generation requests to the WebSocket and yield responses.
300
+
301
+ Args:
302
+ model_id: The ID of the model to use for generating audio.
303
+ transcript: Iterator over text chunks with <1s latency.
304
+ output_format: A dictionary containing the details of the output format.
305
+ voice_id: The ID of the voice to use for generating audio.
306
+ voice_embedding: The embedding of the voice to use for generating audio.
307
+ context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
308
+ duration: The duration of the audio in seconds.
309
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
310
+
311
+ Yields:
312
+ Dictionary containing the following key(s):
313
+ - audio: The audio as bytes.
314
+ - context_id: The context ID for the request.
315
+
316
+ Raises:
317
+ ValueError: If provided context_id doesn't match the current context.
318
+ RuntimeError: If there's an error generating audio.
319
+ """
320
+ if context_id is not None and context_id != self._context_id:
321
+ raise ValueError("Context ID does not match the context ID of the current context.")
322
+
323
+ self._websocket.connect()
324
+
325
+ voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
326
+
327
+ # Create the initial request body
328
+ request_body = {
329
+ "model_id": model_id,
330
+ "voice": voice,
331
+ "output_format": {
332
+ "container": output_format["container"],
333
+ "encoding": output_format["encoding"],
334
+ "sample_rate": output_format["sample_rate"],
335
+ },
336
+ "context_id": self._context_id,
337
+ "language": language,
338
+ }
339
+
340
+ if duration is not None:
341
+ request_body["duration"] = duration
342
+
343
+ try:
344
+ # Create an iterator with a timeout to get text chunks
345
+ text_iterator = TimeoutIterator(
346
+ transcript, timeout=0.001
347
+ ) # 1ms timeout for nearly non-blocking receive
348
+ next_chunk = next(text_iterator, None)
349
+
350
+ while True:
351
+ # Send the next text chunk to the WebSocket if available
352
+ if next_chunk is not None and next_chunk != text_iterator.get_sentinel():
353
+ request_body["transcript"] = next_chunk
354
+ request_body["continue"] = True
355
+ self._websocket.websocket.send(json.dumps(request_body))
356
+ next_chunk = next(text_iterator, None)
357
+
358
+ try:
359
+ # Receive responses from the WebSocket with a small timeout
360
+ response = json.loads(
361
+ self._websocket.websocket.recv(timeout=0.001)
362
+ ) # 1ms timeout for nearly non-blocking receive
363
+ if response["context_id"] != self._context_id:
364
+ pass
365
+ if "error" in response:
366
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
367
+ if response["done"]:
368
+ break
369
+ if response["data"]:
370
+ yield self._websocket._convert_response(
371
+ response=response, include_context_id=True
372
+ )
373
+ except TimeoutError:
374
+ pass
375
+
376
+ # Continuously receive from WebSocket until the next text chunk is available
377
+ while next_chunk == text_iterator.get_sentinel():
378
+ try:
379
+ response = json.loads(self._websocket.websocket.recv(timeout=0.001))
380
+ if response["context_id"] != self._context_id:
381
+ continue
382
+ if "error" in response:
383
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
384
+ if response["done"]:
385
+ break
386
+ if response["data"]:
387
+ yield self._websocket._convert_response(
388
+ response=response, include_context_id=True
389
+ )
390
+ except TimeoutError:
391
+ pass
392
+ next_chunk = next(text_iterator, None)
393
+
394
+ # Send final message if all input text chunks are exhausted
395
+ if next_chunk is None:
396
+ request_body["transcript"] = ""
397
+ request_body["continue"] = False
398
+ self._websocket.websocket.send(json.dumps(request_body))
399
+ break
400
+
401
+ # Receive remaining messages from the WebSocket until "done" is received
402
+ while True:
403
+ response = json.loads(self._websocket.websocket.recv())
404
+ if response["context_id"] != self._context_id:
405
+ continue
406
+ if "error" in response:
407
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
408
+ if response["done"]:
409
+ break
410
+ yield self._websocket._convert_response(response=response, include_context_id=True)
411
+
412
+ except Exception as e:
413
+ self._websocket.close()
414
+ raise RuntimeError(f"Failed to generate audio. {e}")
415
+
416
+ def _close(self):
417
+ """Closes the context. Automatically called when a done message is received for this context."""
418
+ self._websocket._remove_context(self._context_id)
419
+
420
+ def is_closed(self):
421
+ """Check if the context is closed or not. Returns True if closed."""
422
+ return self._context_id not in self._websocket._contexts
423
+
424
+
230
425
  class _WebSocket:
231
426
  """This class contains methods to generate audio using WebSocket. Ideal for low-latency audio generation.
232
427
 
233
428
  Usage:
234
429
  >>> ws = client.tts.websocket()
235
430
  >>> for audio_chunk in ws.send(
236
- ... model_id="upbeat-moon", transcript="Hello world!", voice_embedding=embedding,
237
- ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}, stream=True
431
+ ... model_id="sonic-english", transcript="Hello world!", voice_embedding=embedding,
432
+ ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
433
+ ... context_id=context_id, stream=True
238
434
  ... ):
239
435
  ... audio = audio_chunk["audio"]
240
436
  """
@@ -249,23 +445,40 @@ class _WebSocket:
249
445
  self.api_key = api_key
250
446
  self.cartesia_version = cartesia_version
251
447
  self.websocket = None
448
+ self._contexts: Set[str] = set()
449
+
450
+ def __del__(self):
451
+ try:
452
+ self.close()
453
+ except Exception as e:
454
+ raise RuntimeError("Failed to close WebSocket: ", e)
252
455
 
253
456
  def connect(self):
254
- """This method connects to the WebSocket if it is not already connected."""
457
+ """This method connects to the WebSocket if it is not already connected.
458
+
459
+ Raises:
460
+ RuntimeError: If the connection to the WebSocket fails.
461
+ """
255
462
  if self.websocket is None or self._is_websocket_closed():
256
463
  route = "tts/websocket"
257
- self.websocket = connect(
258
- f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
259
- )
464
+ try:
465
+ self.websocket = connect(
466
+ f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
467
+ )
468
+ except Exception as e:
469
+ raise RuntimeError(f"Failed to connect to WebSocket. {e}")
260
470
 
261
471
  def _is_websocket_closed(self):
262
472
  return self.websocket.socket.fileno() == -1
263
473
 
264
474
  def close(self):
265
475
  """This method closes the WebSocket connection. *Highly* recommended to call this method when done using the WebSocket."""
266
- if self.websocket is not None and not self._is_websocket_closed():
476
+ if self.websocket and not self._is_websocket_closed():
267
477
  self.websocket.close()
268
478
 
479
+ if self._contexts:
480
+ self._contexts.clear()
481
+
269
482
  def _convert_response(
270
483
  self, response: Dict[str, any], include_context_id: bool
271
484
  ) -> Dict[str, Any]:
@@ -329,7 +542,7 @@ class _WebSocket:
329
542
  context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
330
543
  duration: The duration of the audio in seconds.
331
544
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
332
- stream: Whether to stream the audio or not. (Default is True)
545
+ stream: Whether to stream the audio or not.
333
546
 
334
547
  Returns:
335
548
  If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -341,7 +554,7 @@ class _WebSocket:
341
554
  self.connect()
342
555
 
343
556
  if context_id is None:
344
- context_id = uuid.uuid4().hex
557
+ context_id = str(uuid.uuid4())
345
558
 
346
559
  voice = self._validate_and_construct_voice(voice_id, voice_embedding)
347
560
 
@@ -385,17 +598,29 @@ class _WebSocket:
385
598
  yield self._convert_response(response=response, include_context_id=True)
386
599
  except Exception as e:
387
600
  # Close the websocket connection if an error occurs.
388
- if self.websocket and not self._is_websocket_closed():
389
- self.websocket.close()
601
+ self.close()
390
602
  raise RuntimeError(f"Failed to generate audio. {response}") from e
391
603
 
604
+ def _remove_context(self, context_id: str):
605
+ if context_id in self._contexts:
606
+ self._contexts.remove(context_id)
607
+
608
+ def context(self, context_id: Optional[str] = None) -> _TTSContext:
609
+ if context_id in self._contexts:
610
+ raise ValueError(f"Context for context ID {context_id} already exists.")
611
+ if context_id is None:
612
+ context_id = str(uuid.uuid4())
613
+ if context_id not in self._contexts:
614
+ self._contexts.add(context_id)
615
+ return _TTSContext(context_id, self)
616
+
392
617
 
393
618
  class _SSE:
394
619
  """This class contains methods to generate audio using Server-Sent Events.
395
620
 
396
621
  Usage:
397
622
  >>> for audio_chunk in client.tts.sse(
398
- ... model_id="upbeat-moon", transcript="Hello world!", voice_embedding=embedding,
623
+ ... model_id="sonic-english", transcript="Hello world!", voice_embedding=embedding,
399
624
  ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}, stream=True
400
625
  ... ):
401
626
  ... audio = audio_chunk["audio"]
@@ -523,8 +748,7 @@ class _SSE:
523
748
  for chunk in self._sse_generator(request_body):
524
749
  yield chunk
525
750
  except Exception as e:
526
- logger.error(f"Failed to generate audio. {e}")
527
- raise e
751
+ raise RuntimeError(f"Error generating audio. {e}")
528
752
 
529
753
  def _sse_generator(self, request_body: Dict[str, Any]):
530
754
  response = requests.post(
@@ -555,9 +779,10 @@ class _SSE:
555
779
  class TTS(Resource):
556
780
  """This resource contains methods to generate audio using Cartesia's text-to-speech API."""
557
781
 
558
- def __init__(self, api_key, timeout):
782
+ def __init__(self, api_key: str, base_url: str, timeout: float):
559
783
  super().__init__(
560
784
  api_key=api_key,
785
+ base_url=base_url,
561
786
  timeout=timeout,
562
787
  )
563
788
  self._sse_class = _SSE(self._http_url(), self.headers, self.timeout)
@@ -573,7 +798,8 @@ class TTS(Resource):
573
798
  ws.connect()
574
799
  return ws
575
800
 
576
- def get_output_format(self, output_format_name: str) -> OutputFormat:
801
+ @staticmethod
802
+ def get_output_format(output_format_name: str) -> OutputFormat:
577
803
  """Convenience method to get the output_format dictionary from a given output format name.
578
804
 
579
805
  Args:
@@ -631,22 +857,27 @@ class AsyncCartesia(Cartesia):
631
857
  self,
632
858
  *,
633
859
  api_key: Optional[str] = None,
860
+ base_url: Optional[str] = None,
634
861
  timeout: float = DEFAULT_TIMEOUT,
635
862
  max_num_connections: int = DEFAULT_NUM_CONNECTIONS,
636
863
  ):
637
864
  """
638
865
  Args:
639
866
  api_key: See :class:`Cartesia`.
867
+ base_url: See :class:`Cartesia`.
640
868
  timeout: See :class:`Cartesia`.
641
869
  max_num_connections: The maximum number of concurrent connections to use for the client.
642
870
  This is used to limit the number of connections that can be made to the server.
643
871
  """
644
872
  self._session = None
645
873
  self._loop = None
646
- super().__init__(api_key=api_key, timeout=timeout)
874
+ super().__init__(api_key=api_key, base_url=base_url, timeout=timeout)
647
875
  self.max_num_connections = max_num_connections
648
876
  self.tts = AsyncTTS(
649
- api_key=self.api_key, timeout=self.timeout, get_session=self._get_session
877
+ api_key=self.api_key,
878
+ base_url=self._base_url,
879
+ timeout=self.timeout,
880
+ get_session=self._get_session,
650
881
  )
651
882
 
652
883
  async def _get_session(self):
@@ -677,7 +908,7 @@ class AsyncCartesia(Cartesia):
677
908
 
678
909
  if loop is None:
679
910
  asyncio.run(self.close())
680
- else:
911
+ elif loop.is_running():
681
912
  loop.create_task(self.close())
682
913
 
683
914
  async def __aenter__(self):
@@ -753,8 +984,7 @@ class _AsyncSSE(_SSE):
753
984
  async for chunk in self._sse_generator(request_body):
754
985
  yield chunk
755
986
  except Exception as e:
756
- logger.error(f"Failed to generate audio. {e}")
757
- raise e
987
+ raise RuntimeError(f"Error generating audio. {e}")
758
988
 
759
989
  async def _sse_generator(self, request_body: Dict[str, Any]):
760
990
  session = await self._get_session()
@@ -779,6 +1009,145 @@ class _AsyncSSE(_SSE):
779
1009
  pass
780
1010
 
781
1011
 
1012
+ class _AsyncTTSContext:
1013
+ """Manage a single context over an AsyncWebSocket.
1014
+
1015
+ This class separates sending requests and receiving responses into two separate methods.
1016
+ This can be used for sending multiple requests without awaiting the response.
1017
+ Then you can listen to the responses in the order they were sent. See README for usage.
1018
+
1019
+ Each AsyncTTSContext will close automatically when a done message is received for that context.
1020
+ This happens when the no_more_inputs method is called (equivalent to sending a request with `continue_ = False`),
1021
+ or if no requests have been sent for 5 seconds on the same context. It also closes if there is an error.
1022
+
1023
+ """
1024
+
1025
+ def __init__(self, context_id: str, websocket: "_AsyncWebSocket", timeout: float):
1026
+ self._context_id = context_id
1027
+ self._websocket = websocket
1028
+ self.timeout = timeout
1029
+ self._error = None
1030
+
1031
+ @property
1032
+ def context_id(self) -> str:
1033
+ return self._context_id
1034
+
1035
+ async def send(
1036
+ self,
1037
+ model_id: str,
1038
+ transcript: str,
1039
+ output_format: OutputFormat,
1040
+ voice_id: Optional[str] = None,
1041
+ voice_embedding: Optional[List[float]] = None,
1042
+ context_id: Optional[str] = None,
1043
+ continue_: bool = False,
1044
+ duration: Optional[int] = None,
1045
+ language: Optional[str] = None,
1046
+ ) -> None:
1047
+ """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
1048
+
1049
+ Args:
1050
+ model_id: The ID of the model to use for generating audio.
1051
+ transcript: The text to convert to speech.
1052
+ output_format: A dictionary containing the details of the output format.
1053
+ voice_id: The ID of the voice to use for generating audio.
1054
+ voice_embedding: The embedding of the voice to use for generating audio.
1055
+ context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
1056
+ continue_: Whether to continue the audio generation from the previous transcript or not.
1057
+ duration: The duration of the audio in seconds.
1058
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
1059
+
1060
+ Returns:
1061
+ None.
1062
+ """
1063
+ if context_id is not None and context_id != self._context_id:
1064
+ raise ValueError("Context ID does not match the context ID of the current context.")
1065
+ if continue_ and transcript == "":
1066
+ raise ValueError("Transcript cannot be empty when continue_ is True.")
1067
+
1068
+ await self._websocket.connect()
1069
+
1070
+ voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
1071
+
1072
+ request_body = {
1073
+ "model_id": model_id,
1074
+ "transcript": transcript,
1075
+ "voice": voice,
1076
+ "output_format": {
1077
+ "container": output_format["container"],
1078
+ "encoding": output_format["encoding"],
1079
+ "sample_rate": output_format["sample_rate"],
1080
+ },
1081
+ "context_id": self._context_id,
1082
+ "continue": continue_,
1083
+ "language": language,
1084
+ }
1085
+
1086
+ if duration is not None:
1087
+ request_body["duration"] = duration
1088
+
1089
+ await self._websocket.websocket.send_json(request_body)
1090
+
1091
+ # Start listening for responses on the WebSocket
1092
+ self._websocket._dispatch_listener()
1093
+
1094
+ async def no_more_inputs(self) -> None:
1095
+ """Send a request to the WebSocket to indicate that no more requests will be sent."""
1096
+ await self.send(
1097
+ model_id=DEFAULT_MODEL_ID,
1098
+ transcript="",
1099
+ output_format=TTS.get_output_format("raw_pcm_f32le_44100"),
1100
+ voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Default voice ID since it's a required input for now
1101
+ context_id=self._context_id,
1102
+ continue_=False,
1103
+ )
1104
+
1105
+ async def receive(self) -> AsyncGenerator[Dict[str, Any], None]:
1106
+ """Receive the audio chunks from the WebSocket. This method is a generator that yields audio chunks.
1107
+
1108
+ Returns:
1109
+ An async generator that yields audio chunks. Each chunk is a dictionary containing the audio as bytes.
1110
+ """
1111
+ try:
1112
+ while True:
1113
+ response = await self._websocket._get_message(
1114
+ self._context_id, timeout=self.timeout
1115
+ )
1116
+ if "error" in response:
1117
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
1118
+ if response["done"]:
1119
+ break
1120
+ yield self._websocket._convert_response(response, include_context_id=True)
1121
+ except Exception as e:
1122
+ if isinstance(e, asyncio.TimeoutError):
1123
+ raise RuntimeError("Timeout while waiting for audio chunk")
1124
+ raise RuntimeError(f"Failed to generate audio:\n{e}")
1125
+ finally:
1126
+ self._close()
1127
+
1128
+ def _close(self) -> None:
1129
+ """Closes the context. Automatically called when a done message is received for this context."""
1130
+ self._websocket._remove_context(self._context_id)
1131
+
1132
+ def is_closed(self):
1133
+ """Check if the context is closed or not. Returns True if closed."""
1134
+ return self._context_id not in self._websocket._context_queues
1135
+
1136
+ async def __aenter__(self):
1137
+ return self
1138
+
1139
+ async def __aexit__(
1140
+ self,
1141
+ exc_type: Union[type, None],
1142
+ exc: Union[BaseException, None],
1143
+ exc_tb: Union[TracebackType, None],
1144
+ ):
1145
+ self._close()
1146
+
1147
+ def __del__(self):
1148
+ self._close()
1149
+
1150
+
782
1151
  class _AsyncWebSocket(_WebSocket):
783
1152
  """This class contains methods to generate audio using WebSocket asynchronously."""
784
1153
 
@@ -787,19 +1156,45 @@ class _AsyncWebSocket(_WebSocket):
787
1156
  ws_url: str,
788
1157
  api_key: str,
789
1158
  cartesia_version: str,
1159
+ timeout: float,
790
1160
  get_session: Callable[[], Optional[aiohttp.ClientSession]],
791
1161
  ):
1162
+ """
1163
+ Args:
1164
+ ws_url: The WebSocket URL for the Cartesia API.
1165
+ api_key: The API key to use for authorization.
1166
+ cartesia_version: The version of the Cartesia API to use.
1167
+ timeout: The timeout for responses on the WebSocket in seconds.
1168
+ get_session: A function that returns an aiohttp.ClientSession object.
1169
+ """
792
1170
  super().__init__(ws_url, api_key, cartesia_version)
1171
+ self.timeout = timeout
793
1172
  self._get_session = get_session
794
1173
  self.websocket = None
1174
+ self._context_queues: Dict[str, asyncio.Queue] = {}
1175
+ self._processing_task: asyncio.Task = None
1176
+
1177
+ def __del__(self):
1178
+ try:
1179
+ loop = asyncio.get_running_loop()
1180
+ except RuntimeError:
1181
+ loop = None
1182
+
1183
+ if loop is None:
1184
+ asyncio.run(self.close())
1185
+ elif loop.is_running():
1186
+ loop.create_task(self.close())
795
1187
 
796
1188
  async def connect(self):
797
1189
  if self.websocket is None or self._is_websocket_closed():
798
1190
  route = "tts/websocket"
799
1191
  session = await self._get_session()
800
- self.websocket = await session.ws_connect(
801
- f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
802
- )
1192
+ try:
1193
+ self.websocket = await session.ws_connect(
1194
+ f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
1195
+ )
1196
+ except Exception as e:
1197
+ raise RuntimeError(f"Failed to connect to WebSocket. {e}")
803
1198
 
804
1199
  def _is_websocket_closed(self):
805
1200
  return self.websocket.closed
@@ -808,6 +1203,25 @@ class _AsyncWebSocket(_WebSocket):
808
1203
  """This method closes the websocket connection. *Highly* recommended to call this method when done."""
809
1204
  if self.websocket is not None and not self._is_websocket_closed():
810
1205
  await self.websocket.close()
1206
+ if self._processing_task:
1207
+ self._processing_task.cancel()
1208
+ try:
1209
+ self._processing_task = None
1210
+ except asyncio.CancelledError:
1211
+ pass
1212
+ except TypeError as e:
1213
+ # Ignore the error if the task is already cancelled
1214
+ # For some reason we are getting None responses
1215
+ # TODO: This needs to be fixed - we need to think about why we are getting None responses.
1216
+ if "Received message 256:None" not in str(e):
1217
+ raise e
1218
+
1219
+ for context_id in list(self._context_queues.keys()):
1220
+ self._remove_context(context_id)
1221
+
1222
+ self._context_queues.clear()
1223
+ self._processing_task = None
1224
+ self.websocket = None
811
1225
 
812
1226
  async def send(
813
1227
  self,
@@ -819,32 +1233,26 @@ class _AsyncWebSocket(_WebSocket):
819
1233
  context_id: Optional[str] = None,
820
1234
  duration: Optional[int] = None,
821
1235
  language: Optional[str] = None,
822
- stream: Optional[bool] = True,
1236
+ stream: bool = True,
823
1237
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
824
- await self.connect()
825
-
826
1238
  if context_id is None:
827
- context_id = uuid.uuid4().hex
828
-
829
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
830
-
831
- request_body = {
832
- "model_id": model_id,
833
- "transcript": transcript,
834
- "voice": voice,
835
- "output_format": {
836
- "container": output_format["container"],
837
- "encoding": output_format["encoding"],
838
- "sample_rate": output_format["sample_rate"],
839
- },
840
- "context_id": context_id,
841
- "language": language,
842
- }
843
-
844
- if duration is not None:
845
- request_body["duration"] = duration
1239
+ context_id = str(uuid.uuid4())
1240
+
1241
+ ctx = self.context(context_id)
1242
+
1243
+ await ctx.send(
1244
+ model_id=model_id,
1245
+ transcript=transcript,
1246
+ output_format=output_format,
1247
+ voice_id=voice_id,
1248
+ voice_embedding=voice_embedding,
1249
+ context_id=context_id,
1250
+ duration=duration,
1251
+ language=language,
1252
+ continue_=False,
1253
+ )
846
1254
 
847
- generator = self._websocket_generator(request_body)
1255
+ generator = ctx.receive()
848
1256
 
849
1257
  if stream:
850
1258
  return generator
@@ -855,35 +1263,51 @@ class _AsyncWebSocket(_WebSocket):
855
1263
 
856
1264
  return {"audio": b"".join(chunks), "context_id": context_id}
857
1265
 
858
- async def _websocket_generator(self, request_body: Dict[str, Any]):
859
- await self.websocket.send_json(request_body)
860
-
1266
+ async def _process_responses(self):
861
1267
  try:
862
- response = None
863
1268
  while True:
864
1269
  response = await self.websocket.receive_json()
865
- if "error" in response:
866
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
867
- if response["done"]:
868
- break
869
-
870
- yield self._convert_response(response=response, include_context_id=True)
1270
+ if response["context_id"]:
1271
+ context_id = response["context_id"]
1272
+ if context_id in self._context_queues:
1273
+ await self._context_queues[context_id].put(response)
871
1274
  except Exception as e:
872
- # Close the websocket connection if an error occurs.
873
- if self.websocket and not self._is_websocket_closed():
874
- await self.websocket.close()
875
- error_msg_end = "" if response is None else f": {await response.text()}"
876
- raise RuntimeError(f"Failed to generate audio. {error_msg_end}") from e
1275
+ self._error = e
1276
+ raise e
1277
+
1278
+ async def _get_message(self, context_id: str, timeout: float) -> Dict[str, Any]:
1279
+ if context_id not in self._context_queues:
1280
+ raise ValueError(f"Context ID {context_id} not found.")
1281
+ return await asyncio.wait_for(self._context_queues[context_id].get(), timeout=timeout)
1282
+
1283
+ def _remove_context(self, context_id: str):
1284
+ if context_id in self._context_queues:
1285
+ del self._context_queues[context_id]
1286
+
1287
+ def _dispatch_listener(self):
1288
+ if self._processing_task is None or self._processing_task.done():
1289
+ self._processing_task = asyncio.create_task(self._process_responses())
1290
+
1291
+ def context(self, context_id: Optional[str] = None) -> _AsyncTTSContext:
1292
+ if context_id in self._context_queues:
1293
+ raise ValueError(f"AsyncContext for context ID {context_id} already exists.")
1294
+ if context_id is None:
1295
+ context_id = str(uuid.uuid4())
1296
+ if context_id not in self._context_queues:
1297
+ self._context_queues[context_id] = asyncio.Queue()
1298
+ return _AsyncTTSContext(context_id, self, self.timeout)
877
1299
 
878
1300
 
879
1301
  class AsyncTTS(TTS):
880
- def __init__(self, api_key, timeout, get_session):
881
- super().__init__(api_key, timeout)
1302
+ def __init__(self, api_key, base_url, timeout, get_session):
1303
+ super().__init__(api_key, base_url, timeout)
882
1304
  self._get_session = get_session
883
1305
  self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
884
1306
  self.sse = self._sse_class.send
885
1307
 
886
1308
  async def websocket(self) -> _AsyncWebSocket:
887
- ws = _AsyncWebSocket(self._ws_url(), self.api_key, self.cartesia_version, self._get_session)
1309
+ ws = _AsyncWebSocket(
1310
+ self._ws_url(), self.api_key, self.cartesia_version, self.timeout, self._get_session
1311
+ )
888
1312
  await ws.connect()
889
1313
  return ws
@@ -17,6 +17,8 @@ def deprecated(
17
17
  local_vars = locals()
18
18
 
19
19
  def fn(func: TCallable) -> TCallable:
20
+ if isinstance(func, classmethod):
21
+ func = func.__func__
20
22
  msg = _get_deprecated_msg(func, reason, vdeprecated, vremove, replacement)
21
23
  warnings.warn(msg, DeprecationWarning)
22
24
  return func
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.3"
1
+ __version__ = "1.0.5"
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Cartesia AI, Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -10,11 +10,13 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
11
  Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
+ License-File: LICENSE.md
13
14
  Requires-Dist: aiohttp
14
15
  Requires-Dist: httpx
15
16
  Requires-Dist: pytest-asyncio
16
17
  Requires-Dist: requests
17
18
  Requires-Dist: websockets
19
+ Requires-Dist: iterators
18
20
  Provides-Extra: all
19
21
  Requires-Dist: pytest >=8.0.2 ; extra == 'all'
20
22
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
@@ -97,10 +99,10 @@ voice = client.voices.get(id=voice_id)
97
99
 
98
100
  transcript = "Hello! Welcome to Cartesia"
99
101
 
100
- # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
102
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
101
103
  model_id = "sonic-english"
102
104
 
103
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
105
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
104
106
  output_format = {
105
107
  "container": "raw",
106
108
  "encoding": "pcm_f32le",
@@ -148,10 +150,10 @@ async def write_stream():
148
150
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
149
151
  voice = client.voices.get(id=voice_id)
150
152
  transcript = "Hello! Welcome to Cartesia"
151
- # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
153
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
152
154
  model_id = "sonic-english"
153
155
 
154
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
156
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
155
157
  output_format = {
156
158
  "container": "raw",
157
159
  "encoding": "pcm_f32le",
@@ -203,10 +205,10 @@ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
203
205
  voice = client.voices.get(id=voice_id)
204
206
  transcript = "Hello! Welcome to Cartesia"
205
207
 
206
- # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
208
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
207
209
  model_id = "sonic-english"
208
210
 
209
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
211
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
210
212
  output_format = {
211
213
  "container": "raw",
212
214
  "encoding": "pcm_f32le",
@@ -244,6 +246,179 @@ p.terminate()
244
246
  ws.close() # Close the websocket connection
245
247
  ```
246
248
 
249
+ #### Conditioning speech on previous generations using WebSocket
250
+
251
+ In some cases, input text may need to be streamed in. In these cases, it would be slow to wait for all the text to buffer before sending it to Cartesia's TTS service.
252
+
253
+ To mitigate this, Cartesia offers audio continuations. In this setting, users can send input text, as it becomes available, over a websocket connection.
254
+
255
+ To do this, we will create a `context` and send multiple requests without awaiting the response. Then you can listen to the responses in the order they were sent.
256
+
257
+ Each `context` will be closed automatically after 5 seconds of inactivity or when the `no_more_inputs` method is called. `no_more_inputs` sends a request with the `continue_=False`, which indicates no more inputs will be sent over this context
258
+
259
+ ```python
260
+ import asyncio
261
+ import os
262
+ import pyaudio
263
+ from cartesia import AsyncCartesia
264
+
265
+ async def send_transcripts(ctx):
266
+ # Check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
267
+ voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
268
+
269
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
270
+ model_id = "sonic-english"
271
+
272
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
273
+ output_format = {
274
+ "container": "raw",
275
+ "encoding": "pcm_f32le",
276
+ "sample_rate": 44100,
277
+ }
278
+
279
+ transcripts = [
280
+ "Sonic and Yoshi team up in a dimension-hopping adventure! ",
281
+ "Racing through twisting zones, they dodge Eggman's badniks and solve ancient puzzles. ",
282
+ "In the Echoing Caverns, they find the Harmonic Crystal, unlocking new powers. ",
283
+ "Sonic's speed creates sound waves, while Yoshi's eggs become sonic bolts. ",
284
+ "As they near Eggman's lair, our heroes charge their abilities for an epic boss battle. ",
285
+ "Get ready to spin, jump, and sound-blast your way to victory in this high-octane crossover!"
286
+ ]
287
+
288
+ for transcript in transcripts:
289
+ # Send text inputs as they become available
290
+ await ctx.send(
291
+ model_id=model_id,
292
+ transcript=transcript,
293
+ voice_id=voice_id,
294
+ continue_=True,
295
+ output_format=output_format,
296
+ )
297
+
298
+ # Indicate that no more inputs will be sent. Otherwise, the context will close after 5 seconds of inactivity.
299
+ await ctx.no_more_inputs()
300
+
301
+ async def receive_and_play_audio(ctx):
302
+ p = pyaudio.PyAudio()
303
+ stream = None
304
+ rate = 44100
305
+
306
+ async for output in ctx.receive():
307
+ buffer = output["audio"]
308
+
309
+ if not stream:
310
+ stream = p.open(
311
+ format=pyaudio.paFloat32,
312
+ channels=1,
313
+ rate=rate,
314
+ output=True
315
+ )
316
+
317
+ stream.write(buffer)
318
+
319
+ stream.stop_stream()
320
+ stream.close()
321
+ p.terminate()
322
+
323
+ async def stream_and_listen():
324
+ client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
325
+
326
+ # Set up the websocket connection
327
+ ws = await client.tts.websocket()
328
+
329
+ # Create a context to send and receive audio
330
+ ctx = ws.context() # Generates a random context ID if not provided
331
+
332
+ send_task = asyncio.create_task(send_transcripts(ctx))
333
+ listen_task = asyncio.create_task(receive_and_play_audio(ctx))
334
+
335
+ # Call the two coroutine tasks concurrently
336
+ await asyncio.gather(send_task, listen_task)
337
+
338
+ await ws.close()
339
+ await client.close()
340
+
341
+ asyncio.run(stream_and_listen())
342
+ ```
343
+
344
+ You can also use continuations on the synchronous Cartesia client to stream in text as it becomes available. To do this, pass in a text generator that produces text chunks at intervals of less than 1 second, as shown below. This ensures smooth audio playback.
345
+
346
+ Note: the sync client has a different API for continuations compared to the async client.
347
+
348
+ ```python
349
+ from cartesia import Cartesia
350
+ import pyaudio
351
+ import os
352
+
353
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
354
+
355
+ transcripts = [
356
+ "The crew engaged in a range of activities designed to mirror those "
357
+ "they might perform on a real Mars mission. ",
358
+ "Aside from growing vegetables and maintaining their habitat, they faced "
359
+ "additional stressors like communication delays with Earth, ",
360
+ "up to twenty-two minutes each way, to simulate the distance from Mars to our planet. ",
361
+ "These exercises were critical for understanding how astronauts can "
362
+ "maintain not just physical health but also mental well-being under such challenging conditions. ",
363
+ ]
364
+
365
+ # Ending each transcript with a space makes the audio smoother
366
+ def chunk_generator(transcripts):
367
+ for transcript in transcripts:
368
+ if transcript.endswith(" "):
369
+ yield transcript
370
+ else:
371
+ yield transcript + " "
372
+
373
+
374
+ # You can check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
375
+ voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
376
+
377
+ # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
378
+ model_id = "sonic-english"
379
+
380
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
381
+ output_format = {
382
+ "container": "raw",
383
+ "encoding": "pcm_f32le",
384
+ "sample_rate": 44100,
385
+ }
386
+
387
+ p = pyaudio.PyAudio()
388
+ rate = 44100
389
+
390
+ stream = None
391
+
392
+ # Set up the websocket connection
393
+ ws = client.tts.websocket()
394
+
395
+ # Create a context to send and receive audio
396
+ ctx = ws.context() # Generates a random context ID if not provided
397
+
398
+ # Pass in a text generator to generate & stream the audio
399
+ output_stream = ctx.send(
400
+ model_id=model_id,
401
+ transcript=chunk_generator(transcripts),
402
+ voice_id=voice_id,
403
+ output_format=output_format,
404
+ )
405
+
406
+ for output in output_stream:
407
+ buffer = output["audio"]
408
+
409
+ if not stream:
410
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
411
+
412
+ # Write the audio data to the stream
413
+ stream.write(buffer)
414
+
415
+ stream.stop_stream()
416
+ stream.close()
417
+ p.terminate()
418
+
419
+ ws.close() # Close the websocket connection
420
+ ```
421
+
247
422
  ### Multilingual Text-to-Speech [Alpha]
248
423
 
249
424
  You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -261,10 +436,10 @@ voice = client.voices.get(id=voice_id)
261
436
  transcript = "Hola! Bienvenido a Cartesia"
262
437
  language = "es" # Language code corresponding to the language of the transcript
263
438
 
264
- # Make sure you use the multilingual model! You can check out all models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
439
+ # Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
265
440
  model_id = "sonic-multilingual"
266
441
 
267
- # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
442
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
268
443
  output_format = {
269
444
  "container": "raw",
270
445
  "encoding": "pcm_f32le",
@@ -0,0 +1,12 @@
1
+ cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
+ cartesia/_types.py,sha256=tO3Nef_V78TDMKDuIv_wsQLkxoSvYG4bdzFkMGXUFho,3765
3
+ cartesia/client.py,sha256=46XiKTXa0gBXJ_GftMtLHAzBoX0GmWz_aWYuG68jaNQ,49316
4
+ cartesia/version.py,sha256=B9kKWJLln1i8LjtkcYecvNWGLTrez4gCUOHtnPlInFo,22
5
+ cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
+ cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
+ cartesia-1.0.5.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
+ cartesia-1.0.5.dist-info/METADATA,sha256=PImHYCNoo7iSnm3Br6PuRdqvli92c7AyXR4iagdv-d8,18368
10
+ cartesia-1.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
+ cartesia-1.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
+ cartesia-1.0.5.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
- cartesia/_types.py,sha256=msXRqNwVx_mbcLIQgRJYEl8U-hO9LRPWmscnX89cBCY,3747
3
- cartesia/client.py,sha256=jMlFDPRtKVDelqevHlv7YZJgOES3ws9BFN_6uUyN0W8,32720
4
- cartesia/version.py,sha256=2plzdEEb24FLjE2I2XyBBcJEPYWHccNL4SgtLC_6erg,22
5
- cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- cartesia/utils/deprecated.py,sha256=QSM-ld_g1r-JlT571SSt6-w650jVm9mJmmI2MSScLPw,1599
7
- cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
- cartesia-1.0.3.dist-info/METADATA,sha256=y5_HREGB417qL69qMFYtKrIRQAQJ1WDxqObaAg6-V6U,12394
9
- cartesia-1.0.3.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
10
- cartesia-1.0.3.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
11
- cartesia-1.0.3.dist-info/RECORD,,