cartesia 1.0.12__tar.gz → 1.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {cartesia-1.0.12 → cartesia-1.0.14}/PKG-INFO +10 -10
  2. {cartesia-1.0.12 → cartesia-1.0.14}/README.md +9 -9
  3. cartesia-1.0.14/cartesia/__init__.py +4 -0
  4. cartesia-1.0.14/cartesia/_async_sse.py +105 -0
  5. cartesia-1.0.14/cartesia/_async_websocket.py +323 -0
  6. cartesia-1.0.14/cartesia/_constants.py +10 -0
  7. cartesia-1.0.14/cartesia/_logger.py +3 -0
  8. cartesia-1.0.14/cartesia/_sse.py +152 -0
  9. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia/_types.py +3 -2
  10. cartesia-1.0.14/cartesia/_websocket.py +374 -0
  11. cartesia-1.0.14/cartesia/async_client.py +82 -0
  12. cartesia-1.0.14/cartesia/async_tts.py +22 -0
  13. cartesia-1.0.14/cartesia/client.py +69 -0
  14. cartesia-1.0.14/cartesia/resource.py +44 -0
  15. cartesia-1.0.14/cartesia/tts.py +109 -0
  16. cartesia-1.0.14/cartesia/utils/tts.py +25 -0
  17. cartesia-1.0.14/cartesia/version.py +1 -0
  18. cartesia-1.0.14/cartesia/voices.py +170 -0
  19. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia.egg-info/PKG-INFO +10 -10
  20. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia.egg-info/SOURCES.txt +12 -0
  21. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia.egg-info/requires.txt +4 -0
  22. {cartesia-1.0.12 → cartesia-1.0.14}/tests/test_tts.py +635 -285
  23. cartesia-1.0.12/cartesia/__init__.py +0 -3
  24. cartesia-1.0.12/cartesia/client.py +0 -1390
  25. cartesia-1.0.12/cartesia/version.py +0 -1
  26. {cartesia-1.0.12 → cartesia-1.0.14}/LICENSE.md +0 -0
  27. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia/utils/__init__.py +0 -0
  28. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia/utils/deprecated.py +0 -0
  29. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia/utils/retry.py +0 -0
  30. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia.egg-info/dependency_links.txt +0 -0
  31. {cartesia-1.0.12 → cartesia-1.0.14}/cartesia.egg-info/top_level.txt +0 -0
  32. {cartesia-1.0.12 → cartesia-1.0.14}/pyproject.toml +0 -0
  33. {cartesia-1.0.12 → cartesia-1.0.14}/setup.cfg +0 -0
  34. {cartesia-1.0.12 → cartesia-1.0.14}/setup.py +0 -0
  35. {cartesia-1.0.12 → cartesia-1.0.14}/tests/test_deprecated.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.12
3
+ Version: 1.0.14
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -18,12 +18,12 @@ License-File: LICENSE.md
18
18
  # Cartesia Python API Library
19
19
 
20
20
  ![PyPI - Version](https://img.shields.io/pypi/v/cartesia)
21
- [![Discord](https://badgen.net/badge/black/Cartesia/icon?icon=discord&label)](https://discord.gg/ZVxavqHB9X)
21
+ [![Discord](https://badgen.net/badge/black/Cartesia/icon?icon=discord&label)](https://discord.gg/cartesia)
22
22
 
23
23
  The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
24
24
 
25
25
  > [!IMPORTANT]
26
- > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) for any support requests!
26
+ > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/cartesia) for any support requests!
27
27
 
28
28
  - [Cartesia Python API Library](#cartesia-python-api-library)
29
29
  - [Documentation](#documentation)
@@ -105,7 +105,7 @@ transcript = "Hello! Welcome to Cartesia"
105
105
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
106
106
  model_id = "sonic-english"
107
107
 
108
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
108
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
109
109
  output_format = {
110
110
  "container": "raw",
111
111
  "encoding": "pcm_f32le",
@@ -156,7 +156,7 @@ async def write_stream():
156
156
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
157
157
  model_id = "sonic-english"
158
158
 
159
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
159
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
160
160
  output_format = {
161
161
  "container": "raw",
162
162
  "encoding": "pcm_f32le",
@@ -211,7 +211,7 @@ transcript = "Hello! Welcome to Cartesia"
211
211
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
212
212
  model_id = "sonic-english"
213
213
 
214
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
214
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
215
215
  output_format = {
216
216
  "container": "raw",
217
217
  "encoding": "pcm_f32le",
@@ -272,7 +272,7 @@ async def send_transcripts(ctx):
272
272
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
273
273
  model_id = "sonic-english"
274
274
 
275
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
275
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
276
276
  output_format = {
277
277
  "container": "raw",
278
278
  "encoding": "pcm_f32le",
@@ -380,7 +380,7 @@ voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
380
380
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
381
381
  model_id = "sonic-english"
382
382
 
383
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
383
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
384
384
  output_format = {
385
385
  "container": "raw",
386
386
  "encoding": "pcm_f32le",
@@ -470,7 +470,7 @@ language = "es" # Language code corresponding to the language of the transcript
470
470
  # Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
471
471
  model_id = "sonic-multilingual"
472
472
 
473
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
473
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
474
474
  output_format = {
475
475
  "container": "raw",
476
476
  "encoding": "pcm_f32le",
@@ -623,7 +623,7 @@ display(audio)
623
623
 
624
624
  #### Output Formats
625
625
 
626
- You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
626
+ You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events).
627
627
 
628
628
  The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
629
629
 
@@ -1,12 +1,12 @@
1
1
  # Cartesia Python API Library
2
2
 
3
3
  ![PyPI - Version](https://img.shields.io/pypi/v/cartesia)
4
- [![Discord](https://badgen.net/badge/black/Cartesia/icon?icon=discord&label)](https://discord.gg/ZVxavqHB9X)
4
+ [![Discord](https://badgen.net/badge/black/Cartesia/icon?icon=discord&label)](https://discord.gg/cartesia)
5
5
 
6
6
  The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
7
7
 
8
8
  > [!IMPORTANT]
9
- > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) for any support requests!
9
+ > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/cartesia) for any support requests!
10
10
 
11
11
  - [Cartesia Python API Library](#cartesia-python-api-library)
12
12
  - [Documentation](#documentation)
@@ -88,7 +88,7 @@ transcript = "Hello! Welcome to Cartesia"
88
88
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
89
89
  model_id = "sonic-english"
90
90
 
91
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
91
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
92
92
  output_format = {
93
93
  "container": "raw",
94
94
  "encoding": "pcm_f32le",
@@ -139,7 +139,7 @@ async def write_stream():
139
139
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
140
140
  model_id = "sonic-english"
141
141
 
142
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
142
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
143
143
  output_format = {
144
144
  "container": "raw",
145
145
  "encoding": "pcm_f32le",
@@ -194,7 +194,7 @@ transcript = "Hello! Welcome to Cartesia"
194
194
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
195
195
  model_id = "sonic-english"
196
196
 
197
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
197
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
198
198
  output_format = {
199
199
  "container": "raw",
200
200
  "encoding": "pcm_f32le",
@@ -255,7 +255,7 @@ async def send_transcripts(ctx):
255
255
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
256
256
  model_id = "sonic-english"
257
257
 
258
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
258
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
259
259
  output_format = {
260
260
  "container": "raw",
261
261
  "encoding": "pcm_f32le",
@@ -363,7 +363,7 @@ voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
363
363
  # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
364
364
  model_id = "sonic-english"
365
365
 
366
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
366
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
367
367
  output_format = {
368
368
  "container": "raw",
369
369
  "encoding": "pcm_f32le",
@@ -453,7 +453,7 @@ language = "es" # Language code corresponding to the language of the transcript
453
453
  # Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
454
454
  model_id = "sonic-multilingual"
455
455
 
456
- # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
456
+ # You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
457
457
  output_format = {
458
458
  "container": "raw",
459
459
  "encoding": "pcm_f32le",
@@ -606,7 +606,7 @@ display(audio)
606
606
 
607
607
  #### Output Formats
608
608
 
609
- You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
609
+ You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events).
610
610
 
611
611
  The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
612
612
 
@@ -0,0 +1,4 @@
1
+ from cartesia.async_client import AsyncCartesia
2
+ from cartesia.client import Cartesia
3
+
4
+ __all__ = ["Cartesia", "AsyncCartesia"]
@@ -0,0 +1,105 @@
1
+ import base64
2
+ import json
3
+ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
4
+
5
+ import aiohttp
6
+
7
+ from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
8
+ from cartesia._logger import logger
9
+ from cartesia._sse import _SSE
10
+ from cartesia._types import OutputFormat, VoiceControls
11
+ from cartesia.tts import TTS
12
+ from cartesia.utils.retry import retry_on_connection_error_async
13
+
14
+
15
+ class _AsyncSSE(_SSE):
16
+ """This class contains methods to generate audio using Server-Sent Events asynchronously."""
17
+
18
+ def __init__(
19
+ self,
20
+ http_url: str,
21
+ headers: Dict[str, str],
22
+ timeout: float,
23
+ get_session: Callable[[], Optional[aiohttp.ClientSession]],
24
+ ):
25
+ super().__init__(http_url, headers, timeout)
26
+ self._get_session = get_session
27
+
28
+ async def send(
29
+ self,
30
+ model_id: str,
31
+ transcript: str,
32
+ output_format: OutputFormat,
33
+ voice_id: Optional[str] = None,
34
+ voice_embedding: Optional[List[float]] = None,
35
+ duration: Optional[int] = None,
36
+ language: Optional[str] = None,
37
+ stream: bool = True,
38
+ _experimental_voice_controls: Optional[VoiceControls] = None,
39
+ ) -> Union[bytes, AsyncGenerator[bytes, None]]:
40
+ voice = TTS._validate_and_construct_voice(
41
+ voice_id,
42
+ voice_embedding=voice_embedding,
43
+ experimental_voice_controls=_experimental_voice_controls,
44
+ )
45
+
46
+ request_body = {
47
+ "model_id": model_id,
48
+ "transcript": transcript,
49
+ "voice": voice,
50
+ "output_format": {
51
+ "container": output_format["container"],
52
+ "encoding": output_format["encoding"],
53
+ "sample_rate": output_format["sample_rate"],
54
+ },
55
+ "language": language,
56
+ }
57
+
58
+ if duration is not None:
59
+ request_body["duration"] = duration
60
+
61
+ generator = self._sse_generator_wrapper(request_body)
62
+
63
+ if stream:
64
+ return generator
65
+
66
+ chunks = []
67
+ async for chunk in generator:
68
+ chunks.append(chunk["audio"])
69
+
70
+ return {"audio": b"".join(chunks)}
71
+
72
+ @retry_on_connection_error_async(
73
+ max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
74
+ )
75
+ async def _sse_generator_wrapper(self, request_body: Dict[str, Any]):
76
+ """Need to wrap the sse generator in a function for the retry decorator to work."""
77
+ try:
78
+ async for chunk in self._sse_generator(request_body):
79
+ yield chunk
80
+ except Exception as e:
81
+ raise RuntimeError(f"Error generating audio. {e}")
82
+
83
+ async def _sse_generator(self, request_body: Dict[str, Any]):
84
+ session = await self._get_session()
85
+ async with session.post(
86
+ f"{self.http_url}/tts/sse",
87
+ data=json.dumps(request_body),
88
+ headers=self.headers,
89
+ ) as response:
90
+ if not response.ok:
91
+ raise ValueError(f"Failed to generate audio. {await response.text()}")
92
+
93
+ buffer = ""
94
+ async for chunk_bytes in response.content.iter_any():
95
+ buffer, outputs = self._update_buffer(buffer=buffer, chunk_bytes=chunk_bytes)
96
+ for output in outputs:
97
+ yield output
98
+
99
+ if buffer:
100
+ try:
101
+ chunk_json = json.loads(buffer)
102
+ audio = base64.b64decode(chunk_json["data"])
103
+ yield {"audio": audio}
104
+ except json.JSONDecodeError:
105
+ pass
@@ -0,0 +1,323 @@
1
+ import asyncio
2
+ import uuid
3
+ from collections import defaultdict
4
+ from types import TracebackType
5
+ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
6
+
7
+ import aiohttp
8
+
9
+ from cartesia._constants import DEFAULT_MODEL_ID, DEFAULT_VOICE_EMBEDDING
10
+ from cartesia._types import OutputFormat, VoiceControls
11
+ from cartesia._websocket import _WebSocket
12
+ from cartesia.tts import TTS
13
+
14
+
15
+ class _AsyncTTSContext:
16
+ """Manage a single context over an AsyncWebSocket.
17
+
18
+ This class separates sending requests and receiving responses into two separate methods.
19
+ This can be used for sending multiple requests without awaiting the response.
20
+ Then you can listen to the responses in the order they were sent. See README for usage.
21
+
22
+ Each AsyncTTSContext will close automatically when a done message is received for that context.
23
+ This happens when the no_more_inputs method is called (equivalent to sending a request with `continue_ = False`),
24
+ or if no requests have been sent for 5 seconds on the same context. It also closes if there is an error.
25
+
26
+ """
27
+
28
+ def __init__(self, context_id: str, websocket: "_AsyncWebSocket", timeout: float):
29
+ self._context_id = context_id
30
+ self._websocket = websocket
31
+ self.timeout = timeout
32
+ self._error = None
33
+
34
+ @property
35
+ def context_id(self) -> str:
36
+ return self._context_id
37
+
38
+ async def send(
39
+ self,
40
+ model_id: str,
41
+ transcript: str,
42
+ output_format: OutputFormat,
43
+ voice_id: Optional[str] = None,
44
+ voice_embedding: Optional[List[float]] = None,
45
+ context_id: Optional[str] = None,
46
+ continue_: bool = False,
47
+ duration: Optional[int] = None,
48
+ language: Optional[str] = None,
49
+ add_timestamps: bool = False,
50
+ _experimental_voice_controls: Optional[VoiceControls] = None,
51
+ ) -> None:
52
+ """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
53
+
54
+ Args:
55
+ model_id: The ID of the model to use for generating audio.
56
+ transcript: The text to convert to speech.
57
+ output_format: A dictionary containing the details of the output format.
58
+ voice_id: The ID of the voice to use for generating audio.
59
+ voice_embedding: The embedding of the voice to use for generating audio.
60
+ context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
61
+ continue_: Whether to continue the audio generation from the previous transcript or not.
62
+ duration: The duration of the audio in seconds.
63
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
64
+ add_timestamps: Whether to return word-level timestamps.
65
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
66
+ Note: This is an experimental feature and may change rapidly in future releases.
67
+
68
+ Returns:
69
+ None.
70
+ """
71
+ if context_id is not None and context_id != self._context_id:
72
+ raise ValueError("Context ID does not match the context ID of the current context.")
73
+ if continue_ and transcript == "":
74
+ raise ValueError("Transcript cannot be empty when continue_ is True.")
75
+
76
+ await self._websocket.connect()
77
+
78
+ voice = TTS._validate_and_construct_voice(
79
+ voice_id,
80
+ voice_embedding,
81
+ experimental_voice_controls=_experimental_voice_controls,
82
+ )
83
+
84
+ request_body = {
85
+ "model_id": model_id,
86
+ "transcript": transcript,
87
+ "voice": voice,
88
+ "output_format": {
89
+ "container": output_format["container"],
90
+ "encoding": output_format["encoding"],
91
+ "sample_rate": output_format["sample_rate"],
92
+ },
93
+ "context_id": self._context_id,
94
+ "continue": continue_,
95
+ "language": language,
96
+ "add_timestamps": add_timestamps,
97
+ }
98
+
99
+ if duration is not None:
100
+ request_body["duration"] = duration
101
+
102
+ await self._websocket.websocket.send_json(request_body)
103
+
104
+ # Start listening for responses on the WebSocket
105
+ self._websocket._dispatch_listener()
106
+
107
+ async def no_more_inputs(self) -> None:
108
+ """Send a request to the WebSocket to indicate that no more requests will be sent."""
109
+ await self.send(
110
+ model_id=DEFAULT_MODEL_ID,
111
+ transcript="",
112
+ output_format=TTS.get_output_format("raw_pcm_f32le_44100"),
113
+ voice_embedding=DEFAULT_VOICE_EMBEDDING, # Default voice embedding since it's a required input for now.
114
+ context_id=self._context_id,
115
+ continue_=False,
116
+ )
117
+
118
+ async def receive(self) -> AsyncGenerator[Dict[str, Any], None]:
119
+ """Receive the audio chunks from the WebSocket. This method is a generator that yields audio chunks.
120
+
121
+ Returns:
122
+ An async generator that yields audio chunks. Each chunk is a dictionary containing the audio as bytes.
123
+ """
124
+ try:
125
+ while True:
126
+ response = await self._websocket._get_message(
127
+ self._context_id, timeout=self.timeout
128
+ )
129
+ if "error" in response:
130
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
131
+ if response["done"]:
132
+ break
133
+ yield self._websocket._convert_response(response, include_context_id=True)
134
+ except Exception as e:
135
+ if isinstance(e, asyncio.TimeoutError):
136
+ raise RuntimeError("Timeout while waiting for audio chunk")
137
+ raise RuntimeError(f"Failed to generate audio:\n{e}")
138
+ finally:
139
+ self._close()
140
+
141
+ def _close(self) -> None:
142
+ """Closes the context. Automatically called when a done message is received for this context."""
143
+ self._websocket._remove_context(self._context_id)
144
+
145
+ def is_closed(self):
146
+ """Check if the context is closed or not. Returns True if closed."""
147
+ return self._context_id not in self._websocket._context_queues
148
+
149
+ async def __aenter__(self):
150
+ return self
151
+
152
+ async def __aexit__(
153
+ self,
154
+ exc_type: Union[type, None],
155
+ exc: Union[BaseException, None],
156
+ exc_tb: Union[TracebackType, None],
157
+ ):
158
+ self._close()
159
+
160
+ def __del__(self):
161
+ self._close()
162
+
163
+
164
+ class _AsyncWebSocket(_WebSocket):
165
+ """This class contains methods to generate audio using WebSocket asynchronously."""
166
+
167
+ def __init__(
168
+ self,
169
+ ws_url: str,
170
+ api_key: str,
171
+ cartesia_version: str,
172
+ timeout: float,
173
+ get_session: Callable[[], Optional[aiohttp.ClientSession]],
174
+ ):
175
+ """
176
+ Args:
177
+ ws_url: The WebSocket URL for the Cartesia API.
178
+ api_key: The API key to use for authorization.
179
+ cartesia_version: The version of the Cartesia API to use.
180
+ timeout: The timeout for responses on the WebSocket in seconds.
181
+ get_session: A function that returns an aiohttp.ClientSession object.
182
+ """
183
+ super().__init__(ws_url, api_key, cartesia_version)
184
+ self.timeout = timeout
185
+ self._get_session = get_session
186
+ self.websocket = None
187
+ self._context_queues: Dict[str, asyncio.Queue] = {}
188
+ self._processing_task: asyncio.Task = None
189
+
190
+ def __del__(self):
191
+ try:
192
+ loop = asyncio.get_running_loop()
193
+ except RuntimeError:
194
+ loop = None
195
+
196
+ if loop is None:
197
+ asyncio.run(self.close())
198
+ elif loop.is_running():
199
+ loop.create_task(self.close())
200
+
201
+ async def connect(self):
202
+ if self.websocket is None or self._is_websocket_closed():
203
+ route = "tts/websocket"
204
+ session = await self._get_session()
205
+ try:
206
+ self.websocket = await session.ws_connect(
207
+ f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
208
+ )
209
+ except Exception as e:
210
+ raise RuntimeError(f"Failed to connect to WebSocket. {e}")
211
+
212
+ def _is_websocket_closed(self):
213
+ return self.websocket.closed
214
+
215
+ async def close(self):
216
+ """This method closes the websocket connection. *Highly* recommended to call this method when done."""
217
+ if self.websocket is not None and not self._is_websocket_closed():
218
+ await self.websocket.close()
219
+ if self._processing_task:
220
+ self._processing_task.cancel()
221
+ try:
222
+ self._processing_task = None
223
+ except asyncio.CancelledError:
224
+ pass
225
+ except TypeError as e:
226
+ # Ignore the error if the task is already cancelled
227
+ # For some reason we are getting None responses
228
+ # TODO: This needs to be fixed - we need to think about why we are getting None responses.
229
+ if "Received message 256:None" not in str(e):
230
+ raise e
231
+
232
+ for context_id in list(self._context_queues.keys()):
233
+ self._remove_context(context_id)
234
+
235
+ self._context_queues.clear()
236
+ self._processing_task = None
237
+ self.websocket = None
238
+
239
+ async def send(
240
+ self,
241
+ model_id: str,
242
+ transcript: str,
243
+ output_format: OutputFormat,
244
+ voice_id: Optional[str] = None,
245
+ voice_embedding: Optional[List[float]] = None,
246
+ context_id: Optional[str] = None,
247
+ duration: Optional[int] = None,
248
+ language: Optional[str] = None,
249
+ stream: bool = True,
250
+ add_timestamps: bool = False,
251
+ _experimental_voice_controls: Optional[VoiceControls] = None,
252
+ ) -> Union[bytes, AsyncGenerator[bytes, None]]:
253
+ """See :meth:`_WebSocket.send` for details."""
254
+ if context_id is None:
255
+ context_id = str(uuid.uuid4())
256
+
257
+ ctx = self.context(context_id)
258
+
259
+ await ctx.send(
260
+ model_id=model_id,
261
+ transcript=transcript,
262
+ output_format=output_format,
263
+ voice_id=voice_id,
264
+ voice_embedding=voice_embedding,
265
+ context_id=context_id,
266
+ duration=duration,
267
+ language=language,
268
+ continue_=False,
269
+ add_timestamps=add_timestamps,
270
+ _experimental_voice_controls=_experimental_voice_controls,
271
+ )
272
+
273
+ generator = ctx.receive()
274
+
275
+ if stream:
276
+ return generator
277
+
278
+ chunks = []
279
+ word_timestamps = defaultdict(list)
280
+ async for chunk in generator:
281
+ if "audio" in chunk:
282
+ chunks.append(chunk["audio"])
283
+ if add_timestamps and "word_timestamps" in chunk:
284
+ for k, v in chunk["word_timestamps"].items():
285
+ word_timestamps[k].extend(v)
286
+ out = {"audio": b"".join(chunks), "context_id": context_id}
287
+ if add_timestamps:
288
+ out["word_timestamps"] = word_timestamps
289
+ return out
290
+
291
+ async def _process_responses(self):
292
+ try:
293
+ while True:
294
+ response = await self.websocket.receive_json()
295
+ if response["context_id"]:
296
+ context_id = response["context_id"]
297
+ if context_id in self._context_queues:
298
+ await self._context_queues[context_id].put(response)
299
+ except Exception as e:
300
+ self._error = e
301
+ raise e
302
+
303
+ async def _get_message(self, context_id: str, timeout: float) -> Dict[str, Any]:
304
+ if context_id not in self._context_queues:
305
+ raise ValueError(f"Context ID {context_id} not found.")
306
+ return await asyncio.wait_for(self._context_queues[context_id].get(), timeout=timeout)
307
+
308
+ def _remove_context(self, context_id: str):
309
+ if context_id in self._context_queues:
310
+ del self._context_queues[context_id]
311
+
312
+ def _dispatch_listener(self):
313
+ if self._processing_task is None or self._processing_task.done():
314
+ self._processing_task = asyncio.create_task(self._process_responses())
315
+
316
+ def context(self, context_id: Optional[str] = None) -> _AsyncTTSContext:
317
+ if context_id in self._context_queues:
318
+ raise ValueError(f"AsyncContext for context ID {context_id} already exists.")
319
+ if context_id is None:
320
+ context_id = str(uuid.uuid4())
321
+ if context_id not in self._context_queues:
322
+ self._context_queues[context_id] = asyncio.Queue()
323
+ return _AsyncTTSContext(context_id, self, self.timeout)
@@ -0,0 +1,10 @@
1
+ DEFAULT_MODEL_ID = "sonic-english" # latest default model
2
+ MULTILINGUAL_MODEL_ID = "sonic-multilingual" # latest multilingual model
3
+ DEFAULT_BASE_URL = "api.cartesia.ai"
4
+ DEFAULT_CARTESIA_VERSION = "2024-06-10" # latest version
5
+ DEFAULT_TIMEOUT = 30 # seconds
6
+ DEFAULT_NUM_CONNECTIONS = 10 # connections per client
7
+ DEFAULT_VOICE_EMBEDDING = [1.0] * 192
8
+
9
+ BACKOFF_FACTOR = 1
10
+ MAX_RETRIES = 3
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)