cartesia 1.0.13__py2.py3-none-any.whl → 1.0.14__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/client.py CHANGED
@@ -1,60 +1,10 @@
1
- import asyncio
2
- import base64
3
- import json
4
- import logging
5
1
  import os
6
- import uuid
7
- from collections import defaultdict
8
2
  from types import TracebackType
9
- from typing import (
10
- Any,
11
- AsyncGenerator,
12
- Callable,
13
- Dict,
14
- Generator,
15
- Iterator,
16
- List,
17
- Optional,
18
- Set,
19
- Tuple,
20
- Union,
21
- )
3
+ from typing import Optional, Union
22
4
 
23
- import aiohttp
24
- import httpx
25
- import requests
26
-
27
- try:
28
- from websockets.sync.client import connect
29
-
30
- IS_WEBSOCKET_SYNC_AVAILABLE = True
31
- except ImportError:
32
- IS_WEBSOCKET_SYNC_AVAILABLE = False
33
-
34
- from iterators import TimeoutIterator
35
-
36
- from cartesia._types import (
37
- DeprecatedOutputFormatMapping,
38
- EventType,
39
- OutputFormat,
40
- OutputFormatMapping,
41
- VoiceControls,
42
- VoiceMetadata,
43
- )
44
- from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
45
-
46
- DEFAULT_MODEL_ID = "sonic-english" # latest default model
47
- MULTILINGUAL_MODEL_ID = "sonic-multilingual" # latest multilingual model
48
- DEFAULT_BASE_URL = "api.cartesia.ai"
49
- DEFAULT_CARTESIA_VERSION = "2024-06-10" # latest version
50
- DEFAULT_TIMEOUT = 30 # seconds
51
- DEFAULT_NUM_CONNECTIONS = 10 # connections per client
52
- DEFAULT_VOICE_EMBEDDING = [1.0] * 192 # Default voice embedding is a 192 sized float array
53
-
54
- BACKOFF_FACTOR = 1
55
- MAX_RETRIES = 3
56
-
57
- logger = logging.getLogger(__name__)
5
+ from cartesia._constants import DEFAULT_BASE_URL, DEFAULT_TIMEOUT
6
+ from cartesia.tts import TTS
7
+ from cartesia.voices import Voices
58
8
 
59
9
 
60
10
  class BaseClient:
@@ -75,49 +25,6 @@ class BaseClient:
75
25
  return self._base_url
76
26
 
77
27
 
78
- class Resource:
79
- def __init__(
80
- self,
81
- api_key: str,
82
- base_url: str,
83
- timeout: float,
84
- ):
85
- """Constructor for the Resource class. Used by the Voices and TTS classes."""
86
- self.api_key = api_key
87
- self.timeout = timeout
88
- self._base_url = base_url
89
- self.cartesia_version = DEFAULT_CARTESIA_VERSION
90
- self.headers = {
91
- "X-API-Key": self.api_key,
92
- "Cartesia-Version": self.cartesia_version,
93
- "Content-Type": "application/json",
94
- }
95
-
96
- @property
97
- def base_url(self):
98
- return self._base_url
99
-
100
- def _http_url(self):
101
- """Returns the HTTP URL for the Cartesia API.
102
- If the base URL is localhost, the URL will start with 'http'. Otherwise, it will start with 'https'.
103
- """
104
- if self._base_url.startswith("http://") or self._base_url.startswith("https://"):
105
- return self._base_url
106
- else:
107
- prefix = "http" if "localhost" in self._base_url else "https"
108
- return f"{prefix}://{self._base_url}"
109
-
110
- def _ws_url(self):
111
- """Returns the WebSocket URL for the Cartesia API.
112
- If the base URL is localhost, the URL will start with 'ws'. Otherwise, it will start with 'wss'.
113
- """
114
- if self._base_url.startswith("ws://") or self._base_url.startswith("wss://"):
115
- return self._base_url
116
- else:
117
- prefix = "ws" if "localhost" in self._base_url else "wss"
118
- return f"{prefix}://{self._base_url}"
119
-
120
-
121
28
  class Cartesia(BaseClient):
122
29
  """
123
30
  The client for Cartesia's text-to-speech library.
@@ -160,1234 +67,3 @@ class Cartesia(BaseClient):
160
67
  exc_tb: Union[TracebackType, None],
161
68
  ):
162
69
  pass
163
-
164
-
165
- class Voices(Resource):
166
- """This resource contains methods to list, get, clone, and create voices in your Cartesia voice library.
167
-
168
- Usage:
169
- >>> client = Cartesia(api_key="your_api_key")
170
- >>> voices = client.voices.list()
171
- >>> voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
172
- >>> print("Voice Name:", voice["name"], "Voice Description:", voice["description"])
173
- >>> embedding = client.voices.clone(filepath="path/to/clip.wav")
174
- >>> new_voice = client.voices.create(
175
- ... name="My Voice", description="A new voice", embedding=embedding
176
- ... )
177
- """
178
-
179
- def list(self) -> List[VoiceMetadata]:
180
- """List all voices in your voice library.
181
-
182
- Returns:
183
- This method returns a list of VoiceMetadata objects.
184
- """
185
- response = httpx.get(
186
- f"{self._http_url()}/voices",
187
- headers=self.headers,
188
- timeout=self.timeout,
189
- )
190
-
191
- if not response.is_success:
192
- raise ValueError(f"Failed to get voices. Error: {response.text}")
193
-
194
- voices = response.json()
195
- return voices
196
-
197
- def get(self, id: str) -> VoiceMetadata:
198
- """Get a voice by its ID.
199
-
200
- Args:
201
- id: The ID of the voice.
202
-
203
- Returns:
204
- A VoiceMetadata object containing the voice metadata.
205
- """
206
- url = f"{self._http_url()}/voices/{id}"
207
- response = httpx.get(url, headers=self.headers, timeout=self.timeout)
208
-
209
- if not response.is_success:
210
- raise ValueError(
211
- f"Failed to get voice. Status Code: {response.status_code}\n"
212
- f"Error: {response.text}"
213
- )
214
-
215
- return response.json()
216
-
217
- def clone(self, filepath: Optional[str] = None, enhance: str = True) -> List[float]:
218
- """Clone a voice from a clip.
219
-
220
- Args:
221
- filepath: The path to the clip file.
222
- enhance: Whether to enhance the clip before cloning the voice (highly recommended). Defaults to True.
223
-
224
- Returns:
225
- The embedding of the cloned voice as a list of floats.
226
- """
227
- if not filepath:
228
- raise ValueError("Filepath must be specified.")
229
- url = f"{self._http_url()}/voices/clone/clip"
230
- with open(filepath, "rb") as file:
231
- files = {"clip": file}
232
- files["enhance"] = str(enhance).lower()
233
- headers = self.headers.copy()
234
- headers.pop("Content-Type", None)
235
- response = httpx.post(url, headers=headers, files=files, timeout=self.timeout)
236
- if not response.is_success:
237
- raise ValueError(f"Failed to clone voice from clip. Error: {response.text}")
238
-
239
- return response.json()["embedding"]
240
-
241
- def create(self, name: str, description: str, embedding: List[float]) -> VoiceMetadata:
242
- """Create a new voice.
243
-
244
- Args:
245
- name: The name of the voice.
246
- description: The description of the voice.
247
- embedding: The embedding of the voice. This should be generated with :meth:`clone`.
248
-
249
- Returns:
250
- A dictionary containing the voice metadata.
251
- """
252
- response = httpx.post(
253
- f"{self._http_url()}/voices",
254
- headers=self.headers,
255
- json={"name": name, "description": description, "embedding": embedding},
256
- timeout=self.timeout,
257
- )
258
-
259
- if not response.is_success:
260
- raise ValueError(f"Failed to create voice. Error: {response.text}")
261
-
262
- return response.json()
263
-
264
- def mix(self, voices: List[Dict[str, Union[str, float]]]) -> List[float]:
265
- """Mix multiple voices together.
266
-
267
- Args:
268
- voices: A list of dictionaries, each containing either:
269
- - 'id': The ID of an existing voice
270
- - 'embedding': A voice embedding
271
- AND
272
- - 'weight': The weight of the voice in the mix (0.0 to 1.0)
273
-
274
- Returns:
275
- The embedding of the mixed voice as a list of floats.
276
-
277
- Raises:
278
- ValueError: If the request fails or if the input is invalid.
279
- """
280
- url = f"{self._http_url()}/voices/mix"
281
-
282
- if not voices or not isinstance(voices, list):
283
- raise ValueError("voices must be a non-empty list")
284
-
285
- response = httpx.post(
286
- url,
287
- headers=self.headers,
288
- json={"voices": voices},
289
- timeout=self.timeout,
290
- )
291
-
292
- if not response.is_success:
293
- raise ValueError(f"Failed to mix voices. Error: {response.text}")
294
-
295
- result = response.json()
296
- return result["embedding"]
297
-
298
-
299
- class _TTSContext:
300
- """Manage a single context over a WebSocket.
301
-
302
- This class can be used to stream inputs, as they become available, to a specific `context_id`. See README for usage.
303
-
304
- See :class:`_AsyncTTSContext` for asynchronous use cases.
305
-
306
- Each TTSContext will close automatically when a done message is received for that context. It also closes if there is an error.
307
- """
308
-
309
- def __init__(self, context_id: str, websocket: "_WebSocket"):
310
- self._context_id = context_id
311
- self._websocket = websocket
312
- self._error = None
313
-
314
- def __del__(self):
315
- self._close()
316
-
317
- @property
318
- def context_id(self) -> str:
319
- return self._context_id
320
-
321
- def send(
322
- self,
323
- model_id: str,
324
- transcript: Iterator[str],
325
- output_format: OutputFormat,
326
- voice_id: Optional[str] = None,
327
- voice_embedding: Optional[List[float]] = None,
328
- context_id: Optional[str] = None,
329
- duration: Optional[int] = None,
330
- language: Optional[str] = None,
331
- add_timestamps: bool = False,
332
- _experimental_voice_controls: Optional[VoiceControls] = None,
333
- ) -> Generator[bytes, None, None]:
334
- """Send audio generation requests to the WebSocket and yield responses.
335
-
336
- Args:
337
- model_id: The ID of the model to use for generating audio.
338
- transcript: Iterator over text chunks with <1s latency.
339
- output_format: A dictionary containing the details of the output format.
340
- voice_id: The ID of the voice to use for generating audio.
341
- voice_embedding: The embedding of the voice to use for generating audio.
342
- context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
343
- duration: The duration of the audio in seconds.
344
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
345
- add_timestamps: Whether to return word-level timestamps.
346
- _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
347
- Note: This is an experimental feature and may change rapidly in future releases.
348
-
349
- Yields:
350
- Dictionary containing the following key(s):
351
- - audio: The audio as bytes.
352
- - context_id: The context ID for the request.
353
-
354
- Raises:
355
- ValueError: If provided context_id doesn't match the current context.
356
- RuntimeError: If there's an error generating audio.
357
- """
358
- if context_id is not None and context_id != self._context_id:
359
- raise ValueError("Context ID does not match the context ID of the current context.")
360
-
361
- self._websocket.connect()
362
-
363
- voice = TTS._validate_and_construct_voice(
364
- voice_id,
365
- voice_embedding=voice_embedding,
366
- experimental_voice_controls=_experimental_voice_controls,
367
- )
368
-
369
- # Create the initial request body
370
- request_body = {
371
- "model_id": model_id,
372
- "voice": voice,
373
- "output_format": {
374
- "container": output_format["container"],
375
- "encoding": output_format["encoding"],
376
- "sample_rate": output_format["sample_rate"],
377
- },
378
- "context_id": self._context_id,
379
- "language": language,
380
- "add_timestamps": add_timestamps,
381
- }
382
-
383
- if duration is not None:
384
- request_body["duration"] = duration
385
-
386
- try:
387
- # Create an iterator with a timeout to get text chunks
388
- text_iterator = TimeoutIterator(
389
- transcript, timeout=0.001
390
- ) # 1ms timeout for nearly non-blocking receive
391
- next_chunk = next(text_iterator, None)
392
-
393
- while True:
394
- # Send the next text chunk to the WebSocket if available
395
- if next_chunk is not None and next_chunk != text_iterator.get_sentinel():
396
- request_body["transcript"] = next_chunk
397
- request_body["continue"] = True
398
- self._websocket.websocket.send(json.dumps(request_body))
399
- next_chunk = next(text_iterator, None)
400
-
401
- try:
402
- # Receive responses from the WebSocket with a small timeout
403
- response = json.loads(
404
- self._websocket.websocket.recv(timeout=0.001)
405
- ) # 1ms timeout for nearly non-blocking receive
406
- if response["context_id"] != self._context_id:
407
- pass
408
- if "error" in response:
409
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
410
- if response["done"]:
411
- break
412
- if response["data"]:
413
- yield self._websocket._convert_response(
414
- response=response, include_context_id=True
415
- )
416
- except TimeoutError:
417
- pass
418
-
419
- # Continuously receive from WebSocket until the next text chunk is available
420
- while next_chunk == text_iterator.get_sentinel():
421
- try:
422
- response = json.loads(self._websocket.websocket.recv(timeout=0.001))
423
- if response["context_id"] != self._context_id:
424
- continue
425
- if "error" in response:
426
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
427
- if response["done"]:
428
- break
429
- if response["data"]:
430
- yield self._websocket._convert_response(
431
- response=response, include_context_id=True
432
- )
433
- except TimeoutError:
434
- pass
435
- next_chunk = next(text_iterator, None)
436
-
437
- # Send final message if all input text chunks are exhausted
438
- if next_chunk is None:
439
- request_body["transcript"] = ""
440
- request_body["continue"] = False
441
- self._websocket.websocket.send(json.dumps(request_body))
442
- break
443
-
444
- # Receive remaining messages from the WebSocket until "done" is received
445
- while True:
446
- response = json.loads(self._websocket.websocket.recv())
447
- if response["context_id"] != self._context_id:
448
- continue
449
- if "error" in response:
450
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
451
- if response["done"]:
452
- break
453
- yield self._websocket._convert_response(response=response, include_context_id=True)
454
-
455
- except Exception as e:
456
- self._websocket.close()
457
- raise RuntimeError(f"Failed to generate audio. {e}")
458
-
459
- def _close(self):
460
- """Closes the context. Automatically called when a done message is received for this context."""
461
- self._websocket._remove_context(self._context_id)
462
-
463
- def is_closed(self):
464
- """Check if the context is closed or not. Returns True if closed."""
465
- return self._context_id not in self._websocket._contexts
466
-
467
-
468
- class _WebSocket:
469
- """This class contains methods to generate audio using WebSocket. Ideal for low-latency audio generation.
470
-
471
- Usage:
472
- >>> ws = client.tts.websocket()
473
- >>> for audio_chunk in ws.send(
474
- ... model_id="sonic-english", transcript="Hello world!", voice_embedding=embedding,
475
- ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
476
- ... context_id=context_id, stream=True
477
- ... ):
478
- ... audio = audio_chunk["audio"]
479
- """
480
-
481
- def __init__(
482
- self,
483
- ws_url: str,
484
- api_key: str,
485
- cartesia_version: str,
486
- ):
487
- self.ws_url = ws_url
488
- self.api_key = api_key
489
- self.cartesia_version = cartesia_version
490
- self.websocket = None
491
- self._contexts: Set[str] = set()
492
-
493
- def __del__(self):
494
- try:
495
- self.close()
496
- except Exception as e:
497
- raise RuntimeError("Failed to close WebSocket: ", e)
498
-
499
- def connect(self):
500
- """This method connects to the WebSocket if it is not already connected.
501
-
502
- Raises:
503
- RuntimeError: If the connection to the WebSocket fails.
504
- """
505
- if not IS_WEBSOCKET_SYNC_AVAILABLE:
506
- raise ImportError(
507
- "The synchronous WebSocket client is not available. Please ensure that you have 'websockets>=12.0' or compatible version installed."
508
- )
509
- if self.websocket is None or self._is_websocket_closed():
510
- route = "tts/websocket"
511
- try:
512
- self.websocket = connect(
513
- f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
514
- )
515
- except Exception as e:
516
- raise RuntimeError(f"Failed to connect to WebSocket. {e}")
517
-
518
- def _is_websocket_closed(self):
519
- return self.websocket.socket.fileno() == -1
520
-
521
- def close(self):
522
- """This method closes the WebSocket connection. *Highly* recommended to call this method when done using the WebSocket."""
523
- if self.websocket and not self._is_websocket_closed():
524
- self.websocket.close()
525
-
526
- if self._contexts:
527
- self._contexts.clear()
528
-
529
- def _convert_response(
530
- self, response: Dict[str, any], include_context_id: bool
531
- ) -> Dict[str, Any]:
532
- out = {}
533
- if response["type"] == EventType.AUDIO:
534
- out["audio"] = base64.b64decode(response["data"])
535
- elif response["type"] == EventType.TIMESTAMPS:
536
- out["word_timestamps"] = response["word_timestamps"]
537
-
538
- if include_context_id:
539
- out["context_id"] = response["context_id"]
540
-
541
- return out
542
-
543
- def send(
544
- self,
545
- model_id: str,
546
- transcript: str,
547
- output_format: dict,
548
- voice_id: Optional[str] = None,
549
- voice_embedding: Optional[List[float]] = None,
550
- context_id: Optional[str] = None,
551
- duration: Optional[int] = None,
552
- language: Optional[str] = None,
553
- stream: bool = True,
554
- add_timestamps: bool = False,
555
- _experimental_voice_controls: Optional[VoiceControls] = None,
556
- ) -> Union[bytes, Generator[bytes, None, None]]:
557
- """Send a request to the WebSocket to generate audio.
558
-
559
- Args:
560
- model_id: The ID of the model to use for generating audio.
561
- transcript: The text to convert to speech.
562
- output_format: A dictionary containing the details of the output format.
563
- voice_id: The ID of the voice to use for generating audio.
564
- voice_embedding: The embedding of the voice to use for generating audio.
565
- context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
566
- duration: The duration of the audio in seconds.
567
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
568
- stream: Whether to stream the audio or not.
569
- add_timestamps: Whether to return word-level timestamps.
570
- _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
571
- Note: This is an experimental feature and may change rapidly in future releases.
572
-
573
- Returns:
574
- If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
575
- If `stream` is False, the method returns a dictionary.
576
- Both the generator and the dictionary contain the following key(s):
577
- - audio: The audio as bytes.
578
- - context_id: The context ID for the request.
579
- """
580
- self.connect()
581
-
582
- if context_id is None:
583
- context_id = str(uuid.uuid4())
584
-
585
- voice = TTS._validate_and_construct_voice(
586
- voice_id,
587
- voice_embedding=voice_embedding,
588
- experimental_voice_controls=_experimental_voice_controls,
589
- )
590
-
591
- request_body = {
592
- "model_id": model_id,
593
- "transcript": transcript,
594
- "voice": voice,
595
- "output_format": {
596
- "container": output_format["container"],
597
- "encoding": output_format["encoding"],
598
- "sample_rate": output_format["sample_rate"],
599
- },
600
- "context_id": context_id,
601
- "language": language,
602
- "add_timestamps": add_timestamps,
603
- }
604
-
605
- if duration is not None:
606
- request_body["duration"] = duration
607
-
608
- generator = self._websocket_generator(request_body)
609
-
610
- if stream:
611
- return generator
612
-
613
- chunks = []
614
- word_timestamps = defaultdict(list)
615
- for chunk in generator:
616
- if "audio" in chunk:
617
- chunks.append(chunk["audio"])
618
- if add_timestamps and "word_timestamps" in chunk:
619
- for k, v in chunk["word_timestamps"].items():
620
- word_timestamps[k].extend(v)
621
- out = {"audio": b"".join(chunks), "context_id": context_id}
622
- if add_timestamps:
623
- out["word_timestamps"] = word_timestamps
624
- return out
625
-
626
- def _websocket_generator(self, request_body: Dict[str, Any]):
627
- self.websocket.send(json.dumps(request_body))
628
-
629
- try:
630
- while True:
631
- response = json.loads(self.websocket.recv())
632
- if "error" in response:
633
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
634
- if response["done"]:
635
- break
636
- yield self._convert_response(response=response, include_context_id=True)
637
- except Exception as e:
638
- # Close the websocket connection if an error occurs.
639
- self.close()
640
- raise RuntimeError(f"Failed to generate audio. {response}") from e
641
-
642
- def _remove_context(self, context_id: str):
643
- if context_id in self._contexts:
644
- self._contexts.remove(context_id)
645
-
646
- def context(self, context_id: Optional[str] = None) -> _TTSContext:
647
- if context_id in self._contexts:
648
- raise ValueError(f"Context for context ID {context_id} already exists.")
649
- if context_id is None:
650
- context_id = str(uuid.uuid4())
651
- if context_id not in self._contexts:
652
- self._contexts.add(context_id)
653
- return _TTSContext(context_id, self)
654
-
655
-
656
- class _SSE:
657
- """This class contains methods to generate audio using Server-Sent Events.
658
-
659
- Usage:
660
- >>> for audio_chunk in client.tts.sse(
661
- ... model_id="sonic-english", transcript="Hello world!", voice_embedding=embedding,
662
- ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}, stream=True
663
- ... ):
664
- ... audio = audio_chunk["audio"]
665
- """
666
-
667
- def __init__(
668
- self,
669
- http_url: str,
670
- headers: Dict[str, str],
671
- timeout: float,
672
- ):
673
- self.http_url = http_url
674
- self.headers = headers
675
- self.timeout = timeout
676
-
677
- def _update_buffer(self, buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
678
- buffer += chunk_bytes.decode("utf-8")
679
- outputs = []
680
- while "{" in buffer and "}" in buffer:
681
- start_index = buffer.find("{")
682
- end_index = buffer.find("}", start_index)
683
- if start_index != -1 and end_index != -1:
684
- try:
685
- chunk_json = json.loads(buffer[start_index : end_index + 1])
686
- if "error" in chunk_json:
687
- raise RuntimeError(f"Error generating audio:\n{chunk_json['error']}")
688
- if chunk_json["done"]:
689
- break
690
- audio = base64.b64decode(chunk_json["data"])
691
- outputs.append({"audio": audio})
692
- buffer = buffer[end_index + 1 :]
693
- except json.JSONDecodeError:
694
- break
695
- return buffer, outputs
696
-
697
- def send(
698
- self,
699
- model_id: str,
700
- transcript: str,
701
- output_format: OutputFormat,
702
- voice_id: Optional[str] = None,
703
- voice_embedding: Optional[List[float]] = None,
704
- duration: Optional[int] = None,
705
- language: Optional[str] = None,
706
- stream: bool = True,
707
- _experimental_voice_controls: Optional[VoiceControls] = None,
708
- ) -> Union[bytes, Generator[bytes, None, None]]:
709
- """Send a request to the server to generate audio using Server-Sent Events.
710
-
711
- Args:
712
- model_id: The ID of the model to use for generating audio.
713
- transcript: The text to convert to speech.
714
- voice_id: The ID of the voice to use for generating audio.
715
- voice_embedding: The embedding of the voice to use for generating audio.
716
- output_format: A dictionary containing the details of the output format.
717
- duration: The duration of the audio in seconds.
718
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
719
- stream: Whether to stream the audio or not.
720
- _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
721
- Note: This is an experimental feature and may change rapidly in future releases.
722
-
723
- Returns:
724
- If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
725
- If `stream` is False, the method returns a dictionary.
726
- Both the generator and the dictionary contain the following key(s):
727
- - audio: The audio as bytes.
728
- """
729
- voice = TTS._validate_and_construct_voice(
730
- voice_id,
731
- voice_embedding=voice_embedding,
732
- experimental_voice_controls=_experimental_voice_controls,
733
- )
734
- request_body = {
735
- "model_id": model_id,
736
- "transcript": transcript,
737
- "voice": voice,
738
- "output_format": {
739
- "container": output_format["container"],
740
- "encoding": output_format["encoding"],
741
- "sample_rate": output_format["sample_rate"],
742
- },
743
- "language": language,
744
- }
745
-
746
- if duration is not None:
747
- request_body["duration"] = duration
748
-
749
- generator = self._sse_generator_wrapper(request_body)
750
-
751
- if stream:
752
- return generator
753
-
754
- chunks = []
755
- for chunk in generator:
756
- chunks.append(chunk["audio"])
757
-
758
- return {"audio": b"".join(chunks)}
759
-
760
- @retry_on_connection_error(
761
- max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
762
- )
763
- def _sse_generator_wrapper(self, request_body: Dict[str, Any]):
764
- """Need to wrap the sse generator in a function for the retry decorator to work."""
765
- try:
766
- for chunk in self._sse_generator(request_body):
767
- yield chunk
768
- except Exception as e:
769
- raise RuntimeError(f"Error generating audio. {e}")
770
-
771
- def _sse_generator(self, request_body: Dict[str, Any]):
772
- response = requests.post(
773
- f"{self.http_url}/tts/sse",
774
- stream=True,
775
- data=json.dumps(request_body),
776
- headers=self.headers,
777
- timeout=(self.timeout, self.timeout),
778
- )
779
- if not response.ok:
780
- raise ValueError(f"Failed to generate audio. {response.text}")
781
-
782
- buffer = ""
783
- for chunk_bytes in response.iter_content(chunk_size=None):
784
- buffer, outputs = self._update_buffer(buffer=buffer, chunk_bytes=chunk_bytes)
785
- for output in outputs:
786
- yield output
787
-
788
- if buffer:
789
- try:
790
- chunk_json = json.loads(buffer)
791
- audio = base64.b64decode(chunk_json["data"])
792
- yield {"audio": audio}
793
- except json.JSONDecodeError:
794
- pass
795
-
796
-
797
- class TTS(Resource):
798
- """This resource contains methods to generate audio using Cartesia's text-to-speech API."""
799
-
800
- def __init__(self, api_key: str, base_url: str, timeout: float):
801
- super().__init__(
802
- api_key=api_key,
803
- base_url=base_url,
804
- timeout=timeout,
805
- )
806
- self._sse_class = _SSE(self._http_url(), self.headers, self.timeout)
807
- self.sse = self._sse_class.send
808
-
809
- def websocket(self) -> _WebSocket:
810
- """This method returns a WebSocket object that can be used to generate audio using WebSocket.
811
-
812
- Returns:
813
- _WebSocket: A WebSocket object that can be used to generate audio using WebSocket.
814
- """
815
- ws = _WebSocket(self._ws_url(), self.api_key, self.cartesia_version)
816
- ws.connect()
817
- return ws
818
-
819
- @staticmethod
820
- def get_output_format(output_format_name: str) -> OutputFormat:
821
- """Convenience method to get the output_format dictionary from a given output format name.
822
-
823
- Args:
824
- output_format_name (str): The name of the output format.
825
-
826
- Returns:
827
- OutputFormat: A dictionary containing the details of the output format to be passed into tts.sse() or tts.websocket().send()
828
-
829
- Raises:
830
- ValueError: If the output_format name is not supported
831
- """
832
- if output_format_name in OutputFormatMapping._format_mapping:
833
- output_format_obj = OutputFormatMapping.get_format(output_format_name)
834
- elif output_format_name in DeprecatedOutputFormatMapping._format_mapping:
835
- output_format_obj = DeprecatedOutputFormatMapping.get_format_deprecated(
836
- output_format_name
837
- )
838
- else:
839
- raise ValueError(f"Unsupported format: {output_format_name}")
840
-
841
- return OutputFormat(
842
- container=output_format_obj["container"],
843
- encoding=output_format_obj["encoding"],
844
- sample_rate=output_format_obj["sample_rate"],
845
- )
846
-
847
- @staticmethod
848
- def get_sample_rate(self, output_format_name: str) -> int:
849
- """Convenience method to get the sample rate for a given output format.
850
-
851
- Args:
852
- output_format_name (str): The name of the output format.
853
-
854
- Returns:
855
- int: The sample rate for the output format.
856
-
857
- Raises:
858
- ValueError: If the output_format name is not supported
859
- """
860
- if output_format_name in OutputFormatMapping._format_mapping:
861
- output_format_obj = OutputFormatMapping.get_format(output_format_name)
862
- elif output_format_name in DeprecatedOutputFormatMapping._format_mapping:
863
- output_format_obj = DeprecatedOutputFormatMapping.get_format_deprecated(
864
- output_format_name
865
- )
866
- else:
867
- raise ValueError(f"Unsupported format: {output_format_name}")
868
-
869
- return output_format_obj["sample_rate"]
870
-
871
- @staticmethod
872
- def _validate_and_construct_voice(
873
- voice_id: Optional[str] = None,
874
- voice_embedding: Optional[List[float]] = None,
875
- experimental_voice_controls: Optional[VoiceControls] = None,
876
- ) -> dict:
877
- """Validate and construct the voice dictionary for the request.
878
-
879
- Args:
880
- voice_id: The ID of the voice to use for generating audio.
881
- voice_embedding: The embedding of the voice to use for generating audio.
882
- experimental_voice_controls: Voice controls for emotion and speed.
883
- Note: This is an experimental feature and may rapidly change in the future.
884
-
885
- Returns:
886
- A dictionary representing the voice configuration.
887
-
888
- Raises:
889
- ValueError: If neither or both voice_id and voice_embedding are specified.
890
- """
891
- if voice_id is None and voice_embedding is None:
892
- raise ValueError("Either voice_id or voice_embedding must be specified.")
893
-
894
- voice = {}
895
-
896
- if voice_id is not None:
897
- voice["id"] = voice_id
898
-
899
- if voice_embedding is not None:
900
- voice["embedding"] = voice_embedding
901
-
902
- if experimental_voice_controls is not None:
903
- voice["__experimental_controls"] = experimental_voice_controls
904
-
905
- return voice
906
-
907
-
908
- class AsyncCartesia(Cartesia):
909
- """The asynchronous version of the Cartesia client."""
910
-
911
- def __init__(
912
- self,
913
- *,
914
- api_key: Optional[str] = None,
915
- base_url: Optional[str] = None,
916
- timeout: float = DEFAULT_TIMEOUT,
917
- max_num_connections: int = DEFAULT_NUM_CONNECTIONS,
918
- ):
919
- """
920
- Args:
921
- api_key: See :class:`Cartesia`.
922
- base_url: See :class:`Cartesia`.
923
- timeout: See :class:`Cartesia`.
924
- max_num_connections: The maximum number of concurrent connections to use for the client.
925
- This is used to limit the number of connections that can be made to the server.
926
- """
927
- self._session = None
928
- self._loop = None
929
- super().__init__(api_key=api_key, base_url=base_url, timeout=timeout)
930
- self.max_num_connections = max_num_connections
931
- self.tts = AsyncTTS(
932
- api_key=self.api_key,
933
- base_url=self._base_url,
934
- timeout=self.timeout,
935
- get_session=self._get_session,
936
- )
937
-
938
- async def _get_session(self):
939
- current_loop = asyncio.get_event_loop()
940
- if self._loop is not current_loop:
941
- # If the loop has changed, close the session and create a new one.
942
- await self.close()
943
- if self._session is None or self._session.closed:
944
- timeout = aiohttp.ClientTimeout(total=self.timeout)
945
- connector = aiohttp.TCPConnector(limit=self.max_num_connections)
946
- self._session = aiohttp.ClientSession(timeout=timeout, connector=connector)
947
- self._loop = current_loop
948
- return self._session
949
-
950
- async def close(self):
951
- """This method closes the session.
952
-
953
- It is *strongly* recommended to call this method when you are done using the client.
954
- """
955
- if self._session is not None and not self._session.closed:
956
- await self._session.close()
957
-
958
- def __del__(self):
959
- try:
960
- loop = asyncio.get_running_loop()
961
- except RuntimeError:
962
- loop = None
963
-
964
- if loop is None:
965
- asyncio.run(self.close())
966
- elif loop.is_running():
967
- loop.create_task(self.close())
968
-
969
- async def __aenter__(self):
970
- return self
971
-
972
- async def __aexit__(
973
- self,
974
- exc_type: Union[type, None],
975
- exc: Union[BaseException, None],
976
- exc_tb: Union[TracebackType, None],
977
- ):
978
- await self.close()
979
-
980
-
981
- class _AsyncSSE(_SSE):
982
- """This class contains methods to generate audio using Server-Sent Events asynchronously."""
983
-
984
- def __init__(
985
- self,
986
- http_url: str,
987
- headers: Dict[str, str],
988
- timeout: float,
989
- get_session: Callable[[], Optional[aiohttp.ClientSession]],
990
- ):
991
- super().__init__(http_url, headers, timeout)
992
- self._get_session = get_session
993
-
994
- async def send(
995
- self,
996
- model_id: str,
997
- transcript: str,
998
- output_format: OutputFormat,
999
- voice_id: Optional[str] = None,
1000
- voice_embedding: Optional[List[float]] = None,
1001
- duration: Optional[int] = None,
1002
- language: Optional[str] = None,
1003
- stream: bool = True,
1004
- _experimental_voice_controls: Optional[VoiceControls] = None,
1005
- ) -> Union[bytes, AsyncGenerator[bytes, None]]:
1006
- voice = TTS._validate_and_construct_voice(
1007
- voice_id,
1008
- voice_embedding=voice_embedding,
1009
- experimental_voice_controls=_experimental_voice_controls,
1010
- )
1011
-
1012
- request_body = {
1013
- "model_id": model_id,
1014
- "transcript": transcript,
1015
- "voice": voice,
1016
- "output_format": {
1017
- "container": output_format["container"],
1018
- "encoding": output_format["encoding"],
1019
- "sample_rate": output_format["sample_rate"],
1020
- },
1021
- "language": language,
1022
- }
1023
-
1024
- if duration is not None:
1025
- request_body["duration"] = duration
1026
-
1027
- generator = self._sse_generator_wrapper(request_body)
1028
-
1029
- if stream:
1030
- return generator
1031
-
1032
- chunks = []
1033
- async for chunk in generator:
1034
- chunks.append(chunk["audio"])
1035
-
1036
- return {"audio": b"".join(chunks)}
1037
-
1038
- @retry_on_connection_error_async(
1039
- max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
1040
- )
1041
- async def _sse_generator_wrapper(self, request_body: Dict[str, Any]):
1042
- """Need to wrap the sse generator in a function for the retry decorator to work."""
1043
- try:
1044
- async for chunk in self._sse_generator(request_body):
1045
- yield chunk
1046
- except Exception as e:
1047
- raise RuntimeError(f"Error generating audio. {e}")
1048
-
1049
- async def _sse_generator(self, request_body: Dict[str, Any]):
1050
- session = await self._get_session()
1051
- async with session.post(
1052
- f"{self.http_url}/tts/sse", data=json.dumps(request_body), headers=self.headers
1053
- ) as response:
1054
- if not response.ok:
1055
- raise ValueError(f"Failed to generate audio. {await response.text()}")
1056
-
1057
- buffer = ""
1058
- async for chunk_bytes in response.content.iter_any():
1059
- buffer, outputs = self._update_buffer(buffer=buffer, chunk_bytes=chunk_bytes)
1060
- for output in outputs:
1061
- yield output
1062
-
1063
- if buffer:
1064
- try:
1065
- chunk_json = json.loads(buffer)
1066
- audio = base64.b64decode(chunk_json["data"])
1067
- yield {"audio": audio}
1068
- except json.JSONDecodeError:
1069
- pass
1070
-
1071
-
1072
- class _AsyncTTSContext:
1073
- """Manage a single context over an AsyncWebSocket.
1074
-
1075
- This class separates sending requests and receiving responses into two separate methods.
1076
- This can be used for sending multiple requests without awaiting the response.
1077
- Then you can listen to the responses in the order they were sent. See README for usage.
1078
-
1079
- Each AsyncTTSContext will close automatically when a done message is received for that context.
1080
- This happens when the no_more_inputs method is called (equivalent to sending a request with `continue_ = False`),
1081
- or if no requests have been sent for 5 seconds on the same context. It also closes if there is an error.
1082
-
1083
- """
1084
-
1085
- def __init__(self, context_id: str, websocket: "_AsyncWebSocket", timeout: float):
1086
- self._context_id = context_id
1087
- self._websocket = websocket
1088
- self.timeout = timeout
1089
- self._error = None
1090
-
1091
- @property
1092
- def context_id(self) -> str:
1093
- return self._context_id
1094
-
1095
- async def send(
1096
- self,
1097
- model_id: str,
1098
- transcript: str,
1099
- output_format: OutputFormat,
1100
- voice_id: Optional[str] = None,
1101
- voice_embedding: Optional[List[float]] = None,
1102
- context_id: Optional[str] = None,
1103
- continue_: bool = False,
1104
- duration: Optional[int] = None,
1105
- language: Optional[str] = None,
1106
- add_timestamps: bool = False,
1107
- _experimental_voice_controls: Optional[VoiceControls] = None,
1108
- ) -> None:
1109
- """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
1110
-
1111
- Args:
1112
- model_id: The ID of the model to use for generating audio.
1113
- transcript: The text to convert to speech.
1114
- output_format: A dictionary containing the details of the output format.
1115
- voice_id: The ID of the voice to use for generating audio.
1116
- voice_embedding: The embedding of the voice to use for generating audio.
1117
- context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
1118
- continue_: Whether to continue the audio generation from the previous transcript or not.
1119
- duration: The duration of the audio in seconds.
1120
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
1121
- add_timestamps: Whether to return word-level timestamps.
1122
- _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
1123
- Note: This is an experimental feature and may change rapidly in future releases.
1124
-
1125
- Returns:
1126
- None.
1127
- """
1128
- if context_id is not None and context_id != self._context_id:
1129
- raise ValueError("Context ID does not match the context ID of the current context.")
1130
- if continue_ and transcript == "":
1131
- raise ValueError("Transcript cannot be empty when continue_ is True.")
1132
-
1133
- await self._websocket.connect()
1134
-
1135
- voice = TTS._validate_and_construct_voice(
1136
- voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
1137
- )
1138
-
1139
- request_body = {
1140
- "model_id": model_id,
1141
- "transcript": transcript,
1142
- "voice": voice,
1143
- "output_format": {
1144
- "container": output_format["container"],
1145
- "encoding": output_format["encoding"],
1146
- "sample_rate": output_format["sample_rate"],
1147
- },
1148
- "context_id": self._context_id,
1149
- "continue": continue_,
1150
- "language": language,
1151
- "add_timestamps": add_timestamps,
1152
- }
1153
-
1154
- if duration is not None:
1155
- request_body["duration"] = duration
1156
-
1157
- await self._websocket.websocket.send_json(request_body)
1158
-
1159
- # Start listening for responses on the WebSocket
1160
- self._websocket._dispatch_listener()
1161
-
1162
- async def no_more_inputs(self) -> None:
1163
- """Send a request to the WebSocket to indicate that no more requests will be sent."""
1164
- await self.send(
1165
- model_id=DEFAULT_MODEL_ID,
1166
- transcript="",
1167
- output_format=TTS.get_output_format("raw_pcm_f32le_44100"),
1168
- voice_embedding=DEFAULT_VOICE_EMBEDDING, # Default voice embedding since it's a required input for now.
1169
- context_id=self._context_id,
1170
- continue_=False,
1171
- )
1172
-
1173
- async def receive(self) -> AsyncGenerator[Dict[str, Any], None]:
1174
- """Receive the audio chunks from the WebSocket. This method is a generator that yields audio chunks.
1175
-
1176
- Returns:
1177
- An async generator that yields audio chunks. Each chunk is a dictionary containing the audio as bytes.
1178
- """
1179
- try:
1180
- while True:
1181
- response = await self._websocket._get_message(
1182
- self._context_id, timeout=self.timeout
1183
- )
1184
- if "error" in response:
1185
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
1186
- if response["done"]:
1187
- break
1188
- yield self._websocket._convert_response(response, include_context_id=True)
1189
- except Exception as e:
1190
- if isinstance(e, asyncio.TimeoutError):
1191
- raise RuntimeError("Timeout while waiting for audio chunk")
1192
- raise RuntimeError(f"Failed to generate audio:\n{e}")
1193
- finally:
1194
- self._close()
1195
-
1196
- def _close(self) -> None:
1197
- """Closes the context. Automatically called when a done message is received for this context."""
1198
- self._websocket._remove_context(self._context_id)
1199
-
1200
- def is_closed(self):
1201
- """Check if the context is closed or not. Returns True if closed."""
1202
- return self._context_id not in self._websocket._context_queues
1203
-
1204
- async def __aenter__(self):
1205
- return self
1206
-
1207
- async def __aexit__(
1208
- self,
1209
- exc_type: Union[type, None],
1210
- exc: Union[BaseException, None],
1211
- exc_tb: Union[TracebackType, None],
1212
- ):
1213
- self._close()
1214
-
1215
- def __del__(self):
1216
- self._close()
1217
-
1218
-
1219
- class _AsyncWebSocket(_WebSocket):
1220
- """This class contains methods to generate audio using WebSocket asynchronously."""
1221
-
1222
- def __init__(
1223
- self,
1224
- ws_url: str,
1225
- api_key: str,
1226
- cartesia_version: str,
1227
- timeout: float,
1228
- get_session: Callable[[], Optional[aiohttp.ClientSession]],
1229
- ):
1230
- """
1231
- Args:
1232
- ws_url: The WebSocket URL for the Cartesia API.
1233
- api_key: The API key to use for authorization.
1234
- cartesia_version: The version of the Cartesia API to use.
1235
- timeout: The timeout for responses on the WebSocket in seconds.
1236
- get_session: A function that returns an aiohttp.ClientSession object.
1237
- """
1238
- super().__init__(ws_url, api_key, cartesia_version)
1239
- self.timeout = timeout
1240
- self._get_session = get_session
1241
- self.websocket = None
1242
- self._context_queues: Dict[str, asyncio.Queue] = {}
1243
- self._processing_task: asyncio.Task = None
1244
-
1245
- def __del__(self):
1246
- try:
1247
- loop = asyncio.get_running_loop()
1248
- except RuntimeError:
1249
- loop = None
1250
-
1251
- if loop is None:
1252
- asyncio.run(self.close())
1253
- elif loop.is_running():
1254
- loop.create_task(self.close())
1255
-
1256
- async def connect(self):
1257
- if self.websocket is None or self._is_websocket_closed():
1258
- route = "tts/websocket"
1259
- session = await self._get_session()
1260
- try:
1261
- self.websocket = await session.ws_connect(
1262
- f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
1263
- )
1264
- except Exception as e:
1265
- raise RuntimeError(f"Failed to connect to WebSocket. {e}")
1266
-
1267
- def _is_websocket_closed(self):
1268
- return self.websocket.closed
1269
-
1270
- async def close(self):
1271
- """This method closes the websocket connection. *Highly* recommended to call this method when done."""
1272
- if self.websocket is not None and not self._is_websocket_closed():
1273
- await self.websocket.close()
1274
- if self._processing_task:
1275
- self._processing_task.cancel()
1276
- try:
1277
- self._processing_task = None
1278
- except asyncio.CancelledError:
1279
- pass
1280
- except TypeError as e:
1281
- # Ignore the error if the task is already cancelled
1282
- # For some reason we are getting None responses
1283
- # TODO: This needs to be fixed - we need to think about why we are getting None responses.
1284
- if "Received message 256:None" not in str(e):
1285
- raise e
1286
-
1287
- for context_id in list(self._context_queues.keys()):
1288
- self._remove_context(context_id)
1289
-
1290
- self._context_queues.clear()
1291
- self._processing_task = None
1292
- self.websocket = None
1293
-
1294
- async def send(
1295
- self,
1296
- model_id: str,
1297
- transcript: str,
1298
- output_format: OutputFormat,
1299
- voice_id: Optional[str] = None,
1300
- voice_embedding: Optional[List[float]] = None,
1301
- context_id: Optional[str] = None,
1302
- duration: Optional[int] = None,
1303
- language: Optional[str] = None,
1304
- stream: bool = True,
1305
- add_timestamps: bool = False,
1306
- _experimental_voice_controls: Optional[VoiceControls] = None,
1307
- ) -> Union[bytes, AsyncGenerator[bytes, None]]:
1308
- """See :meth:`_WebSocket.send` for details."""
1309
- if context_id is None:
1310
- context_id = str(uuid.uuid4())
1311
-
1312
- ctx = self.context(context_id)
1313
-
1314
- await ctx.send(
1315
- model_id=model_id,
1316
- transcript=transcript,
1317
- output_format=output_format,
1318
- voice_id=voice_id,
1319
- voice_embedding=voice_embedding,
1320
- context_id=context_id,
1321
- duration=duration,
1322
- language=language,
1323
- continue_=False,
1324
- add_timestamps=add_timestamps,
1325
- _experimental_voice_controls=_experimental_voice_controls,
1326
- )
1327
-
1328
- generator = ctx.receive()
1329
-
1330
- if stream:
1331
- return generator
1332
-
1333
- chunks = []
1334
- word_timestamps = defaultdict(list)
1335
- async for chunk in generator:
1336
- if "audio" in chunk:
1337
- chunks.append(chunk["audio"])
1338
- if add_timestamps and "word_timestamps" in chunk:
1339
- for k, v in chunk["word_timestamps"].items():
1340
- word_timestamps[k].extend(v)
1341
- out = {"audio": b"".join(chunks), "context_id": context_id}
1342
- if add_timestamps:
1343
- out["word_timestamps"] = word_timestamps
1344
- return out
1345
-
1346
- async def _process_responses(self):
1347
- try:
1348
- while True:
1349
- response = await self.websocket.receive_json()
1350
- if response["context_id"]:
1351
- context_id = response["context_id"]
1352
- if context_id in self._context_queues:
1353
- await self._context_queues[context_id].put(response)
1354
- except Exception as e:
1355
- self._error = e
1356
- raise e
1357
-
1358
- async def _get_message(self, context_id: str, timeout: float) -> Dict[str, Any]:
1359
- if context_id not in self._context_queues:
1360
- raise ValueError(f"Context ID {context_id} not found.")
1361
- return await asyncio.wait_for(self._context_queues[context_id].get(), timeout=timeout)
1362
-
1363
- def _remove_context(self, context_id: str):
1364
- if context_id in self._context_queues:
1365
- del self._context_queues[context_id]
1366
-
1367
- def _dispatch_listener(self):
1368
- if self._processing_task is None or self._processing_task.done():
1369
- self._processing_task = asyncio.create_task(self._process_responses())
1370
-
1371
- def context(self, context_id: Optional[str] = None) -> _AsyncTTSContext:
1372
- if context_id in self._context_queues:
1373
- raise ValueError(f"AsyncContext for context ID {context_id} already exists.")
1374
- if context_id is None:
1375
- context_id = str(uuid.uuid4())
1376
- if context_id not in self._context_queues:
1377
- self._context_queues[context_id] = asyncio.Queue()
1378
- return _AsyncTTSContext(context_id, self, self.timeout)
1379
-
1380
-
1381
- class AsyncTTS(TTS):
1382
- def __init__(self, api_key, base_url, timeout, get_session):
1383
- super().__init__(api_key, base_url, timeout)
1384
- self._get_session = get_session
1385
- self._sse_class = _AsyncSSE(self._http_url(), self.headers, self.timeout, get_session)
1386
- self.sse = self._sse_class.send
1387
-
1388
- async def websocket(self) -> _AsyncWebSocket:
1389
- ws = _AsyncWebSocket(
1390
- self._ws_url(), self.api_key, self.cartesia_version, self.timeout, self._get_session
1391
- )
1392
- await ws.connect()
1393
- return ws