cartesia 2.0.0b2__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. cartesia/__init__.py +10 -0
  2. cartesia/base_client.py +0 -4
  3. cartesia/core/__init__.py +3 -0
  4. cartesia/core/client_wrapper.py +2 -2
  5. cartesia/core/pagination.py +88 -0
  6. cartesia/infill/client.py +4 -4
  7. cartesia/tts/_async_websocket.py +53 -1
  8. cartesia/tts/_websocket.py +52 -3
  9. cartesia/tts/client.py +4 -4
  10. cartesia/tts/requests/generation_request.py +5 -0
  11. cartesia/tts/requests/web_socket_chunk_response.py +3 -0
  12. cartesia/tts/requests/web_socket_response.py +2 -1
  13. cartesia/tts/requests/web_socket_tts_request.py +1 -0
  14. cartesia/tts/types/emotion.py +5 -0
  15. cartesia/tts/types/generation_request.py +5 -0
  16. cartesia/tts/types/web_socket_chunk_response.py +3 -1
  17. cartesia/tts/types/web_socket_response.py +2 -1
  18. cartesia/tts/types/web_socket_tts_output.py +2 -0
  19. cartesia/tts/types/web_socket_tts_request.py +1 -0
  20. cartesia/tts/utils/constants.py +2 -2
  21. cartesia/voice_changer/requests/streaming_response.py +2 -0
  22. cartesia/voice_changer/types/streaming_response.py +2 -0
  23. cartesia/voices/__init__.py +10 -0
  24. cartesia/voices/client.py +209 -44
  25. cartesia/voices/requests/__init__.py +2 -0
  26. cartesia/voices/requests/get_voices_response.py +24 -0
  27. cartesia/voices/requests/localize_dialect.py +4 -1
  28. cartesia/voices/requests/localize_voice_request.py +15 -2
  29. cartesia/voices/requests/voice.py +13 -9
  30. cartesia/voices/types/__init__.py +8 -0
  31. cartesia/voices/types/gender_presentation.py +5 -0
  32. cartesia/voices/types/get_voices_response.py +34 -0
  33. cartesia/voices/types/localize_dialect.py +4 -1
  34. cartesia/voices/types/localize_french_dialect.py +5 -0
  35. cartesia/voices/types/localize_voice_request.py +16 -3
  36. cartesia/voices/types/voice.py +13 -9
  37. cartesia/voices/types/voice_expand_options.py +5 -0
  38. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/METADATA +149 -73
  39. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/RECORD +40 -35
  40. cartesia/datasets/client.py +0 -392
  41. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/WHEEL +0 -0
@@ -6,9 +6,12 @@ from .create_voice_request import CreateVoiceRequest
6
6
  from .embedding_response import EmbeddingResponse
7
7
  from .embedding_specifier import EmbeddingSpecifier
8
8
  from .gender import Gender
9
+ from .gender_presentation import GenderPresentation
10
+ from .get_voices_response import GetVoicesResponse
9
11
  from .id_specifier import IdSpecifier
10
12
  from .localize_dialect import LocalizeDialect
11
13
  from .localize_english_dialect import LocalizeEnglishDialect
14
+ from .localize_french_dialect import LocalizeFrenchDialect
12
15
  from .localize_portuguese_dialect import LocalizePortugueseDialect
13
16
  from .localize_spanish_dialect import LocalizeSpanishDialect
14
17
  from .localize_target_language import LocalizeTargetLanguage
@@ -17,6 +20,7 @@ from .mix_voice_specifier import MixVoiceSpecifier
17
20
  from .mix_voices_request import MixVoicesRequest
18
21
  from .update_voice_request import UpdateVoiceRequest
19
22
  from .voice import Voice
23
+ from .voice_expand_options import VoiceExpandOptions
20
24
  from .voice_id import VoiceId
21
25
  from .voice_metadata import VoiceMetadata
22
26
  from .weight import Weight
@@ -28,9 +32,12 @@ __all__ = [
28
32
  "EmbeddingResponse",
29
33
  "EmbeddingSpecifier",
30
34
  "Gender",
35
+ "GenderPresentation",
36
+ "GetVoicesResponse",
31
37
  "IdSpecifier",
32
38
  "LocalizeDialect",
33
39
  "LocalizeEnglishDialect",
40
+ "LocalizeFrenchDialect",
34
41
  "LocalizePortugueseDialect",
35
42
  "LocalizeSpanishDialect",
36
43
  "LocalizeTargetLanguage",
@@ -39,6 +46,7 @@ __all__ = [
39
46
  "MixVoicesRequest",
40
47
  "UpdateVoiceRequest",
41
48
  "Voice",
49
+ "VoiceExpandOptions",
42
50
  "VoiceId",
43
51
  "VoiceMetadata",
44
52
  "Weight",
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ GenderPresentation = typing.Union[typing.Literal["masculine", "feminine", "gender_neutral"], typing.Any]
@@ -0,0 +1,34 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ from .voice import Voice
6
+ import pydantic
7
+ from .voice_id import VoiceId
8
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
9
+
10
+
11
+ class GetVoicesResponse(UniversalBaseModel):
12
+ data: typing.List[Voice] = pydantic.Field()
13
+ """
14
+ The paginated list of Voices.
15
+ """
16
+
17
+ has_more: bool = pydantic.Field()
18
+ """
19
+ Whether there are more Voices to fetch (using `starting_after=id`, where id is the ID of the last Voice in the current response).
20
+ """
21
+
22
+ next_page: typing.Optional[VoiceId] = pydantic.Field(default=None)
23
+ """
24
+ (Deprecated - use the id of the last Voice in the current response instead.) An ID that can be passed as `starting_after` to get the next page of Voices.
25
+ """
26
+
27
+ if IS_PYDANTIC_V2:
28
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
29
+ else:
30
+
31
+ class Config:
32
+ frozen = True
33
+ smart_union = True
34
+ extra = pydantic.Extra.allow
@@ -4,5 +4,8 @@ import typing
4
4
  from .localize_english_dialect import LocalizeEnglishDialect
5
5
  from .localize_spanish_dialect import LocalizeSpanishDialect
6
6
  from .localize_portuguese_dialect import LocalizePortugueseDialect
7
+ from .localize_french_dialect import LocalizeFrenchDialect
7
8
 
8
- LocalizeDialect = typing.Union[LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect]
9
+ LocalizeDialect = typing.Union[
10
+ LocalizeEnglishDialect, LocalizeSpanishDialect, LocalizePortugueseDialect, LocalizeFrenchDialect
11
+ ]
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ LocalizeFrenchDialect = typing.Union[typing.Literal["eu", "ca"], typing.Any]
@@ -1,17 +1,30 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
- from ...embedding.types.embedding import Embedding
4
+ import pydantic
5
5
  from .localize_target_language import LocalizeTargetLanguage
6
6
  from .gender import Gender
7
7
  import typing
8
8
  from .localize_dialect import LocalizeDialect
9
9
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
10
- import pydantic
11
10
 
12
11
 
13
12
  class LocalizeVoiceRequest(UniversalBaseModel):
14
- embedding: Embedding
13
+ voice_id: str = pydantic.Field()
14
+ """
15
+ The ID of the voice to localize.
16
+ """
17
+
18
+ name: str = pydantic.Field()
19
+ """
20
+ The name of the new localized voice.
21
+ """
22
+
23
+ description: str = pydantic.Field()
24
+ """
25
+ The description of the new localized voice.
26
+ """
27
+
15
28
  language: LocalizeTargetLanguage
16
29
  original_speaker_gender: Gender
17
30
  dialect: typing.Optional[LocalizeDialect] = None
@@ -2,9 +2,9 @@
2
2
 
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  from .voice_id import VoiceId
5
- import typing
6
5
  import pydantic
7
6
  import datetime as dt
7
+ import typing
8
8
  from ...embedding.types.embedding import Embedding
9
9
  from ...tts.types.supported_language import SupportedLanguage
10
10
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
@@ -12,14 +12,9 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
12
12
 
13
13
  class Voice(UniversalBaseModel):
14
14
  id: VoiceId
15
- user_id: typing.Optional[str] = pydantic.Field(default=None)
16
- """
17
- The ID of the user who owns the voice.
15
+ is_owner: bool = pydantic.Field()
18
16
  """
19
-
20
- is_public: bool = pydantic.Field()
21
- """
22
- Whether the voice is publicly accessible.
17
+ Whether the current user is the owner of the voice.
23
18
  """
24
19
 
25
20
  name: str = pydantic.Field()
@@ -37,7 +32,16 @@ class Voice(UniversalBaseModel):
37
32
  The date and time the voice was created.
38
33
  """
39
34
 
40
- embedding: Embedding
35
+ embedding: typing.Optional[Embedding] = pydantic.Field(default=None)
36
+ """
37
+ The vector embedding of the voice. Only included when `expand` includes `embedding`.
38
+ """
39
+
40
+ is_starred: typing.Optional[bool] = pydantic.Field(default=None)
41
+ """
42
+ Whether the current user has starred the voice. Only included when `expand` includes `is_starred`.
43
+ """
44
+
41
45
  language: SupportedLanguage
42
46
 
43
47
  if IS_PYDANTIC_V2:
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ VoiceExpandOptions = typing.Union[typing.Literal["embedding", "is_starred"], typing.Any]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 2.0.0b2
3
+ Version: 2.0.0b8
4
4
  Summary:
5
5
  Requires-Python: >=3.8,<4.0
6
6
  Classifier: Intended Audience :: Developers
@@ -47,53 +47,6 @@ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.c
47
47
  pip install cartesia
48
48
  ```
49
49
 
50
- ## Reference
51
-
52
- A full reference for this library is available [here](./reference.md).
53
-
54
- ## Voices
55
-
56
- ```python
57
- from cartesia import Cartesia
58
- import os
59
-
60
- client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
61
-
62
- # Get all available voices
63
- voices = client.voices.list()
64
- print(voices)
65
-
66
- # Get a specific voice
67
- voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
68
- print("The embedding for", voice.name, "is", voice.embedding)
69
-
70
- # Clone a voice using file data
71
- cloned_voice = client.voices.clone(
72
- clip=open("path/to/voice.wav", "rb"),
73
- name="Test cloned voice",
74
- language="en",
75
- mode="similarity", # or "stability"
76
- enhance=False, # use enhance=True to clean and denoise the cloning audio
77
- description="Test voice description"
78
- )
79
-
80
- # Mix voices together
81
- mixed_voice = client.voices.mix(
82
- voices=[
83
- {"id": "voice_id_1", "weight": 0.25},
84
- {"id": "voice_id_2", "weight": 0.75}
85
- ]
86
- )
87
-
88
- # Create a new voice from embedding
89
- new_voice = client.voices.create(
90
- name="Test Voice",
91
- description="Test voice description",
92
- embedding=[...], # List[float] with 192 dimensions
93
- language="en"
94
- )
95
- ```
96
-
97
50
  ## Usage
98
51
 
99
52
  Instantiate and use the client with the following:
@@ -107,15 +60,11 @@ client = Cartesia(
107
60
  api_key=os.getenv("CARTESIA_API_KEY"),
108
61
  )
109
62
  client.tts.bytes(
110
- model_id="sonic-english",
63
+ model_id="sonic-2",
111
64
  transcript="Hello, world!",
112
65
  voice={
113
66
  "mode": "id",
114
67
  "id": "694f9389-aac1-45b6-b726-9d9369183238",
115
- "experimental_controls": {
116
- "speed": 0.5, # range between [-1.0, 1.0], or "slow", "fastest", etc.
117
- "emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
118
- }
119
68
  },
120
69
  language="en",
121
70
  output_format={
@@ -143,7 +92,7 @@ client = AsyncCartesia(
143
92
 
144
93
  async def main() -> None:
145
94
  async for output in client.tts.bytes(
146
- model_id="sonic-english",
95
+ model_id="sonic-2",
147
96
  transcript="Hello, world!",
148
97
  voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
149
98
  language="en",
@@ -176,7 +125,7 @@ except ApiError as e:
176
125
 
177
126
  ## Streaming
178
127
 
179
- The SDK supports streaming responses, as well, the response will be a generator that you can loop over.
128
+ The SDK supports streaming responses as well, returning a generator that you can iterate over with a `for ... in ...` loop:
180
129
 
181
130
  ```python
182
131
  from cartesia import Cartesia
@@ -188,7 +137,7 @@ def get_tts_chunks():
188
137
  api_key=os.getenv("CARTESIA_API_KEY"),
189
138
  )
190
139
  response = client.tts.sse(
191
- model_id="sonic",
140
+ model_id="sonic-2",
192
141
  transcript="Hello world!",
193
142
  voice={
194
143
  "id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
@@ -204,7 +153,7 @@ def get_tts_chunks():
204
153
  "sample_rate": 44100,
205
154
  },
206
155
  )
207
-
156
+
208
157
  audio_chunks = []
209
158
  for chunk in response:
210
159
  audio_chunks.append(chunk)
@@ -215,7 +164,9 @@ for chunk in chunks:
215
164
  print(f"Received chunk of size: {len(chunk.data)}")
216
165
  ```
217
166
 
218
- ## WebSocket
167
+ ## WebSockets
168
+
169
+ For the lowest latency in advanced usecases (such as streaming in an LLM-generated transcript and streaming out audio), you should use our websockets client:
219
170
 
220
171
  ```python
221
172
  from cartesia import Cartesia
@@ -223,15 +174,10 @@ from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawPar
223
174
  import pyaudio
224
175
  import os
225
176
 
226
- client = Cartesia(
227
- api_key=os.getenv("CARTESIA_API_KEY"),
228
- )
177
+ client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
229
178
  voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
230
179
  transcript = "Hello! Welcome to Cartesia"
231
180
 
232
- # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
233
- model_id = "sonic"
234
-
235
181
  p = pyaudio.PyAudio()
236
182
  rate = 22050
237
183
 
@@ -242,14 +188,14 @@ ws = client.tts.websocket()
242
188
 
243
189
  # Generate and stream audio using the websocket
244
190
  for output in ws.send(
245
- model_id=model_id,
191
+ model_id="sonic-2", # see: https://docs.cartesia.ai/getting-started/available-models
246
192
  transcript=transcript,
247
193
  voice={"id": voice_id},
248
194
  stream=True,
249
195
  output_format={
250
196
  "container": "raw",
251
- "encoding": "pcm_f32le",
252
- "sample_rate": 22050
197
+ "encoding": "pcm_f32le",
198
+ "sample_rate": rate
253
199
  },
254
200
  ):
255
201
  buffer = output.audio
@@ -267,6 +213,90 @@ p.terminate()
267
213
  ws.close() # Close the websocket connection
268
214
  ```
269
215
 
216
+ ## Voices
217
+
218
+ List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
219
+
220
+ ```python
221
+ from cartesia import Cartesia
222
+ import os
223
+
224
+ client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
225
+
226
+ # Get all available Voices
227
+ voices = client.voices.list()
228
+ for voice in voices:
229
+ print(voice)
230
+ ```
231
+
232
+ You can also get the complete metadata for a specific Voice, or make a new Voice by cloning from an audio sample:
233
+
234
+ ```python
235
+ # Get a specific Voice
236
+ voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
237
+ print("The embedding for", voice.name, "is", voice.embedding)
238
+
239
+ # Clone a Voice using file data
240
+ cloned_voice = client.voices.clone(
241
+ clip=open("path/to/voice.wav", "rb"),
242
+ name="Test cloned voice",
243
+ language="en",
244
+ mode="similarity", # or "stability"
245
+ enhance=False, # use enhance=True to clean and denoise the cloning audio
246
+ description="Test voice description"
247
+ )
248
+ ```
249
+
250
+ ## Requesting Timestamps
251
+
252
+ ```python
253
+ import asyncio
254
+ from cartesia import AsyncCartesia
255
+ import os
256
+
257
+ async def main():
258
+ client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
259
+
260
+ # Connect to the websocket
261
+ ws = await client.tts.websocket()
262
+
263
+ # Generate speech with timestamps
264
+ output_generate = await ws.send(
265
+ model_id="sonic-2",
266
+ transcript="Hello! Welcome to Cartesia's text-to-speech.",
267
+ voice={"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"},
268
+ output_format={
269
+ "container": "raw",
270
+ "encoding": "pcm_f32le",
271
+ "sample_rate": 44100
272
+ },
273
+ add_timestamps=True, # Enable word-level timestamps
274
+ add_phoneme_timestamps=True, # Enable phonemized timestamps
275
+ stream=True
276
+ )
277
+
278
+ # Process the streaming response with timestamps
279
+ all_words = []
280
+ all_starts = []
281
+ all_ends = []
282
+ audio_chunks = []
283
+
284
+ async for out in output_generate:
285
+ # Collect audio data
286
+ if out.audio is not None:
287
+ audio_chunks.append(out.audio)
288
+
289
+ # Process timestamp data
290
+ if out.word_timestamps is not None:
291
+ all_words.extend(out.word_timestamps.words) # List of words
292
+ all_starts.extend(out.word_timestamps.start) # Start time for each word (seconds)
293
+ all_ends.extend(out.word_timestamps.end) # End time for each word (seconds)
294
+
295
+ await ws.close()
296
+
297
+ asyncio.run(main())
298
+ ```
299
+
270
300
  ## Advanced
271
301
 
272
302
  ### Retries
@@ -309,6 +339,26 @@ client.tts.bytes(..., request_options={
309
339
  })
310
340
  ```
311
341
 
342
+ ### Mixing voices and creating from embeddings
343
+
344
+ ```python
345
+ # Mix voices together
346
+ mixed_voice = client.voices.mix(
347
+ voices=[
348
+ {"id": "voice_id_1", "weight": 0.25},
349
+ {"id": "voice_id_2", "weight": 0.75}
350
+ ]
351
+ )
352
+
353
+ # Create a new voice from embedding
354
+ new_voice = client.voices.create(
355
+ name="Test Voice",
356
+ description="Test voice description",
357
+ embedding=[...], # List[float] with 192 dimensions
358
+ language="en"
359
+ )
360
+ ```
361
+
312
362
  ### Custom Client
313
363
 
314
364
  You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -326,13 +376,39 @@ client = Cartesia(
326
376
  )
327
377
  ```
328
378
 
379
+ ## Reference
380
+
381
+ A full reference for this library is available [here](./reference.md).
382
+
329
383
  ## Contributing
330
384
 
331
- While we value open-source contributions to this SDK, this library is generated programmatically.
332
- Additions made directly to this library would have to be moved over to our generation code,
333
- otherwise they would be overwritten upon the next generated release. Feel free to open a PR as
334
- a proof of concept, but know that we will not be able to merge it as-is. We suggest opening
335
- an issue first to discuss with us!
385
+ Note that most of this library is generated programmatically from
386
+ <https://github.com/cartesia-ai/docs> before making edits to a file, verify it's not autogenerated
387
+ by checking for this comment at the top of the file:
388
+
389
+ ```
390
+ # This file was auto-generated by Fern from our API Definition.
391
+ ```
392
+
393
+ ### Running tests
394
+
395
+ ```sh
396
+ uv pip install -r requirements.txt
397
+ uv run pytest -rP -vv tests/custom/test_client.py::test_get_voices
398
+ ```
399
+ ### Manually generating SDK code from docs
400
+
401
+ Assuming all your repos are cloned into your home directory:
402
+
403
+ ```sh
404
+ $ cd ~/docs
405
+ $ fern generate --group python-sdk --log-level debug --api version-2024-11-13 --preview
406
+ $ cd ~/cartesia-python
407
+ $ git pull ~/docs/fern/apis/version-2024-11-13/.preview/fern-python-sdk
408
+ $ git commit --amend -m "manually regenerate from docs" # optional
409
+ ```
410
+
411
+ ### Automatically generating new SDK releases
336
412
 
337
- On the other hand, contributions to the README are always very welcome!
413
+ From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
338
414