cartesia 2.0.0a0__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/__init__.py CHANGED
@@ -126,6 +126,8 @@ from .voices import (
126
126
  LocalizeDialect,
127
127
  LocalizeDialectParams,
128
128
  LocalizeEnglishDialect,
129
+ LocalizePortugueseDialect,
130
+ LocalizeSpanishDialect,
129
131
  LocalizeTargetLanguage,
130
132
  LocalizeVoiceRequest,
131
133
  LocalizeVoiceRequestParams,
@@ -180,6 +182,8 @@ __all__ = [
180
182
  "LocalizeDialect",
181
183
  "LocalizeDialectParams",
182
184
  "LocalizeEnglishDialect",
185
+ "LocalizePortugueseDialect",
186
+ "LocalizeSpanishDialect",
183
187
  "LocalizeTargetLanguage",
184
188
  "LocalizeVoiceRequest",
185
189
  "LocalizeVoiceRequestParams",
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "cartesia",
19
- "X-Fern-SDK-Version": "2.0.0a0",
19
+ "X-Fern-SDK-Version": "2.0.0b1",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  headers["Cartesia-Version"] = "2024-06-10"
@@ -85,8 +85,8 @@ def _retry_timeout(response: httpx.Response, retries: int) -> float:
85
85
 
86
86
 
87
87
  def _should_retry(response: httpx.Response) -> bool:
88
- retriable_400s = [429, 408, 409]
89
- return response.status_code >= 500 or response.status_code in retriable_400s
88
+ retryable_400s = [429, 408, 409]
89
+ return response.status_code >= 500 or response.status_code in retryable_400s
90
90
 
91
91
 
92
92
  def remove_omit_from_dict(
@@ -79,7 +79,7 @@ def to_jsonable_with_fallback(
79
79
  class UniversalBaseModel(pydantic.BaseModel):
80
80
  if IS_PYDANTIC_V2:
81
81
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(
82
- # Allow fields begining with `model_` to be used in the model
82
+ # Allow fields beginning with `model_` to be used in the model
83
83
  protected_namespaces=(),
84
84
  ) # type: ignore # Pydantic v2
85
85
 
@@ -128,7 +128,7 @@ class UniversalBaseModel(pydantic.BaseModel):
128
128
  Override the default dict method to `exclude_unset` by default. This function patches
129
129
  `exclude_unset` to work include fields within non-None default values.
130
130
  """
131
- # Note: the logic here is multi-plexed given the levers exposed in Pydantic V1 vs V2
131
+ # Note: the logic here is multiplexed given the levers exposed in Pydantic V1 vs V2
132
132
  # Pydantic V1's .dict can be extremely slow, so we do not want to call it twice.
133
133
  #
134
134
  # We'd ideally do the same for Pydantic V2, but it shells out to a library to serialize models
@@ -82,7 +82,7 @@ class DatasetsClient:
82
82
  api_key="YOUR_API_KEY",
83
83
  )
84
84
  client.datasets.create(
85
- name="string",
85
+ name="name",
86
86
  )
87
87
  """
88
88
  _response = self._client_wrapper.httpx_client.request(
@@ -129,7 +129,7 @@ class DatasetsClient:
129
129
  api_key="YOUR_API_KEY",
130
130
  )
131
131
  client.datasets.list_files(
132
- id="string",
132
+ id="id",
133
133
  )
134
134
  """
135
135
  _response = self._client_wrapper.httpx_client.request(
@@ -170,17 +170,6 @@ class DatasetsClient:
170
170
  Returns
171
171
  -------
172
172
  None
173
-
174
- Examples
175
- --------
176
- from cartesia import Cartesia
177
-
178
- client = Cartesia(
179
- api_key="YOUR_API_KEY",
180
- )
181
- client.datasets.upload_file(
182
- id="string",
183
- )
184
173
  """
185
174
  _response = self._client_wrapper.httpx_client.request(
186
175
  f"datasets/{jsonable_encoder(id)}/files",
@@ -280,7 +269,7 @@ class AsyncDatasetsClient:
280
269
 
281
270
  async def main() -> None:
282
271
  await client.datasets.create(
283
- name="string",
272
+ name="name",
284
273
  )
285
274
 
286
275
 
@@ -337,7 +326,7 @@ class AsyncDatasetsClient:
337
326
 
338
327
  async def main() -> None:
339
328
  await client.datasets.list_files(
340
- id="string",
329
+ id="id",
341
330
  )
342
331
 
343
332
 
@@ -381,25 +370,6 @@ class AsyncDatasetsClient:
381
370
  Returns
382
371
  -------
383
372
  None
384
-
385
- Examples
386
- --------
387
- import asyncio
388
-
389
- from cartesia import AsyncCartesia
390
-
391
- client = AsyncCartesia(
392
- api_key="YOUR_API_KEY",
393
- )
394
-
395
-
396
- async def main() -> None:
397
- await client.datasets.upload_file(
398
- id="string",
399
- )
400
-
401
-
402
- asyncio.run(main())
403
373
  """
404
374
  _response = await self._client_wrapper.httpx_client.request(
405
375
  f"datasets/{jsonable_encoder(id)}/files",
cartesia/infill/client.py CHANGED
@@ -34,16 +34,26 @@ class InfillClient:
34
34
  output_format_encoding: typing.Optional[RawEncoding] = OMIT,
35
35
  output_format_bit_rate: typing.Optional[int] = OMIT,
36
36
  voice_experimental_controls_speed: typing.Optional[Speed] = OMIT,
37
- voice_experimental_controls_emotion: typing.Optional[Emotion] = OMIT,
37
+ voice_experimental_controls_emotion: typing.Optional[typing.List[Emotion]] = OMIT,
38
38
  request_options: typing.Optional[RequestOptions] = None,
39
39
  ) -> typing.Iterator[bytes]:
40
40
  """
41
41
  Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
42
42
 
43
+ **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
44
+
43
45
  Only the `sonic-preview` model is supported for infill at this time.
44
46
 
45
47
  At least one of `left_audio` or `right_audio` must be provided.
46
48
 
49
+ As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
50
+ - Use longer infill transcripts
51
+ - This gives the model more flexibility to adapt to the rest of the audio
52
+ - Target natural pauses in the audio when deciding where to clip
53
+ - This means you don't need word-level timestamps to be as precise
54
+ - Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
55
+ - This helps the model generate more natural transitions
56
+
47
57
  Parameters
48
58
  ----------
49
59
  left_audio : core.File
@@ -84,7 +94,7 @@ class InfillClient:
84
94
  If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
85
95
 
86
96
 
87
- voice_experimental_controls_emotion : typing.Optional[Emotion]
97
+ voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
88
98
  An array of emotion:level tags.
89
99
 
90
100
  Supported emotions are: anger, positivity, surprise, sadness, and curiosity.
@@ -114,16 +124,18 @@ class InfillClient:
114
124
  output_format_container="mp3",
115
125
  output_format_sample_rate=44100,
116
126
  output_format_bit_rate=128000,
127
+ voice_experimental_controls_speed="slowest",
128
+ voice_experimental_controls_emotion=["surprise:high", "curiosity:high"],
117
129
  )
118
130
  """
119
131
  with self._client_wrapper.httpx_client.stream(
120
132
  "infill/bytes",
121
133
  method="POST",
122
134
  data={
123
- "model_id[]": model_id,
124
- "language[]": language,
125
- "transcript[]": transcript,
126
- "voice[id]": voice_id,
135
+ "model_id": model_id,
136
+ "language": language,
137
+ "transcript": transcript,
138
+ "voice_id": voice_id,
127
139
  "output_format[container]": output_format_container,
128
140
  "output_format[sample_rate]": output_format_sample_rate,
129
141
  "output_format[encoding]": output_format_encoding,
@@ -169,16 +181,26 @@ class AsyncInfillClient:
169
181
  output_format_encoding: typing.Optional[RawEncoding] = OMIT,
170
182
  output_format_bit_rate: typing.Optional[int] = OMIT,
171
183
  voice_experimental_controls_speed: typing.Optional[Speed] = OMIT,
172
- voice_experimental_controls_emotion: typing.Optional[Emotion] = OMIT,
184
+ voice_experimental_controls_emotion: typing.Optional[typing.List[Emotion]] = OMIT,
173
185
  request_options: typing.Optional[RequestOptions] = None,
174
186
  ) -> typing.AsyncIterator[bytes]:
175
187
  """
176
188
  Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
177
189
 
190
+ **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
191
+
178
192
  Only the `sonic-preview` model is supported for infill at this time.
179
193
 
180
194
  At least one of `left_audio` or `right_audio` must be provided.
181
195
 
196
+ As with all generative models, there's some inherent variability, but here's some tips we recommend to get the best results from infill:
197
+ - Use longer infill transcripts
198
+ - This gives the model more flexibility to adapt to the rest of the audio
199
+ - Target natural pauses in the audio when deciding where to clip
200
+ - This means you don't need word-level timestamps to be as precise
201
+ - Clip right up to the start and end of the audio segment you want infilled, keeping as much silence in the left/right audio segments as possible
202
+ - This helps the model generate more natural transitions
203
+
182
204
  Parameters
183
205
  ----------
184
206
  left_audio : core.File
@@ -219,7 +241,7 @@ class AsyncInfillClient:
219
241
  If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
220
242
 
221
243
 
222
- voice_experimental_controls_emotion : typing.Optional[Emotion]
244
+ voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
223
245
  An array of emotion:level tags.
224
246
 
225
247
  Supported emotions are: anger, positivity, surprise, sadness, and curiosity.
@@ -254,6 +276,8 @@ class AsyncInfillClient:
254
276
  output_format_container="mp3",
255
277
  output_format_sample_rate=44100,
256
278
  output_format_bit_rate=128000,
279
+ voice_experimental_controls_speed="slowest",
280
+ voice_experimental_controls_emotion=["surprise:high", "curiosity:high"],
257
281
  )
258
282
 
259
283
 
@@ -263,10 +287,10 @@ class AsyncInfillClient:
263
287
  "infill/bytes",
264
288
  method="POST",
265
289
  data={
266
- "model_id[]": model_id,
267
- "language[]": language,
268
- "transcript[]": transcript,
269
- "voice[id]": voice_id,
290
+ "model_id": model_id,
291
+ "language": language,
292
+ "transcript": transcript,
293
+ "voice_id": voice_id,
270
294
  "output_format[container]": output_format_container,
271
295
  "output_format[sample_rate]": output_format_sample_rate,
272
296
  "output_format[encoding]": output_format_encoding,
cartesia/tts/client.py CHANGED
@@ -67,7 +67,7 @@ class TtsClient:
67
67
  api_key="YOUR_API_KEY",
68
68
  )
69
69
  client.tts.bytes(
70
- model_id="sonic-english",
70
+ model_id="sonic",
71
71
  transcript="Hello, world!",
72
72
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
73
73
  language="en",
@@ -152,7 +152,7 @@ class TtsClient:
152
152
  api_key="YOUR_API_KEY",
153
153
  )
154
154
  response = client.tts.sse(
155
- model_id="sonic-english",
155
+ model_id="sonic",
156
156
  transcript="Hello, world!",
157
157
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
158
158
  language="en",
@@ -258,7 +258,7 @@ class AsyncTtsClient:
258
258
 
259
259
  async def main() -> None:
260
260
  await client.tts.bytes(
261
- model_id="sonic-english",
261
+ model_id="sonic",
262
262
  transcript="Hello, world!",
263
263
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
264
264
  language="en",
@@ -351,7 +351,7 @@ class AsyncTtsClient:
351
351
 
352
352
  async def main() -> None:
353
353
  response = await client.tts.sse(
354
- model_id="sonic-english",
354
+ model_id="sonic",
355
355
  transcript="Hello, world!",
356
356
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
357
357
  language="en",
@@ -10,6 +10,8 @@ from .types import (
10
10
  IdSpecifier,
11
11
  LocalizeDialect,
12
12
  LocalizeEnglishDialect,
13
+ LocalizePortugueseDialect,
14
+ LocalizeSpanishDialect,
13
15
  LocalizeTargetLanguage,
14
16
  LocalizeVoiceRequest,
15
17
  MixVoiceSpecifier,
@@ -49,6 +51,8 @@ __all__ = [
49
51
  "LocalizeDialect",
50
52
  "LocalizeDialectParams",
51
53
  "LocalizeEnglishDialect",
54
+ "LocalizePortugueseDialect",
55
+ "LocalizeSpanishDialect",
52
56
  "LocalizeTargetLanguage",
53
57
  "LocalizeVoiceRequest",
54
58
  "LocalizeVoiceRequestParams",