cartesia 2.0.0b2__py3-none-any.whl → 2.0.0b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. cartesia/__init__.py +8 -4
  2. cartesia/base_client.py +0 -4
  3. cartesia/core/__init__.py +3 -0
  4. cartesia/core/client_wrapper.py +2 -2
  5. cartesia/core/pagination.py +88 -0
  6. cartesia/infill/client.py +4 -4
  7. cartesia/tts/_async_websocket.py +48 -1
  8. cartesia/tts/_websocket.py +44 -3
  9. cartesia/tts/client.py +4 -4
  10. cartesia/tts/requests/generation_request.py +5 -0
  11. cartesia/tts/requests/web_socket_chunk_response.py +3 -0
  12. cartesia/tts/requests/web_socket_response.py +2 -1
  13. cartesia/tts/requests/web_socket_tts_request.py +1 -0
  14. cartesia/tts/types/emotion.py +5 -0
  15. cartesia/tts/types/generation_request.py +5 -0
  16. cartesia/tts/types/web_socket_chunk_response.py +3 -1
  17. cartesia/tts/types/web_socket_response.py +2 -1
  18. cartesia/tts/types/web_socket_tts_output.py +2 -0
  19. cartesia/tts/types/web_socket_tts_request.py +1 -0
  20. cartesia/tts/utils/constants.py +2 -2
  21. cartesia/voice_changer/requests/streaming_response.py +2 -0
  22. cartesia/voice_changer/types/streaming_response.py +2 -0
  23. cartesia/voices/__init__.py +8 -4
  24. cartesia/voices/client.py +285 -169
  25. cartesia/voices/requests/__init__.py +2 -0
  26. cartesia/voices/requests/create_voice_request.py +0 -2
  27. cartesia/voices/requests/get_voices_response.py +24 -0
  28. cartesia/voices/requests/localize_dialect.py +1 -3
  29. cartesia/voices/requests/voice.py +13 -9
  30. cartesia/voices/types/__init__.py +6 -4
  31. cartesia/voices/types/create_voice_request.py +0 -2
  32. cartesia/voices/types/gender_presentation.py +5 -0
  33. cartesia/voices/types/get_voices_response.py +34 -0
  34. cartesia/voices/types/localize_dialect.py +1 -3
  35. cartesia/voices/types/voice.py +13 -9
  36. cartesia/voices/types/voice_expand_options.py +5 -0
  37. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/METADATA +85 -14
  38. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/RECORD +39 -37
  39. cartesia/datasets/client.py +0 -392
  40. cartesia/voices/types/localize_portuguese_dialect.py +0 -5
  41. cartesia/voices/types/localize_spanish_dialect.py +0 -5
  42. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b7.dist-info}/WHEEL +0 -0
cartesia/__init__.py CHANGED
@@ -121,13 +121,14 @@ from .voices import (
121
121
  EmbeddingSpecifier,
122
122
  EmbeddingSpecifierParams,
123
123
  Gender,
124
+ GenderPresentation,
125
+ GetVoicesResponse,
126
+ GetVoicesResponseParams,
124
127
  IdSpecifier,
125
128
  IdSpecifierParams,
126
129
  LocalizeDialect,
127
130
  LocalizeDialectParams,
128
131
  LocalizeEnglishDialect,
129
- LocalizePortugueseDialect,
130
- LocalizeSpanishDialect,
131
132
  LocalizeTargetLanguage,
132
133
  LocalizeVoiceRequest,
133
134
  LocalizeVoiceRequestParams,
@@ -138,6 +139,7 @@ from .voices import (
138
139
  UpdateVoiceRequest,
139
140
  UpdateVoiceRequestParams,
140
141
  Voice,
142
+ VoiceExpandOptions,
141
143
  VoiceId,
142
144
  VoiceMetadata,
143
145
  VoiceMetadataParams,
@@ -175,15 +177,16 @@ __all__ = [
175
177
  "FilePurpose",
176
178
  "FlushId",
177
179
  "Gender",
180
+ "GenderPresentation",
178
181
  "GenerationRequest",
179
182
  "GenerationRequestParams",
183
+ "GetVoicesResponse",
184
+ "GetVoicesResponseParams",
180
185
  "IdSpecifier",
181
186
  "IdSpecifierParams",
182
187
  "LocalizeDialect",
183
188
  "LocalizeDialectParams",
184
189
  "LocalizeEnglishDialect",
185
- "LocalizePortugueseDialect",
186
- "LocalizeSpanishDialect",
187
190
  "LocalizeTargetLanguage",
188
191
  "LocalizeVoiceRequest",
189
192
  "LocalizeVoiceRequestParams",
@@ -235,6 +238,7 @@ __all__ = [
235
238
  "UpdateVoiceRequest",
236
239
  "UpdateVoiceRequestParams",
237
240
  "Voice",
241
+ "VoiceExpandOptions",
238
242
  "VoiceId",
239
243
  "VoiceMetadata",
240
244
  "VoiceMetadataParams",
cartesia/base_client.py CHANGED
@@ -5,14 +5,12 @@ from .environment import CartesiaEnvironment
5
5
  import httpx
6
6
  from .core.client_wrapper import SyncClientWrapper
7
7
  from .api_status.client import ApiStatusClient
8
- from .datasets.client import DatasetsClient
9
8
  from .infill.client import InfillClient
10
9
  from .tts.client import TtsClient
11
10
  from .voice_changer.client import VoiceChangerClient
12
11
  from .voices.client import VoicesClient
13
12
  from .core.client_wrapper import AsyncClientWrapper
14
13
  from .api_status.client import AsyncApiStatusClient
15
- from .datasets.client import AsyncDatasetsClient
16
14
  from .infill.client import AsyncInfillClient
17
15
  from .tts.client import AsyncTtsClient
18
16
  from .voice_changer.client import AsyncVoiceChangerClient
@@ -78,7 +76,6 @@ class BaseCartesia:
78
76
  timeout=_defaulted_timeout,
79
77
  )
80
78
  self.api_status = ApiStatusClient(client_wrapper=self._client_wrapper)
81
- self.datasets = DatasetsClient(client_wrapper=self._client_wrapper)
82
79
  self.infill = InfillClient(client_wrapper=self._client_wrapper)
83
80
  self.tts = TtsClient(client_wrapper=self._client_wrapper)
84
81
  self.voice_changer = VoiceChangerClient(client_wrapper=self._client_wrapper)
@@ -144,7 +141,6 @@ class AsyncBaseCartesia:
144
141
  timeout=_defaulted_timeout,
145
142
  )
146
143
  self.api_status = AsyncApiStatusClient(client_wrapper=self._client_wrapper)
147
- self.datasets = AsyncDatasetsClient(client_wrapper=self._client_wrapper)
148
144
  self.infill = AsyncInfillClient(client_wrapper=self._client_wrapper)
149
145
  self.tts = AsyncTtsClient(client_wrapper=self._client_wrapper)
150
146
  self.voice_changer = AsyncVoiceChangerClient(client_wrapper=self._client_wrapper)
cartesia/core/__init__.py CHANGED
@@ -6,6 +6,7 @@ from .datetime_utils import serialize_datetime
6
6
  from .file import File, convert_file_dict_to_httpx_tuples, with_content_type
7
7
  from .http_client import AsyncHttpClient, HttpClient
8
8
  from .jsonable_encoder import jsonable_encoder
9
+ from .pagination import AsyncPager, SyncPager
9
10
  from .pydantic_utilities import (
10
11
  IS_PYDANTIC_V2,
11
12
  UniversalBaseModel,
@@ -24,6 +25,7 @@ __all__ = [
24
25
  "ApiError",
25
26
  "AsyncClientWrapper",
26
27
  "AsyncHttpClient",
28
+ "AsyncPager",
27
29
  "BaseClientWrapper",
28
30
  "FieldMetadata",
29
31
  "File",
@@ -31,6 +33,7 @@ __all__ = [
31
33
  "IS_PYDANTIC_V2",
32
34
  "RequestOptions",
33
35
  "SyncClientWrapper",
36
+ "SyncPager",
34
37
  "UniversalBaseModel",
35
38
  "UniversalRootModel",
36
39
  "convert_and_respect_annotation_metadata",
@@ -16,10 +16,10 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "cartesia",
19
- "X-Fern-SDK-Version": "2.0.0b2",
19
+ "X-Fern-SDK-Version": "2.0.0b7",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
- headers["Cartesia-Version"] = "2024-06-10"
22
+ headers["Cartesia-Version"] = "2024-11-13"
23
23
  return headers
24
24
 
25
25
  def get_base_url(self) -> str:
@@ -0,0 +1,88 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ from typing_extensions import Self
6
+
7
+ import pydantic
8
+
9
+ # Generic to represent the underlying type of the results within a page
10
+ T = typing.TypeVar("T")
11
+
12
+
13
+ # SDKs implement a Page ABC per-pagination request, the endpoint then returns a pager that wraps this type
14
+ # for example, an endpoint will return SyncPager[UserPage] where UserPage implements the Page ABC. ex:
15
+ #
16
+ # SyncPager<InnerListType>(
17
+ # has_next=response.list_metadata.after is not None,
18
+ # items=response.data,
19
+ # # This should be the outer function that returns the SyncPager again
20
+ # get_next=lambda: list(..., cursor: response.cursor) (or list(..., offset: offset + 1))
21
+ # )
22
+ class BasePage(pydantic.BaseModel, typing.Generic[T]):
23
+ has_next: bool
24
+ items: typing.Optional[typing.List[T]]
25
+
26
+
27
+ class SyncPage(BasePage[T], typing.Generic[T]):
28
+ get_next: typing.Optional[typing.Callable[[], typing.Optional[Self]]]
29
+
30
+
31
+ class AsyncPage(BasePage[T], typing.Generic[T]):
32
+ get_next: typing.Optional[typing.Callable[[], typing.Awaitable[typing.Optional[Self]]]]
33
+
34
+
35
+ # ----------------------------
36
+
37
+
38
+ class SyncPager(SyncPage[T], typing.Generic[T]):
39
+ # Here we type ignore the iterator to avoid a mypy error
40
+ # caused by the type conflict with Pydanitc's __iter__ method
41
+ # brought in by extending the base model
42
+ def __iter__(self) -> typing.Iterator[T]: # type: ignore
43
+ for page in self.iter_pages():
44
+ if page.items is not None:
45
+ for item in page.items:
46
+ yield item
47
+
48
+ def iter_pages(self) -> typing.Iterator[SyncPage[T]]:
49
+ page: typing.Union[SyncPager[T], None] = self
50
+ while True:
51
+ if page is not None:
52
+ yield page
53
+ if page.has_next and page.get_next is not None:
54
+ page = page.get_next()
55
+ if page is None or page.items is None or len(page.items) == 0:
56
+ return
57
+ else:
58
+ return
59
+ else:
60
+ return
61
+
62
+ def next_page(self) -> typing.Optional[SyncPage[T]]:
63
+ return self.get_next() if self.get_next is not None else None
64
+
65
+
66
+ class AsyncPager(AsyncPage[T], typing.Generic[T]):
67
+ async def __aiter__(self) -> typing.AsyncIterator[T]: # type: ignore
68
+ async for page in self.iter_pages():
69
+ if page.items is not None:
70
+ for item in page.items:
71
+ yield item
72
+
73
+ async def iter_pages(self) -> typing.AsyncIterator[AsyncPage[T]]:
74
+ page: typing.Union[AsyncPager[T], None] = self
75
+ while True:
76
+ if page is not None:
77
+ yield page
78
+ if page is not None and page.has_next and page.get_next is not None:
79
+ page = await page.get_next()
80
+ if page is None or page.items is None or len(page.items) == 0:
81
+ return
82
+ else:
83
+ return
84
+ else:
85
+ return
86
+
87
+ async def next_page(self) -> typing.Optional[AsyncPage[T]]:
88
+ return await self.get_next() if self.get_next is not None else None
cartesia/infill/client.py CHANGED
@@ -42,7 +42,7 @@ class InfillClient:
42
42
 
43
43
  **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
44
44
 
45
- Only the `sonic-preview` model is supported for infill at this time.
45
+ Infilling is only available on `sonic-2` at this time.
46
46
 
47
47
  At least one of `left_audio` or `right_audio` must be provided.
48
48
 
@@ -117,7 +117,7 @@ class InfillClient:
117
117
  api_key="YOUR_API_KEY",
118
118
  )
119
119
  client.infill.bytes(
120
- model_id="sonic-preview",
120
+ model_id="sonic-2",
121
121
  language="en",
122
122
  transcript="middle segment",
123
123
  voice_id="694f9389-aac1-45b6-b726-9d9369183238",
@@ -189,7 +189,7 @@ class AsyncInfillClient:
189
189
 
190
190
  **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
191
191
 
192
- Only the `sonic-preview` model is supported for infill at this time.
192
+ Infilling is only available on `sonic-2` at this time.
193
193
 
194
194
  At least one of `left_audio` or `right_audio` must be provided.
195
195
 
@@ -269,7 +269,7 @@ class AsyncInfillClient:
269
269
 
270
270
  async def main() -> None:
271
271
  await client.infill.bytes(
272
- model_id="sonic-preview",
272
+ model_id="sonic-2",
273
273
  language="en",
274
274
  transcript="middle segment",
275
275
  voice_id="694f9389-aac1-45b6-b726-9d9369183238",
@@ -17,6 +17,7 @@ from cartesia.tts.types import (
17
17
  WebSocketResponse_FlushDone,
18
18
  WebSocketTtsOutput,
19
19
  WordTimestamps,
20
+ PhonemeTimestamps,
20
21
  )
21
22
 
22
23
  from ..core.pydantic_utilities import parse_obj_as
@@ -67,6 +68,7 @@ class _AsyncTTSContext:
67
68
  language: Optional[str] = None,
68
69
  stream: bool = True,
69
70
  add_timestamps: bool = False,
71
+ add_phoneme_timestamps: bool = False,
70
72
  continue_: bool = False,
71
73
  flush: bool = False,
72
74
  ) -> None:
@@ -102,6 +104,8 @@ class _AsyncTTSContext:
102
104
  request_body["stream"] = stream
103
105
  if add_timestamps:
104
106
  request_body["add_timestamps"] = add_timestamps
107
+ if add_phoneme_timestamps:
108
+ request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
105
109
  if continue_:
106
110
  request_body["continue"] = continue_
107
111
  if flush:
@@ -229,6 +233,11 @@ class _AsyncTTSContext:
229
233
  finally:
230
234
  self._close()
231
235
 
236
+ async def cancel(self):
237
+ """Cancel the context. This will stop the generation of audio for this context."""
238
+ await self._websocket.websocket.send_json({"context_id": self._context_id, "cancel": True})
239
+ self._close()
240
+
232
241
  def _close(self) -> None:
233
242
  """Closes the context. Automatically called when a done message is received for this context."""
234
243
  self._websocket._remove_context(self._context_id)
@@ -297,7 +306,26 @@ class AsyncTtsWebsocket(TtsWebsocket):
297
306
  try:
298
307
  self.websocket = await session.ws_connect(url)
299
308
  except Exception as e:
300
- raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
309
+ # Extract status code if available
310
+ status_code = None
311
+ error_message = str(e)
312
+
313
+ if hasattr(e, 'status') and e.status is not None:
314
+ status_code = e.status
315
+
316
+ # Create a meaningful error message based on status code
317
+ if status_code == 402:
318
+ error_message = "Payment required. Your API key may have insufficient credits or permissions."
319
+ elif status_code == 401:
320
+ error_message = "Unauthorized. Please check your API key."
321
+ elif status_code == 403:
322
+ error_message = "Forbidden. You don't have permission to access this resource."
323
+ elif status_code == 404:
324
+ error_message = "Not found. The requested resource doesn't exist."
325
+
326
+ raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
327
+ else:
328
+ raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
301
329
 
302
330
  def _is_websocket_closed(self):
303
331
  return self.websocket.closed
@@ -338,6 +366,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
338
366
  language: Optional[str] = None,
339
367
  stream: bool = True,
340
368
  add_timestamps: bool = False,
369
+ add_phoneme_timestamps: bool = False,
341
370
  ):
342
371
  """See :meth:`_WebSocket.send` for details."""
343
372
  if context_id is None:
@@ -355,6 +384,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
355
384
  language=language,
356
385
  continue_=False,
357
386
  add_timestamps=add_timestamps,
387
+ add_phoneme_timestamps=add_phoneme_timestamps,
358
388
  )
359
389
 
360
390
  generator = ctx.receive()
@@ -366,6 +396,9 @@ class AsyncTtsWebsocket(TtsWebsocket):
366
396
  words: typing.List[str] = []
367
397
  start: typing.List[float] = []
368
398
  end: typing.List[float] = []
399
+ phonemes: typing.List[str] = []
400
+ phoneme_start: typing.List[float] = []
401
+ phoneme_end: typing.List[float] = []
369
402
  async for chunk in generator:
370
403
  if chunk.audio is not None:
371
404
  chunks.append(chunk.audio)
@@ -374,6 +407,11 @@ class AsyncTtsWebsocket(TtsWebsocket):
374
407
  words.extend(chunk.word_timestamps.words)
375
408
  start.extend(chunk.word_timestamps.start)
376
409
  end.extend(chunk.word_timestamps.end)
410
+ if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
411
+ if chunk.phoneme_timestamps is not None:
412
+ phonemes.extend(chunk.phoneme_timestamps.phonemes)
413
+ phoneme_start.extend(chunk.phoneme_timestamps.start)
414
+ phoneme_end.extend(chunk.phoneme_timestamps.end)
377
415
 
378
416
  return WebSocketTtsOutput(
379
417
  audio=b"".join(chunks), # type: ignore
@@ -387,6 +425,15 @@ class AsyncTtsWebsocket(TtsWebsocket):
387
425
  if add_timestamps
388
426
  else None
389
427
  ),
428
+ phoneme_timestamps=(
429
+ PhonemeTimestamps(
430
+ phonemes=phonemes,
431
+ start=phoneme_start,
432
+ end=phoneme_end,
433
+ )
434
+ if add_phoneme_timestamps
435
+ else None
436
+ ),
390
437
  )
391
438
 
392
439
  async def _process_responses(self):
@@ -26,6 +26,7 @@ from cartesia.tts.types import (
26
26
  WebSocketResponse_Timestamps,
27
27
  WebSocketTtsOutput,
28
28
  WordTimestamps,
29
+ PhonemeTimestamps,
29
30
  )
30
31
 
31
32
  from ..core.pydantic_utilities import parse_obj_as
@@ -58,7 +59,7 @@ class _TTSContext:
58
59
  self,
59
60
  *,
60
61
  model_id: str,
61
- transcript: str,
62
+ transcript: typing.Generator[str, None, None],
62
63
  output_format: OutputFormatParams,
63
64
  voice: TtsRequestVoiceSpecifierParams,
64
65
  context_id: Optional[str] = None,
@@ -235,7 +236,7 @@ class TtsWebsocket:
235
236
  Usage:
236
237
  >>> ws = client.tts.websocket()
237
238
  >>> generation_request = GenerationRequest(
238
- ... model_id="sonic-english",
239
+ ... model_id="sonic-2",
239
240
  ... transcript="Hello world!",
240
241
  ... voice_embedding=embedding
241
242
  ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}
@@ -281,7 +282,26 @@ class TtsWebsocket:
281
282
  f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
282
283
  )
283
284
  except Exception as e:
284
- raise RuntimeError(f"Failed to connect to WebSocket. {e}")
285
+ # Extract status code if available
286
+ status_code = None
287
+ error_message = str(e)
288
+
289
+ if hasattr(e, 'status') and e.status is not None:
290
+ status_code = e.status
291
+
292
+ # Create a meaningful error message based on status code
293
+ if status_code == 402:
294
+ error_message = "Payment required. Your API key may have insufficient credits or permissions."
295
+ elif status_code == 401:
296
+ error_message = "Unauthorized. Please check your API key."
297
+ elif status_code == 403:
298
+ error_message = "Forbidden. You don't have permission to access this resource."
299
+ elif status_code == 404:
300
+ error_message = "Not found. The requested resource doesn't exist."
301
+
302
+ raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
303
+ else:
304
+ raise RuntimeError(f"Failed to connect to WebSocket. {e}")
285
305
 
286
306
  def _is_websocket_closed(self):
287
307
  return self.websocket.socket.fileno() == -1
@@ -310,6 +330,8 @@ class TtsWebsocket:
310
330
  out["audio"] = base64.b64decode(response.data)
311
331
  elif isinstance(response, WebSocketResponse_Timestamps):
312
332
  out["word_timestamps"] = response.word_timestamps # type: ignore
333
+ elif isinstance(response, WebSocketResponse_PhonemeTimestamps):
334
+ out["phoneme_timestamps"] = response.phoneme_timestamps # type: ignore
313
335
  elif include_flush_id and isinstance(response, WebSocketResponse_FlushDone):
314
336
  out["flush_done"] = response.flush_done # type: ignore
315
337
  out["flush_id"] = response.flush_id # type: ignore
@@ -331,6 +353,7 @@ class TtsWebsocket:
331
353
  language: Optional[str] = None,
332
354
  stream: bool = True,
333
355
  add_timestamps: bool = False,
356
+ add_phoneme_timestamps: bool = False,
334
357
  ):
335
358
  """Send a request to the WebSocket to generate audio.
336
359
 
@@ -360,6 +383,7 @@ class TtsWebsocket:
360
383
  "language": language,
361
384
  "stream": stream,
362
385
  "add_timestamps": add_timestamps,
386
+ "add_phoneme_timestamps": add_phoneme_timestamps,
363
387
  }
364
388
  generator = self._websocket_generator(request_body)
365
389
 
@@ -370,6 +394,9 @@ class TtsWebsocket:
370
394
  words: typing.List[str] = []
371
395
  start: typing.List[float] = []
372
396
  end: typing.List[float] = []
397
+ phonemes: typing.List[str] = []
398
+ phoneme_start: typing.List[float] = []
399
+ phoneme_end: typing.List[float] = []
373
400
  for chunk in generator:
374
401
  if chunk.audio is not None:
375
402
  chunks.append(chunk.audio)
@@ -378,6 +405,11 @@ class TtsWebsocket:
378
405
  words.extend(chunk.word_timestamps.words)
379
406
  start.extend(chunk.word_timestamps.start)
380
407
  end.extend(chunk.word_timestamps.end)
408
+ if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
409
+ if chunk.phoneme_timestamps is not None:
410
+ phonemes.extend(chunk.phoneme_timestamps.phonemes)
411
+ phoneme_start.extend(chunk.phoneme_timestamps.start)
412
+ phoneme_end.extend(chunk.phoneme_timestamps.end)
381
413
 
382
414
  return WebSocketTtsOutput(
383
415
  audio=b"".join(chunks), # type: ignore
@@ -391,6 +423,15 @@ class TtsWebsocket:
391
423
  if add_timestamps
392
424
  else None
393
425
  ),
426
+ phoneme_timestamps=(
427
+ PhonemeTimestamps(
428
+ phonemes=phonemes,
429
+ start=phoneme_start,
430
+ end=phoneme_end,
431
+ )
432
+ if add_phoneme_timestamps
433
+ else None
434
+ ),
394
435
  )
395
436
 
396
437
  def _websocket_generator(self, request_body: Dict[str, Any]):
cartesia/tts/client.py CHANGED
@@ -67,7 +67,7 @@ class TtsClient:
67
67
  api_key="YOUR_API_KEY",
68
68
  )
69
69
  client.tts.bytes(
70
- model_id="sonic",
70
+ model_id="sonic-2",
71
71
  transcript="Hello, world!",
72
72
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
73
73
  language="en",
@@ -152,7 +152,7 @@ class TtsClient:
152
152
  api_key="YOUR_API_KEY",
153
153
  )
154
154
  response = client.tts.sse(
155
- model_id="sonic",
155
+ model_id="sonic-2",
156
156
  transcript="Hello, world!",
157
157
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
158
158
  language="en",
@@ -258,7 +258,7 @@ class AsyncTtsClient:
258
258
 
259
259
  async def main() -> None:
260
260
  await client.tts.bytes(
261
- model_id="sonic",
261
+ model_id="sonic-2",
262
262
  transcript="Hello, world!",
263
263
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
264
264
  language="en",
@@ -351,7 +351,7 @@ class AsyncTtsClient:
351
351
 
352
352
  async def main() -> None:
353
353
  response = await client.tts.sse(
354
- model_id="sonic",
354
+ model_id="sonic-2",
355
355
  transcript="Hello, world!",
356
356
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
357
357
  language="en",
@@ -51,3 +51,8 @@ class GenerationRequestParams(typing_extensions.TypedDict):
51
51
  """
52
52
  Whether to return phoneme-level timestamps.
53
53
  """
54
+
55
+ use_original_timestamps: typing_extensions.NotRequired[bool]
56
+ """
57
+ Whether to use the original transcript for timestamps.
58
+ """
@@ -1,8 +1,11 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponseParams
4
+ import typing_extensions
5
+ from ..types.flush_id import FlushId
4
6
 
5
7
 
6
8
  class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
7
9
  data: str
8
10
  step_time: float
11
+ flush_id: typing_extensions.NotRequired[FlushId]
@@ -4,8 +4,8 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ..types.context_id import ContextId
8
7
  from ..types.flush_id import FlushId
8
+ from ..types.context_id import ContextId
9
9
  from .word_timestamps import WordTimestampsParams
10
10
  from .phoneme_timestamps import PhonemeTimestampsParams
11
11
 
@@ -14,6 +14,7 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
14
14
  type: typing.Literal["chunk"]
15
15
  data: str
16
16
  step_time: float
17
+ flush_id: typing_extensions.NotRequired[FlushId]
17
18
  context_id: typing_extensions.NotRequired[ContextId]
18
19
  status_code: int
19
20
  done: bool
@@ -19,6 +19,7 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
19
19
  duration: typing_extensions.NotRequired[int]
20
20
  language: typing_extensions.NotRequired[str]
21
21
  add_timestamps: typing_extensions.NotRequired[bool]
22
+ use_original_timestamps: typing_extensions.NotRequired[bool]
22
23
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
23
24
  continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
24
25
  context_id: typing_extensions.NotRequired[str]
@@ -6,22 +6,27 @@ Emotion = typing.Union[
6
6
  typing.Literal[
7
7
  "anger:lowest",
8
8
  "anger:low",
9
+ "anger",
9
10
  "anger:high",
10
11
  "anger:highest",
11
12
  "positivity:lowest",
12
13
  "positivity:low",
14
+ "positivity",
13
15
  "positivity:high",
14
16
  "positivity:highest",
15
17
  "surprise:lowest",
16
18
  "surprise:low",
19
+ "surprise",
17
20
  "surprise:high",
18
21
  "surprise:highest",
19
22
  "sadness:lowest",
20
23
  "sadness:low",
24
+ "sadness",
21
25
  "sadness:high",
22
26
  "sadness:highest",
23
27
  "curiosity:lowest",
24
28
  "curiosity:low",
29
+ "curiosity",
25
30
  "curiosity:high",
26
31
  "curiosity:highest",
27
32
  ],
@@ -56,6 +56,11 @@ class GenerationRequest(UniversalBaseModel):
56
56
  Whether to return phoneme-level timestamps.
57
57
  """
58
58
 
59
+ use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
60
+ """
61
+ Whether to use the original transcript for timestamps.
62
+ """
63
+
59
64
  if IS_PYDANTIC_V2:
60
65
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
61
66
  else:
@@ -1,14 +1,16 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponse
4
- from ...core.pydantic_utilities import IS_PYDANTIC_V2
5
4
  import typing
5
+ from .flush_id import FlushId
6
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
7
  import pydantic
7
8
 
8
9
 
9
10
  class WebSocketChunkResponse(WebSocketBaseResponse):
10
11
  data: str
11
12
  step_time: float
13
+ flush_id: typing.Optional[FlushId] = None
12
14
 
13
15
  if IS_PYDANTIC_V2:
14
16
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -3,10 +3,10 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
+ from .flush_id import FlushId
6
7
  from .context_id import ContextId
7
8
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
9
  import pydantic
9
- from .flush_id import FlushId
10
10
  from .word_timestamps import WordTimestamps
11
11
  from .phoneme_timestamps import PhonemeTimestamps
12
12
 
@@ -15,6 +15,7 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
15
15
  type: typing.Literal["chunk"] = "chunk"
16
16
  data: str
17
17
  step_time: float
18
+ flush_id: typing.Optional[FlushId] = None
18
19
  context_id: typing.Optional[ContextId] = None
19
20
  status_code: int
20
21
  done: bool
@@ -7,11 +7,13 @@ import pydantic
7
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
8
8
  from .context_id import ContextId
9
9
  from .flush_id import FlushId
10
+ from .phoneme_timestamps import PhonemeTimestamps
10
11
  from .word_timestamps import WordTimestamps
11
12
 
12
13
 
13
14
  class WebSocketTtsOutput(UniversalBaseModel):
14
15
  word_timestamps: typing.Optional[WordTimestamps] = None
16
+ phoneme_timestamps: typing.Optional[PhonemeTimestamps] = None
15
17
  audio: typing.Optional[bytes] = None
16
18
  context_id: typing.Optional[ContextId] = None
17
19
  flush_id: typing.Optional[FlushId] = None
@@ -22,6 +22,7 @@ class WebSocketTtsRequest(UniversalBaseModel):
22
22
  duration: typing.Optional[int] = None
23
23
  language: typing.Optional[str] = None
24
24
  add_timestamps: typing.Optional[bool] = None
25
+ use_original_timestamps: typing.Optional[bool] = None
25
26
  add_phoneme_timestamps: typing.Optional[bool] = None
26
27
  continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
27
28
  context_id: typing.Optional[str] = None