cartesia 2.0.0b2__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. cartesia/__init__.py +10 -0
  2. cartesia/base_client.py +0 -4
  3. cartesia/core/__init__.py +3 -0
  4. cartesia/core/client_wrapper.py +2 -2
  5. cartesia/core/pagination.py +88 -0
  6. cartesia/infill/client.py +4 -4
  7. cartesia/tts/_async_websocket.py +53 -1
  8. cartesia/tts/_websocket.py +52 -3
  9. cartesia/tts/client.py +4 -4
  10. cartesia/tts/requests/generation_request.py +5 -0
  11. cartesia/tts/requests/web_socket_chunk_response.py +3 -0
  12. cartesia/tts/requests/web_socket_response.py +2 -1
  13. cartesia/tts/requests/web_socket_tts_request.py +1 -0
  14. cartesia/tts/types/emotion.py +5 -0
  15. cartesia/tts/types/generation_request.py +5 -0
  16. cartesia/tts/types/web_socket_chunk_response.py +3 -1
  17. cartesia/tts/types/web_socket_response.py +2 -1
  18. cartesia/tts/types/web_socket_tts_output.py +2 -0
  19. cartesia/tts/types/web_socket_tts_request.py +1 -0
  20. cartesia/tts/utils/constants.py +2 -2
  21. cartesia/voice_changer/requests/streaming_response.py +2 -0
  22. cartesia/voice_changer/types/streaming_response.py +2 -0
  23. cartesia/voices/__init__.py +10 -0
  24. cartesia/voices/client.py +209 -44
  25. cartesia/voices/requests/__init__.py +2 -0
  26. cartesia/voices/requests/get_voices_response.py +24 -0
  27. cartesia/voices/requests/localize_dialect.py +4 -1
  28. cartesia/voices/requests/localize_voice_request.py +15 -2
  29. cartesia/voices/requests/voice.py +13 -9
  30. cartesia/voices/types/__init__.py +8 -0
  31. cartesia/voices/types/gender_presentation.py +5 -0
  32. cartesia/voices/types/get_voices_response.py +34 -0
  33. cartesia/voices/types/localize_dialect.py +4 -1
  34. cartesia/voices/types/localize_french_dialect.py +5 -0
  35. cartesia/voices/types/localize_voice_request.py +16 -3
  36. cartesia/voices/types/voice.py +13 -9
  37. cartesia/voices/types/voice_expand_options.py +5 -0
  38. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/METADATA +149 -73
  39. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/RECORD +40 -35
  40. cartesia/datasets/client.py +0 -392
  41. {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/WHEEL +0 -0
cartesia/__init__.py CHANGED
@@ -121,11 +121,15 @@ from .voices import (
121
121
  EmbeddingSpecifier,
122
122
  EmbeddingSpecifierParams,
123
123
  Gender,
124
+ GenderPresentation,
125
+ GetVoicesResponse,
126
+ GetVoicesResponseParams,
124
127
  IdSpecifier,
125
128
  IdSpecifierParams,
126
129
  LocalizeDialect,
127
130
  LocalizeDialectParams,
128
131
  LocalizeEnglishDialect,
132
+ LocalizeFrenchDialect,
129
133
  LocalizePortugueseDialect,
130
134
  LocalizeSpanishDialect,
131
135
  LocalizeTargetLanguage,
@@ -138,6 +142,7 @@ from .voices import (
138
142
  UpdateVoiceRequest,
139
143
  UpdateVoiceRequestParams,
140
144
  Voice,
145
+ VoiceExpandOptions,
141
146
  VoiceId,
142
147
  VoiceMetadata,
143
148
  VoiceMetadataParams,
@@ -175,13 +180,17 @@ __all__ = [
175
180
  "FilePurpose",
176
181
  "FlushId",
177
182
  "Gender",
183
+ "GenderPresentation",
178
184
  "GenerationRequest",
179
185
  "GenerationRequestParams",
186
+ "GetVoicesResponse",
187
+ "GetVoicesResponseParams",
180
188
  "IdSpecifier",
181
189
  "IdSpecifierParams",
182
190
  "LocalizeDialect",
183
191
  "LocalizeDialectParams",
184
192
  "LocalizeEnglishDialect",
193
+ "LocalizeFrenchDialect",
185
194
  "LocalizePortugueseDialect",
186
195
  "LocalizeSpanishDialect",
187
196
  "LocalizeTargetLanguage",
@@ -235,6 +244,7 @@ __all__ = [
235
244
  "UpdateVoiceRequest",
236
245
  "UpdateVoiceRequestParams",
237
246
  "Voice",
247
+ "VoiceExpandOptions",
238
248
  "VoiceId",
239
249
  "VoiceMetadata",
240
250
  "VoiceMetadataParams",
cartesia/base_client.py CHANGED
@@ -5,14 +5,12 @@ from .environment import CartesiaEnvironment
5
5
  import httpx
6
6
  from .core.client_wrapper import SyncClientWrapper
7
7
  from .api_status.client import ApiStatusClient
8
- from .datasets.client import DatasetsClient
9
8
  from .infill.client import InfillClient
10
9
  from .tts.client import TtsClient
11
10
  from .voice_changer.client import VoiceChangerClient
12
11
  from .voices.client import VoicesClient
13
12
  from .core.client_wrapper import AsyncClientWrapper
14
13
  from .api_status.client import AsyncApiStatusClient
15
- from .datasets.client import AsyncDatasetsClient
16
14
  from .infill.client import AsyncInfillClient
17
15
  from .tts.client import AsyncTtsClient
18
16
  from .voice_changer.client import AsyncVoiceChangerClient
@@ -78,7 +76,6 @@ class BaseCartesia:
78
76
  timeout=_defaulted_timeout,
79
77
  )
80
78
  self.api_status = ApiStatusClient(client_wrapper=self._client_wrapper)
81
- self.datasets = DatasetsClient(client_wrapper=self._client_wrapper)
82
79
  self.infill = InfillClient(client_wrapper=self._client_wrapper)
83
80
  self.tts = TtsClient(client_wrapper=self._client_wrapper)
84
81
  self.voice_changer = VoiceChangerClient(client_wrapper=self._client_wrapper)
@@ -144,7 +141,6 @@ class AsyncBaseCartesia:
144
141
  timeout=_defaulted_timeout,
145
142
  )
146
143
  self.api_status = AsyncApiStatusClient(client_wrapper=self._client_wrapper)
147
- self.datasets = AsyncDatasetsClient(client_wrapper=self._client_wrapper)
148
144
  self.infill = AsyncInfillClient(client_wrapper=self._client_wrapper)
149
145
  self.tts = AsyncTtsClient(client_wrapper=self._client_wrapper)
150
146
  self.voice_changer = AsyncVoiceChangerClient(client_wrapper=self._client_wrapper)
cartesia/core/__init__.py CHANGED
@@ -6,6 +6,7 @@ from .datetime_utils import serialize_datetime
6
6
  from .file import File, convert_file_dict_to_httpx_tuples, with_content_type
7
7
  from .http_client import AsyncHttpClient, HttpClient
8
8
  from .jsonable_encoder import jsonable_encoder
9
+ from .pagination import AsyncPager, SyncPager
9
10
  from .pydantic_utilities import (
10
11
  IS_PYDANTIC_V2,
11
12
  UniversalBaseModel,
@@ -24,6 +25,7 @@ __all__ = [
24
25
  "ApiError",
25
26
  "AsyncClientWrapper",
26
27
  "AsyncHttpClient",
28
+ "AsyncPager",
27
29
  "BaseClientWrapper",
28
30
  "FieldMetadata",
29
31
  "File",
@@ -31,6 +33,7 @@ __all__ = [
31
33
  "IS_PYDANTIC_V2",
32
34
  "RequestOptions",
33
35
  "SyncClientWrapper",
36
+ "SyncPager",
34
37
  "UniversalBaseModel",
35
38
  "UniversalRootModel",
36
39
  "convert_and_respect_annotation_metadata",
@@ -16,10 +16,10 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "cartesia",
19
- "X-Fern-SDK-Version": "2.0.0b2",
19
+ "X-Fern-SDK-Version": "2.0.0b8",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
- headers["Cartesia-Version"] = "2024-06-10"
22
+ headers["Cartesia-Version"] = "2024-11-13"
23
23
  return headers
24
24
 
25
25
  def get_base_url(self) -> str:
@@ -0,0 +1,88 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ from typing_extensions import Self
6
+
7
+ import pydantic
8
+
9
+ # Generic to represent the underlying type of the results within a page
10
+ T = typing.TypeVar("T")
11
+
12
+
13
+ # SDKs implement a Page ABC per-pagination request, the endpoint then returns a pager that wraps this type
14
+ # for example, an endpoint will return SyncPager[UserPage] where UserPage implements the Page ABC. ex:
15
+ #
16
+ # SyncPager<InnerListType>(
17
+ # has_next=response.list_metadata.after is not None,
18
+ # items=response.data,
19
+ # # This should be the outer function that returns the SyncPager again
20
+ # get_next=lambda: list(..., cursor: response.cursor) (or list(..., offset: offset + 1))
21
+ # )
22
+ class BasePage(pydantic.BaseModel, typing.Generic[T]):
23
+ has_next: bool
24
+ items: typing.Optional[typing.List[T]]
25
+
26
+
27
+ class SyncPage(BasePage[T], typing.Generic[T]):
28
+ get_next: typing.Optional[typing.Callable[[], typing.Optional[Self]]]
29
+
30
+
31
+ class AsyncPage(BasePage[T], typing.Generic[T]):
32
+ get_next: typing.Optional[typing.Callable[[], typing.Awaitable[typing.Optional[Self]]]]
33
+
34
+
35
+ # ----------------------------
36
+
37
+
38
+ class SyncPager(SyncPage[T], typing.Generic[T]):
39
+ # Here we type ignore the iterator to avoid a mypy error
40
+ # caused by the type conflict with Pydanitc's __iter__ method
41
+ # brought in by extending the base model
42
+ def __iter__(self) -> typing.Iterator[T]: # type: ignore
43
+ for page in self.iter_pages():
44
+ if page.items is not None:
45
+ for item in page.items:
46
+ yield item
47
+
48
+ def iter_pages(self) -> typing.Iterator[SyncPage[T]]:
49
+ page: typing.Union[SyncPager[T], None] = self
50
+ while True:
51
+ if page is not None:
52
+ yield page
53
+ if page.has_next and page.get_next is not None:
54
+ page = page.get_next()
55
+ if page is None or page.items is None or len(page.items) == 0:
56
+ return
57
+ else:
58
+ return
59
+ else:
60
+ return
61
+
62
+ def next_page(self) -> typing.Optional[SyncPage[T]]:
63
+ return self.get_next() if self.get_next is not None else None
64
+
65
+
66
+ class AsyncPager(AsyncPage[T], typing.Generic[T]):
67
+ async def __aiter__(self) -> typing.AsyncIterator[T]: # type: ignore
68
+ async for page in self.iter_pages():
69
+ if page.items is not None:
70
+ for item in page.items:
71
+ yield item
72
+
73
+ async def iter_pages(self) -> typing.AsyncIterator[AsyncPage[T]]:
74
+ page: typing.Union[AsyncPager[T], None] = self
75
+ while True:
76
+ if page is not None:
77
+ yield page
78
+ if page is not None and page.has_next and page.get_next is not None:
79
+ page = await page.get_next()
80
+ if page is None or page.items is None or len(page.items) == 0:
81
+ return
82
+ else:
83
+ return
84
+ else:
85
+ return
86
+
87
+ async def next_page(self) -> typing.Optional[AsyncPage[T]]:
88
+ return await self.get_next() if self.get_next is not None else None
cartesia/infill/client.py CHANGED
@@ -42,7 +42,7 @@ class InfillClient:
42
42
 
43
43
  **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
44
44
 
45
- Only the `sonic-preview` model is supported for infill at this time.
45
+ Infilling is only available on `sonic-2` at this time.
46
46
 
47
47
  At least one of `left_audio` or `right_audio` must be provided.
48
48
 
@@ -117,7 +117,7 @@ class InfillClient:
117
117
  api_key="YOUR_API_KEY",
118
118
  )
119
119
  client.infill.bytes(
120
- model_id="sonic-preview",
120
+ model_id="sonic-2",
121
121
  language="en",
122
122
  transcript="middle segment",
123
123
  voice_id="694f9389-aac1-45b6-b726-9d9369183238",
@@ -189,7 +189,7 @@ class AsyncInfillClient:
189
189
 
190
190
  **The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
191
191
 
192
- Only the `sonic-preview` model is supported for infill at this time.
192
+ Infilling is only available on `sonic-2` at this time.
193
193
 
194
194
  At least one of `left_audio` or `right_audio` must be provided.
195
195
 
@@ -269,7 +269,7 @@ class AsyncInfillClient:
269
269
 
270
270
  async def main() -> None:
271
271
  await client.infill.bytes(
272
- model_id="sonic-preview",
272
+ model_id="sonic-2",
273
273
  language="en",
274
274
  transcript="middle segment",
275
275
  voice_id="694f9389-aac1-45b6-b726-9d9369183238",
@@ -17,6 +17,7 @@ from cartesia.tts.types import (
17
17
  WebSocketResponse_FlushDone,
18
18
  WebSocketTtsOutput,
19
19
  WordTimestamps,
20
+ PhonemeTimestamps,
20
21
  )
21
22
 
22
23
  from ..core.pydantic_utilities import parse_obj_as
@@ -67,6 +68,8 @@ class _AsyncTTSContext:
67
68
  language: Optional[str] = None,
68
69
  stream: bool = True,
69
70
  add_timestamps: bool = False,
71
+ add_phoneme_timestamps: bool = False,
72
+ use_original_timestamps: bool = False,
70
73
  continue_: bool = False,
71
74
  flush: bool = False,
72
75
  ) -> None:
@@ -102,6 +105,10 @@ class _AsyncTTSContext:
102
105
  request_body["stream"] = stream
103
106
  if add_timestamps:
104
107
  request_body["add_timestamps"] = add_timestamps
108
+ if add_phoneme_timestamps:
109
+ request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
110
+ if use_original_timestamps:
111
+ request_body["use_original_timestamps"] = use_original_timestamps
105
112
  if continue_:
106
113
  request_body["continue"] = continue_
107
114
  if flush:
@@ -229,6 +236,11 @@ class _AsyncTTSContext:
229
236
  finally:
230
237
  self._close()
231
238
 
239
+ async def cancel(self):
240
+ """Cancel the context. This will stop the generation of audio for this context."""
241
+ await self._websocket.websocket.send_json({"context_id": self._context_id, "cancel": True})
242
+ self._close()
243
+
232
244
  def _close(self) -> None:
233
245
  """Closes the context. Automatically called when a done message is received for this context."""
234
246
  self._websocket._remove_context(self._context_id)
@@ -297,7 +309,26 @@ class AsyncTtsWebsocket(TtsWebsocket):
297
309
  try:
298
310
  self.websocket = await session.ws_connect(url)
299
311
  except Exception as e:
300
- raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
312
+ # Extract status code if available
313
+ status_code = None
314
+ error_message = str(e)
315
+
316
+ if hasattr(e, 'status') and e.status is not None:
317
+ status_code = e.status
318
+
319
+ # Create a meaningful error message based on status code
320
+ if status_code == 402:
321
+ error_message = "Payment required. Your API key may have insufficient credits or permissions."
322
+ elif status_code == 401:
323
+ error_message = "Unauthorized. Please check your API key."
324
+ elif status_code == 403:
325
+ error_message = "Forbidden. You don't have permission to access this resource."
326
+ elif status_code == 404:
327
+ error_message = "Not found. The requested resource doesn't exist."
328
+
329
+ raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
330
+ else:
331
+ raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
301
332
 
302
333
  def _is_websocket_closed(self):
303
334
  return self.websocket.closed
@@ -338,6 +369,8 @@ class AsyncTtsWebsocket(TtsWebsocket):
338
369
  language: Optional[str] = None,
339
370
  stream: bool = True,
340
371
  add_timestamps: bool = False,
372
+ add_phoneme_timestamps: bool = False,
373
+ use_original_timestamps: bool = False,
341
374
  ):
342
375
  """See :meth:`_WebSocket.send` for details."""
343
376
  if context_id is None:
@@ -355,6 +388,8 @@ class AsyncTtsWebsocket(TtsWebsocket):
355
388
  language=language,
356
389
  continue_=False,
357
390
  add_timestamps=add_timestamps,
391
+ add_phoneme_timestamps=add_phoneme_timestamps,
392
+ use_original_timestamps=use_original_timestamps,
358
393
  )
359
394
 
360
395
  generator = ctx.receive()
@@ -366,6 +401,9 @@ class AsyncTtsWebsocket(TtsWebsocket):
366
401
  words: typing.List[str] = []
367
402
  start: typing.List[float] = []
368
403
  end: typing.List[float] = []
404
+ phonemes: typing.List[str] = []
405
+ phoneme_start: typing.List[float] = []
406
+ phoneme_end: typing.List[float] = []
369
407
  async for chunk in generator:
370
408
  if chunk.audio is not None:
371
409
  chunks.append(chunk.audio)
@@ -374,6 +412,11 @@ class AsyncTtsWebsocket(TtsWebsocket):
374
412
  words.extend(chunk.word_timestamps.words)
375
413
  start.extend(chunk.word_timestamps.start)
376
414
  end.extend(chunk.word_timestamps.end)
415
+ if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
416
+ if chunk.phoneme_timestamps is not None:
417
+ phonemes.extend(chunk.phoneme_timestamps.phonemes)
418
+ phoneme_start.extend(chunk.phoneme_timestamps.start)
419
+ phoneme_end.extend(chunk.phoneme_timestamps.end)
377
420
 
378
421
  return WebSocketTtsOutput(
379
422
  audio=b"".join(chunks), # type: ignore
@@ -387,6 +430,15 @@ class AsyncTtsWebsocket(TtsWebsocket):
387
430
  if add_timestamps
388
431
  else None
389
432
  ),
433
+ phoneme_timestamps=(
434
+ PhonemeTimestamps(
435
+ phonemes=phonemes,
436
+ start=phoneme_start,
437
+ end=phoneme_end,
438
+ )
439
+ if add_phoneme_timestamps
440
+ else None
441
+ ),
390
442
  )
391
443
 
392
444
  async def _process_responses(self):
@@ -26,6 +26,7 @@ from cartesia.tts.types import (
26
26
  WebSocketResponse_Timestamps,
27
27
  WebSocketTtsOutput,
28
28
  WordTimestamps,
29
+ PhonemeTimestamps,
29
30
  )
30
31
 
31
32
  from ..core.pydantic_utilities import parse_obj_as
@@ -58,7 +59,7 @@ class _TTSContext:
58
59
  self,
59
60
  *,
60
61
  model_id: str,
61
- transcript: str,
62
+ transcript: typing.Generator[str, None, None],
62
63
  output_format: OutputFormatParams,
63
64
  voice: TtsRequestVoiceSpecifierParams,
64
65
  context_id: Optional[str] = None,
@@ -66,6 +67,8 @@ class _TTSContext:
66
67
  language: Optional[str] = None,
67
68
  stream: bool = True,
68
69
  add_timestamps: bool = False,
70
+ add_phoneme_timestamps: bool = False,
71
+ use_original_timestamps: bool = False,
69
72
  ) -> Generator[bytes, None, None]:
70
73
  """Send audio generation requests to the WebSocket and yield responses.
71
74
 
@@ -101,6 +104,10 @@ class _TTSContext:
101
104
  request_body["stream"] = stream
102
105
  if add_timestamps:
103
106
  request_body["add_timestamps"] = add_timestamps
107
+ if add_phoneme_timestamps:
108
+ request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
109
+ if use_original_timestamps:
110
+ request_body["use_original_timestamps"] = use_original_timestamps
104
111
 
105
112
  if (
106
113
  "context_id" in request_body
@@ -235,7 +242,7 @@ class TtsWebsocket:
235
242
  Usage:
236
243
  >>> ws = client.tts.websocket()
237
244
  >>> generation_request = GenerationRequest(
238
- ... model_id="sonic-english",
245
+ ... model_id="sonic-2",
239
246
  ... transcript="Hello world!",
240
247
  ... voice_embedding=embedding
241
248
  ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}
@@ -281,7 +288,26 @@ class TtsWebsocket:
281
288
  f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
282
289
  )
283
290
  except Exception as e:
284
- raise RuntimeError(f"Failed to connect to WebSocket. {e}")
291
+ # Extract status code if available
292
+ status_code = None
293
+ error_message = str(e)
294
+
295
+ if hasattr(e, 'status') and e.status is not None:
296
+ status_code = e.status
297
+
298
+ # Create a meaningful error message based on status code
299
+ if status_code == 402:
300
+ error_message = "Payment required. Your API key may have insufficient credits or permissions."
301
+ elif status_code == 401:
302
+ error_message = "Unauthorized. Please check your API key."
303
+ elif status_code == 403:
304
+ error_message = "Forbidden. You don't have permission to access this resource."
305
+ elif status_code == 404:
306
+ error_message = "Not found. The requested resource doesn't exist."
307
+
308
+ raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
309
+ else:
310
+ raise RuntimeError(f"Failed to connect to WebSocket. {e}")
285
311
 
286
312
  def _is_websocket_closed(self):
287
313
  return self.websocket.socket.fileno() == -1
@@ -310,6 +336,8 @@ class TtsWebsocket:
310
336
  out["audio"] = base64.b64decode(response.data)
311
337
  elif isinstance(response, WebSocketResponse_Timestamps):
312
338
  out["word_timestamps"] = response.word_timestamps # type: ignore
339
+ elif isinstance(response, WebSocketResponse_PhonemeTimestamps):
340
+ out["phoneme_timestamps"] = response.phoneme_timestamps # type: ignore
313
341
  elif include_flush_id and isinstance(response, WebSocketResponse_FlushDone):
314
342
  out["flush_done"] = response.flush_done # type: ignore
315
343
  out["flush_id"] = response.flush_id # type: ignore
@@ -331,6 +359,8 @@ class TtsWebsocket:
331
359
  language: Optional[str] = None,
332
360
  stream: bool = True,
333
361
  add_timestamps: bool = False,
362
+ add_phoneme_timestamps: bool = False,
363
+ use_original_timestamps: bool = False,
334
364
  ):
335
365
  """Send a request to the WebSocket to generate audio.
336
366
 
@@ -360,6 +390,8 @@ class TtsWebsocket:
360
390
  "language": language,
361
391
  "stream": stream,
362
392
  "add_timestamps": add_timestamps,
393
+ "add_phoneme_timestamps": add_phoneme_timestamps,
394
+ "use_original_timestamps": use_original_timestamps,
363
395
  }
364
396
  generator = self._websocket_generator(request_body)
365
397
 
@@ -370,6 +402,9 @@ class TtsWebsocket:
370
402
  words: typing.List[str] = []
371
403
  start: typing.List[float] = []
372
404
  end: typing.List[float] = []
405
+ phonemes: typing.List[str] = []
406
+ phoneme_start: typing.List[float] = []
407
+ phoneme_end: typing.List[float] = []
373
408
  for chunk in generator:
374
409
  if chunk.audio is not None:
375
410
  chunks.append(chunk.audio)
@@ -378,6 +413,11 @@ class TtsWebsocket:
378
413
  words.extend(chunk.word_timestamps.words)
379
414
  start.extend(chunk.word_timestamps.start)
380
415
  end.extend(chunk.word_timestamps.end)
416
+ if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
417
+ if chunk.phoneme_timestamps is not None:
418
+ phonemes.extend(chunk.phoneme_timestamps.phonemes)
419
+ phoneme_start.extend(chunk.phoneme_timestamps.start)
420
+ phoneme_end.extend(chunk.phoneme_timestamps.end)
381
421
 
382
422
  return WebSocketTtsOutput(
383
423
  audio=b"".join(chunks), # type: ignore
@@ -391,6 +431,15 @@ class TtsWebsocket:
391
431
  if add_timestamps
392
432
  else None
393
433
  ),
434
+ phoneme_timestamps=(
435
+ PhonemeTimestamps(
436
+ phonemes=phonemes,
437
+ start=phoneme_start,
438
+ end=phoneme_end,
439
+ )
440
+ if add_phoneme_timestamps
441
+ else None
442
+ ),
394
443
  )
395
444
 
396
445
  def _websocket_generator(self, request_body: Dict[str, Any]):
cartesia/tts/client.py CHANGED
@@ -67,7 +67,7 @@ class TtsClient:
67
67
  api_key="YOUR_API_KEY",
68
68
  )
69
69
  client.tts.bytes(
70
- model_id="sonic",
70
+ model_id="sonic-2",
71
71
  transcript="Hello, world!",
72
72
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
73
73
  language="en",
@@ -152,7 +152,7 @@ class TtsClient:
152
152
  api_key="YOUR_API_KEY",
153
153
  )
154
154
  response = client.tts.sse(
155
- model_id="sonic",
155
+ model_id="sonic-2",
156
156
  transcript="Hello, world!",
157
157
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
158
158
  language="en",
@@ -258,7 +258,7 @@ class AsyncTtsClient:
258
258
 
259
259
  async def main() -> None:
260
260
  await client.tts.bytes(
261
- model_id="sonic",
261
+ model_id="sonic-2",
262
262
  transcript="Hello, world!",
263
263
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
264
264
  language="en",
@@ -351,7 +351,7 @@ class AsyncTtsClient:
351
351
 
352
352
  async def main() -> None:
353
353
  response = await client.tts.sse(
354
- model_id="sonic",
354
+ model_id="sonic-2",
355
355
  transcript="Hello, world!",
356
356
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
357
357
  language="en",
@@ -51,3 +51,8 @@ class GenerationRequestParams(typing_extensions.TypedDict):
51
51
  """
52
52
  Whether to return phoneme-level timestamps.
53
53
  """
54
+
55
+ use_original_timestamps: typing_extensions.NotRequired[bool]
56
+ """
57
+ Whether to use the original transcript for timestamps.
58
+ """
@@ -1,8 +1,11 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponseParams
4
+ import typing_extensions
5
+ from ..types.flush_id import FlushId
4
6
 
5
7
 
6
8
  class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
7
9
  data: str
8
10
  step_time: float
11
+ flush_id: typing_extensions.NotRequired[FlushId]
@@ -4,8 +4,8 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ..types.context_id import ContextId
8
7
  from ..types.flush_id import FlushId
8
+ from ..types.context_id import ContextId
9
9
  from .word_timestamps import WordTimestampsParams
10
10
  from .phoneme_timestamps import PhonemeTimestampsParams
11
11
 
@@ -14,6 +14,7 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
14
14
  type: typing.Literal["chunk"]
15
15
  data: str
16
16
  step_time: float
17
+ flush_id: typing_extensions.NotRequired[FlushId]
17
18
  context_id: typing_extensions.NotRequired[ContextId]
18
19
  status_code: int
19
20
  done: bool
@@ -19,6 +19,7 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
19
19
  duration: typing_extensions.NotRequired[int]
20
20
  language: typing_extensions.NotRequired[str]
21
21
  add_timestamps: typing_extensions.NotRequired[bool]
22
+ use_original_timestamps: typing_extensions.NotRequired[bool]
22
23
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
23
24
  continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
24
25
  context_id: typing_extensions.NotRequired[str]
@@ -6,22 +6,27 @@ Emotion = typing.Union[
6
6
  typing.Literal[
7
7
  "anger:lowest",
8
8
  "anger:low",
9
+ "anger",
9
10
  "anger:high",
10
11
  "anger:highest",
11
12
  "positivity:lowest",
12
13
  "positivity:low",
14
+ "positivity",
13
15
  "positivity:high",
14
16
  "positivity:highest",
15
17
  "surprise:lowest",
16
18
  "surprise:low",
19
+ "surprise",
17
20
  "surprise:high",
18
21
  "surprise:highest",
19
22
  "sadness:lowest",
20
23
  "sadness:low",
24
+ "sadness",
21
25
  "sadness:high",
22
26
  "sadness:highest",
23
27
  "curiosity:lowest",
24
28
  "curiosity:low",
29
+ "curiosity",
25
30
  "curiosity:high",
26
31
  "curiosity:highest",
27
32
  ],
@@ -56,6 +56,11 @@ class GenerationRequest(UniversalBaseModel):
56
56
  Whether to return phoneme-level timestamps.
57
57
  """
58
58
 
59
+ use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
60
+ """
61
+ Whether to use the original transcript for timestamps.
62
+ """
63
+
59
64
  if IS_PYDANTIC_V2:
60
65
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
61
66
  else:
@@ -1,14 +1,16 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponse
4
- from ...core.pydantic_utilities import IS_PYDANTIC_V2
5
4
  import typing
5
+ from .flush_id import FlushId
6
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
7
  import pydantic
7
8
 
8
9
 
9
10
  class WebSocketChunkResponse(WebSocketBaseResponse):
10
11
  data: str
11
12
  step_time: float
13
+ flush_id: typing.Optional[FlushId] = None
12
14
 
13
15
  if IS_PYDANTIC_V2:
14
16
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -3,10 +3,10 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
+ from .flush_id import FlushId
6
7
  from .context_id import ContextId
7
8
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
9
  import pydantic
9
- from .flush_id import FlushId
10
10
  from .word_timestamps import WordTimestamps
11
11
  from .phoneme_timestamps import PhonemeTimestamps
12
12
 
@@ -15,6 +15,7 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
15
15
  type: typing.Literal["chunk"] = "chunk"
16
16
  data: str
17
17
  step_time: float
18
+ flush_id: typing.Optional[FlushId] = None
18
19
  context_id: typing.Optional[ContextId] = None
19
20
  status_code: int
20
21
  done: bool