cartesia 2.0.0b7__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. cartesia/__init__.py +15 -1
  2. cartesia/auth/__init__.py +13 -0
  3. cartesia/auth/client.py +159 -0
  4. cartesia/auth/requests/__init__.py +7 -0
  5. cartesia/auth/requests/token_grant.py +10 -0
  6. cartesia/auth/requests/token_request.py +17 -0
  7. cartesia/auth/requests/token_response.py +10 -0
  8. cartesia/auth/types/__init__.py +7 -0
  9. cartesia/auth/types/token_grant.py +22 -0
  10. cartesia/auth/types/token_request.py +28 -0
  11. cartesia/auth/types/token_response.py +22 -0
  12. cartesia/base_client.py +4 -0
  13. cartesia/core/client_wrapper.py +1 -1
  14. cartesia/tts/_async_websocket.py +8 -0
  15. cartesia/tts/_websocket.py +11 -0
  16. cartesia/tts/client.py +40 -4
  17. cartesia/tts/requests/generation_request.py +19 -1
  18. cartesia/tts/requests/tts_request.py +10 -1
  19. cartesia/tts/requests/web_socket_tts_request.py +3 -1
  20. cartesia/tts/types/generation_request.py +19 -1
  21. cartesia/tts/types/tts_request.py +10 -1
  22. cartesia/tts/types/web_socket_tts_request.py +3 -1
  23. cartesia/voices/__init__.py +6 -0
  24. cartesia/voices/client.py +208 -159
  25. cartesia/voices/requests/create_voice_request.py +2 -0
  26. cartesia/voices/requests/localize_dialect.py +6 -1
  27. cartesia/voices/requests/localize_voice_request.py +15 -2
  28. cartesia/voices/types/__init__.py +6 -0
  29. cartesia/voices/types/create_voice_request.py +2 -0
  30. cartesia/voices/types/localize_dialect.py +6 -1
  31. cartesia/voices/types/localize_french_dialect.py +5 -0
  32. cartesia/voices/types/localize_portuguese_dialect.py +5 -0
  33. cartesia/voices/types/localize_spanish_dialect.py +5 -0
  34. cartesia/voices/types/localize_voice_request.py +16 -3
  35. {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/METADATA +68 -63
  36. {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/RECORD +37 -24
  37. {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/WHEEL +0 -0
@@ -10,7 +10,7 @@ from .output_format import OutputFormatParams
10
10
  class TtsRequestParams(typing_extensions.TypedDict):
11
11
  model_id: str
12
12
  """
13
- The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
13
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
14
14
  """
15
15
 
16
16
  transcript: str
@@ -22,3 +22,12 @@ class TtsRequestParams(typing_extensions.TypedDict):
22
22
  The maximum duration of the audio in seconds. You do not usually need to specify this.
23
23
  If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
24
24
  """
25
+
26
+ text_cfg: typing_extensions.NotRequired[float]
27
+ """
28
+ The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
29
+
30
+ Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
31
+
32
+ This parameter is only supported for `sonic-2` models.
33
+ """
@@ -10,7 +10,7 @@ from ...core.serialization import FieldMetadata
10
10
  class WebSocketTtsRequestParams(typing_extensions.TypedDict):
11
11
  model_id: str
12
12
  """
13
- The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
13
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
14
14
  """
15
15
 
16
16
  output_format: typing_extensions.NotRequired[OutputFormatParams]
@@ -23,3 +23,5 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
23
23
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
24
24
  continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
25
25
  context_id: typing_extensions.NotRequired[str]
26
+ max_buffer_delay_ms: typing_extensions.NotRequired[int]
27
+ text_cfg: typing_extensions.NotRequired[float]
@@ -15,7 +15,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
15
15
  class GenerationRequest(UniversalBaseModel):
16
16
  model_id: str = pydantic.Field()
17
17
  """
18
- The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
18
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
19
19
  """
20
20
 
21
21
  transcript: typing.Optional[typing.Any] = pydantic.Field(default=None)
@@ -32,6 +32,15 @@ class GenerationRequest(UniversalBaseModel):
32
32
  If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
33
33
  """
34
34
 
35
+ text_cfg: typing.Optional[float] = pydantic.Field(default=None)
36
+ """
37
+ The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
38
+
39
+ Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
40
+
41
+ This parameter is only supported for `sonic-2` models.
42
+ """
43
+
35
44
  context_id: typing.Optional[ContextId] = None
36
45
  continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = pydantic.Field(
37
46
  default=None
@@ -41,6 +50,15 @@ class GenerationRequest(UniversalBaseModel):
41
50
  If not specified, this defaults to `false`.
42
51
  """
43
52
 
53
+ max_buffer_delay_ms: typing.Optional[int] = pydantic.Field(default=None)
54
+ """
55
+ The maximum time in milliseconds to buffer text before starting generation. Values between [0, 1000]ms are supported. Defaults to 0 (no buffering).
56
+
57
+ When set, the model will buffer incoming text chunks until it's confident it has enough context to generate high-quality speech, or the buffer delay elapses, whichever comes first. Without this option set, the model will kick off generations immediately, ceding control of buffering to the user.
58
+
59
+ Use this to balance responsiveness with higher quality speech generation, which often benefits from having more context.
60
+ """
61
+
44
62
  flush: typing.Optional[bool] = pydantic.Field(default=None)
45
63
  """
46
64
  Whether to flush the context.
@@ -12,7 +12,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
12
12
  class TtsRequest(UniversalBaseModel):
13
13
  model_id: str = pydantic.Field()
14
14
  """
15
- The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
15
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
16
16
  """
17
17
 
18
18
  transcript: str
@@ -25,6 +25,15 @@ class TtsRequest(UniversalBaseModel):
25
25
  If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
26
26
  """
27
27
 
28
+ text_cfg: typing.Optional[float] = pydantic.Field(default=None)
29
+ """
30
+ The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
31
+
32
+ Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
33
+
34
+ This parameter is only supported for `sonic-2` models.
35
+ """
36
+
28
37
  if IS_PYDANTIC_V2:
29
38
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
30
39
  else:
@@ -13,7 +13,7 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
13
13
  class WebSocketTtsRequest(UniversalBaseModel):
14
14
  model_id: str = pydantic.Field()
15
15
  """
16
- The ID of the model to use for the generation. See [Models](/build-with-sonic/models) for available models.
16
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
17
17
  """
18
18
 
19
19
  output_format: typing.Optional[OutputFormat] = None
@@ -26,6 +26,8 @@ class WebSocketTtsRequest(UniversalBaseModel):
26
26
  add_phoneme_timestamps: typing.Optional[bool] = None
27
27
  continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
28
28
  context_id: typing.Optional[str] = None
29
+ max_buffer_delay_ms: typing.Optional[int] = None
30
+ text_cfg: typing.Optional[float] = None
29
31
 
30
32
  if IS_PYDANTIC_V2:
31
33
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -12,6 +12,9 @@ from .types import (
12
12
  IdSpecifier,
13
13
  LocalizeDialect,
14
14
  LocalizeEnglishDialect,
15
+ LocalizeFrenchDialect,
16
+ LocalizePortugueseDialect,
17
+ LocalizeSpanishDialect,
15
18
  LocalizeTargetLanguage,
16
19
  LocalizeVoiceRequest,
17
20
  MixVoiceSpecifier,
@@ -56,6 +59,9 @@ __all__ = [
56
59
  "LocalizeDialect",
57
60
  "LocalizeDialectParams",
58
61
  "LocalizeEnglishDialect",
62
+ "LocalizeFrenchDialect",
63
+ "LocalizePortugueseDialect",
64
+ "LocalizeSpanishDialect",
59
65
  "LocalizeTargetLanguage",
60
66
  "LocalizeVoiceRequest",
61
67
  "LocalizeVoiceRequestParams",