cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. cartesia/__init__.py +22 -0
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/core/client_wrapper.py +1 -1
  8. cartesia/infill/client.py +0 -8
  9. cartesia/stt/__init__.py +6 -0
  10. cartesia/stt/_async_websocket.py +81 -72
  11. cartesia/stt/_websocket.py +42 -20
  12. cartesia/stt/client.py +450 -0
  13. cartesia/stt/requests/__init__.py +2 -0
  14. cartesia/stt/requests/streaming_transcription_response.py +2 -0
  15. cartesia/stt/requests/transcript_message.py +8 -1
  16. cartesia/stt/requests/transcription_response.py +8 -1
  17. cartesia/stt/requests/transcription_word.py +20 -0
  18. cartesia/stt/socket_client.py +52 -109
  19. cartesia/stt/types/__init__.py +4 -0
  20. cartesia/stt/types/streaming_transcription_response.py +2 -0
  21. cartesia/stt/types/stt_encoding.py +3 -1
  22. cartesia/stt/types/timestamp_granularity.py +5 -0
  23. cartesia/stt/types/transcript_message.py +7 -1
  24. cartesia/stt/types/transcription_response.py +7 -1
  25. cartesia/stt/types/transcription_word.py +32 -0
  26. cartesia/tts/__init__.py +16 -0
  27. cartesia/tts/client.py +63 -8
  28. cartesia/tts/requests/__init__.py +8 -0
  29. cartesia/tts/requests/experimental_model_controls.py +17 -0
  30. cartesia/tts/requests/generation_config.py +23 -0
  31. cartesia/tts/requests/generation_request.py +4 -4
  32. cartesia/tts/requests/sse_output_format.py +11 -0
  33. cartesia/tts/requests/tts_request.py +2 -0
  34. cartesia/tts/requests/ttssse_request.py +47 -0
  35. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  36. cartesia/tts/requests/web_socket_response.py +1 -2
  37. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  38. cartesia/tts/types/__init__.py +8 -0
  39. cartesia/tts/types/experimental_model_controls.py +28 -0
  40. cartesia/tts/types/generation_config.py +34 -0
  41. cartesia/tts/types/generation_request.py +4 -4
  42. cartesia/tts/types/sse_output_format.py +22 -0
  43. cartesia/tts/types/tts_request.py +2 -0
  44. cartesia/tts/types/ttssse_request.py +58 -0
  45. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  46. cartesia/tts/types/web_socket_response.py +1 -2
  47. cartesia/tts/types/web_socket_tts_request.py +11 -3
  48. cartesia/voice_changer/client.py +0 -8
  49. cartesia/voice_changer/requests/streaming_response.py +0 -2
  50. cartesia/voice_changer/types/streaming_response.py +0 -2
  51. cartesia/voices/client.py +0 -12
  52. cartesia-2.0.7.dist-info/LICENSE +201 -0
  53. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
  54. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
  55. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
@@ -2,16 +2,20 @@
2
2
 
3
3
  from .cancel_context_request import CancelContextRequestParams
4
4
  from .controls import ControlsParams
5
+ from .experimental_model_controls import ExperimentalModelControlsParams
6
+ from .generation_config import GenerationConfigParams
5
7
  from .generation_request import GenerationRequestParams
6
8
  from .mp_3_output_format import Mp3OutputFormatParams
7
9
  from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFormat_RawParams, OutputFormat_WavParams
8
10
  from .phoneme_timestamps import PhonemeTimestampsParams
9
11
  from .raw_output_format import RawOutputFormatParams
10
12
  from .speed import SpeedParams
13
+ from .sse_output_format import SseOutputFormatParams
11
14
  from .tts_request import TtsRequestParams
12
15
  from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
13
16
  from .tts_request_id_specifier import TtsRequestIdSpecifierParams
14
17
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
18
+ from .ttssse_request import TtssseRequestParams
15
19
  from .wav_output_format import WavOutputFormatParams
16
20
  from .web_socket_base_response import WebSocketBaseResponseParams
17
21
  from .web_socket_chunk_response import WebSocketChunkResponseParams
@@ -39,6 +43,8 @@ from .word_timestamps import WordTimestampsParams
39
43
  __all__ = [
40
44
  "CancelContextRequestParams",
41
45
  "ControlsParams",
46
+ "ExperimentalModelControlsParams",
47
+ "GenerationConfigParams",
42
48
  "GenerationRequestParams",
43
49
  "Mp3OutputFormatParams",
44
50
  "OutputFormatParams",
@@ -48,10 +54,12 @@ __all__ = [
48
54
  "PhonemeTimestampsParams",
49
55
  "RawOutputFormatParams",
50
56
  "SpeedParams",
57
+ "SseOutputFormatParams",
51
58
  "TtsRequestEmbeddingSpecifierParams",
52
59
  "TtsRequestIdSpecifierParams",
53
60
  "TtsRequestParams",
54
61
  "TtsRequestVoiceSpecifierParams",
62
+ "TtssseRequestParams",
55
63
  "WavOutputFormatParams",
56
64
  "WebSocketBaseResponseParams",
57
65
  "WebSocketChunkResponseParams",
@@ -0,0 +1,17 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+
6
+
7
+ class ExperimentalModelControlsParams(typing_extensions.TypedDict):
8
+ """
9
+ These controls are **experimental** and subject to breaking changes.
10
+ """
11
+
12
+ accent_localization: typing_extensions.NotRequired[int]
13
+ """
14
+ Toggle accent localization: 0 (disabled, default) or 1 (enabled).
15
+ When enabled, the voice adapts to match the transcript language's accent while preserving vocal characteristics. When disabled, maintains the original voice accent.
16
+ For more information, see [Localize Voices](/build-with-sonic/capabilities/localize-voices).
17
+ """
@@ -0,0 +1,23 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+ from .experimental_model_controls import ExperimentalModelControlsParams
6
+
7
+
8
+ class GenerationConfigParams(typing_extensions.TypedDict):
9
+ """
10
+ Configure the various attributes of the generated speech. These controls are only available for `sonic-3-preview` and will have no effect on earlier models.
11
+ """
12
+
13
+ volume: typing_extensions.NotRequired[float]
14
+ """
15
+ Adjust the volume of the generated speech between -1.0 (softer) and 1.0 (louder). 0.0 is the default volume.
16
+ """
17
+
18
+ speed: typing_extensions.NotRequired[float]
19
+ """
20
+ Adjust the speed of the generated speech between -1.0 (slower) and 1.0 (faster). 0.0 is the default speed.
21
+ """
22
+
23
+ experimental: typing_extensions.NotRequired[ExperimentalModelControlsParams]
@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
55
55
 
56
56
  add_timestamps: typing_extensions.NotRequired[bool]
57
57
  """
58
- Whether to return word-level timestamps.
58
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
59
59
  """
60
60
 
61
61
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
62
62
  """
63
- Whether to return phoneme-level timestamps.
63
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
64
64
  """
65
65
 
66
- use_original_timestamps: typing_extensions.NotRequired[bool]
66
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
67
67
  """
68
- Whether to use the original transcript for timestamps.
68
+ Whether to use normalized timestamps (True) or original timestamps (False).
69
69
  """
@@ -0,0 +1,11 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing
5
+ from ..types.raw_encoding import RawEncoding
6
+
7
+
8
+ class SseOutputFormatParams(typing_extensions.TypedDict):
9
+ container: typing.Literal["raw"]
10
+ encoding: RawEncoding
11
+ sample_rate: int
@@ -6,6 +6,7 @@ import typing_extensions
6
6
  from ..types.supported_language import SupportedLanguage
7
7
  from .output_format import OutputFormatParams
8
8
  from ..types.model_speed import ModelSpeed
9
+ from .generation_config import GenerationConfigParams
9
10
 
10
11
 
11
12
  class TtsRequestParams(typing_extensions.TypedDict):
@@ -25,3 +26,4 @@ class TtsRequestParams(typing_extensions.TypedDict):
25
26
  """
26
27
 
27
28
  speed: typing_extensions.NotRequired[ModelSpeed]
29
+ generation_config: typing_extensions.NotRequired[GenerationConfigParams]
@@ -0,0 +1,47 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
5
+ import typing_extensions
6
+ from ..types.supported_language import SupportedLanguage
7
+ from .sse_output_format import SseOutputFormatParams
8
+ from ..types.model_speed import ModelSpeed
9
+ from ..types.context_id import ContextId
10
+
11
+
12
+ class TtssseRequestParams(typing_extensions.TypedDict):
13
+ model_id: str
14
+ """
15
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
16
+ """
17
+
18
+ transcript: str
19
+ voice: TtsRequestVoiceSpecifierParams
20
+ language: typing_extensions.NotRequired[SupportedLanguage]
21
+ output_format: SseOutputFormatParams
22
+ duration: typing_extensions.NotRequired[float]
23
+ """
24
+ The maximum duration of the audio in seconds. You do not usually need to specify this.
25
+ If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
26
+ """
27
+
28
+ speed: typing_extensions.NotRequired[ModelSpeed]
29
+ add_timestamps: typing_extensions.NotRequired[bool]
30
+ """
31
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
32
+ """
33
+
34
+ add_phoneme_timestamps: typing_extensions.NotRequired[bool]
35
+ """
36
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
37
+ """
38
+
39
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
40
+ """
41
+ Whether to use normalized timestamps (True) or original timestamps (False).
42
+ """
43
+
44
+ context_id: typing_extensions.NotRequired[ContextId]
45
+ """
46
+ Optional context ID for this request.
47
+ """
@@ -1,11 +1,8 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponseParams
4
- import typing_extensions
5
- from ..types.flush_id import FlushId
6
4
 
7
5
 
8
6
  class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
9
7
  data: str
10
8
  step_time: float
11
- flush_id: typing_extensions.NotRequired[FlushId]
@@ -4,8 +4,8 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ..types.flush_id import FlushId
8
7
  from ..types.context_id import ContextId
8
+ from ..types.flush_id import FlushId
9
9
  from .word_timestamps import WordTimestampsParams
10
10
  from .phoneme_timestamps import PhonemeTimestampsParams
11
11
 
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
14
14
  type: typing.Literal["chunk"]
15
15
  data: str
16
16
  step_time: float
17
- flush_id: typing_extensions.NotRequired[FlushId]
18
17
  context_id: typing_extensions.NotRequired[ContextId]
19
18
  status_code: int
20
19
  done: bool
@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
20
20
  duration: typing_extensions.NotRequired[int]
21
21
  language: typing_extensions.NotRequired[str]
22
22
  add_timestamps: typing_extensions.NotRequired[bool]
23
- use_original_timestamps: typing_extensions.NotRequired[bool]
23
+ """
24
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
25
+ """
26
+
24
27
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
28
+ """
29
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
30
+ """
31
+
32
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
25
33
  continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
26
34
  context_id: typing_extensions.NotRequired[str]
27
35
  max_buffer_delay_ms: typing_extensions.NotRequired[int]
@@ -4,7 +4,9 @@ from .cancel_context_request import CancelContextRequest
4
4
  from .context_id import ContextId
5
5
  from .controls import Controls
6
6
  from .emotion import Emotion
7
+ from .experimental_model_controls import ExperimentalModelControls
7
8
  from .flush_id import FlushId
9
+ from .generation_config import GenerationConfig
8
10
  from .generation_request import GenerationRequest
9
11
  from .model_speed import ModelSpeed
10
12
  from .mp_3_output_format import Mp3OutputFormat
@@ -15,11 +17,13 @@ from .phoneme_timestamps import PhonemeTimestamps
15
17
  from .raw_encoding import RawEncoding
16
18
  from .raw_output_format import RawOutputFormat
17
19
  from .speed import Speed
20
+ from .sse_output_format import SseOutputFormat
18
21
  from .supported_language import SupportedLanguage
19
22
  from .tts_request import TtsRequest
20
23
  from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
21
24
  from .tts_request_id_specifier import TtsRequestIdSpecifier
22
25
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
26
+ from .ttssse_request import TtssseRequest
23
27
  from .wav_output_format import WavOutputFormat
24
28
  from .web_socket_base_response import WebSocketBaseResponse
25
29
  from .web_socket_chunk_response import WebSocketChunkResponse
@@ -49,7 +53,9 @@ __all__ = [
49
53
  "ContextId",
50
54
  "Controls",
51
55
  "Emotion",
56
+ "ExperimentalModelControls",
52
57
  "FlushId",
58
+ "GenerationConfig",
53
59
  "GenerationRequest",
54
60
  "ModelSpeed",
55
61
  "Mp3OutputFormat",
@@ -63,11 +69,13 @@ __all__ = [
63
69
  "RawEncoding",
64
70
  "RawOutputFormat",
65
71
  "Speed",
72
+ "SseOutputFormat",
66
73
  "SupportedLanguage",
67
74
  "TtsRequest",
68
75
  "TtsRequestEmbeddingSpecifier",
69
76
  "TtsRequestIdSpecifier",
70
77
  "TtsRequestVoiceSpecifier",
78
+ "TtssseRequest",
71
79
  "WavOutputFormat",
72
80
  "WebSocketBaseResponse",
73
81
  "WebSocketChunkResponse",
@@ -0,0 +1,28 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ import pydantic
6
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
+
8
+
9
+ class ExperimentalModelControls(UniversalBaseModel):
10
+ """
11
+ These controls are **experimental** and subject to breaking changes.
12
+ """
13
+
14
+ accent_localization: typing.Optional[int] = pydantic.Field(default=None)
15
+ """
16
+ Toggle accent localization: 0 (disabled, default) or 1 (enabled).
17
+ When enabled, the voice adapts to match the transcript language's accent while preserving vocal characteristics. When disabled, maintains the original voice accent.
18
+ For more information, see [Localize Voices](/build-with-sonic/capabilities/localize-voices).
19
+ """
20
+
21
+ if IS_PYDANTIC_V2:
22
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
23
+ else:
24
+
25
+ class Config:
26
+ frozen = True
27
+ smart_union = True
28
+ extra = pydantic.Extra.allow
@@ -0,0 +1,34 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ import pydantic
6
+ from .experimental_model_controls import ExperimentalModelControls
7
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
+
9
+
10
+ class GenerationConfig(UniversalBaseModel):
11
+ """
12
+ Configure the various attributes of the generated speech. These controls are only available for `sonic-3-preview` and will have no effect on earlier models.
13
+ """
14
+
15
+ volume: typing.Optional[float] = pydantic.Field(default=None)
16
+ """
17
+ Adjust the volume of the generated speech between -1.0 (softer) and 1.0 (louder). 0.0 is the default volume.
18
+ """
19
+
20
+ speed: typing.Optional[float] = pydantic.Field(default=None)
21
+ """
22
+ Adjust the speed of the generated speech between -1.0 (slower) and 1.0 (faster). 0.0 is the default speed.
23
+ """
24
+
25
+ experimental: typing.Optional[ExperimentalModelControls] = None
26
+
27
+ if IS_PYDANTIC_V2:
28
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
29
+ else:
30
+
31
+ class Config:
32
+ frozen = True
33
+ smart_union = True
34
+ extra = pydantic.Extra.allow
@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
59
59
 
60
60
  add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
61
61
  """
62
- Whether to return word-level timestamps.
62
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
63
63
  """
64
64
 
65
65
  add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
66
66
  """
67
- Whether to return phoneme-level timestamps.
67
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
68
68
  """
69
69
 
70
- use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
70
+ use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
71
71
  """
72
- Whether to use the original transcript for timestamps.
72
+ Whether to use normalized timestamps (True) or original timestamps (False).
73
73
  """
74
74
 
75
75
  if IS_PYDANTIC_V2:
@@ -0,0 +1,22 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ from .raw_encoding import RawEncoding
6
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
+ import pydantic
8
+
9
+
10
+ class SseOutputFormat(UniversalBaseModel):
11
+ container: typing.Literal["raw"] = "raw"
12
+ encoding: RawEncoding
13
+ sample_rate: int
14
+
15
+ if IS_PYDANTIC_V2:
16
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
17
+ else:
18
+
19
+ class Config:
20
+ frozen = True
21
+ smart_union = True
22
+ extra = pydantic.Extra.allow
@@ -7,6 +7,7 @@ import typing
7
7
  from .supported_language import SupportedLanguage
8
8
  from .output_format import OutputFormat
9
9
  from .model_speed import ModelSpeed
10
+ from .generation_config import GenerationConfig
10
11
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
11
12
 
12
13
 
@@ -27,6 +28,7 @@ class TtsRequest(UniversalBaseModel):
27
28
  """
28
29
 
29
30
  speed: typing.Optional[ModelSpeed] = None
31
+ generation_config: typing.Optional[GenerationConfig] = None
30
32
 
31
33
  if IS_PYDANTIC_V2:
32
34
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -0,0 +1,58 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
6
+ import typing
7
+ from .supported_language import SupportedLanguage
8
+ from .sse_output_format import SseOutputFormat
9
+ from .model_speed import ModelSpeed
10
+ from .context_id import ContextId
11
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
12
+
13
+
14
+ class TtssseRequest(UniversalBaseModel):
15
+ model_id: str = pydantic.Field()
16
+ """
17
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
18
+ """
19
+
20
+ transcript: str
21
+ voice: TtsRequestVoiceSpecifier
22
+ language: typing.Optional[SupportedLanguage] = None
23
+ output_format: SseOutputFormat
24
+ duration: typing.Optional[float] = pydantic.Field(default=None)
25
+ """
26
+ The maximum duration of the audio in seconds. You do not usually need to specify this.
27
+ If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
28
+ """
29
+
30
+ speed: typing.Optional[ModelSpeed] = None
31
+ add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
32
+ """
33
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
34
+ """
35
+
36
+ add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
37
+ """
38
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
39
+ """
40
+
41
+ use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
42
+ """
43
+ Whether to use normalized timestamps (True) or original timestamps (False).
44
+ """
45
+
46
+ context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
47
+ """
48
+ Optional context ID for this request.
49
+ """
50
+
51
+ if IS_PYDANTIC_V2:
52
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
53
+ else:
54
+
55
+ class Config:
56
+ frozen = True
57
+ smart_union = True
58
+ extra = pydantic.Extra.allow
@@ -1,16 +1,14 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponse
4
- import typing
5
- from .flush_id import FlushId
6
4
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
5
+ import typing
7
6
  import pydantic
8
7
 
9
8
 
10
9
  class WebSocketChunkResponse(WebSocketBaseResponse):
11
10
  data: str
12
11
  step_time: float
13
- flush_id: typing.Optional[FlushId] = None
14
12
 
15
13
  if IS_PYDANTIC_V2:
16
14
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -3,10 +3,10 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
- from .flush_id import FlushId
7
6
  from .context_id import ContextId
8
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
9
8
  import pydantic
9
+ from .flush_id import FlushId
10
10
  from .word_timestamps import WordTimestamps
11
11
  from .phoneme_timestamps import PhonemeTimestamps
12
12
 
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
15
15
  type: typing.Literal["chunk"] = "chunk"
16
16
  data: str
17
17
  step_time: float
18
- flush_id: typing.Optional[FlushId] = None
19
18
  context_id: typing.Optional[ContextId] = None
20
19
  status_code: int
21
20
  done: bool
@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
22
22
  voice: TtsRequestVoiceSpecifier
23
23
  duration: typing.Optional[int] = None
24
24
  language: typing.Optional[str] = None
25
- add_timestamps: typing.Optional[bool] = None
26
- use_original_timestamps: typing.Optional[bool] = None
27
- add_phoneme_timestamps: typing.Optional[bool] = None
25
+ add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
26
+ """
27
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
28
+ """
29
+
30
+ add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
31
+ """
32
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
33
+ """
34
+
35
+ use_normalized_timestamps: typing.Optional[bool] = None
28
36
  continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
29
37
  context_id: typing.Optional[str] = None
30
38
  max_buffer_delay_ms: typing.Optional[int] = None
@@ -51,11 +51,9 @@ class VoiceChangerClient:
51
51
  output_format_encoding : typing.Optional[RawEncoding]
52
52
  Required for `raw` and `wav` containers.
53
53
 
54
-
55
54
  output_format_bit_rate : typing.Optional[int]
56
55
  Required for `mp3` containers.
57
56
 
58
-
59
57
  request_options : typing.Optional[RequestOptions]
60
58
  Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
61
59
 
@@ -131,11 +129,9 @@ class VoiceChangerClient:
131
129
  output_format_encoding : typing.Optional[RawEncoding]
132
130
  Required for `raw` and `wav` containers.
133
131
 
134
-
135
132
  output_format_bit_rate : typing.Optional[int]
136
133
  Required for `mp3` containers.
137
134
 
138
-
139
135
  request_options : typing.Optional[RequestOptions]
140
136
  Request-specific configuration.
141
137
 
@@ -232,11 +228,9 @@ class AsyncVoiceChangerClient:
232
228
  output_format_encoding : typing.Optional[RawEncoding]
233
229
  Required for `raw` and `wav` containers.
234
230
 
235
-
236
231
  output_format_bit_rate : typing.Optional[int]
237
232
  Required for `mp3` containers.
238
233
 
239
-
240
234
  request_options : typing.Optional[RequestOptions]
241
235
  Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
242
236
 
@@ -320,11 +314,9 @@ class AsyncVoiceChangerClient:
320
314
  output_format_encoding : typing.Optional[RawEncoding]
321
315
  Required for `raw` and `wav` containers.
322
316
 
323
-
324
317
  output_format_bit_rate : typing.Optional[int]
325
318
  Required for `mp3` containers.
326
319
 
327
-
328
320
  request_options : typing.Optional[RequestOptions]
329
321
  Request-specific configuration.
330
322
 
@@ -4,7 +4,6 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ...tts.types.flush_id import FlushId
8
7
  from ...tts.types.context_id import ContextId
9
8
 
10
9
 
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
12
11
  type: typing.Literal["chunk"]
13
12
  data: str
14
13
  step_time: float
15
- flush_id: typing_extensions.NotRequired[FlushId]
16
14
  context_id: typing_extensions.NotRequired[ContextId]
17
15
  status_code: int
18
16
  done: bool
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
- from ...tts.types.flush_id import FlushId
7
6
  from ...tts.types.context_id import ContextId
8
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
9
8
  import pydantic
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
13
12
  type: typing.Literal["chunk"] = "chunk"
14
13
  data: str
15
14
  step_time: float
16
- flush_id: typing.Optional[FlushId] = None
17
15
  context_id: typing.Optional[ContextId] = None
18
16
  status_code: int
19
17
  done: bool
cartesia/voices/client.py CHANGED
@@ -168,27 +168,21 @@ class VoicesClient:
168
168
  name : str
169
169
  The name of the voice.
170
170
 
171
-
172
171
  language : SupportedLanguage
173
172
  The language of the voice.
174
173
 
175
-
176
174
  mode : CloneMode
177
175
  Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
178
176
 
179
-
180
177
  description : typing.Optional[str]
181
178
  A description for the voice.
182
179
 
183
-
184
180
  enhance : typing.Optional[bool]
185
181
  Whether to apply AI enhancements to the clip to reduce background noise. This leads to cleaner generated speech at the cost of reduced similarity to the source clip.
186
182
 
187
-
188
183
  base_voice_id : typing.Optional[VoiceId]
189
184
  Optional base voice ID that the cloned voice is derived from.
190
185
 
191
-
192
186
  request_options : typing.Optional[RequestOptions]
193
187
  Request-specific configuration.
194
188
 
@@ -736,27 +730,21 @@ class AsyncVoicesClient:
736
730
  name : str
737
731
  The name of the voice.
738
732
 
739
-
740
733
  language : SupportedLanguage
741
734
  The language of the voice.
742
735
 
743
-
744
736
  mode : CloneMode
745
737
  Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
746
738
 
747
-
748
739
  description : typing.Optional[str]
749
740
  A description for the voice.
750
741
 
751
-
752
742
  enhance : typing.Optional[bool]
753
743
  Whether to apply AI enhancements to the clip to reduce background noise. This leads to cleaner generated speech at the cost of reduced similarity to the source clip.
754
744
 
755
-
756
745
  base_voice_id : typing.Optional[VoiceId]
757
746
  Optional base voice ID that the cloned voice is derived from.
758
747
 
759
-
760
748
  request_options : typing.Optional[RequestOptions]
761
749
  Request-specific configuration.
762
750