cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +22 -0
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/core/client_wrapper.py +1 -1
- cartesia/infill/client.py +0 -8
- cartesia/stt/__init__.py +6 -0
- cartesia/stt/_async_websocket.py +81 -72
- cartesia/stt/_websocket.py +42 -20
- cartesia/stt/client.py +450 -0
- cartesia/stt/requests/__init__.py +2 -0
- cartesia/stt/requests/streaming_transcription_response.py +2 -0
- cartesia/stt/requests/transcript_message.py +8 -1
- cartesia/stt/requests/transcription_response.py +8 -1
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +52 -109
- cartesia/stt/types/__init__.py +4 -0
- cartesia/stt/types/streaming_transcription_response.py +2 -0
- cartesia/stt/types/stt_encoding.py +3 -1
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +7 -1
- cartesia/stt/types/transcription_response.py +7 -1
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +16 -0
- cartesia/tts/client.py +63 -8
- cartesia/tts/requests/__init__.py +8 -0
- cartesia/tts/requests/experimental_model_controls.py +17 -0
- cartesia/tts/requests/generation_config.py +23 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/tts_request.py +2 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +8 -0
- cartesia/tts/types/experimental_model_controls.py +28 -0
- cartesia/tts/types/generation_config.py +34 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/tts_request.py +2 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/client.py +0 -8
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- cartesia/voices/client.py +0 -12
- cartesia-2.0.7.dist-info/LICENSE +201 -0
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
@@ -2,16 +2,20 @@
|
|
2
2
|
|
3
3
|
from .cancel_context_request import CancelContextRequestParams
|
4
4
|
from .controls import ControlsParams
|
5
|
+
from .experimental_model_controls import ExperimentalModelControlsParams
|
6
|
+
from .generation_config import GenerationConfigParams
|
5
7
|
from .generation_request import GenerationRequestParams
|
6
8
|
from .mp_3_output_format import Mp3OutputFormatParams
|
7
9
|
from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFormat_RawParams, OutputFormat_WavParams
|
8
10
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
9
11
|
from .raw_output_format import RawOutputFormatParams
|
10
12
|
from .speed import SpeedParams
|
13
|
+
from .sse_output_format import SseOutputFormatParams
|
11
14
|
from .tts_request import TtsRequestParams
|
12
15
|
from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
|
13
16
|
from .tts_request_id_specifier import TtsRequestIdSpecifierParams
|
14
17
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
18
|
+
from .ttssse_request import TtssseRequestParams
|
15
19
|
from .wav_output_format import WavOutputFormatParams
|
16
20
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
17
21
|
from .web_socket_chunk_response import WebSocketChunkResponseParams
|
@@ -39,6 +43,8 @@ from .word_timestamps import WordTimestampsParams
|
|
39
43
|
__all__ = [
|
40
44
|
"CancelContextRequestParams",
|
41
45
|
"ControlsParams",
|
46
|
+
"ExperimentalModelControlsParams",
|
47
|
+
"GenerationConfigParams",
|
42
48
|
"GenerationRequestParams",
|
43
49
|
"Mp3OutputFormatParams",
|
44
50
|
"OutputFormatParams",
|
@@ -48,10 +54,12 @@ __all__ = [
|
|
48
54
|
"PhonemeTimestampsParams",
|
49
55
|
"RawOutputFormatParams",
|
50
56
|
"SpeedParams",
|
57
|
+
"SseOutputFormatParams",
|
51
58
|
"TtsRequestEmbeddingSpecifierParams",
|
52
59
|
"TtsRequestIdSpecifierParams",
|
53
60
|
"TtsRequestParams",
|
54
61
|
"TtsRequestVoiceSpecifierParams",
|
62
|
+
"TtssseRequestParams",
|
55
63
|
"WavOutputFormatParams",
|
56
64
|
"WebSocketBaseResponseParams",
|
57
65
|
"WebSocketChunkResponseParams",
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
|
6
|
+
|
7
|
+
class ExperimentalModelControlsParams(typing_extensions.TypedDict):
|
8
|
+
"""
|
9
|
+
These controls are **experimental** and subject to breaking changes.
|
10
|
+
"""
|
11
|
+
|
12
|
+
accent_localization: typing_extensions.NotRequired[int]
|
13
|
+
"""
|
14
|
+
Toggle accent localization: 0 (disabled, default) or 1 (enabled).
|
15
|
+
When enabled, the voice adapts to match the transcript language's accent while preserving vocal characteristics. When disabled, maintains the original voice accent.
|
16
|
+
For more information, see [Localize Voices](/build-with-sonic/capabilities/localize-voices).
|
17
|
+
"""
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
from .experimental_model_controls import ExperimentalModelControlsParams
|
6
|
+
|
7
|
+
|
8
|
+
class GenerationConfigParams(typing_extensions.TypedDict):
|
9
|
+
"""
|
10
|
+
Configure the various attributes of the generated speech. These controls are only available for `sonic-3-preview` and will have no effect on earlier models.
|
11
|
+
"""
|
12
|
+
|
13
|
+
volume: typing_extensions.NotRequired[float]
|
14
|
+
"""
|
15
|
+
Adjust the volume of the generated speech between -1.0 (softer) and 1.0 (louder). 0.0 is the default volume.
|
16
|
+
"""
|
17
|
+
|
18
|
+
speed: typing_extensions.NotRequired[float]
|
19
|
+
"""
|
20
|
+
Adjust the speed of the generated speech between -1.0 (slower) and 1.0 (faster). 0.0 is the default speed.
|
21
|
+
"""
|
22
|
+
|
23
|
+
experimental: typing_extensions.NotRequired[ExperimentalModelControlsParams]
|
@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
|
|
55
55
|
|
56
56
|
add_timestamps: typing_extensions.NotRequired[bool]
|
57
57
|
"""
|
58
|
-
Whether to return word-level timestamps.
|
58
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
59
59
|
"""
|
60
60
|
|
61
61
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
62
62
|
"""
|
63
|
-
Whether to return phoneme-level timestamps.
|
63
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
64
64
|
"""
|
65
65
|
|
66
|
-
|
66
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
67
67
|
"""
|
68
|
-
Whether to use
|
68
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
69
69
|
"""
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing
|
5
|
+
from ..types.raw_encoding import RawEncoding
|
6
|
+
|
7
|
+
|
8
|
+
class SseOutputFormatParams(typing_extensions.TypedDict):
|
9
|
+
container: typing.Literal["raw"]
|
10
|
+
encoding: RawEncoding
|
11
|
+
sample_rate: int
|
@@ -6,6 +6,7 @@ import typing_extensions
|
|
6
6
|
from ..types.supported_language import SupportedLanguage
|
7
7
|
from .output_format import OutputFormatParams
|
8
8
|
from ..types.model_speed import ModelSpeed
|
9
|
+
from .generation_config import GenerationConfigParams
|
9
10
|
|
10
11
|
|
11
12
|
class TtsRequestParams(typing_extensions.TypedDict):
|
@@ -25,3 +26,4 @@ class TtsRequestParams(typing_extensions.TypedDict):
|
|
25
26
|
"""
|
26
27
|
|
27
28
|
speed: typing_extensions.NotRequired[ModelSpeed]
|
29
|
+
generation_config: typing_extensions.NotRequired[GenerationConfigParams]
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
5
|
+
import typing_extensions
|
6
|
+
from ..types.supported_language import SupportedLanguage
|
7
|
+
from .sse_output_format import SseOutputFormatParams
|
8
|
+
from ..types.model_speed import ModelSpeed
|
9
|
+
from ..types.context_id import ContextId
|
10
|
+
|
11
|
+
|
12
|
+
class TtssseRequestParams(typing_extensions.TypedDict):
|
13
|
+
model_id: str
|
14
|
+
"""
|
15
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
16
|
+
"""
|
17
|
+
|
18
|
+
transcript: str
|
19
|
+
voice: TtsRequestVoiceSpecifierParams
|
20
|
+
language: typing_extensions.NotRequired[SupportedLanguage]
|
21
|
+
output_format: SseOutputFormatParams
|
22
|
+
duration: typing_extensions.NotRequired[float]
|
23
|
+
"""
|
24
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
25
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
26
|
+
"""
|
27
|
+
|
28
|
+
speed: typing_extensions.NotRequired[ModelSpeed]
|
29
|
+
add_timestamps: typing_extensions.NotRequired[bool]
|
30
|
+
"""
|
31
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
32
|
+
"""
|
33
|
+
|
34
|
+
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
35
|
+
"""
|
36
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
37
|
+
"""
|
38
|
+
|
39
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
40
|
+
"""
|
41
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
42
|
+
"""
|
43
|
+
|
44
|
+
context_id: typing_extensions.NotRequired[ContextId]
|
45
|
+
"""
|
46
|
+
Optional context ID for this request.
|
47
|
+
"""
|
@@ -1,11 +1,8 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
4
|
-
import typing_extensions
|
5
|
-
from ..types.flush_id import FlushId
|
6
4
|
|
7
5
|
|
8
6
|
class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
|
9
7
|
data: str
|
10
8
|
step_time: float
|
11
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
@@ -4,8 +4,8 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ..types.flush_id import FlushId
|
8
7
|
from ..types.context_id import ContextId
|
8
|
+
from ..types.flush_id import FlushId
|
9
9
|
from .word_timestamps import WordTimestampsParams
|
10
10
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
11
11
|
|
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
|
|
14
14
|
type: typing.Literal["chunk"]
|
15
15
|
data: str
|
16
16
|
step_time: float
|
17
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
18
17
|
context_id: typing_extensions.NotRequired[ContextId]
|
19
18
|
status_code: int
|
20
19
|
done: bool
|
@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
|
20
20
|
duration: typing_extensions.NotRequired[int]
|
21
21
|
language: typing_extensions.NotRequired[str]
|
22
22
|
add_timestamps: typing_extensions.NotRequired[bool]
|
23
|
-
|
23
|
+
"""
|
24
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
25
|
+
"""
|
26
|
+
|
24
27
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
28
|
+
"""
|
29
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
30
|
+
"""
|
31
|
+
|
32
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
25
33
|
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
26
34
|
context_id: typing_extensions.NotRequired[str]
|
27
35
|
max_buffer_delay_ms: typing_extensions.NotRequired[int]
|
cartesia/tts/types/__init__.py
CHANGED
@@ -4,7 +4,9 @@ from .cancel_context_request import CancelContextRequest
|
|
4
4
|
from .context_id import ContextId
|
5
5
|
from .controls import Controls
|
6
6
|
from .emotion import Emotion
|
7
|
+
from .experimental_model_controls import ExperimentalModelControls
|
7
8
|
from .flush_id import FlushId
|
9
|
+
from .generation_config import GenerationConfig
|
8
10
|
from .generation_request import GenerationRequest
|
9
11
|
from .model_speed import ModelSpeed
|
10
12
|
from .mp_3_output_format import Mp3OutputFormat
|
@@ -15,11 +17,13 @@ from .phoneme_timestamps import PhonemeTimestamps
|
|
15
17
|
from .raw_encoding import RawEncoding
|
16
18
|
from .raw_output_format import RawOutputFormat
|
17
19
|
from .speed import Speed
|
20
|
+
from .sse_output_format import SseOutputFormat
|
18
21
|
from .supported_language import SupportedLanguage
|
19
22
|
from .tts_request import TtsRequest
|
20
23
|
from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
|
21
24
|
from .tts_request_id_specifier import TtsRequestIdSpecifier
|
22
25
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
26
|
+
from .ttssse_request import TtssseRequest
|
23
27
|
from .wav_output_format import WavOutputFormat
|
24
28
|
from .web_socket_base_response import WebSocketBaseResponse
|
25
29
|
from .web_socket_chunk_response import WebSocketChunkResponse
|
@@ -49,7 +53,9 @@ __all__ = [
|
|
49
53
|
"ContextId",
|
50
54
|
"Controls",
|
51
55
|
"Emotion",
|
56
|
+
"ExperimentalModelControls",
|
52
57
|
"FlushId",
|
58
|
+
"GenerationConfig",
|
53
59
|
"GenerationRequest",
|
54
60
|
"ModelSpeed",
|
55
61
|
"Mp3OutputFormat",
|
@@ -63,11 +69,13 @@ __all__ = [
|
|
63
69
|
"RawEncoding",
|
64
70
|
"RawOutputFormat",
|
65
71
|
"Speed",
|
72
|
+
"SseOutputFormat",
|
66
73
|
"SupportedLanguage",
|
67
74
|
"TtsRequest",
|
68
75
|
"TtsRequestEmbeddingSpecifier",
|
69
76
|
"TtsRequestIdSpecifier",
|
70
77
|
"TtsRequestVoiceSpecifier",
|
78
|
+
"TtssseRequest",
|
71
79
|
"WavOutputFormat",
|
72
80
|
"WebSocketBaseResponse",
|
73
81
|
"WebSocketChunkResponse",
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
5
|
+
import pydantic
|
6
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
|
+
|
8
|
+
|
9
|
+
class ExperimentalModelControls(UniversalBaseModel):
|
10
|
+
"""
|
11
|
+
These controls are **experimental** and subject to breaking changes.
|
12
|
+
"""
|
13
|
+
|
14
|
+
accent_localization: typing.Optional[int] = pydantic.Field(default=None)
|
15
|
+
"""
|
16
|
+
Toggle accent localization: 0 (disabled, default) or 1 (enabled).
|
17
|
+
When enabled, the voice adapts to match the transcript language's accent while preserving vocal characteristics. When disabled, maintains the original voice accent.
|
18
|
+
For more information, see [Localize Voices](/build-with-sonic/capabilities/localize-voices).
|
19
|
+
"""
|
20
|
+
|
21
|
+
if IS_PYDANTIC_V2:
|
22
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
23
|
+
else:
|
24
|
+
|
25
|
+
class Config:
|
26
|
+
frozen = True
|
27
|
+
smart_union = True
|
28
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
5
|
+
import pydantic
|
6
|
+
from .experimental_model_controls import ExperimentalModelControls
|
7
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
|
+
|
9
|
+
|
10
|
+
class GenerationConfig(UniversalBaseModel):
|
11
|
+
"""
|
12
|
+
Configure the various attributes of the generated speech. These controls are only available for `sonic-3-preview` and will have no effect on earlier models.
|
13
|
+
"""
|
14
|
+
|
15
|
+
volume: typing.Optional[float] = pydantic.Field(default=None)
|
16
|
+
"""
|
17
|
+
Adjust the volume of the generated speech between -1.0 (softer) and 1.0 (louder). 0.0 is the default volume.
|
18
|
+
"""
|
19
|
+
|
20
|
+
speed: typing.Optional[float] = pydantic.Field(default=None)
|
21
|
+
"""
|
22
|
+
Adjust the speed of the generated speech between -1.0 (slower) and 1.0 (faster). 0.0 is the default speed.
|
23
|
+
"""
|
24
|
+
|
25
|
+
experimental: typing.Optional[ExperimentalModelControls] = None
|
26
|
+
|
27
|
+
if IS_PYDANTIC_V2:
|
28
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
29
|
+
else:
|
30
|
+
|
31
|
+
class Config:
|
32
|
+
frozen = True
|
33
|
+
smart_union = True
|
34
|
+
extra = pydantic.Extra.allow
|
@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
|
|
59
59
|
|
60
60
|
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
61
61
|
"""
|
62
|
-
Whether to return word-level timestamps.
|
62
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
63
63
|
"""
|
64
64
|
|
65
65
|
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
66
66
|
"""
|
67
|
-
Whether to return phoneme-level timestamps.
|
67
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
68
68
|
"""
|
69
69
|
|
70
|
-
|
70
|
+
use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
71
71
|
"""
|
72
|
-
Whether to use
|
72
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
73
73
|
"""
|
74
74
|
|
75
75
|
if IS_PYDANTIC_V2:
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
5
|
+
from .raw_encoding import RawEncoding
|
6
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
|
+
import pydantic
|
8
|
+
|
9
|
+
|
10
|
+
class SseOutputFormat(UniversalBaseModel):
|
11
|
+
container: typing.Literal["raw"] = "raw"
|
12
|
+
encoding: RawEncoding
|
13
|
+
sample_rate: int
|
14
|
+
|
15
|
+
if IS_PYDANTIC_V2:
|
16
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
17
|
+
else:
|
18
|
+
|
19
|
+
class Config:
|
20
|
+
frozen = True
|
21
|
+
smart_union = True
|
22
|
+
extra = pydantic.Extra.allow
|
@@ -7,6 +7,7 @@ import typing
|
|
7
7
|
from .supported_language import SupportedLanguage
|
8
8
|
from .output_format import OutputFormat
|
9
9
|
from .model_speed import ModelSpeed
|
10
|
+
from .generation_config import GenerationConfig
|
10
11
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
11
12
|
|
12
13
|
|
@@ -27,6 +28,7 @@ class TtsRequest(UniversalBaseModel):
|
|
27
28
|
"""
|
28
29
|
|
29
30
|
speed: typing.Optional[ModelSpeed] = None
|
31
|
+
generation_config: typing.Optional[GenerationConfig] = None
|
30
32
|
|
31
33
|
if IS_PYDANTIC_V2:
|
32
34
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
6
|
+
import typing
|
7
|
+
from .supported_language import SupportedLanguage
|
8
|
+
from .sse_output_format import SseOutputFormat
|
9
|
+
from .model_speed import ModelSpeed
|
10
|
+
from .context_id import ContextId
|
11
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
12
|
+
|
13
|
+
|
14
|
+
class TtssseRequest(UniversalBaseModel):
|
15
|
+
model_id: str = pydantic.Field()
|
16
|
+
"""
|
17
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
18
|
+
"""
|
19
|
+
|
20
|
+
transcript: str
|
21
|
+
voice: TtsRequestVoiceSpecifier
|
22
|
+
language: typing.Optional[SupportedLanguage] = None
|
23
|
+
output_format: SseOutputFormat
|
24
|
+
duration: typing.Optional[float] = pydantic.Field(default=None)
|
25
|
+
"""
|
26
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
27
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
28
|
+
"""
|
29
|
+
|
30
|
+
speed: typing.Optional[ModelSpeed] = None
|
31
|
+
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
32
|
+
"""
|
33
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
34
|
+
"""
|
35
|
+
|
36
|
+
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
37
|
+
"""
|
38
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
39
|
+
"""
|
40
|
+
|
41
|
+
use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
42
|
+
"""
|
43
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
44
|
+
"""
|
45
|
+
|
46
|
+
context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
|
47
|
+
"""
|
48
|
+
Optional context ID for this request.
|
49
|
+
"""
|
50
|
+
|
51
|
+
if IS_PYDANTIC_V2:
|
52
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
53
|
+
else:
|
54
|
+
|
55
|
+
class Config:
|
56
|
+
frozen = True
|
57
|
+
smart_union = True
|
58
|
+
extra = pydantic.Extra.allow
|
@@ -1,16 +1,14 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponse
|
4
|
-
import typing
|
5
|
-
from .flush_id import FlushId
|
6
4
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
5
|
+
import typing
|
7
6
|
import pydantic
|
8
7
|
|
9
8
|
|
10
9
|
class WebSocketChunkResponse(WebSocketBaseResponse):
|
11
10
|
data: str
|
12
11
|
step_time: float
|
13
|
-
flush_id: typing.Optional[FlushId] = None
|
14
12
|
|
15
13
|
if IS_PYDANTIC_V2:
|
16
14
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -3,10 +3,10 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
-
from .flush_id import FlushId
|
7
6
|
from .context_id import ContextId
|
8
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
8
|
import pydantic
|
9
|
+
from .flush_id import FlushId
|
10
10
|
from .word_timestamps import WordTimestamps
|
11
11
|
from .phoneme_timestamps import PhonemeTimestamps
|
12
12
|
|
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
|
|
15
15
|
type: typing.Literal["chunk"] = "chunk"
|
16
16
|
data: str
|
17
17
|
step_time: float
|
18
|
-
flush_id: typing.Optional[FlushId] = None
|
19
18
|
context_id: typing.Optional[ContextId] = None
|
20
19
|
status_code: int
|
21
20
|
done: bool
|
@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
|
|
22
22
|
voice: TtsRequestVoiceSpecifier
|
23
23
|
duration: typing.Optional[int] = None
|
24
24
|
language: typing.Optional[str] = None
|
25
|
-
add_timestamps: typing.Optional[bool] = None
|
26
|
-
|
27
|
-
|
25
|
+
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
26
|
+
"""
|
27
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
28
|
+
"""
|
29
|
+
|
30
|
+
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
31
|
+
"""
|
32
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
33
|
+
"""
|
34
|
+
|
35
|
+
use_normalized_timestamps: typing.Optional[bool] = None
|
28
36
|
continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
|
29
37
|
context_id: typing.Optional[str] = None
|
30
38
|
max_buffer_delay_ms: typing.Optional[int] = None
|
cartesia/voice_changer/client.py
CHANGED
@@ -51,11 +51,9 @@ class VoiceChangerClient:
|
|
51
51
|
output_format_encoding : typing.Optional[RawEncoding]
|
52
52
|
Required for `raw` and `wav` containers.
|
53
53
|
|
54
|
-
|
55
54
|
output_format_bit_rate : typing.Optional[int]
|
56
55
|
Required for `mp3` containers.
|
57
56
|
|
58
|
-
|
59
57
|
request_options : typing.Optional[RequestOptions]
|
60
58
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
61
59
|
|
@@ -131,11 +129,9 @@ class VoiceChangerClient:
|
|
131
129
|
output_format_encoding : typing.Optional[RawEncoding]
|
132
130
|
Required for `raw` and `wav` containers.
|
133
131
|
|
134
|
-
|
135
132
|
output_format_bit_rate : typing.Optional[int]
|
136
133
|
Required for `mp3` containers.
|
137
134
|
|
138
|
-
|
139
135
|
request_options : typing.Optional[RequestOptions]
|
140
136
|
Request-specific configuration.
|
141
137
|
|
@@ -232,11 +228,9 @@ class AsyncVoiceChangerClient:
|
|
232
228
|
output_format_encoding : typing.Optional[RawEncoding]
|
233
229
|
Required for `raw` and `wav` containers.
|
234
230
|
|
235
|
-
|
236
231
|
output_format_bit_rate : typing.Optional[int]
|
237
232
|
Required for `mp3` containers.
|
238
233
|
|
239
|
-
|
240
234
|
request_options : typing.Optional[RequestOptions]
|
241
235
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
242
236
|
|
@@ -320,11 +314,9 @@ class AsyncVoiceChangerClient:
|
|
320
314
|
output_format_encoding : typing.Optional[RawEncoding]
|
321
315
|
Required for `raw` and `wav` containers.
|
322
316
|
|
323
|
-
|
324
317
|
output_format_bit_rate : typing.Optional[int]
|
325
318
|
Required for `mp3` containers.
|
326
319
|
|
327
|
-
|
328
320
|
request_options : typing.Optional[RequestOptions]
|
329
321
|
Request-specific configuration.
|
330
322
|
|
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ...tts.types.flush_id import FlushId
|
8
7
|
from ...tts.types.context_id import ContextId
|
9
8
|
|
10
9
|
|
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
|
|
12
11
|
type: typing.Literal["chunk"]
|
13
12
|
data: str
|
14
13
|
step_time: float
|
15
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
16
14
|
context_id: typing_extensions.NotRequired[ContextId]
|
17
15
|
status_code: int
|
18
16
|
done: bool
|
@@ -3,7 +3,6 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
-
from ...tts.types.flush_id import FlushId
|
7
6
|
from ...tts.types.context_id import ContextId
|
8
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
8
|
import pydantic
|
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
|
|
13
12
|
type: typing.Literal["chunk"] = "chunk"
|
14
13
|
data: str
|
15
14
|
step_time: float
|
16
|
-
flush_id: typing.Optional[FlushId] = None
|
17
15
|
context_id: typing.Optional[ContextId] = None
|
18
16
|
status_code: int
|
19
17
|
done: bool
|
cartesia/voices/client.py
CHANGED
@@ -168,27 +168,21 @@ class VoicesClient:
|
|
168
168
|
name : str
|
169
169
|
The name of the voice.
|
170
170
|
|
171
|
-
|
172
171
|
language : SupportedLanguage
|
173
172
|
The language of the voice.
|
174
173
|
|
175
|
-
|
176
174
|
mode : CloneMode
|
177
175
|
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
178
176
|
|
179
|
-
|
180
177
|
description : typing.Optional[str]
|
181
178
|
A description for the voice.
|
182
179
|
|
183
|
-
|
184
180
|
enhance : typing.Optional[bool]
|
185
181
|
Whether to apply AI enhancements to the clip to reduce background noise. This leads to cleaner generated speech at the cost of reduced similarity to the source clip.
|
186
182
|
|
187
|
-
|
188
183
|
base_voice_id : typing.Optional[VoiceId]
|
189
184
|
Optional base voice ID that the cloned voice is derived from.
|
190
185
|
|
191
|
-
|
192
186
|
request_options : typing.Optional[RequestOptions]
|
193
187
|
Request-specific configuration.
|
194
188
|
|
@@ -736,27 +730,21 @@ class AsyncVoicesClient:
|
|
736
730
|
name : str
|
737
731
|
The name of the voice.
|
738
732
|
|
739
|
-
|
740
733
|
language : SupportedLanguage
|
741
734
|
The language of the voice.
|
742
735
|
|
743
|
-
|
744
736
|
mode : CloneMode
|
745
737
|
Tradeoff between similarity and stability. Similarity clones sound more like the source clip, but may reproduce background noise. Stability clones always sound like a studio recording, but may not sound as similar to the source clip.
|
746
738
|
|
747
|
-
|
748
739
|
description : typing.Optional[str]
|
749
740
|
A description for the voice.
|
750
741
|
|
751
|
-
|
752
742
|
enhance : typing.Optional[bool]
|
753
743
|
Whether to apply AI enhancements to the clip to reduce background noise. This leads to cleaner generated speech at the cost of reduced similarity to the source clip.
|
754
744
|
|
755
|
-
|
756
745
|
base_voice_id : typing.Optional[VoiceId]
|
757
746
|
Optional base voice ID that the cloned voice is derived from.
|
758
747
|
|
759
|
-
|
760
748
|
request_options : typing.Optional[RequestOptions]
|
761
749
|
Request-specific configuration.
|
762
750
|
|