cartesia 2.0.0b7__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +15 -1
- cartesia/auth/__init__.py +13 -0
- cartesia/auth/client.py +159 -0
- cartesia/auth/requests/__init__.py +7 -0
- cartesia/auth/requests/token_grant.py +10 -0
- cartesia/auth/requests/token_request.py +17 -0
- cartesia/auth/requests/token_response.py +10 -0
- cartesia/auth/types/__init__.py +7 -0
- cartesia/auth/types/token_grant.py +22 -0
- cartesia/auth/types/token_request.py +28 -0
- cartesia/auth/types/token_response.py +22 -0
- cartesia/base_client.py +4 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/tts/_async_websocket.py +8 -0
- cartesia/tts/_websocket.py +11 -0
- cartesia/tts/client.py +40 -4
- cartesia/tts/requests/generation_request.py +19 -1
- cartesia/tts/requests/tts_request.py +10 -1
- cartesia/tts/requests/web_socket_tts_request.py +3 -1
- cartesia/tts/types/generation_request.py +19 -1
- cartesia/tts/types/tts_request.py +10 -1
- cartesia/tts/types/web_socket_tts_request.py +3 -1
- cartesia/voices/__init__.py +6 -0
- cartesia/voices/client.py +208 -159
- cartesia/voices/requests/create_voice_request.py +2 -0
- cartesia/voices/requests/localize_dialect.py +6 -1
- cartesia/voices/requests/localize_voice_request.py +15 -2
- cartesia/voices/types/__init__.py +6 -0
- cartesia/voices/types/create_voice_request.py +2 -0
- cartesia/voices/types/localize_dialect.py +6 -1
- cartesia/voices/types/localize_french_dialect.py +5 -0
- cartesia/voices/types/localize_portuguese_dialect.py +5 -0
- cartesia/voices/types/localize_spanish_dialect.py +5 -0
- cartesia/voices/types/localize_voice_request.py +16 -3
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/METADATA +68 -63
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/RECORD +37 -24
- {cartesia-2.0.0b7.dist-info → cartesia-2.0.2.dist-info}/WHEEL +0 -0
cartesia/__init__.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
|
-
from . import api_status, datasets, embedding, infill, tts, voice_changer, voices
|
3
|
+
from . import api_status, auth, datasets, embedding, infill, tts, voice_changer, voices
|
4
4
|
from .api_status import ApiInfo, ApiInfoParams
|
5
|
+
from .auth import TokenGrant, TokenGrantParams, TokenRequest, TokenRequestParams, TokenResponse, TokenResponseParams
|
5
6
|
from .client import AsyncCartesia, Cartesia
|
6
7
|
from .datasets import (
|
7
8
|
CreateDatasetRequest,
|
@@ -129,6 +130,9 @@ from .voices import (
|
|
129
130
|
LocalizeDialect,
|
130
131
|
LocalizeDialectParams,
|
131
132
|
LocalizeEnglishDialect,
|
133
|
+
LocalizeFrenchDialect,
|
134
|
+
LocalizePortugueseDialect,
|
135
|
+
LocalizeSpanishDialect,
|
132
136
|
LocalizeTargetLanguage,
|
133
137
|
LocalizeVoiceRequest,
|
134
138
|
LocalizeVoiceRequestParams,
|
@@ -187,6 +191,9 @@ __all__ = [
|
|
187
191
|
"LocalizeDialect",
|
188
192
|
"LocalizeDialectParams",
|
189
193
|
"LocalizeEnglishDialect",
|
194
|
+
"LocalizeFrenchDialect",
|
195
|
+
"LocalizePortugueseDialect",
|
196
|
+
"LocalizeSpanishDialect",
|
190
197
|
"LocalizeTargetLanguage",
|
191
198
|
"LocalizeVoiceRequest",
|
192
199
|
"LocalizeVoiceRequestParams",
|
@@ -227,6 +234,12 @@ __all__ = [
|
|
227
234
|
"StreamingResponse_Error",
|
228
235
|
"StreamingResponse_ErrorParams",
|
229
236
|
"SupportedLanguage",
|
237
|
+
"TokenGrant",
|
238
|
+
"TokenGrantParams",
|
239
|
+
"TokenRequest",
|
240
|
+
"TokenRequestParams",
|
241
|
+
"TokenResponse",
|
242
|
+
"TokenResponseParams",
|
230
243
|
"TtsRequest",
|
231
244
|
"TtsRequestEmbeddingSpecifier",
|
232
245
|
"TtsRequestEmbeddingSpecifierParams",
|
@@ -288,6 +301,7 @@ __all__ = [
|
|
288
301
|
"WordTimestampsParams",
|
289
302
|
"__version__",
|
290
303
|
"api_status",
|
304
|
+
"auth",
|
291
305
|
"datasets",
|
292
306
|
"embedding",
|
293
307
|
"infill",
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .types import TokenGrant, TokenRequest, TokenResponse
|
4
|
+
from .requests import TokenGrantParams, TokenRequestParams, TokenResponseParams
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"TokenGrant",
|
8
|
+
"TokenGrantParams",
|
9
|
+
"TokenRequest",
|
10
|
+
"TokenRequestParams",
|
11
|
+
"TokenResponse",
|
12
|
+
"TokenResponseParams",
|
13
|
+
]
|
cartesia/auth/client.py
ADDED
@@ -0,0 +1,159 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from ..core.client_wrapper import SyncClientWrapper
|
5
|
+
from .requests.token_grant import TokenGrantParams
|
6
|
+
from ..core.request_options import RequestOptions
|
7
|
+
from .types.token_response import TokenResponse
|
8
|
+
from ..core.serialization import convert_and_respect_annotation_metadata
|
9
|
+
from ..core.pydantic_utilities import parse_obj_as
|
10
|
+
from json.decoder import JSONDecodeError
|
11
|
+
from ..core.api_error import ApiError
|
12
|
+
from ..core.client_wrapper import AsyncClientWrapper
|
13
|
+
|
14
|
+
# this is used as the default value for optional parameters
|
15
|
+
OMIT = typing.cast(typing.Any, ...)
|
16
|
+
|
17
|
+
|
18
|
+
class AuthClient:
|
19
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
20
|
+
self._client_wrapper = client_wrapper
|
21
|
+
|
22
|
+
def access_token(
|
23
|
+
self,
|
24
|
+
*,
|
25
|
+
grants: TokenGrantParams,
|
26
|
+
expires_in: typing.Optional[int] = OMIT,
|
27
|
+
request_options: typing.Optional[RequestOptions] = None,
|
28
|
+
) -> TokenResponse:
|
29
|
+
"""
|
30
|
+
Generates a new Access Token for the client. These tokens are short-lived and should be used to make requests to the API from authenticated clients.
|
31
|
+
|
32
|
+
Parameters
|
33
|
+
----------
|
34
|
+
grants : TokenGrantParams
|
35
|
+
The permissions to be granted via the token.
|
36
|
+
|
37
|
+
expires_in : typing.Optional[int]
|
38
|
+
The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
|
39
|
+
|
40
|
+
request_options : typing.Optional[RequestOptions]
|
41
|
+
Request-specific configuration.
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
TokenResponse
|
46
|
+
|
47
|
+
Examples
|
48
|
+
--------
|
49
|
+
from cartesia import Cartesia
|
50
|
+
|
51
|
+
client = Cartesia(
|
52
|
+
api_key="YOUR_API_KEY",
|
53
|
+
)
|
54
|
+
client.auth.access_token(
|
55
|
+
grants={"tts": True},
|
56
|
+
expires_in=60,
|
57
|
+
)
|
58
|
+
"""
|
59
|
+
_response = self._client_wrapper.httpx_client.request(
|
60
|
+
"access-token",
|
61
|
+
method="POST",
|
62
|
+
json={
|
63
|
+
"grants": convert_and_respect_annotation_metadata(
|
64
|
+
object_=grants, annotation=TokenGrantParams, direction="write"
|
65
|
+
),
|
66
|
+
"expires_in": expires_in,
|
67
|
+
},
|
68
|
+
request_options=request_options,
|
69
|
+
omit=OMIT,
|
70
|
+
)
|
71
|
+
try:
|
72
|
+
if 200 <= _response.status_code < 300:
|
73
|
+
return typing.cast(
|
74
|
+
TokenResponse,
|
75
|
+
parse_obj_as(
|
76
|
+
type_=TokenResponse, # type: ignore
|
77
|
+
object_=_response.json(),
|
78
|
+
),
|
79
|
+
)
|
80
|
+
_response_json = _response.json()
|
81
|
+
except JSONDecodeError:
|
82
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
83
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
84
|
+
|
85
|
+
|
86
|
+
class AsyncAuthClient:
|
87
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
88
|
+
self._client_wrapper = client_wrapper
|
89
|
+
|
90
|
+
async def access_token(
|
91
|
+
self,
|
92
|
+
*,
|
93
|
+
grants: TokenGrantParams,
|
94
|
+
expires_in: typing.Optional[int] = OMIT,
|
95
|
+
request_options: typing.Optional[RequestOptions] = None,
|
96
|
+
) -> TokenResponse:
|
97
|
+
"""
|
98
|
+
Generates a new Access Token for the client. These tokens are short-lived and should be used to make requests to the API from authenticated clients.
|
99
|
+
|
100
|
+
Parameters
|
101
|
+
----------
|
102
|
+
grants : TokenGrantParams
|
103
|
+
The permissions to be granted via the token.
|
104
|
+
|
105
|
+
expires_in : typing.Optional[int]
|
106
|
+
The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
|
107
|
+
|
108
|
+
request_options : typing.Optional[RequestOptions]
|
109
|
+
Request-specific configuration.
|
110
|
+
|
111
|
+
Returns
|
112
|
+
-------
|
113
|
+
TokenResponse
|
114
|
+
|
115
|
+
Examples
|
116
|
+
--------
|
117
|
+
import asyncio
|
118
|
+
|
119
|
+
from cartesia import AsyncCartesia
|
120
|
+
|
121
|
+
client = AsyncCartesia(
|
122
|
+
api_key="YOUR_API_KEY",
|
123
|
+
)
|
124
|
+
|
125
|
+
|
126
|
+
async def main() -> None:
|
127
|
+
await client.auth.access_token(
|
128
|
+
grants={"tts": True},
|
129
|
+
expires_in=60,
|
130
|
+
)
|
131
|
+
|
132
|
+
|
133
|
+
asyncio.run(main())
|
134
|
+
"""
|
135
|
+
_response = await self._client_wrapper.httpx_client.request(
|
136
|
+
"access-token",
|
137
|
+
method="POST",
|
138
|
+
json={
|
139
|
+
"grants": convert_and_respect_annotation_metadata(
|
140
|
+
object_=grants, annotation=TokenGrantParams, direction="write"
|
141
|
+
),
|
142
|
+
"expires_in": expires_in,
|
143
|
+
},
|
144
|
+
request_options=request_options,
|
145
|
+
omit=OMIT,
|
146
|
+
)
|
147
|
+
try:
|
148
|
+
if 200 <= _response.status_code < 300:
|
149
|
+
return typing.cast(
|
150
|
+
TokenResponse,
|
151
|
+
parse_obj_as(
|
152
|
+
type_=TokenResponse, # type: ignore
|
153
|
+
object_=_response.json(),
|
154
|
+
),
|
155
|
+
)
|
156
|
+
_response_json = _response.json()
|
157
|
+
except JSONDecodeError:
|
158
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
159
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .token_grant import TokenGrantParams
|
4
|
+
from .token_request import TokenRequestParams
|
5
|
+
from .token_response import TokenResponseParams
|
6
|
+
|
7
|
+
__all__ = ["TokenGrantParams", "TokenRequestParams", "TokenResponseParams"]
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from .token_grant import TokenGrantParams
|
5
|
+
import typing_extensions
|
6
|
+
|
7
|
+
|
8
|
+
class TokenRequestParams(typing_extensions.TypedDict):
|
9
|
+
grants: TokenGrantParams
|
10
|
+
"""
|
11
|
+
The permissions to be granted via the token.
|
12
|
+
"""
|
13
|
+
|
14
|
+
expires_in: typing_extensions.NotRequired[int]
|
15
|
+
"""
|
16
|
+
The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
|
17
|
+
"""
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
+
import typing
|
7
|
+
|
8
|
+
|
9
|
+
class TokenGrant(UniversalBaseModel):
|
10
|
+
tts: bool = pydantic.Field()
|
11
|
+
"""
|
12
|
+
The `tts` grant allows the token to be used to access any TTS endpoint.
|
13
|
+
"""
|
14
|
+
|
15
|
+
if IS_PYDANTIC_V2:
|
16
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
17
|
+
else:
|
18
|
+
|
19
|
+
class Config:
|
20
|
+
frozen = True
|
21
|
+
smart_union = True
|
22
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
from .token_grant import TokenGrant
|
5
|
+
import pydantic
|
6
|
+
import typing
|
7
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
|
+
|
9
|
+
|
10
|
+
class TokenRequest(UniversalBaseModel):
|
11
|
+
grants: TokenGrant = pydantic.Field()
|
12
|
+
"""
|
13
|
+
The permissions to be granted via the token.
|
14
|
+
"""
|
15
|
+
|
16
|
+
expires_in: typing.Optional[int] = pydantic.Field(default=None)
|
17
|
+
"""
|
18
|
+
The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
|
19
|
+
"""
|
20
|
+
|
21
|
+
if IS_PYDANTIC_V2:
|
22
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
23
|
+
else:
|
24
|
+
|
25
|
+
class Config:
|
26
|
+
frozen = True
|
27
|
+
smart_union = True
|
28
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
+
import typing
|
7
|
+
|
8
|
+
|
9
|
+
class TokenResponse(UniversalBaseModel):
|
10
|
+
token: str = pydantic.Field()
|
11
|
+
"""
|
12
|
+
The generated Access Token.
|
13
|
+
"""
|
14
|
+
|
15
|
+
if IS_PYDANTIC_V2:
|
16
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
17
|
+
else:
|
18
|
+
|
19
|
+
class Config:
|
20
|
+
frozen = True
|
21
|
+
smart_union = True
|
22
|
+
extra = pydantic.Extra.allow
|
cartesia/base_client.py
CHANGED
@@ -5,12 +5,14 @@ from .environment import CartesiaEnvironment
|
|
5
5
|
import httpx
|
6
6
|
from .core.client_wrapper import SyncClientWrapper
|
7
7
|
from .api_status.client import ApiStatusClient
|
8
|
+
from .auth.client import AuthClient
|
8
9
|
from .infill.client import InfillClient
|
9
10
|
from .tts.client import TtsClient
|
10
11
|
from .voice_changer.client import VoiceChangerClient
|
11
12
|
from .voices.client import VoicesClient
|
12
13
|
from .core.client_wrapper import AsyncClientWrapper
|
13
14
|
from .api_status.client import AsyncApiStatusClient
|
15
|
+
from .auth.client import AsyncAuthClient
|
14
16
|
from .infill.client import AsyncInfillClient
|
15
17
|
from .tts.client import AsyncTtsClient
|
16
18
|
from .voice_changer.client import AsyncVoiceChangerClient
|
@@ -76,6 +78,7 @@ class BaseCartesia:
|
|
76
78
|
timeout=_defaulted_timeout,
|
77
79
|
)
|
78
80
|
self.api_status = ApiStatusClient(client_wrapper=self._client_wrapper)
|
81
|
+
self.auth = AuthClient(client_wrapper=self._client_wrapper)
|
79
82
|
self.infill = InfillClient(client_wrapper=self._client_wrapper)
|
80
83
|
self.tts = TtsClient(client_wrapper=self._client_wrapper)
|
81
84
|
self.voice_changer = VoiceChangerClient(client_wrapper=self._client_wrapper)
|
@@ -141,6 +144,7 @@ class AsyncBaseCartesia:
|
|
141
144
|
timeout=_defaulted_timeout,
|
142
145
|
)
|
143
146
|
self.api_status = AsyncApiStatusClient(client_wrapper=self._client_wrapper)
|
147
|
+
self.auth = AsyncAuthClient(client_wrapper=self._client_wrapper)
|
144
148
|
self.infill = AsyncInfillClient(client_wrapper=self._client_wrapper)
|
145
149
|
self.tts = AsyncTtsClient(client_wrapper=self._client_wrapper)
|
146
150
|
self.voice_changer = AsyncVoiceChangerClient(client_wrapper=self._client_wrapper)
|
cartesia/core/client_wrapper.py
CHANGED
@@ -16,7 +16,7 @@ class BaseClientWrapper:
|
|
16
16
|
headers: typing.Dict[str, str] = {
|
17
17
|
"X-Fern-Language": "Python",
|
18
18
|
"X-Fern-SDK-Name": "cartesia",
|
19
|
-
"X-Fern-SDK-Version": "2.0.
|
19
|
+
"X-Fern-SDK-Version": "2.0.2",
|
20
20
|
}
|
21
21
|
headers["X-API-Key"] = self.api_key
|
22
22
|
headers["Cartesia-Version"] = "2024-11-13"
|
cartesia/tts/_async_websocket.py
CHANGED
@@ -69,7 +69,9 @@ class _AsyncTTSContext:
|
|
69
69
|
stream: bool = True,
|
70
70
|
add_timestamps: bool = False,
|
71
71
|
add_phoneme_timestamps: bool = False,
|
72
|
+
use_original_timestamps: bool = False,
|
72
73
|
continue_: bool = False,
|
74
|
+
max_buffer_delay_ms: Optional[int] = None,
|
73
75
|
flush: bool = False,
|
74
76
|
) -> None:
|
75
77
|
"""Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
|
@@ -106,8 +108,12 @@ class _AsyncTTSContext:
|
|
106
108
|
request_body["add_timestamps"] = add_timestamps
|
107
109
|
if add_phoneme_timestamps:
|
108
110
|
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
111
|
+
if use_original_timestamps:
|
112
|
+
request_body["use_original_timestamps"] = use_original_timestamps
|
109
113
|
if continue_:
|
110
114
|
request_body["continue"] = continue_
|
115
|
+
if max_buffer_delay_ms:
|
116
|
+
request_body["max_buffer_delay_ms"] = max_buffer_delay_ms
|
111
117
|
if flush:
|
112
118
|
request_body["flush"] = flush
|
113
119
|
|
@@ -367,6 +373,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
367
373
|
stream: bool = True,
|
368
374
|
add_timestamps: bool = False,
|
369
375
|
add_phoneme_timestamps: bool = False,
|
376
|
+
use_original_timestamps: bool = False,
|
370
377
|
):
|
371
378
|
"""See :meth:`_WebSocket.send` for details."""
|
372
379
|
if context_id is None:
|
@@ -385,6 +392,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
385
392
|
continue_=False,
|
386
393
|
add_timestamps=add_timestamps,
|
387
394
|
add_phoneme_timestamps=add_phoneme_timestamps,
|
395
|
+
use_original_timestamps=use_original_timestamps,
|
388
396
|
)
|
389
397
|
|
390
398
|
generator = ctx.receive()
|
cartesia/tts/_websocket.py
CHANGED
@@ -63,10 +63,13 @@ class _TTSContext:
|
|
63
63
|
output_format: OutputFormatParams,
|
64
64
|
voice: TtsRequestVoiceSpecifierParams,
|
65
65
|
context_id: Optional[str] = None,
|
66
|
+
max_buffer_delay_ms: Optional[int] = None,
|
66
67
|
duration: Optional[int] = None,
|
67
68
|
language: Optional[str] = None,
|
68
69
|
stream: bool = True,
|
69
70
|
add_timestamps: bool = False,
|
71
|
+
add_phoneme_timestamps: bool = False,
|
72
|
+
use_original_timestamps: bool = False,
|
70
73
|
) -> Generator[bytes, None, None]:
|
71
74
|
"""Send audio generation requests to the WebSocket and yield responses.
|
72
75
|
|
@@ -102,6 +105,12 @@ class _TTSContext:
|
|
102
105
|
request_body["stream"] = stream
|
103
106
|
if add_timestamps:
|
104
107
|
request_body["add_timestamps"] = add_timestamps
|
108
|
+
if add_phoneme_timestamps:
|
109
|
+
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
110
|
+
if use_original_timestamps:
|
111
|
+
request_body["use_original_timestamps"] = use_original_timestamps
|
112
|
+
if max_buffer_delay_ms:
|
113
|
+
request_body["max_buffer_delay_ms"] = max_buffer_delay_ms
|
105
114
|
|
106
115
|
if (
|
107
116
|
"context_id" in request_body
|
@@ -354,6 +363,7 @@ class TtsWebsocket:
|
|
354
363
|
stream: bool = True,
|
355
364
|
add_timestamps: bool = False,
|
356
365
|
add_phoneme_timestamps: bool = False,
|
366
|
+
use_original_timestamps: bool = False,
|
357
367
|
):
|
358
368
|
"""Send a request to the WebSocket to generate audio.
|
359
369
|
|
@@ -384,6 +394,7 @@ class TtsWebsocket:
|
|
384
394
|
"stream": stream,
|
385
395
|
"add_timestamps": add_timestamps,
|
386
396
|
"add_phoneme_timestamps": add_phoneme_timestamps,
|
397
|
+
"use_original_timestamps": use_original_timestamps,
|
387
398
|
}
|
388
399
|
generator = self._websocket_generator(request_body)
|
389
400
|
|
cartesia/tts/client.py
CHANGED
@@ -32,13 +32,14 @@ class TtsClient:
|
|
32
32
|
output_format: OutputFormatParams,
|
33
33
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
34
34
|
duration: typing.Optional[float] = OMIT,
|
35
|
+
text_cfg: typing.Optional[float] = OMIT,
|
35
36
|
request_options: typing.Optional[RequestOptions] = None,
|
36
37
|
) -> typing.Iterator[bytes]:
|
37
38
|
"""
|
38
39
|
Parameters
|
39
40
|
----------
|
40
41
|
model_id : str
|
41
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
42
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
42
43
|
|
43
44
|
transcript : str
|
44
45
|
|
@@ -52,6 +53,13 @@ class TtsClient:
|
|
52
53
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
53
54
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
54
55
|
|
56
|
+
text_cfg : typing.Optional[float]
|
57
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
58
|
+
|
59
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
60
|
+
|
61
|
+
This parameter is only supported for `sonic-2` models.
|
62
|
+
|
55
63
|
request_options : typing.Optional[RequestOptions]
|
56
64
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
57
65
|
|
@@ -92,6 +100,7 @@ class TtsClient:
|
|
92
100
|
object_=output_format, annotation=OutputFormatParams, direction="write"
|
93
101
|
),
|
94
102
|
"duration": duration,
|
103
|
+
"text_cfg": text_cfg,
|
95
104
|
},
|
96
105
|
request_options=request_options,
|
97
106
|
omit=OMIT,
|
@@ -117,13 +126,14 @@ class TtsClient:
|
|
117
126
|
output_format: OutputFormatParams,
|
118
127
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
119
128
|
duration: typing.Optional[float] = OMIT,
|
129
|
+
text_cfg: typing.Optional[float] = OMIT,
|
120
130
|
request_options: typing.Optional[RequestOptions] = None,
|
121
131
|
) -> typing.Iterator[WebSocketResponse]:
|
122
132
|
"""
|
123
133
|
Parameters
|
124
134
|
----------
|
125
135
|
model_id : str
|
126
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
136
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
127
137
|
|
128
138
|
transcript : str
|
129
139
|
|
@@ -137,6 +147,13 @@ class TtsClient:
|
|
137
147
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
138
148
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
139
149
|
|
150
|
+
text_cfg : typing.Optional[float]
|
151
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
152
|
+
|
153
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
154
|
+
|
155
|
+
This parameter is only supported for `sonic-2` models.
|
156
|
+
|
140
157
|
request_options : typing.Optional[RequestOptions]
|
141
158
|
Request-specific configuration.
|
142
159
|
|
@@ -179,6 +196,7 @@ class TtsClient:
|
|
179
196
|
object_=output_format, annotation=OutputFormatParams, direction="write"
|
180
197
|
),
|
181
198
|
"duration": duration,
|
199
|
+
"text_cfg": text_cfg,
|
182
200
|
},
|
183
201
|
request_options=request_options,
|
184
202
|
omit=OMIT,
|
@@ -218,13 +236,14 @@ class AsyncTtsClient:
|
|
218
236
|
output_format: OutputFormatParams,
|
219
237
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
220
238
|
duration: typing.Optional[float] = OMIT,
|
239
|
+
text_cfg: typing.Optional[float] = OMIT,
|
221
240
|
request_options: typing.Optional[RequestOptions] = None,
|
222
241
|
) -> typing.AsyncIterator[bytes]:
|
223
242
|
"""
|
224
243
|
Parameters
|
225
244
|
----------
|
226
245
|
model_id : str
|
227
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
246
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
228
247
|
|
229
248
|
transcript : str
|
230
249
|
|
@@ -238,6 +257,13 @@ class AsyncTtsClient:
|
|
238
257
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
239
258
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
240
259
|
|
260
|
+
text_cfg : typing.Optional[float]
|
261
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
262
|
+
|
263
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
264
|
+
|
265
|
+
This parameter is only supported for `sonic-2` models.
|
266
|
+
|
241
267
|
request_options : typing.Optional[RequestOptions]
|
242
268
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
243
269
|
|
@@ -286,6 +312,7 @@ class AsyncTtsClient:
|
|
286
312
|
object_=output_format, annotation=OutputFormatParams, direction="write"
|
287
313
|
),
|
288
314
|
"duration": duration,
|
315
|
+
"text_cfg": text_cfg,
|
289
316
|
},
|
290
317
|
request_options=request_options,
|
291
318
|
omit=OMIT,
|
@@ -311,13 +338,14 @@ class AsyncTtsClient:
|
|
311
338
|
output_format: OutputFormatParams,
|
312
339
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
313
340
|
duration: typing.Optional[float] = OMIT,
|
341
|
+
text_cfg: typing.Optional[float] = OMIT,
|
314
342
|
request_options: typing.Optional[RequestOptions] = None,
|
315
343
|
) -> typing.AsyncIterator[WebSocketResponse]:
|
316
344
|
"""
|
317
345
|
Parameters
|
318
346
|
----------
|
319
347
|
model_id : str
|
320
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
348
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
321
349
|
|
322
350
|
transcript : str
|
323
351
|
|
@@ -331,6 +359,13 @@ class AsyncTtsClient:
|
|
331
359
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
332
360
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
333
361
|
|
362
|
+
text_cfg : typing.Optional[float]
|
363
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
364
|
+
|
365
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
366
|
+
|
367
|
+
This parameter is only supported for `sonic-2` models.
|
368
|
+
|
334
369
|
request_options : typing.Optional[RequestOptions]
|
335
370
|
Request-specific configuration.
|
336
371
|
|
@@ -381,6 +416,7 @@ class AsyncTtsClient:
|
|
381
416
|
object_=output_format, annotation=OutputFormatParams, direction="write"
|
382
417
|
),
|
383
418
|
"duration": duration,
|
419
|
+
"text_cfg": text_cfg,
|
384
420
|
},
|
385
421
|
request_options=request_options,
|
386
422
|
omit=OMIT,
|
@@ -13,7 +13,7 @@ from ...core.serialization import FieldMetadata
|
|
13
13
|
class GenerationRequestParams(typing_extensions.TypedDict):
|
14
14
|
model_id: str
|
15
15
|
"""
|
16
|
-
The ID of the model to use for the generation. See [Models](/build-with-
|
16
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
17
17
|
"""
|
18
18
|
|
19
19
|
transcript: typing.Optional[typing.Any]
|
@@ -30,6 +30,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
|
|
30
30
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
31
31
|
"""
|
32
32
|
|
33
|
+
text_cfg: typing_extensions.NotRequired[float]
|
34
|
+
"""
|
35
|
+
The text [classifier-free guidance](https://arxiv.org/abs/2207.12598) value for the request.
|
36
|
+
|
37
|
+
Higher values causes the model to attend more to the text but speed up the generation. Lower values reduce the speaking rate but can increase the risk of hallucinations. The default value is `3.0`. For a slower speaking rate, we recommend values between `2.0` and `3.0`. Values are supported between `1.5` and `3.0`.
|
38
|
+
|
39
|
+
This parameter is only supported for `sonic-2` models.
|
40
|
+
"""
|
41
|
+
|
33
42
|
context_id: typing_extensions.NotRequired[ContextId]
|
34
43
|
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
35
44
|
"""
|
@@ -37,6 +46,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
|
|
37
46
|
If not specified, this defaults to `false`.
|
38
47
|
"""
|
39
48
|
|
49
|
+
max_buffer_delay_ms: typing_extensions.NotRequired[int]
|
50
|
+
"""
|
51
|
+
The maximum time in milliseconds to buffer text before starting generation. Values between [0, 1000]ms are supported. Defaults to 0 (no buffering).
|
52
|
+
|
53
|
+
When set, the model will buffer incoming text chunks until it's confident it has enough context to generate high-quality speech, or the buffer delay elapses, whichever comes first. Without this option set, the model will kick off generations immediately, ceding control of buffering to the user.
|
54
|
+
|
55
|
+
Use this to balance responsiveness with higher quality speech generation, which often benefits from having more context.
|
56
|
+
"""
|
57
|
+
|
40
58
|
flush: typing_extensions.NotRequired[bool]
|
41
59
|
"""
|
42
60
|
Whether to flush the context.
|