cartesia 2.0.0b1__py3-none-any.whl → 2.0.0b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +8 -4
- cartesia/base_client.py +0 -4
- cartesia/core/__init__.py +3 -0
- cartesia/core/client_wrapper.py +2 -2
- cartesia/core/pagination.py +88 -0
- cartesia/infill/client.py +4 -4
- cartesia/tts/_async_websocket.py +48 -1
- cartesia/tts/_websocket.py +44 -3
- cartesia/tts/client.py +4 -4
- cartesia/tts/requests/generation_request.py +5 -0
- cartesia/tts/requests/web_socket_chunk_response.py +3 -0
- cartesia/tts/requests/web_socket_response.py +2 -1
- cartesia/tts/requests/web_socket_tts_request.py +1 -0
- cartesia/tts/types/emotion.py +5 -0
- cartesia/tts/types/generation_request.py +5 -0
- cartesia/tts/types/web_socket_chunk_response.py +3 -1
- cartesia/tts/types/web_socket_response.py +2 -1
- cartesia/tts/types/web_socket_tts_output.py +2 -0
- cartesia/tts/types/web_socket_tts_request.py +1 -0
- cartesia/tts/utils/constants.py +2 -2
- cartesia/voice_changer/requests/streaming_response.py +2 -0
- cartesia/voice_changer/types/streaming_response.py +2 -0
- cartesia/voices/__init__.py +8 -4
- cartesia/voices/client.py +285 -169
- cartesia/voices/requests/__init__.py +2 -0
- cartesia/voices/requests/create_voice_request.py +0 -2
- cartesia/voices/requests/get_voices_response.py +24 -0
- cartesia/voices/requests/localize_dialect.py +1 -3
- cartesia/voices/requests/voice.py +13 -9
- cartesia/voices/types/__init__.py +6 -4
- cartesia/voices/types/create_voice_request.py +0 -2
- cartesia/voices/types/gender_presentation.py +5 -0
- cartesia/voices/types/get_voices_response.py +34 -0
- cartesia/voices/types/localize_dialect.py +1 -3
- cartesia/voices/types/voice.py +13 -9
- cartesia/voices/types/voice_expand_options.py +5 -0
- {cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/METADATA +151 -49
- {cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/RECORD +39 -37
- cartesia/datasets/client.py +0 -392
- cartesia/voices/types/localize_portuguese_dialect.py +0 -5
- cartesia/voices/types/localize_spanish_dialect.py +0 -5
- {cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/WHEEL +0 -0
@@ -3,6 +3,7 @@
|
|
3
3
|
from .create_voice_request import CreateVoiceRequestParams
|
4
4
|
from .embedding_response import EmbeddingResponseParams
|
5
5
|
from .embedding_specifier import EmbeddingSpecifierParams
|
6
|
+
from .get_voices_response import GetVoicesResponseParams
|
6
7
|
from .id_specifier import IdSpecifierParams
|
7
8
|
from .localize_dialect import LocalizeDialectParams
|
8
9
|
from .localize_voice_request import LocalizeVoiceRequestParams
|
@@ -16,6 +17,7 @@ __all__ = [
|
|
16
17
|
"CreateVoiceRequestParams",
|
17
18
|
"EmbeddingResponseParams",
|
18
19
|
"EmbeddingSpecifierParams",
|
20
|
+
"GetVoicesResponseParams",
|
19
21
|
"IdSpecifierParams",
|
20
22
|
"LocalizeDialectParams",
|
21
23
|
"LocalizeVoiceRequestParams",
|
@@ -4,7 +4,6 @@ import typing_extensions
|
|
4
4
|
from ...embedding.types.embedding import Embedding
|
5
5
|
import typing_extensions
|
6
6
|
from ...tts.types.supported_language import SupportedLanguage
|
7
|
-
from ..types.base_voice_id import BaseVoiceId
|
8
7
|
|
9
8
|
|
10
9
|
class CreateVoiceRequestParams(typing_extensions.TypedDict):
|
@@ -20,4 +19,3 @@ class CreateVoiceRequestParams(typing_extensions.TypedDict):
|
|
20
19
|
|
21
20
|
embedding: Embedding
|
22
21
|
language: typing_extensions.NotRequired[SupportedLanguage]
|
23
|
-
base_voice_id: typing_extensions.NotRequired[BaseVoiceId]
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing
|
5
|
+
from .voice import VoiceParams
|
6
|
+
import typing_extensions
|
7
|
+
from ..types.voice_id import VoiceId
|
8
|
+
|
9
|
+
|
10
|
+
class GetVoicesResponseParams(typing_extensions.TypedDict):
|
11
|
+
data: typing.Sequence[VoiceParams]
|
12
|
+
"""
|
13
|
+
The paginated list of Voices.
|
14
|
+
"""
|
15
|
+
|
16
|
+
has_more: bool
|
17
|
+
"""
|
18
|
+
Whether there are more Voices to fetch (using `starting_after=id`, where id is the ID of the last Voice in the current response).
|
19
|
+
"""
|
20
|
+
|
21
|
+
next_page: typing_extensions.NotRequired[VoiceId]
|
22
|
+
"""
|
23
|
+
(Deprecated - use the id of the last Voice in the current response instead.) An ID that can be passed as `starting_after` to get the next page of Voices.
|
24
|
+
"""
|
@@ -2,7 +2,5 @@
|
|
2
2
|
|
3
3
|
import typing
|
4
4
|
from ..types.localize_english_dialect import LocalizeEnglishDialect
|
5
|
-
from ..types.localize_spanish_dialect import LocalizeSpanishDialect
|
6
|
-
from ..types.localize_portuguese_dialect import LocalizePortugueseDialect
|
7
5
|
|
8
|
-
LocalizeDialectParams = typing.Union[LocalizeEnglishDialect
|
6
|
+
LocalizeDialectParams = typing.Union[LocalizeEnglishDialect]
|
@@ -2,22 +2,17 @@
|
|
2
2
|
|
3
3
|
import typing_extensions
|
4
4
|
from ..types.voice_id import VoiceId
|
5
|
-
import typing_extensions
|
6
5
|
import datetime as dt
|
6
|
+
import typing_extensions
|
7
7
|
from ...embedding.types.embedding import Embedding
|
8
8
|
from ...tts.types.supported_language import SupportedLanguage
|
9
9
|
|
10
10
|
|
11
11
|
class VoiceParams(typing_extensions.TypedDict):
|
12
12
|
id: VoiceId
|
13
|
-
|
14
|
-
"""
|
15
|
-
The ID of the user who owns the voice.
|
13
|
+
is_owner: bool
|
16
14
|
"""
|
17
|
-
|
18
|
-
is_public: bool
|
19
|
-
"""
|
20
|
-
Whether the voice is publicly accessible.
|
15
|
+
Whether the current user is the owner of the voice.
|
21
16
|
"""
|
22
17
|
|
23
18
|
name: str
|
@@ -35,5 +30,14 @@ class VoiceParams(typing_extensions.TypedDict):
|
|
35
30
|
The date and time the voice was created.
|
36
31
|
"""
|
37
32
|
|
38
|
-
embedding: Embedding
|
33
|
+
embedding: typing_extensions.NotRequired[Embedding]
|
34
|
+
"""
|
35
|
+
The vector embedding of the voice. Only included when `expand` includes `embedding`.
|
36
|
+
"""
|
37
|
+
|
38
|
+
is_starred: typing_extensions.NotRequired[bool]
|
39
|
+
"""
|
40
|
+
Whether the current user has starred the voice. Only included when `expand` includes `is_starred`.
|
41
|
+
"""
|
42
|
+
|
39
43
|
language: SupportedLanguage
|
@@ -6,17 +6,18 @@ from .create_voice_request import CreateVoiceRequest
|
|
6
6
|
from .embedding_response import EmbeddingResponse
|
7
7
|
from .embedding_specifier import EmbeddingSpecifier
|
8
8
|
from .gender import Gender
|
9
|
+
from .gender_presentation import GenderPresentation
|
10
|
+
from .get_voices_response import GetVoicesResponse
|
9
11
|
from .id_specifier import IdSpecifier
|
10
12
|
from .localize_dialect import LocalizeDialect
|
11
13
|
from .localize_english_dialect import LocalizeEnglishDialect
|
12
|
-
from .localize_portuguese_dialect import LocalizePortugueseDialect
|
13
|
-
from .localize_spanish_dialect import LocalizeSpanishDialect
|
14
14
|
from .localize_target_language import LocalizeTargetLanguage
|
15
15
|
from .localize_voice_request import LocalizeVoiceRequest
|
16
16
|
from .mix_voice_specifier import MixVoiceSpecifier
|
17
17
|
from .mix_voices_request import MixVoicesRequest
|
18
18
|
from .update_voice_request import UpdateVoiceRequest
|
19
19
|
from .voice import Voice
|
20
|
+
from .voice_expand_options import VoiceExpandOptions
|
20
21
|
from .voice_id import VoiceId
|
21
22
|
from .voice_metadata import VoiceMetadata
|
22
23
|
from .weight import Weight
|
@@ -28,17 +29,18 @@ __all__ = [
|
|
28
29
|
"EmbeddingResponse",
|
29
30
|
"EmbeddingSpecifier",
|
30
31
|
"Gender",
|
32
|
+
"GenderPresentation",
|
33
|
+
"GetVoicesResponse",
|
31
34
|
"IdSpecifier",
|
32
35
|
"LocalizeDialect",
|
33
36
|
"LocalizeEnglishDialect",
|
34
|
-
"LocalizePortugueseDialect",
|
35
|
-
"LocalizeSpanishDialect",
|
36
37
|
"LocalizeTargetLanguage",
|
37
38
|
"LocalizeVoiceRequest",
|
38
39
|
"MixVoiceSpecifier",
|
39
40
|
"MixVoicesRequest",
|
40
41
|
"UpdateVoiceRequest",
|
41
42
|
"Voice",
|
43
|
+
"VoiceExpandOptions",
|
42
44
|
"VoiceId",
|
43
45
|
"VoiceMetadata",
|
44
46
|
"Weight",
|
@@ -5,7 +5,6 @@ import pydantic
|
|
5
5
|
from ...embedding.types.embedding import Embedding
|
6
6
|
import typing
|
7
7
|
from ...tts.types.supported_language import SupportedLanguage
|
8
|
-
from .base_voice_id import BaseVoiceId
|
9
8
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
10
9
|
|
11
10
|
|
@@ -22,7 +21,6 @@ class CreateVoiceRequest(UniversalBaseModel):
|
|
22
21
|
|
23
22
|
embedding: Embedding
|
24
23
|
language: typing.Optional[SupportedLanguage] = None
|
25
|
-
base_voice_id: typing.Optional[BaseVoiceId] = None
|
26
24
|
|
27
25
|
if IS_PYDANTIC_V2:
|
28
26
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
5
|
+
from .voice import Voice
|
6
|
+
import pydantic
|
7
|
+
from .voice_id import VoiceId
|
8
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
|
+
|
10
|
+
|
11
|
+
class GetVoicesResponse(UniversalBaseModel):
|
12
|
+
data: typing.List[Voice] = pydantic.Field()
|
13
|
+
"""
|
14
|
+
The paginated list of Voices.
|
15
|
+
"""
|
16
|
+
|
17
|
+
has_more: bool = pydantic.Field()
|
18
|
+
"""
|
19
|
+
Whether there are more Voices to fetch (using `starting_after=id`, where id is the ID of the last Voice in the current response).
|
20
|
+
"""
|
21
|
+
|
22
|
+
next_page: typing.Optional[VoiceId] = pydantic.Field(default=None)
|
23
|
+
"""
|
24
|
+
(Deprecated - use the id of the last Voice in the current response instead.) An ID that can be passed as `starting_after` to get the next page of Voices.
|
25
|
+
"""
|
26
|
+
|
27
|
+
if IS_PYDANTIC_V2:
|
28
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
29
|
+
else:
|
30
|
+
|
31
|
+
class Config:
|
32
|
+
frozen = True
|
33
|
+
smart_union = True
|
34
|
+
extra = pydantic.Extra.allow
|
@@ -2,7 +2,5 @@
|
|
2
2
|
|
3
3
|
import typing
|
4
4
|
from .localize_english_dialect import LocalizeEnglishDialect
|
5
|
-
from .localize_spanish_dialect import LocalizeSpanishDialect
|
6
|
-
from .localize_portuguese_dialect import LocalizePortugueseDialect
|
7
5
|
|
8
|
-
LocalizeDialect = typing.Union[LocalizeEnglishDialect
|
6
|
+
LocalizeDialect = typing.Union[LocalizeEnglishDialect]
|
cartesia/voices/types/voice.py
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
4
|
from .voice_id import VoiceId
|
5
|
-
import typing
|
6
5
|
import pydantic
|
7
6
|
import datetime as dt
|
7
|
+
import typing
|
8
8
|
from ...embedding.types.embedding import Embedding
|
9
9
|
from ...tts.types.supported_language import SupportedLanguage
|
10
10
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
@@ -12,14 +12,9 @@ from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
12
12
|
|
13
13
|
class Voice(UniversalBaseModel):
|
14
14
|
id: VoiceId
|
15
|
-
|
16
|
-
"""
|
17
|
-
The ID of the user who owns the voice.
|
15
|
+
is_owner: bool = pydantic.Field()
|
18
16
|
"""
|
19
|
-
|
20
|
-
is_public: bool = pydantic.Field()
|
21
|
-
"""
|
22
|
-
Whether the voice is publicly accessible.
|
17
|
+
Whether the current user is the owner of the voice.
|
23
18
|
"""
|
24
19
|
|
25
20
|
name: str = pydantic.Field()
|
@@ -37,7 +32,16 @@ class Voice(UniversalBaseModel):
|
|
37
32
|
The date and time the voice was created.
|
38
33
|
"""
|
39
34
|
|
40
|
-
embedding: Embedding
|
35
|
+
embedding: typing.Optional[Embedding] = pydantic.Field(default=None)
|
36
|
+
"""
|
37
|
+
The vector embedding of the voice. Only included when `expand` includes `embedding`.
|
38
|
+
"""
|
39
|
+
|
40
|
+
is_starred: typing.Optional[bool] = pydantic.Field(default=None)
|
41
|
+
"""
|
42
|
+
Whether the current user has starred the voice. Only included when `expand` includes `is_starred`.
|
43
|
+
"""
|
44
|
+
|
41
45
|
language: SupportedLanguage
|
42
46
|
|
43
47
|
if IS_PYDANTIC_V2:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.0b7
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.8,<4.0
|
6
6
|
Classifier: Intended Audience :: Developers
|
@@ -57,7 +57,7 @@ A full reference for this library is available [here](./reference.md).
|
|
57
57
|
from cartesia import Cartesia
|
58
58
|
import os
|
59
59
|
|
60
|
-
client = Cartesia(api_key=os.
|
60
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
61
61
|
|
62
62
|
# Get all available voices
|
63
63
|
voices = client.voices.list()
|
@@ -65,21 +65,32 @@ print(voices)
|
|
65
65
|
|
66
66
|
# Get a specific voice
|
67
67
|
voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
|
68
|
-
print("The embedding for", voice
|
68
|
+
print("The embedding for", voice.name, "is", voice.embedding)
|
69
69
|
|
70
|
-
# Clone a voice using
|
71
|
-
|
70
|
+
# Clone a voice using file data
|
71
|
+
cloned_voice = client.voices.clone(
|
72
|
+
clip=open("path/to/voice.wav", "rb"),
|
73
|
+
name="Test cloned voice",
|
74
|
+
language="en",
|
75
|
+
mode="similarity", # or "stability"
|
76
|
+
enhance=False, # use enhance=True to clean and denoise the cloning audio
|
77
|
+
description="Test voice description"
|
78
|
+
)
|
72
79
|
|
73
80
|
# Mix voices together
|
74
|
-
|
75
|
-
[
|
81
|
+
mixed_voice = client.voices.mix(
|
82
|
+
voices=[
|
83
|
+
{"id": "voice_id_1", "weight": 0.25},
|
84
|
+
{"id": "voice_id_2", "weight": 0.75}
|
85
|
+
]
|
76
86
|
)
|
77
87
|
|
78
|
-
# Create a new voice
|
88
|
+
# Create a new voice from embedding
|
79
89
|
new_voice = client.voices.create(
|
80
|
-
name="
|
81
|
-
description="
|
82
|
-
embedding=
|
90
|
+
name="Test Voice",
|
91
|
+
description="Test voice description",
|
92
|
+
embedding=[...], # List[float] with 192 dimensions
|
93
|
+
language="en"
|
83
94
|
)
|
84
95
|
```
|
85
96
|
|
@@ -90,15 +101,22 @@ Instantiate and use the client with the following:
|
|
90
101
|
```python
|
91
102
|
from cartesia import Cartesia
|
92
103
|
from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
|
104
|
+
import os
|
93
105
|
|
94
106
|
client = Cartesia(
|
95
|
-
api_key="
|
107
|
+
api_key=os.getenv("CARTESIA_API_KEY"),
|
96
108
|
)
|
97
109
|
client.tts.bytes(
|
98
|
-
model_id="sonic-
|
110
|
+
model_id="sonic-2",
|
99
111
|
transcript="Hello, world!",
|
100
|
-
voice={
|
101
|
-
|
112
|
+
voice={
|
113
|
+
"mode": "id",
|
114
|
+
"id": "694f9389-aac1-45b6-b726-9d9369183238",
|
115
|
+
"experimental_controls": {
|
116
|
+
"speed": 0.5, # range between [-1.0, 1.0], or "slow", "fastest", etc.
|
117
|
+
"emotion": ["positivity", "curiosity:low"] # list of emotions with optional intensity
|
118
|
+
}
|
119
|
+
},
|
102
120
|
language="en",
|
103
121
|
output_format={
|
104
122
|
"container": "raw",
|
@@ -114,18 +132,18 @@ The SDK also exports an `async` client so that you can make non-blocking calls t
|
|
114
132
|
|
115
133
|
```python
|
116
134
|
import asyncio
|
135
|
+
import os
|
117
136
|
|
118
137
|
from cartesia import AsyncCartesia
|
119
138
|
from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
|
120
139
|
|
121
140
|
client = AsyncCartesia(
|
122
|
-
api_key="
|
141
|
+
api_key=os.getenv("CARTESIA_API_KEY"),
|
123
142
|
)
|
124
143
|
|
125
|
-
|
126
144
|
async def main() -> None:
|
127
|
-
|
128
|
-
model_id="sonic-
|
145
|
+
async for output in client.tts.bytes(
|
146
|
+
model_id="sonic-2",
|
129
147
|
transcript="Hello, world!",
|
130
148
|
voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
131
149
|
language="en",
|
@@ -134,7 +152,8 @@ async def main() -> None:
|
|
134
152
|
"sample_rate": 44100,
|
135
153
|
"encoding": "pcm_f32le",
|
136
154
|
},
|
137
|
-
)
|
155
|
+
):
|
156
|
+
print(f"Received chunk of size: {len(output)}")
|
138
157
|
|
139
158
|
|
140
159
|
asyncio.run(main())
|
@@ -162,26 +181,38 @@ The SDK supports streaming responses, as well, the response will be a generator
|
|
162
181
|
```python
|
163
182
|
from cartesia import Cartesia
|
164
183
|
from cartesia.tts import Controls, OutputFormat_RawParams, TtsRequestIdSpecifierParams
|
184
|
+
import os
|
165
185
|
|
166
|
-
|
167
|
-
|
168
|
-
)
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
186
|
+
def get_tts_chunks():
|
187
|
+
client = Cartesia(
|
188
|
+
api_key=os.getenv("CARTESIA_API_KEY"),
|
189
|
+
)
|
190
|
+
response = client.tts.sse(
|
191
|
+
model_id="sonic-2",
|
192
|
+
transcript="Hello world!",
|
193
|
+
voice={
|
194
|
+
"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
|
195
|
+
"experimental_controls": {
|
196
|
+
"speed": "normal",
|
197
|
+
"emotion": [],
|
198
|
+
},
|
177
199
|
},
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
200
|
+
language="en",
|
201
|
+
output_format={
|
202
|
+
"container": "raw",
|
203
|
+
"encoding": "pcm_f32le",
|
204
|
+
"sample_rate": 44100,
|
205
|
+
},
|
206
|
+
)
|
207
|
+
|
208
|
+
audio_chunks = []
|
209
|
+
for chunk in response:
|
210
|
+
audio_chunks.append(chunk)
|
211
|
+
return audio_chunks
|
212
|
+
|
213
|
+
chunks = get_tts_chunks()
|
214
|
+
for chunk in chunks:
|
215
|
+
print(f"Received chunk of size: {len(chunk.data)}")
|
185
216
|
```
|
186
217
|
|
187
218
|
## WebSocket
|
@@ -190,16 +221,16 @@ for chunk in response:
|
|
190
221
|
from cartesia import Cartesia
|
191
222
|
from cartesia.tts import TtsRequestEmbeddingSpecifierParams, OutputFormat_RawParams
|
192
223
|
import pyaudio
|
224
|
+
import os
|
193
225
|
|
194
226
|
client = Cartesia(
|
195
|
-
api_key="
|
227
|
+
api_key=os.getenv("CARTESIA_API_KEY"),
|
196
228
|
)
|
197
229
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
198
|
-
voice = client.voices.get(id=voice_id)
|
199
230
|
transcript = "Hello! Welcome to Cartesia"
|
200
231
|
|
201
232
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
202
|
-
model_id = "sonic-
|
233
|
+
model_id = "sonic-2"
|
203
234
|
|
204
235
|
p = pyaudio.PyAudio()
|
205
236
|
rate = 22050
|
@@ -213,11 +244,11 @@ ws = client.tts.websocket()
|
|
213
244
|
for output in ws.send(
|
214
245
|
model_id=model_id,
|
215
246
|
transcript=transcript,
|
216
|
-
voice={"
|
247
|
+
voice={"id": voice_id},
|
217
248
|
stream=True,
|
218
249
|
output_format={
|
219
250
|
"container": "raw",
|
220
|
-
"encoding": "pcm_f32le",
|
251
|
+
"encoding": "pcm_f32le",
|
221
252
|
"sample_rate": 22050
|
222
253
|
},
|
223
254
|
):
|
@@ -236,6 +267,55 @@ p.terminate()
|
|
236
267
|
ws.close() # Close the websocket connection
|
237
268
|
```
|
238
269
|
|
270
|
+
## Requesting Timestamps
|
271
|
+
|
272
|
+
```python
|
273
|
+
import asyncio
|
274
|
+
from cartesia import AsyncCartesia
|
275
|
+
import os
|
276
|
+
|
277
|
+
async def main():
|
278
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
279
|
+
|
280
|
+
# Connect to the websocket
|
281
|
+
ws = await client.tts.websocket()
|
282
|
+
|
283
|
+
# Generate speech with timestamps
|
284
|
+
output_generate = await ws.send(
|
285
|
+
model_id="sonic-2",
|
286
|
+
transcript="Hello! Welcome to Cartesia's text-to-speech.",
|
287
|
+
voice={"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"},
|
288
|
+
output_format={
|
289
|
+
"container": "raw",
|
290
|
+
"encoding": "pcm_f32le",
|
291
|
+
"sample_rate": 44100
|
292
|
+
},
|
293
|
+
add_timestamps=True, # Enable word-level timestamps
|
294
|
+
stream=True
|
295
|
+
)
|
296
|
+
|
297
|
+
# Process the streaming response with timestamps
|
298
|
+
all_words = []
|
299
|
+
all_starts = []
|
300
|
+
all_ends = []
|
301
|
+
audio_chunks = []
|
302
|
+
|
303
|
+
async for out in output_generate:
|
304
|
+
# Collect audio data
|
305
|
+
if out.audio is not None:
|
306
|
+
audio_chunks.append(out.audio)
|
307
|
+
|
308
|
+
# Process timestamp data
|
309
|
+
if out.word_timestamps is not None:
|
310
|
+
all_words.extend(out.word_timestamps.words) # List of words
|
311
|
+
all_starts.extend(out.word_timestamps.start) # Start time for each word (seconds)
|
312
|
+
all_ends.extend(out.word_timestamps.end) # End time for each word (seconds)
|
313
|
+
|
314
|
+
await ws.close()
|
315
|
+
|
316
|
+
asyncio.run(main())
|
317
|
+
```
|
318
|
+
|
239
319
|
## Advanced
|
240
320
|
|
241
321
|
### Retries
|
@@ -297,11 +377,33 @@ client = Cartesia(
|
|
297
377
|
|
298
378
|
## Contributing
|
299
379
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
380
|
+
Note that most of this library is generated programmatically from
|
381
|
+
<https://github.com/cartesia-ai/docs> — before making edits to a file, verify it's not autogenerated
|
382
|
+
by checking for this comment at the top of the file:
|
383
|
+
|
384
|
+
```
|
385
|
+
# This file was auto-generated by Fern from our API Definition.
|
386
|
+
```
|
387
|
+
|
388
|
+
### Running tests
|
389
|
+
|
390
|
+
```sh
|
391
|
+
uv pip install -r requirements.txt
|
392
|
+
uv run pytest -rP -vv tests/custom/test_client.py::test_get_voices
|
393
|
+
```
|
394
|
+
### Manually generating SDK code from docs
|
395
|
+
|
396
|
+
Assuming all your repos are cloned into your home directory:
|
397
|
+
|
398
|
+
```sh
|
399
|
+
$ cd ~/docs
|
400
|
+
$ fern generate --group python-sdk --log-level debug --api version-2024-11-13 --preview
|
401
|
+
$ cd ~/cartesia-python
|
402
|
+
$ git pull ~/docs/fern/apis/version-2024-11-13/.preview/fern-python-sdk
|
403
|
+
$ git commit --amend -m "manually regenerate from docs" # optional
|
404
|
+
```
|
405
|
+
|
406
|
+
### Automatically generating new SDK releases
|
305
407
|
|
306
|
-
|
408
|
+
From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
|
307
409
|
|