cartesia 2.0.11__py3-none-any.whl → 2.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cartesia might be problematic. Click here for more details.
- cartesia/__init__.py +6 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/tts/__init__.py +6 -0
- cartesia/tts/_async_websocket.py +11 -4
- cartesia/tts/_websocket.py +10 -4
- cartesia/tts/client.py +25 -0
- cartesia/tts/requests/__init__.py +2 -0
- cartesia/tts/requests/controls.py +2 -2
- cartesia/tts/requests/generation_config.py +26 -0
- cartesia/tts/requests/generation_request.py +2 -0
- cartesia/tts/requests/tts_request.py +2 -0
- cartesia/tts/requests/ttssse_request.py +2 -0
- cartesia/tts/requests/web_socket_tts_request.py +2 -0
- cartesia/tts/types/__init__.py +4 -0
- cartesia/tts/types/controls.py +2 -2
- cartesia/tts/types/emotion.py +1 -32
- cartesia/tts/types/emotion_deprecated.py +34 -0
- cartesia/tts/types/generation_config.py +37 -0
- cartesia/tts/types/generation_request.py +2 -0
- cartesia/tts/types/tts_request.py +2 -0
- cartesia/tts/types/ttssse_request.py +2 -0
- cartesia/tts/types/web_socket_tts_request.py +2 -0
- {cartesia-2.0.11.dist-info → cartesia-2.0.14.dist-info}/METADATA +81 -72
- {cartesia-2.0.11.dist-info → cartesia-2.0.14.dist-info}/RECORD +26 -23
- {cartesia-2.0.11.dist-info → cartesia-2.0.14.dist-info}/LICENSE +0 -0
- {cartesia-2.0.11.dist-info → cartesia-2.0.14.dist-info}/WHEEL +0 -0
cartesia/__init__.py
CHANGED
|
@@ -52,7 +52,10 @@ from .tts import (
|
|
|
52
52
|
Controls,
|
|
53
53
|
ControlsParams,
|
|
54
54
|
Emotion,
|
|
55
|
+
EmotionDeprecated,
|
|
55
56
|
FlushId,
|
|
57
|
+
GenerationConfig,
|
|
58
|
+
GenerationConfigParams,
|
|
56
59
|
GenerationRequest,
|
|
57
60
|
GenerationRequestParams,
|
|
58
61
|
ModelSpeed,
|
|
@@ -211,6 +214,7 @@ __all__ = [
|
|
|
211
214
|
"EmbeddingSpecifier",
|
|
212
215
|
"EmbeddingSpecifierParams",
|
|
213
216
|
"Emotion",
|
|
217
|
+
"EmotionDeprecated",
|
|
214
218
|
"ErrorMessage",
|
|
215
219
|
"ErrorMessageParams",
|
|
216
220
|
"FilePurpose",
|
|
@@ -219,6 +223,8 @@ __all__ = [
|
|
|
219
223
|
"FlushId",
|
|
220
224
|
"Gender",
|
|
221
225
|
"GenderPresentation",
|
|
226
|
+
"GenerationConfig",
|
|
227
|
+
"GenerationConfigParams",
|
|
222
228
|
"GenerationRequest",
|
|
223
229
|
"GenerationRequestParams",
|
|
224
230
|
"GetVoicesResponse",
|
cartesia/core/client_wrapper.py
CHANGED
|
@@ -16,7 +16,7 @@ class BaseClientWrapper:
|
|
|
16
16
|
headers: typing.Dict[str, str] = {
|
|
17
17
|
"X-Fern-Language": "Python",
|
|
18
18
|
"X-Fern-SDK-Name": "cartesia",
|
|
19
|
-
"X-Fern-SDK-Version": "2.0.
|
|
19
|
+
"X-Fern-SDK-Version": "2.0.14",
|
|
20
20
|
}
|
|
21
21
|
headers["X-API-Key"] = self.api_key
|
|
22
22
|
headers["Cartesia-Version"] = "2024-11-13"
|
cartesia/tts/__init__.py
CHANGED
|
@@ -5,7 +5,9 @@ from .types import (
|
|
|
5
5
|
ContextId,
|
|
6
6
|
Controls,
|
|
7
7
|
Emotion,
|
|
8
|
+
EmotionDeprecated,
|
|
8
9
|
FlushId,
|
|
10
|
+
GenerationConfig,
|
|
9
11
|
GenerationRequest,
|
|
10
12
|
ModelSpeed,
|
|
11
13
|
Mp3OutputFormat,
|
|
@@ -51,6 +53,7 @@ from .types import (
|
|
|
51
53
|
from .requests import (
|
|
52
54
|
CancelContextRequestParams,
|
|
53
55
|
ControlsParams,
|
|
56
|
+
GenerationConfigParams,
|
|
54
57
|
GenerationRequestParams,
|
|
55
58
|
Mp3OutputFormatParams,
|
|
56
59
|
OutputFormatParams,
|
|
@@ -96,7 +99,10 @@ __all__ = [
|
|
|
96
99
|
"Controls",
|
|
97
100
|
"ControlsParams",
|
|
98
101
|
"Emotion",
|
|
102
|
+
"EmotionDeprecated",
|
|
99
103
|
"FlushId",
|
|
104
|
+
"GenerationConfig",
|
|
105
|
+
"GenerationConfigParams",
|
|
100
106
|
"GenerationRequest",
|
|
101
107
|
"GenerationRequestParams",
|
|
102
108
|
"ModelSpeed",
|
cartesia/tts/_async_websocket.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
import aiohttp
|
|
10
10
|
|
|
11
|
-
from cartesia.tts.requests import TtsRequestVoiceSpecifierParams
|
|
11
|
+
from cartesia.tts.requests import GenerationConfigParams, TtsRequestVoiceSpecifierParams
|
|
12
12
|
from cartesia.tts.requests.output_format import OutputFormatParams
|
|
13
13
|
from cartesia.tts.types import (
|
|
14
14
|
WebSocketResponse,
|
|
@@ -61,6 +61,7 @@ class _AsyncTTSContext:
|
|
|
61
61
|
model_id: str,
|
|
62
62
|
transcript: str,
|
|
63
63
|
output_format: OutputFormatParams,
|
|
64
|
+
generation_config: Optional[GenerationConfigParams] = None,
|
|
64
65
|
voice: TtsRequestVoiceSpecifierParams,
|
|
65
66
|
context_id: Optional[str] = None,
|
|
66
67
|
duration: Optional[int] = None,
|
|
@@ -116,6 +117,12 @@ class _AsyncTTSContext:
|
|
|
116
117
|
if flush:
|
|
117
118
|
request_body["flush"] = flush
|
|
118
119
|
|
|
120
|
+
if generation_config is not None:
|
|
121
|
+
if isinstance(generation_config, dict):
|
|
122
|
+
request_body["generation_config"] = generation_config
|
|
123
|
+
else:
|
|
124
|
+
request_body["generation_config"] = generation_config.dict()
|
|
125
|
+
|
|
119
126
|
if (
|
|
120
127
|
"context_id" in request_body
|
|
121
128
|
and request_body["context_id"] is not None
|
|
@@ -315,10 +322,10 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
|
315
322
|
# Extract status code if available
|
|
316
323
|
status_code = None
|
|
317
324
|
error_message = str(e)
|
|
318
|
-
|
|
325
|
+
|
|
319
326
|
if hasattr(e, 'status') and e.status is not None:
|
|
320
327
|
status_code = e.status
|
|
321
|
-
|
|
328
|
+
|
|
322
329
|
# Create a meaningful error message based on status code
|
|
323
330
|
if status_code == 402:
|
|
324
331
|
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
|
@@ -328,7 +335,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
|
328
335
|
error_message = "Forbidden. You don't have permission to access this resource."
|
|
329
336
|
elif status_code == 404:
|
|
330
337
|
error_message = "Not found. The requested resource doesn't exist."
|
|
331
|
-
|
|
338
|
+
|
|
332
339
|
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
|
333
340
|
else:
|
|
334
341
|
raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
|
cartesia/tts/_websocket.py
CHANGED
|
@@ -14,7 +14,7 @@ except ImportError:
|
|
|
14
14
|
|
|
15
15
|
from iterators import TimeoutIterator # type: ignore
|
|
16
16
|
|
|
17
|
-
from cartesia.tts.requests import TtsRequestVoiceSpecifierParams
|
|
17
|
+
from cartesia.tts.requests import GenerationConfigParams, TtsRequestVoiceSpecifierParams
|
|
18
18
|
from cartesia.tts.requests.output_format import OutputFormatParams
|
|
19
19
|
from cartesia.tts.types import (
|
|
20
20
|
WebSocketResponse,
|
|
@@ -60,6 +60,7 @@ class _TTSContext:
|
|
|
60
60
|
model_id: str,
|
|
61
61
|
transcript: typing.Generator[str, None, None],
|
|
62
62
|
output_format: OutputFormatParams,
|
|
63
|
+
generation_config: Optional[GenerationConfigParams] = None,
|
|
63
64
|
voice: TtsRequestVoiceSpecifierParams,
|
|
64
65
|
context_id: Optional[str] = None,
|
|
65
66
|
max_buffer_delay_ms: Optional[int] = None,
|
|
@@ -111,6 +112,11 @@ class _TTSContext:
|
|
|
111
112
|
if max_buffer_delay_ms:
|
|
112
113
|
request_body["max_buffer_delay_ms"] = max_buffer_delay_ms
|
|
113
114
|
|
|
115
|
+
if generation_config is not None:
|
|
116
|
+
if isinstance(generation_config, dict):
|
|
117
|
+
request_body["generation_config"] = generation_config
|
|
118
|
+
else:
|
|
119
|
+
request_body["generation_config"] = generation_config.dict()
|
|
114
120
|
if (
|
|
115
121
|
"context_id" in request_body
|
|
116
122
|
and request_body["context_id"] is not None
|
|
@@ -293,10 +299,10 @@ class TtsWebsocket:
|
|
|
293
299
|
# Extract status code if available
|
|
294
300
|
status_code = None
|
|
295
301
|
error_message = str(e)
|
|
296
|
-
|
|
302
|
+
|
|
297
303
|
if hasattr(e, 'status') and e.status is not None:
|
|
298
304
|
status_code = e.status
|
|
299
|
-
|
|
305
|
+
|
|
300
306
|
# Create a meaningful error message based on status code
|
|
301
307
|
if status_code == 402:
|
|
302
308
|
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
|
@@ -306,7 +312,7 @@ class TtsWebsocket:
|
|
|
306
312
|
error_message = "Forbidden. You don't have permission to access this resource."
|
|
307
313
|
elif status_code == 404:
|
|
308
314
|
error_message = "Not found. The requested resource doesn't exist."
|
|
309
|
-
|
|
315
|
+
|
|
310
316
|
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
|
311
317
|
else:
|
|
312
318
|
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
cartesia/tts/client.py
CHANGED
|
@@ -5,6 +5,7 @@ from ..core.client_wrapper import SyncClientWrapper
|
|
|
5
5
|
from .requests.tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
|
6
6
|
from .requests.output_format import OutputFormatParams
|
|
7
7
|
from .types.supported_language import SupportedLanguage
|
|
8
|
+
from .requests.generation_config import GenerationConfigParams
|
|
8
9
|
from .types.model_speed import ModelSpeed
|
|
9
10
|
from ..core.request_options import RequestOptions
|
|
10
11
|
from ..core.serialization import convert_and_respect_annotation_metadata
|
|
@@ -34,6 +35,7 @@ class TtsClient:
|
|
|
34
35
|
voice: TtsRequestVoiceSpecifierParams,
|
|
35
36
|
output_format: OutputFormatParams,
|
|
36
37
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
|
38
|
+
generation_config: typing.Optional[GenerationConfigParams] = OMIT,
|
|
37
39
|
duration: typing.Optional[float] = OMIT,
|
|
38
40
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
|
39
41
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -52,6 +54,8 @@ class TtsClient:
|
|
|
52
54
|
|
|
53
55
|
language : typing.Optional[SupportedLanguage]
|
|
54
56
|
|
|
57
|
+
generation_config : typing.Optional[GenerationConfigParams]
|
|
58
|
+
|
|
55
59
|
duration : typing.Optional[float]
|
|
56
60
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
57
61
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
|
@@ -97,6 +101,9 @@ class TtsClient:
|
|
|
97
101
|
"output_format": convert_and_respect_annotation_metadata(
|
|
98
102
|
object_=output_format, annotation=OutputFormatParams, direction="write"
|
|
99
103
|
),
|
|
104
|
+
"generation_config": convert_and_respect_annotation_metadata(
|
|
105
|
+
object_=generation_config, annotation=GenerationConfigParams, direction="write"
|
|
106
|
+
),
|
|
100
107
|
"duration": duration,
|
|
101
108
|
"speed": speed,
|
|
102
109
|
},
|
|
@@ -123,6 +130,7 @@ class TtsClient:
|
|
|
123
130
|
voice: TtsRequestVoiceSpecifierParams,
|
|
124
131
|
output_format: SseOutputFormatParams,
|
|
125
132
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
|
133
|
+
generation_config: typing.Optional[GenerationConfigParams] = OMIT,
|
|
126
134
|
duration: typing.Optional[float] = OMIT,
|
|
127
135
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
|
128
136
|
add_timestamps: typing.Optional[bool] = OMIT,
|
|
@@ -145,6 +153,8 @@ class TtsClient:
|
|
|
145
153
|
|
|
146
154
|
language : typing.Optional[SupportedLanguage]
|
|
147
155
|
|
|
156
|
+
generation_config : typing.Optional[GenerationConfigParams]
|
|
157
|
+
|
|
148
158
|
duration : typing.Optional[float]
|
|
149
159
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
150
160
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
|
@@ -204,6 +214,9 @@ class TtsClient:
|
|
|
204
214
|
"output_format": convert_and_respect_annotation_metadata(
|
|
205
215
|
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
|
206
216
|
),
|
|
217
|
+
"generation_config": convert_and_respect_annotation_metadata(
|
|
218
|
+
object_=generation_config, annotation=GenerationConfigParams, direction="write"
|
|
219
|
+
),
|
|
207
220
|
"duration": duration,
|
|
208
221
|
"speed": speed,
|
|
209
222
|
"add_timestamps": add_timestamps,
|
|
@@ -248,6 +261,7 @@ class AsyncTtsClient:
|
|
|
248
261
|
voice: TtsRequestVoiceSpecifierParams,
|
|
249
262
|
output_format: OutputFormatParams,
|
|
250
263
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
|
264
|
+
generation_config: typing.Optional[GenerationConfigParams] = OMIT,
|
|
251
265
|
duration: typing.Optional[float] = OMIT,
|
|
252
266
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
|
253
267
|
request_options: typing.Optional[RequestOptions] = None,
|
|
@@ -266,6 +280,8 @@ class AsyncTtsClient:
|
|
|
266
280
|
|
|
267
281
|
language : typing.Optional[SupportedLanguage]
|
|
268
282
|
|
|
283
|
+
generation_config : typing.Optional[GenerationConfigParams]
|
|
284
|
+
|
|
269
285
|
duration : typing.Optional[float]
|
|
270
286
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
271
287
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
|
@@ -319,6 +335,9 @@ class AsyncTtsClient:
|
|
|
319
335
|
"output_format": convert_and_respect_annotation_metadata(
|
|
320
336
|
object_=output_format, annotation=OutputFormatParams, direction="write"
|
|
321
337
|
),
|
|
338
|
+
"generation_config": convert_and_respect_annotation_metadata(
|
|
339
|
+
object_=generation_config, annotation=GenerationConfigParams, direction="write"
|
|
340
|
+
),
|
|
322
341
|
"duration": duration,
|
|
323
342
|
"speed": speed,
|
|
324
343
|
},
|
|
@@ -345,6 +364,7 @@ class AsyncTtsClient:
|
|
|
345
364
|
voice: TtsRequestVoiceSpecifierParams,
|
|
346
365
|
output_format: SseOutputFormatParams,
|
|
347
366
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
|
367
|
+
generation_config: typing.Optional[GenerationConfigParams] = OMIT,
|
|
348
368
|
duration: typing.Optional[float] = OMIT,
|
|
349
369
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
|
350
370
|
add_timestamps: typing.Optional[bool] = OMIT,
|
|
@@ -367,6 +387,8 @@ class AsyncTtsClient:
|
|
|
367
387
|
|
|
368
388
|
language : typing.Optional[SupportedLanguage]
|
|
369
389
|
|
|
390
|
+
generation_config : typing.Optional[GenerationConfigParams]
|
|
391
|
+
|
|
370
392
|
duration : typing.Optional[float]
|
|
371
393
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
372
394
|
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
|
@@ -434,6 +456,9 @@ class AsyncTtsClient:
|
|
|
434
456
|
"output_format": convert_and_respect_annotation_metadata(
|
|
435
457
|
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
|
436
458
|
),
|
|
459
|
+
"generation_config": convert_and_respect_annotation_metadata(
|
|
460
|
+
object_=generation_config, annotation=GenerationConfigParams, direction="write"
|
|
461
|
+
),
|
|
437
462
|
"duration": duration,
|
|
438
463
|
"speed": speed,
|
|
439
464
|
"add_timestamps": add_timestamps,
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from .cancel_context_request import CancelContextRequestParams
|
|
4
4
|
from .controls import ControlsParams
|
|
5
|
+
from .generation_config import GenerationConfigParams
|
|
5
6
|
from .generation_request import GenerationRequestParams
|
|
6
7
|
from .mp_3_output_format import Mp3OutputFormatParams
|
|
7
8
|
from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFormat_RawParams, OutputFormat_WavParams
|
|
@@ -41,6 +42,7 @@ from .word_timestamps import WordTimestampsParams
|
|
|
41
42
|
__all__ = [
|
|
42
43
|
"CancelContextRequestParams",
|
|
43
44
|
"ControlsParams",
|
|
45
|
+
"GenerationConfigParams",
|
|
44
46
|
"GenerationRequestParams",
|
|
45
47
|
"Mp3OutputFormatParams",
|
|
46
48
|
"OutputFormatParams",
|
|
@@ -3,9 +3,9 @@
|
|
|
3
3
|
import typing_extensions
|
|
4
4
|
from .speed import SpeedParams
|
|
5
5
|
import typing
|
|
6
|
-
from ..types.
|
|
6
|
+
from ..types.emotion_deprecated import EmotionDeprecated
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ControlsParams(typing_extensions.TypedDict):
|
|
10
10
|
speed: SpeedParams
|
|
11
|
-
emotion: typing.Sequence[
|
|
11
|
+
emotion: typing.Sequence[EmotionDeprecated]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing_extensions
|
|
4
|
+
import typing_extensions
|
|
5
|
+
from ..types.emotion import Emotion
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GenerationConfigParams(typing_extensions.TypedDict):
|
|
9
|
+
"""
|
|
10
|
+
Configure the various attributes of the generated speech. These controls only for `sonic-3` and have no effect on earlier models.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
volume: typing_extensions.NotRequired[float]
|
|
14
|
+
"""
|
|
15
|
+
Adjust the volume of the generated speech between 0.5x and 2.0x the original volume (default is 1.0x). Valid values are between 0.5 and 2.0 inclusive.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
speed: typing_extensions.NotRequired[float]
|
|
19
|
+
"""
|
|
20
|
+
Adjust the speed of the generated speech between 0.6x and 1.5x the original speed (default is 1.0x). Valid values are between 0.6 and 1.5 inclusive.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
emotion: typing_extensions.NotRequired[Emotion]
|
|
24
|
+
"""
|
|
25
|
+
Guide the emotion of the generated speech.
|
|
26
|
+
"""
|
|
@@ -6,6 +6,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
|
|
6
6
|
import typing_extensions
|
|
7
7
|
from ..types.supported_language import SupportedLanguage
|
|
8
8
|
from .web_socket_raw_output_format import WebSocketRawOutputFormatParams
|
|
9
|
+
from .generation_config import GenerationConfigParams
|
|
9
10
|
from ..types.model_speed import ModelSpeed
|
|
10
11
|
from ..types.context_id import ContextId
|
|
11
12
|
from ...core.serialization import FieldMetadata
|
|
@@ -25,6 +26,7 @@ class GenerationRequestParams(typing_extensions.TypedDict):
|
|
|
25
26
|
voice: TtsRequestVoiceSpecifierParams
|
|
26
27
|
language: typing_extensions.NotRequired[SupportedLanguage]
|
|
27
28
|
output_format: WebSocketRawOutputFormatParams
|
|
29
|
+
generation_config: typing_extensions.NotRequired[GenerationConfigParams]
|
|
28
30
|
duration: typing_extensions.NotRequired[float]
|
|
29
31
|
"""
|
|
30
32
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
@@ -5,6 +5,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
|
|
5
5
|
import typing_extensions
|
|
6
6
|
from ..types.supported_language import SupportedLanguage
|
|
7
7
|
from .output_format import OutputFormatParams
|
|
8
|
+
from .generation_config import GenerationConfigParams
|
|
8
9
|
from ..types.model_speed import ModelSpeed
|
|
9
10
|
|
|
10
11
|
|
|
@@ -18,6 +19,7 @@ class TtsRequestParams(typing_extensions.TypedDict):
|
|
|
18
19
|
voice: TtsRequestVoiceSpecifierParams
|
|
19
20
|
language: typing_extensions.NotRequired[SupportedLanguage]
|
|
20
21
|
output_format: OutputFormatParams
|
|
22
|
+
generation_config: typing_extensions.NotRequired[GenerationConfigParams]
|
|
21
23
|
duration: typing_extensions.NotRequired[float]
|
|
22
24
|
"""
|
|
23
25
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
@@ -5,6 +5,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
|
|
5
5
|
import typing_extensions
|
|
6
6
|
from ..types.supported_language import SupportedLanguage
|
|
7
7
|
from .sse_output_format import SseOutputFormatParams
|
|
8
|
+
from .generation_config import GenerationConfigParams
|
|
8
9
|
from ..types.model_speed import ModelSpeed
|
|
9
10
|
from ..types.context_id import ContextId
|
|
10
11
|
|
|
@@ -19,6 +20,7 @@ class TtssseRequestParams(typing_extensions.TypedDict):
|
|
|
19
20
|
voice: TtsRequestVoiceSpecifierParams
|
|
20
21
|
language: typing_extensions.NotRequired[SupportedLanguage]
|
|
21
22
|
output_format: SseOutputFormatParams
|
|
23
|
+
generation_config: typing_extensions.NotRequired[GenerationConfigParams]
|
|
22
24
|
duration: typing_extensions.NotRequired[float]
|
|
23
25
|
"""
|
|
24
26
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import typing_extensions
|
|
4
4
|
import typing_extensions
|
|
5
5
|
from .output_format import OutputFormatParams
|
|
6
|
+
from .generation_config import GenerationConfigParams
|
|
6
7
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
|
7
8
|
from ...core.serialization import FieldMetadata
|
|
8
9
|
from ..types.model_speed import ModelSpeed
|
|
@@ -15,6 +16,7 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
|
|
15
16
|
"""
|
|
16
17
|
|
|
17
18
|
output_format: typing_extensions.NotRequired[OutputFormatParams]
|
|
19
|
+
generation_config: typing_extensions.NotRequired[GenerationConfigParams]
|
|
18
20
|
transcript: typing_extensions.NotRequired[str]
|
|
19
21
|
voice: TtsRequestVoiceSpecifierParams
|
|
20
22
|
duration: typing_extensions.NotRequired[int]
|
cartesia/tts/types/__init__.py
CHANGED
|
@@ -4,7 +4,9 @@ from .cancel_context_request import CancelContextRequest
|
|
|
4
4
|
from .context_id import ContextId
|
|
5
5
|
from .controls import Controls
|
|
6
6
|
from .emotion import Emotion
|
|
7
|
+
from .emotion_deprecated import EmotionDeprecated
|
|
7
8
|
from .flush_id import FlushId
|
|
9
|
+
from .generation_config import GenerationConfig
|
|
8
10
|
from .generation_request import GenerationRequest
|
|
9
11
|
from .model_speed import ModelSpeed
|
|
10
12
|
from .mp_3_output_format import Mp3OutputFormat
|
|
@@ -51,7 +53,9 @@ __all__ = [
|
|
|
51
53
|
"ContextId",
|
|
52
54
|
"Controls",
|
|
53
55
|
"Emotion",
|
|
56
|
+
"EmotionDeprecated",
|
|
54
57
|
"FlushId",
|
|
58
|
+
"GenerationConfig",
|
|
55
59
|
"GenerationRequest",
|
|
56
60
|
"ModelSpeed",
|
|
57
61
|
"Mp3OutputFormat",
|
cartesia/tts/types/controls.py
CHANGED
|
@@ -3,14 +3,14 @@
|
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
|
4
4
|
from .speed import Speed
|
|
5
5
|
import typing
|
|
6
|
-
from .
|
|
6
|
+
from .emotion_deprecated import EmotionDeprecated
|
|
7
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
8
8
|
import pydantic
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Controls(UniversalBaseModel):
|
|
12
12
|
speed: Speed
|
|
13
|
-
emotion: typing.List[
|
|
13
|
+
emotion: typing.List[EmotionDeprecated]
|
|
14
14
|
|
|
15
15
|
if IS_PYDANTIC_V2:
|
|
16
16
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
cartesia/tts/types/emotion.py
CHANGED
|
@@ -1,34 +1,3 @@
|
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
Emotion = typing.Union[
|
|
6
|
-
typing.Literal[
|
|
7
|
-
"anger:lowest",
|
|
8
|
-
"anger:low",
|
|
9
|
-
"anger",
|
|
10
|
-
"anger:high",
|
|
11
|
-
"anger:highest",
|
|
12
|
-
"positivity:lowest",
|
|
13
|
-
"positivity:low",
|
|
14
|
-
"positivity",
|
|
15
|
-
"positivity:high",
|
|
16
|
-
"positivity:highest",
|
|
17
|
-
"surprise:lowest",
|
|
18
|
-
"surprise:low",
|
|
19
|
-
"surprise",
|
|
20
|
-
"surprise:high",
|
|
21
|
-
"surprise:highest",
|
|
22
|
-
"sadness:lowest",
|
|
23
|
-
"sadness:low",
|
|
24
|
-
"sadness",
|
|
25
|
-
"sadness:high",
|
|
26
|
-
"sadness:highest",
|
|
27
|
-
"curiosity:lowest",
|
|
28
|
-
"curiosity:low",
|
|
29
|
-
"curiosity",
|
|
30
|
-
"curiosity:high",
|
|
31
|
-
"curiosity:highest",
|
|
32
|
-
],
|
|
33
|
-
typing.Any,
|
|
34
|
-
]
|
|
3
|
+
Emotion = str
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
EmotionDeprecated = typing.Union[
|
|
6
|
+
typing.Literal[
|
|
7
|
+
"anger:lowest",
|
|
8
|
+
"anger:low",
|
|
9
|
+
"anger",
|
|
10
|
+
"anger:high",
|
|
11
|
+
"anger:highest",
|
|
12
|
+
"positivity:lowest",
|
|
13
|
+
"positivity:low",
|
|
14
|
+
"positivity",
|
|
15
|
+
"positivity:high",
|
|
16
|
+
"positivity:highest",
|
|
17
|
+
"surprise:lowest",
|
|
18
|
+
"surprise:low",
|
|
19
|
+
"surprise",
|
|
20
|
+
"surprise:high",
|
|
21
|
+
"surprise:highest",
|
|
22
|
+
"sadness:lowest",
|
|
23
|
+
"sadness:low",
|
|
24
|
+
"sadness",
|
|
25
|
+
"sadness:high",
|
|
26
|
+
"sadness:highest",
|
|
27
|
+
"curiosity:lowest",
|
|
28
|
+
"curiosity:low",
|
|
29
|
+
"curiosity",
|
|
30
|
+
"curiosity:high",
|
|
31
|
+
"curiosity:highest",
|
|
32
|
+
],
|
|
33
|
+
typing.Any,
|
|
34
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
|
4
|
+
import typing
|
|
5
|
+
import pydantic
|
|
6
|
+
from .emotion import Emotion
|
|
7
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GenerationConfig(UniversalBaseModel):
|
|
11
|
+
"""
|
|
12
|
+
Configure the various attributes of the generated speech. These controls only for `sonic-3` and have no effect on earlier models.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
volume: typing.Optional[float] = pydantic.Field(default=None)
|
|
16
|
+
"""
|
|
17
|
+
Adjust the volume of the generated speech between 0.5x and 2.0x the original volume (default is 1.0x). Valid values are between 0.5 and 2.0 inclusive.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
speed: typing.Optional[float] = pydantic.Field(default=None)
|
|
21
|
+
"""
|
|
22
|
+
Adjust the speed of the generated speech between 0.6x and 1.5x the original speed (default is 1.0x). Valid values are between 0.6 and 1.5 inclusive.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
emotion: typing.Optional[Emotion] = pydantic.Field(default=None)
|
|
26
|
+
"""
|
|
27
|
+
Guide the emotion of the generated speech.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
if IS_PYDANTIC_V2:
|
|
31
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
|
32
|
+
else:
|
|
33
|
+
|
|
34
|
+
class Config:
|
|
35
|
+
frozen = True
|
|
36
|
+
smart_union = True
|
|
37
|
+
extra = pydantic.Extra.allow
|
|
@@ -6,6 +6,7 @@ import typing
|
|
|
6
6
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
|
7
7
|
from .supported_language import SupportedLanguage
|
|
8
8
|
from .web_socket_raw_output_format import WebSocketRawOutputFormat
|
|
9
|
+
from .generation_config import GenerationConfig
|
|
9
10
|
from .model_speed import ModelSpeed
|
|
10
11
|
from .context_id import ContextId
|
|
11
12
|
import typing_extensions
|
|
@@ -27,6 +28,7 @@ class GenerationRequest(UniversalBaseModel):
|
|
|
27
28
|
voice: TtsRequestVoiceSpecifier
|
|
28
29
|
language: typing.Optional[SupportedLanguage] = None
|
|
29
30
|
output_format: WebSocketRawOutputFormat
|
|
31
|
+
generation_config: typing.Optional[GenerationConfig] = None
|
|
30
32
|
duration: typing.Optional[float] = pydantic.Field(default=None)
|
|
31
33
|
"""
|
|
32
34
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
@@ -6,6 +6,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
|
|
6
6
|
import typing
|
|
7
7
|
from .supported_language import SupportedLanguage
|
|
8
8
|
from .output_format import OutputFormat
|
|
9
|
+
from .generation_config import GenerationConfig
|
|
9
10
|
from .model_speed import ModelSpeed
|
|
10
11
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
11
12
|
|
|
@@ -20,6 +21,7 @@ class TtsRequest(UniversalBaseModel):
|
|
|
20
21
|
voice: TtsRequestVoiceSpecifier
|
|
21
22
|
language: typing.Optional[SupportedLanguage] = None
|
|
22
23
|
output_format: OutputFormat
|
|
24
|
+
generation_config: typing.Optional[GenerationConfig] = None
|
|
23
25
|
duration: typing.Optional[float] = pydantic.Field(default=None)
|
|
24
26
|
"""
|
|
25
27
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
@@ -6,6 +6,7 @@ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
|
|
6
6
|
import typing
|
|
7
7
|
from .supported_language import SupportedLanguage
|
|
8
8
|
from .sse_output_format import SseOutputFormat
|
|
9
|
+
from .generation_config import GenerationConfig
|
|
9
10
|
from .model_speed import ModelSpeed
|
|
10
11
|
from .context_id import ContextId
|
|
11
12
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
|
@@ -21,6 +22,7 @@ class TtssseRequest(UniversalBaseModel):
|
|
|
21
22
|
voice: TtsRequestVoiceSpecifier
|
|
22
23
|
language: typing.Optional[SupportedLanguage] = None
|
|
23
24
|
output_format: SseOutputFormat
|
|
25
|
+
generation_config: typing.Optional[GenerationConfig] = None
|
|
24
26
|
duration: typing.Optional[float] = pydantic.Field(default=None)
|
|
25
27
|
"""
|
|
26
28
|
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
|
@@ -4,6 +4,7 @@ from ...core.pydantic_utilities import UniversalBaseModel
|
|
|
4
4
|
import pydantic
|
|
5
5
|
import typing
|
|
6
6
|
from .output_format import OutputFormat
|
|
7
|
+
from .generation_config import GenerationConfig
|
|
7
8
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
|
8
9
|
import typing_extensions
|
|
9
10
|
from ...core.serialization import FieldMetadata
|
|
@@ -18,6 +19,7 @@ class WebSocketTtsRequest(UniversalBaseModel):
|
|
|
18
19
|
"""
|
|
19
20
|
|
|
20
21
|
output_format: typing.Optional[OutputFormat] = None
|
|
22
|
+
generation_config: typing.Optional[GenerationConfig] = None
|
|
21
23
|
transcript: typing.Optional[str] = None
|
|
22
24
|
voice: TtsRequestVoiceSpecifier
|
|
23
25
|
duration: typing.Optional[int] = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cartesia
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.14
|
|
4
4
|
Summary:
|
|
5
5
|
Requires-Python: >=3.8,<4.0
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -53,26 +53,36 @@ Instantiate and use the client with the following:
|
|
|
53
53
|
|
|
54
54
|
```python
|
|
55
55
|
from cartesia import Cartesia
|
|
56
|
-
from cartesia.tts import OutputFormat_Raw, TtsRequestIdSpecifier
|
|
57
56
|
import os
|
|
58
57
|
|
|
59
58
|
client = Cartesia(
|
|
60
|
-
api_key=os.
|
|
61
|
-
)
|
|
62
|
-
client.tts.bytes(
|
|
63
|
-
model_id="sonic-2",
|
|
64
|
-
transcript="Hello, world!",
|
|
65
|
-
voice={
|
|
66
|
-
"mode": "id",
|
|
67
|
-
"id": "694f9389-aac1-45b6-b726-9d9369183238",
|
|
68
|
-
},
|
|
69
|
-
language="en",
|
|
70
|
-
output_format={
|
|
71
|
-
"container": "raw",
|
|
72
|
-
"sample_rate": 44100,
|
|
73
|
-
"encoding": "pcm_f32le",
|
|
74
|
-
},
|
|
59
|
+
api_key=os.environ["CARTESIA_API_KEY"],
|
|
75
60
|
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
with open("sonic.wav", "wb") as f:
|
|
65
|
+
bytes_iter = client.tts.bytes(
|
|
66
|
+
model_id="sonic-3",
|
|
67
|
+
transcript="Hello, world!",
|
|
68
|
+
voice={
|
|
69
|
+
"mode": "id",
|
|
70
|
+
"id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
|
|
71
|
+
},
|
|
72
|
+
language="en",
|
|
73
|
+
output_format={
|
|
74
|
+
"container": "wav",
|
|
75
|
+
"sample_rate": 44100,
|
|
76
|
+
"encoding": "pcm_f32le",
|
|
77
|
+
},
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
for chunk in bytes_iter:
|
|
81
|
+
f.write(chunk)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
if __name__ == "__main__":
|
|
85
|
+
main()
|
|
76
86
|
```
|
|
77
87
|
|
|
78
88
|
## Async Client
|
|
@@ -81,31 +91,37 @@ The SDK also exports an `async` client so that you can make non-blocking calls t
|
|
|
81
91
|
|
|
82
92
|
```python
|
|
83
93
|
import asyncio
|
|
84
|
-
import os
|
|
85
|
-
|
|
86
94
|
from cartesia import AsyncCartesia
|
|
87
|
-
|
|
95
|
+
import os
|
|
88
96
|
|
|
89
97
|
client = AsyncCartesia(
|
|
90
|
-
api_key=os.
|
|
98
|
+
api_key=os.environ["CARTESIA_API_KEY"],
|
|
91
99
|
)
|
|
92
100
|
|
|
93
|
-
async def main() -> None:
|
|
94
|
-
async for output in client.tts.bytes(
|
|
95
|
-
model_id="sonic-2",
|
|
96
|
-
transcript="Hello, world!",
|
|
97
|
-
voice={"id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
|
98
|
-
language="en",
|
|
99
|
-
output_format={
|
|
100
|
-
"container": "raw",
|
|
101
|
-
"sample_rate": 44100,
|
|
102
|
-
"encoding": "pcm_f32le",
|
|
103
|
-
},
|
|
104
|
-
):
|
|
105
|
-
print(f"Received chunk of size: {len(output)}")
|
|
106
101
|
|
|
102
|
+
async def main():
|
|
103
|
+
with open("sonic.wav", "wb") as f:
|
|
104
|
+
bytes_iter = client.tts.bytes(
|
|
105
|
+
model_id="sonic-3",
|
|
106
|
+
transcript="Hello, world!",
|
|
107
|
+
voice={
|
|
108
|
+
"mode": "id",
|
|
109
|
+
"id": "6ccbfb76-1fc6-48f7-b71d-91ac6298247b",
|
|
110
|
+
},
|
|
111
|
+
language="en",
|
|
112
|
+
output_format={
|
|
113
|
+
"container": "wav",
|
|
114
|
+
"sample_rate": 44100,
|
|
115
|
+
"encoding": "pcm_f32le",
|
|
116
|
+
},
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
async for chunk in bytes_iter:
|
|
120
|
+
f.write(chunk)
|
|
107
121
|
|
|
108
|
-
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
asyncio.run(main())
|
|
109
125
|
```
|
|
110
126
|
|
|
111
127
|
## Exception Handling
|
|
@@ -129,7 +145,6 @@ The SDK supports streaming responses as well, returning a generator that you can
|
|
|
129
145
|
|
|
130
146
|
```python
|
|
131
147
|
from cartesia import Cartesia
|
|
132
|
-
from cartesia.tts import Controls, OutputFormat_RawParams, TtsRequestIdSpecifierParams
|
|
133
148
|
import os
|
|
134
149
|
|
|
135
150
|
def get_tts_chunks():
|
|
@@ -137,14 +152,11 @@ def get_tts_chunks():
|
|
|
137
152
|
api_key=os.getenv("CARTESIA_API_KEY"),
|
|
138
153
|
)
|
|
139
154
|
response = client.tts.sse(
|
|
140
|
-
model_id="sonic-
|
|
155
|
+
model_id="sonic-3",
|
|
141
156
|
transcript="Hello world!",
|
|
142
157
|
voice={
|
|
158
|
+
"mode": "id",
|
|
143
159
|
"id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
|
|
144
|
-
"experimental_controls": {
|
|
145
|
-
"speed": "normal",
|
|
146
|
-
"emotion": [],
|
|
147
|
-
},
|
|
148
160
|
},
|
|
149
161
|
language="en",
|
|
150
162
|
output_format={
|
|
@@ -188,9 +200,9 @@ ws = client.tts.websocket()
|
|
|
188
200
|
|
|
189
201
|
# Generate and stream audio using the websocket
|
|
190
202
|
for output in ws.send(
|
|
191
|
-
model_id="sonic-
|
|
203
|
+
model_id="sonic-3", # see: https://docs.cartesia.ai/build-with-cartesia/tts-models
|
|
192
204
|
transcript=transcript,
|
|
193
|
-
voice={"id": voice_id},
|
|
205
|
+
voice={"mode": "id", "id": voice_id},
|
|
194
206
|
stream=True,
|
|
195
207
|
output_format={
|
|
196
208
|
"container": "raw",
|
|
@@ -252,7 +264,7 @@ ws.send("done")
|
|
|
252
264
|
for result in ws.receive():
|
|
253
265
|
if result['type'] == 'transcript':
|
|
254
266
|
print(f"Transcription: {result['text']}")
|
|
255
|
-
|
|
267
|
+
|
|
256
268
|
# Handle word-level timestamps if available
|
|
257
269
|
if 'words' in result and result['words']:
|
|
258
270
|
print("Word-level timestamps:")
|
|
@@ -261,7 +273,7 @@ for result in ws.receive():
|
|
|
261
273
|
start = word_info['start']
|
|
262
274
|
end = word_info['end']
|
|
263
275
|
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
|
264
|
-
|
|
276
|
+
|
|
265
277
|
if result['is_final']:
|
|
266
278
|
print("Final result received")
|
|
267
279
|
elif result['type'] == 'done':
|
|
@@ -286,7 +298,7 @@ async def streaming_stt_example():
|
|
|
286
298
|
and demonstrates the new endpointing and word timestamp features.
|
|
287
299
|
"""
|
|
288
300
|
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
|
289
|
-
|
|
301
|
+
|
|
290
302
|
try:
|
|
291
303
|
# Create websocket connection with voice activity detection
|
|
292
304
|
ws = await client.stt.websocket(
|
|
@@ -297,24 +309,24 @@ async def streaming_stt_example():
|
|
|
297
309
|
min_volume=0.15, # Volume threshold for voice activity detection
|
|
298
310
|
max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
|
|
299
311
|
)
|
|
300
|
-
|
|
312
|
+
|
|
301
313
|
# Simulate streaming audio data (replace with your audio source)
|
|
302
314
|
async def audio_stream():
|
|
303
315
|
"""Simulate real-time audio streaming - replace with actual audio capture"""
|
|
304
316
|
# Load audio file for simulation
|
|
305
317
|
with open("path/to/audio.wav", "rb") as f:
|
|
306
318
|
audio_data = f.read()
|
|
307
|
-
|
|
319
|
+
|
|
308
320
|
# Stream in 100ms chunks (realistic for real-time processing)
|
|
309
321
|
chunk_size = int(16000 * 0.1 * 2) # 100ms at 16kHz, 16-bit
|
|
310
|
-
|
|
322
|
+
|
|
311
323
|
for i in range(0, len(audio_data), chunk_size):
|
|
312
324
|
chunk = audio_data[i:i + chunk_size]
|
|
313
325
|
if chunk:
|
|
314
326
|
yield chunk
|
|
315
327
|
# Simulate real-time streaming delay
|
|
316
328
|
await asyncio.sleep(0.1)
|
|
317
|
-
|
|
329
|
+
|
|
318
330
|
# Send audio and receive results concurrently
|
|
319
331
|
async def send_audio():
|
|
320
332
|
"""Send audio chunks to the STT websocket"""
|
|
@@ -324,31 +336,31 @@ async def streaming_stt_example():
|
|
|
324
336
|
print(f"Sent audio chunk of {len(chunk)} bytes")
|
|
325
337
|
# Small delay to simulate realtime applications
|
|
326
338
|
await asyncio.sleep(0.02)
|
|
327
|
-
|
|
339
|
+
|
|
328
340
|
# Signal end of audio stream
|
|
329
341
|
await ws.send("finalize")
|
|
330
342
|
await ws.send("done")
|
|
331
343
|
print("Audio streaming completed")
|
|
332
|
-
|
|
344
|
+
|
|
333
345
|
except Exception as e:
|
|
334
346
|
print(f"Error sending audio: {e}")
|
|
335
|
-
|
|
347
|
+
|
|
336
348
|
async def receive_transcripts():
|
|
337
349
|
"""Receive and process transcription results with word timestamps"""
|
|
338
350
|
full_transcript = ""
|
|
339
351
|
all_word_timestamps = []
|
|
340
|
-
|
|
352
|
+
|
|
341
353
|
try:
|
|
342
354
|
async for result in ws.receive():
|
|
343
355
|
if result['type'] == 'transcript':
|
|
344
356
|
text = result['text']
|
|
345
357
|
is_final = result['is_final']
|
|
346
|
-
|
|
358
|
+
|
|
347
359
|
# Handle word-level timestamps
|
|
348
360
|
if 'words' in result and result['words']:
|
|
349
361
|
word_timestamps = result['words']
|
|
350
362
|
all_word_timestamps.extend(word_timestamps)
|
|
351
|
-
|
|
363
|
+
|
|
352
364
|
if is_final:
|
|
353
365
|
print("Word-level timestamps:")
|
|
354
366
|
for word_info in word_timestamps:
|
|
@@ -356,7 +368,7 @@ async def streaming_stt_example():
|
|
|
356
368
|
start = word_info['start']
|
|
357
369
|
end = word_info['end']
|
|
358
370
|
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
|
359
|
-
|
|
371
|
+
|
|
360
372
|
if is_final:
|
|
361
373
|
# Final result - this text won't change
|
|
362
374
|
full_transcript += text + " "
|
|
@@ -364,30 +376,30 @@ async def streaming_stt_example():
|
|
|
364
376
|
else:
|
|
365
377
|
# Partial result - may change as more audio is processed
|
|
366
378
|
print(f"PARTIAL: {text}")
|
|
367
|
-
|
|
379
|
+
|
|
368
380
|
elif result['type'] == 'done':
|
|
369
381
|
print("Transcription completed")
|
|
370
382
|
break
|
|
371
|
-
|
|
383
|
+
|
|
372
384
|
except Exception as e:
|
|
373
385
|
print(f"Error receiving transcripts: {e}")
|
|
374
|
-
|
|
386
|
+
|
|
375
387
|
return full_transcript.strip(), all_word_timestamps
|
|
376
|
-
|
|
388
|
+
|
|
377
389
|
print("Starting streaming STT...")
|
|
378
|
-
|
|
390
|
+
|
|
379
391
|
# Use asyncio.gather to run audio sending and transcript receiving concurrently
|
|
380
392
|
_, (final_transcript, word_timestamps) = await asyncio.gather(
|
|
381
393
|
send_audio(),
|
|
382
394
|
receive_transcripts()
|
|
383
395
|
)
|
|
384
|
-
|
|
396
|
+
|
|
385
397
|
print(f"\nComplete transcript: {final_transcript}")
|
|
386
398
|
print(f"Total words with timestamps: {len(word_timestamps)}")
|
|
387
|
-
|
|
399
|
+
|
|
388
400
|
# Clean up
|
|
389
401
|
await ws.close()
|
|
390
|
-
|
|
402
|
+
|
|
391
403
|
except Exception as e:
|
|
392
404
|
print(f"STT streaming error: {e}")
|
|
393
405
|
finally:
|
|
@@ -442,7 +454,7 @@ import os
|
|
|
442
454
|
|
|
443
455
|
async def transcribe_file():
|
|
444
456
|
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
|
445
|
-
|
|
457
|
+
|
|
446
458
|
with open("path/to/audio.wav", "rb") as audio_file:
|
|
447
459
|
response = await client.stt.transcribe(
|
|
448
460
|
file=audio_file,
|
|
@@ -450,14 +462,14 @@ async def transcribe_file():
|
|
|
450
462
|
language="en",
|
|
451
463
|
timestamp_granularities=["word"],
|
|
452
464
|
)
|
|
453
|
-
|
|
465
|
+
|
|
454
466
|
print(f"Transcribed text: {response.text}")
|
|
455
|
-
|
|
467
|
+
|
|
456
468
|
# Process word timestamps
|
|
457
469
|
if response.words:
|
|
458
470
|
for word_info in response.words:
|
|
459
471
|
print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
|
|
460
|
-
|
|
472
|
+
|
|
461
473
|
await client.close()
|
|
462
474
|
|
|
463
475
|
asyncio.run(transcribe_file())
|
|
@@ -664,6 +676,3 @@ $ git commit --amend -m "manually regenerate from docs" # optional
|
|
|
664
676
|
|
|
665
677
|
From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
|
|
666
678
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
cartesia/__init__.py,sha256=
|
|
1
|
+
cartesia/__init__.py,sha256=bANRu5PeAnbf6O7MXltmngXPJa_G-xo7mck3sZy9B_Y,10463
|
|
2
2
|
cartesia/api_status/__init__.py,sha256=_dHNLdknrBjxHtU2PvLumttJM-JTQhJQqhhAQkLqt_U,168
|
|
3
3
|
cartesia/api_status/client.py,sha256=GJ9Dq8iCn3hn8vCIqc6k1fCGEhSz0T0kaPGcdFnbMDY,3146
|
|
4
4
|
cartesia/api_status/requests/__init__.py,sha256=ilEMzEy1JEw484CuL92bX5lHGOznc62pjiDMgiZ0tKM,130
|
|
@@ -19,7 +19,7 @@ cartesia/base_client.py,sha256=igAZOMDXz2Nv58oXHa7I9UfgxVN48drqhEmfsCCQlg8,6701
|
|
|
19
19
|
cartesia/client.py,sha256=LoJjlJW2kJA-hyDt-Wu7QuKQsiTiLQfLYZjsjtewPJM,6537
|
|
20
20
|
cartesia/core/__init__.py,sha256=-t9txgeQZL_1FDw_08GEoj4ft1Cn9Dti6X0Drsadlr0,1519
|
|
21
21
|
cartesia/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
|
|
22
|
-
cartesia/core/client_wrapper.py,sha256=
|
|
22
|
+
cartesia/core/client_wrapper.py,sha256=0KSb2fMEKPh62KA7qLmVkQjJ20sJzkJEeU8QQqwh2Sk,1855
|
|
23
23
|
cartesia/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
|
|
24
24
|
cartesia/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
|
|
25
25
|
cartesia/core/http_client.py,sha256=KL5RGa0y4n8nX0-07WRg4ZQUTq30sc-XJbWcP5vjBDg,19552
|
|
@@ -74,25 +74,26 @@ cartesia/stt/types/timestamp_granularity.py,sha256=Oe39JvLeMgR2BIJnx32abhvs05dJe
|
|
|
74
74
|
cartesia/stt/types/transcript_message.py,sha256=J-MchlahI96nVBiMSLJrEOXFw2pBShbMXVocysQRnrY,1693
|
|
75
75
|
cartesia/stt/types/transcription_response.py,sha256=QMcD6eLmp_Z2uaRLVyxYYIdoiRiVSGhBoxN3kjRTK2I,1190
|
|
76
76
|
cartesia/stt/types/transcription_word.py,sha256=yxTndKXNmToPOM6_F_QfF-B0dE6Kx8-UwBpHLj2_zWk,803
|
|
77
|
-
cartesia/tts/__init__.py,sha256=
|
|
78
|
-
cartesia/tts/_async_websocket.py,sha256=
|
|
79
|
-
cartesia/tts/_websocket.py,sha256=
|
|
80
|
-
cartesia/tts/client.py,sha256=
|
|
81
|
-
cartesia/tts/requests/__init__.py,sha256=
|
|
77
|
+
cartesia/tts/__init__.py,sha256=KmlvJPusv7yRI3OmkEn3GlxqITbfewLVO5S0SJkqV5M,5079
|
|
78
|
+
cartesia/tts/_async_websocket.py,sha256=zcLNLWT7ndf1izMxKLkcyMBNM_nCcW5Nt14e0Z3DOEM,19754
|
|
79
|
+
cartesia/tts/_websocket.py,sha256=A9jTgP_PJhr9JMFk91ZlBQSrh9JSnoDzzwlhnuAuOTE,19546
|
|
80
|
+
cartesia/tts/client.py,sha256=mJXJG9JliWHw7UYCCd9evIW3gaSr3JYORW606E8lHzU,19607
|
|
81
|
+
cartesia/tts/requests/__init__.py,sha256=zS0ny3c6HXr2l6D9TiBmMuyp-tbVuszhBiOQ5RdcQyw,3502
|
|
82
82
|
cartesia/tts/requests/cancel_context_request.py,sha256=Wl8g-o5vwl9ENm-H1wsLx441FkIR_4Wt5UYtuWce2Yw,431
|
|
83
|
-
cartesia/tts/requests/controls.py,sha256=
|
|
84
|
-
cartesia/tts/requests/
|
|
83
|
+
cartesia/tts/requests/controls.py,sha256=TkywdstN4X9odGF_HfN25zYXcCxaJS8Q0H1HR0nv_rg,316
|
|
84
|
+
cartesia/tts/requests/generation_config.py,sha256=ZFed-oneBwyxkkI1DHmmvtYso7FjTYM01ApS1omr1ms,917
|
|
85
|
+
cartesia/tts/requests/generation_request.py,sha256=rZVpfwUzSea72b5gqPY47Fgunu_IJQM2PiVNHqCR9Jk,3214
|
|
85
86
|
cartesia/tts/requests/mp_3_output_format.py,sha256=HBM6452KdWD9tGa9QXNyUZcH1OlJrXt_PIwo2Jt3l2Q,441
|
|
86
87
|
cartesia/tts/requests/output_format.py,sha256=8TKu9AAeHCR5L4edzYch8FIYIldn4bM7ySrsCl8W_g8,842
|
|
87
88
|
cartesia/tts/requests/phoneme_timestamps.py,sha256=ft81nmqElZAnvTBT27lY6YWfF18ZGsCx3Y1XHv9J7cM,267
|
|
88
89
|
cartesia/tts/requests/raw_output_format.py,sha256=WigDQlM_YkLk_-GK1_pNseGq8g-_POO84Su7jqSLsHQ,441
|
|
89
90
|
cartesia/tts/requests/speed.py,sha256=-YGBWwh7_VtCBnYlT5EVsnrmcHFMEBTxy9LathZhkMA,259
|
|
90
91
|
cartesia/tts/requests/sse_output_format.py,sha256=dsRyxFCD3Qt3hTppxV7HJhphPx3jTkZhryMXUP-Soc8,417
|
|
91
|
-
cartesia/tts/requests/tts_request.py,sha256=
|
|
92
|
+
cartesia/tts/requests/tts_request.py,sha256=CUFMg_U2BhJQAxrqLAv4tfxAN326ItiCi0fQfJFi4lU,1152
|
|
92
93
|
cartesia/tts/requests/tts_request_embedding_specifier.py,sha256=-M54ZjV0H5LPwcKtz0bOVqlkvO1pPiMbqMbVBMko3Ns,565
|
|
93
94
|
cartesia/tts/requests/tts_request_id_specifier.py,sha256=-0ClfyJnnaH0uAcF5r84s3cM_cw2wT39dp6T4JYzOQ8,536
|
|
94
95
|
cartesia/tts/requests/tts_request_voice_specifier.py,sha256=eGzL4aVGq4gKPxeglsV7-wuhxg8x33Qth3uFTTytgeI,337
|
|
95
|
-
cartesia/tts/requests/ttssse_request.py,sha256=
|
|
96
|
+
cartesia/tts/requests/ttssse_request.py,sha256=IZ4Urm23VQBhuJmA8CqZegZnTVIBqfZWQ9ve2vy2gXc,2138
|
|
96
97
|
cartesia/tts/requests/wav_output_format.py,sha256=qiipmT5hWsa8J-fwW1EH_rnUAX_zOUpGJUNzuLc65r4,181
|
|
97
98
|
cartesia/tts/requests/web_socket_base_response.py,sha256=zCjHw-FaNJMOcHiAb2NQWrBBfrzU5rc95vqDp7y9RmA,315
|
|
98
99
|
cartesia/tts/requests/web_socket_chunk_response.py,sha256=4fVPJH-ZZb8lJKwqyYGx5wyeYWzfuThGxMRXC6ku4bA,233
|
|
@@ -106,16 +107,18 @@ cartesia/tts/requests/web_socket_response.py,sha256=kS46YN94ilUn4qjpt1TpauZApe0N
|
|
|
106
107
|
cartesia/tts/requests/web_socket_stream_options.py,sha256=VIvblFw9hGZvDzFpOnC11G0NvrFSVt-1-0sY5rpcZPI,232
|
|
107
108
|
cartesia/tts/requests/web_socket_timestamps_response.py,sha256=MK3zN2Q_PVWJtX5DidNB0uXoF2o33rv6qCYPVaourxY,351
|
|
108
109
|
cartesia/tts/requests/web_socket_tts_output.py,sha256=pX2uf0XVdziFhXCydwLlVOWb-LvBiuq-cBI6R1INiMg,760
|
|
109
|
-
cartesia/tts/requests/web_socket_tts_request.py,sha256=
|
|
110
|
+
cartesia/tts/requests/web_socket_tts_request.py,sha256=9IqZKwM8YSDoDqYNPQ6DrcRGfuaAExD0KIPC0Ptaq1U,1926
|
|
110
111
|
cartesia/tts/requests/word_timestamps.py,sha256=WMfBJtETi6wTpES0pYZCFfFRfEbzWE-RtosDJ5seUWg,261
|
|
111
112
|
cartesia/tts/socket_client.py,sha256=zTPayHbgy-yQQ50AE1HXN4GMyanisZcLXf7Ds1paYks,11621
|
|
112
|
-
cartesia/tts/types/__init__.py,sha256=
|
|
113
|
+
cartesia/tts/types/__init__.py,sha256=VsVhynuJM_G3zHAzkAtB8M6eK_tq0Pa76FOAiulbRBc,3585
|
|
113
114
|
cartesia/tts/types/cancel_context_request.py,sha256=zInhk3qRZsSc0F1aYJ-Q5BHJsosTrb22IJWhzue-eKE,856
|
|
114
115
|
cartesia/tts/types/context_id.py,sha256=UCEtq5xFGOeBCECcY6Y-gYVe_Peg1hFhH9YYOkpApQg,81
|
|
115
|
-
cartesia/tts/types/controls.py,sha256=
|
|
116
|
-
cartesia/tts/types/emotion.py,sha256=
|
|
116
|
+
cartesia/tts/types/controls.py,sha256=SxeSPZ4KgvRiUawOUI9mycASv6ekQ11vZYKOMtZz5TU,675
|
|
117
|
+
cartesia/tts/types/emotion.py,sha256=N5E5Tf7L9tHcH-MB5fDPEFusotygu85ybEc-YeslVjc,79
|
|
118
|
+
cartesia/tts/types/emotion_deprecated.py,sha256=WQuI5pXbzgpNq4kT14NMfukCJPN58GbmTtPScMMLy4I,774
|
|
117
119
|
cartesia/tts/types/flush_id.py,sha256=HCIKo9o8d7YWKtaSNU3TEvfUVBju93ckGQy01Z9wLcE,79
|
|
118
|
-
cartesia/tts/types/
|
|
120
|
+
cartesia/tts/types/generation_config.py,sha256=lIb52e8Ua777uvFnFTYn1NghxpzSTMC4QmDlV1cturU,1332
|
|
121
|
+
cartesia/tts/types/generation_request.py,sha256=qO7XKzvwIp8Foglv5_1DJL1pCZLVyea0fQ0oKJw0fGw,3694
|
|
119
122
|
cartesia/tts/types/model_speed.py,sha256=iiTj8V0piFCX2FZh5B8EkgRhZDlj4z3VFcQhp66e7y8,160
|
|
120
123
|
cartesia/tts/types/mp_3_output_format.py,sha256=LQ1-rEYjkK6XXWoj_Z7bezsguPpNI_SmprlIipsyNMI,875
|
|
121
124
|
cartesia/tts/types/natural_specifier.py,sha256=K526P1RRuBGy80hyd_tX8tohPrE8DR9EgTCxS5wce0o,188
|
|
@@ -127,11 +130,11 @@ cartesia/tts/types/raw_output_format.py,sha256=ir5QxW986P8qB14pMD5PVsAgc0bdC37i7
|
|
|
127
130
|
cartesia/tts/types/speed.py,sha256=4c5WdxocBw6WSMnundSaNnceUeooU0vikhy00FW6M-w,239
|
|
128
131
|
cartesia/tts/types/sse_output_format.py,sha256=1_GB3rftQYAsXO6WrgQmzr-tsjCntHCVgKeTjay7M9g,819
|
|
129
132
|
cartesia/tts/types/supported_language.py,sha256=riDRduThMbMWAq9i2uCfxhwVTpgaFwNDZ9LhEIl4zHY,237
|
|
130
|
-
cartesia/tts/types/tts_request.py,sha256=
|
|
133
|
+
cartesia/tts/types/tts_request.py,sha256=TkngMxyGKnjQvIG5u4qFx9TKcohuLyjI1UeXv8xbj2U,1488
|
|
131
134
|
cartesia/tts/types/tts_request_embedding_specifier.py,sha256=eL_qCEr4pvWfy4qp9hZBuVdCincX5DBVqfv1vLt2_Vk,942
|
|
132
135
|
cartesia/tts/types/tts_request_id_specifier.py,sha256=ktGdkkTRQ9scA-lt8qJ2jn_E5WzoOK8AXMrVqi71gf0,906
|
|
133
136
|
cartesia/tts/types/tts_request_voice_specifier.py,sha256=p-3UQ62uFL1SgbX73Ex1D_V73Ef0wmT1ApOt1iLZmwE,307
|
|
134
|
-
cartesia/tts/types/ttssse_request.py,sha256=
|
|
137
|
+
cartesia/tts/types/ttssse_request.py,sha256=QZa0LOwhtsxLFaTxCGA0EzMOYqp7tVu-ezmu-ibcmiA,2535
|
|
135
138
|
cartesia/tts/types/wav_output_format.py,sha256=OTAgVn_gBMk252XO12kiNI9lKrbw3n38aBAiqlG5mdU,531
|
|
136
139
|
cartesia/tts/types/web_socket_base_response.py,sha256=MWoTt1rGRqUQ8BOad1Zk2SA-i0E8a3JwPLSiehIbFj4,672
|
|
137
140
|
cartesia/tts/types/web_socket_chunk_response.py,sha256=VOPXAlyGFdnfC69KxqDWDo1PPMydvQKmAypoWfbW8_s,593
|
|
@@ -145,7 +148,7 @@ cartesia/tts/types/web_socket_response.py,sha256=fUQbJ6yFzZbzUZPuQWgkFdzP8-FMiKT
|
|
|
145
148
|
cartesia/tts/types/web_socket_stream_options.py,sha256=MhDSxBFqMuQeWjoyPqXVnTEzLjF8g6aojeigb5dQUgU,596
|
|
146
149
|
cartesia/tts/types/web_socket_timestamps_response.py,sha256=kuWXI82ncF1QapnaHEjwrL84qWob7ByQU-yh1e0IEmk,667
|
|
147
150
|
cartesia/tts/types/web_socket_tts_output.py,sha256=uvkv0smTBhdm18Rl17C0Ml4Inh79YBHNzAcKnZBs14Y,979
|
|
148
|
-
cartesia/tts/types/web_socket_tts_request.py,sha256=
|
|
151
|
+
cartesia/tts/types/web_socket_tts_request.py,sha256=mBVFoOdZDlxm2cQbmPTHgQjENfM4xhm_DywlTm5OtGI,2262
|
|
149
152
|
cartesia/tts/types/word_timestamps.py,sha256=XZ2Q0prdb3F9c3AiOKXu4s3A3jBxE-qIt1npHOf16R0,631
|
|
150
153
|
cartesia/tts/utils/constants.py,sha256=1CHa5flJf8--L_eYyOyOiWJNZ-Q81ufHZxDbJs8xYSk,418
|
|
151
154
|
cartesia/tts/utils/tts.py,sha256=u7PgPxlJs6fcQTfr-jqAvBCAaK3JWLhF5QF4s-PwoMo,2093
|
|
@@ -198,7 +201,7 @@ cartesia/voices/types/voice_expand_options.py,sha256=e4FroWdlxEE-LXQfT1RWlGHtswl
|
|
|
198
201
|
cartesia/voices/types/voice_id.py,sha256=GDoXcRVeIm-V21R4suxG2zqLD3DLYkXE9kgizadzFKo,79
|
|
199
202
|
cartesia/voices/types/voice_metadata.py,sha256=4KNGjXMUKm3niv-NvKIFVGtiilpH13heuzKcZYNQxk4,1181
|
|
200
203
|
cartesia/voices/types/weight.py,sha256=XqDU7_JItNUb5QykIDqTbELlRYQdbt2SviRgW0w2LKo,80
|
|
201
|
-
cartesia-2.0.
|
|
202
|
-
cartesia-2.0.
|
|
203
|
-
cartesia-2.0.
|
|
204
|
-
cartesia-2.0.
|
|
204
|
+
cartesia-2.0.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
205
|
+
cartesia-2.0.14.dist-info/METADATA,sha256=PVHHNLFx-PEOKbszJkawRJ6xxooIpJpxGNRM8wtBa8k,20671
|
|
206
|
+
cartesia-2.0.14.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
|
207
|
+
cartesia-2.0.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|