cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +60 -1
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/base_client.py +2 -0
- cartesia/client.py +5 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/stt/__init__.py +57 -0
- cartesia/stt/_async_websocket.py +293 -0
- cartesia/stt/_websocket.py +294 -0
- cartesia/stt/client.py +456 -0
- cartesia/stt/requests/__init__.py +29 -0
- cartesia/stt/requests/done_message.py +14 -0
- cartesia/stt/requests/error_message.py +16 -0
- cartesia/stt/requests/flush_done_message.py +14 -0
- cartesia/stt/requests/streaming_transcription_response.py +41 -0
- cartesia/stt/requests/transcript_message.py +40 -0
- cartesia/stt/requests/transcription_response.py +28 -0
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +138 -0
- cartesia/stt/types/__init__.py +33 -0
- cartesia/stt/types/done_message.py +26 -0
- cartesia/stt/types/error_message.py +27 -0
- cartesia/stt/types/flush_done_message.py +26 -0
- cartesia/stt/types/streaming_transcription_response.py +94 -0
- cartesia/stt/types/stt_encoding.py +7 -0
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +50 -0
- cartesia/stt/types/transcription_response.py +38 -0
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +8 -0
- cartesia/tts/client.py +50 -8
- cartesia/tts/requests/__init__.py +4 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +4 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -8,10 +8,12 @@ from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFor
|
|
8
8
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
9
9
|
from .raw_output_format import RawOutputFormatParams
|
10
10
|
from .speed import SpeedParams
|
11
|
+
from .sse_output_format import SseOutputFormatParams
|
11
12
|
from .tts_request import TtsRequestParams
|
12
13
|
from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
|
13
14
|
from .tts_request_id_specifier import TtsRequestIdSpecifierParams
|
14
15
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
16
|
+
from .ttssse_request import TtssseRequestParams
|
15
17
|
from .wav_output_format import WavOutputFormatParams
|
16
18
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
17
19
|
from .web_socket_chunk_response import WebSocketChunkResponseParams
|
@@ -48,10 +50,12 @@ __all__ = [
|
|
48
50
|
"PhonemeTimestampsParams",
|
49
51
|
"RawOutputFormatParams",
|
50
52
|
"SpeedParams",
|
53
|
+
"SseOutputFormatParams",
|
51
54
|
"TtsRequestEmbeddingSpecifierParams",
|
52
55
|
"TtsRequestIdSpecifierParams",
|
53
56
|
"TtsRequestParams",
|
54
57
|
"TtsRequestVoiceSpecifierParams",
|
58
|
+
"TtssseRequestParams",
|
55
59
|
"WavOutputFormatParams",
|
56
60
|
"WebSocketBaseResponseParams",
|
57
61
|
"WebSocketChunkResponseParams",
|
@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
|
|
55
55
|
|
56
56
|
add_timestamps: typing_extensions.NotRequired[bool]
|
57
57
|
"""
|
58
|
-
Whether to return word-level timestamps.
|
58
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
59
59
|
"""
|
60
60
|
|
61
61
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
62
62
|
"""
|
63
|
-
Whether to return phoneme-level timestamps.
|
63
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
64
64
|
"""
|
65
65
|
|
66
|
-
|
66
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
67
67
|
"""
|
68
|
-
Whether to use
|
68
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
69
69
|
"""
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing
|
5
|
+
from ..types.raw_encoding import RawEncoding
|
6
|
+
|
7
|
+
|
8
|
+
class SseOutputFormatParams(typing_extensions.TypedDict):
|
9
|
+
container: typing.Literal["raw"]
|
10
|
+
encoding: RawEncoding
|
11
|
+
sample_rate: int
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
5
|
+
import typing_extensions
|
6
|
+
from ..types.supported_language import SupportedLanguage
|
7
|
+
from .sse_output_format import SseOutputFormatParams
|
8
|
+
from ..types.model_speed import ModelSpeed
|
9
|
+
from ..types.context_id import ContextId
|
10
|
+
|
11
|
+
|
12
|
+
class TtssseRequestParams(typing_extensions.TypedDict):
|
13
|
+
model_id: str
|
14
|
+
"""
|
15
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
16
|
+
"""
|
17
|
+
|
18
|
+
transcript: str
|
19
|
+
voice: TtsRequestVoiceSpecifierParams
|
20
|
+
language: typing_extensions.NotRequired[SupportedLanguage]
|
21
|
+
output_format: SseOutputFormatParams
|
22
|
+
duration: typing_extensions.NotRequired[float]
|
23
|
+
"""
|
24
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
25
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
26
|
+
"""
|
27
|
+
|
28
|
+
speed: typing_extensions.NotRequired[ModelSpeed]
|
29
|
+
add_timestamps: typing_extensions.NotRequired[bool]
|
30
|
+
"""
|
31
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
32
|
+
"""
|
33
|
+
|
34
|
+
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
35
|
+
"""
|
36
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
37
|
+
"""
|
38
|
+
|
39
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
40
|
+
"""
|
41
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
42
|
+
"""
|
43
|
+
|
44
|
+
context_id: typing_extensions.NotRequired[ContextId]
|
45
|
+
"""
|
46
|
+
Optional context ID for this request.
|
47
|
+
"""
|
@@ -1,11 +1,8 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
4
|
-
import typing_extensions
|
5
|
-
from ..types.flush_id import FlushId
|
6
4
|
|
7
5
|
|
8
6
|
class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
|
9
7
|
data: str
|
10
8
|
step_time: float
|
11
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
@@ -4,8 +4,8 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ..types.flush_id import FlushId
|
8
7
|
from ..types.context_id import ContextId
|
8
|
+
from ..types.flush_id import FlushId
|
9
9
|
from .word_timestamps import WordTimestampsParams
|
10
10
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
11
11
|
|
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
|
|
14
14
|
type: typing.Literal["chunk"]
|
15
15
|
data: str
|
16
16
|
step_time: float
|
17
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
18
17
|
context_id: typing_extensions.NotRequired[ContextId]
|
19
18
|
status_code: int
|
20
19
|
done: bool
|
@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
|
20
20
|
duration: typing_extensions.NotRequired[int]
|
21
21
|
language: typing_extensions.NotRequired[str]
|
22
22
|
add_timestamps: typing_extensions.NotRequired[bool]
|
23
|
-
|
23
|
+
"""
|
24
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
25
|
+
"""
|
26
|
+
|
24
27
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
28
|
+
"""
|
29
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
30
|
+
"""
|
31
|
+
|
32
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
25
33
|
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
26
34
|
context_id: typing_extensions.NotRequired[str]
|
27
35
|
max_buffer_delay_ms: typing_extensions.NotRequired[int]
|
cartesia/tts/types/__init__.py
CHANGED
@@ -15,11 +15,13 @@ from .phoneme_timestamps import PhonemeTimestamps
|
|
15
15
|
from .raw_encoding import RawEncoding
|
16
16
|
from .raw_output_format import RawOutputFormat
|
17
17
|
from .speed import Speed
|
18
|
+
from .sse_output_format import SseOutputFormat
|
18
19
|
from .supported_language import SupportedLanguage
|
19
20
|
from .tts_request import TtsRequest
|
20
21
|
from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
|
21
22
|
from .tts_request_id_specifier import TtsRequestIdSpecifier
|
22
23
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
24
|
+
from .ttssse_request import TtssseRequest
|
23
25
|
from .wav_output_format import WavOutputFormat
|
24
26
|
from .web_socket_base_response import WebSocketBaseResponse
|
25
27
|
from .web_socket_chunk_response import WebSocketChunkResponse
|
@@ -63,11 +65,13 @@ __all__ = [
|
|
63
65
|
"RawEncoding",
|
64
66
|
"RawOutputFormat",
|
65
67
|
"Speed",
|
68
|
+
"SseOutputFormat",
|
66
69
|
"SupportedLanguage",
|
67
70
|
"TtsRequest",
|
68
71
|
"TtsRequestEmbeddingSpecifier",
|
69
72
|
"TtsRequestIdSpecifier",
|
70
73
|
"TtsRequestVoiceSpecifier",
|
74
|
+
"TtssseRequest",
|
71
75
|
"WavOutputFormat",
|
72
76
|
"WebSocketBaseResponse",
|
73
77
|
"WebSocketChunkResponse",
|
@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
|
|
59
59
|
|
60
60
|
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
61
61
|
"""
|
62
|
-
Whether to return word-level timestamps.
|
62
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
63
63
|
"""
|
64
64
|
|
65
65
|
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
66
66
|
"""
|
67
|
-
Whether to return phoneme-level timestamps.
|
67
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
68
68
|
"""
|
69
69
|
|
70
|
-
|
70
|
+
use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
71
71
|
"""
|
72
|
-
Whether to use
|
72
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
73
73
|
"""
|
74
74
|
|
75
75
|
if IS_PYDANTIC_V2:
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
5
|
+
from .raw_encoding import RawEncoding
|
6
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
|
+
import pydantic
|
8
|
+
|
9
|
+
|
10
|
+
class SseOutputFormat(UniversalBaseModel):
|
11
|
+
container: typing.Literal["raw"] = "raw"
|
12
|
+
encoding: RawEncoding
|
13
|
+
sample_rate: int
|
14
|
+
|
15
|
+
if IS_PYDANTIC_V2:
|
16
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
17
|
+
else:
|
18
|
+
|
19
|
+
class Config:
|
20
|
+
frozen = True
|
21
|
+
smart_union = True
|
22
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
6
|
+
import typing
|
7
|
+
from .supported_language import SupportedLanguage
|
8
|
+
from .sse_output_format import SseOutputFormat
|
9
|
+
from .model_speed import ModelSpeed
|
10
|
+
from .context_id import ContextId
|
11
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
12
|
+
|
13
|
+
|
14
|
+
class TtssseRequest(UniversalBaseModel):
|
15
|
+
model_id: str = pydantic.Field()
|
16
|
+
"""
|
17
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
18
|
+
"""
|
19
|
+
|
20
|
+
transcript: str
|
21
|
+
voice: TtsRequestVoiceSpecifier
|
22
|
+
language: typing.Optional[SupportedLanguage] = None
|
23
|
+
output_format: SseOutputFormat
|
24
|
+
duration: typing.Optional[float] = pydantic.Field(default=None)
|
25
|
+
"""
|
26
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
27
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
28
|
+
"""
|
29
|
+
|
30
|
+
speed: typing.Optional[ModelSpeed] = None
|
31
|
+
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
32
|
+
"""
|
33
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
34
|
+
"""
|
35
|
+
|
36
|
+
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
37
|
+
"""
|
38
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
39
|
+
"""
|
40
|
+
|
41
|
+
use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
42
|
+
"""
|
43
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
44
|
+
"""
|
45
|
+
|
46
|
+
context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
|
47
|
+
"""
|
48
|
+
Optional context ID for this request.
|
49
|
+
"""
|
50
|
+
|
51
|
+
if IS_PYDANTIC_V2:
|
52
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
53
|
+
else:
|
54
|
+
|
55
|
+
class Config:
|
56
|
+
frozen = True
|
57
|
+
smart_union = True
|
58
|
+
extra = pydantic.Extra.allow
|
@@ -1,16 +1,14 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponse
|
4
|
-
import typing
|
5
|
-
from .flush_id import FlushId
|
6
4
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
5
|
+
import typing
|
7
6
|
import pydantic
|
8
7
|
|
9
8
|
|
10
9
|
class WebSocketChunkResponse(WebSocketBaseResponse):
|
11
10
|
data: str
|
12
11
|
step_time: float
|
13
|
-
flush_id: typing.Optional[FlushId] = None
|
14
12
|
|
15
13
|
if IS_PYDANTIC_V2:
|
16
14
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -3,10 +3,10 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
-
from .flush_id import FlushId
|
7
6
|
from .context_id import ContextId
|
8
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
8
|
import pydantic
|
9
|
+
from .flush_id import FlushId
|
10
10
|
from .word_timestamps import WordTimestamps
|
11
11
|
from .phoneme_timestamps import PhonemeTimestamps
|
12
12
|
|
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
|
|
15
15
|
type: typing.Literal["chunk"] = "chunk"
|
16
16
|
data: str
|
17
17
|
step_time: float
|
18
|
-
flush_id: typing.Optional[FlushId] = None
|
19
18
|
context_id: typing.Optional[ContextId] = None
|
20
19
|
status_code: int
|
21
20
|
done: bool
|
@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
|
|
22
22
|
voice: TtsRequestVoiceSpecifier
|
23
23
|
duration: typing.Optional[int] = None
|
24
24
|
language: typing.Optional[str] = None
|
25
|
-
add_timestamps: typing.Optional[bool] = None
|
26
|
-
|
27
|
-
|
25
|
+
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
26
|
+
"""
|
27
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
28
|
+
"""
|
29
|
+
|
30
|
+
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
31
|
+
"""
|
32
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
33
|
+
"""
|
34
|
+
|
35
|
+
use_normalized_timestamps: typing.Optional[bool] = None
|
28
36
|
continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
|
29
37
|
context_id: typing.Optional[str] = None
|
30
38
|
max_buffer_delay_ms: typing.Optional[int] = None
|
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ...tts.types.flush_id import FlushId
|
8
7
|
from ...tts.types.context_id import ContextId
|
9
8
|
|
10
9
|
|
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
|
|
12
11
|
type: typing.Literal["chunk"]
|
13
12
|
data: str
|
14
13
|
step_time: float
|
15
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
16
14
|
context_id: typing_extensions.NotRequired[ContextId]
|
17
15
|
status_code: int
|
18
16
|
done: bool
|
@@ -3,7 +3,6 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
-
from ...tts.types.flush_id import FlushId
|
7
6
|
from ...tts.types.context_id import ContextId
|
8
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
8
|
import pydantic
|
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
|
|
13
12
|
type: typing.Literal["chunk"] = "chunk"
|
14
13
|
data: str
|
15
14
|
step_time: float
|
16
|
-
flush_id: typing.Optional[FlushId] = None
|
17
15
|
context_id: typing.Optional[ContextId] = None
|
18
16
|
status_code: int
|
19
17
|
done: bool
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.6
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.8,<4.0
|
6
6
|
Classifier: Intended Audience :: Developers
|
@@ -213,6 +213,258 @@ p.terminate()
|
|
213
213
|
ws.close() # Close the websocket connection
|
214
214
|
```
|
215
215
|
|
216
|
+
## Speech-to-Text (STT) with Websockets
|
217
|
+
|
218
|
+
```python
|
219
|
+
from cartesia import Cartesia
|
220
|
+
import os
|
221
|
+
|
222
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
223
|
+
|
224
|
+
# Load your audio file as bytes
|
225
|
+
with open("path/to/audio.wav", "rb") as f:
|
226
|
+
audio_data = f.read()
|
227
|
+
|
228
|
+
# Convert to audio chunks (20ms chunks used here for a streaming example)
|
229
|
+
# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
|
230
|
+
chunk_size = 640
|
231
|
+
audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
|
232
|
+
|
233
|
+
# Create websocket connection with endpointing parameters
|
234
|
+
ws = client.stt.websocket(
|
235
|
+
model="ink-whisper", # Model (required)
|
236
|
+
language="en", # Language of your audio (required)
|
237
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
238
|
+
sample_rate=16000, # Audio sample rate (required)
|
239
|
+
min_volume=0.1, # Volume threshold for voice activity detection
|
240
|
+
max_silence_duration_secs=0.4, # Maximum silence duration before endpointing
|
241
|
+
)
|
242
|
+
|
243
|
+
# Send audio chunks (streaming approach)
|
244
|
+
for chunk in audio_chunks:
|
245
|
+
ws.send(chunk)
|
246
|
+
|
247
|
+
# Finalize and close
|
248
|
+
ws.send("finalize")
|
249
|
+
ws.send("done")
|
250
|
+
|
251
|
+
# Receive transcription results with word-level timestamps
|
252
|
+
for result in ws.receive():
|
253
|
+
if result['type'] == 'transcript':
|
254
|
+
print(f"Transcription: {result['text']}")
|
255
|
+
|
256
|
+
# Handle word-level timestamps if available
|
257
|
+
if 'words' in result and result['words']:
|
258
|
+
print("Word-level timestamps:")
|
259
|
+
for word_info in result['words']:
|
260
|
+
word = word_info['word']
|
261
|
+
start = word_info['start']
|
262
|
+
end = word_info['end']
|
263
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
264
|
+
|
265
|
+
if result['is_final']:
|
266
|
+
print("Final result received")
|
267
|
+
elif result['type'] == 'done':
|
268
|
+
break
|
269
|
+
|
270
|
+
ws.close()
|
271
|
+
```
|
272
|
+
|
273
|
+
### Async Streaming Speech-to-Text (STT) with Websockets
|
274
|
+
|
275
|
+
For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
|
276
|
+
|
277
|
+
```python
|
278
|
+
import asyncio
|
279
|
+
import os
|
280
|
+
from cartesia import AsyncCartesia
|
281
|
+
|
282
|
+
async def streaming_stt_example():
|
283
|
+
"""
|
284
|
+
Advanced async STT example for real-time streaming applications.
|
285
|
+
This example simulates streaming audio processing with proper error handling
|
286
|
+
and demonstrates the new endpointing and word timestamp features.
|
287
|
+
"""
|
288
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
289
|
+
|
290
|
+
try:
|
291
|
+
# Create websocket connection with voice activity detection
|
292
|
+
ws = await client.stt.websocket(
|
293
|
+
model="ink-whisper", # Model (required)
|
294
|
+
language="en", # Language of your audio (required)
|
295
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
296
|
+
sample_rate=16000, # Audio sample rate (required)
|
297
|
+
min_volume=0.15, # Volume threshold for voice activity detection
|
298
|
+
max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
|
299
|
+
)
|
300
|
+
|
301
|
+
# Simulate streaming audio data (replace with your audio source)
|
302
|
+
async def audio_stream():
|
303
|
+
"""Simulate real-time audio streaming - replace with actual audio capture"""
|
304
|
+
# Load audio file for simulation
|
305
|
+
with open("path/to/audio.wav", "rb") as f:
|
306
|
+
audio_data = f.read()
|
307
|
+
|
308
|
+
# Stream in 100ms chunks (realistic for real-time processing)
|
309
|
+
chunk_size = int(16000 * 0.1 * 2) # 100ms at 16kHz, 16-bit
|
310
|
+
|
311
|
+
for i in range(0, len(audio_data), chunk_size):
|
312
|
+
chunk = audio_data[i:i + chunk_size]
|
313
|
+
if chunk:
|
314
|
+
yield chunk
|
315
|
+
# Simulate real-time streaming delay
|
316
|
+
await asyncio.sleep(0.1)
|
317
|
+
|
318
|
+
# Send audio and receive results concurrently
|
319
|
+
async def send_audio():
|
320
|
+
"""Send audio chunks to the STT websocket"""
|
321
|
+
try:
|
322
|
+
async for chunk in audio_stream():
|
323
|
+
await ws.send(chunk)
|
324
|
+
print(f"Sent audio chunk of {len(chunk)} bytes")
|
325
|
+
# Small delay to simulate realtime applications
|
326
|
+
await asyncio.sleep(0.02)
|
327
|
+
|
328
|
+
# Signal end of audio stream
|
329
|
+
await ws.send("finalize")
|
330
|
+
await ws.send("done")
|
331
|
+
print("Audio streaming completed")
|
332
|
+
|
333
|
+
except Exception as e:
|
334
|
+
print(f"Error sending audio: {e}")
|
335
|
+
|
336
|
+
async def receive_transcripts():
|
337
|
+
"""Receive and process transcription results with word timestamps"""
|
338
|
+
full_transcript = ""
|
339
|
+
all_word_timestamps = []
|
340
|
+
|
341
|
+
try:
|
342
|
+
async for result in ws.receive():
|
343
|
+
if result['type'] == 'transcript':
|
344
|
+
text = result['text']
|
345
|
+
is_final = result['is_final']
|
346
|
+
|
347
|
+
# Handle word-level timestamps
|
348
|
+
if 'words' in result and result['words']:
|
349
|
+
word_timestamps = result['words']
|
350
|
+
all_word_timestamps.extend(word_timestamps)
|
351
|
+
|
352
|
+
if is_final:
|
353
|
+
print("Word-level timestamps:")
|
354
|
+
for word_info in word_timestamps:
|
355
|
+
word = word_info['word']
|
356
|
+
start = word_info['start']
|
357
|
+
end = word_info['end']
|
358
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
359
|
+
|
360
|
+
if is_final:
|
361
|
+
# Final result - this text won't change
|
362
|
+
full_transcript += text + " "
|
363
|
+
print(f"FINAL: {text}")
|
364
|
+
else:
|
365
|
+
# Partial result - may change as more audio is processed
|
366
|
+
print(f"PARTIAL: {text}")
|
367
|
+
|
368
|
+
elif result['type'] == 'done':
|
369
|
+
print("Transcription completed")
|
370
|
+
break
|
371
|
+
|
372
|
+
except Exception as e:
|
373
|
+
print(f"Error receiving transcripts: {e}")
|
374
|
+
|
375
|
+
return full_transcript.strip(), all_word_timestamps
|
376
|
+
|
377
|
+
print("Starting streaming STT...")
|
378
|
+
|
379
|
+
# Use asyncio.gather to run audio sending and transcript receiving concurrently
|
380
|
+
_, (final_transcript, word_timestamps) = await asyncio.gather(
|
381
|
+
send_audio(),
|
382
|
+
receive_transcripts()
|
383
|
+
)
|
384
|
+
|
385
|
+
print(f"\nComplete transcript: {final_transcript}")
|
386
|
+
print(f"Total words with timestamps: {len(word_timestamps)}")
|
387
|
+
|
388
|
+
# Clean up
|
389
|
+
await ws.close()
|
390
|
+
|
391
|
+
except Exception as e:
|
392
|
+
print(f"STT streaming error: {e}")
|
393
|
+
finally:
|
394
|
+
await client.close()
|
395
|
+
|
396
|
+
# Run the example
|
397
|
+
if __name__ == "__main__":
|
398
|
+
asyncio.run(streaming_stt_example())
|
399
|
+
```
|
400
|
+
|
401
|
+
## Batch Speech-to-Text (STT)
|
402
|
+
|
403
|
+
For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
|
404
|
+
|
405
|
+
```python
|
406
|
+
from cartesia import Cartesia
|
407
|
+
import os
|
408
|
+
|
409
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
410
|
+
|
411
|
+
# Transcribe an audio file with word-level timestamps
|
412
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
413
|
+
response = client.stt.transcribe(
|
414
|
+
file=audio_file, # Audio file to transcribe
|
415
|
+
model="ink-whisper", # STT model (required)
|
416
|
+
language="en", # Language of the audio (optional)
|
417
|
+
timestamp_granularities=["word"], # Include word-level timestamps (optional)
|
418
|
+
encoding="pcm_s16le", # Audio encoding (optional)
|
419
|
+
sample_rate=16000, # Audio sample rate (optional)
|
420
|
+
)
|
421
|
+
|
422
|
+
# Access transcription results
|
423
|
+
print(f"Transcribed text: {response.text}")
|
424
|
+
print(f"Audio duration: {response.duration:.2f} seconds")
|
425
|
+
|
426
|
+
# Process word-level timestamps if requested
|
427
|
+
if response.words:
|
428
|
+
print("\nWord-level timestamps:")
|
429
|
+
for word_info in response.words:
|
430
|
+
word = word_info.word
|
431
|
+
start = word_info.start
|
432
|
+
end = word_info.end
|
433
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
434
|
+
```
|
435
|
+
|
436
|
+
### Async Batch STT
|
437
|
+
|
438
|
+
```python
|
439
|
+
import asyncio
|
440
|
+
from cartesia import AsyncCartesia
|
441
|
+
import os
|
442
|
+
|
443
|
+
async def transcribe_file():
|
444
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
445
|
+
|
446
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
447
|
+
response = await client.stt.transcribe(
|
448
|
+
file=audio_file,
|
449
|
+
model="ink-whisper",
|
450
|
+
language="en",
|
451
|
+
timestamp_granularities=["word"],
|
452
|
+
)
|
453
|
+
|
454
|
+
print(f"Transcribed text: {response.text}")
|
455
|
+
|
456
|
+
# Process word timestamps
|
457
|
+
if response.words:
|
458
|
+
for word_info in response.words:
|
459
|
+
print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
|
460
|
+
|
461
|
+
await client.close()
|
462
|
+
|
463
|
+
asyncio.run(transcribe_file())
|
464
|
+
```
|
465
|
+
|
466
|
+
> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
|
467
|
+
|
216
468
|
## Voices
|
217
469
|
|
218
470
|
List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
|
@@ -358,7 +610,6 @@ new_voice = client.voices.create(
|
|
358
610
|
language="en"
|
359
611
|
)
|
360
612
|
```
|
361
|
-
|
362
613
|
### Custom Client
|
363
614
|
|
364
615
|
You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
|
@@ -412,3 +663,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
|
|
412
663
|
|
413
664
|
From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
|
414
665
|
|
666
|
+
|
667
|
+
|
668
|
+
|