cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +60 -1
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/base_client.py +2 -0
- cartesia/client.py +5 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/stt/__init__.py +57 -0
- cartesia/stt/_async_websocket.py +293 -0
- cartesia/stt/_websocket.py +294 -0
- cartesia/stt/client.py +456 -0
- cartesia/stt/requests/__init__.py +29 -0
- cartesia/stt/requests/done_message.py +14 -0
- cartesia/stt/requests/error_message.py +16 -0
- cartesia/stt/requests/flush_done_message.py +14 -0
- cartesia/stt/requests/streaming_transcription_response.py +41 -0
- cartesia/stt/requests/transcript_message.py +40 -0
- cartesia/stt/requests/transcription_response.py +28 -0
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +138 -0
- cartesia/stt/types/__init__.py +33 -0
- cartesia/stt/types/done_message.py +26 -0
- cartesia/stt/types/error_message.py +27 -0
- cartesia/stt/types/flush_done_message.py +26 -0
- cartesia/stt/types/streaming_transcription_response.py +94 -0
- cartesia/stt/types/stt_encoding.py +7 -0
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +50 -0
- cartesia/stt/types/transcription_response.py +38 -0
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +8 -0
- cartesia/tts/client.py +50 -8
- cartesia/tts/requests/__init__.py +4 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +4 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,138 @@
|
|
1
|
+
import typing
|
2
|
+
from typing import Any, Dict, Generator, Optional
|
3
|
+
|
4
|
+
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
5
|
+
from ._async_websocket import AsyncSttWebsocket
|
6
|
+
from ._websocket import SttWebsocket
|
7
|
+
from .client import AsyncSttClient, SttClient
|
8
|
+
from .types.stt_encoding import SttEncoding
|
9
|
+
|
10
|
+
|
11
|
+
class SttClientWithWebsocket(SttClient):
|
12
|
+
"""
|
13
|
+
Extension of STT functionality that supports a synchronous WebSocket STT connection.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
17
|
+
super().__init__(client_wrapper=client_wrapper)
|
18
|
+
|
19
|
+
def _ws_url(self):
|
20
|
+
base_url = self._client_wrapper.get_base_url()
|
21
|
+
if base_url.startswith("ws://") or base_url.startswith("wss://"):
|
22
|
+
return base_url
|
23
|
+
else:
|
24
|
+
prefix = "ws" if "localhost" in base_url else "wss"
|
25
|
+
base_url_without_protocol = base_url.split("://")[-1]
|
26
|
+
return f"{prefix}://{base_url_without_protocol}"
|
27
|
+
|
28
|
+
def websocket(
|
29
|
+
self,
|
30
|
+
*,
|
31
|
+
model: str = "ink-whisper",
|
32
|
+
language: Optional[str] = "en",
|
33
|
+
encoding: SttEncoding = "pcm_s16le",
|
34
|
+
sample_rate: int = 16000,
|
35
|
+
min_volume: Optional[float] = None,
|
36
|
+
max_silence_duration_secs: Optional[float] = None,
|
37
|
+
):
|
38
|
+
"""Create a WebSocket connection for real-time speech transcription.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
model: ID of the model to use for transcription
|
42
|
+
language: The language of the input audio in ISO-639-1 format
|
43
|
+
encoding: The encoding format of the audio data (required)
|
44
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
45
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
46
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
SttWebsocket: A connected WebSocket client for STT operations.
|
50
|
+
|
51
|
+
Example:
|
52
|
+
>>> client = Cartesia(api_key="your-api-key")
|
53
|
+
>>> ws = client.stt.websocket()
|
54
|
+
>>> for result in ws.transcribe(audio_chunks):
|
55
|
+
... print(result["text"])
|
56
|
+
"""
|
57
|
+
client_headers = self._client_wrapper.get_headers()
|
58
|
+
ws = SttWebsocket(
|
59
|
+
ws_url=self._ws_url(),
|
60
|
+
cartesia_version=client_headers["Cartesia-Version"],
|
61
|
+
api_key=client_headers["X-API-Key"],
|
62
|
+
)
|
63
|
+
# Auto-connect like TTS does for consistency
|
64
|
+
ws.connect(
|
65
|
+
model=model,
|
66
|
+
language=language,
|
67
|
+
encoding=encoding,
|
68
|
+
sample_rate=sample_rate,
|
69
|
+
min_volume=min_volume,
|
70
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
71
|
+
)
|
72
|
+
return ws
|
73
|
+
|
74
|
+
|
75
|
+
class AsyncSttClientWithWebsocket(AsyncSttClient):
|
76
|
+
"""
|
77
|
+
Extension of STT functionality that supports an asynchronous WebSocket STT connection.
|
78
|
+
"""
|
79
|
+
|
80
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
|
81
|
+
super().__init__(client_wrapper=client_wrapper)
|
82
|
+
self._get_session = get_session
|
83
|
+
|
84
|
+
def _ws_url(self) -> str:
|
85
|
+
base_url = self._client_wrapper.get_base_url()
|
86
|
+
if base_url.startswith("ws://") or base_url.startswith("wss://"):
|
87
|
+
return base_url
|
88
|
+
else:
|
89
|
+
prefix = "ws" if "localhost" in base_url else "wss"
|
90
|
+
base_url_without_protocol = base_url.split("://")[-1]
|
91
|
+
return f"{prefix}://{base_url_without_protocol}"
|
92
|
+
|
93
|
+
async def websocket(
|
94
|
+
self,
|
95
|
+
*,
|
96
|
+
model: str = "ink-whisper",
|
97
|
+
language: Optional[str] = "en",
|
98
|
+
encoding: SttEncoding = "pcm_s16le",
|
99
|
+
sample_rate: int = 16000,
|
100
|
+
min_volume: Optional[float] = None,
|
101
|
+
max_silence_duration_secs: Optional[float] = None,
|
102
|
+
):
|
103
|
+
"""Create an async WebSocket connection for real-time speech transcription.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
model: ID of the model to use for transcription
|
107
|
+
language: The language of the input audio in ISO-639-1 format
|
108
|
+
encoding: The encoding format of the audio data (required)
|
109
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
110
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
111
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
AsyncSttWebsocket: A connected async WebSocket client for STT operations.
|
115
|
+
|
116
|
+
Example:
|
117
|
+
>>> client = AsyncCartesia(api_key="your-api-key")
|
118
|
+
>>> ws = await client.stt.websocket()
|
119
|
+
>>> async for result in ws.transcribe(audio_chunks):
|
120
|
+
... print(result["text"])
|
121
|
+
"""
|
122
|
+
client_headers = self._client_wrapper.get_headers()
|
123
|
+
ws = AsyncSttWebsocket(
|
124
|
+
ws_url=self._ws_url(),
|
125
|
+
cartesia_version=client_headers["Cartesia-Version"],
|
126
|
+
api_key=client_headers["X-API-Key"],
|
127
|
+
get_session=self._get_session,
|
128
|
+
)
|
129
|
+
# Auto-connect like TTS does for consistency
|
130
|
+
await ws.connect(
|
131
|
+
model=model,
|
132
|
+
language=language,
|
133
|
+
encoding=encoding,
|
134
|
+
sample_rate=sample_rate,
|
135
|
+
min_volume=min_volume,
|
136
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
137
|
+
)
|
138
|
+
return ws
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .done_message import DoneMessage
|
4
|
+
from .error_message import ErrorMessage
|
5
|
+
from .flush_done_message import FlushDoneMessage
|
6
|
+
from .streaming_transcription_response import (
|
7
|
+
StreamingTranscriptionResponse,
|
8
|
+
StreamingTranscriptionResponse_Done,
|
9
|
+
StreamingTranscriptionResponse_Error,
|
10
|
+
StreamingTranscriptionResponse_FlushDone,
|
11
|
+
StreamingTranscriptionResponse_Transcript,
|
12
|
+
)
|
13
|
+
from .stt_encoding import SttEncoding
|
14
|
+
from .timestamp_granularity import TimestampGranularity
|
15
|
+
from .transcript_message import TranscriptMessage
|
16
|
+
from .transcription_response import TranscriptionResponse
|
17
|
+
from .transcription_word import TranscriptionWord
|
18
|
+
|
19
|
+
__all__ = [
|
20
|
+
"DoneMessage",
|
21
|
+
"ErrorMessage",
|
22
|
+
"FlushDoneMessage",
|
23
|
+
"StreamingTranscriptionResponse",
|
24
|
+
"StreamingTranscriptionResponse_Done",
|
25
|
+
"StreamingTranscriptionResponse_Error",
|
26
|
+
"StreamingTranscriptionResponse_FlushDone",
|
27
|
+
"StreamingTranscriptionResponse_Transcript",
|
28
|
+
"SttEncoding",
|
29
|
+
"TimestampGranularity",
|
30
|
+
"TranscriptMessage",
|
31
|
+
"TranscriptionResponse",
|
32
|
+
"TranscriptionWord",
|
33
|
+
]
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
+
import typing
|
7
|
+
|
8
|
+
|
9
|
+
class DoneMessage(UniversalBaseModel):
|
10
|
+
"""
|
11
|
+
Acknowledgment message sent in response to a `done` command, indicating that the session is complete and the WebSocket will close.
|
12
|
+
"""
|
13
|
+
|
14
|
+
request_id: str = pydantic.Field()
|
15
|
+
"""
|
16
|
+
Unique identifier for this transcription session.
|
17
|
+
"""
|
18
|
+
|
19
|
+
if IS_PYDANTIC_V2:
|
20
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
21
|
+
else:
|
22
|
+
|
23
|
+
class Config:
|
24
|
+
frozen = True
|
25
|
+
smart_union = True
|
26
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
5
|
+
import pydantic
|
6
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
|
+
|
8
|
+
|
9
|
+
class ErrorMessage(UniversalBaseModel):
|
10
|
+
request_id: typing.Optional[str] = pydantic.Field(default=None)
|
11
|
+
"""
|
12
|
+
The request ID associated with the error, if applicable.
|
13
|
+
"""
|
14
|
+
|
15
|
+
message: str = pydantic.Field()
|
16
|
+
"""
|
17
|
+
Human-readable error message describing what went wrong.
|
18
|
+
"""
|
19
|
+
|
20
|
+
if IS_PYDANTIC_V2:
|
21
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
22
|
+
else:
|
23
|
+
|
24
|
+
class Config:
|
25
|
+
frozen = True
|
26
|
+
smart_union = True
|
27
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
+
import typing
|
7
|
+
|
8
|
+
|
9
|
+
class FlushDoneMessage(UniversalBaseModel):
|
10
|
+
"""
|
11
|
+
Acknowledgment message sent in response to a `finalize` command, indicating that all buffered audio has been flushed and processed.
|
12
|
+
"""
|
13
|
+
|
14
|
+
request_id: str = pydantic.Field()
|
15
|
+
"""
|
16
|
+
Unique identifier for this transcription session.
|
17
|
+
"""
|
18
|
+
|
19
|
+
if IS_PYDANTIC_V2:
|
20
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
21
|
+
else:
|
22
|
+
|
23
|
+
class Config:
|
24
|
+
frozen = True
|
25
|
+
smart_union = True
|
26
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
5
|
+
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
7
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
|
+
import pydantic
|
9
|
+
|
10
|
+
|
11
|
+
class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
|
12
|
+
"""
|
13
|
+
The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
|
14
|
+
"""
|
15
|
+
|
16
|
+
type: typing.Literal["transcript"] = "transcript"
|
17
|
+
request_id: str
|
18
|
+
text: str
|
19
|
+
is_final: bool
|
20
|
+
duration: typing.Optional[float] = None
|
21
|
+
language: typing.Optional[str] = None
|
22
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = None
|
23
|
+
|
24
|
+
if IS_PYDANTIC_V2:
|
25
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
26
|
+
else:
|
27
|
+
|
28
|
+
class Config:
|
29
|
+
frozen = True
|
30
|
+
smart_union = True
|
31
|
+
extra = pydantic.Extra.allow
|
32
|
+
|
33
|
+
|
34
|
+
class StreamingTranscriptionResponse_FlushDone(UniversalBaseModel):
|
35
|
+
"""
|
36
|
+
The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
|
37
|
+
"""
|
38
|
+
|
39
|
+
type: typing.Literal["flush_done"] = "flush_done"
|
40
|
+
request_id: str
|
41
|
+
|
42
|
+
if IS_PYDANTIC_V2:
|
43
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
44
|
+
else:
|
45
|
+
|
46
|
+
class Config:
|
47
|
+
frozen = True
|
48
|
+
smart_union = True
|
49
|
+
extra = pydantic.Extra.allow
|
50
|
+
|
51
|
+
|
52
|
+
class StreamingTranscriptionResponse_Done(UniversalBaseModel):
|
53
|
+
"""
|
54
|
+
The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
|
55
|
+
"""
|
56
|
+
|
57
|
+
type: typing.Literal["done"] = "done"
|
58
|
+
request_id: str
|
59
|
+
|
60
|
+
if IS_PYDANTIC_V2:
|
61
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
62
|
+
else:
|
63
|
+
|
64
|
+
class Config:
|
65
|
+
frozen = True
|
66
|
+
smart_union = True
|
67
|
+
extra = pydantic.Extra.allow
|
68
|
+
|
69
|
+
|
70
|
+
class StreamingTranscriptionResponse_Error(UniversalBaseModel):
|
71
|
+
"""
|
72
|
+
The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
|
73
|
+
"""
|
74
|
+
|
75
|
+
type: typing.Literal["error"] = "error"
|
76
|
+
request_id: typing.Optional[str] = None
|
77
|
+
message: str
|
78
|
+
|
79
|
+
if IS_PYDANTIC_V2:
|
80
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
81
|
+
else:
|
82
|
+
|
83
|
+
class Config:
|
84
|
+
frozen = True
|
85
|
+
smart_union = True
|
86
|
+
extra = pydantic.Extra.allow
|
87
|
+
|
88
|
+
|
89
|
+
StreamingTranscriptionResponse = typing.Union[
|
90
|
+
StreamingTranscriptionResponse_Transcript,
|
91
|
+
StreamingTranscriptionResponse_FlushDone,
|
92
|
+
StreamingTranscriptionResponse_Done,
|
93
|
+
StreamingTranscriptionResponse_Error,
|
94
|
+
]
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
7
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
|
+
|
9
|
+
|
10
|
+
class TranscriptMessage(UniversalBaseModel):
|
11
|
+
request_id: str = pydantic.Field()
|
12
|
+
"""
|
13
|
+
Unique identifier for this transcription session.
|
14
|
+
"""
|
15
|
+
|
16
|
+
text: str = pydantic.Field()
|
17
|
+
"""
|
18
|
+
The transcribed text. May be partial or final depending on is_final.
|
19
|
+
|
20
|
+
**Note**: Text may be empty in initial responses while the system accumulates sufficient audio for transcription. This is normal behavior - wait for responses with non-empty text or monitor is_final for completion status.
|
21
|
+
"""
|
22
|
+
|
23
|
+
is_final: bool = pydantic.Field()
|
24
|
+
"""
|
25
|
+
Whether this is a final transcription result or an interim result.
|
26
|
+
"""
|
27
|
+
|
28
|
+
duration: typing.Optional[float] = pydantic.Field(default=None)
|
29
|
+
"""
|
30
|
+
The duration of the audio transcribed so far, in seconds.
|
31
|
+
"""
|
32
|
+
|
33
|
+
language: typing.Optional[str] = pydantic.Field(default=None)
|
34
|
+
"""
|
35
|
+
The specified language of the input audio.
|
36
|
+
"""
|
37
|
+
|
38
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
|
39
|
+
"""
|
40
|
+
Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
|
41
|
+
"""
|
42
|
+
|
43
|
+
if IS_PYDANTIC_V2:
|
44
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
45
|
+
else:
|
46
|
+
|
47
|
+
class Config:
|
48
|
+
frozen = True
|
49
|
+
smart_union = True
|
50
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
7
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
|
+
|
9
|
+
|
10
|
+
class TranscriptionResponse(UniversalBaseModel):
|
11
|
+
text: str = pydantic.Field()
|
12
|
+
"""
|
13
|
+
The transcribed text.
|
14
|
+
"""
|
15
|
+
|
16
|
+
language: typing.Optional[str] = pydantic.Field(default=None)
|
17
|
+
"""
|
18
|
+
The specified language of the input audio.
|
19
|
+
"""
|
20
|
+
|
21
|
+
duration: typing.Optional[float] = pydantic.Field(default=None)
|
22
|
+
"""
|
23
|
+
The duration of the input audio in seconds.
|
24
|
+
"""
|
25
|
+
|
26
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
|
27
|
+
"""
|
28
|
+
Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
|
29
|
+
"""
|
30
|
+
|
31
|
+
if IS_PYDANTIC_V2:
|
32
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
33
|
+
else:
|
34
|
+
|
35
|
+
class Config:
|
36
|
+
frozen = True
|
37
|
+
smart_union = True
|
38
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
+
import typing
|
7
|
+
|
8
|
+
|
9
|
+
class TranscriptionWord(UniversalBaseModel):
|
10
|
+
word: str = pydantic.Field()
|
11
|
+
"""
|
12
|
+
The transcribed word.
|
13
|
+
"""
|
14
|
+
|
15
|
+
start: float = pydantic.Field()
|
16
|
+
"""
|
17
|
+
Start time of the word in seconds.
|
18
|
+
"""
|
19
|
+
|
20
|
+
end: float = pydantic.Field()
|
21
|
+
"""
|
22
|
+
End time of the word in seconds.
|
23
|
+
"""
|
24
|
+
|
25
|
+
if IS_PYDANTIC_V2:
|
26
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
27
|
+
else:
|
28
|
+
|
29
|
+
class Config:
|
30
|
+
frozen = True
|
31
|
+
smart_union = True
|
32
|
+
extra = pydantic.Extra.allow
|
cartesia/tts/__init__.py
CHANGED
@@ -19,11 +19,13 @@ from .types import (
|
|
19
19
|
RawEncoding,
|
20
20
|
RawOutputFormat,
|
21
21
|
Speed,
|
22
|
+
SseOutputFormat,
|
22
23
|
SupportedLanguage,
|
23
24
|
TtsRequest,
|
24
25
|
TtsRequestEmbeddingSpecifier,
|
25
26
|
TtsRequestIdSpecifier,
|
26
27
|
TtsRequestVoiceSpecifier,
|
28
|
+
TtssseRequest,
|
27
29
|
WavOutputFormat,
|
28
30
|
WebSocketBaseResponse,
|
29
31
|
WebSocketChunkResponse,
|
@@ -58,10 +60,12 @@ from .requests import (
|
|
58
60
|
PhonemeTimestampsParams,
|
59
61
|
RawOutputFormatParams,
|
60
62
|
SpeedParams,
|
63
|
+
SseOutputFormatParams,
|
61
64
|
TtsRequestEmbeddingSpecifierParams,
|
62
65
|
TtsRequestIdSpecifierParams,
|
63
66
|
TtsRequestParams,
|
64
67
|
TtsRequestVoiceSpecifierParams,
|
68
|
+
TtssseRequestParams,
|
65
69
|
WavOutputFormatParams,
|
66
70
|
WebSocketBaseResponseParams,
|
67
71
|
WebSocketChunkResponseParams,
|
@@ -115,6 +119,8 @@ __all__ = [
|
|
115
119
|
"RawOutputFormatParams",
|
116
120
|
"Speed",
|
117
121
|
"SpeedParams",
|
122
|
+
"SseOutputFormat",
|
123
|
+
"SseOutputFormatParams",
|
118
124
|
"SupportedLanguage",
|
119
125
|
"TtsRequest",
|
120
126
|
"TtsRequestEmbeddingSpecifier",
|
@@ -124,6 +130,8 @@ __all__ = [
|
|
124
130
|
"TtsRequestParams",
|
125
131
|
"TtsRequestVoiceSpecifier",
|
126
132
|
"TtsRequestVoiceSpecifierParams",
|
133
|
+
"TtssseRequest",
|
134
|
+
"TtssseRequestParams",
|
127
135
|
"WavOutputFormat",
|
128
136
|
"WavOutputFormatParams",
|
129
137
|
"WebSocketBaseResponse",
|
cartesia/tts/client.py
CHANGED
@@ -10,6 +10,8 @@ from ..core.request_options import RequestOptions
|
|
10
10
|
from ..core.serialization import convert_and_respect_annotation_metadata
|
11
11
|
from json.decoder import JSONDecodeError
|
12
12
|
from ..core.api_error import ApiError
|
13
|
+
from .requests.sse_output_format import SseOutputFormatParams
|
14
|
+
from .types.context_id import ContextId
|
13
15
|
from .types.web_socket_response import WebSocketResponse
|
14
16
|
import httpx_sse
|
15
17
|
from ..core.pydantic_utilities import parse_obj_as
|
@@ -119,10 +121,14 @@ class TtsClient:
|
|
119
121
|
model_id: str,
|
120
122
|
transcript: str,
|
121
123
|
voice: TtsRequestVoiceSpecifierParams,
|
122
|
-
output_format:
|
124
|
+
output_format: SseOutputFormatParams,
|
123
125
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
124
126
|
duration: typing.Optional[float] = OMIT,
|
125
127
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
128
|
+
add_timestamps: typing.Optional[bool] = OMIT,
|
129
|
+
add_phoneme_timestamps: typing.Optional[bool] = OMIT,
|
130
|
+
use_normalized_timestamps: typing.Optional[bool] = OMIT,
|
131
|
+
context_id: typing.Optional[ContextId] = OMIT,
|
126
132
|
request_options: typing.Optional[RequestOptions] = None,
|
127
133
|
) -> typing.Iterator[WebSocketResponse]:
|
128
134
|
"""
|
@@ -135,7 +141,7 @@ class TtsClient:
|
|
135
141
|
|
136
142
|
voice : TtsRequestVoiceSpecifierParams
|
137
143
|
|
138
|
-
output_format :
|
144
|
+
output_format : SseOutputFormatParams
|
139
145
|
|
140
146
|
language : typing.Optional[SupportedLanguage]
|
141
147
|
|
@@ -145,6 +151,18 @@ class TtsClient:
|
|
145
151
|
|
146
152
|
speed : typing.Optional[ModelSpeed]
|
147
153
|
|
154
|
+
add_timestamps : typing.Optional[bool]
|
155
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
156
|
+
|
157
|
+
add_phoneme_timestamps : typing.Optional[bool]
|
158
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
159
|
+
|
160
|
+
use_normalized_timestamps : typing.Optional[bool]
|
161
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
162
|
+
|
163
|
+
context_id : typing.Optional[ContextId]
|
164
|
+
Optional context ID for this request.
|
165
|
+
|
148
166
|
request_options : typing.Optional[RequestOptions]
|
149
167
|
Request-specific configuration.
|
150
168
|
|
@@ -165,9 +183,9 @@ class TtsClient:
|
|
165
183
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
166
184
|
language="en",
|
167
185
|
output_format={
|
186
|
+
"container": "raw",
|
168
187
|
"sample_rate": 44100,
|
169
188
|
"encoding": "pcm_f32le",
|
170
|
-
"container": "raw",
|
171
189
|
},
|
172
190
|
)
|
173
191
|
for chunk in response:
|
@@ -184,10 +202,14 @@ class TtsClient:
|
|
184
202
|
),
|
185
203
|
"language": language,
|
186
204
|
"output_format": convert_and_respect_annotation_metadata(
|
187
|
-
object_=output_format, annotation=
|
205
|
+
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
188
206
|
),
|
189
207
|
"duration": duration,
|
190
208
|
"speed": speed,
|
209
|
+
"add_timestamps": add_timestamps,
|
210
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
211
|
+
"use_normalized_timestamps": use_normalized_timestamps,
|
212
|
+
"context_id": context_id,
|
191
213
|
},
|
192
214
|
request_options=request_options,
|
193
215
|
omit=OMIT,
|
@@ -321,10 +343,14 @@ class AsyncTtsClient:
|
|
321
343
|
model_id: str,
|
322
344
|
transcript: str,
|
323
345
|
voice: TtsRequestVoiceSpecifierParams,
|
324
|
-
output_format:
|
346
|
+
output_format: SseOutputFormatParams,
|
325
347
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
326
348
|
duration: typing.Optional[float] = OMIT,
|
327
349
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
350
|
+
add_timestamps: typing.Optional[bool] = OMIT,
|
351
|
+
add_phoneme_timestamps: typing.Optional[bool] = OMIT,
|
352
|
+
use_normalized_timestamps: typing.Optional[bool] = OMIT,
|
353
|
+
context_id: typing.Optional[ContextId] = OMIT,
|
328
354
|
request_options: typing.Optional[RequestOptions] = None,
|
329
355
|
) -> typing.AsyncIterator[WebSocketResponse]:
|
330
356
|
"""
|
@@ -337,7 +363,7 @@ class AsyncTtsClient:
|
|
337
363
|
|
338
364
|
voice : TtsRequestVoiceSpecifierParams
|
339
365
|
|
340
|
-
output_format :
|
366
|
+
output_format : SseOutputFormatParams
|
341
367
|
|
342
368
|
language : typing.Optional[SupportedLanguage]
|
343
369
|
|
@@ -347,6 +373,18 @@ class AsyncTtsClient:
|
|
347
373
|
|
348
374
|
speed : typing.Optional[ModelSpeed]
|
349
375
|
|
376
|
+
add_timestamps : typing.Optional[bool]
|
377
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
378
|
+
|
379
|
+
add_phoneme_timestamps : typing.Optional[bool]
|
380
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
381
|
+
|
382
|
+
use_normalized_timestamps : typing.Optional[bool]
|
383
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
384
|
+
|
385
|
+
context_id : typing.Optional[ContextId]
|
386
|
+
Optional context ID for this request.
|
387
|
+
|
350
388
|
request_options : typing.Optional[RequestOptions]
|
351
389
|
Request-specific configuration.
|
352
390
|
|
@@ -372,9 +410,9 @@ class AsyncTtsClient:
|
|
372
410
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
373
411
|
language="en",
|
374
412
|
output_format={
|
413
|
+
"container": "raw",
|
375
414
|
"sample_rate": 44100,
|
376
415
|
"encoding": "pcm_f32le",
|
377
|
-
"container": "raw",
|
378
416
|
},
|
379
417
|
)
|
380
418
|
async for chunk in response:
|
@@ -394,10 +432,14 @@ class AsyncTtsClient:
|
|
394
432
|
),
|
395
433
|
"language": language,
|
396
434
|
"output_format": convert_and_respect_annotation_metadata(
|
397
|
-
object_=output_format, annotation=
|
435
|
+
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
398
436
|
),
|
399
437
|
"duration": duration,
|
400
438
|
"speed": speed,
|
439
|
+
"add_timestamps": add_timestamps,
|
440
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
441
|
+
"use_normalized_timestamps": use_normalized_timestamps,
|
442
|
+
"context_id": context_id,
|
401
443
|
},
|
402
444
|
request_options=request_options,
|
403
445
|
omit=OMIT,
|