cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +22 -0
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/core/client_wrapper.py +1 -1
- cartesia/infill/client.py +0 -8
- cartesia/stt/__init__.py +6 -0
- cartesia/stt/_async_websocket.py +81 -72
- cartesia/stt/_websocket.py +42 -20
- cartesia/stt/client.py +450 -0
- cartesia/stt/requests/__init__.py +2 -0
- cartesia/stt/requests/streaming_transcription_response.py +2 -0
- cartesia/stt/requests/transcript_message.py +8 -1
- cartesia/stt/requests/transcription_response.py +8 -1
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +52 -109
- cartesia/stt/types/__init__.py +4 -0
- cartesia/stt/types/streaming_transcription_response.py +2 -0
- cartesia/stt/types/stt_encoding.py +3 -1
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +7 -1
- cartesia/stt/types/transcription_response.py +7 -1
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +16 -0
- cartesia/tts/client.py +63 -8
- cartesia/tts/requests/__init__.py +8 -0
- cartesia/tts/requests/experimental_model_controls.py +17 -0
- cartesia/tts/requests/generation_config.py +23 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/tts_request.py +2 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +8 -0
- cartesia/tts/types/experimental_model_controls.py +28 -0
- cartesia/tts/types/generation_config.py +34 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/tts_request.py +2 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/client.py +0 -8
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- cartesia/voices/client.py +0 -12
- cartesia-2.0.7.dist-info/LICENSE +201 -0
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
@@ -0,0 +1,20 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
|
5
|
+
|
6
|
+
class TranscriptionWordParams(typing_extensions.TypedDict):
|
7
|
+
word: str
|
8
|
+
"""
|
9
|
+
The transcribed word.
|
10
|
+
"""
|
11
|
+
|
12
|
+
start: float
|
13
|
+
"""
|
14
|
+
Start time of the word in seconds.
|
15
|
+
"""
|
16
|
+
|
17
|
+
end: float
|
18
|
+
"""
|
19
|
+
End time of the word in seconds.
|
20
|
+
"""
|
cartesia/stt/socket_client.py
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
import typing
|
2
|
-
from typing import Any, Dict, Generator, Optional
|
2
|
+
from typing import Any, Dict, Generator, Optional
|
3
3
|
|
4
4
|
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
5
5
|
from ._async_websocket import AsyncSttWebsocket
|
6
6
|
from ._websocket import SttWebsocket
|
7
|
+
from .client import AsyncSttClient, SttClient
|
8
|
+
from .types.stt_encoding import SttEncoding
|
7
9
|
|
8
10
|
|
9
|
-
class SttClientWithWebsocket:
|
11
|
+
class SttClientWithWebsocket(SttClient):
|
10
12
|
"""
|
11
13
|
Extension of STT functionality that supports a synchronous WebSocket STT connection.
|
12
14
|
"""
|
13
15
|
|
14
16
|
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
15
|
-
|
17
|
+
super().__init__(client_wrapper=client_wrapper)
|
16
18
|
|
17
19
|
def _ws_url(self):
|
18
20
|
base_url = self._client_wrapper.get_base_url()
|
@@ -23,21 +25,34 @@ class SttClientWithWebsocket:
|
|
23
25
|
base_url_without_protocol = base_url.split("://")[-1]
|
24
26
|
return f"{prefix}://{base_url_without_protocol}"
|
25
27
|
|
26
|
-
def websocket(
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
def websocket(
|
29
|
+
self,
|
30
|
+
*,
|
31
|
+
model: str = "ink-whisper",
|
32
|
+
language: Optional[str] = "en",
|
33
|
+
encoding: SttEncoding = "pcm_s16le",
|
34
|
+
sample_rate: int = 16000,
|
35
|
+
min_volume: Optional[float] = None,
|
36
|
+
max_silence_duration_secs: Optional[float] = None,
|
37
|
+
):
|
31
38
|
"""Create a WebSocket connection for real-time speech transcription.
|
32
39
|
|
33
40
|
Args:
|
34
41
|
model: ID of the model to use for transcription
|
35
42
|
language: The language of the input audio in ISO-639-1 format
|
36
|
-
encoding: The encoding format of the audio data
|
37
|
-
sample_rate: The sample rate of the audio in Hz
|
43
|
+
encoding: The encoding format of the audio data (required)
|
44
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
45
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
46
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
38
47
|
|
39
48
|
Returns:
|
40
49
|
SttWebsocket: A connected WebSocket client for STT operations.
|
50
|
+
|
51
|
+
Example:
|
52
|
+
>>> client = Cartesia(api_key="your-api-key")
|
53
|
+
>>> ws = client.stt.websocket()
|
54
|
+
>>> for result in ws.transcribe(audio_chunks):
|
55
|
+
... print(result["text"])
|
41
56
|
"""
|
42
57
|
client_headers = self._client_wrapper.get_headers()
|
43
58
|
ws = SttWebsocket(
|
@@ -51,61 +66,19 @@ class SttClientWithWebsocket:
|
|
51
66
|
language=language,
|
52
67
|
encoding=encoding,
|
53
68
|
sample_rate=sample_rate,
|
69
|
+
min_volume=min_volume,
|
70
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
54
71
|
)
|
55
72
|
return ws
|
56
73
|
|
57
|
-
def transcribe(
|
58
|
-
self,
|
59
|
-
audio_chunks: typing.Iterator[bytes],
|
60
|
-
*,
|
61
|
-
model: str = "ink-whisper",
|
62
|
-
language: Optional[str] = "en",
|
63
|
-
encoding: Optional[str] = "pcm_s16le",
|
64
|
-
sample_rate: int = 16000,
|
65
|
-
) -> Generator[Dict[str, Any], None, None]:
|
66
|
-
"""Transcribe audio chunks using WebSocket.
|
67
74
|
|
68
|
-
|
69
|
-
audio_chunks: Iterator of audio chunks as bytes
|
70
|
-
model: ID of the model to use for transcription
|
71
|
-
language: The language of the input audio in ISO-639-1 format
|
72
|
-
encoding: The encoding format of the audio data
|
73
|
-
sample_rate: The sample rate of the audio in Hz
|
74
|
-
|
75
|
-
Yields:
|
76
|
-
Dictionary containing transcription results, flush_done, done, or error messages
|
77
|
-
|
78
|
-
Example:
|
79
|
-
>>> client = Cartesia(api_key="your-api-key")
|
80
|
-
>>> ws_client = client.stt.websocket()
|
81
|
-
>>> for result in ws_client.transcribe(audio_chunks):
|
82
|
-
... print(result["text"])
|
83
|
-
"""
|
84
|
-
ws = self.websocket(
|
85
|
-
model=model,
|
86
|
-
language=language,
|
87
|
-
encoding=encoding,
|
88
|
-
sample_rate=sample_rate,
|
89
|
-
)
|
90
|
-
try:
|
91
|
-
yield from ws.transcribe(
|
92
|
-
audio_chunks,
|
93
|
-
model=model,
|
94
|
-
language=language,
|
95
|
-
encoding=encoding,
|
96
|
-
sample_rate=sample_rate,
|
97
|
-
)
|
98
|
-
finally:
|
99
|
-
ws.close()
|
100
|
-
|
101
|
-
|
102
|
-
class AsyncSttClientWithWebsocket:
|
75
|
+
class AsyncSttClientWithWebsocket(AsyncSttClient):
|
103
76
|
"""
|
104
77
|
Extension of STT functionality that supports an asynchronous WebSocket STT connection.
|
105
78
|
"""
|
106
79
|
|
107
80
|
def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
|
108
|
-
|
81
|
+
super().__init__(client_wrapper=client_wrapper)
|
109
82
|
self._get_session = get_session
|
110
83
|
|
111
84
|
def _ws_url(self) -> str:
|
@@ -117,21 +90,34 @@ class AsyncSttClientWithWebsocket:
|
|
117
90
|
base_url_without_protocol = base_url.split("://")[-1]
|
118
91
|
return f"{prefix}://{base_url_without_protocol}"
|
119
92
|
|
120
|
-
async def websocket(
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
93
|
+
async def websocket(
|
94
|
+
self,
|
95
|
+
*,
|
96
|
+
model: str = "ink-whisper",
|
97
|
+
language: Optional[str] = "en",
|
98
|
+
encoding: SttEncoding = "pcm_s16le",
|
99
|
+
sample_rate: int = 16000,
|
100
|
+
min_volume: Optional[float] = None,
|
101
|
+
max_silence_duration_secs: Optional[float] = None,
|
102
|
+
):
|
125
103
|
"""Create an async WebSocket connection for real-time speech transcription.
|
126
104
|
|
127
105
|
Args:
|
128
106
|
model: ID of the model to use for transcription
|
129
107
|
language: The language of the input audio in ISO-639-1 format
|
130
|
-
encoding: The encoding format of the audio data
|
131
|
-
sample_rate: The sample rate of the audio in Hz
|
108
|
+
encoding: The encoding format of the audio data (required)
|
109
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
110
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
111
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
132
112
|
|
133
113
|
Returns:
|
134
114
|
AsyncSttWebsocket: A connected async WebSocket client for STT operations.
|
115
|
+
|
116
|
+
Example:
|
117
|
+
>>> client = AsyncCartesia(api_key="your-api-key")
|
118
|
+
>>> ws = await client.stt.websocket()
|
119
|
+
>>> async for result in ws.transcribe(audio_chunks):
|
120
|
+
... print(result["text"])
|
135
121
|
"""
|
136
122
|
client_headers = self._client_wrapper.get_headers()
|
137
123
|
ws = AsyncSttWebsocket(
|
@@ -146,50 +132,7 @@ class AsyncSttClientWithWebsocket:
|
|
146
132
|
language=language,
|
147
133
|
encoding=encoding,
|
148
134
|
sample_rate=sample_rate,
|
135
|
+
min_volume=min_volume,
|
136
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
149
137
|
)
|
150
|
-
return ws
|
151
|
-
|
152
|
-
async def transcribe(
|
153
|
-
self,
|
154
|
-
audio_chunks: typing.AsyncIterator[bytes],
|
155
|
-
*,
|
156
|
-
model: str = "ink-whisper",
|
157
|
-
language: Optional[str] = "en",
|
158
|
-
encoding: Optional[str] = "pcm_s16le",
|
159
|
-
sample_rate: int = 16000,
|
160
|
-
) -> typing.AsyncGenerator[Dict[str, Any], None]:
|
161
|
-
"""Transcribe audio chunks using async WebSocket.
|
162
|
-
|
163
|
-
Args:
|
164
|
-
audio_chunks: Async iterator of audio chunks as bytes
|
165
|
-
model: ID of the model to use for transcription
|
166
|
-
language: The language of the input audio in ISO-639-1 format
|
167
|
-
encoding: The encoding format of the audio data
|
168
|
-
sample_rate: The sample rate of the audio in Hz
|
169
|
-
|
170
|
-
Yields:
|
171
|
-
Dictionary containing transcription results, flush_done, done, or error messages
|
172
|
-
|
173
|
-
Example:
|
174
|
-
>>> client = AsyncCartesia(api_key="your-api-key")
|
175
|
-
>>> ws_client = await client.stt.websocket()
|
176
|
-
>>> async for result in ws_client.transcribe(audio_chunks):
|
177
|
-
... print(result["text"])
|
178
|
-
"""
|
179
|
-
ws = await self.websocket(
|
180
|
-
model=model,
|
181
|
-
language=language,
|
182
|
-
encoding=encoding,
|
183
|
-
sample_rate=sample_rate,
|
184
|
-
)
|
185
|
-
try:
|
186
|
-
async for result in ws.transcribe(
|
187
|
-
audio_chunks,
|
188
|
-
model=model,
|
189
|
-
language=language,
|
190
|
-
encoding=encoding,
|
191
|
-
sample_rate=sample_rate,
|
192
|
-
):
|
193
|
-
yield result
|
194
|
-
finally:
|
195
|
-
await ws.close()
|
138
|
+
return ws
|
cartesia/stt/types/__init__.py
CHANGED
@@ -11,8 +11,10 @@ from .streaming_transcription_response import (
|
|
11
11
|
StreamingTranscriptionResponse_Transcript,
|
12
12
|
)
|
13
13
|
from .stt_encoding import SttEncoding
|
14
|
+
from .timestamp_granularity import TimestampGranularity
|
14
15
|
from .transcript_message import TranscriptMessage
|
15
16
|
from .transcription_response import TranscriptionResponse
|
17
|
+
from .transcription_word import TranscriptionWord
|
16
18
|
|
17
19
|
__all__ = [
|
18
20
|
"DoneMessage",
|
@@ -24,6 +26,8 @@ __all__ = [
|
|
24
26
|
"StreamingTranscriptionResponse_FlushDone",
|
25
27
|
"StreamingTranscriptionResponse_Transcript",
|
26
28
|
"SttEncoding",
|
29
|
+
"TimestampGranularity",
|
27
30
|
"TranscriptMessage",
|
28
31
|
"TranscriptionResponse",
|
32
|
+
"TranscriptionWord",
|
29
33
|
]
|
@@ -3,6 +3,7 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
6
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
8
|
import pydantic
|
8
9
|
|
@@ -18,6 +19,7 @@ class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
|
|
18
19
|
is_final: bool
|
19
20
|
duration: typing.Optional[float] = None
|
20
21
|
language: typing.Optional[str] = None
|
22
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = None
|
21
23
|
|
22
24
|
if IS_PYDANTIC_V2:
|
23
25
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -3,6 +3,7 @@
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
4
|
import pydantic
|
5
5
|
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
6
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
8
|
|
8
9
|
|
@@ -31,7 +32,12 @@ class TranscriptMessage(UniversalBaseModel):
|
|
31
32
|
|
32
33
|
language: typing.Optional[str] = pydantic.Field(default=None)
|
33
34
|
"""
|
34
|
-
The
|
35
|
+
The specified language of the input audio.
|
36
|
+
"""
|
37
|
+
|
38
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
|
39
|
+
"""
|
40
|
+
Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
|
35
41
|
"""
|
36
42
|
|
37
43
|
if IS_PYDANTIC_V2:
|
@@ -3,6 +3,7 @@
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
4
|
import pydantic
|
5
5
|
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
6
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
8
|
|
8
9
|
|
@@ -14,7 +15,7 @@ class TranscriptionResponse(UniversalBaseModel):
|
|
14
15
|
|
15
16
|
language: typing.Optional[str] = pydantic.Field(default=None)
|
16
17
|
"""
|
17
|
-
The
|
18
|
+
The specified language of the input audio.
|
18
19
|
"""
|
19
20
|
|
20
21
|
duration: typing.Optional[float] = pydantic.Field(default=None)
|
@@ -22,6 +23,11 @@ class TranscriptionResponse(UniversalBaseModel):
|
|
22
23
|
The duration of the input audio in seconds.
|
23
24
|
"""
|
24
25
|
|
26
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
|
27
|
+
"""
|
28
|
+
Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
|
29
|
+
"""
|
30
|
+
|
25
31
|
if IS_PYDANTIC_V2:
|
26
32
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
27
33
|
else:
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
+
import typing
|
7
|
+
|
8
|
+
|
9
|
+
class TranscriptionWord(UniversalBaseModel):
|
10
|
+
word: str = pydantic.Field()
|
11
|
+
"""
|
12
|
+
The transcribed word.
|
13
|
+
"""
|
14
|
+
|
15
|
+
start: float = pydantic.Field()
|
16
|
+
"""
|
17
|
+
Start time of the word in seconds.
|
18
|
+
"""
|
19
|
+
|
20
|
+
end: float = pydantic.Field()
|
21
|
+
"""
|
22
|
+
End time of the word in seconds.
|
23
|
+
"""
|
24
|
+
|
25
|
+
if IS_PYDANTIC_V2:
|
26
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
27
|
+
else:
|
28
|
+
|
29
|
+
class Config:
|
30
|
+
frozen = True
|
31
|
+
smart_union = True
|
32
|
+
extra = pydantic.Extra.allow
|
cartesia/tts/__init__.py
CHANGED
@@ -5,7 +5,9 @@ from .types import (
|
|
5
5
|
ContextId,
|
6
6
|
Controls,
|
7
7
|
Emotion,
|
8
|
+
ExperimentalModelControls,
|
8
9
|
FlushId,
|
10
|
+
GenerationConfig,
|
9
11
|
GenerationRequest,
|
10
12
|
ModelSpeed,
|
11
13
|
Mp3OutputFormat,
|
@@ -19,11 +21,13 @@ from .types import (
|
|
19
21
|
RawEncoding,
|
20
22
|
RawOutputFormat,
|
21
23
|
Speed,
|
24
|
+
SseOutputFormat,
|
22
25
|
SupportedLanguage,
|
23
26
|
TtsRequest,
|
24
27
|
TtsRequestEmbeddingSpecifier,
|
25
28
|
TtsRequestIdSpecifier,
|
26
29
|
TtsRequestVoiceSpecifier,
|
30
|
+
TtssseRequest,
|
27
31
|
WavOutputFormat,
|
28
32
|
WebSocketBaseResponse,
|
29
33
|
WebSocketChunkResponse,
|
@@ -49,6 +53,8 @@ from .types import (
|
|
49
53
|
from .requests import (
|
50
54
|
CancelContextRequestParams,
|
51
55
|
ControlsParams,
|
56
|
+
ExperimentalModelControlsParams,
|
57
|
+
GenerationConfigParams,
|
52
58
|
GenerationRequestParams,
|
53
59
|
Mp3OutputFormatParams,
|
54
60
|
OutputFormatParams,
|
@@ -58,10 +64,12 @@ from .requests import (
|
|
58
64
|
PhonemeTimestampsParams,
|
59
65
|
RawOutputFormatParams,
|
60
66
|
SpeedParams,
|
67
|
+
SseOutputFormatParams,
|
61
68
|
TtsRequestEmbeddingSpecifierParams,
|
62
69
|
TtsRequestIdSpecifierParams,
|
63
70
|
TtsRequestParams,
|
64
71
|
TtsRequestVoiceSpecifierParams,
|
72
|
+
TtssseRequestParams,
|
65
73
|
WavOutputFormatParams,
|
66
74
|
WebSocketBaseResponseParams,
|
67
75
|
WebSocketChunkResponseParams,
|
@@ -92,7 +100,11 @@ __all__ = [
|
|
92
100
|
"Controls",
|
93
101
|
"ControlsParams",
|
94
102
|
"Emotion",
|
103
|
+
"ExperimentalModelControls",
|
104
|
+
"ExperimentalModelControlsParams",
|
95
105
|
"FlushId",
|
106
|
+
"GenerationConfig",
|
107
|
+
"GenerationConfigParams",
|
96
108
|
"GenerationRequest",
|
97
109
|
"GenerationRequestParams",
|
98
110
|
"ModelSpeed",
|
@@ -115,6 +127,8 @@ __all__ = [
|
|
115
127
|
"RawOutputFormatParams",
|
116
128
|
"Speed",
|
117
129
|
"SpeedParams",
|
130
|
+
"SseOutputFormat",
|
131
|
+
"SseOutputFormatParams",
|
118
132
|
"SupportedLanguage",
|
119
133
|
"TtsRequest",
|
120
134
|
"TtsRequestEmbeddingSpecifier",
|
@@ -124,6 +138,8 @@ __all__ = [
|
|
124
138
|
"TtsRequestParams",
|
125
139
|
"TtsRequestVoiceSpecifier",
|
126
140
|
"TtsRequestVoiceSpecifierParams",
|
141
|
+
"TtssseRequest",
|
142
|
+
"TtssseRequestParams",
|
127
143
|
"WavOutputFormat",
|
128
144
|
"WavOutputFormatParams",
|
129
145
|
"WebSocketBaseResponse",
|
cartesia/tts/client.py
CHANGED
@@ -6,10 +6,13 @@ from .requests.tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
|
6
6
|
from .requests.output_format import OutputFormatParams
|
7
7
|
from .types.supported_language import SupportedLanguage
|
8
8
|
from .types.model_speed import ModelSpeed
|
9
|
+
from .requests.generation_config import GenerationConfigParams
|
9
10
|
from ..core.request_options import RequestOptions
|
10
11
|
from ..core.serialization import convert_and_respect_annotation_metadata
|
11
12
|
from json.decoder import JSONDecodeError
|
12
13
|
from ..core.api_error import ApiError
|
14
|
+
from .requests.sse_output_format import SseOutputFormatParams
|
15
|
+
from .types.context_id import ContextId
|
13
16
|
from .types.web_socket_response import WebSocketResponse
|
14
17
|
import httpx_sse
|
15
18
|
from ..core.pydantic_utilities import parse_obj_as
|
@@ -34,6 +37,7 @@ class TtsClient:
|
|
34
37
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
35
38
|
duration: typing.Optional[float] = OMIT,
|
36
39
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
40
|
+
generation_config: typing.Optional[GenerationConfigParams] = OMIT,
|
37
41
|
request_options: typing.Optional[RequestOptions] = None,
|
38
42
|
) -> typing.Iterator[bytes]:
|
39
43
|
"""
|
@@ -56,6 +60,8 @@ class TtsClient:
|
|
56
60
|
|
57
61
|
speed : typing.Optional[ModelSpeed]
|
58
62
|
|
63
|
+
generation_config : typing.Optional[GenerationConfigParams]
|
64
|
+
|
59
65
|
request_options : typing.Optional[RequestOptions]
|
60
66
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
61
67
|
|
@@ -97,6 +103,9 @@ class TtsClient:
|
|
97
103
|
),
|
98
104
|
"duration": duration,
|
99
105
|
"speed": speed,
|
106
|
+
"generation_config": convert_and_respect_annotation_metadata(
|
107
|
+
object_=generation_config, annotation=GenerationConfigParams, direction="write"
|
108
|
+
),
|
100
109
|
},
|
101
110
|
request_options=request_options,
|
102
111
|
omit=OMIT,
|
@@ -119,10 +128,14 @@ class TtsClient:
|
|
119
128
|
model_id: str,
|
120
129
|
transcript: str,
|
121
130
|
voice: TtsRequestVoiceSpecifierParams,
|
122
|
-
output_format:
|
131
|
+
output_format: SseOutputFormatParams,
|
123
132
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
124
133
|
duration: typing.Optional[float] = OMIT,
|
125
134
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
135
|
+
add_timestamps: typing.Optional[bool] = OMIT,
|
136
|
+
add_phoneme_timestamps: typing.Optional[bool] = OMIT,
|
137
|
+
use_normalized_timestamps: typing.Optional[bool] = OMIT,
|
138
|
+
context_id: typing.Optional[ContextId] = OMIT,
|
126
139
|
request_options: typing.Optional[RequestOptions] = None,
|
127
140
|
) -> typing.Iterator[WebSocketResponse]:
|
128
141
|
"""
|
@@ -135,7 +148,7 @@ class TtsClient:
|
|
135
148
|
|
136
149
|
voice : TtsRequestVoiceSpecifierParams
|
137
150
|
|
138
|
-
output_format :
|
151
|
+
output_format : SseOutputFormatParams
|
139
152
|
|
140
153
|
language : typing.Optional[SupportedLanguage]
|
141
154
|
|
@@ -145,6 +158,18 @@ class TtsClient:
|
|
145
158
|
|
146
159
|
speed : typing.Optional[ModelSpeed]
|
147
160
|
|
161
|
+
add_timestamps : typing.Optional[bool]
|
162
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
163
|
+
|
164
|
+
add_phoneme_timestamps : typing.Optional[bool]
|
165
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
166
|
+
|
167
|
+
use_normalized_timestamps : typing.Optional[bool]
|
168
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
169
|
+
|
170
|
+
context_id : typing.Optional[ContextId]
|
171
|
+
Optional context ID for this request.
|
172
|
+
|
148
173
|
request_options : typing.Optional[RequestOptions]
|
149
174
|
Request-specific configuration.
|
150
175
|
|
@@ -165,9 +190,9 @@ class TtsClient:
|
|
165
190
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
166
191
|
language="en",
|
167
192
|
output_format={
|
193
|
+
"container": "raw",
|
168
194
|
"sample_rate": 44100,
|
169
195
|
"encoding": "pcm_f32le",
|
170
|
-
"container": "raw",
|
171
196
|
},
|
172
197
|
)
|
173
198
|
for chunk in response:
|
@@ -184,10 +209,14 @@ class TtsClient:
|
|
184
209
|
),
|
185
210
|
"language": language,
|
186
211
|
"output_format": convert_and_respect_annotation_metadata(
|
187
|
-
object_=output_format, annotation=
|
212
|
+
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
188
213
|
),
|
189
214
|
"duration": duration,
|
190
215
|
"speed": speed,
|
216
|
+
"add_timestamps": add_timestamps,
|
217
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
218
|
+
"use_normalized_timestamps": use_normalized_timestamps,
|
219
|
+
"context_id": context_id,
|
191
220
|
},
|
192
221
|
request_options=request_options,
|
193
222
|
omit=OMIT,
|
@@ -228,6 +257,7 @@ class AsyncTtsClient:
|
|
228
257
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
229
258
|
duration: typing.Optional[float] = OMIT,
|
230
259
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
260
|
+
generation_config: typing.Optional[GenerationConfigParams] = OMIT,
|
231
261
|
request_options: typing.Optional[RequestOptions] = None,
|
232
262
|
) -> typing.AsyncIterator[bytes]:
|
233
263
|
"""
|
@@ -250,6 +280,8 @@ class AsyncTtsClient:
|
|
250
280
|
|
251
281
|
speed : typing.Optional[ModelSpeed]
|
252
282
|
|
283
|
+
generation_config : typing.Optional[GenerationConfigParams]
|
284
|
+
|
253
285
|
request_options : typing.Optional[RequestOptions]
|
254
286
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
255
287
|
|
@@ -299,6 +331,9 @@ class AsyncTtsClient:
|
|
299
331
|
),
|
300
332
|
"duration": duration,
|
301
333
|
"speed": speed,
|
334
|
+
"generation_config": convert_and_respect_annotation_metadata(
|
335
|
+
object_=generation_config, annotation=GenerationConfigParams, direction="write"
|
336
|
+
),
|
302
337
|
},
|
303
338
|
request_options=request_options,
|
304
339
|
omit=OMIT,
|
@@ -321,10 +356,14 @@ class AsyncTtsClient:
|
|
321
356
|
model_id: str,
|
322
357
|
transcript: str,
|
323
358
|
voice: TtsRequestVoiceSpecifierParams,
|
324
|
-
output_format:
|
359
|
+
output_format: SseOutputFormatParams,
|
325
360
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
326
361
|
duration: typing.Optional[float] = OMIT,
|
327
362
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
363
|
+
add_timestamps: typing.Optional[bool] = OMIT,
|
364
|
+
add_phoneme_timestamps: typing.Optional[bool] = OMIT,
|
365
|
+
use_normalized_timestamps: typing.Optional[bool] = OMIT,
|
366
|
+
context_id: typing.Optional[ContextId] = OMIT,
|
328
367
|
request_options: typing.Optional[RequestOptions] = None,
|
329
368
|
) -> typing.AsyncIterator[WebSocketResponse]:
|
330
369
|
"""
|
@@ -337,7 +376,7 @@ class AsyncTtsClient:
|
|
337
376
|
|
338
377
|
voice : TtsRequestVoiceSpecifierParams
|
339
378
|
|
340
|
-
output_format :
|
379
|
+
output_format : SseOutputFormatParams
|
341
380
|
|
342
381
|
language : typing.Optional[SupportedLanguage]
|
343
382
|
|
@@ -347,6 +386,18 @@ class AsyncTtsClient:
|
|
347
386
|
|
348
387
|
speed : typing.Optional[ModelSpeed]
|
349
388
|
|
389
|
+
add_timestamps : typing.Optional[bool]
|
390
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
391
|
+
|
392
|
+
add_phoneme_timestamps : typing.Optional[bool]
|
393
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
394
|
+
|
395
|
+
use_normalized_timestamps : typing.Optional[bool]
|
396
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
397
|
+
|
398
|
+
context_id : typing.Optional[ContextId]
|
399
|
+
Optional context ID for this request.
|
400
|
+
|
350
401
|
request_options : typing.Optional[RequestOptions]
|
351
402
|
Request-specific configuration.
|
352
403
|
|
@@ -372,9 +423,9 @@ class AsyncTtsClient:
|
|
372
423
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
373
424
|
language="en",
|
374
425
|
output_format={
|
426
|
+
"container": "raw",
|
375
427
|
"sample_rate": 44100,
|
376
428
|
"encoding": "pcm_f32le",
|
377
|
-
"container": "raw",
|
378
429
|
},
|
379
430
|
)
|
380
431
|
async for chunk in response:
|
@@ -394,10 +445,14 @@ class AsyncTtsClient:
|
|
394
445
|
),
|
395
446
|
"language": language,
|
396
447
|
"output_format": convert_and_respect_annotation_metadata(
|
397
|
-
object_=output_format, annotation=
|
448
|
+
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
398
449
|
),
|
399
450
|
"duration": duration,
|
400
451
|
"speed": speed,
|
452
|
+
"add_timestamps": add_timestamps,
|
453
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
454
|
+
"use_normalized_timestamps": use_normalized_timestamps,
|
455
|
+
"context_id": context_id,
|
401
456
|
},
|
402
457
|
request_options=request_options,
|
403
458
|
omit=OMIT,
|