cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +14 -0
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/core/client_wrapper.py +1 -1
- cartesia/stt/__init__.py +6 -0
- cartesia/stt/_async_websocket.py +81 -72
- cartesia/stt/_websocket.py +42 -20
- cartesia/stt/client.py +456 -0
- cartesia/stt/requests/__init__.py +2 -0
- cartesia/stt/requests/streaming_transcription_response.py +2 -0
- cartesia/stt/requests/transcript_message.py +8 -1
- cartesia/stt/requests/transcription_response.py +8 -1
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +52 -109
- cartesia/stt/types/__init__.py +4 -0
- cartesia/stt/types/streaming_transcription_response.py +2 -0
- cartesia/stt/types/stt_encoding.py +3 -1
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +7 -1
- cartesia/stt/types/transcription_response.py +7 -1
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +8 -0
- cartesia/tts/client.py +50 -8
- cartesia/tts/requests/__init__.py +4 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +4 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
- {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
- {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
cartesia/stt/socket_client.py
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
import typing
|
2
|
-
from typing import Any, Dict, Generator, Optional
|
2
|
+
from typing import Any, Dict, Generator, Optional
|
3
3
|
|
4
4
|
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
5
5
|
from ._async_websocket import AsyncSttWebsocket
|
6
6
|
from ._websocket import SttWebsocket
|
7
|
+
from .client import AsyncSttClient, SttClient
|
8
|
+
from .types.stt_encoding import SttEncoding
|
7
9
|
|
8
10
|
|
9
|
-
class SttClientWithWebsocket:
|
11
|
+
class SttClientWithWebsocket(SttClient):
|
10
12
|
"""
|
11
13
|
Extension of STT functionality that supports a synchronous WebSocket STT connection.
|
12
14
|
"""
|
13
15
|
|
14
16
|
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
15
|
-
|
17
|
+
super().__init__(client_wrapper=client_wrapper)
|
16
18
|
|
17
19
|
def _ws_url(self):
|
18
20
|
base_url = self._client_wrapper.get_base_url()
|
@@ -23,21 +25,34 @@ class SttClientWithWebsocket:
|
|
23
25
|
base_url_without_protocol = base_url.split("://")[-1]
|
24
26
|
return f"{prefix}://{base_url_without_protocol}"
|
25
27
|
|
26
|
-
def websocket(
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
def websocket(
|
29
|
+
self,
|
30
|
+
*,
|
31
|
+
model: str = "ink-whisper",
|
32
|
+
language: Optional[str] = "en",
|
33
|
+
encoding: SttEncoding = "pcm_s16le",
|
34
|
+
sample_rate: int = 16000,
|
35
|
+
min_volume: Optional[float] = None,
|
36
|
+
max_silence_duration_secs: Optional[float] = None,
|
37
|
+
):
|
31
38
|
"""Create a WebSocket connection for real-time speech transcription.
|
32
39
|
|
33
40
|
Args:
|
34
41
|
model: ID of the model to use for transcription
|
35
42
|
language: The language of the input audio in ISO-639-1 format
|
36
|
-
encoding: The encoding format of the audio data
|
37
|
-
sample_rate: The sample rate of the audio in Hz
|
43
|
+
encoding: The encoding format of the audio data (required)
|
44
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
45
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
46
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
38
47
|
|
39
48
|
Returns:
|
40
49
|
SttWebsocket: A connected WebSocket client for STT operations.
|
50
|
+
|
51
|
+
Example:
|
52
|
+
>>> client = Cartesia(api_key="your-api-key")
|
53
|
+
>>> ws = client.stt.websocket()
|
54
|
+
>>> for result in ws.transcribe(audio_chunks):
|
55
|
+
... print(result["text"])
|
41
56
|
"""
|
42
57
|
client_headers = self._client_wrapper.get_headers()
|
43
58
|
ws = SttWebsocket(
|
@@ -51,61 +66,19 @@ class SttClientWithWebsocket:
|
|
51
66
|
language=language,
|
52
67
|
encoding=encoding,
|
53
68
|
sample_rate=sample_rate,
|
69
|
+
min_volume=min_volume,
|
70
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
54
71
|
)
|
55
72
|
return ws
|
56
73
|
|
57
|
-
def transcribe(
|
58
|
-
self,
|
59
|
-
audio_chunks: typing.Iterator[bytes],
|
60
|
-
*,
|
61
|
-
model: str = "ink-whisper",
|
62
|
-
language: Optional[str] = "en",
|
63
|
-
encoding: Optional[str] = "pcm_s16le",
|
64
|
-
sample_rate: int = 16000,
|
65
|
-
) -> Generator[Dict[str, Any], None, None]:
|
66
|
-
"""Transcribe audio chunks using WebSocket.
|
67
74
|
|
68
|
-
|
69
|
-
audio_chunks: Iterator of audio chunks as bytes
|
70
|
-
model: ID of the model to use for transcription
|
71
|
-
language: The language of the input audio in ISO-639-1 format
|
72
|
-
encoding: The encoding format of the audio data
|
73
|
-
sample_rate: The sample rate of the audio in Hz
|
74
|
-
|
75
|
-
Yields:
|
76
|
-
Dictionary containing transcription results, flush_done, done, or error messages
|
77
|
-
|
78
|
-
Example:
|
79
|
-
>>> client = Cartesia(api_key="your-api-key")
|
80
|
-
>>> ws_client = client.stt.websocket()
|
81
|
-
>>> for result in ws_client.transcribe(audio_chunks):
|
82
|
-
... print(result["text"])
|
83
|
-
"""
|
84
|
-
ws = self.websocket(
|
85
|
-
model=model,
|
86
|
-
language=language,
|
87
|
-
encoding=encoding,
|
88
|
-
sample_rate=sample_rate,
|
89
|
-
)
|
90
|
-
try:
|
91
|
-
yield from ws.transcribe(
|
92
|
-
audio_chunks,
|
93
|
-
model=model,
|
94
|
-
language=language,
|
95
|
-
encoding=encoding,
|
96
|
-
sample_rate=sample_rate,
|
97
|
-
)
|
98
|
-
finally:
|
99
|
-
ws.close()
|
100
|
-
|
101
|
-
|
102
|
-
class AsyncSttClientWithWebsocket:
|
75
|
+
class AsyncSttClientWithWebsocket(AsyncSttClient):
|
103
76
|
"""
|
104
77
|
Extension of STT functionality that supports an asynchronous WebSocket STT connection.
|
105
78
|
"""
|
106
79
|
|
107
80
|
def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
|
108
|
-
|
81
|
+
super().__init__(client_wrapper=client_wrapper)
|
109
82
|
self._get_session = get_session
|
110
83
|
|
111
84
|
def _ws_url(self) -> str:
|
@@ -117,21 +90,34 @@ class AsyncSttClientWithWebsocket:
|
|
117
90
|
base_url_without_protocol = base_url.split("://")[-1]
|
118
91
|
return f"{prefix}://{base_url_without_protocol}"
|
119
92
|
|
120
|
-
async def websocket(
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
93
|
+
async def websocket(
|
94
|
+
self,
|
95
|
+
*,
|
96
|
+
model: str = "ink-whisper",
|
97
|
+
language: Optional[str] = "en",
|
98
|
+
encoding: SttEncoding = "pcm_s16le",
|
99
|
+
sample_rate: int = 16000,
|
100
|
+
min_volume: Optional[float] = None,
|
101
|
+
max_silence_duration_secs: Optional[float] = None,
|
102
|
+
):
|
125
103
|
"""Create an async WebSocket connection for real-time speech transcription.
|
126
104
|
|
127
105
|
Args:
|
128
106
|
model: ID of the model to use for transcription
|
129
107
|
language: The language of the input audio in ISO-639-1 format
|
130
|
-
encoding: The encoding format of the audio data
|
131
|
-
sample_rate: The sample rate of the audio in Hz
|
108
|
+
encoding: The encoding format of the audio data (required)
|
109
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
110
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
111
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
132
112
|
|
133
113
|
Returns:
|
134
114
|
AsyncSttWebsocket: A connected async WebSocket client for STT operations.
|
115
|
+
|
116
|
+
Example:
|
117
|
+
>>> client = AsyncCartesia(api_key="your-api-key")
|
118
|
+
>>> ws = await client.stt.websocket()
|
119
|
+
>>> async for result in ws.transcribe(audio_chunks):
|
120
|
+
... print(result["text"])
|
135
121
|
"""
|
136
122
|
client_headers = self._client_wrapper.get_headers()
|
137
123
|
ws = AsyncSttWebsocket(
|
@@ -146,50 +132,7 @@ class AsyncSttClientWithWebsocket:
|
|
146
132
|
language=language,
|
147
133
|
encoding=encoding,
|
148
134
|
sample_rate=sample_rate,
|
135
|
+
min_volume=min_volume,
|
136
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
149
137
|
)
|
150
|
-
return ws
|
151
|
-
|
152
|
-
async def transcribe(
|
153
|
-
self,
|
154
|
-
audio_chunks: typing.AsyncIterator[bytes],
|
155
|
-
*,
|
156
|
-
model: str = "ink-whisper",
|
157
|
-
language: Optional[str] = "en",
|
158
|
-
encoding: Optional[str] = "pcm_s16le",
|
159
|
-
sample_rate: int = 16000,
|
160
|
-
) -> typing.AsyncGenerator[Dict[str, Any], None]:
|
161
|
-
"""Transcribe audio chunks using async WebSocket.
|
162
|
-
|
163
|
-
Args:
|
164
|
-
audio_chunks: Async iterator of audio chunks as bytes
|
165
|
-
model: ID of the model to use for transcription
|
166
|
-
language: The language of the input audio in ISO-639-1 format
|
167
|
-
encoding: The encoding format of the audio data
|
168
|
-
sample_rate: The sample rate of the audio in Hz
|
169
|
-
|
170
|
-
Yields:
|
171
|
-
Dictionary containing transcription results, flush_done, done, or error messages
|
172
|
-
|
173
|
-
Example:
|
174
|
-
>>> client = AsyncCartesia(api_key="your-api-key")
|
175
|
-
>>> ws_client = await client.stt.websocket()
|
176
|
-
>>> async for result in ws_client.transcribe(audio_chunks):
|
177
|
-
... print(result["text"])
|
178
|
-
"""
|
179
|
-
ws = await self.websocket(
|
180
|
-
model=model,
|
181
|
-
language=language,
|
182
|
-
encoding=encoding,
|
183
|
-
sample_rate=sample_rate,
|
184
|
-
)
|
185
|
-
try:
|
186
|
-
async for result in ws.transcribe(
|
187
|
-
audio_chunks,
|
188
|
-
model=model,
|
189
|
-
language=language,
|
190
|
-
encoding=encoding,
|
191
|
-
sample_rate=sample_rate,
|
192
|
-
):
|
193
|
-
yield result
|
194
|
-
finally:
|
195
|
-
await ws.close()
|
138
|
+
return ws
|
cartesia/stt/types/__init__.py
CHANGED
@@ -11,8 +11,10 @@ from .streaming_transcription_response import (
|
|
11
11
|
StreamingTranscriptionResponse_Transcript,
|
12
12
|
)
|
13
13
|
from .stt_encoding import SttEncoding
|
14
|
+
from .timestamp_granularity import TimestampGranularity
|
14
15
|
from .transcript_message import TranscriptMessage
|
15
16
|
from .transcription_response import TranscriptionResponse
|
17
|
+
from .transcription_word import TranscriptionWord
|
16
18
|
|
17
19
|
__all__ = [
|
18
20
|
"DoneMessage",
|
@@ -24,6 +26,8 @@ __all__ = [
|
|
24
26
|
"StreamingTranscriptionResponse_FlushDone",
|
25
27
|
"StreamingTranscriptionResponse_Transcript",
|
26
28
|
"SttEncoding",
|
29
|
+
"TimestampGranularity",
|
27
30
|
"TranscriptMessage",
|
28
31
|
"TranscriptionResponse",
|
32
|
+
"TranscriptionWord",
|
29
33
|
]
|
@@ -3,6 +3,7 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
6
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
8
|
import pydantic
|
8
9
|
|
@@ -18,6 +19,7 @@ class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
|
|
18
19
|
is_final: bool
|
19
20
|
duration: typing.Optional[float] = None
|
20
21
|
language: typing.Optional[str] = None
|
22
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = None
|
21
23
|
|
22
24
|
if IS_PYDANTIC_V2:
|
23
25
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -3,6 +3,7 @@
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
4
|
import pydantic
|
5
5
|
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
6
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
8
|
|
8
9
|
|
@@ -31,7 +32,12 @@ class TranscriptMessage(UniversalBaseModel):
|
|
31
32
|
|
32
33
|
language: typing.Optional[str] = pydantic.Field(default=None)
|
33
34
|
"""
|
34
|
-
The
|
35
|
+
The specified language of the input audio.
|
36
|
+
"""
|
37
|
+
|
38
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
|
39
|
+
"""
|
40
|
+
Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
|
35
41
|
"""
|
36
42
|
|
37
43
|
if IS_PYDANTIC_V2:
|
@@ -3,6 +3,7 @@
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
4
|
import pydantic
|
5
5
|
import typing
|
6
|
+
from .transcription_word import TranscriptionWord
|
6
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
8
|
|
8
9
|
|
@@ -14,7 +15,7 @@ class TranscriptionResponse(UniversalBaseModel):
|
|
14
15
|
|
15
16
|
language: typing.Optional[str] = pydantic.Field(default=None)
|
16
17
|
"""
|
17
|
-
The
|
18
|
+
The specified language of the input audio.
|
18
19
|
"""
|
19
20
|
|
20
21
|
duration: typing.Optional[float] = pydantic.Field(default=None)
|
@@ -22,6 +23,11 @@ class TranscriptionResponse(UniversalBaseModel):
|
|
22
23
|
The duration of the input audio in seconds.
|
23
24
|
"""
|
24
25
|
|
26
|
+
words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
|
27
|
+
"""
|
28
|
+
Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
|
29
|
+
"""
|
30
|
+
|
25
31
|
if IS_PYDANTIC_V2:
|
26
32
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
27
33
|
else:
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
+
import typing
|
7
|
+
|
8
|
+
|
9
|
+
class TranscriptionWord(UniversalBaseModel):
|
10
|
+
word: str = pydantic.Field()
|
11
|
+
"""
|
12
|
+
The transcribed word.
|
13
|
+
"""
|
14
|
+
|
15
|
+
start: float = pydantic.Field()
|
16
|
+
"""
|
17
|
+
Start time of the word in seconds.
|
18
|
+
"""
|
19
|
+
|
20
|
+
end: float = pydantic.Field()
|
21
|
+
"""
|
22
|
+
End time of the word in seconds.
|
23
|
+
"""
|
24
|
+
|
25
|
+
if IS_PYDANTIC_V2:
|
26
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
27
|
+
else:
|
28
|
+
|
29
|
+
class Config:
|
30
|
+
frozen = True
|
31
|
+
smart_union = True
|
32
|
+
extra = pydantic.Extra.allow
|
cartesia/tts/__init__.py
CHANGED
@@ -19,11 +19,13 @@ from .types import (
|
|
19
19
|
RawEncoding,
|
20
20
|
RawOutputFormat,
|
21
21
|
Speed,
|
22
|
+
SseOutputFormat,
|
22
23
|
SupportedLanguage,
|
23
24
|
TtsRequest,
|
24
25
|
TtsRequestEmbeddingSpecifier,
|
25
26
|
TtsRequestIdSpecifier,
|
26
27
|
TtsRequestVoiceSpecifier,
|
28
|
+
TtssseRequest,
|
27
29
|
WavOutputFormat,
|
28
30
|
WebSocketBaseResponse,
|
29
31
|
WebSocketChunkResponse,
|
@@ -58,10 +60,12 @@ from .requests import (
|
|
58
60
|
PhonemeTimestampsParams,
|
59
61
|
RawOutputFormatParams,
|
60
62
|
SpeedParams,
|
63
|
+
SseOutputFormatParams,
|
61
64
|
TtsRequestEmbeddingSpecifierParams,
|
62
65
|
TtsRequestIdSpecifierParams,
|
63
66
|
TtsRequestParams,
|
64
67
|
TtsRequestVoiceSpecifierParams,
|
68
|
+
TtssseRequestParams,
|
65
69
|
WavOutputFormatParams,
|
66
70
|
WebSocketBaseResponseParams,
|
67
71
|
WebSocketChunkResponseParams,
|
@@ -115,6 +119,8 @@ __all__ = [
|
|
115
119
|
"RawOutputFormatParams",
|
116
120
|
"Speed",
|
117
121
|
"SpeedParams",
|
122
|
+
"SseOutputFormat",
|
123
|
+
"SseOutputFormatParams",
|
118
124
|
"SupportedLanguage",
|
119
125
|
"TtsRequest",
|
120
126
|
"TtsRequestEmbeddingSpecifier",
|
@@ -124,6 +130,8 @@ __all__ = [
|
|
124
130
|
"TtsRequestParams",
|
125
131
|
"TtsRequestVoiceSpecifier",
|
126
132
|
"TtsRequestVoiceSpecifierParams",
|
133
|
+
"TtssseRequest",
|
134
|
+
"TtssseRequestParams",
|
127
135
|
"WavOutputFormat",
|
128
136
|
"WavOutputFormatParams",
|
129
137
|
"WebSocketBaseResponse",
|
cartesia/tts/client.py
CHANGED
@@ -10,6 +10,8 @@ from ..core.request_options import RequestOptions
|
|
10
10
|
from ..core.serialization import convert_and_respect_annotation_metadata
|
11
11
|
from json.decoder import JSONDecodeError
|
12
12
|
from ..core.api_error import ApiError
|
13
|
+
from .requests.sse_output_format import SseOutputFormatParams
|
14
|
+
from .types.context_id import ContextId
|
13
15
|
from .types.web_socket_response import WebSocketResponse
|
14
16
|
import httpx_sse
|
15
17
|
from ..core.pydantic_utilities import parse_obj_as
|
@@ -119,10 +121,14 @@ class TtsClient:
|
|
119
121
|
model_id: str,
|
120
122
|
transcript: str,
|
121
123
|
voice: TtsRequestVoiceSpecifierParams,
|
122
|
-
output_format:
|
124
|
+
output_format: SseOutputFormatParams,
|
123
125
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
124
126
|
duration: typing.Optional[float] = OMIT,
|
125
127
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
128
|
+
add_timestamps: typing.Optional[bool] = OMIT,
|
129
|
+
add_phoneme_timestamps: typing.Optional[bool] = OMIT,
|
130
|
+
use_normalized_timestamps: typing.Optional[bool] = OMIT,
|
131
|
+
context_id: typing.Optional[ContextId] = OMIT,
|
126
132
|
request_options: typing.Optional[RequestOptions] = None,
|
127
133
|
) -> typing.Iterator[WebSocketResponse]:
|
128
134
|
"""
|
@@ -135,7 +141,7 @@ class TtsClient:
|
|
135
141
|
|
136
142
|
voice : TtsRequestVoiceSpecifierParams
|
137
143
|
|
138
|
-
output_format :
|
144
|
+
output_format : SseOutputFormatParams
|
139
145
|
|
140
146
|
language : typing.Optional[SupportedLanguage]
|
141
147
|
|
@@ -145,6 +151,18 @@ class TtsClient:
|
|
145
151
|
|
146
152
|
speed : typing.Optional[ModelSpeed]
|
147
153
|
|
154
|
+
add_timestamps : typing.Optional[bool]
|
155
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
156
|
+
|
157
|
+
add_phoneme_timestamps : typing.Optional[bool]
|
158
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
159
|
+
|
160
|
+
use_normalized_timestamps : typing.Optional[bool]
|
161
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
162
|
+
|
163
|
+
context_id : typing.Optional[ContextId]
|
164
|
+
Optional context ID for this request.
|
165
|
+
|
148
166
|
request_options : typing.Optional[RequestOptions]
|
149
167
|
Request-specific configuration.
|
150
168
|
|
@@ -165,9 +183,9 @@ class TtsClient:
|
|
165
183
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
166
184
|
language="en",
|
167
185
|
output_format={
|
186
|
+
"container": "raw",
|
168
187
|
"sample_rate": 44100,
|
169
188
|
"encoding": "pcm_f32le",
|
170
|
-
"container": "raw",
|
171
189
|
},
|
172
190
|
)
|
173
191
|
for chunk in response:
|
@@ -184,10 +202,14 @@ class TtsClient:
|
|
184
202
|
),
|
185
203
|
"language": language,
|
186
204
|
"output_format": convert_and_respect_annotation_metadata(
|
187
|
-
object_=output_format, annotation=
|
205
|
+
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
188
206
|
),
|
189
207
|
"duration": duration,
|
190
208
|
"speed": speed,
|
209
|
+
"add_timestamps": add_timestamps,
|
210
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
211
|
+
"use_normalized_timestamps": use_normalized_timestamps,
|
212
|
+
"context_id": context_id,
|
191
213
|
},
|
192
214
|
request_options=request_options,
|
193
215
|
omit=OMIT,
|
@@ -321,10 +343,14 @@ class AsyncTtsClient:
|
|
321
343
|
model_id: str,
|
322
344
|
transcript: str,
|
323
345
|
voice: TtsRequestVoiceSpecifierParams,
|
324
|
-
output_format:
|
346
|
+
output_format: SseOutputFormatParams,
|
325
347
|
language: typing.Optional[SupportedLanguage] = OMIT,
|
326
348
|
duration: typing.Optional[float] = OMIT,
|
327
349
|
speed: typing.Optional[ModelSpeed] = OMIT,
|
350
|
+
add_timestamps: typing.Optional[bool] = OMIT,
|
351
|
+
add_phoneme_timestamps: typing.Optional[bool] = OMIT,
|
352
|
+
use_normalized_timestamps: typing.Optional[bool] = OMIT,
|
353
|
+
context_id: typing.Optional[ContextId] = OMIT,
|
328
354
|
request_options: typing.Optional[RequestOptions] = None,
|
329
355
|
) -> typing.AsyncIterator[WebSocketResponse]:
|
330
356
|
"""
|
@@ -337,7 +363,7 @@ class AsyncTtsClient:
|
|
337
363
|
|
338
364
|
voice : TtsRequestVoiceSpecifierParams
|
339
365
|
|
340
|
-
output_format :
|
366
|
+
output_format : SseOutputFormatParams
|
341
367
|
|
342
368
|
language : typing.Optional[SupportedLanguage]
|
343
369
|
|
@@ -347,6 +373,18 @@ class AsyncTtsClient:
|
|
347
373
|
|
348
374
|
speed : typing.Optional[ModelSpeed]
|
349
375
|
|
376
|
+
add_timestamps : typing.Optional[bool]
|
377
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
378
|
+
|
379
|
+
add_phoneme_timestamps : typing.Optional[bool]
|
380
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
381
|
+
|
382
|
+
use_normalized_timestamps : typing.Optional[bool]
|
383
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
384
|
+
|
385
|
+
context_id : typing.Optional[ContextId]
|
386
|
+
Optional context ID for this request.
|
387
|
+
|
350
388
|
request_options : typing.Optional[RequestOptions]
|
351
389
|
Request-specific configuration.
|
352
390
|
|
@@ -372,9 +410,9 @@ class AsyncTtsClient:
|
|
372
410
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
373
411
|
language="en",
|
374
412
|
output_format={
|
413
|
+
"container": "raw",
|
375
414
|
"sample_rate": 44100,
|
376
415
|
"encoding": "pcm_f32le",
|
377
|
-
"container": "raw",
|
378
416
|
},
|
379
417
|
)
|
380
418
|
async for chunk in response:
|
@@ -394,10 +432,14 @@ class AsyncTtsClient:
|
|
394
432
|
),
|
395
433
|
"language": language,
|
396
434
|
"output_format": convert_and_respect_annotation_metadata(
|
397
|
-
object_=output_format, annotation=
|
435
|
+
object_=output_format, annotation=SseOutputFormatParams, direction="write"
|
398
436
|
),
|
399
437
|
"duration": duration,
|
400
438
|
"speed": speed,
|
439
|
+
"add_timestamps": add_timestamps,
|
440
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
441
|
+
"use_normalized_timestamps": use_normalized_timestamps,
|
442
|
+
"context_id": context_id,
|
401
443
|
},
|
402
444
|
request_options=request_options,
|
403
445
|
omit=OMIT,
|
@@ -8,10 +8,12 @@ from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFor
|
|
8
8
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
9
9
|
from .raw_output_format import RawOutputFormatParams
|
10
10
|
from .speed import SpeedParams
|
11
|
+
from .sse_output_format import SseOutputFormatParams
|
11
12
|
from .tts_request import TtsRequestParams
|
12
13
|
from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
|
13
14
|
from .tts_request_id_specifier import TtsRequestIdSpecifierParams
|
14
15
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
16
|
+
from .ttssse_request import TtssseRequestParams
|
15
17
|
from .wav_output_format import WavOutputFormatParams
|
16
18
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
17
19
|
from .web_socket_chunk_response import WebSocketChunkResponseParams
|
@@ -48,10 +50,12 @@ __all__ = [
|
|
48
50
|
"PhonemeTimestampsParams",
|
49
51
|
"RawOutputFormatParams",
|
50
52
|
"SpeedParams",
|
53
|
+
"SseOutputFormatParams",
|
51
54
|
"TtsRequestEmbeddingSpecifierParams",
|
52
55
|
"TtsRequestIdSpecifierParams",
|
53
56
|
"TtsRequestParams",
|
54
57
|
"TtsRequestVoiceSpecifierParams",
|
58
|
+
"TtssseRequestParams",
|
55
59
|
"WavOutputFormatParams",
|
56
60
|
"WebSocketBaseResponseParams",
|
57
61
|
"WebSocketChunkResponseParams",
|
@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
|
|
55
55
|
|
56
56
|
add_timestamps: typing_extensions.NotRequired[bool]
|
57
57
|
"""
|
58
|
-
Whether to return word-level timestamps.
|
58
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
59
59
|
"""
|
60
60
|
|
61
61
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
62
62
|
"""
|
63
|
-
Whether to return phoneme-level timestamps.
|
63
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
64
64
|
"""
|
65
65
|
|
66
|
-
|
66
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
67
67
|
"""
|
68
|
-
Whether to use
|
68
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
69
69
|
"""
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing
|
5
|
+
from ..types.raw_encoding import RawEncoding
|
6
|
+
|
7
|
+
|
8
|
+
class SseOutputFormatParams(typing_extensions.TypedDict):
|
9
|
+
container: typing.Literal["raw"]
|
10
|
+
encoding: RawEncoding
|
11
|
+
sample_rate: int
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
|
5
|
+
import typing_extensions
|
6
|
+
from ..types.supported_language import SupportedLanguage
|
7
|
+
from .sse_output_format import SseOutputFormatParams
|
8
|
+
from ..types.model_speed import ModelSpeed
|
9
|
+
from ..types.context_id import ContextId
|
10
|
+
|
11
|
+
|
12
|
+
class TtssseRequestParams(typing_extensions.TypedDict):
|
13
|
+
model_id: str
|
14
|
+
"""
|
15
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
16
|
+
"""
|
17
|
+
|
18
|
+
transcript: str
|
19
|
+
voice: TtsRequestVoiceSpecifierParams
|
20
|
+
language: typing_extensions.NotRequired[SupportedLanguage]
|
21
|
+
output_format: SseOutputFormatParams
|
22
|
+
duration: typing_extensions.NotRequired[float]
|
23
|
+
"""
|
24
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
25
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
26
|
+
"""
|
27
|
+
|
28
|
+
speed: typing_extensions.NotRequired[ModelSpeed]
|
29
|
+
add_timestamps: typing_extensions.NotRequired[bool]
|
30
|
+
"""
|
31
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
32
|
+
"""
|
33
|
+
|
34
|
+
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
35
|
+
"""
|
36
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
37
|
+
"""
|
38
|
+
|
39
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
40
|
+
"""
|
41
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
42
|
+
"""
|
43
|
+
|
44
|
+
context_id: typing_extensions.NotRequired[ContextId]
|
45
|
+
"""
|
46
|
+
Optional context ID for this request.
|
47
|
+
"""
|
@@ -1,11 +1,8 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
4
|
-
import typing_extensions
|
5
|
-
from ..types.flush_id import FlushId
|
6
4
|
|
7
5
|
|
8
6
|
class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
|
9
7
|
data: str
|
10
8
|
step_time: float
|
11
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|