cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +22 -0
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/core/client_wrapper.py +1 -1
- cartesia/infill/client.py +0 -8
- cartesia/stt/__init__.py +6 -0
- cartesia/stt/_async_websocket.py +81 -72
- cartesia/stt/_websocket.py +42 -20
- cartesia/stt/client.py +450 -0
- cartesia/stt/requests/__init__.py +2 -0
- cartesia/stt/requests/streaming_transcription_response.py +2 -0
- cartesia/stt/requests/transcript_message.py +8 -1
- cartesia/stt/requests/transcription_response.py +8 -1
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +52 -109
- cartesia/stt/types/__init__.py +4 -0
- cartesia/stt/types/streaming_transcription_response.py +2 -0
- cartesia/stt/types/stt_encoding.py +3 -1
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +7 -1
- cartesia/stt/types/transcription_response.py +7 -1
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +16 -0
- cartesia/tts/client.py +63 -8
- cartesia/tts/requests/__init__.py +8 -0
- cartesia/tts/requests/experimental_model_controls.py +17 -0
- cartesia/tts/requests/generation_config.py +23 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/tts_request.py +2 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +8 -0
- cartesia/tts/types/experimental_model_controls.py +28 -0
- cartesia/tts/types/generation_config.py +34 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/tts_request.py +2 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/client.py +0 -8
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- cartesia/voices/client.py +0 -12
- cartesia-2.0.7.dist-info/LICENSE +201 -0
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
cartesia/stt/_websocket.py
CHANGED
@@ -14,6 +14,7 @@ from cartesia.stt.types import (
|
|
14
14
|
StreamingTranscriptionResponse_Error,
|
15
15
|
StreamingTranscriptionResponse_Transcript,
|
16
16
|
)
|
17
|
+
from cartesia.stt.types.stt_encoding import SttEncoding
|
17
18
|
|
18
19
|
from ..core.pydantic_utilities import parse_obj_as
|
19
20
|
|
@@ -45,8 +46,10 @@ class SttWebsocket:
|
|
45
46
|
# Store default connection parameters for auto-connect with proper typing
|
46
47
|
self._default_model: str = "ink-whisper"
|
47
48
|
self._default_language: Optional[str] = "en"
|
48
|
-
self._default_encoding:
|
49
|
+
self._default_encoding: SttEncoding = "pcm_s16le"
|
49
50
|
self._default_sample_rate: int = 16000
|
51
|
+
self._default_min_volume: Optional[float] = None
|
52
|
+
self._default_max_silence_duration_secs: Optional[float] = None
|
50
53
|
|
51
54
|
def __del__(self):
|
52
55
|
try:
|
@@ -59,16 +62,20 @@ class SttWebsocket:
|
|
59
62
|
*,
|
60
63
|
model: str = "ink-whisper",
|
61
64
|
language: Optional[str] = "en",
|
62
|
-
encoding:
|
65
|
+
encoding: SttEncoding = "pcm_s16le",
|
63
66
|
sample_rate: int = 16000,
|
67
|
+
min_volume: Optional[float] = None,
|
68
|
+
max_silence_duration_secs: Optional[float] = None,
|
64
69
|
):
|
65
70
|
"""Connect to the STT WebSocket with the specified parameters.
|
66
71
|
|
67
72
|
Args:
|
68
73
|
model: ID of the model to use for transcription
|
69
74
|
language: The language of the input audio in ISO-639-1 format
|
70
|
-
encoding: The encoding format of the audio data
|
71
|
-
sample_rate: The sample rate of the audio in Hz
|
75
|
+
encoding: The encoding format of the audio data (required)
|
76
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
77
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
78
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
72
79
|
|
73
80
|
Raises:
|
74
81
|
RuntimeError: If the connection to the WebSocket fails.
|
@@ -78,6 +85,8 @@ class SttWebsocket:
|
|
78
85
|
self._default_language = language
|
79
86
|
self._default_encoding = encoding
|
80
87
|
self._default_sample_rate = sample_rate
|
88
|
+
self._default_min_volume = min_volume
|
89
|
+
self._default_max_silence_duration_secs = max_silence_duration_secs
|
81
90
|
|
82
91
|
if not IS_WEBSOCKET_SYNC_AVAILABLE:
|
83
92
|
raise ImportError(
|
@@ -89,13 +98,15 @@ class SttWebsocket:
|
|
89
98
|
"model": model,
|
90
99
|
"api_key": self.api_key,
|
91
100
|
"cartesia_version": self.cartesia_version,
|
101
|
+
"encoding": encoding,
|
102
|
+
"sample_rate": str(sample_rate),
|
92
103
|
}
|
93
104
|
if language is not None:
|
94
105
|
params["language"] = language
|
95
|
-
if
|
96
|
-
params["
|
97
|
-
if
|
98
|
-
params["
|
106
|
+
if min_volume is not None:
|
107
|
+
params["min_volume"] = str(min_volume)
|
108
|
+
if max_silence_duration_secs is not None:
|
109
|
+
params["max_silence_duration_secs"] = str(max_silence_duration_secs)
|
99
110
|
|
100
111
|
query_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
101
112
|
url = f"{self.ws_url}/{route}?{query_string}"
|
@@ -143,6 +154,8 @@ class SttWebsocket:
|
|
143
154
|
language=self._default_language,
|
144
155
|
encoding=self._default_encoding,
|
145
156
|
sample_rate=self._default_sample_rate,
|
157
|
+
min_volume=self._default_min_volume,
|
158
|
+
max_silence_duration_secs=self._default_max_silence_duration_secs,
|
146
159
|
)
|
147
160
|
|
148
161
|
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
@@ -167,6 +180,8 @@ class SttWebsocket:
|
|
167
180
|
language=self._default_language,
|
168
181
|
encoding=self._default_encoding,
|
169
182
|
sample_rate=self._default_sample_rate,
|
183
|
+
min_volume=self._default_min_volume,
|
184
|
+
max_silence_duration_secs=self._default_max_silence_duration_secs,
|
170
185
|
)
|
171
186
|
|
172
187
|
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
@@ -197,6 +212,8 @@ class SttWebsocket:
|
|
197
212
|
result["duration"] = raw_data["duration"]
|
198
213
|
if "language" in raw_data:
|
199
214
|
result["language"] = raw_data["language"]
|
215
|
+
if "words" in raw_data:
|
216
|
+
result["words"] = raw_data["words"]
|
200
217
|
|
201
218
|
yield result
|
202
219
|
|
@@ -208,23 +225,22 @@ class SttWebsocket:
|
|
208
225
|
}
|
209
226
|
yield result
|
210
227
|
|
211
|
-
# Handle done acknowledgment
|
228
|
+
# Handle done acknowledgment
|
212
229
|
elif raw_data.get("type") == "done":
|
213
230
|
result = {
|
214
231
|
"type": raw_data["type"],
|
215
232
|
"request_id": raw_data.get("request_id", ""),
|
216
233
|
}
|
217
234
|
yield result
|
218
|
-
#
|
219
|
-
break
|
220
|
-
|
221
|
-
except Exception as inner_e:
|
222
|
-
self.close()
|
223
|
-
raise RuntimeError(f"Error receiving transcription: {inner_e}")
|
235
|
+
break # Exit the loop when done
|
224
236
|
|
225
|
-
|
237
|
+
except Exception as e:
|
238
|
+
if "Connection closed" in str(e) or "no active connection" in str(e):
|
239
|
+
break # WebSocket was closed
|
240
|
+
raise e # Re-raise other exceptions
|
241
|
+
except KeyboardInterrupt:
|
226
242
|
self.close()
|
227
|
-
raise
|
243
|
+
raise
|
228
244
|
|
229
245
|
def transcribe(
|
230
246
|
self,
|
@@ -232,8 +248,10 @@ class SttWebsocket:
|
|
232
248
|
*,
|
233
249
|
model: str = "ink-whisper",
|
234
250
|
language: Optional[str] = "en",
|
235
|
-
encoding:
|
251
|
+
encoding: SttEncoding = "pcm_s16le",
|
236
252
|
sample_rate: int = 16000,
|
253
|
+
min_volume: Optional[float] = None,
|
254
|
+
max_silence_duration_secs: Optional[float] = None,
|
237
255
|
) -> Generator[Dict[str, Any], None, None]:
|
238
256
|
"""Transcribe audio chunks using the WebSocket.
|
239
257
|
|
@@ -241,8 +259,10 @@ class SttWebsocket:
|
|
241
259
|
audio_chunks: Iterator of audio chunks as bytes
|
242
260
|
model: ID of the model to use for transcription
|
243
261
|
language: The language of the input audio in ISO-639-1 format
|
244
|
-
encoding: The encoding format of the audio data
|
245
|
-
sample_rate: The sample rate of the audio in Hz
|
262
|
+
encoding: The encoding format of the audio data (required)
|
263
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
264
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
265
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
246
266
|
|
247
267
|
Yields:
|
248
268
|
Dictionary containing transcription results, flush_done, done, or error messages
|
@@ -252,6 +272,8 @@ class SttWebsocket:
|
|
252
272
|
language=language,
|
253
273
|
encoding=encoding,
|
254
274
|
sample_rate=sample_rate,
|
275
|
+
min_volume=min_volume,
|
276
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
255
277
|
)
|
256
278
|
|
257
279
|
try:
|
cartesia/stt/client.py
ADDED
@@ -0,0 +1,450 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from ..core.client_wrapper import SyncClientWrapper
|
5
|
+
from .. import core
|
6
|
+
from .types.stt_encoding import SttEncoding
|
7
|
+
from .types.timestamp_granularity import TimestampGranularity
|
8
|
+
from ..core.request_options import RequestOptions
|
9
|
+
from .types.transcription_response import TranscriptionResponse
|
10
|
+
from ..core.pydantic_utilities import parse_obj_as
|
11
|
+
from json.decoder import JSONDecodeError
|
12
|
+
from ..core.api_error import ApiError
|
13
|
+
from ..core.client_wrapper import AsyncClientWrapper
|
14
|
+
|
15
|
+
# this is used as the default value for optional parameters
|
16
|
+
OMIT = typing.cast(typing.Any, ...)
|
17
|
+
|
18
|
+
|
19
|
+
class SttClient:
|
20
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
21
|
+
self._client_wrapper = client_wrapper
|
22
|
+
|
23
|
+
def transcribe(
|
24
|
+
self,
|
25
|
+
*,
|
26
|
+
file: core.File,
|
27
|
+
model: str,
|
28
|
+
encoding: typing.Optional[SttEncoding] = None,
|
29
|
+
sample_rate: typing.Optional[int] = None,
|
30
|
+
language: typing.Optional[str] = OMIT,
|
31
|
+
timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
|
32
|
+
request_options: typing.Optional[RequestOptions] = None,
|
33
|
+
) -> TranscriptionResponse:
|
34
|
+
"""
|
35
|
+
Transcribes audio files into text using Cartesia's Speech-to-Text API.
|
36
|
+
|
37
|
+
Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
|
38
|
+
|
39
|
+
**Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
|
40
|
+
|
41
|
+
**Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
|
42
|
+
|
43
|
+
**Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
|
44
|
+
|
45
|
+
<Note>
|
46
|
+
For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
|
47
|
+
</Note>
|
48
|
+
|
49
|
+
Parameters
|
50
|
+
----------
|
51
|
+
file : core.File
|
52
|
+
See core.File for more documentation
|
53
|
+
|
54
|
+
model : str
|
55
|
+
ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
|
56
|
+
|
57
|
+
encoding : typing.Optional[SttEncoding]
|
58
|
+
The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
|
59
|
+
|
60
|
+
**Supported formats:**
|
61
|
+
- `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
|
62
|
+
- `pcm_s32le` - 32-bit signed integer PCM, little-endian
|
63
|
+
- `pcm_f16le` - 16-bit floating point PCM, little-endian
|
64
|
+
- `pcm_f32le` - 32-bit floating point PCM, little-endian
|
65
|
+
- `pcm_mulaw` - 8-bit μ-law encoded PCM
|
66
|
+
- `pcm_alaw` - 8-bit A-law encoded PCM
|
67
|
+
|
68
|
+
sample_rate : typing.Optional[int]
|
69
|
+
The sample rate of the audio in Hz.
|
70
|
+
|
71
|
+
language : typing.Optional[str]
|
72
|
+
The language of the input audio in ISO-639-1 format. Defaults to `en`.
|
73
|
+
|
74
|
+
<Accordion title="Supported languages">
|
75
|
+
- `en` (English)
|
76
|
+
- `zh` (Chinese)
|
77
|
+
- `de` (German)
|
78
|
+
- `es` (Spanish)
|
79
|
+
- `ru` (Russian)
|
80
|
+
- `ko` (Korean)
|
81
|
+
- `fr` (French)
|
82
|
+
- `ja` (Japanese)
|
83
|
+
- `pt` (Portuguese)
|
84
|
+
- `tr` (Turkish)
|
85
|
+
- `pl` (Polish)
|
86
|
+
- `ca` (Catalan)
|
87
|
+
- `nl` (Dutch)
|
88
|
+
- `ar` (Arabic)
|
89
|
+
- `sv` (Swedish)
|
90
|
+
- `it` (Italian)
|
91
|
+
- `id` (Indonesian)
|
92
|
+
- `hi` (Hindi)
|
93
|
+
- `fi` (Finnish)
|
94
|
+
- `vi` (Vietnamese)
|
95
|
+
- `he` (Hebrew)
|
96
|
+
- `uk` (Ukrainian)
|
97
|
+
- `el` (Greek)
|
98
|
+
- `ms` (Malay)
|
99
|
+
- `cs` (Czech)
|
100
|
+
- `ro` (Romanian)
|
101
|
+
- `da` (Danish)
|
102
|
+
- `hu` (Hungarian)
|
103
|
+
- `ta` (Tamil)
|
104
|
+
- `no` (Norwegian)
|
105
|
+
- `th` (Thai)
|
106
|
+
- `ur` (Urdu)
|
107
|
+
- `hr` (Croatian)
|
108
|
+
- `bg` (Bulgarian)
|
109
|
+
- `lt` (Lithuanian)
|
110
|
+
- `la` (Latin)
|
111
|
+
- `mi` (Maori)
|
112
|
+
- `ml` (Malayalam)
|
113
|
+
- `cy` (Welsh)
|
114
|
+
- `sk` (Slovak)
|
115
|
+
- `te` (Telugu)
|
116
|
+
- `fa` (Persian)
|
117
|
+
- `lv` (Latvian)
|
118
|
+
- `bn` (Bengali)
|
119
|
+
- `sr` (Serbian)
|
120
|
+
- `az` (Azerbaijani)
|
121
|
+
- `sl` (Slovenian)
|
122
|
+
- `kn` (Kannada)
|
123
|
+
- `et` (Estonian)
|
124
|
+
- `mk` (Macedonian)
|
125
|
+
- `br` (Breton)
|
126
|
+
- `eu` (Basque)
|
127
|
+
- `is` (Icelandic)
|
128
|
+
- `hy` (Armenian)
|
129
|
+
- `ne` (Nepali)
|
130
|
+
- `mn` (Mongolian)
|
131
|
+
- `bs` (Bosnian)
|
132
|
+
- `kk` (Kazakh)
|
133
|
+
- `sq` (Albanian)
|
134
|
+
- `sw` (Swahili)
|
135
|
+
- `gl` (Galician)
|
136
|
+
- `mr` (Marathi)
|
137
|
+
- `pa` (Punjabi)
|
138
|
+
- `si` (Sinhala)
|
139
|
+
- `km` (Khmer)
|
140
|
+
- `sn` (Shona)
|
141
|
+
- `yo` (Yoruba)
|
142
|
+
- `so` (Somali)
|
143
|
+
- `af` (Afrikaans)
|
144
|
+
- `oc` (Occitan)
|
145
|
+
- `ka` (Georgian)
|
146
|
+
- `be` (Belarusian)
|
147
|
+
- `tg` (Tajik)
|
148
|
+
- `sd` (Sindhi)
|
149
|
+
- `gu` (Gujarati)
|
150
|
+
- `am` (Amharic)
|
151
|
+
- `yi` (Yiddish)
|
152
|
+
- `lo` (Lao)
|
153
|
+
- `uz` (Uzbek)
|
154
|
+
- `fo` (Faroese)
|
155
|
+
- `ht` (Haitian Creole)
|
156
|
+
- `ps` (Pashto)
|
157
|
+
- `tk` (Turkmen)
|
158
|
+
- `nn` (Nynorsk)
|
159
|
+
- `mt` (Maltese)
|
160
|
+
- `sa` (Sanskrit)
|
161
|
+
- `lb` (Luxembourgish)
|
162
|
+
- `my` (Myanmar)
|
163
|
+
- `bo` (Tibetan)
|
164
|
+
- `tl` (Tagalog)
|
165
|
+
- `mg` (Malagasy)
|
166
|
+
- `as` (Assamese)
|
167
|
+
- `tt` (Tatar)
|
168
|
+
- `haw` (Hawaiian)
|
169
|
+
- `ln` (Lingala)
|
170
|
+
- `ha` (Hausa)
|
171
|
+
- `ba` (Bashkir)
|
172
|
+
- `jw` (Javanese)
|
173
|
+
- `su` (Sundanese)
|
174
|
+
- `yue` (Cantonese)
|
175
|
+
</Accordion>
|
176
|
+
|
177
|
+
timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
|
178
|
+
The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
|
179
|
+
|
180
|
+
request_options : typing.Optional[RequestOptions]
|
181
|
+
Request-specific configuration.
|
182
|
+
|
183
|
+
Returns
|
184
|
+
-------
|
185
|
+
TranscriptionResponse
|
186
|
+
|
187
|
+
Examples
|
188
|
+
--------
|
189
|
+
from cartesia import Cartesia
|
190
|
+
|
191
|
+
client = Cartesia(
|
192
|
+
api_key="YOUR_API_KEY",
|
193
|
+
)
|
194
|
+
client.stt.transcribe(
|
195
|
+
model="ink-whisper",
|
196
|
+
language="en",
|
197
|
+
)
|
198
|
+
"""
|
199
|
+
_response = self._client_wrapper.httpx_client.request(
|
200
|
+
"stt",
|
201
|
+
method="POST",
|
202
|
+
params={
|
203
|
+
"encoding": encoding,
|
204
|
+
"sample_rate": sample_rate,
|
205
|
+
},
|
206
|
+
data={
|
207
|
+
"model": model,
|
208
|
+
"language": language,
|
209
|
+
"timestamp_granularities[]": timestamp_granularities,
|
210
|
+
},
|
211
|
+
files={
|
212
|
+
"file": file,
|
213
|
+
},
|
214
|
+
request_options=request_options,
|
215
|
+
omit=OMIT,
|
216
|
+
)
|
217
|
+
try:
|
218
|
+
if 200 <= _response.status_code < 300:
|
219
|
+
return typing.cast(
|
220
|
+
TranscriptionResponse,
|
221
|
+
parse_obj_as(
|
222
|
+
type_=TranscriptionResponse, # type: ignore
|
223
|
+
object_=_response.json(),
|
224
|
+
),
|
225
|
+
)
|
226
|
+
_response_json = _response.json()
|
227
|
+
except JSONDecodeError:
|
228
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
229
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
230
|
+
|
231
|
+
|
232
|
+
class AsyncSttClient:
|
233
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
234
|
+
self._client_wrapper = client_wrapper
|
235
|
+
|
236
|
+
async def transcribe(
|
237
|
+
self,
|
238
|
+
*,
|
239
|
+
file: core.File,
|
240
|
+
model: str,
|
241
|
+
encoding: typing.Optional[SttEncoding] = None,
|
242
|
+
sample_rate: typing.Optional[int] = None,
|
243
|
+
language: typing.Optional[str] = OMIT,
|
244
|
+
timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
|
245
|
+
request_options: typing.Optional[RequestOptions] = None,
|
246
|
+
) -> TranscriptionResponse:
|
247
|
+
"""
|
248
|
+
Transcribes audio files into text using Cartesia's Speech-to-Text API.
|
249
|
+
|
250
|
+
Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
|
251
|
+
|
252
|
+
**Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
|
253
|
+
|
254
|
+
**Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
|
255
|
+
|
256
|
+
**Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
|
257
|
+
|
258
|
+
<Note>
|
259
|
+
For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
|
260
|
+
</Note>
|
261
|
+
|
262
|
+
Parameters
|
263
|
+
----------
|
264
|
+
file : core.File
|
265
|
+
See core.File for more documentation
|
266
|
+
|
267
|
+
model : str
|
268
|
+
ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
|
269
|
+
|
270
|
+
encoding : typing.Optional[SttEncoding]
|
271
|
+
The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
|
272
|
+
|
273
|
+
**Supported formats:**
|
274
|
+
- `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
|
275
|
+
- `pcm_s32le` - 32-bit signed integer PCM, little-endian
|
276
|
+
- `pcm_f16le` - 16-bit floating point PCM, little-endian
|
277
|
+
- `pcm_f32le` - 32-bit floating point PCM, little-endian
|
278
|
+
- `pcm_mulaw` - 8-bit μ-law encoded PCM
|
279
|
+
- `pcm_alaw` - 8-bit A-law encoded PCM
|
280
|
+
|
281
|
+
sample_rate : typing.Optional[int]
|
282
|
+
The sample rate of the audio in Hz.
|
283
|
+
|
284
|
+
language : typing.Optional[str]
|
285
|
+
The language of the input audio in ISO-639-1 format. Defaults to `en`.
|
286
|
+
|
287
|
+
<Accordion title="Supported languages">
|
288
|
+
- `en` (English)
|
289
|
+
- `zh` (Chinese)
|
290
|
+
- `de` (German)
|
291
|
+
- `es` (Spanish)
|
292
|
+
- `ru` (Russian)
|
293
|
+
- `ko` (Korean)
|
294
|
+
- `fr` (French)
|
295
|
+
- `ja` (Japanese)
|
296
|
+
- `pt` (Portuguese)
|
297
|
+
- `tr` (Turkish)
|
298
|
+
- `pl` (Polish)
|
299
|
+
- `ca` (Catalan)
|
300
|
+
- `nl` (Dutch)
|
301
|
+
- `ar` (Arabic)
|
302
|
+
- `sv` (Swedish)
|
303
|
+
- `it` (Italian)
|
304
|
+
- `id` (Indonesian)
|
305
|
+
- `hi` (Hindi)
|
306
|
+
- `fi` (Finnish)
|
307
|
+
- `vi` (Vietnamese)
|
308
|
+
- `he` (Hebrew)
|
309
|
+
- `uk` (Ukrainian)
|
310
|
+
- `el` (Greek)
|
311
|
+
- `ms` (Malay)
|
312
|
+
- `cs` (Czech)
|
313
|
+
- `ro` (Romanian)
|
314
|
+
- `da` (Danish)
|
315
|
+
- `hu` (Hungarian)
|
316
|
+
- `ta` (Tamil)
|
317
|
+
- `no` (Norwegian)
|
318
|
+
- `th` (Thai)
|
319
|
+
- `ur` (Urdu)
|
320
|
+
- `hr` (Croatian)
|
321
|
+
- `bg` (Bulgarian)
|
322
|
+
- `lt` (Lithuanian)
|
323
|
+
- `la` (Latin)
|
324
|
+
- `mi` (Maori)
|
325
|
+
- `ml` (Malayalam)
|
326
|
+
- `cy` (Welsh)
|
327
|
+
- `sk` (Slovak)
|
328
|
+
- `te` (Telugu)
|
329
|
+
- `fa` (Persian)
|
330
|
+
- `lv` (Latvian)
|
331
|
+
- `bn` (Bengali)
|
332
|
+
- `sr` (Serbian)
|
333
|
+
- `az` (Azerbaijani)
|
334
|
+
- `sl` (Slovenian)
|
335
|
+
- `kn` (Kannada)
|
336
|
+
- `et` (Estonian)
|
337
|
+
- `mk` (Macedonian)
|
338
|
+
- `br` (Breton)
|
339
|
+
- `eu` (Basque)
|
340
|
+
- `is` (Icelandic)
|
341
|
+
- `hy` (Armenian)
|
342
|
+
- `ne` (Nepali)
|
343
|
+
- `mn` (Mongolian)
|
344
|
+
- `bs` (Bosnian)
|
345
|
+
- `kk` (Kazakh)
|
346
|
+
- `sq` (Albanian)
|
347
|
+
- `sw` (Swahili)
|
348
|
+
- `gl` (Galician)
|
349
|
+
- `mr` (Marathi)
|
350
|
+
- `pa` (Punjabi)
|
351
|
+
- `si` (Sinhala)
|
352
|
+
- `km` (Khmer)
|
353
|
+
- `sn` (Shona)
|
354
|
+
- `yo` (Yoruba)
|
355
|
+
- `so` (Somali)
|
356
|
+
- `af` (Afrikaans)
|
357
|
+
- `oc` (Occitan)
|
358
|
+
- `ka` (Georgian)
|
359
|
+
- `be` (Belarusian)
|
360
|
+
- `tg` (Tajik)
|
361
|
+
- `sd` (Sindhi)
|
362
|
+
- `gu` (Gujarati)
|
363
|
+
- `am` (Amharic)
|
364
|
+
- `yi` (Yiddish)
|
365
|
+
- `lo` (Lao)
|
366
|
+
- `uz` (Uzbek)
|
367
|
+
- `fo` (Faroese)
|
368
|
+
- `ht` (Haitian Creole)
|
369
|
+
- `ps` (Pashto)
|
370
|
+
- `tk` (Turkmen)
|
371
|
+
- `nn` (Nynorsk)
|
372
|
+
- `mt` (Maltese)
|
373
|
+
- `sa` (Sanskrit)
|
374
|
+
- `lb` (Luxembourgish)
|
375
|
+
- `my` (Myanmar)
|
376
|
+
- `bo` (Tibetan)
|
377
|
+
- `tl` (Tagalog)
|
378
|
+
- `mg` (Malagasy)
|
379
|
+
- `as` (Assamese)
|
380
|
+
- `tt` (Tatar)
|
381
|
+
- `haw` (Hawaiian)
|
382
|
+
- `ln` (Lingala)
|
383
|
+
- `ha` (Hausa)
|
384
|
+
- `ba` (Bashkir)
|
385
|
+
- `jw` (Javanese)
|
386
|
+
- `su` (Sundanese)
|
387
|
+
- `yue` (Cantonese)
|
388
|
+
</Accordion>
|
389
|
+
|
390
|
+
timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
|
391
|
+
The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
|
392
|
+
|
393
|
+
request_options : typing.Optional[RequestOptions]
|
394
|
+
Request-specific configuration.
|
395
|
+
|
396
|
+
Returns
|
397
|
+
-------
|
398
|
+
TranscriptionResponse
|
399
|
+
|
400
|
+
Examples
|
401
|
+
--------
|
402
|
+
import asyncio
|
403
|
+
|
404
|
+
from cartesia import AsyncCartesia
|
405
|
+
|
406
|
+
client = AsyncCartesia(
|
407
|
+
api_key="YOUR_API_KEY",
|
408
|
+
)
|
409
|
+
|
410
|
+
|
411
|
+
async def main() -> None:
|
412
|
+
await client.stt.transcribe(
|
413
|
+
model="ink-whisper",
|
414
|
+
language="en",
|
415
|
+
)
|
416
|
+
|
417
|
+
|
418
|
+
asyncio.run(main())
|
419
|
+
"""
|
420
|
+
_response = await self._client_wrapper.httpx_client.request(
|
421
|
+
"stt",
|
422
|
+
method="POST",
|
423
|
+
params={
|
424
|
+
"encoding": encoding,
|
425
|
+
"sample_rate": sample_rate,
|
426
|
+
},
|
427
|
+
data={
|
428
|
+
"model": model,
|
429
|
+
"language": language,
|
430
|
+
"timestamp_granularities[]": timestamp_granularities,
|
431
|
+
},
|
432
|
+
files={
|
433
|
+
"file": file,
|
434
|
+
},
|
435
|
+
request_options=request_options,
|
436
|
+
omit=OMIT,
|
437
|
+
)
|
438
|
+
try:
|
439
|
+
if 200 <= _response.status_code < 300:
|
440
|
+
return typing.cast(
|
441
|
+
TranscriptionResponse,
|
442
|
+
parse_obj_as(
|
443
|
+
type_=TranscriptionResponse, # type: ignore
|
444
|
+
object_=_response.json(),
|
445
|
+
),
|
446
|
+
)
|
447
|
+
_response_json = _response.json()
|
448
|
+
except JSONDecodeError:
|
449
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
450
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
@@ -12,6 +12,7 @@ from .streaming_transcription_response import (
|
|
12
12
|
)
|
13
13
|
from .transcript_message import TranscriptMessageParams
|
14
14
|
from .transcription_response import TranscriptionResponseParams
|
15
|
+
from .transcription_word import TranscriptionWordParams
|
15
16
|
|
16
17
|
__all__ = [
|
17
18
|
"DoneMessageParams",
|
@@ -24,4 +25,5 @@ __all__ = [
|
|
24
25
|
"StreamingTranscriptionResponse_TranscriptParams",
|
25
26
|
"TranscriptMessageParams",
|
26
27
|
"TranscriptionResponseParams",
|
28
|
+
"TranscriptionWordParams",
|
27
29
|
]
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
+
from .transcription_word import TranscriptionWordParams
|
7
8
|
|
8
9
|
|
9
10
|
class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
|
@@ -13,6 +14,7 @@ class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDic
|
|
13
14
|
is_final: bool
|
14
15
|
duration: typing_extensions.NotRequired[float]
|
15
16
|
language: typing_extensions.NotRequired[str]
|
17
|
+
words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
|
16
18
|
|
17
19
|
|
18
20
|
class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
import typing_extensions
|
4
4
|
import typing_extensions
|
5
|
+
import typing
|
6
|
+
from .transcription_word import TranscriptionWordParams
|
5
7
|
|
6
8
|
|
7
9
|
class TranscriptMessageParams(typing_extensions.TypedDict):
|
@@ -29,5 +31,10 @@ class TranscriptMessageParams(typing_extensions.TypedDict):
|
|
29
31
|
|
30
32
|
language: typing_extensions.NotRequired[str]
|
31
33
|
"""
|
32
|
-
The
|
34
|
+
The specified language of the input audio.
|
35
|
+
"""
|
36
|
+
|
37
|
+
words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
|
38
|
+
"""
|
39
|
+
Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
|
33
40
|
"""
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
import typing_extensions
|
4
4
|
import typing_extensions
|
5
|
+
import typing
|
6
|
+
from .transcription_word import TranscriptionWordParams
|
5
7
|
|
6
8
|
|
7
9
|
class TranscriptionResponseParams(typing_extensions.TypedDict):
|
@@ -12,10 +14,15 @@ class TranscriptionResponseParams(typing_extensions.TypedDict):
|
|
12
14
|
|
13
15
|
language: typing_extensions.NotRequired[str]
|
14
16
|
"""
|
15
|
-
The
|
17
|
+
The specified language of the input audio.
|
16
18
|
"""
|
17
19
|
|
18
20
|
duration: typing_extensions.NotRequired[float]
|
19
21
|
"""
|
20
22
|
The duration of the input audio in seconds.
|
21
23
|
"""
|
24
|
+
|
25
|
+
words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
|
26
|
+
"""
|
27
|
+
Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
|
28
|
+
"""
|