cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +60 -1
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/base_client.py +2 -0
- cartesia/client.py +5 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/stt/__init__.py +57 -0
- cartesia/stt/_async_websocket.py +293 -0
- cartesia/stt/_websocket.py +294 -0
- cartesia/stt/client.py +456 -0
- cartesia/stt/requests/__init__.py +29 -0
- cartesia/stt/requests/done_message.py +14 -0
- cartesia/stt/requests/error_message.py +16 -0
- cartesia/stt/requests/flush_done_message.py +14 -0
- cartesia/stt/requests/streaming_transcription_response.py +41 -0
- cartesia/stt/requests/transcript_message.py +40 -0
- cartesia/stt/requests/transcription_response.py +28 -0
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +138 -0
- cartesia/stt/types/__init__.py +33 -0
- cartesia/stt/types/done_message.py +26 -0
- cartesia/stt/types/error_message.py +27 -0
- cartesia/stt/types/flush_done_message.py +26 -0
- cartesia/stt/types/streaming_transcription_response.py +94 -0
- cartesia/stt/types/stt_encoding.py +7 -0
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +50 -0
- cartesia/stt/types/transcription_response.py +38 -0
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +8 -0
- cartesia/tts/client.py +50 -8
- cartesia/tts/requests/__init__.py +4 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +4 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
cartesia/stt/client.py
ADDED
@@ -0,0 +1,456 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from ..core.client_wrapper import SyncClientWrapper
|
5
|
+
from .. import core
|
6
|
+
from .types.stt_encoding import SttEncoding
|
7
|
+
from .types.timestamp_granularity import TimestampGranularity
|
8
|
+
from ..core.request_options import RequestOptions
|
9
|
+
from .types.transcription_response import TranscriptionResponse
|
10
|
+
from ..core.pydantic_utilities import parse_obj_as
|
11
|
+
from json.decoder import JSONDecodeError
|
12
|
+
from ..core.api_error import ApiError
|
13
|
+
from ..core.client_wrapper import AsyncClientWrapper
|
14
|
+
|
15
|
+
# this is used as the default value for optional parameters
|
16
|
+
OMIT = typing.cast(typing.Any, ...)
|
17
|
+
|
18
|
+
|
19
|
+
class SttClient:
|
20
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
21
|
+
self._client_wrapper = client_wrapper
|
22
|
+
|
23
|
+
def transcribe(
|
24
|
+
self,
|
25
|
+
*,
|
26
|
+
file: core.File,
|
27
|
+
model: str,
|
28
|
+
encoding: typing.Optional[SttEncoding] = None,
|
29
|
+
sample_rate: typing.Optional[int] = None,
|
30
|
+
language: typing.Optional[str] = OMIT,
|
31
|
+
timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
|
32
|
+
request_options: typing.Optional[RequestOptions] = None,
|
33
|
+
) -> TranscriptionResponse:
|
34
|
+
"""
|
35
|
+
Transcribes audio files into text using Cartesia's Speech-to-Text API.
|
36
|
+
|
37
|
+
Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
|
38
|
+
|
39
|
+
**Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
|
40
|
+
|
41
|
+
**Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
|
42
|
+
|
43
|
+
**Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
|
44
|
+
|
45
|
+
<Note>
|
46
|
+
For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
|
47
|
+
</Note>
|
48
|
+
|
49
|
+
Parameters
|
50
|
+
----------
|
51
|
+
file : core.File
|
52
|
+
See core.File for more documentation
|
53
|
+
|
54
|
+
model : str
|
55
|
+
ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
|
56
|
+
|
57
|
+
|
58
|
+
encoding : typing.Optional[SttEncoding]
|
59
|
+
The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
|
60
|
+
|
61
|
+
**Supported formats:**
|
62
|
+
- `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
|
63
|
+
- `pcm_s32le` - 32-bit signed integer PCM, little-endian
|
64
|
+
- `pcm_f16le` - 16-bit floating point PCM, little-endian
|
65
|
+
- `pcm_f32le` - 32-bit floating point PCM, little-endian
|
66
|
+
- `pcm_mulaw` - 8-bit μ-law encoded PCM
|
67
|
+
- `pcm_alaw` - 8-bit A-law encoded PCM
|
68
|
+
|
69
|
+
sample_rate : typing.Optional[int]
|
70
|
+
The sample rate of the audio in Hz.
|
71
|
+
|
72
|
+
language : typing.Optional[str]
|
73
|
+
The language of the input audio in ISO-639-1 format. Defaults to `en`.
|
74
|
+
|
75
|
+
<Accordion title="Supported languages">
|
76
|
+
- `en` (English)
|
77
|
+
- `zh` (Chinese)
|
78
|
+
- `de` (German)
|
79
|
+
- `es` (Spanish)
|
80
|
+
- `ru` (Russian)
|
81
|
+
- `ko` (Korean)
|
82
|
+
- `fr` (French)
|
83
|
+
- `ja` (Japanese)
|
84
|
+
- `pt` (Portuguese)
|
85
|
+
- `tr` (Turkish)
|
86
|
+
- `pl` (Polish)
|
87
|
+
- `ca` (Catalan)
|
88
|
+
- `nl` (Dutch)
|
89
|
+
- `ar` (Arabic)
|
90
|
+
- `sv` (Swedish)
|
91
|
+
- `it` (Italian)
|
92
|
+
- `id` (Indonesian)
|
93
|
+
- `hi` (Hindi)
|
94
|
+
- `fi` (Finnish)
|
95
|
+
- `vi` (Vietnamese)
|
96
|
+
- `he` (Hebrew)
|
97
|
+
- `uk` (Ukrainian)
|
98
|
+
- `el` (Greek)
|
99
|
+
- `ms` (Malay)
|
100
|
+
- `cs` (Czech)
|
101
|
+
- `ro` (Romanian)
|
102
|
+
- `da` (Danish)
|
103
|
+
- `hu` (Hungarian)
|
104
|
+
- `ta` (Tamil)
|
105
|
+
- `no` (Norwegian)
|
106
|
+
- `th` (Thai)
|
107
|
+
- `ur` (Urdu)
|
108
|
+
- `hr` (Croatian)
|
109
|
+
- `bg` (Bulgarian)
|
110
|
+
- `lt` (Lithuanian)
|
111
|
+
- `la` (Latin)
|
112
|
+
- `mi` (Maori)
|
113
|
+
- `ml` (Malayalam)
|
114
|
+
- `cy` (Welsh)
|
115
|
+
- `sk` (Slovak)
|
116
|
+
- `te` (Telugu)
|
117
|
+
- `fa` (Persian)
|
118
|
+
- `lv` (Latvian)
|
119
|
+
- `bn` (Bengali)
|
120
|
+
- `sr` (Serbian)
|
121
|
+
- `az` (Azerbaijani)
|
122
|
+
- `sl` (Slovenian)
|
123
|
+
- `kn` (Kannada)
|
124
|
+
- `et` (Estonian)
|
125
|
+
- `mk` (Macedonian)
|
126
|
+
- `br` (Breton)
|
127
|
+
- `eu` (Basque)
|
128
|
+
- `is` (Icelandic)
|
129
|
+
- `hy` (Armenian)
|
130
|
+
- `ne` (Nepali)
|
131
|
+
- `mn` (Mongolian)
|
132
|
+
- `bs` (Bosnian)
|
133
|
+
- `kk` (Kazakh)
|
134
|
+
- `sq` (Albanian)
|
135
|
+
- `sw` (Swahili)
|
136
|
+
- `gl` (Galician)
|
137
|
+
- `mr` (Marathi)
|
138
|
+
- `pa` (Punjabi)
|
139
|
+
- `si` (Sinhala)
|
140
|
+
- `km` (Khmer)
|
141
|
+
- `sn` (Shona)
|
142
|
+
- `yo` (Yoruba)
|
143
|
+
- `so` (Somali)
|
144
|
+
- `af` (Afrikaans)
|
145
|
+
- `oc` (Occitan)
|
146
|
+
- `ka` (Georgian)
|
147
|
+
- `be` (Belarusian)
|
148
|
+
- `tg` (Tajik)
|
149
|
+
- `sd` (Sindhi)
|
150
|
+
- `gu` (Gujarati)
|
151
|
+
- `am` (Amharic)
|
152
|
+
- `yi` (Yiddish)
|
153
|
+
- `lo` (Lao)
|
154
|
+
- `uz` (Uzbek)
|
155
|
+
- `fo` (Faroese)
|
156
|
+
- `ht` (Haitian Creole)
|
157
|
+
- `ps` (Pashto)
|
158
|
+
- `tk` (Turkmen)
|
159
|
+
- `nn` (Nynorsk)
|
160
|
+
- `mt` (Maltese)
|
161
|
+
- `sa` (Sanskrit)
|
162
|
+
- `lb` (Luxembourgish)
|
163
|
+
- `my` (Myanmar)
|
164
|
+
- `bo` (Tibetan)
|
165
|
+
- `tl` (Tagalog)
|
166
|
+
- `mg` (Malagasy)
|
167
|
+
- `as` (Assamese)
|
168
|
+
- `tt` (Tatar)
|
169
|
+
- `haw` (Hawaiian)
|
170
|
+
- `ln` (Lingala)
|
171
|
+
- `ha` (Hausa)
|
172
|
+
- `ba` (Bashkir)
|
173
|
+
- `jw` (Javanese)
|
174
|
+
- `su` (Sundanese)
|
175
|
+
- `yue` (Cantonese)
|
176
|
+
</Accordion>
|
177
|
+
|
178
|
+
|
179
|
+
timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
|
180
|
+
The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
|
181
|
+
|
182
|
+
|
183
|
+
request_options : typing.Optional[RequestOptions]
|
184
|
+
Request-specific configuration.
|
185
|
+
|
186
|
+
Returns
|
187
|
+
-------
|
188
|
+
TranscriptionResponse
|
189
|
+
|
190
|
+
Examples
|
191
|
+
--------
|
192
|
+
from cartesia import Cartesia
|
193
|
+
|
194
|
+
client = Cartesia(
|
195
|
+
api_key="YOUR_API_KEY",
|
196
|
+
)
|
197
|
+
client.stt.transcribe(
|
198
|
+
model="ink-whisper",
|
199
|
+
language="en",
|
200
|
+
)
|
201
|
+
"""
|
202
|
+
_response = self._client_wrapper.httpx_client.request(
|
203
|
+
"stt",
|
204
|
+
method="POST",
|
205
|
+
params={
|
206
|
+
"encoding": encoding,
|
207
|
+
"sample_rate": sample_rate,
|
208
|
+
},
|
209
|
+
data={
|
210
|
+
"model": model,
|
211
|
+
"language": language,
|
212
|
+
"timestamp_granularities[]": timestamp_granularities,
|
213
|
+
},
|
214
|
+
files={
|
215
|
+
"file": file,
|
216
|
+
},
|
217
|
+
request_options=request_options,
|
218
|
+
omit=OMIT,
|
219
|
+
)
|
220
|
+
try:
|
221
|
+
if 200 <= _response.status_code < 300:
|
222
|
+
return typing.cast(
|
223
|
+
TranscriptionResponse,
|
224
|
+
parse_obj_as(
|
225
|
+
type_=TranscriptionResponse, # type: ignore
|
226
|
+
object_=_response.json(),
|
227
|
+
),
|
228
|
+
)
|
229
|
+
_response_json = _response.json()
|
230
|
+
except JSONDecodeError:
|
231
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
232
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
233
|
+
|
234
|
+
|
235
|
+
class AsyncSttClient:
|
236
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
237
|
+
self._client_wrapper = client_wrapper
|
238
|
+
|
239
|
+
async def transcribe(
|
240
|
+
self,
|
241
|
+
*,
|
242
|
+
file: core.File,
|
243
|
+
model: str,
|
244
|
+
encoding: typing.Optional[SttEncoding] = None,
|
245
|
+
sample_rate: typing.Optional[int] = None,
|
246
|
+
language: typing.Optional[str] = OMIT,
|
247
|
+
timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
|
248
|
+
request_options: typing.Optional[RequestOptions] = None,
|
249
|
+
) -> TranscriptionResponse:
|
250
|
+
"""
|
251
|
+
Transcribes audio files into text using Cartesia's Speech-to-Text API.
|
252
|
+
|
253
|
+
Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
|
254
|
+
|
255
|
+
**Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
|
256
|
+
|
257
|
+
**Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
|
258
|
+
|
259
|
+
**Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
|
260
|
+
|
261
|
+
<Note>
|
262
|
+
For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
|
263
|
+
</Note>
|
264
|
+
|
265
|
+
Parameters
|
266
|
+
----------
|
267
|
+
file : core.File
|
268
|
+
See core.File for more documentation
|
269
|
+
|
270
|
+
model : str
|
271
|
+
ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
|
272
|
+
|
273
|
+
|
274
|
+
encoding : typing.Optional[SttEncoding]
|
275
|
+
The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
|
276
|
+
|
277
|
+
**Supported formats:**
|
278
|
+
- `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
|
279
|
+
- `pcm_s32le` - 32-bit signed integer PCM, little-endian
|
280
|
+
- `pcm_f16le` - 16-bit floating point PCM, little-endian
|
281
|
+
- `pcm_f32le` - 32-bit floating point PCM, little-endian
|
282
|
+
- `pcm_mulaw` - 8-bit μ-law encoded PCM
|
283
|
+
- `pcm_alaw` - 8-bit A-law encoded PCM
|
284
|
+
|
285
|
+
sample_rate : typing.Optional[int]
|
286
|
+
The sample rate of the audio in Hz.
|
287
|
+
|
288
|
+
language : typing.Optional[str]
|
289
|
+
The language of the input audio in ISO-639-1 format. Defaults to `en`.
|
290
|
+
|
291
|
+
<Accordion title="Supported languages">
|
292
|
+
- `en` (English)
|
293
|
+
- `zh` (Chinese)
|
294
|
+
- `de` (German)
|
295
|
+
- `es` (Spanish)
|
296
|
+
- `ru` (Russian)
|
297
|
+
- `ko` (Korean)
|
298
|
+
- `fr` (French)
|
299
|
+
- `ja` (Japanese)
|
300
|
+
- `pt` (Portuguese)
|
301
|
+
- `tr` (Turkish)
|
302
|
+
- `pl` (Polish)
|
303
|
+
- `ca` (Catalan)
|
304
|
+
- `nl` (Dutch)
|
305
|
+
- `ar` (Arabic)
|
306
|
+
- `sv` (Swedish)
|
307
|
+
- `it` (Italian)
|
308
|
+
- `id` (Indonesian)
|
309
|
+
- `hi` (Hindi)
|
310
|
+
- `fi` (Finnish)
|
311
|
+
- `vi` (Vietnamese)
|
312
|
+
- `he` (Hebrew)
|
313
|
+
- `uk` (Ukrainian)
|
314
|
+
- `el` (Greek)
|
315
|
+
- `ms` (Malay)
|
316
|
+
- `cs` (Czech)
|
317
|
+
- `ro` (Romanian)
|
318
|
+
- `da` (Danish)
|
319
|
+
- `hu` (Hungarian)
|
320
|
+
- `ta` (Tamil)
|
321
|
+
- `no` (Norwegian)
|
322
|
+
- `th` (Thai)
|
323
|
+
- `ur` (Urdu)
|
324
|
+
- `hr` (Croatian)
|
325
|
+
- `bg` (Bulgarian)
|
326
|
+
- `lt` (Lithuanian)
|
327
|
+
- `la` (Latin)
|
328
|
+
- `mi` (Maori)
|
329
|
+
- `ml` (Malayalam)
|
330
|
+
- `cy` (Welsh)
|
331
|
+
- `sk` (Slovak)
|
332
|
+
- `te` (Telugu)
|
333
|
+
- `fa` (Persian)
|
334
|
+
- `lv` (Latvian)
|
335
|
+
- `bn` (Bengali)
|
336
|
+
- `sr` (Serbian)
|
337
|
+
- `az` (Azerbaijani)
|
338
|
+
- `sl` (Slovenian)
|
339
|
+
- `kn` (Kannada)
|
340
|
+
- `et` (Estonian)
|
341
|
+
- `mk` (Macedonian)
|
342
|
+
- `br` (Breton)
|
343
|
+
- `eu` (Basque)
|
344
|
+
- `is` (Icelandic)
|
345
|
+
- `hy` (Armenian)
|
346
|
+
- `ne` (Nepali)
|
347
|
+
- `mn` (Mongolian)
|
348
|
+
- `bs` (Bosnian)
|
349
|
+
- `kk` (Kazakh)
|
350
|
+
- `sq` (Albanian)
|
351
|
+
- `sw` (Swahili)
|
352
|
+
- `gl` (Galician)
|
353
|
+
- `mr` (Marathi)
|
354
|
+
- `pa` (Punjabi)
|
355
|
+
- `si` (Sinhala)
|
356
|
+
- `km` (Khmer)
|
357
|
+
- `sn` (Shona)
|
358
|
+
- `yo` (Yoruba)
|
359
|
+
- `so` (Somali)
|
360
|
+
- `af` (Afrikaans)
|
361
|
+
- `oc` (Occitan)
|
362
|
+
- `ka` (Georgian)
|
363
|
+
- `be` (Belarusian)
|
364
|
+
- `tg` (Tajik)
|
365
|
+
- `sd` (Sindhi)
|
366
|
+
- `gu` (Gujarati)
|
367
|
+
- `am` (Amharic)
|
368
|
+
- `yi` (Yiddish)
|
369
|
+
- `lo` (Lao)
|
370
|
+
- `uz` (Uzbek)
|
371
|
+
- `fo` (Faroese)
|
372
|
+
- `ht` (Haitian Creole)
|
373
|
+
- `ps` (Pashto)
|
374
|
+
- `tk` (Turkmen)
|
375
|
+
- `nn` (Nynorsk)
|
376
|
+
- `mt` (Maltese)
|
377
|
+
- `sa` (Sanskrit)
|
378
|
+
- `lb` (Luxembourgish)
|
379
|
+
- `my` (Myanmar)
|
380
|
+
- `bo` (Tibetan)
|
381
|
+
- `tl` (Tagalog)
|
382
|
+
- `mg` (Malagasy)
|
383
|
+
- `as` (Assamese)
|
384
|
+
- `tt` (Tatar)
|
385
|
+
- `haw` (Hawaiian)
|
386
|
+
- `ln` (Lingala)
|
387
|
+
- `ha` (Hausa)
|
388
|
+
- `ba` (Bashkir)
|
389
|
+
- `jw` (Javanese)
|
390
|
+
- `su` (Sundanese)
|
391
|
+
- `yue` (Cantonese)
|
392
|
+
</Accordion>
|
393
|
+
|
394
|
+
|
395
|
+
timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
|
396
|
+
The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
|
397
|
+
|
398
|
+
|
399
|
+
request_options : typing.Optional[RequestOptions]
|
400
|
+
Request-specific configuration.
|
401
|
+
|
402
|
+
Returns
|
403
|
+
-------
|
404
|
+
TranscriptionResponse
|
405
|
+
|
406
|
+
Examples
|
407
|
+
--------
|
408
|
+
import asyncio
|
409
|
+
|
410
|
+
from cartesia import AsyncCartesia
|
411
|
+
|
412
|
+
client = AsyncCartesia(
|
413
|
+
api_key="YOUR_API_KEY",
|
414
|
+
)
|
415
|
+
|
416
|
+
|
417
|
+
async def main() -> None:
|
418
|
+
await client.stt.transcribe(
|
419
|
+
model="ink-whisper",
|
420
|
+
language="en",
|
421
|
+
)
|
422
|
+
|
423
|
+
|
424
|
+
asyncio.run(main())
|
425
|
+
"""
|
426
|
+
_response = await self._client_wrapper.httpx_client.request(
|
427
|
+
"stt",
|
428
|
+
method="POST",
|
429
|
+
params={
|
430
|
+
"encoding": encoding,
|
431
|
+
"sample_rate": sample_rate,
|
432
|
+
},
|
433
|
+
data={
|
434
|
+
"model": model,
|
435
|
+
"language": language,
|
436
|
+
"timestamp_granularities[]": timestamp_granularities,
|
437
|
+
},
|
438
|
+
files={
|
439
|
+
"file": file,
|
440
|
+
},
|
441
|
+
request_options=request_options,
|
442
|
+
omit=OMIT,
|
443
|
+
)
|
444
|
+
try:
|
445
|
+
if 200 <= _response.status_code < 300:
|
446
|
+
return typing.cast(
|
447
|
+
TranscriptionResponse,
|
448
|
+
parse_obj_as(
|
449
|
+
type_=TranscriptionResponse, # type: ignore
|
450
|
+
object_=_response.json(),
|
451
|
+
),
|
452
|
+
)
|
453
|
+
_response_json = _response.json()
|
454
|
+
except JSONDecodeError:
|
455
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
456
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .done_message import DoneMessageParams
|
4
|
+
from .error_message import ErrorMessageParams
|
5
|
+
from .flush_done_message import FlushDoneMessageParams
|
6
|
+
from .streaming_transcription_response import (
|
7
|
+
StreamingTranscriptionResponseParams,
|
8
|
+
StreamingTranscriptionResponse_DoneParams,
|
9
|
+
StreamingTranscriptionResponse_ErrorParams,
|
10
|
+
StreamingTranscriptionResponse_FlushDoneParams,
|
11
|
+
StreamingTranscriptionResponse_TranscriptParams,
|
12
|
+
)
|
13
|
+
from .transcript_message import TranscriptMessageParams
|
14
|
+
from .transcription_response import TranscriptionResponseParams
|
15
|
+
from .transcription_word import TranscriptionWordParams
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"DoneMessageParams",
|
19
|
+
"ErrorMessageParams",
|
20
|
+
"FlushDoneMessageParams",
|
21
|
+
"StreamingTranscriptionResponseParams",
|
22
|
+
"StreamingTranscriptionResponse_DoneParams",
|
23
|
+
"StreamingTranscriptionResponse_ErrorParams",
|
24
|
+
"StreamingTranscriptionResponse_FlushDoneParams",
|
25
|
+
"StreamingTranscriptionResponse_TranscriptParams",
|
26
|
+
"TranscriptMessageParams",
|
27
|
+
"TranscriptionResponseParams",
|
28
|
+
"TranscriptionWordParams",
|
29
|
+
]
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
|
5
|
+
|
6
|
+
class DoneMessageParams(typing_extensions.TypedDict):
|
7
|
+
"""
|
8
|
+
Acknowledgment message sent in response to a `done` command, indicating that the session is complete and the WebSocket will close.
|
9
|
+
"""
|
10
|
+
|
11
|
+
request_id: str
|
12
|
+
"""
|
13
|
+
Unique identifier for this transcription session.
|
14
|
+
"""
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
|
6
|
+
|
7
|
+
class ErrorMessageParams(typing_extensions.TypedDict):
|
8
|
+
request_id: typing_extensions.NotRequired[str]
|
9
|
+
"""
|
10
|
+
The request ID associated with the error, if applicable.
|
11
|
+
"""
|
12
|
+
|
13
|
+
message: str
|
14
|
+
"""
|
15
|
+
Human-readable error message describing what went wrong.
|
16
|
+
"""
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
|
5
|
+
|
6
|
+
class FlushDoneMessageParams(typing_extensions.TypedDict):
|
7
|
+
"""
|
8
|
+
Acknowledgment message sent in response to a `finalize` command, indicating that all buffered audio has been flushed and processed.
|
9
|
+
"""
|
10
|
+
|
11
|
+
request_id: str
|
12
|
+
"""
|
13
|
+
Unique identifier for this transcription session.
|
14
|
+
"""
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
import typing_extensions
|
5
|
+
import typing
|
6
|
+
import typing_extensions
|
7
|
+
from .transcription_word import TranscriptionWordParams
|
8
|
+
|
9
|
+
|
10
|
+
class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
|
11
|
+
type: typing.Literal["transcript"]
|
12
|
+
request_id: str
|
13
|
+
text: str
|
14
|
+
is_final: bool
|
15
|
+
duration: typing_extensions.NotRequired[float]
|
16
|
+
language: typing_extensions.NotRequired[str]
|
17
|
+
words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
|
18
|
+
|
19
|
+
|
20
|
+
class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):
|
21
|
+
type: typing.Literal["flush_done"]
|
22
|
+
request_id: str
|
23
|
+
|
24
|
+
|
25
|
+
class StreamingTranscriptionResponse_DoneParams(typing_extensions.TypedDict):
|
26
|
+
type: typing.Literal["done"]
|
27
|
+
request_id: str
|
28
|
+
|
29
|
+
|
30
|
+
class StreamingTranscriptionResponse_ErrorParams(typing_extensions.TypedDict):
|
31
|
+
type: typing.Literal["error"]
|
32
|
+
request_id: typing_extensions.NotRequired[str]
|
33
|
+
message: str
|
34
|
+
|
35
|
+
|
36
|
+
StreamingTranscriptionResponseParams = typing.Union[
|
37
|
+
StreamingTranscriptionResponse_TranscriptParams,
|
38
|
+
StreamingTranscriptionResponse_FlushDoneParams,
|
39
|
+
StreamingTranscriptionResponse_DoneParams,
|
40
|
+
StreamingTranscriptionResponse_ErrorParams,
|
41
|
+
]
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
import typing
|
6
|
+
from .transcription_word import TranscriptionWordParams
|
7
|
+
|
8
|
+
|
9
|
+
class TranscriptMessageParams(typing_extensions.TypedDict):
|
10
|
+
request_id: str
|
11
|
+
"""
|
12
|
+
Unique identifier for this transcription session.
|
13
|
+
"""
|
14
|
+
|
15
|
+
text: str
|
16
|
+
"""
|
17
|
+
The transcribed text. May be partial or final depending on is_final.
|
18
|
+
|
19
|
+
**Note**: Text may be empty in initial responses while the system accumulates sufficient audio for transcription. This is normal behavior - wait for responses with non-empty text or monitor is_final for completion status.
|
20
|
+
"""
|
21
|
+
|
22
|
+
is_final: bool
|
23
|
+
"""
|
24
|
+
Whether this is a final transcription result or an interim result.
|
25
|
+
"""
|
26
|
+
|
27
|
+
duration: typing_extensions.NotRequired[float]
|
28
|
+
"""
|
29
|
+
The duration of the audio transcribed so far, in seconds.
|
30
|
+
"""
|
31
|
+
|
32
|
+
language: typing_extensions.NotRequired[str]
|
33
|
+
"""
|
34
|
+
The specified language of the input audio.
|
35
|
+
"""
|
36
|
+
|
37
|
+
words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
|
38
|
+
"""
|
39
|
+
Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
|
40
|
+
"""
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
import typing
|
6
|
+
from .transcription_word import TranscriptionWordParams
|
7
|
+
|
8
|
+
|
9
|
+
class TranscriptionResponseParams(typing_extensions.TypedDict):
|
10
|
+
text: str
|
11
|
+
"""
|
12
|
+
The transcribed text.
|
13
|
+
"""
|
14
|
+
|
15
|
+
language: typing_extensions.NotRequired[str]
|
16
|
+
"""
|
17
|
+
The specified language of the input audio.
|
18
|
+
"""
|
19
|
+
|
20
|
+
duration: typing_extensions.NotRequired[float]
|
21
|
+
"""
|
22
|
+
The duration of the input audio in seconds.
|
23
|
+
"""
|
24
|
+
|
25
|
+
words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
|
26
|
+
"""
|
27
|
+
Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
|
28
|
+
"""
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
|
5
|
+
|
6
|
+
class TranscriptionWordParams(typing_extensions.TypedDict):
|
7
|
+
word: str
|
8
|
+
"""
|
9
|
+
The transcribed word.
|
10
|
+
"""
|
11
|
+
|
12
|
+
start: float
|
13
|
+
"""
|
14
|
+
Start time of the word in seconds.
|
15
|
+
"""
|
16
|
+
|
17
|
+
end: float
|
18
|
+
"""
|
19
|
+
End time of the word in seconds.
|
20
|
+
"""
|