cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. cartesia/__init__.py +22 -0
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/core/client_wrapper.py +1 -1
  8. cartesia/infill/client.py +0 -8
  9. cartesia/stt/__init__.py +6 -0
  10. cartesia/stt/_async_websocket.py +81 -72
  11. cartesia/stt/_websocket.py +42 -20
  12. cartesia/stt/client.py +450 -0
  13. cartesia/stt/requests/__init__.py +2 -0
  14. cartesia/stt/requests/streaming_transcription_response.py +2 -0
  15. cartesia/stt/requests/transcript_message.py +8 -1
  16. cartesia/stt/requests/transcription_response.py +8 -1
  17. cartesia/stt/requests/transcription_word.py +20 -0
  18. cartesia/stt/socket_client.py +52 -109
  19. cartesia/stt/types/__init__.py +4 -0
  20. cartesia/stt/types/streaming_transcription_response.py +2 -0
  21. cartesia/stt/types/stt_encoding.py +3 -1
  22. cartesia/stt/types/timestamp_granularity.py +5 -0
  23. cartesia/stt/types/transcript_message.py +7 -1
  24. cartesia/stt/types/transcription_response.py +7 -1
  25. cartesia/stt/types/transcription_word.py +32 -0
  26. cartesia/tts/__init__.py +16 -0
  27. cartesia/tts/client.py +63 -8
  28. cartesia/tts/requests/__init__.py +8 -0
  29. cartesia/tts/requests/experimental_model_controls.py +17 -0
  30. cartesia/tts/requests/generation_config.py +23 -0
  31. cartesia/tts/requests/generation_request.py +4 -4
  32. cartesia/tts/requests/sse_output_format.py +11 -0
  33. cartesia/tts/requests/tts_request.py +2 -0
  34. cartesia/tts/requests/ttssse_request.py +47 -0
  35. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  36. cartesia/tts/requests/web_socket_response.py +1 -2
  37. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  38. cartesia/tts/types/__init__.py +8 -0
  39. cartesia/tts/types/experimental_model_controls.py +28 -0
  40. cartesia/tts/types/generation_config.py +34 -0
  41. cartesia/tts/types/generation_request.py +4 -4
  42. cartesia/tts/types/sse_output_format.py +22 -0
  43. cartesia/tts/types/tts_request.py +2 -0
  44. cartesia/tts/types/ttssse_request.py +58 -0
  45. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  46. cartesia/tts/types/web_socket_response.py +1 -2
  47. cartesia/tts/types/web_socket_tts_request.py +11 -3
  48. cartesia/voice_changer/client.py +0 -8
  49. cartesia/voice_changer/requests/streaming_response.py +0 -2
  50. cartesia/voice_changer/types/streaming_response.py +0 -2
  51. cartesia/voices/client.py +0 -12
  52. cartesia-2.0.7.dist-info/LICENSE +201 -0
  53. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
  54. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
  55. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
@@ -14,6 +14,7 @@ from cartesia.stt.types import (
14
14
  StreamingTranscriptionResponse_Error,
15
15
  StreamingTranscriptionResponse_Transcript,
16
16
  )
17
+ from cartesia.stt.types.stt_encoding import SttEncoding
17
18
 
18
19
  from ..core.pydantic_utilities import parse_obj_as
19
20
 
@@ -45,8 +46,10 @@ class SttWebsocket:
45
46
  # Store default connection parameters for auto-connect with proper typing
46
47
  self._default_model: str = "ink-whisper"
47
48
  self._default_language: Optional[str] = "en"
48
- self._default_encoding: Optional[str] = "pcm_s16le"
49
+ self._default_encoding: SttEncoding = "pcm_s16le"
49
50
  self._default_sample_rate: int = 16000
51
+ self._default_min_volume: Optional[float] = None
52
+ self._default_max_silence_duration_secs: Optional[float] = None
50
53
 
51
54
  def __del__(self):
52
55
  try:
@@ -59,16 +62,20 @@ class SttWebsocket:
59
62
  *,
60
63
  model: str = "ink-whisper",
61
64
  language: Optional[str] = "en",
62
- encoding: Optional[str] = "pcm_s16le",
65
+ encoding: SttEncoding = "pcm_s16le",
63
66
  sample_rate: int = 16000,
67
+ min_volume: Optional[float] = None,
68
+ max_silence_duration_secs: Optional[float] = None,
64
69
  ):
65
70
  """Connect to the STT WebSocket with the specified parameters.
66
71
 
67
72
  Args:
68
73
  model: ID of the model to use for transcription
69
74
  language: The language of the input audio in ISO-639-1 format
70
- encoding: The encoding format of the audio data
71
- sample_rate: The sample rate of the audio in Hz
75
+ encoding: The encoding format of the audio data (required)
76
+ sample_rate: The sample rate of the audio in Hz (required)
77
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
78
+ max_silence_duration_secs: Maximum duration of silence before endpointing
72
79
 
73
80
  Raises:
74
81
  RuntimeError: If the connection to the WebSocket fails.
@@ -78,6 +85,8 @@ class SttWebsocket:
78
85
  self._default_language = language
79
86
  self._default_encoding = encoding
80
87
  self._default_sample_rate = sample_rate
88
+ self._default_min_volume = min_volume
89
+ self._default_max_silence_duration_secs = max_silence_duration_secs
81
90
 
82
91
  if not IS_WEBSOCKET_SYNC_AVAILABLE:
83
92
  raise ImportError(
@@ -89,13 +98,15 @@ class SttWebsocket:
89
98
  "model": model,
90
99
  "api_key": self.api_key,
91
100
  "cartesia_version": self.cartesia_version,
101
+ "encoding": encoding,
102
+ "sample_rate": str(sample_rate),
92
103
  }
93
104
  if language is not None:
94
105
  params["language"] = language
95
- if encoding is not None:
96
- params["encoding"] = encoding
97
- if sample_rate is not None:
98
- params["sample_rate"] = str(sample_rate)
106
+ if min_volume is not None:
107
+ params["min_volume"] = str(min_volume)
108
+ if max_silence_duration_secs is not None:
109
+ params["max_silence_duration_secs"] = str(max_silence_duration_secs)
99
110
 
100
111
  query_string = "&".join([f"{k}={v}" for k, v in params.items()])
101
112
  url = f"{self.ws_url}/{route}?{query_string}"
@@ -143,6 +154,8 @@ class SttWebsocket:
143
154
  language=self._default_language,
144
155
  encoding=self._default_encoding,
145
156
  sample_rate=self._default_sample_rate,
157
+ min_volume=self._default_min_volume,
158
+ max_silence_duration_secs=self._default_max_silence_duration_secs,
146
159
  )
147
160
 
148
161
  assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -167,6 +180,8 @@ class SttWebsocket:
167
180
  language=self._default_language,
168
181
  encoding=self._default_encoding,
169
182
  sample_rate=self._default_sample_rate,
183
+ min_volume=self._default_min_volume,
184
+ max_silence_duration_secs=self._default_max_silence_duration_secs,
170
185
  )
171
186
 
172
187
  assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -197,6 +212,8 @@ class SttWebsocket:
197
212
  result["duration"] = raw_data["duration"]
198
213
  if "language" in raw_data:
199
214
  result["language"] = raw_data["language"]
215
+ if "words" in raw_data:
216
+ result["words"] = raw_data["words"]
200
217
 
201
218
  yield result
202
219
 
@@ -208,23 +225,22 @@ class SttWebsocket:
208
225
  }
209
226
  yield result
210
227
 
211
- # Handle done acknowledgment - session complete
228
+ # Handle done acknowledgment
212
229
  elif raw_data.get("type") == "done":
213
230
  result = {
214
231
  "type": raw_data["type"],
215
232
  "request_id": raw_data.get("request_id", ""),
216
233
  }
217
234
  yield result
218
- # Session is complete, break out of loop
219
- break
220
-
221
- except Exception as inner_e:
222
- self.close()
223
- raise RuntimeError(f"Error receiving transcription: {inner_e}")
235
+ break # Exit the loop when done
224
236
 
225
- except Exception as e:
237
+ except Exception as e:
238
+ if "Connection closed" in str(e) or "no active connection" in str(e):
239
+ break # WebSocket was closed
240
+ raise e # Re-raise other exceptions
241
+ except KeyboardInterrupt:
226
242
  self.close()
227
- raise RuntimeError(f"Failed to receive transcription. {e}")
243
+ raise
228
244
 
229
245
  def transcribe(
230
246
  self,
@@ -232,8 +248,10 @@ class SttWebsocket:
232
248
  *,
233
249
  model: str = "ink-whisper",
234
250
  language: Optional[str] = "en",
235
- encoding: Optional[str] = "pcm_s16le",
251
+ encoding: SttEncoding = "pcm_s16le",
236
252
  sample_rate: int = 16000,
253
+ min_volume: Optional[float] = None,
254
+ max_silence_duration_secs: Optional[float] = None,
237
255
  ) -> Generator[Dict[str, Any], None, None]:
238
256
  """Transcribe audio chunks using the WebSocket.
239
257
 
@@ -241,8 +259,10 @@ class SttWebsocket:
241
259
  audio_chunks: Iterator of audio chunks as bytes
242
260
  model: ID of the model to use for transcription
243
261
  language: The language of the input audio in ISO-639-1 format
244
- encoding: The encoding format of the audio data
245
- sample_rate: The sample rate of the audio in Hz
262
+ encoding: The encoding format of the audio data (required)
263
+ sample_rate: The sample rate of the audio in Hz (required)
264
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
265
+ max_silence_duration_secs: Maximum duration of silence before endpointing
246
266
 
247
267
  Yields:
248
268
  Dictionary containing transcription results, flush_done, done, or error messages
@@ -252,6 +272,8 @@ class SttWebsocket:
252
272
  language=language,
253
273
  encoding=encoding,
254
274
  sample_rate=sample_rate,
275
+ min_volume=min_volume,
276
+ max_silence_duration_secs=max_silence_duration_secs,
255
277
  )
256
278
 
257
279
  try:
cartesia/stt/client.py ADDED
@@ -0,0 +1,450 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+ from ..core.client_wrapper import SyncClientWrapper
5
+ from .. import core
6
+ from .types.stt_encoding import SttEncoding
7
+ from .types.timestamp_granularity import TimestampGranularity
8
+ from ..core.request_options import RequestOptions
9
+ from .types.transcription_response import TranscriptionResponse
10
+ from ..core.pydantic_utilities import parse_obj_as
11
+ from json.decoder import JSONDecodeError
12
+ from ..core.api_error import ApiError
13
+ from ..core.client_wrapper import AsyncClientWrapper
14
+
15
+ # this is used as the default value for optional parameters
16
+ OMIT = typing.cast(typing.Any, ...)
17
+
18
+
19
+ class SttClient:
20
+ def __init__(self, *, client_wrapper: SyncClientWrapper):
21
+ self._client_wrapper = client_wrapper
22
+
23
+ def transcribe(
24
+ self,
25
+ *,
26
+ file: core.File,
27
+ model: str,
28
+ encoding: typing.Optional[SttEncoding] = None,
29
+ sample_rate: typing.Optional[int] = None,
30
+ language: typing.Optional[str] = OMIT,
31
+ timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
32
+ request_options: typing.Optional[RequestOptions] = None,
33
+ ) -> TranscriptionResponse:
34
+ """
35
+ Transcribes audio files into text using Cartesia's Speech-to-Text API.
36
+
37
+ Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
38
+
39
+ **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
40
+
41
+ **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
42
+
43
+ **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
44
+
45
+ <Note>
46
+ For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
47
+ </Note>
48
+
49
+ Parameters
50
+ ----------
51
+ file : core.File
52
+ See core.File for more documentation
53
+
54
+ model : str
55
+ ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
56
+
57
+ encoding : typing.Optional[SttEncoding]
58
+ The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
59
+
60
+ **Supported formats:**
61
+ - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
62
+ - `pcm_s32le` - 32-bit signed integer PCM, little-endian
63
+ - `pcm_f16le` - 16-bit floating point PCM, little-endian
64
+ - `pcm_f32le` - 32-bit floating point PCM, little-endian
65
+ - `pcm_mulaw` - 8-bit μ-law encoded PCM
66
+ - `pcm_alaw` - 8-bit A-law encoded PCM
67
+
68
+ sample_rate : typing.Optional[int]
69
+ The sample rate of the audio in Hz.
70
+
71
+ language : typing.Optional[str]
72
+ The language of the input audio in ISO-639-1 format. Defaults to `en`.
73
+
74
+ <Accordion title="Supported languages">
75
+ - `en` (English)
76
+ - `zh` (Chinese)
77
+ - `de` (German)
78
+ - `es` (Spanish)
79
+ - `ru` (Russian)
80
+ - `ko` (Korean)
81
+ - `fr` (French)
82
+ - `ja` (Japanese)
83
+ - `pt` (Portuguese)
84
+ - `tr` (Turkish)
85
+ - `pl` (Polish)
86
+ - `ca` (Catalan)
87
+ - `nl` (Dutch)
88
+ - `ar` (Arabic)
89
+ - `sv` (Swedish)
90
+ - `it` (Italian)
91
+ - `id` (Indonesian)
92
+ - `hi` (Hindi)
93
+ - `fi` (Finnish)
94
+ - `vi` (Vietnamese)
95
+ - `he` (Hebrew)
96
+ - `uk` (Ukrainian)
97
+ - `el` (Greek)
98
+ - `ms` (Malay)
99
+ - `cs` (Czech)
100
+ - `ro` (Romanian)
101
+ - `da` (Danish)
102
+ - `hu` (Hungarian)
103
+ - `ta` (Tamil)
104
+ - `no` (Norwegian)
105
+ - `th` (Thai)
106
+ - `ur` (Urdu)
107
+ - `hr` (Croatian)
108
+ - `bg` (Bulgarian)
109
+ - `lt` (Lithuanian)
110
+ - `la` (Latin)
111
+ - `mi` (Maori)
112
+ - `ml` (Malayalam)
113
+ - `cy` (Welsh)
114
+ - `sk` (Slovak)
115
+ - `te` (Telugu)
116
+ - `fa` (Persian)
117
+ - `lv` (Latvian)
118
+ - `bn` (Bengali)
119
+ - `sr` (Serbian)
120
+ - `az` (Azerbaijani)
121
+ - `sl` (Slovenian)
122
+ - `kn` (Kannada)
123
+ - `et` (Estonian)
124
+ - `mk` (Macedonian)
125
+ - `br` (Breton)
126
+ - `eu` (Basque)
127
+ - `is` (Icelandic)
128
+ - `hy` (Armenian)
129
+ - `ne` (Nepali)
130
+ - `mn` (Mongolian)
131
+ - `bs` (Bosnian)
132
+ - `kk` (Kazakh)
133
+ - `sq` (Albanian)
134
+ - `sw` (Swahili)
135
+ - `gl` (Galician)
136
+ - `mr` (Marathi)
137
+ - `pa` (Punjabi)
138
+ - `si` (Sinhala)
139
+ - `km` (Khmer)
140
+ - `sn` (Shona)
141
+ - `yo` (Yoruba)
142
+ - `so` (Somali)
143
+ - `af` (Afrikaans)
144
+ - `oc` (Occitan)
145
+ - `ka` (Georgian)
146
+ - `be` (Belarusian)
147
+ - `tg` (Tajik)
148
+ - `sd` (Sindhi)
149
+ - `gu` (Gujarati)
150
+ - `am` (Amharic)
151
+ - `yi` (Yiddish)
152
+ - `lo` (Lao)
153
+ - `uz` (Uzbek)
154
+ - `fo` (Faroese)
155
+ - `ht` (Haitian Creole)
156
+ - `ps` (Pashto)
157
+ - `tk` (Turkmen)
158
+ - `nn` (Nynorsk)
159
+ - `mt` (Maltese)
160
+ - `sa` (Sanskrit)
161
+ - `lb` (Luxembourgish)
162
+ - `my` (Myanmar)
163
+ - `bo` (Tibetan)
164
+ - `tl` (Tagalog)
165
+ - `mg` (Malagasy)
166
+ - `as` (Assamese)
167
+ - `tt` (Tatar)
168
+ - `haw` (Hawaiian)
169
+ - `ln` (Lingala)
170
+ - `ha` (Hausa)
171
+ - `ba` (Bashkir)
172
+ - `jw` (Javanese)
173
+ - `su` (Sundanese)
174
+ - `yue` (Cantonese)
175
+ </Accordion>
176
+
177
+ timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
178
+ The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
179
+
180
+ request_options : typing.Optional[RequestOptions]
181
+ Request-specific configuration.
182
+
183
+ Returns
184
+ -------
185
+ TranscriptionResponse
186
+
187
+ Examples
188
+ --------
189
+ from cartesia import Cartesia
190
+
191
+ client = Cartesia(
192
+ api_key="YOUR_API_KEY",
193
+ )
194
+ client.stt.transcribe(
195
+ model="ink-whisper",
196
+ language="en",
197
+ )
198
+ """
199
+ _response = self._client_wrapper.httpx_client.request(
200
+ "stt",
201
+ method="POST",
202
+ params={
203
+ "encoding": encoding,
204
+ "sample_rate": sample_rate,
205
+ },
206
+ data={
207
+ "model": model,
208
+ "language": language,
209
+ "timestamp_granularities[]": timestamp_granularities,
210
+ },
211
+ files={
212
+ "file": file,
213
+ },
214
+ request_options=request_options,
215
+ omit=OMIT,
216
+ )
217
+ try:
218
+ if 200 <= _response.status_code < 300:
219
+ return typing.cast(
220
+ TranscriptionResponse,
221
+ parse_obj_as(
222
+ type_=TranscriptionResponse, # type: ignore
223
+ object_=_response.json(),
224
+ ),
225
+ )
226
+ _response_json = _response.json()
227
+ except JSONDecodeError:
228
+ raise ApiError(status_code=_response.status_code, body=_response.text)
229
+ raise ApiError(status_code=_response.status_code, body=_response_json)
230
+
231
+
232
+ class AsyncSttClient:
233
+ def __init__(self, *, client_wrapper: AsyncClientWrapper):
234
+ self._client_wrapper = client_wrapper
235
+
236
+ async def transcribe(
237
+ self,
238
+ *,
239
+ file: core.File,
240
+ model: str,
241
+ encoding: typing.Optional[SttEncoding] = None,
242
+ sample_rate: typing.Optional[int] = None,
243
+ language: typing.Optional[str] = OMIT,
244
+ timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
245
+ request_options: typing.Optional[RequestOptions] = None,
246
+ ) -> TranscriptionResponse:
247
+ """
248
+ Transcribes audio files into text using Cartesia's Speech-to-Text API.
249
+
250
+ Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
251
+
252
+ **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
253
+
254
+ **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
255
+
256
+ **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
257
+
258
+ <Note>
259
+ For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
260
+ </Note>
261
+
262
+ Parameters
263
+ ----------
264
+ file : core.File
265
+ See core.File for more documentation
266
+
267
+ model : str
268
+ ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
269
+
270
+ encoding : typing.Optional[SttEncoding]
271
+ The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
272
+
273
+ **Supported formats:**
274
+ - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
275
+ - `pcm_s32le` - 32-bit signed integer PCM, little-endian
276
+ - `pcm_f16le` - 16-bit floating point PCM, little-endian
277
+ - `pcm_f32le` - 32-bit floating point PCM, little-endian
278
+ - `pcm_mulaw` - 8-bit μ-law encoded PCM
279
+ - `pcm_alaw` - 8-bit A-law encoded PCM
280
+
281
+ sample_rate : typing.Optional[int]
282
+ The sample rate of the audio in Hz.
283
+
284
+ language : typing.Optional[str]
285
+ The language of the input audio in ISO-639-1 format. Defaults to `en`.
286
+
287
+ <Accordion title="Supported languages">
288
+ - `en` (English)
289
+ - `zh` (Chinese)
290
+ - `de` (German)
291
+ - `es` (Spanish)
292
+ - `ru` (Russian)
293
+ - `ko` (Korean)
294
+ - `fr` (French)
295
+ - `ja` (Japanese)
296
+ - `pt` (Portuguese)
297
+ - `tr` (Turkish)
298
+ - `pl` (Polish)
299
+ - `ca` (Catalan)
300
+ - `nl` (Dutch)
301
+ - `ar` (Arabic)
302
+ - `sv` (Swedish)
303
+ - `it` (Italian)
304
+ - `id` (Indonesian)
305
+ - `hi` (Hindi)
306
+ - `fi` (Finnish)
307
+ - `vi` (Vietnamese)
308
+ - `he` (Hebrew)
309
+ - `uk` (Ukrainian)
310
+ - `el` (Greek)
311
+ - `ms` (Malay)
312
+ - `cs` (Czech)
313
+ - `ro` (Romanian)
314
+ - `da` (Danish)
315
+ - `hu` (Hungarian)
316
+ - `ta` (Tamil)
317
+ - `no` (Norwegian)
318
+ - `th` (Thai)
319
+ - `ur` (Urdu)
320
+ - `hr` (Croatian)
321
+ - `bg` (Bulgarian)
322
+ - `lt` (Lithuanian)
323
+ - `la` (Latin)
324
+ - `mi` (Maori)
325
+ - `ml` (Malayalam)
326
+ - `cy` (Welsh)
327
+ - `sk` (Slovak)
328
+ - `te` (Telugu)
329
+ - `fa` (Persian)
330
+ - `lv` (Latvian)
331
+ - `bn` (Bengali)
332
+ - `sr` (Serbian)
333
+ - `az` (Azerbaijani)
334
+ - `sl` (Slovenian)
335
+ - `kn` (Kannada)
336
+ - `et` (Estonian)
337
+ - `mk` (Macedonian)
338
+ - `br` (Breton)
339
+ - `eu` (Basque)
340
+ - `is` (Icelandic)
341
+ - `hy` (Armenian)
342
+ - `ne` (Nepali)
343
+ - `mn` (Mongolian)
344
+ - `bs` (Bosnian)
345
+ - `kk` (Kazakh)
346
+ - `sq` (Albanian)
347
+ - `sw` (Swahili)
348
+ - `gl` (Galician)
349
+ - `mr` (Marathi)
350
+ - `pa` (Punjabi)
351
+ - `si` (Sinhala)
352
+ - `km` (Khmer)
353
+ - `sn` (Shona)
354
+ - `yo` (Yoruba)
355
+ - `so` (Somali)
356
+ - `af` (Afrikaans)
357
+ - `oc` (Occitan)
358
+ - `ka` (Georgian)
359
+ - `be` (Belarusian)
360
+ - `tg` (Tajik)
361
+ - `sd` (Sindhi)
362
+ - `gu` (Gujarati)
363
+ - `am` (Amharic)
364
+ - `yi` (Yiddish)
365
+ - `lo` (Lao)
366
+ - `uz` (Uzbek)
367
+ - `fo` (Faroese)
368
+ - `ht` (Haitian Creole)
369
+ - `ps` (Pashto)
370
+ - `tk` (Turkmen)
371
+ - `nn` (Nynorsk)
372
+ - `mt` (Maltese)
373
+ - `sa` (Sanskrit)
374
+ - `lb` (Luxembourgish)
375
+ - `my` (Myanmar)
376
+ - `bo` (Tibetan)
377
+ - `tl` (Tagalog)
378
+ - `mg` (Malagasy)
379
+ - `as` (Assamese)
380
+ - `tt` (Tatar)
381
+ - `haw` (Hawaiian)
382
+ - `ln` (Lingala)
383
+ - `ha` (Hausa)
384
+ - `ba` (Bashkir)
385
+ - `jw` (Javanese)
386
+ - `su` (Sundanese)
387
+ - `yue` (Cantonese)
388
+ </Accordion>
389
+
390
+ timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
391
+ The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
392
+
393
+ request_options : typing.Optional[RequestOptions]
394
+ Request-specific configuration.
395
+
396
+ Returns
397
+ -------
398
+ TranscriptionResponse
399
+
400
+ Examples
401
+ --------
402
+ import asyncio
403
+
404
+ from cartesia import AsyncCartesia
405
+
406
+ client = AsyncCartesia(
407
+ api_key="YOUR_API_KEY",
408
+ )
409
+
410
+
411
+ async def main() -> None:
412
+ await client.stt.transcribe(
413
+ model="ink-whisper",
414
+ language="en",
415
+ )
416
+
417
+
418
+ asyncio.run(main())
419
+ """
420
+ _response = await self._client_wrapper.httpx_client.request(
421
+ "stt",
422
+ method="POST",
423
+ params={
424
+ "encoding": encoding,
425
+ "sample_rate": sample_rate,
426
+ },
427
+ data={
428
+ "model": model,
429
+ "language": language,
430
+ "timestamp_granularities[]": timestamp_granularities,
431
+ },
432
+ files={
433
+ "file": file,
434
+ },
435
+ request_options=request_options,
436
+ omit=OMIT,
437
+ )
438
+ try:
439
+ if 200 <= _response.status_code < 300:
440
+ return typing.cast(
441
+ TranscriptionResponse,
442
+ parse_obj_as(
443
+ type_=TranscriptionResponse, # type: ignore
444
+ object_=_response.json(),
445
+ ),
446
+ )
447
+ _response_json = _response.json()
448
+ except JSONDecodeError:
449
+ raise ApiError(status_code=_response.status_code, body=_response.text)
450
+ raise ApiError(status_code=_response.status_code, body=_response_json)
@@ -12,6 +12,7 @@ from .streaming_transcription_response import (
12
12
  )
13
13
  from .transcript_message import TranscriptMessageParams
14
14
  from .transcription_response import TranscriptionResponseParams
15
+ from .transcription_word import TranscriptionWordParams
15
16
 
16
17
  __all__ = [
17
18
  "DoneMessageParams",
@@ -24,4 +25,5 @@ __all__ = [
24
25
  "StreamingTranscriptionResponse_TranscriptParams",
25
26
  "TranscriptMessageParams",
26
27
  "TranscriptionResponseParams",
28
+ "TranscriptionWordParams",
27
29
  ]
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
+ from .transcription_word import TranscriptionWordParams
7
8
 
8
9
 
9
10
  class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
@@ -13,6 +14,7 @@ class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDic
13
14
  is_final: bool
14
15
  duration: typing_extensions.NotRequired[float]
15
16
  language: typing_extensions.NotRequired[str]
17
+ words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
16
18
 
17
19
 
18
20
  class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):
@@ -2,6 +2,8 @@
2
2
 
3
3
  import typing_extensions
4
4
  import typing_extensions
5
+ import typing
6
+ from .transcription_word import TranscriptionWordParams
5
7
 
6
8
 
7
9
  class TranscriptMessageParams(typing_extensions.TypedDict):
@@ -29,5 +31,10 @@ class TranscriptMessageParams(typing_extensions.TypedDict):
29
31
 
30
32
  language: typing_extensions.NotRequired[str]
31
33
  """
32
- The detected or specified language of the input audio.
34
+ The specified language of the input audio.
35
+ """
36
+
37
+ words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
38
+ """
39
+ Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
33
40
  """
@@ -2,6 +2,8 @@
2
2
 
3
3
  import typing_extensions
4
4
  import typing_extensions
5
+ import typing
6
+ from .transcription_word import TranscriptionWordParams
5
7
 
6
8
 
7
9
  class TranscriptionResponseParams(typing_extensions.TypedDict):
@@ -12,10 +14,15 @@ class TranscriptionResponseParams(typing_extensions.TypedDict):
12
14
 
13
15
  language: typing_extensions.NotRequired[str]
14
16
  """
15
- The detected or specified language of the input audio.
17
+ The specified language of the input audio.
16
18
  """
17
19
 
18
20
  duration: typing_extensions.NotRequired[float]
19
21
  """
20
22
  The duration of the input audio in seconds.
21
23
  """
24
+
25
+ words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
26
+ """
27
+ Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
28
+ """