cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. cartesia/__init__.py +14 -0
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/core/client_wrapper.py +1 -1
  8. cartesia/stt/__init__.py +6 -0
  9. cartesia/stt/_async_websocket.py +81 -72
  10. cartesia/stt/_websocket.py +42 -20
  11. cartesia/stt/client.py +456 -0
  12. cartesia/stt/requests/__init__.py +2 -0
  13. cartesia/stt/requests/streaming_transcription_response.py +2 -0
  14. cartesia/stt/requests/transcript_message.py +8 -1
  15. cartesia/stt/requests/transcription_response.py +8 -1
  16. cartesia/stt/requests/transcription_word.py +20 -0
  17. cartesia/stt/socket_client.py +52 -109
  18. cartesia/stt/types/__init__.py +4 -0
  19. cartesia/stt/types/streaming_transcription_response.py +2 -0
  20. cartesia/stt/types/stt_encoding.py +3 -1
  21. cartesia/stt/types/timestamp_granularity.py +5 -0
  22. cartesia/stt/types/transcript_message.py +7 -1
  23. cartesia/stt/types/transcription_response.py +7 -1
  24. cartesia/stt/types/transcription_word.py +32 -0
  25. cartesia/tts/__init__.py +8 -0
  26. cartesia/tts/client.py +50 -8
  27. cartesia/tts/requests/__init__.py +4 -0
  28. cartesia/tts/requests/generation_request.py +4 -4
  29. cartesia/tts/requests/sse_output_format.py +11 -0
  30. cartesia/tts/requests/ttssse_request.py +47 -0
  31. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  32. cartesia/tts/requests/web_socket_response.py +1 -2
  33. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  34. cartesia/tts/types/__init__.py +4 -0
  35. cartesia/tts/types/generation_request.py +4 -4
  36. cartesia/tts/types/sse_output_format.py +22 -0
  37. cartesia/tts/types/ttssse_request.py +58 -0
  38. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  39. cartesia/tts/types/web_socket_response.py +1 -2
  40. cartesia/tts/types/web_socket_tts_request.py +11 -3
  41. cartesia/voice_changer/requests/streaming_response.py +0 -2
  42. cartesia/voice_changer/types/streaming_response.py +0 -2
  43. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
  44. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
  45. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
cartesia/stt/client.py ADDED
@@ -0,0 +1,456 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+ from ..core.client_wrapper import SyncClientWrapper
5
+ from .. import core
6
+ from .types.stt_encoding import SttEncoding
7
+ from .types.timestamp_granularity import TimestampGranularity
8
+ from ..core.request_options import RequestOptions
9
+ from .types.transcription_response import TranscriptionResponse
10
+ from ..core.pydantic_utilities import parse_obj_as
11
+ from json.decoder import JSONDecodeError
12
+ from ..core.api_error import ApiError
13
+ from ..core.client_wrapper import AsyncClientWrapper
14
+
15
+ # this is used as the default value for optional parameters
16
+ OMIT = typing.cast(typing.Any, ...)
17
+
18
+
19
+ class SttClient:
20
+ def __init__(self, *, client_wrapper: SyncClientWrapper):
21
+ self._client_wrapper = client_wrapper
22
+
23
+ def transcribe(
24
+ self,
25
+ *,
26
+ file: core.File,
27
+ model: str,
28
+ encoding: typing.Optional[SttEncoding] = None,
29
+ sample_rate: typing.Optional[int] = None,
30
+ language: typing.Optional[str] = OMIT,
31
+ timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
32
+ request_options: typing.Optional[RequestOptions] = None,
33
+ ) -> TranscriptionResponse:
34
+ """
35
+ Transcribes audio files into text using Cartesia's Speech-to-Text API.
36
+
37
+ Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
38
+
39
+ **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
40
+
41
+ **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
42
+
43
+ **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
44
+
45
+ <Note>
46
+ For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
47
+ </Note>
48
+
49
+ Parameters
50
+ ----------
51
+ file : core.File
52
+ See core.File for more documentation
53
+
54
+ model : str
55
+ ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
56
+
57
+
58
+ encoding : typing.Optional[SttEncoding]
59
+ The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
60
+
61
+ **Supported formats:**
62
+ - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
63
+ - `pcm_s32le` - 32-bit signed integer PCM, little-endian
64
+ - `pcm_f16le` - 16-bit floating point PCM, little-endian
65
+ - `pcm_f32le` - 32-bit floating point PCM, little-endian
66
+ - `pcm_mulaw` - 8-bit μ-law encoded PCM
67
+ - `pcm_alaw` - 8-bit A-law encoded PCM
68
+
69
+ sample_rate : typing.Optional[int]
70
+ The sample rate of the audio in Hz.
71
+
72
+ language : typing.Optional[str]
73
+ The language of the input audio in ISO-639-1 format. Defaults to `en`.
74
+
75
+ <Accordion title="Supported languages">
76
+ - `en` (English)
77
+ - `zh` (Chinese)
78
+ - `de` (German)
79
+ - `es` (Spanish)
80
+ - `ru` (Russian)
81
+ - `ko` (Korean)
82
+ - `fr` (French)
83
+ - `ja` (Japanese)
84
+ - `pt` (Portuguese)
85
+ - `tr` (Turkish)
86
+ - `pl` (Polish)
87
+ - `ca` (Catalan)
88
+ - `nl` (Dutch)
89
+ - `ar` (Arabic)
90
+ - `sv` (Swedish)
91
+ - `it` (Italian)
92
+ - `id` (Indonesian)
93
+ - `hi` (Hindi)
94
+ - `fi` (Finnish)
95
+ - `vi` (Vietnamese)
96
+ - `he` (Hebrew)
97
+ - `uk` (Ukrainian)
98
+ - `el` (Greek)
99
+ - `ms` (Malay)
100
+ - `cs` (Czech)
101
+ - `ro` (Romanian)
102
+ - `da` (Danish)
103
+ - `hu` (Hungarian)
104
+ - `ta` (Tamil)
105
+ - `no` (Norwegian)
106
+ - `th` (Thai)
107
+ - `ur` (Urdu)
108
+ - `hr` (Croatian)
109
+ - `bg` (Bulgarian)
110
+ - `lt` (Lithuanian)
111
+ - `la` (Latin)
112
+ - `mi` (Maori)
113
+ - `ml` (Malayalam)
114
+ - `cy` (Welsh)
115
+ - `sk` (Slovak)
116
+ - `te` (Telugu)
117
+ - `fa` (Persian)
118
+ - `lv` (Latvian)
119
+ - `bn` (Bengali)
120
+ - `sr` (Serbian)
121
+ - `az` (Azerbaijani)
122
+ - `sl` (Slovenian)
123
+ - `kn` (Kannada)
124
+ - `et` (Estonian)
125
+ - `mk` (Macedonian)
126
+ - `br` (Breton)
127
+ - `eu` (Basque)
128
+ - `is` (Icelandic)
129
+ - `hy` (Armenian)
130
+ - `ne` (Nepali)
131
+ - `mn` (Mongolian)
132
+ - `bs` (Bosnian)
133
+ - `kk` (Kazakh)
134
+ - `sq` (Albanian)
135
+ - `sw` (Swahili)
136
+ - `gl` (Galician)
137
+ - `mr` (Marathi)
138
+ - `pa` (Punjabi)
139
+ - `si` (Sinhala)
140
+ - `km` (Khmer)
141
+ - `sn` (Shona)
142
+ - `yo` (Yoruba)
143
+ - `so` (Somali)
144
+ - `af` (Afrikaans)
145
+ - `oc` (Occitan)
146
+ - `ka` (Georgian)
147
+ - `be` (Belarusian)
148
+ - `tg` (Tajik)
149
+ - `sd` (Sindhi)
150
+ - `gu` (Gujarati)
151
+ - `am` (Amharic)
152
+ - `yi` (Yiddish)
153
+ - `lo` (Lao)
154
+ - `uz` (Uzbek)
155
+ - `fo` (Faroese)
156
+ - `ht` (Haitian Creole)
157
+ - `ps` (Pashto)
158
+ - `tk` (Turkmen)
159
+ - `nn` (Nynorsk)
160
+ - `mt` (Maltese)
161
+ - `sa` (Sanskrit)
162
+ - `lb` (Luxembourgish)
163
+ - `my` (Myanmar)
164
+ - `bo` (Tibetan)
165
+ - `tl` (Tagalog)
166
+ - `mg` (Malagasy)
167
+ - `as` (Assamese)
168
+ - `tt` (Tatar)
169
+ - `haw` (Hawaiian)
170
+ - `ln` (Lingala)
171
+ - `ha` (Hausa)
172
+ - `ba` (Bashkir)
173
+ - `jw` (Javanese)
174
+ - `su` (Sundanese)
175
+ - `yue` (Cantonese)
176
+ </Accordion>
177
+
178
+
179
+ timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
180
+ The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
181
+
182
+
183
+ request_options : typing.Optional[RequestOptions]
184
+ Request-specific configuration.
185
+
186
+ Returns
187
+ -------
188
+ TranscriptionResponse
189
+
190
+ Examples
191
+ --------
192
+ from cartesia import Cartesia
193
+
194
+ client = Cartesia(
195
+ api_key="YOUR_API_KEY",
196
+ )
197
+ client.stt.transcribe(
198
+ model="ink-whisper",
199
+ language="en",
200
+ )
201
+ """
202
+ _response = self._client_wrapper.httpx_client.request(
203
+ "stt",
204
+ method="POST",
205
+ params={
206
+ "encoding": encoding,
207
+ "sample_rate": sample_rate,
208
+ },
209
+ data={
210
+ "model": model,
211
+ "language": language,
212
+ "timestamp_granularities[]": timestamp_granularities,
213
+ },
214
+ files={
215
+ "file": file,
216
+ },
217
+ request_options=request_options,
218
+ omit=OMIT,
219
+ )
220
+ try:
221
+ if 200 <= _response.status_code < 300:
222
+ return typing.cast(
223
+ TranscriptionResponse,
224
+ parse_obj_as(
225
+ type_=TranscriptionResponse, # type: ignore
226
+ object_=_response.json(),
227
+ ),
228
+ )
229
+ _response_json = _response.json()
230
+ except JSONDecodeError:
231
+ raise ApiError(status_code=_response.status_code, body=_response.text)
232
+ raise ApiError(status_code=_response.status_code, body=_response_json)
233
+
234
+
235
+ class AsyncSttClient:
236
+ def __init__(self, *, client_wrapper: AsyncClientWrapper):
237
+ self._client_wrapper = client_wrapper
238
+
239
+ async def transcribe(
240
+ self,
241
+ *,
242
+ file: core.File,
243
+ model: str,
244
+ encoding: typing.Optional[SttEncoding] = None,
245
+ sample_rate: typing.Optional[int] = None,
246
+ language: typing.Optional[str] = OMIT,
247
+ timestamp_granularities: typing.Optional[typing.List[TimestampGranularity]] = OMIT,
248
+ request_options: typing.Optional[RequestOptions] = None,
249
+ ) -> TranscriptionResponse:
250
+ """
251
+ Transcribes audio files into text using Cartesia's Speech-to-Text API.
252
+
253
+ Upload an audio file and receive a complete transcription response. Supports arbitrarily long audio files with automatic intelligent chunking for longer audio.
254
+
255
+ **Supported audio formats:** flac, m4a, mp3, mp4, mpeg, mpga, oga, ogg, wav, webm
256
+
257
+ **Response format:** Returns JSON with transcribed text, duration, and language. Include `timestamp_granularities: ["word"]` to get word-level timestamps.
258
+
259
+ **Pricing:** Batch transcription is priced at **1 credit per 2 seconds** of audio processed.
260
+
261
+ <Note>
262
+ For migrating from the OpenAI SDK, see our [OpenAI Whisper to Cartesia Ink Migration Guide](/api-reference/stt/migrate-from-open-ai).
263
+ </Note>
264
+
265
+ Parameters
266
+ ----------
267
+ file : core.File
268
+ See core.File for more documentation
269
+
270
+ model : str
271
+ ID of the model to use for transcription. Use `ink-whisper` for the latest Cartesia Whisper model.
272
+
273
+
274
+ encoding : typing.Optional[SttEncoding]
275
+ The encoding format to process the audio as. If not specified, the audio file will be decoded automatically.
276
+
277
+ **Supported formats:**
278
+ - `pcm_s16le` - 16-bit signed integer PCM, little-endian (recommended for best performance)
279
+ - `pcm_s32le` - 32-bit signed integer PCM, little-endian
280
+ - `pcm_f16le` - 16-bit floating point PCM, little-endian
281
+ - `pcm_f32le` - 32-bit floating point PCM, little-endian
282
+ - `pcm_mulaw` - 8-bit μ-law encoded PCM
283
+ - `pcm_alaw` - 8-bit A-law encoded PCM
284
+
285
+ sample_rate : typing.Optional[int]
286
+ The sample rate of the audio in Hz.
287
+
288
+ language : typing.Optional[str]
289
+ The language of the input audio in ISO-639-1 format. Defaults to `en`.
290
+
291
+ <Accordion title="Supported languages">
292
+ - `en` (English)
293
+ - `zh` (Chinese)
294
+ - `de` (German)
295
+ - `es` (Spanish)
296
+ - `ru` (Russian)
297
+ - `ko` (Korean)
298
+ - `fr` (French)
299
+ - `ja` (Japanese)
300
+ - `pt` (Portuguese)
301
+ - `tr` (Turkish)
302
+ - `pl` (Polish)
303
+ - `ca` (Catalan)
304
+ - `nl` (Dutch)
305
+ - `ar` (Arabic)
306
+ - `sv` (Swedish)
307
+ - `it` (Italian)
308
+ - `id` (Indonesian)
309
+ - `hi` (Hindi)
310
+ - `fi` (Finnish)
311
+ - `vi` (Vietnamese)
312
+ - `he` (Hebrew)
313
+ - `uk` (Ukrainian)
314
+ - `el` (Greek)
315
+ - `ms` (Malay)
316
+ - `cs` (Czech)
317
+ - `ro` (Romanian)
318
+ - `da` (Danish)
319
+ - `hu` (Hungarian)
320
+ - `ta` (Tamil)
321
+ - `no` (Norwegian)
322
+ - `th` (Thai)
323
+ - `ur` (Urdu)
324
+ - `hr` (Croatian)
325
+ - `bg` (Bulgarian)
326
+ - `lt` (Lithuanian)
327
+ - `la` (Latin)
328
+ - `mi` (Maori)
329
+ - `ml` (Malayalam)
330
+ - `cy` (Welsh)
331
+ - `sk` (Slovak)
332
+ - `te` (Telugu)
333
+ - `fa` (Persian)
334
+ - `lv` (Latvian)
335
+ - `bn` (Bengali)
336
+ - `sr` (Serbian)
337
+ - `az` (Azerbaijani)
338
+ - `sl` (Slovenian)
339
+ - `kn` (Kannada)
340
+ - `et` (Estonian)
341
+ - `mk` (Macedonian)
342
+ - `br` (Breton)
343
+ - `eu` (Basque)
344
+ - `is` (Icelandic)
345
+ - `hy` (Armenian)
346
+ - `ne` (Nepali)
347
+ - `mn` (Mongolian)
348
+ - `bs` (Bosnian)
349
+ - `kk` (Kazakh)
350
+ - `sq` (Albanian)
351
+ - `sw` (Swahili)
352
+ - `gl` (Galician)
353
+ - `mr` (Marathi)
354
+ - `pa` (Punjabi)
355
+ - `si` (Sinhala)
356
+ - `km` (Khmer)
357
+ - `sn` (Shona)
358
+ - `yo` (Yoruba)
359
+ - `so` (Somali)
360
+ - `af` (Afrikaans)
361
+ - `oc` (Occitan)
362
+ - `ka` (Georgian)
363
+ - `be` (Belarusian)
364
+ - `tg` (Tajik)
365
+ - `sd` (Sindhi)
366
+ - `gu` (Gujarati)
367
+ - `am` (Amharic)
368
+ - `yi` (Yiddish)
369
+ - `lo` (Lao)
370
+ - `uz` (Uzbek)
371
+ - `fo` (Faroese)
372
+ - `ht` (Haitian Creole)
373
+ - `ps` (Pashto)
374
+ - `tk` (Turkmen)
375
+ - `nn` (Nynorsk)
376
+ - `mt` (Maltese)
377
+ - `sa` (Sanskrit)
378
+ - `lb` (Luxembourgish)
379
+ - `my` (Myanmar)
380
+ - `bo` (Tibetan)
381
+ - `tl` (Tagalog)
382
+ - `mg` (Malagasy)
383
+ - `as` (Assamese)
384
+ - `tt` (Tatar)
385
+ - `haw` (Hawaiian)
386
+ - `ln` (Lingala)
387
+ - `ha` (Hausa)
388
+ - `ba` (Bashkir)
389
+ - `jw` (Javanese)
390
+ - `su` (Sundanese)
391
+ - `yue` (Cantonese)
392
+ </Accordion>
393
+
394
+
395
+ timestamp_granularities : typing.Optional[typing.List[TimestampGranularity]]
396
+ The timestamp granularities to populate for this transcription. Currently only `word` level timestamps are supported.
397
+
398
+
399
+ request_options : typing.Optional[RequestOptions]
400
+ Request-specific configuration.
401
+
402
+ Returns
403
+ -------
404
+ TranscriptionResponse
405
+
406
+ Examples
407
+ --------
408
+ import asyncio
409
+
410
+ from cartesia import AsyncCartesia
411
+
412
+ client = AsyncCartesia(
413
+ api_key="YOUR_API_KEY",
414
+ )
415
+
416
+
417
+ async def main() -> None:
418
+ await client.stt.transcribe(
419
+ model="ink-whisper",
420
+ language="en",
421
+ )
422
+
423
+
424
+ asyncio.run(main())
425
+ """
426
+ _response = await self._client_wrapper.httpx_client.request(
427
+ "stt",
428
+ method="POST",
429
+ params={
430
+ "encoding": encoding,
431
+ "sample_rate": sample_rate,
432
+ },
433
+ data={
434
+ "model": model,
435
+ "language": language,
436
+ "timestamp_granularities[]": timestamp_granularities,
437
+ },
438
+ files={
439
+ "file": file,
440
+ },
441
+ request_options=request_options,
442
+ omit=OMIT,
443
+ )
444
+ try:
445
+ if 200 <= _response.status_code < 300:
446
+ return typing.cast(
447
+ TranscriptionResponse,
448
+ parse_obj_as(
449
+ type_=TranscriptionResponse, # type: ignore
450
+ object_=_response.json(),
451
+ ),
452
+ )
453
+ _response_json = _response.json()
454
+ except JSONDecodeError:
455
+ raise ApiError(status_code=_response.status_code, body=_response.text)
456
+ raise ApiError(status_code=_response.status_code, body=_response_json)
@@ -12,6 +12,7 @@ from .streaming_transcription_response import (
12
12
  )
13
13
  from .transcript_message import TranscriptMessageParams
14
14
  from .transcription_response import TranscriptionResponseParams
15
+ from .transcription_word import TranscriptionWordParams
15
16
 
16
17
  __all__ = [
17
18
  "DoneMessageParams",
@@ -24,4 +25,5 @@ __all__ = [
24
25
  "StreamingTranscriptionResponse_TranscriptParams",
25
26
  "TranscriptMessageParams",
26
27
  "TranscriptionResponseParams",
28
+ "TranscriptionWordParams",
27
29
  ]
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
+ from .transcription_word import TranscriptionWordParams
7
8
 
8
9
 
9
10
  class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
@@ -13,6 +14,7 @@ class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDic
13
14
  is_final: bool
14
15
  duration: typing_extensions.NotRequired[float]
15
16
  language: typing_extensions.NotRequired[str]
17
+ words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
16
18
 
17
19
 
18
20
  class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):
@@ -2,6 +2,8 @@
2
2
 
3
3
  import typing_extensions
4
4
  import typing_extensions
5
+ import typing
6
+ from .transcription_word import TranscriptionWordParams
5
7
 
6
8
 
7
9
  class TranscriptMessageParams(typing_extensions.TypedDict):
@@ -29,5 +31,10 @@ class TranscriptMessageParams(typing_extensions.TypedDict):
29
31
 
30
32
  language: typing_extensions.NotRequired[str]
31
33
  """
32
- The detected or specified language of the input audio.
34
+ The specified language of the input audio.
35
+ """
36
+
37
+ words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
38
+ """
39
+ Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
33
40
  """
@@ -2,6 +2,8 @@
2
2
 
3
3
  import typing_extensions
4
4
  import typing_extensions
5
+ import typing
6
+ from .transcription_word import TranscriptionWordParams
5
7
 
6
8
 
7
9
  class TranscriptionResponseParams(typing_extensions.TypedDict):
@@ -12,10 +14,15 @@ class TranscriptionResponseParams(typing_extensions.TypedDict):
12
14
 
13
15
  language: typing_extensions.NotRequired[str]
14
16
  """
15
- The detected or specified language of the input audio.
17
+ The specified language of the input audio.
16
18
  """
17
19
 
18
20
  duration: typing_extensions.NotRequired[float]
19
21
  """
20
22
  The duration of the input audio in seconds.
21
23
  """
24
+
25
+ words: typing_extensions.NotRequired[typing.Sequence[TranscriptionWordParams]]
26
+ """
27
+ Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
28
+ """
@@ -0,0 +1,20 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+
5
+
6
+ class TranscriptionWordParams(typing_extensions.TypedDict):
7
+ word: str
8
+ """
9
+ The transcribed word.
10
+ """
11
+
12
+ start: float
13
+ """
14
+ Start time of the word in seconds.
15
+ """
16
+
17
+ end: float
18
+ """
19
+ End time of the word in seconds.
20
+ """