cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. cartesia/__init__.py +60 -1
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/base_client.py +2 -0
  8. cartesia/client.py +5 -0
  9. cartesia/core/client_wrapper.py +1 -1
  10. cartesia/stt/__init__.py +57 -0
  11. cartesia/stt/_async_websocket.py +293 -0
  12. cartesia/stt/_websocket.py +294 -0
  13. cartesia/stt/client.py +456 -0
  14. cartesia/stt/requests/__init__.py +29 -0
  15. cartesia/stt/requests/done_message.py +14 -0
  16. cartesia/stt/requests/error_message.py +16 -0
  17. cartesia/stt/requests/flush_done_message.py +14 -0
  18. cartesia/stt/requests/streaming_transcription_response.py +41 -0
  19. cartesia/stt/requests/transcript_message.py +40 -0
  20. cartesia/stt/requests/transcription_response.py +28 -0
  21. cartesia/stt/requests/transcription_word.py +20 -0
  22. cartesia/stt/socket_client.py +138 -0
  23. cartesia/stt/types/__init__.py +33 -0
  24. cartesia/stt/types/done_message.py +26 -0
  25. cartesia/stt/types/error_message.py +27 -0
  26. cartesia/stt/types/flush_done_message.py +26 -0
  27. cartesia/stt/types/streaming_transcription_response.py +94 -0
  28. cartesia/stt/types/stt_encoding.py +7 -0
  29. cartesia/stt/types/timestamp_granularity.py +5 -0
  30. cartesia/stt/types/transcript_message.py +50 -0
  31. cartesia/stt/types/transcription_response.py +38 -0
  32. cartesia/stt/types/transcription_word.py +32 -0
  33. cartesia/tts/__init__.py +8 -0
  34. cartesia/tts/client.py +50 -8
  35. cartesia/tts/requests/__init__.py +4 -0
  36. cartesia/tts/requests/generation_request.py +4 -4
  37. cartesia/tts/requests/sse_output_format.py +11 -0
  38. cartesia/tts/requests/ttssse_request.py +47 -0
  39. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  40. cartesia/tts/requests/web_socket_response.py +1 -2
  41. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  42. cartesia/tts/types/__init__.py +4 -0
  43. cartesia/tts/types/generation_request.py +4 -4
  44. cartesia/tts/types/sse_output_format.py +22 -0
  45. cartesia/tts/types/ttssse_request.py +58 -0
  46. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  47. cartesia/tts/types/web_socket_response.py +1 -2
  48. cartesia/tts/types/web_socket_tts_request.py +11 -3
  49. cartesia/voice_changer/requests/streaming_response.py +0 -2
  50. cartesia/voice_changer/types/streaming_response.py +0 -2
  51. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
  52. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
  53. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -8,10 +8,12 @@ from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFor
8
8
  from .phoneme_timestamps import PhonemeTimestampsParams
9
9
  from .raw_output_format import RawOutputFormatParams
10
10
  from .speed import SpeedParams
11
+ from .sse_output_format import SseOutputFormatParams
11
12
  from .tts_request import TtsRequestParams
12
13
  from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
13
14
  from .tts_request_id_specifier import TtsRequestIdSpecifierParams
14
15
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
16
+ from .ttssse_request import TtssseRequestParams
15
17
  from .wav_output_format import WavOutputFormatParams
16
18
  from .web_socket_base_response import WebSocketBaseResponseParams
17
19
  from .web_socket_chunk_response import WebSocketChunkResponseParams
@@ -48,10 +50,12 @@ __all__ = [
48
50
  "PhonemeTimestampsParams",
49
51
  "RawOutputFormatParams",
50
52
  "SpeedParams",
53
+ "SseOutputFormatParams",
51
54
  "TtsRequestEmbeddingSpecifierParams",
52
55
  "TtsRequestIdSpecifierParams",
53
56
  "TtsRequestParams",
54
57
  "TtsRequestVoiceSpecifierParams",
58
+ "TtssseRequestParams",
55
59
  "WavOutputFormatParams",
56
60
  "WebSocketBaseResponseParams",
57
61
  "WebSocketChunkResponseParams",
@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
55
55
 
56
56
  add_timestamps: typing_extensions.NotRequired[bool]
57
57
  """
58
- Whether to return word-level timestamps.
58
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
59
59
  """
60
60
 
61
61
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
62
62
  """
63
- Whether to return phoneme-level timestamps.
63
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
64
64
  """
65
65
 
66
- use_original_timestamps: typing_extensions.NotRequired[bool]
66
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
67
67
  """
68
- Whether to use the original transcript for timestamps.
68
+ Whether to use normalized timestamps (True) or original timestamps (False).
69
69
  """
@@ -0,0 +1,11 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing
5
+ from ..types.raw_encoding import RawEncoding
6
+
7
+
8
+ class SseOutputFormatParams(typing_extensions.TypedDict):
9
+ container: typing.Literal["raw"]
10
+ encoding: RawEncoding
11
+ sample_rate: int
@@ -0,0 +1,47 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
5
+ import typing_extensions
6
+ from ..types.supported_language import SupportedLanguage
7
+ from .sse_output_format import SseOutputFormatParams
8
+ from ..types.model_speed import ModelSpeed
9
+ from ..types.context_id import ContextId
10
+
11
+
12
+ class TtssseRequestParams(typing_extensions.TypedDict):
13
+ model_id: str
14
+ """
15
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
16
+ """
17
+
18
+ transcript: str
19
+ voice: TtsRequestVoiceSpecifierParams
20
+ language: typing_extensions.NotRequired[SupportedLanguage]
21
+ output_format: SseOutputFormatParams
22
+ duration: typing_extensions.NotRequired[float]
23
+ """
24
+ The maximum duration of the audio in seconds. You do not usually need to specify this.
25
+ If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
26
+ """
27
+
28
+ speed: typing_extensions.NotRequired[ModelSpeed]
29
+ add_timestamps: typing_extensions.NotRequired[bool]
30
+ """
31
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
32
+ """
33
+
34
+ add_phoneme_timestamps: typing_extensions.NotRequired[bool]
35
+ """
36
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
37
+ """
38
+
39
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
40
+ """
41
+ Whether to use normalized timestamps (True) or original timestamps (False).
42
+ """
43
+
44
+ context_id: typing_extensions.NotRequired[ContextId]
45
+ """
46
+ Optional context ID for this request.
47
+ """
@@ -1,11 +1,8 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponseParams
4
- import typing_extensions
5
- from ..types.flush_id import FlushId
6
4
 
7
5
 
8
6
  class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
9
7
  data: str
10
8
  step_time: float
11
- flush_id: typing_extensions.NotRequired[FlushId]
@@ -4,8 +4,8 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ..types.flush_id import FlushId
8
7
  from ..types.context_id import ContextId
8
+ from ..types.flush_id import FlushId
9
9
  from .word_timestamps import WordTimestampsParams
10
10
  from .phoneme_timestamps import PhonemeTimestampsParams
11
11
 
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
14
14
  type: typing.Literal["chunk"]
15
15
  data: str
16
16
  step_time: float
17
- flush_id: typing_extensions.NotRequired[FlushId]
18
17
  context_id: typing_extensions.NotRequired[ContextId]
19
18
  status_code: int
20
19
  done: bool
@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
20
20
  duration: typing_extensions.NotRequired[int]
21
21
  language: typing_extensions.NotRequired[str]
22
22
  add_timestamps: typing_extensions.NotRequired[bool]
23
- use_original_timestamps: typing_extensions.NotRequired[bool]
23
+ """
24
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
25
+ """
26
+
24
27
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
28
+ """
29
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
30
+ """
31
+
32
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
25
33
  continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
26
34
  context_id: typing_extensions.NotRequired[str]
27
35
  max_buffer_delay_ms: typing_extensions.NotRequired[int]
@@ -15,11 +15,13 @@ from .phoneme_timestamps import PhonemeTimestamps
15
15
  from .raw_encoding import RawEncoding
16
16
  from .raw_output_format import RawOutputFormat
17
17
  from .speed import Speed
18
+ from .sse_output_format import SseOutputFormat
18
19
  from .supported_language import SupportedLanguage
19
20
  from .tts_request import TtsRequest
20
21
  from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
21
22
  from .tts_request_id_specifier import TtsRequestIdSpecifier
22
23
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
24
+ from .ttssse_request import TtssseRequest
23
25
  from .wav_output_format import WavOutputFormat
24
26
  from .web_socket_base_response import WebSocketBaseResponse
25
27
  from .web_socket_chunk_response import WebSocketChunkResponse
@@ -63,11 +65,13 @@ __all__ = [
63
65
  "RawEncoding",
64
66
  "RawOutputFormat",
65
67
  "Speed",
68
+ "SseOutputFormat",
66
69
  "SupportedLanguage",
67
70
  "TtsRequest",
68
71
  "TtsRequestEmbeddingSpecifier",
69
72
  "TtsRequestIdSpecifier",
70
73
  "TtsRequestVoiceSpecifier",
74
+ "TtssseRequest",
71
75
  "WavOutputFormat",
72
76
  "WebSocketBaseResponse",
73
77
  "WebSocketChunkResponse",
@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
59
59
 
60
60
  add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
61
61
  """
62
- Whether to return word-level timestamps.
62
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
63
63
  """
64
64
 
65
65
  add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
66
66
  """
67
- Whether to return phoneme-level timestamps.
67
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
68
68
  """
69
69
 
70
- use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
70
+ use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
71
71
  """
72
- Whether to use the original transcript for timestamps.
72
+ Whether to use normalized timestamps (True) or original timestamps (False).
73
73
  """
74
74
 
75
75
  if IS_PYDANTIC_V2:
@@ -0,0 +1,22 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ from .raw_encoding import RawEncoding
6
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
+ import pydantic
8
+
9
+
10
+ class SseOutputFormat(UniversalBaseModel):
11
+ container: typing.Literal["raw"] = "raw"
12
+ encoding: RawEncoding
13
+ sample_rate: int
14
+
15
+ if IS_PYDANTIC_V2:
16
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
17
+ else:
18
+
19
+ class Config:
20
+ frozen = True
21
+ smart_union = True
22
+ extra = pydantic.Extra.allow
@@ -0,0 +1,58 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
6
+ import typing
7
+ from .supported_language import SupportedLanguage
8
+ from .sse_output_format import SseOutputFormat
9
+ from .model_speed import ModelSpeed
10
+ from .context_id import ContextId
11
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
12
+
13
+
14
+ class TtssseRequest(UniversalBaseModel):
15
+ model_id: str = pydantic.Field()
16
+ """
17
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
18
+ """
19
+
20
+ transcript: str
21
+ voice: TtsRequestVoiceSpecifier
22
+ language: typing.Optional[SupportedLanguage] = None
23
+ output_format: SseOutputFormat
24
+ duration: typing.Optional[float] = pydantic.Field(default=None)
25
+ """
26
+ The maximum duration of the audio in seconds. You do not usually need to specify this.
27
+ If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
28
+ """
29
+
30
+ speed: typing.Optional[ModelSpeed] = None
31
+ add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
32
+ """
33
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
34
+ """
35
+
36
+ add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
37
+ """
38
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
39
+ """
40
+
41
+ use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
42
+ """
43
+ Whether to use normalized timestamps (True) or original timestamps (False).
44
+ """
45
+
46
+ context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
47
+ """
48
+ Optional context ID for this request.
49
+ """
50
+
51
+ if IS_PYDANTIC_V2:
52
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
53
+ else:
54
+
55
+ class Config:
56
+ frozen = True
57
+ smart_union = True
58
+ extra = pydantic.Extra.allow
@@ -1,16 +1,14 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponse
4
- import typing
5
- from .flush_id import FlushId
6
4
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
5
+ import typing
7
6
  import pydantic
8
7
 
9
8
 
10
9
  class WebSocketChunkResponse(WebSocketBaseResponse):
11
10
  data: str
12
11
  step_time: float
13
- flush_id: typing.Optional[FlushId] = None
14
12
 
15
13
  if IS_PYDANTIC_V2:
16
14
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -3,10 +3,10 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
- from .flush_id import FlushId
7
6
  from .context_id import ContextId
8
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
9
8
  import pydantic
9
+ from .flush_id import FlushId
10
10
  from .word_timestamps import WordTimestamps
11
11
  from .phoneme_timestamps import PhonemeTimestamps
12
12
 
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
15
15
  type: typing.Literal["chunk"] = "chunk"
16
16
  data: str
17
17
  step_time: float
18
- flush_id: typing.Optional[FlushId] = None
19
18
  context_id: typing.Optional[ContextId] = None
20
19
  status_code: int
21
20
  done: bool
@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
22
22
  voice: TtsRequestVoiceSpecifier
23
23
  duration: typing.Optional[int] = None
24
24
  language: typing.Optional[str] = None
25
- add_timestamps: typing.Optional[bool] = None
26
- use_original_timestamps: typing.Optional[bool] = None
27
- add_phoneme_timestamps: typing.Optional[bool] = None
25
+ add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
26
+ """
27
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
28
+ """
29
+
30
+ add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
31
+ """
32
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
33
+ """
34
+
35
+ use_normalized_timestamps: typing.Optional[bool] = None
28
36
  continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
29
37
  context_id: typing.Optional[str] = None
30
38
  max_buffer_delay_ms: typing.Optional[int] = None
@@ -4,7 +4,6 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ...tts.types.flush_id import FlushId
8
7
  from ...tts.types.context_id import ContextId
9
8
 
10
9
 
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
12
11
  type: typing.Literal["chunk"]
13
12
  data: str
14
13
  step_time: float
15
- flush_id: typing_extensions.NotRequired[FlushId]
16
14
  context_id: typing_extensions.NotRequired[ContextId]
17
15
  status_code: int
18
16
  done: bool
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
- from ...tts.types.flush_id import FlushId
7
6
  from ...tts.types.context_id import ContextId
8
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
9
8
  import pydantic
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
13
12
  type: typing.Literal["chunk"] = "chunk"
14
13
  data: str
15
14
  step_time: float
16
- flush_id: typing.Optional[FlushId] = None
17
15
  context_id: typing.Optional[ContextId] = None
18
16
  status_code: int
19
17
  done: bool
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 2.0.4
3
+ Version: 2.0.6
4
4
  Summary:
5
5
  Requires-Python: >=3.8,<4.0
6
6
  Classifier: Intended Audience :: Developers
@@ -213,6 +213,258 @@ p.terminate()
213
213
  ws.close() # Close the websocket connection
214
214
  ```
215
215
 
216
+ ## Speech-to-Text (STT) with Websockets
217
+
218
+ ```python
219
+ from cartesia import Cartesia
220
+ import os
221
+
222
+ client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
223
+
224
+ # Load your audio file as bytes
225
+ with open("path/to/audio.wav", "rb") as f:
226
+ audio_data = f.read()
227
+
228
+ # Convert to audio chunks (20ms chunks used here for a streaming example)
229
+ # This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
230
+ chunk_size = 640
231
+ audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
232
+
233
+ # Create websocket connection with endpointing parameters
234
+ ws = client.stt.websocket(
235
+ model="ink-whisper", # Model (required)
236
+ language="en", # Language of your audio (required)
237
+ encoding="pcm_s16le", # Audio encoding format (required)
238
+ sample_rate=16000, # Audio sample rate (required)
239
+ min_volume=0.1, # Volume threshold for voice activity detection
240
+ max_silence_duration_secs=0.4, # Maximum silence duration before endpointing
241
+ )
242
+
243
+ # Send audio chunks (streaming approach)
244
+ for chunk in audio_chunks:
245
+ ws.send(chunk)
246
+
247
+ # Finalize and close
248
+ ws.send("finalize")
249
+ ws.send("done")
250
+
251
+ # Receive transcription results with word-level timestamps
252
+ for result in ws.receive():
253
+ if result['type'] == 'transcript':
254
+ print(f"Transcription: {result['text']}")
255
+
256
+ # Handle word-level timestamps if available
257
+ if 'words' in result and result['words']:
258
+ print("Word-level timestamps:")
259
+ for word_info in result['words']:
260
+ word = word_info['word']
261
+ start = word_info['start']
262
+ end = word_info['end']
263
+ print(f" '{word}': {start:.2f}s - {end:.2f}s")
264
+
265
+ if result['is_final']:
266
+ print("Final result received")
267
+ elif result['type'] == 'done':
268
+ break
269
+
270
+ ws.close()
271
+ ```
272
+
273
+ ### Async Streaming Speech-to-Text (STT) with Websockets
274
+
275
+ For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
276
+
277
+ ```python
278
+ import asyncio
279
+ import os
280
+ from cartesia import AsyncCartesia
281
+
282
+ async def streaming_stt_example():
283
+ """
284
+ Advanced async STT example for real-time streaming applications.
285
+ This example simulates streaming audio processing with proper error handling
286
+ and demonstrates the new endpointing and word timestamp features.
287
+ """
288
+ client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
289
+
290
+ try:
291
+ # Create websocket connection with voice activity detection
292
+ ws = await client.stt.websocket(
293
+ model="ink-whisper", # Model (required)
294
+ language="en", # Language of your audio (required)
295
+ encoding="pcm_s16le", # Audio encoding format (required)
296
+ sample_rate=16000, # Audio sample rate (required)
297
+ min_volume=0.15, # Volume threshold for voice activity detection
298
+ max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
299
+ )
300
+
301
+ # Simulate streaming audio data (replace with your audio source)
302
+ async def audio_stream():
303
+ """Simulate real-time audio streaming - replace with actual audio capture"""
304
+ # Load audio file for simulation
305
+ with open("path/to/audio.wav", "rb") as f:
306
+ audio_data = f.read()
307
+
308
+ # Stream in 100ms chunks (realistic for real-time processing)
309
+ chunk_size = int(16000 * 0.1 * 2) # 100ms at 16kHz, 16-bit
310
+
311
+ for i in range(0, len(audio_data), chunk_size):
312
+ chunk = audio_data[i:i + chunk_size]
313
+ if chunk:
314
+ yield chunk
315
+ # Simulate real-time streaming delay
316
+ await asyncio.sleep(0.1)
317
+
318
+ # Send audio and receive results concurrently
319
+ async def send_audio():
320
+ """Send audio chunks to the STT websocket"""
321
+ try:
322
+ async for chunk in audio_stream():
323
+ await ws.send(chunk)
324
+ print(f"Sent audio chunk of {len(chunk)} bytes")
325
+ # Small delay to simulate realtime applications
326
+ await asyncio.sleep(0.02)
327
+
328
+ # Signal end of audio stream
329
+ await ws.send("finalize")
330
+ await ws.send("done")
331
+ print("Audio streaming completed")
332
+
333
+ except Exception as e:
334
+ print(f"Error sending audio: {e}")
335
+
336
+ async def receive_transcripts():
337
+ """Receive and process transcription results with word timestamps"""
338
+ full_transcript = ""
339
+ all_word_timestamps = []
340
+
341
+ try:
342
+ async for result in ws.receive():
343
+ if result['type'] == 'transcript':
344
+ text = result['text']
345
+ is_final = result['is_final']
346
+
347
+ # Handle word-level timestamps
348
+ if 'words' in result and result['words']:
349
+ word_timestamps = result['words']
350
+ all_word_timestamps.extend(word_timestamps)
351
+
352
+ if is_final:
353
+ print("Word-level timestamps:")
354
+ for word_info in word_timestamps:
355
+ word = word_info['word']
356
+ start = word_info['start']
357
+ end = word_info['end']
358
+ print(f" '{word}': {start:.2f}s - {end:.2f}s")
359
+
360
+ if is_final:
361
+ # Final result - this text won't change
362
+ full_transcript += text + " "
363
+ print(f"FINAL: {text}")
364
+ else:
365
+ # Partial result - may change as more audio is processed
366
+ print(f"PARTIAL: {text}")
367
+
368
+ elif result['type'] == 'done':
369
+ print("Transcription completed")
370
+ break
371
+
372
+ except Exception as e:
373
+ print(f"Error receiving transcripts: {e}")
374
+
375
+ return full_transcript.strip(), all_word_timestamps
376
+
377
+ print("Starting streaming STT...")
378
+
379
+ # Use asyncio.gather to run audio sending and transcript receiving concurrently
380
+ _, (final_transcript, word_timestamps) = await asyncio.gather(
381
+ send_audio(),
382
+ receive_transcripts()
383
+ )
384
+
385
+ print(f"\nComplete transcript: {final_transcript}")
386
+ print(f"Total words with timestamps: {len(word_timestamps)}")
387
+
388
+ # Clean up
389
+ await ws.close()
390
+
391
+ except Exception as e:
392
+ print(f"STT streaming error: {e}")
393
+ finally:
394
+ await client.close()
395
+
396
+ # Run the example
397
+ if __name__ == "__main__":
398
+ asyncio.run(streaming_stt_example())
399
+ ```
400
+
401
+ ## Batch Speech-to-Text (STT)
402
+
403
+ For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
404
+
405
+ ```python
406
+ from cartesia import Cartesia
407
+ import os
408
+
409
+ client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
410
+
411
+ # Transcribe an audio file with word-level timestamps
412
+ with open("path/to/audio.wav", "rb") as audio_file:
413
+ response = client.stt.transcribe(
414
+ file=audio_file, # Audio file to transcribe
415
+ model="ink-whisper", # STT model (required)
416
+ language="en", # Language of the audio (optional)
417
+ timestamp_granularities=["word"], # Include word-level timestamps (optional)
418
+ encoding="pcm_s16le", # Audio encoding (optional)
419
+ sample_rate=16000, # Audio sample rate (optional)
420
+ )
421
+
422
+ # Access transcription results
423
+ print(f"Transcribed text: {response.text}")
424
+ print(f"Audio duration: {response.duration:.2f} seconds")
425
+
426
+ # Process word-level timestamps if requested
427
+ if response.words:
428
+ print("\nWord-level timestamps:")
429
+ for word_info in response.words:
430
+ word = word_info.word
431
+ start = word_info.start
432
+ end = word_info.end
433
+ print(f" '{word}': {start:.2f}s - {end:.2f}s")
434
+ ```
435
+
436
+ ### Async Batch STT
437
+
438
+ ```python
439
+ import asyncio
440
+ from cartesia import AsyncCartesia
441
+ import os
442
+
443
+ async def transcribe_file():
444
+ client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
445
+
446
+ with open("path/to/audio.wav", "rb") as audio_file:
447
+ response = await client.stt.transcribe(
448
+ file=audio_file,
449
+ model="ink-whisper",
450
+ language="en",
451
+ timestamp_granularities=["word"],
452
+ )
453
+
454
+ print(f"Transcribed text: {response.text}")
455
+
456
+ # Process word timestamps
457
+ if response.words:
458
+ for word_info in response.words:
459
+ print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
460
+
461
+ await client.close()
462
+
463
+ asyncio.run(transcribe_file())
464
+ ```
465
+
466
+ > **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
467
+
216
468
  ## Voices
217
469
 
218
470
  List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
@@ -358,7 +610,6 @@ new_voice = client.voices.create(
358
610
  language="en"
359
611
  )
360
612
  ```
361
-
362
613
  ### Custom Client
363
614
 
364
615
  You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
@@ -412,3 +663,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
412
663
 
413
664
  From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
414
665
 
666
+
667
+
668
+