cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. cartesia/__init__.py +14 -0
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/core/client_wrapper.py +1 -1
  8. cartesia/stt/__init__.py +6 -0
  9. cartesia/stt/_async_websocket.py +81 -72
  10. cartesia/stt/_websocket.py +42 -20
  11. cartesia/stt/client.py +456 -0
  12. cartesia/stt/requests/__init__.py +2 -0
  13. cartesia/stt/requests/streaming_transcription_response.py +2 -0
  14. cartesia/stt/requests/transcript_message.py +8 -1
  15. cartesia/stt/requests/transcription_response.py +8 -1
  16. cartesia/stt/requests/transcription_word.py +20 -0
  17. cartesia/stt/socket_client.py +52 -109
  18. cartesia/stt/types/__init__.py +4 -0
  19. cartesia/stt/types/streaming_transcription_response.py +2 -0
  20. cartesia/stt/types/stt_encoding.py +3 -1
  21. cartesia/stt/types/timestamp_granularity.py +5 -0
  22. cartesia/stt/types/transcript_message.py +7 -1
  23. cartesia/stt/types/transcription_response.py +7 -1
  24. cartesia/stt/types/transcription_word.py +32 -0
  25. cartesia/tts/__init__.py +8 -0
  26. cartesia/tts/client.py +50 -8
  27. cartesia/tts/requests/__init__.py +4 -0
  28. cartesia/tts/requests/generation_request.py +4 -4
  29. cartesia/tts/requests/sse_output_format.py +11 -0
  30. cartesia/tts/requests/ttssse_request.py +47 -0
  31. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  32. cartesia/tts/requests/web_socket_response.py +1 -2
  33. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  34. cartesia/tts/types/__init__.py +4 -0
  35. cartesia/tts/types/generation_request.py +4 -4
  36. cartesia/tts/types/sse_output_format.py +22 -0
  37. cartesia/tts/types/ttssse_request.py +58 -0
  38. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  39. cartesia/tts/types/web_socket_response.py +1 -2
  40. cartesia/tts/types/web_socket_tts_request.py +11 -3
  41. cartesia/voice_changer/requests/streaming_response.py +0 -2
  42. cartesia/voice_changer/types/streaming_response.py +0 -2
  43. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
  44. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
  45. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -4,8 +4,8 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ..types.flush_id import FlushId
8
7
  from ..types.context_id import ContextId
8
+ from ..types.flush_id import FlushId
9
9
  from .word_timestamps import WordTimestampsParams
10
10
  from .phoneme_timestamps import PhonemeTimestampsParams
11
11
 
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
14
14
  type: typing.Literal["chunk"]
15
15
  data: str
16
16
  step_time: float
17
- flush_id: typing_extensions.NotRequired[FlushId]
18
17
  context_id: typing_extensions.NotRequired[ContextId]
19
18
  status_code: int
20
19
  done: bool
@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
20
20
  duration: typing_extensions.NotRequired[int]
21
21
  language: typing_extensions.NotRequired[str]
22
22
  add_timestamps: typing_extensions.NotRequired[bool]
23
- use_original_timestamps: typing_extensions.NotRequired[bool]
23
+ """
24
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
25
+ """
26
+
24
27
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
28
+ """
29
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
30
+ """
31
+
32
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
25
33
  continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
26
34
  context_id: typing_extensions.NotRequired[str]
27
35
  max_buffer_delay_ms: typing_extensions.NotRequired[int]
@@ -15,11 +15,13 @@ from .phoneme_timestamps import PhonemeTimestamps
15
15
  from .raw_encoding import RawEncoding
16
16
  from .raw_output_format import RawOutputFormat
17
17
  from .speed import Speed
18
+ from .sse_output_format import SseOutputFormat
18
19
  from .supported_language import SupportedLanguage
19
20
  from .tts_request import TtsRequest
20
21
  from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
21
22
  from .tts_request_id_specifier import TtsRequestIdSpecifier
22
23
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
24
+ from .ttssse_request import TtssseRequest
23
25
  from .wav_output_format import WavOutputFormat
24
26
  from .web_socket_base_response import WebSocketBaseResponse
25
27
  from .web_socket_chunk_response import WebSocketChunkResponse
@@ -63,11 +65,13 @@ __all__ = [
63
65
  "RawEncoding",
64
66
  "RawOutputFormat",
65
67
  "Speed",
68
+ "SseOutputFormat",
66
69
  "SupportedLanguage",
67
70
  "TtsRequest",
68
71
  "TtsRequestEmbeddingSpecifier",
69
72
  "TtsRequestIdSpecifier",
70
73
  "TtsRequestVoiceSpecifier",
74
+ "TtssseRequest",
71
75
  "WavOutputFormat",
72
76
  "WebSocketBaseResponse",
73
77
  "WebSocketChunkResponse",
@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
59
59
 
60
60
  add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
61
61
  """
62
- Whether to return word-level timestamps.
62
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
63
63
  """
64
64
 
65
65
  add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
66
66
  """
67
- Whether to return phoneme-level timestamps.
67
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
68
68
  """
69
69
 
70
- use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
70
+ use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
71
71
  """
72
- Whether to use the original transcript for timestamps.
72
+ Whether to use normalized timestamps (True) or original timestamps (False).
73
73
  """
74
74
 
75
75
  if IS_PYDANTIC_V2:
@@ -0,0 +1,22 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ from .raw_encoding import RawEncoding
6
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
+ import pydantic
8
+
9
+
10
+ class SseOutputFormat(UniversalBaseModel):
11
+ container: typing.Literal["raw"] = "raw"
12
+ encoding: RawEncoding
13
+ sample_rate: int
14
+
15
+ if IS_PYDANTIC_V2:
16
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
17
+ else:
18
+
19
+ class Config:
20
+ frozen = True
21
+ smart_union = True
22
+ extra = pydantic.Extra.allow
@@ -0,0 +1,58 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
6
+ import typing
7
+ from .supported_language import SupportedLanguage
8
+ from .sse_output_format import SseOutputFormat
9
+ from .model_speed import ModelSpeed
10
+ from .context_id import ContextId
11
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
12
+
13
+
14
+ class TtssseRequest(UniversalBaseModel):
15
+ model_id: str = pydantic.Field()
16
+ """
17
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
18
+ """
19
+
20
+ transcript: str
21
+ voice: TtsRequestVoiceSpecifier
22
+ language: typing.Optional[SupportedLanguage] = None
23
+ output_format: SseOutputFormat
24
+ duration: typing.Optional[float] = pydantic.Field(default=None)
25
+ """
26
+ The maximum duration of the audio in seconds. You do not usually need to specify this.
27
+ If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
28
+ """
29
+
30
+ speed: typing.Optional[ModelSpeed] = None
31
+ add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
32
+ """
33
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
34
+ """
35
+
36
+ add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
37
+ """
38
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
39
+ """
40
+
41
+ use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
42
+ """
43
+ Whether to use normalized timestamps (True) or original timestamps (False).
44
+ """
45
+
46
+ context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
47
+ """
48
+ Optional context ID for this request.
49
+ """
50
+
51
+ if IS_PYDANTIC_V2:
52
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
53
+ else:
54
+
55
+ class Config:
56
+ frozen = True
57
+ smart_union = True
58
+ extra = pydantic.Extra.allow
@@ -1,16 +1,14 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponse
4
- import typing
5
- from .flush_id import FlushId
6
4
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
5
+ import typing
7
6
  import pydantic
8
7
 
9
8
 
10
9
  class WebSocketChunkResponse(WebSocketBaseResponse):
11
10
  data: str
12
11
  step_time: float
13
- flush_id: typing.Optional[FlushId] = None
14
12
 
15
13
  if IS_PYDANTIC_V2:
16
14
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -3,10 +3,10 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
- from .flush_id import FlushId
7
6
  from .context_id import ContextId
8
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
9
8
  import pydantic
9
+ from .flush_id import FlushId
10
10
  from .word_timestamps import WordTimestamps
11
11
  from .phoneme_timestamps import PhonemeTimestamps
12
12
 
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
15
15
  type: typing.Literal["chunk"] = "chunk"
16
16
  data: str
17
17
  step_time: float
18
- flush_id: typing.Optional[FlushId] = None
19
18
  context_id: typing.Optional[ContextId] = None
20
19
  status_code: int
21
20
  done: bool
@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
22
22
  voice: TtsRequestVoiceSpecifier
23
23
  duration: typing.Optional[int] = None
24
24
  language: typing.Optional[str] = None
25
- add_timestamps: typing.Optional[bool] = None
26
- use_original_timestamps: typing.Optional[bool] = None
27
- add_phoneme_timestamps: typing.Optional[bool] = None
25
+ add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
26
+ """
27
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
28
+ """
29
+
30
+ add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
31
+ """
32
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
33
+ """
34
+
35
+ use_normalized_timestamps: typing.Optional[bool] = None
28
36
  continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
29
37
  context_id: typing.Optional[str] = None
30
38
  max_buffer_delay_ms: typing.Optional[int] = None
@@ -4,7 +4,6 @@ from __future__ import annotations
4
4
  import typing_extensions
5
5
  import typing
6
6
  import typing_extensions
7
- from ...tts.types.flush_id import FlushId
8
7
  from ...tts.types.context_id import ContextId
9
8
 
10
9
 
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
12
11
  type: typing.Literal["chunk"]
13
12
  data: str
14
13
  step_time: float
15
- flush_id: typing_extensions.NotRequired[FlushId]
16
14
  context_id: typing_extensions.NotRequired[ContextId]
17
15
  status_code: int
18
16
  done: bool
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
- from ...tts.types.flush_id import FlushId
7
6
  from ...tts.types.context_id import ContextId
8
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
9
8
  import pydantic
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
13
12
  type: typing.Literal["chunk"] = "chunk"
14
13
  data: str
15
14
  step_time: float
16
- flush_id: typing.Optional[FlushId] = None
17
15
  context_id: typing.Optional[ContextId] = None
18
16
  status_code: int
19
17
  done: bool
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 2.0.5
3
+ Version: 2.0.6
4
4
  Summary:
5
5
  Requires-Python: >=3.8,<4.0
6
6
  Classifier: Intended Audience :: Developers
@@ -230,12 +230,14 @@ with open("path/to/audio.wav", "rb") as f:
230
230
  chunk_size = 640
231
231
  audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
232
232
 
233
- # Create websocket connection
233
+ # Create websocket connection with endpointing parameters
234
234
  ws = client.stt.websocket(
235
- model="ink-whisper",
236
- language="en", # Must match the language of your audio
237
- encoding="pcm_s16le", # Must match your audio's encoding format
238
- sample_rate=16000, # Must match your audio's sample rate
235
+ model="ink-whisper", # Model (required)
236
+ language="en", # Language of your audio (required)
237
+ encoding="pcm_s16le", # Audio encoding format (required)
238
+ sample_rate=16000, # Audio sample rate (required)
239
+ min_volume=0.1, # Volume threshold for voice activity detection
240
+ max_silence_duration_secs=0.4, # Maximum silence duration before endpointing
239
241
  )
240
242
 
241
243
  # Send audio chunks (streaming approach)
@@ -246,10 +248,20 @@ for chunk in audio_chunks:
246
248
  ws.send("finalize")
247
249
  ws.send("done")
248
250
 
249
- # Receive transcription results
251
+ # Receive transcription results with word-level timestamps
250
252
  for result in ws.receive():
251
253
  if result['type'] == 'transcript':
252
254
  print(f"Transcription: {result['text']}")
255
+
256
+ # Handle word-level timestamps if available
257
+ if 'words' in result and result['words']:
258
+ print("Word-level timestamps:")
259
+ for word_info in result['words']:
260
+ word = word_info['word']
261
+ start = word_info['start']
262
+ end = word_info['end']
263
+ print(f" '{word}': {start:.2f}s - {end:.2f}s")
264
+
253
265
  if result['is_final']:
254
266
  print("Final result received")
255
267
  elif result['type'] == 'done':
@@ -270,17 +282,20 @@ from cartesia import AsyncCartesia
270
282
  async def streaming_stt_example():
271
283
  """
272
284
  Advanced async STT example for real-time streaming applications.
273
- This example simulates streaming audio processing with proper error handling.
285
+ This example simulates streaming audio processing with proper error handling
286
+ and demonstrates the new endpointing and word timestamp features.
274
287
  """
275
288
  client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
276
289
 
277
290
  try:
278
- # Create websocket connection
291
+ # Create websocket connection with voice activity detection
279
292
  ws = await client.stt.websocket(
280
- model="ink-whisper",
281
- language="en", # Must match the language of your audio
282
- encoding="pcm_s16le", # Must match your audio's encoding format
283
- sample_rate=16000, # Must match your audio's sample rate
293
+ model="ink-whisper", # Model (required)
294
+ language="en", # Language of your audio (required)
295
+ encoding="pcm_s16le", # Audio encoding format (required)
296
+ sample_rate=16000, # Audio sample rate (required)
297
+ min_volume=0.15, # Volume threshold for voice activity detection
298
+ max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
284
299
  )
285
300
 
286
301
  # Simulate streaming audio data (replace with your audio source)
@@ -319,8 +334,9 @@ async def streaming_stt_example():
319
334
  print(f"Error sending audio: {e}")
320
335
 
321
336
  async def receive_transcripts():
322
- """Receive and process transcription results"""
337
+ """Receive and process transcription results with word timestamps"""
323
338
  full_transcript = ""
339
+ all_word_timestamps = []
324
340
 
325
341
  try:
326
342
  async for result in ws.receive():
@@ -328,6 +344,19 @@ async def streaming_stt_example():
328
344
  text = result['text']
329
345
  is_final = result['is_final']
330
346
 
347
+ # Handle word-level timestamps
348
+ if 'words' in result and result['words']:
349
+ word_timestamps = result['words']
350
+ all_word_timestamps.extend(word_timestamps)
351
+
352
+ if is_final:
353
+ print("Word-level timestamps:")
354
+ for word_info in word_timestamps:
355
+ word = word_info['word']
356
+ start = word_info['start']
357
+ end = word_info['end']
358
+ print(f" '{word}': {start:.2f}s - {end:.2f}s")
359
+
331
360
  if is_final:
332
361
  # Final result - this text won't change
333
362
  full_transcript += text + " "
@@ -343,17 +372,18 @@ async def streaming_stt_example():
343
372
  except Exception as e:
344
373
  print(f"Error receiving transcripts: {e}")
345
374
 
346
- return full_transcript.strip()
375
+ return full_transcript.strip(), all_word_timestamps
347
376
 
348
377
  print("Starting streaming STT...")
349
378
 
350
379
  # Use asyncio.gather to run audio sending and transcript receiving concurrently
351
- _, final_transcript = await asyncio.gather(
380
+ _, (final_transcript, word_timestamps) = await asyncio.gather(
352
381
  send_audio(),
353
382
  receive_transcripts()
354
383
  )
355
384
 
356
385
  print(f"\nComplete transcript: {final_transcript}")
386
+ print(f"Total words with timestamps: {len(word_timestamps)}")
357
387
 
358
388
  # Clean up
359
389
  await ws.close()
@@ -368,6 +398,73 @@ if __name__ == "__main__":
368
398
  asyncio.run(streaming_stt_example())
369
399
  ```
370
400
 
401
+ ## Batch Speech-to-Text (STT)
402
+
403
+ For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
404
+
405
+ ```python
406
+ from cartesia import Cartesia
407
+ import os
408
+
409
+ client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
410
+
411
+ # Transcribe an audio file with word-level timestamps
412
+ with open("path/to/audio.wav", "rb") as audio_file:
413
+ response = client.stt.transcribe(
414
+ file=audio_file, # Audio file to transcribe
415
+ model="ink-whisper", # STT model (required)
416
+ language="en", # Language of the audio (optional)
417
+ timestamp_granularities=["word"], # Include word-level timestamps (optional)
418
+ encoding="pcm_s16le", # Audio encoding (optional)
419
+ sample_rate=16000, # Audio sample rate (optional)
420
+ )
421
+
422
+ # Access transcription results
423
+ print(f"Transcribed text: {response.text}")
424
+ print(f"Audio duration: {response.duration:.2f} seconds")
425
+
426
+ # Process word-level timestamps if requested
427
+ if response.words:
428
+ print("\nWord-level timestamps:")
429
+ for word_info in response.words:
430
+ word = word_info.word
431
+ start = word_info.start
432
+ end = word_info.end
433
+ print(f" '{word}': {start:.2f}s - {end:.2f}s")
434
+ ```
435
+
436
+ ### Async Batch STT
437
+
438
+ ```python
439
+ import asyncio
440
+ from cartesia import AsyncCartesia
441
+ import os
442
+
443
+ async def transcribe_file():
444
+ client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
445
+
446
+ with open("path/to/audio.wav", "rb") as audio_file:
447
+ response = await client.stt.transcribe(
448
+ file=audio_file,
449
+ model="ink-whisper",
450
+ language="en",
451
+ timestamp_granularities=["word"],
452
+ )
453
+
454
+ print(f"Transcribed text: {response.text}")
455
+
456
+ # Process word timestamps
457
+ if response.words:
458
+ for word_info in response.words:
459
+ print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
460
+
461
+ await client.close()
462
+
463
+ asyncio.run(transcribe_file())
464
+ ```
465
+
466
+ > **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
467
+
371
468
  ## Voices
372
469
 
373
470
  List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination: