cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. cartesia/__init__.py +14 -0
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/core/client_wrapper.py +1 -1
  8. cartesia/stt/__init__.py +6 -0
  9. cartesia/stt/_async_websocket.py +81 -72
  10. cartesia/stt/_websocket.py +42 -20
  11. cartesia/stt/client.py +456 -0
  12. cartesia/stt/requests/__init__.py +2 -0
  13. cartesia/stt/requests/streaming_transcription_response.py +2 -0
  14. cartesia/stt/requests/transcript_message.py +8 -1
  15. cartesia/stt/requests/transcription_response.py +8 -1
  16. cartesia/stt/requests/transcription_word.py +20 -0
  17. cartesia/stt/socket_client.py +52 -109
  18. cartesia/stt/types/__init__.py +4 -0
  19. cartesia/stt/types/streaming_transcription_response.py +2 -0
  20. cartesia/stt/types/stt_encoding.py +3 -1
  21. cartesia/stt/types/timestamp_granularity.py +5 -0
  22. cartesia/stt/types/transcript_message.py +7 -1
  23. cartesia/stt/types/transcription_response.py +7 -1
  24. cartesia/stt/types/transcription_word.py +32 -0
  25. cartesia/tts/__init__.py +8 -0
  26. cartesia/tts/client.py +50 -8
  27. cartesia/tts/requests/__init__.py +4 -0
  28. cartesia/tts/requests/generation_request.py +4 -4
  29. cartesia/tts/requests/sse_output_format.py +11 -0
  30. cartesia/tts/requests/ttssse_request.py +47 -0
  31. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  32. cartesia/tts/requests/web_socket_response.py +1 -2
  33. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  34. cartesia/tts/types/__init__.py +4 -0
  35. cartesia/tts/types/generation_request.py +4 -4
  36. cartesia/tts/types/sse_output_format.py +22 -0
  37. cartesia/tts/types/ttssse_request.py +58 -0
  38. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  39. cartesia/tts/types/web_socket_response.py +1 -2
  40. cartesia/tts/types/web_socket_tts_request.py +11 -3
  41. cartesia/voice_changer/requests/streaming_response.py +0 -2
  42. cartesia/voice_changer/types/streaming_response.py +0 -2
  43. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
  44. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
  45. {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -1,18 +1,20 @@
1
1
  import typing
2
- from typing import Any, Dict, Generator, Optional, Union
2
+ from typing import Any, Dict, Generator, Optional
3
3
 
4
4
  from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
5
5
  from ._async_websocket import AsyncSttWebsocket
6
6
  from ._websocket import SttWebsocket
7
+ from .client import AsyncSttClient, SttClient
8
+ from .types.stt_encoding import SttEncoding
7
9
 
8
10
 
9
- class SttClientWithWebsocket:
11
+ class SttClientWithWebsocket(SttClient):
10
12
  """
11
13
  Extension of STT functionality that supports a synchronous WebSocket STT connection.
12
14
  """
13
15
 
14
16
  def __init__(self, *, client_wrapper: SyncClientWrapper):
15
- self._client_wrapper = client_wrapper
17
+ super().__init__(client_wrapper=client_wrapper)
16
18
 
17
19
  def _ws_url(self):
18
20
  base_url = self._client_wrapper.get_base_url()
@@ -23,21 +25,34 @@ class SttClientWithWebsocket:
23
25
  base_url_without_protocol = base_url.split("://")[-1]
24
26
  return f"{prefix}://{base_url_without_protocol}"
25
27
 
26
- def websocket(self, *,
27
- model: str = "ink-whisper",
28
- language: Optional[str] = "en",
29
- encoding: Optional[str] = "pcm_s16le",
30
- sample_rate: int = 16000):
28
+ def websocket(
29
+ self,
30
+ *,
31
+ model: str = "ink-whisper",
32
+ language: Optional[str] = "en",
33
+ encoding: SttEncoding = "pcm_s16le",
34
+ sample_rate: int = 16000,
35
+ min_volume: Optional[float] = None,
36
+ max_silence_duration_secs: Optional[float] = None,
37
+ ):
31
38
  """Create a WebSocket connection for real-time speech transcription.
32
39
 
33
40
  Args:
34
41
  model: ID of the model to use for transcription
35
42
  language: The language of the input audio in ISO-639-1 format
36
- encoding: The encoding format of the audio data
37
- sample_rate: The sample rate of the audio in Hz
43
+ encoding: The encoding format of the audio data (required)
44
+ sample_rate: The sample rate of the audio in Hz (required)
45
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
46
+ max_silence_duration_secs: Maximum duration of silence before endpointing
38
47
 
39
48
  Returns:
40
49
  SttWebsocket: A connected WebSocket client for STT operations.
50
+
51
+ Example:
52
+ >>> client = Cartesia(api_key="your-api-key")
53
+ >>> ws = client.stt.websocket()
54
+ >>> for result in ws.transcribe(audio_chunks):
55
+ ... print(result["text"])
41
56
  """
42
57
  client_headers = self._client_wrapper.get_headers()
43
58
  ws = SttWebsocket(
@@ -51,61 +66,19 @@ class SttClientWithWebsocket:
51
66
  language=language,
52
67
  encoding=encoding,
53
68
  sample_rate=sample_rate,
69
+ min_volume=min_volume,
70
+ max_silence_duration_secs=max_silence_duration_secs,
54
71
  )
55
72
  return ws
56
73
 
57
- def transcribe(
58
- self,
59
- audio_chunks: typing.Iterator[bytes],
60
- *,
61
- model: str = "ink-whisper",
62
- language: Optional[str] = "en",
63
- encoding: Optional[str] = "pcm_s16le",
64
- sample_rate: int = 16000,
65
- ) -> Generator[Dict[str, Any], None, None]:
66
- """Transcribe audio chunks using WebSocket.
67
74
 
68
- Args:
69
- audio_chunks: Iterator of audio chunks as bytes
70
- model: ID of the model to use for transcription
71
- language: The language of the input audio in ISO-639-1 format
72
- encoding: The encoding format of the audio data
73
- sample_rate: The sample rate of the audio in Hz
74
-
75
- Yields:
76
- Dictionary containing transcription results, flush_done, done, or error messages
77
-
78
- Example:
79
- >>> client = Cartesia(api_key="your-api-key")
80
- >>> ws_client = client.stt.websocket()
81
- >>> for result in ws_client.transcribe(audio_chunks):
82
- ... print(result["text"])
83
- """
84
- ws = self.websocket(
85
- model=model,
86
- language=language,
87
- encoding=encoding,
88
- sample_rate=sample_rate,
89
- )
90
- try:
91
- yield from ws.transcribe(
92
- audio_chunks,
93
- model=model,
94
- language=language,
95
- encoding=encoding,
96
- sample_rate=sample_rate,
97
- )
98
- finally:
99
- ws.close()
100
-
101
-
102
- class AsyncSttClientWithWebsocket:
75
+ class AsyncSttClientWithWebsocket(AsyncSttClient):
103
76
  """
104
77
  Extension of STT functionality that supports an asynchronous WebSocket STT connection.
105
78
  """
106
79
 
107
80
  def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
108
- self._client_wrapper = client_wrapper
81
+ super().__init__(client_wrapper=client_wrapper)
109
82
  self._get_session = get_session
110
83
 
111
84
  def _ws_url(self) -> str:
@@ -117,21 +90,34 @@ class AsyncSttClientWithWebsocket:
117
90
  base_url_without_protocol = base_url.split("://")[-1]
118
91
  return f"{prefix}://{base_url_without_protocol}"
119
92
 
120
- async def websocket(self, *,
121
- model: str = "ink-whisper",
122
- language: Optional[str] = "en",
123
- encoding: Optional[str] = "pcm_s16le",
124
- sample_rate: int = 16000):
93
+ async def websocket(
94
+ self,
95
+ *,
96
+ model: str = "ink-whisper",
97
+ language: Optional[str] = "en",
98
+ encoding: SttEncoding = "pcm_s16le",
99
+ sample_rate: int = 16000,
100
+ min_volume: Optional[float] = None,
101
+ max_silence_duration_secs: Optional[float] = None,
102
+ ):
125
103
  """Create an async WebSocket connection for real-time speech transcription.
126
104
 
127
105
  Args:
128
106
  model: ID of the model to use for transcription
129
107
  language: The language of the input audio in ISO-639-1 format
130
- encoding: The encoding format of the audio data
131
- sample_rate: The sample rate of the audio in Hz
108
+ encoding: The encoding format of the audio data (required)
109
+ sample_rate: The sample rate of the audio in Hz (required)
110
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
111
+ max_silence_duration_secs: Maximum duration of silence before endpointing
132
112
 
133
113
  Returns:
134
114
  AsyncSttWebsocket: A connected async WebSocket client for STT operations.
115
+
116
+ Example:
117
+ >>> client = AsyncCartesia(api_key="your-api-key")
118
+ >>> ws = await client.stt.websocket()
119
+ >>> async for result in ws.transcribe(audio_chunks):
120
+ ... print(result["text"])
135
121
  """
136
122
  client_headers = self._client_wrapper.get_headers()
137
123
  ws = AsyncSttWebsocket(
@@ -146,50 +132,7 @@ class AsyncSttClientWithWebsocket:
146
132
  language=language,
147
133
  encoding=encoding,
148
134
  sample_rate=sample_rate,
135
+ min_volume=min_volume,
136
+ max_silence_duration_secs=max_silence_duration_secs,
149
137
  )
150
- return ws
151
-
152
- async def transcribe(
153
- self,
154
- audio_chunks: typing.AsyncIterator[bytes],
155
- *,
156
- model: str = "ink-whisper",
157
- language: Optional[str] = "en",
158
- encoding: Optional[str] = "pcm_s16le",
159
- sample_rate: int = 16000,
160
- ) -> typing.AsyncGenerator[Dict[str, Any], None]:
161
- """Transcribe audio chunks using async WebSocket.
162
-
163
- Args:
164
- audio_chunks: Async iterator of audio chunks as bytes
165
- model: ID of the model to use for transcription
166
- language: The language of the input audio in ISO-639-1 format
167
- encoding: The encoding format of the audio data
168
- sample_rate: The sample rate of the audio in Hz
169
-
170
- Yields:
171
- Dictionary containing transcription results, flush_done, done, or error messages
172
-
173
- Example:
174
- >>> client = AsyncCartesia(api_key="your-api-key")
175
- >>> ws_client = await client.stt.websocket()
176
- >>> async for result in ws_client.transcribe(audio_chunks):
177
- ... print(result["text"])
178
- """
179
- ws = await self.websocket(
180
- model=model,
181
- language=language,
182
- encoding=encoding,
183
- sample_rate=sample_rate,
184
- )
185
- try:
186
- async for result in ws.transcribe(
187
- audio_chunks,
188
- model=model,
189
- language=language,
190
- encoding=encoding,
191
- sample_rate=sample_rate,
192
- ):
193
- yield result
194
- finally:
195
- await ws.close()
138
+ return ws
@@ -11,8 +11,10 @@ from .streaming_transcription_response import (
11
11
  StreamingTranscriptionResponse_Transcript,
12
12
  )
13
13
  from .stt_encoding import SttEncoding
14
+ from .timestamp_granularity import TimestampGranularity
14
15
  from .transcript_message import TranscriptMessage
15
16
  from .transcription_response import TranscriptionResponse
17
+ from .transcription_word import TranscriptionWord
16
18
 
17
19
  __all__ = [
18
20
  "DoneMessage",
@@ -24,6 +26,8 @@ __all__ = [
24
26
  "StreamingTranscriptionResponse_FlushDone",
25
27
  "StreamingTranscriptionResponse_Transcript",
26
28
  "SttEncoding",
29
+ "TimestampGranularity",
27
30
  "TranscriptMessage",
28
31
  "TranscriptionResponse",
32
+ "TranscriptionWord",
29
33
  ]
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
+ from .transcription_word import TranscriptionWord
6
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
8
  import pydantic
8
9
 
@@ -18,6 +19,7 @@ class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
18
19
  is_final: bool
19
20
  duration: typing.Optional[float] = None
20
21
  language: typing.Optional[str] = None
22
+ words: typing.Optional[typing.List[TranscriptionWord]] = None
21
23
 
22
24
  if IS_PYDANTIC_V2:
23
25
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -2,4 +2,6 @@
2
2
 
3
3
  import typing
4
4
 
5
- SttEncoding = typing.Union[typing.Literal["pcm_s16le"], typing.Any]
5
+ SttEncoding = typing.Union[
6
+ typing.Literal["pcm_s16le", "pcm_s32le", "pcm_f16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"], typing.Any
7
+ ]
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ TimestampGranularity = typing.Union[typing.Literal["word"], typing.Any]
@@ -3,6 +3,7 @@
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  import pydantic
5
5
  import typing
6
+ from .transcription_word import TranscriptionWord
6
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
8
 
8
9
 
@@ -31,7 +32,12 @@ class TranscriptMessage(UniversalBaseModel):
31
32
 
32
33
  language: typing.Optional[str] = pydantic.Field(default=None)
33
34
  """
34
- The detected or specified language of the input audio.
35
+ The specified language of the input audio.
36
+ """
37
+
38
+ words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
39
+ """
40
+ Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
35
41
  """
36
42
 
37
43
  if IS_PYDANTIC_V2:
@@ -3,6 +3,7 @@
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  import pydantic
5
5
  import typing
6
+ from .transcription_word import TranscriptionWord
6
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
8
 
8
9
 
@@ -14,7 +15,7 @@ class TranscriptionResponse(UniversalBaseModel):
14
15
 
15
16
  language: typing.Optional[str] = pydantic.Field(default=None)
16
17
  """
17
- The detected or specified language of the input audio.
18
+ The specified language of the input audio.
18
19
  """
19
20
 
20
21
  duration: typing.Optional[float] = pydantic.Field(default=None)
@@ -22,6 +23,11 @@ class TranscriptionResponse(UniversalBaseModel):
22
23
  The duration of the input audio in seconds.
23
24
  """
24
25
 
26
+ words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
27
+ """
28
+ Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
29
+ """
30
+
25
31
  if IS_PYDANTIC_V2:
26
32
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
27
33
  else:
@@ -0,0 +1,32 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
+ import typing
7
+
8
+
9
+ class TranscriptionWord(UniversalBaseModel):
10
+ word: str = pydantic.Field()
11
+ """
12
+ The transcribed word.
13
+ """
14
+
15
+ start: float = pydantic.Field()
16
+ """
17
+ Start time of the word in seconds.
18
+ """
19
+
20
+ end: float = pydantic.Field()
21
+ """
22
+ End time of the word in seconds.
23
+ """
24
+
25
+ if IS_PYDANTIC_V2:
26
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
27
+ else:
28
+
29
+ class Config:
30
+ frozen = True
31
+ smart_union = True
32
+ extra = pydantic.Extra.allow
cartesia/tts/__init__.py CHANGED
@@ -19,11 +19,13 @@ from .types import (
19
19
  RawEncoding,
20
20
  RawOutputFormat,
21
21
  Speed,
22
+ SseOutputFormat,
22
23
  SupportedLanguage,
23
24
  TtsRequest,
24
25
  TtsRequestEmbeddingSpecifier,
25
26
  TtsRequestIdSpecifier,
26
27
  TtsRequestVoiceSpecifier,
28
+ TtssseRequest,
27
29
  WavOutputFormat,
28
30
  WebSocketBaseResponse,
29
31
  WebSocketChunkResponse,
@@ -58,10 +60,12 @@ from .requests import (
58
60
  PhonemeTimestampsParams,
59
61
  RawOutputFormatParams,
60
62
  SpeedParams,
63
+ SseOutputFormatParams,
61
64
  TtsRequestEmbeddingSpecifierParams,
62
65
  TtsRequestIdSpecifierParams,
63
66
  TtsRequestParams,
64
67
  TtsRequestVoiceSpecifierParams,
68
+ TtssseRequestParams,
65
69
  WavOutputFormatParams,
66
70
  WebSocketBaseResponseParams,
67
71
  WebSocketChunkResponseParams,
@@ -115,6 +119,8 @@ __all__ = [
115
119
  "RawOutputFormatParams",
116
120
  "Speed",
117
121
  "SpeedParams",
122
+ "SseOutputFormat",
123
+ "SseOutputFormatParams",
118
124
  "SupportedLanguage",
119
125
  "TtsRequest",
120
126
  "TtsRequestEmbeddingSpecifier",
@@ -124,6 +130,8 @@ __all__ = [
124
130
  "TtsRequestParams",
125
131
  "TtsRequestVoiceSpecifier",
126
132
  "TtsRequestVoiceSpecifierParams",
133
+ "TtssseRequest",
134
+ "TtssseRequestParams",
127
135
  "WavOutputFormat",
128
136
  "WavOutputFormatParams",
129
137
  "WebSocketBaseResponse",
cartesia/tts/client.py CHANGED
@@ -10,6 +10,8 @@ from ..core.request_options import RequestOptions
10
10
  from ..core.serialization import convert_and_respect_annotation_metadata
11
11
  from json.decoder import JSONDecodeError
12
12
  from ..core.api_error import ApiError
13
+ from .requests.sse_output_format import SseOutputFormatParams
14
+ from .types.context_id import ContextId
13
15
  from .types.web_socket_response import WebSocketResponse
14
16
  import httpx_sse
15
17
  from ..core.pydantic_utilities import parse_obj_as
@@ -119,10 +121,14 @@ class TtsClient:
119
121
  model_id: str,
120
122
  transcript: str,
121
123
  voice: TtsRequestVoiceSpecifierParams,
122
- output_format: OutputFormatParams,
124
+ output_format: SseOutputFormatParams,
123
125
  language: typing.Optional[SupportedLanguage] = OMIT,
124
126
  duration: typing.Optional[float] = OMIT,
125
127
  speed: typing.Optional[ModelSpeed] = OMIT,
128
+ add_timestamps: typing.Optional[bool] = OMIT,
129
+ add_phoneme_timestamps: typing.Optional[bool] = OMIT,
130
+ use_normalized_timestamps: typing.Optional[bool] = OMIT,
131
+ context_id: typing.Optional[ContextId] = OMIT,
126
132
  request_options: typing.Optional[RequestOptions] = None,
127
133
  ) -> typing.Iterator[WebSocketResponse]:
128
134
  """
@@ -135,7 +141,7 @@ class TtsClient:
135
141
 
136
142
  voice : TtsRequestVoiceSpecifierParams
137
143
 
138
- output_format : OutputFormatParams
144
+ output_format : SseOutputFormatParams
139
145
 
140
146
  language : typing.Optional[SupportedLanguage]
141
147
 
@@ -145,6 +151,18 @@ class TtsClient:
145
151
 
146
152
  speed : typing.Optional[ModelSpeed]
147
153
 
154
+ add_timestamps : typing.Optional[bool]
155
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
156
+
157
+ add_phoneme_timestamps : typing.Optional[bool]
158
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
159
+
160
+ use_normalized_timestamps : typing.Optional[bool]
161
+ Whether to use normalized timestamps (True) or original timestamps (False).
162
+
163
+ context_id : typing.Optional[ContextId]
164
+ Optional context ID for this request.
165
+
148
166
  request_options : typing.Optional[RequestOptions]
149
167
  Request-specific configuration.
150
168
 
@@ -165,9 +183,9 @@ class TtsClient:
165
183
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
166
184
  language="en",
167
185
  output_format={
186
+ "container": "raw",
168
187
  "sample_rate": 44100,
169
188
  "encoding": "pcm_f32le",
170
- "container": "raw",
171
189
  },
172
190
  )
173
191
  for chunk in response:
@@ -184,10 +202,14 @@ class TtsClient:
184
202
  ),
185
203
  "language": language,
186
204
  "output_format": convert_and_respect_annotation_metadata(
187
- object_=output_format, annotation=OutputFormatParams, direction="write"
205
+ object_=output_format, annotation=SseOutputFormatParams, direction="write"
188
206
  ),
189
207
  "duration": duration,
190
208
  "speed": speed,
209
+ "add_timestamps": add_timestamps,
210
+ "add_phoneme_timestamps": add_phoneme_timestamps,
211
+ "use_normalized_timestamps": use_normalized_timestamps,
212
+ "context_id": context_id,
191
213
  },
192
214
  request_options=request_options,
193
215
  omit=OMIT,
@@ -321,10 +343,14 @@ class AsyncTtsClient:
321
343
  model_id: str,
322
344
  transcript: str,
323
345
  voice: TtsRequestVoiceSpecifierParams,
324
- output_format: OutputFormatParams,
346
+ output_format: SseOutputFormatParams,
325
347
  language: typing.Optional[SupportedLanguage] = OMIT,
326
348
  duration: typing.Optional[float] = OMIT,
327
349
  speed: typing.Optional[ModelSpeed] = OMIT,
350
+ add_timestamps: typing.Optional[bool] = OMIT,
351
+ add_phoneme_timestamps: typing.Optional[bool] = OMIT,
352
+ use_normalized_timestamps: typing.Optional[bool] = OMIT,
353
+ context_id: typing.Optional[ContextId] = OMIT,
328
354
  request_options: typing.Optional[RequestOptions] = None,
329
355
  ) -> typing.AsyncIterator[WebSocketResponse]:
330
356
  """
@@ -337,7 +363,7 @@ class AsyncTtsClient:
337
363
 
338
364
  voice : TtsRequestVoiceSpecifierParams
339
365
 
340
- output_format : OutputFormatParams
366
+ output_format : SseOutputFormatParams
341
367
 
342
368
  language : typing.Optional[SupportedLanguage]
343
369
 
@@ -347,6 +373,18 @@ class AsyncTtsClient:
347
373
 
348
374
  speed : typing.Optional[ModelSpeed]
349
375
 
376
+ add_timestamps : typing.Optional[bool]
377
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
378
+
379
+ add_phoneme_timestamps : typing.Optional[bool]
380
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
381
+
382
+ use_normalized_timestamps : typing.Optional[bool]
383
+ Whether to use normalized timestamps (True) or original timestamps (False).
384
+
385
+ context_id : typing.Optional[ContextId]
386
+ Optional context ID for this request.
387
+
350
388
  request_options : typing.Optional[RequestOptions]
351
389
  Request-specific configuration.
352
390
 
@@ -372,9 +410,9 @@ class AsyncTtsClient:
372
410
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
373
411
  language="en",
374
412
  output_format={
413
+ "container": "raw",
375
414
  "sample_rate": 44100,
376
415
  "encoding": "pcm_f32le",
377
- "container": "raw",
378
416
  },
379
417
  )
380
418
  async for chunk in response:
@@ -394,10 +432,14 @@ class AsyncTtsClient:
394
432
  ),
395
433
  "language": language,
396
434
  "output_format": convert_and_respect_annotation_metadata(
397
- object_=output_format, annotation=OutputFormatParams, direction="write"
435
+ object_=output_format, annotation=SseOutputFormatParams, direction="write"
398
436
  ),
399
437
  "duration": duration,
400
438
  "speed": speed,
439
+ "add_timestamps": add_timestamps,
440
+ "add_phoneme_timestamps": add_phoneme_timestamps,
441
+ "use_normalized_timestamps": use_normalized_timestamps,
442
+ "context_id": context_id,
401
443
  },
402
444
  request_options=request_options,
403
445
  omit=OMIT,
@@ -8,10 +8,12 @@ from .output_format import OutputFormatParams, OutputFormat_Mp3Params, OutputFor
8
8
  from .phoneme_timestamps import PhonemeTimestampsParams
9
9
  from .raw_output_format import RawOutputFormatParams
10
10
  from .speed import SpeedParams
11
+ from .sse_output_format import SseOutputFormatParams
11
12
  from .tts_request import TtsRequestParams
12
13
  from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifierParams
13
14
  from .tts_request_id_specifier import TtsRequestIdSpecifierParams
14
15
  from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
16
+ from .ttssse_request import TtssseRequestParams
15
17
  from .wav_output_format import WavOutputFormatParams
16
18
  from .web_socket_base_response import WebSocketBaseResponseParams
17
19
  from .web_socket_chunk_response import WebSocketChunkResponseParams
@@ -48,10 +50,12 @@ __all__ = [
48
50
  "PhonemeTimestampsParams",
49
51
  "RawOutputFormatParams",
50
52
  "SpeedParams",
53
+ "SseOutputFormatParams",
51
54
  "TtsRequestEmbeddingSpecifierParams",
52
55
  "TtsRequestIdSpecifierParams",
53
56
  "TtsRequestParams",
54
57
  "TtsRequestVoiceSpecifierParams",
58
+ "TtssseRequestParams",
55
59
  "WavOutputFormatParams",
56
60
  "WebSocketBaseResponseParams",
57
61
  "WebSocketChunkResponseParams",
@@ -55,15 +55,15 @@ class GenerationRequestParams(typing_extensions.TypedDict):
55
55
 
56
56
  add_timestamps: typing_extensions.NotRequired[bool]
57
57
  """
58
- Whether to return word-level timestamps.
58
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
59
59
  """
60
60
 
61
61
  add_phoneme_timestamps: typing_extensions.NotRequired[bool]
62
62
  """
63
- Whether to return phoneme-level timestamps.
63
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
64
64
  """
65
65
 
66
- use_original_timestamps: typing_extensions.NotRequired[bool]
66
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
67
67
  """
68
- Whether to use the original transcript for timestamps.
68
+ Whether to use normalized timestamps (True) or original timestamps (False).
69
69
  """
@@ -0,0 +1,11 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing
5
+ from ..types.raw_encoding import RawEncoding
6
+
7
+
8
+ class SseOutputFormatParams(typing_extensions.TypedDict):
9
+ container: typing.Literal["raw"]
10
+ encoding: RawEncoding
11
+ sample_rate: int
@@ -0,0 +1,47 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ from .tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
5
+ import typing_extensions
6
+ from ..types.supported_language import SupportedLanguage
7
+ from .sse_output_format import SseOutputFormatParams
8
+ from ..types.model_speed import ModelSpeed
9
+ from ..types.context_id import ContextId
10
+
11
+
12
+ class TtssseRequestParams(typing_extensions.TypedDict):
13
+ model_id: str
14
+ """
15
+ The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
16
+ """
17
+
18
+ transcript: str
19
+ voice: TtsRequestVoiceSpecifierParams
20
+ language: typing_extensions.NotRequired[SupportedLanguage]
21
+ output_format: SseOutputFormatParams
22
+ duration: typing_extensions.NotRequired[float]
23
+ """
24
+ The maximum duration of the audio in seconds. You do not usually need to specify this.
25
+ If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
26
+ """
27
+
28
+ speed: typing_extensions.NotRequired[ModelSpeed]
29
+ add_timestamps: typing_extensions.NotRequired[bool]
30
+ """
31
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
32
+ """
33
+
34
+ add_phoneme_timestamps: typing_extensions.NotRequired[bool]
35
+ """
36
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
37
+ """
38
+
39
+ use_normalized_timestamps: typing_extensions.NotRequired[bool]
40
+ """
41
+ Whether to use normalized timestamps (True) or original timestamps (False).
42
+ """
43
+
44
+ context_id: typing_extensions.NotRequired[ContextId]
45
+ """
46
+ Optional context ID for this request.
47
+ """
@@ -1,11 +1,8 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from .web_socket_base_response import WebSocketBaseResponseParams
4
- import typing_extensions
5
- from ..types.flush_id import FlushId
6
4
 
7
5
 
8
6
  class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
9
7
  data: str
10
8
  step_time: float
11
- flush_id: typing_extensions.NotRequired[FlushId]