cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. cartesia/__init__.py +22 -0
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/core/client_wrapper.py +1 -1
  8. cartesia/infill/client.py +0 -8
  9. cartesia/stt/__init__.py +6 -0
  10. cartesia/stt/_async_websocket.py +81 -72
  11. cartesia/stt/_websocket.py +42 -20
  12. cartesia/stt/client.py +450 -0
  13. cartesia/stt/requests/__init__.py +2 -0
  14. cartesia/stt/requests/streaming_transcription_response.py +2 -0
  15. cartesia/stt/requests/transcript_message.py +8 -1
  16. cartesia/stt/requests/transcription_response.py +8 -1
  17. cartesia/stt/requests/transcription_word.py +20 -0
  18. cartesia/stt/socket_client.py +52 -109
  19. cartesia/stt/types/__init__.py +4 -0
  20. cartesia/stt/types/streaming_transcription_response.py +2 -0
  21. cartesia/stt/types/stt_encoding.py +3 -1
  22. cartesia/stt/types/timestamp_granularity.py +5 -0
  23. cartesia/stt/types/transcript_message.py +7 -1
  24. cartesia/stt/types/transcription_response.py +7 -1
  25. cartesia/stt/types/transcription_word.py +32 -0
  26. cartesia/tts/__init__.py +16 -0
  27. cartesia/tts/client.py +63 -8
  28. cartesia/tts/requests/__init__.py +8 -0
  29. cartesia/tts/requests/experimental_model_controls.py +17 -0
  30. cartesia/tts/requests/generation_config.py +23 -0
  31. cartesia/tts/requests/generation_request.py +4 -4
  32. cartesia/tts/requests/sse_output_format.py +11 -0
  33. cartesia/tts/requests/tts_request.py +2 -0
  34. cartesia/tts/requests/ttssse_request.py +47 -0
  35. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  36. cartesia/tts/requests/web_socket_response.py +1 -2
  37. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  38. cartesia/tts/types/__init__.py +8 -0
  39. cartesia/tts/types/experimental_model_controls.py +28 -0
  40. cartesia/tts/types/generation_config.py +34 -0
  41. cartesia/tts/types/generation_request.py +4 -4
  42. cartesia/tts/types/sse_output_format.py +22 -0
  43. cartesia/tts/types/tts_request.py +2 -0
  44. cartesia/tts/types/ttssse_request.py +58 -0
  45. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  46. cartesia/tts/types/web_socket_response.py +1 -2
  47. cartesia/tts/types/web_socket_tts_request.py +11 -3
  48. cartesia/voice_changer/client.py +0 -8
  49. cartesia/voice_changer/requests/streaming_response.py +0 -2
  50. cartesia/voice_changer/types/streaming_response.py +0 -2
  51. cartesia/voices/client.py +0 -12
  52. cartesia-2.0.7.dist-info/LICENSE +201 -0
  53. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
  54. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
  55. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
@@ -0,0 +1,20 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+
5
+
6
+ class TranscriptionWordParams(typing_extensions.TypedDict):
7
+ word: str
8
+ """
9
+ The transcribed word.
10
+ """
11
+
12
+ start: float
13
+ """
14
+ Start time of the word in seconds.
15
+ """
16
+
17
+ end: float
18
+ """
19
+ End time of the word in seconds.
20
+ """
@@ -1,18 +1,20 @@
1
1
  import typing
2
- from typing import Any, Dict, Generator, Optional, Union
2
+ from typing import Any, Dict, Generator, Optional
3
3
 
4
4
  from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
5
5
  from ._async_websocket import AsyncSttWebsocket
6
6
  from ._websocket import SttWebsocket
7
+ from .client import AsyncSttClient, SttClient
8
+ from .types.stt_encoding import SttEncoding
7
9
 
8
10
 
9
- class SttClientWithWebsocket:
11
+ class SttClientWithWebsocket(SttClient):
10
12
  """
11
13
  Extension of STT functionality that supports a synchronous WebSocket STT connection.
12
14
  """
13
15
 
14
16
  def __init__(self, *, client_wrapper: SyncClientWrapper):
15
- self._client_wrapper = client_wrapper
17
+ super().__init__(client_wrapper=client_wrapper)
16
18
 
17
19
  def _ws_url(self):
18
20
  base_url = self._client_wrapper.get_base_url()
@@ -23,21 +25,34 @@ class SttClientWithWebsocket:
23
25
  base_url_without_protocol = base_url.split("://")[-1]
24
26
  return f"{prefix}://{base_url_without_protocol}"
25
27
 
26
- def websocket(self, *,
27
- model: str = "ink-whisper",
28
- language: Optional[str] = "en",
29
- encoding: Optional[str] = "pcm_s16le",
30
- sample_rate: int = 16000):
28
+ def websocket(
29
+ self,
30
+ *,
31
+ model: str = "ink-whisper",
32
+ language: Optional[str] = "en",
33
+ encoding: SttEncoding = "pcm_s16le",
34
+ sample_rate: int = 16000,
35
+ min_volume: Optional[float] = None,
36
+ max_silence_duration_secs: Optional[float] = None,
37
+ ):
31
38
  """Create a WebSocket connection for real-time speech transcription.
32
39
 
33
40
  Args:
34
41
  model: ID of the model to use for transcription
35
42
  language: The language of the input audio in ISO-639-1 format
36
- encoding: The encoding format of the audio data
37
- sample_rate: The sample rate of the audio in Hz
43
+ encoding: The encoding format of the audio data (required)
44
+ sample_rate: The sample rate of the audio in Hz (required)
45
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
46
+ max_silence_duration_secs: Maximum duration of silence before endpointing
38
47
 
39
48
  Returns:
40
49
  SttWebsocket: A connected WebSocket client for STT operations.
50
+
51
+ Example:
52
+ >>> client = Cartesia(api_key="your-api-key")
53
+ >>> ws = client.stt.websocket()
54
+ >>> for result in ws.transcribe(audio_chunks):
55
+ ... print(result["text"])
41
56
  """
42
57
  client_headers = self._client_wrapper.get_headers()
43
58
  ws = SttWebsocket(
@@ -51,61 +66,19 @@ class SttClientWithWebsocket:
51
66
  language=language,
52
67
  encoding=encoding,
53
68
  sample_rate=sample_rate,
69
+ min_volume=min_volume,
70
+ max_silence_duration_secs=max_silence_duration_secs,
54
71
  )
55
72
  return ws
56
73
 
57
- def transcribe(
58
- self,
59
- audio_chunks: typing.Iterator[bytes],
60
- *,
61
- model: str = "ink-whisper",
62
- language: Optional[str] = "en",
63
- encoding: Optional[str] = "pcm_s16le",
64
- sample_rate: int = 16000,
65
- ) -> Generator[Dict[str, Any], None, None]:
66
- """Transcribe audio chunks using WebSocket.
67
74
 
68
- Args:
69
- audio_chunks: Iterator of audio chunks as bytes
70
- model: ID of the model to use for transcription
71
- language: The language of the input audio in ISO-639-1 format
72
- encoding: The encoding format of the audio data
73
- sample_rate: The sample rate of the audio in Hz
74
-
75
- Yields:
76
- Dictionary containing transcription results, flush_done, done, or error messages
77
-
78
- Example:
79
- >>> client = Cartesia(api_key="your-api-key")
80
- >>> ws_client = client.stt.websocket()
81
- >>> for result in ws_client.transcribe(audio_chunks):
82
- ... print(result["text"])
83
- """
84
- ws = self.websocket(
85
- model=model,
86
- language=language,
87
- encoding=encoding,
88
- sample_rate=sample_rate,
89
- )
90
- try:
91
- yield from ws.transcribe(
92
- audio_chunks,
93
- model=model,
94
- language=language,
95
- encoding=encoding,
96
- sample_rate=sample_rate,
97
- )
98
- finally:
99
- ws.close()
100
-
101
-
102
- class AsyncSttClientWithWebsocket:
75
+ class AsyncSttClientWithWebsocket(AsyncSttClient):
103
76
  """
104
77
  Extension of STT functionality that supports an asynchronous WebSocket STT connection.
105
78
  """
106
79
 
107
80
  def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
108
- self._client_wrapper = client_wrapper
81
+ super().__init__(client_wrapper=client_wrapper)
109
82
  self._get_session = get_session
110
83
 
111
84
  def _ws_url(self) -> str:
@@ -117,21 +90,34 @@ class AsyncSttClientWithWebsocket:
117
90
  base_url_without_protocol = base_url.split("://")[-1]
118
91
  return f"{prefix}://{base_url_without_protocol}"
119
92
 
120
- async def websocket(self, *,
121
- model: str = "ink-whisper",
122
- language: Optional[str] = "en",
123
- encoding: Optional[str] = "pcm_s16le",
124
- sample_rate: int = 16000):
93
+ async def websocket(
94
+ self,
95
+ *,
96
+ model: str = "ink-whisper",
97
+ language: Optional[str] = "en",
98
+ encoding: SttEncoding = "pcm_s16le",
99
+ sample_rate: int = 16000,
100
+ min_volume: Optional[float] = None,
101
+ max_silence_duration_secs: Optional[float] = None,
102
+ ):
125
103
  """Create an async WebSocket connection for real-time speech transcription.
126
104
 
127
105
  Args:
128
106
  model: ID of the model to use for transcription
129
107
  language: The language of the input audio in ISO-639-1 format
130
- encoding: The encoding format of the audio data
131
- sample_rate: The sample rate of the audio in Hz
108
+ encoding: The encoding format of the audio data (required)
109
+ sample_rate: The sample rate of the audio in Hz (required)
110
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
111
+ max_silence_duration_secs: Maximum duration of silence before endpointing
132
112
 
133
113
  Returns:
134
114
  AsyncSttWebsocket: A connected async WebSocket client for STT operations.
115
+
116
+ Example:
117
+ >>> client = AsyncCartesia(api_key="your-api-key")
118
+ >>> ws = await client.stt.websocket()
119
+ >>> async for result in ws.transcribe(audio_chunks):
120
+ ... print(result["text"])
135
121
  """
136
122
  client_headers = self._client_wrapper.get_headers()
137
123
  ws = AsyncSttWebsocket(
@@ -146,50 +132,7 @@ class AsyncSttClientWithWebsocket:
146
132
  language=language,
147
133
  encoding=encoding,
148
134
  sample_rate=sample_rate,
135
+ min_volume=min_volume,
136
+ max_silence_duration_secs=max_silence_duration_secs,
149
137
  )
150
- return ws
151
-
152
- async def transcribe(
153
- self,
154
- audio_chunks: typing.AsyncIterator[bytes],
155
- *,
156
- model: str = "ink-whisper",
157
- language: Optional[str] = "en",
158
- encoding: Optional[str] = "pcm_s16le",
159
- sample_rate: int = 16000,
160
- ) -> typing.AsyncGenerator[Dict[str, Any], None]:
161
- """Transcribe audio chunks using async WebSocket.
162
-
163
- Args:
164
- audio_chunks: Async iterator of audio chunks as bytes
165
- model: ID of the model to use for transcription
166
- language: The language of the input audio in ISO-639-1 format
167
- encoding: The encoding format of the audio data
168
- sample_rate: The sample rate of the audio in Hz
169
-
170
- Yields:
171
- Dictionary containing transcription results, flush_done, done, or error messages
172
-
173
- Example:
174
- >>> client = AsyncCartesia(api_key="your-api-key")
175
- >>> ws_client = await client.stt.websocket()
176
- >>> async for result in ws_client.transcribe(audio_chunks):
177
- ... print(result["text"])
178
- """
179
- ws = await self.websocket(
180
- model=model,
181
- language=language,
182
- encoding=encoding,
183
- sample_rate=sample_rate,
184
- )
185
- try:
186
- async for result in ws.transcribe(
187
- audio_chunks,
188
- model=model,
189
- language=language,
190
- encoding=encoding,
191
- sample_rate=sample_rate,
192
- ):
193
- yield result
194
- finally:
195
- await ws.close()
138
+ return ws
@@ -11,8 +11,10 @@ from .streaming_transcription_response import (
11
11
  StreamingTranscriptionResponse_Transcript,
12
12
  )
13
13
  from .stt_encoding import SttEncoding
14
+ from .timestamp_granularity import TimestampGranularity
14
15
  from .transcript_message import TranscriptMessage
15
16
  from .transcription_response import TranscriptionResponse
17
+ from .transcription_word import TranscriptionWord
16
18
 
17
19
  __all__ = [
18
20
  "DoneMessage",
@@ -24,6 +26,8 @@ __all__ = [
24
26
  "StreamingTranscriptionResponse_FlushDone",
25
27
  "StreamingTranscriptionResponse_Transcript",
26
28
  "SttEncoding",
29
+ "TimestampGranularity",
27
30
  "TranscriptMessage",
28
31
  "TranscriptionResponse",
32
+ "TranscriptionWord",
29
33
  ]
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
  from ...core.pydantic_utilities import UniversalBaseModel
5
5
  import typing
6
+ from .transcription_word import TranscriptionWord
6
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
8
  import pydantic
8
9
 
@@ -18,6 +19,7 @@ class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
18
19
  is_final: bool
19
20
  duration: typing.Optional[float] = None
20
21
  language: typing.Optional[str] = None
22
+ words: typing.Optional[typing.List[TranscriptionWord]] = None
21
23
 
22
24
  if IS_PYDANTIC_V2:
23
25
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
@@ -2,4 +2,6 @@
2
2
 
3
3
  import typing
4
4
 
5
- SttEncoding = typing.Union[typing.Literal["pcm_s16le"], typing.Any]
5
+ SttEncoding = typing.Union[
6
+ typing.Literal["pcm_s16le", "pcm_s32le", "pcm_f16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"], typing.Any
7
+ ]
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ TimestampGranularity = typing.Union[typing.Literal["word"], typing.Any]
@@ -3,6 +3,7 @@
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  import pydantic
5
5
  import typing
6
+ from .transcription_word import TranscriptionWord
6
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
8
 
8
9
 
@@ -31,7 +32,12 @@ class TranscriptMessage(UniversalBaseModel):
31
32
 
32
33
  language: typing.Optional[str] = pydantic.Field(default=None)
33
34
  """
34
- The detected or specified language of the input audio.
35
+ The specified language of the input audio.
36
+ """
37
+
38
+ words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
39
+ """
40
+ Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
35
41
  """
36
42
 
37
43
  if IS_PYDANTIC_V2:
@@ -3,6 +3,7 @@
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
4
  import pydantic
5
5
  import typing
6
+ from .transcription_word import TranscriptionWord
6
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
8
 
8
9
 
@@ -14,7 +15,7 @@ class TranscriptionResponse(UniversalBaseModel):
14
15
 
15
16
  language: typing.Optional[str] = pydantic.Field(default=None)
16
17
  """
17
- The detected or specified language of the input audio.
18
+ The specified language of the input audio.
18
19
  """
19
20
 
20
21
  duration: typing.Optional[float] = pydantic.Field(default=None)
@@ -22,6 +23,11 @@ class TranscriptionResponse(UniversalBaseModel):
22
23
  The duration of the input audio in seconds.
23
24
  """
24
25
 
26
+ words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
27
+ """
28
+ Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
29
+ """
30
+
25
31
  if IS_PYDANTIC_V2:
26
32
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
27
33
  else:
@@ -0,0 +1,32 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
+ import typing
7
+
8
+
9
+ class TranscriptionWord(UniversalBaseModel):
10
+ word: str = pydantic.Field()
11
+ """
12
+ The transcribed word.
13
+ """
14
+
15
+ start: float = pydantic.Field()
16
+ """
17
+ Start time of the word in seconds.
18
+ """
19
+
20
+ end: float = pydantic.Field()
21
+ """
22
+ End time of the word in seconds.
23
+ """
24
+
25
+ if IS_PYDANTIC_V2:
26
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
27
+ else:
28
+
29
+ class Config:
30
+ frozen = True
31
+ smart_union = True
32
+ extra = pydantic.Extra.allow
cartesia/tts/__init__.py CHANGED
@@ -5,7 +5,9 @@ from .types import (
5
5
  ContextId,
6
6
  Controls,
7
7
  Emotion,
8
+ ExperimentalModelControls,
8
9
  FlushId,
10
+ GenerationConfig,
9
11
  GenerationRequest,
10
12
  ModelSpeed,
11
13
  Mp3OutputFormat,
@@ -19,11 +21,13 @@ from .types import (
19
21
  RawEncoding,
20
22
  RawOutputFormat,
21
23
  Speed,
24
+ SseOutputFormat,
22
25
  SupportedLanguage,
23
26
  TtsRequest,
24
27
  TtsRequestEmbeddingSpecifier,
25
28
  TtsRequestIdSpecifier,
26
29
  TtsRequestVoiceSpecifier,
30
+ TtssseRequest,
27
31
  WavOutputFormat,
28
32
  WebSocketBaseResponse,
29
33
  WebSocketChunkResponse,
@@ -49,6 +53,8 @@ from .types import (
49
53
  from .requests import (
50
54
  CancelContextRequestParams,
51
55
  ControlsParams,
56
+ ExperimentalModelControlsParams,
57
+ GenerationConfigParams,
52
58
  GenerationRequestParams,
53
59
  Mp3OutputFormatParams,
54
60
  OutputFormatParams,
@@ -58,10 +64,12 @@ from .requests import (
58
64
  PhonemeTimestampsParams,
59
65
  RawOutputFormatParams,
60
66
  SpeedParams,
67
+ SseOutputFormatParams,
61
68
  TtsRequestEmbeddingSpecifierParams,
62
69
  TtsRequestIdSpecifierParams,
63
70
  TtsRequestParams,
64
71
  TtsRequestVoiceSpecifierParams,
72
+ TtssseRequestParams,
65
73
  WavOutputFormatParams,
66
74
  WebSocketBaseResponseParams,
67
75
  WebSocketChunkResponseParams,
@@ -92,7 +100,11 @@ __all__ = [
92
100
  "Controls",
93
101
  "ControlsParams",
94
102
  "Emotion",
103
+ "ExperimentalModelControls",
104
+ "ExperimentalModelControlsParams",
95
105
  "FlushId",
106
+ "GenerationConfig",
107
+ "GenerationConfigParams",
96
108
  "GenerationRequest",
97
109
  "GenerationRequestParams",
98
110
  "ModelSpeed",
@@ -115,6 +127,8 @@ __all__ = [
115
127
  "RawOutputFormatParams",
116
128
  "Speed",
117
129
  "SpeedParams",
130
+ "SseOutputFormat",
131
+ "SseOutputFormatParams",
118
132
  "SupportedLanguage",
119
133
  "TtsRequest",
120
134
  "TtsRequestEmbeddingSpecifier",
@@ -124,6 +138,8 @@ __all__ = [
124
138
  "TtsRequestParams",
125
139
  "TtsRequestVoiceSpecifier",
126
140
  "TtsRequestVoiceSpecifierParams",
141
+ "TtssseRequest",
142
+ "TtssseRequestParams",
127
143
  "WavOutputFormat",
128
144
  "WavOutputFormatParams",
129
145
  "WebSocketBaseResponse",
cartesia/tts/client.py CHANGED
@@ -6,10 +6,13 @@ from .requests.tts_request_voice_specifier import TtsRequestVoiceSpecifierParams
6
6
  from .requests.output_format import OutputFormatParams
7
7
  from .types.supported_language import SupportedLanguage
8
8
  from .types.model_speed import ModelSpeed
9
+ from .requests.generation_config import GenerationConfigParams
9
10
  from ..core.request_options import RequestOptions
10
11
  from ..core.serialization import convert_and_respect_annotation_metadata
11
12
  from json.decoder import JSONDecodeError
12
13
  from ..core.api_error import ApiError
14
+ from .requests.sse_output_format import SseOutputFormatParams
15
+ from .types.context_id import ContextId
13
16
  from .types.web_socket_response import WebSocketResponse
14
17
  import httpx_sse
15
18
  from ..core.pydantic_utilities import parse_obj_as
@@ -34,6 +37,7 @@ class TtsClient:
34
37
  language: typing.Optional[SupportedLanguage] = OMIT,
35
38
  duration: typing.Optional[float] = OMIT,
36
39
  speed: typing.Optional[ModelSpeed] = OMIT,
40
+ generation_config: typing.Optional[GenerationConfigParams] = OMIT,
37
41
  request_options: typing.Optional[RequestOptions] = None,
38
42
  ) -> typing.Iterator[bytes]:
39
43
  """
@@ -56,6 +60,8 @@ class TtsClient:
56
60
 
57
61
  speed : typing.Optional[ModelSpeed]
58
62
 
63
+ generation_config : typing.Optional[GenerationConfigParams]
64
+
59
65
  request_options : typing.Optional[RequestOptions]
60
66
  Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
61
67
 
@@ -97,6 +103,9 @@ class TtsClient:
97
103
  ),
98
104
  "duration": duration,
99
105
  "speed": speed,
106
+ "generation_config": convert_and_respect_annotation_metadata(
107
+ object_=generation_config, annotation=GenerationConfigParams, direction="write"
108
+ ),
100
109
  },
101
110
  request_options=request_options,
102
111
  omit=OMIT,
@@ -119,10 +128,14 @@ class TtsClient:
119
128
  model_id: str,
120
129
  transcript: str,
121
130
  voice: TtsRequestVoiceSpecifierParams,
122
- output_format: OutputFormatParams,
131
+ output_format: SseOutputFormatParams,
123
132
  language: typing.Optional[SupportedLanguage] = OMIT,
124
133
  duration: typing.Optional[float] = OMIT,
125
134
  speed: typing.Optional[ModelSpeed] = OMIT,
135
+ add_timestamps: typing.Optional[bool] = OMIT,
136
+ add_phoneme_timestamps: typing.Optional[bool] = OMIT,
137
+ use_normalized_timestamps: typing.Optional[bool] = OMIT,
138
+ context_id: typing.Optional[ContextId] = OMIT,
126
139
  request_options: typing.Optional[RequestOptions] = None,
127
140
  ) -> typing.Iterator[WebSocketResponse]:
128
141
  """
@@ -135,7 +148,7 @@ class TtsClient:
135
148
 
136
149
  voice : TtsRequestVoiceSpecifierParams
137
150
 
138
- output_format : OutputFormatParams
151
+ output_format : SseOutputFormatParams
139
152
 
140
153
  language : typing.Optional[SupportedLanguage]
141
154
 
@@ -145,6 +158,18 @@ class TtsClient:
145
158
 
146
159
  speed : typing.Optional[ModelSpeed]
147
160
 
161
+ add_timestamps : typing.Optional[bool]
162
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
163
+
164
+ add_phoneme_timestamps : typing.Optional[bool]
165
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
166
+
167
+ use_normalized_timestamps : typing.Optional[bool]
168
+ Whether to use normalized timestamps (True) or original timestamps (False).
169
+
170
+ context_id : typing.Optional[ContextId]
171
+ Optional context ID for this request.
172
+
148
173
  request_options : typing.Optional[RequestOptions]
149
174
  Request-specific configuration.
150
175
 
@@ -165,9 +190,9 @@ class TtsClient:
165
190
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
166
191
  language="en",
167
192
  output_format={
193
+ "container": "raw",
168
194
  "sample_rate": 44100,
169
195
  "encoding": "pcm_f32le",
170
- "container": "raw",
171
196
  },
172
197
  )
173
198
  for chunk in response:
@@ -184,10 +209,14 @@ class TtsClient:
184
209
  ),
185
210
  "language": language,
186
211
  "output_format": convert_and_respect_annotation_metadata(
187
- object_=output_format, annotation=OutputFormatParams, direction="write"
212
+ object_=output_format, annotation=SseOutputFormatParams, direction="write"
188
213
  ),
189
214
  "duration": duration,
190
215
  "speed": speed,
216
+ "add_timestamps": add_timestamps,
217
+ "add_phoneme_timestamps": add_phoneme_timestamps,
218
+ "use_normalized_timestamps": use_normalized_timestamps,
219
+ "context_id": context_id,
191
220
  },
192
221
  request_options=request_options,
193
222
  omit=OMIT,
@@ -228,6 +257,7 @@ class AsyncTtsClient:
228
257
  language: typing.Optional[SupportedLanguage] = OMIT,
229
258
  duration: typing.Optional[float] = OMIT,
230
259
  speed: typing.Optional[ModelSpeed] = OMIT,
260
+ generation_config: typing.Optional[GenerationConfigParams] = OMIT,
231
261
  request_options: typing.Optional[RequestOptions] = None,
232
262
  ) -> typing.AsyncIterator[bytes]:
233
263
  """
@@ -250,6 +280,8 @@ class AsyncTtsClient:
250
280
 
251
281
  speed : typing.Optional[ModelSpeed]
252
282
 
283
+ generation_config : typing.Optional[GenerationConfigParams]
284
+
253
285
  request_options : typing.Optional[RequestOptions]
254
286
  Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
255
287
 
@@ -299,6 +331,9 @@ class AsyncTtsClient:
299
331
  ),
300
332
  "duration": duration,
301
333
  "speed": speed,
334
+ "generation_config": convert_and_respect_annotation_metadata(
335
+ object_=generation_config, annotation=GenerationConfigParams, direction="write"
336
+ ),
302
337
  },
303
338
  request_options=request_options,
304
339
  omit=OMIT,
@@ -321,10 +356,14 @@ class AsyncTtsClient:
321
356
  model_id: str,
322
357
  transcript: str,
323
358
  voice: TtsRequestVoiceSpecifierParams,
324
- output_format: OutputFormatParams,
359
+ output_format: SseOutputFormatParams,
325
360
  language: typing.Optional[SupportedLanguage] = OMIT,
326
361
  duration: typing.Optional[float] = OMIT,
327
362
  speed: typing.Optional[ModelSpeed] = OMIT,
363
+ add_timestamps: typing.Optional[bool] = OMIT,
364
+ add_phoneme_timestamps: typing.Optional[bool] = OMIT,
365
+ use_normalized_timestamps: typing.Optional[bool] = OMIT,
366
+ context_id: typing.Optional[ContextId] = OMIT,
328
367
  request_options: typing.Optional[RequestOptions] = None,
329
368
  ) -> typing.AsyncIterator[WebSocketResponse]:
330
369
  """
@@ -337,7 +376,7 @@ class AsyncTtsClient:
337
376
 
338
377
  voice : TtsRequestVoiceSpecifierParams
339
378
 
340
- output_format : OutputFormatParams
379
+ output_format : SseOutputFormatParams
341
380
 
342
381
  language : typing.Optional[SupportedLanguage]
343
382
 
@@ -347,6 +386,18 @@ class AsyncTtsClient:
347
386
 
348
387
  speed : typing.Optional[ModelSpeed]
349
388
 
389
+ add_timestamps : typing.Optional[bool]
390
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
391
+
392
+ add_phoneme_timestamps : typing.Optional[bool]
393
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
394
+
395
+ use_normalized_timestamps : typing.Optional[bool]
396
+ Whether to use normalized timestamps (True) or original timestamps (False).
397
+
398
+ context_id : typing.Optional[ContextId]
399
+ Optional context ID for this request.
400
+
350
401
  request_options : typing.Optional[RequestOptions]
351
402
  Request-specific configuration.
352
403
 
@@ -372,9 +423,9 @@ class AsyncTtsClient:
372
423
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
373
424
  language="en",
374
425
  output_format={
426
+ "container": "raw",
375
427
  "sample_rate": 44100,
376
428
  "encoding": "pcm_f32le",
377
- "container": "raw",
378
429
  },
379
430
  )
380
431
  async for chunk in response:
@@ -394,10 +445,14 @@ class AsyncTtsClient:
394
445
  ),
395
446
  "language": language,
396
447
  "output_format": convert_and_respect_annotation_metadata(
397
- object_=output_format, annotation=OutputFormatParams, direction="write"
448
+ object_=output_format, annotation=SseOutputFormatParams, direction="write"
398
449
  ),
399
450
  "duration": duration,
400
451
  "speed": speed,
452
+ "add_timestamps": add_timestamps,
453
+ "add_phoneme_timestamps": add_phoneme_timestamps,
454
+ "use_normalized_timestamps": use_normalized_timestamps,
455
+ "context_id": context_id,
401
456
  },
402
457
  request_options=request_options,
403
458
  omit=OMIT,