cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. cartesia/__init__.py +60 -1
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/base_client.py +2 -0
  8. cartesia/client.py +5 -0
  9. cartesia/core/client_wrapper.py +1 -1
  10. cartesia/stt/__init__.py +57 -0
  11. cartesia/stt/_async_websocket.py +293 -0
  12. cartesia/stt/_websocket.py +294 -0
  13. cartesia/stt/client.py +456 -0
  14. cartesia/stt/requests/__init__.py +29 -0
  15. cartesia/stt/requests/done_message.py +14 -0
  16. cartesia/stt/requests/error_message.py +16 -0
  17. cartesia/stt/requests/flush_done_message.py +14 -0
  18. cartesia/stt/requests/streaming_transcription_response.py +41 -0
  19. cartesia/stt/requests/transcript_message.py +40 -0
  20. cartesia/stt/requests/transcription_response.py +28 -0
  21. cartesia/stt/requests/transcription_word.py +20 -0
  22. cartesia/stt/socket_client.py +138 -0
  23. cartesia/stt/types/__init__.py +33 -0
  24. cartesia/stt/types/done_message.py +26 -0
  25. cartesia/stt/types/error_message.py +27 -0
  26. cartesia/stt/types/flush_done_message.py +26 -0
  27. cartesia/stt/types/streaming_transcription_response.py +94 -0
  28. cartesia/stt/types/stt_encoding.py +7 -0
  29. cartesia/stt/types/timestamp_granularity.py +5 -0
  30. cartesia/stt/types/transcript_message.py +50 -0
  31. cartesia/stt/types/transcription_response.py +38 -0
  32. cartesia/stt/types/transcription_word.py +32 -0
  33. cartesia/tts/__init__.py +8 -0
  34. cartesia/tts/client.py +50 -8
  35. cartesia/tts/requests/__init__.py +4 -0
  36. cartesia/tts/requests/generation_request.py +4 -4
  37. cartesia/tts/requests/sse_output_format.py +11 -0
  38. cartesia/tts/requests/ttssse_request.py +47 -0
  39. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  40. cartesia/tts/requests/web_socket_response.py +1 -2
  41. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  42. cartesia/tts/types/__init__.py +4 -0
  43. cartesia/tts/types/generation_request.py +4 -4
  44. cartesia/tts/types/sse_output_format.py +22 -0
  45. cartesia/tts/types/ttssse_request.py +58 -0
  46. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  47. cartesia/tts/types/web_socket_response.py +1 -2
  48. cartesia/tts/types/web_socket_tts_request.py +11 -3
  49. cartesia/voice_changer/requests/streaming_response.py +0 -2
  50. cartesia/voice_changer/types/streaming_response.py +0 -2
  51. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
  52. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
  53. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,138 @@
1
+ import typing
2
+ from typing import Any, Dict, Generator, Optional
3
+
4
+ from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
5
+ from ._async_websocket import AsyncSttWebsocket
6
+ from ._websocket import SttWebsocket
7
+ from .client import AsyncSttClient, SttClient
8
+ from .types.stt_encoding import SttEncoding
9
+
10
+
11
+ class SttClientWithWebsocket(SttClient):
12
+ """
13
+ Extension of STT functionality that supports a synchronous WebSocket STT connection.
14
+ """
15
+
16
+ def __init__(self, *, client_wrapper: SyncClientWrapper):
17
+ super().__init__(client_wrapper=client_wrapper)
18
+
19
+ def _ws_url(self):
20
+ base_url = self._client_wrapper.get_base_url()
21
+ if base_url.startswith("ws://") or base_url.startswith("wss://"):
22
+ return base_url
23
+ else:
24
+ prefix = "ws" if "localhost" in base_url else "wss"
25
+ base_url_without_protocol = base_url.split("://")[-1]
26
+ return f"{prefix}://{base_url_without_protocol}"
27
+
28
+ def websocket(
29
+ self,
30
+ *,
31
+ model: str = "ink-whisper",
32
+ language: Optional[str] = "en",
33
+ encoding: SttEncoding = "pcm_s16le",
34
+ sample_rate: int = 16000,
35
+ min_volume: Optional[float] = None,
36
+ max_silence_duration_secs: Optional[float] = None,
37
+ ):
38
+ """Create a WebSocket connection for real-time speech transcription.
39
+
40
+ Args:
41
+ model: ID of the model to use for transcription
42
+ language: The language of the input audio in ISO-639-1 format
43
+ encoding: The encoding format of the audio data (required)
44
+ sample_rate: The sample rate of the audio in Hz (required)
45
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
46
+ max_silence_duration_secs: Maximum duration of silence before endpointing
47
+
48
+ Returns:
49
+ SttWebsocket: A connected WebSocket client for STT operations.
50
+
51
+ Example:
52
+ >>> client = Cartesia(api_key="your-api-key")
53
+ >>> ws = client.stt.websocket()
54
+ >>> for result in ws.transcribe(audio_chunks):
55
+ ... print(result["text"])
56
+ """
57
+ client_headers = self._client_wrapper.get_headers()
58
+ ws = SttWebsocket(
59
+ ws_url=self._ws_url(),
60
+ cartesia_version=client_headers["Cartesia-Version"],
61
+ api_key=client_headers["X-API-Key"],
62
+ )
63
+ # Auto-connect like TTS does for consistency
64
+ ws.connect(
65
+ model=model,
66
+ language=language,
67
+ encoding=encoding,
68
+ sample_rate=sample_rate,
69
+ min_volume=min_volume,
70
+ max_silence_duration_secs=max_silence_duration_secs,
71
+ )
72
+ return ws
73
+
74
+
75
+ class AsyncSttClientWithWebsocket(AsyncSttClient):
76
+ """
77
+ Extension of STT functionality that supports an asynchronous WebSocket STT connection.
78
+ """
79
+
80
+ def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
81
+ super().__init__(client_wrapper=client_wrapper)
82
+ self._get_session = get_session
83
+
84
+ def _ws_url(self) -> str:
85
+ base_url = self._client_wrapper.get_base_url()
86
+ if base_url.startswith("ws://") or base_url.startswith("wss://"):
87
+ return base_url
88
+ else:
89
+ prefix = "ws" if "localhost" in base_url else "wss"
90
+ base_url_without_protocol = base_url.split("://")[-1]
91
+ return f"{prefix}://{base_url_without_protocol}"
92
+
93
+ async def websocket(
94
+ self,
95
+ *,
96
+ model: str = "ink-whisper",
97
+ language: Optional[str] = "en",
98
+ encoding: SttEncoding = "pcm_s16le",
99
+ sample_rate: int = 16000,
100
+ min_volume: Optional[float] = None,
101
+ max_silence_duration_secs: Optional[float] = None,
102
+ ):
103
+ """Create an async WebSocket connection for real-time speech transcription.
104
+
105
+ Args:
106
+ model: ID of the model to use for transcription
107
+ language: The language of the input audio in ISO-639-1 format
108
+ encoding: The encoding format of the audio data (required)
109
+ sample_rate: The sample rate of the audio in Hz (required)
110
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
111
+ max_silence_duration_secs: Maximum duration of silence before endpointing
112
+
113
+ Returns:
114
+ AsyncSttWebsocket: A connected async WebSocket client for STT operations.
115
+
116
+ Example:
117
+ >>> client = AsyncCartesia(api_key="your-api-key")
118
+ >>> ws = await client.stt.websocket()
119
+ >>> async for result in ws.transcribe(audio_chunks):
120
+ ... print(result["text"])
121
+ """
122
+ client_headers = self._client_wrapper.get_headers()
123
+ ws = AsyncSttWebsocket(
124
+ ws_url=self._ws_url(),
125
+ cartesia_version=client_headers["Cartesia-Version"],
126
+ api_key=client_headers["X-API-Key"],
127
+ get_session=self._get_session,
128
+ )
129
+ # Auto-connect like TTS does for consistency
130
+ await ws.connect(
131
+ model=model,
132
+ language=language,
133
+ encoding=encoding,
134
+ sample_rate=sample_rate,
135
+ min_volume=min_volume,
136
+ max_silence_duration_secs=max_silence_duration_secs,
137
+ )
138
+ return ws
@@ -0,0 +1,33 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .done_message import DoneMessage
4
+ from .error_message import ErrorMessage
5
+ from .flush_done_message import FlushDoneMessage
6
+ from .streaming_transcription_response import (
7
+ StreamingTranscriptionResponse,
8
+ StreamingTranscriptionResponse_Done,
9
+ StreamingTranscriptionResponse_Error,
10
+ StreamingTranscriptionResponse_FlushDone,
11
+ StreamingTranscriptionResponse_Transcript,
12
+ )
13
+ from .stt_encoding import SttEncoding
14
+ from .timestamp_granularity import TimestampGranularity
15
+ from .transcript_message import TranscriptMessage
16
+ from .transcription_response import TranscriptionResponse
17
+ from .transcription_word import TranscriptionWord
18
+
19
+ __all__ = [
20
+ "DoneMessage",
21
+ "ErrorMessage",
22
+ "FlushDoneMessage",
23
+ "StreamingTranscriptionResponse",
24
+ "StreamingTranscriptionResponse_Done",
25
+ "StreamingTranscriptionResponse_Error",
26
+ "StreamingTranscriptionResponse_FlushDone",
27
+ "StreamingTranscriptionResponse_Transcript",
28
+ "SttEncoding",
29
+ "TimestampGranularity",
30
+ "TranscriptMessage",
31
+ "TranscriptionResponse",
32
+ "TranscriptionWord",
33
+ ]
@@ -0,0 +1,26 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
+ import typing
7
+
8
+
9
+ class DoneMessage(UniversalBaseModel):
10
+ """
11
+ Acknowledgment message sent in response to a `done` command, indicating that the session is complete and the WebSocket will close.
12
+ """
13
+
14
+ request_id: str = pydantic.Field()
15
+ """
16
+ Unique identifier for this transcription session.
17
+ """
18
+
19
+ if IS_PYDANTIC_V2:
20
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
21
+ else:
22
+
23
+ class Config:
24
+ frozen = True
25
+ smart_union = True
26
+ extra = pydantic.Extra.allow
@@ -0,0 +1,27 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
5
+ import pydantic
6
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
7
+
8
+
9
+ class ErrorMessage(UniversalBaseModel):
10
+ request_id: typing.Optional[str] = pydantic.Field(default=None)
11
+ """
12
+ The request ID associated with the error, if applicable.
13
+ """
14
+
15
+ message: str = pydantic.Field()
16
+ """
17
+ Human-readable error message describing what went wrong.
18
+ """
19
+
20
+ if IS_PYDANTIC_V2:
21
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
22
+ else:
23
+
24
+ class Config:
25
+ frozen = True
26
+ smart_union = True
27
+ extra = pydantic.Extra.allow
@@ -0,0 +1,26 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
+ import typing
7
+
8
+
9
+ class FlushDoneMessage(UniversalBaseModel):
10
+ """
11
+ Acknowledgment message sent in response to a `finalize` command, indicating that all buffered audio has been flushed and processed.
12
+ """
13
+
14
+ request_id: str = pydantic.Field()
15
+ """
16
+ Unique identifier for this transcription session.
17
+ """
18
+
19
+ if IS_PYDANTIC_V2:
20
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
21
+ else:
22
+
23
+ class Config:
24
+ frozen = True
25
+ smart_union = True
26
+ extra = pydantic.Extra.allow
@@ -0,0 +1,94 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from __future__ import annotations
4
+ from ...core.pydantic_utilities import UniversalBaseModel
5
+ import typing
6
+ from .transcription_word import TranscriptionWord
7
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
+ import pydantic
9
+
10
+
11
+ class StreamingTranscriptionResponse_Transcript(UniversalBaseModel):
12
+ """
13
+ The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
14
+ """
15
+
16
+ type: typing.Literal["transcript"] = "transcript"
17
+ request_id: str
18
+ text: str
19
+ is_final: bool
20
+ duration: typing.Optional[float] = None
21
+ language: typing.Optional[str] = None
22
+ words: typing.Optional[typing.List[TranscriptionWord]] = None
23
+
24
+ if IS_PYDANTIC_V2:
25
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
26
+ else:
27
+
28
+ class Config:
29
+ frozen = True
30
+ smart_union = True
31
+ extra = pydantic.Extra.allow
32
+
33
+
34
+ class StreamingTranscriptionResponse_FlushDone(UniversalBaseModel):
35
+ """
36
+ The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
37
+ """
38
+
39
+ type: typing.Literal["flush_done"] = "flush_done"
40
+ request_id: str
41
+
42
+ if IS_PYDANTIC_V2:
43
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
44
+ else:
45
+
46
+ class Config:
47
+ frozen = True
48
+ smart_union = True
49
+ extra = pydantic.Extra.allow
50
+
51
+
52
+ class StreamingTranscriptionResponse_Done(UniversalBaseModel):
53
+ """
54
+ The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
55
+ """
56
+
57
+ type: typing.Literal["done"] = "done"
58
+ request_id: str
59
+
60
+ if IS_PYDANTIC_V2:
61
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
62
+ else:
63
+
64
+ class Config:
65
+ frozen = True
66
+ smart_union = True
67
+ extra = pydantic.Extra.allow
68
+
69
+
70
+ class StreamingTranscriptionResponse_Error(UniversalBaseModel):
71
+ """
72
+ The server sends transcription results, control messages, or error messages. Each message has a `type` field to distinguish between different message types.
73
+ """
74
+
75
+ type: typing.Literal["error"] = "error"
76
+ request_id: typing.Optional[str] = None
77
+ message: str
78
+
79
+ if IS_PYDANTIC_V2:
80
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
81
+ else:
82
+
83
+ class Config:
84
+ frozen = True
85
+ smart_union = True
86
+ extra = pydantic.Extra.allow
87
+
88
+
89
+ StreamingTranscriptionResponse = typing.Union[
90
+ StreamingTranscriptionResponse_Transcript,
91
+ StreamingTranscriptionResponse_FlushDone,
92
+ StreamingTranscriptionResponse_Done,
93
+ StreamingTranscriptionResponse_Error,
94
+ ]
@@ -0,0 +1,7 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ SttEncoding = typing.Union[
6
+ typing.Literal["pcm_s16le", "pcm_s32le", "pcm_f16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"], typing.Any
7
+ ]
@@ -0,0 +1,5 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing
4
+
5
+ TimestampGranularity = typing.Union[typing.Literal["word"], typing.Any]
@@ -0,0 +1,50 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ import typing
6
+ from .transcription_word import TranscriptionWord
7
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
+
9
+
10
+ class TranscriptMessage(UniversalBaseModel):
11
+ request_id: str = pydantic.Field()
12
+ """
13
+ Unique identifier for this transcription session.
14
+ """
15
+
16
+ text: str = pydantic.Field()
17
+ """
18
+ The transcribed text. May be partial or final depending on is_final.
19
+
20
+ **Note**: Text may be empty in initial responses while the system accumulates sufficient audio for transcription. This is normal behavior - wait for responses with non-empty text or monitor is_final for completion status.
21
+ """
22
+
23
+ is_final: bool = pydantic.Field()
24
+ """
25
+ Whether this is a final transcription result or an interim result.
26
+ """
27
+
28
+ duration: typing.Optional[float] = pydantic.Field(default=None)
29
+ """
30
+ The duration of the audio transcribed so far, in seconds.
31
+ """
32
+
33
+ language: typing.Optional[str] = pydantic.Field(default=None)
34
+ """
35
+ The specified language of the input audio.
36
+ """
37
+
38
+ words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
39
+ """
40
+ Word-level timestamps showing the start and end time of each word in seconds. Always included in streaming responses.
41
+ """
42
+
43
+ if IS_PYDANTIC_V2:
44
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
45
+ else:
46
+
47
+ class Config:
48
+ frozen = True
49
+ smart_union = True
50
+ extra = pydantic.Extra.allow
@@ -0,0 +1,38 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ import typing
6
+ from .transcription_word import TranscriptionWord
7
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
+
9
+
10
+ class TranscriptionResponse(UniversalBaseModel):
11
+ text: str = pydantic.Field()
12
+ """
13
+ The transcribed text.
14
+ """
15
+
16
+ language: typing.Optional[str] = pydantic.Field(default=None)
17
+ """
18
+ The specified language of the input audio.
19
+ """
20
+
21
+ duration: typing.Optional[float] = pydantic.Field(default=None)
22
+ """
23
+ The duration of the input audio in seconds.
24
+ """
25
+
26
+ words: typing.Optional[typing.List[TranscriptionWord]] = pydantic.Field(default=None)
27
+ """
28
+ Word-level timestamps showing the start and end time of each word. Only included when `[word]` is passed into `timestamp_granularities[]`.
29
+ """
30
+
31
+ if IS_PYDANTIC_V2:
32
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
33
+ else:
34
+
35
+ class Config:
36
+ frozen = True
37
+ smart_union = True
38
+ extra = pydantic.Extra.allow
@@ -0,0 +1,32 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from ...core.pydantic_utilities import UniversalBaseModel
4
+ import pydantic
5
+ from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
+ import typing
7
+
8
+
9
+ class TranscriptionWord(UniversalBaseModel):
10
+ word: str = pydantic.Field()
11
+ """
12
+ The transcribed word.
13
+ """
14
+
15
+ start: float = pydantic.Field()
16
+ """
17
+ Start time of the word in seconds.
18
+ """
19
+
20
+ end: float = pydantic.Field()
21
+ """
22
+ End time of the word in seconds.
23
+ """
24
+
25
+ if IS_PYDANTIC_V2:
26
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
27
+ else:
28
+
29
+ class Config:
30
+ frozen = True
31
+ smart_union = True
32
+ extra = pydantic.Extra.allow
cartesia/tts/__init__.py CHANGED
@@ -19,11 +19,13 @@ from .types import (
19
19
  RawEncoding,
20
20
  RawOutputFormat,
21
21
  Speed,
22
+ SseOutputFormat,
22
23
  SupportedLanguage,
23
24
  TtsRequest,
24
25
  TtsRequestEmbeddingSpecifier,
25
26
  TtsRequestIdSpecifier,
26
27
  TtsRequestVoiceSpecifier,
28
+ TtssseRequest,
27
29
  WavOutputFormat,
28
30
  WebSocketBaseResponse,
29
31
  WebSocketChunkResponse,
@@ -58,10 +60,12 @@ from .requests import (
58
60
  PhonemeTimestampsParams,
59
61
  RawOutputFormatParams,
60
62
  SpeedParams,
63
+ SseOutputFormatParams,
61
64
  TtsRequestEmbeddingSpecifierParams,
62
65
  TtsRequestIdSpecifierParams,
63
66
  TtsRequestParams,
64
67
  TtsRequestVoiceSpecifierParams,
68
+ TtssseRequestParams,
65
69
  WavOutputFormatParams,
66
70
  WebSocketBaseResponseParams,
67
71
  WebSocketChunkResponseParams,
@@ -115,6 +119,8 @@ __all__ = [
115
119
  "RawOutputFormatParams",
116
120
  "Speed",
117
121
  "SpeedParams",
122
+ "SseOutputFormat",
123
+ "SseOutputFormatParams",
118
124
  "SupportedLanguage",
119
125
  "TtsRequest",
120
126
  "TtsRequestEmbeddingSpecifier",
@@ -124,6 +130,8 @@ __all__ = [
124
130
  "TtsRequestParams",
125
131
  "TtsRequestVoiceSpecifier",
126
132
  "TtsRequestVoiceSpecifierParams",
133
+ "TtssseRequest",
134
+ "TtssseRequestParams",
127
135
  "WavOutputFormat",
128
136
  "WavOutputFormatParams",
129
137
  "WebSocketBaseResponse",
cartesia/tts/client.py CHANGED
@@ -10,6 +10,8 @@ from ..core.request_options import RequestOptions
10
10
  from ..core.serialization import convert_and_respect_annotation_metadata
11
11
  from json.decoder import JSONDecodeError
12
12
  from ..core.api_error import ApiError
13
+ from .requests.sse_output_format import SseOutputFormatParams
14
+ from .types.context_id import ContextId
13
15
  from .types.web_socket_response import WebSocketResponse
14
16
  import httpx_sse
15
17
  from ..core.pydantic_utilities import parse_obj_as
@@ -119,10 +121,14 @@ class TtsClient:
119
121
  model_id: str,
120
122
  transcript: str,
121
123
  voice: TtsRequestVoiceSpecifierParams,
122
- output_format: OutputFormatParams,
124
+ output_format: SseOutputFormatParams,
123
125
  language: typing.Optional[SupportedLanguage] = OMIT,
124
126
  duration: typing.Optional[float] = OMIT,
125
127
  speed: typing.Optional[ModelSpeed] = OMIT,
128
+ add_timestamps: typing.Optional[bool] = OMIT,
129
+ add_phoneme_timestamps: typing.Optional[bool] = OMIT,
130
+ use_normalized_timestamps: typing.Optional[bool] = OMIT,
131
+ context_id: typing.Optional[ContextId] = OMIT,
126
132
  request_options: typing.Optional[RequestOptions] = None,
127
133
  ) -> typing.Iterator[WebSocketResponse]:
128
134
  """
@@ -135,7 +141,7 @@ class TtsClient:
135
141
 
136
142
  voice : TtsRequestVoiceSpecifierParams
137
143
 
138
- output_format : OutputFormatParams
144
+ output_format : SseOutputFormatParams
139
145
 
140
146
  language : typing.Optional[SupportedLanguage]
141
147
 
@@ -145,6 +151,18 @@ class TtsClient:
145
151
 
146
152
  speed : typing.Optional[ModelSpeed]
147
153
 
154
+ add_timestamps : typing.Optional[bool]
155
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
156
+
157
+ add_phoneme_timestamps : typing.Optional[bool]
158
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
159
+
160
+ use_normalized_timestamps : typing.Optional[bool]
161
+ Whether to use normalized timestamps (True) or original timestamps (False).
162
+
163
+ context_id : typing.Optional[ContextId]
164
+ Optional context ID for this request.
165
+
148
166
  request_options : typing.Optional[RequestOptions]
149
167
  Request-specific configuration.
150
168
 
@@ -165,9 +183,9 @@ class TtsClient:
165
183
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
166
184
  language="en",
167
185
  output_format={
186
+ "container": "raw",
168
187
  "sample_rate": 44100,
169
188
  "encoding": "pcm_f32le",
170
- "container": "raw",
171
189
  },
172
190
  )
173
191
  for chunk in response:
@@ -184,10 +202,14 @@ class TtsClient:
184
202
  ),
185
203
  "language": language,
186
204
  "output_format": convert_and_respect_annotation_metadata(
187
- object_=output_format, annotation=OutputFormatParams, direction="write"
205
+ object_=output_format, annotation=SseOutputFormatParams, direction="write"
188
206
  ),
189
207
  "duration": duration,
190
208
  "speed": speed,
209
+ "add_timestamps": add_timestamps,
210
+ "add_phoneme_timestamps": add_phoneme_timestamps,
211
+ "use_normalized_timestamps": use_normalized_timestamps,
212
+ "context_id": context_id,
191
213
  },
192
214
  request_options=request_options,
193
215
  omit=OMIT,
@@ -321,10 +343,14 @@ class AsyncTtsClient:
321
343
  model_id: str,
322
344
  transcript: str,
323
345
  voice: TtsRequestVoiceSpecifierParams,
324
- output_format: OutputFormatParams,
346
+ output_format: SseOutputFormatParams,
325
347
  language: typing.Optional[SupportedLanguage] = OMIT,
326
348
  duration: typing.Optional[float] = OMIT,
327
349
  speed: typing.Optional[ModelSpeed] = OMIT,
350
+ add_timestamps: typing.Optional[bool] = OMIT,
351
+ add_phoneme_timestamps: typing.Optional[bool] = OMIT,
352
+ use_normalized_timestamps: typing.Optional[bool] = OMIT,
353
+ context_id: typing.Optional[ContextId] = OMIT,
328
354
  request_options: typing.Optional[RequestOptions] = None,
329
355
  ) -> typing.AsyncIterator[WebSocketResponse]:
330
356
  """
@@ -337,7 +363,7 @@ class AsyncTtsClient:
337
363
 
338
364
  voice : TtsRequestVoiceSpecifierParams
339
365
 
340
- output_format : OutputFormatParams
366
+ output_format : SseOutputFormatParams
341
367
 
342
368
  language : typing.Optional[SupportedLanguage]
343
369
 
@@ -347,6 +373,18 @@ class AsyncTtsClient:
347
373
 
348
374
  speed : typing.Optional[ModelSpeed]
349
375
 
376
+ add_timestamps : typing.Optional[bool]
377
+ Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
378
+
379
+ add_phoneme_timestamps : typing.Optional[bool]
380
+ Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
381
+
382
+ use_normalized_timestamps : typing.Optional[bool]
383
+ Whether to use normalized timestamps (True) or original timestamps (False).
384
+
385
+ context_id : typing.Optional[ContextId]
386
+ Optional context ID for this request.
387
+
350
388
  request_options : typing.Optional[RequestOptions]
351
389
  Request-specific configuration.
352
390
 
@@ -372,9 +410,9 @@ class AsyncTtsClient:
372
410
  voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
373
411
  language="en",
374
412
  output_format={
413
+ "container": "raw",
375
414
  "sample_rate": 44100,
376
415
  "encoding": "pcm_f32le",
377
- "container": "raw",
378
416
  },
379
417
  )
380
418
  async for chunk in response:
@@ -394,10 +432,14 @@ class AsyncTtsClient:
394
432
  ),
395
433
  "language": language,
396
434
  "output_format": convert_and_respect_annotation_metadata(
397
- object_=output_format, annotation=OutputFormatParams, direction="write"
435
+ object_=output_format, annotation=SseOutputFormatParams, direction="write"
398
436
  ),
399
437
  "duration": duration,
400
438
  "speed": speed,
439
+ "add_timestamps": add_timestamps,
440
+ "add_phoneme_timestamps": add_phoneme_timestamps,
441
+ "use_normalized_timestamps": use_normalized_timestamps,
442
+ "context_id": context_id,
401
443
  },
402
444
  request_options=request_options,
403
445
  omit=OMIT,