cartesia 2.0.4__py3-none-any.whl → 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,272 @@
1
+ import json
2
+ import typing
3
+ import uuid
4
+ from typing import Any, Dict, Generator, Optional, Union
5
+
6
+ try:
7
+ from websockets.sync.client import connect
8
+ IS_WEBSOCKET_SYNC_AVAILABLE = True
9
+ except ImportError:
10
+ IS_WEBSOCKET_SYNC_AVAILABLE = False
11
+
12
+ from cartesia.stt.types import (
13
+ StreamingTranscriptionResponse,
14
+ StreamingTranscriptionResponse_Error,
15
+ StreamingTranscriptionResponse_Transcript,
16
+ )
17
+
18
+ from ..core.pydantic_utilities import parse_obj_as
19
+
20
+
21
+ class SttWebsocket:
22
+ """This class contains methods to transcribe audio using WebSocket. Ideal for real-time speech transcription.
23
+
24
+ Usage:
25
+ >>> ws = client.stt.websocket()
26
+ >>> for audio_chunk in audio_chunks:
27
+ ... ws.send(audio_chunk)
28
+ >>> ws.send("finalize") # Flush remaining audio
29
+ >>> ws.send("done") # Close session
30
+ >>> for transcription in ws.receive():
31
+ ... print(transcription["text"])
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ ws_url: str,
37
+ api_key: str,
38
+ cartesia_version: str,
39
+ ):
40
+ self.ws_url = ws_url
41
+ self.api_key = api_key
42
+ self.cartesia_version = cartesia_version
43
+ self.websocket: Optional[Any] = None
44
+ self._is_listening = False
45
+ # Store default connection parameters for auto-connect with proper typing
46
+ self._default_model: str = "ink-whisper"
47
+ self._default_language: Optional[str] = "en"
48
+ self._default_encoding: Optional[str] = "pcm_s16le"
49
+ self._default_sample_rate: int = 16000
50
+
51
+ def __del__(self):
52
+ try:
53
+ self.close()
54
+ except Exception as e:
55
+ raise RuntimeError("Failed to close WebSocket: ", e)
56
+
57
+ def connect(
58
+ self,
59
+ *,
60
+ model: str = "ink-whisper",
61
+ language: Optional[str] = "en",
62
+ encoding: Optional[str] = "pcm_s16le",
63
+ sample_rate: int = 16000,
64
+ ):
65
+ """Connect to the STT WebSocket with the specified parameters.
66
+
67
+ Args:
68
+ model: ID of the model to use for transcription
69
+ language: The language of the input audio in ISO-639-1 format
70
+ encoding: The encoding format of the audio data
71
+ sample_rate: The sample rate of the audio in Hz
72
+
73
+ Raises:
74
+ RuntimeError: If the connection to the WebSocket fails.
75
+ """
76
+ # Store parameters for future auto-connects
77
+ self._default_model = model
78
+ self._default_language = language
79
+ self._default_encoding = encoding
80
+ self._default_sample_rate = sample_rate
81
+
82
+ if not IS_WEBSOCKET_SYNC_AVAILABLE:
83
+ raise ImportError(
84
+ "The synchronous WebSocket client is not available. Please ensure that you have 'websockets>=12.0' or compatible version installed."
85
+ )
86
+ if self.websocket is None or self._is_websocket_closed():
87
+ route = "stt/websocket"
88
+ params = {
89
+ "model": model,
90
+ "api_key": self.api_key,
91
+ "cartesia_version": self.cartesia_version,
92
+ }
93
+ if language is not None:
94
+ params["language"] = language
95
+ if encoding is not None:
96
+ params["encoding"] = encoding
97
+ if sample_rate is not None:
98
+ params["sample_rate"] = str(sample_rate)
99
+
100
+ query_string = "&".join([f"{k}={v}" for k, v in params.items()])
101
+ url = f"{self.ws_url}/{route}?{query_string}"
102
+
103
+ try:
104
+ self.websocket = connect(url)
105
+ except Exception as e:
106
+ status_code = None
107
+ error_message = str(e)
108
+
109
+ if hasattr(e, 'status') and e.status is not None:
110
+ status_code = e.status
111
+
112
+ if status_code == 402:
113
+ error_message = "Payment required. Your API key may have insufficient credits or permissions."
114
+ elif status_code == 401:
115
+ error_message = "Unauthorized. Please check your API key."
116
+ elif status_code == 403:
117
+ error_message = "Forbidden. You don't have permission to access this resource."
118
+ elif status_code == 404:
119
+ error_message = "Not found. The requested resource doesn't exist."
120
+
121
+ raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
122
+ else:
123
+ raise RuntimeError(f"Failed to connect to WebSocket. {e}")
124
+
125
+ def _is_websocket_closed(self):
126
+ return self.websocket is None or (hasattr(self.websocket, 'socket') and self.websocket.socket.fileno() == -1)
127
+
128
+ def close(self):
129
+ """This method closes the WebSocket connection. Highly recommended to call this method when done using the WebSocket."""
130
+ if self.websocket and not self._is_websocket_closed():
131
+ self.websocket.close()
132
+
133
+ def send(self, data: Union[bytes, str]):
134
+ """Send audio data or control commands to the WebSocket.
135
+
136
+ Args:
137
+ data: Binary audio data or text command ("finalize" or "done")
138
+ """
139
+ # Auto-connect if not connected, like TTS does
140
+ if self.websocket is None or self._is_websocket_closed():
141
+ self.connect(
142
+ model=self._default_model,
143
+ language=self._default_language,
144
+ encoding=self._default_encoding,
145
+ sample_rate=self._default_sample_rate,
146
+ )
147
+
148
+ assert self.websocket is not None, "WebSocket should be connected after connect() call"
149
+
150
+ if isinstance(data, bytes):
151
+ self.websocket.send(data)
152
+ elif isinstance(data, str):
153
+ self.websocket.send(data)
154
+ else:
155
+ raise TypeError("Data must be bytes (audio) or str (command)")
156
+
157
+ def receive(self) -> Generator[Dict[str, Any], None, None]:
158
+ """Receive transcription results from the WebSocket.
159
+
160
+ Yields:
161
+ Dictionary containing transcription results, flush_done, done, or error messages
162
+ """
163
+ # Auto-connect if not connected, like TTS does
164
+ if self.websocket is None or self._is_websocket_closed():
165
+ self.connect(
166
+ model=self._default_model,
167
+ language=self._default_language,
168
+ encoding=self._default_encoding,
169
+ sample_rate=self._default_sample_rate,
170
+ )
171
+
172
+ assert self.websocket is not None, "WebSocket should be connected after connect() call"
173
+
174
+ try:
175
+ while True:
176
+ try:
177
+ message = self.websocket.recv()
178
+ if isinstance(message, str):
179
+ raw_data = json.loads(message)
180
+
181
+ # Handle error responses
182
+ if raw_data.get("type") == "error":
183
+ raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
184
+
185
+ # Handle transcript responses with flexible parsing
186
+ if raw_data.get("type") == "transcript":
187
+ # Provide defaults for missing required fields
188
+ result = {
189
+ "type": raw_data["type"],
190
+ "request_id": raw_data.get("request_id", ""),
191
+ "text": raw_data.get("text", ""), # Default to empty string if missing
192
+ "is_final": raw_data.get("is_final", False), # Default to False if missing
193
+ }
194
+
195
+ # Add optional fields if present
196
+ if "duration" in raw_data:
197
+ result["duration"] = raw_data["duration"]
198
+ if "language" in raw_data:
199
+ result["language"] = raw_data["language"]
200
+
201
+ yield result
202
+
203
+ # Handle flush_done acknowledgment
204
+ elif raw_data.get("type") == "flush_done":
205
+ result = {
206
+ "type": raw_data["type"],
207
+ "request_id": raw_data.get("request_id", ""),
208
+ }
209
+ yield result
210
+
211
+ # Handle done acknowledgment - session complete
212
+ elif raw_data.get("type") == "done":
213
+ result = {
214
+ "type": raw_data["type"],
215
+ "request_id": raw_data.get("request_id", ""),
216
+ }
217
+ yield result
218
+ # Session is complete, break out of loop
219
+ break
220
+
221
+ except Exception as inner_e:
222
+ self.close()
223
+ raise RuntimeError(f"Error receiving transcription: {inner_e}")
224
+
225
+ except Exception as e:
226
+ self.close()
227
+ raise RuntimeError(f"Failed to receive transcription. {e}")
228
+
229
+ def transcribe(
230
+ self,
231
+ audio_chunks: typing.Iterator[bytes],
232
+ *,
233
+ model: str = "ink-whisper",
234
+ language: Optional[str] = "en",
235
+ encoding: Optional[str] = "pcm_s16le",
236
+ sample_rate: int = 16000,
237
+ ) -> Generator[Dict[str, Any], None, None]:
238
+ """Transcribe audio chunks using the WebSocket.
239
+
240
+ Args:
241
+ audio_chunks: Iterator of audio chunks as bytes
242
+ model: ID of the model to use for transcription
243
+ language: The language of the input audio in ISO-639-1 format
244
+ encoding: The encoding format of the audio data
245
+ sample_rate: The sample rate of the audio in Hz
246
+
247
+ Yields:
248
+ Dictionary containing transcription results, flush_done, done, or error messages
249
+ """
250
+ self.connect(
251
+ model=model,
252
+ language=language,
253
+ encoding=encoding,
254
+ sample_rate=sample_rate,
255
+ )
256
+
257
+ try:
258
+ # Send all audio chunks
259
+ for chunk in audio_chunks:
260
+ self.send(chunk)
261
+
262
+ # Send finalize command to flush remaining audio
263
+ self.send("finalize")
264
+
265
+ # Send done command to close session cleanly
266
+ self.send("done")
267
+
268
+ # Receive all responses until done
269
+ yield from self.receive()
270
+
271
+ finally:
272
+ self.close()
@@ -0,0 +1,27 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .done_message import DoneMessageParams
4
+ from .error_message import ErrorMessageParams
5
+ from .flush_done_message import FlushDoneMessageParams
6
+ from .streaming_transcription_response import (
7
+ StreamingTranscriptionResponseParams,
8
+ StreamingTranscriptionResponse_DoneParams,
9
+ StreamingTranscriptionResponse_ErrorParams,
10
+ StreamingTranscriptionResponse_FlushDoneParams,
11
+ StreamingTranscriptionResponse_TranscriptParams,
12
+ )
13
+ from .transcript_message import TranscriptMessageParams
14
+ from .transcription_response import TranscriptionResponseParams
15
+
16
+ __all__ = [
17
+ "DoneMessageParams",
18
+ "ErrorMessageParams",
19
+ "FlushDoneMessageParams",
20
+ "StreamingTranscriptionResponseParams",
21
+ "StreamingTranscriptionResponse_DoneParams",
22
+ "StreamingTranscriptionResponse_ErrorParams",
23
+ "StreamingTranscriptionResponse_FlushDoneParams",
24
+ "StreamingTranscriptionResponse_TranscriptParams",
25
+ "TranscriptMessageParams",
26
+ "TranscriptionResponseParams",
27
+ ]
@@ -0,0 +1,14 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+
5
+
6
+ class DoneMessageParams(typing_extensions.TypedDict):
7
+ """
8
+ Acknowledgment message sent in response to a `done` command, indicating that the session is complete and the WebSocket will close.
9
+ """
10
+
11
+ request_id: str
12
+ """
13
+ Unique identifier for this transcription session.
14
+ """
@@ -0,0 +1,16 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+
6
+
7
+ class ErrorMessageParams(typing_extensions.TypedDict):
8
+ request_id: typing_extensions.NotRequired[str]
9
+ """
10
+ The request ID associated with the error, if applicable.
11
+ """
12
+
13
+ message: str
14
+ """
15
+ Human-readable error message describing what went wrong.
16
+ """
@@ -0,0 +1,14 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+
5
+
6
+ class FlushDoneMessageParams(typing_extensions.TypedDict):
7
+ """
8
+ Acknowledgment message sent in response to a `finalize` command, indicating that all buffered audio has been flushed and processed.
9
+ """
10
+
11
+ request_id: str
12
+ """
13
+ Unique identifier for this transcription session.
14
+ """
@@ -0,0 +1,39 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from __future__ import annotations
4
+ import typing_extensions
5
+ import typing
6
+ import typing_extensions
7
+
8
+
9
+ class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
10
+ type: typing.Literal["transcript"]
11
+ request_id: str
12
+ text: str
13
+ is_final: bool
14
+ duration: typing_extensions.NotRequired[float]
15
+ language: typing_extensions.NotRequired[str]
16
+
17
+
18
+ class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):
19
+ type: typing.Literal["flush_done"]
20
+ request_id: str
21
+
22
+
23
+ class StreamingTranscriptionResponse_DoneParams(typing_extensions.TypedDict):
24
+ type: typing.Literal["done"]
25
+ request_id: str
26
+
27
+
28
+ class StreamingTranscriptionResponse_ErrorParams(typing_extensions.TypedDict):
29
+ type: typing.Literal["error"]
30
+ request_id: typing_extensions.NotRequired[str]
31
+ message: str
32
+
33
+
34
+ StreamingTranscriptionResponseParams = typing.Union[
35
+ StreamingTranscriptionResponse_TranscriptParams,
36
+ StreamingTranscriptionResponse_FlushDoneParams,
37
+ StreamingTranscriptionResponse_DoneParams,
38
+ StreamingTranscriptionResponse_ErrorParams,
39
+ ]
@@ -0,0 +1,33 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+
6
+
7
+ class TranscriptMessageParams(typing_extensions.TypedDict):
8
+ request_id: str
9
+ """
10
+ Unique identifier for this transcription session.
11
+ """
12
+
13
+ text: str
14
+ """
15
+ The transcribed text. May be partial or final depending on is_final.
16
+
17
+ **Note**: Text may be empty in initial responses while the system accumulates sufficient audio for transcription. This is normal behavior - wait for responses with non-empty text or monitor is_final for completion status.
18
+ """
19
+
20
+ is_final: bool
21
+ """
22
+ Whether this is a final transcription result or an interim result.
23
+ """
24
+
25
+ duration: typing_extensions.NotRequired[float]
26
+ """
27
+ The duration of the audio transcribed so far, in seconds.
28
+ """
29
+
30
+ language: typing_extensions.NotRequired[str]
31
+ """
32
+ The detected or specified language of the input audio.
33
+ """
@@ -0,0 +1,21 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ import typing_extensions
4
+ import typing_extensions
5
+
6
+
7
+ class TranscriptionResponseParams(typing_extensions.TypedDict):
8
+ text: str
9
+ """
10
+ The transcribed text.
11
+ """
12
+
13
+ language: typing_extensions.NotRequired[str]
14
+ """
15
+ The detected or specified language of the input audio.
16
+ """
17
+
18
+ duration: typing_extensions.NotRequired[float]
19
+ """
20
+ The duration of the input audio in seconds.
21
+ """
@@ -0,0 +1,195 @@
1
+ import typing
2
+ from typing import Any, Dict, Generator, Optional, Union
3
+
4
+ from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
5
+ from ._async_websocket import AsyncSttWebsocket
6
+ from ._websocket import SttWebsocket
7
+
8
+
9
+ class SttClientWithWebsocket:
10
+ """
11
+ Extension of STT functionality that supports a synchronous WebSocket STT connection.
12
+ """
13
+
14
+ def __init__(self, *, client_wrapper: SyncClientWrapper):
15
+ self._client_wrapper = client_wrapper
16
+
17
+ def _ws_url(self):
18
+ base_url = self._client_wrapper.get_base_url()
19
+ if base_url.startswith("ws://") or base_url.startswith("wss://"):
20
+ return base_url
21
+ else:
22
+ prefix = "ws" if "localhost" in base_url else "wss"
23
+ base_url_without_protocol = base_url.split("://")[-1]
24
+ return f"{prefix}://{base_url_without_protocol}"
25
+
26
+ def websocket(self, *,
27
+ model: str = "ink-whisper",
28
+ language: Optional[str] = "en",
29
+ encoding: Optional[str] = "pcm_s16le",
30
+ sample_rate: int = 16000):
31
+ """Create a WebSocket connection for real-time speech transcription.
32
+
33
+ Args:
34
+ model: ID of the model to use for transcription
35
+ language: The language of the input audio in ISO-639-1 format
36
+ encoding: The encoding format of the audio data
37
+ sample_rate: The sample rate of the audio in Hz
38
+
39
+ Returns:
40
+ SttWebsocket: A connected WebSocket client for STT operations.
41
+ """
42
+ client_headers = self._client_wrapper.get_headers()
43
+ ws = SttWebsocket(
44
+ ws_url=self._ws_url(),
45
+ cartesia_version=client_headers["Cartesia-Version"],
46
+ api_key=client_headers["X-API-Key"],
47
+ )
48
+ # Auto-connect like TTS does for consistency
49
+ ws.connect(
50
+ model=model,
51
+ language=language,
52
+ encoding=encoding,
53
+ sample_rate=sample_rate,
54
+ )
55
+ return ws
56
+
57
+ def transcribe(
58
+ self,
59
+ audio_chunks: typing.Iterator[bytes],
60
+ *,
61
+ model: str = "ink-whisper",
62
+ language: Optional[str] = "en",
63
+ encoding: Optional[str] = "pcm_s16le",
64
+ sample_rate: int = 16000,
65
+ ) -> Generator[Dict[str, Any], None, None]:
66
+ """Transcribe audio chunks using WebSocket.
67
+
68
+ Args:
69
+ audio_chunks: Iterator of audio chunks as bytes
70
+ model: ID of the model to use for transcription
71
+ language: The language of the input audio in ISO-639-1 format
72
+ encoding: The encoding format of the audio data
73
+ sample_rate: The sample rate of the audio in Hz
74
+
75
+ Yields:
76
+ Dictionary containing transcription results, flush_done, done, or error messages
77
+
78
+ Example:
79
+ >>> client = Cartesia(api_key="your-api-key")
80
+ >>> ws_client = client.stt.websocket()
81
+ >>> for result in ws_client.transcribe(audio_chunks):
82
+ ... print(result["text"])
83
+ """
84
+ ws = self.websocket(
85
+ model=model,
86
+ language=language,
87
+ encoding=encoding,
88
+ sample_rate=sample_rate,
89
+ )
90
+ try:
91
+ yield from ws.transcribe(
92
+ audio_chunks,
93
+ model=model,
94
+ language=language,
95
+ encoding=encoding,
96
+ sample_rate=sample_rate,
97
+ )
98
+ finally:
99
+ ws.close()
100
+
101
+
102
+ class AsyncSttClientWithWebsocket:
103
+ """
104
+ Extension of STT functionality that supports an asynchronous WebSocket STT connection.
105
+ """
106
+
107
+ def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
108
+ self._client_wrapper = client_wrapper
109
+ self._get_session = get_session
110
+
111
+ def _ws_url(self) -> str:
112
+ base_url = self._client_wrapper.get_base_url()
113
+ if base_url.startswith("ws://") or base_url.startswith("wss://"):
114
+ return base_url
115
+ else:
116
+ prefix = "ws" if "localhost" in base_url else "wss"
117
+ base_url_without_protocol = base_url.split("://")[-1]
118
+ return f"{prefix}://{base_url_without_protocol}"
119
+
120
+ async def websocket(self, *,
121
+ model: str = "ink-whisper",
122
+ language: Optional[str] = "en",
123
+ encoding: Optional[str] = "pcm_s16le",
124
+ sample_rate: int = 16000):
125
+ """Create an async WebSocket connection for real-time speech transcription.
126
+
127
+ Args:
128
+ model: ID of the model to use for transcription
129
+ language: The language of the input audio in ISO-639-1 format
130
+ encoding: The encoding format of the audio data
131
+ sample_rate: The sample rate of the audio in Hz
132
+
133
+ Returns:
134
+ AsyncSttWebsocket: A connected async WebSocket client for STT operations.
135
+ """
136
+ client_headers = self._client_wrapper.get_headers()
137
+ ws = AsyncSttWebsocket(
138
+ ws_url=self._ws_url(),
139
+ cartesia_version=client_headers["Cartesia-Version"],
140
+ api_key=client_headers["X-API-Key"],
141
+ get_session=self._get_session,
142
+ )
143
+ # Auto-connect like TTS does for consistency
144
+ await ws.connect(
145
+ model=model,
146
+ language=language,
147
+ encoding=encoding,
148
+ sample_rate=sample_rate,
149
+ )
150
+ return ws
151
+
152
+ async def transcribe(
153
+ self,
154
+ audio_chunks: typing.AsyncIterator[bytes],
155
+ *,
156
+ model: str = "ink-whisper",
157
+ language: Optional[str] = "en",
158
+ encoding: Optional[str] = "pcm_s16le",
159
+ sample_rate: int = 16000,
160
+ ) -> typing.AsyncGenerator[Dict[str, Any], None]:
161
+ """Transcribe audio chunks using async WebSocket.
162
+
163
+ Args:
164
+ audio_chunks: Async iterator of audio chunks as bytes
165
+ model: ID of the model to use for transcription
166
+ language: The language of the input audio in ISO-639-1 format
167
+ encoding: The encoding format of the audio data
168
+ sample_rate: The sample rate of the audio in Hz
169
+
170
+ Yields:
171
+ Dictionary containing transcription results, flush_done, done, or error messages
172
+
173
+ Example:
174
+ >>> client = AsyncCartesia(api_key="your-api-key")
175
+ >>> ws_client = await client.stt.websocket()
176
+ >>> async for result in ws_client.transcribe(audio_chunks):
177
+ ... print(result["text"])
178
+ """
179
+ ws = await self.websocket(
180
+ model=model,
181
+ language=language,
182
+ encoding=encoding,
183
+ sample_rate=sample_rate,
184
+ )
185
+ try:
186
+ async for result in ws.transcribe(
187
+ audio_chunks,
188
+ model=model,
189
+ language=language,
190
+ encoding=encoding,
191
+ sample_rate=sample_rate,
192
+ ):
193
+ yield result
194
+ finally:
195
+ await ws.close()
@@ -0,0 +1,29 @@
1
+ # This file was auto-generated by Fern from our API Definition.
2
+
3
+ from .done_message import DoneMessage
4
+ from .error_message import ErrorMessage
5
+ from .flush_done_message import FlushDoneMessage
6
+ from .streaming_transcription_response import (
7
+ StreamingTranscriptionResponse,
8
+ StreamingTranscriptionResponse_Done,
9
+ StreamingTranscriptionResponse_Error,
10
+ StreamingTranscriptionResponse_FlushDone,
11
+ StreamingTranscriptionResponse_Transcript,
12
+ )
13
+ from .stt_encoding import SttEncoding
14
+ from .transcript_message import TranscriptMessage
15
+ from .transcription_response import TranscriptionResponse
16
+
17
+ __all__ = [
18
+ "DoneMessage",
19
+ "ErrorMessage",
20
+ "FlushDoneMessage",
21
+ "StreamingTranscriptionResponse",
22
+ "StreamingTranscriptionResponse_Done",
23
+ "StreamingTranscriptionResponse_Error",
24
+ "StreamingTranscriptionResponse_FlushDone",
25
+ "StreamingTranscriptionResponse_Transcript",
26
+ "SttEncoding",
27
+ "TranscriptMessage",
28
+ "TranscriptionResponse",
29
+ ]