cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. cartesia/__init__.py +60 -1
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/base_client.py +2 -0
  8. cartesia/client.py +5 -0
  9. cartesia/core/client_wrapper.py +1 -1
  10. cartesia/stt/__init__.py +57 -0
  11. cartesia/stt/_async_websocket.py +293 -0
  12. cartesia/stt/_websocket.py +294 -0
  13. cartesia/stt/client.py +456 -0
  14. cartesia/stt/requests/__init__.py +29 -0
  15. cartesia/stt/requests/done_message.py +14 -0
  16. cartesia/stt/requests/error_message.py +16 -0
  17. cartesia/stt/requests/flush_done_message.py +14 -0
  18. cartesia/stt/requests/streaming_transcription_response.py +41 -0
  19. cartesia/stt/requests/transcript_message.py +40 -0
  20. cartesia/stt/requests/transcription_response.py +28 -0
  21. cartesia/stt/requests/transcription_word.py +20 -0
  22. cartesia/stt/socket_client.py +138 -0
  23. cartesia/stt/types/__init__.py +33 -0
  24. cartesia/stt/types/done_message.py +26 -0
  25. cartesia/stt/types/error_message.py +27 -0
  26. cartesia/stt/types/flush_done_message.py +26 -0
  27. cartesia/stt/types/streaming_transcription_response.py +94 -0
  28. cartesia/stt/types/stt_encoding.py +7 -0
  29. cartesia/stt/types/timestamp_granularity.py +5 -0
  30. cartesia/stt/types/transcript_message.py +50 -0
  31. cartesia/stt/types/transcription_response.py +38 -0
  32. cartesia/stt/types/transcription_word.py +32 -0
  33. cartesia/tts/__init__.py +8 -0
  34. cartesia/tts/client.py +50 -8
  35. cartesia/tts/requests/__init__.py +4 -0
  36. cartesia/tts/requests/generation_request.py +4 -4
  37. cartesia/tts/requests/sse_output_format.py +11 -0
  38. cartesia/tts/requests/ttssse_request.py +47 -0
  39. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  40. cartesia/tts/requests/web_socket_response.py +1 -2
  41. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  42. cartesia/tts/types/__init__.py +4 -0
  43. cartesia/tts/types/generation_request.py +4 -4
  44. cartesia/tts/types/sse_output_format.py +22 -0
  45. cartesia/tts/types/ttssse_request.py +58 -0
  46. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  47. cartesia/tts/types/web_socket_response.py +1 -2
  48. cartesia/tts/types/web_socket_tts_request.py +11 -3
  49. cartesia/voice_changer/requests/streaming_response.py +0 -2
  50. cartesia/voice_changer/types/streaming_response.py +0 -2
  51. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
  52. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
  53. {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,294 @@
1
+ import json
2
+ import typing
3
+ import uuid
4
+ from typing import Any, Dict, Generator, Optional, Union
5
+
6
+ try:
7
+ from websockets.sync.client import connect
8
+ IS_WEBSOCKET_SYNC_AVAILABLE = True
9
+ except ImportError:
10
+ IS_WEBSOCKET_SYNC_AVAILABLE = False
11
+
12
+ from cartesia.stt.types import (
13
+ StreamingTranscriptionResponse,
14
+ StreamingTranscriptionResponse_Error,
15
+ StreamingTranscriptionResponse_Transcript,
16
+ )
17
+ from cartesia.stt.types.stt_encoding import SttEncoding
18
+
19
+ from ..core.pydantic_utilities import parse_obj_as
20
+
21
+
22
+ class SttWebsocket:
23
+ """This class contains methods to transcribe audio using WebSocket. Ideal for real-time speech transcription.
24
+
25
+ Usage:
26
+ >>> ws = client.stt.websocket()
27
+ >>> for audio_chunk in audio_chunks:
28
+ ... ws.send(audio_chunk)
29
+ >>> ws.send("finalize") # Flush remaining audio
30
+ >>> ws.send("done") # Close session
31
+ >>> for transcription in ws.receive():
32
+ ... print(transcription["text"])
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ ws_url: str,
38
+ api_key: str,
39
+ cartesia_version: str,
40
+ ):
41
+ self.ws_url = ws_url
42
+ self.api_key = api_key
43
+ self.cartesia_version = cartesia_version
44
+ self.websocket: Optional[Any] = None
45
+ self._is_listening = False
46
+ # Store default connection parameters for auto-connect with proper typing
47
+ self._default_model: str = "ink-whisper"
48
+ self._default_language: Optional[str] = "en"
49
+ self._default_encoding: SttEncoding = "pcm_s16le"
50
+ self._default_sample_rate: int = 16000
51
+ self._default_min_volume: Optional[float] = None
52
+ self._default_max_silence_duration_secs: Optional[float] = None
53
+
54
+ def __del__(self):
55
+ try:
56
+ self.close()
57
+ except Exception as e:
58
+ raise RuntimeError("Failed to close WebSocket: ", e)
59
+
60
+ def connect(
61
+ self,
62
+ *,
63
+ model: str = "ink-whisper",
64
+ language: Optional[str] = "en",
65
+ encoding: SttEncoding = "pcm_s16le",
66
+ sample_rate: int = 16000,
67
+ min_volume: Optional[float] = None,
68
+ max_silence_duration_secs: Optional[float] = None,
69
+ ):
70
+ """Connect to the STT WebSocket with the specified parameters.
71
+
72
+ Args:
73
+ model: ID of the model to use for transcription
74
+ language: The language of the input audio in ISO-639-1 format
75
+ encoding: The encoding format of the audio data (required)
76
+ sample_rate: The sample rate of the audio in Hz (required)
77
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
78
+ max_silence_duration_secs: Maximum duration of silence before endpointing
79
+
80
+ Raises:
81
+ RuntimeError: If the connection to the WebSocket fails.
82
+ """
83
+ # Store parameters for future auto-connects
84
+ self._default_model = model
85
+ self._default_language = language
86
+ self._default_encoding = encoding
87
+ self._default_sample_rate = sample_rate
88
+ self._default_min_volume = min_volume
89
+ self._default_max_silence_duration_secs = max_silence_duration_secs
90
+
91
+ if not IS_WEBSOCKET_SYNC_AVAILABLE:
92
+ raise ImportError(
93
+ "The synchronous WebSocket client is not available. Please ensure that you have 'websockets>=12.0' or compatible version installed."
94
+ )
95
+ if self.websocket is None or self._is_websocket_closed():
96
+ route = "stt/websocket"
97
+ params = {
98
+ "model": model,
99
+ "api_key": self.api_key,
100
+ "cartesia_version": self.cartesia_version,
101
+ "encoding": encoding,
102
+ "sample_rate": str(sample_rate),
103
+ }
104
+ if language is not None:
105
+ params["language"] = language
106
+ if min_volume is not None:
107
+ params["min_volume"] = str(min_volume)
108
+ if max_silence_duration_secs is not None:
109
+ params["max_silence_duration_secs"] = str(max_silence_duration_secs)
110
+
111
+ query_string = "&".join([f"{k}={v}" for k, v in params.items()])
112
+ url = f"{self.ws_url}/{route}?{query_string}"
113
+
114
+ try:
115
+ self.websocket = connect(url)
116
+ except Exception as e:
117
+ status_code = None
118
+ error_message = str(e)
119
+
120
+ if hasattr(e, 'status') and e.status is not None:
121
+ status_code = e.status
122
+
123
+ if status_code == 402:
124
+ error_message = "Payment required. Your API key may have insufficient credits or permissions."
125
+ elif status_code == 401:
126
+ error_message = "Unauthorized. Please check your API key."
127
+ elif status_code == 403:
128
+ error_message = "Forbidden. You don't have permission to access this resource."
129
+ elif status_code == 404:
130
+ error_message = "Not found. The requested resource doesn't exist."
131
+
132
+ raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
133
+ else:
134
+ raise RuntimeError(f"Failed to connect to WebSocket. {e}")
135
+
136
+ def _is_websocket_closed(self):
137
+ return self.websocket is None or (hasattr(self.websocket, 'socket') and self.websocket.socket.fileno() == -1)
138
+
139
+ def close(self):
140
+ """This method closes the WebSocket connection. Highly recommended to call this method when done using the WebSocket."""
141
+ if self.websocket and not self._is_websocket_closed():
142
+ self.websocket.close()
143
+
144
+ def send(self, data: Union[bytes, str]):
145
+ """Send audio data or control commands to the WebSocket.
146
+
147
+ Args:
148
+ data: Binary audio data or text command ("finalize" or "done")
149
+ """
150
+ # Auto-connect if not connected, like TTS does
151
+ if self.websocket is None or self._is_websocket_closed():
152
+ self.connect(
153
+ model=self._default_model,
154
+ language=self._default_language,
155
+ encoding=self._default_encoding,
156
+ sample_rate=self._default_sample_rate,
157
+ min_volume=self._default_min_volume,
158
+ max_silence_duration_secs=self._default_max_silence_duration_secs,
159
+ )
160
+
161
+ assert self.websocket is not None, "WebSocket should be connected after connect() call"
162
+
163
+ if isinstance(data, bytes):
164
+ self.websocket.send(data)
165
+ elif isinstance(data, str):
166
+ self.websocket.send(data)
167
+ else:
168
+ raise TypeError("Data must be bytes (audio) or str (command)")
169
+
170
+ def receive(self) -> Generator[Dict[str, Any], None, None]:
171
+ """Receive transcription results from the WebSocket.
172
+
173
+ Yields:
174
+ Dictionary containing transcription results, flush_done, done, or error messages
175
+ """
176
+ # Auto-connect if not connected, like TTS does
177
+ if self.websocket is None or self._is_websocket_closed():
178
+ self.connect(
179
+ model=self._default_model,
180
+ language=self._default_language,
181
+ encoding=self._default_encoding,
182
+ sample_rate=self._default_sample_rate,
183
+ min_volume=self._default_min_volume,
184
+ max_silence_duration_secs=self._default_max_silence_duration_secs,
185
+ )
186
+
187
+ assert self.websocket is not None, "WebSocket should be connected after connect() call"
188
+
189
+ try:
190
+ while True:
191
+ try:
192
+ message = self.websocket.recv()
193
+ if isinstance(message, str):
194
+ raw_data = json.loads(message)
195
+
196
+ # Handle error responses
197
+ if raw_data.get("type") == "error":
198
+ raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
199
+
200
+ # Handle transcript responses with flexible parsing
201
+ if raw_data.get("type") == "transcript":
202
+ # Provide defaults for missing required fields
203
+ result = {
204
+ "type": raw_data["type"],
205
+ "request_id": raw_data.get("request_id", ""),
206
+ "text": raw_data.get("text", ""), # Default to empty string if missing
207
+ "is_final": raw_data.get("is_final", False), # Default to False if missing
208
+ }
209
+
210
+ # Add optional fields if present
211
+ if "duration" in raw_data:
212
+ result["duration"] = raw_data["duration"]
213
+ if "language" in raw_data:
214
+ result["language"] = raw_data["language"]
215
+ if "words" in raw_data:
216
+ result["words"] = raw_data["words"]
217
+
218
+ yield result
219
+
220
+ # Handle flush_done acknowledgment
221
+ elif raw_data.get("type") == "flush_done":
222
+ result = {
223
+ "type": raw_data["type"],
224
+ "request_id": raw_data.get("request_id", ""),
225
+ }
226
+ yield result
227
+
228
+ # Handle done acknowledgment
229
+ elif raw_data.get("type") == "done":
230
+ result = {
231
+ "type": raw_data["type"],
232
+ "request_id": raw_data.get("request_id", ""),
233
+ }
234
+ yield result
235
+ break # Exit the loop when done
236
+
237
+ except Exception as e:
238
+ if "Connection closed" in str(e) or "no active connection" in str(e):
239
+ break # WebSocket was closed
240
+ raise e # Re-raise other exceptions
241
+ except KeyboardInterrupt:
242
+ self.close()
243
+ raise
244
+
245
+ def transcribe(
246
+ self,
247
+ audio_chunks: typing.Iterator[bytes],
248
+ *,
249
+ model: str = "ink-whisper",
250
+ language: Optional[str] = "en",
251
+ encoding: SttEncoding = "pcm_s16le",
252
+ sample_rate: int = 16000,
253
+ min_volume: Optional[float] = None,
254
+ max_silence_duration_secs: Optional[float] = None,
255
+ ) -> Generator[Dict[str, Any], None, None]:
256
+ """Transcribe audio chunks using the WebSocket.
257
+
258
+ Args:
259
+ audio_chunks: Iterator of audio chunks as bytes
260
+ model: ID of the model to use for transcription
261
+ language: The language of the input audio in ISO-639-1 format
262
+ encoding: The encoding format of the audio data (required)
263
+ sample_rate: The sample rate of the audio in Hz (required)
264
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
265
+ max_silence_duration_secs: Maximum duration of silence before endpointing
266
+
267
+ Yields:
268
+ Dictionary containing transcription results, flush_done, done, or error messages
269
+ """
270
+ self.connect(
271
+ model=model,
272
+ language=language,
273
+ encoding=encoding,
274
+ sample_rate=sample_rate,
275
+ min_volume=min_volume,
276
+ max_silence_duration_secs=max_silence_duration_secs,
277
+ )
278
+
279
+ try:
280
+ # Send all audio chunks
281
+ for chunk in audio_chunks:
282
+ self.send(chunk)
283
+
284
+ # Send finalize command to flush remaining audio
285
+ self.send("finalize")
286
+
287
+ # Send done command to close session cleanly
288
+ self.send("done")
289
+
290
+ # Receive all responses until done
291
+ yield from self.receive()
292
+
293
+ finally:
294
+ self.close()