cartesia 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +60 -1
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/base_client.py +2 -0
- cartesia/client.py +5 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/stt/__init__.py +57 -0
- cartesia/stt/_async_websocket.py +293 -0
- cartesia/stt/_websocket.py +294 -0
- cartesia/stt/client.py +456 -0
- cartesia/stt/requests/__init__.py +29 -0
- cartesia/stt/requests/done_message.py +14 -0
- cartesia/stt/requests/error_message.py +16 -0
- cartesia/stt/requests/flush_done_message.py +14 -0
- cartesia/stt/requests/streaming_transcription_response.py +41 -0
- cartesia/stt/requests/transcript_message.py +40 -0
- cartesia/stt/requests/transcription_response.py +28 -0
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +138 -0
- cartesia/stt/types/__init__.py +33 -0
- cartesia/stt/types/done_message.py +26 -0
- cartesia/stt/types/error_message.py +27 -0
- cartesia/stt/types/flush_done_message.py +26 -0
- cartesia/stt/types/streaming_transcription_response.py +94 -0
- cartesia/stt/types/stt_encoding.py +7 -0
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +50 -0
- cartesia/stt/types/transcription_response.py +38 -0
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +8 -0
- cartesia/tts/client.py +50 -8
- cartesia/tts/requests/__init__.py +4 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +4 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/METADATA +256 -2
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/RECORD +53 -26
- {cartesia-2.0.4.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,294 @@
|
|
1
|
+
import json
|
2
|
+
import typing
|
3
|
+
import uuid
|
4
|
+
from typing import Any, Dict, Generator, Optional, Union
|
5
|
+
|
6
|
+
try:
|
7
|
+
from websockets.sync.client import connect
|
8
|
+
IS_WEBSOCKET_SYNC_AVAILABLE = True
|
9
|
+
except ImportError:
|
10
|
+
IS_WEBSOCKET_SYNC_AVAILABLE = False
|
11
|
+
|
12
|
+
from cartesia.stt.types import (
|
13
|
+
StreamingTranscriptionResponse,
|
14
|
+
StreamingTranscriptionResponse_Error,
|
15
|
+
StreamingTranscriptionResponse_Transcript,
|
16
|
+
)
|
17
|
+
from cartesia.stt.types.stt_encoding import SttEncoding
|
18
|
+
|
19
|
+
from ..core.pydantic_utilities import parse_obj_as
|
20
|
+
|
21
|
+
|
22
|
+
class SttWebsocket:
|
23
|
+
"""This class contains methods to transcribe audio using WebSocket. Ideal for real-time speech transcription.
|
24
|
+
|
25
|
+
Usage:
|
26
|
+
>>> ws = client.stt.websocket()
|
27
|
+
>>> for audio_chunk in audio_chunks:
|
28
|
+
... ws.send(audio_chunk)
|
29
|
+
>>> ws.send("finalize") # Flush remaining audio
|
30
|
+
>>> ws.send("done") # Close session
|
31
|
+
>>> for transcription in ws.receive():
|
32
|
+
... print(transcription["text"])
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(
|
36
|
+
self,
|
37
|
+
ws_url: str,
|
38
|
+
api_key: str,
|
39
|
+
cartesia_version: str,
|
40
|
+
):
|
41
|
+
self.ws_url = ws_url
|
42
|
+
self.api_key = api_key
|
43
|
+
self.cartesia_version = cartesia_version
|
44
|
+
self.websocket: Optional[Any] = None
|
45
|
+
self._is_listening = False
|
46
|
+
# Store default connection parameters for auto-connect with proper typing
|
47
|
+
self._default_model: str = "ink-whisper"
|
48
|
+
self._default_language: Optional[str] = "en"
|
49
|
+
self._default_encoding: SttEncoding = "pcm_s16le"
|
50
|
+
self._default_sample_rate: int = 16000
|
51
|
+
self._default_min_volume: Optional[float] = None
|
52
|
+
self._default_max_silence_duration_secs: Optional[float] = None
|
53
|
+
|
54
|
+
def __del__(self):
|
55
|
+
try:
|
56
|
+
self.close()
|
57
|
+
except Exception as e:
|
58
|
+
raise RuntimeError("Failed to close WebSocket: ", e)
|
59
|
+
|
60
|
+
def connect(
|
61
|
+
self,
|
62
|
+
*,
|
63
|
+
model: str = "ink-whisper",
|
64
|
+
language: Optional[str] = "en",
|
65
|
+
encoding: SttEncoding = "pcm_s16le",
|
66
|
+
sample_rate: int = 16000,
|
67
|
+
min_volume: Optional[float] = None,
|
68
|
+
max_silence_duration_secs: Optional[float] = None,
|
69
|
+
):
|
70
|
+
"""Connect to the STT WebSocket with the specified parameters.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
model: ID of the model to use for transcription
|
74
|
+
language: The language of the input audio in ISO-639-1 format
|
75
|
+
encoding: The encoding format of the audio data (required)
|
76
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
77
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
78
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
79
|
+
|
80
|
+
Raises:
|
81
|
+
RuntimeError: If the connection to the WebSocket fails.
|
82
|
+
"""
|
83
|
+
# Store parameters for future auto-connects
|
84
|
+
self._default_model = model
|
85
|
+
self._default_language = language
|
86
|
+
self._default_encoding = encoding
|
87
|
+
self._default_sample_rate = sample_rate
|
88
|
+
self._default_min_volume = min_volume
|
89
|
+
self._default_max_silence_duration_secs = max_silence_duration_secs
|
90
|
+
|
91
|
+
if not IS_WEBSOCKET_SYNC_AVAILABLE:
|
92
|
+
raise ImportError(
|
93
|
+
"The synchronous WebSocket client is not available. Please ensure that you have 'websockets>=12.0' or compatible version installed."
|
94
|
+
)
|
95
|
+
if self.websocket is None or self._is_websocket_closed():
|
96
|
+
route = "stt/websocket"
|
97
|
+
params = {
|
98
|
+
"model": model,
|
99
|
+
"api_key": self.api_key,
|
100
|
+
"cartesia_version": self.cartesia_version,
|
101
|
+
"encoding": encoding,
|
102
|
+
"sample_rate": str(sample_rate),
|
103
|
+
}
|
104
|
+
if language is not None:
|
105
|
+
params["language"] = language
|
106
|
+
if min_volume is not None:
|
107
|
+
params["min_volume"] = str(min_volume)
|
108
|
+
if max_silence_duration_secs is not None:
|
109
|
+
params["max_silence_duration_secs"] = str(max_silence_duration_secs)
|
110
|
+
|
111
|
+
query_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
112
|
+
url = f"{self.ws_url}/{route}?{query_string}"
|
113
|
+
|
114
|
+
try:
|
115
|
+
self.websocket = connect(url)
|
116
|
+
except Exception as e:
|
117
|
+
status_code = None
|
118
|
+
error_message = str(e)
|
119
|
+
|
120
|
+
if hasattr(e, 'status') and e.status is not None:
|
121
|
+
status_code = e.status
|
122
|
+
|
123
|
+
if status_code == 402:
|
124
|
+
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
125
|
+
elif status_code == 401:
|
126
|
+
error_message = "Unauthorized. Please check your API key."
|
127
|
+
elif status_code == 403:
|
128
|
+
error_message = "Forbidden. You don't have permission to access this resource."
|
129
|
+
elif status_code == 404:
|
130
|
+
error_message = "Not found. The requested resource doesn't exist."
|
131
|
+
|
132
|
+
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
133
|
+
else:
|
134
|
+
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
135
|
+
|
136
|
+
def _is_websocket_closed(self):
|
137
|
+
return self.websocket is None or (hasattr(self.websocket, 'socket') and self.websocket.socket.fileno() == -1)
|
138
|
+
|
139
|
+
def close(self):
|
140
|
+
"""This method closes the WebSocket connection. Highly recommended to call this method when done using the WebSocket."""
|
141
|
+
if self.websocket and not self._is_websocket_closed():
|
142
|
+
self.websocket.close()
|
143
|
+
|
144
|
+
def send(self, data: Union[bytes, str]):
|
145
|
+
"""Send audio data or control commands to the WebSocket.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
data: Binary audio data or text command ("finalize" or "done")
|
149
|
+
"""
|
150
|
+
# Auto-connect if not connected, like TTS does
|
151
|
+
if self.websocket is None or self._is_websocket_closed():
|
152
|
+
self.connect(
|
153
|
+
model=self._default_model,
|
154
|
+
language=self._default_language,
|
155
|
+
encoding=self._default_encoding,
|
156
|
+
sample_rate=self._default_sample_rate,
|
157
|
+
min_volume=self._default_min_volume,
|
158
|
+
max_silence_duration_secs=self._default_max_silence_duration_secs,
|
159
|
+
)
|
160
|
+
|
161
|
+
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
162
|
+
|
163
|
+
if isinstance(data, bytes):
|
164
|
+
self.websocket.send(data)
|
165
|
+
elif isinstance(data, str):
|
166
|
+
self.websocket.send(data)
|
167
|
+
else:
|
168
|
+
raise TypeError("Data must be bytes (audio) or str (command)")
|
169
|
+
|
170
|
+
def receive(self) -> Generator[Dict[str, Any], None, None]:
|
171
|
+
"""Receive transcription results from the WebSocket.
|
172
|
+
|
173
|
+
Yields:
|
174
|
+
Dictionary containing transcription results, flush_done, done, or error messages
|
175
|
+
"""
|
176
|
+
# Auto-connect if not connected, like TTS does
|
177
|
+
if self.websocket is None or self._is_websocket_closed():
|
178
|
+
self.connect(
|
179
|
+
model=self._default_model,
|
180
|
+
language=self._default_language,
|
181
|
+
encoding=self._default_encoding,
|
182
|
+
sample_rate=self._default_sample_rate,
|
183
|
+
min_volume=self._default_min_volume,
|
184
|
+
max_silence_duration_secs=self._default_max_silence_duration_secs,
|
185
|
+
)
|
186
|
+
|
187
|
+
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
188
|
+
|
189
|
+
try:
|
190
|
+
while True:
|
191
|
+
try:
|
192
|
+
message = self.websocket.recv()
|
193
|
+
if isinstance(message, str):
|
194
|
+
raw_data = json.loads(message)
|
195
|
+
|
196
|
+
# Handle error responses
|
197
|
+
if raw_data.get("type") == "error":
|
198
|
+
raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
|
199
|
+
|
200
|
+
# Handle transcript responses with flexible parsing
|
201
|
+
if raw_data.get("type") == "transcript":
|
202
|
+
# Provide defaults for missing required fields
|
203
|
+
result = {
|
204
|
+
"type": raw_data["type"],
|
205
|
+
"request_id": raw_data.get("request_id", ""),
|
206
|
+
"text": raw_data.get("text", ""), # Default to empty string if missing
|
207
|
+
"is_final": raw_data.get("is_final", False), # Default to False if missing
|
208
|
+
}
|
209
|
+
|
210
|
+
# Add optional fields if present
|
211
|
+
if "duration" in raw_data:
|
212
|
+
result["duration"] = raw_data["duration"]
|
213
|
+
if "language" in raw_data:
|
214
|
+
result["language"] = raw_data["language"]
|
215
|
+
if "words" in raw_data:
|
216
|
+
result["words"] = raw_data["words"]
|
217
|
+
|
218
|
+
yield result
|
219
|
+
|
220
|
+
# Handle flush_done acknowledgment
|
221
|
+
elif raw_data.get("type") == "flush_done":
|
222
|
+
result = {
|
223
|
+
"type": raw_data["type"],
|
224
|
+
"request_id": raw_data.get("request_id", ""),
|
225
|
+
}
|
226
|
+
yield result
|
227
|
+
|
228
|
+
# Handle done acknowledgment
|
229
|
+
elif raw_data.get("type") == "done":
|
230
|
+
result = {
|
231
|
+
"type": raw_data["type"],
|
232
|
+
"request_id": raw_data.get("request_id", ""),
|
233
|
+
}
|
234
|
+
yield result
|
235
|
+
break # Exit the loop when done
|
236
|
+
|
237
|
+
except Exception as e:
|
238
|
+
if "Connection closed" in str(e) or "no active connection" in str(e):
|
239
|
+
break # WebSocket was closed
|
240
|
+
raise e # Re-raise other exceptions
|
241
|
+
except KeyboardInterrupt:
|
242
|
+
self.close()
|
243
|
+
raise
|
244
|
+
|
245
|
+
def transcribe(
|
246
|
+
self,
|
247
|
+
audio_chunks: typing.Iterator[bytes],
|
248
|
+
*,
|
249
|
+
model: str = "ink-whisper",
|
250
|
+
language: Optional[str] = "en",
|
251
|
+
encoding: SttEncoding = "pcm_s16le",
|
252
|
+
sample_rate: int = 16000,
|
253
|
+
min_volume: Optional[float] = None,
|
254
|
+
max_silence_duration_secs: Optional[float] = None,
|
255
|
+
) -> Generator[Dict[str, Any], None, None]:
|
256
|
+
"""Transcribe audio chunks using the WebSocket.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
audio_chunks: Iterator of audio chunks as bytes
|
260
|
+
model: ID of the model to use for transcription
|
261
|
+
language: The language of the input audio in ISO-639-1 format
|
262
|
+
encoding: The encoding format of the audio data (required)
|
263
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
264
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
265
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
266
|
+
|
267
|
+
Yields:
|
268
|
+
Dictionary containing transcription results, flush_done, done, or error messages
|
269
|
+
"""
|
270
|
+
self.connect(
|
271
|
+
model=model,
|
272
|
+
language=language,
|
273
|
+
encoding=encoding,
|
274
|
+
sample_rate=sample_rate,
|
275
|
+
min_volume=min_volume,
|
276
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
277
|
+
)
|
278
|
+
|
279
|
+
try:
|
280
|
+
# Send all audio chunks
|
281
|
+
for chunk in audio_chunks:
|
282
|
+
self.send(chunk)
|
283
|
+
|
284
|
+
# Send finalize command to flush remaining audio
|
285
|
+
self.send("finalize")
|
286
|
+
|
287
|
+
# Send done command to close session cleanly
|
288
|
+
self.send("done")
|
289
|
+
|
290
|
+
# Receive all responses until done
|
291
|
+
yield from self.receive()
|
292
|
+
|
293
|
+
finally:
|
294
|
+
self.close()
|