cartesia 2.0.3__py3-none-any.whl → 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +46 -1
- cartesia/base_client.py +2 -0
- cartesia/client.py +5 -0
- cartesia/core/client_wrapper.py +1 -1
- cartesia/stt/__init__.py +51 -0
- cartesia/stt/_async_websocket.py +284 -0
- cartesia/stt/_websocket.py +272 -0
- cartesia/stt/requests/__init__.py +27 -0
- cartesia/stt/requests/done_message.py +14 -0
- cartesia/stt/requests/error_message.py +16 -0
- cartesia/stt/requests/flush_done_message.py +14 -0
- cartesia/stt/requests/streaming_transcription_response.py +39 -0
- cartesia/stt/requests/transcript_message.py +33 -0
- cartesia/stt/requests/transcription_response.py +21 -0
- cartesia/stt/socket_client.py +195 -0
- cartesia/stt/types/__init__.py +29 -0
- cartesia/stt/types/done_message.py +26 -0
- cartesia/stt/types/error_message.py +27 -0
- cartesia/stt/types/flush_done_message.py +26 -0
- cartesia/stt/types/streaming_transcription_response.py +92 -0
- cartesia/stt/types/stt_encoding.py +5 -0
- cartesia/stt/types/transcript_message.py +44 -0
- cartesia/stt/types/transcription_response.py +32 -0
- cartesia/tts/_websocket.py +3 -3
- {cartesia-2.0.3.dist-info → cartesia-2.0.5.dist-info}/METADATA +159 -2
- {cartesia-2.0.3.dist-info → cartesia-2.0.5.dist-info}/RECORD +27 -8
- {cartesia-2.0.3.dist-info → cartesia-2.0.5.dist-info}/WHEEL +0 -0
@@ -0,0 +1,272 @@
|
|
1
|
+
import json
|
2
|
+
import typing
|
3
|
+
import uuid
|
4
|
+
from typing import Any, Dict, Generator, Optional, Union
|
5
|
+
|
6
|
+
try:
|
7
|
+
from websockets.sync.client import connect
|
8
|
+
IS_WEBSOCKET_SYNC_AVAILABLE = True
|
9
|
+
except ImportError:
|
10
|
+
IS_WEBSOCKET_SYNC_AVAILABLE = False
|
11
|
+
|
12
|
+
from cartesia.stt.types import (
|
13
|
+
StreamingTranscriptionResponse,
|
14
|
+
StreamingTranscriptionResponse_Error,
|
15
|
+
StreamingTranscriptionResponse_Transcript,
|
16
|
+
)
|
17
|
+
|
18
|
+
from ..core.pydantic_utilities import parse_obj_as
|
19
|
+
|
20
|
+
|
21
|
+
class SttWebsocket:
|
22
|
+
"""This class contains methods to transcribe audio using WebSocket. Ideal for real-time speech transcription.
|
23
|
+
|
24
|
+
Usage:
|
25
|
+
>>> ws = client.stt.websocket()
|
26
|
+
>>> for audio_chunk in audio_chunks:
|
27
|
+
... ws.send(audio_chunk)
|
28
|
+
>>> ws.send("finalize") # Flush remaining audio
|
29
|
+
>>> ws.send("done") # Close session
|
30
|
+
>>> for transcription in ws.receive():
|
31
|
+
... print(transcription["text"])
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
ws_url: str,
|
37
|
+
api_key: str,
|
38
|
+
cartesia_version: str,
|
39
|
+
):
|
40
|
+
self.ws_url = ws_url
|
41
|
+
self.api_key = api_key
|
42
|
+
self.cartesia_version = cartesia_version
|
43
|
+
self.websocket: Optional[Any] = None
|
44
|
+
self._is_listening = False
|
45
|
+
# Store default connection parameters for auto-connect with proper typing
|
46
|
+
self._default_model: str = "ink-whisper"
|
47
|
+
self._default_language: Optional[str] = "en"
|
48
|
+
self._default_encoding: Optional[str] = "pcm_s16le"
|
49
|
+
self._default_sample_rate: int = 16000
|
50
|
+
|
51
|
+
def __del__(self):
|
52
|
+
try:
|
53
|
+
self.close()
|
54
|
+
except Exception as e:
|
55
|
+
raise RuntimeError("Failed to close WebSocket: ", e)
|
56
|
+
|
57
|
+
def connect(
|
58
|
+
self,
|
59
|
+
*,
|
60
|
+
model: str = "ink-whisper",
|
61
|
+
language: Optional[str] = "en",
|
62
|
+
encoding: Optional[str] = "pcm_s16le",
|
63
|
+
sample_rate: int = 16000,
|
64
|
+
):
|
65
|
+
"""Connect to the STT WebSocket with the specified parameters.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
model: ID of the model to use for transcription
|
69
|
+
language: The language of the input audio in ISO-639-1 format
|
70
|
+
encoding: The encoding format of the audio data
|
71
|
+
sample_rate: The sample rate of the audio in Hz
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
RuntimeError: If the connection to the WebSocket fails.
|
75
|
+
"""
|
76
|
+
# Store parameters for future auto-connects
|
77
|
+
self._default_model = model
|
78
|
+
self._default_language = language
|
79
|
+
self._default_encoding = encoding
|
80
|
+
self._default_sample_rate = sample_rate
|
81
|
+
|
82
|
+
if not IS_WEBSOCKET_SYNC_AVAILABLE:
|
83
|
+
raise ImportError(
|
84
|
+
"The synchronous WebSocket client is not available. Please ensure that you have 'websockets>=12.0' or compatible version installed."
|
85
|
+
)
|
86
|
+
if self.websocket is None or self._is_websocket_closed():
|
87
|
+
route = "stt/websocket"
|
88
|
+
params = {
|
89
|
+
"model": model,
|
90
|
+
"api_key": self.api_key,
|
91
|
+
"cartesia_version": self.cartesia_version,
|
92
|
+
}
|
93
|
+
if language is not None:
|
94
|
+
params["language"] = language
|
95
|
+
if encoding is not None:
|
96
|
+
params["encoding"] = encoding
|
97
|
+
if sample_rate is not None:
|
98
|
+
params["sample_rate"] = str(sample_rate)
|
99
|
+
|
100
|
+
query_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
101
|
+
url = f"{self.ws_url}/{route}?{query_string}"
|
102
|
+
|
103
|
+
try:
|
104
|
+
self.websocket = connect(url)
|
105
|
+
except Exception as e:
|
106
|
+
status_code = None
|
107
|
+
error_message = str(e)
|
108
|
+
|
109
|
+
if hasattr(e, 'status') and e.status is not None:
|
110
|
+
status_code = e.status
|
111
|
+
|
112
|
+
if status_code == 402:
|
113
|
+
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
114
|
+
elif status_code == 401:
|
115
|
+
error_message = "Unauthorized. Please check your API key."
|
116
|
+
elif status_code == 403:
|
117
|
+
error_message = "Forbidden. You don't have permission to access this resource."
|
118
|
+
elif status_code == 404:
|
119
|
+
error_message = "Not found. The requested resource doesn't exist."
|
120
|
+
|
121
|
+
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
122
|
+
else:
|
123
|
+
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
124
|
+
|
125
|
+
def _is_websocket_closed(self):
|
126
|
+
return self.websocket is None or (hasattr(self.websocket, 'socket') and self.websocket.socket.fileno() == -1)
|
127
|
+
|
128
|
+
def close(self):
|
129
|
+
"""This method closes the WebSocket connection. Highly recommended to call this method when done using the WebSocket."""
|
130
|
+
if self.websocket and not self._is_websocket_closed():
|
131
|
+
self.websocket.close()
|
132
|
+
|
133
|
+
def send(self, data: Union[bytes, str]):
|
134
|
+
"""Send audio data or control commands to the WebSocket.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
data: Binary audio data or text command ("finalize" or "done")
|
138
|
+
"""
|
139
|
+
# Auto-connect if not connected, like TTS does
|
140
|
+
if self.websocket is None or self._is_websocket_closed():
|
141
|
+
self.connect(
|
142
|
+
model=self._default_model,
|
143
|
+
language=self._default_language,
|
144
|
+
encoding=self._default_encoding,
|
145
|
+
sample_rate=self._default_sample_rate,
|
146
|
+
)
|
147
|
+
|
148
|
+
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
149
|
+
|
150
|
+
if isinstance(data, bytes):
|
151
|
+
self.websocket.send(data)
|
152
|
+
elif isinstance(data, str):
|
153
|
+
self.websocket.send(data)
|
154
|
+
else:
|
155
|
+
raise TypeError("Data must be bytes (audio) or str (command)")
|
156
|
+
|
157
|
+
def receive(self) -> Generator[Dict[str, Any], None, None]:
|
158
|
+
"""Receive transcription results from the WebSocket.
|
159
|
+
|
160
|
+
Yields:
|
161
|
+
Dictionary containing transcription results, flush_done, done, or error messages
|
162
|
+
"""
|
163
|
+
# Auto-connect if not connected, like TTS does
|
164
|
+
if self.websocket is None or self._is_websocket_closed():
|
165
|
+
self.connect(
|
166
|
+
model=self._default_model,
|
167
|
+
language=self._default_language,
|
168
|
+
encoding=self._default_encoding,
|
169
|
+
sample_rate=self._default_sample_rate,
|
170
|
+
)
|
171
|
+
|
172
|
+
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
173
|
+
|
174
|
+
try:
|
175
|
+
while True:
|
176
|
+
try:
|
177
|
+
message = self.websocket.recv()
|
178
|
+
if isinstance(message, str):
|
179
|
+
raw_data = json.loads(message)
|
180
|
+
|
181
|
+
# Handle error responses
|
182
|
+
if raw_data.get("type") == "error":
|
183
|
+
raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
|
184
|
+
|
185
|
+
# Handle transcript responses with flexible parsing
|
186
|
+
if raw_data.get("type") == "transcript":
|
187
|
+
# Provide defaults for missing required fields
|
188
|
+
result = {
|
189
|
+
"type": raw_data["type"],
|
190
|
+
"request_id": raw_data.get("request_id", ""),
|
191
|
+
"text": raw_data.get("text", ""), # Default to empty string if missing
|
192
|
+
"is_final": raw_data.get("is_final", False), # Default to False if missing
|
193
|
+
}
|
194
|
+
|
195
|
+
# Add optional fields if present
|
196
|
+
if "duration" in raw_data:
|
197
|
+
result["duration"] = raw_data["duration"]
|
198
|
+
if "language" in raw_data:
|
199
|
+
result["language"] = raw_data["language"]
|
200
|
+
|
201
|
+
yield result
|
202
|
+
|
203
|
+
# Handle flush_done acknowledgment
|
204
|
+
elif raw_data.get("type") == "flush_done":
|
205
|
+
result = {
|
206
|
+
"type": raw_data["type"],
|
207
|
+
"request_id": raw_data.get("request_id", ""),
|
208
|
+
}
|
209
|
+
yield result
|
210
|
+
|
211
|
+
# Handle done acknowledgment - session complete
|
212
|
+
elif raw_data.get("type") == "done":
|
213
|
+
result = {
|
214
|
+
"type": raw_data["type"],
|
215
|
+
"request_id": raw_data.get("request_id", ""),
|
216
|
+
}
|
217
|
+
yield result
|
218
|
+
# Session is complete, break out of loop
|
219
|
+
break
|
220
|
+
|
221
|
+
except Exception as inner_e:
|
222
|
+
self.close()
|
223
|
+
raise RuntimeError(f"Error receiving transcription: {inner_e}")
|
224
|
+
|
225
|
+
except Exception as e:
|
226
|
+
self.close()
|
227
|
+
raise RuntimeError(f"Failed to receive transcription. {e}")
|
228
|
+
|
229
|
+
def transcribe(
|
230
|
+
self,
|
231
|
+
audio_chunks: typing.Iterator[bytes],
|
232
|
+
*,
|
233
|
+
model: str = "ink-whisper",
|
234
|
+
language: Optional[str] = "en",
|
235
|
+
encoding: Optional[str] = "pcm_s16le",
|
236
|
+
sample_rate: int = 16000,
|
237
|
+
) -> Generator[Dict[str, Any], None, None]:
|
238
|
+
"""Transcribe audio chunks using the WebSocket.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
audio_chunks: Iterator of audio chunks as bytes
|
242
|
+
model: ID of the model to use for transcription
|
243
|
+
language: The language of the input audio in ISO-639-1 format
|
244
|
+
encoding: The encoding format of the audio data
|
245
|
+
sample_rate: The sample rate of the audio in Hz
|
246
|
+
|
247
|
+
Yields:
|
248
|
+
Dictionary containing transcription results, flush_done, done, or error messages
|
249
|
+
"""
|
250
|
+
self.connect(
|
251
|
+
model=model,
|
252
|
+
language=language,
|
253
|
+
encoding=encoding,
|
254
|
+
sample_rate=sample_rate,
|
255
|
+
)
|
256
|
+
|
257
|
+
try:
|
258
|
+
# Send all audio chunks
|
259
|
+
for chunk in audio_chunks:
|
260
|
+
self.send(chunk)
|
261
|
+
|
262
|
+
# Send finalize command to flush remaining audio
|
263
|
+
self.send("finalize")
|
264
|
+
|
265
|
+
# Send done command to close session cleanly
|
266
|
+
self.send("done")
|
267
|
+
|
268
|
+
# Receive all responses until done
|
269
|
+
yield from self.receive()
|
270
|
+
|
271
|
+
finally:
|
272
|
+
self.close()
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .done_message import DoneMessageParams
|
4
|
+
from .error_message import ErrorMessageParams
|
5
|
+
from .flush_done_message import FlushDoneMessageParams
|
6
|
+
from .streaming_transcription_response import (
|
7
|
+
StreamingTranscriptionResponseParams,
|
8
|
+
StreamingTranscriptionResponse_DoneParams,
|
9
|
+
StreamingTranscriptionResponse_ErrorParams,
|
10
|
+
StreamingTranscriptionResponse_FlushDoneParams,
|
11
|
+
StreamingTranscriptionResponse_TranscriptParams,
|
12
|
+
)
|
13
|
+
from .transcript_message import TranscriptMessageParams
|
14
|
+
from .transcription_response import TranscriptionResponseParams
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"DoneMessageParams",
|
18
|
+
"ErrorMessageParams",
|
19
|
+
"FlushDoneMessageParams",
|
20
|
+
"StreamingTranscriptionResponseParams",
|
21
|
+
"StreamingTranscriptionResponse_DoneParams",
|
22
|
+
"StreamingTranscriptionResponse_ErrorParams",
|
23
|
+
"StreamingTranscriptionResponse_FlushDoneParams",
|
24
|
+
"StreamingTranscriptionResponse_TranscriptParams",
|
25
|
+
"TranscriptMessageParams",
|
26
|
+
"TranscriptionResponseParams",
|
27
|
+
]
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
|
5
|
+
|
6
|
+
class DoneMessageParams(typing_extensions.TypedDict):
|
7
|
+
"""
|
8
|
+
Acknowledgment message sent in response to a `done` command, indicating that the session is complete and the WebSocket will close.
|
9
|
+
"""
|
10
|
+
|
11
|
+
request_id: str
|
12
|
+
"""
|
13
|
+
Unique identifier for this transcription session.
|
14
|
+
"""
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
|
6
|
+
|
7
|
+
class ErrorMessageParams(typing_extensions.TypedDict):
|
8
|
+
request_id: typing_extensions.NotRequired[str]
|
9
|
+
"""
|
10
|
+
The request ID associated with the error, if applicable.
|
11
|
+
"""
|
12
|
+
|
13
|
+
message: str
|
14
|
+
"""
|
15
|
+
Human-readable error message describing what went wrong.
|
16
|
+
"""
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
|
5
|
+
|
6
|
+
class FlushDoneMessageParams(typing_extensions.TypedDict):
|
7
|
+
"""
|
8
|
+
Acknowledgment message sent in response to a `finalize` command, indicating that all buffered audio has been flushed and processed.
|
9
|
+
"""
|
10
|
+
|
11
|
+
request_id: str
|
12
|
+
"""
|
13
|
+
Unique identifier for this transcription session.
|
14
|
+
"""
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
import typing_extensions
|
5
|
+
import typing
|
6
|
+
import typing_extensions
|
7
|
+
|
8
|
+
|
9
|
+
class StreamingTranscriptionResponse_TranscriptParams(typing_extensions.TypedDict):
|
10
|
+
type: typing.Literal["transcript"]
|
11
|
+
request_id: str
|
12
|
+
text: str
|
13
|
+
is_final: bool
|
14
|
+
duration: typing_extensions.NotRequired[float]
|
15
|
+
language: typing_extensions.NotRequired[str]
|
16
|
+
|
17
|
+
|
18
|
+
class StreamingTranscriptionResponse_FlushDoneParams(typing_extensions.TypedDict):
|
19
|
+
type: typing.Literal["flush_done"]
|
20
|
+
request_id: str
|
21
|
+
|
22
|
+
|
23
|
+
class StreamingTranscriptionResponse_DoneParams(typing_extensions.TypedDict):
|
24
|
+
type: typing.Literal["done"]
|
25
|
+
request_id: str
|
26
|
+
|
27
|
+
|
28
|
+
class StreamingTranscriptionResponse_ErrorParams(typing_extensions.TypedDict):
|
29
|
+
type: typing.Literal["error"]
|
30
|
+
request_id: typing_extensions.NotRequired[str]
|
31
|
+
message: str
|
32
|
+
|
33
|
+
|
34
|
+
StreamingTranscriptionResponseParams = typing.Union[
|
35
|
+
StreamingTranscriptionResponse_TranscriptParams,
|
36
|
+
StreamingTranscriptionResponse_FlushDoneParams,
|
37
|
+
StreamingTranscriptionResponse_DoneParams,
|
38
|
+
StreamingTranscriptionResponse_ErrorParams,
|
39
|
+
]
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
|
6
|
+
|
7
|
+
class TranscriptMessageParams(typing_extensions.TypedDict):
|
8
|
+
request_id: str
|
9
|
+
"""
|
10
|
+
Unique identifier for this transcription session.
|
11
|
+
"""
|
12
|
+
|
13
|
+
text: str
|
14
|
+
"""
|
15
|
+
The transcribed text. May be partial or final depending on is_final.
|
16
|
+
|
17
|
+
**Note**: Text may be empty in initial responses while the system accumulates sufficient audio for transcription. This is normal behavior - wait for responses with non-empty text or monitor is_final for completion status.
|
18
|
+
"""
|
19
|
+
|
20
|
+
is_final: bool
|
21
|
+
"""
|
22
|
+
Whether this is a final transcription result or an interim result.
|
23
|
+
"""
|
24
|
+
|
25
|
+
duration: typing_extensions.NotRequired[float]
|
26
|
+
"""
|
27
|
+
The duration of the audio transcribed so far, in seconds.
|
28
|
+
"""
|
29
|
+
|
30
|
+
language: typing_extensions.NotRequired[str]
|
31
|
+
"""
|
32
|
+
The detected or specified language of the input audio.
|
33
|
+
"""
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing_extensions
|
4
|
+
import typing_extensions
|
5
|
+
|
6
|
+
|
7
|
+
class TranscriptionResponseParams(typing_extensions.TypedDict):
|
8
|
+
text: str
|
9
|
+
"""
|
10
|
+
The transcribed text.
|
11
|
+
"""
|
12
|
+
|
13
|
+
language: typing_extensions.NotRequired[str]
|
14
|
+
"""
|
15
|
+
The detected or specified language of the input audio.
|
16
|
+
"""
|
17
|
+
|
18
|
+
duration: typing_extensions.NotRequired[float]
|
19
|
+
"""
|
20
|
+
The duration of the input audio in seconds.
|
21
|
+
"""
|
@@ -0,0 +1,195 @@
|
|
1
|
+
import typing
|
2
|
+
from typing import Any, Dict, Generator, Optional, Union
|
3
|
+
|
4
|
+
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
5
|
+
from ._async_websocket import AsyncSttWebsocket
|
6
|
+
from ._websocket import SttWebsocket
|
7
|
+
|
8
|
+
|
9
|
+
class SttClientWithWebsocket:
|
10
|
+
"""
|
11
|
+
Extension of STT functionality that supports a synchronous WebSocket STT connection.
|
12
|
+
"""
|
13
|
+
|
14
|
+
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
15
|
+
self._client_wrapper = client_wrapper
|
16
|
+
|
17
|
+
def _ws_url(self):
|
18
|
+
base_url = self._client_wrapper.get_base_url()
|
19
|
+
if base_url.startswith("ws://") or base_url.startswith("wss://"):
|
20
|
+
return base_url
|
21
|
+
else:
|
22
|
+
prefix = "ws" if "localhost" in base_url else "wss"
|
23
|
+
base_url_without_protocol = base_url.split("://")[-1]
|
24
|
+
return f"{prefix}://{base_url_without_protocol}"
|
25
|
+
|
26
|
+
def websocket(self, *,
|
27
|
+
model: str = "ink-whisper",
|
28
|
+
language: Optional[str] = "en",
|
29
|
+
encoding: Optional[str] = "pcm_s16le",
|
30
|
+
sample_rate: int = 16000):
|
31
|
+
"""Create a WebSocket connection for real-time speech transcription.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
model: ID of the model to use for transcription
|
35
|
+
language: The language of the input audio in ISO-639-1 format
|
36
|
+
encoding: The encoding format of the audio data
|
37
|
+
sample_rate: The sample rate of the audio in Hz
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
SttWebsocket: A connected WebSocket client for STT operations.
|
41
|
+
"""
|
42
|
+
client_headers = self._client_wrapper.get_headers()
|
43
|
+
ws = SttWebsocket(
|
44
|
+
ws_url=self._ws_url(),
|
45
|
+
cartesia_version=client_headers["Cartesia-Version"],
|
46
|
+
api_key=client_headers["X-API-Key"],
|
47
|
+
)
|
48
|
+
# Auto-connect like TTS does for consistency
|
49
|
+
ws.connect(
|
50
|
+
model=model,
|
51
|
+
language=language,
|
52
|
+
encoding=encoding,
|
53
|
+
sample_rate=sample_rate,
|
54
|
+
)
|
55
|
+
return ws
|
56
|
+
|
57
|
+
def transcribe(
|
58
|
+
self,
|
59
|
+
audio_chunks: typing.Iterator[bytes],
|
60
|
+
*,
|
61
|
+
model: str = "ink-whisper",
|
62
|
+
language: Optional[str] = "en",
|
63
|
+
encoding: Optional[str] = "pcm_s16le",
|
64
|
+
sample_rate: int = 16000,
|
65
|
+
) -> Generator[Dict[str, Any], None, None]:
|
66
|
+
"""Transcribe audio chunks using WebSocket.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
audio_chunks: Iterator of audio chunks as bytes
|
70
|
+
model: ID of the model to use for transcription
|
71
|
+
language: The language of the input audio in ISO-639-1 format
|
72
|
+
encoding: The encoding format of the audio data
|
73
|
+
sample_rate: The sample rate of the audio in Hz
|
74
|
+
|
75
|
+
Yields:
|
76
|
+
Dictionary containing transcription results, flush_done, done, or error messages
|
77
|
+
|
78
|
+
Example:
|
79
|
+
>>> client = Cartesia(api_key="your-api-key")
|
80
|
+
>>> ws_client = client.stt.websocket()
|
81
|
+
>>> for result in ws_client.transcribe(audio_chunks):
|
82
|
+
... print(result["text"])
|
83
|
+
"""
|
84
|
+
ws = self.websocket(
|
85
|
+
model=model,
|
86
|
+
language=language,
|
87
|
+
encoding=encoding,
|
88
|
+
sample_rate=sample_rate,
|
89
|
+
)
|
90
|
+
try:
|
91
|
+
yield from ws.transcribe(
|
92
|
+
audio_chunks,
|
93
|
+
model=model,
|
94
|
+
language=language,
|
95
|
+
encoding=encoding,
|
96
|
+
sample_rate=sample_rate,
|
97
|
+
)
|
98
|
+
finally:
|
99
|
+
ws.close()
|
100
|
+
|
101
|
+
|
102
|
+
class AsyncSttClientWithWebsocket:
|
103
|
+
"""
|
104
|
+
Extension of STT functionality that supports an asynchronous WebSocket STT connection.
|
105
|
+
"""
|
106
|
+
|
107
|
+
def __init__(self, *, client_wrapper: AsyncClientWrapper, get_session):
|
108
|
+
self._client_wrapper = client_wrapper
|
109
|
+
self._get_session = get_session
|
110
|
+
|
111
|
+
def _ws_url(self) -> str:
|
112
|
+
base_url = self._client_wrapper.get_base_url()
|
113
|
+
if base_url.startswith("ws://") or base_url.startswith("wss://"):
|
114
|
+
return base_url
|
115
|
+
else:
|
116
|
+
prefix = "ws" if "localhost" in base_url else "wss"
|
117
|
+
base_url_without_protocol = base_url.split("://")[-1]
|
118
|
+
return f"{prefix}://{base_url_without_protocol}"
|
119
|
+
|
120
|
+
async def websocket(self, *,
|
121
|
+
model: str = "ink-whisper",
|
122
|
+
language: Optional[str] = "en",
|
123
|
+
encoding: Optional[str] = "pcm_s16le",
|
124
|
+
sample_rate: int = 16000):
|
125
|
+
"""Create an async WebSocket connection for real-time speech transcription.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
model: ID of the model to use for transcription
|
129
|
+
language: The language of the input audio in ISO-639-1 format
|
130
|
+
encoding: The encoding format of the audio data
|
131
|
+
sample_rate: The sample rate of the audio in Hz
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
AsyncSttWebsocket: A connected async WebSocket client for STT operations.
|
135
|
+
"""
|
136
|
+
client_headers = self._client_wrapper.get_headers()
|
137
|
+
ws = AsyncSttWebsocket(
|
138
|
+
ws_url=self._ws_url(),
|
139
|
+
cartesia_version=client_headers["Cartesia-Version"],
|
140
|
+
api_key=client_headers["X-API-Key"],
|
141
|
+
get_session=self._get_session,
|
142
|
+
)
|
143
|
+
# Auto-connect like TTS does for consistency
|
144
|
+
await ws.connect(
|
145
|
+
model=model,
|
146
|
+
language=language,
|
147
|
+
encoding=encoding,
|
148
|
+
sample_rate=sample_rate,
|
149
|
+
)
|
150
|
+
return ws
|
151
|
+
|
152
|
+
async def transcribe(
|
153
|
+
self,
|
154
|
+
audio_chunks: typing.AsyncIterator[bytes],
|
155
|
+
*,
|
156
|
+
model: str = "ink-whisper",
|
157
|
+
language: Optional[str] = "en",
|
158
|
+
encoding: Optional[str] = "pcm_s16le",
|
159
|
+
sample_rate: int = 16000,
|
160
|
+
) -> typing.AsyncGenerator[Dict[str, Any], None]:
|
161
|
+
"""Transcribe audio chunks using async WebSocket.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
audio_chunks: Async iterator of audio chunks as bytes
|
165
|
+
model: ID of the model to use for transcription
|
166
|
+
language: The language of the input audio in ISO-639-1 format
|
167
|
+
encoding: The encoding format of the audio data
|
168
|
+
sample_rate: The sample rate of the audio in Hz
|
169
|
+
|
170
|
+
Yields:
|
171
|
+
Dictionary containing transcription results, flush_done, done, or error messages
|
172
|
+
|
173
|
+
Example:
|
174
|
+
>>> client = AsyncCartesia(api_key="your-api-key")
|
175
|
+
>>> ws_client = await client.stt.websocket()
|
176
|
+
>>> async for result in ws_client.transcribe(audio_chunks):
|
177
|
+
... print(result["text"])
|
178
|
+
"""
|
179
|
+
ws = await self.websocket(
|
180
|
+
model=model,
|
181
|
+
language=language,
|
182
|
+
encoding=encoding,
|
183
|
+
sample_rate=sample_rate,
|
184
|
+
)
|
185
|
+
try:
|
186
|
+
async for result in ws.transcribe(
|
187
|
+
audio_chunks,
|
188
|
+
model=model,
|
189
|
+
language=language,
|
190
|
+
encoding=encoding,
|
191
|
+
sample_rate=sample_rate,
|
192
|
+
):
|
193
|
+
yield result
|
194
|
+
finally:
|
195
|
+
await ws.close()
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from .done_message import DoneMessage
|
4
|
+
from .error_message import ErrorMessage
|
5
|
+
from .flush_done_message import FlushDoneMessage
|
6
|
+
from .streaming_transcription_response import (
|
7
|
+
StreamingTranscriptionResponse,
|
8
|
+
StreamingTranscriptionResponse_Done,
|
9
|
+
StreamingTranscriptionResponse_Error,
|
10
|
+
StreamingTranscriptionResponse_FlushDone,
|
11
|
+
StreamingTranscriptionResponse_Transcript,
|
12
|
+
)
|
13
|
+
from .stt_encoding import SttEncoding
|
14
|
+
from .transcript_message import TranscriptMessage
|
15
|
+
from .transcription_response import TranscriptionResponse
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"DoneMessage",
|
19
|
+
"ErrorMessage",
|
20
|
+
"FlushDoneMessage",
|
21
|
+
"StreamingTranscriptionResponse",
|
22
|
+
"StreamingTranscriptionResponse_Done",
|
23
|
+
"StreamingTranscriptionResponse_Error",
|
24
|
+
"StreamingTranscriptionResponse_FlushDone",
|
25
|
+
"StreamingTranscriptionResponse_Transcript",
|
26
|
+
"SttEncoding",
|
27
|
+
"TranscriptMessage",
|
28
|
+
"TranscriptionResponse",
|
29
|
+
]
|