cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +22 -0
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/core/client_wrapper.py +1 -1
- cartesia/infill/client.py +0 -8
- cartesia/stt/__init__.py +6 -0
- cartesia/stt/_async_websocket.py +81 -72
- cartesia/stt/_websocket.py +42 -20
- cartesia/stt/client.py +450 -0
- cartesia/stt/requests/__init__.py +2 -0
- cartesia/stt/requests/streaming_transcription_response.py +2 -0
- cartesia/stt/requests/transcript_message.py +8 -1
- cartesia/stt/requests/transcription_response.py +8 -1
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +52 -109
- cartesia/stt/types/__init__.py +4 -0
- cartesia/stt/types/streaming_transcription_response.py +2 -0
- cartesia/stt/types/stt_encoding.py +3 -1
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +7 -1
- cartesia/stt/types/transcription_response.py +7 -1
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +16 -0
- cartesia/tts/client.py +63 -8
- cartesia/tts/requests/__init__.py +8 -0
- cartesia/tts/requests/experimental_model_controls.py +17 -0
- cartesia/tts/requests/generation_config.py +23 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/tts_request.py +2 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +8 -0
- cartesia/tts/types/experimental_model_controls.py +28 -0
- cartesia/tts/types/generation_config.py +34 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/tts_request.py +2 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/client.py +0 -8
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- cartesia/voices/client.py +0 -12
- cartesia-2.0.7.dist-info/LICENSE +201 -0
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
- {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
cartesia/__init__.py
CHANGED
@@ -37,10 +37,13 @@ from .stt import (
|
|
37
37
|
StreamingTranscriptionResponse_Transcript,
|
38
38
|
StreamingTranscriptionResponse_TranscriptParams,
|
39
39
|
SttEncoding,
|
40
|
+
TimestampGranularity,
|
40
41
|
TranscriptMessage,
|
41
42
|
TranscriptMessageParams,
|
42
43
|
TranscriptionResponse,
|
43
44
|
TranscriptionResponseParams,
|
45
|
+
TranscriptionWord,
|
46
|
+
TranscriptionWordParams,
|
44
47
|
)
|
45
48
|
from .tts import (
|
46
49
|
CancelContextRequest,
|
@@ -49,7 +52,11 @@ from .tts import (
|
|
49
52
|
Controls,
|
50
53
|
ControlsParams,
|
51
54
|
Emotion,
|
55
|
+
ExperimentalModelControls,
|
56
|
+
ExperimentalModelControlsParams,
|
52
57
|
FlushId,
|
58
|
+
GenerationConfig,
|
59
|
+
GenerationConfigParams,
|
53
60
|
GenerationRequest,
|
54
61
|
GenerationRequestParams,
|
55
62
|
ModelSpeed,
|
@@ -72,6 +79,8 @@ from .tts import (
|
|
72
79
|
RawOutputFormatParams,
|
73
80
|
Speed,
|
74
81
|
SpeedParams,
|
82
|
+
SseOutputFormat,
|
83
|
+
SseOutputFormatParams,
|
75
84
|
SupportedLanguage,
|
76
85
|
TtsRequest,
|
77
86
|
TtsRequestEmbeddingSpecifier,
|
@@ -81,6 +90,8 @@ from .tts import (
|
|
81
90
|
TtsRequestParams,
|
82
91
|
TtsRequestVoiceSpecifier,
|
83
92
|
TtsRequestVoiceSpecifierParams,
|
93
|
+
TtssseRequest,
|
94
|
+
TtssseRequestParams,
|
84
95
|
WavOutputFormat,
|
85
96
|
WavOutputFormatParams,
|
86
97
|
WebSocketBaseResponse,
|
@@ -206,12 +217,16 @@ __all__ = [
|
|
206
217
|
"Emotion",
|
207
218
|
"ErrorMessage",
|
208
219
|
"ErrorMessageParams",
|
220
|
+
"ExperimentalModelControls",
|
221
|
+
"ExperimentalModelControlsParams",
|
209
222
|
"FilePurpose",
|
210
223
|
"FlushDoneMessage",
|
211
224
|
"FlushDoneMessageParams",
|
212
225
|
"FlushId",
|
213
226
|
"Gender",
|
214
227
|
"GenderPresentation",
|
228
|
+
"GenerationConfig",
|
229
|
+
"GenerationConfigParams",
|
215
230
|
"GenerationRequest",
|
216
231
|
"GenerationRequestParams",
|
217
232
|
"GetVoicesResponse",
|
@@ -256,6 +271,8 @@ __all__ = [
|
|
256
271
|
"RawOutputFormatParams",
|
257
272
|
"Speed",
|
258
273
|
"SpeedParams",
|
274
|
+
"SseOutputFormat",
|
275
|
+
"SseOutputFormatParams",
|
259
276
|
"StreamingResponse",
|
260
277
|
"StreamingResponseParams",
|
261
278
|
"StreamingResponse_Chunk",
|
@@ -276,6 +293,7 @@ __all__ = [
|
|
276
293
|
"StreamingTranscriptionResponse_TranscriptParams",
|
277
294
|
"SttEncoding",
|
278
295
|
"SupportedLanguage",
|
296
|
+
"TimestampGranularity",
|
279
297
|
"TokenGrant",
|
280
298
|
"TokenGrantParams",
|
281
299
|
"TokenRequest",
|
@@ -286,6 +304,8 @@ __all__ = [
|
|
286
304
|
"TranscriptMessageParams",
|
287
305
|
"TranscriptionResponse",
|
288
306
|
"TranscriptionResponseParams",
|
307
|
+
"TranscriptionWord",
|
308
|
+
"TranscriptionWordParams",
|
289
309
|
"TtsRequest",
|
290
310
|
"TtsRequestEmbeddingSpecifier",
|
291
311
|
"TtsRequestEmbeddingSpecifierParams",
|
@@ -294,6 +314,8 @@ __all__ = [
|
|
294
314
|
"TtsRequestParams",
|
295
315
|
"TtsRequestVoiceSpecifier",
|
296
316
|
"TtsRequestVoiceSpecifierParams",
|
317
|
+
"TtssseRequest",
|
318
|
+
"TtssseRequestParams",
|
297
319
|
"UpdateVoiceRequest",
|
298
320
|
"UpdateVoiceRequestParams",
|
299
321
|
"Voice",
|
cartesia/auth/client.py
CHANGED
@@ -22,7 +22,7 @@ class AuthClient:
|
|
22
22
|
def access_token(
|
23
23
|
self,
|
24
24
|
*,
|
25
|
-
grants: TokenGrantParams,
|
25
|
+
grants: typing.Optional[TokenGrantParams] = OMIT,
|
26
26
|
expires_in: typing.Optional[int] = OMIT,
|
27
27
|
request_options: typing.Optional[RequestOptions] = None,
|
28
28
|
) -> TokenResponse:
|
@@ -31,8 +31,8 @@ class AuthClient:
|
|
31
31
|
|
32
32
|
Parameters
|
33
33
|
----------
|
34
|
-
grants : TokenGrantParams
|
35
|
-
The permissions to be granted via the token.
|
34
|
+
grants : typing.Optional[TokenGrantParams]
|
35
|
+
The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
|
36
36
|
|
37
37
|
expires_in : typing.Optional[int]
|
38
38
|
The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
|
@@ -52,7 +52,7 @@ class AuthClient:
|
|
52
52
|
api_key="YOUR_API_KEY",
|
53
53
|
)
|
54
54
|
client.auth.access_token(
|
55
|
-
grants={"tts": True},
|
55
|
+
grants={"tts": True, "stt": True},
|
56
56
|
expires_in=60,
|
57
57
|
)
|
58
58
|
"""
|
@@ -90,7 +90,7 @@ class AsyncAuthClient:
|
|
90
90
|
async def access_token(
|
91
91
|
self,
|
92
92
|
*,
|
93
|
-
grants: TokenGrantParams,
|
93
|
+
grants: typing.Optional[TokenGrantParams] = OMIT,
|
94
94
|
expires_in: typing.Optional[int] = OMIT,
|
95
95
|
request_options: typing.Optional[RequestOptions] = None,
|
96
96
|
) -> TokenResponse:
|
@@ -99,8 +99,8 @@ class AsyncAuthClient:
|
|
99
99
|
|
100
100
|
Parameters
|
101
101
|
----------
|
102
|
-
grants : TokenGrantParams
|
103
|
-
The permissions to be granted via the token.
|
102
|
+
grants : typing.Optional[TokenGrantParams]
|
103
|
+
The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
|
104
104
|
|
105
105
|
expires_in : typing.Optional[int]
|
106
106
|
The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
|
@@ -125,7 +125,7 @@ class AsyncAuthClient:
|
|
125
125
|
|
126
126
|
async def main() -> None:
|
127
127
|
await client.auth.access_token(
|
128
|
-
grants={"tts": True},
|
128
|
+
grants={"tts": True, "stt": True},
|
129
129
|
expires_in=60,
|
130
130
|
)
|
131
131
|
|
@@ -1,10 +1,16 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
import typing_extensions
|
4
|
+
import typing_extensions
|
4
5
|
|
5
6
|
|
6
7
|
class TokenGrantParams(typing_extensions.TypedDict):
|
7
|
-
tts: bool
|
8
|
+
tts: typing_extensions.NotRequired[bool]
|
8
9
|
"""
|
9
10
|
The `tts` grant allows the token to be used to access any TTS endpoint.
|
10
11
|
"""
|
12
|
+
|
13
|
+
stt: typing_extensions.NotRequired[bool]
|
14
|
+
"""
|
15
|
+
The `stt` grant allows the token to be used to access any STT endpoint.
|
16
|
+
"""
|
@@ -1,14 +1,14 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
import typing_extensions
|
4
|
-
from .token_grant import TokenGrantParams
|
5
4
|
import typing_extensions
|
5
|
+
from .token_grant import TokenGrantParams
|
6
6
|
|
7
7
|
|
8
8
|
class TokenRequestParams(typing_extensions.TypedDict):
|
9
|
-
grants: TokenGrantParams
|
9
|
+
grants: typing_extensions.NotRequired[TokenGrantParams]
|
10
10
|
"""
|
11
|
-
The permissions to be granted via the token.
|
11
|
+
The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
|
12
12
|
"""
|
13
13
|
|
14
14
|
expires_in: typing_extensions.NotRequired[int]
|
@@ -1,17 +1,22 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
4
5
|
import pydantic
|
5
6
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
|
-
import typing
|
7
7
|
|
8
8
|
|
9
9
|
class TokenGrant(UniversalBaseModel):
|
10
|
-
tts: bool = pydantic.Field()
|
10
|
+
tts: typing.Optional[bool] = pydantic.Field(default=None)
|
11
11
|
"""
|
12
12
|
The `tts` grant allows the token to be used to access any TTS endpoint.
|
13
13
|
"""
|
14
14
|
|
15
|
+
stt: typing.Optional[bool] = pydantic.Field(default=None)
|
16
|
+
"""
|
17
|
+
The `stt` grant allows the token to be used to access any STT endpoint.
|
18
|
+
"""
|
19
|
+
|
15
20
|
if IS_PYDANTIC_V2:
|
16
21
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
17
22
|
else:
|
@@ -1,16 +1,16 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
4
5
|
from .token_grant import TokenGrant
|
5
6
|
import pydantic
|
6
|
-
import typing
|
7
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
8
|
|
9
9
|
|
10
10
|
class TokenRequest(UniversalBaseModel):
|
11
|
-
grants: TokenGrant = pydantic.Field()
|
11
|
+
grants: typing.Optional[TokenGrant] = pydantic.Field(default=None)
|
12
12
|
"""
|
13
|
-
The permissions to be granted via the token.
|
13
|
+
The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
|
14
14
|
"""
|
15
15
|
|
16
16
|
expires_in: typing.Optional[int] = pydantic.Field(default=None)
|
cartesia/core/client_wrapper.py
CHANGED
@@ -16,7 +16,7 @@ class BaseClientWrapper:
|
|
16
16
|
headers: typing.Dict[str, str] = {
|
17
17
|
"X-Fern-Language": "Python",
|
18
18
|
"X-Fern-SDK-Name": "cartesia",
|
19
|
-
"X-Fern-SDK-Version": "2.0.
|
19
|
+
"X-Fern-SDK-Version": "2.0.7",
|
20
20
|
}
|
21
21
|
headers["X-API-Key"] = self.api_key
|
22
22
|
headers["Cartesia-Version"] = "2024-11-13"
|
cartesia/infill/client.py
CHANGED
@@ -83,17 +83,14 @@ class InfillClient:
|
|
83
83
|
output_format_encoding : typing.Optional[RawEncoding]
|
84
84
|
Required for `raw` and `wav` containers.
|
85
85
|
|
86
|
-
|
87
86
|
output_format_bit_rate : typing.Optional[int]
|
88
87
|
Required for `mp3` containers.
|
89
88
|
|
90
|
-
|
91
89
|
voice_experimental_controls_speed : typing.Optional[Speed]
|
92
90
|
Either a number between -1.0 and 1.0 or a natural language description of speed.
|
93
91
|
|
94
92
|
If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
|
95
93
|
|
96
|
-
|
97
94
|
voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
|
98
95
|
An array of emotion:level tags.
|
99
96
|
|
@@ -101,7 +98,6 @@ class InfillClient:
|
|
101
98
|
|
102
99
|
Supported levels are: lowest, low, (omit), high, highest.
|
103
100
|
|
104
|
-
|
105
101
|
request_options : typing.Optional[RequestOptions]
|
106
102
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
107
103
|
|
@@ -230,17 +226,14 @@ class AsyncInfillClient:
|
|
230
226
|
output_format_encoding : typing.Optional[RawEncoding]
|
231
227
|
Required for `raw` and `wav` containers.
|
232
228
|
|
233
|
-
|
234
229
|
output_format_bit_rate : typing.Optional[int]
|
235
230
|
Required for `mp3` containers.
|
236
231
|
|
237
|
-
|
238
232
|
voice_experimental_controls_speed : typing.Optional[Speed]
|
239
233
|
Either a number between -1.0 and 1.0 or a natural language description of speed.
|
240
234
|
|
241
235
|
If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
|
242
236
|
|
243
|
-
|
244
237
|
voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
|
245
238
|
An array of emotion:level tags.
|
246
239
|
|
@@ -248,7 +241,6 @@ class AsyncInfillClient:
|
|
248
241
|
|
249
242
|
Supported levels are: lowest, low, (omit), high, highest.
|
250
243
|
|
251
|
-
|
252
244
|
request_options : typing.Optional[RequestOptions]
|
253
245
|
Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
|
254
246
|
|
cartesia/stt/__init__.py
CHANGED
@@ -10,8 +10,10 @@ from .types import (
|
|
10
10
|
StreamingTranscriptionResponse_FlushDone,
|
11
11
|
StreamingTranscriptionResponse_Transcript,
|
12
12
|
SttEncoding,
|
13
|
+
TimestampGranularity,
|
13
14
|
TranscriptMessage,
|
14
15
|
TranscriptionResponse,
|
16
|
+
TranscriptionWord,
|
15
17
|
)
|
16
18
|
from .requests import (
|
17
19
|
DoneMessageParams,
|
@@ -24,6 +26,7 @@ from .requests import (
|
|
24
26
|
StreamingTranscriptionResponse_TranscriptParams,
|
25
27
|
TranscriptMessageParams,
|
26
28
|
TranscriptionResponseParams,
|
29
|
+
TranscriptionWordParams,
|
27
30
|
)
|
28
31
|
|
29
32
|
__all__ = [
|
@@ -44,8 +47,11 @@ __all__ = [
|
|
44
47
|
"StreamingTranscriptionResponse_Transcript",
|
45
48
|
"StreamingTranscriptionResponse_TranscriptParams",
|
46
49
|
"SttEncoding",
|
50
|
+
"TimestampGranularity",
|
47
51
|
"TranscriptMessage",
|
48
52
|
"TranscriptMessageParams",
|
49
53
|
"TranscriptionResponse",
|
50
54
|
"TranscriptionResponseParams",
|
55
|
+
"TranscriptionWord",
|
56
|
+
"TranscriptionWordParams",
|
51
57
|
]
|
cartesia/stt/_async_websocket.py
CHANGED
@@ -11,6 +11,7 @@ from cartesia.stt.types import (
|
|
11
11
|
StreamingTranscriptionResponse_Error,
|
12
12
|
StreamingTranscriptionResponse_Transcript,
|
13
13
|
)
|
14
|
+
from cartesia.stt.types.stt_encoding import SttEncoding
|
14
15
|
|
15
16
|
from ..core.pydantic_utilities import parse_obj_as
|
16
17
|
from ._websocket import SttWebsocket
|
@@ -41,8 +42,10 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
41
42
|
self.websocket: Optional[aiohttp.ClientWebSocketResponse] = None
|
42
43
|
self._default_model: str = "ink-whisper"
|
43
44
|
self._default_language: Optional[str] = "en"
|
44
|
-
self._default_encoding:
|
45
|
+
self._default_encoding: SttEncoding = "pcm_s16le"
|
45
46
|
self._default_sample_rate: int = 16000
|
47
|
+
self._default_min_volume: Optional[float] = None
|
48
|
+
self._default_max_silence_duration_secs: Optional[float] = None
|
46
49
|
|
47
50
|
def __del__(self):
|
48
51
|
try:
|
@@ -60,16 +63,20 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
60
63
|
*,
|
61
64
|
model: str = "ink-whisper",
|
62
65
|
language: Optional[str] = "en",
|
63
|
-
encoding:
|
66
|
+
encoding: SttEncoding = "pcm_s16le",
|
64
67
|
sample_rate: int = 16000,
|
68
|
+
min_volume: Optional[float] = None,
|
69
|
+
max_silence_duration_secs: Optional[float] = None,
|
65
70
|
):
|
66
71
|
"""Connect to the STT WebSocket with the specified parameters.
|
67
72
|
|
68
73
|
Args:
|
69
|
-
model: ID of the model to use for transcription
|
70
|
-
language: The language of the input audio in ISO-639-1 format
|
71
|
-
encoding: The encoding format of the audio data
|
72
|
-
sample_rate: The sample rate of the audio in Hz
|
74
|
+
model: ID of the model to use for transcription (required)
|
75
|
+
language: The language of the input audio in ISO-639-1 format (defaults to "en")
|
76
|
+
encoding: The encoding format of the audio data (required)
|
77
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
78
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
79
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
73
80
|
|
74
81
|
Raises:
|
75
82
|
RuntimeError: If the connection to the WebSocket fails.
|
@@ -78,6 +85,8 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
78
85
|
self._default_language = language
|
79
86
|
self._default_encoding = encoding
|
80
87
|
self._default_sample_rate = sample_rate
|
88
|
+
self._default_min_volume = min_volume
|
89
|
+
self._default_max_silence_duration_secs = max_silence_duration_secs
|
81
90
|
|
82
91
|
if self.websocket is None or self._is_websocket_closed():
|
83
92
|
route = "stt/websocket"
|
@@ -87,13 +96,15 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
87
96
|
"model": model,
|
88
97
|
"api_key": self.api_key,
|
89
98
|
"cartesia_version": self.cartesia_version,
|
99
|
+
"encoding": encoding,
|
100
|
+
"sample_rate": str(sample_rate),
|
90
101
|
}
|
91
102
|
if language is not None:
|
92
103
|
params["language"] = language
|
93
|
-
if
|
94
|
-
params["
|
95
|
-
if
|
96
|
-
params["
|
104
|
+
if min_volume is not None:
|
105
|
+
params["min_volume"] = str(min_volume)
|
106
|
+
if max_silence_duration_secs is not None:
|
107
|
+
params["max_silence_duration_secs"] = str(max_silence_duration_secs)
|
97
108
|
|
98
109
|
query_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
99
110
|
url = f"{self.ws_url}/{route}?{query_string}"
|
@@ -143,6 +154,8 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
143
154
|
language=self._default_language,
|
144
155
|
encoding=self._default_encoding,
|
145
156
|
sample_rate=self._default_sample_rate,
|
157
|
+
min_volume=self._default_min_volume,
|
158
|
+
max_silence_duration_secs=self._default_max_silence_duration_secs,
|
146
159
|
)
|
147
160
|
|
148
161
|
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
@@ -166,76 +179,66 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
166
179
|
language=self._default_language,
|
167
180
|
encoding=self._default_encoding,
|
168
181
|
sample_rate=self._default_sample_rate,
|
182
|
+
min_volume=self._default_min_volume,
|
183
|
+
max_silence_duration_secs=self._default_max_silence_duration_secs,
|
169
184
|
)
|
170
185
|
|
171
186
|
assert self.websocket is not None, "WebSocket should be connected after connect() call"
|
172
187
|
|
173
188
|
try:
|
174
|
-
|
175
|
-
|
176
|
-
|
189
|
+
async for message in self.websocket:
|
190
|
+
if message.type == aiohttp.WSMsgType.TEXT:
|
191
|
+
raw_data = json.loads(message.data)
|
177
192
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
"text": raw_data.get("text", ""), # Default to empty string if missing
|
192
|
-
"is_final": raw_data.get("is_final", False), # Default to False if missing
|
193
|
-
}
|
194
|
-
|
195
|
-
# Add optional fields if present
|
196
|
-
if "duration" in raw_data:
|
197
|
-
result["duration"] = raw_data["duration"]
|
198
|
-
if "language" in raw_data:
|
199
|
-
result["language"] = raw_data["language"]
|
200
|
-
|
201
|
-
yield result
|
193
|
+
# Handle error responses
|
194
|
+
if raw_data.get("type") == "error":
|
195
|
+
raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
|
196
|
+
|
197
|
+
# Handle transcript responses with flexible parsing
|
198
|
+
if raw_data.get("type") == "transcript":
|
199
|
+
# Provide defaults for missing required fields
|
200
|
+
result = {
|
201
|
+
"type": raw_data["type"],
|
202
|
+
"request_id": raw_data.get("request_id", ""),
|
203
|
+
"text": raw_data.get("text", ""), # Default to empty string if missing
|
204
|
+
"is_final": raw_data.get("is_final", False), # Default to False if missing
|
205
|
+
}
|
202
206
|
|
203
|
-
#
|
204
|
-
|
205
|
-
result =
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
207
|
+
# Add optional fields if present
|
208
|
+
if "duration" in raw_data:
|
209
|
+
result["duration"] = raw_data["duration"]
|
210
|
+
if "language" in raw_data:
|
211
|
+
result["language"] = raw_data["language"]
|
212
|
+
if "words" in raw_data:
|
213
|
+
result["words"] = raw_data["words"]
|
210
214
|
|
211
|
-
|
212
|
-
elif raw_data.get("type") == "done":
|
213
|
-
result = {
|
214
|
-
"type": raw_data["type"],
|
215
|
-
"request_id": raw_data.get("request_id", ""),
|
216
|
-
}
|
217
|
-
yield result
|
218
|
-
# Session is complete, break out of loop
|
219
|
-
break
|
215
|
+
yield result
|
220
216
|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
217
|
+
# Handle flush_done acknowledgment
|
218
|
+
elif raw_data.get("type") == "flush_done":
|
219
|
+
result = {
|
220
|
+
"type": raw_data["type"],
|
221
|
+
"request_id": raw_data.get("request_id", ""),
|
222
|
+
}
|
223
|
+
yield result
|
225
224
|
|
226
|
-
|
227
|
-
|
225
|
+
# Handle done acknowledgment
|
226
|
+
elif raw_data.get("type") == "done":
|
227
|
+
result = {
|
228
|
+
"type": raw_data["type"],
|
229
|
+
"request_id": raw_data.get("request_id", ""),
|
230
|
+
}
|
231
|
+
yield result
|
232
|
+
break # Exit the loop when done
|
228
233
|
|
229
|
-
|
230
|
-
|
231
|
-
raise RuntimeError(
|
232
|
-
|
233
|
-
|
234
|
-
raise RuntimeError(f"Error receiving transcription: {inner_e}")
|
235
|
-
|
234
|
+
elif message.type == aiohttp.WSMsgType.ERROR:
|
235
|
+
error_message = f"WebSocket error: {self.websocket.exception()}"
|
236
|
+
raise RuntimeError(error_message)
|
237
|
+
elif message.type == aiohttp.WSMsgType.CLOSE:
|
238
|
+
break # WebSocket was closed
|
236
239
|
except Exception as e:
|
237
240
|
await self.close()
|
238
|
-
raise
|
241
|
+
raise e
|
239
242
|
|
240
243
|
async def transcribe( # type: ignore[override]
|
241
244
|
self,
|
@@ -243,17 +246,21 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
243
246
|
*,
|
244
247
|
model: str = "ink-whisper",
|
245
248
|
language: Optional[str] = "en",
|
246
|
-
encoding:
|
249
|
+
encoding: SttEncoding = "pcm_s16le",
|
247
250
|
sample_rate: int = 16000,
|
251
|
+
min_volume: Optional[float] = None,
|
252
|
+
max_silence_duration_secs: Optional[float] = None,
|
248
253
|
) -> AsyncGenerator[Dict[str, Any], None]:
|
249
254
|
"""Transcribe audio chunks using the WebSocket.
|
250
255
|
|
251
256
|
Args:
|
252
257
|
audio_chunks: Async iterator of audio chunks as bytes
|
253
|
-
model: ID of the model to use for transcription
|
254
|
-
language: The language of the input audio in ISO-639-1 format
|
255
|
-
encoding: The encoding format of the audio data
|
256
|
-
sample_rate: The sample rate of the audio in Hz
|
258
|
+
model: ID of the model to use for transcription (required)
|
259
|
+
language: The language of the input audio in ISO-639-1 format (defaults to "en")
|
260
|
+
encoding: The encoding format of the audio data (required)
|
261
|
+
sample_rate: The sample rate of the audio in Hz (required)
|
262
|
+
min_volume: Volume threshold for voice activity detection (0.0-1.0)
|
263
|
+
max_silence_duration_secs: Maximum duration of silence before endpointing
|
257
264
|
|
258
265
|
Yields:
|
259
266
|
Dictionary containing transcription results, flush_done, done, or error messages
|
@@ -263,6 +270,8 @@ class AsyncSttWebsocket(SttWebsocket):
|
|
263
270
|
language=language,
|
264
271
|
encoding=encoding,
|
265
272
|
sample_rate=sample_rate,
|
273
|
+
min_volume=min_volume,
|
274
|
+
max_silence_duration_secs=max_silence_duration_secs,
|
266
275
|
)
|
267
276
|
|
268
277
|
try:
|