cartesia 2.0.5__py3-none-any.whl → 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. cartesia/__init__.py +22 -0
  2. cartesia/auth/client.py +8 -8
  3. cartesia/auth/requests/token_grant.py +7 -1
  4. cartesia/auth/requests/token_request.py +3 -3
  5. cartesia/auth/types/token_grant.py +7 -2
  6. cartesia/auth/types/token_request.py +3 -3
  7. cartesia/core/client_wrapper.py +1 -1
  8. cartesia/infill/client.py +0 -8
  9. cartesia/stt/__init__.py +6 -0
  10. cartesia/stt/_async_websocket.py +81 -72
  11. cartesia/stt/_websocket.py +42 -20
  12. cartesia/stt/client.py +450 -0
  13. cartesia/stt/requests/__init__.py +2 -0
  14. cartesia/stt/requests/streaming_transcription_response.py +2 -0
  15. cartesia/stt/requests/transcript_message.py +8 -1
  16. cartesia/stt/requests/transcription_response.py +8 -1
  17. cartesia/stt/requests/transcription_word.py +20 -0
  18. cartesia/stt/socket_client.py +52 -109
  19. cartesia/stt/types/__init__.py +4 -0
  20. cartesia/stt/types/streaming_transcription_response.py +2 -0
  21. cartesia/stt/types/stt_encoding.py +3 -1
  22. cartesia/stt/types/timestamp_granularity.py +5 -0
  23. cartesia/stt/types/transcript_message.py +7 -1
  24. cartesia/stt/types/transcription_response.py +7 -1
  25. cartesia/stt/types/transcription_word.py +32 -0
  26. cartesia/tts/__init__.py +16 -0
  27. cartesia/tts/client.py +63 -8
  28. cartesia/tts/requests/__init__.py +8 -0
  29. cartesia/tts/requests/experimental_model_controls.py +17 -0
  30. cartesia/tts/requests/generation_config.py +23 -0
  31. cartesia/tts/requests/generation_request.py +4 -4
  32. cartesia/tts/requests/sse_output_format.py +11 -0
  33. cartesia/tts/requests/tts_request.py +2 -0
  34. cartesia/tts/requests/ttssse_request.py +47 -0
  35. cartesia/tts/requests/web_socket_chunk_response.py +0 -3
  36. cartesia/tts/requests/web_socket_response.py +1 -2
  37. cartesia/tts/requests/web_socket_tts_request.py +9 -1
  38. cartesia/tts/types/__init__.py +8 -0
  39. cartesia/tts/types/experimental_model_controls.py +28 -0
  40. cartesia/tts/types/generation_config.py +34 -0
  41. cartesia/tts/types/generation_request.py +4 -4
  42. cartesia/tts/types/sse_output_format.py +22 -0
  43. cartesia/tts/types/tts_request.py +2 -0
  44. cartesia/tts/types/ttssse_request.py +58 -0
  45. cartesia/tts/types/web_socket_chunk_response.py +1 -3
  46. cartesia/tts/types/web_socket_response.py +1 -2
  47. cartesia/tts/types/web_socket_tts_request.py +11 -3
  48. cartesia/voice_changer/client.py +0 -8
  49. cartesia/voice_changer/requests/streaming_response.py +0 -2
  50. cartesia/voice_changer/types/streaming_response.py +0 -2
  51. cartesia/voices/client.py +0 -12
  52. cartesia-2.0.7.dist-info/LICENSE +201 -0
  53. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/METADATA +116 -17
  54. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/RECORD +55 -42
  55. {cartesia-2.0.5.dist-info → cartesia-2.0.7.dist-info}/WHEEL +1 -1
cartesia/__init__.py CHANGED
@@ -37,10 +37,13 @@ from .stt import (
37
37
  StreamingTranscriptionResponse_Transcript,
38
38
  StreamingTranscriptionResponse_TranscriptParams,
39
39
  SttEncoding,
40
+ TimestampGranularity,
40
41
  TranscriptMessage,
41
42
  TranscriptMessageParams,
42
43
  TranscriptionResponse,
43
44
  TranscriptionResponseParams,
45
+ TranscriptionWord,
46
+ TranscriptionWordParams,
44
47
  )
45
48
  from .tts import (
46
49
  CancelContextRequest,
@@ -49,7 +52,11 @@ from .tts import (
49
52
  Controls,
50
53
  ControlsParams,
51
54
  Emotion,
55
+ ExperimentalModelControls,
56
+ ExperimentalModelControlsParams,
52
57
  FlushId,
58
+ GenerationConfig,
59
+ GenerationConfigParams,
53
60
  GenerationRequest,
54
61
  GenerationRequestParams,
55
62
  ModelSpeed,
@@ -72,6 +79,8 @@ from .tts import (
72
79
  RawOutputFormatParams,
73
80
  Speed,
74
81
  SpeedParams,
82
+ SseOutputFormat,
83
+ SseOutputFormatParams,
75
84
  SupportedLanguage,
76
85
  TtsRequest,
77
86
  TtsRequestEmbeddingSpecifier,
@@ -81,6 +90,8 @@ from .tts import (
81
90
  TtsRequestParams,
82
91
  TtsRequestVoiceSpecifier,
83
92
  TtsRequestVoiceSpecifierParams,
93
+ TtssseRequest,
94
+ TtssseRequestParams,
84
95
  WavOutputFormat,
85
96
  WavOutputFormatParams,
86
97
  WebSocketBaseResponse,
@@ -206,12 +217,16 @@ __all__ = [
206
217
  "Emotion",
207
218
  "ErrorMessage",
208
219
  "ErrorMessageParams",
220
+ "ExperimentalModelControls",
221
+ "ExperimentalModelControlsParams",
209
222
  "FilePurpose",
210
223
  "FlushDoneMessage",
211
224
  "FlushDoneMessageParams",
212
225
  "FlushId",
213
226
  "Gender",
214
227
  "GenderPresentation",
228
+ "GenerationConfig",
229
+ "GenerationConfigParams",
215
230
  "GenerationRequest",
216
231
  "GenerationRequestParams",
217
232
  "GetVoicesResponse",
@@ -256,6 +271,8 @@ __all__ = [
256
271
  "RawOutputFormatParams",
257
272
  "Speed",
258
273
  "SpeedParams",
274
+ "SseOutputFormat",
275
+ "SseOutputFormatParams",
259
276
  "StreamingResponse",
260
277
  "StreamingResponseParams",
261
278
  "StreamingResponse_Chunk",
@@ -276,6 +293,7 @@ __all__ = [
276
293
  "StreamingTranscriptionResponse_TranscriptParams",
277
294
  "SttEncoding",
278
295
  "SupportedLanguage",
296
+ "TimestampGranularity",
279
297
  "TokenGrant",
280
298
  "TokenGrantParams",
281
299
  "TokenRequest",
@@ -286,6 +304,8 @@ __all__ = [
286
304
  "TranscriptMessageParams",
287
305
  "TranscriptionResponse",
288
306
  "TranscriptionResponseParams",
307
+ "TranscriptionWord",
308
+ "TranscriptionWordParams",
289
309
  "TtsRequest",
290
310
  "TtsRequestEmbeddingSpecifier",
291
311
  "TtsRequestEmbeddingSpecifierParams",
@@ -294,6 +314,8 @@ __all__ = [
294
314
  "TtsRequestParams",
295
315
  "TtsRequestVoiceSpecifier",
296
316
  "TtsRequestVoiceSpecifierParams",
317
+ "TtssseRequest",
318
+ "TtssseRequestParams",
297
319
  "UpdateVoiceRequest",
298
320
  "UpdateVoiceRequestParams",
299
321
  "Voice",
cartesia/auth/client.py CHANGED
@@ -22,7 +22,7 @@ class AuthClient:
22
22
  def access_token(
23
23
  self,
24
24
  *,
25
- grants: TokenGrantParams,
25
+ grants: typing.Optional[TokenGrantParams] = OMIT,
26
26
  expires_in: typing.Optional[int] = OMIT,
27
27
  request_options: typing.Optional[RequestOptions] = None,
28
28
  ) -> TokenResponse:
@@ -31,8 +31,8 @@ class AuthClient:
31
31
 
32
32
  Parameters
33
33
  ----------
34
- grants : TokenGrantParams
35
- The permissions to be granted via the token.
34
+ grants : typing.Optional[TokenGrantParams]
35
+ The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
36
36
 
37
37
  expires_in : typing.Optional[int]
38
38
  The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -52,7 +52,7 @@ class AuthClient:
52
52
  api_key="YOUR_API_KEY",
53
53
  )
54
54
  client.auth.access_token(
55
- grants={"tts": True},
55
+ grants={"tts": True, "stt": True},
56
56
  expires_in=60,
57
57
  )
58
58
  """
@@ -90,7 +90,7 @@ class AsyncAuthClient:
90
90
  async def access_token(
91
91
  self,
92
92
  *,
93
- grants: TokenGrantParams,
93
+ grants: typing.Optional[TokenGrantParams] = OMIT,
94
94
  expires_in: typing.Optional[int] = OMIT,
95
95
  request_options: typing.Optional[RequestOptions] = None,
96
96
  ) -> TokenResponse:
@@ -99,8 +99,8 @@ class AsyncAuthClient:
99
99
 
100
100
  Parameters
101
101
  ----------
102
- grants : TokenGrantParams
103
- The permissions to be granted via the token.
102
+ grants : typing.Optional[TokenGrantParams]
103
+ The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
104
104
 
105
105
  expires_in : typing.Optional[int]
106
106
  The number of seconds the token will be valid for since the time of generation. The maximum is 1 hour (3600 seconds).
@@ -125,7 +125,7 @@ class AsyncAuthClient:
125
125
 
126
126
  async def main() -> None:
127
127
  await client.auth.access_token(
128
- grants={"tts": True},
128
+ grants={"tts": True, "stt": True},
129
129
  expires_in=60,
130
130
  )
131
131
 
@@ -1,10 +1,16 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  import typing_extensions
4
+ import typing_extensions
4
5
 
5
6
 
6
7
  class TokenGrantParams(typing_extensions.TypedDict):
7
- tts: bool
8
+ tts: typing_extensions.NotRequired[bool]
8
9
  """
9
10
  The `tts` grant allows the token to be used to access any TTS endpoint.
10
11
  """
12
+
13
+ stt: typing_extensions.NotRequired[bool]
14
+ """
15
+ The `stt` grant allows the token to be used to access any STT endpoint.
16
+ """
@@ -1,14 +1,14 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  import typing_extensions
4
- from .token_grant import TokenGrantParams
5
4
  import typing_extensions
5
+ from .token_grant import TokenGrantParams
6
6
 
7
7
 
8
8
  class TokenRequestParams(typing_extensions.TypedDict):
9
- grants: TokenGrantParams
9
+ grants: typing_extensions.NotRequired[TokenGrantParams]
10
10
  """
11
- The permissions to be granted via the token.
11
+ The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
12
12
  """
13
13
 
14
14
  expires_in: typing_extensions.NotRequired[int]
@@ -1,17 +1,22 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
4
5
  import pydantic
5
6
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
6
- import typing
7
7
 
8
8
 
9
9
  class TokenGrant(UniversalBaseModel):
10
- tts: bool = pydantic.Field()
10
+ tts: typing.Optional[bool] = pydantic.Field(default=None)
11
11
  """
12
12
  The `tts` grant allows the token to be used to access any TTS endpoint.
13
13
  """
14
14
 
15
+ stt: typing.Optional[bool] = pydantic.Field(default=None)
16
+ """
17
+ The `stt` grant allows the token to be used to access any STT endpoint.
18
+ """
19
+
15
20
  if IS_PYDANTIC_V2:
16
21
  model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
17
22
  else:
@@ -1,16 +1,16 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
3
  from ...core.pydantic_utilities import UniversalBaseModel
4
+ import typing
4
5
  from .token_grant import TokenGrant
5
6
  import pydantic
6
- import typing
7
7
  from ...core.pydantic_utilities import IS_PYDANTIC_V2
8
8
 
9
9
 
10
10
  class TokenRequest(UniversalBaseModel):
11
- grants: TokenGrant = pydantic.Field()
11
+ grants: typing.Optional[TokenGrant] = pydantic.Field(default=None)
12
12
  """
13
- The permissions to be granted via the token.
13
+ The permissions to be granted via the token. Both TTS and STT grants are optional - specify only the capabilities you need.
14
14
  """
15
15
 
16
16
  expires_in: typing.Optional[int] = pydantic.Field(default=None)
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "cartesia",
19
- "X-Fern-SDK-Version": "2.0.5",
19
+ "X-Fern-SDK-Version": "2.0.7",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  headers["Cartesia-Version"] = "2024-11-13"
cartesia/infill/client.py CHANGED
@@ -83,17 +83,14 @@ class InfillClient:
83
83
  output_format_encoding : typing.Optional[RawEncoding]
84
84
  Required for `raw` and `wav` containers.
85
85
 
86
-
87
86
  output_format_bit_rate : typing.Optional[int]
88
87
  Required for `mp3` containers.
89
88
 
90
-
91
89
  voice_experimental_controls_speed : typing.Optional[Speed]
92
90
  Either a number between -1.0 and 1.0 or a natural language description of speed.
93
91
 
94
92
  If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
95
93
 
96
-
97
94
  voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
98
95
  An array of emotion:level tags.
99
96
 
@@ -101,7 +98,6 @@ class InfillClient:
101
98
 
102
99
  Supported levels are: lowest, low, (omit), high, highest.
103
100
 
104
-
105
101
  request_options : typing.Optional[RequestOptions]
106
102
  Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
107
103
 
@@ -230,17 +226,14 @@ class AsyncInfillClient:
230
226
  output_format_encoding : typing.Optional[RawEncoding]
231
227
  Required for `raw` and `wav` containers.
232
228
 
233
-
234
229
  output_format_bit_rate : typing.Optional[int]
235
230
  Required for `mp3` containers.
236
231
 
237
-
238
232
  voice_experimental_controls_speed : typing.Optional[Speed]
239
233
  Either a number between -1.0 and 1.0 or a natural language description of speed.
240
234
 
241
235
  If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
242
236
 
243
-
244
237
  voice_experimental_controls_emotion : typing.Optional[typing.List[Emotion]]
245
238
  An array of emotion:level tags.
246
239
 
@@ -248,7 +241,6 @@ class AsyncInfillClient:
248
241
 
249
242
  Supported levels are: lowest, low, (omit), high, highest.
250
243
 
251
-
252
244
  request_options : typing.Optional[RequestOptions]
253
245
  Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response.
254
246
 
cartesia/stt/__init__.py CHANGED
@@ -10,8 +10,10 @@ from .types import (
10
10
  StreamingTranscriptionResponse_FlushDone,
11
11
  StreamingTranscriptionResponse_Transcript,
12
12
  SttEncoding,
13
+ TimestampGranularity,
13
14
  TranscriptMessage,
14
15
  TranscriptionResponse,
16
+ TranscriptionWord,
15
17
  )
16
18
  from .requests import (
17
19
  DoneMessageParams,
@@ -24,6 +26,7 @@ from .requests import (
24
26
  StreamingTranscriptionResponse_TranscriptParams,
25
27
  TranscriptMessageParams,
26
28
  TranscriptionResponseParams,
29
+ TranscriptionWordParams,
27
30
  )
28
31
 
29
32
  __all__ = [
@@ -44,8 +47,11 @@ __all__ = [
44
47
  "StreamingTranscriptionResponse_Transcript",
45
48
  "StreamingTranscriptionResponse_TranscriptParams",
46
49
  "SttEncoding",
50
+ "TimestampGranularity",
47
51
  "TranscriptMessage",
48
52
  "TranscriptMessageParams",
49
53
  "TranscriptionResponse",
50
54
  "TranscriptionResponseParams",
55
+ "TranscriptionWord",
56
+ "TranscriptionWordParams",
51
57
  ]
@@ -11,6 +11,7 @@ from cartesia.stt.types import (
11
11
  StreamingTranscriptionResponse_Error,
12
12
  StreamingTranscriptionResponse_Transcript,
13
13
  )
14
+ from cartesia.stt.types.stt_encoding import SttEncoding
14
15
 
15
16
  from ..core.pydantic_utilities import parse_obj_as
16
17
  from ._websocket import SttWebsocket
@@ -41,8 +42,10 @@ class AsyncSttWebsocket(SttWebsocket):
41
42
  self.websocket: Optional[aiohttp.ClientWebSocketResponse] = None
42
43
  self._default_model: str = "ink-whisper"
43
44
  self._default_language: Optional[str] = "en"
44
- self._default_encoding: Optional[str] = "pcm_s16le"
45
+ self._default_encoding: SttEncoding = "pcm_s16le"
45
46
  self._default_sample_rate: int = 16000
47
+ self._default_min_volume: Optional[float] = None
48
+ self._default_max_silence_duration_secs: Optional[float] = None
46
49
 
47
50
  def __del__(self):
48
51
  try:
@@ -60,16 +63,20 @@ class AsyncSttWebsocket(SttWebsocket):
60
63
  *,
61
64
  model: str = "ink-whisper",
62
65
  language: Optional[str] = "en",
63
- encoding: Optional[str] = "pcm_s16le",
66
+ encoding: SttEncoding = "pcm_s16le",
64
67
  sample_rate: int = 16000,
68
+ min_volume: Optional[float] = None,
69
+ max_silence_duration_secs: Optional[float] = None,
65
70
  ):
66
71
  """Connect to the STT WebSocket with the specified parameters.
67
72
 
68
73
  Args:
69
- model: ID of the model to use for transcription
70
- language: The language of the input audio in ISO-639-1 format
71
- encoding: The encoding format of the audio data
72
- sample_rate: The sample rate of the audio in Hz
74
+ model: ID of the model to use for transcription (required)
75
+ language: The language of the input audio in ISO-639-1 format (defaults to "en")
76
+ encoding: The encoding format of the audio data (required)
77
+ sample_rate: The sample rate of the audio in Hz (required)
78
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
79
+ max_silence_duration_secs: Maximum duration of silence before endpointing
73
80
 
74
81
  Raises:
75
82
  RuntimeError: If the connection to the WebSocket fails.
@@ -78,6 +85,8 @@ class AsyncSttWebsocket(SttWebsocket):
78
85
  self._default_language = language
79
86
  self._default_encoding = encoding
80
87
  self._default_sample_rate = sample_rate
88
+ self._default_min_volume = min_volume
89
+ self._default_max_silence_duration_secs = max_silence_duration_secs
81
90
 
82
91
  if self.websocket is None or self._is_websocket_closed():
83
92
  route = "stt/websocket"
@@ -87,13 +96,15 @@ class AsyncSttWebsocket(SttWebsocket):
87
96
  "model": model,
88
97
  "api_key": self.api_key,
89
98
  "cartesia_version": self.cartesia_version,
99
+ "encoding": encoding,
100
+ "sample_rate": str(sample_rate),
90
101
  }
91
102
  if language is not None:
92
103
  params["language"] = language
93
- if encoding is not None:
94
- params["encoding"] = encoding
95
- if sample_rate is not None:
96
- params["sample_rate"] = str(sample_rate)
104
+ if min_volume is not None:
105
+ params["min_volume"] = str(min_volume)
106
+ if max_silence_duration_secs is not None:
107
+ params["max_silence_duration_secs"] = str(max_silence_duration_secs)
97
108
 
98
109
  query_string = "&".join([f"{k}={v}" for k, v in params.items()])
99
110
  url = f"{self.ws_url}/{route}?{query_string}"
@@ -143,6 +154,8 @@ class AsyncSttWebsocket(SttWebsocket):
143
154
  language=self._default_language,
144
155
  encoding=self._default_encoding,
145
156
  sample_rate=self._default_sample_rate,
157
+ min_volume=self._default_min_volume,
158
+ max_silence_duration_secs=self._default_max_silence_duration_secs,
146
159
  )
147
160
 
148
161
  assert self.websocket is not None, "WebSocket should be connected after connect() call"
@@ -166,76 +179,66 @@ class AsyncSttWebsocket(SttWebsocket):
166
179
  language=self._default_language,
167
180
  encoding=self._default_encoding,
168
181
  sample_rate=self._default_sample_rate,
182
+ min_volume=self._default_min_volume,
183
+ max_silence_duration_secs=self._default_max_silence_duration_secs,
169
184
  )
170
185
 
171
186
  assert self.websocket is not None, "WebSocket should be connected after connect() call"
172
187
 
173
188
  try:
174
- while True:
175
- try:
176
- msg = await asyncio.wait_for(self.websocket.receive(), timeout=self.timeout)
189
+ async for message in self.websocket:
190
+ if message.type == aiohttp.WSMsgType.TEXT:
191
+ raw_data = json.loads(message.data)
177
192
 
178
- if msg.type == aiohttp.WSMsgType.TEXT:
179
- raw_data = json.loads(msg.data)
180
-
181
- # Handle error responses
182
- if raw_data.get("type") == "error":
183
- raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
184
-
185
- # Handle transcript responses with flexible parsing
186
- if raw_data.get("type") == "transcript":
187
- # Provide defaults for missing required fields
188
- result = {
189
- "type": raw_data["type"],
190
- "request_id": raw_data.get("request_id", ""),
191
- "text": raw_data.get("text", ""), # Default to empty string if missing
192
- "is_final": raw_data.get("is_final", False), # Default to False if missing
193
- }
194
-
195
- # Add optional fields if present
196
- if "duration" in raw_data:
197
- result["duration"] = raw_data["duration"]
198
- if "language" in raw_data:
199
- result["language"] = raw_data["language"]
200
-
201
- yield result
193
+ # Handle error responses
194
+ if raw_data.get("type") == "error":
195
+ raise RuntimeError(f"Error transcribing audio: {raw_data.get('message', 'Unknown error')}")
196
+
197
+ # Handle transcript responses with flexible parsing
198
+ if raw_data.get("type") == "transcript":
199
+ # Provide defaults for missing required fields
200
+ result = {
201
+ "type": raw_data["type"],
202
+ "request_id": raw_data.get("request_id", ""),
203
+ "text": raw_data.get("text", ""), # Default to empty string if missing
204
+ "is_final": raw_data.get("is_final", False), # Default to False if missing
205
+ }
202
206
 
203
- # Handle flush_done acknowledgment
204
- elif raw_data.get("type") == "flush_done":
205
- result = {
206
- "type": raw_data["type"],
207
- "request_id": raw_data.get("request_id", ""),
208
- }
209
- yield result
207
+ # Add optional fields if present
208
+ if "duration" in raw_data:
209
+ result["duration"] = raw_data["duration"]
210
+ if "language" in raw_data:
211
+ result["language"] = raw_data["language"]
212
+ if "words" in raw_data:
213
+ result["words"] = raw_data["words"]
210
214
 
211
- # Handle done acknowledgment - session complete
212
- elif raw_data.get("type") == "done":
213
- result = {
214
- "type": raw_data["type"],
215
- "request_id": raw_data.get("request_id", ""),
216
- }
217
- yield result
218
- # Session is complete, break out of loop
219
- break
215
+ yield result
220
216
 
221
- elif msg.type == aiohttp.WSMsgType.ERROR:
222
- websocket_exception = self.websocket.exception() if self.websocket else None
223
- await self.close()
224
- raise RuntimeError(f"WebSocket error: {websocket_exception}")
217
+ # Handle flush_done acknowledgment
218
+ elif raw_data.get("type") == "flush_done":
219
+ result = {
220
+ "type": raw_data["type"],
221
+ "request_id": raw_data.get("request_id", ""),
222
+ }
223
+ yield result
225
224
 
226
- elif msg.type == aiohttp.WSMsgType.CLOSE:
227
- break
225
+ # Handle done acknowledgment
226
+ elif raw_data.get("type") == "done":
227
+ result = {
228
+ "type": raw_data["type"],
229
+ "request_id": raw_data.get("request_id", ""),
230
+ }
231
+ yield result
232
+ break # Exit the loop when done
228
233
 
229
- except asyncio.TimeoutError:
230
- await self.close()
231
- raise RuntimeError("Timeout while waiting for transcription")
232
- except Exception as inner_e:
233
- await self.close()
234
- raise RuntimeError(f"Error receiving transcription: {inner_e}")
235
-
234
+ elif message.type == aiohttp.WSMsgType.ERROR:
235
+ error_message = f"WebSocket error: {self.websocket.exception()}"
236
+ raise RuntimeError(error_message)
237
+ elif message.type == aiohttp.WSMsgType.CLOSE:
238
+ break # WebSocket was closed
236
239
  except Exception as e:
237
240
  await self.close()
238
- raise RuntimeError(f"Failed to receive transcription. {e}")
241
+ raise e
239
242
 
240
243
  async def transcribe( # type: ignore[override]
241
244
  self,
@@ -243,17 +246,21 @@ class AsyncSttWebsocket(SttWebsocket):
243
246
  *,
244
247
  model: str = "ink-whisper",
245
248
  language: Optional[str] = "en",
246
- encoding: Optional[str] = "pcm_s16le",
249
+ encoding: SttEncoding = "pcm_s16le",
247
250
  sample_rate: int = 16000,
251
+ min_volume: Optional[float] = None,
252
+ max_silence_duration_secs: Optional[float] = None,
248
253
  ) -> AsyncGenerator[Dict[str, Any], None]:
249
254
  """Transcribe audio chunks using the WebSocket.
250
255
 
251
256
  Args:
252
257
  audio_chunks: Async iterator of audio chunks as bytes
253
- model: ID of the model to use for transcription
254
- language: The language of the input audio in ISO-639-1 format
255
- encoding: The encoding format of the audio data
256
- sample_rate: The sample rate of the audio in Hz
258
+ model: ID of the model to use for transcription (required)
259
+ language: The language of the input audio in ISO-639-1 format (defaults to "en")
260
+ encoding: The encoding format of the audio data (required)
261
+ sample_rate: The sample rate of the audio in Hz (required)
262
+ min_volume: Volume threshold for voice activity detection (0.0-1.0)
263
+ max_silence_duration_secs: Maximum duration of silence before endpointing
257
264
 
258
265
  Yields:
259
266
  Dictionary containing transcription results, flush_done, done, or error messages
@@ -263,6 +270,8 @@ class AsyncSttWebsocket(SttWebsocket):
263
270
  language=language,
264
271
  encoding=encoding,
265
272
  sample_rate=sample_rate,
273
+ min_volume=min_volume,
274
+ max_silence_duration_secs=max_silence_duration_secs,
266
275
  )
267
276
 
268
277
  try: