cartesia 2.0.5__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +14 -0
- cartesia/auth/client.py +8 -8
- cartesia/auth/requests/token_grant.py +7 -1
- cartesia/auth/requests/token_request.py +3 -3
- cartesia/auth/types/token_grant.py +7 -2
- cartesia/auth/types/token_request.py +3 -3
- cartesia/core/client_wrapper.py +1 -1
- cartesia/stt/__init__.py +6 -0
- cartesia/stt/_async_websocket.py +81 -72
- cartesia/stt/_websocket.py +42 -20
- cartesia/stt/client.py +456 -0
- cartesia/stt/requests/__init__.py +2 -0
- cartesia/stt/requests/streaming_transcription_response.py +2 -0
- cartesia/stt/requests/transcript_message.py +8 -1
- cartesia/stt/requests/transcription_response.py +8 -1
- cartesia/stt/requests/transcription_word.py +20 -0
- cartesia/stt/socket_client.py +52 -109
- cartesia/stt/types/__init__.py +4 -0
- cartesia/stt/types/streaming_transcription_response.py +2 -0
- cartesia/stt/types/stt_encoding.py +3 -1
- cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia/stt/types/transcript_message.py +7 -1
- cartesia/stt/types/transcription_response.py +7 -1
- cartesia/stt/types/transcription_word.py +32 -0
- cartesia/tts/__init__.py +8 -0
- cartesia/tts/client.py +50 -8
- cartesia/tts/requests/__init__.py +4 -0
- cartesia/tts/requests/generation_request.py +4 -4
- cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia/tts/requests/ttssse_request.py +47 -0
- cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- cartesia/tts/requests/web_socket_response.py +1 -2
- cartesia/tts/requests/web_socket_tts_request.py +9 -1
- cartesia/tts/types/__init__.py +4 -0
- cartesia/tts/types/generation_request.py +4 -4
- cartesia/tts/types/sse_output_format.py +22 -0
- cartesia/tts/types/ttssse_request.py +58 -0
- cartesia/tts/types/web_socket_chunk_response.py +1 -3
- cartesia/tts/types/web_socket_response.py +1 -2
- cartesia/tts/types/web_socket_tts_request.py +11 -3
- cartesia/voice_changer/requests/streaming_response.py +0 -2
- cartesia/voice_changer/types/streaming_response.py +0 -2
- {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/METADATA +113 -16
- {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/RECORD +45 -37
- {cartesia-2.0.5.dist-info → cartesia-2.0.6.dist-info}/WHEEL +0 -0
@@ -4,8 +4,8 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ..types.flush_id import FlushId
|
8
7
|
from ..types.context_id import ContextId
|
8
|
+
from ..types.flush_id import FlushId
|
9
9
|
from .word_timestamps import WordTimestampsParams
|
10
10
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
11
11
|
|
@@ -14,7 +14,6 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
|
|
14
14
|
type: typing.Literal["chunk"]
|
15
15
|
data: str
|
16
16
|
step_time: float
|
17
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
18
17
|
context_id: typing_extensions.NotRequired[ContextId]
|
19
18
|
status_code: int
|
20
19
|
done: bool
|
@@ -20,8 +20,16 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
|
20
20
|
duration: typing_extensions.NotRequired[int]
|
21
21
|
language: typing_extensions.NotRequired[str]
|
22
22
|
add_timestamps: typing_extensions.NotRequired[bool]
|
23
|
-
|
23
|
+
"""
|
24
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
25
|
+
"""
|
26
|
+
|
24
27
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
28
|
+
"""
|
29
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
30
|
+
"""
|
31
|
+
|
32
|
+
use_normalized_timestamps: typing_extensions.NotRequired[bool]
|
25
33
|
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
26
34
|
context_id: typing_extensions.NotRequired[str]
|
27
35
|
max_buffer_delay_ms: typing_extensions.NotRequired[int]
|
cartesia/tts/types/__init__.py
CHANGED
@@ -15,11 +15,13 @@ from .phoneme_timestamps import PhonemeTimestamps
|
|
15
15
|
from .raw_encoding import RawEncoding
|
16
16
|
from .raw_output_format import RawOutputFormat
|
17
17
|
from .speed import Speed
|
18
|
+
from .sse_output_format import SseOutputFormat
|
18
19
|
from .supported_language import SupportedLanguage
|
19
20
|
from .tts_request import TtsRequest
|
20
21
|
from .tts_request_embedding_specifier import TtsRequestEmbeddingSpecifier
|
21
22
|
from .tts_request_id_specifier import TtsRequestIdSpecifier
|
22
23
|
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
24
|
+
from .ttssse_request import TtssseRequest
|
23
25
|
from .wav_output_format import WavOutputFormat
|
24
26
|
from .web_socket_base_response import WebSocketBaseResponse
|
25
27
|
from .web_socket_chunk_response import WebSocketChunkResponse
|
@@ -63,11 +65,13 @@ __all__ = [
|
|
63
65
|
"RawEncoding",
|
64
66
|
"RawOutputFormat",
|
65
67
|
"Speed",
|
68
|
+
"SseOutputFormat",
|
66
69
|
"SupportedLanguage",
|
67
70
|
"TtsRequest",
|
68
71
|
"TtsRequestEmbeddingSpecifier",
|
69
72
|
"TtsRequestIdSpecifier",
|
70
73
|
"TtsRequestVoiceSpecifier",
|
74
|
+
"TtssseRequest",
|
71
75
|
"WavOutputFormat",
|
72
76
|
"WebSocketBaseResponse",
|
73
77
|
"WebSocketChunkResponse",
|
@@ -59,17 +59,17 @@ class GenerationRequest(UniversalBaseModel):
|
|
59
59
|
|
60
60
|
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
61
61
|
"""
|
62
|
-
Whether to return word-level timestamps.
|
62
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
63
63
|
"""
|
64
64
|
|
65
65
|
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
66
66
|
"""
|
67
|
-
Whether to return phoneme-level timestamps.
|
67
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
68
68
|
"""
|
69
69
|
|
70
|
-
|
70
|
+
use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
71
71
|
"""
|
72
|
-
Whether to use
|
72
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
73
73
|
"""
|
74
74
|
|
75
75
|
if IS_PYDANTIC_V2:
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import typing
|
5
|
+
from .raw_encoding import RawEncoding
|
6
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
7
|
+
import pydantic
|
8
|
+
|
9
|
+
|
10
|
+
class SseOutputFormat(UniversalBaseModel):
|
11
|
+
container: typing.Literal["raw"] = "raw"
|
12
|
+
encoding: RawEncoding
|
13
|
+
sample_rate: int
|
14
|
+
|
15
|
+
if IS_PYDANTIC_V2:
|
16
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
17
|
+
else:
|
18
|
+
|
19
|
+
class Config:
|
20
|
+
frozen = True
|
21
|
+
smart_union = True
|
22
|
+
extra = pydantic.Extra.allow
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
from ...core.pydantic_utilities import UniversalBaseModel
|
4
|
+
import pydantic
|
5
|
+
from .tts_request_voice_specifier import TtsRequestVoiceSpecifier
|
6
|
+
import typing
|
7
|
+
from .supported_language import SupportedLanguage
|
8
|
+
from .sse_output_format import SseOutputFormat
|
9
|
+
from .model_speed import ModelSpeed
|
10
|
+
from .context_id import ContextId
|
11
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
12
|
+
|
13
|
+
|
14
|
+
class TtssseRequest(UniversalBaseModel):
|
15
|
+
model_id: str = pydantic.Field()
|
16
|
+
"""
|
17
|
+
The ID of the model to use for the generation. See [Models](/build-with-cartesia/models) for available models.
|
18
|
+
"""
|
19
|
+
|
20
|
+
transcript: str
|
21
|
+
voice: TtsRequestVoiceSpecifier
|
22
|
+
language: typing.Optional[SupportedLanguage] = None
|
23
|
+
output_format: SseOutputFormat
|
24
|
+
duration: typing.Optional[float] = pydantic.Field(default=None)
|
25
|
+
"""
|
26
|
+
The maximum duration of the audio in seconds. You do not usually need to specify this.
|
27
|
+
If the duration is not appropriate for the length of the transcript, the output audio may be truncated.
|
28
|
+
"""
|
29
|
+
|
30
|
+
speed: typing.Optional[ModelSpeed] = None
|
31
|
+
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
32
|
+
"""
|
33
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
34
|
+
"""
|
35
|
+
|
36
|
+
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
37
|
+
"""
|
38
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
39
|
+
"""
|
40
|
+
|
41
|
+
use_normalized_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
42
|
+
"""
|
43
|
+
Whether to use normalized timestamps (True) or original timestamps (False).
|
44
|
+
"""
|
45
|
+
|
46
|
+
context_id: typing.Optional[ContextId] = pydantic.Field(default=None)
|
47
|
+
"""
|
48
|
+
Optional context ID for this request.
|
49
|
+
"""
|
50
|
+
|
51
|
+
if IS_PYDANTIC_V2:
|
52
|
+
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
53
|
+
else:
|
54
|
+
|
55
|
+
class Config:
|
56
|
+
frozen = True
|
57
|
+
smart_union = True
|
58
|
+
extra = pydantic.Extra.allow
|
@@ -1,16 +1,14 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponse
|
4
|
-
import typing
|
5
|
-
from .flush_id import FlushId
|
6
4
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
5
|
+
import typing
|
7
6
|
import pydantic
|
8
7
|
|
9
8
|
|
10
9
|
class WebSocketChunkResponse(WebSocketBaseResponse):
|
11
10
|
data: str
|
12
11
|
step_time: float
|
13
|
-
flush_id: typing.Optional[FlushId] = None
|
14
12
|
|
15
13
|
if IS_PYDANTIC_V2:
|
16
14
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -3,10 +3,10 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
-
from .flush_id import FlushId
|
7
6
|
from .context_id import ContextId
|
8
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
8
|
import pydantic
|
9
|
+
from .flush_id import FlushId
|
10
10
|
from .word_timestamps import WordTimestamps
|
11
11
|
from .phoneme_timestamps import PhonemeTimestamps
|
12
12
|
|
@@ -15,7 +15,6 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
|
|
15
15
|
type: typing.Literal["chunk"] = "chunk"
|
16
16
|
data: str
|
17
17
|
step_time: float
|
18
|
-
flush_id: typing.Optional[FlushId] = None
|
19
18
|
context_id: typing.Optional[ContextId] = None
|
20
19
|
status_code: int
|
21
20
|
done: bool
|
@@ -22,9 +22,17 @@ class WebSocketTtsRequest(UniversalBaseModel):
|
|
22
22
|
voice: TtsRequestVoiceSpecifier
|
23
23
|
duration: typing.Optional[int] = None
|
24
24
|
language: typing.Optional[str] = None
|
25
|
-
add_timestamps: typing.Optional[bool] = None
|
26
|
-
|
27
|
-
|
25
|
+
add_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
26
|
+
"""
|
27
|
+
Whether to return word-level timestamps. If `false` (default), no word timestamps will be produced at all. If `true`, the server will return timestamp events containing word-level timing information.
|
28
|
+
"""
|
29
|
+
|
30
|
+
add_phoneme_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
31
|
+
"""
|
32
|
+
Whether to return phoneme-level timestamps. If `false` (default), no phoneme timestamps will be produced - if `add_timestamps` is `true`, the produced timestamps will be word timestamps instead. If `true`, the server will return timestamp events containing phoneme-level timing information.
|
33
|
+
"""
|
34
|
+
|
35
|
+
use_normalized_timestamps: typing.Optional[bool] = None
|
28
36
|
continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
|
29
37
|
context_id: typing.Optional[str] = None
|
30
38
|
max_buffer_delay_ms: typing.Optional[int] = None
|
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ...tts.types.flush_id import FlushId
|
8
7
|
from ...tts.types.context_id import ContextId
|
9
8
|
|
10
9
|
|
@@ -12,7 +11,6 @@ class StreamingResponse_ChunkParams(typing_extensions.TypedDict):
|
|
12
11
|
type: typing.Literal["chunk"]
|
13
12
|
data: str
|
14
13
|
step_time: float
|
15
|
-
flush_id: typing_extensions.NotRequired[FlushId]
|
16
14
|
context_id: typing_extensions.NotRequired[ContextId]
|
17
15
|
status_code: int
|
18
16
|
done: bool
|
@@ -3,7 +3,6 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
-
from ...tts.types.flush_id import FlushId
|
7
6
|
from ...tts.types.context_id import ContextId
|
8
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
9
8
|
import pydantic
|
@@ -13,7 +12,6 @@ class StreamingResponse_Chunk(UniversalBaseModel):
|
|
13
12
|
type: typing.Literal["chunk"] = "chunk"
|
14
13
|
data: str
|
15
14
|
step_time: float
|
16
|
-
flush_id: typing.Optional[FlushId] = None
|
17
15
|
context_id: typing.Optional[ContextId] = None
|
18
16
|
status_code: int
|
19
17
|
done: bool
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.6
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.8,<4.0
|
6
6
|
Classifier: Intended Audience :: Developers
|
@@ -230,12 +230,14 @@ with open("path/to/audio.wav", "rb") as f:
|
|
230
230
|
chunk_size = 640
|
231
231
|
audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
|
232
232
|
|
233
|
-
# Create websocket connection
|
233
|
+
# Create websocket connection with endpointing parameters
|
234
234
|
ws = client.stt.websocket(
|
235
|
-
model="ink-whisper",
|
236
|
-
language="en",
|
237
|
-
encoding="pcm_s16le",
|
238
|
-
sample_rate=16000,
|
235
|
+
model="ink-whisper", # Model (required)
|
236
|
+
language="en", # Language of your audio (required)
|
237
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
238
|
+
sample_rate=16000, # Audio sample rate (required)
|
239
|
+
min_volume=0.1, # Volume threshold for voice activity detection
|
240
|
+
max_silence_duration_secs=0.4, # Maximum silence duration before endpointing
|
239
241
|
)
|
240
242
|
|
241
243
|
# Send audio chunks (streaming approach)
|
@@ -246,10 +248,20 @@ for chunk in audio_chunks:
|
|
246
248
|
ws.send("finalize")
|
247
249
|
ws.send("done")
|
248
250
|
|
249
|
-
# Receive transcription results
|
251
|
+
# Receive transcription results with word-level timestamps
|
250
252
|
for result in ws.receive():
|
251
253
|
if result['type'] == 'transcript':
|
252
254
|
print(f"Transcription: {result['text']}")
|
255
|
+
|
256
|
+
# Handle word-level timestamps if available
|
257
|
+
if 'words' in result and result['words']:
|
258
|
+
print("Word-level timestamps:")
|
259
|
+
for word_info in result['words']:
|
260
|
+
word = word_info['word']
|
261
|
+
start = word_info['start']
|
262
|
+
end = word_info['end']
|
263
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
264
|
+
|
253
265
|
if result['is_final']:
|
254
266
|
print("Final result received")
|
255
267
|
elif result['type'] == 'done':
|
@@ -270,17 +282,20 @@ from cartesia import AsyncCartesia
|
|
270
282
|
async def streaming_stt_example():
|
271
283
|
"""
|
272
284
|
Advanced async STT example for real-time streaming applications.
|
273
|
-
This example simulates streaming audio processing with proper error handling
|
285
|
+
This example simulates streaming audio processing with proper error handling
|
286
|
+
and demonstrates the new endpointing and word timestamp features.
|
274
287
|
"""
|
275
288
|
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
276
289
|
|
277
290
|
try:
|
278
|
-
# Create websocket connection
|
291
|
+
# Create websocket connection with voice activity detection
|
279
292
|
ws = await client.stt.websocket(
|
280
|
-
model="ink-whisper",
|
281
|
-
language="en",
|
282
|
-
encoding="pcm_s16le",
|
283
|
-
sample_rate=16000,
|
293
|
+
model="ink-whisper", # Model (required)
|
294
|
+
language="en", # Language of your audio (required)
|
295
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
296
|
+
sample_rate=16000, # Audio sample rate (required)
|
297
|
+
min_volume=0.15, # Volume threshold for voice activity detection
|
298
|
+
max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
|
284
299
|
)
|
285
300
|
|
286
301
|
# Simulate streaming audio data (replace with your audio source)
|
@@ -319,8 +334,9 @@ async def streaming_stt_example():
|
|
319
334
|
print(f"Error sending audio: {e}")
|
320
335
|
|
321
336
|
async def receive_transcripts():
|
322
|
-
"""Receive and process transcription results"""
|
337
|
+
"""Receive and process transcription results with word timestamps"""
|
323
338
|
full_transcript = ""
|
339
|
+
all_word_timestamps = []
|
324
340
|
|
325
341
|
try:
|
326
342
|
async for result in ws.receive():
|
@@ -328,6 +344,19 @@ async def streaming_stt_example():
|
|
328
344
|
text = result['text']
|
329
345
|
is_final = result['is_final']
|
330
346
|
|
347
|
+
# Handle word-level timestamps
|
348
|
+
if 'words' in result and result['words']:
|
349
|
+
word_timestamps = result['words']
|
350
|
+
all_word_timestamps.extend(word_timestamps)
|
351
|
+
|
352
|
+
if is_final:
|
353
|
+
print("Word-level timestamps:")
|
354
|
+
for word_info in word_timestamps:
|
355
|
+
word = word_info['word']
|
356
|
+
start = word_info['start']
|
357
|
+
end = word_info['end']
|
358
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
359
|
+
|
331
360
|
if is_final:
|
332
361
|
# Final result - this text won't change
|
333
362
|
full_transcript += text + " "
|
@@ -343,17 +372,18 @@ async def streaming_stt_example():
|
|
343
372
|
except Exception as e:
|
344
373
|
print(f"Error receiving transcripts: {e}")
|
345
374
|
|
346
|
-
return full_transcript.strip()
|
375
|
+
return full_transcript.strip(), all_word_timestamps
|
347
376
|
|
348
377
|
print("Starting streaming STT...")
|
349
378
|
|
350
379
|
# Use asyncio.gather to run audio sending and transcript receiving concurrently
|
351
|
-
_, final_transcript = await asyncio.gather(
|
380
|
+
_, (final_transcript, word_timestamps) = await asyncio.gather(
|
352
381
|
send_audio(),
|
353
382
|
receive_transcripts()
|
354
383
|
)
|
355
384
|
|
356
385
|
print(f"\nComplete transcript: {final_transcript}")
|
386
|
+
print(f"Total words with timestamps: {len(word_timestamps)}")
|
357
387
|
|
358
388
|
# Clean up
|
359
389
|
await ws.close()
|
@@ -368,6 +398,73 @@ if __name__ == "__main__":
|
|
368
398
|
asyncio.run(streaming_stt_example())
|
369
399
|
```
|
370
400
|
|
401
|
+
## Batch Speech-to-Text (STT)
|
402
|
+
|
403
|
+
For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
|
404
|
+
|
405
|
+
```python
|
406
|
+
from cartesia import Cartesia
|
407
|
+
import os
|
408
|
+
|
409
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
410
|
+
|
411
|
+
# Transcribe an audio file with word-level timestamps
|
412
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
413
|
+
response = client.stt.transcribe(
|
414
|
+
file=audio_file, # Audio file to transcribe
|
415
|
+
model="ink-whisper", # STT model (required)
|
416
|
+
language="en", # Language of the audio (optional)
|
417
|
+
timestamp_granularities=["word"], # Include word-level timestamps (optional)
|
418
|
+
encoding="pcm_s16le", # Audio encoding (optional)
|
419
|
+
sample_rate=16000, # Audio sample rate (optional)
|
420
|
+
)
|
421
|
+
|
422
|
+
# Access transcription results
|
423
|
+
print(f"Transcribed text: {response.text}")
|
424
|
+
print(f"Audio duration: {response.duration:.2f} seconds")
|
425
|
+
|
426
|
+
# Process word-level timestamps if requested
|
427
|
+
if response.words:
|
428
|
+
print("\nWord-level timestamps:")
|
429
|
+
for word_info in response.words:
|
430
|
+
word = word_info.word
|
431
|
+
start = word_info.start
|
432
|
+
end = word_info.end
|
433
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
434
|
+
```
|
435
|
+
|
436
|
+
### Async Batch STT
|
437
|
+
|
438
|
+
```python
|
439
|
+
import asyncio
|
440
|
+
from cartesia import AsyncCartesia
|
441
|
+
import os
|
442
|
+
|
443
|
+
async def transcribe_file():
|
444
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
445
|
+
|
446
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
447
|
+
response = await client.stt.transcribe(
|
448
|
+
file=audio_file,
|
449
|
+
model="ink-whisper",
|
450
|
+
language="en",
|
451
|
+
timestamp_granularities=["word"],
|
452
|
+
)
|
453
|
+
|
454
|
+
print(f"Transcribed text: {response.text}")
|
455
|
+
|
456
|
+
# Process word timestamps
|
457
|
+
if response.words:
|
458
|
+
for word_info in response.words:
|
459
|
+
print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
|
460
|
+
|
461
|
+
await client.close()
|
462
|
+
|
463
|
+
asyncio.run(transcribe_file())
|
464
|
+
```
|
465
|
+
|
466
|
+
> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
|
467
|
+
|
371
468
|
## Voices
|
372
469
|
|
373
470
|
List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
|