cartesia 2.0.4__tar.gz → 2.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-2.0.4 → cartesia-2.0.6}/PKG-INFO +256 -2
- {cartesia-2.0.4 → cartesia-2.0.6}/README.md +255 -1
- {cartesia-2.0.4 → cartesia-2.0.6}/pyproject.toml +1 -1
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/__init__.py +60 -1
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/client.py +8 -8
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/requests/token_grant.py +7 -1
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/requests/token_request.py +3 -3
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/types/token_grant.py +7 -2
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/types/token_request.py +3 -3
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/base_client.py +2 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/client.py +5 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/client_wrapper.py +1 -1
- cartesia-2.0.6/src/cartesia/stt/__init__.py +57 -0
- cartesia-2.0.6/src/cartesia/stt/_async_websocket.py +293 -0
- cartesia-2.0.6/src/cartesia/stt/_websocket.py +294 -0
- cartesia-2.0.6/src/cartesia/stt/client.py +456 -0
- cartesia-2.0.6/src/cartesia/stt/requests/__init__.py +29 -0
- cartesia-2.0.6/src/cartesia/stt/requests/done_message.py +14 -0
- cartesia-2.0.6/src/cartesia/stt/requests/error_message.py +16 -0
- cartesia-2.0.6/src/cartesia/stt/requests/flush_done_message.py +14 -0
- cartesia-2.0.6/src/cartesia/stt/requests/streaming_transcription_response.py +41 -0
- cartesia-2.0.6/src/cartesia/stt/requests/transcript_message.py +40 -0
- cartesia-2.0.6/src/cartesia/stt/requests/transcription_response.py +28 -0
- cartesia-2.0.6/src/cartesia/stt/requests/transcription_word.py +20 -0
- cartesia-2.0.6/src/cartesia/stt/socket_client.py +138 -0
- cartesia-2.0.6/src/cartesia/stt/types/__init__.py +33 -0
- cartesia-2.0.6/src/cartesia/stt/types/done_message.py +26 -0
- cartesia-2.0.6/src/cartesia/stt/types/error_message.py +27 -0
- cartesia-2.0.6/src/cartesia/stt/types/flush_done_message.py +26 -0
- cartesia-2.0.6/src/cartesia/stt/types/streaming_transcription_response.py +94 -0
- cartesia-2.0.6/src/cartesia/stt/types/stt_encoding.py +7 -0
- cartesia-2.0.6/src/cartesia/stt/types/timestamp_granularity.py +5 -0
- cartesia-2.0.6/src/cartesia/stt/types/transcript_message.py +50 -0
- cartesia-2.0.6/src/cartesia/stt/types/transcription_response.py +38 -0
- cartesia-2.0.6/src/cartesia/stt/types/transcription_word.py +32 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/__init__.py +8 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/client.py +50 -8
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/__init__.py +4 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/generation_request.py +4 -4
- cartesia-2.0.6/src/cartesia/tts/requests/sse_output_format.py +11 -0
- cartesia-2.0.6/src/cartesia/tts/requests/ttssse_request.py +47 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_chunk_response.py +0 -3
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_response.py +1 -2
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_tts_request.py +9 -1
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/__init__.py +4 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/generation_request.py +4 -4
- cartesia-2.0.6/src/cartesia/tts/types/sse_output_format.py +22 -0
- cartesia-2.0.6/src/cartesia/tts/types/ttssse_request.py +58 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_chunk_response.py +1 -3
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_response.py +1 -2
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_tts_request.py +11 -3
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voice_changer/requests/streaming_response.py +0 -2
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voice_changer/types/streaming_response.py +0 -2
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/api_status/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/api_status/client.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/api_status/requests/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/api_status/requests/api_info.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/api_status/types/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/api_status/types/api_info.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/requests/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/requests/token_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/types/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/auth/types/token_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/api_error.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/datetime_utils.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/file.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/http_client.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/jsonable_encoder.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/pagination.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/pydantic_utilities.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/query_encoder.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/remove_none_from_dict.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/request_options.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/core/serialization.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/requests/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/requests/create_dataset_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/requests/dataset.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/requests/dataset_file.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/requests/paginated_dataset_files.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/requests/paginated_datasets.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/types/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/types/create_dataset_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/types/dataset.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/types/dataset_file.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/types/file_purpose.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/types/paginated_dataset_files.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/datasets/types/paginated_datasets.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/embedding/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/embedding/types/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/embedding/types/embedding.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/environment.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/infill/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/infill/client.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/py.typed +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/_async_websocket.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/_websocket.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/cancel_context_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/controls.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/mp_3_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/phoneme_timestamps.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/raw_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/speed.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/tts_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/tts_request_embedding_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/tts_request_id_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/tts_request_voice_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/wav_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_base_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_done_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_error_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_flush_done_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_raw_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_stream_options.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_timestamps_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/web_socket_tts_output.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/requests/word_timestamps.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/socket_client.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/cancel_context_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/context_id.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/controls.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/emotion.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/flush_id.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/model_speed.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/mp_3_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/natural_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/numerical_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/phoneme_timestamps.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/raw_encoding.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/raw_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/speed.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/supported_language.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/tts_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/tts_request_embedding_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/tts_request_id_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/tts_request_voice_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/wav_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_base_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_done_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_error_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_flush_done_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_phoneme_timestamps_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_raw_output_format.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_stream_options.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_timestamps_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/web_socket_tts_output.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/types/word_timestamps.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/utils/constants.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/utils/tts.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/tts/utils/types.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/version.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voice_changer/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voice_changer/client.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voice_changer/requests/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voice_changer/types/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voice_changer/types/output_format_container.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/client.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/create_voice_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/embedding_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/embedding_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/get_voices_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/id_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/localize_dialect.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/localize_voice_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/mix_voice_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/mix_voices_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/update_voice_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/voice.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/requests/voice_metadata.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/__init__.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/base_voice_id.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/clone_mode.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/create_voice_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/embedding_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/embedding_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/gender.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/gender_presentation.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/get_voices_response.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/id_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/localize_dialect.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/localize_english_dialect.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/localize_french_dialect.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/localize_portuguese_dialect.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/localize_spanish_dialect.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/localize_target_language.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/localize_voice_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/mix_voice_specifier.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/mix_voices_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/update_voice_request.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/voice.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/voice_expand_options.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/voice_id.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/voice_metadata.py +0 -0
- {cartesia-2.0.4 → cartesia-2.0.6}/src/cartesia/voices/types/weight.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.6
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.8,<4.0
|
6
6
|
Classifier: Intended Audience :: Developers
|
@@ -213,6 +213,258 @@ p.terminate()
|
|
213
213
|
ws.close() # Close the websocket connection
|
214
214
|
```
|
215
215
|
|
216
|
+
## Speech-to-Text (STT) with Websockets
|
217
|
+
|
218
|
+
```python
|
219
|
+
from cartesia import Cartesia
|
220
|
+
import os
|
221
|
+
|
222
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
223
|
+
|
224
|
+
# Load your audio file as bytes
|
225
|
+
with open("path/to/audio.wav", "rb") as f:
|
226
|
+
audio_data = f.read()
|
227
|
+
|
228
|
+
# Convert to audio chunks (20ms chunks used here for a streaming example)
|
229
|
+
# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
|
230
|
+
chunk_size = 640
|
231
|
+
audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
|
232
|
+
|
233
|
+
# Create websocket connection with endpointing parameters
|
234
|
+
ws = client.stt.websocket(
|
235
|
+
model="ink-whisper", # Model (required)
|
236
|
+
language="en", # Language of your audio (required)
|
237
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
238
|
+
sample_rate=16000, # Audio sample rate (required)
|
239
|
+
min_volume=0.1, # Volume threshold for voice activity detection
|
240
|
+
max_silence_duration_secs=0.4, # Maximum silence duration before endpointing
|
241
|
+
)
|
242
|
+
|
243
|
+
# Send audio chunks (streaming approach)
|
244
|
+
for chunk in audio_chunks:
|
245
|
+
ws.send(chunk)
|
246
|
+
|
247
|
+
# Finalize and close
|
248
|
+
ws.send("finalize")
|
249
|
+
ws.send("done")
|
250
|
+
|
251
|
+
# Receive transcription results with word-level timestamps
|
252
|
+
for result in ws.receive():
|
253
|
+
if result['type'] == 'transcript':
|
254
|
+
print(f"Transcription: {result['text']}")
|
255
|
+
|
256
|
+
# Handle word-level timestamps if available
|
257
|
+
if 'words' in result and result['words']:
|
258
|
+
print("Word-level timestamps:")
|
259
|
+
for word_info in result['words']:
|
260
|
+
word = word_info['word']
|
261
|
+
start = word_info['start']
|
262
|
+
end = word_info['end']
|
263
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
264
|
+
|
265
|
+
if result['is_final']:
|
266
|
+
print("Final result received")
|
267
|
+
elif result['type'] == 'done':
|
268
|
+
break
|
269
|
+
|
270
|
+
ws.close()
|
271
|
+
```
|
272
|
+
|
273
|
+
### Async Streaming Speech-to-Text (STT) with Websockets
|
274
|
+
|
275
|
+
For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
|
276
|
+
|
277
|
+
```python
|
278
|
+
import asyncio
|
279
|
+
import os
|
280
|
+
from cartesia import AsyncCartesia
|
281
|
+
|
282
|
+
async def streaming_stt_example():
|
283
|
+
"""
|
284
|
+
Advanced async STT example for real-time streaming applications.
|
285
|
+
This example simulates streaming audio processing with proper error handling
|
286
|
+
and demonstrates the new endpointing and word timestamp features.
|
287
|
+
"""
|
288
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
289
|
+
|
290
|
+
try:
|
291
|
+
# Create websocket connection with voice activity detection
|
292
|
+
ws = await client.stt.websocket(
|
293
|
+
model="ink-whisper", # Model (required)
|
294
|
+
language="en", # Language of your audio (required)
|
295
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
296
|
+
sample_rate=16000, # Audio sample rate (required)
|
297
|
+
min_volume=0.15, # Volume threshold for voice activity detection
|
298
|
+
max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
|
299
|
+
)
|
300
|
+
|
301
|
+
# Simulate streaming audio data (replace with your audio source)
|
302
|
+
async def audio_stream():
|
303
|
+
"""Simulate real-time audio streaming - replace with actual audio capture"""
|
304
|
+
# Load audio file for simulation
|
305
|
+
with open("path/to/audio.wav", "rb") as f:
|
306
|
+
audio_data = f.read()
|
307
|
+
|
308
|
+
# Stream in 100ms chunks (realistic for real-time processing)
|
309
|
+
chunk_size = int(16000 * 0.1 * 2) # 100ms at 16kHz, 16-bit
|
310
|
+
|
311
|
+
for i in range(0, len(audio_data), chunk_size):
|
312
|
+
chunk = audio_data[i:i + chunk_size]
|
313
|
+
if chunk:
|
314
|
+
yield chunk
|
315
|
+
# Simulate real-time streaming delay
|
316
|
+
await asyncio.sleep(0.1)
|
317
|
+
|
318
|
+
# Send audio and receive results concurrently
|
319
|
+
async def send_audio():
|
320
|
+
"""Send audio chunks to the STT websocket"""
|
321
|
+
try:
|
322
|
+
async for chunk in audio_stream():
|
323
|
+
await ws.send(chunk)
|
324
|
+
print(f"Sent audio chunk of {len(chunk)} bytes")
|
325
|
+
# Small delay to simulate realtime applications
|
326
|
+
await asyncio.sleep(0.02)
|
327
|
+
|
328
|
+
# Signal end of audio stream
|
329
|
+
await ws.send("finalize")
|
330
|
+
await ws.send("done")
|
331
|
+
print("Audio streaming completed")
|
332
|
+
|
333
|
+
except Exception as e:
|
334
|
+
print(f"Error sending audio: {e}")
|
335
|
+
|
336
|
+
async def receive_transcripts():
|
337
|
+
"""Receive and process transcription results with word timestamps"""
|
338
|
+
full_transcript = ""
|
339
|
+
all_word_timestamps = []
|
340
|
+
|
341
|
+
try:
|
342
|
+
async for result in ws.receive():
|
343
|
+
if result['type'] == 'transcript':
|
344
|
+
text = result['text']
|
345
|
+
is_final = result['is_final']
|
346
|
+
|
347
|
+
# Handle word-level timestamps
|
348
|
+
if 'words' in result and result['words']:
|
349
|
+
word_timestamps = result['words']
|
350
|
+
all_word_timestamps.extend(word_timestamps)
|
351
|
+
|
352
|
+
if is_final:
|
353
|
+
print("Word-level timestamps:")
|
354
|
+
for word_info in word_timestamps:
|
355
|
+
word = word_info['word']
|
356
|
+
start = word_info['start']
|
357
|
+
end = word_info['end']
|
358
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
359
|
+
|
360
|
+
if is_final:
|
361
|
+
# Final result - this text won't change
|
362
|
+
full_transcript += text + " "
|
363
|
+
print(f"FINAL: {text}")
|
364
|
+
else:
|
365
|
+
# Partial result - may change as more audio is processed
|
366
|
+
print(f"PARTIAL: {text}")
|
367
|
+
|
368
|
+
elif result['type'] == 'done':
|
369
|
+
print("Transcription completed")
|
370
|
+
break
|
371
|
+
|
372
|
+
except Exception as e:
|
373
|
+
print(f"Error receiving transcripts: {e}")
|
374
|
+
|
375
|
+
return full_transcript.strip(), all_word_timestamps
|
376
|
+
|
377
|
+
print("Starting streaming STT...")
|
378
|
+
|
379
|
+
# Use asyncio.gather to run audio sending and transcript receiving concurrently
|
380
|
+
_, (final_transcript, word_timestamps) = await asyncio.gather(
|
381
|
+
send_audio(),
|
382
|
+
receive_transcripts()
|
383
|
+
)
|
384
|
+
|
385
|
+
print(f"\nComplete transcript: {final_transcript}")
|
386
|
+
print(f"Total words with timestamps: {len(word_timestamps)}")
|
387
|
+
|
388
|
+
# Clean up
|
389
|
+
await ws.close()
|
390
|
+
|
391
|
+
except Exception as e:
|
392
|
+
print(f"STT streaming error: {e}")
|
393
|
+
finally:
|
394
|
+
await client.close()
|
395
|
+
|
396
|
+
# Run the example
|
397
|
+
if __name__ == "__main__":
|
398
|
+
asyncio.run(streaming_stt_example())
|
399
|
+
```
|
400
|
+
|
401
|
+
## Batch Speech-to-Text (STT)
|
402
|
+
|
403
|
+
For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
|
404
|
+
|
405
|
+
```python
|
406
|
+
from cartesia import Cartesia
|
407
|
+
import os
|
408
|
+
|
409
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
410
|
+
|
411
|
+
# Transcribe an audio file with word-level timestamps
|
412
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
413
|
+
response = client.stt.transcribe(
|
414
|
+
file=audio_file, # Audio file to transcribe
|
415
|
+
model="ink-whisper", # STT model (required)
|
416
|
+
language="en", # Language of the audio (optional)
|
417
|
+
timestamp_granularities=["word"], # Include word-level timestamps (optional)
|
418
|
+
encoding="pcm_s16le", # Audio encoding (optional)
|
419
|
+
sample_rate=16000, # Audio sample rate (optional)
|
420
|
+
)
|
421
|
+
|
422
|
+
# Access transcription results
|
423
|
+
print(f"Transcribed text: {response.text}")
|
424
|
+
print(f"Audio duration: {response.duration:.2f} seconds")
|
425
|
+
|
426
|
+
# Process word-level timestamps if requested
|
427
|
+
if response.words:
|
428
|
+
print("\nWord-level timestamps:")
|
429
|
+
for word_info in response.words:
|
430
|
+
word = word_info.word
|
431
|
+
start = word_info.start
|
432
|
+
end = word_info.end
|
433
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
434
|
+
```
|
435
|
+
|
436
|
+
### Async Batch STT
|
437
|
+
|
438
|
+
```python
|
439
|
+
import asyncio
|
440
|
+
from cartesia import AsyncCartesia
|
441
|
+
import os
|
442
|
+
|
443
|
+
async def transcribe_file():
|
444
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
445
|
+
|
446
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
447
|
+
response = await client.stt.transcribe(
|
448
|
+
file=audio_file,
|
449
|
+
model="ink-whisper",
|
450
|
+
language="en",
|
451
|
+
timestamp_granularities=["word"],
|
452
|
+
)
|
453
|
+
|
454
|
+
print(f"Transcribed text: {response.text}")
|
455
|
+
|
456
|
+
# Process word timestamps
|
457
|
+
if response.words:
|
458
|
+
for word_info in response.words:
|
459
|
+
print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
|
460
|
+
|
461
|
+
await client.close()
|
462
|
+
|
463
|
+
asyncio.run(transcribe_file())
|
464
|
+
```
|
465
|
+
|
466
|
+
> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
|
467
|
+
|
216
468
|
## Voices
|
217
469
|
|
218
470
|
List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
|
@@ -358,7 +610,6 @@ new_voice = client.voices.create(
|
|
358
610
|
language="en"
|
359
611
|
)
|
360
612
|
```
|
361
|
-
|
362
613
|
### Custom Client
|
363
614
|
|
364
615
|
You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
|
@@ -412,3 +663,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
|
|
412
663
|
|
413
664
|
From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
|
414
665
|
|
666
|
+
|
667
|
+
|
668
|
+
|
@@ -181,6 +181,258 @@ p.terminate()
|
|
181
181
|
ws.close() # Close the websocket connection
|
182
182
|
```
|
183
183
|
|
184
|
+
## Speech-to-Text (STT) with Websockets
|
185
|
+
|
186
|
+
```python
|
187
|
+
from cartesia import Cartesia
|
188
|
+
import os
|
189
|
+
|
190
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
191
|
+
|
192
|
+
# Load your audio file as bytes
|
193
|
+
with open("path/to/audio.wav", "rb") as f:
|
194
|
+
audio_data = f.read()
|
195
|
+
|
196
|
+
# Convert to audio chunks (20ms chunks used here for a streaming example)
|
197
|
+
# This chunk size is calculated for 16kHz, 16-bit audio: 16000 * 0.02 * 2 = 640 bytes
|
198
|
+
chunk_size = 640
|
199
|
+
audio_chunks = [audio_data[i:i+chunk_size] for i in range(0, len(audio_data), chunk_size)]
|
200
|
+
|
201
|
+
# Create websocket connection with endpointing parameters
|
202
|
+
ws = client.stt.websocket(
|
203
|
+
model="ink-whisper", # Model (required)
|
204
|
+
language="en", # Language of your audio (required)
|
205
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
206
|
+
sample_rate=16000, # Audio sample rate (required)
|
207
|
+
min_volume=0.1, # Volume threshold for voice activity detection
|
208
|
+
max_silence_duration_secs=0.4, # Maximum silence duration before endpointing
|
209
|
+
)
|
210
|
+
|
211
|
+
# Send audio chunks (streaming approach)
|
212
|
+
for chunk in audio_chunks:
|
213
|
+
ws.send(chunk)
|
214
|
+
|
215
|
+
# Finalize and close
|
216
|
+
ws.send("finalize")
|
217
|
+
ws.send("done")
|
218
|
+
|
219
|
+
# Receive transcription results with word-level timestamps
|
220
|
+
for result in ws.receive():
|
221
|
+
if result['type'] == 'transcript':
|
222
|
+
print(f"Transcription: {result['text']}")
|
223
|
+
|
224
|
+
# Handle word-level timestamps if available
|
225
|
+
if 'words' in result and result['words']:
|
226
|
+
print("Word-level timestamps:")
|
227
|
+
for word_info in result['words']:
|
228
|
+
word = word_info['word']
|
229
|
+
start = word_info['start']
|
230
|
+
end = word_info['end']
|
231
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
232
|
+
|
233
|
+
if result['is_final']:
|
234
|
+
print("Final result received")
|
235
|
+
elif result['type'] == 'done':
|
236
|
+
break
|
237
|
+
|
238
|
+
ws.close()
|
239
|
+
```
|
240
|
+
|
241
|
+
### Async Streaming Speech-to-Text (STT) with Websockets
|
242
|
+
|
243
|
+
For real-time streaming applications, here's a more practical async example that demonstrates concurrent audio processing and result handling:
|
244
|
+
|
245
|
+
```python
|
246
|
+
import asyncio
|
247
|
+
import os
|
248
|
+
from cartesia import AsyncCartesia
|
249
|
+
|
250
|
+
async def streaming_stt_example():
|
251
|
+
"""
|
252
|
+
Advanced async STT example for real-time streaming applications.
|
253
|
+
This example simulates streaming audio processing with proper error handling
|
254
|
+
and demonstrates the new endpointing and word timestamp features.
|
255
|
+
"""
|
256
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
257
|
+
|
258
|
+
try:
|
259
|
+
# Create websocket connection with voice activity detection
|
260
|
+
ws = await client.stt.websocket(
|
261
|
+
model="ink-whisper", # Model (required)
|
262
|
+
language="en", # Language of your audio (required)
|
263
|
+
encoding="pcm_s16le", # Audio encoding format (required)
|
264
|
+
sample_rate=16000, # Audio sample rate (required)
|
265
|
+
min_volume=0.15, # Volume threshold for voice activity detection
|
266
|
+
max_silence_duration_secs=0.3, # Maximum silence duration before endpointing
|
267
|
+
)
|
268
|
+
|
269
|
+
# Simulate streaming audio data (replace with your audio source)
|
270
|
+
async def audio_stream():
|
271
|
+
"""Simulate real-time audio streaming - replace with actual audio capture"""
|
272
|
+
# Load audio file for simulation
|
273
|
+
with open("path/to/audio.wav", "rb") as f:
|
274
|
+
audio_data = f.read()
|
275
|
+
|
276
|
+
# Stream in 100ms chunks (realistic for real-time processing)
|
277
|
+
chunk_size = int(16000 * 0.1 * 2) # 100ms at 16kHz, 16-bit
|
278
|
+
|
279
|
+
for i in range(0, len(audio_data), chunk_size):
|
280
|
+
chunk = audio_data[i:i + chunk_size]
|
281
|
+
if chunk:
|
282
|
+
yield chunk
|
283
|
+
# Simulate real-time streaming delay
|
284
|
+
await asyncio.sleep(0.1)
|
285
|
+
|
286
|
+
# Send audio and receive results concurrently
|
287
|
+
async def send_audio():
|
288
|
+
"""Send audio chunks to the STT websocket"""
|
289
|
+
try:
|
290
|
+
async for chunk in audio_stream():
|
291
|
+
await ws.send(chunk)
|
292
|
+
print(f"Sent audio chunk of {len(chunk)} bytes")
|
293
|
+
# Small delay to simulate realtime applications
|
294
|
+
await asyncio.sleep(0.02)
|
295
|
+
|
296
|
+
# Signal end of audio stream
|
297
|
+
await ws.send("finalize")
|
298
|
+
await ws.send("done")
|
299
|
+
print("Audio streaming completed")
|
300
|
+
|
301
|
+
except Exception as e:
|
302
|
+
print(f"Error sending audio: {e}")
|
303
|
+
|
304
|
+
async def receive_transcripts():
|
305
|
+
"""Receive and process transcription results with word timestamps"""
|
306
|
+
full_transcript = ""
|
307
|
+
all_word_timestamps = []
|
308
|
+
|
309
|
+
try:
|
310
|
+
async for result in ws.receive():
|
311
|
+
if result['type'] == 'transcript':
|
312
|
+
text = result['text']
|
313
|
+
is_final = result['is_final']
|
314
|
+
|
315
|
+
# Handle word-level timestamps
|
316
|
+
if 'words' in result and result['words']:
|
317
|
+
word_timestamps = result['words']
|
318
|
+
all_word_timestamps.extend(word_timestamps)
|
319
|
+
|
320
|
+
if is_final:
|
321
|
+
print("Word-level timestamps:")
|
322
|
+
for word_info in word_timestamps:
|
323
|
+
word = word_info['word']
|
324
|
+
start = word_info['start']
|
325
|
+
end = word_info['end']
|
326
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
327
|
+
|
328
|
+
if is_final:
|
329
|
+
# Final result - this text won't change
|
330
|
+
full_transcript += text + " "
|
331
|
+
print(f"FINAL: {text}")
|
332
|
+
else:
|
333
|
+
# Partial result - may change as more audio is processed
|
334
|
+
print(f"PARTIAL: {text}")
|
335
|
+
|
336
|
+
elif result['type'] == 'done':
|
337
|
+
print("Transcription completed")
|
338
|
+
break
|
339
|
+
|
340
|
+
except Exception as e:
|
341
|
+
print(f"Error receiving transcripts: {e}")
|
342
|
+
|
343
|
+
return full_transcript.strip(), all_word_timestamps
|
344
|
+
|
345
|
+
print("Starting streaming STT...")
|
346
|
+
|
347
|
+
# Use asyncio.gather to run audio sending and transcript receiving concurrently
|
348
|
+
_, (final_transcript, word_timestamps) = await asyncio.gather(
|
349
|
+
send_audio(),
|
350
|
+
receive_transcripts()
|
351
|
+
)
|
352
|
+
|
353
|
+
print(f"\nComplete transcript: {final_transcript}")
|
354
|
+
print(f"Total words with timestamps: {len(word_timestamps)}")
|
355
|
+
|
356
|
+
# Clean up
|
357
|
+
await ws.close()
|
358
|
+
|
359
|
+
except Exception as e:
|
360
|
+
print(f"STT streaming error: {e}")
|
361
|
+
finally:
|
362
|
+
await client.close()
|
363
|
+
|
364
|
+
# Run the example
|
365
|
+
if __name__ == "__main__":
|
366
|
+
asyncio.run(streaming_stt_example())
|
367
|
+
```
|
368
|
+
|
369
|
+
## Batch Speech-to-Text (STT)
|
370
|
+
|
371
|
+
For processing pre-recorded audio files, use the batch STT API which supports uploading complete audio files for transcription:
|
372
|
+
|
373
|
+
```python
|
374
|
+
from cartesia import Cartesia
|
375
|
+
import os
|
376
|
+
|
377
|
+
client = Cartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
378
|
+
|
379
|
+
# Transcribe an audio file with word-level timestamps
|
380
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
381
|
+
response = client.stt.transcribe(
|
382
|
+
file=audio_file, # Audio file to transcribe
|
383
|
+
model="ink-whisper", # STT model (required)
|
384
|
+
language="en", # Language of the audio (optional)
|
385
|
+
timestamp_granularities=["word"], # Include word-level timestamps (optional)
|
386
|
+
encoding="pcm_s16le", # Audio encoding (optional)
|
387
|
+
sample_rate=16000, # Audio sample rate (optional)
|
388
|
+
)
|
389
|
+
|
390
|
+
# Access transcription results
|
391
|
+
print(f"Transcribed text: {response.text}")
|
392
|
+
print(f"Audio duration: {response.duration:.2f} seconds")
|
393
|
+
|
394
|
+
# Process word-level timestamps if requested
|
395
|
+
if response.words:
|
396
|
+
print("\nWord-level timestamps:")
|
397
|
+
for word_info in response.words:
|
398
|
+
word = word_info.word
|
399
|
+
start = word_info.start
|
400
|
+
end = word_info.end
|
401
|
+
print(f" '{word}': {start:.2f}s - {end:.2f}s")
|
402
|
+
```
|
403
|
+
|
404
|
+
### Async Batch STT
|
405
|
+
|
406
|
+
```python
|
407
|
+
import asyncio
|
408
|
+
from cartesia import AsyncCartesia
|
409
|
+
import os
|
410
|
+
|
411
|
+
async def transcribe_file():
|
412
|
+
client = AsyncCartesia(api_key=os.getenv("CARTESIA_API_KEY"))
|
413
|
+
|
414
|
+
with open("path/to/audio.wav", "rb") as audio_file:
|
415
|
+
response = await client.stt.transcribe(
|
416
|
+
file=audio_file,
|
417
|
+
model="ink-whisper",
|
418
|
+
language="en",
|
419
|
+
timestamp_granularities=["word"],
|
420
|
+
)
|
421
|
+
|
422
|
+
print(f"Transcribed text: {response.text}")
|
423
|
+
|
424
|
+
# Process word timestamps
|
425
|
+
if response.words:
|
426
|
+
for word_info in response.words:
|
427
|
+
print(f"'{word_info.word}': {word_info.start:.2f}s - {word_info.end:.2f}s")
|
428
|
+
|
429
|
+
await client.close()
|
430
|
+
|
431
|
+
asyncio.run(transcribe_file())
|
432
|
+
```
|
433
|
+
|
434
|
+
> **Note:** Batch STT also supports OpenAI's audio transcriptions format for easy migration from OpenAI Whisper. See our [migration guide](https://docs.cartesia.ai/api-reference/stt/migrate-from-open-ai) for details.
|
435
|
+
|
184
436
|
## Voices
|
185
437
|
|
186
438
|
List all available Voices with `client.voices.list`, which returns an iterable that automatically handles pagination:
|
@@ -326,7 +578,6 @@ new_voice = client.voices.create(
|
|
326
578
|
language="en"
|
327
579
|
)
|
328
580
|
```
|
329
|
-
|
330
581
|
### Custom Client
|
331
582
|
|
332
583
|
You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies
|
@@ -379,3 +630,6 @@ $ git commit --amend -m "manually regenerate from docs" # optional
|
|
379
630
|
### Automatically generating new SDK releases
|
380
631
|
|
381
632
|
From https://github.com/cartesia-ai/docs click `Actions` then `Release Python SDK`. (Requires permissions.)
|
633
|
+
|
634
|
+
|
635
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
|
-
from . import api_status, auth, datasets, embedding, infill, tts, voice_changer, voices
|
3
|
+
from . import api_status, auth, datasets, embedding, infill, stt, tts, voice_changer, voices
|
4
4
|
from .api_status import ApiInfo, ApiInfoParams
|
5
5
|
from .auth import TokenGrant, TokenGrantParams, TokenRequest, TokenRequestParams, TokenResponse, TokenResponseParams
|
6
6
|
from .client import AsyncCartesia, Cartesia
|
@@ -19,6 +19,32 @@ from .datasets import (
|
|
19
19
|
)
|
20
20
|
from .embedding import Embedding
|
21
21
|
from .environment import CartesiaEnvironment
|
22
|
+
from .stt import (
|
23
|
+
DoneMessage,
|
24
|
+
DoneMessageParams,
|
25
|
+
ErrorMessage,
|
26
|
+
ErrorMessageParams,
|
27
|
+
FlushDoneMessage,
|
28
|
+
FlushDoneMessageParams,
|
29
|
+
StreamingTranscriptionResponse,
|
30
|
+
StreamingTranscriptionResponseParams,
|
31
|
+
StreamingTranscriptionResponse_Done,
|
32
|
+
StreamingTranscriptionResponse_DoneParams,
|
33
|
+
StreamingTranscriptionResponse_Error,
|
34
|
+
StreamingTranscriptionResponse_ErrorParams,
|
35
|
+
StreamingTranscriptionResponse_FlushDone,
|
36
|
+
StreamingTranscriptionResponse_FlushDoneParams,
|
37
|
+
StreamingTranscriptionResponse_Transcript,
|
38
|
+
StreamingTranscriptionResponse_TranscriptParams,
|
39
|
+
SttEncoding,
|
40
|
+
TimestampGranularity,
|
41
|
+
TranscriptMessage,
|
42
|
+
TranscriptMessageParams,
|
43
|
+
TranscriptionResponse,
|
44
|
+
TranscriptionResponseParams,
|
45
|
+
TranscriptionWord,
|
46
|
+
TranscriptionWordParams,
|
47
|
+
)
|
22
48
|
from .tts import (
|
23
49
|
CancelContextRequest,
|
24
50
|
CancelContextRequestParams,
|
@@ -49,6 +75,8 @@ from .tts import (
|
|
49
75
|
RawOutputFormatParams,
|
50
76
|
Speed,
|
51
77
|
SpeedParams,
|
78
|
+
SseOutputFormat,
|
79
|
+
SseOutputFormatParams,
|
52
80
|
SupportedLanguage,
|
53
81
|
TtsRequest,
|
54
82
|
TtsRequestEmbeddingSpecifier,
|
@@ -58,6 +86,8 @@ from .tts import (
|
|
58
86
|
TtsRequestParams,
|
59
87
|
TtsRequestVoiceSpecifier,
|
60
88
|
TtsRequestVoiceSpecifierParams,
|
89
|
+
TtssseRequest,
|
90
|
+
TtssseRequestParams,
|
61
91
|
WavOutputFormat,
|
62
92
|
WavOutputFormatParams,
|
63
93
|
WebSocketBaseResponse,
|
@@ -173,13 +203,19 @@ __all__ = [
|
|
173
203
|
"DatasetFile",
|
174
204
|
"DatasetFileParams",
|
175
205
|
"DatasetParams",
|
206
|
+
"DoneMessage",
|
207
|
+
"DoneMessageParams",
|
176
208
|
"Embedding",
|
177
209
|
"EmbeddingResponse",
|
178
210
|
"EmbeddingResponseParams",
|
179
211
|
"EmbeddingSpecifier",
|
180
212
|
"EmbeddingSpecifierParams",
|
181
213
|
"Emotion",
|
214
|
+
"ErrorMessage",
|
215
|
+
"ErrorMessageParams",
|
182
216
|
"FilePurpose",
|
217
|
+
"FlushDoneMessage",
|
218
|
+
"FlushDoneMessageParams",
|
183
219
|
"FlushId",
|
184
220
|
"Gender",
|
185
221
|
"GenderPresentation",
|
@@ -227,6 +263,8 @@ __all__ = [
|
|
227
263
|
"RawOutputFormatParams",
|
228
264
|
"Speed",
|
229
265
|
"SpeedParams",
|
266
|
+
"SseOutputFormat",
|
267
|
+
"SseOutputFormatParams",
|
230
268
|
"StreamingResponse",
|
231
269
|
"StreamingResponseParams",
|
232
270
|
"StreamingResponse_Chunk",
|
@@ -235,13 +273,31 @@ __all__ = [
|
|
235
273
|
"StreamingResponse_DoneParams",
|
236
274
|
"StreamingResponse_Error",
|
237
275
|
"StreamingResponse_ErrorParams",
|
276
|
+
"StreamingTranscriptionResponse",
|
277
|
+
"StreamingTranscriptionResponseParams",
|
278
|
+
"StreamingTranscriptionResponse_Done",
|
279
|
+
"StreamingTranscriptionResponse_DoneParams",
|
280
|
+
"StreamingTranscriptionResponse_Error",
|
281
|
+
"StreamingTranscriptionResponse_ErrorParams",
|
282
|
+
"StreamingTranscriptionResponse_FlushDone",
|
283
|
+
"StreamingTranscriptionResponse_FlushDoneParams",
|
284
|
+
"StreamingTranscriptionResponse_Transcript",
|
285
|
+
"StreamingTranscriptionResponse_TranscriptParams",
|
286
|
+
"SttEncoding",
|
238
287
|
"SupportedLanguage",
|
288
|
+
"TimestampGranularity",
|
239
289
|
"TokenGrant",
|
240
290
|
"TokenGrantParams",
|
241
291
|
"TokenRequest",
|
242
292
|
"TokenRequestParams",
|
243
293
|
"TokenResponse",
|
244
294
|
"TokenResponseParams",
|
295
|
+
"TranscriptMessage",
|
296
|
+
"TranscriptMessageParams",
|
297
|
+
"TranscriptionResponse",
|
298
|
+
"TranscriptionResponseParams",
|
299
|
+
"TranscriptionWord",
|
300
|
+
"TranscriptionWordParams",
|
245
301
|
"TtsRequest",
|
246
302
|
"TtsRequestEmbeddingSpecifier",
|
247
303
|
"TtsRequestEmbeddingSpecifierParams",
|
@@ -250,6 +306,8 @@ __all__ = [
|
|
250
306
|
"TtsRequestParams",
|
251
307
|
"TtsRequestVoiceSpecifier",
|
252
308
|
"TtsRequestVoiceSpecifierParams",
|
309
|
+
"TtssseRequest",
|
310
|
+
"TtssseRequestParams",
|
253
311
|
"UpdateVoiceRequest",
|
254
312
|
"UpdateVoiceRequestParams",
|
255
313
|
"Voice",
|
@@ -307,6 +365,7 @@ __all__ = [
|
|
307
365
|
"datasets",
|
308
366
|
"embedding",
|
309
367
|
"infill",
|
368
|
+
"stt",
|
310
369
|
"tts",
|
311
370
|
"voice_changer",
|
312
371
|
"voices",
|