typecast-python 0.2.2__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {typecast_python-0.2.2 → typecast_python-0.3.1}/.gitignore +5 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/PKG-INFO +87 -1
- {typecast_python-0.2.2 → typecast_python-0.3.1}/README.md +86 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/pyproject.toml +1 -1
- typecast_python-0.3.1/src/typecast/_voice_clone.py +89 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/async_client.py +129 -2
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/client.py +122 -1
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/models/__init__.py +28 -18
- typecast_python-0.3.1/src/typecast/models/tts.py +449 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/models/voices.py +16 -1
- typecast_python-0.2.2/src/typecast/models/tts.py +0 -214
- {typecast_python-0.2.2 → typecast_python-0.3.1}/LICENSE +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/__init__.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/conf.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/exceptions.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/models/error.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/models/subscription.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.1}/src/typecast/utils.py +0 -0
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
.env.*.local
|
|
11
11
|
*.env
|
|
12
12
|
!.env.example
|
|
13
|
+
config.yaml
|
|
14
|
+
credentials.json
|
|
13
15
|
|
|
14
16
|
# ----------------
|
|
15
17
|
# Node.js / JavaScript
|
|
@@ -314,6 +316,9 @@ logs/
|
|
|
314
316
|
# Audio output files (in root directories, not source code)
|
|
315
317
|
*.wav
|
|
316
318
|
*.mp3
|
|
319
|
+
# Exception: shared test fixtures
|
|
320
|
+
!test-fixtures/**/*.wav
|
|
321
|
+
!test-fixtures/**/*.mp3
|
|
317
322
|
# Note: Removed 'output.*' pattern as it incorrectly matches Output.java/Output.cs source files
|
|
318
323
|
|
|
319
324
|
# ----------------
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: typecast-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Official Typecast Python SDK - Convert text to lifelike speech using AI-powered voices
|
|
5
5
|
Project-URL: Homepage, https://typecast.ai
|
|
6
6
|
Project-URL: Documentation, https://typecast.ai/docs/overview
|
|
@@ -269,6 +269,7 @@ Convert text to lifelike speech using AI-powered voices
|
|
|
269
269
|
- [Voice Discovery](#voice-discovery)
|
|
270
270
|
- [Emotion Control](#emotion-control)
|
|
271
271
|
- [Async Client](#async-client)
|
|
272
|
+
- [Timestamp TTS](#timestamp-tts)
|
|
272
273
|
- [Supported Languages](#supported-languages)
|
|
273
274
|
- [Error Handling](#error-handling)
|
|
274
275
|
- [License](#license)
|
|
@@ -465,6 +466,91 @@ async def main():
|
|
|
465
466
|
asyncio.run(main())
|
|
466
467
|
```
|
|
467
468
|
|
|
469
|
+
### Timestamp TTS
|
|
470
|
+
|
|
471
|
+
Use `text_to_speech_with_timestamps()` to receive base64 audio plus
|
|
472
|
+
word/character-level timestamps aligned with the synthesized speech. The
|
|
473
|
+
result object exposes `save_audio()`, `to_srt()`, and `to_vtt()` helpers
|
|
474
|
+
so you can finish the typical "audio + subtitles" flow in one line.
|
|
475
|
+
|
|
476
|
+
```python
|
|
477
|
+
from typecast import Typecast
|
|
478
|
+
from typecast.models import TTSRequestWithTimestamps
|
|
479
|
+
|
|
480
|
+
client = Typecast(api_key="YOUR_API_KEY")
|
|
481
|
+
resp = client.text_to_speech_with_timestamps(
|
|
482
|
+
TTSRequestWithTimestamps(
|
|
483
|
+
voice_id="tc_60e5426de8b95f1d3000d7b5",
|
|
484
|
+
text="Hello. How are you?",
|
|
485
|
+
model="ssfm-v30",
|
|
486
|
+
language="eng",
|
|
487
|
+
),
|
|
488
|
+
)
|
|
489
|
+
resp.save_audio("hello.wav")
|
|
490
|
+
print(resp.to_srt()) # SRT subtitles
|
|
491
|
+
print(resp.to_vtt()) # WebVTT subtitles
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
Caption splits follow BBC/Netflix subtitle guidelines: 7s/42-char cue maximums.
|
|
495
|
+
|
|
496
|
+
```python
|
|
497
|
+
# Real-time karaoke / highlight: iterate the words array directly.
|
|
498
|
+
for w in resp.words or []:
|
|
499
|
+
print(f"[{w.start:.2f}s - {w.end:.2f}s] {w.text}")
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
Pass `granularity="word"` or `granularity="char"` to receive only one of
|
|
503
|
+
the two alignment arrays. For non-whitespace languages (Japanese,
|
|
504
|
+
Chinese), pair with `granularity="char"` — word-level alignment will
|
|
505
|
+
collapse the entire sentence into a single segment.
|
|
506
|
+
|
|
507
|
+
### Instant cloning
|
|
508
|
+
|
|
509
|
+
Clone a custom voice from a short audio sample (≤ 25 MB), then use it just like any built-in voice. The cloned voice ID has a `uc_` prefix and works with `text_to_speech` directly.
|
|
510
|
+
|
|
511
|
+
```python
|
|
512
|
+
from typecast import Typecast
|
|
513
|
+
from typecast.models import TTSRequest
|
|
514
|
+
|
|
515
|
+
client = Typecast(api_key="YOUR_API_KEY")
|
|
516
|
+
|
|
517
|
+
# 1) Clone
|
|
518
|
+
voice = client.clone_voice(
|
|
519
|
+
audio="path/to/sample.wav", # str path | Path | bytes | file object
|
|
520
|
+
name="my-voice", # 1-30 chars
|
|
521
|
+
model="ssfm-v30", # or "ssfm-v21"
|
|
522
|
+
)
|
|
523
|
+
print(voice.voice_id) # uc_64a1b2...
|
|
524
|
+
|
|
525
|
+
# 2) Synthesize with the cloned voice
|
|
526
|
+
audio = client.text_to_speech(TTSRequest(
|
|
527
|
+
text="Hello from my cloned voice!",
|
|
528
|
+
voice_id=voice.voice_id,
|
|
529
|
+
model="ssfm-v30",
|
|
530
|
+
))
|
|
531
|
+
with open("output.wav", "wb") as f:
|
|
532
|
+
f.write(audio.audio_data)
|
|
533
|
+
|
|
534
|
+
# 3) Delete when done
|
|
535
|
+
client.delete_voice(voice.voice_id)
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
**Limits**
|
|
539
|
+
|
|
540
|
+
- Audio file: max 25 MB. Supported formats: WAV, MP3.
|
|
541
|
+
- Voice name: 1–30 characters.
|
|
542
|
+
- Model: `ssfm-v21` or `ssfm-v30`.
|
|
543
|
+
|
|
544
|
+
**Async usage** is identical via `AsyncTypecast`:
|
|
545
|
+
|
|
546
|
+
```python
|
|
547
|
+
from typecast import AsyncTypecast
|
|
548
|
+
|
|
549
|
+
async with AsyncTypecast(api_key="YOUR_API_KEY") as client:
|
|
550
|
+
voice = await client.clone_voice(audio="sample.wav", name="my-voice", model="ssfm-v30")
|
|
551
|
+
await client.delete_voice(voice.voice_id)
|
|
552
|
+
```
|
|
553
|
+
|
|
468
554
|
---
|
|
469
555
|
|
|
470
556
|
## Supported Languages
|
|
@@ -28,6 +28,7 @@ Convert text to lifelike speech using AI-powered voices
|
|
|
28
28
|
- [Voice Discovery](#voice-discovery)
|
|
29
29
|
- [Emotion Control](#emotion-control)
|
|
30
30
|
- [Async Client](#async-client)
|
|
31
|
+
- [Timestamp TTS](#timestamp-tts)
|
|
31
32
|
- [Supported Languages](#supported-languages)
|
|
32
33
|
- [Error Handling](#error-handling)
|
|
33
34
|
- [License](#license)
|
|
@@ -224,6 +225,91 @@ async def main():
|
|
|
224
225
|
asyncio.run(main())
|
|
225
226
|
```
|
|
226
227
|
|
|
228
|
+
### Timestamp TTS
|
|
229
|
+
|
|
230
|
+
Use `text_to_speech_with_timestamps()` to receive base64 audio plus
|
|
231
|
+
word/character-level timestamps aligned with the synthesized speech. The
|
|
232
|
+
result object exposes `save_audio()`, `to_srt()`, and `to_vtt()` helpers
|
|
233
|
+
so you can finish the typical "audio + subtitles" flow in one line.
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
from typecast import Typecast
|
|
237
|
+
from typecast.models import TTSRequestWithTimestamps
|
|
238
|
+
|
|
239
|
+
client = Typecast(api_key="YOUR_API_KEY")
|
|
240
|
+
resp = client.text_to_speech_with_timestamps(
|
|
241
|
+
TTSRequestWithTimestamps(
|
|
242
|
+
voice_id="tc_60e5426de8b95f1d3000d7b5",
|
|
243
|
+
text="Hello. How are you?",
|
|
244
|
+
model="ssfm-v30",
|
|
245
|
+
language="eng",
|
|
246
|
+
),
|
|
247
|
+
)
|
|
248
|
+
resp.save_audio("hello.wav")
|
|
249
|
+
print(resp.to_srt()) # SRT subtitles
|
|
250
|
+
print(resp.to_vtt()) # WebVTT subtitles
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
Caption splits follow BBC/Netflix subtitle guidelines: 7s/42-char cue maximums.
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
# Real-time karaoke / highlight: iterate the words array directly.
|
|
257
|
+
for w in resp.words or []:
|
|
258
|
+
print(f"[{w.start:.2f}s - {w.end:.2f}s] {w.text}")
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
Pass `granularity="word"` or `granularity="char"` to receive only one of
|
|
262
|
+
the two alignment arrays. For non-whitespace languages (Japanese,
|
|
263
|
+
Chinese), pair with `granularity="char"` — word-level alignment will
|
|
264
|
+
collapse the entire sentence into a single segment.
|
|
265
|
+
|
|
266
|
+
### Instant cloning
|
|
267
|
+
|
|
268
|
+
Clone a custom voice from a short audio sample (≤ 25 MB), then use it just like any built-in voice. The cloned voice ID has a `uc_` prefix and works with `text_to_speech` directly.
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
from typecast import Typecast
|
|
272
|
+
from typecast.models import TTSRequest
|
|
273
|
+
|
|
274
|
+
client = Typecast(api_key="YOUR_API_KEY")
|
|
275
|
+
|
|
276
|
+
# 1) Clone
|
|
277
|
+
voice = client.clone_voice(
|
|
278
|
+
audio="path/to/sample.wav", # str path | Path | bytes | file object
|
|
279
|
+
name="my-voice", # 1-30 chars
|
|
280
|
+
model="ssfm-v30", # or "ssfm-v21"
|
|
281
|
+
)
|
|
282
|
+
print(voice.voice_id) # uc_64a1b2...
|
|
283
|
+
|
|
284
|
+
# 2) Synthesize with the cloned voice
|
|
285
|
+
audio = client.text_to_speech(TTSRequest(
|
|
286
|
+
text="Hello from my cloned voice!",
|
|
287
|
+
voice_id=voice.voice_id,
|
|
288
|
+
model="ssfm-v30",
|
|
289
|
+
))
|
|
290
|
+
with open("output.wav", "wb") as f:
|
|
291
|
+
f.write(audio.audio_data)
|
|
292
|
+
|
|
293
|
+
# 3) Delete when done
|
|
294
|
+
client.delete_voice(voice.voice_id)
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
**Limits**
|
|
298
|
+
|
|
299
|
+
- Audio file: max 25 MB. Supported formats: WAV, MP3.
|
|
300
|
+
- Voice name: 1–30 characters.
|
|
301
|
+
- Model: `ssfm-v21` or `ssfm-v30`.
|
|
302
|
+
|
|
303
|
+
**Async usage** is identical via `AsyncTypecast`:
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
from typecast import AsyncTypecast
|
|
307
|
+
|
|
308
|
+
async with AsyncTypecast(api_key="YOUR_API_KEY") as client:
|
|
309
|
+
voice = await client.clone_voice(audio="sample.wav", name="my-voice", model="ssfm-v30")
|
|
310
|
+
await client.delete_voice(voice.voice_id)
|
|
311
|
+
```
|
|
312
|
+
|
|
227
313
|
---
|
|
228
314
|
|
|
229
315
|
## Supported Languages
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "typecast-python"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "Official Typecast Python SDK - Convert text to lifelike speech using AI-powered voices"
|
|
9
9
|
authors = [
|
|
10
10
|
{name = "Neosapience", email = "help@typecast.ai"}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Internal helpers for instant cloning (sync/async shared)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import BinaryIO, Union
|
|
7
|
+
|
|
8
|
+
CLONING_MAX_FILE_SIZE = 25 * 1024 * 1024 # must match typecast-api `cloning_max_file_size`
|
|
9
|
+
NAME_MIN_LENGTH = 1
|
|
10
|
+
NAME_MAX_LENGTH = 30
|
|
11
|
+
ALLOWED_CLONE_MODELS = frozenset({"ssfm-v21", "ssfm-v30"})
|
|
12
|
+
CUSTOM_VOICE_ID_PREFIX = "uc_"
|
|
13
|
+
|
|
14
|
+
AudioInput = Union[str, Path, bytes, BinaryIO]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def normalize_clone_model(model: object) -> str:
|
|
18
|
+
"""Coerce ``model`` to its string form and reject values outside the API contract.
|
|
19
|
+
|
|
20
|
+
Accepts a ``TTSModel`` enum (uses ``.value``) or a string. Raises ``ValueError``
|
|
21
|
+
when the resolved value is not in :data:`ALLOWED_CLONE_MODELS` so callers fail
|
|
22
|
+
fast client-side instead of relying on a 422 from the API.
|
|
23
|
+
"""
|
|
24
|
+
model_str = model.value if hasattr(model, "value") else str(model)
|
|
25
|
+
if model_str not in ALLOWED_CLONE_MODELS:
|
|
26
|
+
allowed = ", ".join(sorted(ALLOWED_CLONE_MODELS))
|
|
27
|
+
raise ValueError(f"model must be one of: {allowed}; got {model_str!r}")
|
|
28
|
+
return model_str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def validate_custom_voice_id(voice_id: str) -> None:
|
|
32
|
+
"""Reject non-custom voice ids before they reach the DELETE endpoint."""
|
|
33
|
+
if not isinstance(voice_id, str) or not voice_id.startswith(CUSTOM_VOICE_ID_PREFIX):
|
|
34
|
+
raise ValueError(
|
|
35
|
+
f"voice_id must start with {CUSTOM_VOICE_ID_PREFIX!r}; got {voice_id!r}"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def validate_clone_inputs(audio: AudioInput, name: str) -> tuple[bytes, str]:
|
|
40
|
+
"""Pre-validate `clone_voice` inputs and return (audio_bytes, filename).
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
audio: One of file path (str/Path), raw bytes, or readable binary file object.
|
|
44
|
+
name: Voice name (1-30 chars).
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
(audio_bytes, filename) — filename is derived from the path/file object,
|
|
48
|
+
or defaults to "audio.wav" when caller passes raw bytes.
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
ValueError: name length out of range or file too large.
|
|
52
|
+
FileNotFoundError: path argument refers to a non-existent file.
|
|
53
|
+
TypeError: audio is none of the accepted types.
|
|
54
|
+
"""
|
|
55
|
+
if not (NAME_MIN_LENGTH <= len(name) <= NAME_MAX_LENGTH):
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"name must be {NAME_MIN_LENGTH}-{NAME_MAX_LENGTH} characters; got {len(name)}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if isinstance(audio, (str, Path)):
|
|
61
|
+
path = Path(audio)
|
|
62
|
+
if not path.exists() or not path.is_file():
|
|
63
|
+
raise FileNotFoundError(f"audio file not found: {path}")
|
|
64
|
+
audio_bytes = path.read_bytes()
|
|
65
|
+
filename = path.name
|
|
66
|
+
elif isinstance(audio, (bytes, bytearray)):
|
|
67
|
+
audio_bytes = bytes(audio)
|
|
68
|
+
filename = "audio.wav"
|
|
69
|
+
elif hasattr(audio, "read"):
|
|
70
|
+
audio_bytes = audio.read()
|
|
71
|
+
if isinstance(audio_bytes, bytearray):
|
|
72
|
+
audio_bytes = bytes(audio_bytes)
|
|
73
|
+
if not isinstance(audio_bytes, bytes):
|
|
74
|
+
raise TypeError(
|
|
75
|
+
"audio file object must be opened in binary mode and return bytes"
|
|
76
|
+
)
|
|
77
|
+
raw_name = getattr(audio, "name", None) or "audio.wav"
|
|
78
|
+
filename = os.path.basename(str(raw_name).replace("\\", "/"))
|
|
79
|
+
else:
|
|
80
|
+
raise TypeError(
|
|
81
|
+
"audio must be a file path (str/Path), bytes, or readable binary file object"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if len(audio_bytes) > CLONING_MAX_FILE_SIZE:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"audio file exceeds 25MB limit; got {len(audio_bytes)} bytes"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return audio_bytes, filename
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
-
from
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import AsyncIterator, BinaryIO, Optional, Union
|
|
3
|
+
from urllib.parse import quote
|
|
2
4
|
|
|
3
5
|
import aiohttp
|
|
4
6
|
|
|
5
7
|
from . import conf
|
|
8
|
+
from ._voice_clone import (
|
|
9
|
+
normalize_clone_model,
|
|
10
|
+
validate_clone_inputs,
|
|
11
|
+
validate_custom_voice_id,
|
|
12
|
+
)
|
|
13
|
+
from .client import _guess_audio_mime
|
|
6
14
|
from .exceptions import (
|
|
7
15
|
BadRequestError,
|
|
8
16
|
InternalServerError,
|
|
@@ -14,10 +22,14 @@ from .exceptions import (
|
|
|
14
22
|
UnprocessableEntityError,
|
|
15
23
|
)
|
|
16
24
|
from .models import (
|
|
25
|
+
CustomVoice,
|
|
17
26
|
SubscriptionResponse,
|
|
27
|
+
TTSModel,
|
|
18
28
|
TTSRequest,
|
|
19
29
|
TTSRequestStream,
|
|
30
|
+
TTSRequestWithTimestamps,
|
|
20
31
|
TTSResponse,
|
|
32
|
+
TTSWithTimestampsResponse,
|
|
21
33
|
VoicesResponse,
|
|
22
34
|
VoicesV2Filter,
|
|
23
35
|
VoiceV2Response,
|
|
@@ -59,7 +71,9 @@ class AsyncTypecast:
|
|
|
59
71
|
self.session: Optional[aiohttp.ClientSession] = None
|
|
60
72
|
|
|
61
73
|
async def __aenter__(self):
|
|
62
|
-
|
|
74
|
+
# Auth header at session scope; per-request Content-Type is set by aiohttp
|
|
75
|
+
# (json= auto-sets application/json, data=FormData() auto-sets multipart).
|
|
76
|
+
headers = {}
|
|
63
77
|
if self.api_key:
|
|
64
78
|
headers["X-API-KEY"] = self.api_key
|
|
65
79
|
self.session = aiohttp.ClientSession(headers=headers)
|
|
@@ -168,6 +182,119 @@ class AsyncTypecast:
|
|
|
168
182
|
async for chunk in response.content.iter_chunked(chunk_size):
|
|
169
183
|
yield chunk
|
|
170
184
|
|
|
185
|
+
async def text_to_speech_with_timestamps(
|
|
186
|
+
self,
|
|
187
|
+
request: TTSRequestWithTimestamps,
|
|
188
|
+
granularity: Optional[str] = None,
|
|
189
|
+
) -> TTSWithTimestampsResponse:
|
|
190
|
+
"""Async version of ``Typecast.text_to_speech_with_timestamps``.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
request: Request body (same shape as ``TTSRequest``).
|
|
194
|
+
granularity: Optional ``"word"`` or ``"char"`` filter.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
``TTSWithTimestampsResponse`` with helpers ``to_srt()``,
|
|
198
|
+
``to_vtt()``, ``save_audio()``.
|
|
199
|
+
|
|
200
|
+
Raises:
|
|
201
|
+
TypecastError: If the client session is not initialized
|
|
202
|
+
(i.e. used outside ``async with``).
|
|
203
|
+
ValueError: If ``granularity`` is not ``None``, ``"word"``, or ``"char"``.
|
|
204
|
+
BadRequestError, UnauthorizedError, PaymentRequiredError,
|
|
205
|
+
NotFoundError, UnprocessableEntityError, RateLimitError,
|
|
206
|
+
InternalServerError, TypecastError: per HTTP status.
|
|
207
|
+
"""
|
|
208
|
+
if self.session is None:
|
|
209
|
+
raise TypecastError("Client session not initialized; use 'async with'.")
|
|
210
|
+
if granularity not in (None, "word", "char"):
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"granularity must be None, 'word', or 'char'; got {granularity!r}"
|
|
213
|
+
)
|
|
214
|
+
endpoint = "/v1/text-to-speech/with-timestamps"
|
|
215
|
+
params = {"granularity": granularity} if granularity else None
|
|
216
|
+
async with self.session.post(
|
|
217
|
+
f"{self.host}{endpoint}",
|
|
218
|
+
json=request.model_dump(exclude_none=True),
|
|
219
|
+
params=params,
|
|
220
|
+
) as response:
|
|
221
|
+
if response.status != 200:
|
|
222
|
+
text = await response.text()
|
|
223
|
+
self._handle_error(response.status, text)
|
|
224
|
+
data = await response.json()
|
|
225
|
+
return TTSWithTimestampsResponse.model_validate(data)
|
|
226
|
+
|
|
227
|
+
async def clone_voice(
|
|
228
|
+
self,
|
|
229
|
+
audio: Union[str, Path, bytes, BinaryIO],
|
|
230
|
+
name: str,
|
|
231
|
+
model: Union[str, "TTSModel"],
|
|
232
|
+
) -> CustomVoice:
|
|
233
|
+
"""Create a quick-cloned custom voice from an audio sample (async).
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
audio: Audio sample. Accepts file path (str/Path), raw bytes,
|
|
237
|
+
or a readable binary file object. Max 25 MB.
|
|
238
|
+
name: Voice name, 1-30 characters.
|
|
239
|
+
model: Engine model. ``"ssfm-v21"`` or ``"ssfm-v30"`` (or ``TTSModel`` enum).
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
``CustomVoice`` with ``voice_id`` (uc_ prefix), ``name``, and ``model``.
|
|
243
|
+
|
|
244
|
+
Raises:
|
|
245
|
+
ValueError: name length out of range or audio exceeds 25 MB.
|
|
246
|
+
FileNotFoundError: ``audio`` is a path to a non-existent file.
|
|
247
|
+
TypecastError: client session not initialized or HTTP error.
|
|
248
|
+
"""
|
|
249
|
+
if self.session is None:
|
|
250
|
+
raise TypecastError("Client session not initialized; use 'async with'.")
|
|
251
|
+
|
|
252
|
+
audio_bytes, filename = validate_clone_inputs(audio, name)
|
|
253
|
+
model_str = normalize_clone_model(model)
|
|
254
|
+
|
|
255
|
+
form = aiohttp.FormData()
|
|
256
|
+
form.add_field("name", name)
|
|
257
|
+
form.add_field("model", model_str)
|
|
258
|
+
form.add_field(
|
|
259
|
+
"file",
|
|
260
|
+
audio_bytes,
|
|
261
|
+
filename=filename,
|
|
262
|
+
content_type=_guess_audio_mime(filename),
|
|
263
|
+
)
|
|
264
|
+
timeout = aiohttp.ClientTimeout(total=300, connect=10)
|
|
265
|
+
async with self.session.post(
|
|
266
|
+
f"{self.host}/v1/voices/clone",
|
|
267
|
+
data=form,
|
|
268
|
+
timeout=timeout,
|
|
269
|
+
) as response:
|
|
270
|
+
if response.status != 200:
|
|
271
|
+
text = await response.text()
|
|
272
|
+
self._handle_error(response.status, text)
|
|
273
|
+
body = await response.json()
|
|
274
|
+
return CustomVoice.model_validate(body)
|
|
275
|
+
|
|
276
|
+
async def delete_voice(self, voice_id: str) -> None:
|
|
277
|
+
"""Soft-delete a custom voice (async).
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
voice_id: Voice identifier with ``uc_`` prefix.
|
|
281
|
+
|
|
282
|
+
Raises:
|
|
283
|
+
TypecastError subclasses: per HTTP status from the API.
|
|
284
|
+
"""
|
|
285
|
+
if self.session is None:
|
|
286
|
+
raise TypecastError("Client session not initialized; use 'async with'.")
|
|
287
|
+
|
|
288
|
+
validate_custom_voice_id(voice_id)
|
|
289
|
+
timeout = aiohttp.ClientTimeout(total=60, connect=10)
|
|
290
|
+
async with self.session.delete(
|
|
291
|
+
f"{self.host}/v1/voices/{quote(voice_id, safe='')}",
|
|
292
|
+
timeout=timeout,
|
|
293
|
+
) as response:
|
|
294
|
+
if response.status not in (200, 204):
|
|
295
|
+
text = await response.text()
|
|
296
|
+
self._handle_error(response.status, text)
|
|
297
|
+
|
|
171
298
|
async def voices(self, model: Optional[str] = None) -> list[VoicesResponse]:
|
|
172
299
|
"""Get available voices (V1 API) asynchronously.
|
|
173
300
|
|
|
@@ -1,8 +1,15 @@
|
|
|
1
|
-
from
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import BinaryIO, Iterator, Optional, Union
|
|
3
|
+
from urllib.parse import quote
|
|
2
4
|
|
|
3
5
|
import requests
|
|
4
6
|
|
|
5
7
|
from . import conf
|
|
8
|
+
from ._voice_clone import (
|
|
9
|
+
normalize_clone_model,
|
|
10
|
+
validate_clone_inputs,
|
|
11
|
+
validate_custom_voice_id,
|
|
12
|
+
)
|
|
6
13
|
from .exceptions import (
|
|
7
14
|
BadRequestError,
|
|
8
15
|
InternalServerError,
|
|
@@ -14,16 +21,30 @@ from .exceptions import (
|
|
|
14
21
|
UnprocessableEntityError,
|
|
15
22
|
)
|
|
16
23
|
from .models import (
|
|
24
|
+
CustomVoice,
|
|
17
25
|
SubscriptionResponse,
|
|
26
|
+
TTSModel,
|
|
18
27
|
TTSRequest,
|
|
19
28
|
TTSRequestStream,
|
|
29
|
+
TTSRequestWithTimestamps,
|
|
20
30
|
TTSResponse,
|
|
31
|
+
TTSWithTimestampsResponse,
|
|
21
32
|
VoicesResponse,
|
|
22
33
|
VoicesV2Filter,
|
|
23
34
|
VoiceV2Response,
|
|
24
35
|
)
|
|
25
36
|
|
|
26
37
|
|
|
38
|
+
def _guess_audio_mime(filename: str) -> str:
|
|
39
|
+
"""Guess audio MIME type from filename extension; fall back to octet-stream."""
|
|
40
|
+
lower = filename.lower()
|
|
41
|
+
if lower.endswith(".wav"):
|
|
42
|
+
return "audio/wav"
|
|
43
|
+
if lower.endswith(".mp3"):
|
|
44
|
+
return "audio/mpeg"
|
|
45
|
+
return "application/octet-stream"
|
|
46
|
+
|
|
47
|
+
|
|
27
48
|
class Typecast:
|
|
28
49
|
"""Synchronous client for the Typecast Text-to-Speech API.
|
|
29
50
|
|
|
@@ -161,6 +182,106 @@ class Typecast:
|
|
|
161
182
|
finally:
|
|
162
183
|
response.close()
|
|
163
184
|
|
|
185
|
+
def text_to_speech_with_timestamps(
|
|
186
|
+
self,
|
|
187
|
+
request: TTSRequestWithTimestamps,
|
|
188
|
+
granularity: Optional[str] = None,
|
|
189
|
+
) -> TTSWithTimestampsResponse:
|
|
190
|
+
"""Synthesize speech and return base64 audio + alignment timestamps.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
request: Request body (same shape as `TTSRequest`).
|
|
194
|
+
granularity: Optional ``"word"`` or ``"char"`` to filter the
|
|
195
|
+
returned alignment arrays. Omit to receive both.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
``TTSWithTimestampsResponse`` with ``audio`` (base64),
|
|
199
|
+
``words``, ``characters``, and helper methods ``to_srt()``,
|
|
200
|
+
``to_vtt()``, and ``save_audio()``.
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
ValueError: If ``granularity`` is not ``None``, ``"word"``, or ``"char"``.
|
|
204
|
+
BadRequestError, UnauthorizedError, PaymentRequiredError,
|
|
205
|
+
NotFoundError, UnprocessableEntityError, RateLimitError,
|
|
206
|
+
InternalServerError, TypecastError: per HTTP status.
|
|
207
|
+
"""
|
|
208
|
+
if granularity not in (None, "word", "char"):
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"granularity must be None, 'word', or 'char'; got {granularity!r}"
|
|
211
|
+
)
|
|
212
|
+
endpoint = "/v1/text-to-speech/with-timestamps"
|
|
213
|
+
params = {"granularity": granularity} if granularity else None
|
|
214
|
+
response = self.session.post(
|
|
215
|
+
f"{self.host}{endpoint}",
|
|
216
|
+
json=request.model_dump(exclude_none=True),
|
|
217
|
+
params=params,
|
|
218
|
+
timeout=(10, 300),
|
|
219
|
+
)
|
|
220
|
+
if response.status_code != 200:
|
|
221
|
+
self._handle_error(response.status_code, response.text)
|
|
222
|
+
return TTSWithTimestampsResponse.model_validate(response.json())
|
|
223
|
+
|
|
224
|
+
def clone_voice(
|
|
225
|
+
self,
|
|
226
|
+
audio: Union[str, Path, bytes, BinaryIO],
|
|
227
|
+
name: str,
|
|
228
|
+
model: Union[str, "TTSModel"],
|
|
229
|
+
) -> CustomVoice:
|
|
230
|
+
"""Create a quick-cloned custom voice from an audio sample.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
audio: Audio sample. Accepts file path (str/Path), raw bytes,
|
|
234
|
+
or a readable binary file object. Max 25 MB.
|
|
235
|
+
name: Voice name, 1-30 characters.
|
|
236
|
+
model: Engine model. ``"ssfm-v21"`` or ``"ssfm-v30"`` (or ``TTSModel`` enum).
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
``CustomVoice`` with ``voice_id`` (uc_ prefix), ``name``, and ``model``.
|
|
240
|
+
Use ``voice_id`` directly with ``text_to_speech`` to synthesize.
|
|
241
|
+
|
|
242
|
+
Raises:
|
|
243
|
+
ValueError: name length out of range or audio exceeds 25 MB.
|
|
244
|
+
FileNotFoundError: ``audio`` is a path to a non-existent file.
|
|
245
|
+
TypecastError subclasses: per HTTP status from the API.
|
|
246
|
+
"""
|
|
247
|
+
audio_bytes, filename = validate_clone_inputs(audio, name)
|
|
248
|
+
model_str = normalize_clone_model(model)
|
|
249
|
+
|
|
250
|
+
files = {
|
|
251
|
+
"file": (filename, audio_bytes, _guess_audio_mime(filename)),
|
|
252
|
+
}
|
|
253
|
+
data = {"name": name, "model": model_str}
|
|
254
|
+
# Remove the session-level Content-Type so requests can set the
|
|
255
|
+
# correct multipart/form-data boundary for this request.
|
|
256
|
+
response = self.session.post(
|
|
257
|
+
f"{self.host}/v1/voices/clone",
|
|
258
|
+
files=files,
|
|
259
|
+
data=data,
|
|
260
|
+
headers={"Content-Type": None},
|
|
261
|
+
timeout=(10, 300),
|
|
262
|
+
)
|
|
263
|
+
if response.status_code != 200:
|
|
264
|
+
self._handle_error(response.status_code, response.text)
|
|
265
|
+
return CustomVoice.model_validate(response.json())
|
|
266
|
+
|
|
267
|
+
def delete_voice(self, voice_id: str) -> None:
|
|
268
|
+
"""Soft-delete a custom voice.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
voice_id: Voice identifier with ``uc_`` prefix (returned by ``clone_voice``).
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
TypecastError subclasses: per HTTP status from the API
|
|
275
|
+
(e.g., ``NotFoundError`` if the voice doesn't exist or isn't owned).
|
|
276
|
+
"""
|
|
277
|
+
validate_custom_voice_id(voice_id)
|
|
278
|
+
response = self.session.delete(
|
|
279
|
+
f"{self.host}/v1/voices/{quote(voice_id, safe='')}",
|
|
280
|
+
timeout=(10, 60),
|
|
281
|
+
)
|
|
282
|
+
if response.status_code not in (200, 204):
|
|
283
|
+
self._handle_error(response.status_code, response.text)
|
|
284
|
+
|
|
164
285
|
def voices(self, model: Optional[str] = None) -> list[VoicesResponse]:
|
|
165
286
|
"""Get available voices (V1 API).
|
|
166
287
|
|