typecast-python 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {typecast_python-0.2.2 → typecast_python-0.3.0}/PKG-INFO +40 -1
- {typecast_python-0.2.2 → typecast_python-0.3.0}/README.md +39 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/pyproject.toml +1 -1
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/async_client.py +44 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/client.py +41 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/models/__init__.py +26 -18
- typecast_python-0.3.0/src/typecast/models/tts.py +449 -0
- typecast_python-0.2.2/src/typecast/models/tts.py +0 -214
- {typecast_python-0.2.2 → typecast_python-0.3.0}/.gitignore +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/LICENSE +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/__init__.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/conf.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/exceptions.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/models/error.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/models/subscription.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/models/voices.py +0 -0
- {typecast_python-0.2.2 → typecast_python-0.3.0}/src/typecast/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: typecast-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Official Typecast Python SDK - Convert text to lifelike speech using AI-powered voices
|
|
5
5
|
Project-URL: Homepage, https://typecast.ai
|
|
6
6
|
Project-URL: Documentation, https://typecast.ai/docs/overview
|
|
@@ -269,6 +269,7 @@ Convert text to lifelike speech using AI-powered voices
|
|
|
269
269
|
- [Voice Discovery](#voice-discovery)
|
|
270
270
|
- [Emotion Control](#emotion-control)
|
|
271
271
|
- [Async Client](#async-client)
|
|
272
|
+
- [Timestamp TTS](#timestamp-tts)
|
|
272
273
|
- [Supported Languages](#supported-languages)
|
|
273
274
|
- [Error Handling](#error-handling)
|
|
274
275
|
- [License](#license)
|
|
@@ -465,6 +466,44 @@ async def main():
|
|
|
465
466
|
asyncio.run(main())
|
|
466
467
|
```
|
|
467
468
|
|
|
469
|
+
### Timestamp TTS
|
|
470
|
+
|
|
471
|
+
Use `text_to_speech_with_timestamps()` to receive base64 audio plus
|
|
472
|
+
word/character-level timestamps aligned with the synthesized speech. The
|
|
473
|
+
result object exposes `save_audio()`, `to_srt()`, and `to_vtt()` helpers
|
|
474
|
+
so you can finish the typical "audio + subtitles" flow in one line.
|
|
475
|
+
|
|
476
|
+
```python
|
|
477
|
+
from typecast import Typecast
|
|
478
|
+
from typecast.models import TTSRequestWithTimestamps
|
|
479
|
+
|
|
480
|
+
client = Typecast(api_key="YOUR_API_KEY")
|
|
481
|
+
resp = client.text_to_speech_with_timestamps(
|
|
482
|
+
TTSRequestWithTimestamps(
|
|
483
|
+
voice_id="tc_60e5426de8b95f1d3000d7b5",
|
|
484
|
+
text="Hello. How are you?",
|
|
485
|
+
model="ssfm-v30",
|
|
486
|
+
language="eng",
|
|
487
|
+
),
|
|
488
|
+
)
|
|
489
|
+
resp.save_audio("hello.wav")
|
|
490
|
+
print(resp.to_srt()) # SRT subtitles
|
|
491
|
+
print(resp.to_vtt()) # WebVTT subtitles
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
Caption splits follow BBC/Netflix subtitle guidelines: 7s/42-char cue maximums.
|
|
495
|
+
|
|
496
|
+
```python
|
|
497
|
+
# Real-time karaoke / highlight: iterate the words array directly.
|
|
498
|
+
for w in resp.words or []:
|
|
499
|
+
print(f"[{w.start:.2f}s - {w.end:.2f}s] {w.text}")
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
Pass `granularity="word"` or `granularity="char"` to receive only one of
|
|
503
|
+
the two alignment arrays. For non-whitespace languages (Japanese,
|
|
504
|
+
Chinese), pair with `granularity="char"` — word-level alignment will
|
|
505
|
+
collapse the entire sentence into a single segment.
|
|
506
|
+
|
|
468
507
|
---
|
|
469
508
|
|
|
470
509
|
## Supported Languages
|
|
@@ -28,6 +28,7 @@ Convert text to lifelike speech using AI-powered voices
|
|
|
28
28
|
- [Voice Discovery](#voice-discovery)
|
|
29
29
|
- [Emotion Control](#emotion-control)
|
|
30
30
|
- [Async Client](#async-client)
|
|
31
|
+
- [Timestamp TTS](#timestamp-tts)
|
|
31
32
|
- [Supported Languages](#supported-languages)
|
|
32
33
|
- [Error Handling](#error-handling)
|
|
33
34
|
- [License](#license)
|
|
@@ -224,6 +225,44 @@ async def main():
|
|
|
224
225
|
asyncio.run(main())
|
|
225
226
|
```
|
|
226
227
|
|
|
228
|
+
### Timestamp TTS
|
|
229
|
+
|
|
230
|
+
Use `text_to_speech_with_timestamps()` to receive base64 audio plus
|
|
231
|
+
word/character-level timestamps aligned with the synthesized speech. The
|
|
232
|
+
result object exposes `save_audio()`, `to_srt()`, and `to_vtt()` helpers
|
|
233
|
+
so you can finish the typical "audio + subtitles" flow in one line.
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
from typecast import Typecast
|
|
237
|
+
from typecast.models import TTSRequestWithTimestamps
|
|
238
|
+
|
|
239
|
+
client = Typecast(api_key="YOUR_API_KEY")
|
|
240
|
+
resp = client.text_to_speech_with_timestamps(
|
|
241
|
+
TTSRequestWithTimestamps(
|
|
242
|
+
voice_id="tc_60e5426de8b95f1d3000d7b5",
|
|
243
|
+
text="Hello. How are you?",
|
|
244
|
+
model="ssfm-v30",
|
|
245
|
+
language="eng",
|
|
246
|
+
),
|
|
247
|
+
)
|
|
248
|
+
resp.save_audio("hello.wav")
|
|
249
|
+
print(resp.to_srt()) # SRT subtitles
|
|
250
|
+
print(resp.to_vtt()) # WebVTT subtitles
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
Caption splits follow BBC/Netflix subtitle guidelines: 7s/42-char cue maximums.
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
# Real-time karaoke / highlight: iterate the words array directly.
|
|
257
|
+
for w in resp.words or []:
|
|
258
|
+
print(f"[{w.start:.2f}s - {w.end:.2f}s] {w.text}")
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
Pass `granularity="word"` or `granularity="char"` to receive only one of
|
|
262
|
+
the two alignment arrays. For non-whitespace languages (Japanese,
|
|
263
|
+
Chinese), pair with `granularity="char"` — word-level alignment will
|
|
264
|
+
collapse the entire sentence into a single segment.
|
|
265
|
+
|
|
227
266
|
---
|
|
228
267
|
|
|
229
268
|
## Supported Languages
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "typecast-python"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Official Typecast Python SDK - Convert text to lifelike speech using AI-powered voices"
|
|
9
9
|
authors = [
|
|
10
10
|
{name = "Neosapience", email = "help@typecast.ai"}
|
|
@@ -17,7 +17,9 @@ from .models import (
|
|
|
17
17
|
SubscriptionResponse,
|
|
18
18
|
TTSRequest,
|
|
19
19
|
TTSRequestStream,
|
|
20
|
+
TTSRequestWithTimestamps,
|
|
20
21
|
TTSResponse,
|
|
22
|
+
TTSWithTimestampsResponse,
|
|
21
23
|
VoicesResponse,
|
|
22
24
|
VoicesV2Filter,
|
|
23
25
|
VoiceV2Response,
|
|
@@ -168,6 +170,48 @@ class AsyncTypecast:
|
|
|
168
170
|
async for chunk in response.content.iter_chunked(chunk_size):
|
|
169
171
|
yield chunk
|
|
170
172
|
|
|
173
|
+
async def text_to_speech_with_timestamps(
|
|
174
|
+
self,
|
|
175
|
+
request: TTSRequestWithTimestamps,
|
|
176
|
+
granularity: Optional[str] = None,
|
|
177
|
+
) -> TTSWithTimestampsResponse:
|
|
178
|
+
"""Async version of ``Typecast.text_to_speech_with_timestamps``.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
request: Request body (same shape as ``TTSRequest``).
|
|
182
|
+
granularity: Optional ``"word"`` or ``"char"`` filter.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
``TTSWithTimestampsResponse`` with helpers ``to_srt()``,
|
|
186
|
+
``to_vtt()``, ``save_audio()``.
|
|
187
|
+
|
|
188
|
+
Raises:
|
|
189
|
+
TypecastError: If the client session is not initialized
|
|
190
|
+
(i.e. used outside ``async with``).
|
|
191
|
+
ValueError: If ``granularity`` is not ``None``, ``"word"``, or ``"char"``.
|
|
192
|
+
BadRequestError, UnauthorizedError, PaymentRequiredError,
|
|
193
|
+
NotFoundError, UnprocessableEntityError, RateLimitError,
|
|
194
|
+
InternalServerError, TypecastError: per HTTP status.
|
|
195
|
+
"""
|
|
196
|
+
if self.session is None:
|
|
197
|
+
raise TypecastError("Client session not initialized; use 'async with'.")
|
|
198
|
+
if granularity not in (None, "word", "char"):
|
|
199
|
+
raise ValueError(
|
|
200
|
+
f"granularity must be None, 'word', or 'char'; got {granularity!r}"
|
|
201
|
+
)
|
|
202
|
+
endpoint = "/v1/text-to-speech/with-timestamps"
|
|
203
|
+
params = {"granularity": granularity} if granularity else None
|
|
204
|
+
async with self.session.post(
|
|
205
|
+
f"{self.host}{endpoint}",
|
|
206
|
+
json=request.model_dump(exclude_none=True),
|
|
207
|
+
params=params,
|
|
208
|
+
) as response:
|
|
209
|
+
if response.status != 200:
|
|
210
|
+
text = await response.text()
|
|
211
|
+
self._handle_error(response.status, text)
|
|
212
|
+
data = await response.json()
|
|
213
|
+
return TTSWithTimestampsResponse.model_validate(data)
|
|
214
|
+
|
|
171
215
|
async def voices(self, model: Optional[str] = None) -> list[VoicesResponse]:
|
|
172
216
|
"""Get available voices (V1 API) asynchronously.
|
|
173
217
|
|
|
@@ -17,7 +17,9 @@ from .models import (
|
|
|
17
17
|
SubscriptionResponse,
|
|
18
18
|
TTSRequest,
|
|
19
19
|
TTSRequestStream,
|
|
20
|
+
TTSRequestWithTimestamps,
|
|
20
21
|
TTSResponse,
|
|
22
|
+
TTSWithTimestampsResponse,
|
|
21
23
|
VoicesResponse,
|
|
22
24
|
VoicesV2Filter,
|
|
23
25
|
VoiceV2Response,
|
|
@@ -161,6 +163,45 @@ class Typecast:
|
|
|
161
163
|
finally:
|
|
162
164
|
response.close()
|
|
163
165
|
|
|
166
|
+
def text_to_speech_with_timestamps(
|
|
167
|
+
self,
|
|
168
|
+
request: TTSRequestWithTimestamps,
|
|
169
|
+
granularity: Optional[str] = None,
|
|
170
|
+
) -> TTSWithTimestampsResponse:
|
|
171
|
+
"""Synthesize speech and return base64 audio + alignment timestamps.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
request: Request body (same shape as `TTSRequest`).
|
|
175
|
+
granularity: Optional ``"word"`` or ``"char"`` to filter the
|
|
176
|
+
returned alignment arrays. Omit to receive both.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
``TTSWithTimestampsResponse`` with ``audio`` (base64),
|
|
180
|
+
``words``, ``characters``, and helper methods ``to_srt()``,
|
|
181
|
+
``to_vtt()``, and ``save_audio()``.
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
ValueError: If ``granularity`` is not ``None``, ``"word"``, or ``"char"``.
|
|
185
|
+
BadRequestError, UnauthorizedError, PaymentRequiredError,
|
|
186
|
+
NotFoundError, UnprocessableEntityError, RateLimitError,
|
|
187
|
+
InternalServerError, TypecastError: per HTTP status.
|
|
188
|
+
"""
|
|
189
|
+
if granularity not in (None, "word", "char"):
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"granularity must be None, 'word', or 'char'; got {granularity!r}"
|
|
192
|
+
)
|
|
193
|
+
endpoint = "/v1/text-to-speech/with-timestamps"
|
|
194
|
+
params = {"granularity": granularity} if granularity else None
|
|
195
|
+
response = self.session.post(
|
|
196
|
+
f"{self.host}{endpoint}",
|
|
197
|
+
json=request.model_dump(exclude_none=True),
|
|
198
|
+
params=params,
|
|
199
|
+
timeout=(10, 300),
|
|
200
|
+
)
|
|
201
|
+
if response.status_code != 200:
|
|
202
|
+
self._handle_error(response.status_code, response.text)
|
|
203
|
+
return TTSWithTimestampsResponse.model_validate(response.json())
|
|
204
|
+
|
|
164
205
|
def voices(self, model: Optional[str] = None) -> list[VoicesResponse]:
|
|
165
206
|
"""Get available voices (V1 API).
|
|
166
207
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from .error import Error
|
|
2
2
|
from .subscription import Credits, Limits, PlanTier, SubscriptionResponse
|
|
3
3
|
from .tts import (
|
|
4
|
+
AlignmentSegmentCharacter,
|
|
5
|
+
AlignmentSegmentWord,
|
|
4
6
|
EmotionPreset,
|
|
5
7
|
LanguageCode,
|
|
6
8
|
Output,
|
|
@@ -12,7 +14,9 @@ from .tts import (
|
|
|
12
14
|
TTSPrompt,
|
|
13
15
|
TTSRequest,
|
|
14
16
|
TTSRequestStream,
|
|
17
|
+
TTSRequestWithTimestamps,
|
|
15
18
|
TTSResponse,
|
|
19
|
+
TTSWithTimestampsResponse,
|
|
16
20
|
)
|
|
17
21
|
from .voices import (
|
|
18
22
|
AgeEnum,
|
|
@@ -25,28 +29,32 @@ from .voices import (
|
|
|
25
29
|
)
|
|
26
30
|
|
|
27
31
|
__all__ = [
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"Prompt",
|
|
33
|
-
"PresetPrompt",
|
|
34
|
-
"SmartPrompt",
|
|
32
|
+
"AgeEnum",
|
|
33
|
+
"AlignmentSegmentCharacter",
|
|
34
|
+
"AlignmentSegmentWord",
|
|
35
|
+
"Credits",
|
|
35
36
|
"EmotionPreset",
|
|
37
|
+
"Error",
|
|
38
|
+
"GenderEnum",
|
|
39
|
+
"LanguageCode",
|
|
40
|
+
"Limits",
|
|
41
|
+
"ModelInfo",
|
|
36
42
|
"Output",
|
|
37
43
|
"OutputStream",
|
|
44
|
+
"PlanTier",
|
|
45
|
+
"Prompt",
|
|
46
|
+
"PresetPrompt",
|
|
47
|
+
"SmartPrompt",
|
|
48
|
+
"SubscriptionResponse",
|
|
49
|
+
"TTSModel",
|
|
50
|
+
"TTSPrompt",
|
|
51
|
+
"TTSRequest",
|
|
52
|
+
"TTSRequestStream",
|
|
53
|
+
"TTSRequestWithTimestamps",
|
|
38
54
|
"TTSResponse",
|
|
39
|
-
"
|
|
55
|
+
"TTSWithTimestampsResponse",
|
|
56
|
+
"UseCaseEnum",
|
|
40
57
|
"VoiceV2Response",
|
|
58
|
+
"VoicesResponse",
|
|
41
59
|
"VoicesV2Filter",
|
|
42
|
-
"ModelInfo",
|
|
43
|
-
"GenderEnum",
|
|
44
|
-
"AgeEnum",
|
|
45
|
-
"UseCaseEnum",
|
|
46
|
-
"Error",
|
|
47
|
-
"LanguageCode",
|
|
48
|
-
"PlanTier",
|
|
49
|
-
"Credits",
|
|
50
|
-
"Limits",
|
|
51
|
-
"SubscriptionResponse",
|
|
52
60
|
]
|
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Literal, Optional, Union
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TTSModel(str, Enum):
|
|
8
|
+
SSFM_V21 = "ssfm-v21"
|
|
9
|
+
SSFM_V30 = "ssfm-v30"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LanguageCode(str, Enum):
|
|
13
|
+
"""ISO 639-3 language codes supported by Typecast API
|
|
14
|
+
|
|
15
|
+
ssfm-v21: 27 languages
|
|
16
|
+
ssfm-v30: 37 languages (includes all v21 languages plus additional ones)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
ENG = "eng" # English
|
|
20
|
+
KOR = "kor" # Korean
|
|
21
|
+
SPA = "spa" # Spanish
|
|
22
|
+
DEU = "deu" # German
|
|
23
|
+
FRA = "fra" # French
|
|
24
|
+
ITA = "ita" # Italian
|
|
25
|
+
POL = "pol" # Polish
|
|
26
|
+
NLD = "nld" # Dutch
|
|
27
|
+
RUS = "rus" # Russian
|
|
28
|
+
JPN = "jpn" # Japanese
|
|
29
|
+
ELL = "ell" # Greek
|
|
30
|
+
TAM = "tam" # Tamil
|
|
31
|
+
TGL = "tgl" # Tagalog
|
|
32
|
+
FIN = "fin" # Finnish
|
|
33
|
+
ZHO = "zho" # Chinese
|
|
34
|
+
SLK = "slk" # Slovak
|
|
35
|
+
ARA = "ara" # Arabic
|
|
36
|
+
HRV = "hrv" # Croatian
|
|
37
|
+
UKR = "ukr" # Ukrainian
|
|
38
|
+
IND = "ind" # Indonesian
|
|
39
|
+
DAN = "dan" # Danish
|
|
40
|
+
SWE = "swe" # Swedish
|
|
41
|
+
MSA = "msa" # Malay
|
|
42
|
+
CES = "ces" # Czech
|
|
43
|
+
POR = "por" # Portuguese
|
|
44
|
+
BUL = "bul" # Bulgarian
|
|
45
|
+
RON = "ron" # Romanian
|
|
46
|
+
# ssfm-v30 additional languages
|
|
47
|
+
BEN = "ben" # Bengali
|
|
48
|
+
HIN = "hin" # Hindi
|
|
49
|
+
HUN = "hun" # Hungarian
|
|
50
|
+
NAN = "nan" # Min Nan
|
|
51
|
+
NOR = "nor" # Norwegian
|
|
52
|
+
PAN = "pan" # Punjabi
|
|
53
|
+
THA = "tha" # Thai
|
|
54
|
+
TUR = "tur" # Turkish
|
|
55
|
+
VIE = "vie" # Vietnamese
|
|
56
|
+
YUE = "yue" # Cantonese
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class EmotionPreset(str, Enum):
|
|
60
|
+
"""Emotion preset types
|
|
61
|
+
|
|
62
|
+
ssfm-v21: normal, happy, sad, angry
|
|
63
|
+
ssfm-v30: normal, happy, sad, angry, whisper, toneup, tonedown
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
NORMAL = "normal"
|
|
67
|
+
HAPPY = "happy"
|
|
68
|
+
SAD = "sad"
|
|
69
|
+
ANGRY = "angry"
|
|
70
|
+
WHISPER = "whisper" # ssfm-v30 only
|
|
71
|
+
TONEUP = "toneup" # ssfm-v30 only
|
|
72
|
+
TONEDOWN = "tonedown" # ssfm-v30 only
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Prompt(BaseModel):
|
|
76
|
+
"""Emotion and style settings for ssfm-v21 model"""
|
|
77
|
+
|
|
78
|
+
emotion_preset: Optional[str] = Field(
|
|
79
|
+
default="normal",
|
|
80
|
+
description="Emotion preset",
|
|
81
|
+
examples=["normal", "happy", "sad", "angry"],
|
|
82
|
+
)
|
|
83
|
+
emotion_intensity: Optional[float] = Field(default=1.0, ge=0.0, le=2.0)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PresetPrompt(BaseModel):
|
|
87
|
+
"""Preset-based emotion control for ssfm-v30 model"""
|
|
88
|
+
|
|
89
|
+
emotion_type: Literal["preset"] = Field(
|
|
90
|
+
default="preset",
|
|
91
|
+
description="Must be 'preset' for preset-based emotion control",
|
|
92
|
+
)
|
|
93
|
+
emotion_preset: Optional[str] = Field(
|
|
94
|
+
default="normal",
|
|
95
|
+
description="Emotion preset to apply",
|
|
96
|
+
examples=["normal", "happy", "sad", "angry", "whisper", "toneup", "tonedown"],
|
|
97
|
+
)
|
|
98
|
+
emotion_intensity: Optional[float] = Field(default=1.0, ge=0.0, le=2.0)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SmartPrompt(BaseModel):
|
|
102
|
+
"""Context-aware emotion inference for ssfm-v30 model"""
|
|
103
|
+
|
|
104
|
+
emotion_type: Literal["smart"] = Field(
|
|
105
|
+
default="smart",
|
|
106
|
+
description="Must be 'smart' for context-aware emotion inference",
|
|
107
|
+
)
|
|
108
|
+
previous_text: Optional[str] = Field(
|
|
109
|
+
default=None,
|
|
110
|
+
description="Text that comes BEFORE the main text (max 2000 chars)",
|
|
111
|
+
max_length=2000,
|
|
112
|
+
)
|
|
113
|
+
next_text: Optional[str] = Field(
|
|
114
|
+
default=None,
|
|
115
|
+
description="Text that comes AFTER the main text (max 2000 chars)",
|
|
116
|
+
max_length=2000,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# Union type for all prompt types
|
|
121
|
+
TTSPrompt = Union[Prompt, PresetPrompt, SmartPrompt]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class Output(BaseModel):
|
|
125
|
+
volume: Optional[int] = Field(
|
|
126
|
+
default=100,
|
|
127
|
+
ge=0,
|
|
128
|
+
le=200,
|
|
129
|
+
description="Volume (0-200). Cannot be used together with target_lufs.",
|
|
130
|
+
)
|
|
131
|
+
target_lufs: Optional[float] = Field(
|
|
132
|
+
default=None,
|
|
133
|
+
ge=-70.0,
|
|
134
|
+
le=0.0,
|
|
135
|
+
description="Target loudness in LUFS for absolute loudness normalization (-70 to 0). Cannot be used together with volume.",
|
|
136
|
+
)
|
|
137
|
+
audio_pitch: Optional[int] = Field(default=0, ge=-12, le=12)
|
|
138
|
+
audio_tempo: Optional[float] = Field(default=1.0, ge=0.5, le=2.0)
|
|
139
|
+
audio_format: Optional[str] = Field(
|
|
140
|
+
default="wav", description="Audio format", examples=["wav", "mp3"]
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
@model_validator(mode="before")
|
|
144
|
+
@classmethod
|
|
145
|
+
def check_volume_and_target_lufs(cls, data: dict) -> dict:
|
|
146
|
+
if isinstance(data, dict):
|
|
147
|
+
target_lufs = data.get("target_lufs")
|
|
148
|
+
volume = data.get("volume")
|
|
149
|
+
volume_explicitly_set = "volume" in data
|
|
150
|
+
if target_lufs is not None and volume is not None and volume_explicitly_set:
|
|
151
|
+
raise ValueError("volume and target_lufs cannot be used together")
|
|
152
|
+
if target_lufs is not None and not volume_explicitly_set:
|
|
153
|
+
data["volume"] = None
|
|
154
|
+
return data
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class TTSRequest(BaseModel):
|
|
158
|
+
model_config = ConfigDict(json_schema_extra={"exclude_none": True})
|
|
159
|
+
|
|
160
|
+
voice_id: str = Field(
|
|
161
|
+
description="Voice ID", examples=["tc_62a8975e695ad26f7fb514d1"]
|
|
162
|
+
)
|
|
163
|
+
text: str = Field(description="Text", examples=["Hello. How are you?"])
|
|
164
|
+
model: TTSModel = Field(description="Voice model name", examples=["ssfm-v21"])
|
|
165
|
+
language: Optional[Union[LanguageCode, str]] = Field(
|
|
166
|
+
None, description="Language code (ISO 639-3)", examples=["eng"]
|
|
167
|
+
)
|
|
168
|
+
prompt: Optional[TTSPrompt] = None
|
|
169
|
+
output: Optional[Output] = None
|
|
170
|
+
seed: Optional[int] = None
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class TTSResponse(BaseModel):
|
|
174
|
+
audio_data: bytes
|
|
175
|
+
duration: float
|
|
176
|
+
format: str = "wav"
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class OutputStream(BaseModel):
|
|
180
|
+
"""Audio output settings for streaming mode.
|
|
181
|
+
|
|
182
|
+
Streaming mode does not support `volume` or `target_lufs` because the
|
|
183
|
+
server has to commit each chunk before the full waveform is known.
|
|
184
|
+
Passing either field raises a validation error so misuse fails fast.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
model_config = ConfigDict(extra="forbid")
|
|
188
|
+
|
|
189
|
+
audio_pitch: Optional[int] = Field(default=0, ge=-12, le=12)
|
|
190
|
+
audio_tempo: Optional[float] = Field(default=1.0, ge=0.5, le=2.0)
|
|
191
|
+
audio_format: Optional[str] = Field(
|
|
192
|
+
default="wav", description="Audio format", examples=["wav", "mp3"]
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class TTSRequestStream(BaseModel):
|
|
197
|
+
"""Request body for `POST /v1/text-to-speech/stream`.
|
|
198
|
+
|
|
199
|
+
Mirrors `TTSRequest` but uses `OutputStream` (no volume / target_lufs).
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
model_config = ConfigDict(json_schema_extra={"exclude_none": True})
|
|
203
|
+
|
|
204
|
+
voice_id: str = Field(
|
|
205
|
+
description="Voice ID", examples=["tc_62a8975e695ad26f7fb514d1"]
|
|
206
|
+
)
|
|
207
|
+
text: str = Field(description="Text", examples=["Hello. How are you?"])
|
|
208
|
+
model: TTSModel = Field(description="Voice model name", examples=["ssfm-v21"])
|
|
209
|
+
language: Optional[Union[LanguageCode, str]] = Field(
|
|
210
|
+
None, description="Language code (ISO 639-3)", examples=["eng"]
|
|
211
|
+
)
|
|
212
|
+
prompt: Optional[TTSPrompt] = None
|
|
213
|
+
output: Optional[OutputStream] = None
|
|
214
|
+
seed: Optional[int] = None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class AlignmentSegmentWord(BaseModel):
|
|
218
|
+
"""A single word-level alignment segment between transcript and audio."""
|
|
219
|
+
|
|
220
|
+
text: str = Field(description="Text fragment (with attached punctuation).")
|
|
221
|
+
start: float = Field(ge=0, description="Start time in seconds.")
|
|
222
|
+
end: float = Field(ge=0, description="End time in seconds.")
|
|
223
|
+
|
|
224
|
+
@model_validator(mode="after")
|
|
225
|
+
def _validate_span(self):
|
|
226
|
+
if self.end < self.start:
|
|
227
|
+
raise ValueError("end must be greater than or equal to start")
|
|
228
|
+
return self
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class AlignmentSegmentCharacter(BaseModel):
|
|
232
|
+
"""A single character-level alignment segment between transcript and audio."""
|
|
233
|
+
|
|
234
|
+
text: str = Field(description="Character fragment (with punctuation/whitespace).")
|
|
235
|
+
start: float = Field(ge=0, description="Start time in seconds.")
|
|
236
|
+
end: float = Field(ge=0, description="End time in seconds.")
|
|
237
|
+
|
|
238
|
+
@model_validator(mode="after")
|
|
239
|
+
def _validate_span(self):
|
|
240
|
+
if self.end < self.start:
|
|
241
|
+
raise ValueError("end must be greater than or equal to start")
|
|
242
|
+
return self
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class TTSRequestWithTimestamps(BaseModel):
|
|
246
|
+
"""Request body for `POST /v1/text-to-speech/with-timestamps`.
|
|
247
|
+
|
|
248
|
+
Mirrors `TTSRequest` (voice_id, text, model, language, prompt, output, seed).
|
|
249
|
+
The optional `granularity` query parameter is *not* part of this body — pass
|
|
250
|
+
it as a method argument to `text_to_speech_with_timestamps()`.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
model_config = ConfigDict(json_schema_extra={"exclude_none": True})
|
|
254
|
+
|
|
255
|
+
voice_id: str = Field(
|
|
256
|
+
description="Voice ID", examples=["tc_62a8975e695ad26f7fb514d1"]
|
|
257
|
+
)
|
|
258
|
+
text: str = Field(description="Text", examples=["Hello. How are you?"])
|
|
259
|
+
model: TTSModel = Field(description="Voice model name", examples=["ssfm-v30"])
|
|
260
|
+
language: Optional[Union[LanguageCode, str]] = Field(
|
|
261
|
+
None, description="Language code (ISO 639-3)", examples=["eng"]
|
|
262
|
+
)
|
|
263
|
+
prompt: Optional[TTSPrompt] = None
|
|
264
|
+
output: Optional[Output] = None
|
|
265
|
+
seed: Optional[int] = None
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# --- timestamp captioning helpers (module-level, shared by SRT/VTT) ---
|
|
269
|
+
|
|
270
|
+
_SENTENCE_TERMINATORS = (".", "?", "!", "。", "?", "!")
|
|
271
|
+
_MAX_CAPTION_SECONDS = 7.0
|
|
272
|
+
_MAX_CAPTION_CHARS = 42
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _segments_for_captioning(words, characters):
|
|
276
|
+
"""Pick which segment list to use, returning (segments, word_mode) tuple.
|
|
277
|
+
|
|
278
|
+
- words with >= 2 entries -> words (word_mode=True: join parts with space)
|
|
279
|
+
- else if characters with >= 1 entry -> characters (word_mode=False: concat directly)
|
|
280
|
+
- single-entry words with no characters -> words (word_mode=True, one cue)
|
|
281
|
+
- else -> ValueError
|
|
282
|
+
|
|
283
|
+
Raises ValueError if more than 50% of segments have empty text (defense-in-depth:
|
|
284
|
+
the server contract should never produce majority-empty alignment arrays).
|
|
285
|
+
"""
|
|
286
|
+
if words and len(words) >= 2:
|
|
287
|
+
segs = words
|
|
288
|
+
elif characters and len(characters) >= 1:
|
|
289
|
+
return characters, False
|
|
290
|
+
elif words and len(words) == 1 and not characters:
|
|
291
|
+
segs = words # English single-cue is still valid
|
|
292
|
+
else:
|
|
293
|
+
raise ValueError("no alignment segments to caption from")
|
|
294
|
+
|
|
295
|
+
empty_count = sum(1 for s in segs if not s.text.strip())
|
|
296
|
+
if empty_count > len(segs) / 2:
|
|
297
|
+
raise ValueError("alignment segments contain empty text")
|
|
298
|
+
return segs, True
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _group_into_cues(
|
|
302
|
+
segments,
|
|
303
|
+
word_mode: bool = False,
|
|
304
|
+
max_seconds: float = _MAX_CAPTION_SECONDS,
|
|
305
|
+
max_chars: int = _MAX_CAPTION_CHARS,
|
|
306
|
+
):
|
|
307
|
+
"""Group segments into caption cues using shared rules:
|
|
308
|
+
- Split on sentence terminator at end of segment text.
|
|
309
|
+
- Split BEFORE appending if adding the segment would push the cue past
|
|
310
|
+
max_seconds or max_chars (hard cap). Defaults follow BBC/Netflix subtitle
|
|
311
|
+
guidelines (7.0s / 42 chars).
|
|
312
|
+
|
|
313
|
+
word_mode=True: parts are joined with a single space.
|
|
314
|
+
word_mode=False: parts are concatenated directly.
|
|
315
|
+
|
|
316
|
+
Returns list[(text, start, end)] tuples.
|
|
317
|
+
"""
|
|
318
|
+
cues = []
|
|
319
|
+
cur_text_parts = []
|
|
320
|
+
cur_start = None
|
|
321
|
+
last_end = None
|
|
322
|
+
|
|
323
|
+
def _joined():
|
|
324
|
+
if word_mode:
|
|
325
|
+
return " ".join(cur_text_parts).strip()
|
|
326
|
+
return "".join(cur_text_parts).strip()
|
|
327
|
+
|
|
328
|
+
def _flush(end_time):
|
|
329
|
+
text = _joined()
|
|
330
|
+
if text:
|
|
331
|
+
cues.append((text, cur_start, end_time))
|
|
332
|
+
|
|
333
|
+
for seg in segments:
|
|
334
|
+
if cur_text_parts and cur_start is not None and last_end is not None:
|
|
335
|
+
if word_mode:
|
|
336
|
+
would_be_text = " ".join([*cur_text_parts, seg.text]).strip()
|
|
337
|
+
else:
|
|
338
|
+
would_be_text = "".join([*cur_text_parts, seg.text]).strip()
|
|
339
|
+
would_exceed_seconds = (seg.end - cur_start) > max_seconds
|
|
340
|
+
would_exceed_chars = len(would_be_text) > max_chars
|
|
341
|
+
if would_exceed_seconds or would_exceed_chars:
|
|
342
|
+
_flush(last_end)
|
|
343
|
+
cur_text_parts = []
|
|
344
|
+
cur_start = None
|
|
345
|
+
|
|
346
|
+
if cur_start is None:
|
|
347
|
+
cur_start = seg.start
|
|
348
|
+
cur_text_parts.append(seg.text)
|
|
349
|
+
last_end = seg.end
|
|
350
|
+
|
|
351
|
+
ends_in_sentence = seg.text.rstrip().endswith(_SENTENCE_TERMINATORS)
|
|
352
|
+
if ends_in_sentence:
|
|
353
|
+
_flush(seg.end)
|
|
354
|
+
cur_text_parts = []
|
|
355
|
+
cur_start = None
|
|
356
|
+
|
|
357
|
+
if cur_text_parts and last_end is not None:
|
|
358
|
+
_flush(last_end)
|
|
359
|
+
return cues
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _format_srt_time(seconds: float) -> str:
|
|
363
|
+
total_ms = int(round(seconds * 1000))
|
|
364
|
+
hh, rem = divmod(total_ms, 3600 * 1000)
|
|
365
|
+
mm, rem = divmod(rem, 60 * 1000)
|
|
366
|
+
ss, ms = divmod(rem, 1000)
|
|
367
|
+
return f"{hh:02d}:{mm:02d}:{ss:02d},{ms:03d}"
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _format_vtt_time(seconds: float) -> str:
|
|
371
|
+
"""Format time for WebVTT format: HH:MM:SS.mmm (dot decimal, not comma)."""
|
|
372
|
+
total_ms = int(round(seconds * 1000))
|
|
373
|
+
hh, rem = divmod(total_ms, 3600 * 1000)
|
|
374
|
+
mm, rem = divmod(rem, 60 * 1000)
|
|
375
|
+
ss, ms = divmod(rem, 1000)
|
|
376
|
+
return f"{hh:02d}:{mm:02d}:{ss:02d}.{ms:03d}"
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class TTSWithTimestampsResponse(BaseModel):
|
|
380
|
+
"""Response payload for `POST /v1/text-to-speech/with-timestamps`.
|
|
381
|
+
|
|
382
|
+
Contains base64-encoded audio plus optional word/character alignment arrays.
|
|
383
|
+
Helper methods (`save_audio()`, `to_srt()`, `to_vtt()`) are added in
|
|
384
|
+
subsequent tasks.
|
|
385
|
+
"""
|
|
386
|
+
|
|
387
|
+
audio: str = Field(description="Base64-encoded audio bytes.")
|
|
388
|
+
audio_format: Literal["wav", "mp3"] = Field(description="Audio encoding format.")
|
|
389
|
+
audio_duration: float = Field(description="Length of audio in seconds.")
|
|
390
|
+
words: Optional[list[AlignmentSegmentWord]] = Field(
|
|
391
|
+
default=None,
|
|
392
|
+
description="Word-level timestamps; null when granularity=char.",
|
|
393
|
+
)
|
|
394
|
+
characters: Optional[list[AlignmentSegmentCharacter]] = Field(
|
|
395
|
+
default=None,
|
|
396
|
+
description="Character-level timestamps; null when granularity=word.",
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def audio_bytes(self) -> bytes:
|
|
401
|
+
"""Return decoded audio bytes from the base64 `audio` field."""
|
|
402
|
+
import base64
|
|
403
|
+
return base64.b64decode(self.audio, validate=True)
|
|
404
|
+
|
|
405
|
+
def save_audio(self, path: str) -> None:
|
|
406
|
+
"""Write decoded audio bytes to `path`."""
|
|
407
|
+
with open(path, "wb") as f:
|
|
408
|
+
f.write(self.audio_bytes)
|
|
409
|
+
|
|
410
|
+
def to_srt(self, max_seconds: float = 7.0, max_chars: int = 42) -> str:
|
|
411
|
+
"""Return SRT-formatted caption string for this TTS response.
|
|
412
|
+
|
|
413
|
+
Uses word-level segments when words has >= 2 entries; falls back to
|
|
414
|
+
character-level segments otherwise (e.g. jpn/zho collapsed words).
|
|
415
|
+
Cues are split on sentence terminators (. ? ! 。 ? !) or when a cue
|
|
416
|
+
would exceed max_seconds or max_chars. Default values follow BBC/Netflix
|
|
417
|
+
subtitle guidelines (7.0s / 42 chars).
|
|
418
|
+
"""
|
|
419
|
+
segments, word_mode = _segments_for_captioning(self.words, self.characters)
|
|
420
|
+
cues = _group_into_cues(segments, word_mode=word_mode, max_seconds=max_seconds, max_chars=max_chars)
|
|
421
|
+
if not cues:
|
|
422
|
+
raise ValueError("no alignment segments to caption from")
|
|
423
|
+
lines = []
|
|
424
|
+
for idx, (text, start, end) in enumerate(cues, start=1):
|
|
425
|
+
lines.append(str(idx))
|
|
426
|
+
lines.append(f"{_format_srt_time(start)} --> {_format_srt_time(end)}")
|
|
427
|
+
lines.append(text)
|
|
428
|
+
lines.append("")
|
|
429
|
+
return "\n".join(lines) + "\n"
|
|
430
|
+
|
|
431
|
+
def to_vtt(self, max_seconds: float = 7.0, max_chars: int = 42) -> str:
|
|
432
|
+
"""Return WebVTT-formatted caption string for this TTS response.
|
|
433
|
+
|
|
434
|
+
Uses word-level segments when words has >= 2 entries; falls back to
|
|
435
|
+
character-level segments otherwise (e.g. jpn/zho collapsed words).
|
|
436
|
+
Cues are split on sentence terminators (. ? ! 。 ? !) or when a cue
|
|
437
|
+
would exceed max_seconds or max_chars. Default values follow BBC/Netflix
|
|
438
|
+
subtitle guidelines (7.0s / 42 chars).
|
|
439
|
+
"""
|
|
440
|
+
segments, word_mode = _segments_for_captioning(self.words, self.characters)
|
|
441
|
+
cues = _group_into_cues(segments, word_mode=word_mode, max_seconds=max_seconds, max_chars=max_chars)
|
|
442
|
+
if not cues:
|
|
443
|
+
raise ValueError("no alignment segments to caption from")
|
|
444
|
+
lines = ["WEBVTT", ""]
|
|
445
|
+
for text, start, end in cues:
|
|
446
|
+
lines.append(f"{_format_vtt_time(start)} --> {_format_vtt_time(end)}")
|
|
447
|
+
lines.append(text)
|
|
448
|
+
lines.append("")
|
|
449
|
+
return "\n".join(lines) + "\n"
|
|
@@ -1,214 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
from typing import Literal, Optional, Union
|
|
3
|
-
|
|
4
|
-
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class TTSModel(str, Enum):
|
|
8
|
-
SSFM_V21 = "ssfm-v21"
|
|
9
|
-
SSFM_V30 = "ssfm-v30"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class LanguageCode(str, Enum):
|
|
13
|
-
"""ISO 639-3 language codes supported by Typecast API
|
|
14
|
-
|
|
15
|
-
ssfm-v21: 27 languages
|
|
16
|
-
ssfm-v30: 37 languages (includes all v21 languages plus additional ones)
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
ENG = "eng" # English
|
|
20
|
-
KOR = "kor" # Korean
|
|
21
|
-
SPA = "spa" # Spanish
|
|
22
|
-
DEU = "deu" # German
|
|
23
|
-
FRA = "fra" # French
|
|
24
|
-
ITA = "ita" # Italian
|
|
25
|
-
POL = "pol" # Polish
|
|
26
|
-
NLD = "nld" # Dutch
|
|
27
|
-
RUS = "rus" # Russian
|
|
28
|
-
JPN = "jpn" # Japanese
|
|
29
|
-
ELL = "ell" # Greek
|
|
30
|
-
TAM = "tam" # Tamil
|
|
31
|
-
TGL = "tgl" # Tagalog
|
|
32
|
-
FIN = "fin" # Finnish
|
|
33
|
-
ZHO = "zho" # Chinese
|
|
34
|
-
SLK = "slk" # Slovak
|
|
35
|
-
ARA = "ara" # Arabic
|
|
36
|
-
HRV = "hrv" # Croatian
|
|
37
|
-
UKR = "ukr" # Ukrainian
|
|
38
|
-
IND = "ind" # Indonesian
|
|
39
|
-
DAN = "dan" # Danish
|
|
40
|
-
SWE = "swe" # Swedish
|
|
41
|
-
MSA = "msa" # Malay
|
|
42
|
-
CES = "ces" # Czech
|
|
43
|
-
POR = "por" # Portuguese
|
|
44
|
-
BUL = "bul" # Bulgarian
|
|
45
|
-
RON = "ron" # Romanian
|
|
46
|
-
# ssfm-v30 additional languages
|
|
47
|
-
BEN = "ben" # Bengali
|
|
48
|
-
HIN = "hin" # Hindi
|
|
49
|
-
HUN = "hun" # Hungarian
|
|
50
|
-
NAN = "nan" # Min Nan
|
|
51
|
-
NOR = "nor" # Norwegian
|
|
52
|
-
PAN = "pan" # Punjabi
|
|
53
|
-
THA = "tha" # Thai
|
|
54
|
-
TUR = "tur" # Turkish
|
|
55
|
-
VIE = "vie" # Vietnamese
|
|
56
|
-
YUE = "yue" # Cantonese
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
class EmotionPreset(str, Enum):
|
|
60
|
-
"""Emotion preset types
|
|
61
|
-
|
|
62
|
-
ssfm-v21: normal, happy, sad, angry
|
|
63
|
-
ssfm-v30: normal, happy, sad, angry, whisper, toneup, tonedown
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
NORMAL = "normal"
|
|
67
|
-
HAPPY = "happy"
|
|
68
|
-
SAD = "sad"
|
|
69
|
-
ANGRY = "angry"
|
|
70
|
-
WHISPER = "whisper" # ssfm-v30 only
|
|
71
|
-
TONEUP = "toneup" # ssfm-v30 only
|
|
72
|
-
TONEDOWN = "tonedown" # ssfm-v30 only
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class Prompt(BaseModel):
|
|
76
|
-
"""Emotion and style settings for ssfm-v21 model"""
|
|
77
|
-
|
|
78
|
-
emotion_preset: Optional[str] = Field(
|
|
79
|
-
default="normal",
|
|
80
|
-
description="Emotion preset",
|
|
81
|
-
examples=["normal", "happy", "sad", "angry"],
|
|
82
|
-
)
|
|
83
|
-
emotion_intensity: Optional[float] = Field(default=1.0, ge=0.0, le=2.0)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
class PresetPrompt(BaseModel):
|
|
87
|
-
"""Preset-based emotion control for ssfm-v30 model"""
|
|
88
|
-
|
|
89
|
-
emotion_type: Literal["preset"] = Field(
|
|
90
|
-
default="preset",
|
|
91
|
-
description="Must be 'preset' for preset-based emotion control",
|
|
92
|
-
)
|
|
93
|
-
emotion_preset: Optional[str] = Field(
|
|
94
|
-
default="normal",
|
|
95
|
-
description="Emotion preset to apply",
|
|
96
|
-
examples=["normal", "happy", "sad", "angry", "whisper", "toneup", "tonedown"],
|
|
97
|
-
)
|
|
98
|
-
emotion_intensity: Optional[float] = Field(default=1.0, ge=0.0, le=2.0)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
class SmartPrompt(BaseModel):
|
|
102
|
-
"""Context-aware emotion inference for ssfm-v30 model"""
|
|
103
|
-
|
|
104
|
-
emotion_type: Literal["smart"] = Field(
|
|
105
|
-
default="smart",
|
|
106
|
-
description="Must be 'smart' for context-aware emotion inference",
|
|
107
|
-
)
|
|
108
|
-
previous_text: Optional[str] = Field(
|
|
109
|
-
default=None,
|
|
110
|
-
description="Text that comes BEFORE the main text (max 2000 chars)",
|
|
111
|
-
max_length=2000,
|
|
112
|
-
)
|
|
113
|
-
next_text: Optional[str] = Field(
|
|
114
|
-
default=None,
|
|
115
|
-
description="Text that comes AFTER the main text (max 2000 chars)",
|
|
116
|
-
max_length=2000,
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
# Union type for all prompt types
|
|
121
|
-
TTSPrompt = Union[Prompt, PresetPrompt, SmartPrompt]
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
class Output(BaseModel):
|
|
125
|
-
volume: Optional[int] = Field(
|
|
126
|
-
default=100,
|
|
127
|
-
ge=0,
|
|
128
|
-
le=200,
|
|
129
|
-
description="Volume (0-200). Cannot be used together with target_lufs.",
|
|
130
|
-
)
|
|
131
|
-
target_lufs: Optional[float] = Field(
|
|
132
|
-
default=None,
|
|
133
|
-
ge=-70.0,
|
|
134
|
-
le=0.0,
|
|
135
|
-
description="Target loudness in LUFS for absolute loudness normalization (-70 to 0). Cannot be used together with volume.",
|
|
136
|
-
)
|
|
137
|
-
audio_pitch: Optional[int] = Field(default=0, ge=-12, le=12)
|
|
138
|
-
audio_tempo: Optional[float] = Field(default=1.0, ge=0.5, le=2.0)
|
|
139
|
-
audio_format: Optional[str] = Field(
|
|
140
|
-
default="wav", description="Audio format", examples=["wav", "mp3"]
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
@model_validator(mode="before")
|
|
144
|
-
@classmethod
|
|
145
|
-
def check_volume_and_target_lufs(cls, data: dict) -> dict:
|
|
146
|
-
if isinstance(data, dict):
|
|
147
|
-
target_lufs = data.get("target_lufs")
|
|
148
|
-
volume = data.get("volume")
|
|
149
|
-
volume_explicitly_set = "volume" in data
|
|
150
|
-
if target_lufs is not None and volume is not None and volume_explicitly_set:
|
|
151
|
-
raise ValueError("volume and target_lufs cannot be used together")
|
|
152
|
-
if target_lufs is not None and not volume_explicitly_set:
|
|
153
|
-
data["volume"] = None
|
|
154
|
-
return data
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
class TTSRequest(BaseModel):
|
|
158
|
-
model_config = ConfigDict(json_schema_extra={"exclude_none": True})
|
|
159
|
-
|
|
160
|
-
voice_id: str = Field(
|
|
161
|
-
description="Voice ID", examples=["tc_62a8975e695ad26f7fb514d1"]
|
|
162
|
-
)
|
|
163
|
-
text: str = Field(description="Text", examples=["Hello. How are you?"])
|
|
164
|
-
model: TTSModel = Field(description="Voice model name", examples=["ssfm-v21"])
|
|
165
|
-
language: Optional[Union[LanguageCode, str]] = Field(
|
|
166
|
-
None, description="Language code (ISO 639-3)", examples=["eng"]
|
|
167
|
-
)
|
|
168
|
-
prompt: Optional[TTSPrompt] = None
|
|
169
|
-
output: Optional[Output] = None
|
|
170
|
-
seed: Optional[int] = None
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class TTSResponse(BaseModel):
|
|
174
|
-
audio_data: bytes
|
|
175
|
-
duration: float
|
|
176
|
-
format: str = "wav"
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class OutputStream(BaseModel):
|
|
180
|
-
"""Audio output settings for streaming mode.
|
|
181
|
-
|
|
182
|
-
Streaming mode does not support `volume` or `target_lufs` because the
|
|
183
|
-
server has to commit each chunk before the full waveform is known.
|
|
184
|
-
Passing either field raises a validation error so misuse fails fast.
|
|
185
|
-
"""
|
|
186
|
-
|
|
187
|
-
model_config = ConfigDict(extra="forbid")
|
|
188
|
-
|
|
189
|
-
audio_pitch: Optional[int] = Field(default=0, ge=-12, le=12)
|
|
190
|
-
audio_tempo: Optional[float] = Field(default=1.0, ge=0.5, le=2.0)
|
|
191
|
-
audio_format: Optional[str] = Field(
|
|
192
|
-
default="wav", description="Audio format", examples=["wav", "mp3"]
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
class TTSRequestStream(BaseModel):
|
|
197
|
-
"""Request body for `POST /v1/text-to-speech/stream`.
|
|
198
|
-
|
|
199
|
-
Mirrors `TTSRequest` but uses `OutputStream` (no volume / target_lufs).
|
|
200
|
-
"""
|
|
201
|
-
|
|
202
|
-
model_config = ConfigDict(json_schema_extra={"exclude_none": True})
|
|
203
|
-
|
|
204
|
-
voice_id: str = Field(
|
|
205
|
-
description="Voice ID", examples=["tc_62a8975e695ad26f7fb514d1"]
|
|
206
|
-
)
|
|
207
|
-
text: str = Field(description="Text", examples=["Hello. How are you?"])
|
|
208
|
-
model: TTSModel = Field(description="Voice model name", examples=["ssfm-v21"])
|
|
209
|
-
language: Optional[Union[LanguageCode, str]] = Field(
|
|
210
|
-
None, description="Language code (ISO 639-3)", examples=["eng"]
|
|
211
|
-
)
|
|
212
|
-
prompt: Optional[TTSPrompt] = None
|
|
213
|
-
output: Optional[OutputStream] = None
|
|
214
|
-
seed: Optional[int] = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|