typecast-python 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ """Internal helpers for instant cloning (sync/async shared)."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from pathlib import Path
6
+ from typing import BinaryIO, Union
7
+
8
+ CLONING_MAX_FILE_SIZE = 25 * 1024 * 1024 # must match typecast-api `cloning_max_file_size`
9
+ NAME_MIN_LENGTH = 1
10
+ NAME_MAX_LENGTH = 30
11
+ ALLOWED_CLONE_MODELS = frozenset({"ssfm-v21", "ssfm-v30"})
12
+ CUSTOM_VOICE_ID_PREFIX = "uc_"
13
+
14
+ AudioInput = Union[str, Path, bytes, BinaryIO]
15
+
16
+
17
+ def normalize_clone_model(model: object) -> str:
18
+ """Coerce ``model`` to its string form and reject values outside the API contract.
19
+
20
+ Accepts a ``TTSModel`` enum (uses ``.value``) or a string. Raises ``ValueError``
21
+ when the resolved value is not in :data:`ALLOWED_CLONE_MODELS` so callers fail
22
+ fast client-side instead of relying on a 422 from the API.
23
+ """
24
+ model_str = model.value if hasattr(model, "value") else str(model)
25
+ if model_str not in ALLOWED_CLONE_MODELS:
26
+ allowed = ", ".join(sorted(ALLOWED_CLONE_MODELS))
27
+ raise ValueError(f"model must be one of: {allowed}; got {model_str!r}")
28
+ return model_str
29
+
30
+
31
+ def validate_custom_voice_id(voice_id: str) -> None:
32
+ """Reject non-custom voice ids before they reach the DELETE endpoint."""
33
+ if not isinstance(voice_id, str) or not voice_id.startswith(CUSTOM_VOICE_ID_PREFIX):
34
+ raise ValueError(
35
+ f"voice_id must start with {CUSTOM_VOICE_ID_PREFIX!r}; got {voice_id!r}"
36
+ )
37
+
38
+
39
+ def validate_clone_inputs(audio: AudioInput, name: str) -> tuple[bytes, str]:
40
+ """Pre-validate `clone_voice` inputs and return (audio_bytes, filename).
41
+
42
+ Args:
43
+ audio: One of file path (str/Path), raw bytes, or readable binary file object.
44
+ name: Voice name (1-30 chars).
45
+
46
+ Returns:
47
+ (audio_bytes, filename) — filename is derived from the path/file object,
48
+ or defaults to "audio.wav" when caller passes raw bytes.
49
+
50
+ Raises:
51
+ ValueError: name length out of range or file too large.
52
+ FileNotFoundError: path argument refers to a non-existent file.
53
+ TypeError: audio is none of the accepted types.
54
+ """
55
+ if not (NAME_MIN_LENGTH <= len(name) <= NAME_MAX_LENGTH):
56
+ raise ValueError(
57
+ f"name must be {NAME_MIN_LENGTH}-{NAME_MAX_LENGTH} characters; got {len(name)}"
58
+ )
59
+
60
+ if isinstance(audio, (str, Path)):
61
+ path = Path(audio)
62
+ if not path.exists() or not path.is_file():
63
+ raise FileNotFoundError(f"audio file not found: {path}")
64
+ audio_bytes = path.read_bytes()
65
+ filename = path.name
66
+ elif isinstance(audio, (bytes, bytearray)):
67
+ audio_bytes = bytes(audio)
68
+ filename = "audio.wav"
69
+ elif hasattr(audio, "read"):
70
+ audio_bytes = audio.read()
71
+ if isinstance(audio_bytes, bytearray):
72
+ audio_bytes = bytes(audio_bytes)
73
+ if not isinstance(audio_bytes, bytes):
74
+ raise TypeError(
75
+ "audio file object must be opened in binary mode and return bytes"
76
+ )
77
+ raw_name = getattr(audio, "name", None) or "audio.wav"
78
+ filename = os.path.basename(str(raw_name).replace("\\", "/"))
79
+ else:
80
+ raise TypeError(
81
+ "audio must be a file path (str/Path), bytes, or readable binary file object"
82
+ )
83
+
84
+ if len(audio_bytes) > CLONING_MAX_FILE_SIZE:
85
+ raise ValueError(
86
+ f"audio file exceeds 25MB limit; got {len(audio_bytes)} bytes"
87
+ )
88
+
89
+ return audio_bytes, filename
typecast/async_client.py CHANGED
@@ -1,8 +1,16 @@
1
- from typing import AsyncIterator, Optional
1
+ from pathlib import Path
2
+ from typing import AsyncIterator, BinaryIO, Optional, Union
3
+ from urllib.parse import quote
2
4
 
3
5
  import aiohttp
4
6
 
5
7
  from . import conf
8
+ from ._voice_clone import (
9
+ normalize_clone_model,
10
+ validate_clone_inputs,
11
+ validate_custom_voice_id,
12
+ )
13
+ from .client import _guess_audio_mime
6
14
  from .exceptions import (
7
15
  BadRequestError,
8
16
  InternalServerError,
@@ -14,10 +22,14 @@ from .exceptions import (
14
22
  UnprocessableEntityError,
15
23
  )
16
24
  from .models import (
25
+ CustomVoice,
17
26
  SubscriptionResponse,
27
+ TTSModel,
18
28
  TTSRequest,
19
29
  TTSRequestStream,
30
+ TTSRequestWithTimestamps,
20
31
  TTSResponse,
32
+ TTSWithTimestampsResponse,
21
33
  VoicesResponse,
22
34
  VoicesV2Filter,
23
35
  VoiceV2Response,
@@ -59,7 +71,9 @@ class AsyncTypecast:
59
71
  self.session: Optional[aiohttp.ClientSession] = None
60
72
 
61
73
  async def __aenter__(self):
62
- headers = {"Content-Type": "application/json"}
74
+ # Auth header at session scope; per-request Content-Type is set by aiohttp
75
+ # (json= auto-sets application/json, data=FormData() auto-sets multipart).
76
+ headers = {}
63
77
  if self.api_key:
64
78
  headers["X-API-KEY"] = self.api_key
65
79
  self.session = aiohttp.ClientSession(headers=headers)
@@ -168,6 +182,119 @@ class AsyncTypecast:
168
182
  async for chunk in response.content.iter_chunked(chunk_size):
169
183
  yield chunk
170
184
 
185
+ async def text_to_speech_with_timestamps(
186
+ self,
187
+ request: TTSRequestWithTimestamps,
188
+ granularity: Optional[str] = None,
189
+ ) -> TTSWithTimestampsResponse:
190
+ """Async version of ``Typecast.text_to_speech_with_timestamps``.
191
+
192
+ Args:
193
+ request: Request body (same shape as ``TTSRequest``).
194
+ granularity: Optional ``"word"`` or ``"char"`` filter.
195
+
196
+ Returns:
197
+ ``TTSWithTimestampsResponse`` with helpers ``to_srt()``,
198
+ ``to_vtt()``, ``save_audio()``.
199
+
200
+ Raises:
201
+ TypecastError: If the client session is not initialized
202
+ (i.e. used outside ``async with``).
203
+ ValueError: If ``granularity`` is not ``None``, ``"word"``, or ``"char"``.
204
+ BadRequestError, UnauthorizedError, PaymentRequiredError,
205
+ NotFoundError, UnprocessableEntityError, RateLimitError,
206
+ InternalServerError, TypecastError: per HTTP status.
207
+ """
208
+ if self.session is None:
209
+ raise TypecastError("Client session not initialized; use 'async with'.")
210
+ if granularity not in (None, "word", "char"):
211
+ raise ValueError(
212
+ f"granularity must be None, 'word', or 'char'; got {granularity!r}"
213
+ )
214
+ endpoint = "/v1/text-to-speech/with-timestamps"
215
+ params = {"granularity": granularity} if granularity else None
216
+ async with self.session.post(
217
+ f"{self.host}{endpoint}",
218
+ json=request.model_dump(exclude_none=True),
219
+ params=params,
220
+ ) as response:
221
+ if response.status != 200:
222
+ text = await response.text()
223
+ self._handle_error(response.status, text)
224
+ data = await response.json()
225
+ return TTSWithTimestampsResponse.model_validate(data)
226
+
227
+ async def clone_voice(
228
+ self,
229
+ audio: Union[str, Path, bytes, BinaryIO],
230
+ name: str,
231
+ model: Union[str, "TTSModel"],
232
+ ) -> CustomVoice:
233
+ """Create a quick-cloned custom voice from an audio sample (async).
234
+
235
+ Args:
236
+ audio: Audio sample. Accepts file path (str/Path), raw bytes,
237
+ or a readable binary file object. Max 25 MB.
238
+ name: Voice name, 1-30 characters.
239
+ model: Engine model. ``"ssfm-v21"`` or ``"ssfm-v30"`` (or ``TTSModel`` enum).
240
+
241
+ Returns:
242
+ ``CustomVoice`` with ``voice_id`` (uc_ prefix), ``name``, and ``model``.
243
+
244
+ Raises:
245
+ ValueError: name length out of range or audio exceeds 25 MB.
246
+ FileNotFoundError: ``audio`` is a path to a non-existent file.
247
+ TypecastError: client session not initialized or HTTP error.
248
+ """
249
+ if self.session is None:
250
+ raise TypecastError("Client session not initialized; use 'async with'.")
251
+
252
+ audio_bytes, filename = validate_clone_inputs(audio, name)
253
+ model_str = normalize_clone_model(model)
254
+
255
+ form = aiohttp.FormData()
256
+ form.add_field("name", name)
257
+ form.add_field("model", model_str)
258
+ form.add_field(
259
+ "file",
260
+ audio_bytes,
261
+ filename=filename,
262
+ content_type=_guess_audio_mime(filename),
263
+ )
264
+ timeout = aiohttp.ClientTimeout(total=300, connect=10)
265
+ async with self.session.post(
266
+ f"{self.host}/v1/voices/clone",
267
+ data=form,
268
+ timeout=timeout,
269
+ ) as response:
270
+ if response.status != 200:
271
+ text = await response.text()
272
+ self._handle_error(response.status, text)
273
+ body = await response.json()
274
+ return CustomVoice.model_validate(body)
275
+
276
+ async def delete_voice(self, voice_id: str) -> None:
277
+ """Soft-delete a custom voice (async).
278
+
279
+ Args:
280
+ voice_id: Voice identifier with ``uc_`` prefix.
281
+
282
+ Raises:
283
+ TypecastError subclasses: per HTTP status from the API.
284
+ """
285
+ if self.session is None:
286
+ raise TypecastError("Client session not initialized; use 'async with'.")
287
+
288
+ validate_custom_voice_id(voice_id)
289
+ timeout = aiohttp.ClientTimeout(total=60, connect=10)
290
+ async with self.session.delete(
291
+ f"{self.host}/v1/voices/{quote(voice_id, safe='')}",
292
+ timeout=timeout,
293
+ ) as response:
294
+ if response.status not in (200, 204):
295
+ text = await response.text()
296
+ self._handle_error(response.status, text)
297
+
171
298
  async def voices(self, model: Optional[str] = None) -> list[VoicesResponse]:
172
299
  """Get available voices (V1 API) asynchronously.
173
300
 
typecast/client.py CHANGED
@@ -1,8 +1,15 @@
1
- from typing import Iterator, Optional
1
+ from pathlib import Path
2
+ from typing import BinaryIO, Iterator, Optional, Union
3
+ from urllib.parse import quote
2
4
 
3
5
  import requests
4
6
 
5
7
  from . import conf
8
+ from ._voice_clone import (
9
+ normalize_clone_model,
10
+ validate_clone_inputs,
11
+ validate_custom_voice_id,
12
+ )
6
13
  from .exceptions import (
7
14
  BadRequestError,
8
15
  InternalServerError,
@@ -14,16 +21,30 @@ from .exceptions import (
14
21
  UnprocessableEntityError,
15
22
  )
16
23
  from .models import (
24
+ CustomVoice,
17
25
  SubscriptionResponse,
26
+ TTSModel,
18
27
  TTSRequest,
19
28
  TTSRequestStream,
29
+ TTSRequestWithTimestamps,
20
30
  TTSResponse,
31
+ TTSWithTimestampsResponse,
21
32
  VoicesResponse,
22
33
  VoicesV2Filter,
23
34
  VoiceV2Response,
24
35
  )
25
36
 
26
37
 
38
+ def _guess_audio_mime(filename: str) -> str:
39
+ """Guess audio MIME type from filename extension; fall back to octet-stream."""
40
+ lower = filename.lower()
41
+ if lower.endswith(".wav"):
42
+ return "audio/wav"
43
+ if lower.endswith(".mp3"):
44
+ return "audio/mpeg"
45
+ return "application/octet-stream"
46
+
47
+
27
48
  class Typecast:
28
49
  """Synchronous client for the Typecast Text-to-Speech API.
29
50
 
@@ -161,6 +182,106 @@ class Typecast:
161
182
  finally:
162
183
  response.close()
163
184
 
185
+ def text_to_speech_with_timestamps(
186
+ self,
187
+ request: TTSRequestWithTimestamps,
188
+ granularity: Optional[str] = None,
189
+ ) -> TTSWithTimestampsResponse:
190
+ """Synthesize speech and return base64 audio + alignment timestamps.
191
+
192
+ Args:
193
+ request: Request body (same shape as `TTSRequest`).
194
+ granularity: Optional ``"word"`` or ``"char"`` to filter the
195
+ returned alignment arrays. Omit to receive both.
196
+
197
+ Returns:
198
+ ``TTSWithTimestampsResponse`` with ``audio`` (base64),
199
+ ``words``, ``characters``, and helper methods ``to_srt()``,
200
+ ``to_vtt()``, and ``save_audio()``.
201
+
202
+ Raises:
203
+ ValueError: If ``granularity`` is not ``None``, ``"word"``, or ``"char"``.
204
+ BadRequestError, UnauthorizedError, PaymentRequiredError,
205
+ NotFoundError, UnprocessableEntityError, RateLimitError,
206
+ InternalServerError, TypecastError: per HTTP status.
207
+ """
208
+ if granularity not in (None, "word", "char"):
209
+ raise ValueError(
210
+ f"granularity must be None, 'word', or 'char'; got {granularity!r}"
211
+ )
212
+ endpoint = "/v1/text-to-speech/with-timestamps"
213
+ params = {"granularity": granularity} if granularity else None
214
+ response = self.session.post(
215
+ f"{self.host}{endpoint}",
216
+ json=request.model_dump(exclude_none=True),
217
+ params=params,
218
+ timeout=(10, 300),
219
+ )
220
+ if response.status_code != 200:
221
+ self._handle_error(response.status_code, response.text)
222
+ return TTSWithTimestampsResponse.model_validate(response.json())
223
+
224
+ def clone_voice(
225
+ self,
226
+ audio: Union[str, Path, bytes, BinaryIO],
227
+ name: str,
228
+ model: Union[str, "TTSModel"],
229
+ ) -> CustomVoice:
230
+ """Create a quick-cloned custom voice from an audio sample.
231
+
232
+ Args:
233
+ audio: Audio sample. Accepts file path (str/Path), raw bytes,
234
+ or a readable binary file object. Max 25 MB.
235
+ name: Voice name, 1-30 characters.
236
+ model: Engine model. ``"ssfm-v21"`` or ``"ssfm-v30"`` (or ``TTSModel`` enum).
237
+
238
+ Returns:
239
+ ``CustomVoice`` with ``voice_id`` (uc_ prefix), ``name``, and ``model``.
240
+ Use ``voice_id`` directly with ``text_to_speech`` to synthesize.
241
+
242
+ Raises:
243
+ ValueError: name length out of range or audio exceeds 25 MB.
244
+ FileNotFoundError: ``audio`` is a path to a non-existent file.
245
+ TypecastError subclasses: per HTTP status from the API.
246
+ """
247
+ audio_bytes, filename = validate_clone_inputs(audio, name)
248
+ model_str = normalize_clone_model(model)
249
+
250
+ files = {
251
+ "file": (filename, audio_bytes, _guess_audio_mime(filename)),
252
+ }
253
+ data = {"name": name, "model": model_str}
254
+ # Remove the session-level Content-Type so requests can set the
255
+ # correct multipart/form-data boundary for this request.
256
+ response = self.session.post(
257
+ f"{self.host}/v1/voices/clone",
258
+ files=files,
259
+ data=data,
260
+ headers={"Content-Type": None},
261
+ timeout=(10, 300),
262
+ )
263
+ if response.status_code != 200:
264
+ self._handle_error(response.status_code, response.text)
265
+ return CustomVoice.model_validate(response.json())
266
+
267
+ def delete_voice(self, voice_id: str) -> None:
268
+ """Soft-delete a custom voice.
269
+
270
+ Args:
271
+ voice_id: Voice identifier with ``uc_`` prefix (returned by ``clone_voice``).
272
+
273
+ Raises:
274
+ TypecastError subclasses: per HTTP status from the API
275
+ (e.g., ``NotFoundError`` if the voice doesn't exist or isn't owned).
276
+ """
277
+ validate_custom_voice_id(voice_id)
278
+ response = self.session.delete(
279
+ f"{self.host}/v1/voices/{quote(voice_id, safe='')}",
280
+ timeout=(10, 60),
281
+ )
282
+ if response.status_code not in (200, 204):
283
+ self._handle_error(response.status_code, response.text)
284
+
164
285
  def voices(self, model: Optional[str] = None) -> list[VoicesResponse]:
165
286
  """Get available voices (V1 API).
166
287
 
@@ -1,6 +1,8 @@
1
1
  from .error import Error
2
2
  from .subscription import Credits, Limits, PlanTier, SubscriptionResponse
3
3
  from .tts import (
4
+ AlignmentSegmentCharacter,
5
+ AlignmentSegmentWord,
4
6
  EmotionPreset,
5
7
  LanguageCode,
6
8
  Output,
@@ -12,10 +14,13 @@ from .tts import (
12
14
  TTSPrompt,
13
15
  TTSRequest,
14
16
  TTSRequestStream,
17
+ TTSRequestWithTimestamps,
15
18
  TTSResponse,
19
+ TTSWithTimestampsResponse,
16
20
  )
17
21
  from .voices import (
18
22
  AgeEnum,
23
+ CustomVoice,
19
24
  GenderEnum,
20
25
  ModelInfo,
21
26
  UseCaseEnum,
@@ -25,28 +30,33 @@ from .voices import (
25
30
  )
26
31
 
27
32
  __all__ = [
28
- "TTSRequest",
29
- "TTSRequestStream",
30
- "TTSModel",
31
- "TTSPrompt",
32
- "Prompt",
33
- "PresetPrompt",
34
- "SmartPrompt",
33
+ "AgeEnum",
34
+ "AlignmentSegmentCharacter",
35
+ "AlignmentSegmentWord",
36
+ "Credits",
37
+ "CustomVoice",
35
38
  "EmotionPreset",
39
+ "Error",
40
+ "GenderEnum",
41
+ "LanguageCode",
42
+ "Limits",
43
+ "ModelInfo",
36
44
  "Output",
37
45
  "OutputStream",
46
+ "PlanTier",
47
+ "Prompt",
48
+ "PresetPrompt",
49
+ "SmartPrompt",
50
+ "SubscriptionResponse",
51
+ "TTSModel",
52
+ "TTSPrompt",
53
+ "TTSRequest",
54
+ "TTSRequestStream",
55
+ "TTSRequestWithTimestamps",
38
56
  "TTSResponse",
39
- "VoicesResponse",
57
+ "TTSWithTimestampsResponse",
58
+ "UseCaseEnum",
40
59
  "VoiceV2Response",
60
+ "VoicesResponse",
41
61
  "VoicesV2Filter",
42
- "ModelInfo",
43
- "GenderEnum",
44
- "AgeEnum",
45
- "UseCaseEnum",
46
- "Error",
47
- "LanguageCode",
48
- "PlanTier",
49
- "Credits",
50
- "Limits",
51
- "SubscriptionResponse",
52
62
  ]
typecast/models/tts.py CHANGED
@@ -212,3 +212,238 @@ class TTSRequestStream(BaseModel):
212
212
  prompt: Optional[TTSPrompt] = None
213
213
  output: Optional[OutputStream] = None
214
214
  seed: Optional[int] = None
215
+
216
+
217
+ class AlignmentSegmentWord(BaseModel):
218
+ """A single word-level alignment segment between transcript and audio."""
219
+
220
+ text: str = Field(description="Text fragment (with attached punctuation).")
221
+ start: float = Field(ge=0, description="Start time in seconds.")
222
+ end: float = Field(ge=0, description="End time in seconds.")
223
+
224
+ @model_validator(mode="after")
225
+ def _validate_span(self):
226
+ if self.end < self.start:
227
+ raise ValueError("end must be greater than or equal to start")
228
+ return self
229
+
230
+
231
+ class AlignmentSegmentCharacter(BaseModel):
232
+ """A single character-level alignment segment between transcript and audio."""
233
+
234
+ text: str = Field(description="Character fragment (with punctuation/whitespace).")
235
+ start: float = Field(ge=0, description="Start time in seconds.")
236
+ end: float = Field(ge=0, description="End time in seconds.")
237
+
238
+ @model_validator(mode="after")
239
+ def _validate_span(self):
240
+ if self.end < self.start:
241
+ raise ValueError("end must be greater than or equal to start")
242
+ return self
243
+
244
+
245
+ class TTSRequestWithTimestamps(BaseModel):
246
+ """Request body for `POST /v1/text-to-speech/with-timestamps`.
247
+
248
+ Mirrors `TTSRequest` (voice_id, text, model, language, prompt, output, seed).
249
+ The optional `granularity` query parameter is *not* part of this body — pass
250
+ it as a method argument to `text_to_speech_with_timestamps()`.
251
+ """
252
+
253
+ model_config = ConfigDict(json_schema_extra={"exclude_none": True})
254
+
255
+ voice_id: str = Field(
256
+ description="Voice ID", examples=["tc_62a8975e695ad26f7fb514d1"]
257
+ )
258
+ text: str = Field(description="Text", examples=["Hello. How are you?"])
259
+ model: TTSModel = Field(description="Voice model name", examples=["ssfm-v30"])
260
+ language: Optional[Union[LanguageCode, str]] = Field(
261
+ None, description="Language code (ISO 639-3)", examples=["eng"]
262
+ )
263
+ prompt: Optional[TTSPrompt] = None
264
+ output: Optional[Output] = None
265
+ seed: Optional[int] = None
266
+
267
+
268
+ # --- timestamp captioning helpers (module-level, shared by SRT/VTT) ---
269
+
270
+ _SENTENCE_TERMINATORS = (".", "?", "!", "。", "?", "!")
271
+ _MAX_CAPTION_SECONDS = 7.0
272
+ _MAX_CAPTION_CHARS = 42
273
+
274
+
275
+ def _segments_for_captioning(words, characters):
276
+ """Pick which segment list to use, returning (segments, word_mode) tuple.
277
+
278
+ - words with >= 2 entries -> words (word_mode=True: join parts with space)
279
+ - else if characters with >= 1 entry -> characters (word_mode=False: concat directly)
280
+ - single-entry words with no characters -> words (word_mode=True, one cue)
281
+ - else -> ValueError
282
+
283
+ Raises ValueError if more than 50% of segments have empty text (defense-in-depth:
284
+ the server contract should never produce majority-empty alignment arrays).
285
+ """
286
+ if words and len(words) >= 2:
287
+ segs = words
288
+ elif characters and len(characters) >= 1:
289
+ return characters, False
290
+ elif words and len(words) == 1 and not characters:
291
+ segs = words # English single-cue is still valid
292
+ else:
293
+ raise ValueError("no alignment segments to caption from")
294
+
295
+ empty_count = sum(1 for s in segs if not s.text.strip())
296
+ if empty_count > len(segs) / 2:
297
+ raise ValueError("alignment segments contain empty text")
298
+ return segs, True
299
+
300
+
301
+ def _group_into_cues(
302
+ segments,
303
+ word_mode: bool = False,
304
+ max_seconds: float = _MAX_CAPTION_SECONDS,
305
+ max_chars: int = _MAX_CAPTION_CHARS,
306
+ ):
307
+ """Group segments into caption cues using shared rules:
308
+ - Split on sentence terminator at end of segment text.
309
+ - Split BEFORE appending if adding the segment would push the cue past
310
+ max_seconds or max_chars (hard cap). Defaults follow BBC/Netflix subtitle
311
+ guidelines (7.0s / 42 chars).
312
+
313
+ word_mode=True: parts are joined with a single space.
314
+ word_mode=False: parts are concatenated directly.
315
+
316
+ Returns list[(text, start, end)] tuples.
317
+ """
318
+ cues = []
319
+ cur_text_parts = []
320
+ cur_start = None
321
+ last_end = None
322
+
323
+ def _joined():
324
+ if word_mode:
325
+ return " ".join(cur_text_parts).strip()
326
+ return "".join(cur_text_parts).strip()
327
+
328
+ def _flush(end_time):
329
+ text = _joined()
330
+ if text:
331
+ cues.append((text, cur_start, end_time))
332
+
333
+ for seg in segments:
334
+ if cur_text_parts and cur_start is not None and last_end is not None:
335
+ if word_mode:
336
+ would_be_text = " ".join([*cur_text_parts, seg.text]).strip()
337
+ else:
338
+ would_be_text = "".join([*cur_text_parts, seg.text]).strip()
339
+ would_exceed_seconds = (seg.end - cur_start) > max_seconds
340
+ would_exceed_chars = len(would_be_text) > max_chars
341
+ if would_exceed_seconds or would_exceed_chars:
342
+ _flush(last_end)
343
+ cur_text_parts = []
344
+ cur_start = None
345
+
346
+ if cur_start is None:
347
+ cur_start = seg.start
348
+ cur_text_parts.append(seg.text)
349
+ last_end = seg.end
350
+
351
+ ends_in_sentence = seg.text.rstrip().endswith(_SENTENCE_TERMINATORS)
352
+ if ends_in_sentence:
353
+ _flush(seg.end)
354
+ cur_text_parts = []
355
+ cur_start = None
356
+
357
+ if cur_text_parts and last_end is not None:
358
+ _flush(last_end)
359
+ return cues
360
+
361
+
362
+ def _format_srt_time(seconds: float) -> str:
363
+ total_ms = int(round(seconds * 1000))
364
+ hh, rem = divmod(total_ms, 3600 * 1000)
365
+ mm, rem = divmod(rem, 60 * 1000)
366
+ ss, ms = divmod(rem, 1000)
367
+ return f"{hh:02d}:{mm:02d}:{ss:02d},{ms:03d}"
368
+
369
+
370
+ def _format_vtt_time(seconds: float) -> str:
371
+ """Format time for WebVTT format: HH:MM:SS.mmm (dot decimal, not comma)."""
372
+ total_ms = int(round(seconds * 1000))
373
+ hh, rem = divmod(total_ms, 3600 * 1000)
374
+ mm, rem = divmod(rem, 60 * 1000)
375
+ ss, ms = divmod(rem, 1000)
376
+ return f"{hh:02d}:{mm:02d}:{ss:02d}.{ms:03d}"
377
+
378
+
379
+ class TTSWithTimestampsResponse(BaseModel):
380
+ """Response payload for `POST /v1/text-to-speech/with-timestamps`.
381
+
382
+ Contains base64-encoded audio plus optional word/character alignment arrays.
383
+ Helper methods (`save_audio()`, `to_srt()`, `to_vtt()`) are added in
384
+ subsequent tasks.
385
+ """
386
+
387
+ audio: str = Field(description="Base64-encoded audio bytes.")
388
+ audio_format: Literal["wav", "mp3"] = Field(description="Audio encoding format.")
389
+ audio_duration: float = Field(description="Length of audio in seconds.")
390
+ words: Optional[list[AlignmentSegmentWord]] = Field(
391
+ default=None,
392
+ description="Word-level timestamps; null when granularity=char.",
393
+ )
394
+ characters: Optional[list[AlignmentSegmentCharacter]] = Field(
395
+ default=None,
396
+ description="Character-level timestamps; null when granularity=word.",
397
+ )
398
+
399
+ @property
400
+ def audio_bytes(self) -> bytes:
401
+ """Return decoded audio bytes from the base64 `audio` field."""
402
+ import base64
403
+ return base64.b64decode(self.audio, validate=True)
404
+
405
+ def save_audio(self, path: str) -> None:
406
+ """Write decoded audio bytes to `path`."""
407
+ with open(path, "wb") as f:
408
+ f.write(self.audio_bytes)
409
+
410
+ def to_srt(self, max_seconds: float = 7.0, max_chars: int = 42) -> str:
411
+ """Return SRT-formatted caption string for this TTS response.
412
+
413
+ Uses word-level segments when words has >= 2 entries; falls back to
414
+ character-level segments otherwise (e.g. jpn/zho collapsed words).
415
+ Cues are split on sentence terminators (. ? ! 。 ? !) or when a cue
416
+ would exceed max_seconds or max_chars. Default values follow BBC/Netflix
417
+ subtitle guidelines (7.0s / 42 chars).
418
+ """
419
+ segments, word_mode = _segments_for_captioning(self.words, self.characters)
420
+ cues = _group_into_cues(segments, word_mode=word_mode, max_seconds=max_seconds, max_chars=max_chars)
421
+ if not cues:
422
+ raise ValueError("no alignment segments to caption from")
423
+ lines = []
424
+ for idx, (text, start, end) in enumerate(cues, start=1):
425
+ lines.append(str(idx))
426
+ lines.append(f"{_format_srt_time(start)} --> {_format_srt_time(end)}")
427
+ lines.append(text)
428
+ lines.append("")
429
+ return "\n".join(lines) + "\n"
430
+
431
+ def to_vtt(self, max_seconds: float = 7.0, max_chars: int = 42) -> str:
432
+ """Return WebVTT-formatted caption string for this TTS response.
433
+
434
+ Uses word-level segments when words has >= 2 entries; falls back to
435
+ character-level segments otherwise (e.g. jpn/zho collapsed words).
436
+ Cues are split on sentence terminators (. ? ! 。 ? !) or when a cue
437
+ would exceed max_seconds or max_chars. Default values follow BBC/Netflix
438
+ subtitle guidelines (7.0s / 42 chars).
439
+ """
440
+ segments, word_mode = _segments_for_captioning(self.words, self.characters)
441
+ cues = _group_into_cues(segments, word_mode=word_mode, max_seconds=max_seconds, max_chars=max_chars)
442
+ if not cues:
443
+ raise ValueError("no alignment segments to caption from")
444
+ lines = ["WEBVTT", ""]
445
+ for text, start, end in cues:
446
+ lines.append(f"{_format_vtt_time(start)} --> {_format_vtt_time(end)}")
447
+ lines.append(text)
448
+ lines.append("")
449
+ return "\n".join(lines) + "\n"
typecast/models/voices.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from enum import Enum
2
2
  from typing import Optional
3
3
 
4
- from pydantic import BaseModel
4
+ from pydantic import BaseModel, Field
5
5
 
6
6
  from .tts import TTSModel
7
7
 
@@ -75,3 +75,18 @@ class VoicesV2Filter(BaseModel):
75
75
  gender: Optional[GenderEnum] = None
76
76
  age: Optional[AgeEnum] = None
77
77
  use_cases: Optional[UseCaseEnum] = None
78
+
79
+
80
+ class CustomVoice(BaseModel):
81
+ """Quick-cloned custom voice returned by `POST /v1/voices/clone`.
82
+
83
+ Attributes:
84
+ voice_id: Custom voice identifier with `uc_` prefix.
85
+ Use this value as `voice_id` in `text_to_speech` / `text_to_speech_with_timestamps`.
86
+ name: Human-readable name (1-30 chars).
87
+ model: Engine model the voice was cloned for (`ssfm-v21` or `ssfm-v30`).
88
+ """
89
+
90
+ voice_id: str = Field(..., description="Custom voice identifier (uc_ prefix)")
91
+ name: str = Field(..., description="Human-readable voice name")
92
+ model: str = Field(..., description="Engine model: ssfm-v21 or ssfm-v30")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: typecast-python
3
- Version: 0.2.2
3
+ Version: 0.3.1
4
4
  Summary: Official Typecast Python SDK - Convert text to lifelike speech using AI-powered voices
5
5
  Project-URL: Homepage, https://typecast.ai
6
6
  Project-URL: Documentation, https://typecast.ai/docs/overview
@@ -269,6 +269,7 @@ Convert text to lifelike speech using AI-powered voices
269
269
  - [Voice Discovery](#voice-discovery)
270
270
  - [Emotion Control](#emotion-control)
271
271
  - [Async Client](#async-client)
272
+ - [Timestamp TTS](#timestamp-tts)
272
273
  - [Supported Languages](#supported-languages)
273
274
  - [Error Handling](#error-handling)
274
275
  - [License](#license)
@@ -465,6 +466,91 @@ async def main():
465
466
  asyncio.run(main())
466
467
  ```
467
468
 
469
+ ### Timestamp TTS
470
+
471
+ Use `text_to_speech_with_timestamps()` to receive base64 audio plus
472
+ word/character-level timestamps aligned with the synthesized speech. The
473
+ result object exposes `save_audio()`, `to_srt()`, and `to_vtt()` helpers
474
+ so you can finish the typical "audio + subtitles" flow in one line.
475
+
476
+ ```python
477
+ from typecast import Typecast
478
+ from typecast.models import TTSRequestWithTimestamps
479
+
480
+ client = Typecast(api_key="YOUR_API_KEY")
481
+ resp = client.text_to_speech_with_timestamps(
482
+ TTSRequestWithTimestamps(
483
+ voice_id="tc_60e5426de8b95f1d3000d7b5",
484
+ text="Hello. How are you?",
485
+ model="ssfm-v30",
486
+ language="eng",
487
+ ),
488
+ )
489
+ resp.save_audio("hello.wav")
490
+ print(resp.to_srt()) # SRT subtitles
491
+ print(resp.to_vtt()) # WebVTT subtitles
492
+ ```
493
+
494
+ Caption splits follow BBC/Netflix subtitle guidelines: 7s/42-char cue maximums.
495
+
496
+ ```python
497
+ # Real-time karaoke / highlight: iterate the words array directly.
498
+ for w in resp.words or []:
499
+ print(f"[{w.start:.2f}s - {w.end:.2f}s] {w.text}")
500
+ ```
501
+
502
+ Pass `granularity="word"` or `granularity="char"` to receive only one of
503
+ the two alignment arrays. For non-whitespace languages (Japanese,
504
+ Chinese), pair with `granularity="char"` — word-level alignment will
505
+ collapse the entire sentence into a single segment.
506
+
507
+ ### Instant cloning
508
+
509
+ Clone a custom voice from a short audio sample (≤ 25 MB), then use it just like any built-in voice. The cloned voice ID has a `uc_` prefix and works with `text_to_speech` directly.
510
+
511
+ ```python
512
+ from typecast import Typecast
513
+ from typecast.models import TTSRequest
514
+
515
+ client = Typecast(api_key="YOUR_API_KEY")
516
+
517
+ # 1) Clone
518
+ voice = client.clone_voice(
519
+ audio="path/to/sample.wav", # str path | Path | bytes | file object
520
+ name="my-voice", # 1-30 chars
521
+ model="ssfm-v30", # or "ssfm-v21"
522
+ )
523
+ print(voice.voice_id) # uc_64a1b2...
524
+
525
+ # 2) Synthesize with the cloned voice
526
+ audio = client.text_to_speech(TTSRequest(
527
+ text="Hello from my cloned voice!",
528
+ voice_id=voice.voice_id,
529
+ model="ssfm-v30",
530
+ ))
531
+ with open("output.wav", "wb") as f:
532
+ f.write(audio.audio_data)
533
+
534
+ # 3) Delete when done
535
+ client.delete_voice(voice.voice_id)
536
+ ```
537
+
538
+ **Limits**
539
+
540
+ - Audio file: max 25 MB. Supported formats: WAV, MP3.
541
+ - Voice name: 1–30 characters.
542
+ - Model: `ssfm-v21` or `ssfm-v30`.
543
+
544
+ **Async usage** is identical via `AsyncTypecast`:
545
+
546
+ ```python
547
+ from typecast import AsyncTypecast
548
+
549
+ async with AsyncTypecast(api_key="YOUR_API_KEY") as client:
550
+ voice = await client.clone_voice(audio="sample.wav", name="my-voice", model="ssfm-v30")
551
+ await client.delete_voice(voice.voice_id)
552
+ ```
553
+
468
554
  ---
469
555
 
470
556
  ## Supported Languages
@@ -0,0 +1,16 @@
1
+ typecast/__init__.py,sha256=3pdJqNkXCZ7svzqab4sBR_qwyoM5E2sPjfuci1g1Ub8,1047
2
+ typecast/_voice_clone.py,sha256=TN2tbB3b5lC5uFCBnbERhz37bNJlbv6VZ-vj70EfTs4,3464
3
+ typecast/async_client.py,sha256=8mab1ai_P1TdQilR1l29n5Cg40F7lN960shODemXXWs,17287
4
+ typecast/client.py,sha256=PRFF7hj8Ih88dRl0ZiLcRRD8N4ofPPbYMZAKpk_b89w,15212
5
+ typecast/conf.py,sha256=Fn_T4XW7BaHRnj0tP11BT5at3Y-db7oGcbBA_E1fmF0,479
6
+ typecast/exceptions.py,sha256=Y0ZzYebe8zOSOSAHbXfKR0G_RJgdmZXxi15Z7ZxPLIk,1568
7
+ typecast/utils.py,sha256=XuNuX7gW8_CGKqZ-cv_tKlPVMPBluAYJBw2clwmjIMI,708
8
+ typecast/models/__init__.py,sha256=UEPUjg86fpCMXUvAzcNgxSPPhuPwCY9aQbzK3w90Fj0,1203
9
+ typecast/models/error.py,sha256=XomIjx7jvlCjItqzJuCAT4mXC9jwTjxR8lLDUk6P8KA,152
10
+ typecast/models/subscription.py,sha256=EIaAAo3cCRw8LYT_O6D9AVwxqIHrWCijzl4UTx7FZB8,894
11
+ typecast/models/tts.py,sha256=IDz4IN2d4MCkATB_rVlaEoMS7EgWlQsW0tIP8k67LHo,16001
12
+ typecast/models/voices.py,sha256=-EXP35jDy7_G30k5bDnVrFJHp6svEDTA5jJ8oHAgXNQ,2310
13
+ typecast_python-0.3.1.dist-info/METADATA,sha256=pBQbgmi4_SX07Dx5RKO-YPBXMDAPyZwcQXDO3JWoQV4,25529
14
+ typecast_python-0.3.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
15
+ typecast_python-0.3.1.dist-info/licenses/LICENSE,sha256=HvtJ-S89uUkuYmt-OvVk4MRxmzwtbn84__qJtSrGU2Q,11348
16
+ typecast_python-0.3.1.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- typecast/__init__.py,sha256=3pdJqNkXCZ7svzqab4sBR_qwyoM5E2sPjfuci1g1Ub8,1047
2
- typecast/async_client.py,sha256=NJY2CWuCD3CVcqHuNwezIk39X8YTnkpQeJGdfhv6bqw,12341
3
- typecast/client.py,sha256=HzCSGDglsCz880x8lXesHFXQIcivc7KXImWiXehWvQc,10514
4
- typecast/conf.py,sha256=Fn_T4XW7BaHRnj0tP11BT5at3Y-db7oGcbBA_E1fmF0,479
5
- typecast/exceptions.py,sha256=Y0ZzYebe8zOSOSAHbXfKR0G_RJgdmZXxi15Z7ZxPLIk,1568
6
- typecast/utils.py,sha256=XuNuX7gW8_CGKqZ-cv_tKlPVMPBluAYJBw2clwmjIMI,708
7
- typecast/models/__init__.py,sha256=uvDWzZPo01mjHreUtlHBMYiuZ84Tb02LzU3ZpszYg1I,923
8
- typecast/models/error.py,sha256=XomIjx7jvlCjItqzJuCAT4mXC9jwTjxR8lLDUk6P8KA,152
9
- typecast/models/subscription.py,sha256=EIaAAo3cCRw8LYT_O6D9AVwxqIHrWCijzl4UTx7FZB8,894
10
- typecast/models/tts.py,sha256=clwrqky7CtcKjGtLY1h1hnyn2MXXJjXEaZ8iVZXK85c,6739
11
- typecast/models/voices.py,sha256=VwmK_Ts17QccAaClGYqqstIGJ5RT9Qgj2kxqEoDr6z4,1659
12
- typecast_python-0.2.2.dist-info/METADATA,sha256=2c0Er6ggl9ZnPbr9iyFUAIBLvZdsxH9kHDIwRIEdTOU,22881
13
- typecast_python-0.2.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
14
- typecast_python-0.2.2.dist-info/licenses/LICENSE,sha256=HvtJ-S89uUkuYmt-OvVk4MRxmzwtbn84__qJtSrGU2Q,11348
15
- typecast_python-0.2.2.dist-info/RECORD,,