lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
- lattifai-1.3.0.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
lattifai/caption/formats/ttml.py
DELETED
|
@@ -1,652 +0,0 @@
|
|
|
1
|
-
"""TTML/IMSC1/EBU-TT-D format handler.
|
|
2
|
-
|
|
3
|
-
TTML (Timed Text Markup Language) is a W3C standard used by:
|
|
4
|
-
- Netflix (IMSC1 profile)
|
|
5
|
-
- European broadcasters (EBU-TT-D profile)
|
|
6
|
-
- IMF workflows
|
|
7
|
-
- Apple Music (iTunes timing)
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import xml.etree.ElementTree as ET
|
|
11
|
-
from dataclasses import dataclass, field
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from typing import Dict, List, Optional, Union
|
|
14
|
-
from xml.dom import minidom
|
|
15
|
-
|
|
16
|
-
from lhotse.supervision import AlignmentItem
|
|
17
|
-
from lhotse.utils import Pathlike
|
|
18
|
-
|
|
19
|
-
from ...config.caption import KaraokeConfig
|
|
20
|
-
from ..supervision import Supervision
|
|
21
|
-
from . import register_format
|
|
22
|
-
from .base import FormatHandler
|
|
23
|
-
|
|
24
|
-
# XML namespaces
|
|
25
|
-
TTML_NS = "http://www.w3.org/ns/ttml"
|
|
26
|
-
TTML_STYLE_NS = "http://www.w3.org/ns/ttml#styling"
|
|
27
|
-
TTML_PARAM_NS = "http://www.w3.org/ns/ttml#parameter"
|
|
28
|
-
XML_NS = "http://www.w3.org/XML/1998/namespace"
|
|
29
|
-
ITUNES_NS = "http://music.apple.com/lyric-ttml-internal"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@dataclass
|
|
33
|
-
class TTMLStyle:
|
|
34
|
-
"""Text style configuration for TTML."""
|
|
35
|
-
|
|
36
|
-
font_family: str = "proportionalSansSerif"
|
|
37
|
-
font_size: str = "100%"
|
|
38
|
-
color: str = "#FFFFFF"
|
|
39
|
-
background_color: Optional[str] = "#000000C0"
|
|
40
|
-
text_align: str = "center"
|
|
41
|
-
display_align: str = "after"
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@dataclass
|
|
45
|
-
class TTMLRegion:
|
|
46
|
-
"""Region definition for TTML positioning."""
|
|
47
|
-
|
|
48
|
-
id: str = "bottom"
|
|
49
|
-
origin: str = "10% 80%"
|
|
50
|
-
extent: str = "80% 15%"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@dataclass
|
|
54
|
-
class TTMLConfig:
|
|
55
|
-
"""Configuration for TTML export."""
|
|
56
|
-
|
|
57
|
-
profile: str = "imsc1" # "imsc1", "ebu-tt-d", or "basic"
|
|
58
|
-
default_style: TTMLStyle = field(default_factory=TTMLStyle)
|
|
59
|
-
default_region: TTMLRegion = field(default_factory=TTMLRegion)
|
|
60
|
-
speaker_regions: Dict[str, TTMLRegion] = field(default_factory=dict)
|
|
61
|
-
speaker_styles: Dict[str, TTMLStyle] = field(default_factory=dict)
|
|
62
|
-
language: str = "en"
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
class TTMLFormatBase(FormatHandler):
|
|
66
|
-
"""Base TTML format handler (reader/writer)."""
|
|
67
|
-
|
|
68
|
-
@classmethod
|
|
69
|
-
def _parse_ttml_time(cls, time_str: str) -> float:
|
|
70
|
-
"""Parse TTML time string to seconds.
|
|
71
|
-
|
|
72
|
-
Supports:
|
|
73
|
-
- Clock time: HH:MM:SS.mmm or HH:MM:SS:frames
|
|
74
|
-
- Offset time: 10s, 10.5s, 500ms, 100f
|
|
75
|
-
"""
|
|
76
|
-
if not time_str:
|
|
77
|
-
return 0.0
|
|
78
|
-
|
|
79
|
-
time_str = time_str.strip()
|
|
80
|
-
|
|
81
|
-
# Handle offset time
|
|
82
|
-
if time_str.endswith("ms"):
|
|
83
|
-
return float(time_str[:-2]) / 1000.0
|
|
84
|
-
if time_str.endswith("s"):
|
|
85
|
-
return float(time_str[:-1])
|
|
86
|
-
if time_str.endswith("f"):
|
|
87
|
-
# Assuming default 30fps if frame count provided without explicit frame rate
|
|
88
|
-
# This is imprecise but a fallback
|
|
89
|
-
return float(time_str[:-1]) / 30.0
|
|
90
|
-
|
|
91
|
-
# Handle clock time: HH:MM:SS.mmm or HH:MM:SS:fff
|
|
92
|
-
parts = time_str.split(":")
|
|
93
|
-
if len(parts) >= 3:
|
|
94
|
-
hours = float(parts[0])
|
|
95
|
-
minutes = float(parts[1])
|
|
96
|
-
|
|
97
|
-
# Check for seconds and frames/milliseconds
|
|
98
|
-
last_part = parts[2]
|
|
99
|
-
seconds = 0.0
|
|
100
|
-
|
|
101
|
-
if "." in last_part:
|
|
102
|
-
seconds = float(last_part)
|
|
103
|
-
elif len(parts) == 4:
|
|
104
|
-
# HH:MM:SS:FF
|
|
105
|
-
seconds = float(parts[2])
|
|
106
|
-
frames = float(parts[3])
|
|
107
|
-
# Assume 30fps for HH:MM:SS:FF standard if not specified
|
|
108
|
-
seconds += frames / 30.0
|
|
109
|
-
else:
|
|
110
|
-
seconds = float(last_part)
|
|
111
|
-
|
|
112
|
-
return hours * 3600 + minutes * 60 + seconds
|
|
113
|
-
|
|
114
|
-
# Fallback: try parsing as simple float seconds
|
|
115
|
-
try:
|
|
116
|
-
return float(time_str)
|
|
117
|
-
except ValueError:
|
|
118
|
-
return 0.0
|
|
119
|
-
|
|
120
|
-
@classmethod
|
|
121
|
-
def extract_metadata(cls, source: Union[Pathlike, str], **kwargs) -> Dict:
|
|
122
|
-
"""Extract TTML metadata including profile, language, and timing mode.
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
Dict containing:
|
|
126
|
-
- ttml_profile: Profile URI (imsc1, ebu-tt-d, or basic)
|
|
127
|
-
- ttml_language: Language code from xml:lang
|
|
128
|
-
- ttml_timing: iTunes timing mode if present
|
|
129
|
-
"""
|
|
130
|
-
if isinstance(source, (str, Path)) and not cls.is_content(source):
|
|
131
|
-
try:
|
|
132
|
-
with open(source, "r", encoding="utf-8") as f:
|
|
133
|
-
content = f.read()
|
|
134
|
-
except Exception:
|
|
135
|
-
return {}
|
|
136
|
-
else:
|
|
137
|
-
content = str(source)
|
|
138
|
-
|
|
139
|
-
metadata = {}
|
|
140
|
-
|
|
141
|
-
try:
|
|
142
|
-
# Don't strip namespaces for metadata extraction
|
|
143
|
-
root = ET.fromstring(content)
|
|
144
|
-
|
|
145
|
-
# Extract language
|
|
146
|
-
lang = root.get(f"{{{XML_NS}}}lang") or root.get("lang")
|
|
147
|
-
if lang:
|
|
148
|
-
metadata["ttml_language"] = lang
|
|
149
|
-
|
|
150
|
-
# Extract profile
|
|
151
|
-
profile = root.get(f"{{{TTML_PARAM_NS}}}profile") or root.get("profile")
|
|
152
|
-
if profile:
|
|
153
|
-
if "imsc1" in profile.lower():
|
|
154
|
-
metadata["ttml_profile"] = "imsc1"
|
|
155
|
-
elif "ebu" in profile.lower():
|
|
156
|
-
metadata["ttml_profile"] = "ebu-tt-d"
|
|
157
|
-
else:
|
|
158
|
-
metadata["ttml_profile"] = "basic"
|
|
159
|
-
|
|
160
|
-
# Extract iTunes timing mode if present
|
|
161
|
-
timing = root.get(f"{{{ITUNES_NS}}}timing")
|
|
162
|
-
if timing:
|
|
163
|
-
metadata["ttml_timing"] = timing
|
|
164
|
-
|
|
165
|
-
except ET.ParseError:
|
|
166
|
-
pass
|
|
167
|
-
|
|
168
|
-
return metadata
|
|
169
|
-
|
|
170
|
-
@classmethod
|
|
171
|
-
def read(
|
|
172
|
-
cls,
|
|
173
|
-
source: Union[Pathlike, str],
|
|
174
|
-
normalize_text: bool = True,
|
|
175
|
-
**kwargs,
|
|
176
|
-
) -> List[Supervision]:
|
|
177
|
-
"""Read TTML content and return supervisions."""
|
|
178
|
-
if isinstance(source, (str, Path)) and not cls.is_content(source):
|
|
179
|
-
with open(source, "r", encoding="utf-8") as f:
|
|
180
|
-
content = f.read()
|
|
181
|
-
else:
|
|
182
|
-
content = str(source)
|
|
183
|
-
|
|
184
|
-
# Parse XML
|
|
185
|
-
try:
|
|
186
|
-
# Strip namespaces for easier parsing
|
|
187
|
-
# This is a bit hacky but robust against different namespace prefixes
|
|
188
|
-
import re
|
|
189
|
-
|
|
190
|
-
content = re.sub(r' xmlns="[^"]+"', "", content, count=1)
|
|
191
|
-
content = re.sub(r' xmlns:t?ts="[^"]+"', "", content)
|
|
192
|
-
content = re.sub(r' xmlns:t?tp="[^"]+"', "", content)
|
|
193
|
-
# Also strip the namespace prefixes from attributes since we removed definitions
|
|
194
|
-
content = re.sub(r" (t?ts|t?tp):", " ", content)
|
|
195
|
-
|
|
196
|
-
root = ET.fromstring(content)
|
|
197
|
-
except ET.ParseError:
|
|
198
|
-
return []
|
|
199
|
-
|
|
200
|
-
supervisions = []
|
|
201
|
-
|
|
202
|
-
# Find body/div/p structure
|
|
203
|
-
body = root.find("body") or root.find(f"{{{TTML_NS}}}body")
|
|
204
|
-
if body is None:
|
|
205
|
-
return []
|
|
206
|
-
|
|
207
|
-
# Traverse all divs and p tags
|
|
208
|
-
# Note: TTML structure can be nested div->div->p
|
|
209
|
-
for p in body.iter("p"):
|
|
210
|
-
# Or with explicit namespace if stripping failed
|
|
211
|
-
# for p in body.iter(f"{{{TTML_NS}}}p"):
|
|
212
|
-
|
|
213
|
-
begin_str = p.get("begin")
|
|
214
|
-
end_str = p.get("end")
|
|
215
|
-
dur_str = p.get("dur")
|
|
216
|
-
|
|
217
|
-
if not begin_str:
|
|
218
|
-
continue
|
|
219
|
-
|
|
220
|
-
start = cls._parse_ttml_time(begin_str)
|
|
221
|
-
|
|
222
|
-
if end_str:
|
|
223
|
-
end = cls._parse_ttml_time(end_str)
|
|
224
|
-
duration = end - start
|
|
225
|
-
elif dur_str:
|
|
226
|
-
duration = cls._parse_ttml_time(dur_str)
|
|
227
|
-
else:
|
|
228
|
-
duration = 0.0
|
|
229
|
-
|
|
230
|
-
# Extract text and potential word-level spans
|
|
231
|
-
alignment = None
|
|
232
|
-
text_parts = []
|
|
233
|
-
word_items = []
|
|
234
|
-
|
|
235
|
-
# Text directly in p
|
|
236
|
-
if p.text and p.text.strip():
|
|
237
|
-
text_parts.append(p.text.strip())
|
|
238
|
-
|
|
239
|
-
# Child spans
|
|
240
|
-
for child in p:
|
|
241
|
-
if child.tag.endswith("span"):
|
|
242
|
-
span_text = child.text.strip() if child.text else ""
|
|
243
|
-
if not span_text:
|
|
244
|
-
pass
|
|
245
|
-
|
|
246
|
-
# Check for timing on span (word-level or phrase-level)
|
|
247
|
-
span_begin = child.get("begin")
|
|
248
|
-
span_end = child.get("end")
|
|
249
|
-
|
|
250
|
-
if span_begin and (span_end or child.get("dur")):
|
|
251
|
-
# It's a timed span
|
|
252
|
-
s_start = cls._parse_ttml_time(span_begin)
|
|
253
|
-
if span_end:
|
|
254
|
-
s_end = cls._parse_ttml_time(span_end)
|
|
255
|
-
s_dur = s_end - s_start
|
|
256
|
-
else:
|
|
257
|
-
s_dur = cls._parse_ttml_time(child.get("dur"))
|
|
258
|
-
|
|
259
|
-
# If start is relative to p? TTML spec says absolute usually unless offset
|
|
260
|
-
# We assume absolute for now as per simple profile
|
|
261
|
-
|
|
262
|
-
word_items.append(AlignmentItem(symbol=span_text, start=s_start, duration=s_dur))
|
|
263
|
-
text_parts.append(span_text)
|
|
264
|
-
else:
|
|
265
|
-
# Just styled text
|
|
266
|
-
text_parts.append(span_text)
|
|
267
|
-
|
|
268
|
-
# Tail text after span
|
|
269
|
-
if child.tail and child.tail.strip():
|
|
270
|
-
text_parts.append(child.tail.strip())
|
|
271
|
-
|
|
272
|
-
full_text = " ".join(text_parts).strip()
|
|
273
|
-
|
|
274
|
-
if word_items:
|
|
275
|
-
alignment = {"word": word_items}
|
|
276
|
-
# Update line timing based on words if p timing was missing/zero
|
|
277
|
-
if duration <= 0:
|
|
278
|
-
start = word_items[0].start
|
|
279
|
-
end = word_items[-1].start + word_items[-1].duration
|
|
280
|
-
duration = end - start
|
|
281
|
-
|
|
282
|
-
if full_text:
|
|
283
|
-
supervisions.append(
|
|
284
|
-
Supervision(
|
|
285
|
-
id=p.get("id", ""),
|
|
286
|
-
recording_id="ttml_import",
|
|
287
|
-
start=start,
|
|
288
|
-
duration=duration,
|
|
289
|
-
text=full_text,
|
|
290
|
-
alignment=alignment,
|
|
291
|
-
speaker=p.get("agent") or p.get(f"{{{TTML_PARAM_NS}}}agent"), # Metadata agent
|
|
292
|
-
)
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
return sorted(supervisions, key=lambda s: s.start)
|
|
296
|
-
|
|
297
|
-
@classmethod
|
|
298
|
-
def _seconds_to_ttml_time(cls, seconds: float) -> str:
|
|
299
|
-
"""Convert seconds to TTML time format (HH:MM:SS.mmm)."""
|
|
300
|
-
if seconds < 0:
|
|
301
|
-
seconds = 0
|
|
302
|
-
hours = int(seconds // 3600)
|
|
303
|
-
minutes = int((seconds % 3600) // 60)
|
|
304
|
-
secs = seconds % 60
|
|
305
|
-
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
|
|
306
|
-
|
|
307
|
-
@classmethod
|
|
308
|
-
def _create_style_element(cls, parent: ET.Element, style_id: str, style: TTMLStyle) -> ET.Element:
|
|
309
|
-
"""Create a style element."""
|
|
310
|
-
style_elem = ET.SubElement(parent, f"{{{TTML_NS}}}style")
|
|
311
|
-
style_elem.set(f"{{{XML_NS}}}id", style_id)
|
|
312
|
-
style_elem.set(f"{{{TTML_STYLE_NS}}}fontFamily", style.font_family)
|
|
313
|
-
style_elem.set(f"{{{TTML_STYLE_NS}}}fontSize", style.font_size)
|
|
314
|
-
style_elem.set(f"{{{TTML_STYLE_NS}}}color", style.color)
|
|
315
|
-
style_elem.set(f"{{{TTML_STYLE_NS}}}textAlign", style.text_align)
|
|
316
|
-
style_elem.set(f"{{{TTML_STYLE_NS}}}displayAlign", style.display_align)
|
|
317
|
-
if style.background_color:
|
|
318
|
-
style_elem.set(f"{{{TTML_STYLE_NS}}}backgroundColor", style.background_color)
|
|
319
|
-
return style_elem
|
|
320
|
-
|
|
321
|
-
@classmethod
|
|
322
|
-
def _create_region_element(cls, parent: ET.Element, region: TTMLRegion) -> ET.Element:
|
|
323
|
-
"""Create a region element."""
|
|
324
|
-
region_elem = ET.SubElement(parent, f"{{{TTML_NS}}}region")
|
|
325
|
-
region_elem.set(f"{{{XML_NS}}}id", region.id)
|
|
326
|
-
region_elem.set(f"{{{TTML_STYLE_NS}}}origin", region.origin)
|
|
327
|
-
region_elem.set(f"{{{TTML_STYLE_NS}}}extent", region.extent)
|
|
328
|
-
return region_elem
|
|
329
|
-
|
|
330
|
-
@classmethod
|
|
331
|
-
def _build_ttml(
|
|
332
|
-
cls,
|
|
333
|
-
supervisions: List[Supervision],
|
|
334
|
-
config: TTMLConfig,
|
|
335
|
-
include_speaker: bool = True,
|
|
336
|
-
word_level: bool = False,
|
|
337
|
-
karaoke_config: Optional[KaraokeConfig] = None,
|
|
338
|
-
) -> ET.Element:
|
|
339
|
-
"""Build TTML document structure.
|
|
340
|
-
|
|
341
|
-
Args:
|
|
342
|
-
supervisions: List of supervisions to convert
|
|
343
|
-
config: TTML configuration
|
|
344
|
-
include_speaker: Whether to include speaker names
|
|
345
|
-
word_level: Whether to output word-level timing
|
|
346
|
-
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
347
|
-
use span-based karaoke; otherwise use p-per-word
|
|
348
|
-
"""
|
|
349
|
-
from .base import expand_to_word_supervisions
|
|
350
|
-
|
|
351
|
-
# Check if karaoke is enabled
|
|
352
|
-
karaoke_enabled = karaoke_config is not None and karaoke_config.enabled
|
|
353
|
-
|
|
354
|
-
# If word_level=True and karaoke is not enabled, expand to word-per-paragraph
|
|
355
|
-
if word_level and not karaoke_enabled:
|
|
356
|
-
supervisions = expand_to_word_supervisions(supervisions)
|
|
357
|
-
|
|
358
|
-
ET.register_namespace("", TTML_NS)
|
|
359
|
-
ET.register_namespace("tts", TTML_STYLE_NS)
|
|
360
|
-
ET.register_namespace("ttp", TTML_PARAM_NS)
|
|
361
|
-
ET.register_namespace("xml", XML_NS)
|
|
362
|
-
|
|
363
|
-
# Register iTunes namespace if karaoke mode is enabled
|
|
364
|
-
if word_level and karaoke_enabled:
|
|
365
|
-
ET.register_namespace("itunes", ITUNES_NS)
|
|
366
|
-
|
|
367
|
-
root = ET.Element(
|
|
368
|
-
f"{{{TTML_NS}}}tt",
|
|
369
|
-
attrib={
|
|
370
|
-
f"{{{XML_NS}}}lang": config.language,
|
|
371
|
-
f"{{{TTML_PARAM_NS}}}timeBase": "media",
|
|
372
|
-
},
|
|
373
|
-
)
|
|
374
|
-
|
|
375
|
-
if config.profile == "imsc1":
|
|
376
|
-
root.set(f"{{{TTML_PARAM_NS}}}profile", "http://www.w3.org/ns/ttml/profile/imsc1/text")
|
|
377
|
-
elif config.profile == "ebu-tt-d":
|
|
378
|
-
root.set(f"{{{TTML_PARAM_NS}}}profile", "urn:ebu:tt:distribution:2014-01")
|
|
379
|
-
|
|
380
|
-
# Add iTunes timing attribute for karaoke mode
|
|
381
|
-
if word_level and karaoke_enabled:
|
|
382
|
-
timing_mode = karaoke_config.ttml_timing_mode
|
|
383
|
-
root.set(f"{{{ITUNES_NS}}}timing", timing_mode)
|
|
384
|
-
|
|
385
|
-
# Head section
|
|
386
|
-
head = ET.SubElement(root, f"{{{TTML_NS}}}head")
|
|
387
|
-
styling = ET.SubElement(head, f"{{{TTML_NS}}}styling")
|
|
388
|
-
cls._create_style_element(styling, "default", config.default_style)
|
|
389
|
-
|
|
390
|
-
for speaker, style in config.speaker_styles.items():
|
|
391
|
-
style_id = f"speaker_{speaker.replace(' ', '_')}"
|
|
392
|
-
cls._create_style_element(styling, style_id, style)
|
|
393
|
-
|
|
394
|
-
layout = ET.SubElement(head, f"{{{TTML_NS}}}layout")
|
|
395
|
-
cls._create_region_element(layout, config.default_region)
|
|
396
|
-
|
|
397
|
-
for speaker, region in config.speaker_regions.items():
|
|
398
|
-
cls._create_region_element(layout, region)
|
|
399
|
-
|
|
400
|
-
# Body section
|
|
401
|
-
body = ET.SubElement(root, f"{{{TTML_NS}}}body")
|
|
402
|
-
div = ET.SubElement(body, f"{{{TTML_NS}}}div")
|
|
403
|
-
|
|
404
|
-
for sup in supervisions:
|
|
405
|
-
# Check if karaoke mode should be used for this supervision
|
|
406
|
-
has_word_alignment = (
|
|
407
|
-
word_level
|
|
408
|
-
and karaoke_enabled
|
|
409
|
-
and sup.alignment
|
|
410
|
-
and "word" in sup.alignment
|
|
411
|
-
and len(sup.alignment["word"]) > 0
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
# Use word timestamps for timing when available (more accurate)
|
|
415
|
-
if has_word_alignment:
|
|
416
|
-
word_items = sup.alignment["word"]
|
|
417
|
-
begin = cls._seconds_to_ttml_time(word_items[0].start)
|
|
418
|
-
end = cls._seconds_to_ttml_time(word_items[-1].end)
|
|
419
|
-
else:
|
|
420
|
-
begin = cls._seconds_to_ttml_time(sup.start)
|
|
421
|
-
end = cls._seconds_to_ttml_time(sup.end)
|
|
422
|
-
|
|
423
|
-
p = ET.SubElement(div, f"{{{TTML_NS}}}p")
|
|
424
|
-
p.set("begin", begin)
|
|
425
|
-
p.set("end", end)
|
|
426
|
-
|
|
427
|
-
if sup.speaker and sup.speaker in config.speaker_regions:
|
|
428
|
-
p.set("region", config.speaker_regions[sup.speaker].id)
|
|
429
|
-
else:
|
|
430
|
-
p.set("region", config.default_region.id)
|
|
431
|
-
|
|
432
|
-
if sup.speaker and sup.speaker in config.speaker_styles:
|
|
433
|
-
style_id = f"speaker_{sup.speaker.replace(' ', '_')}"
|
|
434
|
-
p.set("style", style_id)
|
|
435
|
-
else:
|
|
436
|
-
p.set("style", "default")
|
|
437
|
-
|
|
438
|
-
include_this_speaker = cls._should_include_speaker(sup, include_speaker)
|
|
439
|
-
|
|
440
|
-
if has_word_alignment:
|
|
441
|
-
# Karaoke mode: create span for each word with timing
|
|
442
|
-
for i, item in enumerate(word_items):
|
|
443
|
-
span = ET.SubElement(p, f"{{{TTML_NS}}}span")
|
|
444
|
-
span.set("begin", cls._seconds_to_ttml_time(item.start))
|
|
445
|
-
span.set("end", cls._seconds_to_ttml_time(item.start + item.duration))
|
|
446
|
-
span.text = item.symbol
|
|
447
|
-
# Add space between words (except after last word)
|
|
448
|
-
if i < len(word_items) - 1:
|
|
449
|
-
span.tail = " "
|
|
450
|
-
elif include_this_speaker and config.profile != "basic":
|
|
451
|
-
span = ET.SubElement(p, f"{{{TTML_NS}}}span")
|
|
452
|
-
span.set(f"{{{TTML_STYLE_NS}}}fontWeight", "bold")
|
|
453
|
-
span.text = f"{sup.speaker} "
|
|
454
|
-
span.tail = sup.text.strip() if sup.text else ""
|
|
455
|
-
else:
|
|
456
|
-
p.text = sup.text.strip() if sup.text else ""
|
|
457
|
-
|
|
458
|
-
return root
|
|
459
|
-
|
|
460
|
-
@classmethod
|
|
461
|
-
def _prettify_xml(cls, element: ET.Element) -> str:
|
|
462
|
-
"""Convert XML element to pretty-printed string."""
|
|
463
|
-
rough_string = ET.tostring(element, encoding="unicode")
|
|
464
|
-
reparsed = minidom.parseString(rough_string)
|
|
465
|
-
pretty = reparsed.toprettyxml(indent=" ")
|
|
466
|
-
lines = [line for line in pretty.split("\n") if line.strip()]
|
|
467
|
-
return "\n".join(lines)
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
@register_format("ttml")
|
|
471
|
-
class TTMLFormat(TTMLFormatBase):
|
|
472
|
-
"""Standard TTML format."""
|
|
473
|
-
|
|
474
|
-
extensions = [".ttml", ".xml"]
|
|
475
|
-
description = "Timed Text Markup Language - W3C standard"
|
|
476
|
-
|
|
477
|
-
@classmethod
|
|
478
|
-
def write(
|
|
479
|
-
cls,
|
|
480
|
-
supervisions: List[Supervision],
|
|
481
|
-
output_path,
|
|
482
|
-
include_speaker: bool = True,
|
|
483
|
-
config: Optional[TTMLConfig] = None,
|
|
484
|
-
word_level: bool = False,
|
|
485
|
-
karaoke_config: Optional[KaraokeConfig] = None,
|
|
486
|
-
**kwargs,
|
|
487
|
-
) -> Path:
|
|
488
|
-
"""Write TTML format.
|
|
489
|
-
|
|
490
|
-
Args:
|
|
491
|
-
supervisions: List of supervisions to write
|
|
492
|
-
output_path: Output file path
|
|
493
|
-
include_speaker: Whether to include speaker names
|
|
494
|
-
config: TTML configuration
|
|
495
|
-
word_level: Whether to output word-level timing
|
|
496
|
-
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
497
|
-
use span-based karaoke; otherwise use p-per-word
|
|
498
|
-
"""
|
|
499
|
-
if config is None:
|
|
500
|
-
config = TTMLConfig()
|
|
501
|
-
|
|
502
|
-
output_path = Path(output_path)
|
|
503
|
-
if output_path.suffix.lower() not in [".ttml", ".xml"]:
|
|
504
|
-
output_path = output_path.with_suffix(".ttml")
|
|
505
|
-
|
|
506
|
-
root = cls._build_ttml(
|
|
507
|
-
supervisions,
|
|
508
|
-
config,
|
|
509
|
-
include_speaker=include_speaker,
|
|
510
|
-
word_level=word_level,
|
|
511
|
-
karaoke_config=karaoke_config,
|
|
512
|
-
)
|
|
513
|
-
xml_content = cls._prettify_xml(root)
|
|
514
|
-
|
|
515
|
-
output_path.write_text(xml_content, encoding="utf-8")
|
|
516
|
-
return output_path
|
|
517
|
-
|
|
518
|
-
@classmethod
|
|
519
|
-
def to_bytes(
|
|
520
|
-
cls,
|
|
521
|
-
supervisions: List[Supervision],
|
|
522
|
-
include_speaker: bool = True,
|
|
523
|
-
config: Optional[TTMLConfig] = None,
|
|
524
|
-
word_level: bool = False,
|
|
525
|
-
karaoke_config: Optional[KaraokeConfig] = None,
|
|
526
|
-
metadata: Optional[Dict] = None,
|
|
527
|
-
**kwargs,
|
|
528
|
-
) -> bytes:
|
|
529
|
-
"""Convert to TTML format bytes.
|
|
530
|
-
|
|
531
|
-
Args:
|
|
532
|
-
supervisions: List of supervisions to convert
|
|
533
|
-
include_speaker: Whether to include speaker names
|
|
534
|
-
config: TTML configuration
|
|
535
|
-
word_level: Whether to output word-level timing
|
|
536
|
-
karaoke_config: Karaoke configuration. When provided with enabled=True,
|
|
537
|
-
use span-based karaoke; otherwise use p-per-word
|
|
538
|
-
metadata: Optional metadata dict containing ttml_* keys to restore
|
|
539
|
-
"""
|
|
540
|
-
if config is None:
|
|
541
|
-
config = TTMLConfig()
|
|
542
|
-
|
|
543
|
-
# Apply metadata to config if available
|
|
544
|
-
if metadata:
|
|
545
|
-
if metadata.get("ttml_language"):
|
|
546
|
-
config.language = metadata["ttml_language"]
|
|
547
|
-
if metadata.get("ttml_profile"):
|
|
548
|
-
config.profile = metadata["ttml_profile"]
|
|
549
|
-
|
|
550
|
-
root = cls._build_ttml(
|
|
551
|
-
supervisions,
|
|
552
|
-
config,
|
|
553
|
-
include_speaker=include_speaker,
|
|
554
|
-
word_level=word_level,
|
|
555
|
-
karaoke_config=karaoke_config,
|
|
556
|
-
)
|
|
557
|
-
xml_content = cls._prettify_xml(root)
|
|
558
|
-
return xml_content.encode("utf-8")
|
|
559
|
-
|
|
560
|
-
@classmethod
|
|
561
|
-
def write_imsc1(
|
|
562
|
-
cls,
|
|
563
|
-
supervisions: List[Supervision],
|
|
564
|
-
output_path,
|
|
565
|
-
language: str = "en",
|
|
566
|
-
**kwargs,
|
|
567
|
-
) -> Path:
|
|
568
|
-
"""Convenience method to write IMSC1 format."""
|
|
569
|
-
config = TTMLConfig(profile="imsc1", language=language)
|
|
570
|
-
return cls.write(supervisions, output_path, config=config, **kwargs)
|
|
571
|
-
|
|
572
|
-
@classmethod
|
|
573
|
-
def write_ebu_tt_d(
|
|
574
|
-
cls,
|
|
575
|
-
supervisions: List[Supervision],
|
|
576
|
-
output_path,
|
|
577
|
-
language: str = "en",
|
|
578
|
-
**kwargs,
|
|
579
|
-
) -> Path:
|
|
580
|
-
"""Convenience method to write EBU-TT-D format."""
|
|
581
|
-
config = TTMLConfig(profile="ebu-tt-d", language=language)
|
|
582
|
-
return cls.write(supervisions, output_path, config=config, **kwargs)
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
@register_format("imsc1")
|
|
586
|
-
class IMSC1Format(TTMLFormatBase):
|
|
587
|
-
"""IMSC1 format - Netflix/streaming profile."""
|
|
588
|
-
|
|
589
|
-
extensions = [".ttml"]
|
|
590
|
-
description = "IMSC1 - Netflix/streaming TTML profile"
|
|
591
|
-
|
|
592
|
-
@classmethod
|
|
593
|
-
def write(
|
|
594
|
-
cls,
|
|
595
|
-
supervisions: List[Supervision],
|
|
596
|
-
output_path,
|
|
597
|
-
include_speaker: bool = True,
|
|
598
|
-
language: str = "en",
|
|
599
|
-
**kwargs,
|
|
600
|
-
) -> Path:
|
|
601
|
-
"""Write IMSC1 format."""
|
|
602
|
-
config = TTMLConfig(profile="imsc1", language=language)
|
|
603
|
-
return TTMLFormat.write(supervisions, output_path, include_speaker, config, **kwargs)
|
|
604
|
-
|
|
605
|
-
@classmethod
|
|
606
|
-
def to_bytes(
|
|
607
|
-
cls,
|
|
608
|
-
supervisions: List[Supervision],
|
|
609
|
-
include_speaker: bool = True,
|
|
610
|
-
language: str = "en",
|
|
611
|
-
**kwargs,
|
|
612
|
-
) -> bytes:
|
|
613
|
-
"""Convert to IMSC1 format bytes."""
|
|
614
|
-
config = TTMLConfig(profile="imsc1", language=language)
|
|
615
|
-
return TTMLFormat.to_bytes(supervisions, include_speaker, config, **kwargs)
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
@register_format("ebu_tt_d")
|
|
619
|
-
class EBUTD_Format(TTMLFormatBase):
|
|
620
|
-
"""EBU-TT-D format - European broadcast profile."""
|
|
621
|
-
|
|
622
|
-
extensions = [".ttml"]
|
|
623
|
-
description = "EBU-TT-D - European broadcast TTML profile"
|
|
624
|
-
|
|
625
|
-
@classmethod
|
|
626
|
-
def write(
|
|
627
|
-
cls,
|
|
628
|
-
supervisions: List[Supervision],
|
|
629
|
-
output_path,
|
|
630
|
-
include_speaker: bool = True,
|
|
631
|
-
language: str = "en",
|
|
632
|
-
**kwargs,
|
|
633
|
-
) -> Path:
|
|
634
|
-
"""Write EBU-TT-D format."""
|
|
635
|
-
config = TTMLConfig(profile="ebu-tt-d", language=language)
|
|
636
|
-
return TTMLFormat.write(supervisions, output_path, include_speaker, config, **kwargs)
|
|
637
|
-
|
|
638
|
-
@classmethod
|
|
639
|
-
def to_bytes(
|
|
640
|
-
cls,
|
|
641
|
-
supervisions: List[Supervision],
|
|
642
|
-
include_speaker: bool = True,
|
|
643
|
-
language: str = "en",
|
|
644
|
-
**kwargs,
|
|
645
|
-
) -> bytes:
|
|
646
|
-
"""Convert to EBU-TT-D format bytes."""
|
|
647
|
-
config = TTMLConfig(profile="ebu-tt-d", language=language)
|
|
648
|
-
return TTMLFormat.to_bytes(supervisions, include_speaker, config, **kwargs)
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
# Export config classes
|
|
652
|
-
__all__ = ["TTMLFormat", "IMSC1Format", "EBUTD_Format", "TTMLConfig", "TTMLStyle", "TTMLRegion", "ITUNES_NS"]
|