lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +350 -0
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +91 -220
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1143
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/formats/gemini.py +722 -0
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +4 -9
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +48 -84
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +9 -2
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +36 -18
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +81 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_reader.py +0 -371
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.0.dist-info/METADATA +0 -1133
- lattifai-1.2.0.dist-info/RECORD +0 -57
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""WebVTT format with YouTube VTT word-level timestamp support.
|
|
2
|
+
|
|
3
|
+
This module provides a unified VTT format handler that:
|
|
4
|
+
- Reads both standard VTT and YouTube VTT (with word-level timestamps)
|
|
5
|
+
- Writes standard VTT or YouTube VTT (when karaoke_config.enabled=True)
|
|
6
|
+
|
|
7
|
+
YouTube VTT format uses word-level tags like:
|
|
8
|
+
Word1<00:00:10.559><c> Word2</c><00:00:11.000><c> Word3</c>
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
import pysubs2
|
|
16
|
+
from lhotse.supervision import AlignmentItem
|
|
17
|
+
|
|
18
|
+
from ...config.caption import KaraokeConfig
|
|
19
|
+
from ..parsers.text_parser import normalize_text as normalize_text_fn
|
|
20
|
+
from ..parsers.text_parser import parse_speaker_text
|
|
21
|
+
from ..supervision import Supervision
|
|
22
|
+
from . import register_format
|
|
23
|
+
from .base import FormatHandler
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@register_format("vtt")
|
|
27
|
+
class VTTFormat(FormatHandler):
|
|
28
|
+
"""WebVTT format with YouTube VTT word-level timestamp support.
|
|
29
|
+
|
|
30
|
+
Reading:
|
|
31
|
+
- Auto-detects YouTube VTT format (with word-level timestamps)
|
|
32
|
+
- Falls back to standard VTT parsing via pysubs2
|
|
33
|
+
|
|
34
|
+
Writing:
|
|
35
|
+
- Standard VTT by default
|
|
36
|
+
- YouTube VTT style when word_level=True and karaoke_config.enabled=True
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
extensions = [".vtt"]
|
|
40
|
+
description = "Web Video Text Tracks - HTML5 standard with YouTube VTT support"
|
|
41
|
+
|
|
42
|
+
# Pattern to detect YouTube VTT word-level timestamps
|
|
43
|
+
YOUTUBE_VTT_PATTERN = re.compile(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>")
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def can_read(cls, source) -> bool:
|
|
47
|
+
"""Check if source is a VTT file."""
|
|
48
|
+
if cls.is_content(source):
|
|
49
|
+
return source.strip().startswith("WEBVTT")
|
|
50
|
+
try:
|
|
51
|
+
path_str = str(source).lower()
|
|
52
|
+
return path_str.endswith(".vtt")
|
|
53
|
+
except Exception:
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def _is_youtube_vtt(cls, content: str) -> bool:
|
|
58
|
+
"""Check if content is YouTube VTT format with word-level timestamps."""
|
|
59
|
+
return bool(cls.YOUTUBE_VTT_PATTERN.search(content))
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def read(
|
|
63
|
+
cls,
|
|
64
|
+
source,
|
|
65
|
+
normalize_text: bool = True,
|
|
66
|
+
**kwargs,
|
|
67
|
+
) -> List[Supervision]:
|
|
68
|
+
"""Read VTT format, auto-detecting YouTube VTT word-level timestamps.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
source: File path or content string
|
|
72
|
+
normalize_text: Whether to normalize text
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List of Supervision objects
|
|
76
|
+
"""
|
|
77
|
+
if cls.is_content(source):
|
|
78
|
+
content = source
|
|
79
|
+
else:
|
|
80
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
81
|
+
content = f.read()
|
|
82
|
+
|
|
83
|
+
# Auto-detect YouTube VTT format
|
|
84
|
+
if cls._is_youtube_vtt(content):
|
|
85
|
+
return cls._read_youtube_vtt(content, normalize_text)
|
|
86
|
+
else:
|
|
87
|
+
return cls._read_standard_vtt(source if not cls.is_content(source) else content, normalize_text)
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def _read_standard_vtt(cls, source, normalize_text: bool = True) -> List[Supervision]:
|
|
91
|
+
"""Read standard VTT using pysubs2."""
|
|
92
|
+
try:
|
|
93
|
+
if cls.is_content(source):
|
|
94
|
+
subs = pysubs2.SSAFile.from_string(source, format_="vtt")
|
|
95
|
+
else:
|
|
96
|
+
subs = pysubs2.load(str(source), encoding="utf-8", format_="vtt")
|
|
97
|
+
except Exception:
|
|
98
|
+
if cls.is_content(source):
|
|
99
|
+
subs = pysubs2.SSAFile.from_string(source)
|
|
100
|
+
else:
|
|
101
|
+
subs = pysubs2.load(str(source), encoding="utf-8")
|
|
102
|
+
|
|
103
|
+
supervisions = []
|
|
104
|
+
for event in subs.events:
|
|
105
|
+
text = event.text
|
|
106
|
+
if normalize_text:
|
|
107
|
+
text = normalize_text_fn(text)
|
|
108
|
+
|
|
109
|
+
speaker, text = parse_speaker_text(text)
|
|
110
|
+
|
|
111
|
+
supervisions.append(
|
|
112
|
+
Supervision(
|
|
113
|
+
text=text,
|
|
114
|
+
speaker=speaker or event.name or None,
|
|
115
|
+
start=event.start / 1000.0 if event.start is not None else 0,
|
|
116
|
+
duration=(event.end - event.start) / 1000.0 if event.end is not None else 0,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return supervisions
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def _read_youtube_vtt(cls, content: str, normalize_text: bool = True) -> List[Supervision]:
|
|
124
|
+
"""Parse YouTube VTT format with word-level timestamps."""
|
|
125
|
+
supervisions = []
|
|
126
|
+
|
|
127
|
+
# Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269
|
|
128
|
+
timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
|
|
129
|
+
|
|
130
|
+
# Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
|
|
131
|
+
word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
|
|
132
|
+
|
|
133
|
+
# Pattern to match the first word (before first timestamp)
|
|
134
|
+
first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
|
|
135
|
+
|
|
136
|
+
def parse_timestamp(ts: str) -> float:
|
|
137
|
+
"""Convert timestamp string to seconds."""
|
|
138
|
+
ts = ts.replace(",", ".")
|
|
139
|
+
parts = ts.split(":")
|
|
140
|
+
hours = int(parts[0])
|
|
141
|
+
minutes = int(parts[1])
|
|
142
|
+
seconds = float(parts[2])
|
|
143
|
+
return hours * 3600 + minutes * 60 + seconds
|
|
144
|
+
|
|
145
|
+
def has_word_timestamps(text: str) -> bool:
|
|
146
|
+
"""Check if text contains word-level timestamps."""
|
|
147
|
+
return bool(word_timestamp_pattern.search(text) or first_word_pattern.match(text))
|
|
148
|
+
|
|
149
|
+
lines = content.split("\n")
|
|
150
|
+
i = 0
|
|
151
|
+
|
|
152
|
+
# First pass: collect all cues with their content
|
|
153
|
+
all_cues = []
|
|
154
|
+
while i < len(lines):
|
|
155
|
+
line = lines[i]
|
|
156
|
+
ts_match = timestamp_pattern.search(line)
|
|
157
|
+
if ts_match:
|
|
158
|
+
cue_start = parse_timestamp(ts_match.group(1))
|
|
159
|
+
cue_end = parse_timestamp(ts_match.group(2))
|
|
160
|
+
|
|
161
|
+
cue_lines = []
|
|
162
|
+
i += 1
|
|
163
|
+
while i < len(lines):
|
|
164
|
+
if timestamp_pattern.search(lines[i]):
|
|
165
|
+
break
|
|
166
|
+
stripped = lines[i].strip()
|
|
167
|
+
if not stripped and cue_lines and not lines[i - 1].strip():
|
|
168
|
+
break
|
|
169
|
+
if stripped:
|
|
170
|
+
cue_lines.append(lines[i])
|
|
171
|
+
i += 1
|
|
172
|
+
|
|
173
|
+
all_cues.append({"start": cue_start, "end": cue_end, "lines": cue_lines})
|
|
174
|
+
continue
|
|
175
|
+
i += 1
|
|
176
|
+
|
|
177
|
+
# Second pass: identify cues to skip and merge
|
|
178
|
+
cues_to_skip = set()
|
|
179
|
+
cues_to_merge_text = {}
|
|
180
|
+
|
|
181
|
+
for idx in range(len(all_cues) - 1):
|
|
182
|
+
cue = all_cues[idx]
|
|
183
|
+
duration = cue["end"] - cue["start"]
|
|
184
|
+
|
|
185
|
+
if abs(duration - 0.010) < 0.001:
|
|
186
|
+
cue_text = "\n".join(cue["lines"])
|
|
187
|
+
if not has_word_timestamps(cue_text):
|
|
188
|
+
next_cue = all_cues[idx + 1]
|
|
189
|
+
if abs(next_cue["start"] - cue["end"]) < 0.001:
|
|
190
|
+
cues_to_skip.add(idx)
|
|
191
|
+
|
|
192
|
+
next_cue_text = "\n".join(next_cue["lines"])
|
|
193
|
+
if not has_word_timestamps(next_cue_text):
|
|
194
|
+
for prev_idx in range(idx - 1, -1, -1):
|
|
195
|
+
if prev_idx not in cues_to_skip:
|
|
196
|
+
if len(next_cue["lines"]) > 1:
|
|
197
|
+
append_text = next_cue["lines"][-1].strip()
|
|
198
|
+
if append_text:
|
|
199
|
+
cues_to_merge_text[prev_idx] = append_text
|
|
200
|
+
cues_to_skip.add(idx + 1)
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
# Third pass: process remaining cues
|
|
204
|
+
for idx, cue in enumerate(all_cues):
|
|
205
|
+
if idx in cues_to_skip:
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
cue_start = cue["start"]
|
|
209
|
+
cue_end = cue["end"]
|
|
210
|
+
cue_lines = cue["lines"]
|
|
211
|
+
|
|
212
|
+
word_alignments = []
|
|
213
|
+
text_parts = []
|
|
214
|
+
|
|
215
|
+
for cue_line in cue_lines:
|
|
216
|
+
cue_line = cue_line.strip()
|
|
217
|
+
if not cue_line:
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
word_matches = word_timestamp_pattern.findall(cue_line)
|
|
221
|
+
first_match = first_word_pattern.match(cue_line)
|
|
222
|
+
|
|
223
|
+
if word_matches or first_match:
|
|
224
|
+
if first_match:
|
|
225
|
+
first_word = first_match.group(1).strip()
|
|
226
|
+
first_word_next_ts = parse_timestamp(first_match.group(2))
|
|
227
|
+
if first_word:
|
|
228
|
+
text_parts.append(first_word)
|
|
229
|
+
word_alignments.append(
|
|
230
|
+
AlignmentItem(
|
|
231
|
+
symbol=first_word,
|
|
232
|
+
start=cue_start,
|
|
233
|
+
duration=max(0.01, first_word_next_ts - cue_start),
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
for word_idx, (ts, word) in enumerate(word_matches):
|
|
238
|
+
word_start = parse_timestamp(ts)
|
|
239
|
+
word = word.strip()
|
|
240
|
+
if not word:
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
text_parts.append(word)
|
|
244
|
+
|
|
245
|
+
if word_idx + 1 < len(word_matches):
|
|
246
|
+
next_ts = parse_timestamp(word_matches[word_idx + 1][0])
|
|
247
|
+
duration = next_ts - word_start
|
|
248
|
+
else:
|
|
249
|
+
duration = cue_end - word_start
|
|
250
|
+
|
|
251
|
+
word_alignments.append(
|
|
252
|
+
AlignmentItem(
|
|
253
|
+
symbol=word,
|
|
254
|
+
start=word_start,
|
|
255
|
+
duration=max(0.01, duration),
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
if not text_parts:
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
full_text = " ".join(text_parts)
|
|
263
|
+
if idx in cues_to_merge_text:
|
|
264
|
+
full_text += " " + cues_to_merge_text[idx]
|
|
265
|
+
|
|
266
|
+
if normalize_text:
|
|
267
|
+
full_text = normalize_text_fn(full_text)
|
|
268
|
+
|
|
269
|
+
if word_alignments:
|
|
270
|
+
sup_start = word_alignments[0].start
|
|
271
|
+
sup_end = word_alignments[-1].start + word_alignments[-1].duration
|
|
272
|
+
else:
|
|
273
|
+
sup_start = cue_start
|
|
274
|
+
sup_end = cue_end
|
|
275
|
+
|
|
276
|
+
supervisions.append(
|
|
277
|
+
Supervision(
|
|
278
|
+
text=full_text,
|
|
279
|
+
start=sup_start,
|
|
280
|
+
duration=max(0.0, sup_end - sup_start),
|
|
281
|
+
alignment={"word": word_alignments} if word_alignments else None,
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
return supervisions
|
|
286
|
+
|
|
287
|
+
@classmethod
|
|
288
|
+
def extract_metadata(cls, source, **kwargs) -> Dict[str, str]:
|
|
289
|
+
"""Extract metadata from VTT header."""
|
|
290
|
+
if cls.is_content(source):
|
|
291
|
+
content = source[:4096]
|
|
292
|
+
else:
|
|
293
|
+
try:
|
|
294
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
295
|
+
content = f.read(4096)
|
|
296
|
+
except Exception:
|
|
297
|
+
return {}
|
|
298
|
+
|
|
299
|
+
metadata = {}
|
|
300
|
+
lines = content.split("\n")
|
|
301
|
+
for line in lines[:10]:
|
|
302
|
+
line = line.strip()
|
|
303
|
+
if line.startswith("Kind:"):
|
|
304
|
+
metadata["kind"] = line.split(":", 1)[1].strip()
|
|
305
|
+
elif line.startswith("Language:"):
|
|
306
|
+
metadata["language"] = line.split(":", 1)[1].strip()
|
|
307
|
+
elif line.startswith("NOTE"):
|
|
308
|
+
match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
|
|
309
|
+
if match:
|
|
310
|
+
key, value = match.groups()
|
|
311
|
+
metadata[key.lower()] = value.strip()
|
|
312
|
+
|
|
313
|
+
return metadata
|
|
314
|
+
|
|
315
|
+
@classmethod
|
|
316
|
+
def write(
|
|
317
|
+
cls,
|
|
318
|
+
supervisions: List[Supervision],
|
|
319
|
+
output_path,
|
|
320
|
+
include_speaker: bool = True,
|
|
321
|
+
**kwargs,
|
|
322
|
+
) -> Path:
|
|
323
|
+
"""Write VTT to file."""
|
|
324
|
+
output_path = Path(output_path)
|
|
325
|
+
content = cls.to_bytes(supervisions, include_speaker=include_speaker, **kwargs)
|
|
326
|
+
output_path.write_bytes(content)
|
|
327
|
+
return output_path
|
|
328
|
+
|
|
329
|
+
@classmethod
|
|
330
|
+
def to_bytes(
|
|
331
|
+
cls,
|
|
332
|
+
supervisions: List[Supervision],
|
|
333
|
+
include_speaker: bool = True,
|
|
334
|
+
fps: float = 25.0,
|
|
335
|
+
word_level: bool = False,
|
|
336
|
+
karaoke_config: Optional[KaraokeConfig] = None,
|
|
337
|
+
metadata: Optional[Dict] = None,
|
|
338
|
+
**kwargs,
|
|
339
|
+
) -> bytes:
|
|
340
|
+
"""Convert to VTT bytes with optional karaoke and metadata preservation.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
supervisions: List of supervision segments
|
|
344
|
+
include_speaker: Whether to include speaker in output
|
|
345
|
+
fps: Frames per second (not used for VTT)
|
|
346
|
+
word_level: If True and alignment exists, output word-per-segment or karaoke
|
|
347
|
+
karaoke_config: Karaoke configuration. When enabled, output YouTube VTT
|
|
348
|
+
style with word-level timestamps: <00:00:10.559><c> word</c>
|
|
349
|
+
metadata: Optional metadata dict containing kind and language
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
VTT content as bytes
|
|
353
|
+
"""
|
|
354
|
+
from .base import expand_to_word_supervisions
|
|
355
|
+
|
|
356
|
+
karaoke_enabled = karaoke_config is not None and karaoke_config.enabled
|
|
357
|
+
|
|
358
|
+
# If karaoke enabled, output YouTube VTT style
|
|
359
|
+
if word_level and karaoke_enabled:
|
|
360
|
+
return cls._to_youtube_vtt_bytes(supervisions, include_speaker, metadata)
|
|
361
|
+
|
|
362
|
+
# If word_level only (no karaoke), expand to word-per-segment
|
|
363
|
+
if word_level:
|
|
364
|
+
supervisions = expand_to_word_supervisions(supervisions)
|
|
365
|
+
|
|
366
|
+
# Build VTT with metadata header
|
|
367
|
+
return cls._to_vtt_bytes_with_metadata(supervisions, include_speaker, metadata)
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
def _to_vtt_bytes_with_metadata(
|
|
371
|
+
cls,
|
|
372
|
+
supervisions: List[Supervision],
|
|
373
|
+
include_speaker: bool = True,
|
|
374
|
+
metadata: Optional[Dict] = None,
|
|
375
|
+
) -> bytes:
|
|
376
|
+
"""Generate VTT with metadata header."""
|
|
377
|
+
lines = ["WEBVTT"]
|
|
378
|
+
|
|
379
|
+
if metadata:
|
|
380
|
+
if metadata.get("kind"):
|
|
381
|
+
lines.append(f"Kind: {metadata['kind']}")
|
|
382
|
+
if metadata.get("language"):
|
|
383
|
+
lines.append(f"Language: {metadata['language']}")
|
|
384
|
+
|
|
385
|
+
lines.append("")
|
|
386
|
+
|
|
387
|
+
subs = pysubs2.SSAFile()
|
|
388
|
+
for sup in supervisions:
|
|
389
|
+
text = sup.text or ""
|
|
390
|
+
if cls._should_include_speaker(sup, include_speaker):
|
|
391
|
+
text = f"{sup.speaker} {text}"
|
|
392
|
+
subs.append(
|
|
393
|
+
pysubs2.SSAEvent(
|
|
394
|
+
start=int(sup.start * 1000),
|
|
395
|
+
end=int(sup.end * 1000),
|
|
396
|
+
text=text,
|
|
397
|
+
name=sup.speaker or "",
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
vtt_content = subs.to_string(format_="vtt")
|
|
402
|
+
vtt_lines = vtt_content.split("\n")
|
|
403
|
+
started = False
|
|
404
|
+
for line in vtt_lines[1:]:
|
|
405
|
+
if not started and not line.strip():
|
|
406
|
+
continue
|
|
407
|
+
started = True
|
|
408
|
+
lines.append(line)
|
|
409
|
+
|
|
410
|
+
return "\n".join(lines).encode("utf-8")
|
|
411
|
+
|
|
412
|
+
@classmethod
|
|
413
|
+
def _to_youtube_vtt_bytes(
|
|
414
|
+
cls,
|
|
415
|
+
supervisions: List[Supervision],
|
|
416
|
+
include_speaker: bool = True,
|
|
417
|
+
metadata: Optional[Dict] = None,
|
|
418
|
+
) -> bytes:
|
|
419
|
+
"""Generate YouTube VTT format with word-level timestamps.
|
|
420
|
+
|
|
421
|
+
Format: <00:00:10.559><c> word</c>
|
|
422
|
+
"""
|
|
423
|
+
|
|
424
|
+
def format_timestamp(seconds: float) -> str:
|
|
425
|
+
"""Format seconds into HH:MM:SS.mmm."""
|
|
426
|
+
h = int(seconds // 3600)
|
|
427
|
+
m = int((seconds % 3600) // 60)
|
|
428
|
+
s = int(seconds % 60)
|
|
429
|
+
ms = int(round((seconds % 1) * 1000))
|
|
430
|
+
if ms == 1000:
|
|
431
|
+
s += 1
|
|
432
|
+
ms = 0
|
|
433
|
+
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
|
|
434
|
+
|
|
435
|
+
lines = ["WEBVTT"]
|
|
436
|
+
|
|
437
|
+
if metadata:
|
|
438
|
+
if metadata.get("kind"):
|
|
439
|
+
lines.append(f"Kind: {metadata['kind']}")
|
|
440
|
+
if metadata.get("language"):
|
|
441
|
+
lines.append(f"Language: {metadata['language']}")
|
|
442
|
+
|
|
443
|
+
lines.append("")
|
|
444
|
+
|
|
445
|
+
for sup in sorted(supervisions, key=lambda x: x.start):
|
|
446
|
+
text = sup.text or ""
|
|
447
|
+
alignment = getattr(sup, "alignment", None)
|
|
448
|
+
words = alignment.get("word") if alignment else None
|
|
449
|
+
|
|
450
|
+
if words:
|
|
451
|
+
cue_start = words[0].start
|
|
452
|
+
cue_end = words[-1].end
|
|
453
|
+
lines.append(f"{format_timestamp(cue_start)} --> {format_timestamp(cue_end)}")
|
|
454
|
+
|
|
455
|
+
text_parts = []
|
|
456
|
+
for i, word in enumerate(words):
|
|
457
|
+
symbol = word.symbol
|
|
458
|
+
if i == 0 and include_speaker and sup.speaker:
|
|
459
|
+
symbol = f"{sup.speaker}: {symbol}"
|
|
460
|
+
text_parts.append(f"<{format_timestamp(word.start)}><c> {symbol}</c>")
|
|
461
|
+
lines.append("".join(text_parts))
|
|
462
|
+
else:
|
|
463
|
+
lines.append(f"{format_timestamp(sup.start)} --> {format_timestamp(sup.end)}")
|
|
464
|
+
if include_speaker and sup.speaker:
|
|
465
|
+
text = f"{sup.speaker}: {text}"
|
|
466
|
+
lines.append(text)
|
|
467
|
+
lines.append("")
|
|
468
|
+
|
|
469
|
+
return "\n".join(lines).encode("utf-8")
|
|
@@ -69,7 +69,8 @@ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
|
|
|
69
69
|
|
|
70
70
|
match = SPEAKER_LATTIFAI.match(line)
|
|
71
71
|
if match:
|
|
72
|
-
|
|
72
|
+
if len(match.groups()) != 2:
|
|
73
|
+
raise ValueError(f"Expected 2 groups in SPEAKER_LATTIFAI match, got {match.groups()}")
|
|
73
74
|
if not match.group(1):
|
|
74
75
|
logging.error(f"ParseSub LINE [{line}]")
|
|
75
76
|
else:
|
|
@@ -77,7 +78,8 @@ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
|
|
|
77
78
|
|
|
78
79
|
match = SPEAKER_PATTERN2.match(line)
|
|
79
80
|
if match:
|
|
80
|
-
|
|
81
|
+
if len(match.groups()) != 2:
|
|
82
|
+
raise ValueError(f"Expected 2 groups in SPEAKER_PATTERN2 match, got {match.groups()}")
|
|
81
83
|
return match.group(1).strip(), match.group(2).strip()
|
|
82
84
|
|
|
83
85
|
return None, line
|