lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
- lattifai-1.3.1.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
lattifai/caption/formats/vtt.py
DELETED
|
@@ -1,469 +0,0 @@
|
|
|
1
|
-
"""WebVTT format with YouTube VTT word-level timestamp support.
|
|
2
|
-
|
|
3
|
-
This module provides a unified VTT format handler that:
|
|
4
|
-
- Reads both standard VTT and YouTube VTT (with word-level timestamps)
|
|
5
|
-
- Writes standard VTT or YouTube VTT (when karaoke_config.enabled=True)
|
|
6
|
-
|
|
7
|
-
YouTube VTT format uses word-level tags like:
|
|
8
|
-
Word1<00:00:10.559><c> Word2</c><00:00:11.000><c> Word3</c>
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import re
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from typing import Dict, List, Optional
|
|
14
|
-
|
|
15
|
-
import pysubs2
|
|
16
|
-
from lhotse.supervision import AlignmentItem
|
|
17
|
-
|
|
18
|
-
from ...config.caption import KaraokeConfig
|
|
19
|
-
from ..parsers.text_parser import normalize_text as normalize_text_fn
|
|
20
|
-
from ..parsers.text_parser import parse_speaker_text
|
|
21
|
-
from ..supervision import Supervision
|
|
22
|
-
from . import register_format
|
|
23
|
-
from .base import FormatHandler
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@register_format("vtt")
|
|
27
|
-
class VTTFormat(FormatHandler):
|
|
28
|
-
"""WebVTT format with YouTube VTT word-level timestamp support.
|
|
29
|
-
|
|
30
|
-
Reading:
|
|
31
|
-
- Auto-detects YouTube VTT format (with word-level timestamps)
|
|
32
|
-
- Falls back to standard VTT parsing via pysubs2
|
|
33
|
-
|
|
34
|
-
Writing:
|
|
35
|
-
- Standard VTT by default
|
|
36
|
-
- YouTube VTT style when word_level=True and karaoke_config.enabled=True
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
extensions = [".vtt"]
|
|
40
|
-
description = "Web Video Text Tracks - HTML5 standard with YouTube VTT support"
|
|
41
|
-
|
|
42
|
-
# Pattern to detect YouTube VTT word-level timestamps
|
|
43
|
-
YOUTUBE_VTT_PATTERN = re.compile(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>")
|
|
44
|
-
|
|
45
|
-
@classmethod
|
|
46
|
-
def can_read(cls, source) -> bool:
|
|
47
|
-
"""Check if source is a VTT file."""
|
|
48
|
-
if cls.is_content(source):
|
|
49
|
-
return source.strip().startswith("WEBVTT")
|
|
50
|
-
try:
|
|
51
|
-
path_str = str(source).lower()
|
|
52
|
-
return path_str.endswith(".vtt")
|
|
53
|
-
except Exception:
|
|
54
|
-
return False
|
|
55
|
-
|
|
56
|
-
@classmethod
|
|
57
|
-
def _is_youtube_vtt(cls, content: str) -> bool:
|
|
58
|
-
"""Check if content is YouTube VTT format with word-level timestamps."""
|
|
59
|
-
return bool(cls.YOUTUBE_VTT_PATTERN.search(content))
|
|
60
|
-
|
|
61
|
-
@classmethod
|
|
62
|
-
def read(
|
|
63
|
-
cls,
|
|
64
|
-
source,
|
|
65
|
-
normalize_text: bool = True,
|
|
66
|
-
**kwargs,
|
|
67
|
-
) -> List[Supervision]:
|
|
68
|
-
"""Read VTT format, auto-detecting YouTube VTT word-level timestamps.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
source: File path or content string
|
|
72
|
-
normalize_text: Whether to normalize text
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
List of Supervision objects
|
|
76
|
-
"""
|
|
77
|
-
if cls.is_content(source):
|
|
78
|
-
content = source
|
|
79
|
-
else:
|
|
80
|
-
with open(source, "r", encoding="utf-8") as f:
|
|
81
|
-
content = f.read()
|
|
82
|
-
|
|
83
|
-
# Auto-detect YouTube VTT format
|
|
84
|
-
if cls._is_youtube_vtt(content):
|
|
85
|
-
return cls._read_youtube_vtt(content, normalize_text)
|
|
86
|
-
else:
|
|
87
|
-
return cls._read_standard_vtt(source if not cls.is_content(source) else content, normalize_text)
|
|
88
|
-
|
|
89
|
-
@classmethod
|
|
90
|
-
def _read_standard_vtt(cls, source, normalize_text: bool = True) -> List[Supervision]:
|
|
91
|
-
"""Read standard VTT using pysubs2."""
|
|
92
|
-
try:
|
|
93
|
-
if cls.is_content(source):
|
|
94
|
-
subs = pysubs2.SSAFile.from_string(source, format_="vtt")
|
|
95
|
-
else:
|
|
96
|
-
subs = pysubs2.load(str(source), encoding="utf-8", format_="vtt")
|
|
97
|
-
except Exception:
|
|
98
|
-
if cls.is_content(source):
|
|
99
|
-
subs = pysubs2.SSAFile.from_string(source)
|
|
100
|
-
else:
|
|
101
|
-
subs = pysubs2.load(str(source), encoding="utf-8")
|
|
102
|
-
|
|
103
|
-
supervisions = []
|
|
104
|
-
for event in subs.events:
|
|
105
|
-
text = event.text
|
|
106
|
-
if normalize_text:
|
|
107
|
-
text = normalize_text_fn(text)
|
|
108
|
-
|
|
109
|
-
speaker, text = parse_speaker_text(text)
|
|
110
|
-
|
|
111
|
-
supervisions.append(
|
|
112
|
-
Supervision(
|
|
113
|
-
text=text,
|
|
114
|
-
speaker=speaker or event.name or None,
|
|
115
|
-
start=event.start / 1000.0 if event.start is not None else 0,
|
|
116
|
-
duration=(event.end - event.start) / 1000.0 if event.end is not None else 0,
|
|
117
|
-
)
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
return supervisions
|
|
121
|
-
|
|
122
|
-
@classmethod
|
|
123
|
-
def _read_youtube_vtt(cls, content: str, normalize_text: bool = True) -> List[Supervision]:
|
|
124
|
-
"""Parse YouTube VTT format with word-level timestamps."""
|
|
125
|
-
supervisions = []
|
|
126
|
-
|
|
127
|
-
# Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269
|
|
128
|
-
timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
|
|
129
|
-
|
|
130
|
-
# Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
|
|
131
|
-
word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
|
|
132
|
-
|
|
133
|
-
# Pattern to match the first word (before first timestamp)
|
|
134
|
-
first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
|
|
135
|
-
|
|
136
|
-
def parse_timestamp(ts: str) -> float:
|
|
137
|
-
"""Convert timestamp string to seconds."""
|
|
138
|
-
ts = ts.replace(",", ".")
|
|
139
|
-
parts = ts.split(":")
|
|
140
|
-
hours = int(parts[0])
|
|
141
|
-
minutes = int(parts[1])
|
|
142
|
-
seconds = float(parts[2])
|
|
143
|
-
return hours * 3600 + minutes * 60 + seconds
|
|
144
|
-
|
|
145
|
-
def has_word_timestamps(text: str) -> bool:
|
|
146
|
-
"""Check if text contains word-level timestamps."""
|
|
147
|
-
return bool(word_timestamp_pattern.search(text) or first_word_pattern.match(text))
|
|
148
|
-
|
|
149
|
-
lines = content.split("\n")
|
|
150
|
-
i = 0
|
|
151
|
-
|
|
152
|
-
# First pass: collect all cues with their content
|
|
153
|
-
all_cues = []
|
|
154
|
-
while i < len(lines):
|
|
155
|
-
line = lines[i]
|
|
156
|
-
ts_match = timestamp_pattern.search(line)
|
|
157
|
-
if ts_match:
|
|
158
|
-
cue_start = parse_timestamp(ts_match.group(1))
|
|
159
|
-
cue_end = parse_timestamp(ts_match.group(2))
|
|
160
|
-
|
|
161
|
-
cue_lines = []
|
|
162
|
-
i += 1
|
|
163
|
-
while i < len(lines):
|
|
164
|
-
if timestamp_pattern.search(lines[i]):
|
|
165
|
-
break
|
|
166
|
-
stripped = lines[i].strip()
|
|
167
|
-
if not stripped and cue_lines and not lines[i - 1].strip():
|
|
168
|
-
break
|
|
169
|
-
if stripped:
|
|
170
|
-
cue_lines.append(lines[i])
|
|
171
|
-
i += 1
|
|
172
|
-
|
|
173
|
-
all_cues.append({"start": cue_start, "end": cue_end, "lines": cue_lines})
|
|
174
|
-
continue
|
|
175
|
-
i += 1
|
|
176
|
-
|
|
177
|
-
# Second pass: identify cues to skip and merge
|
|
178
|
-
cues_to_skip = set()
|
|
179
|
-
cues_to_merge_text = {}
|
|
180
|
-
|
|
181
|
-
for idx in range(len(all_cues) - 1):
|
|
182
|
-
cue = all_cues[idx]
|
|
183
|
-
duration = cue["end"] - cue["start"]
|
|
184
|
-
|
|
185
|
-
if abs(duration - 0.010) < 0.001:
|
|
186
|
-
cue_text = "\n".join(cue["lines"])
|
|
187
|
-
if not has_word_timestamps(cue_text):
|
|
188
|
-
next_cue = all_cues[idx + 1]
|
|
189
|
-
if abs(next_cue["start"] - cue["end"]) < 0.001:
|
|
190
|
-
cues_to_skip.add(idx)
|
|
191
|
-
|
|
192
|
-
next_cue_text = "\n".join(next_cue["lines"])
|
|
193
|
-
if not has_word_timestamps(next_cue_text):
|
|
194
|
-
for prev_idx in range(idx - 1, -1, -1):
|
|
195
|
-
if prev_idx not in cues_to_skip:
|
|
196
|
-
if len(next_cue["lines"]) > 1:
|
|
197
|
-
append_text = next_cue["lines"][-1].strip()
|
|
198
|
-
if append_text:
|
|
199
|
-
cues_to_merge_text[prev_idx] = append_text
|
|
200
|
-
cues_to_skip.add(idx + 1)
|
|
201
|
-
break
|
|
202
|
-
|
|
203
|
-
# Third pass: process remaining cues
|
|
204
|
-
for idx, cue in enumerate(all_cues):
|
|
205
|
-
if idx in cues_to_skip:
|
|
206
|
-
continue
|
|
207
|
-
|
|
208
|
-
cue_start = cue["start"]
|
|
209
|
-
cue_end = cue["end"]
|
|
210
|
-
cue_lines = cue["lines"]
|
|
211
|
-
|
|
212
|
-
word_alignments = []
|
|
213
|
-
text_parts = []
|
|
214
|
-
|
|
215
|
-
for cue_line in cue_lines:
|
|
216
|
-
cue_line = cue_line.strip()
|
|
217
|
-
if not cue_line:
|
|
218
|
-
continue
|
|
219
|
-
|
|
220
|
-
word_matches = word_timestamp_pattern.findall(cue_line)
|
|
221
|
-
first_match = first_word_pattern.match(cue_line)
|
|
222
|
-
|
|
223
|
-
if word_matches or first_match:
|
|
224
|
-
if first_match:
|
|
225
|
-
first_word = first_match.group(1).strip()
|
|
226
|
-
first_word_next_ts = parse_timestamp(first_match.group(2))
|
|
227
|
-
if first_word:
|
|
228
|
-
text_parts.append(first_word)
|
|
229
|
-
word_alignments.append(
|
|
230
|
-
AlignmentItem(
|
|
231
|
-
symbol=first_word,
|
|
232
|
-
start=cue_start,
|
|
233
|
-
duration=max(0.01, first_word_next_ts - cue_start),
|
|
234
|
-
)
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
for word_idx, (ts, word) in enumerate(word_matches):
|
|
238
|
-
word_start = parse_timestamp(ts)
|
|
239
|
-
word = word.strip()
|
|
240
|
-
if not word:
|
|
241
|
-
continue
|
|
242
|
-
|
|
243
|
-
text_parts.append(word)
|
|
244
|
-
|
|
245
|
-
if word_idx + 1 < len(word_matches):
|
|
246
|
-
next_ts = parse_timestamp(word_matches[word_idx + 1][0])
|
|
247
|
-
duration = next_ts - word_start
|
|
248
|
-
else:
|
|
249
|
-
duration = cue_end - word_start
|
|
250
|
-
|
|
251
|
-
word_alignments.append(
|
|
252
|
-
AlignmentItem(
|
|
253
|
-
symbol=word,
|
|
254
|
-
start=word_start,
|
|
255
|
-
duration=max(0.01, duration),
|
|
256
|
-
)
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
if not text_parts:
|
|
260
|
-
continue
|
|
261
|
-
|
|
262
|
-
full_text = " ".join(text_parts)
|
|
263
|
-
if idx in cues_to_merge_text:
|
|
264
|
-
full_text += " " + cues_to_merge_text[idx]
|
|
265
|
-
|
|
266
|
-
if normalize_text:
|
|
267
|
-
full_text = normalize_text_fn(full_text)
|
|
268
|
-
|
|
269
|
-
if word_alignments:
|
|
270
|
-
sup_start = word_alignments[0].start
|
|
271
|
-
sup_end = word_alignments[-1].start + word_alignments[-1].duration
|
|
272
|
-
else:
|
|
273
|
-
sup_start = cue_start
|
|
274
|
-
sup_end = cue_end
|
|
275
|
-
|
|
276
|
-
supervisions.append(
|
|
277
|
-
Supervision(
|
|
278
|
-
text=full_text,
|
|
279
|
-
start=sup_start,
|
|
280
|
-
duration=max(0.0, sup_end - sup_start),
|
|
281
|
-
alignment={"word": word_alignments} if word_alignments else None,
|
|
282
|
-
)
|
|
283
|
-
)
|
|
284
|
-
|
|
285
|
-
return supervisions
|
|
286
|
-
|
|
287
|
-
@classmethod
|
|
288
|
-
def extract_metadata(cls, source, **kwargs) -> Dict[str, str]:
|
|
289
|
-
"""Extract metadata from VTT header."""
|
|
290
|
-
if cls.is_content(source):
|
|
291
|
-
content = source[:4096]
|
|
292
|
-
else:
|
|
293
|
-
try:
|
|
294
|
-
with open(source, "r", encoding="utf-8") as f:
|
|
295
|
-
content = f.read(4096)
|
|
296
|
-
except Exception:
|
|
297
|
-
return {}
|
|
298
|
-
|
|
299
|
-
metadata = {}
|
|
300
|
-
lines = content.split("\n")
|
|
301
|
-
for line in lines[:10]:
|
|
302
|
-
line = line.strip()
|
|
303
|
-
if line.startswith("Kind:"):
|
|
304
|
-
metadata["kind"] = line.split(":", 1)[1].strip()
|
|
305
|
-
elif line.startswith("Language:"):
|
|
306
|
-
metadata["language"] = line.split(":", 1)[1].strip()
|
|
307
|
-
elif line.startswith("NOTE"):
|
|
308
|
-
match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
|
|
309
|
-
if match:
|
|
310
|
-
key, value = match.groups()
|
|
311
|
-
metadata[key.lower()] = value.strip()
|
|
312
|
-
|
|
313
|
-
return metadata
|
|
314
|
-
|
|
315
|
-
@classmethod
|
|
316
|
-
def write(
|
|
317
|
-
cls,
|
|
318
|
-
supervisions: List[Supervision],
|
|
319
|
-
output_path,
|
|
320
|
-
include_speaker: bool = True,
|
|
321
|
-
**kwargs,
|
|
322
|
-
) -> Path:
|
|
323
|
-
"""Write VTT to file."""
|
|
324
|
-
output_path = Path(output_path)
|
|
325
|
-
content = cls.to_bytes(supervisions, include_speaker=include_speaker, **kwargs)
|
|
326
|
-
output_path.write_bytes(content)
|
|
327
|
-
return output_path
|
|
328
|
-
|
|
329
|
-
@classmethod
|
|
330
|
-
def to_bytes(
|
|
331
|
-
cls,
|
|
332
|
-
supervisions: List[Supervision],
|
|
333
|
-
include_speaker: bool = True,
|
|
334
|
-
fps: float = 25.0,
|
|
335
|
-
word_level: bool = False,
|
|
336
|
-
karaoke_config: Optional[KaraokeConfig] = None,
|
|
337
|
-
metadata: Optional[Dict] = None,
|
|
338
|
-
**kwargs,
|
|
339
|
-
) -> bytes:
|
|
340
|
-
"""Convert to VTT bytes with optional karaoke and metadata preservation.
|
|
341
|
-
|
|
342
|
-
Args:
|
|
343
|
-
supervisions: List of supervision segments
|
|
344
|
-
include_speaker: Whether to include speaker in output
|
|
345
|
-
fps: Frames per second (not used for VTT)
|
|
346
|
-
word_level: If True and alignment exists, output word-per-segment or karaoke
|
|
347
|
-
karaoke_config: Karaoke configuration. When enabled, output YouTube VTT
|
|
348
|
-
style with word-level timestamps: <00:00:10.559><c> word</c>
|
|
349
|
-
metadata: Optional metadata dict containing kind and language
|
|
350
|
-
|
|
351
|
-
Returns:
|
|
352
|
-
VTT content as bytes
|
|
353
|
-
"""
|
|
354
|
-
from .base import expand_to_word_supervisions
|
|
355
|
-
|
|
356
|
-
karaoke_enabled = karaoke_config is not None and karaoke_config.enabled
|
|
357
|
-
|
|
358
|
-
# If karaoke enabled, output YouTube VTT style
|
|
359
|
-
if word_level and karaoke_enabled:
|
|
360
|
-
return cls._to_youtube_vtt_bytes(supervisions, include_speaker, metadata)
|
|
361
|
-
|
|
362
|
-
# If word_level only (no karaoke), expand to word-per-segment
|
|
363
|
-
if word_level:
|
|
364
|
-
supervisions = expand_to_word_supervisions(supervisions)
|
|
365
|
-
|
|
366
|
-
# Build VTT with metadata header
|
|
367
|
-
return cls._to_vtt_bytes_with_metadata(supervisions, include_speaker, metadata)
|
|
368
|
-
|
|
369
|
-
@classmethod
|
|
370
|
-
def _to_vtt_bytes_with_metadata(
|
|
371
|
-
cls,
|
|
372
|
-
supervisions: List[Supervision],
|
|
373
|
-
include_speaker: bool = True,
|
|
374
|
-
metadata: Optional[Dict] = None,
|
|
375
|
-
) -> bytes:
|
|
376
|
-
"""Generate VTT with metadata header."""
|
|
377
|
-
lines = ["WEBVTT"]
|
|
378
|
-
|
|
379
|
-
if metadata:
|
|
380
|
-
if metadata.get("kind"):
|
|
381
|
-
lines.append(f"Kind: {metadata['kind']}")
|
|
382
|
-
if metadata.get("language"):
|
|
383
|
-
lines.append(f"Language: {metadata['language']}")
|
|
384
|
-
|
|
385
|
-
lines.append("")
|
|
386
|
-
|
|
387
|
-
subs = pysubs2.SSAFile()
|
|
388
|
-
for sup in supervisions:
|
|
389
|
-
text = sup.text or ""
|
|
390
|
-
if cls._should_include_speaker(sup, include_speaker):
|
|
391
|
-
text = f"{sup.speaker} {text}"
|
|
392
|
-
subs.append(
|
|
393
|
-
pysubs2.SSAEvent(
|
|
394
|
-
start=int(sup.start * 1000),
|
|
395
|
-
end=int(sup.end * 1000),
|
|
396
|
-
text=text,
|
|
397
|
-
name=sup.speaker or "",
|
|
398
|
-
)
|
|
399
|
-
)
|
|
400
|
-
|
|
401
|
-
vtt_content = subs.to_string(format_="vtt")
|
|
402
|
-
vtt_lines = vtt_content.split("\n")
|
|
403
|
-
started = False
|
|
404
|
-
for line in vtt_lines[1:]:
|
|
405
|
-
if not started and not line.strip():
|
|
406
|
-
continue
|
|
407
|
-
started = True
|
|
408
|
-
lines.append(line)
|
|
409
|
-
|
|
410
|
-
return "\n".join(lines).encode("utf-8")
|
|
411
|
-
|
|
412
|
-
@classmethod
|
|
413
|
-
def _to_youtube_vtt_bytes(
|
|
414
|
-
cls,
|
|
415
|
-
supervisions: List[Supervision],
|
|
416
|
-
include_speaker: bool = True,
|
|
417
|
-
metadata: Optional[Dict] = None,
|
|
418
|
-
) -> bytes:
|
|
419
|
-
"""Generate YouTube VTT format with word-level timestamps.
|
|
420
|
-
|
|
421
|
-
Format: <00:00:10.559><c> word</c>
|
|
422
|
-
"""
|
|
423
|
-
|
|
424
|
-
def format_timestamp(seconds: float) -> str:
|
|
425
|
-
"""Format seconds into HH:MM:SS.mmm."""
|
|
426
|
-
h = int(seconds // 3600)
|
|
427
|
-
m = int((seconds % 3600) // 60)
|
|
428
|
-
s = int(seconds % 60)
|
|
429
|
-
ms = int(round((seconds % 1) * 1000))
|
|
430
|
-
if ms == 1000:
|
|
431
|
-
s += 1
|
|
432
|
-
ms = 0
|
|
433
|
-
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
|
|
434
|
-
|
|
435
|
-
lines = ["WEBVTT"]
|
|
436
|
-
|
|
437
|
-
if metadata:
|
|
438
|
-
if metadata.get("kind"):
|
|
439
|
-
lines.append(f"Kind: {metadata['kind']}")
|
|
440
|
-
if metadata.get("language"):
|
|
441
|
-
lines.append(f"Language: {metadata['language']}")
|
|
442
|
-
|
|
443
|
-
lines.append("")
|
|
444
|
-
|
|
445
|
-
for sup in sorted(supervisions, key=lambda x: x.start):
|
|
446
|
-
text = sup.text or ""
|
|
447
|
-
alignment = getattr(sup, "alignment", None)
|
|
448
|
-
words = alignment.get("word") if alignment else None
|
|
449
|
-
|
|
450
|
-
if words:
|
|
451
|
-
cue_start = words[0].start
|
|
452
|
-
cue_end = words[-1].end
|
|
453
|
-
lines.append(f"{format_timestamp(cue_start)} --> {format_timestamp(cue_end)}")
|
|
454
|
-
|
|
455
|
-
text_parts = []
|
|
456
|
-
for i, word in enumerate(words):
|
|
457
|
-
symbol = word.symbol
|
|
458
|
-
if i == 0 and include_speaker and sup.speaker:
|
|
459
|
-
symbol = f"{sup.speaker}: {symbol}"
|
|
460
|
-
text_parts.append(f"<{format_timestamp(word.start)}><c> {symbol}</c>")
|
|
461
|
-
lines.append("".join(text_parts))
|
|
462
|
-
else:
|
|
463
|
-
lines.append(f"{format_timestamp(sup.start)} --> {format_timestamp(sup.end)}")
|
|
464
|
-
if include_speaker and sup.speaker:
|
|
465
|
-
text = f"{sup.speaker}: {text}"
|
|
466
|
-
lines.append(text)
|
|
467
|
-
lines.append("")
|
|
468
|
-
|
|
469
|
-
return "\n".join(lines).encode("utf-8")
|
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import re
|
|
3
|
-
from typing import Optional, Tuple
|
|
4
|
-
|
|
5
|
-
# Timestamp pattern: [start-end] text
|
|
6
|
-
# Example: [1.23-4.56] Hello world
|
|
7
|
-
TIMESTAMP_PATTERN = re.compile(r"^\[([\d.]+)-([\d.]+)\]\s*(.*)$")
|
|
8
|
-
|
|
9
|
-
# 来自于字幕中常见的说话人标记格式
|
|
10
|
-
SPEAKER_PATTERN = re.compile(r"((?:>>|>>|>|>).*?[::])\s*(.*)")
|
|
11
|
-
|
|
12
|
-
# Transcriber Output Example:
|
|
13
|
-
# 26:19.919 --> 26:34.921
|
|
14
|
-
# [SPEAKER_01]: 越来越多的科技巨头入...
|
|
15
|
-
SPEAKER_LATTIFAI = re.compile(r"(^\[SPEAKER_.*?\][::])\s*(.*)")
|
|
16
|
-
|
|
17
|
-
# NISHTHA BHATIA: Hey, everyone.
|
|
18
|
-
# DIETER: Oh, hey, Nishtha.
|
|
19
|
-
# GEMINI: That might
|
|
20
|
-
SPEAKER_PATTERN2 = re.compile(r"^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[::])\s*(.*)$")
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def normalize_text(text: str) -> str:
|
|
24
|
-
"""Normalize caption text by:
|
|
25
|
-
- Decoding common HTML entities
|
|
26
|
-
- Removing HTML tags (e.g., <i>, <font>, <b>, <br>)
|
|
27
|
-
- Collapsing multiple whitespace into a single space
|
|
28
|
-
- Converting curly apostrophes to straight ones in common contractions
|
|
29
|
-
"""
|
|
30
|
-
if not text:
|
|
31
|
-
return ""
|
|
32
|
-
|
|
33
|
-
# # Remove HTML tags first (replace with space to avoid concatenation)
|
|
34
|
-
# text = re.sub(r"<[^>]+>", " ", text)
|
|
35
|
-
|
|
36
|
-
html_entities = {
|
|
37
|
-
"&": "&",
|
|
38
|
-
"<": "<",
|
|
39
|
-
">": ">",
|
|
40
|
-
""": '"',
|
|
41
|
-
"'": "'",
|
|
42
|
-
" ": " ",
|
|
43
|
-
"\\N": " ",
|
|
44
|
-
"…": " ", # replace ellipsis with space to avoid merging words
|
|
45
|
-
}
|
|
46
|
-
for entity, char in html_entities.items():
|
|
47
|
-
text = text.replace(entity, char)
|
|
48
|
-
|
|
49
|
-
# Convert curly apostrophes to straight apostrophes for common English contractions
|
|
50
|
-
text = re.sub(r"([a-zA-Z])’([tsdm]|ll|re|ve)\b", r"\1'\2", text, flags=re.IGNORECASE)
|
|
51
|
-
text = re.sub(r"([0-9])’([s])\b", r"\1'\2", text, flags=re.IGNORECASE)
|
|
52
|
-
|
|
53
|
-
# Collapse whitespace (after replacements)
|
|
54
|
-
text = re.sub(r"\s+", " ", text)
|
|
55
|
-
|
|
56
|
-
return text.strip()
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def parse_speaker_text(line) -> Tuple[Optional[str], str]:
|
|
60
|
-
"""Parse a line of text to extract speaker and content."""
|
|
61
|
-
|
|
62
|
-
if ":" not in line and ":" not in line:
|
|
63
|
-
return None, line
|
|
64
|
-
|
|
65
|
-
# 匹配以 >> 开头的行,并去除开头的名字和冒号
|
|
66
|
-
match = SPEAKER_PATTERN.match(line)
|
|
67
|
-
if match:
|
|
68
|
-
return match.group(1).strip(), match.group(2).strip()
|
|
69
|
-
|
|
70
|
-
match = SPEAKER_LATTIFAI.match(line)
|
|
71
|
-
if match:
|
|
72
|
-
if len(match.groups()) != 2:
|
|
73
|
-
raise ValueError(f"Expected 2 groups in SPEAKER_LATTIFAI match, got {match.groups()}")
|
|
74
|
-
if not match.group(1):
|
|
75
|
-
logging.error(f"ParseSub LINE [{line}]")
|
|
76
|
-
else:
|
|
77
|
-
return match.group(1).strip(), match.group(2).strip()
|
|
78
|
-
|
|
79
|
-
match = SPEAKER_PATTERN2.match(line)
|
|
80
|
-
if match:
|
|
81
|
-
if len(match.groups()) != 2:
|
|
82
|
-
raise ValueError(f"Expected 2 groups in SPEAKER_PATTERN2 match, got {match.groups()}")
|
|
83
|
-
return match.group(1).strip(), match.group(2).strip()
|
|
84
|
-
|
|
85
|
-
return None, line
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def parse_timestamp_text(line: str) -> Tuple[Optional[float], Optional[float], str]:
|
|
89
|
-
"""
|
|
90
|
-
Parse a line of text to extract timestamp and content.
|
|
91
|
-
|
|
92
|
-
Format: [start-end] text
|
|
93
|
-
Example: [1.23-4.56] Hello world
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
line: Input line to parse
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
Tuple of (start_time, end_time, text)
|
|
100
|
-
- start_time: Start timestamp in seconds, or None if not found
|
|
101
|
-
- end_time: End timestamp in seconds, or None if not found
|
|
102
|
-
- text: The text content after the timestamp
|
|
103
|
-
"""
|
|
104
|
-
match = TIMESTAMP_PATTERN.match(line)
|
|
105
|
-
if match:
|
|
106
|
-
try:
|
|
107
|
-
start = float(match.group(1))
|
|
108
|
-
end = float(match.group(2))
|
|
109
|
-
text = match.group(3).strip()
|
|
110
|
-
return start, end, text
|
|
111
|
-
except ValueError:
|
|
112
|
-
# If conversion fails, treat as plain text
|
|
113
|
-
return None, None, line
|
|
114
|
-
|
|
115
|
-
return None, None, line
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if __name__ == "__main__":
|
|
119
|
-
pattern = re.compile(r">>\s*(.*?)\s*[::]\s*(.*)")
|
|
120
|
-
pattern = re.compile(r"(>>.*?[::])\s*(.*)")
|
|
121
|
-
|
|
122
|
-
test_strings = [
|
|
123
|
-
">>Key: Value",
|
|
124
|
-
">> Key with space : Value with space ",
|
|
125
|
-
">> 全角键 : 全角值",
|
|
126
|
-
">>Key:Value xxx. >>Key:Value",
|
|
127
|
-
]
|
|
128
|
-
|
|
129
|
-
for text in test_strings:
|
|
130
|
-
match = pattern.match(text)
|
|
131
|
-
if match:
|
|
132
|
-
print(f"Input: '{text}'")
|
|
133
|
-
print(f"Speaker: '{match.group(1)}'")
|
|
134
|
-
print(f"Content: '{match.group(2)}'")
|
|
135
|
-
print("-------------")
|
|
136
|
-
|
|
137
|
-
# pattern2
|
|
138
|
-
test_strings2 = ["NISHTHA BHATIA: Hey, everyone.", "DIETER: Oh, hey, Nishtha.", "GEMINI: That might"]
|
|
139
|
-
for text in test_strings2:
|
|
140
|
-
match = SPEAKER_PATTERN2.match(text)
|
|
141
|
-
if match:
|
|
142
|
-
print(f" Input: '{text}'")
|
|
143
|
-
print(f"Speaker: '{match.group(1)}'")
|
|
144
|
-
print(f"Content: '{match.group(2)}'")
|
|
145
|
-
print("-------------")
|
|
146
|
-
else:
|
|
147
|
-
raise ValueError(f"No match for: '{text}'")
|