lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +350 -0
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +91 -220
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1143
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/formats/gemini.py +722 -0
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +4 -9
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +48 -84
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +9 -2
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +36 -18
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +81 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_reader.py +0 -371
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.0.dist-info/METADATA +0 -1133
- lattifai-1.2.0.dist-info/RECORD +0 -57
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,722 @@
|
|
|
1
|
+
"""Gemini/YouTube transcript format handler.
|
|
2
|
+
|
|
3
|
+
Handles YouTube/Gemini markdown transcript format with timestamps like [HH:MM:SS].
|
|
4
|
+
Supports reading and writing transcript files with speaker labels, events, and sections.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import tempfile
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Union
|
|
12
|
+
|
|
13
|
+
from lhotse.utils import Pathlike
|
|
14
|
+
|
|
15
|
+
from ..supervision import Supervision
|
|
16
|
+
from . import register_format
|
|
17
|
+
from .base import FormatHandler
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class GeminiSegment:
|
|
22
|
+
"""Represents a segment in the Gemini transcript with metadata."""
|
|
23
|
+
|
|
24
|
+
text: str
|
|
25
|
+
timestamp: Optional[float] = None # For backward compatibility (start time)
|
|
26
|
+
end_timestamp: Optional[float] = None # End time when timestamp is at the end
|
|
27
|
+
speaker: Optional[str] = None
|
|
28
|
+
section: Optional[str] = None
|
|
29
|
+
segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
|
|
30
|
+
line_number: int = 0
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def start(self) -> float:
|
|
34
|
+
"""Return start time in seconds."""
|
|
35
|
+
return self.timestamp if self.timestamp is not None else 0.0
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def end(self) -> Optional[float]:
|
|
39
|
+
"""Return end time in seconds if available."""
|
|
40
|
+
return self.end_timestamp
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class GeminiReader:
|
|
44
|
+
"""Parser for YouTube transcript format with speaker labels and timestamps."""
|
|
45
|
+
|
|
46
|
+
# Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
|
|
47
|
+
TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
|
|
48
|
+
SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
|
|
49
|
+
SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
|
|
50
|
+
# Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
|
|
51
|
+
EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
|
|
52
|
+
# Timestamp at the end indicates end time
|
|
53
|
+
INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
54
|
+
# Timestamp at the beginning indicates start time
|
|
55
|
+
INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
|
|
56
|
+
# Standalone timestamp on its own line
|
|
57
|
+
STANDALONE_TIMESTAMP_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
58
|
+
|
|
59
|
+
# New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
|
|
60
|
+
YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
|
|
61
|
+
YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def parse_timestamp(cls, *args) -> float:
|
|
65
|
+
"""Convert timestamp to seconds.
|
|
66
|
+
|
|
67
|
+
Supports both HH:MM:SS and MM:SS formats.
|
|
68
|
+
Args can be (hours, minutes, seconds) or (minutes, seconds).
|
|
69
|
+
Can also accept a single argument which is seconds.
|
|
70
|
+
"""
|
|
71
|
+
if len(args) == 3:
|
|
72
|
+
# HH:MM:SS format
|
|
73
|
+
hours, minutes, seconds = args
|
|
74
|
+
return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
|
|
75
|
+
elif len(args) == 2:
|
|
76
|
+
# MM:SS format
|
|
77
|
+
minutes, seconds = args
|
|
78
|
+
return int(minutes) * 60 + int(seconds)
|
|
79
|
+
elif len(args) == 1:
|
|
80
|
+
# Direct seconds (from YouTube &t= parameter)
|
|
81
|
+
return int(args[0])
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(f"Invalid timestamp args: {args}")
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def read(
|
|
87
|
+
cls,
|
|
88
|
+
transcript_path: Union[Pathlike, str],
|
|
89
|
+
include_events: bool = False,
|
|
90
|
+
include_sections: bool = False,
|
|
91
|
+
) -> List[GeminiSegment]:
|
|
92
|
+
"""Parse YouTube transcript file or content and return list of transcript segments.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
transcript_path: Path to the transcript file or raw string content
|
|
96
|
+
include_events: Whether to include event descriptions like [Applause]
|
|
97
|
+
include_sections: Whether to include section headers
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of GeminiSegment objects with all metadata
|
|
101
|
+
"""
|
|
102
|
+
content = ""
|
|
103
|
+
# Check if transcript_path is a multi-line string (content) or a short string (likely path)
|
|
104
|
+
is_content = "\n" in str(transcript_path) or len(str(transcript_path)) > 1000
|
|
105
|
+
|
|
106
|
+
if is_content:
|
|
107
|
+
content = str(transcript_path)
|
|
108
|
+
else:
|
|
109
|
+
p = Path(transcript_path).expanduser().resolve()
|
|
110
|
+
if p.exists() and p.is_file():
|
|
111
|
+
with open(p, "r", encoding="utf-8") as f:
|
|
112
|
+
content = f.read()
|
|
113
|
+
else:
|
|
114
|
+
# Fallback: treat as content if path doesn't exist
|
|
115
|
+
content = str(transcript_path)
|
|
116
|
+
|
|
117
|
+
segments: List[GeminiSegment] = []
|
|
118
|
+
current_section = None
|
|
119
|
+
current_speaker = None
|
|
120
|
+
|
|
121
|
+
lines = content.splitlines()
|
|
122
|
+
for line_num, line in enumerate(lines, start=1):
|
|
123
|
+
line = line.strip()
|
|
124
|
+
if not line:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Skip table of contents
|
|
128
|
+
if line.startswith("* ["):
|
|
129
|
+
continue
|
|
130
|
+
if line.startswith("## Table of Contents"):
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
# Parse section headers
|
|
134
|
+
section_match = cls.SECTION_HEADER_PATTERN.match(line)
|
|
135
|
+
if section_match:
|
|
136
|
+
hours, minutes, seconds, section_title = section_match.groups()
|
|
137
|
+
timestamp = cls.parse_timestamp(hours, minutes, seconds)
|
|
138
|
+
current_section = section_title.strip()
|
|
139
|
+
if include_sections:
|
|
140
|
+
segments.append(
|
|
141
|
+
GeminiSegment(
|
|
142
|
+
text=section_title.strip(),
|
|
143
|
+
timestamp=timestamp,
|
|
144
|
+
section=current_section,
|
|
145
|
+
segment_type="section_header",
|
|
146
|
+
line_number=line_num,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
# Parse YouTube format section headers
|
|
152
|
+
youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
|
|
153
|
+
if youtube_section_match:
|
|
154
|
+
minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
|
|
155
|
+
timestamp = cls.parse_timestamp(url_seconds)
|
|
156
|
+
current_section = section_title.strip()
|
|
157
|
+
if include_sections:
|
|
158
|
+
segments.append(
|
|
159
|
+
GeminiSegment(
|
|
160
|
+
text=section_title.strip(),
|
|
161
|
+
timestamp=timestamp,
|
|
162
|
+
section=current_section,
|
|
163
|
+
segment_type="section_header",
|
|
164
|
+
line_number=line_num,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# Parse standalone timestamp [HH:MM:SS]
|
|
170
|
+
# Often used as an end timestamp for the preceding block
|
|
171
|
+
standalone_match = cls.STANDALONE_TIMESTAMP_PATTERN.match(line)
|
|
172
|
+
if standalone_match:
|
|
173
|
+
groups = standalone_match.groups()
|
|
174
|
+
if groups[0] is not None:
|
|
175
|
+
ts = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
176
|
+
else:
|
|
177
|
+
ts = cls.parse_timestamp(groups[3], groups[4])
|
|
178
|
+
|
|
179
|
+
# Assign to previous dialogue segment if it doesn't have an end time
|
|
180
|
+
if segments and segments[-1].segment_type == "dialogue":
|
|
181
|
+
if segments[-1].end_timestamp is None:
|
|
182
|
+
segments[-1].end_timestamp = ts
|
|
183
|
+
elif segments[-1].timestamp is None:
|
|
184
|
+
# If it has an end but no start, this standalone might be its start?
|
|
185
|
+
# Usually standalone is end, but let's be flexible
|
|
186
|
+
segments[-1].timestamp = ts
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
# Parse event descriptions [event] [HH:MM:SS]
|
|
190
|
+
event_match = cls.EVENT_PATTERN.match(line)
|
|
191
|
+
if event_match:
|
|
192
|
+
groups = event_match.groups()
|
|
193
|
+
event_text = groups[0]
|
|
194
|
+
hours_or_minutes = groups[1]
|
|
195
|
+
minutes_or_seconds = groups[2]
|
|
196
|
+
seconds_optional = groups[3]
|
|
197
|
+
|
|
198
|
+
if seconds_optional is not None:
|
|
199
|
+
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
|
|
200
|
+
else:
|
|
201
|
+
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
|
|
202
|
+
|
|
203
|
+
if include_events and timestamp is not None:
|
|
204
|
+
segments.append(
|
|
205
|
+
GeminiSegment(
|
|
206
|
+
text=f"[{event_text.strip()}]",
|
|
207
|
+
timestamp=timestamp,
|
|
208
|
+
section=current_section,
|
|
209
|
+
segment_type="event",
|
|
210
|
+
line_number=line_num,
|
|
211
|
+
)
|
|
212
|
+
)
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
# Parse speaker dialogue: **Speaker:** Text [HH:MM:SS]
|
|
216
|
+
speaker_match = cls.SPEAKER_PATTERN.match(line)
|
|
217
|
+
if speaker_match:
|
|
218
|
+
speaker, text_with_timestamp = speaker_match.groups()
|
|
219
|
+
current_speaker = speaker.strip()
|
|
220
|
+
|
|
221
|
+
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
|
|
222
|
+
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
|
|
223
|
+
youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
|
|
224
|
+
|
|
225
|
+
start_timestamp = None
|
|
226
|
+
end_timestamp = None
|
|
227
|
+
text = text_with_timestamp.strip()
|
|
228
|
+
|
|
229
|
+
if start_match:
|
|
230
|
+
groups = start_match.groups()
|
|
231
|
+
if groups[0] is not None:
|
|
232
|
+
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
233
|
+
elif groups[3] is not None:
|
|
234
|
+
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
235
|
+
text = groups[5]
|
|
236
|
+
elif end_match:
|
|
237
|
+
groups = end_match.groups()
|
|
238
|
+
text = groups[0]
|
|
239
|
+
if groups[1] is not None:
|
|
240
|
+
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
241
|
+
elif groups[4] is not None:
|
|
242
|
+
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
243
|
+
elif youtube_match:
|
|
244
|
+
groups = youtube_match.groups()
|
|
245
|
+
text = groups[0]
|
|
246
|
+
url_seconds = groups[3]
|
|
247
|
+
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
248
|
+
|
|
249
|
+
segments.append(
|
|
250
|
+
GeminiSegment(
|
|
251
|
+
text=text.strip(),
|
|
252
|
+
timestamp=start_timestamp,
|
|
253
|
+
end_timestamp=end_timestamp,
|
|
254
|
+
speaker=current_speaker,
|
|
255
|
+
section=current_section,
|
|
256
|
+
segment_type="dialogue",
|
|
257
|
+
line_number=line_num,
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
current_speaker = None
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
# Parse plain text (might contain inline timestamp or be a continuation)
|
|
264
|
+
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
|
|
265
|
+
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
|
|
266
|
+
youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
|
|
267
|
+
|
|
268
|
+
if start_match:
|
|
269
|
+
groups = start_match.groups()
|
|
270
|
+
if groups[0] is not None:
|
|
271
|
+
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
272
|
+
else:
|
|
273
|
+
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
274
|
+
text = groups[5]
|
|
275
|
+
segments.append(
|
|
276
|
+
GeminiSegment(
|
|
277
|
+
text=text.strip(),
|
|
278
|
+
timestamp=start_timestamp,
|
|
279
|
+
speaker=current_speaker,
|
|
280
|
+
section=current_section,
|
|
281
|
+
segment_type="dialogue",
|
|
282
|
+
line_number=line_num,
|
|
283
|
+
)
|
|
284
|
+
)
|
|
285
|
+
elif end_match:
|
|
286
|
+
groups = end_match.groups()
|
|
287
|
+
text = groups[0]
|
|
288
|
+
if groups[1] is not None:
|
|
289
|
+
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
290
|
+
else:
|
|
291
|
+
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
292
|
+
segments.append(
|
|
293
|
+
GeminiSegment(
|
|
294
|
+
text=text.strip(),
|
|
295
|
+
end_timestamp=end_timestamp,
|
|
296
|
+
speaker=current_speaker,
|
|
297
|
+
section=current_section,
|
|
298
|
+
segment_type="dialogue",
|
|
299
|
+
line_number=line_num,
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
elif youtube_inline_match:
|
|
303
|
+
groups = youtube_inline_match.groups()
|
|
304
|
+
text = groups[0]
|
|
305
|
+
url_seconds = groups[3]
|
|
306
|
+
segments.append(
|
|
307
|
+
GeminiSegment(
|
|
308
|
+
text=text.strip(),
|
|
309
|
+
end_timestamp=cls.parse_timestamp(url_seconds),
|
|
310
|
+
speaker=current_speaker,
|
|
311
|
+
section=current_section,
|
|
312
|
+
segment_type="dialogue",
|
|
313
|
+
line_number=line_num,
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
# Plain text without any recognized markers
|
|
318
|
+
# If it follows a speaker line or another dialogue line without end timestamp,
|
|
319
|
+
# merge it into the last segment to support multi-line text blocks.
|
|
320
|
+
if segments and segments[-1].segment_type == "dialogue" and segments[-1].end_timestamp is None:
|
|
321
|
+
segments[-1].text += " " + line.strip()
|
|
322
|
+
else:
|
|
323
|
+
# Skip markdown headers and other formatting
|
|
324
|
+
if line.startswith("#"):
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
segments.append(
|
|
328
|
+
GeminiSegment(
|
|
329
|
+
text=line.strip(),
|
|
330
|
+
speaker=current_speaker,
|
|
331
|
+
section=current_section,
|
|
332
|
+
segment_type="dialogue",
|
|
333
|
+
line_number=line_num,
|
|
334
|
+
)
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
return segments
|
|
338
|
+
|
|
339
|
+
@classmethod
|
|
340
|
+
def extract_for_alignment(
|
|
341
|
+
cls,
|
|
342
|
+
transcript_path: Pathlike,
|
|
343
|
+
merge_consecutive: bool = False,
|
|
344
|
+
min_duration: float = 0.1,
|
|
345
|
+
merge_max_gap: float = 2.0,
|
|
346
|
+
normalize_text: bool = True,
|
|
347
|
+
**kwargs,
|
|
348
|
+
) -> List[Supervision]:
|
|
349
|
+
"""Extract text segments for forced alignment.
|
|
350
|
+
|
|
351
|
+
This extracts only dialogue segments (not events or section headers)
|
|
352
|
+
and converts them to Supervision objects suitable for alignment.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
transcript_path: Path to the transcript file
|
|
356
|
+
merge_consecutive: Whether to merge consecutive segments from same speaker
|
|
357
|
+
min_duration: Minimum duration for a segment
|
|
358
|
+
merge_max_gap: Maximum time gap (seconds) to merge consecutive segments
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
List of Supervision objects ready for alignment
|
|
362
|
+
"""
|
|
363
|
+
segments = cls.read(transcript_path, include_events=True, include_sections=False)
|
|
364
|
+
|
|
365
|
+
# Filter to dialogue and event segments with timestamps (either start or end)
|
|
366
|
+
dialogue_segments = [
|
|
367
|
+
s
|
|
368
|
+
for s in segments
|
|
369
|
+
if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
|
|
370
|
+
]
|
|
371
|
+
|
|
372
|
+
if not dialogue_segments:
|
|
373
|
+
raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
|
|
374
|
+
|
|
375
|
+
# Sort by timestamp (use start time if available, otherwise end time)
|
|
376
|
+
dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
|
|
377
|
+
|
|
378
|
+
# Convert to Supervision objects
|
|
379
|
+
supervisions: List[Supervision] = []
|
|
380
|
+
prev_end_time = 0.0
|
|
381
|
+
|
|
382
|
+
for i, segment in enumerate(dialogue_segments):
|
|
383
|
+
seg_start = None
|
|
384
|
+
seg_end = None
|
|
385
|
+
|
|
386
|
+
# Determine start and end times based on available timestamps
|
|
387
|
+
if segment.timestamp is not None:
|
|
388
|
+
# Has start time
|
|
389
|
+
seg_start = segment.timestamp
|
|
390
|
+
if segment.end_timestamp is not None:
|
|
391
|
+
# Has both start and end
|
|
392
|
+
seg_end = segment.end_timestamp
|
|
393
|
+
else:
|
|
394
|
+
# Only has start, estimate end
|
|
395
|
+
if i < len(dialogue_segments) - 1:
|
|
396
|
+
# Use next segment's time
|
|
397
|
+
next_seg = dialogue_segments[i + 1]
|
|
398
|
+
if next_seg.timestamp is not None:
|
|
399
|
+
seg_end = next_seg.timestamp
|
|
400
|
+
elif next_seg.end_timestamp is not None:
|
|
401
|
+
# Next has only end, estimate its start and use that
|
|
402
|
+
words_next = len(next_seg.text.split())
|
|
403
|
+
estimated_duration_next = words_next * 0.3
|
|
404
|
+
seg_end = next_seg.end_timestamp - estimated_duration_next
|
|
405
|
+
|
|
406
|
+
if seg_end is None:
|
|
407
|
+
# Estimate based on text length
|
|
408
|
+
words = len(segment.text.split())
|
|
409
|
+
seg_end = seg_start + words * 0.3
|
|
410
|
+
|
|
411
|
+
elif segment.end_timestamp is not None:
|
|
412
|
+
# Only has end time, need to infer start
|
|
413
|
+
seg_end = segment.end_timestamp
|
|
414
|
+
# Use previous segment's end time as start, or estimate based on text
|
|
415
|
+
if prev_end_time > 0:
|
|
416
|
+
seg_start = prev_end_time
|
|
417
|
+
else:
|
|
418
|
+
# Estimate start based on text length
|
|
419
|
+
words = len(segment.text.split())
|
|
420
|
+
estimated_duration = words * 0.3
|
|
421
|
+
seg_start = seg_end - estimated_duration
|
|
422
|
+
|
|
423
|
+
if seg_start is not None and seg_end is not None:
|
|
424
|
+
duration = max(seg_end - seg_start, min_duration)
|
|
425
|
+
if segment.segment_type == "dialogue":
|
|
426
|
+
supervisions.append(
|
|
427
|
+
Supervision(
|
|
428
|
+
text=segment.text.strip(),
|
|
429
|
+
start=seg_start,
|
|
430
|
+
duration=duration,
|
|
431
|
+
id=f"segment_{i:05d}",
|
|
432
|
+
speaker=segment.speaker,
|
|
433
|
+
)
|
|
434
|
+
)
|
|
435
|
+
prev_end_time = seg_start + duration
|
|
436
|
+
|
|
437
|
+
# Optionally merge consecutive segments from same speaker
|
|
438
|
+
if merge_consecutive:
|
|
439
|
+
merged = []
|
|
440
|
+
current_speaker = None
|
|
441
|
+
current_texts = []
|
|
442
|
+
current_start = None
|
|
443
|
+
last_end_time = None
|
|
444
|
+
|
|
445
|
+
for i, (segment, sup) in enumerate(zip(dialogue_segments, supervisions)):
|
|
446
|
+
# Check if we should merge with previous segment
|
|
447
|
+
should_merge = False
|
|
448
|
+
if segment.speaker == current_speaker and current_start is not None:
|
|
449
|
+
# Same speaker - check time gap
|
|
450
|
+
time_gap = sup.start - last_end_time if last_end_time else 0
|
|
451
|
+
if time_gap <= merge_max_gap:
|
|
452
|
+
should_merge = True
|
|
453
|
+
|
|
454
|
+
if should_merge:
|
|
455
|
+
# Same speaker within time threshold, accumulate
|
|
456
|
+
current_texts.append(segment.text)
|
|
457
|
+
last_end_time = sup.start + sup.duration
|
|
458
|
+
else:
|
|
459
|
+
# Different speaker or gap too large, save previous segment
|
|
460
|
+
if current_texts:
|
|
461
|
+
merged_text = " ".join(current_texts)
|
|
462
|
+
merged.append(
|
|
463
|
+
Supervision(
|
|
464
|
+
text=merged_text,
|
|
465
|
+
start=current_start,
|
|
466
|
+
duration=last_end_time - current_start,
|
|
467
|
+
id=f"merged_{len(merged):05d}",
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
current_speaker = segment.speaker
|
|
471
|
+
current_texts = [segment.text]
|
|
472
|
+
current_start = sup.start
|
|
473
|
+
last_end_time = sup.start + sup.duration
|
|
474
|
+
|
|
475
|
+
# Add final segment
|
|
476
|
+
if current_texts:
|
|
477
|
+
merged_text = " ".join(current_texts)
|
|
478
|
+
merged.append(
|
|
479
|
+
Supervision(
|
|
480
|
+
text=merged_text,
|
|
481
|
+
start=current_start,
|
|
482
|
+
duration=last_end_time - current_start,
|
|
483
|
+
id=f"merged_{len(merged):05d}",
|
|
484
|
+
)
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
supervisions = merged
|
|
488
|
+
|
|
489
|
+
return supervisions
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
__all__ = ["GeminiReader", "GeminiSegment"]
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
class GeminiWriter:
|
|
496
|
+
"""Writer for updating YouTube transcript timestamps based on alignment results."""
|
|
497
|
+
|
|
498
|
+
@staticmethod
|
|
499
|
+
def format_timestamp(seconds: float) -> str:
|
|
500
|
+
"""Convert seconds to [HH:MM:SS] format."""
|
|
501
|
+
hours = int(seconds // 3600)
|
|
502
|
+
minutes = int((seconds % 3600) // 60)
|
|
503
|
+
secs = int(seconds % 60)
|
|
504
|
+
return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
|
|
505
|
+
|
|
506
|
+
@classmethod
|
|
507
|
+
def update_timestamps(
|
|
508
|
+
cls,
|
|
509
|
+
original_transcript: Pathlike,
|
|
510
|
+
aligned_supervisions: List[Supervision],
|
|
511
|
+
output_path: Pathlike,
|
|
512
|
+
timestamp_mapping: Optional[Dict[int, float]] = None,
|
|
513
|
+
) -> Pathlike:
|
|
514
|
+
"""Update transcript file with corrected timestamps from alignment.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
original_transcript: Path to the original transcript file
|
|
518
|
+
aligned_supervisions: List of aligned Supervision objects with corrected timestamps
|
|
519
|
+
output_path: Path to write the updated transcript
|
|
520
|
+
timestamp_mapping: Optional manual mapping from line_number to new timestamp
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
Path to the output file
|
|
524
|
+
"""
|
|
525
|
+
original_path = Path(original_transcript)
|
|
526
|
+
output_path = Path(output_path)
|
|
527
|
+
|
|
528
|
+
# Read original file
|
|
529
|
+
with open(original_path, "r", encoding="utf-8") as f:
|
|
530
|
+
lines = f.readlines()
|
|
531
|
+
|
|
532
|
+
# Parse original segments to get line numbers
|
|
533
|
+
original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
|
|
534
|
+
|
|
535
|
+
# Create mapping from line number to new timestamp
|
|
536
|
+
if timestamp_mapping is None:
|
|
537
|
+
timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
|
|
538
|
+
|
|
539
|
+
# Update timestamps in lines
|
|
540
|
+
updated_lines = []
|
|
541
|
+
for line_num, line in enumerate(lines, start=1):
|
|
542
|
+
if line_num in timestamp_mapping:
|
|
543
|
+
new_timestamp = timestamp_mapping[line_num]
|
|
544
|
+
updated_line = cls._replace_timestamp(line, new_timestamp)
|
|
545
|
+
updated_lines.append(updated_line)
|
|
546
|
+
else:
|
|
547
|
+
updated_lines.append(line)
|
|
548
|
+
|
|
549
|
+
# Write updated content
|
|
550
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
551
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
552
|
+
f.writelines(updated_lines)
|
|
553
|
+
|
|
554
|
+
return output_path
|
|
555
|
+
|
|
556
|
+
@classmethod
|
|
557
|
+
def _create_timestamp_mapping(
|
|
558
|
+
cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
|
|
559
|
+
) -> Dict[int, float]:
|
|
560
|
+
"""Create mapping from line numbers to new timestamps based on alignment.
|
|
561
|
+
|
|
562
|
+
This performs text matching between original segments and aligned supervisions
|
|
563
|
+
to determine which timestamps should be updated.
|
|
564
|
+
"""
|
|
565
|
+
mapping = {}
|
|
566
|
+
|
|
567
|
+
# Create a simple text-based matching
|
|
568
|
+
dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
|
|
569
|
+
|
|
570
|
+
# Try to match based on text content
|
|
571
|
+
for aligned_sup in aligned_supervisions:
|
|
572
|
+
aligned_text = aligned_sup.text.strip()
|
|
573
|
+
|
|
574
|
+
# Find best matching original segment
|
|
575
|
+
best_match = None
|
|
576
|
+
best_score = 0
|
|
577
|
+
|
|
578
|
+
for orig_seg in dialogue_segments:
|
|
579
|
+
orig_text = orig_seg.text.strip()
|
|
580
|
+
|
|
581
|
+
# Simple text similarity (could be improved with fuzzy matching)
|
|
582
|
+
if aligned_text == orig_text:
|
|
583
|
+
best_match = orig_seg
|
|
584
|
+
best_score = 1.0
|
|
585
|
+
break
|
|
586
|
+
elif aligned_text in orig_text or orig_text in aligned_text:
|
|
587
|
+
score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
|
|
588
|
+
if score > best_score:
|
|
589
|
+
best_score = score
|
|
590
|
+
best_match = orig_seg
|
|
591
|
+
|
|
592
|
+
# If we found a good match, update the mapping
|
|
593
|
+
if best_match and best_score > 0.8:
|
|
594
|
+
mapping[best_match.line_number] = aligned_sup.start
|
|
595
|
+
|
|
596
|
+
return mapping
|
|
597
|
+
|
|
598
|
+
@classmethod
|
|
599
|
+
def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
|
|
600
|
+
"""Replace timestamp in a line with new timestamp."""
|
|
601
|
+
new_ts_str = cls.format_timestamp(new_timestamp)
|
|
602
|
+
|
|
603
|
+
# Replace timestamp patterns
|
|
604
|
+
# Pattern 1: [HH:MM:SS] at the end or in brackets
|
|
605
|
+
line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
|
|
606
|
+
|
|
607
|
+
return line
|
|
608
|
+
|
|
609
|
+
@classmethod
|
|
610
|
+
def write_aligned_transcript(
|
|
611
|
+
cls,
|
|
612
|
+
aligned_supervisions: List[Supervision],
|
|
613
|
+
output_path: Pathlike,
|
|
614
|
+
include_word_timestamps: bool = False,
|
|
615
|
+
) -> Pathlike:
|
|
616
|
+
"""Write a new transcript file from aligned supervisions.
|
|
617
|
+
|
|
618
|
+
This creates a simplified transcript format with accurate timestamps.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
aligned_supervisions: List of aligned Supervision objects
|
|
622
|
+
output_path: Path to write the transcript
|
|
623
|
+
include_word_timestamps: Whether to include word-level timestamps if available
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
Path to the output file
|
|
627
|
+
"""
|
|
628
|
+
output_path = Path(output_path)
|
|
629
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
630
|
+
|
|
631
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
632
|
+
f.write("# Aligned Transcript\n\n")
|
|
633
|
+
|
|
634
|
+
for i, sup in enumerate(aligned_supervisions):
|
|
635
|
+
# Write segment with timestamp
|
|
636
|
+
start_ts = cls.format_timestamp(sup.start)
|
|
637
|
+
f.write(f"{start_ts} {sup.text}\n")
|
|
638
|
+
|
|
639
|
+
# Optionally write word-level timestamps
|
|
640
|
+
if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
|
|
641
|
+
if "word" in sup.alignment:
|
|
642
|
+
f.write(" Words: ")
|
|
643
|
+
word_parts = []
|
|
644
|
+
for word_info in sup.alignment["word"]:
|
|
645
|
+
word_ts = cls.format_timestamp(word_info["start"])
|
|
646
|
+
word_parts.append(f'{word_info["symbol"]}{word_ts}')
|
|
647
|
+
f.write(" ".join(word_parts))
|
|
648
|
+
f.write("\n")
|
|
649
|
+
|
|
650
|
+
f.write("\n")
|
|
651
|
+
|
|
652
|
+
return output_path
|
|
653
|
+
|
|
654
|
+
@classmethod
|
|
655
|
+
def write(
|
|
656
|
+
cls,
|
|
657
|
+
supervisions: List[Supervision],
|
|
658
|
+
output_path: Pathlike,
|
|
659
|
+
**kwargs,
|
|
660
|
+
) -> Path:
|
|
661
|
+
"""Alias for write_aligned_transcript for Caption API compatibility."""
|
|
662
|
+
return Path(cls.write_aligned_transcript(supervisions, output_path, **kwargs))
|
|
663
|
+
|
|
664
|
+
@classmethod
|
|
665
|
+
def to_bytes(
|
|
666
|
+
cls,
|
|
667
|
+
supervisions: List[Supervision],
|
|
668
|
+
**kwargs,
|
|
669
|
+
) -> bytes:
|
|
670
|
+
"""Convert aligned supervisions to Gemini format bytes."""
|
|
671
|
+
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
|
|
672
|
+
tmp_path = Path(tmp.name)
|
|
673
|
+
try:
|
|
674
|
+
cls.write_aligned_transcript(supervisions, tmp_path, **kwargs)
|
|
675
|
+
return tmp_path.read_bytes()
|
|
676
|
+
finally:
|
|
677
|
+
tmp_path.unlink(missing_ok=True)
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
__all__ = ["GeminiWriter"]
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
@register_format("gemini")
|
|
684
|
+
class GeminiFormat(FormatHandler):
|
|
685
|
+
"""YouTube/Gemini markdown transcript format."""
|
|
686
|
+
|
|
687
|
+
extensions = [".md"]
|
|
688
|
+
description = "YouTube/Gemini transcript format with timestamps"
|
|
689
|
+
|
|
690
|
+
@classmethod
|
|
691
|
+
def can_read(cls, path) -> bool:
|
|
692
|
+
"""Check if this is a Gemini format file."""
|
|
693
|
+
path_str = str(path).lower()
|
|
694
|
+
return (
|
|
695
|
+
path_str.endswith("gemini.md")
|
|
696
|
+
or path_str.endswith("gemini3.md")
|
|
697
|
+
or ("gemini" in path_str and path_str.endswith(".md"))
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
@classmethod
|
|
701
|
+
def read(cls, path: Pathlike, **kwargs) -> List[Supervision]:
|
|
702
|
+
"""Read Gemini format file."""
|
|
703
|
+
return GeminiReader.extract_for_alignment(path, **kwargs)
|
|
704
|
+
|
|
705
|
+
@classmethod
|
|
706
|
+
def write(
|
|
707
|
+
cls,
|
|
708
|
+
supervisions: List[Supervision],
|
|
709
|
+
output_path: Pathlike,
|
|
710
|
+
**kwargs,
|
|
711
|
+
) -> Path:
|
|
712
|
+
"""Write Gemini format file."""
|
|
713
|
+
return GeminiWriter.write(supervisions, output_path, **kwargs)
|
|
714
|
+
|
|
715
|
+
@classmethod
|
|
716
|
+
def to_bytes(
|
|
717
|
+
cls,
|
|
718
|
+
supervisions: List[Supervision],
|
|
719
|
+
**kwargs,
|
|
720
|
+
) -> bytes:
|
|
721
|
+
"""Convert to Gemini format bytes."""
|
|
722
|
+
return GeminiWriter.to_bytes(supervisions, **kwargs)
|