lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +9 -1
- lattifai/alignment/lattice1_aligner.py +175 -54
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +441 -0
- lattifai/alignment/tokenizer.py +134 -65
- lattifai/audio2.py +162 -183
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +111 -4
- lattifai/cli/transcribe.py +2 -6
- lattifai/cli/youtube.py +7 -1
- lattifai/client.py +72 -123
- lattifai/config/__init__.py +28 -0
- lattifai/config/alignment.py +14 -0
- lattifai/config/caption.py +45 -31
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/media.py +20 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +49 -32
- lattifai/transcription/base.py +8 -2
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +25 -63
- lattifai/types.py +1 -1
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1265 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.3.0.dist-info/METADATA +678 -0
- lattifai-1.3.0.dist-info/RECORD +57 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -219
- lattifai/caption/__init__.py +0 -20
- lattifai/caption/caption.py +0 -1467
- lattifai/caption/gemini_reader.py +0 -462
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/text_parser.py +0 -145
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,462 +0,0 @@
|
|
|
1
|
-
"""Reader for YouTube transcript files with speaker labels and timestamps."""
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import List, Optional
|
|
7
|
-
|
|
8
|
-
from lhotse.utils import Pathlike
|
|
9
|
-
|
|
10
|
-
from .supervision import Supervision
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class GeminiSegment:
|
|
15
|
-
"""Represents a segment in the Gemini transcript with metadata."""
|
|
16
|
-
|
|
17
|
-
text: str
|
|
18
|
-
timestamp: Optional[float] = None # For backward compatibility (start time)
|
|
19
|
-
end_timestamp: Optional[float] = None # End time when timestamp is at the end
|
|
20
|
-
speaker: Optional[str] = None
|
|
21
|
-
section: Optional[str] = None
|
|
22
|
-
segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
|
|
23
|
-
line_number: int = 0
|
|
24
|
-
|
|
25
|
-
@property
|
|
26
|
-
def start(self) -> float:
|
|
27
|
-
"""Return start time in seconds."""
|
|
28
|
-
return self.timestamp if self.timestamp is not None else 0.0
|
|
29
|
-
|
|
30
|
-
@property
|
|
31
|
-
def end(self) -> Optional[float]:
|
|
32
|
-
"""Return end time in seconds if available."""
|
|
33
|
-
return self.end_timestamp
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class GeminiReader:
|
|
37
|
-
"""Parser for YouTube transcript format with speaker labels and timestamps."""
|
|
38
|
-
|
|
39
|
-
# Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
|
|
40
|
-
TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
|
|
41
|
-
SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
|
|
42
|
-
SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
|
|
43
|
-
# Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
|
|
44
|
-
EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
|
|
45
|
-
# Timestamp at the end indicates end time
|
|
46
|
-
INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
47
|
-
# Timestamp at the beginning indicates start time
|
|
48
|
-
INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
|
|
49
|
-
|
|
50
|
-
# New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
|
|
51
|
-
YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
|
|
52
|
-
YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
|
|
53
|
-
|
|
54
|
-
@classmethod
|
|
55
|
-
def parse_timestamp(cls, *args) -> float:
|
|
56
|
-
"""Convert timestamp to seconds.
|
|
57
|
-
|
|
58
|
-
Supports both HH:MM:SS and MM:SS formats.
|
|
59
|
-
Args can be (hours, minutes, seconds) or (minutes, seconds).
|
|
60
|
-
Can also accept a single argument which is seconds.
|
|
61
|
-
"""
|
|
62
|
-
if len(args) == 3:
|
|
63
|
-
# HH:MM:SS format
|
|
64
|
-
hours, minutes, seconds = args
|
|
65
|
-
return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
|
|
66
|
-
elif len(args) == 2:
|
|
67
|
-
# MM:SS format
|
|
68
|
-
minutes, seconds = args
|
|
69
|
-
return int(minutes) * 60 + int(seconds)
|
|
70
|
-
elif len(args) == 1:
|
|
71
|
-
# Direct seconds (from YouTube &t= parameter)
|
|
72
|
-
return int(args[0])
|
|
73
|
-
else:
|
|
74
|
-
raise ValueError(f"Invalid timestamp args: {args}")
|
|
75
|
-
|
|
76
|
-
@classmethod
|
|
77
|
-
def read(
|
|
78
|
-
cls,
|
|
79
|
-
transcript_path: Pathlike,
|
|
80
|
-
include_events: bool = False,
|
|
81
|
-
include_sections: bool = False,
|
|
82
|
-
) -> List[GeminiSegment]:
|
|
83
|
-
"""Parse YouTube transcript file and return list of transcript segments.
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
transcript_path: Path to the transcript file
|
|
87
|
-
include_events: Whether to include event descriptions like [Applause]
|
|
88
|
-
include_sections: Whether to include section headers
|
|
89
|
-
|
|
90
|
-
Returns:
|
|
91
|
-
List of GeminiSegment objects with all metadata
|
|
92
|
-
"""
|
|
93
|
-
transcript_path = Path(transcript_path).expanduser().resolve()
|
|
94
|
-
if not transcript_path.exists():
|
|
95
|
-
raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
|
|
96
|
-
|
|
97
|
-
segments: List[GeminiSegment] = []
|
|
98
|
-
current_section = None
|
|
99
|
-
current_speaker = None
|
|
100
|
-
|
|
101
|
-
with open(transcript_path, "r", encoding="utf-8") as f:
|
|
102
|
-
lines = f.readlines()
|
|
103
|
-
|
|
104
|
-
for line_num, line in enumerate(lines, start=1):
|
|
105
|
-
line = line.strip()
|
|
106
|
-
if not line:
|
|
107
|
-
continue
|
|
108
|
-
|
|
109
|
-
# Skip table of contents
|
|
110
|
-
if line.startswith("* ["):
|
|
111
|
-
continue
|
|
112
|
-
if line.startswith("## Table of Contents"):
|
|
113
|
-
continue
|
|
114
|
-
|
|
115
|
-
# Parse section headers
|
|
116
|
-
section_match = cls.SECTION_HEADER_PATTERN.match(line)
|
|
117
|
-
if section_match:
|
|
118
|
-
hours, minutes, seconds, section_title = section_match.groups()
|
|
119
|
-
timestamp = cls.parse_timestamp(hours, minutes, seconds)
|
|
120
|
-
current_section = section_title.strip()
|
|
121
|
-
if include_sections:
|
|
122
|
-
segments.append(
|
|
123
|
-
GeminiSegment(
|
|
124
|
-
text=section_title.strip(),
|
|
125
|
-
timestamp=timestamp,
|
|
126
|
-
section=current_section,
|
|
127
|
-
segment_type="section_header",
|
|
128
|
-
line_number=line_num,
|
|
129
|
-
)
|
|
130
|
-
)
|
|
131
|
-
continue
|
|
132
|
-
|
|
133
|
-
# Parse YouTube format section headers: ## [[MM:SS](URL&t=seconds)] Title
|
|
134
|
-
youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
|
|
135
|
-
if youtube_section_match:
|
|
136
|
-
minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
|
|
137
|
-
# Use the URL seconds for more accuracy
|
|
138
|
-
timestamp = cls.parse_timestamp(url_seconds)
|
|
139
|
-
current_section = section_title.strip()
|
|
140
|
-
if include_sections:
|
|
141
|
-
segments.append(
|
|
142
|
-
GeminiSegment(
|
|
143
|
-
text=section_title.strip(),
|
|
144
|
-
timestamp=timestamp,
|
|
145
|
-
section=current_section,
|
|
146
|
-
segment_type="section_header",
|
|
147
|
-
line_number=line_num,
|
|
148
|
-
)
|
|
149
|
-
)
|
|
150
|
-
continue
|
|
151
|
-
|
|
152
|
-
# Parse event descriptions [event] [HH:MM:SS] or [MM:SS]
|
|
153
|
-
event_match = cls.EVENT_PATTERN.match(line)
|
|
154
|
-
if event_match:
|
|
155
|
-
groups = event_match.groups()
|
|
156
|
-
event_text = groups[0]
|
|
157
|
-
# Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
|
|
158
|
-
hours_or_minutes = groups[1]
|
|
159
|
-
minutes_or_seconds = groups[2]
|
|
160
|
-
seconds_optional = groups[3]
|
|
161
|
-
|
|
162
|
-
if seconds_optional is not None:
|
|
163
|
-
# HH:MM:SS format
|
|
164
|
-
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
|
|
165
|
-
else:
|
|
166
|
-
# MM:SS format
|
|
167
|
-
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
|
|
168
|
-
|
|
169
|
-
if include_events and timestamp is not None:
|
|
170
|
-
segments.append(
|
|
171
|
-
GeminiSegment(
|
|
172
|
-
text=f"[{event_text.strip()}]",
|
|
173
|
-
timestamp=timestamp,
|
|
174
|
-
section=current_section,
|
|
175
|
-
segment_type="event",
|
|
176
|
-
line_number=line_num,
|
|
177
|
-
)
|
|
178
|
-
)
|
|
179
|
-
continue
|
|
180
|
-
|
|
181
|
-
# Parse speaker dialogue: **Speaker:** Text [HH:MM:SS] or [MM:SS]
|
|
182
|
-
speaker_match = cls.SPEAKER_PATTERN.match(line)
|
|
183
|
-
if speaker_match:
|
|
184
|
-
speaker, text_with_timestamp = speaker_match.groups()
|
|
185
|
-
current_speaker = speaker.strip()
|
|
186
|
-
|
|
187
|
-
# Check for timestamp at the beginning (start time)
|
|
188
|
-
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
|
|
189
|
-
# Check for timestamp at the end (end time)
|
|
190
|
-
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
|
|
191
|
-
youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
|
|
192
|
-
|
|
193
|
-
start_timestamp = None
|
|
194
|
-
end_timestamp = None
|
|
195
|
-
text = text_with_timestamp.strip()
|
|
196
|
-
|
|
197
|
-
if start_match:
|
|
198
|
-
groups = start_match.groups()
|
|
199
|
-
# Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
|
|
200
|
-
if groups[0] is not None: # HH:MM:SS format
|
|
201
|
-
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
202
|
-
elif groups[3] is not None: # MM:SS format
|
|
203
|
-
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
204
|
-
text = groups[5] # Text is after timestamp
|
|
205
|
-
elif end_match:
|
|
206
|
-
groups = end_match.groups()
|
|
207
|
-
text = groups[0] # Text is before timestamp
|
|
208
|
-
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
209
|
-
if groups[1] is not None: # HH:MM:SS format
|
|
210
|
-
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
211
|
-
elif groups[4] is not None: # MM:SS format
|
|
212
|
-
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
213
|
-
elif youtube_match:
|
|
214
|
-
groups = youtube_match.groups()
|
|
215
|
-
text = groups[0]
|
|
216
|
-
# Extract seconds from URL parameter (treat as end time)
|
|
217
|
-
url_seconds = groups[3]
|
|
218
|
-
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
219
|
-
|
|
220
|
-
segments.append(
|
|
221
|
-
GeminiSegment(
|
|
222
|
-
text=text.strip(),
|
|
223
|
-
timestamp=start_timestamp,
|
|
224
|
-
end_timestamp=end_timestamp,
|
|
225
|
-
speaker=current_speaker,
|
|
226
|
-
section=current_section,
|
|
227
|
-
segment_type="dialogue",
|
|
228
|
-
line_number=line_num,
|
|
229
|
-
)
|
|
230
|
-
)
|
|
231
|
-
current_speaker = None # Reset speaker after use
|
|
232
|
-
continue
|
|
233
|
-
|
|
234
|
-
# Parse plain text with timestamp (check both positions)
|
|
235
|
-
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
|
|
236
|
-
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
|
|
237
|
-
youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
|
|
238
|
-
|
|
239
|
-
start_timestamp = None
|
|
240
|
-
end_timestamp = None
|
|
241
|
-
text = None
|
|
242
|
-
|
|
243
|
-
if start_match:
|
|
244
|
-
groups = start_match.groups()
|
|
245
|
-
# Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
|
|
246
|
-
if groups[0] is not None: # HH:MM:SS format
|
|
247
|
-
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
248
|
-
elif groups[3] is not None: # MM:SS format
|
|
249
|
-
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
250
|
-
text = groups[5] # Text is after timestamp
|
|
251
|
-
|
|
252
|
-
segments.append(
|
|
253
|
-
GeminiSegment(
|
|
254
|
-
text=text.strip(),
|
|
255
|
-
timestamp=start_timestamp,
|
|
256
|
-
end_timestamp=None,
|
|
257
|
-
speaker=current_speaker,
|
|
258
|
-
section=current_section,
|
|
259
|
-
segment_type="dialogue",
|
|
260
|
-
line_number=line_num,
|
|
261
|
-
)
|
|
262
|
-
)
|
|
263
|
-
continue
|
|
264
|
-
elif end_match:
|
|
265
|
-
groups = end_match.groups()
|
|
266
|
-
text = groups[0] # Text is before timestamp
|
|
267
|
-
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
268
|
-
if groups[1] is not None: # HH:MM:SS format
|
|
269
|
-
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
270
|
-
elif groups[4] is not None: # MM:SS format
|
|
271
|
-
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
272
|
-
|
|
273
|
-
segments.append(
|
|
274
|
-
GeminiSegment(
|
|
275
|
-
text=text.strip(),
|
|
276
|
-
timestamp=None,
|
|
277
|
-
end_timestamp=end_timestamp,
|
|
278
|
-
speaker=current_speaker,
|
|
279
|
-
section=current_section,
|
|
280
|
-
segment_type="dialogue",
|
|
281
|
-
line_number=line_num,
|
|
282
|
-
)
|
|
283
|
-
)
|
|
284
|
-
continue
|
|
285
|
-
elif youtube_inline_match:
|
|
286
|
-
groups = youtube_inline_match.groups()
|
|
287
|
-
text = groups[0]
|
|
288
|
-
# Extract seconds from URL parameter (treat as end time)
|
|
289
|
-
url_seconds = groups[3]
|
|
290
|
-
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
291
|
-
|
|
292
|
-
segments.append(
|
|
293
|
-
GeminiSegment(
|
|
294
|
-
text=text.strip(),
|
|
295
|
-
timestamp=None,
|
|
296
|
-
end_timestamp=end_timestamp,
|
|
297
|
-
speaker=current_speaker,
|
|
298
|
-
section=current_section,
|
|
299
|
-
segment_type="dialogue",
|
|
300
|
-
line_number=line_num,
|
|
301
|
-
)
|
|
302
|
-
)
|
|
303
|
-
continue
|
|
304
|
-
|
|
305
|
-
# Skip markdown headers and other formatting
|
|
306
|
-
if line.startswith("#"):
|
|
307
|
-
continue
|
|
308
|
-
|
|
309
|
-
return segments
|
|
310
|
-
|
|
311
|
-
@classmethod
|
|
312
|
-
def extract_for_alignment(
|
|
313
|
-
cls,
|
|
314
|
-
transcript_path: Pathlike,
|
|
315
|
-
merge_consecutive: bool = False,
|
|
316
|
-
min_duration: float = 0.1,
|
|
317
|
-
merge_max_gap: float = 2.0,
|
|
318
|
-
) -> List[Supervision]:
|
|
319
|
-
"""Extract text segments for forced alignment.
|
|
320
|
-
|
|
321
|
-
This extracts only dialogue segments (not events or section headers)
|
|
322
|
-
and converts them to Supervision objects suitable for alignment.
|
|
323
|
-
|
|
324
|
-
Args:
|
|
325
|
-
transcript_path: Path to the transcript file
|
|
326
|
-
merge_consecutive: Whether to merge consecutive segments from same speaker
|
|
327
|
-
min_duration: Minimum duration for a segment
|
|
328
|
-
merge_max_gap: Maximum time gap (seconds) to merge consecutive segments
|
|
329
|
-
|
|
330
|
-
Returns:
|
|
331
|
-
List of Supervision objects ready for alignment
|
|
332
|
-
"""
|
|
333
|
-
segments = cls.read(transcript_path, include_events=True, include_sections=False)
|
|
334
|
-
|
|
335
|
-
# Filter to dialogue and event segments with timestamps (either start or end)
|
|
336
|
-
dialogue_segments = [
|
|
337
|
-
s
|
|
338
|
-
for s in segments
|
|
339
|
-
if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
|
|
340
|
-
]
|
|
341
|
-
|
|
342
|
-
if not dialogue_segments:
|
|
343
|
-
raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
|
|
344
|
-
|
|
345
|
-
# Sort by timestamp (use start time if available, otherwise end time)
|
|
346
|
-
dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
|
|
347
|
-
|
|
348
|
-
# Convert to Supervision objects
|
|
349
|
-
supervisions: List[Supervision] = []
|
|
350
|
-
prev_end_time = 0.0
|
|
351
|
-
|
|
352
|
-
for i, segment in enumerate(dialogue_segments):
|
|
353
|
-
seg_start = None
|
|
354
|
-
seg_end = None
|
|
355
|
-
|
|
356
|
-
# Determine start and end times based on available timestamps
|
|
357
|
-
if segment.timestamp is not None:
|
|
358
|
-
# Has start time
|
|
359
|
-
seg_start = segment.timestamp
|
|
360
|
-
if segment.end_timestamp is not None:
|
|
361
|
-
# Has both start and end
|
|
362
|
-
seg_end = segment.end_timestamp
|
|
363
|
-
else:
|
|
364
|
-
# Only has start, estimate end
|
|
365
|
-
if i < len(dialogue_segments) - 1:
|
|
366
|
-
# Use next segment's time
|
|
367
|
-
next_seg = dialogue_segments[i + 1]
|
|
368
|
-
if next_seg.timestamp is not None:
|
|
369
|
-
seg_end = next_seg.timestamp
|
|
370
|
-
elif next_seg.end_timestamp is not None:
|
|
371
|
-
# Next has only end, estimate its start and use that
|
|
372
|
-
words_next = len(next_seg.text.split())
|
|
373
|
-
estimated_duration_next = words_next * 0.3
|
|
374
|
-
seg_end = next_seg.end_timestamp - estimated_duration_next
|
|
375
|
-
|
|
376
|
-
if seg_end is None:
|
|
377
|
-
# Estimate based on text length
|
|
378
|
-
words = len(segment.text.split())
|
|
379
|
-
seg_end = seg_start + words * 0.3
|
|
380
|
-
|
|
381
|
-
elif segment.end_timestamp is not None:
|
|
382
|
-
# Only has end time, need to infer start
|
|
383
|
-
seg_end = segment.end_timestamp
|
|
384
|
-
# Use previous segment's end time as start, or estimate based on text
|
|
385
|
-
if prev_end_time > 0:
|
|
386
|
-
seg_start = prev_end_time
|
|
387
|
-
else:
|
|
388
|
-
# Estimate start based on text length
|
|
389
|
-
words = len(segment.text.split())
|
|
390
|
-
estimated_duration = words * 0.3
|
|
391
|
-
seg_start = seg_end - estimated_duration
|
|
392
|
-
|
|
393
|
-
if seg_start is not None and seg_end is not None:
|
|
394
|
-
duration = max(seg_end - seg_start, min_duration)
|
|
395
|
-
if segment.segment_type == "dialogue":
|
|
396
|
-
supervisions.append(
|
|
397
|
-
Supervision(
|
|
398
|
-
text=segment.text,
|
|
399
|
-
start=seg_start,
|
|
400
|
-
duration=duration,
|
|
401
|
-
id=f"segment_{i:05d}",
|
|
402
|
-
speaker=segment.speaker,
|
|
403
|
-
)
|
|
404
|
-
)
|
|
405
|
-
prev_end_time = seg_start + duration
|
|
406
|
-
|
|
407
|
-
# Optionally merge consecutive segments from same speaker
|
|
408
|
-
if merge_consecutive:
|
|
409
|
-
merged = []
|
|
410
|
-
current_speaker = None
|
|
411
|
-
current_texts = []
|
|
412
|
-
current_start = None
|
|
413
|
-
last_end_time = None
|
|
414
|
-
|
|
415
|
-
for i, (segment, sup) in enumerate(zip(dialogue_segments, supervisions)):
|
|
416
|
-
# Check if we should merge with previous segment
|
|
417
|
-
should_merge = False
|
|
418
|
-
if segment.speaker == current_speaker and current_start is not None:
|
|
419
|
-
# Same speaker - check time gap
|
|
420
|
-
time_gap = sup.start - last_end_time if last_end_time else 0
|
|
421
|
-
if time_gap <= merge_max_gap:
|
|
422
|
-
should_merge = True
|
|
423
|
-
|
|
424
|
-
if should_merge:
|
|
425
|
-
# Same speaker within time threshold, accumulate
|
|
426
|
-
current_texts.append(segment.text)
|
|
427
|
-
last_end_time = sup.start + sup.duration
|
|
428
|
-
else:
|
|
429
|
-
# Different speaker or gap too large, save previous segment
|
|
430
|
-
if current_texts:
|
|
431
|
-
merged_text = " ".join(current_texts)
|
|
432
|
-
merged.append(
|
|
433
|
-
Supervision(
|
|
434
|
-
text=merged_text,
|
|
435
|
-
start=current_start,
|
|
436
|
-
duration=last_end_time - current_start,
|
|
437
|
-
id=f"merged_{len(merged):05d}",
|
|
438
|
-
)
|
|
439
|
-
)
|
|
440
|
-
current_speaker = segment.speaker
|
|
441
|
-
current_texts = [segment.text]
|
|
442
|
-
current_start = sup.start
|
|
443
|
-
last_end_time = sup.start + sup.duration
|
|
444
|
-
|
|
445
|
-
# Add final segment
|
|
446
|
-
if current_texts:
|
|
447
|
-
merged_text = " ".join(current_texts)
|
|
448
|
-
merged.append(
|
|
449
|
-
Supervision(
|
|
450
|
-
text=merged_text,
|
|
451
|
-
start=current_start,
|
|
452
|
-
duration=last_end_time - current_start,
|
|
453
|
-
id=f"merged_{len(merged):05d}",
|
|
454
|
-
)
|
|
455
|
-
)
|
|
456
|
-
|
|
457
|
-
supervisions = merged
|
|
458
|
-
|
|
459
|
-
return supervisions
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
__all__ = ["GeminiReader", "GeminiSegment"]
|
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
"""Writer for YouTube transcript files with corrected timestamps from alignment."""
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Dict, List, Optional
|
|
6
|
-
|
|
7
|
-
from lhotse.utils import Pathlike
|
|
8
|
-
|
|
9
|
-
from .gemini_reader import GeminiReader, GeminiSegment
|
|
10
|
-
from .supervision import Supervision
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class GeminiWriter:
|
|
14
|
-
"""Writer for updating YouTube transcript timestamps based on alignment results."""
|
|
15
|
-
|
|
16
|
-
@staticmethod
|
|
17
|
-
def format_timestamp(seconds: float) -> str:
|
|
18
|
-
"""Convert seconds to [HH:MM:SS] format."""
|
|
19
|
-
hours = int(seconds // 3600)
|
|
20
|
-
minutes = int((seconds % 3600) // 60)
|
|
21
|
-
secs = int(seconds % 60)
|
|
22
|
-
return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
|
|
23
|
-
|
|
24
|
-
@classmethod
|
|
25
|
-
def update_timestamps(
|
|
26
|
-
cls,
|
|
27
|
-
original_transcript: Pathlike,
|
|
28
|
-
aligned_supervisions: List[Supervision],
|
|
29
|
-
output_path: Pathlike,
|
|
30
|
-
timestamp_mapping: Optional[Dict[int, float]] = None,
|
|
31
|
-
) -> Pathlike:
|
|
32
|
-
"""Update transcript file with corrected timestamps from alignment.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
original_transcript: Path to the original transcript file
|
|
36
|
-
aligned_supervisions: List of aligned Supervision objects with corrected timestamps
|
|
37
|
-
output_path: Path to write the updated transcript
|
|
38
|
-
timestamp_mapping: Optional manual mapping from line_number to new timestamp
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
Path to the output file
|
|
42
|
-
"""
|
|
43
|
-
original_path = Path(original_transcript)
|
|
44
|
-
output_path = Path(output_path)
|
|
45
|
-
|
|
46
|
-
# Read original file
|
|
47
|
-
with open(original_path, "r", encoding="utf-8") as f:
|
|
48
|
-
lines = f.readlines()
|
|
49
|
-
|
|
50
|
-
# Parse original segments to get line numbers
|
|
51
|
-
original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
|
|
52
|
-
|
|
53
|
-
# Create mapping from line number to new timestamp
|
|
54
|
-
if timestamp_mapping is None:
|
|
55
|
-
timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
|
|
56
|
-
|
|
57
|
-
# Update timestamps in lines
|
|
58
|
-
updated_lines = []
|
|
59
|
-
for line_num, line in enumerate(lines, start=1):
|
|
60
|
-
if line_num in timestamp_mapping:
|
|
61
|
-
new_timestamp = timestamp_mapping[line_num]
|
|
62
|
-
updated_line = cls._replace_timestamp(line, new_timestamp)
|
|
63
|
-
updated_lines.append(updated_line)
|
|
64
|
-
else:
|
|
65
|
-
updated_lines.append(line)
|
|
66
|
-
|
|
67
|
-
# Write updated content
|
|
68
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
-
with open(output_path, "w", encoding="utf-8") as f:
|
|
70
|
-
f.writelines(updated_lines)
|
|
71
|
-
|
|
72
|
-
return output_path
|
|
73
|
-
|
|
74
|
-
@classmethod
|
|
75
|
-
def _create_timestamp_mapping(
|
|
76
|
-
cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
|
|
77
|
-
) -> Dict[int, float]:
|
|
78
|
-
"""Create mapping from line numbers to new timestamps based on alignment.
|
|
79
|
-
|
|
80
|
-
This performs text matching between original segments and aligned supervisions
|
|
81
|
-
to determine which timestamps should be updated.
|
|
82
|
-
"""
|
|
83
|
-
mapping = {}
|
|
84
|
-
|
|
85
|
-
# Create a simple text-based matching
|
|
86
|
-
dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
|
|
87
|
-
|
|
88
|
-
# Try to match based on text content
|
|
89
|
-
for aligned_sup in aligned_supervisions:
|
|
90
|
-
aligned_text = aligned_sup.text.strip()
|
|
91
|
-
|
|
92
|
-
# Find best matching original segment
|
|
93
|
-
best_match = None
|
|
94
|
-
best_score = 0
|
|
95
|
-
|
|
96
|
-
for orig_seg in dialogue_segments:
|
|
97
|
-
orig_text = orig_seg.text.strip()
|
|
98
|
-
|
|
99
|
-
# Simple text similarity (could be improved with fuzzy matching)
|
|
100
|
-
if aligned_text == orig_text:
|
|
101
|
-
best_match = orig_seg
|
|
102
|
-
best_score = 1.0
|
|
103
|
-
break
|
|
104
|
-
elif aligned_text in orig_text or orig_text in aligned_text:
|
|
105
|
-
score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
|
|
106
|
-
if score > best_score:
|
|
107
|
-
best_score = score
|
|
108
|
-
best_match = orig_seg
|
|
109
|
-
|
|
110
|
-
# If we found a good match, update the mapping
|
|
111
|
-
if best_match and best_score > 0.8:
|
|
112
|
-
mapping[best_match.line_number] = aligned_sup.start
|
|
113
|
-
|
|
114
|
-
return mapping
|
|
115
|
-
|
|
116
|
-
@classmethod
|
|
117
|
-
def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
|
|
118
|
-
"""Replace timestamp in a line with new timestamp."""
|
|
119
|
-
new_ts_str = cls.format_timestamp(new_timestamp)
|
|
120
|
-
|
|
121
|
-
# Replace timestamp patterns
|
|
122
|
-
# Pattern 1: [HH:MM:SS] at the end or in brackets
|
|
123
|
-
line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
|
|
124
|
-
|
|
125
|
-
return line
|
|
126
|
-
|
|
127
|
-
@classmethod
|
|
128
|
-
def write_aligned_transcript(
|
|
129
|
-
cls,
|
|
130
|
-
aligned_supervisions: List[Supervision],
|
|
131
|
-
output_path: Pathlike,
|
|
132
|
-
include_word_timestamps: bool = False,
|
|
133
|
-
) -> Pathlike:
|
|
134
|
-
"""Write a new transcript file from aligned supervisions.
|
|
135
|
-
|
|
136
|
-
This creates a simplified transcript format with accurate timestamps.
|
|
137
|
-
|
|
138
|
-
Args:
|
|
139
|
-
aligned_supervisions: List of aligned Supervision objects
|
|
140
|
-
output_path: Path to write the transcript
|
|
141
|
-
include_word_timestamps: Whether to include word-level timestamps if available
|
|
142
|
-
|
|
143
|
-
Returns:
|
|
144
|
-
Path to the output file
|
|
145
|
-
"""
|
|
146
|
-
output_path = Path(output_path)
|
|
147
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
148
|
-
|
|
149
|
-
with open(output_path, "w", encoding="utf-8") as f:
|
|
150
|
-
f.write("# Aligned Transcript\n\n")
|
|
151
|
-
|
|
152
|
-
for i, sup in enumerate(aligned_supervisions):
|
|
153
|
-
# Write segment with timestamp
|
|
154
|
-
start_ts = cls.format_timestamp(sup.start)
|
|
155
|
-
f.write(f"{start_ts} {sup.text}\n")
|
|
156
|
-
|
|
157
|
-
# Optionally write word-level timestamps
|
|
158
|
-
if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
|
|
159
|
-
if "word" in sup.alignment:
|
|
160
|
-
f.write(" Words: ")
|
|
161
|
-
word_parts = []
|
|
162
|
-
for word_info in sup.alignment["word"]:
|
|
163
|
-
word_ts = cls.format_timestamp(word_info["start"])
|
|
164
|
-
word_parts.append(f'{word_info["symbol"]}{word_ts}')
|
|
165
|
-
f.write(" ".join(word_parts))
|
|
166
|
-
f.write("\n")
|
|
167
|
-
|
|
168
|
-
f.write("\n")
|
|
169
|
-
|
|
170
|
-
return output_path
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
__all__ = ["GeminiWriter"]
|
lattifai/caption/supervision.py
DELETED
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import Optional
|
|
3
|
-
|
|
4
|
-
from lhotse.supervision import SupervisionSegment
|
|
5
|
-
from lhotse.utils import Seconds
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@dataclass
|
|
9
|
-
class Supervision(SupervisionSegment):
|
|
10
|
-
"""
|
|
11
|
-
Extended SupervisionSegment with simplified initialization.
|
|
12
|
-
|
|
13
|
-
Note: The `alignment` field is inherited from SupervisionSegment:
|
|
14
|
-
alignment: Optional[Dict[str, List[AlignmentItem]]] = None
|
|
15
|
-
|
|
16
|
-
Structure of alignment when return_details=True:
|
|
17
|
-
{
|
|
18
|
-
'word': [
|
|
19
|
-
AlignmentItem(symbol='hello', start=0.0, duration=0.5, score=0.95),
|
|
20
|
-
AlignmentItem(symbol='world', start=0.6, duration=0.4, score=0.92),
|
|
21
|
-
...
|
|
22
|
-
]
|
|
23
|
-
}
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
text: Optional[str] = None
|
|
27
|
-
speaker: Optional[str] = None
|
|
28
|
-
id: str = ""
|
|
29
|
-
recording_id: str = ""
|
|
30
|
-
start: Seconds = 0.0
|
|
31
|
-
duration: Seconds = 0.0
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
__all__ = ["Supervision"]
|