lattifai 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/sentence_splitter.py +152 -21
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +82 -40
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1141
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/{gemini_reader.py → formats/gemini.py} +320 -60
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +1 -1
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +33 -113
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +27 -15
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,20 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Gemini/YouTube transcript format handler.
|
|
2
|
+
|
|
3
|
+
Handles YouTube/Gemini markdown transcript format with timestamps like [HH:MM:SS].
|
|
4
|
+
Supports reading and writing transcript files with speaker labels, events, and sections.
|
|
5
|
+
"""
|
|
2
6
|
|
|
3
7
|
import re
|
|
8
|
+
import tempfile
|
|
4
9
|
from dataclasses import dataclass
|
|
5
10
|
from pathlib import Path
|
|
6
|
-
from typing import List, Optional
|
|
11
|
+
from typing import Dict, List, Optional, Union
|
|
7
12
|
|
|
8
13
|
from lhotse.utils import Pathlike
|
|
9
14
|
|
|
10
|
-
from
|
|
15
|
+
from ..supervision import Supervision
|
|
16
|
+
from . import register_format
|
|
17
|
+
from .base import FormatHandler
|
|
11
18
|
|
|
12
19
|
|
|
13
20
|
@dataclass
|
|
@@ -46,6 +53,8 @@ class GeminiReader:
|
|
|
46
53
|
INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
47
54
|
# Timestamp at the beginning indicates start time
|
|
48
55
|
INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
|
|
56
|
+
# Standalone timestamp on its own line
|
|
57
|
+
STANDALONE_TIMESTAMP_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
49
58
|
|
|
50
59
|
# New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
|
|
51
60
|
YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
|
|
@@ -76,31 +85,40 @@ class GeminiReader:
|
|
|
76
85
|
@classmethod
|
|
77
86
|
def read(
|
|
78
87
|
cls,
|
|
79
|
-
transcript_path: Pathlike,
|
|
88
|
+
transcript_path: Union[Pathlike, str],
|
|
80
89
|
include_events: bool = False,
|
|
81
90
|
include_sections: bool = False,
|
|
82
91
|
) -> List[GeminiSegment]:
|
|
83
|
-
"""Parse YouTube transcript file and return list of transcript segments.
|
|
92
|
+
"""Parse YouTube transcript file or content and return list of transcript segments.
|
|
84
93
|
|
|
85
94
|
Args:
|
|
86
|
-
transcript_path: Path to the transcript file
|
|
95
|
+
transcript_path: Path to the transcript file or raw string content
|
|
87
96
|
include_events: Whether to include event descriptions like [Applause]
|
|
88
97
|
include_sections: Whether to include section headers
|
|
89
98
|
|
|
90
99
|
Returns:
|
|
91
100
|
List of GeminiSegment objects with all metadata
|
|
92
101
|
"""
|
|
93
|
-
|
|
94
|
-
if
|
|
95
|
-
|
|
102
|
+
content = ""
|
|
103
|
+
# Check if transcript_path is a multi-line string (content) or a short string (likely path)
|
|
104
|
+
is_content = "\n" in str(transcript_path) or len(str(transcript_path)) > 1000
|
|
105
|
+
|
|
106
|
+
if is_content:
|
|
107
|
+
content = str(transcript_path)
|
|
108
|
+
else:
|
|
109
|
+
p = Path(transcript_path).expanduser().resolve()
|
|
110
|
+
if p.exists() and p.is_file():
|
|
111
|
+
with open(p, "r", encoding="utf-8") as f:
|
|
112
|
+
content = f.read()
|
|
113
|
+
else:
|
|
114
|
+
# Fallback: treat as content if path doesn't exist
|
|
115
|
+
content = str(transcript_path)
|
|
96
116
|
|
|
97
117
|
segments: List[GeminiSegment] = []
|
|
98
118
|
current_section = None
|
|
99
119
|
current_speaker = None
|
|
100
120
|
|
|
101
|
-
|
|
102
|
-
lines = f.readlines()
|
|
103
|
-
|
|
121
|
+
lines = content.splitlines()
|
|
104
122
|
for line_num, line in enumerate(lines, start=1):
|
|
105
123
|
line = line.strip()
|
|
106
124
|
if not line:
|
|
@@ -130,11 +148,10 @@ class GeminiReader:
|
|
|
130
148
|
)
|
|
131
149
|
continue
|
|
132
150
|
|
|
133
|
-
# Parse YouTube format section headers
|
|
151
|
+
# Parse YouTube format section headers
|
|
134
152
|
youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
|
|
135
153
|
if youtube_section_match:
|
|
136
154
|
minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
|
|
137
|
-
# Use the URL seconds for more accuracy
|
|
138
155
|
timestamp = cls.parse_timestamp(url_seconds)
|
|
139
156
|
current_section = section_title.strip()
|
|
140
157
|
if include_sections:
|
|
@@ -149,21 +166,38 @@ class GeminiReader:
|
|
|
149
166
|
)
|
|
150
167
|
continue
|
|
151
168
|
|
|
152
|
-
# Parse
|
|
169
|
+
# Parse standalone timestamp [HH:MM:SS]
|
|
170
|
+
# Often used as an end timestamp for the preceding block
|
|
171
|
+
standalone_match = cls.STANDALONE_TIMESTAMP_PATTERN.match(line)
|
|
172
|
+
if standalone_match:
|
|
173
|
+
groups = standalone_match.groups()
|
|
174
|
+
if groups[0] is not None:
|
|
175
|
+
ts = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
176
|
+
else:
|
|
177
|
+
ts = cls.parse_timestamp(groups[3], groups[4])
|
|
178
|
+
|
|
179
|
+
# Assign to previous dialogue segment if it doesn't have an end time
|
|
180
|
+
if segments and segments[-1].segment_type == "dialogue":
|
|
181
|
+
if segments[-1].end_timestamp is None:
|
|
182
|
+
segments[-1].end_timestamp = ts
|
|
183
|
+
elif segments[-1].timestamp is None:
|
|
184
|
+
# If it has an end but no start, this standalone might be its start?
|
|
185
|
+
# Usually standalone is end, but let's be flexible
|
|
186
|
+
segments[-1].timestamp = ts
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
# Parse event descriptions [event] [HH:MM:SS]
|
|
153
190
|
event_match = cls.EVENT_PATTERN.match(line)
|
|
154
191
|
if event_match:
|
|
155
192
|
groups = event_match.groups()
|
|
156
193
|
event_text = groups[0]
|
|
157
|
-
# Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
|
|
158
194
|
hours_or_minutes = groups[1]
|
|
159
195
|
minutes_or_seconds = groups[2]
|
|
160
196
|
seconds_optional = groups[3]
|
|
161
197
|
|
|
162
198
|
if seconds_optional is not None:
|
|
163
|
-
# HH:MM:SS format
|
|
164
199
|
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
|
|
165
200
|
else:
|
|
166
|
-
# MM:SS format
|
|
167
201
|
timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
|
|
168
202
|
|
|
169
203
|
if include_events and timestamp is not None:
|
|
@@ -178,15 +212,13 @@ class GeminiReader:
|
|
|
178
212
|
)
|
|
179
213
|
continue
|
|
180
214
|
|
|
181
|
-
# Parse speaker dialogue: **Speaker:** Text [HH:MM:SS]
|
|
215
|
+
# Parse speaker dialogue: **Speaker:** Text [HH:MM:SS]
|
|
182
216
|
speaker_match = cls.SPEAKER_PATTERN.match(line)
|
|
183
217
|
if speaker_match:
|
|
184
218
|
speaker, text_with_timestamp = speaker_match.groups()
|
|
185
219
|
current_speaker = speaker.strip()
|
|
186
220
|
|
|
187
|
-
# Check for timestamp at the beginning (start time)
|
|
188
221
|
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
|
|
189
|
-
# Check for timestamp at the end (end time)
|
|
190
222
|
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
|
|
191
223
|
youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
|
|
192
224
|
|
|
@@ -196,24 +228,21 @@ class GeminiReader:
|
|
|
196
228
|
|
|
197
229
|
if start_match:
|
|
198
230
|
groups = start_match.groups()
|
|
199
|
-
|
|
200
|
-
if groups[0] is not None: # HH:MM:SS format
|
|
231
|
+
if groups[0] is not None:
|
|
201
232
|
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
202
|
-
elif groups[3] is not None:
|
|
233
|
+
elif groups[3] is not None:
|
|
203
234
|
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
204
|
-
text = groups[5]
|
|
235
|
+
text = groups[5]
|
|
205
236
|
elif end_match:
|
|
206
237
|
groups = end_match.groups()
|
|
207
|
-
text = groups[0]
|
|
208
|
-
|
|
209
|
-
if groups[1] is not None: # HH:MM:SS format
|
|
238
|
+
text = groups[0]
|
|
239
|
+
if groups[1] is not None:
|
|
210
240
|
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
211
|
-
elif groups[4] is not None:
|
|
241
|
+
elif groups[4] is not None:
|
|
212
242
|
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
213
243
|
elif youtube_match:
|
|
214
244
|
groups = youtube_match.groups()
|
|
215
245
|
text = groups[0]
|
|
216
|
-
# Extract seconds from URL parameter (treat as end time)
|
|
217
246
|
url_seconds = groups[3]
|
|
218
247
|
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
219
248
|
|
|
@@ -228,52 +257,41 @@ class GeminiReader:
|
|
|
228
257
|
line_number=line_num,
|
|
229
258
|
)
|
|
230
259
|
)
|
|
231
|
-
current_speaker = None
|
|
260
|
+
current_speaker = None
|
|
232
261
|
continue
|
|
233
262
|
|
|
234
|
-
# Parse plain text
|
|
263
|
+
# Parse plain text (might contain inline timestamp or be a continuation)
|
|
235
264
|
start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
|
|
236
265
|
end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
|
|
237
266
|
youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
|
|
238
267
|
|
|
239
|
-
start_timestamp = None
|
|
240
|
-
end_timestamp = None
|
|
241
|
-
text = None
|
|
242
|
-
|
|
243
268
|
if start_match:
|
|
244
269
|
groups = start_match.groups()
|
|
245
|
-
|
|
246
|
-
if groups[0] is not None: # HH:MM:SS format
|
|
270
|
+
if groups[0] is not None:
|
|
247
271
|
start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
|
|
248
|
-
|
|
272
|
+
else:
|
|
249
273
|
start_timestamp = cls.parse_timestamp(groups[3], groups[4])
|
|
250
|
-
text = groups[5]
|
|
251
|
-
|
|
274
|
+
text = groups[5]
|
|
252
275
|
segments.append(
|
|
253
276
|
GeminiSegment(
|
|
254
277
|
text=text.strip(),
|
|
255
278
|
timestamp=start_timestamp,
|
|
256
|
-
end_timestamp=None,
|
|
257
279
|
speaker=current_speaker,
|
|
258
280
|
section=current_section,
|
|
259
281
|
segment_type="dialogue",
|
|
260
282
|
line_number=line_num,
|
|
261
283
|
)
|
|
262
284
|
)
|
|
263
|
-
continue
|
|
264
285
|
elif end_match:
|
|
265
286
|
groups = end_match.groups()
|
|
266
|
-
text = groups[0]
|
|
267
|
-
|
|
268
|
-
if groups[1] is not None: # HH:MM:SS format
|
|
287
|
+
text = groups[0]
|
|
288
|
+
if groups[1] is not None:
|
|
269
289
|
end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
270
|
-
|
|
290
|
+
else:
|
|
271
291
|
end_timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
272
|
-
|
|
273
292
|
segments.append(
|
|
274
293
|
GeminiSegment(
|
|
275
294
|
text=text.strip(),
|
|
276
|
-
timestamp=None,
|
|
277
295
|
end_timestamp=end_timestamp,
|
|
278
296
|
speaker=current_speaker,
|
|
279
297
|
section=current_section,
|
|
@@ -281,30 +299,40 @@ class GeminiReader:
|
|
|
281
299
|
line_number=line_num,
|
|
282
300
|
)
|
|
283
301
|
)
|
|
284
|
-
continue
|
|
285
302
|
elif youtube_inline_match:
|
|
286
303
|
groups = youtube_inline_match.groups()
|
|
287
304
|
text = groups[0]
|
|
288
|
-
# Extract seconds from URL parameter (treat as end time)
|
|
289
305
|
url_seconds = groups[3]
|
|
290
|
-
end_timestamp = cls.parse_timestamp(url_seconds)
|
|
291
|
-
|
|
292
306
|
segments.append(
|
|
293
307
|
GeminiSegment(
|
|
294
308
|
text=text.strip(),
|
|
295
|
-
|
|
296
|
-
end_timestamp=end_timestamp,
|
|
309
|
+
end_timestamp=cls.parse_timestamp(url_seconds),
|
|
297
310
|
speaker=current_speaker,
|
|
298
311
|
section=current_section,
|
|
299
312
|
segment_type="dialogue",
|
|
300
313
|
line_number=line_num,
|
|
301
314
|
)
|
|
302
315
|
)
|
|
303
|
-
|
|
316
|
+
else:
|
|
317
|
+
# Plain text without any recognized markers
|
|
318
|
+
# If it follows a speaker line or another dialogue line without end timestamp,
|
|
319
|
+
# merge it into the last segment to support multi-line text blocks.
|
|
320
|
+
if segments and segments[-1].segment_type == "dialogue" and segments[-1].end_timestamp is None:
|
|
321
|
+
segments[-1].text += " " + line.strip()
|
|
322
|
+
else:
|
|
323
|
+
# Skip markdown headers and other formatting
|
|
324
|
+
if line.startswith("#"):
|
|
325
|
+
continue
|
|
304
326
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
327
|
+
segments.append(
|
|
328
|
+
GeminiSegment(
|
|
329
|
+
text=line.strip(),
|
|
330
|
+
speaker=current_speaker,
|
|
331
|
+
section=current_section,
|
|
332
|
+
segment_type="dialogue",
|
|
333
|
+
line_number=line_num,
|
|
334
|
+
)
|
|
335
|
+
)
|
|
308
336
|
|
|
309
337
|
return segments
|
|
310
338
|
|
|
@@ -315,6 +343,8 @@ class GeminiReader:
|
|
|
315
343
|
merge_consecutive: bool = False,
|
|
316
344
|
min_duration: float = 0.1,
|
|
317
345
|
merge_max_gap: float = 2.0,
|
|
346
|
+
normalize_text: bool = True,
|
|
347
|
+
**kwargs,
|
|
318
348
|
) -> List[Supervision]:
|
|
319
349
|
"""Extract text segments for forced alignment.
|
|
320
350
|
|
|
@@ -395,7 +425,7 @@ class GeminiReader:
|
|
|
395
425
|
if segment.segment_type == "dialogue":
|
|
396
426
|
supervisions.append(
|
|
397
427
|
Supervision(
|
|
398
|
-
text=segment.text,
|
|
428
|
+
text=segment.text.strip(),
|
|
399
429
|
start=seg_start,
|
|
400
430
|
duration=duration,
|
|
401
431
|
id=f"segment_{i:05d}",
|
|
@@ -460,3 +490,233 @@ class GeminiReader:
|
|
|
460
490
|
|
|
461
491
|
|
|
462
492
|
__all__ = ["GeminiReader", "GeminiSegment"]
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
class GeminiWriter:
|
|
496
|
+
"""Writer for updating YouTube transcript timestamps based on alignment results."""
|
|
497
|
+
|
|
498
|
+
@staticmethod
|
|
499
|
+
def format_timestamp(seconds: float) -> str:
|
|
500
|
+
"""Convert seconds to [HH:MM:SS] format."""
|
|
501
|
+
hours = int(seconds // 3600)
|
|
502
|
+
minutes = int((seconds % 3600) // 60)
|
|
503
|
+
secs = int(seconds % 60)
|
|
504
|
+
return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
|
|
505
|
+
|
|
506
|
+
@classmethod
|
|
507
|
+
def update_timestamps(
|
|
508
|
+
cls,
|
|
509
|
+
original_transcript: Pathlike,
|
|
510
|
+
aligned_supervisions: List[Supervision],
|
|
511
|
+
output_path: Pathlike,
|
|
512
|
+
timestamp_mapping: Optional[Dict[int, float]] = None,
|
|
513
|
+
) -> Pathlike:
|
|
514
|
+
"""Update transcript file with corrected timestamps from alignment.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
original_transcript: Path to the original transcript file
|
|
518
|
+
aligned_supervisions: List of aligned Supervision objects with corrected timestamps
|
|
519
|
+
output_path: Path to write the updated transcript
|
|
520
|
+
timestamp_mapping: Optional manual mapping from line_number to new timestamp
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
Path to the output file
|
|
524
|
+
"""
|
|
525
|
+
original_path = Path(original_transcript)
|
|
526
|
+
output_path = Path(output_path)
|
|
527
|
+
|
|
528
|
+
# Read original file
|
|
529
|
+
with open(original_path, "r", encoding="utf-8") as f:
|
|
530
|
+
lines = f.readlines()
|
|
531
|
+
|
|
532
|
+
# Parse original segments to get line numbers
|
|
533
|
+
original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
|
|
534
|
+
|
|
535
|
+
# Create mapping from line number to new timestamp
|
|
536
|
+
if timestamp_mapping is None:
|
|
537
|
+
timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
|
|
538
|
+
|
|
539
|
+
# Update timestamps in lines
|
|
540
|
+
updated_lines = []
|
|
541
|
+
for line_num, line in enumerate(lines, start=1):
|
|
542
|
+
if line_num in timestamp_mapping:
|
|
543
|
+
new_timestamp = timestamp_mapping[line_num]
|
|
544
|
+
updated_line = cls._replace_timestamp(line, new_timestamp)
|
|
545
|
+
updated_lines.append(updated_line)
|
|
546
|
+
else:
|
|
547
|
+
updated_lines.append(line)
|
|
548
|
+
|
|
549
|
+
# Write updated content
|
|
550
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
551
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
552
|
+
f.writelines(updated_lines)
|
|
553
|
+
|
|
554
|
+
return output_path
|
|
555
|
+
|
|
556
|
+
@classmethod
|
|
557
|
+
def _create_timestamp_mapping(
|
|
558
|
+
cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
|
|
559
|
+
) -> Dict[int, float]:
|
|
560
|
+
"""Create mapping from line numbers to new timestamps based on alignment.
|
|
561
|
+
|
|
562
|
+
This performs text matching between original segments and aligned supervisions
|
|
563
|
+
to determine which timestamps should be updated.
|
|
564
|
+
"""
|
|
565
|
+
mapping = {}
|
|
566
|
+
|
|
567
|
+
# Create a simple text-based matching
|
|
568
|
+
dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
|
|
569
|
+
|
|
570
|
+
# Try to match based on text content
|
|
571
|
+
for aligned_sup in aligned_supervisions:
|
|
572
|
+
aligned_text = aligned_sup.text.strip()
|
|
573
|
+
|
|
574
|
+
# Find best matching original segment
|
|
575
|
+
best_match = None
|
|
576
|
+
best_score = 0
|
|
577
|
+
|
|
578
|
+
for orig_seg in dialogue_segments:
|
|
579
|
+
orig_text = orig_seg.text.strip()
|
|
580
|
+
|
|
581
|
+
# Simple text similarity (could be improved with fuzzy matching)
|
|
582
|
+
if aligned_text == orig_text:
|
|
583
|
+
best_match = orig_seg
|
|
584
|
+
best_score = 1.0
|
|
585
|
+
break
|
|
586
|
+
elif aligned_text in orig_text or orig_text in aligned_text:
|
|
587
|
+
score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
|
|
588
|
+
if score > best_score:
|
|
589
|
+
best_score = score
|
|
590
|
+
best_match = orig_seg
|
|
591
|
+
|
|
592
|
+
# If we found a good match, update the mapping
|
|
593
|
+
if best_match and best_score > 0.8:
|
|
594
|
+
mapping[best_match.line_number] = aligned_sup.start
|
|
595
|
+
|
|
596
|
+
return mapping
|
|
597
|
+
|
|
598
|
+
@classmethod
|
|
599
|
+
def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
|
|
600
|
+
"""Replace timestamp in a line with new timestamp."""
|
|
601
|
+
new_ts_str = cls.format_timestamp(new_timestamp)
|
|
602
|
+
|
|
603
|
+
# Replace timestamp patterns
|
|
604
|
+
# Pattern 1: [HH:MM:SS] at the end or in brackets
|
|
605
|
+
line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
|
|
606
|
+
|
|
607
|
+
return line
|
|
608
|
+
|
|
609
|
+
@classmethod
|
|
610
|
+
def write_aligned_transcript(
|
|
611
|
+
cls,
|
|
612
|
+
aligned_supervisions: List[Supervision],
|
|
613
|
+
output_path: Pathlike,
|
|
614
|
+
include_word_timestamps: bool = False,
|
|
615
|
+
) -> Pathlike:
|
|
616
|
+
"""Write a new transcript file from aligned supervisions.
|
|
617
|
+
|
|
618
|
+
This creates a simplified transcript format with accurate timestamps.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
aligned_supervisions: List of aligned Supervision objects
|
|
622
|
+
output_path: Path to write the transcript
|
|
623
|
+
include_word_timestamps: Whether to include word-level timestamps if available
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
Path to the output file
|
|
627
|
+
"""
|
|
628
|
+
output_path = Path(output_path)
|
|
629
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
630
|
+
|
|
631
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
632
|
+
f.write("# Aligned Transcript\n\n")
|
|
633
|
+
|
|
634
|
+
for i, sup in enumerate(aligned_supervisions):
|
|
635
|
+
# Write segment with timestamp
|
|
636
|
+
start_ts = cls.format_timestamp(sup.start)
|
|
637
|
+
f.write(f"{start_ts} {sup.text}\n")
|
|
638
|
+
|
|
639
|
+
# Optionally write word-level timestamps
|
|
640
|
+
if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
|
|
641
|
+
if "word" in sup.alignment:
|
|
642
|
+
f.write(" Words: ")
|
|
643
|
+
word_parts = []
|
|
644
|
+
for word_info in sup.alignment["word"]:
|
|
645
|
+
word_ts = cls.format_timestamp(word_info["start"])
|
|
646
|
+
word_parts.append(f'{word_info["symbol"]}{word_ts}')
|
|
647
|
+
f.write(" ".join(word_parts))
|
|
648
|
+
f.write("\n")
|
|
649
|
+
|
|
650
|
+
f.write("\n")
|
|
651
|
+
|
|
652
|
+
return output_path
|
|
653
|
+
|
|
654
|
+
@classmethod
|
|
655
|
+
def write(
|
|
656
|
+
cls,
|
|
657
|
+
supervisions: List[Supervision],
|
|
658
|
+
output_path: Pathlike,
|
|
659
|
+
**kwargs,
|
|
660
|
+
) -> Path:
|
|
661
|
+
"""Alias for write_aligned_transcript for Caption API compatibility."""
|
|
662
|
+
return Path(cls.write_aligned_transcript(supervisions, output_path, **kwargs))
|
|
663
|
+
|
|
664
|
+
@classmethod
|
|
665
|
+
def to_bytes(
|
|
666
|
+
cls,
|
|
667
|
+
supervisions: List[Supervision],
|
|
668
|
+
**kwargs,
|
|
669
|
+
) -> bytes:
|
|
670
|
+
"""Convert aligned supervisions to Gemini format bytes."""
|
|
671
|
+
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
|
|
672
|
+
tmp_path = Path(tmp.name)
|
|
673
|
+
try:
|
|
674
|
+
cls.write_aligned_transcript(supervisions, tmp_path, **kwargs)
|
|
675
|
+
return tmp_path.read_bytes()
|
|
676
|
+
finally:
|
|
677
|
+
tmp_path.unlink(missing_ok=True)
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
__all__ = ["GeminiWriter"]
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
@register_format("gemini")
|
|
684
|
+
class GeminiFormat(FormatHandler):
|
|
685
|
+
"""YouTube/Gemini markdown transcript format."""
|
|
686
|
+
|
|
687
|
+
extensions = [".md"]
|
|
688
|
+
description = "YouTube/Gemini transcript format with timestamps"
|
|
689
|
+
|
|
690
|
+
@classmethod
|
|
691
|
+
def can_read(cls, path) -> bool:
|
|
692
|
+
"""Check if this is a Gemini format file."""
|
|
693
|
+
path_str = str(path).lower()
|
|
694
|
+
return (
|
|
695
|
+
path_str.endswith("gemini.md")
|
|
696
|
+
or path_str.endswith("gemini3.md")
|
|
697
|
+
or ("gemini" in path_str and path_str.endswith(".md"))
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
@classmethod
|
|
701
|
+
def read(cls, path: Pathlike, **kwargs) -> List[Supervision]:
|
|
702
|
+
"""Read Gemini format file."""
|
|
703
|
+
return GeminiReader.extract_for_alignment(path, **kwargs)
|
|
704
|
+
|
|
705
|
+
@classmethod
|
|
706
|
+
def write(
|
|
707
|
+
cls,
|
|
708
|
+
supervisions: List[Supervision],
|
|
709
|
+
output_path: Pathlike,
|
|
710
|
+
**kwargs,
|
|
711
|
+
) -> Path:
|
|
712
|
+
"""Write Gemini format file."""
|
|
713
|
+
return GeminiWriter.write(supervisions, output_path, **kwargs)
|
|
714
|
+
|
|
715
|
+
@classmethod
|
|
716
|
+
def to_bytes(
|
|
717
|
+
cls,
|
|
718
|
+
supervisions: List[Supervision],
|
|
719
|
+
**kwargs,
|
|
720
|
+
) -> bytes:
|
|
721
|
+
"""Convert to Gemini format bytes."""
|
|
722
|
+
return GeminiWriter.to_bytes(supervisions, **kwargs)
|