lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +2 -3
  3. lattifai/alignment/lattice1_aligner.py +117 -4
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/segmenter.py +3 -2
  6. lattifai/alignment/text_align.py +2 -1
  7. lattifai/alignment/tokenizer.py +56 -29
  8. lattifai/audio2.py +162 -183
  9. lattifai/cli/alignment.py +5 -0
  10. lattifai/cli/caption.py +6 -6
  11. lattifai/cli/transcribe.py +1 -5
  12. lattifai/cli/youtube.py +3 -0
  13. lattifai/client.py +41 -12
  14. lattifai/config/__init__.py +21 -3
  15. lattifai/config/alignment.py +7 -0
  16. lattifai/config/caption.py +13 -243
  17. lattifai/config/client.py +16 -0
  18. lattifai/config/event.py +102 -0
  19. lattifai/config/transcription.py +25 -1
  20. lattifai/data/__init__.py +8 -0
  21. lattifai/data/caption.py +228 -0
  22. lattifai/errors.py +78 -53
  23. lattifai/event/__init__.py +65 -0
  24. lattifai/event/lattifai.py +166 -0
  25. lattifai/mixin.py +22 -17
  26. lattifai/transcription/base.py +2 -1
  27. lattifai/transcription/gemini.py +147 -16
  28. lattifai/transcription/lattifai.py +8 -11
  29. lattifai/types.py +1 -1
  30. lattifai/youtube/client.py +143 -48
  31. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
  32. lattifai-1.3.1.dist-info/RECORD +57 -0
  33. lattifai/__init__.py +0 -88
  34. lattifai/alignment/sentence_splitter.py +0 -350
  35. lattifai/caption/__init__.py +0 -96
  36. lattifai/caption/caption.py +0 -661
  37. lattifai/caption/formats/__init__.py +0 -199
  38. lattifai/caption/formats/base.py +0 -211
  39. lattifai/caption/formats/gemini.py +0 -722
  40. lattifai/caption/formats/json.py +0 -194
  41. lattifai/caption/formats/lrc.py +0 -309
  42. lattifai/caption/formats/nle/__init__.py +0 -9
  43. lattifai/caption/formats/nle/audition.py +0 -561
  44. lattifai/caption/formats/nle/avid.py +0 -423
  45. lattifai/caption/formats/nle/fcpxml.py +0 -549
  46. lattifai/caption/formats/nle/premiere.py +0 -589
  47. lattifai/caption/formats/pysubs2.py +0 -642
  48. lattifai/caption/formats/sbv.py +0 -147
  49. lattifai/caption/formats/tabular.py +0 -338
  50. lattifai/caption/formats/textgrid.py +0 -193
  51. lattifai/caption/formats/ttml.py +0 -652
  52. lattifai/caption/formats/vtt.py +0 -469
  53. lattifai/caption/parsers/__init__.py +0 -9
  54. lattifai/caption/parsers/text_parser.py +0 -147
  55. lattifai/caption/standardize.py +0 -636
  56. lattifai/caption/supervision.py +0 -34
  57. lattifai/caption/utils.py +0 -474
  58. lattifai-1.2.2.dist-info/RECORD +0 -76
  59. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
  60. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
  61. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
  62. {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,722 +0,0 @@
1
- """Gemini/YouTube transcript format handler.
2
-
3
- Handles YouTube/Gemini markdown transcript format with timestamps like [HH:MM:SS].
4
- Supports reading and writing transcript files with speaker labels, events, and sections.
5
- """
6
-
7
- import re
8
- import tempfile
9
- from dataclasses import dataclass
10
- from pathlib import Path
11
- from typing import Dict, List, Optional, Union
12
-
13
- from lhotse.utils import Pathlike
14
-
15
- from ..supervision import Supervision
16
- from . import register_format
17
- from .base import FormatHandler
18
-
19
-
20
- @dataclass
21
- class GeminiSegment:
22
- """Represents a segment in the Gemini transcript with metadata."""
23
-
24
- text: str
25
- timestamp: Optional[float] = None # For backward compatibility (start time)
26
- end_timestamp: Optional[float] = None # End time when timestamp is at the end
27
- speaker: Optional[str] = None
28
- section: Optional[str] = None
29
- segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
30
- line_number: int = 0
31
-
32
- @property
33
- def start(self) -> float:
34
- """Return start time in seconds."""
35
- return self.timestamp if self.timestamp is not None else 0.0
36
-
37
- @property
38
- def end(self) -> Optional[float]:
39
- """Return end time in seconds if available."""
40
- return self.end_timestamp
41
-
42
-
43
- class GeminiReader:
44
- """Parser for YouTube transcript format with speaker labels and timestamps."""
45
-
46
- # Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
47
- TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
48
- SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
49
- SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
50
- # Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
51
- EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
52
- # Timestamp at the end indicates end time
53
- INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
54
- # Timestamp at the beginning indicates start time
55
- INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
56
- # Standalone timestamp on its own line
57
- STANDALONE_TIMESTAMP_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
58
-
59
- # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
60
- YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
61
- YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
62
-
63
- @classmethod
64
- def parse_timestamp(cls, *args) -> float:
65
- """Convert timestamp to seconds.
66
-
67
- Supports both HH:MM:SS and MM:SS formats.
68
- Args can be (hours, minutes, seconds) or (minutes, seconds).
69
- Can also accept a single argument which is seconds.
70
- """
71
- if len(args) == 3:
72
- # HH:MM:SS format
73
- hours, minutes, seconds = args
74
- return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
75
- elif len(args) == 2:
76
- # MM:SS format
77
- minutes, seconds = args
78
- return int(minutes) * 60 + int(seconds)
79
- elif len(args) == 1:
80
- # Direct seconds (from YouTube &t= parameter)
81
- return int(args[0])
82
- else:
83
- raise ValueError(f"Invalid timestamp args: {args}")
84
-
85
- @classmethod
86
- def read(
87
- cls,
88
- transcript_path: Union[Pathlike, str],
89
- include_events: bool = False,
90
- include_sections: bool = False,
91
- ) -> List[GeminiSegment]:
92
- """Parse YouTube transcript file or content and return list of transcript segments.
93
-
94
- Args:
95
- transcript_path: Path to the transcript file or raw string content
96
- include_events: Whether to include event descriptions like [Applause]
97
- include_sections: Whether to include section headers
98
-
99
- Returns:
100
- List of GeminiSegment objects with all metadata
101
- """
102
- content = ""
103
- # Check if transcript_path is a multi-line string (content) or a short string (likely path)
104
- is_content = "\n" in str(transcript_path) or len(str(transcript_path)) > 1000
105
-
106
- if is_content:
107
- content = str(transcript_path)
108
- else:
109
- p = Path(transcript_path).expanduser().resolve()
110
- if p.exists() and p.is_file():
111
- with open(p, "r", encoding="utf-8") as f:
112
- content = f.read()
113
- else:
114
- # Fallback: treat as content if path doesn't exist
115
- content = str(transcript_path)
116
-
117
- segments: List[GeminiSegment] = []
118
- current_section = None
119
- current_speaker = None
120
-
121
- lines = content.splitlines()
122
- for line_num, line in enumerate(lines, start=1):
123
- line = line.strip()
124
- if not line:
125
- continue
126
-
127
- # Skip table of contents
128
- if line.startswith("* ["):
129
- continue
130
- if line.startswith("## Table of Contents"):
131
- continue
132
-
133
- # Parse section headers
134
- section_match = cls.SECTION_HEADER_PATTERN.match(line)
135
- if section_match:
136
- hours, minutes, seconds, section_title = section_match.groups()
137
- timestamp = cls.parse_timestamp(hours, minutes, seconds)
138
- current_section = section_title.strip()
139
- if include_sections:
140
- segments.append(
141
- GeminiSegment(
142
- text=section_title.strip(),
143
- timestamp=timestamp,
144
- section=current_section,
145
- segment_type="section_header",
146
- line_number=line_num,
147
- )
148
- )
149
- continue
150
-
151
- # Parse YouTube format section headers
152
- youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
153
- if youtube_section_match:
154
- minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
155
- timestamp = cls.parse_timestamp(url_seconds)
156
- current_section = section_title.strip()
157
- if include_sections:
158
- segments.append(
159
- GeminiSegment(
160
- text=section_title.strip(),
161
- timestamp=timestamp,
162
- section=current_section,
163
- segment_type="section_header",
164
- line_number=line_num,
165
- )
166
- )
167
- continue
168
-
169
- # Parse standalone timestamp [HH:MM:SS]
170
- # Often used as an end timestamp for the preceding block
171
- standalone_match = cls.STANDALONE_TIMESTAMP_PATTERN.match(line)
172
- if standalone_match:
173
- groups = standalone_match.groups()
174
- if groups[0] is not None:
175
- ts = cls.parse_timestamp(groups[0], groups[1], groups[2])
176
- else:
177
- ts = cls.parse_timestamp(groups[3], groups[4])
178
-
179
- # Assign to previous dialogue segment if it doesn't have an end time
180
- if segments and segments[-1].segment_type == "dialogue":
181
- if segments[-1].end_timestamp is None:
182
- segments[-1].end_timestamp = ts
183
- elif segments[-1].timestamp is None:
184
- # If it has an end but no start, this standalone might be its start?
185
- # Usually standalone is end, but let's be flexible
186
- segments[-1].timestamp = ts
187
- continue
188
-
189
- # Parse event descriptions [event] [HH:MM:SS]
190
- event_match = cls.EVENT_PATTERN.match(line)
191
- if event_match:
192
- groups = event_match.groups()
193
- event_text = groups[0]
194
- hours_or_minutes = groups[1]
195
- minutes_or_seconds = groups[2]
196
- seconds_optional = groups[3]
197
-
198
- if seconds_optional is not None:
199
- timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
200
- else:
201
- timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
202
-
203
- if include_events and timestamp is not None:
204
- segments.append(
205
- GeminiSegment(
206
- text=f"[{event_text.strip()}]",
207
- timestamp=timestamp,
208
- section=current_section,
209
- segment_type="event",
210
- line_number=line_num,
211
- )
212
- )
213
- continue
214
-
215
- # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS]
216
- speaker_match = cls.SPEAKER_PATTERN.match(line)
217
- if speaker_match:
218
- speaker, text_with_timestamp = speaker_match.groups()
219
- current_speaker = speaker.strip()
220
-
221
- start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
222
- end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
223
- youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
224
-
225
- start_timestamp = None
226
- end_timestamp = None
227
- text = text_with_timestamp.strip()
228
-
229
- if start_match:
230
- groups = start_match.groups()
231
- if groups[0] is not None:
232
- start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
233
- elif groups[3] is not None:
234
- start_timestamp = cls.parse_timestamp(groups[3], groups[4])
235
- text = groups[5]
236
- elif end_match:
237
- groups = end_match.groups()
238
- text = groups[0]
239
- if groups[1] is not None:
240
- end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
241
- elif groups[4] is not None:
242
- end_timestamp = cls.parse_timestamp(groups[4], groups[5])
243
- elif youtube_match:
244
- groups = youtube_match.groups()
245
- text = groups[0]
246
- url_seconds = groups[3]
247
- end_timestamp = cls.parse_timestamp(url_seconds)
248
-
249
- segments.append(
250
- GeminiSegment(
251
- text=text.strip(),
252
- timestamp=start_timestamp,
253
- end_timestamp=end_timestamp,
254
- speaker=current_speaker,
255
- section=current_section,
256
- segment_type="dialogue",
257
- line_number=line_num,
258
- )
259
- )
260
- current_speaker = None
261
- continue
262
-
263
- # Parse plain text (might contain inline timestamp or be a continuation)
264
- start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
265
- end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
266
- youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
267
-
268
- if start_match:
269
- groups = start_match.groups()
270
- if groups[0] is not None:
271
- start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
272
- else:
273
- start_timestamp = cls.parse_timestamp(groups[3], groups[4])
274
- text = groups[5]
275
- segments.append(
276
- GeminiSegment(
277
- text=text.strip(),
278
- timestamp=start_timestamp,
279
- speaker=current_speaker,
280
- section=current_section,
281
- segment_type="dialogue",
282
- line_number=line_num,
283
- )
284
- )
285
- elif end_match:
286
- groups = end_match.groups()
287
- text = groups[0]
288
- if groups[1] is not None:
289
- end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
290
- else:
291
- end_timestamp = cls.parse_timestamp(groups[4], groups[5])
292
- segments.append(
293
- GeminiSegment(
294
- text=text.strip(),
295
- end_timestamp=end_timestamp,
296
- speaker=current_speaker,
297
- section=current_section,
298
- segment_type="dialogue",
299
- line_number=line_num,
300
- )
301
- )
302
- elif youtube_inline_match:
303
- groups = youtube_inline_match.groups()
304
- text = groups[0]
305
- url_seconds = groups[3]
306
- segments.append(
307
- GeminiSegment(
308
- text=text.strip(),
309
- end_timestamp=cls.parse_timestamp(url_seconds),
310
- speaker=current_speaker,
311
- section=current_section,
312
- segment_type="dialogue",
313
- line_number=line_num,
314
- )
315
- )
316
- else:
317
- # Plain text without any recognized markers
318
- # If it follows a speaker line or another dialogue line without end timestamp,
319
- # merge it into the last segment to support multi-line text blocks.
320
- if segments and segments[-1].segment_type == "dialogue" and segments[-1].end_timestamp is None:
321
- segments[-1].text += " " + line.strip()
322
- else:
323
- # Skip markdown headers and other formatting
324
- if line.startswith("#"):
325
- continue
326
-
327
- segments.append(
328
- GeminiSegment(
329
- text=line.strip(),
330
- speaker=current_speaker,
331
- section=current_section,
332
- segment_type="dialogue",
333
- line_number=line_num,
334
- )
335
- )
336
-
337
- return segments
338
-
339
- @classmethod
340
- def extract_for_alignment(
341
- cls,
342
- transcript_path: Pathlike,
343
- merge_consecutive: bool = False,
344
- min_duration: float = 0.1,
345
- merge_max_gap: float = 2.0,
346
- normalize_text: bool = True,
347
- **kwargs,
348
- ) -> List[Supervision]:
349
- """Extract text segments for forced alignment.
350
-
351
- This extracts only dialogue segments (not events or section headers)
352
- and converts them to Supervision objects suitable for alignment.
353
-
354
- Args:
355
- transcript_path: Path to the transcript file
356
- merge_consecutive: Whether to merge consecutive segments from same speaker
357
- min_duration: Minimum duration for a segment
358
- merge_max_gap: Maximum time gap (seconds) to merge consecutive segments
359
-
360
- Returns:
361
- List of Supervision objects ready for alignment
362
- """
363
- segments = cls.read(transcript_path, include_events=True, include_sections=False)
364
-
365
- # Filter to dialogue and event segments with timestamps (either start or end)
366
- dialogue_segments = [
367
- s
368
- for s in segments
369
- if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
370
- ]
371
-
372
- if not dialogue_segments:
373
- raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
374
-
375
- # Sort by timestamp (use start time if available, otherwise end time)
376
- dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
377
-
378
- # Convert to Supervision objects
379
- supervisions: List[Supervision] = []
380
- prev_end_time = 0.0
381
-
382
- for i, segment in enumerate(dialogue_segments):
383
- seg_start = None
384
- seg_end = None
385
-
386
- # Determine start and end times based on available timestamps
387
- if segment.timestamp is not None:
388
- # Has start time
389
- seg_start = segment.timestamp
390
- if segment.end_timestamp is not None:
391
- # Has both start and end
392
- seg_end = segment.end_timestamp
393
- else:
394
- # Only has start, estimate end
395
- if i < len(dialogue_segments) - 1:
396
- # Use next segment's time
397
- next_seg = dialogue_segments[i + 1]
398
- if next_seg.timestamp is not None:
399
- seg_end = next_seg.timestamp
400
- elif next_seg.end_timestamp is not None:
401
- # Next has only end, estimate its start and use that
402
- words_next = len(next_seg.text.split())
403
- estimated_duration_next = words_next * 0.3
404
- seg_end = next_seg.end_timestamp - estimated_duration_next
405
-
406
- if seg_end is None:
407
- # Estimate based on text length
408
- words = len(segment.text.split())
409
- seg_end = seg_start + words * 0.3
410
-
411
- elif segment.end_timestamp is not None:
412
- # Only has end time, need to infer start
413
- seg_end = segment.end_timestamp
414
- # Use previous segment's end time as start, or estimate based on text
415
- if prev_end_time > 0:
416
- seg_start = prev_end_time
417
- else:
418
- # Estimate start based on text length
419
- words = len(segment.text.split())
420
- estimated_duration = words * 0.3
421
- seg_start = seg_end - estimated_duration
422
-
423
- if seg_start is not None and seg_end is not None:
424
- duration = max(seg_end - seg_start, min_duration)
425
- if segment.segment_type == "dialogue":
426
- supervisions.append(
427
- Supervision(
428
- text=segment.text.strip(),
429
- start=seg_start,
430
- duration=duration,
431
- id=f"segment_{i:05d}",
432
- speaker=segment.speaker,
433
- )
434
- )
435
- prev_end_time = seg_start + duration
436
-
437
- # Optionally merge consecutive segments from same speaker
438
- if merge_consecutive:
439
- merged = []
440
- current_speaker = None
441
- current_texts = []
442
- current_start = None
443
- last_end_time = None
444
-
445
- for i, (segment, sup) in enumerate(zip(dialogue_segments, supervisions)):
446
- # Check if we should merge with previous segment
447
- should_merge = False
448
- if segment.speaker == current_speaker and current_start is not None:
449
- # Same speaker - check time gap
450
- time_gap = sup.start - last_end_time if last_end_time else 0
451
- if time_gap <= merge_max_gap:
452
- should_merge = True
453
-
454
- if should_merge:
455
- # Same speaker within time threshold, accumulate
456
- current_texts.append(segment.text)
457
- last_end_time = sup.start + sup.duration
458
- else:
459
- # Different speaker or gap too large, save previous segment
460
- if current_texts:
461
- merged_text = " ".join(current_texts)
462
- merged.append(
463
- Supervision(
464
- text=merged_text,
465
- start=current_start,
466
- duration=last_end_time - current_start,
467
- id=f"merged_{len(merged):05d}",
468
- )
469
- )
470
- current_speaker = segment.speaker
471
- current_texts = [segment.text]
472
- current_start = sup.start
473
- last_end_time = sup.start + sup.duration
474
-
475
- # Add final segment
476
- if current_texts:
477
- merged_text = " ".join(current_texts)
478
- merged.append(
479
- Supervision(
480
- text=merged_text,
481
- start=current_start,
482
- duration=last_end_time - current_start,
483
- id=f"merged_{len(merged):05d}",
484
- )
485
- )
486
-
487
- supervisions = merged
488
-
489
- return supervisions
490
-
491
-
492
- __all__ = ["GeminiReader", "GeminiSegment"]
493
-
494
-
495
- class GeminiWriter:
496
- """Writer for updating YouTube transcript timestamps based on alignment results."""
497
-
498
- @staticmethod
499
- def format_timestamp(seconds: float) -> str:
500
- """Convert seconds to [HH:MM:SS] format."""
501
- hours = int(seconds // 3600)
502
- minutes = int((seconds % 3600) // 60)
503
- secs = int(seconds % 60)
504
- return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
505
-
506
- @classmethod
507
- def update_timestamps(
508
- cls,
509
- original_transcript: Pathlike,
510
- aligned_supervisions: List[Supervision],
511
- output_path: Pathlike,
512
- timestamp_mapping: Optional[Dict[int, float]] = None,
513
- ) -> Pathlike:
514
- """Update transcript file with corrected timestamps from alignment.
515
-
516
- Args:
517
- original_transcript: Path to the original transcript file
518
- aligned_supervisions: List of aligned Supervision objects with corrected timestamps
519
- output_path: Path to write the updated transcript
520
- timestamp_mapping: Optional manual mapping from line_number to new timestamp
521
-
522
- Returns:
523
- Path to the output file
524
- """
525
- original_path = Path(original_transcript)
526
- output_path = Path(output_path)
527
-
528
- # Read original file
529
- with open(original_path, "r", encoding="utf-8") as f:
530
- lines = f.readlines()
531
-
532
- # Parse original segments to get line numbers
533
- original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
534
-
535
- # Create mapping from line number to new timestamp
536
- if timestamp_mapping is None:
537
- timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
538
-
539
- # Update timestamps in lines
540
- updated_lines = []
541
- for line_num, line in enumerate(lines, start=1):
542
- if line_num in timestamp_mapping:
543
- new_timestamp = timestamp_mapping[line_num]
544
- updated_line = cls._replace_timestamp(line, new_timestamp)
545
- updated_lines.append(updated_line)
546
- else:
547
- updated_lines.append(line)
548
-
549
- # Write updated content
550
- output_path.parent.mkdir(parents=True, exist_ok=True)
551
- with open(output_path, "w", encoding="utf-8") as f:
552
- f.writelines(updated_lines)
553
-
554
- return output_path
555
-
556
- @classmethod
557
- def _create_timestamp_mapping(
558
- cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
559
- ) -> Dict[int, float]:
560
- """Create mapping from line numbers to new timestamps based on alignment.
561
-
562
- This performs text matching between original segments and aligned supervisions
563
- to determine which timestamps should be updated.
564
- """
565
- mapping = {}
566
-
567
- # Create a simple text-based matching
568
- dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
569
-
570
- # Try to match based on text content
571
- for aligned_sup in aligned_supervisions:
572
- aligned_text = aligned_sup.text.strip()
573
-
574
- # Find best matching original segment
575
- best_match = None
576
- best_score = 0
577
-
578
- for orig_seg in dialogue_segments:
579
- orig_text = orig_seg.text.strip()
580
-
581
- # Simple text similarity (could be improved with fuzzy matching)
582
- if aligned_text == orig_text:
583
- best_match = orig_seg
584
- best_score = 1.0
585
- break
586
- elif aligned_text in orig_text or orig_text in aligned_text:
587
- score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
588
- if score > best_score:
589
- best_score = score
590
- best_match = orig_seg
591
-
592
- # If we found a good match, update the mapping
593
- if best_match and best_score > 0.8:
594
- mapping[best_match.line_number] = aligned_sup.start
595
-
596
- return mapping
597
-
598
- @classmethod
599
- def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
600
- """Replace timestamp in a line with new timestamp."""
601
- new_ts_str = cls.format_timestamp(new_timestamp)
602
-
603
- # Replace timestamp patterns
604
- # Pattern 1: [HH:MM:SS] at the end or in brackets
605
- line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
606
-
607
- return line
608
-
609
- @classmethod
610
- def write_aligned_transcript(
611
- cls,
612
- aligned_supervisions: List[Supervision],
613
- output_path: Pathlike,
614
- include_word_timestamps: bool = False,
615
- ) -> Pathlike:
616
- """Write a new transcript file from aligned supervisions.
617
-
618
- This creates a simplified transcript format with accurate timestamps.
619
-
620
- Args:
621
- aligned_supervisions: List of aligned Supervision objects
622
- output_path: Path to write the transcript
623
- include_word_timestamps: Whether to include word-level timestamps if available
624
-
625
- Returns:
626
- Path to the output file
627
- """
628
- output_path = Path(output_path)
629
- output_path.parent.mkdir(parents=True, exist_ok=True)
630
-
631
- with open(output_path, "w", encoding="utf-8") as f:
632
- f.write("# Aligned Transcript\n\n")
633
-
634
- for i, sup in enumerate(aligned_supervisions):
635
- # Write segment with timestamp
636
- start_ts = cls.format_timestamp(sup.start)
637
- f.write(f"{start_ts} {sup.text}\n")
638
-
639
- # Optionally write word-level timestamps
640
- if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
641
- if "word" in sup.alignment:
642
- f.write(" Words: ")
643
- word_parts = []
644
- for word_info in sup.alignment["word"]:
645
- word_ts = cls.format_timestamp(word_info["start"])
646
- word_parts.append(f'{word_info["symbol"]}{word_ts}')
647
- f.write(" ".join(word_parts))
648
- f.write("\n")
649
-
650
- f.write("\n")
651
-
652
- return output_path
653
-
654
- @classmethod
655
- def write(
656
- cls,
657
- supervisions: List[Supervision],
658
- output_path: Pathlike,
659
- **kwargs,
660
- ) -> Path:
661
- """Alias for write_aligned_transcript for Caption API compatibility."""
662
- return Path(cls.write_aligned_transcript(supervisions, output_path, **kwargs))
663
-
664
- @classmethod
665
- def to_bytes(
666
- cls,
667
- supervisions: List[Supervision],
668
- **kwargs,
669
- ) -> bytes:
670
- """Convert aligned supervisions to Gemini format bytes."""
671
- with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
672
- tmp_path = Path(tmp.name)
673
- try:
674
- cls.write_aligned_transcript(supervisions, tmp_path, **kwargs)
675
- return tmp_path.read_bytes()
676
- finally:
677
- tmp_path.unlink(missing_ok=True)
678
-
679
-
680
- __all__ = ["GeminiWriter"]
681
-
682
-
683
- @register_format("gemini")
684
- class GeminiFormat(FormatHandler):
685
- """YouTube/Gemini markdown transcript format."""
686
-
687
- extensions = [".md"]
688
- description = "YouTube/Gemini transcript format with timestamps"
689
-
690
- @classmethod
691
- def can_read(cls, path) -> bool:
692
- """Check if this is a Gemini format file."""
693
- path_str = str(path).lower()
694
- return (
695
- path_str.endswith("gemini.md")
696
- or path_str.endswith("gemini3.md")
697
- or ("gemini" in path_str and path_str.endswith(".md"))
698
- )
699
-
700
- @classmethod
701
- def read(cls, path: Pathlike, **kwargs) -> List[Supervision]:
702
- """Read Gemini format file."""
703
- return GeminiReader.extract_for_alignment(path, **kwargs)
704
-
705
- @classmethod
706
- def write(
707
- cls,
708
- supervisions: List[Supervision],
709
- output_path: Pathlike,
710
- **kwargs,
711
- ) -> Path:
712
- """Write Gemini format file."""
713
- return GeminiWriter.write(supervisions, output_path, **kwargs)
714
-
715
- @classmethod
716
- def to_bytes(
717
- cls,
718
- supervisions: List[Supervision],
719
- **kwargs,
720
- ) -> bytes:
721
- """Convert to Gemini format bytes."""
722
- return GeminiWriter.to_bytes(supervisions, **kwargs)