lattifai 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. lattifai/alignment/__init__.py +10 -1
  2. lattifai/alignment/lattice1_aligner.py +66 -58
  3. lattifai/alignment/punctuation.py +38 -0
  4. lattifai/alignment/sentence_splitter.py +152 -21
  5. lattifai/alignment/text_align.py +440 -0
  6. lattifai/alignment/tokenizer.py +82 -40
  7. lattifai/caption/__init__.py +82 -6
  8. lattifai/caption/caption.py +335 -1141
  9. lattifai/caption/formats/__init__.py +199 -0
  10. lattifai/caption/formats/base.py +211 -0
  11. lattifai/caption/{gemini_reader.py → formats/gemini.py} +320 -60
  12. lattifai/caption/formats/json.py +194 -0
  13. lattifai/caption/formats/lrc.py +309 -0
  14. lattifai/caption/formats/nle/__init__.py +9 -0
  15. lattifai/caption/formats/nle/audition.py +561 -0
  16. lattifai/caption/formats/nle/avid.py +423 -0
  17. lattifai/caption/formats/nle/fcpxml.py +549 -0
  18. lattifai/caption/formats/nle/premiere.py +589 -0
  19. lattifai/caption/formats/pysubs2.py +642 -0
  20. lattifai/caption/formats/sbv.py +147 -0
  21. lattifai/caption/formats/tabular.py +338 -0
  22. lattifai/caption/formats/textgrid.py +193 -0
  23. lattifai/caption/formats/ttml.py +652 -0
  24. lattifai/caption/formats/vtt.py +469 -0
  25. lattifai/caption/parsers/__init__.py +9 -0
  26. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  27. lattifai/caption/standardize.py +636 -0
  28. lattifai/caption/utils.py +474 -0
  29. lattifai/cli/__init__.py +2 -1
  30. lattifai/cli/caption.py +108 -1
  31. lattifai/cli/transcribe.py +1 -1
  32. lattifai/cli/youtube.py +4 -1
  33. lattifai/client.py +33 -113
  34. lattifai/config/__init__.py +11 -1
  35. lattifai/config/alignment.py +7 -0
  36. lattifai/config/caption.py +267 -23
  37. lattifai/config/media.py +20 -0
  38. lattifai/diarization/__init__.py +41 -1
  39. lattifai/mixin.py +27 -15
  40. lattifai/transcription/base.py +6 -1
  41. lattifai/transcription/lattifai.py +19 -54
  42. lattifai/utils.py +7 -13
  43. lattifai/workflow/__init__.py +28 -4
  44. lattifai/workflow/file_manager.py +2 -5
  45. lattifai/youtube/__init__.py +43 -0
  46. lattifai/youtube/client.py +1170 -0
  47. lattifai/youtube/types.py +23 -0
  48. lattifai-1.2.2.dist-info/METADATA +615 -0
  49. lattifai-1.2.2.dist-info/RECORD +76 -0
  50. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  51. lattifai/caption/gemini_writer.py +0 -173
  52. lattifai/cli/app_installer.py +0 -142
  53. lattifai/cli/server.py +0 -44
  54. lattifai/server/app.py +0 -427
  55. lattifai/workflow/youtube.py +0 -577
  56. lattifai-1.2.1.dist-info/METADATA +0 -1134
  57. lattifai-1.2.1.dist-info/RECORD +0 -58
  58. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  60. {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,20 @@
1
- """Reader for YouTube transcript files with speaker labels and timestamps."""
1
+ """Gemini/YouTube transcript format handler.
2
+
3
+ Handles YouTube/Gemini markdown transcript format with timestamps like [HH:MM:SS].
4
+ Supports reading and writing transcript files with speaker labels, events, and sections.
5
+ """
2
6
 
3
7
  import re
8
+ import tempfile
4
9
  from dataclasses import dataclass
5
10
  from pathlib import Path
6
- from typing import List, Optional
11
+ from typing import Dict, List, Optional, Union
7
12
 
8
13
  from lhotse.utils import Pathlike
9
14
 
10
- from .supervision import Supervision
15
+ from ..supervision import Supervision
16
+ from . import register_format
17
+ from .base import FormatHandler
11
18
 
12
19
 
13
20
  @dataclass
@@ -46,6 +53,8 @@ class GeminiReader:
46
53
  INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
47
54
  # Timestamp at the beginning indicates start time
48
55
  INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
56
+ # Standalone timestamp on its own line
57
+ STANDALONE_TIMESTAMP_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
49
58
 
50
59
  # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
51
60
  YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
@@ -76,31 +85,40 @@ class GeminiReader:
76
85
  @classmethod
77
86
  def read(
78
87
  cls,
79
- transcript_path: Pathlike,
88
+ transcript_path: Union[Pathlike, str],
80
89
  include_events: bool = False,
81
90
  include_sections: bool = False,
82
91
  ) -> List[GeminiSegment]:
83
- """Parse YouTube transcript file and return list of transcript segments.
92
+ """Parse YouTube transcript file or content and return list of transcript segments.
84
93
 
85
94
  Args:
86
- transcript_path: Path to the transcript file
95
+ transcript_path: Path to the transcript file or raw string content
87
96
  include_events: Whether to include event descriptions like [Applause]
88
97
  include_sections: Whether to include section headers
89
98
 
90
99
  Returns:
91
100
  List of GeminiSegment objects with all metadata
92
101
  """
93
- transcript_path = Path(transcript_path).expanduser().resolve()
94
- if not transcript_path.exists():
95
- raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
102
+ content = ""
103
+ # Check if transcript_path is a multi-line string (content) or a short string (likely path)
104
+ is_content = "\n" in str(transcript_path) or len(str(transcript_path)) > 1000
105
+
106
+ if is_content:
107
+ content = str(transcript_path)
108
+ else:
109
+ p = Path(transcript_path).expanduser().resolve()
110
+ if p.exists() and p.is_file():
111
+ with open(p, "r", encoding="utf-8") as f:
112
+ content = f.read()
113
+ else:
114
+ # Fallback: treat as content if path doesn't exist
115
+ content = str(transcript_path)
96
116
 
97
117
  segments: List[GeminiSegment] = []
98
118
  current_section = None
99
119
  current_speaker = None
100
120
 
101
- with open(transcript_path, "r", encoding="utf-8") as f:
102
- lines = f.readlines()
103
-
121
+ lines = content.splitlines()
104
122
  for line_num, line in enumerate(lines, start=1):
105
123
  line = line.strip()
106
124
  if not line:
@@ -130,11 +148,10 @@ class GeminiReader:
130
148
  )
131
149
  continue
132
150
 
133
- # Parse YouTube format section headers: ## [[MM:SS](URL&t=seconds)] Title
151
+ # Parse YouTube format section headers
134
152
  youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
135
153
  if youtube_section_match:
136
154
  minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
137
- # Use the URL seconds for more accuracy
138
155
  timestamp = cls.parse_timestamp(url_seconds)
139
156
  current_section = section_title.strip()
140
157
  if include_sections:
@@ -149,21 +166,38 @@ class GeminiReader:
149
166
  )
150
167
  continue
151
168
 
152
- # Parse event descriptions [event] [HH:MM:SS] or [MM:SS]
169
+ # Parse standalone timestamp [HH:MM:SS]
170
+ # Often used as an end timestamp for the preceding block
171
+ standalone_match = cls.STANDALONE_TIMESTAMP_PATTERN.match(line)
172
+ if standalone_match:
173
+ groups = standalone_match.groups()
174
+ if groups[0] is not None:
175
+ ts = cls.parse_timestamp(groups[0], groups[1], groups[2])
176
+ else:
177
+ ts = cls.parse_timestamp(groups[3], groups[4])
178
+
179
+ # Assign to previous dialogue segment if it doesn't have an end time
180
+ if segments and segments[-1].segment_type == "dialogue":
181
+ if segments[-1].end_timestamp is None:
182
+ segments[-1].end_timestamp = ts
183
+ elif segments[-1].timestamp is None:
184
+ # If it has an end but no start, this standalone might be its start?
185
+ # Usually standalone is end, but let's be flexible
186
+ segments[-1].timestamp = ts
187
+ continue
188
+
189
+ # Parse event descriptions [event] [HH:MM:SS]
153
190
  event_match = cls.EVENT_PATTERN.match(line)
154
191
  if event_match:
155
192
  groups = event_match.groups()
156
193
  event_text = groups[0]
157
- # Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
158
194
  hours_or_minutes = groups[1]
159
195
  minutes_or_seconds = groups[2]
160
196
  seconds_optional = groups[3]
161
197
 
162
198
  if seconds_optional is not None:
163
- # HH:MM:SS format
164
199
  timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
165
200
  else:
166
- # MM:SS format
167
201
  timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
168
202
 
169
203
  if include_events and timestamp is not None:
@@ -178,15 +212,13 @@ class GeminiReader:
178
212
  )
179
213
  continue
180
214
 
181
- # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS] or [MM:SS]
215
+ # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS]
182
216
  speaker_match = cls.SPEAKER_PATTERN.match(line)
183
217
  if speaker_match:
184
218
  speaker, text_with_timestamp = speaker_match.groups()
185
219
  current_speaker = speaker.strip()
186
220
 
187
- # Check for timestamp at the beginning (start time)
188
221
  start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
189
- # Check for timestamp at the end (end time)
190
222
  end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
191
223
  youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
192
224
 
@@ -196,24 +228,21 @@ class GeminiReader:
196
228
 
197
229
  if start_match:
198
230
  groups = start_match.groups()
199
- # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
200
- if groups[0] is not None: # HH:MM:SS format
231
+ if groups[0] is not None:
201
232
  start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
202
- elif groups[3] is not None: # MM:SS format
233
+ elif groups[3] is not None:
203
234
  start_timestamp = cls.parse_timestamp(groups[3], groups[4])
204
- text = groups[5] # Text is after timestamp
235
+ text = groups[5]
205
236
  elif end_match:
206
237
  groups = end_match.groups()
207
- text = groups[0] # Text is before timestamp
208
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
209
- if groups[1] is not None: # HH:MM:SS format
238
+ text = groups[0]
239
+ if groups[1] is not None:
210
240
  end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
211
- elif groups[4] is not None: # MM:SS format
241
+ elif groups[4] is not None:
212
242
  end_timestamp = cls.parse_timestamp(groups[4], groups[5])
213
243
  elif youtube_match:
214
244
  groups = youtube_match.groups()
215
245
  text = groups[0]
216
- # Extract seconds from URL parameter (treat as end time)
217
246
  url_seconds = groups[3]
218
247
  end_timestamp = cls.parse_timestamp(url_seconds)
219
248
 
@@ -228,52 +257,41 @@ class GeminiReader:
228
257
  line_number=line_num,
229
258
  )
230
259
  )
231
- current_speaker = None # Reset speaker after use
260
+ current_speaker = None
232
261
  continue
233
262
 
234
- # Parse plain text with timestamp (check both positions)
263
+ # Parse plain text (might contain inline timestamp or be a continuation)
235
264
  start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
236
265
  end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
237
266
  youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
238
267
 
239
- start_timestamp = None
240
- end_timestamp = None
241
- text = None
242
-
243
268
  if start_match:
244
269
  groups = start_match.groups()
245
- # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
246
- if groups[0] is not None: # HH:MM:SS format
270
+ if groups[0] is not None:
247
271
  start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
248
- elif groups[3] is not None: # MM:SS format
272
+ else:
249
273
  start_timestamp = cls.parse_timestamp(groups[3], groups[4])
250
- text = groups[5] # Text is after timestamp
251
-
274
+ text = groups[5]
252
275
  segments.append(
253
276
  GeminiSegment(
254
277
  text=text.strip(),
255
278
  timestamp=start_timestamp,
256
- end_timestamp=None,
257
279
  speaker=current_speaker,
258
280
  section=current_section,
259
281
  segment_type="dialogue",
260
282
  line_number=line_num,
261
283
  )
262
284
  )
263
- continue
264
285
  elif end_match:
265
286
  groups = end_match.groups()
266
- text = groups[0] # Text is before timestamp
267
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
268
- if groups[1] is not None: # HH:MM:SS format
287
+ text = groups[0]
288
+ if groups[1] is not None:
269
289
  end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
270
- elif groups[4] is not None: # MM:SS format
290
+ else:
271
291
  end_timestamp = cls.parse_timestamp(groups[4], groups[5])
272
-
273
292
  segments.append(
274
293
  GeminiSegment(
275
294
  text=text.strip(),
276
- timestamp=None,
277
295
  end_timestamp=end_timestamp,
278
296
  speaker=current_speaker,
279
297
  section=current_section,
@@ -281,30 +299,40 @@ class GeminiReader:
281
299
  line_number=line_num,
282
300
  )
283
301
  )
284
- continue
285
302
  elif youtube_inline_match:
286
303
  groups = youtube_inline_match.groups()
287
304
  text = groups[0]
288
- # Extract seconds from URL parameter (treat as end time)
289
305
  url_seconds = groups[3]
290
- end_timestamp = cls.parse_timestamp(url_seconds)
291
-
292
306
  segments.append(
293
307
  GeminiSegment(
294
308
  text=text.strip(),
295
- timestamp=None,
296
- end_timestamp=end_timestamp,
309
+ end_timestamp=cls.parse_timestamp(url_seconds),
297
310
  speaker=current_speaker,
298
311
  section=current_section,
299
312
  segment_type="dialogue",
300
313
  line_number=line_num,
301
314
  )
302
315
  )
303
- continue
316
+ else:
317
+ # Plain text without any recognized markers
318
+ # If it follows a speaker line or another dialogue line without end timestamp,
319
+ # merge it into the last segment to support multi-line text blocks.
320
+ if segments and segments[-1].segment_type == "dialogue" and segments[-1].end_timestamp is None:
321
+ segments[-1].text += " " + line.strip()
322
+ else:
323
+ # Skip markdown headers and other formatting
324
+ if line.startswith("#"):
325
+ continue
304
326
 
305
- # Skip markdown headers and other formatting
306
- if line.startswith("#"):
307
- continue
327
+ segments.append(
328
+ GeminiSegment(
329
+ text=line.strip(),
330
+ speaker=current_speaker,
331
+ section=current_section,
332
+ segment_type="dialogue",
333
+ line_number=line_num,
334
+ )
335
+ )
308
336
 
309
337
  return segments
310
338
 
@@ -315,6 +343,8 @@ class GeminiReader:
315
343
  merge_consecutive: bool = False,
316
344
  min_duration: float = 0.1,
317
345
  merge_max_gap: float = 2.0,
346
+ normalize_text: bool = True,
347
+ **kwargs,
318
348
  ) -> List[Supervision]:
319
349
  """Extract text segments for forced alignment.
320
350
 
@@ -395,7 +425,7 @@ class GeminiReader:
395
425
  if segment.segment_type == "dialogue":
396
426
  supervisions.append(
397
427
  Supervision(
398
- text=segment.text,
428
+ text=segment.text.strip(),
399
429
  start=seg_start,
400
430
  duration=duration,
401
431
  id=f"segment_{i:05d}",
@@ -460,3 +490,233 @@ class GeminiReader:
460
490
 
461
491
 
462
492
  __all__ = ["GeminiReader", "GeminiSegment"]
493
+
494
+
495
+ class GeminiWriter:
496
+ """Writer for updating YouTube transcript timestamps based on alignment results."""
497
+
498
+ @staticmethod
499
+ def format_timestamp(seconds: float) -> str:
500
+ """Convert seconds to [HH:MM:SS] format."""
501
+ hours = int(seconds // 3600)
502
+ minutes = int((seconds % 3600) // 60)
503
+ secs = int(seconds % 60)
504
+ return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
505
+
506
+ @classmethod
507
+ def update_timestamps(
508
+ cls,
509
+ original_transcript: Pathlike,
510
+ aligned_supervisions: List[Supervision],
511
+ output_path: Pathlike,
512
+ timestamp_mapping: Optional[Dict[int, float]] = None,
513
+ ) -> Pathlike:
514
+ """Update transcript file with corrected timestamps from alignment.
515
+
516
+ Args:
517
+ original_transcript: Path to the original transcript file
518
+ aligned_supervisions: List of aligned Supervision objects with corrected timestamps
519
+ output_path: Path to write the updated transcript
520
+ timestamp_mapping: Optional manual mapping from line_number to new timestamp
521
+
522
+ Returns:
523
+ Path to the output file
524
+ """
525
+ original_path = Path(original_transcript)
526
+ output_path = Path(output_path)
527
+
528
+ # Read original file
529
+ with open(original_path, "r", encoding="utf-8") as f:
530
+ lines = f.readlines()
531
+
532
+ # Parse original segments to get line numbers
533
+ original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
534
+
535
+ # Create mapping from line number to new timestamp
536
+ if timestamp_mapping is None:
537
+ timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
538
+
539
+ # Update timestamps in lines
540
+ updated_lines = []
541
+ for line_num, line in enumerate(lines, start=1):
542
+ if line_num in timestamp_mapping:
543
+ new_timestamp = timestamp_mapping[line_num]
544
+ updated_line = cls._replace_timestamp(line, new_timestamp)
545
+ updated_lines.append(updated_line)
546
+ else:
547
+ updated_lines.append(line)
548
+
549
+ # Write updated content
550
+ output_path.parent.mkdir(parents=True, exist_ok=True)
551
+ with open(output_path, "w", encoding="utf-8") as f:
552
+ f.writelines(updated_lines)
553
+
554
+ return output_path
555
+
556
+ @classmethod
557
+ def _create_timestamp_mapping(
558
+ cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
559
+ ) -> Dict[int, float]:
560
+ """Create mapping from line numbers to new timestamps based on alignment.
561
+
562
+ This performs text matching between original segments and aligned supervisions
563
+ to determine which timestamps should be updated.
564
+ """
565
+ mapping = {}
566
+
567
+ # Create a simple text-based matching
568
+ dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
569
+
570
+ # Try to match based on text content
571
+ for aligned_sup in aligned_supervisions:
572
+ aligned_text = aligned_sup.text.strip()
573
+
574
+ # Find best matching original segment
575
+ best_match = None
576
+ best_score = 0
577
+
578
+ for orig_seg in dialogue_segments:
579
+ orig_text = orig_seg.text.strip()
580
+
581
+ # Simple text similarity (could be improved with fuzzy matching)
582
+ if aligned_text == orig_text:
583
+ best_match = orig_seg
584
+ best_score = 1.0
585
+ break
586
+ elif aligned_text in orig_text or orig_text in aligned_text:
587
+ score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
588
+ if score > best_score:
589
+ best_score = score
590
+ best_match = orig_seg
591
+
592
+ # If we found a good match, update the mapping
593
+ if best_match and best_score > 0.8:
594
+ mapping[best_match.line_number] = aligned_sup.start
595
+
596
+ return mapping
597
+
598
+ @classmethod
599
+ def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
600
+ """Replace timestamp in a line with new timestamp."""
601
+ new_ts_str = cls.format_timestamp(new_timestamp)
602
+
603
+ # Replace timestamp patterns
604
+ # Pattern 1: [HH:MM:SS] at the end or in brackets
605
+ line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
606
+
607
+ return line
608
+
609
+ @classmethod
610
+ def write_aligned_transcript(
611
+ cls,
612
+ aligned_supervisions: List[Supervision],
613
+ output_path: Pathlike,
614
+ include_word_timestamps: bool = False,
615
+ ) -> Pathlike:
616
+ """Write a new transcript file from aligned supervisions.
617
+
618
+ This creates a simplified transcript format with accurate timestamps.
619
+
620
+ Args:
621
+ aligned_supervisions: List of aligned Supervision objects
622
+ output_path: Path to write the transcript
623
+ include_word_timestamps: Whether to include word-level timestamps if available
624
+
625
+ Returns:
626
+ Path to the output file
627
+ """
628
+ output_path = Path(output_path)
629
+ output_path.parent.mkdir(parents=True, exist_ok=True)
630
+
631
+ with open(output_path, "w", encoding="utf-8") as f:
632
+ f.write("# Aligned Transcript\n\n")
633
+
634
+ for i, sup in enumerate(aligned_supervisions):
635
+ # Write segment with timestamp
636
+ start_ts = cls.format_timestamp(sup.start)
637
+ f.write(f"{start_ts} {sup.text}\n")
638
+
639
+ # Optionally write word-level timestamps
640
+ if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
641
+ if "word" in sup.alignment:
642
+ f.write(" Words: ")
643
+ word_parts = []
644
+ for word_info in sup.alignment["word"]:
645
+ word_ts = cls.format_timestamp(word_info["start"])
646
+ word_parts.append(f'{word_info["symbol"]}{word_ts}')
647
+ f.write(" ".join(word_parts))
648
+ f.write("\n")
649
+
650
+ f.write("\n")
651
+
652
+ return output_path
653
+
654
+ @classmethod
655
+ def write(
656
+ cls,
657
+ supervisions: List[Supervision],
658
+ output_path: Pathlike,
659
+ **kwargs,
660
+ ) -> Path:
661
+ """Alias for write_aligned_transcript for Caption API compatibility."""
662
+ return Path(cls.write_aligned_transcript(supervisions, output_path, **kwargs))
663
+
664
+ @classmethod
665
+ def to_bytes(
666
+ cls,
667
+ supervisions: List[Supervision],
668
+ **kwargs,
669
+ ) -> bytes:
670
+ """Convert aligned supervisions to Gemini format bytes."""
671
+ with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
672
+ tmp_path = Path(tmp.name)
673
+ try:
674
+ cls.write_aligned_transcript(supervisions, tmp_path, **kwargs)
675
+ return tmp_path.read_bytes()
676
+ finally:
677
+ tmp_path.unlink(missing_ok=True)
678
+
679
+
680
+ __all__ = ["GeminiWriter"]
681
+
682
+
683
+ @register_format("gemini")
684
+ class GeminiFormat(FormatHandler):
685
+ """YouTube/Gemini markdown transcript format."""
686
+
687
+ extensions = [".md"]
688
+ description = "YouTube/Gemini transcript format with timestamps"
689
+
690
+ @classmethod
691
+ def can_read(cls, path) -> bool:
692
+ """Check if this is a Gemini format file."""
693
+ path_str = str(path).lower()
694
+ return (
695
+ path_str.endswith("gemini.md")
696
+ or path_str.endswith("gemini3.md")
697
+ or ("gemini" in path_str and path_str.endswith(".md"))
698
+ )
699
+
700
+ @classmethod
701
+ def read(cls, path: Pathlike, **kwargs) -> List[Supervision]:
702
+ """Read Gemini format file."""
703
+ return GeminiReader.extract_for_alignment(path, **kwargs)
704
+
705
+ @classmethod
706
+ def write(
707
+ cls,
708
+ supervisions: List[Supervision],
709
+ output_path: Pathlike,
710
+ **kwargs,
711
+ ) -> Path:
712
+ """Write Gemini format file."""
713
+ return GeminiWriter.write(supervisions, output_path, **kwargs)
714
+
715
+ @classmethod
716
+ def to_bytes(
717
+ cls,
718
+ supervisions: List[Supervision],
719
+ **kwargs,
720
+ ) -> bytes:
721
+ """Convert to Gemini format bytes."""
722
+ return GeminiWriter.to_bytes(supervisions, **kwargs)