PyPI - lattifai - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

lattifai 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

lattifai/alignment/__init__.py +10 -1
lattifai/alignment/lattice1_aligner.py +66 -58
lattifai/alignment/punctuation.py +38 -0
lattifai/alignment/sentence_splitter.py +152 -21
lattifai/alignment/text_align.py +440 -0
lattifai/alignment/tokenizer.py +82 -40
lattifai/caption/__init__.py +82 -6
lattifai/caption/caption.py +335 -1141
lattifai/caption/formats/__init__.py +199 -0
lattifai/caption/formats/base.py +211 -0
lattifai/caption/{gemini_reader.py → formats/gemini.py} +320 -60
lattifai/caption/formats/json.py +194 -0
lattifai/caption/formats/lrc.py +309 -0
lattifai/caption/formats/nle/__init__.py +9 -0
lattifai/caption/formats/nle/audition.py +561 -0
lattifai/caption/formats/nle/avid.py +423 -0
lattifai/caption/formats/nle/fcpxml.py +549 -0
lattifai/caption/formats/nle/premiere.py +589 -0
lattifai/caption/formats/pysubs2.py +642 -0
lattifai/caption/formats/sbv.py +147 -0
lattifai/caption/formats/tabular.py +338 -0
lattifai/caption/formats/textgrid.py +193 -0
lattifai/caption/formats/ttml.py +652 -0
lattifai/caption/formats/vtt.py +469 -0
lattifai/caption/parsers/__init__.py +9 -0
lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
lattifai/caption/standardize.py +636 -0
lattifai/caption/utils.py +474 -0
lattifai/cli/__init__.py +2 -1
lattifai/cli/caption.py +108 -1
lattifai/cli/transcribe.py +1 -1
lattifai/cli/youtube.py +4 -1
lattifai/client.py +33 -113
lattifai/config/__init__.py +11 -1
lattifai/config/alignment.py +7 -0
lattifai/config/caption.py +267 -23
lattifai/config/media.py +20 -0
lattifai/diarization/__init__.py +41 -1
lattifai/mixin.py +27 -15
lattifai/transcription/base.py +6 -1
lattifai/transcription/lattifai.py +19 -54
lattifai/utils.py +7 -13
lattifai/workflow/__init__.py +28 -4
lattifai/workflow/file_manager.py +2 -5
lattifai/youtube/__init__.py +43 -0
lattifai/youtube/client.py +1170 -0
lattifai/youtube/types.py +23 -0
lattifai-1.2.2.dist-info/METADATA +615 -0
lattifai-1.2.2.dist-info/RECORD +76 -0
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
lattifai/caption/gemini_writer.py +0 -173
lattifai/cli/app_installer.py +0 -142
lattifai/cli/server.py +0 -44
lattifai/server/app.py +0 -427
lattifai/workflow/youtube.py +0 -577
lattifai-1.2.1.dist-info/METADATA +0 -1134
lattifai-1.2.1.dist-info/RECORD +0 -58
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0

lattifai/caption/caption.py CHANGED Viewed

@@ -1,19 +1,22 @@
 """Caption data structure for storing subtitle information with metadata."""
-import json
-import re
+from __future__ import annotations
+import io
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, List, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
+if TYPE_CHECKING:
+    from ..config.caption import KaraokeConfig
 from lhotse.supervision import AlignmentItem
 from lhotse.utils import Pathlike
 from tgt import TextGrid
 from ..config.caption import InputCaptionFormat, OutputCaptionFormat  # noqa: F401
+from .formats import detect_format, get_reader, get_writer
 from .supervision import Supervision
-from .text_parser import normalize_text as normalize_text_fn
-from .text_parser import parse_speaker_text, parse_timestamp_text
 DiarizationOutput = TypeVar("DiarizationOutput")
@@ -50,7 +53,7 @@ class Caption:
     kind: Optional[str] = None
     source_format: Optional[str] = None
     source_path: Optional[Pathlike] = None
-    metadata: Dict[str, str] = field(default_factory=dict)
+    metadata: Dict[str, Any] = field(default_factory=dict)
     def __len__(self) -> int:
         """Return the number of supervision segments."""
@@ -66,7 +69,7 @@ class Caption:
     def __bool__(self) -> bool:
         """Return True if caption has supervisions."""
-        return self.__len__() > 0
+        return len(self) > 0
     @property
     def is_empty(self) -> bool:
@@ -147,19 +150,72 @@ class Caption:
         Returns:
             New Caption instance with shifted timestamps
         """
-        shifted_sups = [
-            Supervision(
-                text=sup.text,
-                start=sup.start + seconds,
-                duration=sup.duration,
-                speaker=sup.speaker,
-                id=sup.id,
-                language=sup.language,
-                alignment=sup.alignment if hasattr(sup, "alignment") else None,
-                custom=sup.custom,
+        shifted_sups = []
+        for sup in self.supervisions:
+            # Calculate physical time range
+            raw_start = sup.start + seconds
+            raw_end = sup.end + seconds
+            # Skip segments that end before 0
+            if raw_end <= 0:
+                continue
+            # Clip start to 0 if negative
+            if raw_start < 0:
+                final_start = 0.0
+                final_duration = raw_end
+            else:
+                final_start = raw_start
+                final_duration = sup.duration
+            # Handle alignment (word-level timestamps)
+            final_alignment = None
+            original_alignment = getattr(sup, "alignment", None)
+            if original_alignment and "word" in original_alignment:
+                new_words = []
+                for word in original_alignment["word"]:
+                    w_start = word.start + seconds
+                    w_end = w_start + word.duration
+                    # Skip words that end before 0
+                    if w_end <= 0:
+                        continue
+                    # Clip start to 0 if negative
+                    if w_start < 0:
+                        w_final_start = 0.0
+                        w_final_duration = w_end
+                    else:
+                        w_final_start = w_start
+                        w_final_duration = word.duration
+                    new_words.append(
+                        AlignmentItem(
+                            symbol=word.symbol,
+                            start=w_final_start,
+                            duration=w_final_duration,
+                            score=word.score,
+                        )
+                    )
+                # Copy original alignment dict structure and update words
+                final_alignment = original_alignment.copy()
+                final_alignment["word"] = new_words
+            shifted_sups.append(
+                Supervision(
+                    text=sup.text,
+                    start=final_start,
+                    duration=final_duration,
+                    speaker=sup.speaker,
+                    id=sup.id,
+                    recording_id=sup.recording_id if hasattr(sup, "recording_id") else "",
+                    channel=getattr(sup, "channel", 0),
+                    language=sup.language,
+                    alignment=final_alignment,
+                    custom=sup.custom,
+                )
             )
-            for sup in self.supervisions
-        ]
         return Caption(
             supervisions=shifted_sups,
@@ -170,52 +226,90 @@ class Caption:
             metadata=self.metadata.copy(),
         )
-    def to_string(self, format: str = "srt") -> str:
+    def with_margins(
+        self,
+        start_margin: float = 0.08,
+        end_margin: float = 0.20,
+        min_gap: float = 0.08,
+        collision_mode: str = "trim",
+    ) -> "Caption":
         """
-        Return caption content in specified format.
+        Create a new Caption with segment boundaries adjusted based on word-level alignment.
+        Uses supervision.alignment['word'] to recalculate segment start/end times
+        with the specified margins applied around the actual speech boundaries.
         Args:
-            format: Output format (e.g., 'srt', 'vtt', 'ass')
+            start_margin: Seconds to extend before the first word (default: 0.08)
+            end_margin: Seconds to extend after the last word (default: 0.20)
+            min_gap: Minimum gap between segments for collision handling (default: 0.08)
+            collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
         Returns:
-            String containing formatted captions
-        """
-        import pysubs2
+            New Caption instance with adjusted timestamps
+        Note:
+            Segments without alignment data will keep their original timestamps.
-        subs = pysubs2.SSAFile()
+        Example:
+            >>> caption = Caption.read("aligned.srt")
+            >>> adjusted = caption.with_margins(start_margin=0.05, end_margin=0.15)
+            >>> adjusted.write("output.srt")
+        """
+        from .standardize import apply_margins_to_captions
+        # Determine which supervisions to use
         if self.alignments:
-            alignments = self.alignments
+            source_sups = self.alignments
+        elif self.supervisions:
+            source_sups = self.supervisions
         else:
-            alignments = self.supervisions
-        if not alignments:
-            alignments = self.transcription
-        for sup in alignments:
-            # Add word-level timing as metadata in the caption text
-            word_items = self._parse_alignment_from_supervision(sup)
-            if word_items:
-                for word in word_items:
-                    subs.append(
-                        pysubs2.SSAEvent(
-                            start=int(word.start * 1000),
-                            end=int(word.end * 1000),
-                            text=word.symbol,
-                            name=sup.speaker or "",
-                        )
-                    )
-            else:
-                subs.append(
-                    pysubs2.SSAEvent(
-                        start=int(sup.start * 1000),
-                        end=int(sup.end * 1000),
-                        text=sup.text or "",
-                        name=sup.speaker or "",
-                    )
-                )
+            source_sups = self.transcription
+        adjusted_sups = apply_margins_to_captions(
+            source_sups,
+            start_margin=start_margin,
+            end_margin=end_margin,
+            min_gap=min_gap,
+            collision_mode=collision_mode,
+        )
+        return Caption(
+            supervisions=adjusted_sups,
+            transcription=self.transcription,
+            audio_events=self.audio_events,
+            speaker_diarization=self.speaker_diarization,
+            alignments=[],  # Clear alignments since we've applied them
+            language=self.language,
+            kind=self.kind,
+            source_format=self.source_format,
+            source_path=self.source_path,
+            metadata=self.metadata.copy(),
+        )
+    def to_string(
+        self,
+        format: str = "srt",
+        word_level: bool = False,
+        karaoke_config: Optional["KaraokeConfig"] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        """
+        Return caption content in specified format.
-        return subs.to_string(format_=format)
+        Args:
+            format: Output format (e.g., 'srt', 'vtt', 'ass')
+            word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
+            karaoke_config: Karaoke configuration. When provided with enabled=True,
+                enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
+            metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
+        Returns:
+            String containing formatted captions
+        """
+        return self.to_bytes(
+            output_format=format, word_level=word_level, karaoke_config=karaoke_config, metadata=metadata
+        ).decode("utf-8")
     def to_dict(self) -> Dict:
         """
@@ -269,6 +363,71 @@ class Caption:
             metadata=metadata or {},
         )
+    @classmethod
+    def from_string(
+        cls,
+        content: str,
+        format: str,
+        normalize_text: bool = True,
+    ) -> "Caption":
+        """
+        Create Caption from string content.
+        Args:
+            content: Caption content as string
+            format: Caption format (e.g., 'srt', 'vtt', 'ass')
+            normalize_text: Whether to normalize text during reading
+        Returns:
+            New Caption instance
+        Example:
+            >>> srt_content = \"\"\"1
+            ... 00:00:00,000 --> 00:00:02,000
+            ... Hello world\"\"\"
+            >>> caption = Caption.from_string(srt_content, format=\"srt\")
+        """
+        buffer = io.StringIO(content)
+        return cls.read(buffer, format=format, normalize_text=normalize_text)
+    def to_bytes(
+        self,
+        output_format: Optional[str] = None,
+        include_speaker_in_text: bool = True,
+        word_level: bool = False,
+        karaoke_config: Optional["KaraokeConfig"] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> bytes:
+        """
+        Convert caption to bytes.
+        Args:
+            output_format: Output format (e.g., 'srt', 'vtt', 'ass'). Defaults to source_format or 'srt'
+            include_speaker_in_text: Whether to include speaker labels in text
+            word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
+            karaoke_config: Karaoke configuration. When provided with enabled=True,
+                enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
+            metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
+        Returns:
+            Caption content as bytes
+        Example:
+            >>> caption = Caption.read("input.srt")
+            >>> # Get as bytes in original format
+            >>> data = caption.to_bytes()
+            >>> # Get as bytes in specific format
+            >>> vtt_data = caption.to_bytes(output_format="vtt")
+        """
+        return self.write(
+            None,
+            output_format=output_format,
+            include_speaker_in_text=include_speaker_in_text,
+            word_level=word_level,
+            karaoke_config=karaoke_config,
+            metadata=metadata,
+        )
     @classmethod
     def from_transcription_results(
         cls,
@@ -307,82 +466,168 @@ class Caption:
     @classmethod
     def read(
         cls,
-        path: Pathlike,
+        path: Union[Pathlike, io.BytesIO, io.StringIO],
         format: Optional[str] = None,
         normalize_text: bool = True,
     ) -> "Caption":
         """
-        Read caption file and return Caption object.
+        Read caption file or in-memory data and return Caption object.
         Args:
-            path: Path to caption file
-            format: Caption format (auto-detected if not provided)
+            path: Path to caption file, or BytesIO/StringIO object with caption content
+            format: Caption format (auto-detected if not provided, required for in-memory data)
             normalize_text: Whether to normalize text during reading
         Returns:
             Caption object containing supervisions and metadata
-        Example:
-            >>> caption = Caption.read("subtitles.srt")
-            >>> print(f"Loaded {len(caption)} segments")
         """
-        caption_path = Path(str(path)) if not isinstance(path, Path) else path
         # Detect format if not provided
-        if not format and caption_path.exists():
-            format = caption_path.suffix.lstrip(".").lower()
-        elif format:
-            format = format.lower()
+        if not format:
+            if isinstance(path, (io.BytesIO, io.StringIO)):
+                raise ValueError("format parameter is required when reading from BytesIO/StringIO")
+            format = detect_format(str(path))
+        if not format:
+            # Fallback to extension
+            if not isinstance(path, (io.BytesIO, io.StringIO)):
+                format = Path(str(path)).suffix.lstrip(".").lower()
+        if not format:
+            format = "srt"  # Last resort default
-        # Extract metadata from file
-        metadata = cls._extract_metadata(path, format)
+        # Get content if it's an in-memory buffer
+        source = path
+        if isinstance(path, io.BytesIO):
+            source = path.read().decode("utf-8")
+        elif isinstance(path, io.StringIO):
+            source = path.read()
-        # Parse supervisions
-        supervisions = cls._parse_supervisions(path, format, normalize_text)
+        # Reset buffer position if it was a stream
+        if isinstance(path, (io.BytesIO, io.StringIO)):
+            path.seek(0)
+        # Get reader and perform extraction
+        reader_cls = get_reader(format)
+        if not reader_cls:
+            # Use pysubs2 as a generic fallback if no specific reader exists
+            from .formats.pysubs2 import Pysubs2Format
+            reader_cls = Pysubs2Format
+        supervisions = reader_cls.read(source, normalize_text=normalize_text)
+        metadata = reader_cls.extract_metadata(source)
         # Create Caption object
+        source_path = None
+        if isinstance(path, (str, Path)) and not ("\n" in str(path) or len(str(path)) > 500):
+            try:
+                p = Path(str(path))
+                if p.exists():
+                    source_path = str(p)
+            except (OSError, ValueError):
+                pass
         return cls(
             supervisions=supervisions,
             language=metadata.get("language"),
             kind=metadata.get("kind"),
             source_format=format,
-            source_path=str(caption_path) if caption_path.exists() else None,
+            source_path=source_path,
             metadata=metadata,
         )
     def write(
         self,
-        path: Pathlike,
+        path: Union[Pathlike, io.BytesIO, None] = None,
+        output_format: Optional[str] = None,
         include_speaker_in_text: bool = True,
-    ) -> Pathlike:
+        word_level: bool = False,
+        karaoke_config: Optional["KaraokeConfig"] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> Union[Pathlike, bytes]:
         """
-        Write caption to file.
+        Write caption to file or return as bytes.
         Args:
-            path: Path to output caption file
+            path: Path to output caption file, BytesIO object, or None to return bytes
+            output_format: Output format (e.g., 'srt', 'vtt', 'ass')
             include_speaker_in_text: Whether to include speaker labels in text
+            word_level: Use word-level output format if supported (e.g., LRC, ASS, TTML)
+            karaoke_config: Karaoke configuration. When provided with enabled=True,
+                enables karaoke styling (ASS \\kf tags, enhanced LRC, etc.)
+            metadata: Optional metadata dict to pass to writer. If None, uses self.metadata.
+                Can be used to override or supplement format-specific metadata.
         Returns:
-            Path to the written file
-        Example:
-            >>> caption = Caption.read("input.srt")
-            >>> caption.write("output.vtt", include_speaker_in_text=False)
+            Path to the written file if path is a file path, or bytes if path is BytesIO/None
         """
         if self.alignments:
-            alignments = self.alignments
+            supervisions = self.alignments
+        elif self.supervisions:
+            supervisions = self.supervisions
         else:
-            alignments = self.supervisions
-        if not alignments:
-            alignments = self.transcription
+            supervisions = self.transcription
+        # Merge external metadata with self.metadata (external takes precedence)
+        effective_metadata = dict(self.metadata) if self.metadata else {}
+        if metadata:
+            effective_metadata.update(metadata)
+        # Determine output format
+        if output_format:
+            output_format = output_format.lower()
+        elif isinstance(path, (io.BytesIO, type(None))):
+            output_format = self.source_format or "srt"
+        else:
+            output_format = detect_format(str(path)) or Path(str(path)).suffix.lstrip(".").lower() or "srt"
+        # Special casing for professional formats as before
+        ext = output_format
+        if isinstance(path, (str, Path)):
+            path_str = str(path)
+            if path_str.endswith("_avid.txt"):
+                ext = "avid_ds"
+            elif "audition" in path_str.lower() and path_str.endswith(".csv"):
+                ext = "audition_csv"
+            elif "edimarker" in path_str.lower() and path_str.endswith(".csv"):
+                ext = "edimarker_csv"
+            elif "imsc" in path_str.lower() and path_str.endswith(".ttml"):
+                ext = "imsc1"
+            elif "ebu" in path_str.lower() and path_str.endswith(".ttml"):
+                ext = "ebu_tt_d"
+        writer_cls = get_writer(ext)
+        if not writer_cls:
+            from .formats.pysubs2 import Pysubs2Format
+            writer_cls = Pysubs2Format
+        if isinstance(path, (str, Path)):
+            return writer_cls.write(
+                supervisions,
+                path,
+                include_speaker=include_speaker_in_text,
+                word_level=word_level,
+                karaoke_config=karaoke_config,
+                metadata=effective_metadata,
+            )
-        return self._write_caption(alignments, path, include_speaker_in_text)
+        content = writer_cls.to_bytes(
+            supervisions,
+            include_speaker=include_speaker_in_text,
+            word_level=word_level,
+            karaoke_config=karaoke_config,
+            metadata=effective_metadata,
+        )
+        if isinstance(path, io.BytesIO):
+            path.write(content)
+            path.seek(0)
+        return content
     def read_speaker_diarization(
         self,
         path: Pathlike,
-    ) -> TextGrid:
+    ) -> "DiarizationOutput":
         """
         Read speaker diarization TextGrid from file.
         """
@@ -404,1057 +649,6 @@ class Caption:
         self.speaker_diarization.write(path)
         return path
-    @staticmethod
-    def _parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
-        """
-        Extract word-level alignment items from Supervision object.
-        Args:
-            supervision: Supervision object with potential alignment data
-        Returns:
-            List of AlignmentItem objects, or None if no alignment data present
-        """
-        if not hasattr(supervision, "alignment") or not supervision.alignment:
-            return None
-        if "word" not in supervision.alignment:
-            return None
-        return supervision.alignment["word"]
-    @classmethod
-    def _write_caption(
-        cls,
-        alignments: List[Supervision],
-        output_path: Pathlike,
-        include_speaker_in_text: bool = True,
-    ) -> Pathlike:
-        """
-        Write caption to file in various formats.
-        Args:
-            alignments: List of supervision segments to write
-            output_path: Path to output file
-            include_speaker_in_text: Whether to include speaker in text
-        Returns:
-            Path to written file
-        """
-        if str(output_path)[-4:].lower() == ".txt":
-            with open(output_path, "w", encoding="utf-8") as f:
-                for sup in alignments:
-                    word_items = cls._parse_alignment_from_supervision(sup)
-                    if word_items:
-                        for item in word_items:
-                            f.write(f"[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n")
-                    else:
-                        if include_speaker_in_text and sup.speaker is not None:
-                            # Use [SPEAKER]: format for consistency with parsing
-                            if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
-                                text = f"[{sup.speaker}]: {sup.text}"
-                            else:
-                                text = f"{sup.text}"
-                        else:
-                            text = sup.text
-                        f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
-        elif str(output_path)[-5:].lower() == ".json":
-            with open(output_path, "w", encoding="utf-8") as f:
-                # Enhanced JSON export with word-level alignment
-                json_data = []
-                for sup in alignments:
-                    sup_dict = sup.to_dict()
-                    json_data.append(sup_dict)
-                json.dump(json_data, f, ensure_ascii=False, indent=4)
-        elif str(output_path).lower().endswith(".textgrid"):
-            from tgt import Interval, IntervalTier, TextGrid, write_to_file
-            tg = TextGrid()
-            supervisions, words, scores = [], [], {"utterances": [], "words": []}
-            for supervision in sorted(alignments, key=lambda x: x.start):
-                # Respect `original_speaker` custom flag: default to include speaker when missing
-                if (
-                    include_speaker_in_text
-                    and supervision.speaker is not None
-                    and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
-                ):
-                    text = f"{supervision.speaker} {supervision.text}"
-                else:
-                    text = supervision.text
-                supervisions.append(Interval(supervision.start, supervision.end, text or ""))
-                # Extract word-level alignment using helper function
-                word_items = cls._parse_alignment_from_supervision(supervision)
-                if word_items:
-                    for item in word_items:
-                        words.append(Interval(item.start, item.end, item.symbol))
-                        if item.score is not None:
-                            scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
-                if supervision.has_custom("score"):
-                    scores["utterances"].append(
-                        Interval(supervision.start, supervision.end, f"{supervision.score:.2f}")
-                    )
-            tg.add_tier(IntervalTier(name="utterances", objects=supervisions))
-            if words:
-                tg.add_tier(IntervalTier(name="words", objects=words))
-            if scores["utterances"]:
-                tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
-            if scores["words"]:
-                tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
-            write_to_file(tg, output_path, format="long")
-        elif str(output_path)[-4:].lower() == ".tsv":
-            cls._write_tsv(alignments, output_path, include_speaker_in_text)
-        elif str(output_path)[-4:].lower() == ".csv":
-            cls._write_csv(alignments, output_path, include_speaker_in_text)
-        elif str(output_path)[-4:].lower() == ".aud":
-            cls._write_aud(alignments, output_path, include_speaker_in_text)
-        elif str(output_path)[-4:].lower() == ".sbv":
-            cls._write_sbv(alignments, output_path, include_speaker_in_text)
-        else:
-            import pysubs2
-            subs = pysubs2.SSAFile()
-            for sup in alignments:
-                # Add word-level timing as metadata in the caption text
-                word_items = cls._parse_alignment_from_supervision(sup)
-                if word_items:
-                    for word in word_items:
-                        subs.append(
-                            pysubs2.SSAEvent(
-                                start=int(word.start * 1000),
-                                end=int(word.end * 1000),
-                                text=word.symbol,
-                                name=sup.speaker or "",
-                            )
-                        )
-                else:
-                    if include_speaker_in_text and sup.speaker is not None:
-                        if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
-                            text = f"{sup.speaker} {sup.text}"
-                        else:
-                            text = f"{sup.text}"
-                    else:
-                        text = sup.text
-                    subs.append(
-                        pysubs2.SSAEvent(
-                            start=int(sup.start * 1000),
-                            end=int(sup.end * 1000),
-                            text=text or "",
-                            name=sup.speaker or "",
-                        )
-                    )
-            # MicroDVD format requires framerate to be specified
-            output_ext = str(output_path).lower().split(".")[-1]
-            if output_ext == "sub":
-                # Default to 25 fps for MicroDVD format if not specified
-                subs.save(output_path, fps=25.0)
-            else:
-                subs.save(output_path)
-        return output_path
-    @classmethod
-    def _extract_metadata(cls, caption: Pathlike, format: Optional[str]) -> Dict[str, str]:
-        """
-        Extract metadata from caption file header.
-        Args:
-            caption: Caption file path or content
-            format: Caption format
-        Returns:
-            Dictionary of metadata key-value pairs
-        """
-        metadata = {}
-        caption_path = Path(str(caption))
-        if not caption_path.exists():
-            return metadata
-        try:
-            with open(caption_path, "r", encoding="utf-8") as f:
-                content = f.read(2048)  # Read first 2KB for metadata
-            # WebVTT metadata extraction
-            if format == "vtt" or content.startswith("WEBVTT"):
-                lines = content.split("\n")
-                for line in lines[:10]:  # Check first 10 lines
-                    line = line.strip()
-                    if line.startswith("Kind:"):
-                        metadata["kind"] = line.split(":", 1)[1].strip()
-                    elif line.startswith("Language:"):
-                        metadata["language"] = line.split(":", 1)[1].strip()
-                    elif line.startswith("NOTE"):
-                        # Extract metadata from NOTE comments
-                        match = re.search(r"NOTE\s+(\w+):\s*(.+)", line)
-                        if match:
-                            key, value = match.groups()
-                            metadata[key.lower()] = value.strip()
-            # SRT doesn't have standard metadata, but check for BOM
-            elif format == "srt":
-                if content.startswith("\ufeff"):
-                    metadata["encoding"] = "utf-8-sig"
-            # TextGrid metadata
-            elif format == "textgrid" or caption_path.suffix.lower() == ".textgrid":
-                match = re.search(r"xmin\s*=\s*([\d.]+)", content)
-                if match:
-                    metadata["xmin"] = match.group(1)
-                match = re.search(r"xmax\s*=\s*([\d.]+)", content)
-                if match:
-                    metadata["xmax"] = match.group(1)
-        except Exception:
-            # If metadata extraction fails, continue with empty metadata
-            pass
-        return metadata
-    @classmethod
-    def _parse_youtube_vtt_with_word_timestamps(
-        cls, content: str, normalize_text: Optional[bool] = False
-    ) -> List[Supervision]:
-        """
-        Parse YouTube VTT format with word-level timestamps.
-        YouTube auto-generated captions use this format:
-        Word1<00:00:10.559><c> Word2</c><00:00:11.120><c> Word3</c>...
-        Args:
-            content: VTT file content
-            normalize_text: Whether to normalize text
-        Returns:
-            List of Supervision objects with word-level alignments
-        """
-        from lhotse.supervision import AlignmentItem
-        supervisions = []
-        # Pattern to match timestamp lines: 00:00:14.280 --> 00:00:17.269 align:start position:0%
-        timestamp_pattern = re.compile(r"(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})")
-        # Pattern to match word-level timestamps: <00:00:10.559><c> word</c>
-        word_timestamp_pattern = re.compile(r"<(\d{2}:\d{2}:\d{2}[.,]\d{3})><c>\s*([^<]+)</c>")
-        # Pattern to match the first word (before first timestamp)
-        first_word_pattern = re.compile(r"^([^<\n]+?)<(\d{2}:\d{2}:\d{2}[.,]\d{3})>")
-        def parse_timestamp(ts: str) -> float:
-            """Convert timestamp string to seconds."""
-            ts = ts.replace(",", ".")
-            parts = ts.split(":")
-            hours = int(parts[0])
-            minutes = int(parts[1])
-            seconds = float(parts[2])
-            return hours * 3600 + minutes * 60 + seconds
-        lines = content.split("\n")
-        i = 0
-        while i < len(lines):
-            line = lines[i].strip()
-            # Look for timestamp line
-            ts_match = timestamp_pattern.search(line)
-            if ts_match:
-                cue_start = parse_timestamp(ts_match.group(1))
-                cue_end = parse_timestamp(ts_match.group(2))
-                # Read the next non-empty lines for cue content
-                cue_lines = []
-                i += 1
-                while i < len(lines) and lines[i].strip() and not timestamp_pattern.search(lines[i]):
-                    cue_lines.append(lines[i])
-                    i += 1
-                # Process cue content
-                for cue_line in cue_lines:
-                    cue_line = cue_line.strip()
-                    if not cue_line:
-                        continue
-                    # Check if this line has word-level timestamps
-                    word_matches = word_timestamp_pattern.findall(cue_line)
-                    if word_matches:
-                        # This line has word-level timing
-                        word_alignments = []
-                        # Get the first word (before the first timestamp)
-                        first_match = first_word_pattern.match(cue_line)
-                        if first_match:
-                            first_word = first_match.group(1).strip()
-                            first_word_next_ts = parse_timestamp(first_match.group(2))
-                            if first_word:
-                                # First word starts at cue_start
-                                word_alignments.append(
-                                    AlignmentItem(
-                                        symbol=first_word,
-                                        start=cue_start,
-                                        duration=first_word_next_ts - cue_start,
-                                    )
-                                )
-                        # Process remaining words with timestamps
-                        for idx, (ts, word) in enumerate(word_matches):
-                            word_start = parse_timestamp(ts)
-                            word = word.strip()
-                            if not word:
-                                continue
-                            # Calculate duration based on next word's timestamp or cue end
-                            if idx + 1 < len(word_matches):
-                                next_ts = parse_timestamp(word_matches[idx + 1][0])
-                                duration = next_ts - word_start
-                            else:
-                                duration = cue_end - word_start
-                            word_alignments.append(
-                                AlignmentItem(
-                                    symbol=word,
-                                    start=word_start,
-                                    duration=max(0.01, duration),  # Ensure positive duration
-                                )
-                            )
-                        if word_alignments:
-                            # Create supervision with word-level alignment
-                            full_text = " ".join(item.symbol for item in word_alignments)
-                            if normalize_text:
-                                full_text = normalize_text_fn(full_text)
-                            sup_start = word_alignments[0].start
-                            sup_end = word_alignments[-1].start + word_alignments[-1].duration
-                            supervisions.append(
-                                Supervision(
-                                    text=full_text,
-                                    start=sup_start,
-                                    duration=sup_end - sup_start,
-                                    alignment={"word": word_alignments},
-                                )
-                            )
-                    else:
-                        # Plain text line without word-level timing - skip duplicate lines
-                        # (YouTube VTT often repeats the previous line without timestamps)
-                        pass
-                continue
-            i += 1
-        # Merge consecutive supervisions to form complete utterances
-        if supervisions:
-            supervisions = cls._merge_youtube_vtt_supervisions(supervisions)
-        return supervisions
-    @classmethod
-    def _merge_youtube_vtt_supervisions(cls, supervisions: List[Supervision]) -> List[Supervision]:
-        """
-        Merge consecutive YouTube VTT supervisions into complete utterances.
-        YouTube VTT splits utterances across multiple cues. This method merges
-        cues that are close together in time.
-        Args:
-            supervisions: List of supervisions to merge
-        Returns:
-            List of merged supervisions
-        """
-        if not supervisions:
-            return supervisions
-        merged = []
-        current = supervisions[0]
-        for next_sup in supervisions[1:]:
-            # Check if next supervision is close enough to merge (within 0.5 seconds)
-            gap = next_sup.start - (current.start + current.duration)
-            if gap < 0.5 and current.alignment and next_sup.alignment:
-                # Merge alignments
-                current_words = current.alignment.get("word", [])
-                next_words = next_sup.alignment.get("word", [])
-                merged_words = list(current_words) + list(next_words)
-                # Create merged supervision
-                merged_text = current.text + " " + next_sup.text
-                merged_end = next_sup.start + next_sup.duration
-                current = Supervision(
-                    text=merged_text,
-                    start=current.start,
-                    duration=merged_end - current.start,
-                    alignment={"word": merged_words},
-                )
-            else:
-                merged.append(current)
-                current = next_sup
-        merged.append(current)
-        return merged
-    @classmethod
-    def _is_youtube_vtt_with_word_timestamps(cls, content: str) -> bool:
-        """
-        Check if content is YouTube VTT format with word-level timestamps.
-        Args:
-            content: File content to check
-        Returns:
-            True if content contains YouTube-style word timestamps
-        """
-        # Look for pattern like <00:00:10.559><c> word</c>
-        return bool(re.search(r"<\d{2}:\d{2}:\d{2}[.,]\d{3}><c>", content))
-    @classmethod
-    def _parse_supervisions(
-        cls, caption: Pathlike, format: Optional[str], normalize_text: Optional[bool] = False
-    ) -> List[Supervision]:
-        """
-        Parse supervisions from caption file.
-        Args:
-            caption: Caption file path or content
-            format: Caption format
-            normalize_text: Whether to normalize text
-        Returns:
-            List of Supervision objects
-        """
-        if format:
-            format = format.lower()
-        # Check for YouTube VTT with word-level timestamps first
-        caption_path = Path(str(caption))
-        if caption_path.exists():
-            with open(caption_path, "r", encoding="utf-8") as f:
-                content = f.read()
-            if cls._is_youtube_vtt_with_word_timestamps(content):
-                return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
-        # Match Gemini format: explicit format, or files ending with Gemini.md/Gemini3.md,
-        # or files containing "gemini" in the name with .md extension
-        caption_str = str(caption).lower()
-        is_gemini_format = (
-            format == "gemini"
-            or str(caption).endswith("Gemini.md")
-            or str(caption).endswith("Gemini3.md")
-            or ("gemini" in caption_str and caption_str.endswith(".md"))
-        )
-        if is_gemini_format:
-            from .gemini_reader import GeminiReader
-            supervisions = GeminiReader.extract_for_alignment(caption)
-        elif format and (format == "textgrid" or str(caption).lower().endswith("textgrid")):
-            # Internel usage
-            from tgt import read_textgrid
-            tgt = read_textgrid(caption)
-            supervisions = []
-            for tier in tgt.tiers:
-                supervisions.extend(
-                    [
-                        Supervision(
-                            text=interval.text,
-                            start=interval.start_time,
-                            duration=interval.end_time - interval.start_time,
-                            speaker=tier.name,
-                        )
-                        for interval in tier.intervals
-                    ]
-                )
-            supervisions = sorted(supervisions, key=lambda x: x.start)
-        elif format == "tsv" or str(caption)[-4:].lower() == ".tsv":
-            supervisions = cls._parse_tsv(caption, normalize_text)
-        elif format == "csv" or str(caption)[-4:].lower() == ".csv":
-            supervisions = cls._parse_csv(caption, normalize_text)
-        elif format == "aud" or str(caption)[-4:].lower() == ".aud":
-            supervisions = cls._parse_aud(caption, normalize_text)
-        elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
-            supervisions = cls._parse_sbv(caption, normalize_text)
-        elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
-            if not Path(str(caption)).exists():  # str
-                lines = [line.strip() for line in str(caption).split("\n")]
-            else:  # file
-                path_str = str(caption)
-                with open(path_str, encoding="utf-8") as f:
-                    lines = [line.strip() for line in f.readlines()]
-                    if normalize_text:
-                        lines = [normalize_text_fn(line) for line in lines]
-            supervisions = []
-            for line in lines:
-                if line:
-                    # First try to parse timestamp format: [start-end] text
-                    start, end, remaining_text = parse_timestamp_text(line)
-                    if start is not None and end is not None:
-                        # Has timestamp, now check for speaker in the remaining text
-                        speaker, text = parse_speaker_text(remaining_text)
-                        supervisions.append(
-                            Supervision(
-                                text=text,
-                                start=start,
-                                duration=end - start,
-                                speaker=speaker,
-                            )
-                        )
-                    else:
-                        # No timestamp, just parse speaker and text
-                        speaker, text = parse_speaker_text(line)
-                        supervisions.append(Supervision(text=text, speaker=speaker))
-        else:
-            try:
-                supervisions = cls._parse_caption(caption, format=format, normalize_text=normalize_text)
-            except Exception as e:
-                print(f"Failed to parse caption with Format: {format}, Exception: {e}, trying 'gemini' parser.")
-                from .gemini_reader import GeminiReader
-                supervisions = GeminiReader.extract_for_alignment(caption)
-        return supervisions
-    @classmethod
-    def _parse_tsv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
-        """
-        Parse TSV (Tab-Separated Values) format caption file.
-        Format specifications:
-        - With speaker: speaker\tstart\tend\ttext
-        - Without speaker: start\tend\ttext
-        - Times are in milliseconds
-        Args:
-            caption: Caption file path
-            normalize_text: Whether to normalize text
-        Returns:
-            List of Supervision objects
-        """
-        caption_path = Path(str(caption))
-        if not caption_path.exists():
-            raise FileNotFoundError(f"Caption file not found: {caption}")
-        supervisions = []
-        with open(caption_path, "r", encoding="utf-8") as f:
-            lines = f.readlines()
-        # Check if first line is a header
-        first_line = lines[0].strip().lower()
-        has_header = "start" in first_line and "end" in first_line and "text" in first_line
-        has_speaker_column = "speaker" in first_line
-        start_idx = 1 if has_header else 0
-        for line in lines[start_idx:]:
-            line = line.strip()
-            if not line:
-                continue
-            parts = line.split("\t")
-            if len(parts) < 3:
-                continue
-            try:
-                if has_speaker_column and len(parts) >= 4:
-                    # Format: speaker\tstart\tend\ttext
-                    speaker = parts[0].strip() if parts[0].strip() else None
-                    start = float(parts[1]) / 1000.0  # Convert milliseconds to seconds
-                    end = float(parts[2]) / 1000.0
-                    text = "\t".join(parts[3:]).strip()
-                else:
-                    # Format: start\tend\ttext
-                    start = float(parts[0]) / 1000.0  # Convert milliseconds to seconds
-                    end = float(parts[1]) / 1000.0
-                    text = "\t".join(parts[2:]).strip()
-                    speaker = None
-                if normalize_text:
-                    text = normalize_text_fn(text)
-                duration = end - start
-                if duration < 0:
-                    continue
-                supervisions.append(
-                    Supervision(
-                        text=text,
-                        start=start,
-                        duration=duration,
-                        speaker=speaker,
-                    )
-                )
-            except (ValueError, IndexError):
-                # Skip malformed lines
-                continue
-        return supervisions
-    @classmethod
-    def _parse_csv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
-        """
-        Parse CSV (Comma-Separated Values) format caption file.
-        Format specifications:
-        - With speaker: speaker,start,end,text
-        - Without speaker: start,end,text
-        - Times are in milliseconds
-        Args:
-            caption: Caption file path
-            normalize_text: Whether to normalize text
-        Returns:
-            List of Supervision objects
-        """
-        import csv
-        caption_path = Path(str(caption))
-        if not caption_path.exists():
-            raise FileNotFoundError(f"Caption file not found: {caption}")
-        supervisions = []
-        with open(caption_path, "r", encoding="utf-8", newline="") as f:
-            reader = csv.reader(f)
-            lines = list(reader)
-        if not lines:
-            return supervisions
-        # Check if first line is a header
-        first_line = [col.strip().lower() for col in lines[0]]
-        has_header = "start" in first_line and "end" in first_line and "text" in first_line
-        has_speaker_column = "speaker" in first_line
-        start_idx = 1 if has_header else 0
-        for parts in lines[start_idx:]:
-            if len(parts) < 3:
-                continue
-            try:
-                if has_speaker_column and len(parts) >= 4:
-                    # Format: speaker,start,end,text
-                    speaker = parts[0].strip() if parts[0].strip() else None
-                    start = float(parts[1]) / 1000.0  # Convert milliseconds to seconds
-                    end = float(parts[2]) / 1000.0
-                    text = ",".join(parts[3:]).strip()
-                else:
-                    # Format: start,end,text
-                    start = float(parts[0]) / 1000.0  # Convert milliseconds to seconds
-                    end = float(parts[1]) / 1000.0
-                    text = ",".join(parts[2:]).strip()
-                    speaker = None
-                if normalize_text:
-                    text = normalize_text_fn(text)
-                duration = end - start
-                if duration < 0:
-                    continue
-                supervisions.append(
-                    Supervision(
-                        text=text,
-                        start=start,
-                        duration=duration,
-                        speaker=speaker,
-                    )
-                )
-            except (ValueError, IndexError):
-                # Skip malformed lines
-                continue
-        return supervisions
-    @classmethod
-    def _parse_aud(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
-        """
-        Parse AUD (Audacity Labels) format caption file.
-        Format: start\tend\t[[speaker]]text
-        - Times are in seconds (float)
-        - Speaker is optional and enclosed in [[brackets]]
-        Args:
-            caption: Caption file path
-            normalize_text: Whether to normalize text
-        Returns:
-            List of Supervision objects
-        """
-        caption_path = Path(str(caption))
-        if not caption_path.exists():
-            raise FileNotFoundError(f"Caption file not found: {caption}")
-        supervisions = []
-        with open(caption_path, "r", encoding="utf-8") as f:
-            lines = f.readlines()
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            parts = line.split("\t")
-            if len(parts) < 3:
-                continue
-            try:
-                # AUD format: start\tend\ttext (speaker in [[brackets]])
-                start = float(parts[0])
-                end = float(parts[1])
-                text = "\t".join(parts[2:]).strip()
-                # Extract speaker from [[speaker]] prefix
-                speaker = None
-                speaker_match = re.match(r"^\[\[([^\]]+)\]\]\s*(.*)$", text)
-                if speaker_match:
-                    speaker = speaker_match.group(1)
-                    text = speaker_match.group(2)
-                if normalize_text:
-                    text = normalize_text_fn(text)
-                duration = end - start
-                if duration < 0:
-                    continue
-                supervisions.append(
-                    Supervision(
-                        text=text,
-                        start=start,
-                        duration=duration,
-                        speaker=speaker,
-                    )
-                )
-            except (ValueError, IndexError):
-                # Skip malformed lines
-                continue
-        return supervisions
-    @classmethod
-    def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
-        """
-        Parse SubViewer (SBV) format caption file.
-        Format:
-        0:00:00.000,0:00:02.000
-        Text line 1
-        0:00:02.000,0:00:04.000
-        Text line 2
-        Args:
-            caption: Caption file path
-            normalize_text: Whether to normalize text
-        Returns:
-            List of Supervision objects
-        """
-        caption_path = Path(str(caption))
-        if not caption_path.exists():
-            raise FileNotFoundError(f"Caption file not found: {caption}")
-        supervisions = []
-        with open(caption_path, "r", encoding="utf-8") as f:
-            content = f.read()
-        # Split by double newlines to separate entries
-        entries = content.strip().split("\n\n")
-        for entry in entries:
-            lines = entry.strip().split("\n")
-            if len(lines) < 2:
-                continue
-            # First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
-            timestamp_line = lines[0].strip()
-            # Remaining lines: text
-            text_lines = lines[1:]
-            try:
-                # Parse timestamp: 0:00:00.000,0:00:02.000
-                if "," not in timestamp_line:
-                    continue
-                start_str, end_str = timestamp_line.split(",", 1)
-                # Parse start time
-                start_parts = start_str.strip().split(":")
-                if len(start_parts) == 3:
-                    h, m, s = start_parts
-                    s_parts = s.split(".")
-                    start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
-                    if len(s_parts) > 1:
-                        start += int(s_parts[1]) / 1000.0
-                else:
-                    continue
-                # Parse end time
-                end_parts = end_str.strip().split(":")
-                if len(end_parts) == 3:
-                    h, m, s = end_parts
-                    s_parts = s.split(".")
-                    end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
-                    if len(s_parts) > 1:
-                        end += int(s_parts[1]) / 1000.0
-                else:
-                    continue
-                # Parse text and speaker
-                text = " ".join(text_lines).strip()
-                speaker, text = parse_speaker_text(text)
-                if normalize_text:
-                    text = normalize_text_fn(text)
-                duration = end - start
-                if duration < 0:
-                    continue
-                supervisions.append(
-                    Supervision(
-                        text=text,
-                        start=start,
-                        duration=duration,
-                        speaker=speaker,
-                    )
-                )
-            except (ValueError, IndexError):
-                # Skip malformed entries
-                continue
-        return supervisions
-    @classmethod
-    def _write_tsv(
-        cls,
-        alignments: List[Supervision],
-        output_path: Pathlike,
-        include_speaker_in_text: bool = True,
-    ) -> None:
-        """
-        Write caption to TSV format.
-        Format: speaker\tstart\tend\ttext (with speaker)
-        or: start\tend\ttext (without speaker)
-        Args:
-            alignments: List of supervision segments to write
-            output_path: Path to output TSV file
-            include_speaker_in_text: Whether to include speaker column
-        """
-        with open(output_path, "w", encoding="utf-8") as file:
-            # Write header
-            if include_speaker_in_text:
-                file.write("speaker\tstart\tend\ttext\n")
-                for supervision in alignments:
-                    # Respect `original_speaker` custom flag: default to True when missing
-                    include_speaker = supervision.speaker and (
-                        not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
-                    )
-                    speaker = supervision.speaker if include_speaker else ""
-                    start_ms = round(1000 * supervision.start)
-                    end_ms = round(1000 * supervision.end)
-                    text = supervision.text.strip().replace("\t", " ")
-                    file.write(f"{speaker}\t{start_ms}\t{end_ms}\t{text}\n")
-            else:
-                file.write("start\tend\ttext\n")
-                for supervision in alignments:
-                    start_ms = round(1000 * supervision.start)
-                    end_ms = round(1000 * supervision.end)
-                    text = supervision.text.strip().replace("\t", " ")
-                    file.write(f"{start_ms}\t{end_ms}\t{text}\n")
-    @classmethod
-    def _write_csv(
-        cls,
-        alignments: List[Supervision],
-        output_path: Pathlike,
-        include_speaker_in_text: bool = True,
-    ) -> None:
-        """
-        Write caption to CSV format.
-        Format: speaker,start,end,text (with speaker)
-        or: start,end,text (without speaker)
-        Args:
-            alignments: List of supervision segments to write
-            output_path: Path to output CSV file
-            include_speaker_in_text: Whether to include speaker column
-        """
-        import csv
-        with open(output_path, "w", encoding="utf-8", newline="") as file:
-            if include_speaker_in_text:
-                writer = csv.writer(file)
-                writer.writerow(["speaker", "start", "end", "text"])
-                for supervision in alignments:
-                    include_speaker = supervision.speaker and (
-                        not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
-                    )
-                    speaker = supervision.speaker if include_speaker else ""
-                    start_ms = round(1000 * supervision.start)
-                    end_ms = round(1000 * supervision.end)
-                    text = supervision.text.strip()
-                    writer.writerow([speaker, start_ms, end_ms, text])
-            else:
-                writer = csv.writer(file)
-                writer.writerow(["start", "end", "text"])
-                for supervision in alignments:
-                    start_ms = round(1000 * supervision.start)
-                    end_ms = round(1000 * supervision.end)
-                    text = supervision.text.strip()
-                    writer.writerow([start_ms, end_ms, text])
-    @classmethod
-    def _write_aud(
-        cls,
-        alignments: List[Supervision],
-        output_path: Pathlike,
-        include_speaker_in_text: bool = True,
-    ) -> None:
-        """
-        Write caption to AUD format.
-        Format: start\tend\t[[speaker]]text
-        or: start\tend\ttext (without speaker)
-        Args:
-            alignments: List of supervision segments to write
-            output_path: Path to output AUD file
-            include_speaker_in_text: Whether to include speaker in [[brackets]]
-        """
-        with open(output_path, "w", encoding="utf-8") as file:
-            for supervision in alignments:
-                start = supervision.start
-                end = supervision.end
-                text = supervision.text.strip().replace("\t", " ")
-                # Respect `original_speaker` custom flag when adding speaker prefix
-                if (
-                    include_speaker_in_text
-                    and supervision.speaker
-                    and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
-                ):
-                    text = f"[[{supervision.speaker}]]{text}"
-                file.write(f"{start}\t{end}\t{text}\n")
-    @classmethod
-    def _write_sbv(
-        cls,
-        alignments: List[Supervision],
-        output_path: Pathlike,
-        include_speaker_in_text: bool = True,
-    ) -> None:
-        """
-        Write caption to SubViewer (SBV) format.
-        Format:
-        0:00:00.000,0:00:02.000
-        Text line 1
-        0:00:02.000,0:00:04.000
-        Text line 2
-        Args:
-            alignments: List of supervision segments to write
-            output_path: Path to output SBV file
-            include_speaker_in_text: Whether to include speaker in text
-        """
-        with open(output_path, "w", encoding="utf-8") as file:
-            for i, supervision in enumerate(alignments):
-                # Format timestamps as H:MM:SS.mmm
-                start_h = int(supervision.start // 3600)
-                start_m = int((supervision.start % 3600) // 60)
-                start_s = int(supervision.start % 60)
-                start_ms = int((supervision.start % 1) * 1000)
-                end_h = int(supervision.end // 3600)
-                end_m = int((supervision.end % 3600) // 60)
-                end_s = int(supervision.end % 60)
-                end_ms = int((supervision.end % 1) * 1000)
-                start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
-                end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
-                # Write timestamp line
-                file.write(f"{start_time},{end_time}\n")
-                # Write text (with optional speaker). Respect `original_speaker` custom flag.
-                text = supervision.text.strip()
-                if (
-                    include_speaker_in_text
-                    and supervision.speaker
-                    and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
-                ):
-                    text = f"{supervision.speaker}: {text}"
-                file.write(f"{text}\n")
-                # Add blank line between entries (except after last one)
-                if i < len(alignments) - 1:
-                    file.write("\n")
-    @classmethod
-    def _parse_caption(
-        cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
-    ) -> List[Supervision]:
-        """
-        Parse caption using pysubs2.
-        Args:
-            caption: Caption file path or content
-            format: Caption format
-            normalize_text: Whether to normalize text
-        Returns:
-            List of Supervision objects
-        """
-        import pysubs2
-        try:
-            subs: pysubs2.SSAFile = pysubs2.load(
-                caption, encoding="utf-8", format_=format if format != "auto" else None
-            )  # file
-        except IOError:
-            try:
-                subs: pysubs2.SSAFile = pysubs2.SSAFile.from_string(
-                    caption, format_=format if format != "auto" else None
-                )  # str
-            except Exception as e:
-                del e
-                subs: pysubs2.SSAFile = pysubs2.load(caption, encoding="utf-8")  # auto detect format
-        # Parse supervisions
-        supervisions = []
-        for event in subs.events:
-            if normalize_text:
-                event.text = normalize_text_fn(event.text)
-            speaker, text = parse_speaker_text(event.text)
-            supervisions.append(
-                Supervision(
-                    text=text,
-                    speaker=speaker or event.name,
-                    start=event.start / 1000.0 if event.start is not None else None,
-                    duration=(event.end - event.start) / 1000.0 if event.end is not None else None,
-                )
-            )
-        return supervisions
     def __repr__(self) -> str:
         """String representation of Caption."""
         lang = f"lang={self.language}" if self.language else "lang=unknown"

lattifai 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

lattifai 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl