PyPI - lattifai - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

lattifai 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

lattifai/alignment/__init__.py +10 -1
lattifai/alignment/lattice1_aligner.py +66 -58
lattifai/alignment/punctuation.py +38 -0
lattifai/alignment/sentence_splitter.py +152 -21
lattifai/alignment/text_align.py +440 -0
lattifai/alignment/tokenizer.py +82 -40
lattifai/caption/__init__.py +82 -6
lattifai/caption/caption.py +335 -1141
lattifai/caption/formats/__init__.py +199 -0
lattifai/caption/formats/base.py +211 -0
lattifai/caption/{gemini_reader.py → formats/gemini.py} +320 -60
lattifai/caption/formats/json.py +194 -0
lattifai/caption/formats/lrc.py +309 -0
lattifai/caption/formats/nle/__init__.py +9 -0
lattifai/caption/formats/nle/audition.py +561 -0
lattifai/caption/formats/nle/avid.py +423 -0
lattifai/caption/formats/nle/fcpxml.py +549 -0
lattifai/caption/formats/nle/premiere.py +589 -0
lattifai/caption/formats/pysubs2.py +642 -0
lattifai/caption/formats/sbv.py +147 -0
lattifai/caption/formats/tabular.py +338 -0
lattifai/caption/formats/textgrid.py +193 -0
lattifai/caption/formats/ttml.py +652 -0
lattifai/caption/formats/vtt.py +469 -0
lattifai/caption/parsers/__init__.py +9 -0
lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
lattifai/caption/standardize.py +636 -0
lattifai/caption/utils.py +474 -0
lattifai/cli/__init__.py +2 -1
lattifai/cli/caption.py +108 -1
lattifai/cli/transcribe.py +1 -1
lattifai/cli/youtube.py +4 -1
lattifai/client.py +33 -113
lattifai/config/__init__.py +11 -1
lattifai/config/alignment.py +7 -0
lattifai/config/caption.py +267 -23
lattifai/config/media.py +20 -0
lattifai/diarization/__init__.py +41 -1
lattifai/mixin.py +27 -15
lattifai/transcription/base.py +6 -1
lattifai/transcription/lattifai.py +19 -54
lattifai/utils.py +7 -13
lattifai/workflow/__init__.py +28 -4
lattifai/workflow/file_manager.py +2 -5
lattifai/youtube/__init__.py +43 -0
lattifai/youtube/client.py +1170 -0
lattifai/youtube/types.py +23 -0
lattifai-1.2.2.dist-info/METADATA +615 -0
lattifai-1.2.2.dist-info/RECORD +76 -0
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
lattifai/caption/gemini_writer.py +0 -173
lattifai/cli/app_installer.py +0 -142
lattifai/cli/server.py +0 -44
lattifai/server/app.py +0 -427
lattifai/workflow/youtube.py +0 -577
lattifai-1.2.1.dist-info/METADATA +0 -1134
lattifai-1.2.1.dist-info/RECORD +0 -58
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0

lattifai/caption/{gemini_reader.py → formats/gemini.py} RENAMED Viewed

@@ -1,13 +1,20 @@
-"""Reader for YouTube transcript files with speaker labels and timestamps."""
+"""Gemini/YouTube transcript format handler.
+Handles YouTube/Gemini markdown transcript format with timestamps like [HH:MM:SS].
+Supports reading and writing transcript files with speaker labels, events, and sections.
+"""
 import re
+import tempfile
 from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional
+from typing import Dict, List, Optional, Union
 from lhotse.utils import Pathlike
-from .supervision import Supervision
+from ..supervision import Supervision
+from . import register_format
+from .base import FormatHandler
 @dataclass
@@ -46,6 +53,8 @@ class GeminiReader:
     INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
     # Timestamp at the beginning indicates start time
     INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
+    # Standalone timestamp on its own line
+    STANDALONE_TIMESTAMP_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
     # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
     YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
@@ -76,31 +85,40 @@ class GeminiReader:
     @classmethod
     def read(
         cls,
-        transcript_path: Pathlike,
+        transcript_path: Union[Pathlike, str],
         include_events: bool = False,
         include_sections: bool = False,
     ) -> List[GeminiSegment]:
-        """Parse YouTube transcript file and return list of transcript segments.
+        """Parse YouTube transcript file or content and return list of transcript segments.
         Args:
-                transcript_path: Path to the transcript file
+                transcript_path: Path to the transcript file or raw string content
                 include_events: Whether to include event descriptions like [Applause]
                 include_sections: Whether to include section headers
         Returns:
                 List of GeminiSegment objects with all metadata
         """
-        transcript_path = Path(transcript_path).expanduser().resolve()
-        if not transcript_path.exists():
-            raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
+        content = ""
+        # Check if transcript_path is a multi-line string (content) or a short string (likely path)
+        is_content = "\n" in str(transcript_path) or len(str(transcript_path)) > 1000
+        if is_content:
+            content = str(transcript_path)
+        else:
+            p = Path(transcript_path).expanduser().resolve()
+            if p.exists() and p.is_file():
+                with open(p, "r", encoding="utf-8") as f:
+                    content = f.read()
+            else:
+                # Fallback: treat as content if path doesn't exist
+                content = str(transcript_path)
         segments: List[GeminiSegment] = []
         current_section = None
         current_speaker = None
-        with open(transcript_path, "r", encoding="utf-8") as f:
-            lines = f.readlines()
+        lines = content.splitlines()
         for line_num, line in enumerate(lines, start=1):
             line = line.strip()
             if not line:
@@ -130,11 +148,10 @@ class GeminiReader:
                     )
                 continue
-            # Parse YouTube format section headers: ## [[MM:SS](URL&t=seconds)] Title
+            # Parse YouTube format section headers
             youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
             if youtube_section_match:
                 minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
-                # Use the URL seconds for more accuracy
                 timestamp = cls.parse_timestamp(url_seconds)
                 current_section = section_title.strip()
                 if include_sections:
@@ -149,21 +166,38 @@ class GeminiReader:
                     )
                 continue
-            # Parse event descriptions [event] [HH:MM:SS] or [MM:SS]
+            # Parse standalone timestamp [HH:MM:SS]
+            # Often used as an end timestamp for the preceding block
+            standalone_match = cls.STANDALONE_TIMESTAMP_PATTERN.match(line)
+            if standalone_match:
+                groups = standalone_match.groups()
+                if groups[0] is not None:
+                    ts = cls.parse_timestamp(groups[0], groups[1], groups[2])
+                else:
+                    ts = cls.parse_timestamp(groups[3], groups[4])
+                # Assign to previous dialogue segment if it doesn't have an end time
+                if segments and segments[-1].segment_type == "dialogue":
+                    if segments[-1].end_timestamp is None:
+                        segments[-1].end_timestamp = ts
+                    elif segments[-1].timestamp is None:
+                        # If it has an end but no start, this standalone might be its start?
+                        # Usually standalone is end, but let's be flexible
+                        segments[-1].timestamp = ts
+                continue
+            # Parse event descriptions [event] [HH:MM:SS]
             event_match = cls.EVENT_PATTERN.match(line)
             if event_match:
                 groups = event_match.groups()
                 event_text = groups[0]
-                # Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
                 hours_or_minutes = groups[1]
                 minutes_or_seconds = groups[2]
                 seconds_optional = groups[3]
                 if seconds_optional is not None:
-                    # HH:MM:SS format
                     timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
                 else:
-                    # MM:SS format
                     timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
                 if include_events and timestamp is not None:
@@ -178,15 +212,13 @@ class GeminiReader:
                     )
                 continue
-            # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS] or [MM:SS]
+            # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS]
             speaker_match = cls.SPEAKER_PATTERN.match(line)
             if speaker_match:
                 speaker, text_with_timestamp = speaker_match.groups()
                 current_speaker = speaker.strip()
-                # Check for timestamp at the beginning (start time)
                 start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
-                # Check for timestamp at the end (end time)
                 end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
                 youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
@@ -196,24 +228,21 @@ class GeminiReader:
                 if start_match:
                     groups = start_match.groups()
-                    # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
-                    if groups[0] is not None:  # HH:MM:SS format
+                    if groups[0] is not None:
                         start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
-                    elif groups[3] is not None:  # MM:SS format
+                    elif groups[3] is not None:
                         start_timestamp = cls.parse_timestamp(groups[3], groups[4])
-                    text = groups[5]  # Text is after timestamp
+                    text = groups[5]
                 elif end_match:
                     groups = end_match.groups()
-                    text = groups[0]  # Text is before timestamp
-                    # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
-                    if groups[1] is not None:  # HH:MM:SS format
+                    text = groups[0]
+                    if groups[1] is not None:
                         end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
-                    elif groups[4] is not None:  # MM:SS format
+                    elif groups[4] is not None:
                         end_timestamp = cls.parse_timestamp(groups[4], groups[5])
                 elif youtube_match:
                     groups = youtube_match.groups()
                     text = groups[0]
-                    # Extract seconds from URL parameter (treat as end time)
                     url_seconds = groups[3]
                     end_timestamp = cls.parse_timestamp(url_seconds)
@@ -228,52 +257,41 @@ class GeminiReader:
                         line_number=line_num,
                     )
                 )
-                current_speaker = None  # Reset speaker after use
+                current_speaker = None
                 continue
-            # Parse plain text with timestamp (check both positions)
+            # Parse plain text (might contain inline timestamp or be a continuation)
             start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
             end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
             youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
-            start_timestamp = None
-            end_timestamp = None
-            text = None
             if start_match:
                 groups = start_match.groups()
-                # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
-                if groups[0] is not None:  # HH:MM:SS format
+                if groups[0] is not None:
                     start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
-                elif groups[3] is not None:  # MM:SS format
+                else:
                     start_timestamp = cls.parse_timestamp(groups[3], groups[4])
-                text = groups[5]  # Text is after timestamp
+                text = groups[5]
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
                         timestamp=start_timestamp,
-                        end_timestamp=None,
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
-                continue
             elif end_match:
                 groups = end_match.groups()
-                text = groups[0]  # Text is before timestamp
-                # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
-                if groups[1] is not None:  # HH:MM:SS format
+                text = groups[0]
+                if groups[1] is not None:
                     end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
-                elif groups[4] is not None:  # MM:SS format
+                else:
                     end_timestamp = cls.parse_timestamp(groups[4], groups[5])
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=None,
                         end_timestamp=end_timestamp,
                         speaker=current_speaker,
                         section=current_section,
@@ -281,30 +299,40 @@ class GeminiReader:
                         line_number=line_num,
                     )
                 )
-                continue
             elif youtube_inline_match:
                 groups = youtube_inline_match.groups()
                 text = groups[0]
-                # Extract seconds from URL parameter (treat as end time)
                 url_seconds = groups[3]
-                end_timestamp = cls.parse_timestamp(url_seconds)
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=None,
-                        end_timestamp=end_timestamp,
+                        end_timestamp=cls.parse_timestamp(url_seconds),
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
-                continue
+            else:
+                # Plain text without any recognized markers
+                # If it follows a speaker line or another dialogue line without end timestamp,
+                # merge it into the last segment to support multi-line text blocks.
+                if segments and segments[-1].segment_type == "dialogue" and segments[-1].end_timestamp is None:
+                    segments[-1].text += " " + line.strip()
+                else:
+                    # Skip markdown headers and other formatting
+                    if line.startswith("#"):
+                        continue
-            # Skip markdown headers and other formatting
-            if line.startswith("#"):
-                continue
+                    segments.append(
+                        GeminiSegment(
+                            text=line.strip(),
+                            speaker=current_speaker,
+                            section=current_section,
+                            segment_type="dialogue",
+                            line_number=line_num,
+                        )
+                    )
         return segments
@@ -315,6 +343,8 @@ class GeminiReader:
         merge_consecutive: bool = False,
         min_duration: float = 0.1,
         merge_max_gap: float = 2.0,
+        normalize_text: bool = True,
+        **kwargs,
     ) -> List[Supervision]:
         """Extract text segments for forced alignment.
@@ -395,7 +425,7 @@ class GeminiReader:
                 if segment.segment_type == "dialogue":
                     supervisions.append(
                         Supervision(
-                            text=segment.text,
+                            text=segment.text.strip(),
                             start=seg_start,
                             duration=duration,
                             id=f"segment_{i:05d}",
@@ -460,3 +490,233 @@ class GeminiReader:
 __all__ = ["GeminiReader", "GeminiSegment"]
+class GeminiWriter:
+    """Writer for updating YouTube transcript timestamps based on alignment results."""
+    @staticmethod
+    def format_timestamp(seconds: float) -> str:
+        """Convert seconds to [HH:MM:SS] format."""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
+    @classmethod
+    def update_timestamps(
+        cls,
+        original_transcript: Pathlike,
+        aligned_supervisions: List[Supervision],
+        output_path: Pathlike,
+        timestamp_mapping: Optional[Dict[int, float]] = None,
+    ) -> Pathlike:
+        """Update transcript file with corrected timestamps from alignment.
+        Args:
+                original_transcript: Path to the original transcript file
+                aligned_supervisions: List of aligned Supervision objects with corrected timestamps
+                output_path: Path to write the updated transcript
+                timestamp_mapping: Optional manual mapping from line_number to new timestamp
+        Returns:
+                Path to the output file
+        """
+        original_path = Path(original_transcript)
+        output_path = Path(output_path)
+        # Read original file
+        with open(original_path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        # Parse original segments to get line numbers
+        original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
+        # Create mapping from line number to new timestamp
+        if timestamp_mapping is None:
+            timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
+        # Update timestamps in lines
+        updated_lines = []
+        for line_num, line in enumerate(lines, start=1):
+            if line_num in timestamp_mapping:
+                new_timestamp = timestamp_mapping[line_num]
+                updated_line = cls._replace_timestamp(line, new_timestamp)
+                updated_lines.append(updated_line)
+            else:
+                updated_lines.append(line)
+        # Write updated content
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.writelines(updated_lines)
+        return output_path
+    @classmethod
+    def _create_timestamp_mapping(
+        cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
+    ) -> Dict[int, float]:
+        """Create mapping from line numbers to new timestamps based on alignment.
+        This performs text matching between original segments and aligned supervisions
+        to determine which timestamps should be updated.
+        """
+        mapping = {}
+        # Create a simple text-based matching
+        dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
+        # Try to match based on text content
+        for aligned_sup in aligned_supervisions:
+            aligned_text = aligned_sup.text.strip()
+            # Find best matching original segment
+            best_match = None
+            best_score = 0
+            for orig_seg in dialogue_segments:
+                orig_text = orig_seg.text.strip()
+                # Simple text similarity (could be improved with fuzzy matching)
+                if aligned_text == orig_text:
+                    best_match = orig_seg
+                    best_score = 1.0
+                    break
+                elif aligned_text in orig_text or orig_text in aligned_text:
+                    score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
+                    if score > best_score:
+                        best_score = score
+                        best_match = orig_seg
+            # If we found a good match, update the mapping
+            if best_match and best_score > 0.8:
+                mapping[best_match.line_number] = aligned_sup.start
+        return mapping
+    @classmethod
+    def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
+        """Replace timestamp in a line with new timestamp."""
+        new_ts_str = cls.format_timestamp(new_timestamp)
+        # Replace timestamp patterns
+        # Pattern 1: [HH:MM:SS] at the end or in brackets
+        line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
+        return line
+    @classmethod
+    def write_aligned_transcript(
+        cls,
+        aligned_supervisions: List[Supervision],
+        output_path: Pathlike,
+        include_word_timestamps: bool = False,
+    ) -> Pathlike:
+        """Write a new transcript file from aligned supervisions.
+        This creates a simplified transcript format with accurate timestamps.
+        Args:
+                aligned_supervisions: List of aligned Supervision objects
+                output_path: Path to write the transcript
+                include_word_timestamps: Whether to include word-level timestamps if available
+        Returns:
+                Path to the output file
+        """
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("# Aligned Transcript\n\n")
+            for i, sup in enumerate(aligned_supervisions):
+                # Write segment with timestamp
+                start_ts = cls.format_timestamp(sup.start)
+                f.write(f"{start_ts} {sup.text}\n")
+                # Optionally write word-level timestamps
+                if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
+                    if "word" in sup.alignment:
+                        f.write("  Words: ")
+                        word_parts = []
+                        for word_info in sup.alignment["word"]:
+                            word_ts = cls.format_timestamp(word_info["start"])
+                            word_parts.append(f'{word_info["symbol"]}{word_ts}')
+                        f.write(" ".join(word_parts))
+                        f.write("\n")
+                f.write("\n")
+        return output_path
+    @classmethod
+    def write(
+        cls,
+        supervisions: List[Supervision],
+        output_path: Pathlike,
+        **kwargs,
+    ) -> Path:
+        """Alias for write_aligned_transcript for Caption API compatibility."""
+        return Path(cls.write_aligned_transcript(supervisions, output_path, **kwargs))
+    @classmethod
+    def to_bytes(
+        cls,
+        supervisions: List[Supervision],
+        **kwargs,
+    ) -> bytes:
+        """Convert aligned supervisions to Gemini format bytes."""
+        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
+            tmp_path = Path(tmp.name)
+        try:
+            cls.write_aligned_transcript(supervisions, tmp_path, **kwargs)
+            return tmp_path.read_bytes()
+        finally:
+            tmp_path.unlink(missing_ok=True)
+__all__ = ["GeminiWriter"]
+@register_format("gemini")
+class GeminiFormat(FormatHandler):
+    """YouTube/Gemini markdown transcript format."""
+    extensions = [".md"]
+    description = "YouTube/Gemini transcript format with timestamps"
+    @classmethod
+    def can_read(cls, path) -> bool:
+        """Check if this is a Gemini format file."""
+        path_str = str(path).lower()
+        return (
+            path_str.endswith("gemini.md")
+            or path_str.endswith("gemini3.md")
+            or ("gemini" in path_str and path_str.endswith(".md"))
+        )
+    @classmethod
+    def read(cls, path: Pathlike, **kwargs) -> List[Supervision]:
+        """Read Gemini format file."""
+        return GeminiReader.extract_for_alignment(path, **kwargs)
+    @classmethod
+    def write(
+        cls,
+        supervisions: List[Supervision],
+        output_path: Pathlike,
+        **kwargs,
+    ) -> Path:
+        """Write Gemini format file."""
+        return GeminiWriter.write(supervisions, output_path, **kwargs)
+    @classmethod
+    def to_bytes(
+        cls,
+        supervisions: List[Supervision],
+        **kwargs,
+    ) -> bytes:
+        """Convert to Gemini format bytes."""
+        return GeminiWriter.to_bytes(supervisions, **kwargs)

lattifai 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

lattifai 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl