PyPI - lattifai - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

lattifai 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

lattifai/__init__.py +0 -24
lattifai/alignment/__init__.py +10 -1
lattifai/alignment/lattice1_aligner.py +66 -58
lattifai/alignment/lattice1_worker.py +1 -6
lattifai/alignment/punctuation.py +38 -0
lattifai/alignment/segmenter.py +1 -1
lattifai/alignment/sentence_splitter.py +350 -0
lattifai/alignment/text_align.py +440 -0
lattifai/alignment/tokenizer.py +91 -220
lattifai/caption/__init__.py +82 -6
lattifai/caption/caption.py +335 -1143
lattifai/caption/formats/__init__.py +199 -0
lattifai/caption/formats/base.py +211 -0
lattifai/caption/formats/gemini.py +722 -0
lattifai/caption/formats/json.py +194 -0
lattifai/caption/formats/lrc.py +309 -0
lattifai/caption/formats/nle/__init__.py +9 -0
lattifai/caption/formats/nle/audition.py +561 -0
lattifai/caption/formats/nle/avid.py +423 -0
lattifai/caption/formats/nle/fcpxml.py +549 -0
lattifai/caption/formats/nle/premiere.py +589 -0
lattifai/caption/formats/pysubs2.py +642 -0
lattifai/caption/formats/sbv.py +147 -0
lattifai/caption/formats/tabular.py +338 -0
lattifai/caption/formats/textgrid.py +193 -0
lattifai/caption/formats/ttml.py +652 -0
lattifai/caption/formats/vtt.py +469 -0
lattifai/caption/parsers/__init__.py +9 -0
lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
lattifai/caption/standardize.py +636 -0
lattifai/caption/utils.py +474 -0
lattifai/cli/__init__.py +2 -1
lattifai/cli/caption.py +108 -1
lattifai/cli/transcribe.py +4 -9
lattifai/cli/youtube.py +4 -1
lattifai/client.py +48 -84
lattifai/config/__init__.py +11 -1
lattifai/config/alignment.py +9 -2
lattifai/config/caption.py +267 -23
lattifai/config/media.py +20 -0
lattifai/diarization/__init__.py +41 -1
lattifai/mixin.py +36 -18
lattifai/transcription/base.py +6 -1
lattifai/transcription/lattifai.py +19 -54
lattifai/utils.py +81 -13
lattifai/workflow/__init__.py +28 -4
lattifai/workflow/file_manager.py +2 -5
lattifai/youtube/__init__.py +43 -0
lattifai/youtube/client.py +1170 -0
lattifai/youtube/types.py +23 -0
lattifai-1.2.2.dist-info/METADATA +615 -0
lattifai-1.2.2.dist-info/RECORD +76 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
lattifai/caption/gemini_reader.py +0 -371
lattifai/caption/gemini_writer.py +0 -173
lattifai/cli/app_installer.py +0 -142
lattifai/cli/server.py +0 -44
lattifai/server/app.py +0 -427
lattifai/workflow/youtube.py +0 -577
lattifai-1.2.0.dist-info/METADATA +0 -1133
lattifai-1.2.0.dist-info/RECORD +0 -57
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0

lattifai/caption/formats/json.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""JSON format handler for structured caption data.
+JSON is the most flexible format for storing caption data, supporting:
+- Segment-level timing (start, end)
+- Word-level alignment (words array with per-word timestamps)
+- Speaker labels
+- Custom metadata
+Example JSON structure:
+```json
+[
+    {
+        "text": "Hello world",
+        "start": 0.0,
+        "end": 2.5,
+        "speaker": "Speaker 1",
+        "words": [
+            {"word": "Hello", "start": 0.0, "end": 0.5},
+            {"word": "world", "start": 0.6, "end": 2.5}
+        ]
+    }
+]
+```
+"""
+import json
+from pathlib import Path
+from typing import List
+from ..parsers.text_parser import normalize_text as normalize_text_fn
+from ..supervision import Supervision
+from . import register_format
+from .base import FormatHandler
+@register_format("json")
+class JSONFormat(FormatHandler):
+    """JSON format for structured caption data.
+    Features:
+    - Preserves full segment structure with timing
+    - Supports word-level alignment in 'words' field
+    - Round-trip compatible (read/write preserves all data)
+    - Human-readable with indentation
+    Input format (read):
+    - Array of objects with: text, start, duration/end
+    - Optional: speaker, words (array of word timing objects)
+    - Words can have: word, start, duration or end
+    Output format (write):
+    - word_level=False: Standard segment output
+    - word_level=True: Includes 'words' array with per-word timestamps
+    """
+    extensions = [".json"]
+    description = "JSON - structured caption data with word-level support"
+    @classmethod
+    def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
+        """Read JSON format.
+        Args:
+            source: File path or JSON string content
+            normalize_text: Whether to normalize text content
+        Returns:
+            List of Supervision objects with alignment data if present
+        Supports word-level alignment data in the 'words' field.
+        Each word item should have: word, start, duration (or end).
+        """
+        from lhotse.supervision import AlignmentItem
+        if cls.is_content(source):
+            data = json.loads(source)
+        else:
+            with open(source, "r", encoding="utf-8") as f:
+                data = json.load(f)
+        supervisions = []
+        for item in data:
+            text = item.get("text", "")
+            if normalize_text:
+                text = normalize_text_fn(text)
+            # Parse word-level alignment if present
+            alignment = None
+            if "words" in item and item["words"]:
+                word_alignments = []
+                for word_item in item["words"]:
+                    word_text = word_item.get("word", "")
+                    word_start = word_item.get("start", 0)
+                    # Support both 'duration' and 'end' fields
+                    if "duration" in word_item:
+                        word_duration = word_item["duration"]
+                    elif "end" in word_item:
+                        word_duration = word_item["end"] - word_start
+                    else:
+                        word_duration = 0
+                    word_alignments.append(AlignmentItem(symbol=word_text, start=word_start, duration=word_duration))
+                if word_alignments:
+                    alignment = {"word": word_alignments}
+            # Support both 'duration' and 'end' fields for segment timing
+            start = item.get("start", 0)
+            if "duration" in item:
+                duration = item["duration"]
+            elif "end" in item:
+                duration = item["end"] - start
+            else:
+                duration = 0
+            supervisions.append(
+                Supervision(
+                    text=text,
+                    start=start,
+                    duration=duration,
+                    speaker=item.get("speaker"),
+                    alignment=alignment,
+                )
+            )
+        return supervisions
+    @classmethod
+    def write(
+        cls,
+        supervisions: List[Supervision],
+        output_path,
+        include_speaker: bool = True,
+        word_level: bool = False,
+        **kwargs,
+    ) -> Path:
+        """Write JSON format.
+        Args:
+            supervisions: List of Supervision objects
+            output_path: Output file path
+            include_speaker: Whether to include speaker field
+            word_level: If True, include 'words' field with word-level timestamps
+        Returns:
+            Path to written file
+        """
+        output_path = Path(output_path)
+        content = cls.to_bytes(supervisions, include_speaker=include_speaker, word_level=word_level)
+        output_path.write_bytes(content)
+        return output_path
+    @classmethod
+    def to_bytes(
+        cls, supervisions: List[Supervision], include_speaker: bool = True, word_level: bool = False, **kwargs
+    ) -> bytes:
+        """Convert to JSON format bytes.
+        Args:
+            supervisions: List of Supervision objects
+            include_speaker: Whether to include speaker field
+            word_level: If True, include 'words' field with word-level timestamps
+        Returns:
+            JSON content as UTF-8 encoded bytes
+        Note:
+            Unlike other formats (SRT, VTT, LRC) that expand word_level=True to
+            one segment per word, JSON preserves the original structure and adds
+            a 'words' array inside each segment. This allows round-trip compatibility
+            and preserves all timing information.
+        """
+        data = []
+        for sup in supervisions:
+            item = {
+                "text": sup.text,
+                "start": sup.start,
+                "end": sup.end,
+            }
+            if include_speaker and sup.speaker:
+                item["speaker"] = sup.speaker
+            # Add words field when word_level=True and alignment exists
+            if word_level and sup.alignment and "word" in sup.alignment:
+                item["words"] = [
+                    {
+                        "word": w.symbol,
+                        "start": w.start,
+                        "end": w.start + w.duration,
+                    }
+                    for w in sup.alignment["word"]
+                ]
+            data.append(item)
+        return json.dumps(data, ensure_ascii=False, indent=4).encode("utf-8")

lattifai/caption/formats/lrc.py ADDED Viewed

@@ -0,0 +1,309 @@
+"""Enhanced LRC format handler.
+LRC (LyRiCs) is a file format for synchronized song lyrics. Enhanced LRC
+adds word-level timestamps for karaoke applications.
+Standard LRC:
+    [00:15.20]Hello beautiful world
+Enhanced LRC (word-level):
+    [00:15.20]<00:15.20>Hello <00:15.65>beautiful <00:16.40>world
+Metadata tags:
+    [ar:Artist Name]
+    [ti:Song Title]
+    [al:Album Name]
+    [offset:±milliseconds]
+"""
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+from lhotse.supervision import AlignmentItem
+from lhotse.utils import Pathlike
+from ...config.caption import KaraokeConfig
+from ..supervision import Supervision
+from . import register_format
+from .base import FormatHandler
+@register_format("lrc")
+class LRCFormat(FormatHandler):
+    """Enhanced LRC format with word-level timing support."""
+    extensions = [".lrc"]
+    description = "Enhanced LRC - karaoke lyrics format"
+    @classmethod
+    def is_content(cls, source) -> bool:
+        """Check if source is LRC content rather than a file path.
+        Overrides base class to also detect LRC content by timestamp pattern.
+        """
+        if not isinstance(source, str):
+            return False
+        # If it has newlines or is very long, it's likely content
+        if "\n" in source or len(source) > 500:
+            return True
+        # LRC-specific: check for timestamp pattern at start
+        if source.strip().startswith("[") and re.match(r"\[\d+:\d+", source):
+            return True
+        return False
+    @classmethod
+    def extract_metadata(cls, source: Union[Pathlike, str], **kwargs) -> Dict[str, str]:
+        """Extract LRC metadata tags.
+        Extracts standard LRC metadata:
+        - ar: Artist name
+        - ti: Title
+        - al: Album
+        - by: Creator
+        - offset: Time offset in milliseconds
+        - length: Song length
+        Returns:
+            Dict with lrc_* prefixed keys for metadata preservation
+        """
+        if cls.is_content(source):
+            content = source
+        else:
+            try:
+                content = Path(str(source)).read_text(encoding="utf-8")
+            except Exception:
+                return {}
+        metadata = {}
+        # Pattern to match [key:value] metadata tags
+        meta_pattern = re.compile(r"^\[([a-z]+):(.+)\]$", re.IGNORECASE)
+        for line in content.split("\n")[:50]:  # Only check first 50 lines
+            line = line.strip()
+            match = meta_pattern.match(line)
+            if match:
+                key, value = match.groups()
+                key = key.lower()
+                # Store with lrc_ prefix to avoid conflicts
+                if key in ("ar", "ti", "al", "by", "offset", "length", "re", "ve"):
+                    metadata[f"lrc_{key}"] = value.strip()
+        return metadata
+    @classmethod
+    def read(
+        cls,
+        source,
+        normalize_text: bool = True,
+        **kwargs,
+    ) -> List[Supervision]:
+        """Read LRC file and return list of Supervision objects.
+        Parses both standard LRC and enhanced LRC with word-level timestamps.
+        Args:
+            source: File path or string content
+            normalize_text: Whether to normalize text (currently unused)
+            **kwargs: Additional options
+        Returns:
+            List of Supervision objects with timing and optional word alignment
+        """
+        if cls.is_content(source):
+            content = source
+        else:
+            content = Path(source).read_text(encoding="utf-8")
+        supervisions = []
+        # Match line timestamp: [mm:ss.xx] or [mm:ss.xxx]
+        line_pattern = re.compile(r"\[(\d+):(\d+)\.(\d+)\](.+)")
+        # Match word timestamp: <mm:ss.xx> or <mm:ss.xxx>
+        word_pattern = re.compile(r"<(\d+):(\d+)\.(\d+)>([^<]+)")
+        for line in content.split("\n"):
+            line = line.strip()
+            # Skip empty lines and metadata
+            if not line or line.startswith("[ar:") or line.startswith("[ti:"):
+                continue
+            if line.startswith("[al:") or line.startswith("[offset:"):
+                continue
+            if line.startswith("[by:") or line.startswith("[length:"):
+                continue
+            match = line_pattern.match(line)
+            if match:
+                mins, secs, frac, text = match.groups()
+                # Handle centisecond vs millisecond
+                if len(frac) == 2:
+                    start = int(mins) * 60 + int(secs) + int(frac) / 100
+                else:
+                    start = int(mins) * 60 + int(secs) + int(frac) / 1000
+                # Extract word-level alignment
+                words = word_pattern.findall(text)
+                alignment = None
+                if words:
+                    alignment = {"word": []}
+                    for w_mins, w_secs, w_frac, w_text in words:
+                        if len(w_frac) == 2:
+                            w_start = int(w_mins) * 60 + int(w_secs) + int(w_frac) / 100
+                        else:
+                            w_start = int(w_mins) * 60 + int(w_secs) + int(w_frac) / 1000
+                        alignment["word"].append(
+                            AlignmentItem(
+                                symbol=w_text.strip(),
+                                start=w_start,
+                                duration=0,  # LRC doesn't store duration
+                            )
+                        )
+                    # Clean text (remove timestamp tags)
+                    text = re.sub(r"<\d+:\d+\.\d+>", "", text)
+                supervisions.append(
+                    Supervision(
+                        text=text.strip(),
+                        start=start,
+                        duration=0,  # Will calculate below
+                        alignment=alignment,
+                    )
+                )
+        # Calculate duration from next segment
+        for i, sup in enumerate(supervisions):
+            if i + 1 < len(supervisions):
+                sup.duration = supervisions[i + 1].start - sup.start
+            else:
+                sup.duration = 5.0  # Default 5 seconds for last line
+        return supervisions
+    @classmethod
+    def write(
+        cls,
+        supervisions: List[Supervision],
+        output_path,
+        include_speaker: bool = True,
+        word_level: bool = False,
+        karaoke_config: Optional[KaraokeConfig] = None,
+        **kwargs,
+    ) -> Path:
+        """Write supervisions to LRC file.
+        Args:
+            supervisions: List of Supervision objects to write
+            output_path: Path to output file
+            include_speaker: Whether to include speaker labels in text
+            word_level: Enable word-level output
+            karaoke_config: Karaoke configuration. When provided with enabled=True,
+                use enhanced LRC with inline timestamps
+            **kwargs: Additional options
+        Returns:
+            Path to the written file
+        """
+        output_path = Path(output_path)
+        content = cls.to_bytes(
+            supervisions,
+            include_speaker=include_speaker,
+            word_level=word_level,
+            karaoke_config=karaoke_config,
+            **kwargs,
+        )
+        output_path.write_bytes(content)
+        return output_path
+    @classmethod
+    def to_bytes(
+        cls,
+        supervisions: List[Supervision],
+        include_speaker: bool = True,
+        word_level: bool = False,
+        karaoke_config: Optional[KaraokeConfig] = None,
+        metadata: Optional[Dict] = None,
+        **kwargs,
+    ) -> bytes:
+        """Convert supervisions to LRC format bytes.
+        Args:
+            supervisions: List of Supervision objects
+            include_speaker: Whether to include speaker labels
+            word_level: Enable word-level output
+            karaoke_config: Karaoke configuration. When provided with enabled=True,
+                use enhanced LRC with inline timestamps
+            metadata: Optional metadata dict containing lrc_* keys to restore
+        Returns:
+            Caption content as bytes
+        """
+        config = karaoke_config or KaraokeConfig(enabled=False)
+        karaoke_enabled = config.enabled
+        lines = []
+        # Restore metadata from Caption.metadata (lrc_* keys)
+        if metadata:
+            lrc_meta_keys = ["ar", "ti", "al", "by", "offset", "length", "re", "ve"]
+            for key in lrc_meta_keys:
+                value = metadata.get(f"lrc_{key}")
+                if value:
+                    lines.append(f"[{key}:{value}]")
+        # Also add karaoke config metadata if enabled
+        if karaoke_enabled:
+            for key, value in config.lrc_metadata.items():
+                # Avoid duplicates
+                existing_line = f"[{key}:"
+                if not any(line.startswith(existing_line) for line in lines):
+                    lines.append(f"[{key}:{value}]")
+        if lines:
+            lines.append("")
+        for sup in supervisions:
+            if word_level and sup.alignment and "word" in sup.alignment:
+                word_items = sup.alignment["word"]
+                if karaoke_enabled:
+                    # Enhanced LRC mode: each word has inline timestamp
+                    # Use first word's timestamp for line timing (more accurate)
+                    line_time = cls._format_time(word_items[0].start, config.lrc_precision)
+                    word_parts = []
+                    for word in word_items:
+                        word_time = cls._format_time(word.start, config.lrc_precision)
+                        word_parts.append(f"<{word_time}>{word.symbol}")
+                    lines.append(f"[{line_time}]{' '.join(word_parts)}")
+                else:
+                    # Word-per-line mode: each word as separate line
+                    for word in sup.alignment["word"]:
+                        word_time = cls._format_time(word.start, config.lrc_precision)
+                        lines.append(f"[{word_time}]{word.symbol}")
+            else:
+                # Standard LRC mode: only line timestamp
+                line_time = cls._format_time(sup.start, config.lrc_precision)
+                text = sup.text or ""
+                if cls._should_include_speaker(sup, include_speaker):
+                    text = f"{sup.speaker}: {text}"
+                lines.append(f"[{line_time}]{text}")
+        return "\n".join(lines).encode("utf-8")
+    @staticmethod
+    def _format_time(seconds: float, precision: str) -> str:
+        """Format time for LRC timestamp.
+        Args:
+            seconds: Time in seconds
+            precision: "centisecond" for [mm:ss.xx] or "millisecond" for [mm:ss.xxx]
+        Returns:
+            Formatted time string
+        """
+        if seconds < 0:
+            seconds = 0
+        minutes = int(seconds // 60)
+        secs = seconds % 60
+        if precision == "millisecond":
+            return f"{minutes:02d}:{secs:06.3f}"  # 00:15.200
+        return f"{minutes:02d}:{secs:05.2f}"  # 00:15.23
+__all__ = ["LRCFormat"]

lattifai/caption/formats/nle/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Professional NLE format handlers.
+This module provides format handlers for professional non-linear editing systems
+and digital audio workstations.
+"""
+from . import audition, avid, fcpxml, premiere
+__all__ = ["audition", "avid", "fcpxml", "premiere"]

lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

lattifai 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl