PyPI - lattifai - Versions diffs - 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

lattifai 0.4.5py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

lattifai/__init__.py +61 -47
lattifai/alignment/__init__.py +6 -0
lattifai/alignment/lattice1_aligner.py +119 -0
lattifai/alignment/lattice1_worker.py +185 -0
lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
lattifai/alignment/segmenter.py +166 -0
lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
lattifai/audio2.py +211 -0
lattifai/caption/__init__.py +20 -0
lattifai/caption/caption.py +1275 -0
lattifai/{io → caption}/gemini_reader.py +30 -30
lattifai/{io → caption}/gemini_writer.py +17 -17
lattifai/{io → caption}/supervision.py +4 -3
lattifai/caption/text_parser.py +145 -0
lattifai/cli/__init__.py +17 -0
lattifai/cli/alignment.py +153 -0
lattifai/cli/caption.py +204 -0
lattifai/cli/server.py +19 -0
lattifai/cli/transcribe.py +197 -0
lattifai/cli/youtube.py +128 -0
lattifai/client.py +460 -251
lattifai/config/__init__.py +20 -0
lattifai/config/alignment.py +73 -0
lattifai/config/caption.py +178 -0
lattifai/config/client.py +46 -0
lattifai/config/diarization.py +67 -0
lattifai/config/media.py +335 -0
lattifai/config/transcription.py +84 -0
lattifai/diarization/__init__.py +5 -0
lattifai/diarization/lattifai.py +89 -0
lattifai/errors.py +98 -91
lattifai/logging.py +116 -0
lattifai/mixin.py +552 -0
lattifai/server/app.py +420 -0
lattifai/transcription/__init__.py +76 -0
lattifai/transcription/base.py +108 -0
lattifai/transcription/gemini.py +219 -0
lattifai/transcription/lattifai.py +103 -0
lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
lattifai/types.py +30 -0
lattifai/utils.py +16 -44
lattifai/workflow/__init__.py +22 -0
lattifai/workflow/agents.py +6 -0
lattifai/{workflows → workflow}/base.py +22 -22
lattifai/{workflows → workflow}/file_manager.py +239 -215
lattifai/workflow/youtube.py +564 -0
lattifai-1.0.0.dist-info/METADATA +736 -0
lattifai-1.0.0.dist-info/RECORD +52 -0
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
lattifai-1.0.0.dist-info/entry_points.txt +13 -0
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
lattifai/base_client.py +0 -126
lattifai/bin/__init__.py +0 -3
lattifai/bin/agent.py +0 -325
lattifai/bin/align.py +0 -296
lattifai/bin/cli_base.py +0 -25
lattifai/bin/subtitle.py +0 -210
lattifai/io/__init__.py +0 -42
lattifai/io/reader.py +0 -85
lattifai/io/text_parser.py +0 -75
lattifai/io/utils.py +0 -15
lattifai/io/writer.py +0 -90
lattifai/tokenizer/__init__.py +0 -3
lattifai/workers/__init__.py +0 -3
lattifai/workers/lattice1_alpha.py +0 -284
lattifai/workflows/__init__.py +0 -34
lattifai/workflows/agents.py +0 -10
lattifai/workflows/gemini.py +0 -167
lattifai/workflows/prompts/README.md +0 -22
lattifai/workflows/prompts/gemini/README.md +0 -24
lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
lattifai/workflows/youtube.py +0 -931
lattifai-0.4.5.dist-info/METADATA +0 -808
lattifai-0.4.5.dist-info/RECORD +0 -39
lattifai-0.4.5.dist-info/entry_points.txt +0 -3
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0

lattifai/{io → caption}/gemini_reader.py RENAMED Viewed

@@ -1,9 +1,9 @@
 """Reader for YouTube transcript files with speaker labels and timestamps."""
 import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Optional
 from lhotse.utils import Pathlike
@@ -18,7 +18,7 @@ class GeminiSegment:
     timestamp: Optional[float] = None
     speaker: Optional[str] = None
     section: Optional[str] = None
-    segment_type: str = 'dialogue'  # 'dialogue', 'event', or 'section_header'
+    segment_type: str = "dialogue"  # 'dialogue', 'event', or 'section_header'
     line_number: int = 0
     @property
@@ -31,15 +31,15 @@ class GeminiReader:
     """Parser for YouTube transcript format with speaker labels and timestamps."""
     # Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
-    TIMESTAMP_PATTERN = re.compile(r'\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]')
-    SECTION_HEADER_PATTERN = re.compile(r'^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$')
-    SPEAKER_PATTERN = re.compile(r'^\*\*(.+?[:：])\*\*\s*(.+)$')
-    EVENT_PATTERN = re.compile(r'^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
-    INLINE_TIMESTAMP_PATTERN = re.compile(r'^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
+    TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
+    SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
+    SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[:：])\*\*\s*(.+)$")
+    EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
+    INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
     # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
-    YOUTUBE_SECTION_PATTERN = re.compile(r'^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$')
-    YOUTUBE_INLINE_PATTERN = re.compile(r'^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$')
+    YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
+    YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
     @classmethod
     def parse_timestamp(cls, *args) -> float:
@@ -61,7 +61,7 @@ class GeminiReader:
             # Direct seconds (from YouTube &t= parameter)
             return int(args[0])
         else:
-            raise ValueError(f'Invalid timestamp args: {args}')
+            raise ValueError(f"Invalid timestamp args: {args}")
     @classmethod
     def read(
@@ -82,13 +82,13 @@ class GeminiReader:
         """
         transcript_path = Path(transcript_path).expanduser().resolve()
         if not transcript_path.exists():
-            raise FileNotFoundError(f'Transcript file not found: {transcript_path}')
+            raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
         segments: List[GeminiSegment] = []
         current_section = None
         current_speaker = None
-        with open(transcript_path, 'r', encoding='utf-8') as f:
+        with open(transcript_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
         for line_num, line in enumerate(lines, start=1):
@@ -97,9 +97,9 @@ class GeminiReader:
                 continue
             # Skip table of contents
-            if line.startswith('* ['):
+            if line.startswith("* ["):
                 continue
-            if line.startswith('## Table of Contents'):
+            if line.startswith("## Table of Contents"):
                 continue
             # Parse section headers
@@ -114,7 +114,7 @@ class GeminiReader:
                             text=section_title.strip(),
                             timestamp=timestamp,
                             section=current_section,
-                            segment_type='section_header',
+                            segment_type="section_header",
                             line_number=line_num,
                         )
                     )
@@ -133,7 +133,7 @@ class GeminiReader:
                             text=section_title.strip(),
                             timestamp=timestamp,
                             section=current_section,
-                            segment_type='section_header',
+                            segment_type="section_header",
                             line_number=line_num,
                         )
                     )
@@ -158,7 +158,7 @@ class GeminiReader:
                             text=event_text.strip(),
                             timestamp=timestamp,
                             section=current_section,
-                            segment_type='event',
+                            segment_type="event",
                             line_number=line_num,
                         )
                     )
@@ -200,7 +200,7 @@ class GeminiReader:
                         timestamp=timestamp,
                         speaker=current_speaker,
                         section=current_section,
-                        segment_type='dialogue',
+                        segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
@@ -228,7 +228,7 @@ class GeminiReader:
                         timestamp=timestamp,
                         speaker=current_speaker,
                         section=current_section,
-                        segment_type='dialogue',
+                        segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
@@ -246,14 +246,14 @@ class GeminiReader:
                         timestamp=timestamp,
                         speaker=current_speaker,
                         section=current_section,
-                        segment_type='dialogue',
+                        segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
                 continue
             # Skip markdown headers and other formatting
-            if line.startswith('#'):
+            if line.startswith("#"):
                 continue
         return segments
@@ -283,10 +283,10 @@ class GeminiReader:
         segments = cls.read(transcript_path, include_events=False, include_sections=False)
         # Filter to only dialogue segments with timestamps
-        dialogue_segments = [s for s in segments if s.segment_type == 'dialogue' and s.timestamp is not None]
+        dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
         if not dialogue_segments:
-            raise ValueError(f'No dialogue segments with timestamps found in {transcript_path}')
+            raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
         # Sort by timestamp
         dialogue_segments.sort(key=lambda x: x.timestamp)
@@ -308,7 +308,7 @@ class GeminiReader:
                     text=segment.text,
                     start=segment.timestamp,
                     duration=max(duration, min_duration),
-                    id=f'segment_{i:05d}',
+                    id=f"segment_{i:05d}",
                     speaker=segment.speaker,
                 )
             )
@@ -337,13 +337,13 @@ class GeminiReader:
                 else:
                     # Different speaker or gap too large, save previous segment
                     if current_texts:
-                        merged_text = ' '.join(current_texts)
+                        merged_text = " ".join(current_texts)
                         merged.append(
                             Supervision(
                                 text=merged_text,
                                 start=current_start,
                                 duration=last_end_time - current_start,
-                                id=f'merged_{len(merged):05d}',
+                                id=f"merged_{len(merged):05d}",
                             )
                         )
                     current_speaker = segment.speaker
@@ -353,13 +353,13 @@ class GeminiReader:
             # Add final segment
             if current_texts:
-                merged_text = ' '.join(current_texts)
+                merged_text = " ".join(current_texts)
                 merged.append(
                     Supervision(
                         text=merged_text,
                         start=current_start,
                         duration=last_end_time - current_start,
-                        id=f'merged_{len(merged):05d}',
+                        id=f"merged_{len(merged):05d}",
                     )
                 )
@@ -368,4 +368,4 @@ class GeminiReader:
         return supervisions
-__all__ = ['GeminiReader', 'GeminiSegment']
+__all__ = ["GeminiReader", "GeminiSegment"]

lattifai/{io → caption}/gemini_writer.py RENAMED Viewed

@@ -19,7 +19,7 @@ class GeminiWriter:
         hours = int(seconds // 3600)
         minutes = int((seconds % 3600) // 60)
         secs = int(seconds % 60)
-        return f'[{hours:02d}:{minutes:02d}:{secs:02d}]'
+        return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
     @classmethod
     def update_timestamps(
@@ -44,7 +44,7 @@ class GeminiWriter:
         output_path = Path(output_path)
         # Read original file
-        with open(original_path, 'r', encoding='utf-8') as f:
+        with open(original_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
         # Parse original segments to get line numbers
@@ -66,7 +66,7 @@ class GeminiWriter:
         # Write updated content
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_path, 'w', encoding='utf-8') as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             f.writelines(updated_lines)
         return output_path
@@ -83,7 +83,7 @@ class GeminiWriter:
         mapping = {}
         # Create a simple text-based matching
-        dialogue_segments = [s for s in original_segments if s.segment_type == 'dialogue']
+        dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
         # Try to match based on text content
         for aligned_sup in aligned_supervisions:
@@ -120,7 +120,7 @@ class GeminiWriter:
         # Replace timestamp patterns
         # Pattern 1: [HH:MM:SS] at the end or in brackets
-        line = re.sub(r'\[\d{2}:\d{2}:\d{2}\]', new_ts_str, line)
+        line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
         return line
@@ -146,28 +146,28 @@ class GeminiWriter:
         output_path = Path(output_path)
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_path, 'w', encoding='utf-8') as f:
-            f.write('# Aligned Transcript\n\n')
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("# Aligned Transcript\n\n")
             for i, sup in enumerate(aligned_supervisions):
                 # Write segment with timestamp
                 start_ts = cls.format_timestamp(sup.start)
-                f.write(f'{start_ts} {sup.text}\n')
+                f.write(f"{start_ts} {sup.text}\n")
                 # Optionally write word-level timestamps
-                if include_word_timestamps and hasattr(sup, 'alignment') and sup.alignment:
-                    if 'word' in sup.alignment:
-                        f.write('  Words: ')
+                if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
+                    if "word" in sup.alignment:
+                        f.write("  Words: ")
                         word_parts = []
-                        for word_info in sup.alignment['word']:
-                            word_ts = cls.format_timestamp(word_info['start'])
+                        for word_info in sup.alignment["word"]:
+                            word_ts = cls.format_timestamp(word_info["start"])
                             word_parts.append(f'{word_info["symbol"]}{word_ts}')
-                        f.write(' '.join(word_parts))
-                        f.write('\n')
+                        f.write(" ".join(word_parts))
+                        f.write("\n")
-                f.write('\n')
+                f.write("\n")
         return output_path
-__all__ = ['GeminiWriter']
+__all__ = ["GeminiWriter"]

lattifai/{io → caption}/supervision.py RENAMED Viewed

@@ -24,10 +24,11 @@ class Supervision(SupervisionSegment):
     """
     text: Optional[str] = None
-    id: str = ''
-    recording_id: str = ''
+    speaker: Optional[str] = None
+    id: str = ""
+    recording_id: str = ""
     start: Seconds = 0.0
     duration: Seconds = 0.0
-__all__ = ['Supervision']
+__all__ = ["Supervision"]

lattifai/caption/text_parser.py ADDED Viewed

@@ -0,0 +1,145 @@
+import logging
+import re
+from typing import Optional, Tuple
+# Timestamp pattern: [start-end] text
+# Example: [1.23-4.56] Hello world
+TIMESTAMP_PATTERN = re.compile(r"^\[([\d.]+)-([\d.]+)\]\s*(.*)$")
+# 来自于字幕中常见的说话人标记格式
+SPEAKER_PATTERN = re.compile(r"((?:>>|&gt;&gt;|>|&gt;).*?[:：])\s*(.*)")
+# Transcriber Output Example:
+# 26:19.919 --> 26:34.921
+# [SPEAKER_01]: 越来越多的科技巨头入...
+SPEAKER_LATTIFAI = re.compile(r"(^\[SPEAKER_.*?\][:：])\s*(.*)")
+# NISHTHA BHATIA: Hey, everyone.
+# DIETER: Oh, hey, Nishtha.
+# GEMINI: That might
+SPEAKER_PATTERN2 = re.compile(r"^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[:：])\s*(.*)$")
+def normalize_text(text: str) -> str:
+    """Normalize caption text by:
+    - Decoding common HTML entities
+    - Removing HTML tags (e.g., <i>, <font>, <b>, <br>)
+    - Collapsing multiple whitespace into a single space
+    - Converting curly apostrophes to straight ones in common contractions
+    """
+    if not text:
+        return ""
+    # # Remove HTML tags first (replace with space to avoid concatenation)
+    # text = re.sub(r"<[^>]+>", " ", text)
+    html_entities = {
+        "&amp;": "&",
+        "&lt;": "<",
+        "&gt;": ">",
+        "&quot;": '"',
+        "&#39;": "'",
+        "&nbsp;": " ",
+        "\\N": " ",
+        "…": " ",  # replace ellipsis with space to avoid merging words
+    }
+    for entity, char in html_entities.items():
+        text = text.replace(entity, char)
+    # Convert curly apostrophes to straight apostrophes for common English contractions
+    text = re.sub(r"([a-zA-Z])’([tsdm]|ll|re|ve)\b", r"\1'\2", text, flags=re.IGNORECASE)
+    text = re.sub(r"([0-9])’([s])\b", r"\1'\2", text, flags=re.IGNORECASE)
+    # Collapse whitespace (after replacements)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def parse_speaker_text(line) -> Tuple[Optional[str], str]:
+    """Parse a line of text to extract speaker and content."""
+    if ":" not in line and "：" not in line:
+        return None, line
+    # 匹配以 >> 开头的行，并去除开头的名字和冒号
+    match = SPEAKER_PATTERN.match(line)
+    if match:
+        return match.group(1).strip(), match.group(2).strip()
+    match = SPEAKER_LATTIFAI.match(line)
+    if match:
+        assert len(match.groups()) == 2, match.groups()
+        if not match.group(1):
+            logging.error(f"ParseSub LINE [{line}]")
+        else:
+            return match.group(1).strip(), match.group(2).strip()
+    match = SPEAKER_PATTERN2.match(line)
+    if match:
+        assert len(match.groups()) == 2, match.groups()
+        return match.group(1).strip(), match.group(2).strip()
+    return None, line
+def parse_timestamp_text(line: str) -> Tuple[Optional[float], Optional[float], str]:
+    """
+    Parse a line of text to extract timestamp and content.
+    Format: [start-end] text
+    Example: [1.23-4.56] Hello world
+    Args:
+        line: Input line to parse
+    Returns:
+        Tuple of (start_time, end_time, text)
+        - start_time: Start timestamp in seconds, or None if not found
+        - end_time: End timestamp in seconds, or None if not found
+        - text: The text content after the timestamp
+    """
+    match = TIMESTAMP_PATTERN.match(line)
+    if match:
+        try:
+            start = float(match.group(1))
+            end = float(match.group(2))
+            text = match.group(3).strip()
+            return start, end, text
+        except ValueError:
+            # If conversion fails, treat as plain text
+            return None, None, line
+    return None, None, line
+if __name__ == "__main__":
+    pattern = re.compile(r">>\s*(.*?)\s*[:：]\s*(.*)")
+    pattern = re.compile(r"(>>.*?[:：])\s*(.*)")
+    test_strings = [
+        ">>Key: Value",
+        ">>  Key with space : Value with space ",
+        ">>  全角键 ： 全角值",
+        ">>Key：Value xxx. >>Key：Value",
+    ]
+    for text in test_strings:
+        match = pattern.match(text)
+        if match:
+            print(f"Input: '{text}'")
+            print(f"Speaker:   '{match.group(1)}'")
+            print(f"Content: '{match.group(2)}'")
+            print("-------------")
+    # pattern2
+    test_strings2 = ["NISHTHA BHATIA: Hey, everyone.", "DIETER: Oh, hey, Nishtha.", "GEMINI: That might"]
+    for text in test_strings2:
+        match = SPEAKER_PATTERN2.match(text)
+        if match:
+            print(f"  Input: '{text}'")
+            print(f"Speaker: '{match.group(1)}'")
+            print(f"Content: '{match.group(2)}'")
+            print("-------------")
+        else:
+            raise ValueError(f"No match for: '{text}'")

lattifai/cli/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""CLI module for LattifAI with nemo_run entry points."""
+import nemo_run as run  # noqa: F401
+# Import and re-export entrypoints at package level so NeMo Run can find them
+from lattifai.cli.alignment import align
+from lattifai.cli.caption import convert
+from lattifai.cli.transcribe import transcribe, transcribe_align
+from lattifai.cli.youtube import youtube
+__all__ = [
+    "align",
+    "convert",
+    "transcribe",
+    "transcribe_align",
+    "youtube",
+]

lattifai/cli/alignment.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""Alignment CLI entry point with nemo_run."""
+from typing import Optional
+import nemo_run as run
+from lhotse.utils import Pathlike
+from typing_extensions import Annotated
+from lattifai.client import LattifAI
+from lattifai.config import (
+    AlignmentConfig,
+    CaptionConfig,
+    ClientConfig,
+    DiarizationConfig,
+    MediaConfig,
+    TranscriptionConfig,
+)
+__all__ = ["align"]
+@run.cli.entrypoint(name="align", namespace="alignment")
+def align(
+    input_media: Optional[str] = None,
+    input_caption: Optional[str] = None,
+    output_caption: Optional[str] = None,
+    media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
+    caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
+    client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
+    alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
+    transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
+    diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
+):
+    """
+    Align audio/video with caption file.
+    This command performs forced alignment between audio/video media and caption text,
+    generating accurate timestamps for each caption segment and optionally word-level
+    timestamps. The alignment engine uses advanced speech recognition models to ensure
+    precise synchronization between audio and text.
+    Shortcut: invoking ``lai-align`` is equivalent to running ``lai alignment align``.
+    Args:
+        media: Media configuration for audio/video input and output handling.
+            Fields: input_path, media_format, sample_rate, channels, output_dir,
+                    output_path, output_format, prefer_audio, default_audio_format,
+                    default_video_format, force_overwrite
+        client: API client configuration.
+            Fields: api_key, timeout, max_retries, default_headers
+        alignment: Alignment configuration (model selection and inference settings).
+            Fields: model_name, device, batch_size
+        caption: Caption I/O configuration (file reading/writing and formatting).
+            Fields: input_format, input_path, output_format, output_path,
+                    normalize_text, split_sentence, word_level,
+                    include_speaker_in_text, encoding
+    Examples:
+        # Basic usage with positional arguments
+        lai alignment align audio.wav caption.srt output.srt
+        # Mixing positional and keyword arguments
+        lai alignment align audio.mp4 caption.srt output.json \\
+            alignment.device=cuda \\
+            caption.word_level=true
+        # Smart sentence splitting with custom output format
+        lai alignment align audio.wav caption.srt output.vtt \\
+            caption.split_sentence=true
+        # Using keyword arguments (traditional syntax)
+        lai alignment align \\
+            input_media=audio.wav \\
+            input_caption=caption.srt \\
+            output_caption=output.srt
+        # Full configuration with nested config objects
+        lai alignment align audio.wav caption.srt aligned.json \\
+            media.output_dir=/tmp/output \\
+            caption.split_sentence=true \\
+            caption.word_level=true \\
+            caption.normalize_text=true \\
+            alignment.device=mps \\
+            alignment.model_name=Lattifai/Lattice-1-Alpha
+    """
+    media_config = media or MediaConfig()
+    # Validate that input_media and media_config.input_path are not both provided
+    if input_media and media_config.input_path:
+        raise ValueError(
+            "Cannot specify both positional input_media and media.input_path. "
+            "Use either positional argument or config, not both."
+        )
+    # Assign input_media to media_config.input_path if provided
+    if input_media:
+        media_config.set_input_path(input_media)
+    if not media_config.input_path:
+        raise ValueError("Input media path must be specified via positional argument input_media= or media.input_path=")
+    caption_config = caption or CaptionConfig()
+    # Validate that output_caption_path and caption_config.output_path are not both provided
+    if output_caption and caption_config.output_path:
+        raise ValueError(
+            "Cannot specify both positional output_caption and caption.output_path. "
+            "Use either positional argument or config, not both."
+        )
+    # Assign paths to caption_config if provided
+    if input_caption:
+        caption_config.set_input_path(input_caption)
+    if output_caption:
+        caption_config.set_output_path(output_caption)
+    client = LattifAI(
+        client_config=client,
+        alignment_config=alignment,
+        caption_config=caption_config,
+        transcription_config=transcription,
+        diarization_config=diarization,
+    )
+    is_url = media_config.input_path.startswith(("http://", "https://"))
+    if is_url:
+        # Call the client's youtube method
+        return client.youtube(
+            url=media_config.input_path,
+            output_dir=media_config.output_dir,
+            output_caption_path=caption_config.output_path,
+            media_format=media_config.normalize_format() if media_config.output_format else None,
+            force_overwrite=media_config.force_overwrite,
+            split_sentence=caption_config.split_sentence,
+            channel_selector=media_config.channel_selector,
+        )
+    return client.alignment(
+        input_media=media_config.input_path,
+        input_caption=caption_config.input_path,
+        output_caption_path=caption_config.output_path,
+        split_sentence=caption_config.split_sentence,
+        channel_selector=media_config.channel_selector,
+    )
+def main():
+    run.cli.main(align)
+if __name__ == "__main__":
+    main()

lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

lattifai 0.4.5py3-none-any.whl → 1.0.0py3-none-any.whl