PyPI - lattifai - Versions diffs - 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

lattifai 0.4.5py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

lattifai/__init__.py +61 -47
lattifai/alignment/__init__.py +6 -0
lattifai/alignment/lattice1_aligner.py +119 -0
lattifai/alignment/lattice1_worker.py +185 -0
lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
lattifai/alignment/segmenter.py +166 -0
lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
lattifai/audio2.py +211 -0
lattifai/caption/__init__.py +20 -0
lattifai/caption/caption.py +1275 -0
lattifai/{io → caption}/gemini_reader.py +30 -30
lattifai/{io → caption}/gemini_writer.py +17 -17
lattifai/{io → caption}/supervision.py +4 -3
lattifai/caption/text_parser.py +145 -0
lattifai/cli/__init__.py +17 -0
lattifai/cli/alignment.py +153 -0
lattifai/cli/caption.py +204 -0
lattifai/cli/server.py +19 -0
lattifai/cli/transcribe.py +197 -0
lattifai/cli/youtube.py +128 -0
lattifai/client.py +460 -251
lattifai/config/__init__.py +20 -0
lattifai/config/alignment.py +73 -0
lattifai/config/caption.py +178 -0
lattifai/config/client.py +46 -0
lattifai/config/diarization.py +67 -0
lattifai/config/media.py +335 -0
lattifai/config/transcription.py +84 -0
lattifai/diarization/__init__.py +5 -0
lattifai/diarization/lattifai.py +89 -0
lattifai/errors.py +98 -91
lattifai/logging.py +116 -0
lattifai/mixin.py +552 -0
lattifai/server/app.py +420 -0
lattifai/transcription/__init__.py +76 -0
lattifai/transcription/base.py +108 -0
lattifai/transcription/gemini.py +219 -0
lattifai/transcription/lattifai.py +103 -0
lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
lattifai/types.py +30 -0
lattifai/utils.py +16 -44
lattifai/workflow/__init__.py +22 -0
lattifai/workflow/agents.py +6 -0
lattifai/{workflows → workflow}/base.py +22 -22
lattifai/{workflows → workflow}/file_manager.py +239 -215
lattifai/workflow/youtube.py +564 -0
lattifai-1.0.0.dist-info/METADATA +736 -0
lattifai-1.0.0.dist-info/RECORD +52 -0
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
lattifai-1.0.0.dist-info/entry_points.txt +13 -0
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
lattifai/base_client.py +0 -126
lattifai/bin/__init__.py +0 -3
lattifai/bin/agent.py +0 -325
lattifai/bin/align.py +0 -296
lattifai/bin/cli_base.py +0 -25
lattifai/bin/subtitle.py +0 -210
lattifai/io/__init__.py +0 -42
lattifai/io/reader.py +0 -85
lattifai/io/text_parser.py +0 -75
lattifai/io/utils.py +0 -15
lattifai/io/writer.py +0 -90
lattifai/tokenizer/__init__.py +0 -3
lattifai/workers/__init__.py +0 -3
lattifai/workers/lattice1_alpha.py +0 -284
lattifai/workflows/__init__.py +0 -34
lattifai/workflows/agents.py +0 -10
lattifai/workflows/gemini.py +0 -167
lattifai/workflows/prompts/README.md +0 -22
lattifai/workflows/prompts/gemini/README.md +0 -24
lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
lattifai/workflows/youtube.py +0 -931
lattifai-0.4.5.dist-info/METADATA +0 -808
lattifai-0.4.5.dist-info/RECORD +0 -39
lattifai-0.4.5.dist-info/entry_points.txt +0 -3
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0

lattifai/config/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Configuration system for LattifAI using nemo_run."""
+from .alignment import AlignmentConfig
+from .caption import CaptionConfig
+from .client import ClientConfig
+from .diarization import DiarizationConfig
+from .media import AUDIO_FORMATS, MEDIA_FORMATS, VIDEO_FORMATS, MediaConfig
+from .transcription import TranscriptionConfig
+__all__ = [
+    "ClientConfig",
+    "AlignmentConfig",
+    "CaptionConfig",
+    "TranscriptionConfig",
+    "DiarizationConfig",
+    "MediaConfig",
+    "AUDIO_FORMATS",
+    "VIDEO_FORMATS",
+    "MEDIA_FORMATS",
+]

lattifai/config/alignment.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Alignment configuration for LattifAI."""
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Dict, Literal, Optional
+from ..utils import _select_device
+if TYPE_CHECKING:
+    from ..base_client import SyncAPIClient
+@dataclass
+class AlignmentConfig:
+    """
+    Core alignment configuration.
+    Defines model selection, decoding behavior, and API settings for forced alignment.
+    """
+    # Alignment configuration
+    model_name: str = "Lattifai/Lattice-1"
+    """Model identifier or path to local model directory (e.g., 'Lattifai/Lattice-1')."""
+    device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
+    """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
+    batch_size: int = 1
+    """Batch size for inference (number of samples processed simultaneously)."""
+    # Segmented Alignment for Long Audio
+    trust_caption_timestamps: bool = False
+    """When True, use original caption timestamps as strong reference constraints during alignment.
+    The alignment process will still adjust timestamps but stay close to the input timing.
+    Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
+    while preserving the approximate timing from the original captions.
+    When False (default), performs unconstrained forced alignment based purely on media-caption matching.
+    """
+    strategy: Literal["caption", "transcription", "entire"] = "entire"
+    """Alignment strategy for long audio alignment:
+    - 'entire': Process entire audio as single alignment (default, suitable for <30 min)
+    - 'caption': Split based on existing caption boundaries and gaps (segment_max_gap)
+        work with `alignment.trust_caption_timestamps=true`
+    - 'transcription': Align media with transcription first, then segment based on transcription
+    Use segmentation for long audio (>30 min) to reduce memory usage and improve performance.
+    """
+    segment_duration: float = 300.0
+    """Target duration (in seconds) for each alignment segment when using 'caption' strategy.
+    Default: 300.0 (5 minutes). Typical range: 30-600 seconds (30s-10min).
+    Shorter segments = lower memory, longer segments = better context for alignment.
+    """
+    segment_max_gap: float = 4.0
+    """Maximum gap (in seconds) between captions to consider them part of the same segment.
+    Used by 'caption' and 'adaptive' strategies. Gaps larger than this trigger segment splitting.
+    Default: 4.0 seconds. Useful for detecting scene changes or natural breaks in content.
+    """
+    client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
+    """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
+    def __post_init__(self):
+        """Validate and auto-populate configuration after initialization."""
+        # Validate alignment parameters
+        if self.batch_size < 1:
+            raise ValueError("batch_size must be at least 1")
+        if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
+            raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got {self.device}")
+        if self.device == "auto":
+            self.device = _select_device(self.device)

lattifai/config/caption.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""Caption I/O configuration for LattifAI."""
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal, Optional
+from lhotse.utils import Pathlike
+# Supported caption formats for reading/writing
+CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "md", "ttml", "sami", "smi"]
+# Input caption formats (includes special formats like 'auto' and 'gemini')
+INPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "auto", "gemini"]
+# Output caption formats (includes special formats like 'TextGrid' and 'json')
+OUTPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"]
+# All caption formats combined (for file detection)
+ALL_CAPTION_FORMATS = list(set(CAPTION_FORMATS + ["TextGrid", "json", "gemini"]))
+# Type aliases for better type hints
+InputCaptionFormat = Literal["auto", "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "gemini"]
+OutputCaptionFormat = Literal[
+    "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"
+]
+@dataclass
+class CaptionConfig:
+    """
+    Caption I/O configuration.
+    Controls caption file reading, writing, and formatting options.
+    """
+    input_format: InputCaptionFormat = "auto"
+    """Input caption format: 'auto', 'srt', 'vtt', 'ass', 'txt', or 'json'."""
+    input_path: Optional[str] = None
+    """Path to input caption file."""
+    output_format: OutputCaptionFormat = "srt"
+    """Output caption format: 'srt', 'vtt', 'ass', 'txt', or 'json'."""
+    output_path: Optional[str] = None
+    """Path to output caption file."""
+    include_speaker_in_text: bool = True
+    """Preserve speaker labels in caption text content."""
+    normalize_text: bool = False
+    """Clean HTML entities and normalize whitespace in caption text."""
+    split_sentence: bool = False
+    """Re-segment captions intelligently based on punctuation and semantics."""
+    word_level: bool = False
+    """Include word-level timestamps in alignment results (useful for karaoke, dubbing)."""
+    encoding: str = "utf-8"
+    """Character encoding for reading/writing caption files (default: utf-8)."""
+    source_lang: Optional[str] = None
+    """Source language code for the caption content (e.g., 'en', 'zh', 'de')."""
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        self._normalize_paths()
+        self._validate_formats()
+    @property
+    def need_alignment(self, trust_timestamps: bool) -> bool:
+        """Determine if alignment is needed based on configuration."""
+        if trust_timestamps and not self.split_sentence:
+            if not self.word_level:
+                return False
+            if self.normalize_text:
+                print(
+                    "⚠️ Warning: Text normalization with 'trust_input_timestamps=True' and 'split_sentence=False'"
+                    "💡 Recommended command:\n"
+                    "   lai caption normalize input.srt normalized.srt\n"
+                )
+            return False
+        return True
+    def _normalize_paths(self) -> None:
+        """Normalize and expand input/output paths."""
+        # Expand and normalize input path if provided, but don't require it to exist yet
+        # (it might be set later after downloading captions)
+        if self.input_path is not None:
+            self.input_path = str(Path(self.input_path).expanduser())
+        if self.output_path is not None:
+            self.output_path = str(Path(self.output_path).expanduser())
+            output_dir = Path(self.output_path).parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+    def _validate_formats(self) -> None:
+        """Validate input and output format fields."""
+        if self.input_format not in INPUT_CAPTION_FORMATS:
+            raise ValueError(f"input_format must be one of {INPUT_CAPTION_FORMATS}, got '{self.input_format}'")
+        if self.output_format not in OUTPUT_CAPTION_FORMATS:
+            raise ValueError(f"output_format must be one of {OUTPUT_CAPTION_FORMATS}, got '{self.output_format}'")
+    def set_input_path(self, path: Pathlike) -> Path:
+        """
+        Set input caption path and validate it.
+        Args:
+            path: Path to input caption file (str or Path)
+        Returns:
+            Resolved path as Path object
+        Raises:
+            FileNotFoundError: If the file does not exist
+            ValueError: If the path is not a file
+        """
+        resolved = Path(path).expanduser().resolve()
+        if not resolved.exists():
+            raise FileNotFoundError(f"Input caption file does not exist: '{resolved}'")
+        if not resolved.is_file():
+            raise ValueError(f"Input caption path is not a file: '{resolved}'")
+        self.input_path = str(resolved)
+        self.check_input_sanity()
+        return resolved
+    def set_output_path(self, path: Pathlike) -> Path:
+        """
+        Set output caption path and create parent directories if needed.
+        Args:
+            path: Path to output caption file (str or Path)
+        Returns:
+            Resolved path as Path object
+        """
+        resolved = Path(path).expanduser().resolve()
+        resolved.parent.mkdir(parents=True, exist_ok=True)
+        self.output_path = str(resolved)
+        return resolved
+    def check_input_sanity(self) -> None:
+        """
+        Validate that input_path is properly configured and accessible.
+        Raises:
+            ValueError: If input_path is not set or is invalid
+            FileNotFoundError: If input_path does not exist
+        """
+        if not self.input_path:
+            raise ValueError("input_path is required but not set in CaptionConfig")
+        input_file = Path(self.input_path).expanduser()
+        if not input_file.exists():
+            raise FileNotFoundError(
+                f"Input caption file does not exist: '{input_file}'. " "Please check the path and try again."
+            )
+        if not input_file.is_file():
+            raise ValueError(
+                f"Input caption path is not a file: '{input_file}'. " "Expected a valid caption file path."
+            )
+    def check_sanity(self) -> bool:
+        """Perform sanity checks on the configuration."""
+        assert self.is_input_path_existed(), "Input caption path must be provided and exist."
+    def is_input_path_existed(self) -> bool:
+        """Check if input caption path is provided and exists."""
+        if self.input_path is None:
+            return False
+        input_file = Path(self.input_path).expanduser()
+        self.input_path = str(input_file)
+        return input_file.exists() and input_file.is_file()

lattifai/config/client.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""LattifAI Client configuration."""
+import os
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+@dataclass
+class ClientConfig:
+    """
+    Core alignment configuration.
+    Defines model selection, decoding behavior, and API settings for forced alignment.
+    """
+    # API configuration
+    api_key: Optional[str] = field(default=None)
+    """LattifAI API key. If None, reads from LATTIFAI_API_KEY environment variable."""
+    timeout: float = 120.0
+    """Request timeout in seconds."""
+    max_retries: int = 2
+    """Maximum number of retry attempts for failed requests."""
+    default_headers: Optional[Dict[str, str]] = field(default=None)
+    """Optional static headers to include in all requests."""
+    def __post_init__(self):
+        """Validate and auto-populate configuration after initialization."""
+        # Load environment variables from .env file
+        from dotenv import find_dotenv, load_dotenv
+        # Try to find and load .env file from current directory or parent directories
+        load_dotenv(find_dotenv(usecwd=True))
+        # Auto-load API key from environment if not provided
+        if self.api_key is None:
+            object.__setattr__(self, "api_key", os.environ.get("LATTIFAI_API_KEY"))
+        # Validate API parameters
+        if self.timeout <= 0:
+            raise ValueError("timeout must be greater than 0")
+        if self.max_retries < 0:
+            raise ValueError("max_retries must be non-negative")

lattifai/config/diarization.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Speaker diarization configuration for LattifAI."""
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal, Optional
+from ..utils import _select_device
+if TYPE_CHECKING:
+    from ..base_client import SyncAPIClient
+@dataclass
+class DiarizationConfig:
+    """
+    Speaker diarization configuration.
+    Settings for speaker diarization operations.
+    """
+    enabled: bool = False
+    """Enable speaker diarization."""
+    device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
+    """Computation device for diarization models."""
+    num_speakers: Optional[int] = None
+    """Number of speakers, when known. If not set, diarization will attempt to infer the number of speakers."""
+    min_speakers: Optional[int] = None
+    """Minimum number of speakers. Has no effect when `num_speakers` is provided."""
+    max_speakers: Optional[int] = None
+    """Maximum number of speakers. Has no effect when `num_speakers` is provided."""
+    model_name: str = "pyannote/speaker-diarization-community-1"
+    """Model name for speaker diarization."""
+    verbose: bool = False
+    """Enable debug logging for diarization operations."""
+    debug: bool = False
+    """Enable debug mode for diarization operations."""
+    client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
+    """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
+    def __post_init__(self):
+        """Validate and auto-populate configuration after initialization."""
+        # Validate device
+        if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
+            raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got '{self.device}'")
+        if self.device == "auto":
+            self.device = _select_device(self.device)
+        # Validate speaker counts
+        if self.num_speakers is not None and self.num_speakers < 1:
+            raise ValueError("num_speakers must be at least 1")
+        if self.min_speakers is not None and self.min_speakers < 1:
+            raise ValueError("min_speakers must be at least 1")
+        if self.max_speakers is not None and self.max_speakers < 1:
+            raise ValueError("max_speakers must be at least 1")
+        if self.min_speakers is not None and self.max_speakers is not None and self.min_speakers > self.max_speakers:
+            raise ValueError("min_speakers cannot be greater than max_speakers")

lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

lattifai 0.4.5py3-none-any.whl → 1.0.0py3-none-any.whl