PyPI - lattifai - Versions diffs - 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

lattifai 0.4.6py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

lattifai/__init__.py +42 -27
lattifai/alignment/__init__.py +6 -0
lattifai/alignment/lattice1_aligner.py +119 -0
lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
lattifai/alignment/segmenter.py +166 -0
lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
lattifai/audio2.py +211 -0
lattifai/caption/__init__.py +20 -0
lattifai/caption/caption.py +1275 -0
lattifai/{io → caption}/supervision.py +1 -0
lattifai/{io → caption}/text_parser.py +53 -10
lattifai/cli/__init__.py +17 -0
lattifai/cli/alignment.py +153 -0
lattifai/cli/caption.py +204 -0
lattifai/cli/server.py +19 -0
lattifai/cli/transcribe.py +197 -0
lattifai/cli/youtube.py +128 -0
lattifai/client.py +455 -246
lattifai/config/__init__.py +20 -0
lattifai/config/alignment.py +73 -0
lattifai/config/caption.py +178 -0
lattifai/config/client.py +46 -0
lattifai/config/diarization.py +67 -0
lattifai/config/media.py +335 -0
lattifai/config/transcription.py +84 -0
lattifai/diarization/__init__.py +5 -0
lattifai/diarization/lattifai.py +89 -0
lattifai/errors.py +41 -34
lattifai/logging.py +116 -0
lattifai/mixin.py +552 -0
lattifai/server/app.py +420 -0
lattifai/transcription/__init__.py +76 -0
lattifai/transcription/base.py +108 -0
lattifai/transcription/gemini.py +219 -0
lattifai/transcription/lattifai.py +103 -0
lattifai/types.py +30 -0
lattifai/utils.py +3 -31
lattifai/workflow/__init__.py +22 -0
lattifai/workflow/agents.py +6 -0
lattifai/{workflows → workflow}/file_manager.py +81 -57
lattifai/workflow/youtube.py +564 -0
lattifai-1.0.0.dist-info/METADATA +736 -0
lattifai-1.0.0.dist-info/RECORD +52 -0
{lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
lattifai-1.0.0.dist-info/entry_points.txt +13 -0
lattifai/base_client.py +0 -126
lattifai/bin/__init__.py +0 -3
lattifai/bin/agent.py +0 -324
lattifai/bin/align.py +0 -295
lattifai/bin/cli_base.py +0 -25
lattifai/bin/subtitle.py +0 -210
lattifai/io/__init__.py +0 -43
lattifai/io/reader.py +0 -86
lattifai/io/utils.py +0 -15
lattifai/io/writer.py +0 -102
lattifai/tokenizer/__init__.py +0 -3
lattifai/workers/__init__.py +0 -3
lattifai/workflows/__init__.py +0 -34
lattifai/workflows/agents.py +0 -12
lattifai/workflows/gemini.py +0 -167
lattifai/workflows/prompts/README.md +0 -22
lattifai/workflows/prompts/gemini/README.md +0 -24
lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
lattifai/workflows/youtube.py +0 -931
lattifai-0.4.6.dist-info/METADATA +0 -806
lattifai-0.4.6.dist-info/RECORD +0 -39
lattifai-0.4.6.dist-info/entry_points.txt +0 -3
/lattifai/{io → caption}/gemini_reader.py +0 -0
/lattifai/{io → caption}/gemini_writer.py +0 -0
/lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
/lattifai/{workflows → workflow}/base.py +0 -0
{lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
{lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0

lattifai/{io → caption}/supervision.py RENAMED Viewed

@@ -24,6 +24,7 @@ class Supervision(SupervisionSegment):
     """
     text: Optional[str] = None
+    speaker: Optional[str] = None
     id: str = ""
     recording_id: str = ""
     start: Seconds = 0.0

lattifai/{io → caption}/text_parser.py RENAMED Viewed

@@ -2,6 +2,10 @@ import logging
 import re
 from typing import Optional, Tuple
+# Timestamp pattern: [start-end] text
+# Example: [1.23-4.56] Hello world
+TIMESTAMP_PATTERN = re.compile(r"^\[([\d.]+)-([\d.]+)\]\s*(.*)$")
 # 来自于字幕中常见的说话人标记格式
 SPEAKER_PATTERN = re.compile(r"((?:>>|&gt;&gt;|>|&gt;).*?[:：])\s*(.*)")
@@ -16,8 +20,19 @@ SPEAKER_LATTIFAI = re.compile(r"(^\[SPEAKER_.*?\][:：])\s*(.*)")
 SPEAKER_PATTERN2 = re.compile(r"^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[:：])\s*(.*)$")
-def normalize_html_text(text: str) -> str:
-    """Normalize HTML text by decoding entities and stripping whitespace."""
+def normalize_text(text: str) -> str:
+    """Normalize caption text by:
+    - Decoding common HTML entities
+    - Removing HTML tags (e.g., <i>, <font>, <b>, <br>)
+    - Collapsing multiple whitespace into a single space
+    - Converting curly apostrophes to straight ones in common contractions
+    """
+    if not text:
+        return ""
+    # # Remove HTML tags first (replace with space to avoid concatenation)
+    # text = re.sub(r"<[^>]+>", " ", text)
     html_entities = {
         "&amp;": "&",
         "&lt;": "<",
@@ -26,20 +41,18 @@ def normalize_html_text(text: str) -> str:
         "&#39;": "'",
         "&nbsp;": " ",
         "\\N": " ",
-        "…": " ",
+        "…": " ",  # replace ellipsis with space to avoid merging words
     }
     for entity, char in html_entities.items():
         text = text.replace(entity, char)
-    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
     # Convert curly apostrophes to straight apostrophes for common English contractions
-    # Handles: 't 's 'll 're 've 'd 'm
-    # For example, convert "don't" to "don't"
     text = re.sub(r"([a-zA-Z])’([tsdm]|ll|re|ve)\b", r"\1'\2", text, flags=re.IGNORECASE)
-    # For example, convert "5’s" to "5's"
     text = re.sub(r"([0-9])’([s])\b", r"\1'\2", text, flags=re.IGNORECASE)
+    # Collapse whitespace (after replacements)
+    text = re.sub(r"\s+", " ", text)
     return text.strip()
@@ -70,6 +83,36 @@ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
     return None, line
+def parse_timestamp_text(line: str) -> Tuple[Optional[float], Optional[float], str]:
+    """
+    Parse a line of text to extract timestamp and content.
+    Format: [start-end] text
+    Example: [1.23-4.56] Hello world
+    Args:
+        line: Input line to parse
+    Returns:
+        Tuple of (start_time, end_time, text)
+        - start_time: Start timestamp in seconds, or None if not found
+        - end_time: End timestamp in seconds, or None if not found
+        - text: The text content after the timestamp
+    """
+    match = TIMESTAMP_PATTERN.match(line)
+    if match:
+        try:
+            start = float(match.group(1))
+            end = float(match.group(2))
+            text = match.group(3).strip()
+            return start, end, text
+        except ValueError:
+            # If conversion fails, treat as plain text
+            return None, None, line
+    return None, None, line
 if __name__ == "__main__":
     pattern = re.compile(r">>\s*(.*?)\s*[:：]\s*(.*)")
     pattern = re.compile(r"(>>.*?[:：])\s*(.*)")
@@ -85,8 +128,8 @@ if __name__ == "__main__":
         match = pattern.match(text)
         if match:
             print(f"Input: '{text}'")
-            print(f"  Key:   '{match.group(1)}'")
-            print(f"  Value: '{match.group(2)}'")
+            print(f"Speaker:   '{match.group(1)}'")
+            print(f"Content: '{match.group(2)}'")
             print("-------------")
     # pattern2

lattifai/cli/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""CLI module for LattifAI with nemo_run entry points."""
+import nemo_run as run  # noqa: F401
+# Import and re-export entrypoints at package level so NeMo Run can find them
+from lattifai.cli.alignment import align
+from lattifai.cli.caption import convert
+from lattifai.cli.transcribe import transcribe, transcribe_align
+from lattifai.cli.youtube import youtube
+__all__ = [
+    "align",
+    "convert",
+    "transcribe",
+    "transcribe_align",
+    "youtube",
+]

lattifai/cli/alignment.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""Alignment CLI entry point with nemo_run."""
+from typing import Optional
+import nemo_run as run
+from lhotse.utils import Pathlike
+from typing_extensions import Annotated
+from lattifai.client import LattifAI
+from lattifai.config import (
+    AlignmentConfig,
+    CaptionConfig,
+    ClientConfig,
+    DiarizationConfig,
+    MediaConfig,
+    TranscriptionConfig,
+)
+__all__ = ["align"]
+@run.cli.entrypoint(name="align", namespace="alignment")
+def align(
+    input_media: Optional[str] = None,
+    input_caption: Optional[str] = None,
+    output_caption: Optional[str] = None,
+    media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
+    caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
+    client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
+    alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
+    transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
+    diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
+):
+    """
+    Align audio/video with caption file.
+    This command performs forced alignment between audio/video media and caption text,
+    generating accurate timestamps for each caption segment and optionally word-level
+    timestamps. The alignment engine uses advanced speech recognition models to ensure
+    precise synchronization between audio and text.
+    Shortcut: invoking ``lai-align`` is equivalent to running ``lai alignment align``.
+    Args:
+        media: Media configuration for audio/video input and output handling.
+            Fields: input_path, media_format, sample_rate, channels, output_dir,
+                    output_path, output_format, prefer_audio, default_audio_format,
+                    default_video_format, force_overwrite
+        client: API client configuration.
+            Fields: api_key, timeout, max_retries, default_headers
+        alignment: Alignment configuration (model selection and inference settings).
+            Fields: model_name, device, batch_size
+        caption: Caption I/O configuration (file reading/writing and formatting).
+            Fields: input_format, input_path, output_format, output_path,
+                    normalize_text, split_sentence, word_level,
+                    include_speaker_in_text, encoding
+    Examples:
+        # Basic usage with positional arguments
+        lai alignment align audio.wav caption.srt output.srt
+        # Mixing positional and keyword arguments
+        lai alignment align audio.mp4 caption.srt output.json \\
+            alignment.device=cuda \\
+            caption.word_level=true
+        # Smart sentence splitting with custom output format
+        lai alignment align audio.wav caption.srt output.vtt \\
+            caption.split_sentence=true
+        # Using keyword arguments (traditional syntax)
+        lai alignment align \\
+            input_media=audio.wav \\
+            input_caption=caption.srt \\
+            output_caption=output.srt
+        # Full configuration with nested config objects
+        lai alignment align audio.wav caption.srt aligned.json \\
+            media.output_dir=/tmp/output \\
+            caption.split_sentence=true \\
+            caption.word_level=true \\
+            caption.normalize_text=true \\
+            alignment.device=mps \\
+            alignment.model_name=Lattifai/Lattice-1-Alpha
+    """
+    media_config = media or MediaConfig()
+    # Validate that input_media and media_config.input_path are not both provided
+    if input_media and media_config.input_path:
+        raise ValueError(
+            "Cannot specify both positional input_media and media.input_path. "
+            "Use either positional argument or config, not both."
+        )
+    # Assign input_media to media_config.input_path if provided
+    if input_media:
+        media_config.set_input_path(input_media)
+    if not media_config.input_path:
+        raise ValueError("Input media path must be specified via positional argument input_media= or media.input_path=")
+    caption_config = caption or CaptionConfig()
+    # Validate that output_caption_path and caption_config.output_path are not both provided
+    if output_caption and caption_config.output_path:
+        raise ValueError(
+            "Cannot specify both positional output_caption and caption.output_path. "
+            "Use either positional argument or config, not both."
+        )
+    # Assign paths to caption_config if provided
+    if input_caption:
+        caption_config.set_input_path(input_caption)
+    if output_caption:
+        caption_config.set_output_path(output_caption)
+    client = LattifAI(
+        client_config=client,
+        alignment_config=alignment,
+        caption_config=caption_config,
+        transcription_config=transcription,
+        diarization_config=diarization,
+    )
+    is_url = media_config.input_path.startswith(("http://", "https://"))
+    if is_url:
+        # Call the client's youtube method
+        return client.youtube(
+            url=media_config.input_path,
+            output_dir=media_config.output_dir,
+            output_caption_path=caption_config.output_path,
+            media_format=media_config.normalize_format() if media_config.output_format else None,
+            force_overwrite=media_config.force_overwrite,
+            split_sentence=caption_config.split_sentence,
+            channel_selector=media_config.channel_selector,
+        )
+    return client.alignment(
+        input_media=media_config.input_path,
+        input_caption=caption_config.input_path,
+        output_caption_path=caption_config.output_path,
+        split_sentence=caption_config.split_sentence,
+        channel_selector=media_config.channel_selector,
+    )
+def main():
+    run.cli.main(align)
+if __name__ == "__main__":
+    main()

lattifai/cli/caption.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""Caption CLI entry point with nemo_run."""
+from typing import Optional
+import nemo_run as run
+from lhotse.utils import Pathlike
+from typing_extensions import Annotated
+from lattifai.config import CaptionConfig
+@run.cli.entrypoint(name="convert", namespace="caption")
+def convert(
+    input_path: Pathlike,
+    output_path: Pathlike,
+    include_speaker_in_text: bool = True,
+    normalize_text: bool = False,
+):
+    """
+    Convert caption file to another format.
+    This command reads a caption file from one format and writes it to another format,
+    preserving all timing information, text content, and speaker labels (if present).
+    Supports common caption formats including SRT, VTT, JSON, and Praat TextGrid.
+    Shortcut: invoking ``laisub-convert`` is equivalent to running ``lai caption convert``.
+    Args:
+        input_path: Path to input caption file (supports SRT, VTT, JSON, TextGrid formats)
+        output_path: Path to output caption file (format determined by file extension)
+        include_speaker_in_text: Preserve speaker labels in caption text content.
+        normalize_text: Whether to normalize caption text during conversion.
+            This applies text cleaning such as removing HTML tags, decoding entities,
+            collapsing whitespace, and standardizing punctuation.
+    Examples:
+        # Basic format conversion (positional arguments)
+        lai caption convert input.srt output.vtt
+        # Convert with text normalization
+        lai caption convert input.srt output.json normalize_text=true
+        # Mixing positional and keyword arguments
+        lai caption convert input.srt output.vtt \\
+            include_speaker_in_text=false \\
+            normalize_text=true
+        # Using keyword arguments (traditional syntax)
+        lai caption convert \\
+            input_path=input.srt \\
+            output_path=output.TextGrid
+    """
+    from lattifai.caption import Caption
+    caption = Caption.read(input_path, normalize_text=normalize_text)
+    caption.write(output_path, include_speaker_in_text=include_speaker_in_text)
+    print(f"✅ Converted {input_path} -> {output_path}")
+    return output_path
+@run.cli.entrypoint(name="normalize", namespace="caption")
+def normalize(
+    input_path: Pathlike,
+    output_path: Pathlike,
+    caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
+):
+    """
+    Normalize caption text by cleaning HTML entities and whitespace.
+    This command reads a caption file and normalizes all text content by applying
+    the following transformations:
+    - Decode common HTML entities (&amp;, &lt;, &gt;, &quot;, &#39;, &nbsp;)
+    - Remove HTML tags (e.g., <i>, <font>, <b>, <br>)
+    - Collapse multiple whitespace characters into single spaces
+    - Convert curly apostrophes to straight ones in contractions
+    - Strip leading and trailing whitespace from each segment
+    Shortcut: invoking ``laisub-normalize`` is equivalent to running ``lai caption normalize``.
+    Args:
+        input_path: Path to input caption file to normalize
+        output_path: Path to output caption file (defaults to overwriting input file)
+        caption: Caption configuration for text normalization.
+            Fields: input_format, output_format, normalize_text (automatically enabled),
+                    encoding
+    Examples:
+        # Normalize and save to new file (positional arguments)
+        lai caption normalize input.srt output.srt
+        # Normalize with format conversion
+        lai caption normalize input.vtt output.srt
+        # Normalize with custom caption config
+        lai caption normalize input.srt output.srt \\
+            caption.encoding=utf-8
+        # Using keyword arguments (traditional syntax)
+        lai caption normalize \\
+            input_path=input.srt \\
+            output_path=output.srt
+    """
+    from pathlib import Path
+    from lattifai.caption import Caption
+    input_path = Path(input_path).expanduser()
+    output_path = Path(output_path).expanduser()
+    caption_obj = Caption.read(input_path, normalize_text=True)
+    caption_obj.write(output_path, include_speaker_in_text=True)
+    if output_path == input_path:
+        print(f"✅ Normalized {input_path} (in-place)")
+    else:
+        print(f"✅ Normalized {input_path} -> {output_path}")
+    return output_path
+@run.cli.entrypoint(name="shift", namespace="caption")
+def shift(
+    input_path: Pathlike,
+    output_path: Pathlike,
+    seconds: float,
+    caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
+):
+    """
+    Shift caption timestamps by a specified number of seconds.
+    This command reads a caption file and adjusts all timestamps by adding or
+    subtracting a specified offset. Use positive values to delay captions and
+    negative values to make them appear earlier.
+    Shortcut: invoking ``laisub-shift`` is equivalent to running ``lai caption shift``.
+    Args:
+        input_path: Path to input caption file
+        output_path: Path to output caption file (can be same as input for in-place modification)
+        seconds: Number of seconds to shift timestamps. Positive values delay captions,
+                 negative values advance them earlier.
+        caption: Caption configuration for reading/writing.
+            Fields: input_format, output_format, encoding
+    Examples:
+        # Delay captions by 2 seconds (positional arguments)
+        lai caption shift input.srt output.srt 2.0
+        # Make captions appear 1.5 seconds earlier
+        lai caption shift input.srt output.srt -1.5
+        # Shift and convert format
+        lai caption shift input.vtt output.srt seconds=0.5
+        # Using keyword arguments (traditional syntax)
+        lai caption shift \\
+            input_path=input.srt \\
+            output_path=output.srt \\
+            seconds=3.0
+    """
+    from pathlib import Path
+    from lattifai.caption import Caption
+    input_path = Path(input_path).expanduser()
+    output_path = Path(output_path).expanduser()
+    # Read captions
+    caption_obj = Caption.read(input_path)
+    # Shift timestamps
+    shifted_caption = caption_obj.shift_time(seconds)
+    # Write shifted captions
+    shifted_caption.write(output_path, include_speaker_in_text=True)
+    if seconds >= 0:
+        direction = f"delayed by {seconds}s"
+    else:
+        direction = f"advanced by {abs(seconds)}s"
+    if output_path == input_path:
+        print(f"✅ Shifted timestamps {direction} in {input_path} (in-place)")
+    else:
+        print(f"✅ Shifted timestamps {direction}: {input_path} -> {output_path}")
+    return output_path
+def main_convert():
+    run.cli.main(convert)
+def main_normalize():
+    run.cli.main(normalize)
+def main_shift():
+    run.cli.main(shift)
+if __name__ == "__main__":
+    main_convert()

lattifai/cli/server.py ADDED Viewed

@@ -0,0 +1,19 @@
+import os
+import colorful
+import uvicorn
+def main():
+    """Launch the LattifAI Web Interface."""
+    print(colorful.bold_green("🚀 Launching LattifAI Web Interface..."))
+    print(colorful.cyan("See http://localhost:8001"))
+    # Ensure the directory contains the app
+    # We might need to adjust python path or just rely on installed package
+    uvicorn.run("lattifai.server.app:app", host="0.0.0.0", port=8001, reload=True, log_level="info")
+if __name__ == "__main__":
+    main()

lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

lattifai 0.4.6py3-none-any.whl → 1.0.0py3-none-any.whl