PyPI - lattifai - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

lattifai 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

lattifai/__init__.py +0 -24
lattifai/alignment/__init__.py +10 -1
lattifai/alignment/lattice1_aligner.py +66 -58
lattifai/alignment/lattice1_worker.py +1 -6
lattifai/alignment/punctuation.py +38 -0
lattifai/alignment/segmenter.py +1 -1
lattifai/alignment/sentence_splitter.py +350 -0
lattifai/alignment/text_align.py +440 -0
lattifai/alignment/tokenizer.py +91 -220
lattifai/caption/__init__.py +82 -6
lattifai/caption/caption.py +335 -1143
lattifai/caption/formats/__init__.py +199 -0
lattifai/caption/formats/base.py +211 -0
lattifai/caption/formats/gemini.py +722 -0
lattifai/caption/formats/json.py +194 -0
lattifai/caption/formats/lrc.py +309 -0
lattifai/caption/formats/nle/__init__.py +9 -0
lattifai/caption/formats/nle/audition.py +561 -0
lattifai/caption/formats/nle/avid.py +423 -0
lattifai/caption/formats/nle/fcpxml.py +549 -0
lattifai/caption/formats/nle/premiere.py +589 -0
lattifai/caption/formats/pysubs2.py +642 -0
lattifai/caption/formats/sbv.py +147 -0
lattifai/caption/formats/tabular.py +338 -0
lattifai/caption/formats/textgrid.py +193 -0
lattifai/caption/formats/ttml.py +652 -0
lattifai/caption/formats/vtt.py +469 -0
lattifai/caption/parsers/__init__.py +9 -0
lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
lattifai/caption/standardize.py +636 -0
lattifai/caption/utils.py +474 -0
lattifai/cli/__init__.py +2 -1
lattifai/cli/caption.py +108 -1
lattifai/cli/transcribe.py +4 -9
lattifai/cli/youtube.py +4 -1
lattifai/client.py +48 -84
lattifai/config/__init__.py +11 -1
lattifai/config/alignment.py +9 -2
lattifai/config/caption.py +267 -23
lattifai/config/media.py +20 -0
lattifai/diarization/__init__.py +41 -1
lattifai/mixin.py +36 -18
lattifai/transcription/base.py +6 -1
lattifai/transcription/lattifai.py +19 -54
lattifai/utils.py +81 -13
lattifai/workflow/__init__.py +28 -4
lattifai/workflow/file_manager.py +2 -5
lattifai/youtube/__init__.py +43 -0
lattifai/youtube/client.py +1170 -0
lattifai/youtube/types.py +23 -0
lattifai-1.2.2.dist-info/METADATA +615 -0
lattifai-1.2.2.dist-info/RECORD +76 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
lattifai/caption/gemini_reader.py +0 -371
lattifai/caption/gemini_writer.py +0 -173
lattifai/cli/app_installer.py +0 -142
lattifai/cli/server.py +0 -44
lattifai/server/app.py +0 -427
lattifai/workflow/youtube.py +0 -577
lattifai-1.2.0.dist-info/METADATA +0 -1133
lattifai-1.2.0.dist-info/RECORD +0 -57
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0

lattifai/alignment/tokenizer.py CHANGED Viewed

@@ -2,12 +2,13 @@ import gzip
 import pickle
 import re
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
+from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
 import numpy as np
-from lattifai.alignment.phonemizer import G2Phonemizer
-from lattifai.caption import Supervision
+# from lattifai.caption import Supervision
+from lhotse.supervision import SupervisionSegment as Supervision  # NOTE: Transcriber SupervisionSegment
 from lattifai.caption import normalize_text as normalize_html_text
 from lattifai.errors import (
     LATTICE_DECODING_FAILURE_HELP,
@@ -16,12 +17,10 @@ from lattifai.errors import (
     QuotaExceededError,
 )
-PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
-END_PUNCTUATION = '.!?"]。！？”】'
-PUNCTUATION_SPACE = PUNCTUATION + " "
-STAR_TOKEN = "※"
-GROUPING_SEPARATOR = "✹"
+from .phonemizer import G2Phonemizer
+from .punctuation import PUNCTUATION, PUNCTUATION_SPACE
+from .sentence_splitter import SentenceSplitter
+from .text_align import TextAlignResult
 MAXIMUM_WORD_LENGTH = 40
@@ -79,8 +78,11 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
         ['Kühlschrank']
         >>> tokenize_multilingual_text("Hello, World!", attach_punctuation=True)
         ['Hello,', ' ', 'World!']
+        >>> tokenize_multilingual_text("[AED], World!", keep_spaces=False, attach_punctuation=True)
+        ['[AED],', 'World!']
     """
     # Regex pattern:
+    # - \[[A-Z_]+\] matches bracketed annotations like [APPLAUSE], [MUSIC], [SPEAKER_01]
     # - [a-zA-Z0-9\u00C0-\u024F]+ matches Latin letters (including accented chars like ü, ö, ä, ß, é, etc.)
     # - (?:'[a-zA-Z]{1,2})? optionally matches contractions like 's, 't, 'm, 'll, 're, 've
     # - [\u4e00-\u9fff] matches CJK characters
@@ -89,7 +91,7 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
     # - \u00C0-\u00FF: Latin-1 Supplement (À-ÿ)
     # - \u0100-\u017F: Latin Extended-A
     # - \u0180-\u024F: Latin Extended-B
-    pattern = re.compile(r"([a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
+    pattern = re.compile(r"(\[[A-Z_]+\]|[a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
     # filter(None, ...) removes any empty strings from re.findall results
     tokens = list(filter(None, pattern.findall(text)))
@@ -126,84 +128,12 @@ class LatticeTokenizer:
         self.g2p_model: Any = None  # Placeholder for G2P model
         self.dictionaries = defaultdict(lambda: [])
         self.oov_word = "<unk>"
-        self.sentence_splitter = None
+        self.sentence_splitter: Optional[SentenceSplitter] = None
         self.device = "cpu"
     def init_sentence_splitter(self):
-        if self.sentence_splitter is not None:
-            return
-        import onnxruntime as ort
-        from wtpsplit import SaT
-        providers = []
-        device = self.device
-        if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
-            providers.append("CUDAExecutionProvider")
-        elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
-            providers.append("MPSExecutionProvider")
-        if self.model_hub == "modelscope":
-            from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
-            downloaded_path = ms_snapshot("LattifAI/OmniTokenizer")
-            sat = SaT(
-                f"{downloaded_path}/sat-3l-sm",
-                tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
-                ort_providers=providers + ["CPUExecutionProvider"],
-            )
-        else:
-            sat = SaT(
-                "sat-3l-sm",
-                ort_providers=providers + ["CPUExecutionProvider"],
-            )
-        self.sentence_splitter = sat
-    @staticmethod
-    def _resplit_special_sentence_types(sentence: str) -> List[str]:
-        """
-        Re-split special sentence types.
-        Examples:
-        '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']
-        '[MUSIC] &gt;&gt; SPEAKER:' -> ['[MUSIC]', '&gt;&gt; SPEAKER:']
-        Special handling patterns:
-        1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
-        2. Use speaker marks (&gt;&gt; or other separators) as split points
-        Args:
-            sentence: Input sentence string
-        Returns:
-            List of re-split sentences. If no special marks are found, returns the original sentence in a list
-        """
-        # Detect special mark patterns: [SOMETHING] &gt;&gt; SPEAKER:
-        # or other forms like [SOMETHING] SPEAKER:
-        # Pattern 1: [mark] HTML-encoded separator speaker:
-        pattern1 = r"^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$"
-        match1 = re.match(pattern1, sentence.strip())
-        if match1:
-            special_mark = match1.group(1)
-            separator = match1.group(2)
-            speaker_part = match1.group(3)
-            return [special_mark, f"{separator} {speaker_part}"]
-        # Pattern 2: [mark] speaker:
-        pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
-        match2 = re.match(pattern2, sentence.strip())
-        if match2:
-            special_mark = match2.group(1)
-            speaker_label = match2.group(2)
-            remaining = match2.group(3).strip()
-            if remaining:
-                return [special_mark, f"{speaker_label} {remaining}"]
-            else:
-                return [special_mark, speaker_label]
-        # If no special pattern matches, return the original sentence
-        return [sentence]
+        if self.sentence_splitter is None:
+            self.sentence_splitter = SentenceSplitter(device=self.device, model_hub=self.model_hub)
     @classmethod
     def from_pretrained(
@@ -308,127 +238,45 @@ class LatticeTokenizer:
         return {}
-    def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[str]:
+    def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
         """Split supervisions into sentences using the sentence splitter.
-        Carefull about speaker changes.
+        Careful about speaker changes.
         """
-        texts, speakers = [], []
-        text_len, sidx = 0, 0
-        def flush_segment(end_idx: int, speaker: Optional[str] = None):
-            """Flush accumulated text from sidx to end_idx with given speaker."""
-            nonlocal text_len, sidx
-            if sidx <= end_idx:
-                if len(speakers) < len(texts) + 1:
-                    speakers.append(speaker)
-                text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
-                texts.append(text)
-                sidx = end_idx + 1
-                text_len = 0
-        for s, supervision in enumerate(supervisions):
-            text_len += len(supervision.text)
-            is_last = s == len(supervisions) - 1
-            if supervision.speaker:
-                # Flush previous segment without speaker (if any)
-                if sidx < s:
-                    flush_segment(s - 1, None)
-                    text_len = len(supervision.text)
-                # Check if we should flush this speaker's segment now
-                next_has_speaker = not is_last and supervisions[s + 1].speaker
-                if is_last or next_has_speaker:
-                    flush_segment(s, supervision.speaker)
-                else:
-                    speakers.append(supervision.speaker)
-            elif text_len >= 2000 or is_last:
-                flush_segment(s, None)
-        assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
-        sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
-        supervisions, remainder = [], ""
-        for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
-            # Prepend remainder from previous iteration to the first sentence
-            if _sentences and remainder:
-                _sentences[0] = remainder + _sentences[0]
-                remainder = ""
-            if not _sentences:
-                continue
-            # Process and re-split special sentence types
-            processed_sentences = []
-            for s, _sentence in enumerate(_sentences):
-                if remainder:
-                    _sentence = remainder + _sentence
-                    remainder = ""
-                # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']  # noqa: E501
-                resplit_parts = self._resplit_special_sentence_types(_sentence)
-                if any(resplit_parts[-1].endswith(sp) for sp in [":", "："]):
-                    if s < len(_sentences) - 1:
-                        _sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
-                    else:  # last part
-                        remainder = resplit_parts[-1] + " "
-                    processed_sentences.extend(resplit_parts[:-1])
-                else:
-                    processed_sentences.extend(resplit_parts)
-            _sentences = processed_sentences
-            if not _sentences:
-                if remainder:
-                    _sentences, remainder = [remainder.strip()], ""
-                else:
-                    continue
+        self.init_sentence_splitter()
+        return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
+    def tokenize(
+        self, supervisions: Union[List[Supervision], TextAlignResult], split_sentence: bool = False, boost: float = 0.0
+    ) -> Tuple[str, Dict[str, Any]]:
+        if isinstance(supervisions[0], Supervision):
+            if split_sentence:
+                supervisions = self.split_sentences(supervisions)
+            pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
+            response = self.client_wrapper.post(
+                "tokenize",
+                json={
+                    "model_name": self.model_name,
+                    "supervisions": [s.to_dict() for s in supervisions],
+                    "pronunciation_dictionaries": pronunciation_dictionaries,
+                },
+            )
+        else:
+            pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions[0]])
+            pronunciation_dictionaries.update(self.prenormalize([s.text for s in supervisions[1]]))
+            response = self.client_wrapper.post(
+                "difftokenize",
+                json={
+                    "model_name": self.model_name,
+                    "supervisions": [s.to_dict() for s in supervisions[0]],
+                    "transcription": [s.to_dict() for s in supervisions[1]],
+                    "pronunciation_dictionaries": pronunciation_dictionaries,
+                    "boost": boost,
+                },
+            )
-            if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
-                supervisions.extend(
-                    Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
-                )
-                _speaker = None  # reset speaker after use
-            else:
-                supervisions.extend(
-                    Supervision(text=text, speaker=(_speaker if s == 0 else None))
-                    for s, text in enumerate(_sentences[:-1])
-                )
-                remainder = _sentences[-1] + " " + remainder
-                if k < len(speakers) - 1 and speakers[k + 1] is not None:  # next speaker is set
-                    supervisions.append(
-                        Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
-                    )
-                    remainder = ""
-                elif len(_sentences) == 1:
-                    if k == len(speakers) - 1:
-                        pass  # keep _speaker for the last supervision
-                    else:
-                        assert speakers[k + 1] is None
-                        speakers[k + 1] = _speaker
-                else:
-                    assert len(_sentences) > 1
-                    _speaker = None  # reset speaker if sentence not ended
-        if remainder.strip():
-            supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
-        return supervisions
-    def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
-        if split_sentence:
-            self.init_sentence_splitter()
-            supervisions = self.split_sentences(supervisions)
-        pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
-        response = self.client_wrapper.post(
-            "tokenize",
-            json={
-                "model_name": self.model_name,
-                "supervisions": [s.to_dict() for s in supervisions],
-                "pronunciation_dictionaries": pronunciation_dictionaries,
-            },
-        )
         if response.status_code == 402:
             raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
         if response.status_code != 200:
@@ -445,28 +293,47 @@ class LatticeTokenizer:
         self,
         lattice_id: str,
         lattice_results: Tuple[np.ndarray, Any, Any, float, float],
-        supervisions: List[Supervision],
+        supervisions: Union[List[Supervision], TextAlignResult],
         return_details: bool = False,
         start_margin: float = 0.08,
         end_margin: float = 0.20,
     ) -> List[Supervision]:
         emission, results, labels, frame_shift, offset, channel = lattice_results  # noqa: F841
-        response = self.client_wrapper.post(
-            "detokenize",
-            json={
-                "model_name": self.model_name,
-                "lattice_id": lattice_id,
-                "frame_shift": frame_shift,
-                "results": [t.to_dict() for t in results[0]],
-                "labels": labels[0],
-                "offset": offset,
-                "channel": channel,
-                "return_details": False if return_details is None else return_details,
-                "destroy_lattice": True,
-                "start_margin": start_margin,
-                "end_margin": end_margin,
-            },
-        )
+        if isinstance(supervisions[0], Supervision):
+            response = self.client_wrapper.post(
+                "detokenize",
+                json={
+                    "model_name": self.model_name,
+                    "lattice_id": lattice_id,
+                    "frame_shift": frame_shift,
+                    "results": [t.to_dict() for t in results[0]],
+                    "labels": labels[0],
+                    "offset": offset,
+                    "channel": channel,
+                    "return_details": False if return_details is None else return_details,
+                    "destroy_lattice": True,
+                    "start_margin": start_margin,
+                    "end_margin": end_margin,
+                },
+            )
+        else:
+            response = self.client_wrapper.post(
+                "diffdetokenize",
+                json={
+                    "model_name": self.model_name,
+                    "lattice_id": lattice_id,
+                    "frame_shift": frame_shift,
+                    "results": [t.to_dict() for t in results[0]],
+                    "labels": labels[0],
+                    "offset": offset,
+                    "channel": channel,
+                    "return_details": False if return_details is None else return_details,
+                    "destroy_lattice": True,
+                    "start_margin": start_margin,
+                    "end_margin": end_margin,
+                },
+            )
         if response.status_code == 400:
             raise LatticeDecodingError(
                 lattice_id,
@@ -487,7 +354,11 @@ class LatticeTokenizer:
             # Add emission confidence scores for segments and word-level alignments
             _add_confidence_scores(alignments, emission, labels[0], frame_shift, offset)
-        alignments = _update_alignments_speaker(supervisions, alignments)
+        if isinstance(supervisions[0], Supervision):
+            alignments = _update_alignments_speaker(supervisions, alignments)
+        else:
+            # NOTE: Text Diff Alignment >> speaker has been handled in the backend service
+            pass
         return alignments

lattifai/caption/__init__.py CHANGED Viewed

@@ -1,20 +1,96 @@
-from typing import List, Optional
+"""Caption processing module for LattifAI.
-from lhotse.utils import Pathlike
+This module provides comprehensive caption/subtitle processing capabilities:
+- Multi-format reading and writing (SRT, VTT, ASS, TTML, etc.)
+- Professional NLE integration (Avid, Final Cut Pro, Premiere Pro, DaVinci Resolve)
+- Audio workstation support (Pro Tools, Adobe Audition)
+- Advanced features: timecode offset, overlap resolution, word-level timing
+"""
-from ..config.caption import InputCaptionFormat
+from ..config.caption import InputCaptionFormat, OutputCaptionFormat
 from .caption import Caption
-from .gemini_reader import GeminiReader, GeminiSegment
-from .gemini_writer import GeminiWriter
+from .formats.gemini import GeminiReader, GeminiSegment, GeminiWriter
+from .formats.nle.audition import (
+    AuditionCSVConfig,
+    AuditionCSVWriter,
+    EdiMarkerConfig,
+    EdiMarkerWriter,
+)
+# Professional NLE format writers (re-exported from formats/)
+from .formats.nle.avid import AvidDSConfig, AvidDSWriter, FrameRate
+from .formats.nle.fcpxml import FCPXMLConfig, FCPXMLStyle, FCPXMLWriter
+from .formats.nle.premiere import PremiereXMLConfig, PremiereXMLWriter
+from .formats.ttml import TTMLConfig, TTMLFormat, TTMLRegion, TTMLStyle
+from .parsers.text_parser import normalize_text
+from .standardize import (
+    CaptionStandardizer,
+    CaptionValidator,
+    StandardizationConfig,
+    ValidationResult,
+    apply_margins_to_captions,
+    standardize_captions,
+)
 from .supervision import Supervision
-from .text_parser import normalize_text
+# Create TTMLWriter alias for backward compatibility
+TTMLWriter = TTMLFormat
+# Utility functions
+from .utils import (
+    CollisionMode,
+    TimecodeOffset,
+    apply_timecode_offset,
+    detect_overlaps,
+    format_srt_timestamp,
+    generate_srt_content,
+    resolve_overlaps,
+    split_long_lines,
+)
 __all__ = [
+    # Core classes
     "Caption",
     "Supervision",
+    # Standardization
+    "CaptionStandardizer",
+    "CaptionValidator",
+    "StandardizationConfig",
+    "ValidationResult",
+    "standardize_captions",
+    "apply_margins_to_captions",
+    # Gemini format support
     "GeminiReader",
     "GeminiWriter",
     "GeminiSegment",
+    # Text utilities
     "normalize_text",
+    # Format types
     "InputCaptionFormat",
+    "OutputCaptionFormat",
+    # Professional format writers
+    "AvidDSWriter",
+    "AvidDSConfig",
+    "FCPXMLWriter",
+    "FCPXMLConfig",
+    "FCPXMLStyle",
+    "PremiereXMLWriter",
+    "PremiereXMLConfig",
+    "AuditionCSVWriter",
+    "AuditionCSVConfig",
+    "EdiMarkerWriter",
+    "EdiMarkerConfig",
+    "TTMLWriter",
+    "TTMLConfig",
+    "TTMLStyle",
+    "TTMLRegion",
+    # Utilities
+    "CollisionMode",
+    "TimecodeOffset",
+    "apply_timecode_offset",
+    "resolve_overlaps",
+    "detect_overlaps",
+    "split_long_lines",
+    "format_srt_timestamp",
+    "generate_srt_content",
 ]

lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

lattifai 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl