PyPI - lattifai - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

lattifai 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

lattifai/__init__.py +0 -25
lattifai/alignment/lattice1_aligner.py +12 -9
lattifai/alignment/lattice1_worker.py +124 -155
lattifai/alignment/segmenter.py +1 -1
lattifai/alignment/sentence_splitter.py +219 -0
lattifai/alignment/tokenizer.py +23 -179
lattifai/audio2.py +1 -1
lattifai/caption/caption.py +0 -2
lattifai/caption/gemini_reader.py +151 -60
lattifai/cli/diarization.py +3 -1
lattifai/cli/transcribe.py +3 -8
lattifai/cli/youtube.py +11 -0
lattifai/client.py +96 -47
lattifai/config/alignment.py +2 -2
lattifai/config/client.py +5 -0
lattifai/mixin.py +17 -8
lattifai/utils.py +40 -4
lattifai/workflow/youtube.py +55 -57
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +331 -48
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +24 -23
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0

lattifai/alignment/sentence_splitter.py ADDED Viewed

@@ -0,0 +1,219 @@
+import re
+from typing import List, Optional
+from lattifai.caption import Supervision
+from lattifai.utils import _resolve_model_path
+END_PUNCTUATION = '.!?"]。！？"】'
+class SentenceSplitter:
+    """Lazy-initialized sentence splitter using wtpsplit."""
+    def __init__(self, device: str = "cpu", model_hub: Optional[str] = None, lazy_init: bool = True):
+        """Initialize sentence splitter with lazy loading.
+        Args:
+            device: Device to run the model on (cpu, cuda, mps)
+            model_hub: Model hub to use (None for huggingface, "modelscope" for modelscope)
+        """
+        self.device = device
+        self.model_hub = model_hub
+        if lazy_init:
+            self._splitter = None
+        else:
+            self._init_splitter()
+    def _init_splitter(self):
+        """Initialize the sentence splitter model on first use."""
+        if self._splitter is not None:
+            return
+        import onnxruntime as ort
+        from wtpsplit import SaT
+        providers = []
+        device = self.device
+        if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
+            providers.append("CUDAExecutionProvider")
+        elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
+            providers.append("MPSExecutionProvider")
+        if self.model_hub == "modelscope":
+            downloaded_path = _resolve_model_path("LattifAI/OmniTokenizer", model_hub="modelscope")
+            sat = SaT(
+                f"{downloaded_path}/sat-3l-sm",
+                tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
+                ort_providers=providers + ["CPUExecutionProvider"],
+            )
+        else:
+            sat_path = _resolve_model_path("segment-any-text/sat-3l-sm", model_hub="huggingface")
+            sat = SaT(
+                sat_path,
+                tokenizer_name_or_path="facebookAI/xlm-roberta-base",
+                hub_prefix="segment-any-text",
+                ort_providers=providers + ["CPUExecutionProvider"],
+            )
+        self._splitter = sat
+    @staticmethod
+    def _resplit_special_sentence_types(sentence: str) -> List[str]:
+        """
+        Re-split special sentence types.
+        Examples:
+        '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']
+        '[MUSIC] &gt;&gt; SPEAKER:' -> ['[MUSIC]', '&gt;&gt; SPEAKER:']
+        Special handling patterns:
+        1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
+        2. Use speaker marks (&gt;&gt; or other separators) as split points
+        Args:
+            sentence: Input sentence string
+        Returns:
+            List of re-split sentences. If no special marks are found, returns the original sentence in a list
+        """
+        # Detect special mark patterns: [SOMETHING] &gt;&gt; SPEAKER:
+        # or other forms like [SOMETHING] SPEAKER:
+        # Pattern 1: [mark] HTML-encoded separator speaker:
+        pattern1 = r"^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$"
+        match1 = re.match(pattern1, sentence.strip())
+        if match1:
+            special_mark = match1.group(1)
+            separator = match1.group(2)
+            speaker_part = match1.group(3)
+            return [special_mark, f"{separator} {speaker_part}"]
+        # Pattern 2: [mark] speaker:
+        pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
+        match2 = re.match(pattern2, sentence.strip())
+        if match2:
+            special_mark = match2.group(1)
+            speaker_label = match2.group(2)
+            remaining = match2.group(3).strip()
+            if remaining:
+                return [special_mark, f"{speaker_label} {remaining}"]
+            else:
+                return [special_mark, speaker_label]
+        # If no special pattern matches, return the original sentence
+        return [sentence]
+    def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
+        """Split supervisions into sentences using the sentence splitter.
+        Careful about speaker changes.
+        Args:
+            supervisions: List of Supervision objects to split
+            strip_whitespace: Whether to strip whitespace from split sentences
+        Returns:
+            List of Supervision objects with split sentences
+        """
+        self._init_splitter()
+        texts, speakers = [], []
+        text_len, sidx = 0, 0
+        def flush_segment(end_idx: int, speaker: Optional[str] = None):
+            """Flush accumulated text from sidx to end_idx with given speaker."""
+            nonlocal text_len, sidx
+            if sidx <= end_idx:
+                if len(speakers) < len(texts) + 1:
+                    speakers.append(speaker)
+                text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
+                texts.append(text)
+                sidx = end_idx + 1
+                text_len = 0
+        for s, supervision in enumerate(supervisions):
+            text_len += len(supervision.text)
+            is_last = s == len(supervisions) - 1
+            if supervision.speaker:
+                # Flush previous segment without speaker (if any)
+                if sidx < s:
+                    flush_segment(s - 1, None)
+                    text_len = len(supervision.text)
+                # Check if we should flush this speaker's segment now
+                next_has_speaker = not is_last and supervisions[s + 1].speaker
+                if is_last or next_has_speaker:
+                    flush_segment(s, supervision.speaker)
+                else:
+                    speakers.append(supervision.speaker)
+            elif text_len >= 2000 or is_last:
+                flush_segment(s, None)
+        assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
+        sentences = self._splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
+        supervisions, remainder = [], ""
+        for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
+            # Prepend remainder from previous iteration to the first sentence
+            if _sentences and remainder:
+                _sentences[0] = remainder + _sentences[0]
+                remainder = ""
+            if not _sentences:
+                continue
+            # Process and re-split special sentence types
+            processed_sentences = []
+            for s, _sentence in enumerate(_sentences):
+                if remainder:
+                    _sentence = remainder + _sentence
+                    remainder = ""
+                # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']  # noqa: E501
+                resplit_parts = self._resplit_special_sentence_types(_sentence)
+                if any(resplit_parts[-1].endswith(sp) for sp in [":", "："]):
+                    if s < len(_sentences) - 1:
+                        _sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
+                    else:  # last part
+                        remainder = resplit_parts[-1] + " "
+                    processed_sentences.extend(resplit_parts[:-1])
+                else:
+                    processed_sentences.extend(resplit_parts)
+            _sentences = processed_sentences
+            if not _sentences:
+                if remainder:
+                    _sentences, remainder = [remainder.strip()], ""
+                else:
+                    continue
+            if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
+                supervisions.extend(
+                    Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
+                )
+                _speaker = None  # reset speaker after use
+            else:
+                supervisions.extend(
+                    Supervision(text=text, speaker=(_speaker if s == 0 else None))
+                    for s, text in enumerate(_sentences[:-1])
+                )
+                remainder = _sentences[-1] + " " + remainder
+                if k < len(speakers) - 1 and speakers[k + 1] is not None:  # next speaker is set
+                    supervisions.append(
+                        Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
+                    )
+                    remainder = ""
+                elif len(_sentences) == 1:
+                    if k == len(speakers) - 1:
+                        pass  # keep _speaker for the last supervision
+                    else:
+                        assert speakers[k + 1] is None
+                        speakers[k + 1] = _speaker
+                else:
+                    assert len(_sentences) > 1
+                    _speaker = None  # reset speaker if sentence not ended
+        if remainder.strip():
+            supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
+        return supervisions

lattifai/alignment/tokenizer.py CHANGED Viewed

@@ -4,9 +4,8 @@ import re
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
-import torch
+import numpy as np
-from lattifai.alignment.phonemizer import G2Phonemizer
 from lattifai.caption import Supervision
 from lattifai.caption import normalize_text as normalize_html_text
 from lattifai.errors import (
@@ -16,8 +15,10 @@ from lattifai.errors import (
     QuotaExceededError,
 )
+from .phonemizer import G2Phonemizer
+from .sentence_splitter import SentenceSplitter
 PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
-END_PUNCTUATION = '.!?"]。！？”】'
 PUNCTUATION_SPACE = PUNCTUATION + " "
 STAR_TOKEN = "※"
@@ -121,78 +122,17 @@ class LatticeTokenizer:
     def __init__(self, client_wrapper: Any):
         self.client_wrapper = client_wrapper
         self.model_name = ""
+        self.model_hub: Optional[str] = None
         self.words: List[str] = []
         self.g2p_model: Any = None  # Placeholder for G2P model
         self.dictionaries = defaultdict(lambda: [])
         self.oov_word = "<unk>"
-        self.sentence_splitter = None
+        self.sentence_splitter: Optional[SentenceSplitter] = None
         self.device = "cpu"
     def init_sentence_splitter(self):
-        if self.sentence_splitter is not None:
-            return
-        import onnxruntime as ort
-        from wtpsplit import SaT
-        providers = []
-        device = self.device
-        if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
-            providers.append("CUDAExecutionProvider")
-        elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
-            providers.append("MPSExecutionProvider")
-        sat = SaT(
-            "sat-3l-sm",
-            ort_providers=providers + ["CPUExecutionProvider"],
-        )
-        self.sentence_splitter = sat
-    @staticmethod
-    def _resplit_special_sentence_types(sentence: str) -> List[str]:
-        """
-        Re-split special sentence types.
-        Examples:
-        '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']
-        '[MUSIC] &gt;&gt; SPEAKER:' -> ['[MUSIC]', '&gt;&gt; SPEAKER:']
-        Special handling patterns:
-        1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
-        2. Use speaker marks (&gt;&gt; or other separators) as split points
-        Args:
-            sentence: Input sentence string
-        Returns:
-            List of re-split sentences. If no special marks are found, returns the original sentence in a list
-        """
-        # Detect special mark patterns: [SOMETHING] &gt;&gt; SPEAKER:
-        # or other forms like [SOMETHING] SPEAKER:
-        # Pattern 1: [mark] HTML-encoded separator speaker:
-        pattern1 = r"^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$"
-        match1 = re.match(pattern1, sentence.strip())
-        if match1:
-            special_mark = match1.group(1)
-            separator = match1.group(2)
-            speaker_part = match1.group(3)
-            return [special_mark, f"{separator} {speaker_part}"]
-        # Pattern 2: [mark] speaker:
-        pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
-        match2 = re.match(pattern2, sentence.strip())
-        if match2:
-            special_mark = match2.group(1)
-            speaker_label = match2.group(2)
-            remaining = match2.group(3).strip()
-            if remaining:
-                return [special_mark, f"{speaker_label} {remaining}"]
-            else:
-                return [special_mark, speaker_label]
-        # If no special pattern matches, return the original sentence
-        return [sentence]
+        if self.sentence_splitter is None:
+            self.sentence_splitter = SentenceSplitter(device=self.device, model_hub=self.model_hub)
     @classmethod
     def from_pretrained(
@@ -200,6 +140,7 @@ class LatticeTokenizer:
         client_wrapper: Any,
         model_path: str,
         model_name: str,
+        model_hub: Optional[str] = None,
         device: str = "cpu",
         compressed: bool = True,
     ) -> TokenizerT:
@@ -227,6 +168,7 @@ class LatticeTokenizer:
         tokenizer = cls(client_wrapper=client_wrapper)
         tokenizer.model_name = model_name
+        tokenizer.model_hub = model_hub
         tokenizer.words = data["words"]
         tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
         tokenizer.oov_word = data["oov_word"]
@@ -295,116 +237,16 @@ class LatticeTokenizer:
         return {}
-    def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[str]:
+    def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
         """Split supervisions into sentences using the sentence splitter.
-        Carefull about speaker changes.
+        Careful about speaker changes.
         """
-        texts, speakers = [], []
-        text_len, sidx = 0, 0
-        def flush_segment(end_idx: int, speaker: Optional[str] = None):
-            """Flush accumulated text from sidx to end_idx with given speaker."""
-            nonlocal text_len, sidx
-            if sidx <= end_idx:
-                if len(speakers) < len(texts) + 1:
-                    speakers.append(speaker)
-                text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
-                texts.append(text)
-                sidx = end_idx + 1
-                text_len = 0
-        for s, supervision in enumerate(supervisions):
-            text_len += len(supervision.text)
-            is_last = s == len(supervisions) - 1
-            if supervision.speaker:
-                # Flush previous segment without speaker (if any)
-                if sidx < s:
-                    flush_segment(s - 1, None)
-                    text_len = len(supervision.text)
-                # Check if we should flush this speaker's segment now
-                next_has_speaker = not is_last and supervisions[s + 1].speaker
-                if is_last or next_has_speaker:
-                    flush_segment(s, supervision.speaker)
-                else:
-                    speakers.append(supervision.speaker)
-            elif text_len >= 2000 or is_last:
-                flush_segment(s, None)
-        assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
-        sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
-        supervisions, remainder = [], ""
-        for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
-            # Prepend remainder from previous iteration to the first sentence
-            if _sentences and remainder:
-                _sentences[0] = remainder + _sentences[0]
-                remainder = ""
-            if not _sentences:
-                continue
-            # Process and re-split special sentence types
-            processed_sentences = []
-            for s, _sentence in enumerate(_sentences):
-                if remainder:
-                    _sentence = remainder + _sentence
-                    remainder = ""
-                # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']  # noqa: E501
-                resplit_parts = self._resplit_special_sentence_types(_sentence)
-                if any(resplit_parts[-1].endswith(sp) for sp in [":", "："]):
-                    if s < len(_sentences) - 1:
-                        _sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
-                    else:  # last part
-                        remainder = resplit_parts[-1] + " "
-                    processed_sentences.extend(resplit_parts[:-1])
-                else:
-                    processed_sentences.extend(resplit_parts)
-            _sentences = processed_sentences
-            if not _sentences:
-                if remainder:
-                    _sentences, remainder = [remainder.strip()], ""
-                else:
-                    continue
-            if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
-                supervisions.extend(
-                    Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
-                )
-                _speaker = None  # reset speaker after use
-            else:
-                supervisions.extend(
-                    Supervision(text=text, speaker=(_speaker if s == 0 else None))
-                    for s, text in enumerate(_sentences[:-1])
-                )
-                remainder = _sentences[-1] + " " + remainder
-                if k < len(speakers) - 1 and speakers[k + 1] is not None:  # next speaker is set
-                    supervisions.append(
-                        Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
-                    )
-                    remainder = ""
-                elif len(_sentences) == 1:
-                    if k == len(speakers) - 1:
-                        pass  # keep _speaker for the last supervision
-                    else:
-                        assert speakers[k + 1] is None
-                        speakers[k + 1] = _speaker
-                else:
-                    assert len(_sentences) > 1
-                    _speaker = None  # reset speaker if sentence not ended
-        if remainder.strip():
-            supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
-        return supervisions
+        self.init_sentence_splitter()
+        return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
     def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
         if split_sentence:
-            self.init_sentence_splitter()
             supervisions = self.split_sentences(supervisions)
         pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
@@ -431,7 +273,7 @@ class LatticeTokenizer:
     def detokenize(
         self,
         lattice_id: str,
-        lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
+        lattice_results: Tuple[np.ndarray, Any, Any, float, float],
         supervisions: List[Supervision],
         return_details: bool = False,
         start_margin: float = 0.08,
@@ -481,7 +323,7 @@ class LatticeTokenizer:
 def _add_confidence_scores(
     supervisions: List[Supervision],
-    emission: torch.Tensor,
+    emission: np.ndarray,
     labels: List[int],
     frame_shift: float,
     offset: float = 0.0,
@@ -499,17 +341,17 @@ def _add_confidence_scores(
         labels: Token labels corresponding to aligned tokens
         frame_shift: Frame shift in seconds for converting frames to time
     """
-    tokens = torch.tensor(labels, dtype=torch.int64, device=emission.device)
+    tokens = np.array(labels, dtype=np.int64)
     for supervision in supervisions:
         start_frame = int((supervision.start - offset) / frame_shift)
         end_frame = int((supervision.end - offset) / frame_shift)
         # Compute segment-level confidence
-        probabilities = emission[0, start_frame:end_frame].softmax(dim=-1)
+        probabilities = np.exp(emission[0, start_frame:end_frame])
         aligned = probabilities[range(0, end_frame - start_frame), tokens[start_frame:end_frame]]
-        diffprobs = (probabilities.max(dim=-1).values - aligned).cpu()
-        supervision.score = round(1.0 - diffprobs.mean().item(), ndigits=4)
+        diffprobs = np.max(probabilities, axis=-1) - aligned
+        supervision.score = round(1.0 - diffprobs.mean(), ndigits=4)
         # Compute word-level confidence if alignment exists
         if hasattr(supervision, "alignment") and supervision.alignment:
@@ -517,7 +359,7 @@ def _add_confidence_scores(
             for w, item in enumerate(words):
                 start = int((item.start - offset) / frame_shift) - start_frame
                 end = int((item.end - offset) / frame_shift) - start_frame
-                words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean().item(), ndigits=4))
+                words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean(), ndigits=4))
 def _update_alignments_speaker(supervisions: List[Supervision], alignments: List[Supervision]) -> List[Supervision]:
@@ -539,6 +381,7 @@ def _load_tokenizer(
     model_name: str,
     device: str,
     *,
+    model_hub: Optional[str] = None,
     tokenizer_cls: Type[LatticeTokenizer] = LatticeTokenizer,
 ) -> LatticeTokenizer:
     """Instantiate tokenizer with consistent error handling."""
@@ -546,5 +389,6 @@ def _load_tokenizer(
         client_wrapper=client_wrapper,
         model_path=model_path,
         model_name=model_name,
+        model_hub=model_hub,
         device=device,
     )

lattifai/audio2.py CHANGED Viewed

@@ -36,7 +36,7 @@ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "path", "st
     @property
     def streaming_mode(self) -> bool:
         """Indicates whether streaming mode is enabled based on streaming_chunk_secs."""
-        if self.streaming_chunk_secs is not None:
+        if self.streaming_chunk_secs:
             return self.duration > self.streaming_chunk_secs * 1.1
         return False

lattifai/caption/caption.py CHANGED Viewed

@@ -467,7 +467,6 @@ class Caption:
                     sup_dict = sup.to_dict()
                     json_data.append(sup_dict)
                 json.dump(json_data, f, ensure_ascii=False, indent=4)
         elif str(output_path).lower().endswith(".textgrid"):
             from tgt import Interval, IntervalTier, TextGrid, write_to_file
@@ -506,7 +505,6 @@ class Caption:
                 tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
             write_to_file(tg, output_path, format="long")
         elif str(output_path)[-4:].lower() == ".tsv":
             cls._write_tsv(alignments, output_path, include_speaker_in_text)
         elif str(output_path)[-4:].lower() == ".csv":

lattifai 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

lattifai 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl