PyPI - lattifai - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

lattifai 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

lattifai/_init.py +20 -0
lattifai/alignment/__init__.py +9 -1
lattifai/alignment/lattice1_aligner.py +175 -54
lattifai/alignment/lattice1_worker.py +47 -4
lattifai/alignment/punctuation.py +38 -0
lattifai/alignment/segmenter.py +3 -2
lattifai/alignment/text_align.py +441 -0
lattifai/alignment/tokenizer.py +134 -65
lattifai/audio2.py +162 -183
lattifai/cli/__init__.py +2 -1
lattifai/cli/alignment.py +5 -0
lattifai/cli/caption.py +111 -4
lattifai/cli/transcribe.py +2 -6
lattifai/cli/youtube.py +7 -1
lattifai/client.py +72 -123
lattifai/config/__init__.py +28 -0
lattifai/config/alignment.py +14 -0
lattifai/config/caption.py +45 -31
lattifai/config/client.py +16 -0
lattifai/config/event.py +102 -0
lattifai/config/media.py +20 -0
lattifai/config/transcription.py +25 -1
lattifai/data/__init__.py +8 -0
lattifai/data/caption.py +228 -0
lattifai/diarization/__init__.py +41 -1
lattifai/errors.py +78 -53
lattifai/event/__init__.py +65 -0
lattifai/event/lattifai.py +166 -0
lattifai/mixin.py +49 -32
lattifai/transcription/base.py +8 -2
lattifai/transcription/gemini.py +147 -16
lattifai/transcription/lattifai.py +25 -63
lattifai/types.py +1 -1
lattifai/utils.py +7 -13
lattifai/workflow/__init__.py +28 -4
lattifai/workflow/file_manager.py +2 -5
lattifai/youtube/__init__.py +43 -0
lattifai/youtube/client.py +1265 -0
lattifai/youtube/types.py +23 -0
lattifai-1.3.0.dist-info/METADATA +678 -0
lattifai-1.3.0.dist-info/RECORD +57 -0
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
lattifai/__init__.py +0 -88
lattifai/alignment/sentence_splitter.py +0 -219
lattifai/caption/__init__.py +0 -20
lattifai/caption/caption.py +0 -1467
lattifai/caption/gemini_reader.py +0 -462
lattifai/caption/gemini_writer.py +0 -173
lattifai/caption/supervision.py +0 -34
lattifai/caption/text_parser.py +0 -145
lattifai/cli/app_installer.py +0 -142
lattifai/cli/server.py +0 -44
lattifai/server/app.py +0 -427
lattifai/workflow/youtube.py +0 -577
lattifai-1.2.1.dist-info/METADATA +0 -1134
lattifai-1.2.1.dist-info/RECORD +0 -58
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0

lattifai/alignment/tokenizer.py CHANGED Viewed

@@ -2,11 +2,11 @@ import gzip
 import pickle
 import re
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
+from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
 import numpy as np
-from lattifai.caption import Supervision
+from lattifai.caption import SentenceSplitter, Supervision
 from lattifai.caption import normalize_text as normalize_html_text
 from lattifai.errors import (
     LATTICE_DECODING_FAILURE_HELP,
@@ -15,14 +15,8 @@ from lattifai.errors import (
     QuotaExceededError,
 )
-from .phonemizer import G2Phonemizer
-from .sentence_splitter import SentenceSplitter
-PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
-PUNCTUATION_SPACE = PUNCTUATION + " "
-STAR_TOKEN = "※"
-GROUPING_SEPARATOR = "✹"
+from .punctuation import PUNCTUATION, PUNCTUATION_SPACE
+from .text_align import TextAlignResult
 MAXIMUM_WORD_LENGTH = 40
@@ -80,8 +74,11 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
         ['Kühlschrank']
         >>> tokenize_multilingual_text("Hello, World!", attach_punctuation=True)
         ['Hello,', ' ', 'World!']
+        >>> tokenize_multilingual_text("[AED], World!", keep_spaces=False, attach_punctuation=True)
+        ['[AED],', 'World!']
     """
     # Regex pattern:
+    # - \[[A-Z_]+\] matches bracketed annotations like [APPLAUSE], [MUSIC], [SPEAKER_01]
     # - [a-zA-Z0-9\u00C0-\u024F]+ matches Latin letters (including accented chars like ü, ö, ä, ß, é, etc.)
     # - (?:'[a-zA-Z]{1,2})? optionally matches contractions like 's, 't, 'm, 'll, 're, 've
     # - [\u4e00-\u9fff] matches CJK characters
@@ -90,7 +87,7 @@ def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punct
     # - \u00C0-\u00FF: Latin-1 Supplement (À-ÿ)
     # - \u0100-\u017F: Latin Extended-A
     # - \u0180-\u024F: Latin Extended-B
-    pattern = re.compile(r"([a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
+    pattern = re.compile(r"(\[[A-Z_]+\]|[a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
     # filter(None, ...) removes any empty strings from re.findall results
     tokens = list(filter(None, pattern.findall(text)))
@@ -173,13 +170,16 @@ class LatticeTokenizer:
         tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
         tokenizer.oov_word = data["oov_word"]
+        # Lazy load G2P model only if it exists (avoids PyTorch dependency)
         g2pp_model_path = f"{model_path}/g2pp.bin" if Path(f"{model_path}/g2pp.bin").exists() else None
-        if g2pp_model_path:
-            tokenizer.g2p_model = G2Phonemizer(g2pp_model_path, device=device)
+        g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
+        g2p_path = g2pp_model_path or g2p_model_path
+        if g2p_path:
+            from .phonemizer import G2Phonemizer
+            tokenizer.g2p_model = G2Phonemizer(g2p_path, device=device)
         else:
-            g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
-            if g2p_model_path:
-                tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
+            tokenizer.g2p_model = None
         tokenizer.device = device
         tokenizer.add_special_tokens()
@@ -245,19 +245,55 @@ class LatticeTokenizer:
         self.init_sentence_splitter()
         return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
-    def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
-        if split_sentence:
-            supervisions = self.split_sentences(supervisions)
-        pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
-        response = self.client_wrapper.post(
-            "tokenize",
-            json={
-                "model_name": self.model_name,
-                "supervisions": [s.to_dict() for s in supervisions],
-                "pronunciation_dictionaries": pronunciation_dictionaries,
-            },
-        )
+    def _get_client_info(self) -> Dict[str, Optional[str]]:
+        """Get client identification info for usage tracking."""
+        try:
+            from importlib.metadata import version
+            return {"client_name": "python-sdk", "client_version": version("lattifai")}
+        except Exception:
+            return {"client_name": "python-sdk", "client_version": "unknown"}
+    def tokenize(
+        self,
+        supervisions: Union[List[Supervision], TextAlignResult],
+        split_sentence: bool = False,
+        boost: float = 0.0,
+        transition_penalty: Optional[float] = 0.0,
+    ) -> Tuple[str, Dict[str, Any]]:
+        client_info = self._get_client_info()
+        if isinstance(supervisions[0], Supervision):
+            if split_sentence:
+                supervisions = self.split_sentences(supervisions)
+            pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
+            response = self.client_wrapper.post(
+                "tokenize",
+                json={
+                    "model_name": self.model_name,
+                    "supervisions": [s.to_dict() for s in supervisions],
+                    "pronunciation_dictionaries": pronunciation_dictionaries,
+                    **client_info,
+                    "transition_penalty": transition_penalty,
+                },
+            )
+        else:
+            pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions[0]])
+            pronunciation_dictionaries.update(self.prenormalize([s.text for s in supervisions[1]]))
+            response = self.client_wrapper.post(
+                "difftokenize",
+                json={
+                    "model_name": self.model_name,
+                    "supervisions": [s.to_dict() for s in supervisions[0]],
+                    "transcription": [s.to_dict() for s in supervisions[1]],
+                    "pronunciation_dictionaries": pronunciation_dictionaries,
+                    "boost": boost,
+                    **client_info,
+                },
+            )
         if response.status_code == 402:
             raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
         if response.status_code != 200:
@@ -274,28 +310,51 @@ class LatticeTokenizer:
         self,
         lattice_id: str,
         lattice_results: Tuple[np.ndarray, Any, Any, float, float],
-        supervisions: List[Supervision],
+        supervisions: Union[List[Supervision], TextAlignResult],
         return_details: bool = False,
         start_margin: float = 0.08,
         end_margin: float = 0.20,
+        check_sanity: bool = True,
     ) -> List[Supervision]:
-        emission, results, labels, frame_shift, offset, channel = lattice_results  # noqa: F841
-        response = self.client_wrapper.post(
-            "detokenize",
-            json={
-                "model_name": self.model_name,
-                "lattice_id": lattice_id,
-                "frame_shift": frame_shift,
-                "results": [t.to_dict() for t in results[0]],
-                "labels": labels[0],
-                "offset": offset,
-                "channel": channel,
-                "return_details": False if return_details is None else return_details,
-                "destroy_lattice": True,
-                "start_margin": start_margin,
-                "end_margin": end_margin,
-            },
-        )
+        emission_stats, results, labels, frame_shift, offset, channel = lattice_results  # noqa: F841
+        # emission_stats is a dict with 'max_probs' and 'aligned_probs' (unified for batch and streaming)
+        if isinstance(supervisions[0], Supervision):
+            response = self.client_wrapper.post(
+                "detokenize",
+                json={
+                    "model_name": self.model_name,
+                    "lattice_id": lattice_id,
+                    "frame_shift": frame_shift,
+                    "results": [t.to_dict() for t in results[0]],
+                    "labels": labels[0],
+                    "offset": offset,
+                    "channel": channel,
+                    "return_details": False if return_details is None else return_details,
+                    "destroy_lattice": True,
+                    "start_margin": start_margin,
+                    "end_margin": end_margin,
+                    "check_sanity": check_sanity,
+                },
+            )
+        else:
+            response = self.client_wrapper.post(
+                "diffdetokenize",
+                json={
+                    "model_name": self.model_name,
+                    "lattice_id": lattice_id,
+                    "frame_shift": frame_shift,
+                    "results": [t.to_dict() for t in results[0]],
+                    "labels": labels[0],
+                    "offset": offset,
+                    "channel": channel,
+                    "return_details": False if return_details is None else return_details,
+                    "destroy_lattice": True,
+                    "start_margin": start_margin,
+                    "end_margin": end_margin,
+                    "check_sanity": check_sanity,
+                },
+            )
         if response.status_code == 400:
             raise LatticeDecodingError(
                 lattice_id,
@@ -312,19 +371,21 @@ class LatticeTokenizer:
         alignments = [Supervision.from_dict(s) for s in result["supervisions"]]
-        if emission is not None and return_details:
-            # Add emission confidence scores for segments and word-level alignments
-            _add_confidence_scores(alignments, emission, labels[0], frame_shift, offset)
+        # Add emission confidence scores for segments and word-level alignments
+        _add_confidence_scores(alignments, emission_stats, frame_shift, offset)
-        alignments = _update_alignments_speaker(supervisions, alignments)
+        if isinstance(supervisions[0], Supervision):
+            alignments = _update_alignments_speaker(supervisions, alignments)
+        else:
+            # NOTE: Text Diff Alignment >> speaker has been handled in the backend service
+            pass
         return alignments
 def _add_confidence_scores(
     supervisions: List[Supervision],
-    emission: np.ndarray,
-    labels: List[int],
+    emission_stats: Dict[str, np.ndarray],
     frame_shift: float,
     offset: float = 0.0,
 ) -> None:
@@ -337,29 +398,37 @@ def _add_confidence_scores(
     Args:
         supervisions: List of Supervision objects to add scores to (modified in-place)
-        emission: Emission tensor with shape [batch, time, vocab_size]
-        labels: Token labels corresponding to aligned tokens
+        emission_stats: Dict with 'max_probs' and 'aligned_probs' arrays
         frame_shift: Frame shift in seconds for converting frames to time
+        offset: Time offset in seconds
     """
-    tokens = np.array(labels, dtype=np.int64)
+    max_probs = emission_stats["max_probs"]
+    aligned_probs = emission_stats["aligned_probs"]
+    diffprobs_full = max_probs - aligned_probs
     for supervision in supervisions:
         start_frame = int((supervision.start - offset) / frame_shift)
         end_frame = int((supervision.end - offset) / frame_shift)
-        # Compute segment-level confidence
-        probabilities = np.exp(emission[0, start_frame:end_frame])
-        aligned = probabilities[range(0, end_frame - start_frame), tokens[start_frame:end_frame]]
-        diffprobs = np.max(probabilities, axis=-1) - aligned
-        supervision.score = round(1.0 - diffprobs.mean(), ndigits=4)
+        # Clamp to valid range
+        start_frame = max(0, min(start_frame, len(diffprobs_full) - 1))
+        end_frame = max(start_frame + 1, min(end_frame, len(diffprobs_full)))
+        diffprobs = diffprobs_full[start_frame:end_frame]
+        if len(diffprobs) > 0:
+            supervision.score = round(1.0 - diffprobs.mean().item(), ndigits=4)
-        # Compute word-level confidence if alignment exists
+        # Word-level confidence
         if hasattr(supervision, "alignment") and supervision.alignment:
             words = supervision.alignment.get("word", [])
             for w, item in enumerate(words):
-                start = int((item.start - offset) / frame_shift) - start_frame
-                end = int((item.end - offset) / frame_shift) - start_frame
-                words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean(), ndigits=4))
+                start = int((item.start - offset) / frame_shift)
+                end = int((item.end - offset) / frame_shift)
+                start = max(0, min(start, len(diffprobs_full) - 1))
+                end = max(start + 1, min(end, len(diffprobs_full)))
+                word_diffprobs = diffprobs_full[start:end]
+                if len(word_diffprobs) > 0:
+                    words[w] = item._replace(score=round(1.0 - word_diffprobs.mean().item(), ndigits=4))
 def _update_alignments_speaker(supervisions: List[Supervision], alignments: List[Supervision]) -> List[Supervision]:

lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

lattifai 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl