PyPI - lattifai - Versions diffs - 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

lattifai 0.4.5py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

lattifai/__init__.py +61 -47
lattifai/alignment/__init__.py +6 -0
lattifai/alignment/lattice1_aligner.py +119 -0
lattifai/alignment/lattice1_worker.py +185 -0
lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
lattifai/alignment/segmenter.py +166 -0
lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
lattifai/audio2.py +211 -0
lattifai/caption/__init__.py +20 -0
lattifai/caption/caption.py +1275 -0
lattifai/{io → caption}/gemini_reader.py +30 -30
lattifai/{io → caption}/gemini_writer.py +17 -17
lattifai/{io → caption}/supervision.py +4 -3
lattifai/caption/text_parser.py +145 -0
lattifai/cli/__init__.py +17 -0
lattifai/cli/alignment.py +153 -0
lattifai/cli/caption.py +204 -0
lattifai/cli/server.py +19 -0
lattifai/cli/transcribe.py +197 -0
lattifai/cli/youtube.py +128 -0
lattifai/client.py +460 -251
lattifai/config/__init__.py +20 -0
lattifai/config/alignment.py +73 -0
lattifai/config/caption.py +178 -0
lattifai/config/client.py +46 -0
lattifai/config/diarization.py +67 -0
lattifai/config/media.py +335 -0
lattifai/config/transcription.py +84 -0
lattifai/diarization/__init__.py +5 -0
lattifai/diarization/lattifai.py +89 -0
lattifai/errors.py +98 -91
lattifai/logging.py +116 -0
lattifai/mixin.py +552 -0
lattifai/server/app.py +420 -0
lattifai/transcription/__init__.py +76 -0
lattifai/transcription/base.py +108 -0
lattifai/transcription/gemini.py +219 -0
lattifai/transcription/lattifai.py +103 -0
lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
lattifai/types.py +30 -0
lattifai/utils.py +16 -44
lattifai/workflow/__init__.py +22 -0
lattifai/workflow/agents.py +6 -0
lattifai/{workflows → workflow}/base.py +22 -22
lattifai/{workflows → workflow}/file_manager.py +239 -215
lattifai/workflow/youtube.py +564 -0
lattifai-1.0.0.dist-info/METADATA +736 -0
lattifai-1.0.0.dist-info/RECORD +52 -0
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
lattifai-1.0.0.dist-info/entry_points.txt +13 -0
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
lattifai/base_client.py +0 -126
lattifai/bin/__init__.py +0 -3
lattifai/bin/agent.py +0 -325
lattifai/bin/align.py +0 -296
lattifai/bin/cli_base.py +0 -25
lattifai/bin/subtitle.py +0 -210
lattifai/io/__init__.py +0 -42
lattifai/io/reader.py +0 -85
lattifai/io/text_parser.py +0 -75
lattifai/io/utils.py +0 -15
lattifai/io/writer.py +0 -90
lattifai/tokenizer/__init__.py +0 -3
lattifai/workers/__init__.py +0 -3
lattifai/workers/lattice1_alpha.py +0 -284
lattifai/workflows/__init__.py +0 -34
lattifai/workflows/agents.py +0 -10
lattifai/workflows/gemini.py +0 -167
lattifai/workflows/prompts/README.md +0 -22
lattifai/workflows/prompts/gemini/README.md +0 -24
lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
lattifai/workflows/youtube.py +0 -931
lattifai-0.4.5.dist-info/METADATA +0 -808
lattifai-0.4.5.dist-info/RECORD +0 -39
lattifai-0.4.5.dist-info/entry_points.txt +0 -3
{lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0

lattifai/{tokenizer → alignment}/tokenizer.py RENAMED Viewed

@@ -1,27 +1,118 @@
 import gzip
-import inspect
 import pickle
 import re
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
 import torch
-from lattifai.errors import LATTICE_DECODING_FAILURE_HELP, LatticeDecodingError
-from lattifai.io import Supervision
-from lattifai.tokenizer.phonemizer import G2Phonemizer
+from lattifai.alignment.phonemizer import G2Phonemizer
+from lattifai.caption import Supervision
+from lattifai.caption import normalize_text as normalize_html_text
+from lattifai.errors import (
+    LATTICE_DECODING_FAILURE_HELP,
+    LatticeDecodingError,
+    ModelLoadError,
+    QuotaExceededError,
+)
 PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
 END_PUNCTUATION = '.!?"]。！？”】'
-PUNCTUATION_SPACE = PUNCTUATION + ' '
-STAR_TOKEN = '※'
+PUNCTUATION_SPACE = PUNCTUATION + " "
+STAR_TOKEN = "※"
-GROUPING_SEPARATOR = '✹'
+GROUPING_SEPARATOR = "✹"
 MAXIMUM_WORD_LENGTH = 40
-TokenizerT = TypeVar('TokenizerT', bound='LatticeTokenizer')
+TokenizerT = TypeVar("TokenizerT", bound="LatticeTokenizer")
+def _is_punctuation(char: str) -> bool:
+    """Check if a character is punctuation (not space, not alphanumeric, not CJK)."""
+    if len(char) != 1:
+        return False
+    if char.isspace():
+        return False
+    if char.isalnum():
+        return False
+    # Check if it's a CJK character
+    if "\u4e00" <= char <= "\u9fff":
+        return False
+    # Check if it's an accented Latin character
+    if "\u00c0" <= char <= "\u024f":
+        return False
+    return True
+def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punctuation: bool = False) -> list[str]:
+    """
+    Tokenize a mixed Chinese-English string into individual units.
+    Tokenization rules:
+    - Chinese characters (CJK) are split individually
+    - Consecutive Latin letters (including accented characters) and digits are grouped as one unit
+    - English contractions ('s, 't, 'm, 'll, 're, 've) are kept with the preceding word
+    - Other characters (punctuation, spaces) are split individually by default
+    - If attach_punctuation=True, punctuation marks are attached to the preceding token
+    Args:
+        text: Input string containing mixed Chinese and English text
+        keep_spaces: If True, spaces are included in the output as separate tokens.
+                     If False, spaces are excluded from the output. Default is True.
+        attach_punctuation: If True, punctuation marks are attached to the preceding token.
+                            For example, "Hello, World!" becomes ["Hello,", " ", "World!"].
+                            Default is False.
+    Returns:
+        List of tokenized units
+    Examples:
+        >>> tokenize_multilingual_text("Hello世界")
+        ['Hello', '世', '界']
+        >>> tokenize_multilingual_text("I'm fine")
+        ["I'm", ' ', 'fine']
+        >>> tokenize_multilingual_text("I'm fine", keep_spaces=False)
+        ["I'm", 'fine']
+        >>> tokenize_multilingual_text("Kühlschrank")
+        ['Kühlschrank']
+        >>> tokenize_multilingual_text("Hello, World!", attach_punctuation=True)
+        ['Hello,', ' ', 'World!']
+    """
+    # Regex pattern:
+    # - [a-zA-Z0-9\u00C0-\u024F]+ matches Latin letters (including accented chars like ü, ö, ä, ß, é, etc.)
+    # - (?:'[a-zA-Z]{1,2})? optionally matches contractions like 's, 't, 'm, 'll, 're, 've
+    # - [\u4e00-\u9fff] matches CJK characters
+    # - . matches any other single character
+    # Unicode ranges:
+    # - \u00C0-\u00FF: Latin-1 Supplement (À-ÿ)
+    # - \u0100-\u017F: Latin Extended-A
+    # - \u0180-\u024F: Latin Extended-B
+    pattern = re.compile(r"([a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
+    # filter(None, ...) removes any empty strings from re.findall results
+    tokens = list(filter(None, pattern.findall(text)))
+    if attach_punctuation and len(tokens) > 1:
+        # Attach punctuation to the preceding token
+        # Punctuation characters (excluding spaces) are merged with the previous token
+        merged_tokens = []
+        i = 0
+        while i < len(tokens):
+            token = tokens[i]
+            # Look ahead to collect consecutive punctuation (non-space, non-alphanumeric, non-CJK)
+            if merged_tokens and _is_punctuation(token):
+                merged_tokens[-1] = merged_tokens[-1] + token
+            else:
+                merged_tokens.append(token)
+            i += 1
+        tokens = merged_tokens
+    if not keep_spaces:
+        tokens = [t for t in tokens if not t.isspace()]
+    return tokens
 class LatticeTokenizer:
@@ -29,12 +120,13 @@ class LatticeTokenizer:
     def __init__(self, client_wrapper: Any):
         self.client_wrapper = client_wrapper
+        self.model_name = ""
         self.words: List[str] = []
         self.g2p_model: Any = None  # Placeholder for G2P model
         self.dictionaries = defaultdict(lambda: [])
-        self.oov_word = '<unk>'
+        self.oov_word = "<unk>"
         self.sentence_splitter = None
-        self.device = 'cpu'
+        self.device = "cpu"
     def init_sentence_splitter(self):
         if self.sentence_splitter is not None:
@@ -45,14 +137,14 @@ class LatticeTokenizer:
         providers = []
         device = self.device
-        if device.startswith('cuda') and ort.get_all_providers().count('CUDAExecutionProvider') > 0:
-            providers.append('CUDAExecutionProvider')
-        elif device.startswith('mps') and ort.get_all_providers().count('MPSExecutionProvider') > 0:
-            providers.append('MPSExecutionProvider')
+        if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
+            providers.append("CUDAExecutionProvider")
+        elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
+            providers.append("MPSExecutionProvider")
         sat = SaT(
-            'sat-3l-sm',
-            ort_providers=providers + ['CPUExecutionProvider'],
+            "sat-3l-sm",
+            ort_providers=providers + ["CPUExecutionProvider"],
         )
         self.sentence_splitter = sat
@@ -79,23 +171,23 @@ class LatticeTokenizer:
         # or other forms like [SOMETHING] SPEAKER:
         # Pattern 1: [mark] HTML-encoded separator speaker:
-        pattern1 = r'^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$'
+        pattern1 = r"^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$"
         match1 = re.match(pattern1, sentence.strip())
         if match1:
             special_mark = match1.group(1)
             separator = match1.group(2)
             speaker_part = match1.group(3)
-            return [special_mark, f'{separator} {speaker_part}']
+            return [special_mark, f"{separator} {speaker_part}"]
         # Pattern 2: [mark] speaker:
-        pattern2 = r'^(\[[^\]]+\])\s+([^:]+:)(.*)$'
+        pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
         match2 = re.match(pattern2, sentence.strip())
         if match2:
             special_mark = match2.group(1)
             speaker_label = match2.group(2)
             remaining = match2.group(3).strip()
             if remaining:
-                return [special_mark, f'{speaker_label} {remaining}']
+                return [special_mark, f"{speaker_label} {remaining}"]
             else:
                 return [special_mark, speaker_label]
@@ -107,28 +199,45 @@ class LatticeTokenizer:
         cls: Type[TokenizerT],
         client_wrapper: Any,
         model_path: str,
-        device: str = 'cpu',
+        model_name: str,
+        device: str = "cpu",
         compressed: bool = True,
     ) -> TokenizerT:
         """Load tokenizer from exported binary file"""
         from pathlib import Path
-        words_model_path = f'{model_path}/words.bin'
-        if compressed:
-            with gzip.open(words_model_path, 'rb') as f:
-                data = pickle.load(f)
-        else:
-            with open(words_model_path, 'rb') as f:
-                data = pickle.load(f)
+        words_model_path = f"{model_path}/words.bin"
+        try:
+            if compressed:
+                with gzip.open(words_model_path, "rb") as f:
+                    data = pickle.load(f)
+            else:
+                with open(words_model_path, "rb") as f:
+                    data = pickle.load(f)
+        except pickle.UnpicklingError as e:
+            del e
+            import msgpack
+            if compressed:
+                with gzip.open(words_model_path, "rb") as f:
+                    data = msgpack.unpack(f, raw=False, strict_map_key=False)
+            else:
+                with open(words_model_path, "rb") as f:
+                    data = msgpack.unpack(f, raw=False, strict_map_key=False)
         tokenizer = cls(client_wrapper=client_wrapper)
-        tokenizer.words = data['words']
-        tokenizer.dictionaries = defaultdict(list, data['dictionaries'])
-        tokenizer.oov_word = data['oov_word']
-        g2p_model_path = f'{model_path}/g2p.bin' if Path(f'{model_path}/g2p.bin').exists() else None
-        if g2p_model_path:
-            tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
+        tokenizer.model_name = model_name
+        tokenizer.words = data["words"]
+        tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
+        tokenizer.oov_word = data["oov_word"]
+        g2pp_model_path = f"{model_path}/g2pp.bin" if Path(f"{model_path}/g2pp.bin").exists() else None
+        if g2pp_model_path:
+            tokenizer.g2p_model = G2Phonemizer(g2pp_model_path, device=device)
+        else:
+            g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
+            if g2p_model_path:
+                tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
         tokenizer.device = device
         tokenizer.add_special_tokens()
@@ -136,18 +245,22 @@ class LatticeTokenizer:
     def add_special_tokens(self):
         tokenizer = self
-        for special_token in ['&gt;&gt;', '&gt;']:
+        for special_token in ["&gt;&gt;", "&gt;"]:
             if special_token not in tokenizer.dictionaries:
                 tokenizer.dictionaries[special_token] = tokenizer.dictionaries[tokenizer.oov_word]
         return self
     def prenormalize(self, texts: List[str], language: Optional[str] = None) -> List[str]:
         if not self.g2p_model:
-            raise ValueError('G2P model is not loaded, cannot prenormalize texts')
+            raise ValueError("G2P model is not loaded, cannot prenormalize texts")
         oov_words = []
         for text in texts:
-            words = text.lower().replace('-', ' ').replace('—', ' ').replace('–', ' ').split()
+            text = normalize_html_text(text)
+            # support english, chinese and german tokenization
+            words = tokenize_multilingual_text(
+                text.lower().replace("-", " ").replace("—", " ").replace("–", " "), keep_spaces=False
+            )
             oovs = [w.strip(PUNCTUATION) for w in words if w not in self.words]
             if oovs:
                 oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])
@@ -156,7 +269,7 @@ class LatticeTokenizer:
         if oov_words:
             indexs = []
             for k, _word in enumerate(oov_words):
-                if any(_word.startswith(p) and _word.endswith(q) for (p, q) in [('(', ')'), ('[', ']')]):
+                if any(_word.startswith(p) and _word.endswith(q) for (p, q) in [("(", ")"), ("[", "]")]):
                     self.dictionaries[_word] = self.dictionaries[self.oov_word]
                 else:
                     _word = _word.strip(PUNCTUATION_SPACE)
@@ -187,38 +300,49 @@ class LatticeTokenizer:
         Carefull about speaker changes.
         """
-        texts, text_len, sidx = [], 0, 0
-        speakers = []
+        texts, speakers = [], []
+        text_len, sidx = 0, 0
+        def flush_segment(end_idx: int, speaker: Optional[str] = None):
+            """Flush accumulated text from sidx to end_idx with given speaker."""
+            nonlocal text_len, sidx
+            if sidx <= end_idx:
+                if len(speakers) < len(texts) + 1:
+                    speakers.append(speaker)
+                text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
+                texts.append(text)
+                sidx = end_idx + 1
+                text_len = 0
         for s, supervision in enumerate(supervisions):
             text_len += len(supervision.text)
+            is_last = s == len(supervisions) - 1
             if supervision.speaker:
+                # Flush previous segment without speaker (if any)
                 if sidx < s:
-                    if len(speakers) < len(texts) + 1:
-                        speakers.append(None)
-                    text = ' '.join([sup.text for sup in supervisions[sidx:s]])
-                    texts.append(text)
-                    sidx = s
+                    flush_segment(s - 1, None)
                     text_len = len(supervision.text)
-                speakers.append(supervision.speaker)
-            else:
-                if text_len >= 2000 or s == len(supervisions) - 1:
-                    if len(speakers) < len(texts) + 1:
-                        speakers.append(None)
-                    text = ' '.join([sup.text for sup in supervisions[sidx : s + 1]])
-                    texts.append(text)
-                    sidx = s + 1
-                    text_len = 0
-        assert len(speakers) == len(texts), f'len(speakers)={len(speakers)} != len(texts)={len(texts)}'
+                # Check if we should flush this speaker's segment now
+                next_has_speaker = not is_last and supervisions[s + 1].speaker
+                if is_last or next_has_speaker:
+                    flush_segment(s, supervision.speaker)
+                else:
+                    speakers.append(supervision.speaker)
+            elif text_len >= 2000 or is_last:
+                flush_segment(s, None)
+        assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
         sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace)
-        supervisions, remainder = [], ''
+        supervisions, remainder = [], ""
         for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
             # Prepend remainder from previous iteration to the first sentence
             if _sentences and remainder:
                 _sentences[0] = remainder + _sentences[0]
-                remainder = ''
+                remainder = ""
             if not _sentences:
                 continue
@@ -228,14 +352,14 @@ class LatticeTokenizer:
             for s, _sentence in enumerate(_sentences):
                 if remainder:
                     _sentence = remainder + _sentence
-                    remainder = ''
+                    remainder = ""
                 # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']  # noqa: E501
                 resplit_parts = self._resplit_special_sentence_types(_sentence)
-                if any(resplit_parts[-1].endswith(sp) for sp in [':', '：']):
+                if any(resplit_parts[-1].endswith(sp) for sp in [":", "："]):
                     if s < len(_sentences) - 1:
-                        _sentences[s + 1] = resplit_parts[-1] + ' ' + _sentences[s + 1]
+                        _sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
                     else:  # last part
-                        remainder = resplit_parts[-1] + ' '
+                        remainder = resplit_parts[-1] + " "
                     processed_sentences.extend(resplit_parts[:-1])
                 else:
                     processed_sentences.extend(resplit_parts)
@@ -243,7 +367,7 @@ class LatticeTokenizer:
             if not _sentences:
                 if remainder:
-                    _sentences, remainder = [remainder.strip()], ''
+                    _sentences, remainder = [remainder.strip()], ""
                 else:
                     continue
@@ -257,12 +381,12 @@ class LatticeTokenizer:
                     Supervision(text=text, speaker=(_speaker if s == 0 else None))
                     for s, text in enumerate(_sentences[:-1])
                 )
-                remainder = _sentences[-1] + ' ' + remainder
+                remainder = _sentences[-1] + " " + remainder
                 if k < len(speakers) - 1 and speakers[k + 1] is not None:  # next speaker is set
                     supervisions.append(
                         Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
                     )
-                    remainder = ''
+                    remainder = ""
                 elif len(_sentences) == 1:
                     if k == len(speakers) - 1:
                         pass  # keep _speaker for the last supervision
@@ -285,20 +409,23 @@ class LatticeTokenizer:
         pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
         response = self.client_wrapper.post(
-            'tokenize',
+            "tokenize",
             json={
-                'supervisions': [s.to_dict() for s in supervisions],
-                'pronunciation_dictionaries': pronunciation_dictionaries,
+                "model_name": self.model_name,
+                "supervisions": [s.to_dict() for s in supervisions],
+                "pronunciation_dictionaries": pronunciation_dictionaries,
             },
         )
+        if response.status_code == 402:
+            raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
         if response.status_code != 200:
-            raise Exception(f'Failed to tokenize texts: {response.text}')
+            raise Exception(f"Failed to tokenize texts: {response.text}")
         result = response.json()
-        lattice_id = result['id']
+        lattice_id = result["id"]
         return (
             supervisions,
             lattice_id,
-            (result['lattice_graph'], result['final_state'], result.get('acoustic_scale', 1.0)),
+            (result["lattice_graph"], result["final_state"], result.get("acoustic_scale", 1.0)),
         )
     def detokenize(
@@ -310,16 +437,17 @@ class LatticeTokenizer:
     ) -> List[Supervision]:
         emission, results, labels, frame_shift, offset, channel = lattice_results  # noqa: F841
         response = self.client_wrapper.post(
-            'detokenize',
+            "detokenize",
             json={
-                'lattice_id': lattice_id,
-                'frame_shift': frame_shift,
-                'results': [t.to_dict() for t in results[0]],
-                'labels': labels[0],
-                'offset': offset,
-                'channel': channel,
-                'return_details': return_details,
-                'destroy_lattice': True,
+                "model_name": self.model_name,
+                "lattice_id": lattice_id,
+                "frame_shift": frame_shift,
+                "results": [t.to_dict() for t in results[0]],
+                "labels": labels[0],
+                "offset": offset,
+                "channel": channel,
+                "return_details": False if return_details is None else return_details,
+                "destroy_lattice": True,
             },
         )
         if response.status_code == 422:
@@ -327,94 +455,20 @@ class LatticeTokenizer:
                 lattice_id,
                 original_error=Exception(LATTICE_DECODING_FAILURE_HELP),
             )
+        if response.status_code == 402:
+            raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
         if response.status_code != 200:
-            raise Exception(f'Failed to detokenize lattice: {response.text}')
+            raise Exception(f"Failed to detokenize lattice: {response.text}")
         result = response.json()
-        if not result.get('success'):
-            raise Exception('Failed to detokenize the alignment results.')
+        if not result.get("success"):
+            raise Exception("Failed to detokenize the alignment results.")
-        alignments = [Supervision.from_dict(s) for s in result['supervisions']]
+        alignments = [Supervision.from_dict(s) for s in result["supervisions"]]
         if return_details:
             # Add emission confidence scores for segments and word-level alignments
-            _add_confidence_scores(alignments, emission, labels[0], frame_shift)
-        alignments = _update_alignments_speaker(supervisions, alignments)
-        return alignments
-class AsyncLatticeTokenizer(LatticeTokenizer):
-    async def _post_async(self, endpoint: str, **kwargs):
-        response = self.client_wrapper.post(endpoint, **kwargs)
-        if inspect.isawaitable(response):
-            return await response
-        return response
-    async def tokenize(
-        self, supervisions: List[Supervision], split_sentence: bool = False
-    ) -> Tuple[str, Dict[str, Any]]:
-        if split_sentence:
-            self.init_sentence_splitter()
-            supervisions = self.split_sentences(supervisions)
-        pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
-        response = await self._post_async(
-            'tokenize',
-            json={
-                'supervisions': [s.to_dict() for s in supervisions],
-                'pronunciation_dictionaries': pronunciation_dictionaries,
-            },
-        )
-        if response.status_code != 200:
-            raise Exception(f'Failed to tokenize texts: {response.text}')
-        result = response.json()
-        lattice_id = result['id']
-        return (
-            supervisions,
-            lattice_id,
-            (result['lattice_graph'], result['final_state'], result.get('acoustic_scale', 1.0)),
-        )
-    async def detokenize(
-        self,
-        lattice_id: str,
-        lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
-        supervisions: List[Supervision],
-        return_details: bool = False,
-    ) -> List[Supervision]:
-        emission, results, labels, frame_shift, offset, channel = lattice_results  # noqa: F841
-        response = await self._post_async(
-            'detokenize',
-            json={
-                'lattice_id': lattice_id,
-                'frame_shift': frame_shift,
-                'results': [t.to_dict() for t in results[0]],
-                'labels': labels[0],
-                'offset': offset,
-                'channel': channel,
-                'return_details': return_details,
-                'destroy_lattice': True,
-            },
-        )
-        if response.status_code == 422:
-            raise LatticeDecodingError(
-                lattice_id,
-                original_error=Exception(LATTICE_DECODING_FAILURE_HELP),
-            )
-        if response.status_code != 200:
-            raise Exception(f'Failed to detokenize lattice: {response.text}')
-        result = response.json()
-        if not result.get('success'):
-            return Exception('Failed to detokenize the alignment results.')
-        alignments = [Supervision.from_dict(s) for s in result['supervisions']]
-        if return_details:
-            # Add emission confidence scores for segments and word-level alignments
-            _add_confidence_scores(alignments, emission, labels[0], frame_shift)
+            _add_confidence_scores(alignments, emission, labels[0], frame_shift, offset)
         alignments = _update_alignments_speaker(supervisions, alignments)
@@ -426,6 +480,7 @@ def _add_confidence_scores(
     emission: torch.Tensor,
     labels: List[int],
     frame_shift: float,
+    offset: float = 0.0,
 ) -> None:
     """
     Add confidence scores to supervisions and their word-level alignments.
@@ -443,8 +498,8 @@ def _add_confidence_scores(
     tokens = torch.tensor(labels, dtype=torch.int64, device=emission.device)
     for supervision in supervisions:
-        start_frame = int(supervision.start / frame_shift)
-        end_frame = int(supervision.end / frame_shift)
+        start_frame = int((supervision.start - offset) / frame_shift)
+        end_frame = int((supervision.end - offset) / frame_shift)
         # Compute segment-level confidence
         probabilities = emission[0, start_frame:end_frame].softmax(dim=-1)
@@ -453,11 +508,11 @@ def _add_confidence_scores(
         supervision.score = round(1.0 - diffprobs.mean().item(), ndigits=4)
         # Compute word-level confidence if alignment exists
-        if hasattr(supervision, 'alignment') and supervision.alignment:
-            words = supervision.alignment.get('word', [])
+        if hasattr(supervision, "alignment") and supervision.alignment:
+            words = supervision.alignment.get("word", [])
             for w, item in enumerate(words):
-                start = int(item.start / frame_shift) - start_frame
-                end = int(item.end / frame_shift) - start_frame
+                start = int((item.start - offset) / frame_shift) - start_frame
+                end = int((item.end - offset) / frame_shift) - start_frame
                 words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean().item(), ndigits=4))
@@ -472,3 +527,23 @@ def _update_alignments_speaker(supervisions: List[Supervision], alignments: List
     for supervision, alignment in zip(supervisions, alignments):
         alignment.speaker = supervision.speaker
     return alignments
+def _load_tokenizer(
+    client_wrapper: Any,
+    model_path: str,
+    model_name: str,
+    device: str,
+    *,
+    tokenizer_cls: Type[LatticeTokenizer] = LatticeTokenizer,
+) -> LatticeTokenizer:
+    """Instantiate tokenizer with consistent error handling."""
+    try:
+        return tokenizer_cls.from_pretrained(
+            client_wrapper=client_wrapper,
+            model_path=model_path,
+            model_name=model_name,
+            device=device,
+        )
+    except Exception as e:
+        raise ModelLoadError(f"tokenizer from {model_path}", original_error=e)

lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

lattifai 0.4.5py3-none-any.whl → 1.0.0py3-none-any.whl