PyPI - lattifai - Versions diffs - 1.0.5__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

lattifai 1.0.5py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

lattifai/__init__.py +11 -12
lattifai/alignment/lattice1_aligner.py +39 -7
lattifai/alignment/lattice1_worker.py +135 -147
lattifai/alignment/tokenizer.py +38 -22
lattifai/audio2.py +1 -1
lattifai/caption/caption.py +55 -19
lattifai/cli/__init__.py +2 -0
lattifai/cli/caption.py +1 -1
lattifai/cli/diarization.py +110 -0
lattifai/cli/transcribe.py +3 -1
lattifai/cli/youtube.py +11 -0
lattifai/client.py +32 -111
lattifai/config/alignment.py +14 -0
lattifai/config/client.py +5 -0
lattifai/config/transcription.py +4 -0
lattifai/diarization/lattifai.py +18 -7
lattifai/mixin.py +26 -5
lattifai/transcription/__init__.py +1 -1
lattifai/transcription/base.py +21 -2
lattifai/transcription/gemini.py +127 -1
lattifai/transcription/lattifai.py +30 -2
lattifai/utils.py +62 -69
lattifai/workflow/youtube.py +55 -57
{lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/METADATA +352 -56
{lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/RECORD +29 -28
{lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/entry_points.txt +2 -0
{lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/WHEEL +0 -0
{lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.0.5.dist-info → lattifai-1.2.0.dist-info}/top_level.txt +0 -0

lattifai/__init__.py CHANGED Viewed

@@ -52,28 +52,27 @@ except Exception:
     __version__ = "0.1.0"  # fallback version
-# Check and auto-install k2 if not present
-def _check_and_install_k2():
-    """Check if k2 is installed and attempt to install it if not."""
+# Check and auto-install k2py if not present
+def _check_and_install_k2py():
+    """Check if k2py is installed and attempt to install it if not."""
     try:
-        import k2
+        import k2py
     except ImportError:
         import subprocess
-        print("k2 is not installed. Attempting to install k2...")
+        print("k2py is not installed. Attempting to install k2py...")
         try:
-            subprocess.check_call([sys.executable, "-m", "pip", "install", "install-k2"])
-            subprocess.check_call([sys.executable, "-m", "install_k2"])
-            import k2  # Try importing again after installation
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "k2py"])
+            import k2py  # Try importing again after installation
-            print("k2 installed successfully.")
+            print("k2py installed successfully.")
         except Exception as e:
-            warnings.warn(f"Failed to install k2 automatically. Please install it manually. Error: {e}")
+            warnings.warn(f"Failed to install k2py automatically. Please install it manually. Error: {e}")
     return True
-# Auto-install k2 on first import
-_check_and_install_k2()
+# Auto-install k2py on first import
+_check_and_install_k2py()
 __all__ = [

lattifai/alignment/lattice1_aligner.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Any, List, Optional, Tuple
 import colorful
 import numpy as np
-import torch
 from lattifai.audio2 import AudioData
 from lattifai.caption import Supervision
@@ -35,31 +34,55 @@ class Lattice1Aligner(object):
             raise ValueError("AlignmentConfig.client_wrapper is not set. It must be initialized by the client.")
         client_wrapper = config.client_wrapper
-        model_path = _resolve_model_path(config.model_name)
+        # Resolve model path using configured model hub
+        model_path = _resolve_model_path(config.model_name, getattr(config, "model_hub", "huggingface"))
-        self.tokenizer = _load_tokenizer(client_wrapper, model_path, config.model_name, config.device)
+        self.tokenizer = _load_tokenizer(
+            client_wrapper, model_path, config.model_name, config.device, model_hub=config.model_hub
+        )
         self.worker = _load_worker(model_path, config.device, config)
         self.frame_shift = self.worker.frame_shift
-    def emission(self, ndarray: np.ndarray) -> torch.Tensor:
+    def emission(self, ndarray: np.ndarray) -> np.ndarray:
         """Generate emission probabilities from audio ndarray.
         Args:
             ndarray: Audio data as numpy array of shape (1, T) or (C, T)
         Returns:
-            Emission tensor of shape (1, T, vocab_size)
+            Emission numpy array of shape (1, T, vocab_size)
         """
         return self.worker.emission(ndarray)
+    def separate(self, audio: np.ndarray) -> np.ndarray:
+        """Separate audio using separator model.
+        Args:
+            audio: np.ndarray object containing the audio to separate, shape (1, T)
+        Returns:
+            Separated audio as numpy array
+        Raises:
+            RuntimeError: If separator model is not available
+        """
+        if self.worker.separator_ort is None:
+            raise RuntimeError("Separator model not available. separator.onnx not found in model path.")
+        # Run separator model
+        separator_output = self.worker.separator_ort.run(
+            None,
+            {"audios": audio},
+        )
+        return separator_output[0]
     def alignment(
         self,
         audio: AudioData,
         supervisions: List[Supervision],
         split_sentence: Optional[bool] = False,
         return_details: Optional[bool] = False,
-        emission: Optional[torch.Tensor] = None,
+        emission: Optional[np.ndarray] = None,
         offset: float = 0.0,
         verbose: bool = True,
     ) -> Tuple[List[Supervision], List[Supervision]]:
@@ -120,7 +143,12 @@ class Lattice1Aligner(object):
                 safe_print(colorful.cyan("🎯 Step 4: Decoding lattice results to aligned segments"))
             try:
                 alignments = self.tokenizer.detokenize(
-                    lattice_id, lattice_results, supervisions=supervisions, return_details=return_details
+                    lattice_id,
+                    lattice_results,
+                    supervisions=supervisions,
+                    return_details=return_details,
+                    start_margin=self.config.start_margin,
+                    end_margin=self.config.end_margin,
                 )
                 if verbose:
                     safe_print(colorful.green(f"         ✓ Successfully aligned {len(alignments)} segments"))
@@ -137,3 +165,7 @@ class Lattice1Aligner(object):
             raise
         except Exception as e:
             raise e
+    def profile(self) -> None:
+        """Print profiling statistics."""
+        self.worker.profile()

lattifai/alignment/lattice1_worker.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import json
 import time
 from collections import defaultdict
+from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
+import colorful
 import numpy as np
 import onnxruntime as ort
-import torch
 from lhotse import FbankConfig
 from lhotse.features.kaldi.layers import Wav2LogFilterBank
 from lhotse.utils import Pathlike
@@ -13,6 +14,7 @@ from tqdm import tqdm
 from lattifai.audio2 import AudioData
 from lattifai.errors import AlignmentError, DependencyError, ModelLoadError
+from lattifai.utils import safe_print
 class Lattice1Worker:
@@ -60,104 +62,84 @@ class Lattice1Worker:
         except Exception as e:
             raise ModelLoadError(f"acoustic model from {model_path}", original_error=e)
+        # Get vocab_size from model output
+        self.vocab_size = self.acoustic_ort.get_outputs()[0].shape[-1]
         # get input_names
         input_names = [inp.name for inp in self.acoustic_ort.get_inputs()]
-        if "audios" not in input_names:
+        assert "audios" in input_names, f"Input name audios not found in {input_names}"
+        # Initialize separator if available
+        separator_model_path = Path(model_path) / "separator.onnx"
+        if separator_model_path.exists():
             try:
-                config = FbankConfig(num_mel_bins=80, device=device, snip_edges=False)
-                config_dict = config.to_dict()
-                config_dict.pop("device")
-                self.extractor = Wav2LogFilterBank(**config_dict).to(device).eval()
+                self.separator_ort = ort.InferenceSession(
+                    str(separator_model_path),
+                    providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+                )
             except Exception as e:
-                raise ModelLoadError(f"feature extractor for device {device}", original_error=e)
+                raise ModelLoadError(f"separator model from {model_path}", original_error=e)
         else:
-            self.extractor = None  # ONNX model includes feature extractor
+            self.separator_ort = None
-        self.device = torch.device(device)
         self.timings = defaultdict(lambda: 0.0)
     @property
     def frame_shift(self) -> float:
         return 0.02  # 20 ms
-    @torch.inference_mode()
-    def emission(self, ndarray: np.ndarray, acoustic_scale: float = 1.0, device: Optional[str] = None) -> torch.Tensor:
+    def emission(self, ndarray: np.ndarray, acoustic_scale: float = 1.0) -> np.ndarray:
         """Generate emission probabilities from audio ndarray.
         Args:
             ndarray: Audio data as numpy array of shape (1, T) or (C, T)
         Returns:
-            Emission tensor of shape (1, T, vocab_size)
+            Emission numpy array of shape (1, T, vocab_size)
         """
         _start = time.time()
-        if self.extractor is not None:
-            # audio -> features -> emission
-            audio = torch.from_numpy(ndarray).to(self.device)
-            if audio.shape[1] < 160:
-                audio = torch.nn.functional.pad(audio, (0, 320 - audio.shape[1]))
-            features = self.extractor(audio)  # (1, T, D)
-            if features.shape[1] > 6000:
-                emissions = []
-                for start in range(0, features.size(1), 6000):
-                    _features = features[:, start : start + 6000, :]
-                    ort_inputs = {
-                        "features": _features.cpu().numpy(),
-                        "feature_lengths": np.array([_features.size(1)], dtype=np.int64),
-                    }
-                    emission = self.acoustic_ort.run(None, ort_inputs)[0]  # (1, T, vocab_size) numpy
-                    emissions.append(emission)
-                emission = torch.cat(
-                    [torch.from_numpy(emission).to(device or self.device) for emission in emissions], dim=1
-                )  # (1, T, vocab_size)
-                del emissions
-            else:
-                ort_inputs = {
-                    "features": features.cpu().numpy(),
-                    "feature_lengths": np.array([features.size(1)], dtype=np.int64),
-                }
-                emission = self.acoustic_ort.run(None, ort_inputs)[0]  # (1, T, vocab_size) numpy
-                emission = torch.from_numpy(emission).to(device or self.device)
+        if ndarray.shape[1] < 160:
+            ndarray = np.pad(ndarray, ((0, 0), (0, 320 - ndarray.shape[1])), mode="constant")
+        CHUNK_SIZE = 60 * 16000  # 60 seconds
+        total_samples = ndarray.shape[1]
+        if total_samples > CHUNK_SIZE:
+            frame_samples = int(16000 * self.frame_shift)
+            emissions = np.empty((1, total_samples // frame_samples + 1, self.vocab_size), dtype=np.float32)
+            for start in range(0, total_samples, CHUNK_SIZE):
+                chunk = ndarray[:, start : start + CHUNK_SIZE]
+                if chunk.shape[1] < 160:
+                    chunk = np.pad(chunk, ((0, 0), (0, 320 - chunk.shape[1])), mode="constant")
+                emission_out = self.acoustic_ort.run(None, {"audios": chunk})[0]
+                if acoustic_scale != 1.0:
+                    emission_out *= acoustic_scale
+                sf = start // frame_samples  # start frame
+                lf = sf + emission_out.shape[1]  # last frame
+                emissions[0, sf:lf, :] = emission_out
+            emissions[:, lf:, :] = 0.0
         else:
-            if ndarray.shape[1] < 160:
-                ndarray = np.pad(ndarray, ((0, 0), (0, 320 - ndarray.shape[1])), mode="constant")
-            CHUNK_SIZE = 60 * 16000  # 60 seconds
-            if ndarray.shape[1] > CHUNK_SIZE:
-                emissions = []
-                for start in range(0, ndarray.shape[1], CHUNK_SIZE):
-                    emission = self.acoustic_ort.run(
-                        None,
-                        {
-                            "audios": ndarray[:, start : start + CHUNK_SIZE],
-                        },
-                    )  # (1, T, vocab_size) numpy
-                    emissions.append(emission[0])
-                emission = torch.cat(
-                    [torch.from_numpy(emission).to(device or self.device) for emission in emissions], dim=1
-                )  # (1, T, vocab_size)
-                del emissions
-            else:
-                emission = self.acoustic_ort.run(
-                    None,
-                    {
-                        "audios": ndarray,
-                    },
-                )  # (1, T, vocab_size) numpy
-                emission = torch.from_numpy(emission[0]).to(device or self.device)
+            emission_out = self.acoustic_ort.run(
+                None,
+                {
+                    "audios": ndarray,
+                },
+            )  # (1, T, vocab_size) numpy
+            emissions = emission_out[0]
-        if acoustic_scale != 1.0:
-            emission = emission.mul_(acoustic_scale)
+            if acoustic_scale != 1.0:
+                emissions *= acoustic_scale
         self.timings["emission"] += time.time() - _start
-        return emission  # (1, T, vocab_size) torch
+        return emissions  # (1, T, vocab_size) numpy
     def alignment(
         self,
         audio: AudioData,
         lattice_graph: Tuple[str, int, float],
-        emission: Optional[torch.Tensor] = None,
+        emission: Optional[np.ndarray] = None,
         offset: float = 0.0,
     ) -> Dict[str, Any]:
         """Process audio with LatticeGraph.
@@ -165,7 +147,7 @@ class Lattice1Worker:
         Args:
             audio: AudioData object
             lattice_graph: LatticeGraph data
-            emission: Pre-computed emission tensor (ignored if streaming=True)
+            emission: Pre-computed emission numpy array (ignored if streaming=True)
             offset: Time offset for the audio
             streaming: If True, use streaming mode for memory-efficient processing
@@ -178,25 +160,18 @@ class Lattice1Worker:
             AlignmentError: If alignment process fails
         """
         try:
-            import k2
-        except ImportError:
-            raise DependencyError("k2", install_command="pip install install-k2 && python -m install_k2")
-        try:
-            from lattifai_core.lattice.decode import align_segments
+            import k2py as k2
         except ImportError:
-            raise DependencyError("lattifai_core", install_command="Contact support for lattifai_core installation")
+            raise DependencyError("k2py", install_command="pip install k2py")
         lattice_graph_str, final_state, acoustic_scale = lattice_graph
         _start = time.time()
         try:
-            # Create decoding graph
-            decoding_graph = k2.Fsa.from_str(lattice_graph_str, acceptor=False)
-            decoding_graph.requires_grad_(False)
-            decoding_graph = k2.arc_sort(decoding_graph)
-            decoding_graph.skip_id = int(final_state)
-            decoding_graph.return_id = int(final_state + 1)
+            # Create decoding graph using k2py
+            graph_dict = k2.CreateFsaVecFromStr(lattice_graph_str, int(final_state), False)
+            decoding_graph = graph_dict["fsa"]
+            aux_labels = graph_dict["aux_labels"]
         except Exception as e:
             raise AlignmentError(
                 "Failed to create decoding graph from lattice",
@@ -204,11 +179,6 @@ class Lattice1Worker:
             )
         self.timings["decoding_graph"] += time.time() - _start
-        if self.device.type == "mps":
-            device = "cpu"  # k2 does not support mps yet
-        else:
-            device = self.device
         _start = time.time()
         # Get beam search parameters from config or use defaults
@@ -218,71 +188,54 @@ class Lattice1Worker:
         max_active_states = self.alignment_config.max_active_states or 10000
         if emission is None and audio.streaming_mode:
-            # Streaming mode: pass emission iterator to align_segments
-            # The align_segments function will automatically detect the iterator
-            # and use k2.OnlineDenseIntersecter for memory-efficient processing
-            def emission_iterator():
-                """Generate emissions for each audio chunk with progress tracking."""
-                total_duration = audio.duration
-                processed_duration = 0.0
-                total_minutes = int(total_duration / 60.0)
-                with tqdm(
-                    total=total_minutes,
-                    desc=f"Processing audio ({total_minutes} min)",
-                    unit="min",
-                    unit_scale=False,
-                    unit_divisor=1,
-                ) as pbar:
-                    for chunk in audio.iter_chunks():
-                        chunk_emission = self.emission(chunk.ndarray, acoustic_scale=acoustic_scale, device=device)
-                        # Update progress based on chunk duration in minutes
-                        chunk_duration = int(chunk.duration / 60.0)
-                        pbar.update(chunk_duration)
-                        processed_duration += chunk_duration
-                        yield chunk_emission
-            # Calculate total frames for supervision_segments
-            total_frames = int(audio.duration / self.frame_shift)
-            results, labels = align_segments(
-                emission_iterator(),  # Pass iterator for streaming
-                decoding_graph.to(device),
-                torch.tensor([total_frames], dtype=torch.int32),
-                search_beam=search_beam,
-                output_beam=output_beam,
-                min_active_states=min_active_states,
-                max_active_states=max_active_states,
-                subsampling_factor=1,
-                reject_low_confidence=False,
+            # Initialize OnlineDenseIntersecter for streaming
+            intersecter = k2.OnlineDenseIntersecter(
+                decoding_graph,
+                aux_labels,
+                float(search_beam),
+                float(output_beam),
+                int(min_active_states),
+                int(max_active_states),
             )
-            # For streaming, don't return emission tensor to save memory
+            # Streaming mode
+            total_duration = audio.duration
+            total_minutes = int(total_duration / 60.0)
+            with tqdm(
+                total=total_minutes,
+                desc=f"Processing audio ({total_minutes} min)",
+                unit="min",
+                unit_scale=False,
+                unit_divisor=1,
+            ) as pbar:
+                for chunk in audio.iter_chunks():
+                    chunk_emission = self.emission(chunk.ndarray, acoustic_scale=acoustic_scale)
+                    intersecter.decode(chunk_emission[0])
+                    # Update progress
+                    chunk_duration = int(chunk.duration / 60.0)
+                    pbar.update(chunk_duration)
             emission_result = None
+            # Get results from intersecter
+            results, labels = intersecter.finish()
         else:
-            # Batch mode: compute full emission tensor and pass to align_segments
+            # Batch mode
             if emission is None:
-                emission = self.emission(
-                    audio.ndarray, acoustic_scale=acoustic_scale, device=device
-                )  # (1, T, vocab_size)
+                emission = self.emission(audio.ndarray, acoustic_scale=acoustic_scale)  # (1, T, vocab_size)
             else:
-                emission = emission.to(device) * acoustic_scale
-            results, labels = align_segments(
-                emission,
-                decoding_graph.to(device),
-                torch.tensor([emission.shape[1]], dtype=torch.int32),
-                search_beam=search_beam,
-                output_beam=output_beam,
-                min_active_states=min_active_states,
-                max_active_states=max_active_states,
-                subsampling_factor=1,
-                reject_low_confidence=False,
+                if acoustic_scale != 1.0:
+                    emission *= acoustic_scale
+            # Use AlignSegments directly
+            results, labels = k2.AlignSegments(
+                graph_dict,
+                emission[0],  # Pass the prepared scores
+                float(search_beam),
+                float(output_beam),
+                int(min_active_states),
+                int(max_active_states),
             )
             emission_result = emission
         self.timings["align_segments"] += time.time() - _start
@@ -290,6 +243,41 @@ class Lattice1Worker:
         channel = 0
         return emission_result, results, labels, self.frame_shift, offset, channel  # frame_shift=20ms
+    def profile(self) -> None:
+        """Print formatted profiling statistics."""
+        if not self.timings:
+            return
+        safe_print(colorful.bold(colorful.cyan("\n⏱️  Alignment Profiling")))
+        safe_print(colorful.gray("─" * 44))
+        safe_print(
+            f"{colorful.bold('Phase'.ljust(21))} "
+            f"{colorful.bold('Time'.ljust(12))} "
+            f"{colorful.bold('Percent'.rjust(8))}"
+        )
+        safe_print(colorful.gray("─" * 44))
+        total_time = sum(self.timings.values())
+        # Sort by duration descending
+        sorted_stats = sorted(self.timings.items(), key=lambda x: x[1], reverse=True)
+        for name, duration in sorted_stats:
+            percentage = (duration / total_time * 100) if total_time > 0 else 0.0
+            # Name: Cyan, Time: Yellow, Percent: Gray
+            safe_print(
+                f"{name:<20} "
+                f"{colorful.yellow(f'{duration:7.4f}s'.ljust(12))} "
+                f"{colorful.gray(f'{percentage:.2f}%'.rjust(8))}"
+            )
+        safe_print(colorful.gray("─" * 44))
+        # Pad "Total Time" before coloring to ensure correct alignment (ANSI codes don't count for width)
+        safe_print(
+            f"{colorful.bold('Total Time'.ljust(20))} "
+            f"{colorful.bold(colorful.yellow(f'{total_time:7.4f}s'.ljust(12)))}\n"
+        )
 def _load_worker(model_path: str, device: str, config: Optional[Any] = None) -> Lattice1Worker:
     """Instantiate lattice worker with consistent error handling."""

lattifai/alignment/tokenizer.py CHANGED Viewed

@@ -4,7 +4,7 @@ import re
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
-import torch
+import numpy as np
 from lattifai.alignment.phonemizer import G2Phonemizer
 from lattifai.caption import Supervision
@@ -121,6 +121,7 @@ class LatticeTokenizer:
     def __init__(self, client_wrapper: Any):
         self.client_wrapper = client_wrapper
         self.model_name = ""
+        self.model_hub: Optional[str] = None
         self.words: List[str] = []
         self.g2p_model: Any = None  # Placeholder for G2P model
         self.dictionaries = defaultdict(lambda: [])
@@ -142,10 +143,20 @@ class LatticeTokenizer:
         elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
             providers.append("MPSExecutionProvider")
-        sat = SaT(
-            "sat-3l-sm",
-            ort_providers=providers + ["CPUExecutionProvider"],
-        )
+        if self.model_hub == "modelscope":
+            from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
+            downloaded_path = ms_snapshot("LattifAI/OmniTokenizer")
+            sat = SaT(
+                f"{downloaded_path}/sat-3l-sm",
+                tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
+                ort_providers=providers + ["CPUExecutionProvider"],
+            )
+        else:
+            sat = SaT(
+                "sat-3l-sm",
+                ort_providers=providers + ["CPUExecutionProvider"],
+            )
         self.sentence_splitter = sat
     @staticmethod
@@ -200,6 +211,7 @@ class LatticeTokenizer:
         client_wrapper: Any,
         model_path: str,
         model_name: str,
+        model_hub: Optional[str] = None,
         device: str = "cpu",
         compressed: bool = True,
     ) -> TokenizerT:
@@ -214,7 +226,7 @@ class LatticeTokenizer:
             else:
                 with open(words_model_path, "rb") as f:
                     data = pickle.load(f)
-        except pickle.UnpicklingError as e:
+        except Exception as e:
             del e
             import msgpack
@@ -227,6 +239,7 @@ class LatticeTokenizer:
         tokenizer = cls(client_wrapper=client_wrapper)
         tokenizer.model_name = model_name
+        tokenizer.model_hub = model_hub
         tokenizer.words = data["words"]
         tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
         tokenizer.oov_word = data["oov_word"]
@@ -431,9 +444,11 @@ class LatticeTokenizer:
     def detokenize(
         self,
         lattice_id: str,
-        lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
+        lattice_results: Tuple[np.ndarray, Any, Any, float, float],
         supervisions: List[Supervision],
         return_details: bool = False,
+        start_margin: float = 0.08,
+        end_margin: float = 0.20,
     ) -> List[Supervision]:
         emission, results, labels, frame_shift, offset, channel = lattice_results  # noqa: F841
         response = self.client_wrapper.post(
@@ -448,6 +463,8 @@ class LatticeTokenizer:
                 "channel": channel,
                 "return_details": False if return_details is None else return_details,
                 "destroy_lattice": True,
+                "start_margin": start_margin,
+                "end_margin": end_margin,
             },
         )
         if response.status_code == 400:
@@ -477,7 +494,7 @@ class LatticeTokenizer:
 def _add_confidence_scores(
     supervisions: List[Supervision],
-    emission: torch.Tensor,
+    emission: np.ndarray,
     labels: List[int],
     frame_shift: float,
     offset: float = 0.0,
@@ -495,17 +512,17 @@ def _add_confidence_scores(
         labels: Token labels corresponding to aligned tokens
         frame_shift: Frame shift in seconds for converting frames to time
     """
-    tokens = torch.tensor(labels, dtype=torch.int64, device=emission.device)
+    tokens = np.array(labels, dtype=np.int64)
     for supervision in supervisions:
         start_frame = int((supervision.start - offset) / frame_shift)
         end_frame = int((supervision.end - offset) / frame_shift)
         # Compute segment-level confidence
-        probabilities = emission[0, start_frame:end_frame].softmax(dim=-1)
+        probabilities = np.exp(emission[0, start_frame:end_frame])
         aligned = probabilities[range(0, end_frame - start_frame), tokens[start_frame:end_frame]]
-        diffprobs = (probabilities.max(dim=-1).values - aligned).cpu()
-        supervision.score = round(1.0 - diffprobs.mean().item(), ndigits=4)
+        diffprobs = np.max(probabilities, axis=-1) - aligned
+        supervision.score = round(1.0 - diffprobs.mean(), ndigits=4)
         # Compute word-level confidence if alignment exists
         if hasattr(supervision, "alignment") and supervision.alignment:
@@ -513,7 +530,7 @@ def _add_confidence_scores(
             for w, item in enumerate(words):
                 start = int((item.start - offset) / frame_shift) - start_frame
                 end = int((item.end - offset) / frame_shift) - start_frame
-                words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean().item(), ndigits=4))
+                words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean(), ndigits=4))
 def _update_alignments_speaker(supervisions: List[Supervision], alignments: List[Supervision]) -> List[Supervision]:
@@ -535,15 +552,14 @@ def _load_tokenizer(
     model_name: str,
     device: str,
     *,
+    model_hub: Optional[str] = None,
     tokenizer_cls: Type[LatticeTokenizer] = LatticeTokenizer,
 ) -> LatticeTokenizer:
     """Instantiate tokenizer with consistent error handling."""
-    try:
-        return tokenizer_cls.from_pretrained(
-            client_wrapper=client_wrapper,
-            model_path=model_path,
-            model_name=model_name,
-            device=device,
-        )
-    except Exception as e:
-        raise ModelLoadError(f"tokenizer from {model_path}", original_error=e)
+    return tokenizer_cls.from_pretrained(
+        client_wrapper=client_wrapper,
+        model_path=model_path,
+        model_name=model_name,
+        model_hub=model_hub,
+        device=device,
+    )

lattifai 1.0.5__py3-none-any.whl → 1.2.0__py3-none-any.whl

lattifai 1.0.5py3-none-any.whl → 1.2.0py3-none-any.whl