PyPI - lattifai - Versions diffs - 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

lattifai 0.4.6py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

lattifai/__init__.py +42 -27
lattifai/alignment/__init__.py +6 -0
lattifai/alignment/lattice1_aligner.py +119 -0
lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
lattifai/alignment/segmenter.py +166 -0
lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
lattifai/audio2.py +211 -0
lattifai/caption/__init__.py +20 -0
lattifai/caption/caption.py +1275 -0
lattifai/{io → caption}/supervision.py +1 -0
lattifai/{io → caption}/text_parser.py +53 -10
lattifai/cli/__init__.py +17 -0
lattifai/cli/alignment.py +153 -0
lattifai/cli/caption.py +204 -0
lattifai/cli/server.py +19 -0
lattifai/cli/transcribe.py +197 -0
lattifai/cli/youtube.py +128 -0
lattifai/client.py +455 -246
lattifai/config/__init__.py +20 -0
lattifai/config/alignment.py +73 -0
lattifai/config/caption.py +178 -0
lattifai/config/client.py +46 -0
lattifai/config/diarization.py +67 -0
lattifai/config/media.py +335 -0
lattifai/config/transcription.py +84 -0
lattifai/diarization/__init__.py +5 -0
lattifai/diarization/lattifai.py +89 -0
lattifai/errors.py +41 -34
lattifai/logging.py +116 -0
lattifai/mixin.py +552 -0
lattifai/server/app.py +420 -0
lattifai/transcription/__init__.py +76 -0
lattifai/transcription/base.py +108 -0
lattifai/transcription/gemini.py +219 -0
lattifai/transcription/lattifai.py +103 -0
lattifai/types.py +30 -0
lattifai/utils.py +3 -31
lattifai/workflow/__init__.py +22 -0
lattifai/workflow/agents.py +6 -0
lattifai/{workflows → workflow}/file_manager.py +81 -57
lattifai/workflow/youtube.py +564 -0
lattifai-1.0.0.dist-info/METADATA +736 -0
lattifai-1.0.0.dist-info/RECORD +52 -0
{lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
lattifai-1.0.0.dist-info/entry_points.txt +13 -0
lattifai/base_client.py +0 -126
lattifai/bin/__init__.py +0 -3
lattifai/bin/agent.py +0 -324
lattifai/bin/align.py +0 -295
lattifai/bin/cli_base.py +0 -25
lattifai/bin/subtitle.py +0 -210
lattifai/io/__init__.py +0 -43
lattifai/io/reader.py +0 -86
lattifai/io/utils.py +0 -15
lattifai/io/writer.py +0 -102
lattifai/tokenizer/__init__.py +0 -3
lattifai/workers/__init__.py +0 -3
lattifai/workflows/__init__.py +0 -34
lattifai/workflows/agents.py +0 -12
lattifai/workflows/gemini.py +0 -167
lattifai/workflows/prompts/README.md +0 -22
lattifai/workflows/prompts/gemini/README.md +0 -24
lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
lattifai/workflows/youtube.py +0 -931
lattifai-0.4.6.dist-info/METADATA +0 -806
lattifai-0.4.6.dist-info/RECORD +0 -39
lattifai-0.4.6.dist-info/entry_points.txt +0 -3
/lattifai/{io → caption}/gemini_reader.py +0 -0
/lattifai/{io → caption}/gemini_writer.py +0 -0
/lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
/lattifai/{workflows → workflow}/base.py +0 -0
{lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
{lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0

lattifai/{tokenizer → alignment}/tokenizer.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import gzip
-import inspect
 import pickle
 import re
 from collections import defaultdict
@@ -7,9 +6,15 @@ from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
 import torch
-from lattifai.errors import LATTICE_DECODING_FAILURE_HELP, LatticeDecodingError
-from lattifai.io import Supervision, normalize_html_text
-from lattifai.tokenizer.phonemizer import G2Phonemizer
+from lattifai.alignment.phonemizer import G2Phonemizer
+from lattifai.caption import Supervision
+from lattifai.caption import normalize_text as normalize_html_text
+from lattifai.errors import (
+    LATTICE_DECODING_FAILURE_HELP,
+    LatticeDecodingError,
+    ModelLoadError,
+    QuotaExceededError,
+)
 PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
 END_PUNCTUATION = '.!?"]。！？”】'
@@ -24,11 +29,98 @@ MAXIMUM_WORD_LENGTH = 40
 TokenizerT = TypeVar("TokenizerT", bound="LatticeTokenizer")
+def _is_punctuation(char: str) -> bool:
+    """Check if a character is punctuation (not space, not alphanumeric, not CJK)."""
+    if len(char) != 1:
+        return False
+    if char.isspace():
+        return False
+    if char.isalnum():
+        return False
+    # Check if it's a CJK character
+    if "\u4e00" <= char <= "\u9fff":
+        return False
+    # Check if it's an accented Latin character
+    if "\u00c0" <= char <= "\u024f":
+        return False
+    return True
+def tokenize_multilingual_text(text: str, keep_spaces: bool = True, attach_punctuation: bool = False) -> list[str]:
+    """
+    Tokenize a mixed Chinese-English string into individual units.
+    Tokenization rules:
+    - Chinese characters (CJK) are split individually
+    - Consecutive Latin letters (including accented characters) and digits are grouped as one unit
+    - English contractions ('s, 't, 'm, 'll, 're, 've) are kept with the preceding word
+    - Other characters (punctuation, spaces) are split individually by default
+    - If attach_punctuation=True, punctuation marks are attached to the preceding token
+    Args:
+        text: Input string containing mixed Chinese and English text
+        keep_spaces: If True, spaces are included in the output as separate tokens.
+                     If False, spaces are excluded from the output. Default is True.
+        attach_punctuation: If True, punctuation marks are attached to the preceding token.
+                            For example, "Hello, World!" becomes ["Hello,", " ", "World!"].
+                            Default is False.
+    Returns:
+        List of tokenized units
+    Examples:
+        >>> tokenize_multilingual_text("Hello世界")
+        ['Hello', '世', '界']
+        >>> tokenize_multilingual_text("I'm fine")
+        ["I'm", ' ', 'fine']
+        >>> tokenize_multilingual_text("I'm fine", keep_spaces=False)
+        ["I'm", 'fine']
+        >>> tokenize_multilingual_text("Kühlschrank")
+        ['Kühlschrank']
+        >>> tokenize_multilingual_text("Hello, World!", attach_punctuation=True)
+        ['Hello,', ' ', 'World!']
+    """
+    # Regex pattern:
+    # - [a-zA-Z0-9\u00C0-\u024F]+ matches Latin letters (including accented chars like ü, ö, ä, ß, é, etc.)
+    # - (?:'[a-zA-Z]{1,2})? optionally matches contractions like 's, 't, 'm, 'll, 're, 've
+    # - [\u4e00-\u9fff] matches CJK characters
+    # - . matches any other single character
+    # Unicode ranges:
+    # - \u00C0-\u00FF: Latin-1 Supplement (À-ÿ)
+    # - \u0100-\u017F: Latin Extended-A
+    # - \u0180-\u024F: Latin Extended-B
+    pattern = re.compile(r"([a-zA-Z0-9\u00C0-\u024F]+(?:'[a-zA-Z]{1,2})?|[\u4e00-\u9fff]|.)")
+    # filter(None, ...) removes any empty strings from re.findall results
+    tokens = list(filter(None, pattern.findall(text)))
+    if attach_punctuation and len(tokens) > 1:
+        # Attach punctuation to the preceding token
+        # Punctuation characters (excluding spaces) are merged with the previous token
+        merged_tokens = []
+        i = 0
+        while i < len(tokens):
+            token = tokens[i]
+            # Look ahead to collect consecutive punctuation (non-space, non-alphanumeric, non-CJK)
+            if merged_tokens and _is_punctuation(token):
+                merged_tokens[-1] = merged_tokens[-1] + token
+            else:
+                merged_tokens.append(token)
+            i += 1
+        tokens = merged_tokens
+    if not keep_spaces:
+        tokens = [t for t in tokens if not t.isspace()]
+    return tokens
 class LatticeTokenizer:
     """Tokenizer for converting Lhotse Cut to LatticeGraph."""
     def __init__(self, client_wrapper: Any):
         self.client_wrapper = client_wrapper
+        self.model_name = ""
         self.words: List[str] = []
         self.g2p_model: Any = None  # Placeholder for G2P model
         self.dictionaries = defaultdict(lambda: [])
@@ -107,6 +199,7 @@ class LatticeTokenizer:
         cls: Type[TokenizerT],
         client_wrapper: Any,
         model_path: str,
+        model_name: str,
         device: str = "cpu",
         compressed: bool = True,
     ) -> TokenizerT:
@@ -114,21 +207,37 @@ class LatticeTokenizer:
         from pathlib import Path
         words_model_path = f"{model_path}/words.bin"
-        if compressed:
-            with gzip.open(words_model_path, "rb") as f:
-                data = pickle.load(f)
-        else:
-            with open(words_model_path, "rb") as f:
-                data = pickle.load(f)
+        try:
+            if compressed:
+                with gzip.open(words_model_path, "rb") as f:
+                    data = pickle.load(f)
+            else:
+                with open(words_model_path, "rb") as f:
+                    data = pickle.load(f)
+        except pickle.UnpicklingError as e:
+            del e
+            import msgpack
+            if compressed:
+                with gzip.open(words_model_path, "rb") as f:
+                    data = msgpack.unpack(f, raw=False, strict_map_key=False)
+            else:
+                with open(words_model_path, "rb") as f:
+                    data = msgpack.unpack(f, raw=False, strict_map_key=False)
         tokenizer = cls(client_wrapper=client_wrapper)
+        tokenizer.model_name = model_name
         tokenizer.words = data["words"]
         tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
         tokenizer.oov_word = data["oov_word"]
-        g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
-        if g2p_model_path:
-            tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
+        g2pp_model_path = f"{model_path}/g2pp.bin" if Path(f"{model_path}/g2pp.bin").exists() else None
+        if g2pp_model_path:
+            tokenizer.g2p_model = G2Phonemizer(g2pp_model_path, device=device)
+        else:
+            g2p_model_path = f"{model_path}/g2p.bin" if Path(f"{model_path}/g2p.bin").exists() else None
+            if g2p_model_path:
+                tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
         tokenizer.device = device
         tokenizer.add_special_tokens()
@@ -148,7 +257,10 @@ class LatticeTokenizer:
         oov_words = []
         for text in texts:
             text = normalize_html_text(text)
-            words = text.lower().replace("-", " ").replace("—", " ").replace("–", " ").split()
+            # support english, chinese and german tokenization
+            words = tokenize_multilingual_text(
+                text.lower().replace("-", " ").replace("—", " ").replace("–", " "), keep_spaces=False
+            )
             oovs = [w.strip(PUNCTUATION) for w in words if w not in self.words]
             if oovs:
                 oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])
@@ -188,28 +300,39 @@ class LatticeTokenizer:
         Carefull about speaker changes.
         """
-        texts, text_len, sidx = [], 0, 0
-        speakers = []
+        texts, speakers = [], []
+        text_len, sidx = 0, 0
+        def flush_segment(end_idx: int, speaker: Optional[str] = None):
+            """Flush accumulated text from sidx to end_idx with given speaker."""
+            nonlocal text_len, sidx
+            if sidx <= end_idx:
+                if len(speakers) < len(texts) + 1:
+                    speakers.append(speaker)
+                text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
+                texts.append(text)
+                sidx = end_idx + 1
+                text_len = 0
         for s, supervision in enumerate(supervisions):
             text_len += len(supervision.text)
+            is_last = s == len(supervisions) - 1
             if supervision.speaker:
+                # Flush previous segment without speaker (if any)
                 if sidx < s:
-                    if len(speakers) < len(texts) + 1:
-                        speakers.append(None)
-                    text = " ".join([sup.text for sup in supervisions[sidx:s]])
-                    texts.append(text)
-                    sidx = s
+                    flush_segment(s - 1, None)
                     text_len = len(supervision.text)
-                speakers.append(supervision.speaker)
-            else:
-                if text_len >= 2000 or s == len(supervisions) - 1:
-                    if len(speakers) < len(texts) + 1:
-                        speakers.append(None)
-                    text = " ".join([sup.text for sup in supervisions[sidx : s + 1]])
-                    texts.append(text)
-                    sidx = s + 1
-                    text_len = 0
+                # Check if we should flush this speaker's segment now
+                next_has_speaker = not is_last and supervisions[s + 1].speaker
+                if is_last or next_has_speaker:
+                    flush_segment(s, supervision.speaker)
+                else:
+                    speakers.append(supervision.speaker)
+            elif text_len >= 2000 or is_last:
+                flush_segment(s, None)
         assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
         sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace)
@@ -288,10 +411,13 @@ class LatticeTokenizer:
         response = self.client_wrapper.post(
             "tokenize",
             json={
+                "model_name": self.model_name,
                 "supervisions": [s.to_dict() for s in supervisions],
                 "pronunciation_dictionaries": pronunciation_dictionaries,
             },
         )
+        if response.status_code == 402:
+            raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
         if response.status_code != 200:
             raise Exception(f"Failed to tokenize texts: {response.text}")
         result = response.json()
@@ -313,13 +439,14 @@ class LatticeTokenizer:
         response = self.client_wrapper.post(
             "detokenize",
             json={
+                "model_name": self.model_name,
                 "lattice_id": lattice_id,
                 "frame_shift": frame_shift,
                 "results": [t.to_dict() for t in results[0]],
                 "labels": labels[0],
                 "offset": offset,
                 "channel": channel,
-                "return_details": return_details,
+                "return_details": False if return_details is None else return_details,
                 "destroy_lattice": True,
             },
         )
@@ -328,6 +455,8 @@ class LatticeTokenizer:
                 lattice_id,
                 original_error=Exception(LATTICE_DECODING_FAILURE_HELP),
             )
+        if response.status_code == 402:
+            raise QuotaExceededError(response.json().get("detail", "Quota exceeded"))
         if response.status_code != 200:
             raise Exception(f"Failed to detokenize lattice: {response.text}")
@@ -339,83 +468,7 @@ class LatticeTokenizer:
         if return_details:
             # Add emission confidence scores for segments and word-level alignments
-            _add_confidence_scores(alignments, emission, labels[0], frame_shift)
-        alignments = _update_alignments_speaker(supervisions, alignments)
-        return alignments
-class AsyncLatticeTokenizer(LatticeTokenizer):
-    async def _post_async(self, endpoint: str, **kwargs):
-        response = self.client_wrapper.post(endpoint, **kwargs)
-        if inspect.isawaitable(response):
-            return await response
-        return response
-    async def tokenize(
-        self, supervisions: List[Supervision], split_sentence: bool = False
-    ) -> Tuple[str, Dict[str, Any]]:
-        if split_sentence:
-            self.init_sentence_splitter()
-            supervisions = self.split_sentences(supervisions)
-        pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
-        response = await self._post_async(
-            "tokenize",
-            json={
-                "supervisions": [s.to_dict() for s in supervisions],
-                "pronunciation_dictionaries": pronunciation_dictionaries,
-            },
-        )
-        if response.status_code != 200:
-            raise Exception(f"Failed to tokenize texts: {response.text}")
-        result = response.json()
-        lattice_id = result["id"]
-        return (
-            supervisions,
-            lattice_id,
-            (result["lattice_graph"], result["final_state"], result.get("acoustic_scale", 1.0)),
-        )
-    async def detokenize(
-        self,
-        lattice_id: str,
-        lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
-        supervisions: List[Supervision],
-        return_details: bool = False,
-    ) -> List[Supervision]:
-        emission, results, labels, frame_shift, offset, channel = lattice_results  # noqa: F841
-        response = await self._post_async(
-            "detokenize",
-            json={
-                "lattice_id": lattice_id,
-                "frame_shift": frame_shift,
-                "results": [t.to_dict() for t in results[0]],
-                "labels": labels[0],
-                "offset": offset,
-                "channel": channel,
-                "return_details": return_details,
-                "destroy_lattice": True,
-            },
-        )
-        if response.status_code == 422:
-            raise LatticeDecodingError(
-                lattice_id,
-                original_error=Exception(LATTICE_DECODING_FAILURE_HELP),
-            )
-        if response.status_code != 200:
-            raise Exception(f"Failed to detokenize lattice: {response.text}")
-        result = response.json()
-        if not result.get("success"):
-            return Exception("Failed to detokenize the alignment results.")
-        alignments = [Supervision.from_dict(s) for s in result["supervisions"]]
-        if return_details:
-            # Add emission confidence scores for segments and word-level alignments
-            _add_confidence_scores(alignments, emission, labels[0], frame_shift)
+            _add_confidence_scores(alignments, emission, labels[0], frame_shift, offset)
         alignments = _update_alignments_speaker(supervisions, alignments)
@@ -427,6 +480,7 @@ def _add_confidence_scores(
     emission: torch.Tensor,
     labels: List[int],
     frame_shift: float,
+    offset: float = 0.0,
 ) -> None:
     """
     Add confidence scores to supervisions and their word-level alignments.
@@ -444,8 +498,8 @@ def _add_confidence_scores(
     tokens = torch.tensor(labels, dtype=torch.int64, device=emission.device)
     for supervision in supervisions:
-        start_frame = int(supervision.start / frame_shift)
-        end_frame = int(supervision.end / frame_shift)
+        start_frame = int((supervision.start - offset) / frame_shift)
+        end_frame = int((supervision.end - offset) / frame_shift)
         # Compute segment-level confidence
         probabilities = emission[0, start_frame:end_frame].softmax(dim=-1)
@@ -457,8 +511,8 @@ def _add_confidence_scores(
         if hasattr(supervision, "alignment") and supervision.alignment:
             words = supervision.alignment.get("word", [])
             for w, item in enumerate(words):
-                start = int(item.start / frame_shift) - start_frame
-                end = int(item.end / frame_shift) - start_frame
+                start = int((item.start - offset) / frame_shift) - start_frame
+                end = int((item.end - offset) / frame_shift) - start_frame
                 words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean().item(), ndigits=4))
@@ -473,3 +527,23 @@ def _update_alignments_speaker(supervisions: List[Supervision], alignments: List
     for supervision, alignment in zip(supervisions, alignments):
         alignment.speaker = supervision.speaker
     return alignments
+def _load_tokenizer(
+    client_wrapper: Any,
+    model_path: str,
+    model_name: str,
+    device: str,
+    *,
+    tokenizer_cls: Type[LatticeTokenizer] = LatticeTokenizer,
+) -> LatticeTokenizer:
+    """Instantiate tokenizer with consistent error handling."""
+    try:
+        return tokenizer_cls.from_pretrained(
+            client_wrapper=client_wrapper,
+            model_path=model_path,
+            model_name=model_name,
+            device=device,
+        )
+    except Exception as e:
+        raise ModelLoadError(f"tokenizer from {model_path}", original_error=e)

lattifai/audio2.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""Audio loading and resampling utilities."""
+from collections import namedtuple
+from pathlib import Path
+from typing import BinaryIO, Iterable, Optional, Tuple, Union
+import numpy as np
+import soundfile as sf
+import torch
+from lhotse.augmentation import get_or_create_resampler
+from lhotse.utils import Pathlike
+from lattifai.errors import AudioLoadError
+# ChannelSelectorType = Union[int, Iterable[int], str]
+ChannelSelectorType = Union[int, str]
+class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "device", "path"])):
+    """Audio data container with sampling rate, numpy array, tensor, and device information."""
+    def __str__(self) -> str:
+        return self.path
+    @property
+    def duration(self) -> float:
+        """Duration of the audio in seconds."""
+        return self.ndarray.shape[-1] / self.sampling_rate
+class AudioLoader:
+    """Load and preprocess audio files into AudioData format."""
+    def __init__(
+        self,
+        device: str = "cpu",
+    ):
+        """Initialize AudioLoader.
+        Args:
+            device: Device to load audio tensors on (default: "cpu").
+        """
+        self.device = device
+        self._resampler_cache = {}
+    def _resample_audio(
+        self,
+        audio_sr: Tuple[torch.Tensor, int],
+        sampling_rate: int,
+        device: Optional[str],
+        channel_selector: Optional[ChannelSelectorType],
+    ) -> torch.Tensor:
+        """Resample audio to target sampling rate with channel selection.
+        Args:
+            audio_sr: Tuple of (audio_tensor, original_sample_rate).
+            sampling_rate: Target sampling rate.
+            device: Device to perform resampling on.
+            channel_selector: How to select channels.
+        Returns:
+            Resampled audio tensor of shape (1, T) or (C, T).
+        """
+        audio, sr = audio_sr
+        if channel_selector is None:
+            # keep the original multi-channel signal
+            tensor = audio
+        elif isinstance(channel_selector, int):
+            assert audio.shape[0] >= channel_selector, f"Invalid channel: {channel_selector}"
+            tensor = audio[channel_selector : channel_selector + 1].clone()
+            del audio
+        elif isinstance(channel_selector, str):
+            assert channel_selector == "average"
+            tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
+            del audio
+        else:
+            raise ValueError(f"Unsupported channel_selector: {channel_selector}")
+            # assert isinstance(channel_selector, Iterable)
+            # num_channels = audio.shape[0]
+            # print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
+            # if max(channel_selector) >= num_channels:
+            #     raise ValueError(
+            #         f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
+            #     )
+            # tensor = audio[channel_selector]
+        tensor = tensor.to(device)
+        if sr != sampling_rate:
+            cache_key = (sr, sampling_rate, device)
+            if cache_key not in self._resampler_cache:
+                self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
+            resampler = self._resampler_cache[cache_key]
+            length = tensor.size(-1)
+            chunk_size = sampling_rate * 3600
+            if length > chunk_size:
+                resampled_chunks = []
+                for i in range(0, length, chunk_size):
+                    resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
+                tensor = torch.cat(resampled_chunks, dim=-1)
+            else:
+                tensor = resampler(tensor)
+        return tensor
+    def _load_audio(
+        self,
+        audio: Union[Pathlike, BinaryIO],
+        sampling_rate: int,
+        channel_selector: Optional[ChannelSelectorType],
+    ) -> torch.Tensor:
+        """Load audio from file or binary stream and resample to target rate.
+        Args:
+            audio: Path to audio file or binary stream.
+            sampling_rate: Target sampling rate.
+            channel_selector: How to select channels.
+        Returns:
+            Resampled audio tensor.
+        Raises:
+            ImportError: If PyAV is needed but not installed.
+            ValueError: If no audio stream found.
+            RuntimeError: If audio loading fails.
+        """
+        if isinstance(audio, Pathlike):
+            audio = str(Path(str(audio)).expanduser())
+        # load audio
+        try:
+            waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")  # numpy array
+            waveform = waveform.T  # (channels, samples)
+        except Exception as primary_error:
+            # Fallback to PyAV for formats not supported by soundfile
+            try:
+                import av
+            except ImportError:
+                raise AudioLoadError(
+                    "PyAV (av) is required for loading certain audio formats. "
+                    f"Install it with: pip install av\n"
+                    f"Primary error was: {primary_error}"
+                )
+            try:
+                container = av.open(audio)
+                audio_stream = next((s for s in container.streams if s.type == "audio"), None)
+                if audio_stream is None:
+                    raise ValueError(f"No audio stream found in file: {audio}")
+                # Resample to target sample rate during decoding
+                audio_stream.codec_context.format = av.AudioFormat("flt")  # 32-bit float
+                frames = []
+                for frame in container.decode(audio_stream):
+                    # Convert frame to numpy array
+                    array = frame.to_ndarray()
+                    # Ensure shape is (channels, samples)
+                    if array.ndim == 1:
+                        array = array.reshape(1, -1)
+                    elif array.ndim == 2 and array.shape[0] > array.shape[1]:
+                        array = array.T
+                    frames.append(array)
+                container.close()
+                if not frames:
+                    raise ValueError(f"No audio data found in file: {audio}")
+                # Concatenate all frames
+                waveform = np.concatenate(frames, axis=1).astype(np.float32)  # (channels, samples)
+                sample_rate = audio_stream.codec_context.sample_rate
+            except Exception as e:
+                raise RuntimeError(f"Failed to load audio file {audio}: {e}")
+        return self._resample_audio(
+            (torch.from_numpy(waveform), sample_rate),
+            sampling_rate,
+            device=self.device,
+            channel_selector=channel_selector,
+        )
+    def __call__(
+        self,
+        audio: Union[Pathlike, BinaryIO],
+        sampling_rate: int = 16000,
+        channel_selector: Optional[ChannelSelectorType] = "average",
+    ) -> AudioData:
+        """
+        Args:
+            audio: Path to audio file or binary stream.
+            channel_selector: How to select channels (default: "average").
+            sampling_rate: Target sampling rate (default: use instance sampling_rate).
+        Returns:
+            AudioData namedtuple with sampling_rate, ndarray, and tensor fields.
+        """
+        tensor = self._load_audio(audio, sampling_rate, channel_selector)
+        # tensor is (1, T) or (C, T)
+        ndarray = tensor.cpu().numpy()
+        return AudioData(
+            sampling_rate=sampling_rate,
+            ndarray=ndarray,
+            tensor=tensor,
+            device=self.device,
+            path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
+        )

lattifai/caption/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+from typing import List, Optional
+from lhotse.utils import Pathlike
+from ..config.caption import InputCaptionFormat
+from .caption import Caption
+from .gemini_reader import GeminiReader, GeminiSegment
+from .gemini_writer import GeminiWriter
+from .supervision import Supervision
+from .text_parser import normalize_text
+__all__ = [
+    "Caption",
+    "Supervision",
+    "GeminiReader",
+    "GeminiWriter",
+    "GeminiSegment",
+    "normalize_text",
+    "InputCaptionFormat",
+]

lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

lattifai 0.4.6py3-none-any.whl → 1.0.0py3-none-any.whl