PyPI - lattifai - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

lattifai 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

lattifai/__init__.py +26 -27
lattifai/base_client.py +7 -7
lattifai/bin/agent.py +94 -91
lattifai/bin/align.py +110 -111
lattifai/bin/cli_base.py +3 -3
lattifai/bin/subtitle.py +45 -45
lattifai/client.py +56 -56
lattifai/errors.py +73 -73
lattifai/io/__init__.py +12 -11
lattifai/io/gemini_reader.py +30 -30
lattifai/io/gemini_writer.py +17 -17
lattifai/io/reader.py +13 -12
lattifai/io/supervision.py +3 -3
lattifai/io/text_parser.py +43 -16
lattifai/io/utils.py +4 -4
lattifai/io/writer.py +31 -19
lattifai/tokenizer/__init__.py +1 -1
lattifai/tokenizer/phonemizer.py +3 -3
lattifai/tokenizer/tokenizer.py +84 -83
lattifai/utils.py +15 -15
lattifai/workers/__init__.py +1 -1
lattifai/workers/lattice1_alpha.py +103 -63
lattifai/workflows/__init__.py +11 -11
lattifai/workflows/agents.py +2 -0
lattifai/workflows/base.py +22 -22
lattifai/workflows/file_manager.py +182 -182
lattifai/workflows/gemini.py +29 -29
lattifai/workflows/prompts/__init__.py +4 -4
lattifai/workflows/youtube.py +233 -233
{lattifai-0.4.4.dist-info → lattifai-0.4.6.dist-info}/METADATA +7 -10
lattifai-0.4.6.dist-info/RECORD +39 -0
{lattifai-0.4.4.dist-info → lattifai-0.4.6.dist-info}/licenses/LICENSE +1 -1
lattifai-0.4.4.dist-info/RECORD +0 -39
{lattifai-0.4.4.dist-info → lattifai-0.4.6.dist-info}/WHEEL +0 -0
{lattifai-0.4.4.dist-info → lattifai-0.4.6.dist-info}/entry_points.txt +0 -0
{lattifai-0.4.4.dist-info → lattifai-0.4.6.dist-info}/top_level.txt +0 -0

lattifai/workers/lattice1_alpha.py CHANGED Viewed

@@ -1,59 +1,110 @@
 import json
 import time
 from collections import defaultdict
-from typing import Any, BinaryIO, Dict, Tuple, Union
+from typing import Any, BinaryIO, Dict, Iterable, Optional, Tuple, Union
 import numpy as np
 import onnxruntime as ort
-import resampy
 import soundfile as sf
 import torch
 from lhotse import FbankConfig
-from lhotse.audio import read_audio
+from lhotse.augmentation import get_or_create_resampler
 from lhotse.features.kaldi.layers import Wav2LogFilterBank
 from lhotse.utils import Pathlike
 from lattifai.errors import AlignmentError, AudioFormatError, AudioLoadError, DependencyError, ModelLoadError
+ChannelSelectorType = Union[int, Iterable[int], str]
+def resample_audio(
+    audio_sr: Tuple[torch.Tensor, int],
+    sampling_rate: int,
+    device: Optional[str],
+    channel_selector: Optional[ChannelSelectorType] = "average",
+) -> torch.Tensor:
+    """
+    return:
+        (1, T)
+    """
+    audio, sr = audio_sr
+    if channel_selector is None:
+        # keep the original multi-channel signal
+        tensor = audio
+    elif isinstance(channel_selector, int):
+        assert audio.shape[0] >= channel_selector, f"Invalid channel: {channel_selector}"
+        tensor = audio[channel_selector : channel_selector + 1].clone()
+        del audio
+    elif isinstance(channel_selector, str):
+        assert channel_selector == "average"
+        tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
+        del audio
+    else:
+        assert isinstance(channel_selector, Iterable)
+        num_channels = audio.shape[0]
+        print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
+        assert isinstance(channel_selector, Iterable)
+        if max(channel_selector) >= num_channels:
+            raise ValueError(
+                f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
+            )
+        tensor = audio[channel_selector]
+    tensor = tensor.to(device)
+    if sr != sampling_rate:
+        resampler = get_or_create_resampler(sr, sampling_rate).to(device=device)
+        length = tensor.size(-1)
+        chunk_size = sampling_rate * 3600
+        if length > chunk_size:
+            resampled_chunks = []
+            for i in range(0, length, chunk_size):
+                resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
+            tensor = torch.cat(resampled_chunks, dim=-1)
+        else:
+            tensor = resampler(tensor)
+    return tensor
 class Lattice1AlphaWorker:
     """Worker for processing audio with LatticeGraph."""
-    def __init__(self, model_path: Pathlike, device: str = 'cpu', num_threads: int = 8) -> None:
+    def __init__(self, model_path: Pathlike, device: str = "cpu", num_threads: int = 8) -> None:
         try:
-            self.config = json.load(open(f'{model_path}/config.json'))
+            self.config = json.load(open(f"{model_path}/config.json"))
         except Exception as e:
-            raise ModelLoadError(f'config from {model_path}', original_error=e)
+            raise ModelLoadError(f"config from {model_path}", original_error=e)
         # SessionOptions
         sess_options = ort.SessionOptions()
         # sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
         sess_options.intra_op_num_threads = num_threads  # CPU cores
         sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
-        sess_options.add_session_config_entry('session.intra_op.allow_spinning', '0')
+        sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
         providers = []
-        if device.startswith('cuda') and ort.get_all_providers().count('CUDAExecutionProvider') > 0:
-            providers.append('CUDAExecutionProvider')
-        elif device.startswith('mps') and ort.get_all_providers().count('MPSExecutionProvider') > 0:
-            providers.append('MPSExecutionProvider')
+        if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
+            providers.append("CUDAExecutionProvider")
+        elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
+            providers.append("MPSExecutionProvider")
         try:
             self.acoustic_ort = ort.InferenceSession(
-                f'{model_path}/acoustic_opt.onnx',
+                f"{model_path}/acoustic_opt.onnx",
                 sess_options,
-                providers=providers + ['CoreMLExecutionProvider', 'CPUExecutionProvider'],
+                providers=providers + ["CPUExecutionProvider", "CoreMLExecutionProvider"],
             )
         except Exception as e:
-            raise ModelLoadError(f'acoustic model from {model_path}', original_error=e)
+            raise ModelLoadError(f"acoustic model from {model_path}", original_error=e)
         try:
             config = FbankConfig(num_mel_bins=80, device=device, snip_edges=False)
             config_dict = config.to_dict()
-            config_dict.pop('device')
+            config_dict.pop("device")
             self.extractor = Wav2LogFilterBank(**config_dict).to(device).eval()
         except Exception as e:
-            raise ModelLoadError(f'feature extractor for device {device}', original_error=e)
+            raise ModelLoadError(f"feature extractor for device {device}", original_error=e)
         self.device = torch.device(device)
         self.timings = defaultdict(lambda: 0.0)
@@ -68,8 +119,8 @@ class Lattice1AlphaWorker:
             emissions = []
             for features in features_list:
                 ort_inputs = {
-                    'features': features.cpu().numpy(),
-                    'feature_lengths': np.array([features.size(1)], dtype=np.int64),
+                    "features": features.cpu().numpy(),
+                    "feature_lengths": np.array([features.size(1)], dtype=np.int64),
                 }
                 emission = self.acoustic_ort.run(None, ort_inputs)[0]  # (1, T, vocab_size) numpy
                 emissions.append(emission)
@@ -78,44 +129,40 @@ class Lattice1AlphaWorker:
             )  # (1, T, vocab_size)
         else:
             ort_inputs = {
-                'features': features.cpu().numpy(),
-                'feature_lengths': np.array([features.size(1)], dtype=np.int64),
+                "features": features.cpu().numpy(),
+                "feature_lengths": np.array([features.size(1)], dtype=np.int64),
             }
             emission = self.acoustic_ort.run(None, ort_inputs)[0]  # (1, T, vocab_size) numpy
             emission = torch.from_numpy(emission).to(self.device)
-        self.timings['emission'] += time.time() - _start
+        self.timings["emission"] += time.time() - _start
         return emission  # (1, T, vocab_size) torch
-    def load_audio(self, audio: Union[Pathlike, BinaryIO]) -> Tuple[torch.Tensor, int]:
+    def load_audio(
+        self, audio: Union[Pathlike, BinaryIO], channel_selector: Optional[ChannelSelectorType] = "average"
+    ) -> Tuple[torch.Tensor, int]:
         # load audio
         try:
-            waveform, sample_rate = read_audio(audio)  # numpy array
-            if len(waveform.shape) == 1:
-                waveform = waveform.reshape([1, -1])  # (1, L)
-            else:  # make sure channel first
-                if waveform.shape[0] > waveform.shape[1]:
-                    waveform = waveform.transpose(0, 1)
-                # average multiple channels
-                waveform = np.mean(waveform, axis=0, keepdims=True)  # (1, L)
+            waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")  # numpy array
+            waveform = waveform.T  # (channels, samples)
         except Exception as primary_error:
             # Fallback to PyAV for formats not supported by soundfile
             try:
                 import av
             except ImportError:
                 raise DependencyError(
-                    'av (PyAV)', install_command='pip install av', context={'primary_error': str(primary_error)}
+                    "av (PyAV)", install_command="pip install av", context={"primary_error": str(primary_error)}
                 )
             try:
                 container = av.open(audio)
-                audio_stream = next((s for s in container.streams if s.type == 'audio'), None)
+                audio_stream = next((s for s in container.streams if s.type == "audio"), None)
                 if audio_stream is None:
-                    raise AudioFormatError(str(audio), 'No audio stream found in file')
+                    raise AudioFormatError(str(audio), "No audio stream found in file")
                 # Resample to target sample rate during decoding
-                audio_stream.codec_context.format = av.AudioFormat('flt')  # 32-bit float
+                audio_stream.codec_context.format = av.AudioFormat("flt")  # 32-bit float
                 frames = []
                 for frame in container.decode(audio_stream):
@@ -131,27 +178,20 @@ class Lattice1AlphaWorker:
                 container.close()
                 if not frames:
-                    raise AudioFormatError(str(audio), 'No audio data found in file')
+                    raise AudioFormatError(str(audio), "No audio data found in file")
                 # Concatenate all frames
                 waveform = np.concatenate(frames, axis=1)
-                # Average multiple channels to mono
-                if waveform.shape[0] > 1:
-                    waveform = np.mean(waveform, axis=0, keepdims=True)
                 sample_rate = audio_stream.codec_context.sample_rate
             except Exception as e:
                 raise AudioLoadError(str(audio), original_error=e)
-        try:
-            if sample_rate != self.config['sample_rate']:
-                waveform = resampy.resample(waveform, sample_rate, self.config['sample_rate'], axis=1)
-        except Exception:
-            raise AudioFormatError(
-                str(audio), f'Failed to resample from {sample_rate}Hz to {self.config["sample_rate"]}Hz'
-            )
-        return torch.from_numpy(waveform).to(self.device)  # (1, L)
+        return resample_audio(
+            (torch.from_numpy(waveform), sample_rate),
+            self.config.get("sampling_rate", 16000),
+            device=self.device.type,
+            channel_selector=channel_selector,
+        )
     def alignment(
         self, audio: Union[Union[Pathlike, BinaryIO], torch.tensor], lattice_graph: Tuple[str, int, float]
@@ -181,21 +221,21 @@ class Lattice1AlphaWorker:
             emission = self.emission(waveform.to(self.device))  # (1, T, vocab_size)
         except Exception as e:
             raise AlignmentError(
-                'Failed to compute acoustic features from audio',
-                audio_path=str(audio) if not isinstance(audio, torch.Tensor) else 'tensor',
-                context={'original_error': str(e)},
+                "Failed to compute acoustic features from audio",
+                audio_path=str(audio) if not isinstance(audio, torch.Tensor) else "tensor",
+                context={"original_error": str(e)},
             )
-        self.timings['emission'] += time.time() - _start
+        self.timings["emission"] += time.time() - _start
         try:
             import k2
         except ImportError:
-            raise DependencyError('k2', install_command='pip install install-k2 && python -m install_k2')
+            raise DependencyError("k2", install_command="pip install install-k2 && python -m install_k2")
         try:
             from lattifai_core.lattice.decode import align_segments
         except ImportError:
-            raise DependencyError('lattifai_core', install_command='Contact support for lattifai_core installation')
+            raise DependencyError("lattifai_core", install_command="Contact support for lattifai_core installation")
         lattice_graph_str, final_state, acoustic_scale = lattice_graph
@@ -209,14 +249,14 @@ class Lattice1AlphaWorker:
             decoding_graph.return_id = int(final_state + 1)
         except Exception as e:
             raise AlignmentError(
-                'Failed to create decoding graph from lattice',
-                context={'original_error': str(e), 'lattice_graph_length': len(lattice_graph_str)},
+                "Failed to create decoding graph from lattice",
+                context={"original_error": str(e), "lattice_graph_length": len(lattice_graph_str)},
             )
-        self.timings['decoding_graph'] += time.time() - _start
+        self.timings["decoding_graph"] += time.time() - _start
         _start = time.time()
-        if self.device.type == 'mps':
-            device = 'cpu'  # k2 does not support mps yet
+        if self.device.type == "mps":
+            device = "cpu"  # k2 does not support mps yet
         else:
             device = self.device
@@ -234,11 +274,11 @@ class Lattice1AlphaWorker:
             )
         except Exception as e:
             raise AlignmentError(
-                'Failed to perform forced alignment',
-                audio_path=str(audio) if not isinstance(audio, torch.Tensor) else 'tensor',
-                context={'original_error': str(e), 'emission_shape': list(emission.shape), 'device': str(device)},
+                "Failed to perform forced alignment",
+                audio_path=str(audio) if not isinstance(audio, torch.Tensor) else "tensor",
+                context={"original_error": str(e), "emission_shape": list(emission.shape), "device": str(device)},
             )
-        self.timings['align_segments'] += time.time() - _start
+        self.timings["align_segments"] += time.time() - _start
         channel = 0
         return emission, results, labels, 0.02, 0.0, channel  # frame_shift=20ms, offset=0.0s

lattifai/workflows/__init__.py CHANGED Viewed

@@ -20,15 +20,15 @@ from .base import WorkflowAgent, WorkflowResult, WorkflowStep
 from .file_manager import FileExistenceManager
 __all__ = [
-    'WorkflowAgent',
-    'WorkflowStep',
-    'WorkflowResult',
-    'YouTubeSubtitleAgent',
-    'FileExistenceManager',
-    'GeminiReader',
-    'GeminiWriter',
-    'SUBTITLE_FORMATS',
-    'INPUT_SUBTITLE_FORMATS',
-    'OUTPUT_SUBTITLE_FORMATS',
-    'ALL_SUBTITLE_FORMATS',
+    "WorkflowAgent",
+    "WorkflowStep",
+    "WorkflowResult",
+    "YouTubeSubtitleAgent",
+    "FileExistenceManager",
+    "GeminiReader",
+    "GeminiWriter",
+    "SUBTITLE_FORMATS",
+    "INPUT_SUBTITLE_FORMATS",
+    "OUTPUT_SUBTITLE_FORMATS",
+    "ALL_SUBTITLE_FORMATS",
 ]

lattifai/workflows/agents.py CHANGED Viewed

@@ -8,3 +8,5 @@ An agentic workflow for processing YouTube(or more) videos through:
 """
 from .youtube import YouTubeSubtitleAgent
+__all__ = ["YouTubeSubtitleAgent"]

lattifai/workflows/base.py CHANGED Viewed

@@ -7,20 +7,20 @@ import logging
 import time
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 import colorful
 def setup_workflow_logger(name: str) -> logging.Logger:
     """Setup a logger with consistent formatting for workflow modules"""
-    logger = logging.getLogger(f'workflows.{name}')
+    logger = logging.getLogger(f"workflows.{name}")
     # Only add handler if it doesn't exist
     if not logger.handlers:
         handler = logging.StreamHandler()
         formatter = logging.Formatter(
-            '%(asctime)s - %(name)+17s.py:%(lineno)-4d - %(levelname)-8s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'
+            "%(asctime)s - %(name)+17s.py:%(lineno)-4d - %(levelname)-8s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
         )
         handler.setFormatter(formatter)
         logger.addHandler(handler)
@@ -30,17 +30,17 @@ def setup_workflow_logger(name: str) -> logging.Logger:
     return logger
-logger = setup_workflow_logger('base')
+logger = setup_workflow_logger("base")
 class WorkflowStatus(Enum):
     """Workflow execution status"""
-    PENDING = 'pending'
-    RUNNING = 'running'
-    COMPLETED = 'completed'
-    FAILED = 'failed'
-    RETRYING = 'retrying'
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    RETRYING = "retrying"
 @dataclass
@@ -84,7 +84,7 @@ class WorkflowAgent(abc.ABC):
         self.name = name
         self.max_retries = max_retries
         self.steps: List[WorkflowStep] = []
-        self.logger = setup_workflow_logger('agent')
+        self.logger = setup_workflow_logger("agent")
     @abc.abstractmethod
     def define_steps(self) -> List[WorkflowStep]:
@@ -111,11 +111,11 @@ class WorkflowAgent(abc.ABC):
         context = kwargs.copy()
         step_results = []
-        self.logger.info(colorful.bold_white_on_green(f'🚀 Starting workflow: {self.name}'))
+        self.logger.info(colorful.bold_white_on_green(f"🚀 Starting workflow: {self.name}"))
         try:
             for i, step in enumerate(self.steps):
-                step_info = f'📋 Step {i + 1}/{len(self.steps)}: {step.name}'
+                step_info = f"📋 Step {i + 1}/{len(self.steps)}: {step.name}"
                 self.logger.info(colorful.bold_white_on_green(step_info))
                 step_start = time.time()
@@ -123,17 +123,17 @@ class WorkflowAgent(abc.ABC):
                 step_duration = time.time() - step_start
                 step_results.append(
-                    {'step_name': step.name, 'status': 'completed', 'duration': step_duration, 'result': step_result}
+                    {"step_name": step.name, "status": "completed", "duration": step_duration, "result": step_result}
                 )
                 # Update context with step result
-                context[f'step_{i}_result'] = step_result
+                context[f"step_{i}_result"] = step_result
                 context[f'{step.name.lower().replace(" ", "_")}_result'] = step_result
-                self.logger.info(f'✅ Step {i + 1} completed in {step_duration:.2f}s')
+                self.logger.info(f"✅ Step {i + 1} completed in {step_duration:.2f}s")
             execution_time = time.time() - start_time
-            self.logger.info(f'🎉 Workflow completed in {execution_time:.2f}s')
+            self.logger.info(f"🎉 Workflow completed in {execution_time:.2f}s")
             return WorkflowResult(
                 status=WorkflowStatus.COMPLETED, data=context, execution_time=execution_time, step_results=step_results
@@ -145,9 +145,9 @@ class WorkflowAgent(abc.ABC):
             from lattifai.errors import LattifAIError
             if isinstance(e, LattifAIError):
-                self.logger.error(f'❌ Workflow failed after {execution_time:.2f}s: [{e.error_code}] {e.message}')
+                self.logger.error(f"❌ Workflow failed after {execution_time:.2f}s: [{e.error_code}] {e.message}")
             else:
-                self.logger.error(f'❌ Workflow failed after {execution_time:.2f}s: {str(e)}')
+                self.logger.error(f"❌ Workflow failed after {execution_time:.2f}s: {str(e)}")
             return WorkflowResult(
                 status=WorkflowStatus.FAILED,
@@ -164,7 +164,7 @@ class WorkflowAgent(abc.ABC):
         for attempt in range(step.max_retries + 1):
             try:
                 if attempt > 0:
-                    self.logger.info(f'🔄 Retrying step {step.name} (attempt {attempt + 1}/{step.max_retries + 1})')
+                    self.logger.info(f"🔄 Retrying step {step.name} (attempt {attempt + 1}/{step.max_retries + 1})")
                 result = await self.execute_step(step, context)
                 return result
@@ -176,14 +176,14 @@ class WorkflowAgent(abc.ABC):
                 # For LattifAI errors, show simplified message in logs
                 from lattifai.errors import LattifAIError
-                error_summary = f'[{e.error_code}]' if isinstance(e, LattifAIError) else str(e)[:100]
+                error_summary = f"[{e.error_code}]" if isinstance(e, LattifAIError) else str(e)[:100]
                 if step.should_retry():
-                    self.logger.warning(f'⚠️ Step {step.name} failed: {error_summary}. Retrying...')
+                    self.logger.warning(f"⚠️ Step {step.name} failed: {error_summary}. Retrying...")
                     continue
                 else:
                     self.logger.error(
-                        f'❌ Step {step.name} failed after {step.max_retries + 1} attempts: {error_summary}'
+                        f"❌ Step {step.name} failed after {step.max_retries + 1} attempts: {error_summary}"
                     )
                     raise e

lattifai 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

lattifai 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl