PyPI - lattifai - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl - Mend

lattifai 1.0.4py3-none-any.whl → 1.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

lattifai/__init__.py +10 -0
lattifai/alignment/lattice1_aligner.py +33 -13
lattifai/alignment/lattice1_worker.py +121 -50
lattifai/alignment/segmenter.py +3 -2
lattifai/alignment/tokenizer.py +3 -3
lattifai/audio2.py +269 -70
lattifai/caption/caption.py +161 -3
lattifai/cli/alignment.py +2 -1
lattifai/cli/app_installer.py +35 -33
lattifai/cli/caption.py +8 -18
lattifai/cli/server.py +3 -1
lattifai/cli/transcribe.py +53 -38
lattifai/cli/youtube.py +1 -0
lattifai/client.py +16 -11
lattifai/config/alignment.py +23 -2
lattifai/config/caption.py +1 -1
lattifai/config/media.py +23 -3
lattifai/errors.py +7 -3
lattifai/mixin.py +26 -15
lattifai/server/app.py +2 -1
lattifai/utils.py +37 -0
lattifai/workflow/file_manager.py +15 -13
lattifai/workflow/youtube.py +16 -1
{lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/METADATA +65 -15
{lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/RECORD +29 -29
{lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/licenses/LICENSE +1 -1
{lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/WHEEL +0 -0
{lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/entry_points.txt +0 -0
{lattifai-1.0.4.dist-info → lattifai-1.0.5.dist-info}/top_level.txt +0 -0

lattifai/audio2.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from collections import namedtuple
 from pathlib import Path
-from typing import BinaryIO, Iterable, Optional, Tuple, Union
+from typing import BinaryIO, Optional, Tuple, Union
 import numpy as np
 import soundfile as sf
@@ -16,8 +16,14 @@ from lattifai.errors import AudioLoadError
 ChannelSelectorType = Union[int, str]
-class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "device", "path"])):
-    """Audio data container with sampling rate, numpy array, tensor, and device information."""
+class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "path", "streaming_chunk_secs", "overlap_secs"])):
+    """Audio data container with sampling rate and numpy array.
+    Supports iteration to stream audio chunks for processing long audio files.
+    The streaming_chunk_secs field indicates whether streaming mode should be used downstream.
+    The overlap_secs field specifies the overlap duration between consecutive chunks.
+    Note: tensor field removed to reduce memory usage. Convert ndarray to tensor on-demand.
+    """
     def __str__(self) -> str:
         return self.path
@@ -27,6 +33,66 @@ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "tensor", "
         """Duration of the audio in seconds."""
         return self.ndarray.shape[-1] / self.sampling_rate
+    @property
+    def streaming_mode(self) -> bool:
+        """Indicates whether streaming mode is enabled based on streaming_chunk_secs."""
+        if self.streaming_chunk_secs is not None:
+            return self.duration > self.streaming_chunk_secs * 1.1
+        return False
+    def __iter__(self):
+        """Initialize iterator for chunk-based audio streaming.
+        Returns an iterator that yields audio chunks as AudioData instances.
+        Uses streaming_chunk_secs and overlap_secs from the instance.
+        """
+        return self.iter_chunks()
+    def iter_chunks(
+        self,
+        chunk_secs: Optional[float] = None,
+        overlap_secs: Optional[float] = None,
+    ):
+        """Iterate over audio chunks with configurable duration and overlap.
+        Args:
+            chunk_secs: Duration of each chunk in seconds (default: uses streaming_chunk_secs or 600.0).
+            overlap_secs: Overlap between consecutive chunks in seconds (default: uses overlap_secs or 0.0).
+        Yields:
+            AudioData: Chunks of audio data.
+        Example:
+            >>> audio = loader("long_audio.wav")
+            >>> for chunk in audio.iter_chunks(chunk_secs=60.0, overlap_secs=2.0):
+            ...     process(chunk)
+        """
+        chunk_duration = chunk_secs or self.streaming_chunk_secs or 600.0
+        overlap_duration = overlap_secs or self.overlap_secs or 0.0
+        chunk_size = int(chunk_duration * self.sampling_rate)
+        overlap_size = int(overlap_duration * self.sampling_rate)
+        step_size = chunk_size - overlap_size
+        total_samples = self.ndarray.shape[-1]
+        current_offset = 0
+        while current_offset < total_samples:
+            start = current_offset
+            end = min(start + chunk_size, total_samples)
+            # Extract chunk from ndarray only
+            chunk_ndarray = self.ndarray[..., start:end]
+            yield AudioData(
+                sampling_rate=self.sampling_rate,
+                ndarray=chunk_ndarray,
+                path=f"{self.path}[{start/self.sampling_rate:.2f}s-{end/self.sampling_rate:.2f}s]",
+                streaming_chunk_secs=None,
+                overlap_secs=None,
+            )
+            current_offset += step_size
 class AudioLoader:
     """Load and preprocess audio files into AudioData format."""
@@ -45,62 +111,48 @@ class AudioLoader:
     def _resample_audio(
         self,
-        audio_sr: Tuple[torch.Tensor, int],
+        audio_sr: Tuple[np.ndarray, int],
         sampling_rate: int,
         device: Optional[str],
         channel_selector: Optional[ChannelSelectorType],
-    ) -> torch.Tensor:
+    ) -> np.ndarray:
         """Resample audio to target sampling rate with channel selection.
         Args:
-            audio_sr: Tuple of (audio_tensor, original_sample_rate).
+            audio_sr: Tuple of (audio, original_sample_rate).
             sampling_rate: Target sampling rate.
             device: Device to perform resampling on.
             channel_selector: How to select channels.
         Returns:
-            Resampled audio tensor of shape (1, T) or (C, T).
+            Resampled audio array of shape (1, T) or (C, T).
         """
         audio, sr = audio_sr
         if channel_selector is None:
             # keep the original multi-channel signal
-            tensor = audio
+            tensor = audio.T
+            del audio  # Free original audio memory
         elif isinstance(channel_selector, int):
-            assert audio.shape[0] >= channel_selector, f"Invalid channel: {channel_selector}"
-            tensor = audio[channel_selector : channel_selector + 1].clone()
+            assert audio.shape[1] >= channel_selector, f"Invalid channel: {channel_selector}"
+            tensor = audio[:, channel_selector : channel_selector + 1].T.copy()
             del audio
         elif isinstance(channel_selector, str):
             assert channel_selector == "average"
-            tensor = torch.mean(audio.to(device), dim=0, keepdim=True)
+            tensor = np.mean(audio, axis=1, keepdims=True).T
             del audio
         else:
             raise ValueError(f"Unsupported channel_selector: {channel_selector}")
-            # assert isinstance(channel_selector, Iterable)
-            # num_channels = audio.shape[0]
-            # print(f"Selecting channels {channel_selector} from the signal with {num_channels} channels.")
-            # if max(channel_selector) >= num_channels:
-            #     raise ValueError(
-            #         f"Cannot select channel subset {channel_selector} from a signal with {num_channels} channels."
-            #     )
-            # tensor = audio[channel_selector]
-        tensor = tensor.to(device)
+        # tensor: np.ndarray (channels, samples)
         if sr != sampling_rate:
             cache_key = (sr, sampling_rate, device)
             if cache_key not in self._resampler_cache:
                 self._resampler_cache[cache_key] = get_or_create_resampler(sr, sampling_rate).to(device=device)
             resampler = self._resampler_cache[cache_key]
-            length = tensor.size(-1)
-            chunk_size = sampling_rate * 3600
-            if length > chunk_size:
-                resampled_chunks = []
-                for i in range(0, length, chunk_size):
-                    resampled_chunks.append(resampler(tensor[..., i : i + chunk_size]))
-                tensor = torch.cat(resampled_chunks, dim=-1)
-            else:
-                tensor = resampler(tensor)
+            tensor = resampler(torch.from_numpy(tensor).to(device=device))
+            tensor = tensor.cpu().numpy()
         return tensor
@@ -109,7 +161,7 @@ class AudioLoader:
         audio: Union[Pathlike, BinaryIO],
         sampling_rate: int,
         channel_selector: Optional[ChannelSelectorType],
-    ) -> torch.Tensor:
+    ) -> np.ndarray:
         """Load audio from file or binary stream and resample to target rate.
         Args:
@@ -118,7 +170,7 @@ class AudioLoader:
             channel_selector: How to select channels.
         Returns:
-            Resampled audio tensor.
+            Resampled audio as a NumPy array of shape (channels, samples).
         Raises:
             ImportError: If PyAV is needed but not installed.
@@ -128,11 +180,69 @@ class AudioLoader:
         if isinstance(audio, Pathlike):
             audio = str(Path(str(audio)).expanduser())
-        # load audio
+        # load audio in chunks to reduce memory footprint for long files
         try:
-            waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")  # numpy array
-            waveform = waveform.T  # (channels, samples)
+            # First check file duration to decide loading strategy
+            info = sf.info(audio)
+            duration = info.duration
+            # For very long audio (>60 minutes), use chunk-based loading
+            if duration > 3600:  # 60 minutes
+                with sf.SoundFile(audio, "r") as f:
+                    sample_rate = f.samplerate
+                    total_frames = f.frames
+                    # Pre-calculate output size to avoid list accumulation
+                    num_channels = 1 if channel_selector else f.channels
+                    expected_output_samples = int(total_frames * sampling_rate / sample_rate)
+                    # Pre-allocate output array
+                    waveform = np.zeros((num_channels, expected_output_samples), dtype=np.float32)
+                    # Use source sample rate for reading, not target
+                    chunk_frames = int(sample_rate * 1800)  # 30-minute chunks at source rate
+                    output_offset = 0
+                    while True:
+                        chunk = f.read(frames=chunk_frames, dtype="float32", always_2d=True)
+                        if chunk.size == 0:
+                            break
+                        # Resample chunk -> (channels, samples)
+                        resampled_chunk = self._resample_audio(
+                            (chunk, sample_rate),
+                            sampling_rate,
+                            device=self.device,
+                            channel_selector=channel_selector,
+                        )
+                        # Write directly to pre-allocated array
+                        chunk_length = resampled_chunk.shape[-1]
+                        waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
+                        output_offset += chunk_length
+                        # Clean up immediately
+                        del chunk, resampled_chunk
+                    # Trim to actual size if needed (due to rounding in resampling)
+                    if output_offset < expected_output_samples:
+                        waveform = waveform[..., :output_offset]
+                return waveform
+            else:
+                # For shorter audio, use standard loading
+                waveform, sample_rate = sf.read(audio, always_2d=True, dtype="float32")
+                # Resample and return directly to avoid double processing
+                result = self._resample_audio(
+                    (waveform, sample_rate),
+                    sampling_rate,
+                    device=self.device,
+                    channel_selector=channel_selector,
+                )
+                del waveform
+                return result
         except Exception as primary_error:
+            print(f"Primary error with soundfile: {primary_error}")
             # Fallback to PyAV for formats not supported by soundfile
             try:
                 import av
@@ -150,62 +260,151 @@ class AudioLoader:
                 if audio_stream is None:
                     raise ValueError(f"No audio stream found in file: {audio}")
-                # Resample to target sample rate during decoding
                 audio_stream.codec_context.format = av.AudioFormat("flt")  # 32-bit float
-                frames = []
-                for frame in container.decode(audio_stream):
-                    # Convert frame to numpy array
-                    array = frame.to_ndarray()
-                    # Ensure shape is (channels, samples)
-                    if array.ndim == 1:
-                        array = array.reshape(1, -1)
-                    elif array.ndim == 2 and array.shape[0] > array.shape[1]:
-                        array = array.T
-                    frames.append(array)
-                container.close()
-                if not frames:
-                    raise ValueError(f"No audio data found in file: {audio}")
-                # Concatenate all frames
-                waveform = np.concatenate(frames, axis=1).astype(np.float32)  # (channels, samples)
                 sample_rate = audio_stream.codec_context.sample_rate
+                # Estimate duration to decide processing strategy
+                duration_estimate = None
+                if audio_stream.duration and audio_stream.time_base:
+                    duration_estimate = float(audio_stream.duration * audio_stream.time_base)
+                else:
+                    print(f"WARNING: Failed to estimate duration for audio: {audio}")
+                # For very long audio (>30 minutes), process and resample in chunks
+                if duration_estimate and duration_estimate > 1800:
+                    # Estimate output size and pre-allocate with buffer
+                    num_channels = 1 if channel_selector else audio_stream.codec_context.channels
+                    estimated_samples = int(duration_estimate * sampling_rate * 1.1)  # 10% buffer
+                    waveform = np.zeros((num_channels, estimated_samples), dtype=np.float32)
+                    frames = []
+                    accumulated_samples = 0
+                    output_offset = 0
+                    chunk_sample_target = int(sample_rate * 600)  # 10 minutes at original rate
+                    for frame in container.decode(audio_stream):
+                        array = frame.to_ndarray()
+                        # Ensure shape is (samples, channels)
+                        if array.ndim == 1:
+                            array = array.reshape(-1, 1)
+                        elif array.ndim == 2 and array.shape[0] < array.shape[1]:
+                            array = array.T
+                        frames.append(array)
+                        accumulated_samples += array.shape[0]
+                        # Process chunk when accumulated enough samples
+                        if accumulated_samples >= chunk_sample_target:
+                            chunk = np.concatenate(frames, axis=0).astype(np.float32)
+                            del frames  # Free frames list before resampling
+                            # Resample chunk -> (channels, samples)
+                            resampled_chunk = self._resample_audio(
+                                (chunk, sample_rate),
+                                sampling_rate,
+                                device=self.device,
+                                channel_selector=channel_selector,
+                            )
+                            chunk_length = resampled_chunk.shape[-1]
+                            if output_offset + chunk_length > waveform.shape[-1]:
+                                print(
+                                    f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}"  # noqa: E501
+                                )
+                                resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
+                            # Write directly to array
+                            waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
+                            output_offset += chunk_length
+                            # Clean up immediately
+                            del chunk, resampled_chunk
+                            frames = []  # Create new list
+                            accumulated_samples = 0
+                    # Process remaining frames
+                    if frames:
+                        chunk = np.concatenate(frames, axis=0).astype(np.float32)
+                        del frames
+                        resampled_chunk = self._resample_audio(
+                            (chunk, sample_rate),
+                            sampling_rate,
+                            device=self.device,
+                            channel_selector=channel_selector,
+                        )
+                        chunk_length = resampled_chunk.shape[-1]
+                        if output_offset + chunk_length > waveform.shape[-1]:
+                            print(
+                                f"WARNING: Trimming resampled chunk from {chunk_length} to {waveform.shape[-1] - output_offset} samples to fit waveform buffer for audio: {audio}"  # noqa: E501
+                            )
+                            resampled_chunk = resampled_chunk[:, : waveform.shape[-1] - output_offset]
+                        waveform[..., output_offset : output_offset + chunk_length] = resampled_chunk
+                        output_offset += chunk_length
+                        del chunk, resampled_chunk
+                    container.close()
+                    if output_offset == 0:
+                        raise ValueError(f"No audio data found in file: {audio}")
+                    # Trim to actual size
+                    waveform = waveform[..., :output_offset]
+                    return waveform
+                else:
+                    # For shorter audio, process in batches to reduce memory
+                    frames = []
+                    for frame in container.decode(audio_stream):
+                        array = frame.to_ndarray()
+                        # Ensure shape is (channels, samples)
+                        if array.ndim == 1:
+                            array = array.reshape(-1, 1)
+                        elif array.ndim == 2 and array.shape[0] < array.shape[1]:
+                            array = array.T
+                        frames.append(array)
+                    container.close()
+                    if not frames:
+                        raise ValueError(f"No audio data found in file: {audio}")
+                    # Concatenate remaining frames
+                    waveform = np.concatenate(frames, axis=0).astype(np.float32)
+                    del frames
+                    # Resample and return directly
+                    result = self._resample_audio(
+                        (waveform, sample_rate),
+                        sampling_rate,
+                        device=self.device,
+                        channel_selector=channel_selector,
+                    )
+                    del waveform
+                    return result
             except Exception as e:
                 raise RuntimeError(f"Failed to load audio file {audio}: {e}")
-        return self._resample_audio(
-            (torch.from_numpy(waveform), sample_rate),
-            sampling_rate,
-            device=self.device,
-            channel_selector=channel_selector,
-        )
     def __call__(
         self,
         audio: Union[Pathlike, BinaryIO],
         sampling_rate: int = 16000,
         channel_selector: Optional[ChannelSelectorType] = "average",
+        streaming_chunk_secs: Optional[float] = None,
     ) -> AudioData:
         """
         Args:
             audio: Path to audio file or binary stream.
             channel_selector: How to select channels (default: "average").
             sampling_rate: Target sampling rate (default: use instance sampling_rate).
+            streaming_chunk_secs: Duration in seconds for streaming chunks (default: None, disabled).
         Returns:
-            AudioData namedtuple with sampling_rate, ndarray, and tensor fields.
+            AudioData namedtuple with sampling_rate, ndarray, and streaming_chunk_secs fields.
         """
-        tensor = self._load_audio(audio, sampling_rate, channel_selector)
-        # tensor is (1, T) or (C, T)
-        ndarray = tensor.cpu().numpy()
+        ndarray = self._load_audio(audio, sampling_rate, channel_selector)
         return AudioData(
             sampling_rate=sampling_rate,
             ndarray=ndarray,
-            tensor=tensor,
-            device=self.device,
             path=str(audio) if isinstance(audio, Pathlike) else "<BinaryIO>",
+            streaming_chunk_secs=streaming_chunk_secs,
+            overlap_secs=0.0,
         )

lattifai/caption/caption.py CHANGED Viewed

@@ -307,7 +307,7 @@ class Caption:
         cls,
         path: Pathlike,
         format: Optional[str] = None,
-        normalize_text: bool = False,
+        normalize_text: bool = True,
     ) -> "Caption":
         """
         Read caption file and return Caption object.
@@ -505,6 +505,8 @@ class Caption:
             cls._write_csv(alignments, output_path, include_speaker_in_text)
         elif str(output_path)[-4:].lower() == ".aud":
             cls._write_aud(alignments, output_path, include_speaker_in_text)
+        elif str(output_path)[-4:].lower() == ".sbv":
+            cls._write_sbv(alignments, output_path, include_speaker_in_text)
         else:
             import pysubs2
@@ -535,7 +537,14 @@ class Caption:
                             name=sup.speaker or "",
                         )
                     )
-            subs.save(output_path)
+            # MicroDVD format requires framerate to be specified
+            output_ext = str(output_path).lower().split(".")[-1]
+            if output_ext == "sub":
+                # Default to 25 fps for MicroDVD format if not specified
+                subs.save(output_path, fps=25.0)
+            else:
+                subs.save(output_path)
         return output_path
@@ -821,7 +830,7 @@ class Caption:
             if cls._is_youtube_vtt_with_word_timestamps(content):
                 return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
-        if format == "gemini" or str(caption).endswith("Gemini.md"):
+        if format == "gemini" or str(caption).endswith("Gemini.md") or str(caption).endswith("Gemini3.md"):
             from .gemini_reader import GeminiReader
             supervisions = GeminiReader.extract_for_alignment(caption)
@@ -850,6 +859,8 @@ class Caption:
             supervisions = cls._parse_csv(caption, normalize_text)
         elif format == "aud" or str(caption)[-4:].lower() == ".aud":
             supervisions = cls._parse_aud(caption, normalize_text)
+        elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
+            supervisions = cls._parse_sbv(caption, normalize_text)
         elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
             if not Path(str(caption)).exists():  # str
                 lines = [line.strip() for line in str(caption).split("\n")]
@@ -1113,6 +1124,101 @@ class Caption:
         return supervisions
+    @classmethod
+    def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
+        """
+        Parse SubViewer (SBV) format caption file.
+        Format:
+        0:00:00.000,0:00:02.000
+        Text line 1
+        0:00:02.000,0:00:04.000
+        Text line 2
+        Args:
+            caption: Caption file path
+            normalize_text: Whether to normalize text
+        Returns:
+            List of Supervision objects
+        """
+        caption_path = Path(str(caption))
+        if not caption_path.exists():
+            raise FileNotFoundError(f"Caption file not found: {caption}")
+        supervisions = []
+        with open(caption_path, "r", encoding="utf-8") as f:
+            content = f.read()
+        # Split by double newlines to separate entries
+        entries = content.strip().split("\n\n")
+        for entry in entries:
+            lines = entry.strip().split("\n")
+            if len(lines) < 2:
+                continue
+            # First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
+            timestamp_line = lines[0].strip()
+            # Remaining lines: text
+            text_lines = lines[1:]
+            try:
+                # Parse timestamp: 0:00:00.000,0:00:02.000
+                if "," not in timestamp_line:
+                    continue
+                start_str, end_str = timestamp_line.split(",", 1)
+                # Parse start time
+                start_parts = start_str.strip().split(":")
+                if len(start_parts) == 3:
+                    h, m, s = start_parts
+                    s_parts = s.split(".")
+                    start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
+                    if len(s_parts) > 1:
+                        start += int(s_parts[1]) / 1000.0
+                else:
+                    continue
+                # Parse end time
+                end_parts = end_str.strip().split(":")
+                if len(end_parts) == 3:
+                    h, m, s = end_parts
+                    s_parts = s.split(".")
+                    end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
+                    if len(s_parts) > 1:
+                        end += int(s_parts[1]) / 1000.0
+                else:
+                    continue
+                # Parse text and speaker
+                text = " ".join(text_lines).strip()
+                speaker, text = parse_speaker_text(text)
+                if normalize_text:
+                    text = normalize_text_fn(text)
+                duration = end - start
+                if duration < 0:
+                    continue
+                supervisions.append(
+                    Supervision(
+                        text=text,
+                        start=start,
+                        duration=duration,
+                        speaker=speaker,
+                    )
+                )
+            except (ValueError, IndexError):
+                # Skip malformed entries
+                continue
+        return supervisions
     @classmethod
     def _write_tsv(
         cls,
@@ -1217,6 +1323,58 @@ class Caption:
                 file.write(f"{start}\t{end}\t{text}\n")
+    @classmethod
+    def _write_sbv(
+        cls,
+        alignments: List[Supervision],
+        output_path: Pathlike,
+        include_speaker_in_text: bool = True,
+    ) -> None:
+        """
+        Write caption to SubViewer (SBV) format.
+        Format:
+        0:00:00.000,0:00:02.000
+        Text line 1
+        0:00:02.000,0:00:04.000
+        Text line 2
+        Args:
+            alignments: List of supervision segments to write
+            output_path: Path to output SBV file
+            include_speaker_in_text: Whether to include speaker in text
+        """
+        with open(output_path, "w", encoding="utf-8") as file:
+            for i, supervision in enumerate(alignments):
+                # Format timestamps as H:MM:SS.mmm
+                start_h = int(supervision.start // 3600)
+                start_m = int((supervision.start % 3600) // 60)
+                start_s = int(supervision.start % 60)
+                start_ms = int((supervision.start % 1) * 1000)
+                end_h = int(supervision.end // 3600)
+                end_m = int((supervision.end % 3600) // 60)
+                end_s = int(supervision.end % 60)
+                end_ms = int((supervision.end % 1) * 1000)
+                start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
+                end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
+                # Write timestamp line
+                file.write(f"{start_time},{end_time}\n")
+                # Write text (with optional speaker)
+                text = supervision.text.strip()
+                if include_speaker_in_text and supervision.speaker:
+                    text = f"{supervision.speaker}: {text}"
+                file.write(f"{text}\n")
+                # Add blank line between entries (except after last one)
+                if i < len(alignments) - 1:
+                    file.write("\n")
     @classmethod
     def _parse_caption(
         cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False

lattifai/cli/alignment.py CHANGED Viewed

@@ -81,7 +81,7 @@ def align(
             caption.word_level=true \\
             caption.normalize_text=true \\
             alignment.device=mps \\
-            alignment.model_name=Lattifai/Lattice-1-Alpha
+            alignment.model_name=LattifAI/Lattice-1-Alpha
     """
     media_config = media or MediaConfig()
@@ -142,6 +142,7 @@ def align(
         output_caption_path=caption_config.output_path,
         split_sentence=caption_config.split_sentence,
         channel_selector=media_config.channel_selector,
+        streaming_chunk_secs=media_config.streaming_chunk_secs,
     )

lattifai 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

lattifai 1.0.4py3-none-any.whl → 1.0.5py3-none-any.whl