PyPI - lightly-studio - Versions diffs - 0.4.6__py3-none-any.whl - Mend

lightly-studio 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (356) hide show

lightly_studio/dataset/fsspec_lister.py ADDED Viewed

@@ -0,0 +1,275 @@
+"""File listing utilities using fsspec.
+Handles local and remote paths, directories, and glob patterns.
+"""
+from __future__ import annotations
+import logging
+from collections.abc import Iterator
+from typing import Any
+import fsspec
+from tqdm import tqdm
+# Constants
+PROTOCOL_SEPARATOR = "://"
+DEFAULT_PROTOCOL = "file"
+PATH_SEPARATOR = "/"
+# Glob pattern characters
+GLOB_CHARS = ["*", "?", "[", "]"]
+# Cloud storage protocols
+CLOUD_PROTOCOLS = ("s3", "gs", "gcs", "azure", "abfs")
+# Image file extensions
+IMAGE_EXTENSIONS = {
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".webp",
+    ".bmp",
+    ".tiff",
+}
+def iter_files_from_path(path: str, allowed_extensions: set[str] | None = None) -> Iterator[str]:
+    """List all files from a single path, handling directories, globs, and individual files.
+    Args:
+        path: A single path which can be:
+            - Individual file path
+            - Directory path (will list all files recursively)
+            - Glob pattern
+            - Remote path (s3://, gcs://, etc.)
+        allowed_extensions: Optional set of allowed file extensions (e.g., {".jpg", ".png"}).
+            If None, uses default IMAGE_EXTENSIONS.
+    Yields:
+        File paths as they are discovered, with progress tracking
+    """
+    seen: set[str] = set()
+    extensions = allowed_extensions or IMAGE_EXTENSIONS
+    with tqdm(desc="Discovering files", unit=" files", dynamic_ncols=True) as pbar:
+        cleaned_path = str(path).strip()
+        if not cleaned_path:
+            return
+        fs = _get_filesystem(cleaned_path)
+        yield from _process_single_path_streaming(fs, cleaned_path, seen, pbar, extensions)
+def _process_single_path_streaming(
+    fs: fsspec.AbstractFileSystem, path: str, seen: set[str], pbar: tqdm[Any], extensions: set[str]
+) -> Iterator[str]:
+    """Process a single path and yield matching image files.
+    Handles different path types: individual files, directories, and glob patterns.
+    Args:
+        fs: The filesystem instance.
+        path: The path to process (file, directory, or glob pattern).
+        seen: Set of already processed paths to avoid duplicates.
+        pbar: Progress bar instance for tracking progress.
+        extensions: Set of allowed file extensions.
+    Yields:
+        File paths that match the criteria.
+    Raises:
+        ValueError: If the path doesn't exist or is not an image file when expected.
+    """
+    if _is_glob_pattern(path):
+        yield from _process_glob_pattern(fs, path, seen, pbar, extensions)
+    elif not fs.exists(path):
+        raise ValueError(f"Path does not exist: {path}")
+    elif fs.isfile(path):
+        if _is_image_file(path, extensions) and path not in seen:
+            seen.add(path)
+            pbar.update(1)
+            yield path
+        elif not _is_image_file(path, extensions):
+            raise ValueError(f"File is not an image: {path}")
+    elif fs.isdir(path):
+        for file_path in _stream_files_from_directory(fs, path, extensions):
+            if file_path not in seen:
+                seen.add(file_path)
+                pbar.update(1)
+                yield file_path
+def _process_glob_pattern(
+    fs: fsspec.AbstractFileSystem, path: str, seen: set[str], pbar: tqdm[Any], extensions: set[str]
+) -> Iterator[str]:
+    """Process glob pattern and yield matching image files.
+    Args:
+        fs: The filesystem instance.
+        path: The glob pattern path.
+        seen: Set of already processed paths to avoid duplicates.
+        pbar: Progress bar instance for tracking progress.
+        extensions: Set of allowed file extensions.
+    Yields:
+        File paths that match the glob pattern and allowed extensions.
+    """
+    matching_paths = fs.glob(path)
+    for p in matching_paths:
+        path_str = str(p)
+        if _needs_protocol_prefix(path_str, fs):
+            protocol = _get_protocol_string(fs)
+            path_str = f"{protocol}{PROTOCOL_SEPARATOR}{path_str}"
+        if fs.isfile(path_str) and _is_image_file(path_str, extensions) and path_str not in seen:
+            seen.add(path_str)
+            pbar.update(1)
+            yield path_str
+def _stream_files_from_directory(
+    fs: fsspec.AbstractFileSystem, path: str, extensions: set[str]
+) -> Iterator[str]:
+    """Stream files from a directory with progress tracking.
+    Args:
+        fs: The filesystem instance
+        path: Directory path to list
+        extensions: Set of allowed file extensions
+    Yields:
+        File paths as they are discovered
+    """
+    try:
+        protocol = _get_protocol_string(fs)
+        if protocol in CLOUD_PROTOCOLS:
+            yield from _stream_files_using_walk(fs, path, extensions)
+        else:
+            try:
+                all_paths = fs.find(path, detail=False)
+                for p in all_paths:
+                    if fs.isfile(p) and _is_image_file(p, extensions):
+                        yield p
+            except Exception as e:
+                logging.warning(f"fs.find() failed for {path}, trying alternative method: {e}")
+                yield from _stream_files_using_walk(fs, path, extensions)
+    except Exception as e:
+        logging.error(f"Error streaming files from '{path}': {e}")
+def _stream_files_using_walk(
+    fs: fsspec.AbstractFileSystem, path: str, extensions: set[str]
+) -> Iterator[str]:
+    """Stream files using fs.walk() method.
+    Args:
+        fs: The filesystem instance.
+        path: The directory path to walk.
+        extensions: Set of allowed file extensions.
+    Yields:
+        File paths that match the allowed extensions.
+    """
+    def add_protocol_if_needed(p: str) -> str:
+        if _needs_protocol_prefix(p, fs):
+            protocol = _get_protocol_string(fs)
+            return f"{protocol}{PROTOCOL_SEPARATOR}{p}"
+        return p
+    for root, _dirs, files in fs.walk(path):
+        for file in files:
+            if not root.endswith(PATH_SEPARATOR):
+                full_path = f"{root}{PATH_SEPARATOR}{file}"
+            else:
+                full_path = f"{root}{file}"
+            full_path = add_protocol_if_needed(full_path)
+            if _is_image_file(full_path, extensions):
+                yield full_path
+def _get_filesystem(path: str) -> fsspec.AbstractFileSystem:
+    """Get the appropriate filesystem for the given path.
+    Args:
+        path: The path to determine the filesystem for. Can be local or remote.
+    Returns:
+        An fsspec filesystem instance appropriate for the path's protocol.
+    Raises:
+        ValueError: If the protocol cannot be determined or is invalid.
+    """
+    protocol = path.split(PROTOCOL_SEPARATOR)[0] if PROTOCOL_SEPARATOR in path else DEFAULT_PROTOCOL
+    # Ensure protocol is a string, not a tuple
+    if isinstance(protocol, (list, tuple)):
+        protocol = protocol[0]
+    return fsspec.filesystem(protocol)
+def _is_glob_pattern(path: str) -> bool:
+    """Check if a path contains glob pattern characters.
+    Args:
+        path: The path to check for glob patterns.
+    Returns:
+        True if the path contains glob pattern characters (*, ?, [, ]), False otherwise.
+    """
+    return any(char in path for char in GLOB_CHARS)
+def _is_image_file(path: str, extensions: set[str]) -> bool:
+    """Check if a file is an image based on its extension.
+    Args:
+        path: The file path to check.
+        extensions: Set of allowed file extensions (e.g., {'.jpg', '.png'}).
+    Returns:
+        True if the file has an allowed image extension, False otherwise.
+    """
+    path_lower = path.lower()
+    return any(path_lower.endswith(ext) for ext in extensions)
+def _needs_protocol_prefix(path: str, fs: fsspec.AbstractFileSystem) -> bool:
+    """Check if a path needs protocol prefix.
+    Args:
+        path: The path to check.
+        fs: The filesystem instance.
+    Returns:
+        True if the path needs a protocol prefix (e.g., for cloud storage),
+        False if it is a local path.
+    """
+    if PROTOCOL_SEPARATOR in path:
+        return False
+    if not hasattr(fs, "protocol"):
+        return False
+    protocol = getattr(fs, "protocol", DEFAULT_PROTOCOL)
+    # Handle case where protocol is a tuple (common with fsspec)
+    if isinstance(protocol, (list, tuple)):
+        protocol = protocol[0]
+    return str(protocol) != DEFAULT_PROTOCOL
+def _get_protocol_string(fs: fsspec.AbstractFileSystem) -> str:
+    """Get the protocol string from filesystem.
+    Args:
+        fs: The filesystem instance.
+    Returns:
+        The protocol string (e.g., 's3', 'file', 'gcs').
+        Returns 'file' as default if protocol cannot be determined.
+    """
+    protocol = getattr(fs, "protocol", DEFAULT_PROTOCOL)
+    if isinstance(protocol, (list, tuple)):
+        return str(protocol[0])
+    return str(protocol)

lightly_studio/dataset/mobileclip_embedding_generator.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""MobileCLIP embedding generator."""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+from typing import Callable
+from uuid import UUID
+import fsspec
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from lightly_studio.models.embedding_model import EmbeddingModelCreate
+from lightly_studio.vendor import mobileclip
+from . import file_utils
+from .embedding_generator import ImageEmbeddingGenerator
+MODEL_NAME = "mobileclip_s0"
+MOBILECLIP_DOWNLOAD_URL = (
+    f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{MODEL_NAME}.pt"
+)
+MAX_BATCH_SIZE: int = 16
+EMBEDDING_DIMENSION: int = 512
+# Dataset for efficient batched image loading and preprocessing
+class _ImageFileDataset(Dataset[torch.Tensor]):
+    """Dataset wrapping image file paths and a preprocess function."""
+    def __init__(
+        self,
+        filepaths: list[str],
+        preprocess: Callable[[Image.Image], torch.Tensor],
+    ) -> None:
+        self.filepaths = filepaths
+        self.preprocess = preprocess
+    def __len__(self) -> int:
+        return len(self.filepaths)
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        with fsspec.open(self.filepaths[idx], "rb") as file:
+            image = Image.open(file).convert("RGB")
+            return self.preprocess(image)
+class MobileCLIPEmbeddingGenerator(ImageEmbeddingGenerator):
+    """MobileCLIP embedding model."""
+    def __init__(self) -> None:
+        """Initialize the MobileCLIP embedding model.
+        This method loads the MobileCLIP model and its tokenizer. The model
+        checkpoint is downloaded and cached locally for future use.
+        """
+        model_path = _get_cached_mobileclip_checkpoint()
+        self._model, _, self._preprocess = mobileclip.create_model_and_transforms(
+            model_name=MODEL_NAME, pretrained=str(model_path)
+        )
+        # Auto select device: CUDA > MPS (Apple Silicon) > CPU
+        self._device = torch.device(
+            "cuda"
+            if torch.cuda.is_available()
+            else "mps"
+            if torch.backends.mps.is_available()
+            else "cpu"
+        )
+        self._model = self._model.to(self._device)
+        self._tokenizer = mobileclip.get_tokenizer(model_name=MODEL_NAME)
+        self._model_hash = file_utils.get_file_xxhash(model_path)
+    def get_embedding_model_input(self, dataset_id: UUID) -> EmbeddingModelCreate:
+        """Generate an EmbeddingModelCreate instance.
+        Args:
+            dataset_id: The ID of the dataset.
+        Returns:
+            An EmbeddingModelCreate instance with the model details.
+        """
+        return EmbeddingModelCreate(
+            name=MODEL_NAME,
+            embedding_model_hash=self._model_hash,
+            embedding_dimension=EMBEDDING_DIMENSION,
+            dataset_id=dataset_id,
+        )
+    def embed_text(self, text: str) -> list[float]:
+        """Embed a text with MobileCLIP.
+        Args:
+            text: The text to embed.
+        Returns:
+            A list of floats representing the generated embedding.
+        """
+        tokenized = self._tokenizer([text]).to(self._device)
+        with torch.no_grad():
+            embedding = self._model.encode_text(tokenized)[0]
+            # Convert embedding to list of floats.
+            embedding_list: list[float] = embedding.cpu().numpy().flatten().tolist()
+        return embedding_list
+    def embed_images(self, filepaths: list[str]) -> NDArray[np.float32]:
+        """Embed images with MobileCLIP.
+        Args:
+            filepaths: A list of file paths to the images to embed.
+        Returns:
+            A numpy array representing the generated embeddings
+            in the same order as the input file paths.
+        """
+        total_images = len(filepaths)
+        if not total_images:
+            return np.empty((0, EMBEDDING_DIMENSION), dtype=np.float32)
+        dataset = _ImageFileDataset(filepaths, self._preprocess)
+        # To avoid issues with db locking and multiprocessing we set the
+        # number of workers to 0 (no multiprocessing). The DataLoader is still
+        # very useful for batching and async prefetching of images.
+        loader = DataLoader(
+            dataset,
+            batch_size=MAX_BATCH_SIZE,
+            num_workers=0,  # must be 0 to avoid multiprocessing issues
+        )
+        embeddings = np.empty((total_images, EMBEDDING_DIMENSION), dtype=np.float32)
+        position = 0
+        with tqdm(
+            total=total_images, desc="Generating embeddings", unit=" images"
+        ) as progress_bar, torch.no_grad():
+            for images_tensor in loader:
+                imgs = images_tensor.to(self._device, non_blocking=True)
+                batch_embeddings = self._model.encode_image(imgs).cpu().numpy()
+                batch_size = imgs.size(0)
+                embeddings[position : position + batch_size] = batch_embeddings
+                position += batch_size
+                progress_bar.update(batch_size)
+        return embeddings
+def _get_cached_mobileclip_checkpoint() -> Path:
+    file_path = Path(tempfile.gettempdir()) / f"{MODEL_NAME}.pt"
+    file_utils.download_file_if_does_not_exist(
+        url=MOBILECLIP_DOWNLOAD_URL,
+        local_filename=file_path,
+    )
+    return file_path

lightly_studio/dataset/perception_encoder_embedding_generator.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Perception Encoder embedding generator."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Callable
+from uuid import UUID
+import fsspec
+import numpy as np
+import torch
+from av import container
+from numpy.typing import NDArray
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from lightly_studio.models.embedding_model import EmbeddingModelCreate
+from lightly_studio.vendor.perception_encoder.vision_encoder import pe, transforms
+from . import file_utils
+from .embedding_generator import ImageEmbeddingGenerator, VideoEmbeddingGenerator
+MODEL_NAME = "PE-Core-T16-384"
+DEFAULT_VIDEO_CHANNEL = 0
+MAX_BATCH_SIZE: int = 16
+VIDEO_FRAMES_PER_SAMPLE: int = 8
+# TODO(Jonas, 12/225): Move to a helper.
+class _ImageFileDataset(Dataset[torch.Tensor]):
+    """Dataset wrapping image file paths and a preprocess function.
+    Used for efficient batched image loading and preprocessing
+    """
+    def __init__(
+        self,
+        filepaths: list[str],
+        preprocess: Callable[[Image.Image], torch.Tensor],
+    ) -> None:
+        self.filepaths = filepaths
+        self.preprocess = preprocess
+    def __len__(self) -> int:
+        return len(self.filepaths)
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        with fsspec.open(self.filepaths[idx], "rb") as file:
+            image = Image.open(file).convert("RGB")
+            return self.preprocess(image)
+class _VideoFileDataset(Dataset[torch.Tensor]):
+    """Dataset wrapping video file paths and a preprocess function.
+    Used for efficient batched video loading and preprocessing
+    """
+    def __init__(
+        self,
+        filepaths: list[str],
+        preprocess: Callable[[Image.Image], torch.Tensor],
+    ) -> None:
+        self.filepaths = filepaths
+        self.preprocess = preprocess
+    def __len__(self) -> int:
+        return len(self.filepaths)
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        """Return tensor [N C H W] for idx-th video.
+        As in the original paper we subsample N frames from a video and stack them to a tensor.
+        As in the paper, we use a default of 8 frames per video (VIDEO_FRAMES_PER_SAMPLE).
+        Note: the video length in the paper was 16.7 +/- 9.8 sec, hence for longer videos we might
+        consider alternative models or more frames.
+        """
+        video_path = self.filepaths[idx]
+        frames = self._load_frames(video_path)
+        if not frames:
+            raise ValueError(f"Unable to read frames from video '{video_path}'.")
+        processed_frames = [self.preprocess(frame) for frame in frames]
+        return torch.stack(processed_frames)
+    def _load_frames(self, video_path: str) -> list[Image.Image]:
+        """Sample uniformly spaced frames and return them as PIL images.
+        Using seek for sampling is fast, however it may yield slightly different results on
+        different OS (known issue: MacOS vs Linux).
+        Alternative option is to decode frame-by-frame to be OS independent,
+        however this comes with performance drop.
+        """
+        fs, fs_path = fsspec.core.url_to_fs(url=video_path)
+        with fs.open(path=fs_path, mode="rb") as video_file, container.open(
+            file=video_file
+        ) as video_container:
+            video_stream = video_container.streams.video[DEFAULT_VIDEO_CHANNEL]
+            duration_pts = video_stream.duration
+            time_base = float(video_stream.time_base)
+            if duration_pts is None or duration_pts <= 0 or time_base <= 0.0:
+                return []
+            duration_seconds = duration_pts * time_base
+            # Sample VIDEO_FRAMES_PER_SAMPLE evenly spaced inside [0, duration_seconds)
+            ts_to_sample = np.linspace(
+                0.0,
+                duration_seconds,
+                num=VIDEO_FRAMES_PER_SAMPLE,
+                endpoint=False,
+                dtype=np.float64,
+            )
+            frames: list[Image.Image] = []
+            for ts_target in ts_to_sample:
+                pts_target = int(ts_target / time_base)
+                video_container.seek(offset=pts_target, stream=video_stream)
+                frame = next(video_container.decode(video=DEFAULT_VIDEO_CHANNEL))
+                frames.append(frame.to_image())
+            return frames
+class PerceptionEncoderEmbeddingGenerator(ImageEmbeddingGenerator, VideoEmbeddingGenerator):
+    """Perception Encoder Core embedding model."""
+    def __init__(self) -> None:
+        """Initialize the Perception Encoder Core embedding model.
+        This method loads the Perception Encoder Core model and its tokenizer. The model
+        checkpoint is downloaded and cached locally for future use.
+        """
+        self._model, model_path = pe.CLIP.from_config(MODEL_NAME, pretrained=True)
+        self._preprocess = transforms.get_image_transform(self._model.image_size)
+        self._tokenizer = transforms.get_text_tokenizer(self._model.context_length)
+        # Auto select device: CUDA > MPS (Apple Silicon) > CPU
+        self._device = torch.device(
+            "cuda"
+            if torch.cuda.is_available()
+            else "mps"
+            if torch.backends.mps.is_available()
+            else "cpu"
+        )
+        self._model = self._model.to(self._device)
+        self._model_hash = file_utils.get_file_xxhash(Path(model_path))
+    def get_embedding_model_input(self, dataset_id: UUID) -> EmbeddingModelCreate:
+        """Generate an EmbeddingModelCreate instance.
+        Args:
+            dataset_id: The ID of the dataset.
+        Returns:
+            An EmbeddingModelCreate instance with the model details.
+        """
+        return EmbeddingModelCreate(
+            name=MODEL_NAME,
+            embedding_model_hash=self._model_hash,
+            embedding_dimension=self._model.output_dim,
+            dataset_id=dataset_id,
+        )
+    def embed_text(self, text: str) -> list[float]:
+        """Embed a text with Perception Encoder.
+        Args:
+            text: The text to embed.
+        Returns:
+            A list of floats representing the generated embedding.
+        """
+        tokenized = self._tokenizer([text]).to(self._device)
+        with torch.no_grad():
+            embedding = self._model.encode_text(tokenized, normalize=True)[0]
+            # Convert embedding to list of floats.
+            embedding_list: list[float] = embedding.cpu().numpy().flatten().tolist()
+        return embedding_list
+    def embed_images(self, filepaths: list[str]) -> NDArray[np.float32]:
+        """Embed images with Perception Encoder.
+        Args:
+            filepaths: A list of file paths to the images to embed.
+        Returns:
+            A numpy array representing the generated embeddings
+            in the same order as the input file paths.
+        """
+        total_images = len(filepaths)
+        if not total_images:
+            return np.empty((0, self._model.output_dim), dtype=np.float32)
+        dataset = _ImageFileDataset(filepaths, self._preprocess)
+        # To avoid issues with db locking and multiprocessing we set the
+        # number of workers to 0 (no multiprocessing). The DataLoader is still
+        # very useful for batching and async prefetching of images.
+        loader = DataLoader(
+            dataset,
+            batch_size=MAX_BATCH_SIZE,
+            num_workers=0,  # must be 0 to avoid multiprocessing issues
+        )
+        embeddings = np.empty((total_images, self._model.output_dim), dtype=np.float32)
+        position = 0
+        with tqdm(
+            total=total_images, desc="Generating embeddings", unit=" images"
+        ) as progress_bar, torch.no_grad():
+            for images_tensor in loader:
+                imgs = images_tensor.to(self._device, non_blocking=True)
+                batch_embeddings = self._model.encode_image(imgs, normalize=True).cpu().numpy()
+                batch_size = imgs.size(0)
+                embeddings[position : position + batch_size] = batch_embeddings
+                position += batch_size
+                progress_bar.update(batch_size)
+        return embeddings
+    def embed_videos(self, filepaths: list[str]) -> NDArray[np.float32]:
+        """Embed videos with Perception Encoder.
+        Args:
+            filepaths: A list of file paths to the videos to embed.
+        Returns:
+            A numpy array representing the generated embeddings
+            in the same order as the input file paths.
+        """
+        dataset = _VideoFileDataset(filepaths, self._preprocess)
+        # To avoid issues with db locking and multiprocessing we set the
+        # number of workers to 0 (no multiprocessing). The DataLoader is still
+        # very useful for batching and async prefetching of videos.
+        loader = DataLoader(
+            dataset,
+            batch_size=MAX_BATCH_SIZE,
+            num_workers=0,  # must be 0 to avoid multiprocessing issues
+        )
+        total_videos = len(filepaths)
+        if not total_videos:
+            return np.empty((0, self._model.output_dim), dtype=np.float32)
+        embeddings = np.empty((total_videos, self._model.output_dim), dtype=np.float32)
+        position = 0
+        with tqdm(
+            total=total_videos, desc="Generating embeddings", unit=" videos"
+        ) as progress_bar, torch.no_grad():
+            for videos_tensor in loader:
+                videos = videos_tensor.to(self._device, non_blocking=True)
+                batch_embeddings = self._model.encode_video(videos, normalize=True).cpu().numpy()
+                batch_size = videos.size(0)
+                embeddings[position : position + batch_size] = batch_embeddings
+                position += batch_size
+                progress_bar.update(batch_size)
+        return embeddings