PyPI - lightly-studio - Versions diffs - 0.3.1__py3-none-any.whl - Mend

lightly-studio 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lightly-studio might be problematic. Click here for more details.

Files changed (219) hide show

lightly_studio/dataset/edge_embedding_generator.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""EdgeCLIP embedding generator."""
+from __future__ import annotations
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Tuple
+from uuid import UUID
+import cv2
+from lightly_edge_sdk import (
+    InferenceDeviceType,
+    LightlyEdge,
+    LightlyEdgeConfig,
+    LightlyEdgeDetectorConfig,
+)
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from lightly_studio.models.embedding_model import EmbeddingModelCreate
+from .embedding_generator import EmbeddingGenerator
+MAX_BATCH_SIZE: int = 1
+class _ImageFileDatasetEdge(Dataset[Tuple[bytes, int, int]]):
+    """Dataset wrapping image file paths for processing."""
+    def __init__(
+        self,
+        filepaths: Sequence[Path],
+    ) -> None:
+        self.filepaths = filepaths
+    def __len__(self) -> int:
+        return len(self.filepaths)
+    def __getitem__(self, idx: int) -> tuple[bytes, int, int]:
+        # Load the image.
+        bgr_image = cv2.imread(str(self.filepaths[idx]))
+        rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
+        rgb_bytes = rgb_image.tobytes()
+        height, width, _ = rgb_image.shape
+        return rgb_bytes, width, height
+class EdgeSDKEmbeddingGenerator(EmbeddingGenerator):
+    """Embedding generator using Edge SDK runtime."""
+    def __init__(self, model_path: str) -> None:
+        """Initialize the LightlyEdge object.
+        Args:
+            model_path: Path to the model tar file.
+        """
+        # Initialize the LightlyEdge SDK.
+        config = _create_edge_config()
+        self.lightly_edge = LightlyEdge(
+            path=model_path,
+            config=config,
+        )
+        model_config = self.lightly_edge.get_image_model_config()
+        self._model_hash = model_config.model_hash
+        self._embedding_size = model_config.embedding_size
+        self._model_name = model_config.model_name
+    def get_embedding_model_input(self, dataset_id: UUID) -> EmbeddingModelCreate:
+        """Generate an EmbeddingModelInput instance.
+        Args:
+            dataset_id: The ID of the dataset.
+        Returns:
+            An EmbeddingModelInput instance with the model details.
+        """
+        return EmbeddingModelCreate(
+            name=self._model_name,
+            embedding_model_hash=self._model_hash,
+            embedding_dimension=self._embedding_size,
+            dataset_id=dataset_id,
+        )
+    def embed_text(self, text: str) -> list[float]:
+        """Embed a text with EdgeCLIP.
+        Args:
+            text: The text to embed.
+        Returns:
+            A list of floats representing the generated embedding.
+        """
+        embeddings = self.lightly_edge.embed_texts([text])
+        if len(embeddings):
+            return embeddings[0]
+        return []
+    def embed_images(self, filepaths: list[Path]) -> list[list[float]]:
+        """Embed images with EdgeSDK.
+        Args:
+            filepaths: A list of file paths to the images to embed.
+        Returns:
+            A list of lists of floats representing the generated embeddings.
+        """
+        dataset = _ImageFileDatasetEdge(filepaths)
+        loader = DataLoader(
+            dataset,
+            batch_size=MAX_BATCH_SIZE,
+            num_workers=0,
+            pin_memory=True,
+        )
+        embeddings_list: list[list[float]] = []
+        total_images = len(filepaths)
+        with tqdm(total=total_images, desc="Generating embeddings", unit=" images") as progress_bar:
+            for rgb_bytes, width, height in loader:
+                embedding = self.lightly_edge.embed_frame_rgb_bytes(
+                    rgb_bytes=rgb_bytes[0],
+                    width=width[0].item(),
+                    height=height[0].item(),
+                )
+                embeddings_list.append(embedding)
+                progress_bar.update(1)
+        return embeddings_list
+def _create_edge_config() -> LightlyEdgeConfig:
+    """Create configuration for LightlyEdge.
+    Returns:
+        Configured LightlyEdgeConfig instance.
+    """
+    config = LightlyEdgeConfig.default()
+    config.inference_device_type = InferenceDeviceType.Auto
+    config.detector_config = LightlyEdgeDetectorConfig(
+        object_detector_enable=False,
+        classifiers_enable=False,
+        max_classifications=0,
+    )
+    return config

lightly_studio/dataset/embedding_generator.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""EmbeddingGenerator implementations."""
+from __future__ import annotations
+import random
+from pathlib import Path
+from typing import Protocol, runtime_checkable
+from uuid import UUID
+from lightly_studio.models.embedding_model import EmbeddingModelCreate
+@runtime_checkable
+class EmbeddingGenerator(Protocol):
+    """Protocol defining the interface for embedding models.
+    This protocol defines the interface that all embedding models must
+    implement. Concrete implementations will use different techniques
+    for creating embeddings.
+    """
+    def get_embedding_model_input(self, dataset_id: UUID) -> EmbeddingModelCreate:
+        """Generate an EmbeddingModelCreate instance.
+        Args:
+            dataset_id: The ID of the dataset.
+        Returns:
+            An EmbeddingModelCreate instance with the model details.
+        """
+    def embed_text(self, text: str) -> list[float]:
+        """Generate an embedding for a text sample.
+        Args:
+            text: The text to embed.
+        Returns:
+            A list of floats representing the generated embedding.
+        """
+        ...
+    def embed_images(self, filepaths: list[Path]) -> list[list[float]]:
+        """Generate embeddings for multiple image samples.
+        TODO(Michal, 04/2025): Use DatasetLoader as input instead.
+        Args:
+            filepaths: A list of file paths to the images to embed.
+        Returns:
+            A list of lists of floats representing the generated embeddings
+            in the same order as the input file paths.
+        """
+        ...
+class RandomEmbeddingGenerator(EmbeddingGenerator):
+    """Model that produces random embeddings with a fixed dimension."""
+    def __init__(self, dimension: int = 3):
+        """Initialize the random embedding model.
+        Args:
+            dimension: The dimension of the embedding vectors to generate.
+        """
+        self._dimension = dimension
+    def get_embedding_model_input(self, dataset_id: UUID) -> EmbeddingModelCreate:
+        """Generate an EmbeddingModelCreate instance.
+        Args:
+            dataset_id: The ID of the dataset.
+        Returns:
+            An EmbeddingModelCreate instance with the model details.
+        """
+        return EmbeddingModelCreate(
+            name="Random",
+            embedding_model_hash="random_model",
+            embedding_dimension=self._dimension,
+            dataset_id=dataset_id,
+        )
+    def embed_text(self, _text: str) -> list[float]:
+        """Generate a random embedding for a text sample."""
+        return [random.random() for _ in range(self._dimension)]
+    def embed_images(self, filepaths: list[Path]) -> list[list[float]]:
+        """Generate random embeddings for multiple image samples."""
+        return [[random.random() for _ in range(self._dimension)] for _ in range(len(filepaths))]

lightly_studio/dataset/embedding_manager.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""Embedding manager for dataset processing."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from uuid import UUID
+from sqlmodel import Session
+from lightly_studio.dataset.embedding_generator import EmbeddingGenerator
+from lightly_studio.models.embedding_model import EmbeddingModelTable
+from lightly_studio.models.sample_embedding import SampleEmbeddingCreate
+from lightly_studio.resolvers import (
+    embedding_model_resolver,
+    sample_embedding_resolver,
+    sample_resolver,
+)
+class EmbeddingManagerProvider:
+    """Provider for the EmbeddingManager singleton instance."""
+    _instance: EmbeddingManager | None = None
+    @classmethod
+    def get_embedding_manager(cls) -> EmbeddingManager:
+        """Get the singleton instance of EmbeddingManager.
+        Returns:
+            The singleton instance of EmbeddingManager.
+        Raises:
+            ValueError: If no instance exists and no session is provided.
+        """
+        if cls._instance is None:
+            cls._instance = EmbeddingManager()
+        return cls._instance
+@dataclass
+class TextEmbedQuery:
+    """Parameters for text embedding generation."""
+    text: str
+    embedding_model_id: UUID | None = None
+class EmbeddingManager:
+    """Manages embedding models and handles embedding generation and storage."""
+    def __init__(self) -> None:
+        """Initialize the embedding manager."""
+        self._models: dict[UUID, EmbeddingGenerator] = {}
+        self._default_model_id: UUID | None = None
+    def register_embedding_model(
+        self,
+        session: Session,
+        dataset_id: UUID,
+        embedding_generator: EmbeddingGenerator,
+        set_as_default: bool = False,
+    ) -> EmbeddingModelTable:
+        """Register an embedding model in the database.
+        The model is stored in an internal dictionary for later use.
+        The model is set as default if requested or if it's the first model.
+        Args:
+            session: Database session for resolver operations.
+            dataset_id: The ID of the dataset to associate with the model.
+            embedding_generator: The model implementation used for embeddings.
+            set_as_default: Whether to set this model as the default.
+        Returns:
+            The created EmbeddingModel.
+        """
+        # Create embedding model record in the database.
+        created_model = embedding_model_resolver.create(
+            session=session,
+            embedding_model=embedding_generator.get_embedding_model_input(dataset_id=dataset_id),
+        )
+        model_id = created_model.embedding_model_id
+        # Store the model in our dictionary
+        self._models[model_id] = embedding_generator
+        # Set as default if requested or if it's the first model
+        if set_as_default or self._default_model_id is None:
+            self._default_model_id = model_id
+        return created_model
+    def embed_text(self, text_query: TextEmbedQuery) -> list[float]:
+        """Generate an embedding for a text sample.
+        Args:
+            text_query: Text embedding query containing text and model ID.
+        Returns:
+            A list of floats representing the generated embedding.
+        """
+        model_id = text_query.embedding_model_id or self._default_model_id
+        if model_id is None:
+            raise ValueError("No embedding model specified and no default model set.")
+        model = self._models.get(model_id)
+        if model is None:
+            raise ValueError(f"Embedding model with ID {model_id} not found.")
+        return model.embed_text(text_query.text)
+    def embed_images(
+        self,
+        session: Session,
+        sample_ids: list[UUID],
+        embedding_model_id: UUID | None = None,
+    ) -> None:
+        """Generate and store embeddings for samples.
+        Args:
+            session: Database session for resolver operations.
+            sample_ids: List of sample IDs to generate embeddings for.
+            embedding_model_id: ID of the model to use. Uses default if None.
+        Raises:
+            ValueError: If no embedding model is registered or provided model
+            ID doesn't exist.
+        """
+        model_id = embedding_model_id or self._default_model_id
+        if not model_id:
+            raise ValueError("No default embedding model registered.")
+        if model_id not in self._models:
+            raise ValueError(f"No embedding model found with ID {model_id}")
+        # Query image filenames from the database.
+        sample_id_to_filepath = {
+            sample.sample_id: Path(sample.file_path_abs)
+            for sample in sample_resolver.get_many_by_id(
+                session=session,
+                sample_ids=sample_ids,
+            )
+        }
+        # Extract filepaths in the same order as sample_ids.
+        filepaths = [sample_id_to_filepath[sample_id] for sample_id in sample_ids]
+        # Generate embeddings for the samples.
+        embeddings = self._models[model_id].embed_images(filepaths=filepaths)
+        # Convert to SampleEmbeddingCreate objects.
+        sample_embeddings = [
+            SampleEmbeddingCreate(
+                sample_id=sample_id,
+                embedding_model_id=model_id,
+                embedding=embedding,
+            )
+            for sample_id, embedding in zip(sample_ids, embeddings)
+        ]
+        # Store the embeddings in the database.
+        sample_embedding_resolver.create_many(session=session, sample_embeddings=sample_embeddings)

lightly_studio/dataset/env.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Initialize environment variables for the dataset module."""
+from environs import Env
+env = Env()
+env.read_env()
+LIGHTLY_STUDIO_EMBEDDINGS_MODEL_TYPE: str = env.str(
+    "LIGHTLY_STUDIO_EMBEDDINGS_MODEL_TYPE", "MOBILE_CLIP"
+)
+LIGHTLY_STUDIO_EDGE_MODEL_FILE_PATH: str = env.str("EDGE_MODEL_PATH", "./lightly_model.tar")
+LIGHTLY_STUDIO_PROTOCOL: str = env.str("LIGHTLY_STUDIO_PROTOCOL", "http")
+LIGHTLY_STUDIO_PORT: int = env.int("LIGHTLY_STUDIO_PORT", 8001)
+LIGHTLY_STUDIO_HOST: str = env.str("LIGHTLY_STUDIO_HOST", "localhost")
+LIGHTLY_STUDIO_DEBUG: str = env.bool("LIGHTLY_STUDIO_DEBUG", "false")
+APP_URL = f"{LIGHTLY_STUDIO_PROTOCOL}://{LIGHTLY_STUDIO_HOST}:{LIGHTLY_STUDIO_PORT}"

lightly_studio/dataset/file_utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""File manipulation utilities."""
+from __future__ import annotations
+import shutil
+from pathlib import Path
+import requests
+import xxhash
+def download_file_if_does_not_exist(url: str, local_filename: Path) -> None:
+    """Download a file from a URL if it does not already exist locally."""
+    if local_filename.exists():
+        return
+    with requests.get(url, stream=True) as r, open(local_filename, "wb") as f:
+        shutil.copyfileobj(r.raw, f)
+def get_file_xxhash(file_path: Path) -> str:
+    """Calculate the xxhash of a file.
+    XXHash is a fast non-cryptographic hash function.
+    Args:
+        file_path: Path to the file.
+    Returns:
+        The xxhash of the file as a string.
+    """
+    hasher = xxhash.xxh64()
+    with file_path.open("rb") as f:
+        while chunk := f.read(8192):
+            hasher.update(chunk)
+    return hasher.hexdigest()