PyPI - lightly-studio - Versions diffs - 0.4.6__py3-none-any.whl - Mend

lightly-studio 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (356) hide show

lightly_studio/resolvers/video_resolver/video_count_annotations_filter.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Utility functions for building database queries."""
+from typing import List, Optional
+from pydantic import BaseModel
+from sqlmodel import col, select
+from lightly_studio.models.annotation.annotation_base import AnnotationBaseTable
+from lightly_studio.models.annotation_label import AnnotationLabelTable
+from lightly_studio.models.video import VideoFrameTable, VideoTable
+from lightly_studio.resolvers.video_resolver.video_filter import VideoFilter
+from lightly_studio.type_definitions import QueryType
+class VideoCountAnnotationsFilter(BaseModel):
+    """Encapsulates filter parameters for querying video frame annotations counter."""
+    video_filter: Optional[VideoFilter] = None
+    video_frames_annotations_labels: Optional[List[str]] = None
+    def apply(self, query: QueryType) -> QueryType:
+        """Apply the filters to the given query."""
+        query = self._apply_annotations_label(query)
+        if self.video_filter:
+            query = self.video_filter.apply(query)
+        return query
+    def _apply_annotations_label(self, query: QueryType) -> QueryType:
+        if not self.video_frames_annotations_labels:
+            return query
+        frame_filtered_video_ids_subquery = (
+            select(VideoTable.sample_id)
+            .join(VideoTable.frames)
+            .join(
+                AnnotationBaseTable,
+                col(AnnotationBaseTable.parent_sample_id) == VideoFrameTable.sample_id,
+            )
+            .join(AnnotationBaseTable.annotation_label)
+            .where(
+                col(AnnotationLabelTable.annotation_label_name).in_(
+                    self.video_frames_annotations_labels or []
+                )
+            )
+            .distinct()
+        )
+        return query.where(col(VideoTable.sample_id).in_(frame_filtered_video_ids_subquery))

lightly_studio/resolvers/video_resolver/video_filter.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Utility functions for building database queries."""
+from typing import List, Optional
+from uuid import UUID
+from pydantic import BaseModel
+from sqlmodel import col, select
+from lightly_studio.models.annotation.annotation_base import AnnotationBaseTable
+from lightly_studio.models.range import FloatRange
+from lightly_studio.models.video import VideoFrameTable, VideoTable
+from lightly_studio.resolvers.image_filter import FilterDimensions
+from lightly_studio.resolvers.sample_resolver.sample_filter import SampleFilter
+from lightly_studio.type_definitions import QueryType
+class VideoFilter(BaseModel):
+    """Encapsulates filter parameters for querying videos."""
+    width: Optional[FilterDimensions] = None
+    height: Optional[FilterDimensions] = None
+    fps: Optional[FloatRange] = None
+    duration_s: Optional[FloatRange] = None
+    annotation_frames_label_ids: Optional[List[UUID]] = None
+    sample_filter: Optional[SampleFilter] = None
+    def apply(self, query: QueryType) -> QueryType:
+        """Apply the filters to the given query."""
+        query = self._apply_width_and_height_filters(query)
+        query = self._apply_fps_filters(query)
+        query = self._apply_duration_filters(query)
+        if self.annotation_frames_label_ids:
+            query = self._apply_annotations_ids(query)
+        if self.sample_filter:
+            query = self.sample_filter.apply(query)
+        return query
+    def _apply_width_and_height_filters(self, query: QueryType) -> QueryType:
+        if self.width:
+            if self.width.min is not None:
+                query = query.where(VideoTable.width >= self.width.min)
+            if self.width.max is not None:
+                query = query.where(VideoTable.width <= self.width.max)
+        if self.height:
+            if self.height.min is not None:
+                query = query.where(VideoTable.height >= self.height.min)
+            if self.height.max is not None:
+                query = query.where(VideoTable.height <= self.height.max)
+        return query
+    def _apply_fps_filters(self, query: QueryType) -> QueryType:
+        min_fps = self.fps.min if self.fps and self.fps.min is not None else None
+        max_fps = self.fps.max if self.fps and self.fps.max is not None else None
+        if min_fps is not None:
+            query = query.where(VideoTable.fps >= min_fps)
+        if max_fps is not None:
+            query = query.where(VideoTable.fps <= max_fps)
+        return query
+    def _apply_duration_filters(self, query: QueryType) -> QueryType:
+        min_duration_s = (
+            self.duration_s.min if self.duration_s and self.duration_s.min is not None else None
+        )
+        max_duration_s = (
+            self.duration_s.max if self.duration_s and self.duration_s.max is not None else None
+        )
+        if min_duration_s is not None:
+            query = query.where(col(VideoTable.duration_s) >= min_duration_s)
+        if max_duration_s is not None:
+            query = query.where(col(VideoTable.duration_s) <= max_duration_s)
+        return query
+    def _apply_annotations_ids(self, query: QueryType) -> QueryType:
+        frame_filtered_video_ids_subquery = (
+            select(VideoTable.sample_id)
+            .join(VideoTable.frames)
+            .join(
+                AnnotationBaseTable,
+                col(AnnotationBaseTable.parent_sample_id) == VideoFrameTable.sample_id,
+            )
+            .where(
+                col(AnnotationBaseTable.annotation_label_id).in_(
+                    self.annotation_frames_label_ids or []
+                )
+            )
+            .distinct()
+        )
+        return query.where(col(VideoTable.sample_id).in_(frame_filtered_video_ids_subquery))

lightly_studio/selection/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Selection package."""

lightly_studio/selection/mundig.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""Python interface to the Mundig selection algorithm."""
+from __future__ import annotations
+from typing import Iterable
+# TODO(Malte, 08/2025): About the type ignore:
+# Use pyo3 typing stubs once they are implemented.
+# See https://github.com/PyO3/pyo3/issues/510
+# Or remove the type ignore once typing stubs were added manually.
+import lightly_mundig  # type: ignore[import-untyped]
+import numpy as np
+from lightly_studio.dataset.env import LIGHTLY_STUDIO_LICENSE_KEY
+class Mundig:
+    """Python interface for the Mundig selection algorithm.
+    This class provides a Python interface to the lightly_mundig Rust library
+    for sample selection. It allows combining different selection strategies
+    such as diversity and weighting.
+    """
+    def __init__(self) -> None:
+        """Initialize the Mundig selection interface."""
+        if LIGHTLY_STUDIO_LICENSE_KEY is None:
+            raise ValueError(
+                "LIGHTLY_STUDIO_LICENSE_KEY environment variable is not set. "
+                "Please set it to your LightlyStudio license key."
+            )
+        self.mundig = lightly_mundig.Selection(token=LIGHTLY_STUDIO_LICENSE_KEY)
+        self.n_input_samples: int | None = None
+    def run(self, n_samples: int) -> list[int]:
+        """Run the selection algorithm and return selected sample indices.
+        Args:
+            n_samples: The number of samples to select.
+        Returns:
+            A list of indices of the selected samples.
+        """
+        selected: list[int] = self.mundig.run_selection(
+            n_total_samples=self.n_input_samples, n_samples_to_select=n_samples
+        )
+        return selected
+    def add_diversity(self, embeddings: Iterable[Iterable[float]], strength: float = 1.0) -> None:
+        """Add diversity-based selection using sample embeddings.
+        Args:
+            embeddings:
+                The embeddings of each sample.
+                First dimension is over the samples, the second dimension is
+                the embedding size. The embedding size must be the same for
+                all samples.
+            strength:
+                The strength of the diversity strategy.
+        """
+        # Convert to ndarray with float32 dtype if not already
+        if isinstance(embeddings, np.ndarray) and embeddings.dtype == np.float32:
+            embeddings_ndarray = embeddings
+        else:
+            embeddings_ndarray = np.array(embeddings, dtype=np.float32)
+        self._check_consistent_input_size(embeddings_ndarray.shape[0])
+        self.mundig.add_diversifying_strategy(embeddings=embeddings_ndarray, strength=strength)
+    def add_weighting(self, weights: Iterable[float], strength: float = 1.0) -> None:
+        """Add a weighting strategy.
+        Args:
+            weights:
+                The weight or importance or utility of each sample.
+            strength:
+                The strength of the weighting strategy.
+        """
+        weights_ndarray = np.array(weights, dtype=np.float32)
+        self._check_consistent_input_size(weights_ndarray.shape[0])
+        self.mundig.add_weighting_strategy(weights=weights_ndarray, strength=strength)
+    def add_class_balancing(
+        self,
+        class_distributions: Iterable[Iterable[float]],
+        target: Iterable[float],
+        strength: float = 1.0,
+    ) -> None:
+        """Add a class balancing strategy.
+        This strategy aims to select a subset of samples such that the
+        distribution of classes in the subset is close to the target
+        distribution.
+        Args:
+            class_distributions:
+                First dimension is over all samples, second one is the distribution per sample over
+                the classes.
+            target:
+                The desired target distribution for the classes in the selected subset of samples.
+                The length of the target must match the number of classes in the class
+                distributions.
+            strength:
+                The strength of the balancing strategy.
+        """
+        # Convert to ndarray with float32 dtype if not already
+        if isinstance(class_distributions, np.ndarray) and class_distributions.dtype == np.float32:
+            class_distributions_nparray = class_distributions
+        else:
+            class_distributions_nparray = np.array(class_distributions, dtype=np.float32)
+        self._check_consistent_input_size(class_distributions_nparray.shape[0])
+        target_nparray = np.array(target, dtype=np.float32)
+        if class_distributions_nparray.shape[1] != target_nparray.shape[0]:
+            raise ValueError(
+                f"The length of 'target' {target_nparray.shape[0]} doesn't match the width of "
+                f"'class_distributions': {class_distributions_nparray.shape[0]}"
+            )
+        self.mundig.add_balancing_strategy(
+            class_distributions=class_distributions_nparray,
+            target=target_nparray,
+            strength=strength,
+        )
+    def _check_consistent_input_size(self, n_input_samples_strategy: int) -> None:
+        """Assert that input samples count is consistent across strategies.
+        Args:
+            n_input_samples_strategy:
+                The number of input samples in the currently added strategy.
+        Raises:
+            ValueError:
+                If the number of input samples in the new strategy differs
+                from the one used in previous strategies.
+        """
+        if self.n_input_samples is None:
+            self.n_input_samples = n_input_samples_strategy
+        elif self.n_input_samples != n_input_samples_strategy:
+            raise ValueError(
+                f"Expected {self.n_input_samples} input samples, "
+                f"but the latest strategy passed {n_input_samples_strategy}."
+            )

lightly_studio/selection/select.py ADDED Viewed

@@ -0,0 +1,203 @@
+"""Provides the user python interface to selection bound to sample ids."""
+from __future__ import annotations
+from collections.abc import Iterable
+from typing import Final, Literal
+from uuid import UUID
+from sqlmodel import Session
+from lightly_studio.selection.select_via_db import select_via_database
+from lightly_studio.selection.selection_config import (
+    AnnotationClassBalancingStrategy,
+    AnnotationClassToTarget,
+    EmbeddingDiversityStrategy,
+    MetadataWeightingStrategy,
+    SelectionConfig,
+    SelectionStrategy,
+)
+class Selection:
+    """Smart selection interface.
+    The `Selection` class allows to select a subset of samples from a given set of input
+    samples. There are many different strategies to select samples, e.g. diversity based
+    on embeddings or weighting based on numeric metadata. Multiple strategies can be
+    combined to form more complex selection strategies.
+    The result of a selection is stored as a tag on the selected samples in the database.
+    The `selection_result_tag_name` must be a unique tag name that is not used yet.
+    # Creation of a Selection instance.
+    Creation of an instance of this is easiest via the `DatasetQuery` class. By using
+    a `match()` first, the samples to select from can be filtered down.
+    ```python
+    from lightly_studio.core.dataset_query import SampleField
+    # Select from all samples in the dataset.
+    selection = dataset.query().selection()
+    # Select only from samples with width < 256.
+    query_narrow_images = dataset.query().match(SampleField.width < 256)
+    selection_among_narrow_images = query_narrow_images.selection()
+    ```
+    See the `DatasetQuery.match()` documentation for more information on filtering.
+    By creating the `Selection` instance, the query is executed. Further changes to the
+    query do not affect the selection instance.
+    # Performing single-strategy selections.
+    Once a `Selection` instance is created, different selection strategies can be
+    applied to select samples. Single-strategy selections are performed by calling
+    the respective method on the `Selection` instance. All methods take the number of
+    samples to select and a tag name for the selection result as mandatory arguments.
+    ```python
+    # Select 100 diverse samples based on embeddings
+    selection.diverse(
+        n_samples_to_select=100,
+        selection_result_tag_name="diverse selection",
+    )
+    # Select 50 samples weighted by numeric metadata "difficulty"
+    selection.metadata_weighting(
+        n_samples_to_select=50,
+        selection_result_tag_name="weighted selection",
+        metadata_key="difficulty",
+    )
+    # Select 100 samples with balanced annotation classes (e.g. uniform distribution)
+    selection.annotation_balancing(
+        n_samples_to_select=100,
+        selection_result_tag_name="balanced selection",
+        target_distribution="uniform",
+    )
+    ```
+    # Performing multi-strategy selections.
+    More complex selection strategies can be formed by combining multiple selection
+    strategies. This is done via the `multi_strategies()` method, which takes a
+    list of selection strategies as an argument.
+    ```python
+    from lightly_studio.selection.selection_config import (
+        EmbeddingDiversityStrategy,
+        MetadataWeightingStrategy
+    )
+    # Select 75 samples that are diverse and weighted by "difficulty"
+    selection.multi_strategies(
+        n_samples_to_select=75,
+        selection_result_tag_name="diverse and weighted selection",
+        selection_strategies=[
+            EmbeddingDiversityStrategy(),
+            MetadataWeightingStrategy(metadata_key="difficulty"),
+        ],
+    )
+    ```
+    """
+    def __init__(
+        self,
+        dataset_id: UUID,
+        session: Session,
+        input_sample_ids: Iterable[UUID],
+    ) -> None:
+        """Create the selection interface.
+        Args:
+            dataset_id: Dataset in which the selection is performed.
+            session: Database session to resolve selection dependencies.
+            input_sample_ids: Candidate sample ids considered for selection.
+                The iterable is consumed immediately to capture a stable snapshot.
+        """
+        self._dataset_id: Final[UUID] = dataset_id
+        self._session: Final[Session] = session
+        self._input_sample_ids: list[UUID] = list(input_sample_ids)
+    def metadata_weighting(
+        self,
+        n_samples_to_select: int,
+        selection_result_tag_name: str,
+        metadata_key: str,
+    ) -> None:
+        """Select a subset based on numeric metadata weights.
+        Args:
+            n_samples_to_select: Number of samples to select.
+            selection_result_tag_name: Tag name for the selection result.
+            metadata_key: Metadata key used as weights (float or int values).
+        """
+        strategy = MetadataWeightingStrategy(metadata_key=metadata_key)
+        self.multi_strategies(
+            n_samples_to_select=n_samples_to_select,
+            selection_result_tag_name=selection_result_tag_name,
+            selection_strategies=[strategy],
+        )
+    def diverse(
+        self,
+        n_samples_to_select: int,
+        selection_result_tag_name: str,
+        embedding_model_name: str | None = None,
+    ) -> None:
+        """Select a diverse subset using embeddings.
+        Args:
+            n_samples_to_select: Number of samples to select.
+            selection_result_tag_name: Tag name for the selection result.
+            embedding_model_name: Optional embedding model name. If None, uses the only
+                available model or raises if multiple exist.
+        """
+        strategy = EmbeddingDiversityStrategy(embedding_model_name=embedding_model_name)
+        self.multi_strategies(
+            n_samples_to_select=n_samples_to_select,
+            selection_result_tag_name=selection_result_tag_name,
+            selection_strategies=[strategy],
+        )
+    def annotation_balancing(
+        self,
+        n_samples_to_select: int,
+        selection_result_tag_name: str,
+        target_distribution: AnnotationClassToTarget | Literal["uniform"] | Literal["input"],
+    ) -> None:
+        """Select a subset using annotation class balancing.
+        Args:
+            n_samples_to_select: Number of samples to select.
+            selection_result_tag_name: Tag name for the selection result.
+            target_distribution: Can be 'uniform', 'input',
+                or a dictionary mapping class names to target ratios.
+        """
+        strategy = AnnotationClassBalancingStrategy(target_distribution=target_distribution)
+        self.multi_strategies(
+            n_samples_to_select=n_samples_to_select,
+            selection_result_tag_name=selection_result_tag_name,
+            selection_strategies=[strategy],
+        )
+    def multi_strategies(
+        self,
+        n_samples_to_select: int,
+        selection_result_tag_name: str,
+        selection_strategies: list[SelectionStrategy],
+    ) -> None:
+        """Select a subset based on multiple strategies.
+        Args:
+            n_samples_to_select: Number of samples to select.
+            selection_result_tag_name: Tag name for the selection result.
+            selection_strategies: Strategies to compose for selection.
+        """
+        config = SelectionConfig(
+            dataset_id=self._dataset_id,
+            n_samples_to_select=n_samples_to_select,
+            selection_result_tag_name=selection_result_tag_name,
+            strategies=selection_strategies,
+        )
+        select_via_database(
+            session=self._session,
+            config=config,
+            input_sample_ids=self._input_sample_ids,
+        )