PyPI - lightly-studio - Versions diffs - 0.4.6__py3-none-any.whl - Mend

lightly-studio 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (356) hide show

lightly_studio/few_shot_classifier/random_forest_classifier.py ADDED Viewed

@@ -0,0 +1,495 @@
+"""RandomForest classifier implementations."""
+from __future__ import annotations
+import io
+import pickle
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+import numpy as np
+import sklearn  # type: ignore[import-untyped]
+from sklearn.ensemble import (  # type: ignore[import-untyped]
+    RandomForestClassifier,
+)
+from sklearn.tree import (  # type: ignore[import-untyped]
+    DecisionTreeClassifier,
+)
+from sklearn.utils import validation  # type: ignore[import-untyped]
+from typing_extensions import assert_never
+from .classifier import AnnotatedEmbedding, ExportType, FewShotClassifier
+# The version of the file format used for exporting and importing classifiers.
+# This is used to ensure compatibility between different versions of the code.
+# If the format changes, this version should be incremented.
+FILE_FORMAT_VERSION = "1.0.0"
+@dataclass
+class ModelExportMetadata:
+    """Metadata for exporting a model for traceability and reproducibility."""
+    name: str
+    file_format_version: str
+    model_type: str
+    created_at: str
+    class_names: list[str]
+    num_input_features: int
+    num_estimators: int
+    embedding_model_hash: str
+    embedding_model_name: str
+    sklearn_version: str
+@dataclass
+class InnerNode:
+    """Inner node of a decision tree.
+    Defaults are used for tree construction.
+    """
+    feature_index: int = 0
+    threshold: float = 0.0
+    left_child: int = 0
+    right_child: int = 0
+@dataclass
+class LeafNode:
+    """Leaf node of a decision tree."""
+    class_probabilities: list[float]
+@dataclass
+class ExportedTree:
+    """Exported tree structure."""
+    inner_nodes: list[InnerNode]
+    leaf_nodes: list[LeafNode]
+@dataclass
+class RandomForestExport:
+    """Datastructure for exporting the RandomForest model."""
+    metadata: ModelExportMetadata
+    trees: list[ExportedTree]
+class RandomForest(FewShotClassifier):
+    """RandomForest classifier."""
+    def __init__(
+        self,
+        name: str,
+        classes: list[str],
+        embedding_model_name: str,
+        embedding_model_hash: str,
+    ) -> None:
+        """Initialize the RandomForestClassifier with predefined classes.
+        Args:
+            name: Name of the classifier.
+            classes: Ordered list of class labels that will be used for training
+                and predictions. The order of this list determines the order of
+                probability values in predictions.
+            embedding_model_name: Name of the model used for creating the
+                embeddings.
+            embedding_model_hash: Hash of the model used for creating the
+                embeddings.
+            Note: embedding_model_name and embedding_model_hash are used for
+            traceability in the exported model metadata.
+        Raises:
+            ValueError: If classes list is empty.
+        """
+        if not classes:
+            raise ValueError("Class list cannot be empty.")
+        # Fix the random seed for reproducibility.
+        self._model = RandomForestClassifier(class_weight="balanced", random_state=42)
+        self.name = name
+        self.classes = classes
+        self._class_to_index = {label: idx for idx, label in enumerate(classes)}
+        self._embedding_model_name = embedding_model_name
+        self.embedding_model_hash = embedding_model_hash
+    def train(self, annotated_embeddings: list[AnnotatedEmbedding]) -> None:
+        """Trains a classifier using the provided input.
+        Args:
+            annotated_embeddings: A list of annotated embeddings to train the
+            classifier.
+        Raises:
+            ValueError: If annotated_embeddings is empty or contains invalid
+            classes.
+        """
+        if not annotated_embeddings:
+            raise ValueError("annotated_embeddings cannot be empty.")
+        # Extract embeddings and labels.
+        embeddings = [ae.embedding for ae in annotated_embeddings]
+        labels = [ae.annotation for ae in annotated_embeddings]
+        # Validate that all labels are in predefined classes.
+        invalid_labels = set(labels) - set(self.classes)
+        if invalid_labels:
+            raise ValueError(f"Found labels not in predefined classes: {invalid_labels}")
+        # Convert to NumPy arrays.
+        embeddings_np = np.array(embeddings)
+        labels_encoded = [self._class_to_index[label] for label in labels]
+        # Train the RandomForestClassifier.
+        self._model.fit(embeddings_np, labels_encoded)
+    def predict(self, embeddings: list[list[float]]) -> list[list[float]]:
+        """Predicts the classification scores for a list of embeddings.
+        Args:
+            embeddings: A list of embeddings, where each embedding is a list of
+            floats.
+        Returns:
+            A list of lists, where each inner list represents the probability
+            distribution over classes for the corresponding input embedding.
+            Each value in the inner list corresponds to the likelihood of the
+            embedding belonging to a specific class.
+            If embeddings is empty, returns an empty list.
+        """
+        if len(embeddings) == 0:
+            return []
+        # Convert embeddings to a NumPy array.
+        embeddings_np = np.array(embeddings)
+        # Get the classes that the model was trained on.
+        trained_classes: list[int] = self._model.classes_
+        # Initialize full-size probability array.
+        full_probabilities = []
+        # Get raw probabilities from model.
+        raw_probabilities = self._model.predict_proba(embeddings_np)
+        for raw_probs in raw_probabilities:
+            # Initialize zeros for all possible classes.
+            full_probs = [0.0 for _ in range(len(self.classes))]
+            # Map probabilities to their correct positions.
+            for trained_class, prob in zip(trained_classes, raw_probs):
+                full_probs[trained_class] = prob
+            full_probabilities.append(full_probs)
+        return full_probabilities
+    def export(
+        self,
+        export_path: Path | None = None,
+        buffer: io.BytesIO | None = None,
+        export_type: ExportType = "sklearn",
+    ) -> None:
+        """Exports the classifier to a specified file.
+        Args:
+            export_path: The full file path where the export will be saved.
+            buffer: A BytesIO buffer to save the export to.
+            export_type: The type of export. Options are:
+                "sklearn": Exports the RandomForestClassifier instance.
+                "lightly": Exports the model in raw format with metadata
+                and tree details.
+        """
+        metadata = ModelExportMetadata(
+            name=self.name,
+            file_format_version=FILE_FORMAT_VERSION,
+            model_type="RandomForest",
+            created_at=str(datetime.now(timezone.utc).isoformat()),
+            class_names=self.classes,
+            num_input_features=self._model.n_features_in_,
+            num_estimators=len(self._model.estimators_),
+            embedding_model_hash=self.embedding_model_hash,
+            embedding_model_name=self._embedding_model_name,
+            sklearn_version=sklearn.__version__,
+        )
+        if export_type == "sklearn":
+            # Combine the model and metadata into a single dictionary
+            export_data = {
+                "model": self._model,
+                "metadata": metadata,
+            }
+            if buffer is not None:
+                pickle.dump(export_data, buffer)
+            elif export_path is not None:
+                # Save to the specified file path.
+                # Ensure parent dirs exist.
+                export_path.parent.mkdir(parents=True, exist_ok=True)
+                with open(export_path, "wb") as f:
+                    pickle.dump(export_data, f)
+        elif export_type == "lightly":
+            export_data_raw = _export_random_forest_model(
+                model=self._model,
+                metadata=metadata,
+                all_classes=self.classes,
+            )
+            if buffer is not None:
+                pickle.dump(export_data_raw, buffer)
+            elif export_path is not None:
+                # Save to the specified file path.
+                # Ensure parent dirs exist.
+                export_path.parent.mkdir(parents=True, exist_ok=True)
+                with open(export_path, "wb") as f:
+                    pickle.dump(export_data_raw, f)
+        else:
+            assert_never(export_type)
+    def is_trained(self) -> bool:
+        """Checks if the classifier is trained.
+        Returns:
+            True if the classifier is trained, False otherwise.
+        """
+        try:
+            validation.check_is_fitted(self._model)
+            return True
+        except sklearn.exceptions.NotFittedError:
+            return False
+def load_random_forest_classifier(
+    classifier_path: Path | None, buffer: io.BytesIO | None
+) -> RandomForest:
+    """Loads a RandomForest classifier from a file or a buffer.
+    Args:
+        classifier_path: The path to the exported classifier file.
+        buffer: A BytesIO buffer containing the exported classifier.
+    If both path and buffer are provided, the path will be used.
+    Returns:
+        A fully initialized RandomForest classifier instance.
+    Raises:
+        FileNotFoundError: If the classifier_path does not exist.
+        ValueError: If the file is not a valid 'sklearn' pickled export
+                    or if the version/format mismatches.
+    """
+    if classifier_path is not None:
+        if not classifier_path.exists():
+            raise FileNotFoundError(f"The file {classifier_path} does not exist.")
+        with open(classifier_path, "rb") as f:
+            export_data = pickle.load(f)
+    elif buffer is not None:
+        export_data = pickle.load(buffer)
+    model = export_data.get("model")
+    metadata: ModelExportMetadata = export_data.get("metadata")
+    if model is None or metadata is None:
+        raise ValueError("The loaded file does not contain a valid model or metadata.")
+    if metadata.file_format_version != FILE_FORMAT_VERSION:
+        raise ValueError(
+            f"File format version mismatch. Expected '{FILE_FORMAT_VERSION}', "
+            f"got '{metadata.file_format_version}'."
+        )
+    if metadata.sklearn_version != sklearn.__version__:
+        raise ValueError(
+            f"File format mismatch, loading a file format for a different sklearn version. "
+            f"File format uses '{metadata.sklearn_version}', got '{sklearn.__version__}'."
+        )
+    instance = RandomForest(
+        name=metadata.name,
+        classes=metadata.class_names,
+        embedding_model_name=metadata.embedding_model_name,
+        embedding_model_hash=metadata.embedding_model_hash,
+    )
+    # Set the model.
+    instance._model = model  # noqa: SLF001
+    return instance
+def _export_random_forest_model(
+    model: RandomForestClassifier,
+    metadata: ModelExportMetadata,
+    all_classes: list[str],
+) -> RandomForestExport:
+    """Converts a sk-learn RandomForestClassifier to RandomForestExport format.
+    Args:
+        model: The trained random forest model to export.
+        metadata: Metadata describing the dataset and training setup.
+        all_classes: Full list of all class labels.
+    Returns:
+        RandomForestExport: The serialized export object containing all trees
+            and metadata.
+    """
+    trained_classes: list[int] = model.classes_
+    trees = [_export_single_tree(tree, trained_classes, all_classes) for tree in model.estimators_]
+    return RandomForestExport(metadata=metadata, trees=trees)
+def load_lightly_random_forest(path: Path | None, buffer: io.BytesIO | None) -> RandomForestExport:
+    """Loads a Lightly exported RandomForest model from a file or buffer.
+    Args:
+        path: The path to the exported classifier file.
+        buffer: A BytesIO buffer containing the exported classifier.
+    If both path and buffer are provided, the path will be used.
+    Returns:
+        A RandomForestExport instance.
+    Raises:
+        ValueError: If the file is not a valid RandomForestExport or
+                if the version/format mismatches.
+    """
+    if path is not None:
+        with open(path, "rb") as f:
+            data = pickle.load(f)
+    elif buffer is not None:
+        data = pickle.load(buffer)
+    if not isinstance(data, RandomForestExport):
+        raise ValueError("Loaded object is not a RandomForestExport instance.")
+    if data.metadata.file_format_version != FILE_FORMAT_VERSION:
+        raise ValueError(
+            f"File format version mismatch. Expected '{FILE_FORMAT_VERSION}', "
+            f"got '{data.metadata.file_format_version}'."
+        )
+    return data
+def predict_with_lightly_random_forest(
+    model: RandomForestExport, embeddings: list[list[float]]
+) -> list[list[float]]:
+    """Predicts the classification scores for a list of embeddings.
+    Args:
+        model: A RandomForestExport instance containing the model and metadata.
+        embeddings: A list of embeddings.
+    Returns:
+        A list of lists, where each inner list represents the probability
+            distribution over classes for the corresponding input embedding.
+    Raises:
+        ValueError: If the provided embeddings have different size than
+            expected.
+    """
+    expected_dim = model.metadata.num_input_features
+    all_probs: list[list[float]] = []
+    for embedding in embeddings:
+        if len(embedding) != expected_dim:
+            raise ValueError(
+                f"Embedding has wrong dimensionality: expected {expected_dim},got {len(embedding)}"
+            )
+        tree_probs: list[list[float]] = [
+            _predict_tree_probs(tree, embedding) for tree in model.trees
+        ]
+        mean_probs = np.mean(tree_probs, axis=0).tolist()
+        all_probs.append(mean_probs)
+    return all_probs
+def _export_single_tree(
+    tree: DecisionTreeClassifier,
+    trained_classes: list[int],
+    all_classes: list[str],
+) -> ExportedTree:
+    """Converts a single sk-learn tree into a serializable ExportedTree format.
+    Args:
+        tree: The decision tree to convert.
+        trained_classes: Indices of the classes the tree was trained on.
+        all_classes: Full list of all class labels.
+    Returns:
+        ExportedTree: A representation of the tree with explicit node and leaf
+                    structures, compatible with the Lightly format.
+    """
+    tree_structure = tree.tree_
+    inner_nodes: list[InnerNode] = []
+    leaf_nodes: list[LeafNode] = []
+    node_map = {}  # Maps node_id to (mapped_index, is_leaf)
+    for node_id in range(tree_structure.node_count):
+        is_leaf = tree_structure.children_left[node_id] == tree_structure.children_right[node_id]
+        if is_leaf:
+            leaf_idx = len(leaf_nodes)
+            # value[node_id] is a 2D array of shape [1, n_classes].
+            # [0] is used to extract the inner array and
+            # convert it to a 1D array of class counts.
+            class_weights = tree_structure.value[node_id][0]
+            total = sum(class_weights)
+            probs = (class_weights / total).tolist() if total > 0 else [0.0] * len(class_weights)
+            # Order probabilities according to the initial classes.
+            # Initialize zeros for all possible classes.
+            full_probs = [0.0 for _ in range(len(all_classes))]
+            # Map probabilities to their correct positions.
+            for trained_class, prob in zip(trained_classes, probs):
+                full_probs[trained_class] = prob
+            leaf_nodes.append(LeafNode(class_probabilities=full_probs))
+            node_map[node_id] = (-leaf_idx - 1, True)
+        else:
+            inner_idx = len(inner_nodes)
+            node_map[node_id] = (inner_idx, False)
+            # Reserve a spot for the inner node.
+            inner_nodes.append(InnerNode())
+    # Now populate inner_nodes using mapped indices.
+    for node_id in range(tree_structure.node_count):
+        mapped_idx, is_leaf = node_map[node_id]
+        if is_leaf:
+            continue
+        left_id = tree_structure.children_left[node_id]
+        right_id = tree_structure.children_right[node_id]
+        left_mapped = node_map[left_id][0]
+        right_mapped = node_map[right_id][0]
+        inner_nodes[mapped_idx] = InnerNode(
+            feature_index=int(tree_structure.feature[node_id]),
+            threshold=float(tree_structure.threshold[node_id]),
+            left_child=left_mapped,
+            right_child=right_mapped,
+        )
+    return ExportedTree(inner_nodes=inner_nodes, leaf_nodes=leaf_nodes)
+def _predict_tree_probs(tree: ExportedTree, embedding: list[float]) -> list[float]:
+    """Predicts class probabilities for an embedding using a single tree.
+    Args:
+        tree: A ExportedTree instance used to determine the probability.
+        embedding: A single embedding.
+    """
+    if not tree.inner_nodes:
+        return tree.leaf_nodes[0].class_probabilities
+    node_idx = 0  # Start at root
+    while node_idx >= 0:
+        node = tree.inner_nodes[node_idx]
+        if embedding[node.feature_index] <= node.threshold:
+            node_idx = node.left_child
+        else:
+            node_idx = node.right_child
+    leaf_idx = -node_idx - 1
+    leaf = tree.leaf_nodes[leaf_idx]
+    return leaf.class_probabilities

lightly_studio/metadata/complex_metadata.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Complex metadata types that can be stored in JSON columns."""
+from typing import Any, Dict, Type
+from lightly_studio.metadata.gps_coordinate import GPSCoordinate
+from lightly_studio.metadata.metadata_protocol import ComplexMetadata
+# Registry of complex metadata types for automatic serialization/deserialization
+COMPLEX_METADATA_TYPES: Dict[str, Type[ComplexMetadata]] = {
+    "gps_coordinate": GPSCoordinate,
+}
+def serialize_complex_metadata(value: Any) -> Any:
+    """Serialize complex metadata for JSON storage.
+    Args:
+        value: Value to serialize.
+    Returns:
+        Serialized value if it is ComplexMetadata, the original
+        value otherwise.
+    """
+    if isinstance(value, ComplexMetadata):
+        return value.as_dict()
+    return value
+def deserialize_complex_metadata(value: Any, expected_type: str) -> Any:
+    """Deserialize complex metadata from JSON storage.
+    Args:
+        value: Value to deserialize.
+        expected_type: Expected type name from schema (e.g., "gps_coordinate").
+    Returns:
+        Deserialized value (complex metadata object if applicable).
+    """
+    # If we have an expected type and the value is a dict, try to deserialize.
+    if expected_type and isinstance(value, dict) and expected_type in COMPLEX_METADATA_TYPES:
+        try:
+            return COMPLEX_METADATA_TYPES[expected_type].from_dict(value)
+        except (KeyError, TypeError):
+            # If deserialization fails, return the original value.
+            pass
+    return value

lightly_studio/metadata/compute_similarity.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Computes similarity from embeddings."""
+from datetime import datetime, timezone
+from typing import Optional
+from uuid import UUID
+from lightly_mundig import Similarity  # type: ignore[import-untyped]
+from sqlmodel import Session
+from lightly_studio.dataset.env import LIGHTLY_STUDIO_LICENSE_KEY
+from lightly_studio.errors import TagNotFoundError
+from lightly_studio.resolvers import metadata_resolver, sample_embedding_resolver, tag_resolver
+from lightly_studio.resolvers.sample_resolver.sample_filter import SampleFilter
+def compute_similarity_metadata(
+    session: Session,
+    key_dataset_id: UUID,
+    embedding_model_id: UUID,
+    query_tag_id: UUID,
+    metadata_name: Optional[str] = None,
+) -> str:
+    """Computes similarity for each sample in the dataset from embeddings.
+    Similarity is a measure of how similar a sample is to its nearest neighbor
+    in the embedding space. It can be used to find duplicates.
+    The computed similarity values are stored as metadata for each sample.
+    Args:
+        session:
+            The database session.
+        key_dataset_id:
+            The ID of the dataset the similarity is computed on.
+        embedding_model_id:
+            The ID of the embedding model to use for the computation.
+        query_tag_id:
+            The ID of the tag describing the query.
+        metadata_name:
+            The name of the metadata field to store the similarity values in.
+    Raises:
+        TagNotFoundError if tag with ID `query_tag_id` does not exist.
+    Returns:
+        The name of the metadata storing the similarity values.
+    """
+    license_key = LIGHTLY_STUDIO_LICENSE_KEY
+    if license_key is None:
+        raise ValueError(
+            "LIGHTLY_STUDIO_LICENSE_KEY environment variable is not set. "
+            "Please set it to your LightlyStudio license key."
+        )
+    key_samples = sample_embedding_resolver.get_all_by_dataset_id(
+        session=session, dataset_id=key_dataset_id, embedding_model_id=embedding_model_id
+    )
+    key_embeddings = [sample.embedding for sample in key_samples]
+    similarity = Similarity(key_embeddings=key_embeddings, token=license_key)
+    query_tag = tag_resolver.get_by_id(session=session, tag_id=query_tag_id)
+    if query_tag is None:
+        raise TagNotFoundError("Query tag with ID {query_tag_id} not found")
+    tag_filter = SampleFilter(tag_ids=[query_tag_id])
+    query_samples = sample_embedding_resolver.get_all_by_dataset_id(
+        session=session,
+        dataset_id=key_dataset_id,
+        embedding_model_id=embedding_model_id,
+        filters=tag_filter,
+    )
+    query_embeddings = [sample.embedding for sample in query_samples]
+    similarity_values = similarity.calculate_similarity(query_embeddings=query_embeddings)
+    if metadata_name is None:
+        date = datetime.now(timezone.utc)
+        # Only use whole seconds, such as "2025-11-26T10:11:56'. This is 19 characters.
+        formatted_date = date.isoformat()[:19]
+        metadata_name = f"similarity_{query_tag.name}_{formatted_date}"
+    metadata = [
+        (sample.sample_id, {metadata_name: similarity})
+        for sample, similarity in zip(key_samples, similarity_values)
+    ]
+    metadata_resolver.bulk_update_metadata(session, metadata)
+    return metadata_name

lightly_studio/metadata/compute_typicality.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Computes typicality from embeddings."""
+from uuid import UUID
+from lightly_mundig import Typicality  # type: ignore[import-untyped]
+from sqlmodel import Session
+from lightly_studio.dataset.env import LIGHTLY_STUDIO_LICENSE_KEY
+from lightly_studio.resolvers import (
+    metadata_resolver,
+    sample_embedding_resolver,
+)
+DEFAULT_NUM_NEAREST_NEIGHBORS = 20
+def compute_typicality_metadata(
+    session: Session,
+    dataset_id: UUID,
+    embedding_model_id: UUID,
+    metadata_name: str = "typicality",
+) -> None:
+    """Computes typicality for each sample in the dataset from embeddings.
+    Typicality is a measure of how representative a sample is of the dataset.
+    It is calculated for each sample from its K-nearest neighbors in the
+    embedding space.
+    The computed typicality values are stored as metadata for each sample.
+    Args:
+        session:
+            The database session.
+        dataset_id:
+            The ID of the dataset for which to compute the typicality.
+        embedding_model_id:
+            The ID of the embedding model to use for the computation.
+        metadata_name:
+            The name of the metadata field to store the typicality values in.
+            Defaults to "typicality".
+    """
+    license_key = LIGHTLY_STUDIO_LICENSE_KEY
+    if license_key is None:
+        raise ValueError(
+            "LIGHTLY_STUDIO_LICENSE_KEY environment variable is not set. "
+            "Please set it to your LightlyStudio license key."
+        )
+    samples = sample_embedding_resolver.get_all_by_dataset_id(
+        session=session, dataset_id=dataset_id, embedding_model_id=embedding_model_id
+    )
+    embeddings = [sample.embedding for sample in samples]
+    typicality = Typicality(embeddings=embeddings, token=license_key)
+    typicality_values = typicality.calculate_typicality(
+        num_nearest_neighbors=DEFAULT_NUM_NEAREST_NEIGHBORS
+    )
+    assert len(samples) == len(typicality_values), (
+        "The number of samples and computed typicality values must match"
+    )
+    metadata = [
+        (sample.sample_id, {metadata_name: typicality})
+        for sample, typicality in zip(samples, typicality_values)
+    ]
+    metadata_resolver.bulk_update_metadata(session, metadata)