PyPI - lightly-studio - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

lightly-studio 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lightly-studio might be problematic. Click here for more details.

Files changed (122) hide show

lightly_studio/api/app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from lightly_studio.api.routes import healthz, images, webapp
 from lightly_studio.api.routes.api import (
     annotation,
     annotation_label,
+    caption,
     classifier,
     dataset,
     dataset_tag,
@@ -89,6 +90,7 @@ api_router.include_router(export.export_router)
 api_router.include_router(sample.samples_router)
 api_router.include_router(annotation_label.annotations_label_router)
 api_router.include_router(annotation.annotations_router)
+api_router.include_router(caption.captions_router)
 api_router.include_router(text_embedding.text_embedding_router)
 api_router.include_router(settings.settings_router)
 api_router.include_router(classifier.classifier_router)

lightly_studio/api/routes/api/caption.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""API routes for dataset captions."""
+from __future__ import annotations
+from uuid import UUID
+from fastapi import APIRouter, Depends, Path
+from typing_extensions import Annotated
+from lightly_studio.api.routes.api.validators import Paginated, PaginatedWithCursor
+from lightly_studio.db_manager import SessionDep
+from lightly_studio.models.caption import CaptionsListView
+from lightly_studio.resolvers import caption_resolver
+from lightly_studio.resolvers.caption_resolver import GetAllCaptionsResult
+captions_router = APIRouter(prefix="/datasets/{dataset_id}", tags=["captions"])
+@captions_router.get("/captions", response_model=CaptionsListView)
+def read_captions(
+    dataset_id: Annotated[UUID, Path(title="Dataset Id")],
+    session: SessionDep,
+    pagination: Annotated[PaginatedWithCursor, Depends()],
+) -> GetAllCaptionsResult:
+    """Retrieve captions for a dataset."""
+    return caption_resolver.get_all(
+        session=session,
+        dataset_id=dataset_id,
+        pagination=Paginated(offset=pagination.offset, limit=pagination.limit),
+    )

lightly_studio/api/routes/api/embeddings2d.py CHANGED Viewed

@@ -3,25 +3,40 @@
 from __future__ import annotations
 import io
+from uuid import UUID
 import numpy as np
 import pyarrow as pa
 from fastapi import APIRouter, HTTPException, Response
 from numpy.typing import NDArray
 from pyarrow import ipc
+from pydantic import BaseModel, Field
 from sklearn.manifold import TSNE
 from sqlmodel import select
 from lightly_studio.db_manager import SessionDep
 from lightly_studio.models.dataset import DatasetTable
 from lightly_studio.models.embedding_model import EmbeddingModelTable
-from lightly_studio.resolvers import sample_embedding_resolver
+from lightly_studio.resolvers import sample_embedding_resolver, sample_resolver
+from lightly_studio.resolvers.samples_filter import SampleFilter
 embeddings2d_router = APIRouter()
-@embeddings2d_router.get("/embeddings2d/tsne")
-def get_embeddings2d__tsne(session: SessionDep) -> Response:
+class GetEmbeddings2DRequest(BaseModel):
+    """Request body for retrieving 2D embeddings."""
+    filters: SampleFilter | None = Field(
+        None,
+        description="Filter parameters identifying matching samples",
+    )
+@embeddings2d_router.post("/embeddings2d/tsne")
+def get_embeddings2d__tsne(
+    session: SessionDep,
+    body: GetEmbeddings2DRequest | None = None,
+) -> Response:
     """Return 2D embeddings serialized as an Arrow stream."""
     # TODO(Malte, 09/2025): Support choosing the dataset via API parameter.
     dataset = session.exec(select(DatasetTable).limit(1)).first()
@@ -37,7 +52,6 @@ def get_embeddings2d__tsne(session: SessionDep) -> Response:
     if embedding_model is None:
         raise HTTPException(status_code=404, detail="No embedding model configured.")
-    # TODO(Malte, 09/2025): Support choosing a subset of samples via API parameter.
     embeddings = sample_embedding_resolver.get_all_by_dataset_id(
         session=session,
         dataset_id=dataset.dataset_id,
@@ -49,6 +63,22 @@ def get_embeddings2d__tsne(session: SessionDep) -> Response:
     x = embedding_values_tsne[:, 0]
     y = embedding_values_tsne[:, 1]
+    matching_sample_ids: set[UUID] | None = None
+    filters = body.filters if body else None
+    if filters:
+        matching_samples_result = sample_resolver.get_all_by_dataset_id(
+            session=session,
+            dataset_id=dataset.dataset_id,
+            filters=filters,
+        )
+        matching_sample_ids = {sample.sample_id for sample in matching_samples_result.samples}
+    sample_ids = [embedding.sample_id for embedding in embeddings]
+    if matching_sample_ids is None:
+        fulfils_filter = [1] * len(sample_ids)
+    else:
+        fulfils_filter = [1 if sample_id in matching_sample_ids else 0 for sample_id in sample_ids]
     # TODO(Malte, 09/2025): Save the 2D-embeddings in the database to avoid recomputing
     # them on every request.
@@ -57,6 +87,8 @@ def get_embeddings2d__tsne(session: SessionDep) -> Response:
         {
             "x": pa.array(x, type=pa.float32()),
             "y": pa.array(y, type=pa.float32()),
+            "fulfils_filter": pa.array(fulfils_filter, type=pa.uint8()),
+            "sample_id": pa.array([str(sample_id) for sample_id in sample_ids], type=pa.string()),
         }
     )

lightly_studio/api/routes/api/metadata.py CHANGED Viewed

@@ -5,11 +5,16 @@ from __future__ import annotations
 from typing import List
 from uuid import UUID
-from fastapi import APIRouter, Path
+from fastapi import APIRouter, Depends, Path
+from pydantic import BaseModel, Field
 from typing_extensions import Annotated
+from lightly_studio.api.routes.api.dataset import get_and_validate_dataset_id
 from lightly_studio.db_manager import SessionDep
+from lightly_studio.metadata import compute_typicality
+from lightly_studio.models.dataset import DatasetTable
 from lightly_studio.models.metadata import MetadataInfoView
+from lightly_studio.resolvers import embedding_model_resolver
 from lightly_studio.resolvers.metadata_resolver.sample.get_metadata_info import (
     get_all_metadata_keys_and_schema,
 )
@@ -33,3 +38,54 @@ def get_metadata_info(
         for numerical metadata types.
     """
     return get_all_metadata_keys_and_schema(session=session, dataset_id=dataset_id)
+class ComputeTypicalityRequest(BaseModel):
+    """Request model for computing typicality metadata."""
+    embedding_model_name: str | None = Field(
+        default=None,
+        description="Embedding model name (uses default if not specified)",
+    )
+    metadata_name: str = Field(
+        default="typicality",
+        description="Metadata field name (defaults to 'typicality')",
+    )
+@metadata_router.post(
+    "/metadata/typicality",
+    status_code=204,
+    response_model=None,
+)
+def compute_typicality_metadata(
+    session: SessionDep,
+    dataset: Annotated[
+        DatasetTable,
+        Depends(get_and_validate_dataset_id),
+    ],
+    request: ComputeTypicalityRequest,
+) -> None:
+    """Compute typicality metadata for a dataset.
+    Args:
+        session: The database session.
+        dataset: The dataset to compute typicality for.
+        request: Request parameters including optional embedding model name
+            and metadata field name.
+    Returns:
+        None (204 No Content on success).
+    """
+    embedding_model = embedding_model_resolver.get_by_name(
+        session=session,
+        dataset_id=dataset.dataset_id,
+        embedding_model_name=request.embedding_model_name,
+    )
+    compute_typicality.compute_typicality_metadata(
+        session=session,
+        dataset_id=dataset.dataset_id,
+        embedding_model_id=embedding_model.embedding_model_id,
+        metadata_name=request.metadata_name,
+    )

lightly_studio/core/add_samples.py CHANGED Viewed

@@ -2,6 +2,8 @@
 from __future__ import annotations
+import json
+from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Iterable
@@ -26,10 +28,12 @@ from tqdm import tqdm
 from lightly_studio.models.annotation.annotation_base import AnnotationCreate
 from lightly_studio.models.annotation_label import AnnotationLabelCreate
+from lightly_studio.models.caption import CaptionCreate
 from lightly_studio.models.sample import SampleCreate, SampleTable
 from lightly_studio.resolvers import (
     annotation_label_resolver,
     annotation_resolver,
+    caption_resolver,
     sample_resolver,
 )
@@ -218,6 +222,111 @@ def load_into_dataset_from_labelformat(
     return created_sample_ids
+def load_into_dataset_from_coco_captions(
+    session: Session,
+    dataset_id: UUID,
+    annotations_json: Path,
+    images_path: Path,
+) -> list[UUID]:
+    """Load samples and captions from a COCO captions file into the dataset.
+    Args:
+        session: Database session used for resolver operations.
+        dataset_id: Identifier of the dataset that receives the samples.
+        annotations_json: Path to the COCO captions annotations file.
+        images_path: Directory containing the referenced images.
+    Returns:
+        The list of newly created sample identifiers.
+    """
+    with fsspec.open(str(annotations_json), "r") as file:
+        coco_payload = json.load(file)
+    images: list[dict[str, object]] = coco_payload.get("images", [])
+    annotations: list[dict[str, object]] = coco_payload.get("annotations", [])
+    captions_by_image_id: dict[int, list[str]] = defaultdict(list)
+    for annotation in annotations:
+        image_id = annotation["image_id"]
+        caption = annotation["caption"]
+        if not isinstance(image_id, int):
+            continue
+        if not isinstance(caption, str):
+            continue
+        caption_text = caption.strip()
+        if not caption_text:
+            continue
+        captions_by_image_id[image_id].append(caption_text)
+    logging_context = _LoadingLoggingContext(
+        n_samples_to_be_inserted=len(images),
+        n_samples_before_loading=sample_resolver.count_by_dataset_id(
+            session=session, dataset_id=dataset_id
+        ),
+    )
+    captions_to_create: list[CaptionCreate] = []
+    samples_to_create: list[SampleCreate] = []
+    created_sample_ids: list[UUID] = []
+    image_path_to_captions: dict[str, list[str]] = {}
+    for image_info in tqdm(images, desc="Processing images", unit=" images"):
+        if isinstance(image_info["id"], int):
+            image_id_raw = image_info["id"]
+        else:
+            continue
+        file_name_raw = str(image_info["file_name"])
+        width = image_info["width"] if isinstance(image_info["width"], int) else 0
+        height = image_info["height"] if isinstance(image_info["height"], int) else 0
+        sample = SampleCreate(
+            file_name=file_name_raw,
+            file_path_abs=str(images_path / file_name_raw),
+            width=width,
+            height=height,
+            dataset_id=dataset_id,
+        )
+        samples_to_create.append(sample)
+        image_path_to_captions[sample.file_path_abs] = captions_by_image_id.get(image_id_raw, [])
+        if len(samples_to_create) >= SAMPLE_BATCH_SIZE:
+            created_samples_batch, paths_not_inserted = _create_batch_samples(
+                session=session, samples=samples_to_create
+            )
+            created_sample_ids.extend(s.sample_id for s in created_samples_batch)
+            logging_context.update_example_paths(paths_not_inserted)
+            _process_batch_captions(
+                session=session,
+                dataset_id=dataset_id,
+                stored_samples=created_samples_batch,
+                image_path_to_captions=image_path_to_captions,
+                captions_to_create=captions_to_create,
+            )
+            samples_to_create.clear()
+            image_path_to_captions.clear()
+    if samples_to_create:
+        created_samples_batch, paths_not_inserted = _create_batch_samples(
+            session=session, samples=samples_to_create
+        )
+        created_sample_ids.extend(s.sample_id for s in created_samples_batch)
+        logging_context.update_example_paths(paths_not_inserted)
+        _process_batch_captions(
+            session=session,
+            dataset_id=dataset_id,
+            stored_samples=created_samples_batch,
+            image_path_to_captions=image_path_to_captions,
+            captions_to_create=captions_to_create,
+        )
+    if captions_to_create:
+        caption_resolver.create_many(session=session, captions=captions_to_create)
+    _log_loading_results(session=session, dataset_id=dataset_id, logging_context=logging_context)
+    return created_sample_ids
 def _log_loading_results(
     session: Session, dataset_id: UUID, logging_context: _LoadingLoggingContext
 ) -> None:
@@ -372,3 +481,32 @@ def _process_batch_annotations(  # noqa: PLR0913
         if len(annotations_to_create) >= ANNOTATION_BATCH_SIZE:
             annotation_resolver.create_many(session=session, annotations=annotations_to_create)
             annotations_to_create.clear()
+def _process_batch_captions(
+    session: Session,
+    dataset_id: UUID,
+    stored_samples: list[SampleTable],
+    image_path_to_captions: dict[str, list[str]],
+    captions_to_create: list[CaptionCreate],
+) -> None:
+    """Process captions for a batch of samples."""
+    if not stored_samples:
+        return
+    for stored_sample in stored_samples:
+        captions = image_path_to_captions[stored_sample.file_path_abs]
+        if not captions:
+            continue
+        for caption_text in captions:
+            caption = CaptionCreate(
+                dataset_id=dataset_id,
+                sample_id=stored_sample.sample_id,
+                text=caption_text,
+            )
+            captions_to_create.append(caption)
+        if len(captions_to_create) >= ANNOTATION_BATCH_SIZE:
+            caption_resolver.create_many(session=session, captions=captions_to_create)
+            captions_to_create.clear()

lightly_studio/core/dataset.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Iterable, Iterator
 from uuid import UUID
+import yaml
 from labelformat.formats import (
     COCOInstanceSegmentationInput,
     COCOObjectDetectionInput,
@@ -38,11 +39,13 @@ from lightly_studio.resolvers import (
     dataset_resolver,
     embedding_model_resolver,
     sample_resolver,
+    tag_resolver,
 )
 from lightly_studio.type_definitions import PathLike
 # Constants
 DEFAULT_DATASET_NAME = "default_dataset"
+ALLOWED_YOLO_SPLITS = {"train", "val", "test", "minival"}
 _SliceType = slice  # to avoid shadowing built-in slice in type annotations
@@ -68,7 +71,7 @@ class Dataset:
         dataset = dataset_resolver.create(
             session=db_manager.persistent_session(),
-            dataset=DatasetCreate(name=name, directory=""),
+            dataset=DatasetCreate(name=name),
         )
         return Dataset(dataset=dataset)
@@ -262,14 +265,15 @@ class Dataset:
     def add_samples_from_yolo(
         self,
         data_yaml: PathLike,
-        input_split: str = "train",
+        input_split: str | None = None,
         embed: bool = True,
     ) -> None:
         """Load a dataset in YOLO format and store in DB.
         Args:
             data_yaml: Path to the YOLO data.yaml file.
-            input_split: The split to load (e.g., 'train', 'val').
+            input_split: The split to load (e.g., 'train', 'val', 'test').
+                If None, all available splits will be loaded and assigned a corresponding tag.
             embed: If True, generate embeddings for the newly added samples.
         """
         if isinstance(data_yaml, str):
@@ -279,24 +283,54 @@ class Dataset:
         if not data_yaml.is_file() or data_yaml.suffix != ".yaml":
             raise FileNotFoundError(f"YOLO data yaml file not found: '{data_yaml}'")
-        # Load the dataset using labelformat.
-        label_input = YOLOv8ObjectDetectionInput(
-            input_file=data_yaml,
-            input_split=input_split,
-        )
-        images_path = label_input._images_dir()  # noqa: SLF001
+        # Determine which splits to process
+        splits_to_process = _resolve_yolo_splits(data_yaml=data_yaml, input_split=input_split)
-        self.add_samples_from_labelformat(
-            input_labels=label_input,
-            images_path=images_path,
-            embed=embed,
-        )
+        all_created_sample_ids = []
+        # Process each split
+        for split in splits_to_process:
+            # Load the dataset using labelformat.
+            label_input = YOLOv8ObjectDetectionInput(
+                input_file=data_yaml,
+                input_split=split,
+            )
+            images_path = label_input._images_dir()  # noqa: SLF001
+            created_sample_ids = add_samples.load_into_dataset_from_labelformat(
+                session=self.session,
+                dataset_id=self.dataset_id,
+                input_labels=label_input,
+                images_path=images_path,
+            )
+            # Tag samples with split name
+            if created_sample_ids:
+                tag = tag_resolver.get_or_create_sample_tag_by_name(
+                    session=self.session,
+                    dataset_id=self.dataset_id,
+                    tag_name=split,
+                )
+                tag_resolver.add_sample_ids_to_tag_id(
+                    session=self.session,
+                    tag_id=tag.tag_id,
+                    sample_ids=created_sample_ids,
+                )
+            all_created_sample_ids.extend(created_sample_ids)
+        # Generate embeddings for all samples at once
+        if embed:
+            _generate_embeddings(
+                session=self.session, dataset_id=self.dataset_id, sample_ids=all_created_sample_ids
+            )
     def add_samples_from_coco(
         self,
         annotations_json: PathLike,
         images_path: PathLike,
         annotation_type: AnnotationType = AnnotationType.OBJECT_DETECTION,
+        split: str | None = None,
         embed: bool = True,
     ) -> None:
         """Load a dataset in COCO Object Detection format and store in DB.
@@ -306,6 +340,8 @@ class Dataset:
             images_path: Path to the folder containing the images.
             annotation_type: The type of annotation to be loaded (e.g., 'ObjectDetection',
                 'InstanceSegmentation').
+            split: Optional split name to tag samples (e.g., 'train', 'val').
+                If provided, all samples will be tagged with this name.
             embed: If True, generate embeddings for the newly added samples.
         """
         if isinstance(annotations_json, str):
@@ -330,12 +366,83 @@ class Dataset:
         images_path = Path(images_path).absolute()
-        self.add_samples_from_labelformat(
+        created_sample_ids = add_samples.load_into_dataset_from_labelformat(
+            session=self.session,
+            dataset_id=self.dataset_id,
             input_labels=label_input,
             images_path=images_path,
-            embed=embed,
         )
+        # Tag samples with split name if provided
+        if split is not None and created_sample_ids:
+            tag = tag_resolver.get_or_create_sample_tag_by_name(
+                session=self.session,
+                dataset_id=self.dataset_id,
+                tag_name=split,
+            )
+            tag_resolver.add_sample_ids_to_tag_id(
+                session=self.session,
+                tag_id=tag.tag_id,
+                sample_ids=created_sample_ids,
+            )
+        if embed:
+            _generate_embeddings(
+                session=self.session, dataset_id=self.dataset_id, sample_ids=created_sample_ids
+            )
+    def add_samples_from_coco_caption(
+        self,
+        annotations_json: PathLike,
+        images_path: PathLike,
+        split: str | None = None,
+        embed: bool = True,
+    ) -> None:
+        """Load a dataset in COCO caption format and store in DB.
+        Args:
+            annotations_json: Path to the COCO caption JSON file.
+            images_path: Path to the folder containing the images.
+            split: Optional split name to tag samples (e.g., 'train', 'val').
+                If provided, all samples will be tagged with this name.
+            embed: If True, generate embeddings for the newly added samples.
+        """
+        if isinstance(annotations_json, str):
+            annotations_json = Path(annotations_json)
+        annotations_json = annotations_json.absolute()
+        if not annotations_json.is_file() or annotations_json.suffix != ".json":
+            raise FileNotFoundError(f"COCO caption json file not found: '{annotations_json}'")
+        if isinstance(images_path, str):
+            images_path = Path(images_path)
+        images_path = images_path.absolute()
+        created_sample_ids = add_samples.load_into_dataset_from_coco_captions(
+            session=self.session,
+            dataset_id=self.dataset_id,
+            annotations_json=annotations_json,
+            images_path=images_path,
+        )
+        # Tag samples with split name if provided
+        if split is not None and created_sample_ids:
+            tag = tag_resolver.get_or_create_sample_tag_by_name(
+                session=self.session,
+                dataset_id=self.dataset_id,
+                tag_name=split,
+            )
+            tag_resolver.add_sample_ids_to_tag_id(
+                session=self.session,
+                tag_id=tag.tag_id,
+                sample_ids=created_sample_ids,
+            )
+        if embed:
+            _generate_embeddings(
+                session=self.session, dataset_id=self.dataset_id, sample_ids=created_sample_ids
+            )
     def compute_typicality_metadata(
         self,
         embedding_model_name: str | None = None,
@@ -393,3 +500,23 @@ def _generate_embeddings(session: Session, dataset_id: UUID, sample_ids: list[UU
     # Mark the embedding search feature as enabled.
     if "embeddingSearchEnabled" not in features.lightly_studio_active_features:
         features.lightly_studio_active_features.append("embeddingSearchEnabled")
+def _resolve_yolo_splits(data_yaml: Path, input_split: str | None) -> list[str]:
+    """Determine which YOLO splits to process for the given config."""
+    if input_split is not None:
+        if input_split not in ALLOWED_YOLO_SPLITS:
+            raise ValueError(
+                f"Split '{input_split}' not found in config file '{data_yaml}'. "
+                f"Allowed splits: {sorted(ALLOWED_YOLO_SPLITS)}"
+            )
+        return [input_split]
+    with data_yaml.open() as f:
+        config = yaml.safe_load(f)
+    config_keys = config.keys() if isinstance(config, dict) else []
+    splits = [key for key in config_keys if key in ALLOWED_YOLO_SPLITS]
+    if not splits:
+        raise ValueError(f"No splits found in config file '{data_yaml}'")
+    return splits

lightly_studio/dataset/loader.py CHANGED Viewed

@@ -258,10 +258,7 @@ class DatasetLoader:
         # Create dataset and annotation task.
         dataset = dataset_resolver.create(
             session=self.session,
-            dataset=DatasetCreate(
-                name=dataset_name,
-                directory=str(img_dir_path),
-            ),
+            dataset=DatasetCreate(name=dataset_name),
         )
         self._load_into_dataset(
@@ -296,10 +293,7 @@ class DatasetLoader:
         # Create dataset.
         dataset = dataset_resolver.create(
             session=self.session,
-            dataset=DatasetCreate(
-                name=dataset_name,
-                directory=img_dir,
-            ),
+            dataset=DatasetCreate(name=dataset_name),
         )
         # Collect image file paths with extension filtering.

lightly_studio/db_manager.py CHANGED Viewed

@@ -57,6 +57,11 @@ class DatabaseEngine:
         try:
             yield session
             session.commit()
+            # Commit the persistent session to ensure it sees the latest data changes.
+            # This prevents the persistent session from having stale data when it's used
+            # after operations in short-lived sessions have modified the database.
+            self.get_persistent_session().commit()
         except Exception:
             session.rollback()
             raise
@@ -66,7 +71,9 @@ class DatabaseEngine:
     def get_persistent_session(self) -> Session:
         """Get the persistent database session."""
         if self._persistent_session is None:
-            self._persistent_session = Session(self._engine, close_resets_only=False)
+            self._persistent_session = Session(
+                self._engine, close_resets_only=False, expire_on_commit=True
+            )
         return self._persistent_session
@@ -78,11 +85,10 @@ def get_engine() -> DatabaseEngine:
     """Get the database engine.
     If the engine does not exist yet, it is newly created with the default settings.
-    In that case, a pre-existing database file is deleted.
     """
     global _engine  # noqa: PLW0603
     if _engine is None:
-        _engine = DatabaseEngine(cleanup_existing=True)
+        _engine = DatabaseEngine()
     return _engine
@@ -94,7 +100,7 @@ def set_engine(engine: DatabaseEngine) -> None:
     _engine = engine
-def connect(db_file: str | None, cleanup_existing: bool = False) -> None:
+def connect(db_file: str | None = None, cleanup_existing: bool = False) -> None:
     """Set up the database connection.
     Helper function to set up the database engine.

lightly-studio 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

Potentially problematic release.

lightly-studio 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl