PyPI - lightly-studio - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

lightly-studio 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lightly-studio might be problematic. Click here for more details.

Files changed (163) hide show

lightly_studio/core/add_samples.py CHANGED Viewed

@@ -2,6 +2,8 @@
 from __future__ import annotations
+import json
+from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Iterable
@@ -26,10 +28,12 @@ from tqdm import tqdm
 from lightly_studio.models.annotation.annotation_base import AnnotationCreate
 from lightly_studio.models.annotation_label import AnnotationLabelCreate
+from lightly_studio.models.caption import CaptionCreate
 from lightly_studio.models.sample import SampleCreate, SampleTable
 from lightly_studio.resolvers import (
     annotation_label_resolver,
     annotation_resolver,
+    caption_resolver,
     sample_resolver,
 )
@@ -46,7 +50,6 @@ class _AnnotationProcessingContext:
     dataset_id: UUID
     sample_id: UUID
     label_map: dict[int, UUID]
-    annotation_task_id: UUID
 @dataclass
@@ -137,7 +140,6 @@ def load_into_dataset_from_labelformat(
     dataset_id: UUID,
     input_labels: ObjectDetectionInput | InstanceSegmentationInput,
     images_path: Path,
-    annotation_task_id: UUID,
 ) -> list[UUID]:
     """Load samples and their annotations from a labelformat input into the dataset.
@@ -146,7 +148,6 @@ def load_into_dataset_from_labelformat(
         dataset_id: The ID of the dataset to load samples into.
         input_labels: The labelformat input containing images and annotations.
         images_path: The path to the directory containing the images.
-        annotation_task_id: The ID of the annotation task to associate with the annotations.
     Returns:
         A list of UUIDs of the created samples.
@@ -192,7 +193,6 @@ def load_into_dataset_from_labelformat(
                 image_path_to_anno_data=image_path_to_anno_data,
                 dataset_id=dataset_id,
                 label_map=label_map,
-                annotation_task_id=annotation_task_id,
                 annotations_to_create=annotations_to_create,
             )
             samples_to_create.clear()
@@ -210,7 +210,6 @@ def load_into_dataset_from_labelformat(
             image_path_to_anno_data=image_path_to_anno_data,
             dataset_id=dataset_id,
             label_map=label_map,
-            annotation_task_id=annotation_task_id,
             annotations_to_create=annotations_to_create,
         )
@@ -223,6 +222,111 @@ def load_into_dataset_from_labelformat(
     return created_sample_ids
+def load_into_dataset_from_coco_captions(
+    session: Session,
+    dataset_id: UUID,
+    annotations_json: Path,
+    images_path: Path,
+) -> list[UUID]:
+    """Load samples and captions from a COCO captions file into the dataset.
+    Args:
+        session: Database session used for resolver operations.
+        dataset_id: Identifier of the dataset that receives the samples.
+        annotations_json: Path to the COCO captions annotations file.
+        images_path: Directory containing the referenced images.
+    Returns:
+        The list of newly created sample identifiers.
+    """
+    with fsspec.open(str(annotations_json), "r") as file:
+        coco_payload = json.load(file)
+    images: list[dict[str, object]] = coco_payload.get("images", [])
+    annotations: list[dict[str, object]] = coco_payload.get("annotations", [])
+    captions_by_image_id: dict[int, list[str]] = defaultdict(list)
+    for annotation in annotations:
+        image_id = annotation["image_id"]
+        caption = annotation["caption"]
+        if not isinstance(image_id, int):
+            continue
+        if not isinstance(caption, str):
+            continue
+        caption_text = caption.strip()
+        if not caption_text:
+            continue
+        captions_by_image_id[image_id].append(caption_text)
+    logging_context = _LoadingLoggingContext(
+        n_samples_to_be_inserted=len(images),
+        n_samples_before_loading=sample_resolver.count_by_dataset_id(
+            session=session, dataset_id=dataset_id
+        ),
+    )
+    captions_to_create: list[CaptionCreate] = []
+    samples_to_create: list[SampleCreate] = []
+    created_sample_ids: list[UUID] = []
+    image_path_to_captions: dict[str, list[str]] = {}
+    for image_info in tqdm(images, desc="Processing images", unit=" images"):
+        if isinstance(image_info["id"], int):
+            image_id_raw = image_info["id"]
+        else:
+            continue
+        file_name_raw = str(image_info["file_name"])
+        width = image_info["width"] if isinstance(image_info["width"], int) else 0
+        height = image_info["height"] if isinstance(image_info["height"], int) else 0
+        sample = SampleCreate(
+            file_name=file_name_raw,
+            file_path_abs=str(images_path / file_name_raw),
+            width=width,
+            height=height,
+            dataset_id=dataset_id,
+        )
+        samples_to_create.append(sample)
+        image_path_to_captions[sample.file_path_abs] = captions_by_image_id.get(image_id_raw, [])
+        if len(samples_to_create) >= SAMPLE_BATCH_SIZE:
+            created_samples_batch, paths_not_inserted = _create_batch_samples(
+                session=session, samples=samples_to_create
+            )
+            created_sample_ids.extend(s.sample_id for s in created_samples_batch)
+            logging_context.update_example_paths(paths_not_inserted)
+            _process_batch_captions(
+                session=session,
+                dataset_id=dataset_id,
+                stored_samples=created_samples_batch,
+                image_path_to_captions=image_path_to_captions,
+                captions_to_create=captions_to_create,
+            )
+            samples_to_create.clear()
+            image_path_to_captions.clear()
+    if samples_to_create:
+        created_samples_batch, paths_not_inserted = _create_batch_samples(
+            session=session, samples=samples_to_create
+        )
+        created_sample_ids.extend(s.sample_id for s in created_samples_batch)
+        logging_context.update_example_paths(paths_not_inserted)
+        _process_batch_captions(
+            session=session,
+            dataset_id=dataset_id,
+            stored_samples=created_samples_batch,
+            image_path_to_captions=image_path_to_captions,
+            captions_to_create=captions_to_create,
+        )
+    if captions_to_create:
+        caption_resolver.create_many(session=session, captions=captions_to_create)
+    _log_loading_results(session=session, dataset_id=dataset_id, logging_context=logging_context)
+    return created_sample_ids
 def _log_loading_results(
     session: Session, dataset_id: UUID, logging_context: _LoadingLoggingContext
 ) -> None:
@@ -304,7 +408,6 @@ def _process_object_detection_annotations(
                 width=int(width),
                 height=int(height),
                 confidence=obj.confidence,
-                annotation_task_id=context.annotation_task_id,
             )
         )
     return new_annotations
@@ -339,7 +442,6 @@ def _process_instance_segmentation_annotations(
                 width=int(width),
                 height=int(height),
                 segmentation_mask=segmentation_rle,
-                annotation_task_id=context.annotation_task_id,
             )
         )
     return new_annotations
@@ -351,7 +453,6 @@ def _process_batch_annotations(  # noqa: PLR0913
     image_path_to_anno_data: dict[str, ImageInstanceSegmentation | ImageObjectDetection],
     dataset_id: UUID,
     label_map: dict[int, UUID],
-    annotation_task_id: UUID,
     annotations_to_create: list[AnnotationCreate],
 ) -> None:
     """Process annotations for a batch of samples."""
@@ -362,7 +463,6 @@ def _process_batch_annotations(  # noqa: PLR0913
             dataset_id=dataset_id,
             sample_id=stored_sample.sample_id,
             label_map=label_map,
-            annotation_task_id=annotation_task_id,
         )
         if isinstance(anno_data, ImageInstanceSegmentation):
@@ -381,3 +481,32 @@ def _process_batch_annotations(  # noqa: PLR0913
         if len(annotations_to_create) >= ANNOTATION_BATCH_SIZE:
             annotation_resolver.create_many(session=session, annotations=annotations_to_create)
             annotations_to_create.clear()
+def _process_batch_captions(
+    session: Session,
+    dataset_id: UUID,
+    stored_samples: list[SampleTable],
+    image_path_to_captions: dict[str, list[str]],
+    captions_to_create: list[CaptionCreate],
+) -> None:
+    """Process captions for a batch of samples."""
+    if not stored_samples:
+        return
+    for stored_sample in stored_samples:
+        captions = image_path_to_captions[stored_sample.file_path_abs]
+        if not captions:
+            continue
+        for caption_text in captions:
+            caption = CaptionCreate(
+                dataset_id=dataset_id,
+                sample_id=stored_sample.sample_id,
+                text=caption_text,
+            )
+            captions_to_create.append(caption)
+        if len(captions_to_create) >= ANNOTATION_BATCH_SIZE:
+            caption_resolver.create_many(session=session, captions=captions_to_create)
+            captions_to_create.clear()

lightly_studio/core/dataset.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Iterable, Iterator
 from uuid import UUID
+import yaml
 from labelformat.formats import (
     COCOInstanceSegmentationInput,
     COCOObjectDetectionInput,
@@ -28,21 +29,23 @@ from lightly_studio.core.dataset_query.order_by import OrderByExpression
 from lightly_studio.core.sample import Sample
 from lightly_studio.dataset import fsspec_lister
 from lightly_studio.dataset.embedding_manager import EmbeddingManagerProvider
-from lightly_studio.models.annotation_task import (
-    AnnotationTaskTable,
+from lightly_studio.metadata import compute_typicality
+from lightly_studio.models.annotation.annotation_base import (
     AnnotationType,
 )
 from lightly_studio.models.dataset import DatasetCreate, DatasetTable
 from lightly_studio.models.sample import SampleTable
 from lightly_studio.resolvers import (
-    annotation_task_resolver,
     dataset_resolver,
+    embedding_model_resolver,
     sample_resolver,
+    tag_resolver,
 )
 from lightly_studio.type_definitions import PathLike
 # Constants
 DEFAULT_DATASET_NAME = "default_dataset"
+ALLOWED_YOLO_SPLITS = {"train", "val", "test", "minival"}
 _SliceType = slice  # to avoid shadowing built-in slice in type annotations
@@ -68,7 +71,7 @@ class Dataset:
         dataset = dataset_resolver.create(
             session=db_manager.persistent_session(),
-            dataset=DatasetCreate(name=name, directory=""),
+            dataset=DatasetCreate(name=name),
         )
         return Dataset(dataset=dataset)
@@ -234,8 +237,6 @@ class Dataset:
         self,
         input_labels: ObjectDetectionInput | InstanceSegmentationInput,
         images_path: PathLike,
-        is_prediction: bool = True,
-        task_name: str | None = None,
         embed: bool = True,
     ) -> None:
         """Load a dataset from a labelformat object and store in database.
@@ -243,40 +244,17 @@ class Dataset:
         Args:
             input_labels: The labelformat input object.
             images_path: Path to the folder containing the images.
-            is_prediction: Whether the task is for prediction or labels.
-            task_name: Optional name for the annotation task. If None, a
-                default name is generated.
             embed: If True, generate embeddings for the newly added samples.
         """
         if isinstance(images_path, str):
             images_path = Path(images_path)
         images_path = images_path.absolute()
-        # Determine annotation type based on input.
-        # Currently, we always create BBOX tasks, even for segmentation,
-        # as segmentation data is stored alongside bounding boxes.
-        annotation_type = AnnotationType.BBOX
-        # Generate a default task name if none is provided.
-        if task_name is None:
-            task_name = f"Loaded from labelformat: {self.name}"
-        # Create annotation task.
-        new_annotation_task = annotation_task_resolver.create(
-            session=self.session,
-            annotation_task=AnnotationTaskTable(
-                name=task_name,
-                annotation_type=annotation_type,
-                is_prediction=is_prediction,
-            ),
-        )
         created_sample_ids = add_samples.load_into_dataset_from_labelformat(
             session=self.session,
             dataset_id=self.dataset_id,
             input_labels=input_labels,
             images_path=images_path,
-            annotation_task_id=new_annotation_task.annotation_task_id,
         )
         if embed:
@@ -287,17 +265,15 @@ class Dataset:
     def add_samples_from_yolo(
         self,
         data_yaml: PathLike,
-        input_split: str = "train",
-        task_name: str | None = None,
+        input_split: str | None = None,
         embed: bool = True,
     ) -> None:
         """Load a dataset in YOLO format and store in DB.
         Args:
             data_yaml: Path to the YOLO data.yaml file.
-            input_split: The split to load (e.g., 'train', 'val').
-            task_name: Optional name for the annotation task. If None, a
-                default name is generated.
+            input_split: The split to load (e.g., 'train', 'val', 'test').
+                If None, all available splits will be loaded and assigned a corresponding tag.
             embed: If True, generate embeddings for the newly added samples.
         """
         if isinstance(data_yaml, str):
@@ -307,30 +283,54 @@ class Dataset:
         if not data_yaml.is_file() or data_yaml.suffix != ".yaml":
             raise FileNotFoundError(f"YOLO data yaml file not found: '{data_yaml}'")
-        if task_name is None:
-            task_name = f"Loaded from YOLO: {data_yaml.name} ({input_split} split)"
+        # Determine which splits to process
+        splits_to_process = _resolve_yolo_splits(data_yaml=data_yaml, input_split=input_split)
-        # Load the dataset using labelformat.
-        label_input = YOLOv8ObjectDetectionInput(
-            input_file=data_yaml,
-            input_split=input_split,
-        )
-        images_path = label_input._images_dir()  # noqa: SLF001
+        all_created_sample_ids = []
-        self.add_samples_from_labelformat(
-            input_labels=label_input,
-            images_path=images_path,
-            is_prediction=False,
-            task_name=task_name,
-            embed=embed,
-        )
+        # Process each split
+        for split in splits_to_process:
+            # Load the dataset using labelformat.
+            label_input = YOLOv8ObjectDetectionInput(
+                input_file=data_yaml,
+                input_split=split,
+            )
+            images_path = label_input._images_dir()  # noqa: SLF001
+            created_sample_ids = add_samples.load_into_dataset_from_labelformat(
+                session=self.session,
+                dataset_id=self.dataset_id,
+                input_labels=label_input,
+                images_path=images_path,
+            )
+            # Tag samples with split name
+            if created_sample_ids:
+                tag = tag_resolver.get_or_create_sample_tag_by_name(
+                    session=self.session,
+                    dataset_id=self.dataset_id,
+                    tag_name=split,
+                )
+                tag_resolver.add_sample_ids_to_tag_id(
+                    session=self.session,
+                    tag_id=tag.tag_id,
+                    sample_ids=created_sample_ids,
+                )
+            all_created_sample_ids.extend(created_sample_ids)
+        # Generate embeddings for all samples at once
+        if embed:
+            _generate_embeddings(
+                session=self.session, dataset_id=self.dataset_id, sample_ids=all_created_sample_ids
+            )
     def add_samples_from_coco(
         self,
         annotations_json: PathLike,
         images_path: PathLike,
-        task_name: str | None = None,
-        annotation_type: AnnotationType = AnnotationType.BBOX,
+        annotation_type: AnnotationType = AnnotationType.OBJECT_DETECTION,
+        split: str | None = None,
         embed: bool = True,
     ) -> None:
         """Load a dataset in COCO Object Detection format and store in DB.
@@ -338,10 +338,10 @@ class Dataset:
         Args:
             annotations_json: Path to the COCO annotations JSON file.
             images_path: Path to the folder containing the images.
-            task_name: Optional name for the annotation task. If None, a
-                default name is generated.
             annotation_type: The type of annotation to be loaded (e.g., 'ObjectDetection',
                 'InstanceSegmentation').
+            split: Optional split name to tag samples (e.g., 'train', 'val').
+                If provided, all samples will be tagged with this name.
             embed: If True, generate embeddings for the newly added samples.
         """
         if isinstance(annotations_json, str):
@@ -353,30 +353,121 @@ class Dataset:
         label_input: COCOObjectDetectionInput | COCOInstanceSegmentationInput
-        if annotation_type == AnnotationType.BBOX:
+        if annotation_type == AnnotationType.OBJECT_DETECTION:
             label_input = COCOObjectDetectionInput(
                 input_file=annotations_json,
             )
-            task_name_default = f"Loaded from COCO Object Detection: {annotations_json.name}"
         elif annotation_type == AnnotationType.INSTANCE_SEGMENTATION:
             label_input = COCOInstanceSegmentationInput(
                 input_file=annotations_json,
             )
-            task_name_default = f"Loaded from COCO Instance Segmentation: {annotations_json.name}"
         else:
             raise ValueError(f"Invalid annotation type: {annotation_type}")
-        if task_name is None:
-            task_name = task_name_default
         images_path = Path(images_path).absolute()
-        self.add_samples_from_labelformat(
+        created_sample_ids = add_samples.load_into_dataset_from_labelformat(
+            session=self.session,
+            dataset_id=self.dataset_id,
             input_labels=label_input,
             images_path=images_path,
-            is_prediction=False,
-            task_name=task_name,
-            embed=embed,
+        )
+        # Tag samples with split name if provided
+        if split is not None and created_sample_ids:
+            tag = tag_resolver.get_or_create_sample_tag_by_name(
+                session=self.session,
+                dataset_id=self.dataset_id,
+                tag_name=split,
+            )
+            tag_resolver.add_sample_ids_to_tag_id(
+                session=self.session,
+                tag_id=tag.tag_id,
+                sample_ids=created_sample_ids,
+            )
+        if embed:
+            _generate_embeddings(
+                session=self.session, dataset_id=self.dataset_id, sample_ids=created_sample_ids
+            )
+    def add_samples_from_coco_caption(
+        self,
+        annotations_json: PathLike,
+        images_path: PathLike,
+        split: str | None = None,
+        embed: bool = True,
+    ) -> None:
+        """Load a dataset in COCO caption format and store in DB.
+        Args:
+            annotations_json: Path to the COCO caption JSON file.
+            images_path: Path to the folder containing the images.
+            split: Optional split name to tag samples (e.g., 'train', 'val').
+                If provided, all samples will be tagged with this name.
+            embed: If True, generate embeddings for the newly added samples.
+        """
+        if isinstance(annotations_json, str):
+            annotations_json = Path(annotations_json)
+        annotations_json = annotations_json.absolute()
+        if not annotations_json.is_file() or annotations_json.suffix != ".json":
+            raise FileNotFoundError(f"COCO caption json file not found: '{annotations_json}'")
+        if isinstance(images_path, str):
+            images_path = Path(images_path)
+        images_path = images_path.absolute()
+        created_sample_ids = add_samples.load_into_dataset_from_coco_captions(
+            session=self.session,
+            dataset_id=self.dataset_id,
+            annotations_json=annotations_json,
+            images_path=images_path,
+        )
+        # Tag samples with split name if provided
+        if split is not None and created_sample_ids:
+            tag = tag_resolver.get_or_create_sample_tag_by_name(
+                session=self.session,
+                dataset_id=self.dataset_id,
+                tag_name=split,
+            )
+            tag_resolver.add_sample_ids_to_tag_id(
+                session=self.session,
+                tag_id=tag.tag_id,
+                sample_ids=created_sample_ids,
+            )
+        if embed:
+            _generate_embeddings(
+                session=self.session, dataset_id=self.dataset_id, sample_ids=created_sample_ids
+            )
+    def compute_typicality_metadata(
+        self,
+        embedding_model_name: str | None = None,
+        metadata_name: str = "typicality",
+    ) -> None:
+        """Computes typicality from embeddings, for K nearest neighbors.
+        Args:
+            embedding_model_name:
+                The name of the embedding model to use. If not given, the default
+                embedding model is used.
+            metadata_name:
+                The name of the metadata to store the typicality values in. If not give, the default
+                name "typicality" is used.
+        """
+        embedding_model_id = embedding_model_resolver.get_by_name(
+            session=self.session,
+            dataset_id=self.dataset_id,
+            embedding_model_name=embedding_model_name,
+        ).embedding_model_id
+        compute_typicality.compute_typicality_metadata(
+            session=self.session,
+            dataset_id=self.dataset_id,
+            embedding_model_id=embedding_model_id,
+            metadata_name=metadata_name,
         )
@@ -409,3 +500,23 @@ def _generate_embeddings(session: Session, dataset_id: UUID, sample_ids: list[UU
     # Mark the embedding search feature as enabled.
     if "embeddingSearchEnabled" not in features.lightly_studio_active_features:
         features.lightly_studio_active_features.append("embeddingSearchEnabled")
+def _resolve_yolo_splits(data_yaml: Path, input_split: str | None) -> list[str]:
+    """Determine which YOLO splits to process for the given config."""
+    if input_split is not None:
+        if input_split not in ALLOWED_YOLO_SPLITS:
+            raise ValueError(
+                f"Split '{input_split}' not found in config file '{data_yaml}'. "
+                f"Allowed splits: {sorted(ALLOWED_YOLO_SPLITS)}"
+            )
+        return [input_split]
+    with data_yaml.open() as f:
+        config = yaml.safe_load(f)
+    config_keys = config.keys() if isinstance(config, dict) else []
+    splits = [key for key in config_keys if key in ALLOWED_YOLO_SPLITS]
+    if not splits:
+        raise ValueError(f"No splits found in config file '{data_yaml}'")
+    return splits

lightly_studio/core/dataset_query/dataset_query.py CHANGED Viewed

@@ -10,6 +10,7 @@ from lightly_studio.core.dataset_query.match_expression import MatchExpression
 from lightly_studio.core.dataset_query.order_by import OrderByExpression, OrderByField
 from lightly_studio.core.dataset_query.sample_field import SampleField
 from lightly_studio.core.sample import Sample
+from lightly_studio.export.export_dataset import DatasetExport
 from lightly_studio.models.dataset import DatasetTable
 from lightly_studio.models.sample import SampleTable
 from lightly_studio.resolvers import tag_resolver
@@ -209,3 +210,7 @@ class DatasetQuery:
             session=self.session,
             input_sample_ids=input_sample_ids,
         )
+    def export(self) -> DatasetExport:
+        """Return a DatasetExport instance which can export the dataset in various formats."""
+        return DatasetExport(session=self.session, samples=self)

lightly_studio/dataset/env.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Initialize environment variables for the dataset module."""
+from typing import Optional
 from environs import Env
 env = Env()
@@ -14,3 +16,5 @@ LIGHTLY_STUDIO_HOST: str = env.str("LIGHTLY_STUDIO_HOST", "localhost")
 LIGHTLY_STUDIO_DEBUG: str = env.bool("LIGHTLY_STUDIO_DEBUG", "false")
 APP_URL = f"{LIGHTLY_STUDIO_PROTOCOL}://{LIGHTLY_STUDIO_HOST}:{LIGHTLY_STUDIO_PORT}"
+LIGHTLY_STUDIO_LICENSE_KEY: Optional[str] = env.str("LIGHTLY_STUDIO_LICENSE_KEY", default=None)

lightly_studio/dataset/file_utils.py CHANGED Viewed

@@ -13,8 +13,19 @@ def download_file_if_does_not_exist(url: str, local_filename: Path) -> None:
     """Download a file from a URL if it does not already exist locally."""
     if local_filename.exists():
         return
-    with requests.get(url, stream=True) as r, open(local_filename, "wb") as f:
-        shutil.copyfileobj(r.raw, f)
+    try:
+        print(f"Downloading {url} to {local_filename}")
+        with requests.get(url, stream=True, timeout=30) as r:
+            # Raise an error for bad status codes
+            r.raise_for_status()
+            with open(local_filename, "wb") as f:
+                shutil.copyfileobj(r.raw, f)
+    except Exception:
+        # If download fails, remove any partial file to allow retry.
+        if local_filename.exists():
+            local_filename.unlink()
+        raise
 def get_file_xxhash(file_path: Path) -> str:

lightly-studio 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

Potentially problematic release.

lightly-studio 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl