PyPI - lightly-studio - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

lightly-studio 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lightly-studio might be problematic. Click here for more details.

Files changed (169) hide show

lightly_studio/dataset/fsspec_lister.py ADDED Viewed

@@ -0,0 +1,275 @@
+"""File listing utilities using fsspec.
+Handles local and remote paths, directories, and glob patterns.
+"""
+from __future__ import annotations
+import logging
+from collections.abc import Iterator
+from typing import Any
+import fsspec
+from tqdm import tqdm
+# Constants
+PROTOCOL_SEPARATOR = "://"
+DEFAULT_PROTOCOL = "file"
+PATH_SEPARATOR = "/"
+# Glob pattern characters
+GLOB_CHARS = ["*", "?", "[", "]"]
+# Cloud storage protocols
+CLOUD_PROTOCOLS = ("s3", "gs", "gcs", "azure", "abfs")
+# Image file extensions
+IMAGE_EXTENSIONS = {
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".webp",
+    ".bmp",
+    ".tiff",
+}
+def iter_files_from_path(path: str, allowed_extensions: set[str] | None = None) -> Iterator[str]:
+    """List all files from a single path, handling directories, globs, and individual files.
+    Args:
+        path: A single path which can be:
+            - Individual file path
+            - Directory path (will list all files recursively)
+            - Glob pattern
+            - Remote path (s3://, gcs://, etc.)
+        allowed_extensions: Optional set of allowed file extensions (e.g., {".jpg", ".png"}).
+            If None, uses default IMAGE_EXTENSIONS.
+    Yields:
+        File paths as they are discovered, with progress tracking
+    """
+    seen: set[str] = set()
+    extensions = allowed_extensions or IMAGE_EXTENSIONS
+    with tqdm(desc="Discovering files", unit=" files", dynamic_ncols=True) as pbar:
+        cleaned_path = str(path).strip()
+        if not cleaned_path:
+            return
+        fs = _get_filesystem(cleaned_path)
+        yield from _process_single_path_streaming(fs, cleaned_path, seen, pbar, extensions)
+def _process_single_path_streaming(
+    fs: fsspec.AbstractFileSystem, path: str, seen: set[str], pbar: tqdm[Any], extensions: set[str]
+) -> Iterator[str]:
+    """Process a single path and yield matching image files.
+    Handles different path types: individual files, directories, and glob patterns.
+    Args:
+        fs: The filesystem instance.
+        path: The path to process (file, directory, or glob pattern).
+        seen: Set of already processed paths to avoid duplicates.
+        pbar: Progress bar instance for tracking progress.
+        extensions: Set of allowed file extensions.
+    Yields:
+        File paths that match the criteria.
+    Raises:
+        ValueError: If the path doesn't exist or is not an image file when expected.
+    """
+    if _is_glob_pattern(path):
+        yield from _process_glob_pattern(fs, path, seen, pbar, extensions)
+    elif not fs.exists(path):
+        raise ValueError(f"Path does not exist: {path}")
+    elif fs.isfile(path):
+        if _is_image_file(path, extensions) and path not in seen:
+            seen.add(path)
+            pbar.update(1)
+            yield path
+        elif not _is_image_file(path, extensions):
+            raise ValueError(f"File is not an image: {path}")
+    elif fs.isdir(path):
+        for file_path in _stream_files_from_directory(fs, path, extensions):
+            if file_path not in seen:
+                seen.add(file_path)
+                pbar.update(1)
+                yield file_path
+def _process_glob_pattern(
+    fs: fsspec.AbstractFileSystem, path: str, seen: set[str], pbar: tqdm[Any], extensions: set[str]
+) -> Iterator[str]:
+    """Process glob pattern and yield matching image files.
+    Args:
+        fs: The filesystem instance.
+        path: The glob pattern path.
+        seen: Set of already processed paths to avoid duplicates.
+        pbar: Progress bar instance for tracking progress.
+        extensions: Set of allowed file extensions.
+    Yields:
+        File paths that match the glob pattern and allowed extensions.
+    """
+    matching_paths = fs.glob(path)
+    for p in matching_paths:
+        path_str = str(p)
+        if _needs_protocol_prefix(path_str, fs):
+            protocol = _get_protocol_string(fs)
+            path_str = f"{protocol}{PROTOCOL_SEPARATOR}{path_str}"
+        if fs.isfile(path_str) and _is_image_file(path_str, extensions) and path_str not in seen:
+            seen.add(path_str)
+            pbar.update(1)
+            yield path_str
+def _stream_files_from_directory(
+    fs: fsspec.AbstractFileSystem, path: str, extensions: set[str]
+) -> Iterator[str]:
+    """Stream files from a directory with progress tracking.
+    Args:
+        fs: The filesystem instance
+        path: Directory path to list
+        extensions: Set of allowed file extensions
+    Yields:
+        File paths as they are discovered
+    """
+    try:
+        protocol = _get_protocol_string(fs)
+        if protocol in CLOUD_PROTOCOLS:
+            yield from _stream_files_using_walk(fs, path, extensions)
+        else:
+            try:
+                all_paths = fs.find(path, detail=False)
+                for p in all_paths:
+                    if fs.isfile(p) and _is_image_file(p, extensions):
+                        yield p
+            except Exception as e:
+                logging.warning(f"fs.find() failed for {path}, trying alternative method: {e}")
+                yield from _stream_files_using_walk(fs, path, extensions)
+    except Exception as e:
+        logging.error(f"Error streaming files from '{path}': {e}")
+def _stream_files_using_walk(
+    fs: fsspec.AbstractFileSystem, path: str, extensions: set[str]
+) -> Iterator[str]:
+    """Stream files using fs.walk() method.
+    Args:
+        fs: The filesystem instance.
+        path: The directory path to walk.
+        extensions: Set of allowed file extensions.
+    Yields:
+        File paths that match the allowed extensions.
+    """
+    def add_protocol_if_needed(p: str) -> str:
+        if _needs_protocol_prefix(p, fs):
+            protocol = _get_protocol_string(fs)
+            return f"{protocol}{PROTOCOL_SEPARATOR}{p}"
+        return p
+    for root, _dirs, files in fs.walk(path):
+        for file in files:
+            if not root.endswith(PATH_SEPARATOR):
+                full_path = f"{root}{PATH_SEPARATOR}{file}"
+            else:
+                full_path = f"{root}{file}"
+            full_path = add_protocol_if_needed(full_path)
+            if _is_image_file(full_path, extensions):
+                yield full_path
+def _get_filesystem(path: str) -> fsspec.AbstractFileSystem:
+    """Get the appropriate filesystem for the given path.
+    Args:
+        path: The path to determine the filesystem for. Can be local or remote.
+    Returns:
+        An fsspec filesystem instance appropriate for the path's protocol.
+    Raises:
+        ValueError: If the protocol cannot be determined or is invalid.
+    """
+    protocol = path.split(PROTOCOL_SEPARATOR)[0] if PROTOCOL_SEPARATOR in path else DEFAULT_PROTOCOL
+    # Ensure protocol is a string, not a tuple
+    if isinstance(protocol, (list, tuple)):
+        protocol = protocol[0]
+    return fsspec.filesystem(protocol)
+def _is_glob_pattern(path: str) -> bool:
+    """Check if a path contains glob pattern characters.
+    Args:
+        path: The path to check for glob patterns.
+    Returns:
+        True if the path contains glob pattern characters (*, ?, [, ]), False otherwise.
+    """
+    return any(char in path for char in GLOB_CHARS)
+def _is_image_file(path: str, extensions: set[str]) -> bool:
+    """Check if a file is an image based on its extension.
+    Args:
+        path: The file path to check.
+        extensions: Set of allowed file extensions (e.g., {'.jpg', '.png'}).
+    Returns:
+        True if the file has an allowed image extension, False otherwise.
+    """
+    path_lower = path.lower()
+    return any(path_lower.endswith(ext) for ext in extensions)
+def _needs_protocol_prefix(path: str, fs: fsspec.AbstractFileSystem) -> bool:
+    """Check if a path needs protocol prefix.
+    Args:
+        path: The path to check.
+        fs: The filesystem instance.
+    Returns:
+        True if the path needs a protocol prefix (e.g., for cloud storage),
+        False if it is a local path.
+    """
+    if PROTOCOL_SEPARATOR in path:
+        return False
+    if not hasattr(fs, "protocol"):
+        return False
+    protocol = getattr(fs, "protocol", DEFAULT_PROTOCOL)
+    # Handle case where protocol is a tuple (common with fsspec)
+    if isinstance(protocol, (list, tuple)):
+        protocol = protocol[0]
+    return str(protocol) != DEFAULT_PROTOCOL
+def _get_protocol_string(fs: fsspec.AbstractFileSystem) -> str:
+    """Get the protocol string from filesystem.
+    Args:
+        fs: The filesystem instance.
+    Returns:
+        The protocol string (e.g., 's3', 'file', 'gcs').
+        Returns 'file' as default if protocol cannot be determined.
+    """
+    protocol = getattr(fs, "protocol", DEFAULT_PROTOCOL)
+    if isinstance(protocol, (list, tuple)):
+        return str(protocol[0])
+    return str(protocol)

lightly_studio/dataset/loader.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 from typing import Iterable
 from uuid import UUID
+import fsspec
 import PIL
 from labelformat.formats import (
     COCOInstanceSegmentationInput,
@@ -29,10 +30,10 @@ from labelformat.model.object_detection import (
 from sqlmodel import Session
 from tqdm import tqdm
-from lightly_studio.api.db import db_manager
+from lightly_studio import db_manager
 from lightly_studio.api.features import lightly_studio_active_features
 from lightly_studio.api.server import Server
-from lightly_studio.dataset import env
+from lightly_studio.dataset import env, fsspec_lister
 from lightly_studio.dataset.embedding_generator import EmbeddingGenerator
 from lightly_studio.dataset.embedding_manager import (
     EmbeddingManager,
@@ -40,16 +41,11 @@ from lightly_studio.dataset.embedding_manager import (
 )
 from lightly_studio.models.annotation.annotation_base import AnnotationCreate
 from lightly_studio.models.annotation_label import AnnotationLabelCreate
-from lightly_studio.models.annotation_task import (
-    AnnotationTaskTable,
-    AnnotationType,
-)
 from lightly_studio.models.dataset import DatasetCreate, DatasetTable
 from lightly_studio.models.sample import SampleCreate, SampleTable
 from lightly_studio.resolvers import (
     annotation_label_resolver,
     annotation_resolver,
-    annotation_task_resolver,
     dataset_resolver,
     sample_resolver,
 )
@@ -66,7 +62,6 @@ class AnnotationProcessingContext:
     dataset_id: UUID
     sample_id: UUID
     label_map: dict[int, UUID]
-    annotation_task_id: UUID
 class DatasetLoader:
@@ -82,7 +77,6 @@ class DatasetLoader:
         dataset: DatasetTable,
         input_labels: ObjectDetectionInput | InstanceSegmentationInput,
         img_dir: Path,
-        annotation_task_id: UUID,
     ) -> None:
         """Store a loaded dataset in database."""
         # Create label mapping
@@ -119,7 +113,6 @@ class DatasetLoader:
                     samples_data=samples_image_data,
                     dataset_id=dataset.dataset_id,
                     label_map=label_map,
-                    annotation_task_id=annotation_task_id,
                     annotations_to_create=annotations_to_create,
                     sample_ids=sample_ids,
                 )
@@ -136,7 +129,6 @@ class DatasetLoader:
                 samples_data=samples_image_data,
                 dataset_id=dataset.dataset_id,
                 label_map=label_map,
-                annotation_task_id=annotation_task_id,
                 annotations_to_create=annotations_to_create,
                 sample_ids=sample_ids,
             )
@@ -187,23 +179,18 @@ class DatasetLoader:
             input_labels=label_input,
             dataset_name=dataset_name,
             img_dir=str(img_dir),
-            is_prediction=False,
-            task_name=task_name,
         )
     def from_coco_object_detections(
         self,
         annotations_json_path: str,
         img_dir: str,
-        task_name: str | None = None,
     ) -> DatasetTable:
         """Load a dataset in COCO Object Detection format and store in DB.
         Args:
             annotations_json_path: Path to the COCO annotations JSON file.
             img_dir: Path to the folder containing the images.
-            task_name: Optional name for the annotation task. If None, a
-                default name is generated.
         Returns:
             DatasetTable: The created dataset table entry.
@@ -211,9 +198,6 @@ class DatasetLoader:
         annotations_json = Path(annotations_json_path)
         dataset_name = annotations_json.parent.name
-        if task_name is None:
-            task_name = f"Loaded from COCO Object Detection: {annotations_json.name}"
         label_input = COCOObjectDetectionInput(
             input_file=annotations_json,
         )
@@ -223,23 +207,18 @@ class DatasetLoader:
             input_labels=label_input,
             dataset_name=dataset_name,
             img_dir=str(img_dir_path),
-            is_prediction=False,
-            task_name=task_name,
         )
     def from_coco_instance_segmentations(
         self,
         annotations_json_path: str,
         img_dir: str,
-        task_name: str | None = None,
     ) -> DatasetTable:
         """Load a dataset in COCO Instance Segmentation format and store in DB.
         Args:
             annotations_json_path: Path to the COCO annotations JSON file.
             img_dir: Path to the folder containing the images.
-            task_name: Optional name for the annotation task. If None, a
-                default name is generated.
         Returns:
             DatasetTable: The created dataset table entry.
@@ -247,9 +226,6 @@ class DatasetLoader:
         annotations_json = Path(annotations_json_path)
         dataset_name = annotations_json.parent.name
-        if task_name is None:
-            task_name = f"Loaded from COCO Instance Segmentation: {annotations_json.name}"
         label_input = COCOInstanceSegmentationInput(
             input_file=annotations_json,
         )
@@ -259,8 +235,6 @@ class DatasetLoader:
             input_labels=label_input,
             dataset_name=dataset_name,
             img_dir=str(img_dir_path),
-            is_prediction=False,
-            task_name=task_name,
         )
     def from_labelformat(
@@ -268,8 +242,6 @@ class DatasetLoader:
         input_labels: ObjectDetectionInput | InstanceSegmentationInput,
         dataset_name: str,
         img_dir: str,
-        is_prediction: bool = True,
-        task_name: str | None = None,
     ) -> DatasetTable:
         """Load a dataset from a labelformat object and store in database.
@@ -277,24 +249,12 @@ class DatasetLoader:
             input_labels: The labelformat input object.
             dataset_name: The name for the new dataset.
             img_dir: Path to the folder containing the images.
-            is_prediction: Whether the task is for prediction or labels.
-            task_name: Optional name for the annotation task. If None, a
-                default name is generated.
         Returns:
             DatasetTable: The created dataset table entry.
         """
         img_dir_path = Path(img_dir).absolute()
-        # Determine annotation type based on input.
-        # Currently, we always create BBOX tasks, even for segmentation,
-        # as segmentation data is stored alongside bounding boxes.
-        annotation_type = AnnotationType.BBOX
-        # Generate a default task name if none is provided.
-        if task_name is None:
-            task_name = f"Loaded from labelformat: {dataset_name}"
         # Create dataset and annotation task.
         dataset = dataset_resolver.create(
             session=self.session,
@@ -303,20 +263,11 @@ class DatasetLoader:
                 directory=str(img_dir_path),
             ),
         )
-        new_annotation_task = annotation_task_resolver.create(
-            session=self.session,
-            annotation_task=AnnotationTaskTable(
-                name=task_name,
-                annotation_type=annotation_type,
-                is_prediction=is_prediction,
-            ),
-        )
         self._load_into_dataset(
             dataset=dataset,
             input_labels=input_labels,
             img_dir=img_dir_path,
-            annotation_task_id=new_annotation_task.annotation_task_id,
         )
         return dataset
@@ -324,7 +275,6 @@ class DatasetLoader:
         self,
         dataset_name: str,
         img_dir: str,
-        recursive: bool = True,
         allowed_extensions: Iterable[str] = {
             ".png",
             ".jpg",
@@ -340,31 +290,22 @@ class DatasetLoader:
         Args:
             dataset_name: The name for the new dataset.
             img_dir: Path to the folder containing the images.
-            recursive: If True, search for images recursively in subfolders.
             allowed_extensions: An iterable container of allowed image file
                 extensions.
         """
-        img_dir_path = Path(img_dir).absolute()
-        if not img_dir_path.exists() or not img_dir_path.is_dir():
-            raise ValueError(f"Input images folder is not a valid directory: {img_dir_path}")
         # Create dataset.
         dataset = dataset_resolver.create(
             session=self.session,
             dataset=DatasetCreate(
                 name=dataset_name,
-                directory=str(img_dir_path),
+                directory=img_dir,
             ),
         )
-        # Collect image file paths.
+        # Collect image file paths with extension filtering.
         allowed_extensions_set = {ext.lower() for ext in allowed_extensions}
-        image_paths = []
-        path_iter = img_dir_path.rglob("*") if recursive else img_dir_path.glob("*")
-        for path in path_iter:
-            if path.is_file() and path.suffix.lower() in allowed_extensions_set:
-                image_paths.append(path)
-        print(f"Found {len(image_paths)} images in {img_dir_path}.")
+        image_paths = list(fsspec_lister.iter_files_from_path(img_dir, allowed_extensions_set))
+        print(f"Found {len(image_paths)} images in {img_dir}.")
         # Process images.
         sample_ids = _create_samples_from_paths(
@@ -383,8 +324,37 @@ class DatasetLoader:
         return dataset
+    def _validate_has_samples(self) -> None:
+        """Validate that there are samples in the database before starting GUI.
+        Raises:
+            ValueError: If no samples are found in any dataset.
+        """
+        # Check if any datasets exist
+        datasets = dataset_resolver.get_all(session=self.session, offset=0, limit=1)
+        if not datasets:
+            raise ValueError(
+                "No datasets found. Please load a dataset using one of the loader methods "
+                "(e.g., from_yolo(), from_directory(), etc.) before starting the GUI."
+            )
+        # Check if there are any samples in the first dataset
+        first_dataset = datasets[0]
+        sample_count = sample_resolver.count_by_dataset_id(
+            session=self.session, dataset_id=first_dataset.dataset_id
+        )
+        if sample_count == 0:
+            raise ValueError(
+                "No images have been indexed for the first dataset. "
+                "Please ensure your dataset contains valid images and try loading again."
+            )
     def start_gui(self) -> None:
         """Launch the web interface for the loaded dataset."""
+        self._validate_has_samples()
         server = Server(host=env.LIGHTLY_STUDIO_HOST, port=env.LIGHTLY_STUDIO_PORT)
         print(f"Open the LightlyStudio GUI under: {env.APP_URL}")
@@ -395,7 +365,7 @@ class DatasetLoader:
 def _create_samples_from_paths(
     session: Session,
     dataset_id: UUID,
-    image_paths: Iterable[Path],
+    image_paths: Iterable[str],
 ) -> Iterator[UUID]:
     """Create samples from a list of image paths.
@@ -415,15 +385,14 @@ def _create_samples_from_paths(
         unit=" images",
     ):
         try:
-            image = PIL.Image.open(image_path)
-            width, height = image.size
-            image.close()
+            with fsspec.open(image_path, "rb") as file, PIL.Image.open(file) as img:
+                width, height = img.size
         except (FileNotFoundError, PIL.UnidentifiedImageError, OSError):
             continue
         sample = SampleCreate(
-            file_name=image_path.name,
-            file_path_abs=str(image_path),
+            file_name=Path(image_path).name,
+            file_path_abs=image_path,
             width=width,
             height=height,
             dataset_id=dataset_id,
@@ -477,12 +446,11 @@ def _process_object_detection_annotations(
                 sample_id=context.sample_id,
                 annotation_label_id=context.label_map[obj.category.id],
                 annotation_type="object_detection",
-                x=x,
-                y=y,
-                width=width,
-                height=height,
+                x=int(x),
+                y=int(y),
+                width=int(width),
+                height=int(height),
                 confidence=obj.confidence,
-                annotation_task_id=context.annotation_task_id,
             )
         )
     return new_annotations
@@ -512,12 +480,11 @@ def _process_instance_segmentation_annotations(
                 sample_id=context.sample_id,
                 annotation_label_id=context.label_map[obj.category.id],
                 annotation_type="instance_segmentation",
-                x=x,
-                y=y,
-                width=width,
-                height=height,
+                x=int(x),
+                y=int(y),
+                width=int(width),
+                height=int(height),
                 segmentation_mask=segmentation_rle,
-                annotation_task_id=context.annotation_task_id,
             )
         )
     return new_annotations
@@ -529,7 +496,6 @@ def _process_batch_annotations(  # noqa: PLR0913
     samples_data: list[tuple[SampleCreate, ImageInstanceSegmentation | ImageObjectDetection]],
     dataset_id: UUID,
     label_map: dict[int, UUID],
-    annotation_task_id: UUID,
     annotations_to_create: list[AnnotationCreate],
     sample_ids: list[UUID],
 ) -> None:
@@ -541,7 +507,6 @@ def _process_batch_annotations(  # noqa: PLR0913
             dataset_id=dataset_id,
             sample_id=stored_sample.sample_id,
             label_map=label_map,
-            annotation_task_id=annotation_task_id,
         )
         if isinstance(img_data, ImageInstanceSegmentation):

lightly_studio/dataset/mobileclip_embedding_generator.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Callable
 from uuid import UUID
+import fsspec
 import torch
 from PIL import Image
 from torch.utils.data import DataLoader, Dataset
@@ -23,6 +24,7 @@ MOBILECLIP_DOWNLOAD_URL = (
     f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{MODEL_NAME}.pt"
 )
 MAX_BATCH_SIZE: int = 16
+EMBEDDING_DIMENSION: int = 512
 # Dataset for efficient batched image loading and preprocessing
@@ -31,7 +33,7 @@ class _ImageFileDataset(Dataset[torch.Tensor]):
     def __init__(
         self,
-        filepaths: list[Path],
+        filepaths: list[str],
         preprocess: Callable[[Image.Image], torch.Tensor],
     ) -> None:
         self.filepaths = filepaths
@@ -41,8 +43,9 @@ class _ImageFileDataset(Dataset[torch.Tensor]):
         return len(self.filepaths)
     def __getitem__(self, idx: int) -> torch.Tensor:
-        image = Image.open(self.filepaths[idx]).convert("RGB")
-        return self.preprocess(image)
+        with fsspec.open(self.filepaths[idx], "rb") as file:
+            image = Image.open(file).convert("RGB")
+            return self.preprocess(image)
 class MobileCLIPEmbeddingGenerator(EmbeddingGenerator):
@@ -83,7 +86,7 @@ class MobileCLIPEmbeddingGenerator(EmbeddingGenerator):
         return EmbeddingModelCreate(
             name=MODEL_NAME,
             embedding_model_hash=self._model_hash,
-            embedding_dimension=512,
+            embedding_dimension=EMBEDDING_DIMENSION,
             dataset_id=dataset_id,
         )
@@ -103,7 +106,7 @@ class MobileCLIPEmbeddingGenerator(EmbeddingGenerator):
             embedding_list: list[float] = embedding.cpu().numpy().flatten().tolist()
         return embedding_list
-    def embed_images(self, filepaths: list[Path]) -> list[list[float]]:
+    def embed_images(self, filepaths: list[str]) -> list[list[float]]:
         """Embed images with MobileCLIP.
         Args:
@@ -136,7 +139,7 @@ class MobileCLIPEmbeddingGenerator(EmbeddingGenerator):
 def _get_cached_mobileclip_checkpoint() -> Path:
-    file_path = Path(tempfile.gettempdir()) / "mobileclip_s0.pt"
+    file_path = Path(tempfile.gettempdir()) / f"{MODEL_NAME}.pt"
     file_utils.download_file_if_does_not_exist(
         url=MOBILECLIP_DOWNLOAD_URL,
         local_filename=file_path,

lightly-studio 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

lightly-studio 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl