PyPI - maite-datasets - Versions diffs - 0.0.1__py3-none-any.whl - Mend

maite-datasets 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

maite_datasets/__init__.py +1 -0
maite_datasets/_base.py +254 -0
maite_datasets/_fileio.py +174 -0
maite_datasets/_mixin/__init__.py +0 -0
maite_datasets/_mixin/_numpy.py +28 -0
maite_datasets/_mixin/_torch.py +28 -0
maite_datasets/_protocols.py +224 -0
maite_datasets/_types.py +54 -0
maite_datasets/image_classification/__init__.py +11 -0
maite_datasets/image_classification/_cifar10.py +233 -0
maite_datasets/image_classification/_mnist.py +215 -0
maite_datasets/image_classification/_ships.py +150 -0
maite_datasets/object_detection/__init__.py +20 -0
maite_datasets/object_detection/_antiuav.py +200 -0
maite_datasets/object_detection/_milco.py +207 -0
maite_datasets/object_detection/_seadrone.py +551 -0
maite_datasets/object_detection/_voc.py +510 -0
maite_datasets/object_detection/_voc_torch.py +65 -0
maite_datasets/py.typed +0 -0
maite_datasets-0.0.1.dist-info/METADATA +91 -0
maite_datasets-0.0.1.dist-info/RECORD +23 -0
maite_datasets-0.0.1.dist-info/WHEEL +4 -0
maite_datasets-0.0.1.dist-info/licenses/LICENSE +21 -0

maite_datasets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Module for MAITE compliant Computer Vision datasets."""

maite_datasets/_base.py ADDED Viewed

@@ -0,0 +1,254 @@
+from __future__ import annotations
+__all__ = []
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar, cast
+import numpy as np
+from maite_datasets._fileio import _ensure_exists
+from maite_datasets._protocols import Array, Transform
+from maite_datasets._types import (
+    AnnotatedDataset,
+    DatasetMetadata,
+    DatumMetadata,
+    ImageClassificationDataset,
+    ObjectDetectionDataset,
+    ObjectDetectionTarget,
+)
+_TArray = TypeVar("_TArray", bound=Array)
+_TTarget = TypeVar("_TTarget")
+_TRawTarget = TypeVar(
+    "_TRawTarget",
+    Sequence[int],
+    Sequence[str],
+    Sequence[tuple[list[int], list[list[float]]]],
+)
+_TAnnotation = TypeVar("_TAnnotation", int, str, tuple[list[int], list[list[float]]])
+def _to_datum_metadata(index: int, metadata: dict[str, Any]) -> DatumMetadata:
+    _id = metadata.pop("id", index)
+    return DatumMetadata(id=_id, **metadata)
+class DataLocation(NamedTuple):
+    url: str
+    filename: str
+    md5: bool
+    checksum: str
+class BaseDatasetMixin(Generic[_TArray]):
+    index2label: dict[int, str]
+    def _as_array(self, raw: list[Any]) -> _TArray: ...
+    def _one_hot_encode(self, value: int | list[int]) -> _TArray: ...
+    def _read_file(self, path: str) -> _TArray: ...
+class BaseDataset(
+    AnnotatedDataset[tuple[_TArray, _TTarget, DatumMetadata]],
+    Generic[_TArray, _TTarget, _TRawTarget, _TAnnotation],
+):
+    """
+    Base class for internet downloaded datasets.
+    """
+    # Each subclass should override the attributes below.
+    # Each resource tuple must contain:
+    #    'url': str, the URL to download from
+    #    'filename': str, the name of the file once downloaded
+    #    'md5': boolean, True if it's the checksum value is md5
+    #    'checksum': str, the associated checksum for the downloaded file
+    _resources: list[DataLocation]
+    _resource_index: int = 0
+    index2label: dict[int, str]
+    def __init__(
+        self,
+        root: str | Path,
+        image_set: Literal["train", "val", "test", "operational", "base"] = "train",
+        transforms: Transform[_TArray] | Sequence[Transform[_TArray]] | None = None,
+        download: bool = False,
+        verbose: bool = False,
+    ) -> None:
+        self._root: Path = (
+            root.absolute() if isinstance(root, Path) else Path(root).absolute()
+        )
+        transforms = transforms if transforms is not None else []
+        self.transforms: Sequence[Transform[_TArray]] = (
+            transforms if isinstance(transforms, Sequence) else [transforms]
+        )
+        self.image_set = image_set
+        self._verbose = verbose
+        # Internal Attributes
+        self._download = download
+        self._filepaths: list[str]
+        self._targets: _TRawTarget
+        self._datum_metadata: dict[str, list[Any]]
+        self._resource: DataLocation = self._resources[self._resource_index]
+        self._label2index = {v: k for k, v in self.index2label.items()}
+        self.metadata: DatasetMetadata = DatasetMetadata(
+            id=self._unique_id(),
+            index2label=self.index2label,
+            split=self.image_set,
+        )
+        # Load the data
+        self.path: Path = self._get_dataset_dir()
+        self._filepaths, self._targets, self._datum_metadata = self._load_data()
+        self.size: int = len(self._filepaths)
+    def __str__(self) -> str:
+        nt = "\n    "
+        title = f"{self.__class__.__name__} Dataset"
+        sep = "-" * len(title)
+        attrs = [
+            f"{k.capitalize()}: {v}"
+            for k, v in self.__dict__.items()
+            if not k.startswith("_")
+        ]
+        return f"{title}\n{sep}{nt}{nt.join(attrs)}"
+    @property
+    def label2index(self) -> dict[str, int]:
+        return self._label2index
+    def __iter__(self) -> Iterator[tuple[_TArray, _TTarget, DatumMetadata]]:
+        for i in range(len(self)):
+            yield self[i]
+    def _get_dataset_dir(self) -> Path:
+        # Create a designated folder for this dataset (named after the class)
+        if self._root.stem.lower() == self.__class__.__name__.lower():
+            dataset_dir: Path = self._root
+        else:
+            dataset_dir: Path = self._root / self.__class__.__name__.lower()
+        if not dataset_dir.exists():
+            dataset_dir.mkdir(parents=True, exist_ok=True)
+        return dataset_dir
+    def _unique_id(self) -> str:
+        return f"{self.__class__.__name__}_{self.image_set}"
+    def _load_data(self) -> tuple[list[str], _TRawTarget, dict[str, Any]]:
+        """
+        Function to determine if data can be accessed or if it needs to be downloaded and/or extracted.
+        """
+        if self._verbose:
+            print(f"Determining if {self._resource.filename} needs to be downloaded.")
+        try:
+            result = self._load_data_inner()
+            if self._verbose:
+                print("No download needed, loaded data successfully.")
+        except FileNotFoundError:
+            _ensure_exists(
+                *self._resource, self.path, self._root, self._download, self._verbose
+            )
+            result = self._load_data_inner()
+        return result
+    @abstractmethod
+    def _load_data_inner(self) -> tuple[list[str], _TRawTarget, dict[str, Any]]: ...
+    def _transform(self, image: _TArray) -> _TArray:
+        """Function to transform the image prior to returning based on parameters passed in."""
+        for transform in self.transforms:
+            image = transform(image)
+        return image
+    def __len__(self) -> int:
+        return self.size
+class BaseICDataset(
+    BaseDataset[_TArray, _TArray, list[int], int],
+    BaseDatasetMixin[_TArray],
+    ImageClassificationDataset[_TArray],
+):
+    """
+    Base class for image classification datasets.
+    """
+    def __getitem__(self, index: int) -> tuple[_TArray, _TArray, DatumMetadata]:
+        """
+        Args
+        ----
+        index : int
+            Value of the desired data point
+        Returns
+        -------
+        tuple[TArray, TArray, DatumMetadata]
+            Image, target, datum_metadata - where target is one-hot encoding of class.
+        """
+        # Get the associated label and score
+        label = self._targets[index]
+        score = self._one_hot_encode(label)
+        # Get the image
+        img = self._read_file(self._filepaths[index])
+        img = self._transform(img)
+        img_metadata = {key: val[index] for key, val in self._datum_metadata.items()}
+        return img, score, _to_datum_metadata(index, img_metadata)
+class BaseODDataset(
+    BaseDataset[_TArray, ObjectDetectionTarget[_TArray], _TRawTarget, _TAnnotation],
+    BaseDatasetMixin[_TArray],
+    ObjectDetectionDataset[_TArray],
+):
+    """
+    Base class for object detection datasets.
+    """
+    _bboxes_per_size: bool = False
+    def __getitem__(
+        self, index: int
+    ) -> tuple[_TArray, ObjectDetectionTarget[_TArray], DatumMetadata]:
+        """
+        Args
+        ----
+        index : int
+            Value of the desired data point
+        Returns
+        -------
+        tuple[TArray, ObjectDetectionTarget[TArray], DatumMetadata]
+            Image, target, datum_metadata - target.boxes returns boxes in x0, y0, x1, y1 format
+        """
+        # Grab the bounding boxes and labels from the annotations
+        annotation = cast(_TAnnotation, self._targets[index])
+        boxes, labels, additional_metadata = self._read_annotations(annotation)
+        # Get the image
+        img = self._read_file(self._filepaths[index])
+        img_size = img.shape
+        img = self._transform(img)
+        # Adjust labels if necessary
+        if self._bboxes_per_size and boxes:
+            boxes = boxes * np.array(
+                [[img_size[1], img_size[2], img_size[1], img_size[2]]]
+            )
+        # Create the Object Detection Target
+        target = ObjectDetectionTarget(
+            self._as_array(boxes), self._as_array(labels), self._one_hot_encode(labels)
+        )
+        img_metadata = {key: val[index] for key, val in self._datum_metadata.items()}
+        img_metadata = img_metadata | additional_metadata
+        return img, target, _to_datum_metadata(index, img_metadata)
+    @abstractmethod
+    def _read_annotations(
+        self, annotation: _TAnnotation
+    ) -> tuple[list[list[float]], list[int], dict[str, Any]]: ...

maite_datasets/_fileio.py ADDED Viewed

@@ -0,0 +1,174 @@
+from __future__ import annotations
+__all__ = []
+import hashlib
+import tarfile
+import zipfile
+from pathlib import Path
+import requests
+try:
+    from tqdm.auto import tqdm
+except ImportError:
+    tqdm = None
+ARCHIVE_ENDINGS = [".zip", ".tar", ".tgz"]
+COMPRESS_ENDINGS = [".gz", ".bz2"]
+def _print(text: str, verbose: bool) -> None:
+    if verbose:
+        print(text)
+def _validate_file(
+    fpath: Path | str, file_md5: str, md5: bool = False, chunk_size: int = 65535
+) -> bool:
+    hasher = hashlib.md5(usedforsecurity=False) if md5 else hashlib.sha256()
+    with open(fpath, "rb") as fpath_file:
+        while chunk := fpath_file.read(chunk_size):
+            hasher.update(chunk)
+    return hasher.hexdigest() == file_md5
+def _download_dataset(
+    url: str, file_path: Path, timeout: int = 60, verbose: bool = False
+) -> None:
+    """Download a single resource from its URL to the `data_folder`."""
+    error_msg = "URL fetch failure on {}: {} -- {}"
+    try:
+        response = requests.get(url, stream=True, timeout=timeout)
+        response.raise_for_status()
+    except requests.exceptions.HTTPError as e:
+        raise RuntimeError(
+            f"{error_msg.format(url, e.response.status_code, e.response.reason)}"
+        ) from e
+    except requests.exceptions.RequestException as e:
+        raise ValueError(f"{error_msg.format(url, 'Unknown error', str(e))}") from e
+    total_size = int(response.headers.get("content-length", 0))
+    block_size = 8192  # 8 KB
+    progress_bar = (
+        None
+        if tqdm is None
+        else tqdm(total=total_size, unit="iB", unit_scale=True, disable=not verbose)
+    )
+    with open(file_path, "wb") as f:
+        for chunk in response.iter_content(block_size):
+            f.write(chunk)
+            if progress_bar is not None:
+                progress_bar.update(len(chunk))
+    if progress_bar is not None:
+        progress_bar.close()
+def _extract_zip_archive(file_path: Path, extract_to: Path) -> None:
+    """Extracts the zip file to the given directory."""
+    try:
+        with zipfile.ZipFile(file_path, "r") as zip_ref:
+            zip_ref.extractall(extract_to)  # noqa: S202
+            file_path.unlink()
+    except zipfile.BadZipFile:
+        raise FileNotFoundError(
+            f"{file_path.name} is not a valid zip file, skipping extraction."
+        )
+def _extract_tar_archive(file_path: Path, extract_to: Path) -> None:
+    """Extracts a tar file (or compressed tar) to the specified directory."""
+    try:
+        with tarfile.open(file_path, "r:*") as tar_ref:
+            tar_ref.extractall(extract_to)  # noqa: S202
+            file_path.unlink()
+    except tarfile.TarError:
+        raise FileNotFoundError(
+            f"{file_path.name} is not a valid tar file, skipping extraction."
+        )
+def _extract_archive(
+    file_ext: str,
+    file_path: Path,
+    directory: Path,
+    compression: bool = False,
+    verbose: bool = False,
+) -> None:
+    """
+    Single function to extract and then flatten if necessary.
+    Recursively extracts nested zip files as well.
+    Extracts and flattens all folders to the base directory.
+    """
+    if file_ext != ".zip" or compression:
+        _extract_tar_archive(file_path, directory)
+    else:
+        _extract_zip_archive(file_path, directory)
+    # Look for nested zip files in the extraction directory and extract them recursively.
+    # Does NOT extract in place - extracts everything to directory
+    for child in directory.iterdir():
+        if child.suffix == ".zip":
+            _print(f"Extracting nested zip: {child} to {directory}", verbose)
+            _extract_zip_archive(child, directory)
+def _ensure_exists(
+    url: str,
+    filename: str,
+    md5: bool,
+    checksum: str,
+    directory: Path,
+    root: Path,
+    download: bool = True,
+    verbose: bool = False,
+) -> None:
+    """
+    For each resource, download it if it doesn't exist in the dataset_dir.
+    If the resource is a zip file, extract it (including recursively extracting nested zips).
+    """
+    file_path = directory / str(filename)
+    alternate_path = root / str(filename)
+    _, file_ext = file_path.stem, file_path.suffix
+    compression = False
+    if file_ext in COMPRESS_ENDINGS:
+        file_ext = file_path.suffixes[0]
+        compression = True
+    check_path = (
+        alternate_path
+        if alternate_path.exists() and not file_path.exists()
+        else file_path
+    )
+    # Download file if it doesn't exist.
+    if not check_path.exists() and download:
+        _print(f"Downloading {filename} from {url}", verbose)
+        _download_dataset(url, check_path, verbose=verbose)
+        if not _validate_file(check_path, checksum, md5):
+            raise Exception(
+                "File checksum mismatch. Remove current file and retry download."
+            )
+        # If the file is a zip, tar or tgz extract it into the designated folder.
+        if file_ext in ARCHIVE_ENDINGS:
+            _print(f"Extracting {filename}...", verbose)
+            _extract_archive(file_ext, check_path, directory, compression, verbose)
+    elif not check_path.exists() and not download:
+        raise FileNotFoundError(
+            "Data could not be loaded with the provided root directory, "
+            f"the file path to the file {filename} does not exist, "
+            "and the download parameter is set to False."
+        )
+    else:
+        if not _validate_file(check_path, checksum, md5):
+            raise Exception(
+                "File checksum mismatch. Remove current file and retry download."
+            )
+        _print(f"{filename} already exists, skipping download.", verbose)
+        if file_ext in ARCHIVE_ENDINGS:
+            _print(f"Extracting {filename}...", verbose)
+            _extract_archive(file_ext, check_path, directory, compression, verbose)

maite_datasets/_mixin/__init__.py ADDED Viewed

File without changes

maite_datasets/_mixin/_numpy.py ADDED Viewed

@@ -0,0 +1,28 @@
+from __future__ import annotations
+__all__ = []
+from typing import Any
+import numpy as np
+from numpy.typing import NDArray
+from PIL import Image
+from maite_datasets._base import BaseDatasetMixin
+class BaseDatasetNumpyMixin(BaseDatasetMixin[NDArray[np.number[Any]]]):
+    def _as_array(self, raw: list[Any]) -> NDArray[np.number[Any]]:
+        return np.asarray(raw)
+    def _one_hot_encode(self, value: int | list[int]) -> NDArray[np.number[Any]]:
+        if isinstance(value, int):
+            encoded = np.zeros(len(self.index2label))
+            encoded[value] = 1
+        else:
+            encoded = np.zeros((len(value), len(self.index2label)))
+            encoded[np.arange(len(value)), value] = 1
+        return encoded
+    def _read_file(self, path: str) -> NDArray[np.number[Any]]:
+        return np.array(Image.open(path)).transpose(2, 0, 1)

maite_datasets/_mixin/_torch.py ADDED Viewed

@@ -0,0 +1,28 @@
+from __future__ import annotations
+__all__ = []
+from typing import Any
+import numpy as np
+import torch
+from PIL import Image
+from maite_datasets._base import BaseDatasetMixin
+class BaseDatasetTorchMixin(BaseDatasetMixin[torch.Tensor]):
+    def _as_array(self, raw: list[Any]) -> torch.Tensor:
+        return torch.as_tensor(raw)
+    def _one_hot_encode(self, value: int | list[int]) -> torch.Tensor:
+        if isinstance(value, int):
+            encoded = torch.zeros(len(self.index2label))
+            encoded[value] = 1
+        else:
+            encoded = torch.zeros((len(value), len(self.index2label)))
+            encoded[torch.arange(len(value)), value] = 1
+        return encoded
+    def _read_file(self, path: str) -> torch.Tensor:
+        return torch.as_tensor(np.array(Image.open(path)).transpose(2, 0, 1))