PyPI - dataeval - Versions diffs - 0.86.0__py3-none-any.whl → 0.86.2__py3-none-any.whl - Mend

dataeval 0.86.0py3-none-any.whl → 0.86.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +188 -178
dataeval/data/_selection.py +1 -2
dataeval/data/_split.py +4 -5
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +2 -5
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_nml/_base.py +4 -2
dataeval/detectors/drift/_nml/_chunk.py +11 -19
dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
dataeval/detectors/drift/_nml/_result.py +8 -9
dataeval/detectors/drift/_nml/_thresholds.py +66 -77
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metadata/_distance.py +10 -7
dataeval/metadata/_ood.py +11 -103
dataeval/metrics/bias/_balance.py +23 -33
dataeval/metrics/bias/_diversity.py +16 -14
dataeval/metrics/bias/_parity.py +18 -18
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +24 -70
dataeval/outputs/_drift.py +1 -9
dataeval/outputs/_linters.py +11 -11
dataeval/outputs/_stats.py +82 -23
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +54 -28
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +22 -12
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/METADATA +2 -1
dataeval-0.86.2.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.86.0.dist-info/RECORD +0 -114
{dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/LICENSE.txt +0 -0
{dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/WHEEL +0 -0

dataeval/utils/datasets/_antiuav.py ADDED Viewed

@@ -0,0 +1,189 @@
+from __future__ import annotations
+__all__ = []
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal, Sequence
+from defusedxml.ElementTree import parse
+from numpy.typing import NDArray
+from dataeval.utils.datasets._base import BaseODDataset, DataLocation
+from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
+class AntiUAVDetection(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
+    """
+    A UAV detection dataset focused on detecting UAVs in natural images against large variation in backgrounds.
+    The dataset comes from the paper
+    `Vision-based Anti-UAV Detection and Tracking <https://ieeexplore.ieee.org/document/9785379>`_
+    by Jie Zhao et. al. (2022).
+    The dataset is approximately 1.3 GB and can be found `here <https://github.com/wangdongdut/DUT-Anti-UAV>`_.
+    Images are collected against a variety of different backgrounds with a variety in the number and type of UAV.
+    Ground truth labels are provided for the train, validation and test set.
+    There are 35 different types of drones along with a variety in lighting conditions and weather conditions.
+    There are 10,000 images: 5200 images in the training set, 2200 images in the validation set,
+    and 2600 images in the test set.
+    The dataset only has a single UAV class with the focus being on identifying object location in the image.
+    Ground-truth bounding boxes are provided in (x0, y0, x1, y1) format.
+    The images come in a variety of sizes from 3744 x 5616 to 160 x 240.
+    Parameters
+    ----------
+    root : str or pathlib.Path
+        Root directory where the data should be downloaded to or
+        the ``antiuavdetection`` folder of the already downloaded data.
+    image_set: "train", "val", "test", or "base", default "train"
+        If "base", then the full dataset is selected (train, val and test).
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
+    download : bool, default False
+        If True, downloads the dataset from the internet and puts it in root directory.
+        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
+    verbose : bool, default False
+        If True, outputs print statements.
+    Attributes
+    ----------
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "val", "test", or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
+        Dictionary which translates from class integers to the associated class strings.
+    label2index : dict[str, int]
+        Dictionary which translates from class strings to the associated class integers.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
+    Note
+    ----
+    Data License: `Apache 2.0 <https://www.apache.org/licenses/LICENSE-2.0.txt>`_
+    """
+    # Need to run the sha256 on the files and then store that
+    _resources = [
+        DataLocation(
+            url="https://drive.usercontent.google.com/download?id=1RVsSGPUKTdmoyoPTBTWwroyulLek1eTj&export=download&authuser=0&confirm=t&uuid=6bca4f94-a242-4bc2-9663-fb03cd94ef2c&at=APcmpox0--NroQ_3bqeTFaJxP7Pw%3A1746552902927",
+            filename="train.zip",
+            md5=False,
+            checksum="14f927290556df60e23cedfa80dffc10dc21e4a3b6843e150cfc49644376eece",
+        ),
+        DataLocation(
+            url="https://drive.usercontent.google.com/download?id=1333uEQfGuqTKslRkkeLSCxylh6AQ0X6n&export=download&authuser=0&confirm=t&uuid=c2ad2f01-aca8-4a85-96bb-b8ef6e40feea&at=APcmpozY-8bhk3nZSFaYbE8rq1Fi%3A1746551543297",
+            filename="val.zip",
+            md5=False,
+            checksum="238be0ceb3e7c5be6711ee3247e49df2750d52f91f54f5366c68bebac112ebf8",
+        ),
+        DataLocation(
+            url="https://drive.usercontent.google.com/download?id=1L1zeW1EMDLlXHClSDcCjl3rs_A6sVai0&export=download&authuser=0&confirm=t&uuid=5a1d7650-d8cd-4461-8354-7daf7292f06c&at=APcmpozLQC1CuP-n5_UX2JnP53Zo%3A1746551676177",
+            filename="test.zip",
+            md5=False,
+            checksum="a671989a01cff98c684aeb084e59b86f4152c50499d86152eb970a9fc7fb1cbe",
+        ),
+    ]
+    index2label: dict[int, str] = {
+        0: "unknown",
+        1: "UAV",
+    }
+    def __init__(
+        self,
+        root: str | Path,
+        image_set: Literal["train", "val", "test", "base"] = "train",
+        transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(
+            root,
+            image_set,
+            transforms,
+            download,
+            verbose,
+        )
+    def _load_data(self) -> tuple[list[str], list[str], dict[str, list[Any]]]:
+        filepaths: list[str] = []
+        targets: list[str] = []
+        datum_metadata: dict[str, list[Any]] = {}
+        # If base, load all resources
+        if self.image_set == "base":
+            metadata_list: list[dict[str, Any]] = []
+            for resource in self._resources:
+                self._resource = resource
+                resource_filepaths, resource_targets, resource_metadata = super()._load_data()
+                filepaths.extend(resource_filepaths)
+                targets.extend(resource_targets)
+                metadata_list.append(resource_metadata)
+            # Combine metadata
+            for data_dict in metadata_list:
+                for key, val in data_dict.items():
+                    str_key = str(key)  # Ensure key is string
+                    if str_key not in datum_metadata:
+                        datum_metadata[str_key] = []
+                    datum_metadata[str_key].extend(val)
+        else:
+            # Grab only the desired data
+            for resource in self._resources:
+                if self.image_set in resource.filename:
+                    self._resource = resource
+                    resource_filepaths, resource_targets, resource_metadata = super()._load_data()
+                    filepaths.extend(resource_filepaths)
+                    targets.extend(resource_targets)
+                    datum_metadata.update(resource_metadata)
+        return filepaths, targets, datum_metadata
+    def _load_data_inner(self) -> tuple[list[str], list[str], dict[str, Any]]:
+        resource_name = self._resource.filename[:-4]
+        base_dir = self.path / resource_name
+        data_folder = sorted((base_dir / "img").glob("*.jpg"))
+        if not data_folder:
+            raise FileNotFoundError
+        file_data = {"image_id": [f"{resource_name}_{entry.name}" for entry in data_folder]}
+        data = [str(entry) for entry in data_folder]
+        annotations = sorted(str(entry) for entry in (base_dir / "xml").glob("*.xml"))
+        return data, annotations, file_data
+    def _read_annotations(self, annotation: str) -> tuple[list[list[float]], list[int], dict[str, Any]]:
+        """Function for extracting the info for the label and boxes"""
+        boxes: list[list[float]] = []
+        labels = []
+        root = parse(annotation).getroot()
+        if root is None:
+            raise ValueError(f"Unable to parse {annotation}")
+        additional_meta: dict[str, Any] = {
+            "image_width": int(root.findtext("size/width", default="-1")),
+            "image_height": int(root.findtext("size/height", default="-1")),
+            "image_depth": int(root.findtext("size/depth", default="-1")),
+        }
+        for obj in root.findall("object"):
+            labels.append(1 if obj.findtext("name", default="") == "UAV" else 0)
+            boxes.append(
+                [
+                    float(obj.findtext("bndbox/xmin", default="0")),
+                    float(obj.findtext("bndbox/ymin", default="0")),
+                    float(obj.findtext("bndbox/xmax", default="0")),
+                    float(obj.findtext("bndbox/ymax", default="0")),
+                ]
+            )
+        return boxes, labels, additional_meta

dataeval/utils/datasets/_base.py CHANGED Viewed

@@ -6,6 +6,8 @@ from abc import abstractmethod
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
+import numpy as np
 from dataeval.utils.datasets._fileio import _ensure_exists
 from dataeval.utils.datasets._mixin import BaseDatasetMixin
 from dataeval.utils.datasets._types import (
@@ -101,11 +103,7 @@ class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Ge
     def _get_dataset_dir(self) -> Path:
         # Create a designated folder for this dataset (named after the class)
-        if self._root.stem in [
-            self.__class__.__name__.lower(),
-            self.__class__.__name__.upper(),
-            self.__class__.__name__,
-        ]:
+        if self._root.stem.lower() == self.__class__.__name__.lower():
             dataset_dir: Path = self._root
         else:
             dataset_dir: Path = self._root / self.__class__.__name__.lower()
@@ -114,8 +112,7 @@ class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Ge
         return dataset_dir
     def _unique_id(self) -> str:
-        unique_id = f"{self.__class__.__name__}_{self.image_set}"
-        return unique_id
+        return f"{self.__class__.__name__}_{self.image_set}"
     def _load_data(self) -> tuple[list[str], _TRawTarget, dict[str, Any]]:
         """
@@ -188,6 +185,8 @@ class BaseODDataset(
     Base class for object detection datasets.
     """
+    _bboxes_per_size: bool = False
     def __getitem__(self, index: int) -> tuple[_TArray, ObjectDetectionTarget[_TArray], dict[str, Any]]:
         """
         Args
@@ -204,8 +203,12 @@ class BaseODDataset(
         boxes, labels, additional_metadata = self._read_annotations(self._targets[index])
         # Get the image
         img = self._read_file(self._filepaths[index])
+        img_size = img.shape
         img = self._transform(img)
+        # Adjust labels if necessary
+        if self._bboxes_per_size and boxes:
+            boxes = boxes * np.array([[img_size[1], img_size[2], img_size[1], img_size[2]]])
+        # Create the Object Detection Target
         target = ObjectDetectionTarget(self._as_array(boxes), self._as_array(labels), self._one_hot_encode(labels))
         img_metadata = {key: val[index] for key, val in self._datum_metadata.items()}

dataeval/utils/datasets/_cifar10.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
 import numpy as np
 from numpy.typing import NDArray
-from PIL import Image
 from dataeval.utils.datasets._base import BaseICDataset, DataLocation
 from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin
@@ -26,7 +25,7 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     Parameters
     ----------
     root : str or pathlib.Path
-        Root directory of dataset where the ``mnist`` folder exists.
+        Root directory where the data should be downloaded to or the ``cifar10`` folder of the already downloaded data.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
     transforms : Transform, Sequence[Transform] or None, default None
@@ -93,50 +92,110 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
             verbose,
         )
+    def _load_bin_data(self, data_folder: list[Path]) -> tuple[list[str], list[int], dict[str, Any]]:
+        batch_nums = np.zeros(60000, dtype=np.uint8)
+        all_labels = np.zeros(60000, dtype=np.uint8)
+        all_images = np.zeros((60000, 3, 32, 32), dtype=np.uint8)
+        # Process each batch file, skipping .meta and .html files
+        for batch_file in data_folder:
+            # Get batch parameters
+            batch_type = "test" if "test" in batch_file.stem else "train"
+            batch_num = 5 if batch_type == "test" else int(batch_file.stem.split("_")[-1]) - 1
+            # Load data
+            batch_images, batch_labels = self._unpack_batch_files(batch_file)
+            # Stack data
+            num_images = batch_images.shape[0]
+            batch_start = batch_num * num_images
+            all_images[batch_start : batch_start + num_images] = batch_images
+            all_labels[batch_start : batch_start + num_images] = batch_labels
+            batch_nums[batch_start : batch_start + num_images] = batch_num
+        # Save data
+        self._loaded_data = all_images
+        np.savez(self.path / "cifar10", images=self._loaded_data, labels=all_labels, batches=batch_nums)
+        # Select data
+        image_list = np.arange(all_labels.shape[0]).astype(str)
+        if self.image_set == "train":
+            return (
+                image_list[np.nonzero(batch_nums != 5)[0]].tolist(),
+                all_labels[batch_nums != 5].tolist(),
+                {"batch_num": batch_nums[batch_nums != 5].tolist()},
+            )
+        if self.image_set == "test":
+            return (
+                image_list[np.nonzero(batch_nums == 5)[0]].tolist(),
+                all_labels[batch_nums == 5].tolist(),
+                {"batch_num": batch_nums[batch_nums == 5].tolist()},
+            )
+        return image_list.tolist(), all_labels.tolist(), {"batch_num": batch_nums.tolist()}
     def _load_data_inner(self) -> tuple[list[str], list[int], dict[str, Any]]:
         """Function to load in the file paths for the data and labels and retrieve metadata"""
-        file_meta = {"batch_num": []}
-        raw_data = []
-        labels = []
-        data_folder = self.path / "cifar-10-batches-bin"
-        save_folder = self.path / "images"
-        image_sets: dict[str, list[str]] = {"base": [], "train": [], "test": []}
-        # Process each batch file, skipping .meta and .html files
-        for entry in data_folder.iterdir():
-            if entry.suffix == ".bin":
-                batch_data, batch_labels = self._unpack_batch_files(entry)
-                raw_data.append(batch_data)
-                group = "train" if "test" not in entry.stem else "test"
-                name_split = entry.stem.split("_")
-                batch_num = int(name_split[-1]) - 1 if group == "train" else 5
-                file_names = [
-                    str(save_folder / f"{i + 10000 * batch_num:05d}_{self.index2label[label]}.png")
-                    for i, label in enumerate(batch_labels)
-                ]
-                image_sets["base"].extend(file_names)
-                image_sets[group].extend(file_names)
-                if self.image_set in (group, "base"):
-                    labels.extend(batch_labels)
-                    file_meta["batch_num"].extend([batch_num] * len(labels))
-        # Stack and reshape images
-        images = np.vstack(raw_data).reshape(-1, 3, 32, 32)
-        # Save the raw data into images if not already there
-        if not save_folder.exists():
-            save_folder.mkdir(exist_ok=True)
-            for i, file in enumerate(image_sets["base"]):
-                Image.fromarray(images[i].transpose(1, 2, 0).astype(np.uint8)).save(file)
-        return image_sets[self.image_set], labels, file_meta
-    def _unpack_batch_files(self, file_path: Path) -> tuple[NDArray[Any], list[int]]:
+        data_file = self.path / "cifar10.npz"
+        if not data_file.exists():
+            data_folder = sorted((self.path / "cifar-10-batches-bin").glob("*.bin"))
+            if not data_folder:
+                raise FileNotFoundError
+            return self._load_bin_data(data_folder)
+        # Load data
+        data = np.load(data_file)
+        self._loaded_data = data["images"]
+        all_labels = data["labels"]
+        batch_nums = data["batches"]
+        # Select data
+        image_list = np.arange(all_labels.shape[0]).astype(str)
+        if self.image_set == "train":
+            return (
+                image_list[np.nonzero(batch_nums != 5)[0]].tolist(),
+                all_labels[batch_nums != 5].tolist(),
+                {"batch_num": batch_nums[batch_nums != 5].tolist()},
+            )
+        if self.image_set == "test":
+            return (
+                image_list[np.nonzero(batch_nums == 5)[0]].tolist(),
+                all_labels[batch_nums == 5].tolist(),
+                {"batch_num": batch_nums[batch_nums == 5].tolist()},
+            )
+        return image_list.tolist(), all_labels.tolist(), {"batch_num": batch_nums.tolist()}
+    def _unpack_batch_files(self, file_path: Path) -> tuple[NDArray[np.uint8], NDArray[np.uint8]]:
         # Load pickle data with latin1 encoding
         with file_path.open("rb") as f:
-            buffer = np.frombuffer(f.read(), "B")
-            labels = buffer[::3073]
-            pixels = np.delete(buffer, np.arange(0, buffer.size, 3073))
-            images = pixels.reshape(-1, 3072)
-        return images, labels.tolist()
+            buffer = np.frombuffer(f.read(), dtype=np.uint8)
+            # Each entry is 1 byte for label + 3072 bytes for image (3*32*32)
+            entry_size = 1 + 3072
+            num_entries = buffer.size // entry_size
+            # Extract labels (first byte of each entry)
+            labels = buffer[::entry_size]
+            # Extract image data and reshape to (N, 3, 32, 32)
+            images = np.zeros((num_entries, 3, 32, 32), dtype=np.uint8)
+            for i in range(num_entries):
+                # Skip the label byte and get image data for this entry
+                start_idx = i * entry_size + 1  # +1 to skip label
+                img_flat = buffer[start_idx : start_idx + 3072]
+                # The CIFAR format stores channels in blocks (all R, then all G, then all B)
+                # Each channel block is 1024 bytes (32x32)
+                red_channel = img_flat[0:1024].reshape(32, 32)
+                green_channel = img_flat[1024:2048].reshape(32, 32)
+                blue_channel = img_flat[2048:3072].reshape(32, 32)
+                # Stack the channels in the proper C×H×W format
+                images[i, 0] = red_channel  # Red channel
+                images[i, 1] = green_channel  # Green channel
+                images[i, 2] = blue_channel  # Blue channel
+        return images, labels
+    def _read_file(self, path: str) -> NDArray[Any]:
+        """
+        Function to grab the correct image from the loaded data.
+        Overwrite of the base `_read_file` because data is an all or nothing load.
+        """
+        index = int(path)
+        return self._loaded_data[index]

dataeval/utils/datasets/_fileio.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 __all__ = []
 import hashlib
-import shutil
 import tarfile
 import zipfile
 from pathlib import Path
@@ -15,7 +14,12 @@ ARCHIVE_ENDINGS = [".zip", ".tar", ".tgz"]
 COMPRESS_ENDINGS = [".gz", ".bz2"]
-def _validate_file(fpath, file_md5, md5: bool = False, chunk_size=65535) -> bool:
+def _print(text: str, verbose: bool) -> None:
+    if verbose:
+        print(text)
+def _validate_file(fpath: Path | str, file_md5: str, md5: bool = False, chunk_size: int = 65535) -> bool:
     hasher = hashlib.md5(usedforsecurity=False) if md5 else hashlib.sha256()
     with open(fpath, "rb") as fpath_file:
         while chunk := fpath_file.read(chunk_size):
@@ -23,7 +27,7 @@ def _validate_file(fpath, file_md5, md5: bool = False, chunk_size=65535) -> bool
     return hasher.hexdigest() == file_md5
-def _download_dataset(url: str, file_path: Path, timeout: int = 60) -> None:
+def _download_dataset(url: str, file_path: Path, timeout: int = 60, verbose: bool = False) -> None:
     """Download a single resource from its URL to the `data_folder`."""
     error_msg = "URL fetch failure on {}: {} -- {}"
     try:
@@ -36,7 +40,7 @@ def _download_dataset(url: str, file_path: Path, timeout: int = 60) -> None:
     total_size = int(response.headers.get("content-length", 0))
     block_size = 8192  # 8 KB
-    progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True)
+    progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True, disable=not verbose)
     with open(file_path, "wb") as f:
         for chunk in response.iter_content(block_size):
@@ -49,7 +53,7 @@ def _extract_zip_archive(file_path: Path, extract_to: Path) -> None:
     """Extracts the zip file to the given directory."""
     try:
         with zipfile.ZipFile(file_path, "r") as zip_ref:
-            zip_ref.extractall(extract_to)
+            zip_ref.extractall(extract_to)  # noqa: S202
             file_path.unlink()
     except zipfile.BadZipFile:
         raise FileNotFoundError(f"{file_path.name} is not a valid zip file, skipping extraction.")
@@ -59,36 +63,15 @@ def _extract_tar_archive(file_path: Path, extract_to: Path) -> None:
     """Extracts a tar file (or compressed tar) to the specified directory."""
     try:
         with tarfile.open(file_path, "r:*") as tar_ref:
-            tar_ref.extractall(extract_to)
+            tar_ref.extractall(extract_to)  # noqa: S202
             file_path.unlink()
     except tarfile.TarError:
         raise FileNotFoundError(f"{file_path.name} is not a valid tar file, skipping extraction.")
-def _flatten_extraction(base_directory: Path, verbose: bool = False) -> None:
-    """
-    If the extracted folder contains only directories (and no files),
-    move all its subfolders to the dataset_dir and remove the now-empty folder.
-    """
-    for child in base_directory.iterdir():
-        if child.is_dir():
-            inner_list = list(child.iterdir())
-            if all(subchild.is_dir() for subchild in inner_list):
-                for subchild in child.iterdir():
-                    if verbose:
-                        print(f"Moving {subchild.stem} to {base_directory}")
-                    shutil.move(subchild, base_directory)
-                if verbose:
-                    print(f"Removing empty folder {child.stem}")
-                child.rmdir()
-                # Checking for additional placeholder folders
-                if len(inner_list) == 1:
-                    _flatten_extraction(base_directory, verbose)
-def _archive_extraction(file_ext, file_path, directory, compression: bool = False, verbose: bool = False):
+def _extract_archive(
+    file_ext: str, file_path: Path, directory: Path, compression: bool = False, verbose: bool = False
+) -> None:
     """
     Single function to extract and then flatten if necessary.
     Recursively extracts nested zip files as well.
@@ -102,14 +85,9 @@ def _archive_extraction(file_ext, file_path, directory, compression: bool = Fals
     # Does NOT extract in place - extracts everything to directory
     for child in directory.iterdir():
         if child.suffix == ".zip":
-            if verbose:
-                print(f"Extracting nested zip: {child} to {directory}")
+            _print(f"Extracting nested zip: {child} to {directory}", verbose)
             _extract_zip_archive(child, directory)
-    # Determine if there are nested folders and remove them
-    # Helps ensure there that data is at most one folder below main directory
-    _flatten_extraction(directory, verbose)
 def _ensure_exists(
     url: str,
@@ -137,18 +115,16 @@ def _ensure_exists(
     # Download file if it doesn't exist.
     if not check_path.exists() and download:
-        if verbose:
-            print(f"Downloading {filename} from {url}")
-        _download_dataset(url, check_path)
+        _print(f"Downloading {filename} from {url}", verbose)
+        _download_dataset(url, check_path, verbose=verbose)
         if not _validate_file(check_path, checksum, md5):
             raise Exception("File checksum mismatch. Remove current file and retry download.")
         # If the file is a zip, tar or tgz extract it into the designated folder.
         if file_ext in ARCHIVE_ENDINGS:
-            if verbose:
-                print(f"Extracting {filename}...")
-            _archive_extraction(file_ext, check_path, directory, compression, verbose)
+            _print(f"Extracting {filename}...", verbose)
+            _extract_archive(file_ext, check_path, directory, compression, verbose)
     elif not check_path.exists() and not download:
         raise FileNotFoundError(
@@ -159,10 +135,8 @@ def _ensure_exists(
     else:
         if not _validate_file(check_path, checksum, md5):
             raise Exception("File checksum mismatch. Remove current file and retry download.")
-        if verbose:
-            print(f"{filename} already exists, skipping download.")
+        _print(f"{filename} already exists, skipping download.", verbose)
         if file_ext in ARCHIVE_ENDINGS:
-            if verbose:
-                print(f"Extracting {filename}...")
-            _archive_extraction(file_ext, check_path, directory, compression, verbose)
+            _print(f"Extracting {filename}...", verbose)
+            _extract_archive(file_ext, check_path, directory, compression, verbose)

dataeval/utils/datasets/_milco.py CHANGED Viewed

@@ -38,7 +38,7 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     Parameters
     ----------
     root : str or pathlib.Path
-        Root directory of dataset where the ``milco`` folder exists.
+        Root directory where the data should be downloaded to or the ``milco`` folder of the already downloaded data.
     image_set: "train", "operational", or "base", default "train"
         If "train", then the images from 2015, 2017 and 2021 are selected,
         resulting in 315 MILCO objects and 177 NOMBO objects.
@@ -128,6 +128,7 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
             download,
             verbose,
         )
+        self._bboxes_per_size = True
     def _load_data(self) -> tuple[list[str], list[str], dict[str, list[Any]]]:
         filepaths: list[str] = []
@@ -160,15 +161,17 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def _load_data_inner(self) -> tuple[list[str], list[str], dict[str, Any]]:
         file_data = {"year": [], "image_id": [], "data_path": [], "label_path": []}
-        data_folder = self.path / self._resource.filename[:-4]
-        for entry in data_folder.iterdir():
-            if entry.is_file() and entry.suffix == ".jpg":
-                # Remove file extension and split by "_"
-                parts = entry.stem.split("_")
-                file_data["image_id"].append(parts[0])
-                file_data["year"].append(parts[1])
-                file_data["data_path"].append(str(entry))
-                file_data["label_path"].append(str(entry.parent / entry.stem) + ".txt")
+        data_folder = sorted((self.path / self._resource.filename[:-4]).glob("*.jpg"))
+        if not data_folder:
+            raise FileNotFoundError
+        for entry in data_folder:
+            # Remove file extension and split by "_"
+            parts = entry.stem.split("_")
+            file_data["image_id"].append(parts[0])
+            file_data["year"].append(parts[1])
+            file_data["data_path"].append(str(entry))
+            file_data["label_path"].append(str(entry.parent / entry.stem) + ".txt")
         data = file_data.pop("data_path")
         annotations = file_data.pop("label_path")
@@ -180,8 +183,15 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         boxes: list[list[float]] = []
         with open(annotation) as f:
             for line in f.readlines():
-                out = line.strip().split(" ")
+                out = line.strip().split()
                 labels.append(int(out[0]))
-                boxes.append([float(out[1]), float(out[2]), float(out[3]), float(out[4])])
+                xcenter, ycenter, width, height = [float(out[1]), float(out[2]), float(out[3]), float(out[4])]
+                x0 = xcenter - width / 2
+                x1 = x0 + width
+                y0 = ycenter - height / 2
+                y1 = y0 + height
+                boxes.append([x0, y0, x1, y1])
         return boxes, labels, {}

dataeval/utils/datasets/_mixin.py CHANGED Viewed

@@ -34,8 +34,7 @@ class BaseDatasetNumpyMixin(BaseDatasetMixin[NDArray[Any]]):
         return encoded
     def _read_file(self, path: str) -> NDArray[Any]:
-        x = np.array(Image.open(path)).transpose(2, 0, 1)
-        return x
+        return np.array(Image.open(path)).transpose(2, 0, 1)
 class BaseDatasetTorchMixin(BaseDatasetMixin[torch.Tensor]):
@@ -52,5 +51,4 @@ class BaseDatasetTorchMixin(BaseDatasetMixin[torch.Tensor]):
         return encoded
     def _read_file(self, path: str) -> torch.Tensor:
-        x = torch.as_tensor(np.array(Image.open(path)).transpose(2, 0, 1))
-        return x
+        return torch.as_tensor(np.array(Image.open(path)).transpose(2, 0, 1))

dataeval/utils/datasets/_mnist.py CHANGED Viewed

@@ -48,7 +48,7 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     Parameters
     ----------
     root : str or pathlib.Path
-        Root directory of dataset where the ``mnist`` folder exists.
+        Root directory where the data should be downloaded to or the ``minst`` folder of the already downloaded data.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
     corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
@@ -154,7 +154,7 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def _load_corruption(self) -> tuple[NDArray[Any], NDArray[np.uintp]]:
         """Function to load in the file paths for the data and labels for the different corrupt data formats"""
         corruption = self.corruption if self.corruption is not None else "identity"
-        base_path = self.path / corruption
+        base_path = self.path / "mnist_c" / corruption
         if self.image_set == "base":
             raw_data = []
             raw_labels = []
@@ -191,8 +191,7 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def _grab_corruption_data(self, path: Path) -> NDArray[Any]:
         """Function to load in the data numpy array for the previously chosen corrupt format"""
-        x = np.load(path, allow_pickle=False)
-        return x
+        return np.load(path, allow_pickle=False)
     def _read_file(self, path: str) -> NDArray[Any]:
         """

dataeval 0.86.0__py3-none-any.whl → 0.86.2__py3-none-any.whl

dataeval 0.86.0py3-none-any.whl → 0.86.2py3-none-any.whl