PyPI - dataeval - Versions diffs - 0.83.0__py3-none-any.whl → 0.84.1__py3-none-any.whl - Mend

dataeval 0.83.0py3-none-any.whl → 0.84.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

dataeval/__init__.py +1 -1
dataeval/config.py +3 -3
dataeval/detectors/drift/__init__.py +2 -2
dataeval/detectors/drift/_base.py +55 -203
dataeval/detectors/drift/_cvm.py +19 -30
dataeval/detectors/drift/_ks.py +18 -30
dataeval/detectors/drift/_mmd.py +189 -53
dataeval/detectors/drift/_uncertainty.py +52 -56
dataeval/detectors/drift/updates.py +13 -12
dataeval/detectors/linters/duplicates.py +5 -3
dataeval/detectors/linters/outliers.py +2 -2
dataeval/detectors/ood/ae.py +1 -1
dataeval/metrics/bias/__init__.py +11 -1
dataeval/metrics/bias/_completeness.py +130 -0
dataeval/metrics/stats/_base.py +28 -32
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_labelstats.py +4 -45
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/__init__.py +2 -1
dataeval/outputs/_bias.py +31 -22
dataeval/outputs/_stats.py +2 -3
dataeval/typing.py +25 -22
dataeval/utils/_array.py +43 -7
dataeval/utils/data/_dataset.py +8 -4
dataeval/utils/data/_embeddings.py +141 -24
dataeval/utils/data/_images.py +38 -15
dataeval/utils/data/_metadata.py +5 -4
dataeval/utils/data/_selection.py +3 -15
dataeval/utils/data/_split.py +76 -129
dataeval/utils/data/datasets/_base.py +7 -4
dataeval/utils/data/datasets/_cifar10.py +9 -9
dataeval/utils/data/datasets/_milco.py +42 -14
dataeval/utils/data/datasets/_mnist.py +9 -5
dataeval/utils/data/datasets/_ships.py +8 -4
dataeval/utils/data/datasets/_voc.py +40 -19
dataeval/utils/data/selections/__init__.py +2 -0
dataeval/utils/data/selections/_classbalance.py +38 -0
dataeval/utils/data/selections/_classfilter.py +14 -29
dataeval/utils/data/selections/_prioritize.py +1 -1
dataeval/utils/data/selections/_shuffle.py +2 -2
dataeval/utils/metadata.py +1 -1
dataeval/utils/torch/_internal.py +12 -35
{dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/METADATA +2 -3
{dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/RECORD +49 -48
dataeval/detectors/drift/_torch.py +0 -222
{dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/WHEEL +0 -0

dataeval/utils/data/_split.py CHANGED Viewed

@@ -2,19 +2,22 @@ from __future__ import annotations
 __all__ = []
+import logging
 import warnings
-from typing import Any, Iterator, Protocol
+from typing import Any, Iterator, Protocol, Sequence
 import numpy as np
 from numpy.typing import NDArray
-from sklearn.cluster import KMeans
-from sklearn.metrics import silhouette_score
 from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
 from sklearn.utils.multiclass import type_of_target
-from dataeval.config import get_seed
+from dataeval.config import EPSILON
 from dataeval.outputs._base import set_metadata
 from dataeval.outputs._utils import SplitDatasetOutput, TrainValSplit
+from dataeval.typing import AnnotatedDataset
+from dataeval.utils.data._metadata import Metadata
+_logger = logging.getLogger(__name__)
 class KFoldSplitter(Protocol):
@@ -85,7 +88,7 @@ def calculate_validation_fraction(num_folds: int, test_frac: float, val_frac: fl
     return val_base * (1.0 / num_folds) * (1.0 - test_frac)
-def _validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
+def validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
     """
     Check to make sure there is more input data than the total number of partitions requested
@@ -116,7 +119,7 @@ def _validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
         raise ValueError("Detected continuous labels. Labels must be discrete for proper stratification")
-def is_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> bool:
+def validate_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> None:
     """
     Check if the dataset can be stratified by class label over the given number of partitions
@@ -132,26 +135,23 @@ def is_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> bool:
     bool
         True if dataset can be stratified else False
-    Warns
-    -----
-    UserWarning
-        Warns user if the dataset cannot be stratified due to the total number of [train, val, test]
+    Raises
+    ------
+    ValueError
+        If the dataset cannot be stratified due to the total number of [train, val, test]
         partitions exceeding the number of instances of the rarest class label.
     """
     # Get the minimum count of all labels
     lowest_label_count = np.unique(labels, return_counts=True)[1].min()
     if lowest_label_count < num_partitions:
-        warnings.warn(
+        raise ValueError(
             f"Unable to stratify due to label frequency. The lowest label count ({lowest_label_count}) is fewer "
-            f"than the total number of partitions ({num_partitions}) requested.",
-            UserWarning,
+            f"than the total number of partitions ({num_partitions}) requested."
         )
-        return False
-    return True
-def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
+def validate_groupable(groups: NDArray[np.intp], num_partitions: int) -> None:
     """
     Warns user if the number of unique group_ids is incompatible with a grouped partition containing
     num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
@@ -159,7 +159,7 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
     Parameters
     ----------
-    group_ids : NDArray of ints
+    groups : NDArray of ints
         The id of the group each sample at the corresponding index belongs to
     num_partitions : int
         Total number of train, val, and test splits requested
@@ -169,60 +169,24 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
     bool
         True if the dataset can be grouped by the given group ids else False
-    Warns
-    -----
-    UserWarning
-        Warns if there are fewer groups than the requested number of partitions plus one
+    Raises
+    ------
+    ValueError
+        If there are is only one unique group.
+    ValueError
+        If there are fewer groups than the requested number of partitions plus one
     """
-    num_unique_groups = len(np.unique(group_ids))
+    num_unique_groups = len(np.unique(groups))
     # Cannot separate if only one group exists
     if num_unique_groups == 1:
-        return False
+        raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than 1.")
     if num_unique_groups < num_partitions:
-        warnings.warn(
-            f"Groups must be greater than num partitions. Got {num_unique_groups} and {num_partitions}. "
-            "Reverting to ungrouped partitioning",
-            UserWarning,
-        )
-        return False
-    return True
-def bin_kmeans(array: NDArray[Any]) -> NDArray[np.intp]:
-    """
-    Find bins of continuous data by iteratively applying k-means clustering, and keeping the
-    clustering with the highest silhouette score.
-    Parameters
-    ----------
-    array : NDArray
-        continuous data to bin
+        raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than num partitions ({num_partitions}).")
-    Returns
-    -------
-    NDArray[int]:
-        bin numbers assigned by the kmeans best clusterer.
-    """
-    if array.ndim == 1:
-        array = array.reshape([-1, 1])
-        best_score = 0.60
-    else:
-        best_score = 0.50
-    bin_index = np.zeros(len(array), dtype=np.intp)
-    for k in range(2, 20):
-        clusterer = KMeans(n_clusters=k, random_state=get_seed())
-        cluster_labels = clusterer.fit_predict(array)
-        score = silhouette_score(array, cluster_labels, sample_size=25_000, random_state=get_seed())
-        if score > best_score:
-            best_score = score
-            bin_index = cluster_labels.astype(np.intp)
-    return bin_index
-def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.intp]:
+def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np.intp] | None:
     """
     Returns individual group numbers based on a subset of metadata defined by groupnames
@@ -232,32 +196,20 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
         dictionary containing all metadata
     groupnames : list
         which groups from the metadata dictionary to consider for dataset grouping
-    num_samples : int
-        number of labels. Used to ensure agreement between input data/labels and metadata entries.
-    Raises
-    ------
-    IndexError
-        raised if an entry in the metadata dictionary doesn't have the same length as num_samples
     Returns
     -------
     np.ndarray
         group identifiers from metadata
     """
-    features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
-    if not features2group:
-        return np.zeros(num_samples, dtype=np.intp)
-    for name, feature in features2group.items():
-        if len(feature) != num_samples:
-            raise ValueError(
-                f"Feature length does not match number of labels. Got {len(feature)} features and {num_samples} samples"
-            )
-        if type_of_target(feature) == "continuous":
-            features2group[name] = bin_kmeans(feature)
-    binned_features = np.stack(list(features2group.values()), axis=1)
-    _, group_ids = np.unique(binned_features, axis=0, return_inverse=True)
+    # get only the factors that are present in the metadata
+    if split_on is None:
+        return None
+    split_set = set(split_on)
+    indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
+    binned_features = metadata.discrete_data[:, indices]
+    group_ids = np.unique(binned_features, axis=0, return_inverse=True)[1]
     return group_ids
@@ -294,10 +246,18 @@ def make_splits(
     split_defs: list[TrainValSplit] = []
     n_labels = len(np.unique(labels))
     splitter = KFOLD_GROUP_STRATIFIED_MAP[(groups is not None, stratified)](n_folds)
+    _logger.log(logging.DEBUG, f"splitter={splitter.__class__.__name__}(n_splits={n_folds})")
     good = False
     attempts = 0
     while not good and attempts < 3:
         attempts += 1
+        _logger.log(
+            logging.DEBUG,
+            f"attempt={attempts}: splitter.split("
+            + f"index=arr(len={len(index)}, unique={np.unique(index)}), "
+            + f"labels=arr(len={len(index)}, unique={np.unique(index)}), "
+            + ("groups=None" if groups is None else f"groups=arr(len={len(groups)}, unique={np.unique(groups)}))"),
+        )
         splits = splitter.split(index, labels, groups)
         split_defs.clear()
         for train_idx, eval_idx in splits:
@@ -341,20 +301,20 @@ def find_best_split(
         counts = np.bincount(arr, minlength=minlength)
         return counts / np.sum(counts)
-    def weight(arr: NDArray, class_freq: NDArray) -> np.float64:
-        return np.sum(np.abs(freq(arr, len(class_freq)) - class_freq))
+    def weight(arr: NDArray, class_freq: NDArray) -> float:
+        return float(np.sum(np.abs(freq(arr, len(class_freq)) - class_freq)))
-    def class_freq_diff(split: TrainValSplit) -> np.float64:
+    def class_freq_diff(split: TrainValSplit) -> float:
         class_freq = freq(labels)
         return weight(labels[split.train], class_freq) + weight(labels[split.val], class_freq)
-    def split_ratio(split: TrainValSplit) -> np.float64:
-        return np.float64(len(split.val) / (len(split.val) + len(split.train)))
+    def split_ratio(split: TrainValSplit) -> float:
+        return len(split.val) / (len(split.val) + len(split.train))
-    def split_diff(split: TrainValSplit) -> np.float64:
+    def split_diff(split: TrainValSplit) -> float:
         return abs(split_frac - split_ratio(split))
-    def split_inv_diff(split: TrainValSplit) -> np.float64:
+    def split_inv_diff(split: TrainValSplit) -> float:
         return abs(1 - split_frac - split_ratio(split))
     # Selects minimization function based on inputs
@@ -399,11 +359,12 @@ def single_split(
         Indices of data partitioned for training and evaluation
     """
-    _, label_counts = np.unique(labels, return_counts=True)
-    max_folds = label_counts.min()
-    min_folds = np.unique(groups).shape[0] if groups is not None else 2
-    divisor = split_frac + 1e-06 if split_frac <= 2 / 3 else 1 - split_frac - 1e-06
-    n_folds = round(min(max(1 / divisor, min_folds), max_folds))  # Clips value between min_folds and max_folds
+    unique_groups = 2 if groups is None else len(np.unique(groups))
+    max_folds = min(min(np.unique(labels, return_counts=True)[1]), unique_groups) if stratified else unique_groups
+    divisor = split_frac if split_frac <= 2 / 3 else 1 - split_frac
+    n_folds = min(max(round(1 / (divisor + EPSILON)), 2), max_folds)  # Clips value between 2 and max_folds
+    _logger.log(logging.DEBUG, f"n_folds={n_folds} clipped between[2, {max_folds}]")
     split_candidates = make_splits(index, labels, n_folds, groups, stratified)
     return find_best_split(labels, split_candidates, stratified, split_frac)
@@ -411,22 +372,20 @@ def single_split(
 @set_metadata
 def split_dataset(
-    labels: list[int] | NDArray[np.intp],
+    dataset: AnnotatedDataset[Any] | Metadata,
     num_folds: int = 1,
     stratify: bool = False,
-    split_on: list[str] | None = None,
-    metadata: dict[str, Any] | None = None,
+    split_on: Sequence[str] | None = None,
     test_frac: float = 0.0,
     val_frac: float = 0.0,
 ) -> SplitDatasetOutput:
     """
-    Top level splitting function. Returns a dataclass containing a list of train and validation indices.
-    Indices for a test holdout may also be optionally included
+    Dataset splitting function. Returns a dataclass containing a list of train and validation indices.
     Parameters
     ----------
-    labels : list or NDArray of ints
-        Classification Labels used to generate splits. Determines the size of the dataset
+    dataset : AnnotatedDataset or Metadata
+        Dataset to split.
     num_folds : int, default 1
         Number of [train, val] folds. If equal to 1, val_frac must be greater than 0.0
     stratify : bool, default False
@@ -436,8 +395,6 @@ def split_dataset(
         Keys of the metadata dictionary upon which to group the dataset.
         A grouped partition is divided such that no group is present within both the training and
         validation set. Split_on groups should be selected to mitigate validation bias
-    metadata : dict or None, default None
-        Dict containing data for potential dataset grouping. See split_on above
     test_frac : float, default 0.0
         Fraction of data to be optionally held out for test set
     val_frac : float, default 0.0
@@ -450,13 +407,8 @@ def split_dataset(
         Output class containing a list of indices of training
         and validation data for each fold and optional test indices
-    Raises
-    ------
-    TypeError
-        Raised if split_on is passed, but metadata is None or empty
-    Note
-    ----
+    Notes
+    -----
     When specifying groups and/or stratification, ratios for test and validation splits can vary
     as the stratification and grouping take higher priority than the percentages
     """
@@ -464,30 +416,25 @@ def split_dataset(
     val_frac = calculate_validation_fraction(num_folds, test_frac, val_frac)
     total_partitions = num_folds + 1 if test_frac else num_folds
-    if isinstance(labels, list):
-        labels = np.array(labels, dtype=np.intp)
+    metadata = dataset if isinstance(dataset, Metadata) else Metadata(dataset)
+    labels = metadata.class_labels
-    label_length: int = len(labels)
+    validate_labels(labels, total_partitions)
+    if stratify:
+        validate_stratifiable(labels, total_partitions)
-    _validate_labels(labels, total_partitions)
-    stratify &= is_stratifiable(labels, total_partitions)
-    groups = None
-    if split_on:
-        if metadata is None or metadata == {}:
-            raise TypeError("If split_on is specified, metadata must also be provided, got None")
-        possible_groups = get_group_ids(metadata, split_on, label_length)
+    groups = get_groups(metadata, split_on)
+    if groups is not None:
         # Accounts for a test set that is 100 % of the data
         group_partitions = total_partitions + 1 if val_frac else total_partitions
-        if is_groupable(possible_groups, group_partitions):
-            groups = possible_groups
+        validate_groupable(groups, group_partitions)
-    index = np.arange(label_length)
+    index = np.arange(len(labels))
-    tvs = (
-        single_split(index=index, labels=labels, split_frac=test_frac, groups=groups, stratified=stratify)
-        if test_frac
-        else TrainValSplit(index, np.array([], dtype=np.intp))
-    )
+    if test_frac:
+        tvs = single_split(index, labels, test_frac, groups, stratify)
+    else:
+        tvs = TrainValSplit(index, np.array([], dtype=np.intp))
     tv_labels = labels[tvs.train]
     tv_groups = groups[tvs.train] if groups is not None else None

dataeval/utils/data/datasets/_base.py CHANGED Viewed

@@ -19,9 +19,12 @@ from dataeval.utils.data.datasets._types import (
 )
 if TYPE_CHECKING:
-    from dataeval.typing import Transform
+    from dataeval.typing import Array, Transform
+    _TArray = TypeVar("_TArray", bound=Array)
+else:
+    _TArray = TypeVar("_TArray")
-_TArray = TypeVar("_TArray")
 _TTarget = TypeVar("_TTarget")
 _TRawTarget = TypeVar("_TRawTarget", list[int], list[str])
@@ -51,9 +54,9 @@ class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Ge
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
-        image_set: Literal["train", "val", "test", "base"] = "train",
+        image_set: Literal["train", "val", "test", "operational", "base"] = "train",
         transforms: Transform[_TArray] | Sequence[Transform[_TArray]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         self._root: Path = root.absolute() if isinstance(root, Path) else Path(root).absolute()

dataeval/utils/data/datasets/_cifar10.py CHANGED Viewed

@@ -27,13 +27,13 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``mnist`` folder exists.
-    download : bool, default False
-        If True, downloads the dataset from the internet and puts it in root directory.
-        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
     transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
+    download : bool, default False
+        If True, downloads the dataset from the internet and puts it in root directory.
+        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     verbose : bool, default False
         If True, outputs print statements.
@@ -43,16 +43,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         Location of the folder containing the data.
     image_set : "train", "test" or "base"
         The selected image set from the dataset.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
     label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
     metadata : DatasetMetadata
         Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
-    transforms : Sequence[Transform]
-        The transforms to be applied to the data.
-    size : int
-        The size of the dataset.
     """
     _resources = [
@@ -80,16 +80,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
         image_set: Literal["train", "test", "base"] = "train",
         transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         super().__init__(
             root,
-            download,
             image_set,
             transforms,
+            download,
             verbose,
         )

dataeval/utils/data/datasets/_milco.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 from numpy.typing import NDArray
@@ -16,21 +16,20 @@ if TYPE_CHECKING:
 class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     """
-    A side-scan sonar dataset focused on mine (object) detection.
+    A side-scan sonar dataset focused on mine-like object detection.
     The dataset comes from the paper
     `Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
     by N.P. Santos et. al. (2024).
-    This class only accesses a portion of the above dataset due to size constraints.
     The full dataset contains 1170 side-scan sonar images collected using a 900-1800 kHz Marine Sonic
     dual frequency side-scan sonar of a Teledyne Marine Gavia Autonomous Underwater Vehicle.
     All the images were carefully analyzed and annotated, including the image coordinates of the
     Bounding Box (BB) of the detected objects divided into NOn-Mine-like BOttom Objects (NOMBO)
     and MIne-Like COntacts (MILCO) classes.
-    This dataset is consists of 261 images (120 images from 2015, 93 images from 2017, and 48 images from 2021).
-    In these 261 images, there are 315 MILCO objects, and 175 NOMBO objects.
+    This dataset is consists of 345 images from 2010, 120 images from 2015, 93 images from 2017, 564 images from 2018,
+    and 48 images from 2021). In these 1170 images, there are 432 MILCO objects, and 235 NOMBO objects.
     The class “0” corresponds to a MILCO object and the class “1” corresponds to a NOMBO object.
     The raw BB coordinates provided in the downloaded text files are (x, y, w, h),
     given as percentages of the image (x_BB = x/img_width, y_BB = y/img_height, etc.).
@@ -40,11 +39,17 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``milco`` folder exists.
+    image_set: "train", "operational", or "base", default "train"
+        If "train", then the images from 2015, 2017 and 2021 are selected,
+        resulting in 315 MILCO objects and 177 NOMBO objects.
+        If "operational", then the images from 2010 and 2018 are selected,
+        resulting in 117 MILCO objects and 58 NOMBO objects.
+        If "base", then the full dataset is selected.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
-    transforms : Transform, Sequence[Transform] or None, default None
-        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
@@ -52,8 +57,8 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     path : pathlib.Path
         Location of the folder containing the data.
-    image_set : "base"
-        The base image set is the only available image set for the MILCO dataset.
+    image_set : "train", "operational" or "base"
+        The selected image set from the dataset.
     index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
     label2index : dict[str, int]
@@ -64,6 +69,10 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_
     """
     _resources = [
@@ -85,6 +94,18 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
             md5=True,
             checksum="b84749b21fa95a4a4c7de3741db78bc7",
         ),
+        DataLocation(
+            url="https://figshare.com/ndownloader/files/43169008",
+            filename="2010.zip",
+            md5=True,
+            checksum="43347a0cc383c0d3dbe0d24ae56f328d",
+        ),
+        DataLocation(
+            url="https://figshare.com/ndownloader/files/43169011",
+            filename="2018.zip",
+            md5=True,
+            checksum="25d091044a10c78674fedad655023e3b",
+        ),
     ]
     index2label: dict[int, str] = {
@@ -95,15 +116,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
+        image_set: Literal["train", "operational", "base"] = "train",
         transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         super().__init__(
             root,
-            download,
-            "base",
+            image_set,
             transforms,
+            download,
             verbose,
         )
@@ -112,10 +134,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         targets: list[str] = []
         datum_metadata: dict[str, list[Any]] = {}
         metadata_list: list[dict[str, Any]] = []
+        image_sets: dict[str, list[int]] = {
+            "base": list(range(len(self._resources))),
+            "train": list(range(3)),
+            "operational": list(range(3, len(self._resources))),
+        }
         # Load the data
-        for resource in self._resources:
-            self._resource = resource
+        resource_indices = image_sets[self.image_set]
+        for idx in resource_indices:
+            self._resource = self._resources[idx]
             filepath, target, metadata = super()._load_data()
             filepaths.extend(filepath)
             targets.extend(target)

dataeval/utils/data/datasets/_mnist.py CHANGED Viewed

@@ -49,9 +49,6 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``mnist`` folder exists.
-    download : bool, default False
-        If True, downloads the dataset from the internet and puts it in root directory.
-        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
     corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
@@ -60,6 +57,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         Corruption to apply to the data.
     transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
+    download : bool, default False
+        If True, downloads the dataset from the internet and puts it in root directory.
+        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     verbose : bool, default False
         If True, outputs print statements.
@@ -81,6 +81,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_ for corruption dataset
     """
     _resources = [
@@ -114,10 +118,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
         image_set: Literal["train", "test", "base"] = "train",
         corruption: CorruptionStringMap | None = None,
         transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         self.corruption = corruption
@@ -127,9 +131,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         super().__init__(
             root,
-            download,
             image_set,
             transforms,
+            download,
             verbose,
         )

dataeval/utils/data/datasets/_ships.py CHANGED Viewed

@@ -31,11 +31,11 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``shipdataset`` folder exists.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
-    transforms : Transform, Sequence[Transform] or None, default None
-        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
@@ -55,6 +55,10 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `CC BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_
     """
     _resources = [
@@ -74,15 +78,15 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
         transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         super().__init__(
             root,
-            download,
             "base",
             transforms,
+            download,
             verbose,
         )
         self._scenes: list[str] = self._load_scenes()

dataeval 0.83.0__py3-none-any.whl → 0.84.1__py3-none-any.whl

dataeval 0.83.0py3-none-any.whl → 0.84.1py3-none-any.whl