PyPI - dataeval - Versions diffs - 0.86.0__py3-none-any.whl → 0.86.2__py3-none-any.whl - Mend

dataeval 0.86.0py3-none-any.whl → 0.86.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +188 -178
dataeval/data/_selection.py +1 -2
dataeval/data/_split.py +4 -5
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +2 -5
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_nml/_base.py +4 -2
dataeval/detectors/drift/_nml/_chunk.py +11 -19
dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
dataeval/detectors/drift/_nml/_result.py +8 -9
dataeval/detectors/drift/_nml/_thresholds.py +66 -77
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metadata/_distance.py +10 -7
dataeval/metadata/_ood.py +11 -103
dataeval/metrics/bias/_balance.py +23 -33
dataeval/metrics/bias/_diversity.py +16 -14
dataeval/metrics/bias/_parity.py +18 -18
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +24 -70
dataeval/outputs/_drift.py +1 -9
dataeval/outputs/_linters.py +11 -11
dataeval/outputs/_stats.py +82 -23
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +54 -28
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +22 -12
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/METADATA +2 -1
dataeval-0.86.2.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.86.0.dist-info/RECORD +0 -114
{dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/LICENSE.txt +0 -0
{dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/WHEEL +0 -0

dataeval/utils/_image.py CHANGED Viewed

@@ -12,6 +12,9 @@ from scipy.signal import convolve2d
 EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
 BIT_DEPTH = (1, 8, 12, 16, 32)
+Box = tuple[int, int, int, int]
+"""Bounding box as tuple of integers in x0, y0, x1, y1 format."""
 @dataclass
 class BitDepth:
@@ -25,12 +28,11 @@ def get_bitdepth(image: NDArray[Any]) -> BitDepth:
     Approximates the bit depth of the image using the
     min and max pixel values.
     """
-    pmin, pmax = np.min(image), np.max(image)
+    pmin, pmax = np.nanmin(image), np.nanmax(image)
     if pmin < 0:
         return BitDepth(0, pmin, pmax)
-    else:
-        depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
-        return BitDepth(depth, 0, 2**depth - 1)
+    depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
+    return BitDepth(depth, 0, 2**depth - 1)
 def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
@@ -40,9 +42,8 @@ def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
     bitdepth = get_bitdepth(image)
     if bitdepth.depth == depth:
         return image
-    else:
-        normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
-        return normalized * (2**depth - 1)
+    normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
+    return normalized * (2**depth - 1)
 def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
@@ -52,13 +53,12 @@ def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
     ndim = image.ndim
     if ndim == 2:
         return np.expand_dims(image, axis=0)
-    elif ndim == 3:
+    if ndim == 3:
         return image
-    elif ndim > 3:
+    if ndim > 3:
         # Slice all but the last 3 dimensions
         return image[(0,) * (ndim - 3)]
-    else:
-        raise ValueError("Images must have 2 or more dimensions.")
+    raise ValueError("Images must have 2 or more dimensions.")
 def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
@@ -71,3 +71,57 @@ def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
     edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
     np.clip(edges, 0, 255, edges)
     return edges
+def clip_box(image: NDArray[Any], box: Box) -> Box:
+    """
+    Clip the box to inside the provided image dimensions.
+    """
+    x0, y0, x1, y1 = box
+    h, w = image.shape[-2:]
+    return max(0, x0), max(0, y0), min(w, x1), min(h, y1)
+def is_valid_box(box: Box) -> bool:
+    """
+    Check if the box dimensions provided are a valid image.
+    """
+    return box[2] > box[0] and box[3] > box[1]
+def clip_and_pad(image: NDArray[Any], box: Box) -> NDArray[Any]:
+    """
+    Extract a region from an image based on a bounding box, clipping to image boundaries
+    and padding out-of-bounds areas with np.nan.
+    Parameters:
+    -----------
+    image : NDArray[Any]
+        Input image array in format C, H, W (channels first)
+    box : Box
+        Bounding box coordinates as (x0, y0, x1, y1) where (x0, y0) is top-left and (x1, y1) is bottom-right
+    Returns:
+    --------
+    NDArray[Any]
+        The extracted region with out-of-bounds areas padded with np.nan
+    """
+    # Create output array filled with NaN with a minimum size of 1x1
+    bw, bh = max(1, box[2] - box[0]), max(1, box[3] - box[1])
+    output = np.full((image.shape[-3] if image.ndim > 2 else 1, bh, bw), np.nan)
+    # Calculate source box
+    sbox = clip_box(image, box)
+    # Calculate destination box
+    x0, y0 = sbox[0] - box[0], sbox[1] - box[1]
+    x1, y1 = x0 + (sbox[2] - sbox[0]), y0 + (sbox[3] - sbox[1])
+    # Copy the source if valid from the image to the output
+    if is_valid_box(sbox):
+        output[:, y0:y1, x0:x1] = image[:, sbox[1] : sbox[3], sbox[0] : sbox[2]]
+    return output

dataeval/utils/_mst.py CHANGED Viewed

@@ -83,6 +83,4 @@ def compute_neighbors(
     nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
     nns = nbrs.kneighbors(A)[1]
-    nns = nns[:, 1:].squeeze()
-    return nns
+    return nns[:, 1:].squeeze()

dataeval/utils/_plot.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 __all__ = []
 import contextlib
+import math
 from typing import Any
 import numpy as np
@@ -160,11 +161,9 @@ def histogram_plot(
     import matplotlib.pyplot as plt
     num_metrics = len(data_dict)
-    if num_metrics > 2:
-        rows = int(len(data_dict) / 3)
-        fig, axs = plt.subplots(rows, 3, figsize=(10, rows * 2.5))
-    else:
-        fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
+    rows = math.ceil(num_metrics / 3)
+    cols = min(num_metrics, 3)
+    fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
     for ax, metric in zip(
         axs.flat,
@@ -178,6 +177,10 @@ def histogram_plot(
         ax.set_ylabel(ylabel)
         ax.set_xlabel(xlabel)
+    for ax in axs.flat[num_metrics:]:
+        ax.axis("off")
+        ax.set_visible(False)
     fig.tight_layout()
     return fig
@@ -216,11 +219,9 @@ def channel_histogram_plot(
     label_kwargs = {"label": [f"Channel {i}" for i in range(max_channels)]}
     num_metrics = len(data_keys)
-    if num_metrics > 2:
-        rows = int(len(data_keys) / 3)
-        fig, axs = plt.subplots(rows, 3, figsize=(10, rows * 2.5))
-    else:
-        fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
+    rows = math.ceil(num_metrics / 3)
+    cols = min(num_metrics, 3)
+    fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
     for ax, metric in zip(
         axs.flat,
@@ -245,5 +246,9 @@ def channel_histogram_plot(
         ax.set_ylabel(ylabel)
         ax.set_xlabel(xlabel)
+    for ax in axs.flat[num_metrics:]:
+        ax.axis("off")
+        ax.set_visible(False)
     fig.tight_layout()
     return fig

dataeval/utils/data/_dataset.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Generic, Iterable, Literal, Sequence, TypeVar
+from typing import Any, Generic, Iterable, Literal, Sequence, SupportsFloat, SupportsInt, TypeVar, cast
 from dataeval.typing import (
     Array,
@@ -17,9 +17,9 @@ from dataeval.utils._array import as_numpy
 def _validate_data(
     datum_type: Literal["ic", "od"],
     images: Array | Sequence[Array],
-    labels: Sequence[int] | Sequence[Sequence[int]],
-    bboxes: Sequence[Sequence[Sequence[float]]] | None,
-    metadata: Sequence[dict[str, Any]] | None,
+    labels: Array | Sequence[int] | Sequence[Array] | Sequence[Sequence[int]],
+    bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]] | None,
+    metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
 ) -> None:
     # Validate inputs
     dataset_len = len(images)
@@ -30,20 +30,31 @@ def _validate_data(
         raise ValueError(f"Number of labels ({len(labels)}) does not match number of images ({dataset_len}).")
     if bboxes is not None and len(bboxes) != dataset_len:
         raise ValueError(f"Number of bboxes ({len(bboxes)}) does not match number of images ({dataset_len}).")
-    if metadata is not None and len(metadata) != dataset_len:
+    if metadata is not None and (
+        len(metadata) != dataset_len
+        if isinstance(metadata, Sequence)
+        else any(
+            not isinstance(metadatum, Sequence) or len(metadatum) != dataset_len for metadatum in metadata.values()
+        )
+    ):
         raise ValueError(f"Number of metadata ({len(metadata)}) does not match number of images ({dataset_len}).")
     if datum_type == "ic":
-        if not isinstance(labels, Sequence) or not isinstance(labels[0], int):
+        if not isinstance(labels, (Sequence, Array)) or not isinstance(labels[0], (int, SupportsInt)):
             raise TypeError("Labels must be a sequence of integers for image classification.")
     elif datum_type == "od":
-        if not isinstance(labels, Sequence) or not isinstance(labels[0], Sequence) or not isinstance(labels[0][0], int):
+        if (
+            not isinstance(labels, (Sequence, Array))
+            or not isinstance(labels[0], (Sequence, Array))
+            or not isinstance(cast(Sequence[Any], labels[0])[0], (int, SupportsInt))
+        ):
             raise TypeError("Labels must be a sequence of sequences of integers for object detection.")
         if (
             bboxes is None
             or not isinstance(bboxes, (Sequence, Array))
             or not isinstance(bboxes[0], (Sequence, Array))
             or not isinstance(bboxes[0][0], (Sequence, Array))
+            or not isinstance(bboxes[0][0][0], (float, SupportsFloat))
             or not len(bboxes[0][0]) == 4
         ):
             raise TypeError("Boxes must be a sequence of sequences of (x0, y0, x1, y1) for object detection.")
@@ -51,12 +62,19 @@ def _validate_data(
         raise ValueError(f"Unknown datum type '{datum_type}'. Must be 'ic' or 'od'.")
+def _listify_metadata(
+    metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
+) -> Sequence[dict[str, Any]] | None:
+    if isinstance(metadata, dict):
+        return [{k: v[i] for k, v in metadata.items()} for i in range(len(next(iter(metadata.values()))))]
+    return metadata
 def _find_max(arr: ArrayLike) -> Any:
-    if isinstance(arr, (Iterable, Sequence, Array)):
+    if not isinstance(arr, (bytes, str)) and isinstance(arr, (Iterable, Sequence, Array)):
         if isinstance(arr[0], (Iterable, Sequence, Array)):
             return max([_find_max(x) for x in arr])  # type: ignore
-        else:
-            return max(arr)
+        return max(arr)
     return arr
@@ -92,12 +110,14 @@ class CustomImageClassificationDataset(BaseAnnotatedDataset[Sequence[int]], Imag
     def __init__(
         self,
         images: Array | Sequence[Array],
-        labels: Sequence[int],
+        labels: Array | Sequence[int],
         metadata: Sequence[dict[str, Any]] | None,
         classes: Sequence[str] | None,
         name: str | None = None,
     ) -> None:
-        super().__init__("ic", images, labels, metadata, classes)
+        super().__init__(
+            "ic", images, as_numpy(labels).tolist() if isinstance(labels, Array) else labels, metadata, classes
+        )
         if name is not None:
             self.__name__ = name
             self.__class__.__name__ = name
@@ -135,18 +155,24 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
     def __init__(
         self,
         images: Array | Sequence[Array],
-        labels: Sequence[Sequence[int]],
-        bboxes: Sequence[Sequence[Sequence[float]]],
+        labels: Array | Sequence[Array] | Sequence[Sequence[int]],
+        bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
         metadata: Sequence[dict[str, Any]] | None,
         classes: Sequence[str] | None,
         name: str | None = None,
     ) -> None:
-        super().__init__("od", images, labels, metadata, classes)
+        super().__init__(
+            "od",
+            images,
+            [as_numpy(label).tolist() if isinstance(label, Array) else label for label in labels],
+            metadata,
+            classes,
+        )
         if name is not None:
             self.__name__ = name
             self.__class__.__name__ = name
             self.__class__.__qualname__ = name
-        self._bboxes = bboxes
+        self._bboxes = [[as_numpy(box).tolist() if isinstance(box, Array) else box for box in bbox] for bbox in bboxes]
     @property
     def metadata(self) -> DatasetMetadata:
@@ -162,8 +188,8 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
 def to_image_classification_dataset(
     images: Array | Sequence[Array],
-    labels: Sequence[int],
-    metadata: Sequence[dict[str, Any]] | None,
+    labels: Array | Sequence[int],
+    metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
     classes: Sequence[str] | None,
     name: str | None = None,
 ) -> ImageClassificationDataset:
@@ -174,9 +200,9 @@ def to_image_classification_dataset(
     ----------
     images : Array | Sequence[Array]
         The images to use in the dataset.
-    labels : Sequence[int]
+    labels : Array | Sequence[int]
         The labels to use in the dataset.
-    metadata : Sequence[dict[str, Any]] | None
+    metadata : Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None
         The metadata to use in the dataset.
     classes : Sequence[str] | None
         The classes to use in the dataset.
@@ -186,14 +212,14 @@ def to_image_classification_dataset(
     ImageClassificationDataset
     """
     _validate_data("ic", images, labels, None, metadata)
-    return CustomImageClassificationDataset(images, labels, metadata, classes, name)
+    return CustomImageClassificationDataset(images, labels, _listify_metadata(metadata), classes, name)
 def to_object_detection_dataset(
     images: Array | Sequence[Array],
-    labels: Sequence[Sequence[int]],
-    bboxes: Sequence[Sequence[Sequence[float]]],
-    metadata: Sequence[dict[str, Any]] | None,
+    labels: Array | Sequence[Array] | Sequence[Sequence[int]],
+    bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
+    metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
     classes: Sequence[str] | None,
     name: str | None = None,
 ) -> ObjectDetectionDataset:
@@ -204,11 +230,11 @@ def to_object_detection_dataset(
     ----------
     images : Array | Sequence[Array]
         The images to use in the dataset.
-    labels : Sequence[Sequence[int]]
+    labels : Array | Sequence[Array] | Sequence[Sequence[int]]
         The labels to use in the dataset.
-    bboxes : Sequence[Sequence[Sequence[float]]]
+    bboxes : Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]]
         The bounding boxes (x0,y0,x1,y0) to use in the dataset.
-    metadata : Sequence[dict[str, Any]] | None
+    metadata : Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None
         The metadata to use in the dataset.
     classes : Sequence[str] | None
         The classes to use in the dataset.
@@ -218,4 +244,4 @@ def to_object_detection_dataset(
     ObjectDetectionDataset
     """
     _validate_data("od", images, labels, bboxes, metadata)
-    return CustomObjectDetectionDataset(images, labels, bboxes, metadata, classes, name)
+    return CustomObjectDetectionDataset(images, labels, bboxes, _listify_metadata(metadata), classes, name)

dataeval/utils/data/metadata.py CHANGED Viewed

@@ -228,58 +228,130 @@ def flatten(
     if return_dropped:
         return output, size, _sorted_drop_reasons(dropped)
+    if dropped:
+        dropped_items = "\n".join([f"    {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
+        warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
+    return output, size
+def _flatten_for_merge(
+    metadatum: Mapping[str, Any],
+    ignore_lists: bool,
+    fully_qualified: bool,
+    targets: int | None,
+) -> tuple[dict[str, list[Any]] | dict[str, Any], int, dict[str, list[str]]]:
+    flattened, image_repeats, dropped_inner = flatten(
+        metadatum, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
+    )
+    if targets is not None:
+        # check for mismatch in targets per image and force ignore_lists
+        if not ignore_lists and targets != image_repeats:
+            flattened, image_repeats, dropped_inner = flatten(
+                metadatum, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
+            )
+        if targets != image_repeats:
+            flattened = {k: [v] * targets for k, v in flattened.items()}
+        image_repeats = targets
+    return flattened, image_repeats, dropped_inner
+def _merge(
+    dicts: list[Mapping[str, Any]],
+    ignore_lists: bool,
+    fully_qualified: bool,
+    targets_per_image: Sequence[int] | None,
+) -> tuple[dict[str, list[Any]], dict[str, set[DropReason]], NDArray[np.intp]]:
+    merged: dict[str, list[Any]] = {}
+    isect: set[str] = set()
+    union: set[str] = set()
+    image_repeats = np.zeros(len(dicts), dtype=np.int_)
+    dropped: dict[str, set[DropReason]] = {}
+    for i, d in enumerate(dicts):
+        targets = None if targets_per_image is None else targets_per_image[i]
+        flattened, image_repeats[i], dropped_inner = _flatten_for_merge(d, ignore_lists, fully_qualified, targets)
+        isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
+        union.update(flattened.keys())
+        for k, v in dropped_inner.items():
+            dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
+        for k, v in flattened.items():
+            merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
+    for k in union - isect:
+        dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
+    if image_repeats.sum() == image_repeats.size:
+        image_indices = np.arange(image_repeats.size)
     else:
-        if dropped:
-            dropped_items = "\n".join([f"    {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
-            warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
-        return output, size
+        image_ids = np.arange(image_repeats.size)
+        image_data = np.concatenate(
+            [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
+        )
+        _, image_unsorted = np.unique(image_data, return_inverse=True)
+        image_indices = np.sort(image_unsorted)
+    merged = {k: _simplify_type(v) for k, v in merged.items() if k in isect}
+    return merged, dropped, image_indices
-def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
-    """EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
-    # single dict
-    if len(metadata) < 2:
-        return False
-    # dict of non dicts
-    keys = list(metadata)
-    if not isinstance(metadata[keys[0]], Mapping):
-        return False
+@overload
+def merge(
+    metadata: Iterable[Mapping[str, Any]],
+    *,
+    return_dropped: Literal[True],
+    return_numpy: Literal[False] = False,
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+    targets_per_image: Sequence[int] | None = None,
+    image_index_key: str = "_image_index",
+) -> tuple[dict[str, list[Any]], dict[str, list[str]]]: ...
-    # dict of dicts with matching keys
-    return set(metadata[keys[0]]) == set(metadata[keys[1]])
+@overload
+def merge(
+    metadata: Iterable[Mapping[str, Any]],
+    *,
+    return_dropped: Literal[False] = False,
+    return_numpy: Literal[False] = False,
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+    targets_per_image: Sequence[int] | None = None,
+    image_index_key: str = "_image_index",
+) -> dict[str, list[Any]]: ...
 @overload
 def merge(
     metadata: Iterable[Mapping[str, Any]],
+    *,
     return_dropped: Literal[True],
+    return_numpy: Literal[True],
     ignore_lists: bool = False,
     fully_qualified: bool = False,
-    return_numpy: bool = False,
     targets_per_image: Sequence[int] | None = None,
     image_index_key: str = "_image_index",
-) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], dict[str, list[str]]]: ...
+) -> tuple[dict[str, NDArray[Any]], dict[str, list[str]]]: ...
 @overload
 def merge(
     metadata: Iterable[Mapping[str, Any]],
+    *,
     return_dropped: Literal[False] = False,
+    return_numpy: Literal[True],
     ignore_lists: bool = False,
     fully_qualified: bool = False,
-    return_numpy: bool = False,
     targets_per_image: Sequence[int] | None = None,
     image_index_key: str = "_image_index",
-) -> dict[str, list[Any]] | dict[str, NDArray[Any]]: ...
+) -> dict[str, NDArray[Any]]: ...
 def merge(
     metadata: Iterable[Mapping[str, Any]],
+    *,
     return_dropped: bool = False,
+    return_numpy: bool = False,
     ignore_lists: bool = False,
     fully_qualified: bool = False,
-    return_numpy: bool = False,
     targets_per_image: Sequence[int] | None = None,
     image_index_key: str = "_image_index",
 ):
@@ -298,12 +370,12 @@ def merge(
         Iterable collection of metadata dictionaries to flatten and merge
     return_dropped: bool, default False
         Option to return a dictionary of dropped keys and the reason(s) for dropping
+    return_numpy : bool, default False
+        Option to return results as lists or NumPy arrays
     ignore_lists : bool, default False
         Option to skip expanding lists within metadata
     fully_qualified : bool, default False
         Option to return dictionary keys full qualified instead of minimized
-    return_numpy : bool, default False
-        Option to return results as lists or NumPy arrays
     targets_per_image : Sequence[int] or None, default None
         Number of targets for each image metadata entry
     image_index_key : str, default "_image_index"
@@ -330,74 +402,24 @@ def merge(
     >>> dropped_keys
     {'target_c': ['inconsistent_key']}
     """
-    merged: dict[str, list[Any]] = {}
-    isect: set[str] = set()
-    union: set[str] = set()
-    keys: list[str] | None = None
-    dicts: list[Mapping[str, Any]]
-    # EXPERIMENTAL
-    if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
-        warnings.warn("Experimental processing for dict of dicts.")
-        keys = [str(k) for k in metadata]
-        dicts = list(metadata.values())
-        ignore_lists = True
-    else:
-        dicts = list(metadata)
+    dicts: list[Mapping[str, Any]] = list(metadata)
     if targets_per_image is not None and len(dicts) != len(targets_per_image):
         raise ValueError("Number of targets per image must be equal to number of metadata entries.")
-    image_repeats = np.zeros(len(dicts), dtype=np.int_)
-    dropped: dict[str, set[DropReason]] = {}
-    for i, d in enumerate(dicts):
-        flattened, image_repeats[i], dropped_inner = flatten(
-            d, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
-        )
-        if targets_per_image is not None:
-            # check for mismatch in targets per image and force ignore_lists
-            if not ignore_lists and targets_per_image[i] != image_repeats[i]:
-                flattened, image_repeats[i], dropped_inner = flatten(
-                    d, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
-                )
-            if targets_per_image[i] != image_repeats[i]:
-                flattened = {k: [v] * targets_per_image[i] for k, v in flattened.items()}
-            image_repeats[i] = targets_per_image[i]
-        isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
-        union.update(flattened.keys())
-        for k, v in dropped_inner.items():
-            dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
-        for k, v in flattened.items():
-            merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
-    for k in union - isect:
-        dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
-    if image_repeats.sum() == image_repeats.size:
-        image_indices = np.arange(image_repeats.size)
-    else:
-        image_ids = np.arange(image_repeats.size)
-        image_data = np.concatenate(
-            [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
-        )
-        _, image_unsorted = np.unique(image_data, return_inverse=True)
-        image_indices = np.sort(image_unsorted)
-    output: dict[str, Any] = {}
+    merged, dropped, image_indices = _merge(dicts, ignore_lists, fully_qualified, targets_per_image)
-    if keys:
-        output["keys"] = np.array(keys) if return_numpy else keys
+    output: dict[str, Any] = {k: np.asarray(v) for k, v in merged.items()} if return_numpy else merged
-    for k in (key for key in merged if key in isect):
-        cv = _simplify_type(merged[k])
-        output[k] = np.array(cv) if return_numpy else cv
     if image_index_key not in output:
         output[image_index_key] = image_indices if return_numpy else image_indices.tolist()
     if return_dropped:
         return output, _sorted_drop_reasons(dropped)
-    else:
-        if dropped:
-            dropped_items = "\n".join([f"    {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
-            warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
-        return output
+    if dropped:
+        dropped_items = "\n".join([f"    {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
+        warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
+    return output

dataeval/utils/datasets/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Provides access to common Computer Vision datasets."""
+from dataeval.utils.datasets._antiuav import AntiUAVDetection
 from dataeval.utils.datasets._cifar10 import CIFAR10
 from dataeval.utils.datasets._milco import MILCO
 from dataeval.utils.datasets._mnist import MNIST
@@ -10,6 +11,7 @@ __all__ = [
     "MNIST",
     "Ships",
     "CIFAR10",
+    "AntiUAVDetection",
     "MILCO",
     "VOCDetection",
     "VOCDetectionTorch",

dataeval 0.86.0__py3-none-any.whl → 0.86.2__py3-none-any.whl

dataeval 0.86.0py3-none-any.whl → 0.86.2py3-none-any.whl