PyPI - dataeval - Versions diffs - 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl - Mend

dataeval 0.85.0py3-none-any.whl → 0.86.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +65 -42
dataeval/data/_selection.py +2 -3
dataeval/data/_split.py +2 -3
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +6 -8
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/__init__.py +4 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_mvdc.py +92 -0
dataeval/detectors/drift/_nml/__init__.py +6 -0
dataeval/detectors/drift/_nml/_base.py +70 -0
dataeval/detectors/drift/_nml/_chunk.py +396 -0
dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
dataeval/detectors/drift/_nml/_result.py +97 -0
dataeval/detectors/drift/_nml/_thresholds.py +269 -0
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metrics/bias/_parity.py +10 -13
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/__init__.py +2 -1
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +27 -31
dataeval/outputs/_drift.py +60 -0
dataeval/outputs/_linters.py +12 -17
dataeval/outputs/_stats.py +83 -29
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +32 -20
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +19 -11
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
dataeval-0.86.1.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.85.0.dist-info/RECORD +0 -107
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/_base.py CHANGED Viewed

@@ -10,23 +10,86 @@ from copy import deepcopy
 from dataclasses import dataclass
 from functools import partial
 from multiprocessing import Pool
-from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar
+from typing import Any, Callable, Generic, Iterable, Iterator, Sequence, TypeVar
 import numpy as np
 import tqdm
 from numpy.typing import NDArray
 from dataeval.config import get_max_processes
-from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
+from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput, SourceIndex
 from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
 from dataeval.utils._array import as_numpy, to_numpy
-from dataeval.utils._image import normalize_image_shape, rescale
+from dataeval.utils._image import clip_and_pad, clip_box, is_valid_box, normalize_image_shape, rescale
 DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
-BoundingBox = tuple[float, float, float, float]
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
+_S = TypeVar("_S")
+_T = TypeVar("_T")
+@dataclass
+class BoundingBox:
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    def __post_init__(self) -> None:
+        # Test for invalid coordinates
+        x_swap = self.x0 > self.x1
+        y_swap = self.y0 > self.y1
+        if x_swap or y_swap:
+            warnings.warn(f"Invalid bounding box coordinates: {self} - swapping invalid coordinates.")
+            if x_swap:
+                self.x0, self.x1 = self.x1, self.x0
+            if y_swap:
+                self.y0, self.y1 = self.y1, self.y0
+    @property
+    def width(self) -> float:
+        return self.x1 - self.x0
+    @property
+    def height(self) -> float:
+        return self.y1 - self.y0
+    def to_int(self) -> tuple[int, int, int, int]:
+        """
+        Returns the bounding box as a tuple of integers.
+        """
+        x0_int = math.floor(self.x0)
+        y0_int = math.floor(self.y0)
+        x1_int = math.ceil(self.x1)
+        y1_int = math.ceil(self.y1)
+        return x0_int, y0_int, x1_int, y1_int
+class PoolWrapper:
+    """
+    Wraps `multiprocessing.Pool` to allow for easy switching between
+    multiprocessing and single-threaded execution.
+    This helps with debugging and profiling, as well as usage with Jupyter notebooks
+    in VS Code, which does not support subprocess debugging.
+    """
+    def __init__(self, processes: int | None) -> None:
+        self.pool = Pool(processes) if processes is not None and processes > 1 else None
+    def imap(self, func: Callable[[_S], _T], iterable: Iterable[_S]) -> Iterator[_T]:
+        return map(func, iterable) if self.pool is None else self.pool.imap(func, iterable)
+    def __enter__(self, *args: Any, **kwargs: Any) -> PoolWrapper:
+        return self
+    def __exit__(self, *args: Any) -> None:
+        if self.pool is not None:
+            self.pool.close()
+            self.pool.join()
 class StatsProcessor(Generic[TStatsOutput]):
     output_class: type[TStatsOutput]
@@ -34,32 +97,26 @@ class StatsProcessor(Generic[TStatsOutput]):
     image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
     channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
-    def __init__(self, image: NDArray[Any], box: BoundingBox | None, per_channel: bool) -> None:
+    def __init__(self, image: NDArray[Any], box: BoundingBox | Iterable[Any] | None, per_channel: bool) -> None:
         self.raw = image
         self.width: int = image.shape[-1]
         self.height: int = image.shape[-2]
-        box = BoundingBox((0, 0, self.width, self.height)) if box is None else box
-        # Clip the bounding box to image
-        x0, y0 = (min(j, max(0, math.floor(box[i]))) for i, j in zip((0, 1), (self.width - 1, self.height - 1)))
-        x1, y1 = (min(j, max(1, math.ceil(box[i]))) for i, j in zip((2, 3), (self.width, self.height)))
-        self.box: NDArray[np.int64] = np.array([x0, y0, x1, y1], dtype=np.int64)
+        box = (0, 0, self.width, self.height) if box is None else box
+        self.box = box if isinstance(box, BoundingBox) else BoundingBox(*box)
         self._per_channel = per_channel
         self._image = None
         self._shape = None
         self._scaled = None
         self._cache = {}
         self._fn_map = self.channel_function_map if per_channel else self.image_function_map
-        self._is_valid_slice = box is None or bool(
-            box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
-        )
+        self._is_valid_box = is_valid_box(clip_box(image, self.box.to_int()))
     def get(self, fn_key: str) -> NDArray[Any]:
         if fn_key in self.cache_keys:
             if fn_key not in self._cache:
                 self._cache[fn_key] = self._fn_map[fn_key](self)
             return self._cache[fn_key]
-        else:
-            return self._fn_map[fn_key](self)
+        return self._fn_map[fn_key](self)
     def process(self) -> dict[str, Any]:
         return {k: self._fn_map[k](self) for k in self._fn_map}
@@ -67,11 +124,7 @@ class StatsProcessor(Generic[TStatsOutput]):
     @property
     def image(self) -> NDArray[Any]:
         if self._image is None:
-            if self._is_valid_slice:
-                norm = normalize_image_shape(self.raw)
-                self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
-            else:
-                self._image = np.zeros((self.raw.shape[0], self.box[3] - self.box[1], self.box[2] - self.box[0]))
+            self._image = clip_and_pad(normalize_image_shape(self.raw), self.box.to_int())
         return self._image
     @property
@@ -90,9 +143,9 @@ class StatsProcessor(Generic[TStatsOutput]):
     @classmethod
     def convert_output(
-        cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
+        cls, source: dict[str, Any], source_index: list[SourceIndex], object_count: list[int], image_count: int
     ) -> TStatsOutput:
-        output = {}
+        output: dict[str, Any] = {}
         attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
         for key in (key for key in source if key in attrs):
             stat_type: str = attrs[key]
@@ -101,14 +154,17 @@ class StatsProcessor(Generic[TStatsOutput]):
                 output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
             else:
                 output[key] = source[key]
-        return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
+        base_attrs: dict[str, Any] = dict(
+            zip(BASE_ATTRS, (source_index, np.asarray(object_count, dtype=np.uint16), image_count))
+        )
+        return cls.output_class(**output, **base_attrs)
 @dataclass
 class StatsProcessorOutput:
     results: list[dict[str, Any]]
     source_indices: list[SourceIndex]
-    box_counts: list[int]
+    object_counts: list[int]
     warnings_list: list[str]
@@ -119,18 +175,18 @@ def process_stats(
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    image = to_numpy(image)
+    np_image = to_numpy(image)
     results_list: list[dict[str, Any]] = []
     source_indices: list[SourceIndex] = []
     box_counts: list[int] = []
     warnings_list: list[str] = []
     for i_b, box in [(None, None)] if boxes is None else enumerate(boxes):
-        processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
-        if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
-            warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
+        processor_list = [p(np_image, box, per_channel) for p in stats_processor_cls]
+        if any(not p._is_valid_box for p in processor_list) and i_b is not None and box is not None:
+            warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} for image shape {np_image.shape} is invalid.")
         results_list.append({k: v for p in processor_list for k, v in p.process().items()})
         if per_channel:
-            source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
+            source_indices.extend([SourceIndex(i, i_b, c) for c in range(np_image.shape[-3])])
         else:
             source_indices.append(SourceIndex(i, i_b, None))
     box_counts.append(0 if boxes is None else len(boxes))
@@ -145,13 +201,18 @@ def process_stats_unpack(
     return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
-def _enumerate(dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]], per_box: bool):
+def _enumerate(
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]], per_box: bool
+) -> Iterator[tuple[int, ArrayLike, Any]]:
     for i in range(len(dataset)):
         d = dataset[i]
         image = d[0] if isinstance(d, tuple) else d
         if per_box and isinstance(d, tuple) and isinstance(d[1], ObjectDetectionTarget):
-            boxes = d[1].boxes if isinstance(d[1].boxes, Array) else as_numpy(d[1].boxes)
-            target = [BoundingBox(float(box[i]) for i in range(4)) for box in boxes]
+            try:
+                boxes = d[1].boxes if isinstance(d[1].boxes, Array) else as_numpy(d[1].boxes)
+                target = [BoundingBox(*(float(box[i]) for i in range(4))) for box in boxes]
+            except (ValueError, IndexError):
+                raise ValueError(f"Invalid bounding box format for image {i}: {d[1].boxes}")
         else:
             target = None
@@ -199,12 +260,13 @@ def run_stats(
     """
     results_list: list[dict[str, NDArray[np.float64]]] = []
     source_index: list[SourceIndex] = []
-    box_count: list[int] = []
+    object_count: list[int] = []
+    image_count: int = len(dataset)
     warning_list = []
     stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
-    with Pool(processes=get_max_processes()) as p:
+    with PoolWrapper(processes=get_max_processes()) as p:
         for r in tqdm.tqdm(
             p.imap(
                 partial(
@@ -214,14 +276,12 @@ def run_stats(
                 ),
                 _enumerate(dataset, per_box),
             ),
-            total=len(dataset),
+            total=image_count,
         ):
             results_list.extend(r.results)
             source_index.extend(r.source_indices)
-            box_count.extend(r.box_counts)
+            object_count.extend(r.object_counts)
             warning_list.extend(r.warnings_list)
-    p.close()
-    p.join()
     # warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
     for w in warning_list:
@@ -235,8 +295,7 @@ def run_stats(
             else:
                 output.setdefault(stat, []).append(result.tolist() if isinstance(result, np.ndarray) else result)
-    outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
-    return outputs
+    return [s.convert_output(output, source_index, object_count, image_count) for s in stats_processor_cls]
 def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
@@ -246,10 +305,12 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
     sum_dict = deepcopy(a.data())
     for k in sum_dict:
-        if isinstance(sum_dict[k], list):
+        if isinstance(sum_dict[k], Sequence):
             sum_dict[k].extend(b.data()[k])
-        else:
+        elif isinstance(sum_dict[k], Array):
             sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
+        else:
+            sum_dict[k] += b.data()[k]
     return type(a)(**sum_dict)

dataeval/metrics/stats/_boxratiostats.py CHANGED Viewed

@@ -8,8 +8,9 @@ from typing import Any, Callable, Generic, TypeVar, cast
 import numpy as np
 from numpy.typing import NDArray
+from dataeval.config import EPSILON
 from dataeval.outputs._base import set_metadata
-from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
+from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput
 TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
 ArraySlice = tuple[int, int]
@@ -40,15 +41,19 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
         self.img = self.StatSlicer(img_stats, img_slice)
-RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
-    DimensionStatsOutput: dict[str, Callable[[BoxImageStatsOutputSlice[DimensionStatsOutput]], NDArray[Any]]](
-        {
-            "left": lambda x: x.box["left"] / x.img["width"],
-            "top": lambda x: x.box["top"] / x.img["height"],
-            "channels": lambda x: x.box["channels"],
-            "depth": lambda x: x.box["depth"],
-            "distance": lambda x: x.box["distance"],
-        }
+RATIOSTATS_OVERRIDE_MAP: dict[str, Callable[[BoxImageStatsOutputSlice[Any]], NDArray[Any]]] = {
+    "offset_x": lambda x: x.box["offset_x"] / x.img["width"],
+    "offset_y": lambda x: x.box["offset_y"] / x.img["height"],
+    "channels": lambda x: x.box["channels"],
+    "depth": lambda x: x.box["depth"],
+    "distance_center": lambda x: x.box["distance_center"]
+    / (np.sqrt(np.square(x.img["width"]) + np.square(x.img["height"])) / 2),
+    "distance_edge": lambda x: x.box["distance_edge"]
+    / (
+        x.img["width"]
+        if np.min([np.abs(x.box["offset_x"]), np.abs((x.box["width"] + x.box["offset_x"]) - x.img["width"])])
+        < np.min([np.abs(x.box["offset_y"]), np.abs((x.box["height"] + x.box["offset_y"]) - x.img["height"])])
+        else x.img["height"]
     ),
 }
@@ -69,11 +74,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
     stats = getattr(box_stats, key)
-    # Copy over stats index maps and box counts
-    if key in (SOURCE_INDEX):
+    # Copy over base attributes
+    if key in BASE_ATTRS:
         return copy.deepcopy(stats)
-    elif key == BOX_COUNT:
-        return np.copy(stats)
     # Calculate ratios for each stat
     out_stats: np.ndarray = np.copy(stats).astype(np.float64)
@@ -84,10 +87,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
         box_j = len(box_stats) if i == len(box_map) - 1 else box_map[i + 1]
         img_j = len(img_stats) if i == len(img_map) - 1 else img_map[i + 1]
         stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
-        out_type = type(box_stats)
-        use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
+        use_override = key in RATIOSTATS_OVERRIDE_MAP
         with np.errstate(divide="ignore", invalid="ignore"):
-            ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
+            ratio = RATIOSTATS_OVERRIDE_MAP[key](stats) if use_override else stats.box[key] / (stats.img[key] + EPSILON)
         out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
     return out_stats
@@ -141,8 +143,8 @@ def boxratiostats(
     output_cls = type(boxstats)
     if type(boxstats) is not type(imgstats):
         raise TypeError("Must provide stats outputs of the same type.")
-    if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
-        raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
+    if boxstats.image_count != imgstats.image_count:
+        raise ValueError("Stats image count length mismatch. Check if the correct box and image stats were provided.")
     if any(src_idx.box is None for src_idx in boxstats.source_index):
         raise ValueError("Input for boxstats must contain box information.")
     if any(src_idx.box is not None for src_idx in imgstats.source_index):

dataeval/metrics/stats/_dimensionstats.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Any, Callable
 import numpy as np
+from dataeval.config import EPSILON
 from dataeval.metrics.stats._base import StatsProcessor, run_stats
 from dataeval.outputs import DimensionStatsOutput
 from dataeval.outputs._base import set_metadata
@@ -16,18 +17,21 @@ from dataeval.utils._image import get_bitdepth
 class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
     output_class: type = DimensionStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
-        "left": lambda x: x.box[0],
-        "top": lambda x: x.box[1],
-        "width": lambda x: x.box[2] - x.box[0],
-        "height": lambda x: x.box[3] - x.box[1],
+        "offset_x": lambda x: x.box.x0,
+        "offset_y": lambda x: x.box.y0,
+        "width": lambda x: x.box.width,
+        "height": lambda x: x.box.height,
         "channels": lambda x: x.shape[-3],
-        "size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
-        "aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
+        "size": lambda x: x.box.width * x.box.height,
+        "aspect_ratio": lambda x: x.box.width / (x.box.height + EPSILON),
         "depth": lambda x: get_bitdepth(x.image).depth,
-        "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
-        "distance": lambda x: np.sqrt(
-            np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
-            + np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
+        "center": lambda x: np.asarray([(x.box.x0 + x.box.x1) / 2, (x.box.y0 + x.box.y1) / 2]),
+        "distance_center": lambda x: np.sqrt(
+            np.square(((x.box.x0 + x.box.x1) / 2) - (x.raw.shape[-1] / 2))
+            + np.square(((x.box.y0 + x.box.y1) / 2) - (x.raw.shape[-2] / 2))
+        ),
+        "distance_edge": lambda x: np.min(
+            [np.abs(x.box.x0), np.abs(x.box.y0), np.abs(x.box.x1 - x.raw.shape[-1]), np.abs(x.box.y1 - x.raw.shape[-2])]
         ),
     }

dataeval/metrics/stats/_hashstats.py CHANGED Viewed

@@ -137,7 +137,7 @@ def hashstats(
     >>> results = hashstats(dataset)
     >>> print(results.xxhash[:5])
-    ['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
+    ['69b50a5f06af238c', '5a861d7a23d1afe7', '7ffdb4990ad44ac6', '4f0c366a3298ceac', 'c5519e36ac1f8839']
     >>> print(results.pchash[:5])
     ['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
     """

dataeval/metrics/stats/_pixelstats.py CHANGED Viewed

@@ -16,18 +16,18 @@ from dataeval.typing import ArrayLike, Dataset
 class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
     output_class: type = PixelStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
-        "mean": lambda x: np.mean(x.scaled),
-        "std": lambda x: np.std(x.scaled),
-        "var": lambda x: np.var(x.scaled),
+        "mean": lambda x: np.nanmean(x.scaled),
+        "std": lambda x: np.nanstd(x.scaled),
+        "var": lambda x: np.nanvar(x.scaled),
         "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
         "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled.ravel())),
         "histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
         "entropy": lambda x: entropy(x.get("histogram")),
     }
     channel_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
-        "mean": lambda x: np.mean(x.scaled, axis=1),
-        "std": lambda x: np.std(x.scaled, axis=1),
-        "var": lambda x: np.var(x.scaled, axis=1),
+        "mean": lambda x: np.nanmean(x.scaled, axis=1),
+        "std": lambda x: np.nanstd(x.scaled, axis=1),
+        "var": lambda x: np.nanvar(x.scaled, axis=1),
         "skew": lambda x: np.nan_to_num(skew(x.scaled, axis=1)),
         "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled, axis=1)),
         "histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),

dataeval/metrics/stats/_visualstats.py CHANGED Viewed

@@ -24,8 +24,8 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
         else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
         "darkness": lambda x: x.get("percentiles")[-2],
         "missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
-        "sharpness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
-        "zeros": lambda x: np.count_nonzero(np.sum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
+        "sharpness": lambda x: np.nanstd(edge_filter(np.mean(x.image, axis=0))),
+        "zeros": lambda x: np.count_nonzero(np.nansum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
         "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
     }
     channel_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
@@ -36,7 +36,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
         ),
         "darkness": lambda x: x.get("percentiles")[:, -2],
         "missing": lambda x: np.count_nonzero(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
-        "sharpness": lambda x: np.std(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
+        "sharpness": lambda x: np.nanstd(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
         "zeros": lambda x: np.count_nonzero(x.image == 0, axis=(1, 2)) / np.prod(x.shape[-2:]),
         "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES, axis=1).T,
     }

dataeval/outputs/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ as well as runtime metadata for reproducibility and logging.
 from ._base import ExecutionMetadata
 from ._bias import BalanceOutput, CompletenessOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
-from ._drift import DriftMMDOutput, DriftOutput
+from ._drift import DriftMMDOutput, DriftMVDCOutput, DriftOutput
 from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
 from ._linters import DuplicatesOutput, OutliersOutput
 from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
@@ -34,6 +34,7 @@ __all__ = [
     "DivergenceOutput",
     "DiversityOutput",
     "DriftMMDOutput",
+    "DriftMVDCOutput",
     "DriftOutput",
     "DuplicatesOutput",
     "ExecutionMetadata",

dataeval/outputs/_base.py CHANGED Viewed

@@ -66,25 +66,40 @@ class GenericOutput(Generic[T]):
     def meta(self) -> ExecutionMetadata:
         """
         Metadata about the execution of the function or method for the Output class.
+        Returns
+        -------
+        ExecutionMetadata
         """
         return self._meta or ExecutionMetadata.empty()
 class Output(GenericOutput[dict[str, Any]]):
     def data(self) -> dict[str, Any]:
-        return {k: v for k, v in self.__dict__.items() if k != "_meta"}
+        """
+        The output data as a dictionary.
-    def __repr__(self) -> str:
-        return str(self)
+        Returns
+        -------
+        dict[str, Any]
+        """
+        return {k: v for k, v in self.__dict__.items() if k != "_meta"}
     def __str__(self) -> str:
-        return f"{self.__class__.__name__}({', '.join([f'{k}={v}' for k, v in self.data().items()])})"
+        return str(self.data())
 class BaseCollectionMixin(Collection[Any]):
     __slots__ = ["_data"]
     def data(self) -> Any:
+        """
+        The output data as a collection.
+        Returns
+        -------
+        Collection
+        """
         return self._data
     def __len__(self) -> int:
@@ -102,7 +117,7 @@ TValue = TypeVar("TValue")
 class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Mapping[TKey, TValue]]):
-    def __init__(self, data: Mapping[TKey, TValue]):
+    def __init__(self, data: Mapping[TKey, TValue]) -> None:
         self._data = data
     def __getitem__(self, key: TKey) -> TValue:
@@ -113,7 +128,7 @@ class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Ma
 class SequenceOutput(Sequence[TValue], BaseCollectionMixin, GenericOutput[Sequence[TValue]]):
-    def __init__(self, data: Sequence[TValue]):
+    def __init__(self, data: Sequence[TValue]) -> None:
         self._data = data
     @overload
@@ -140,7 +155,7 @@ def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None =
     @wraps(fn)
     def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
-        def fmt(v):
+        def fmt(v: Any) -> Any:
             if np.isscalar(v):
                 return v
             if hasattr(v, "shape"):

dataeval/outputs/_bias.py CHANGED Viewed

@@ -7,10 +7,10 @@ from dataclasses import asdict, dataclass
 from typing import Any, Literal, TypeVar, overload
 import numpy as np
+import pandas as pd
 from numpy.typing import NDArray
 with contextlib.suppress(ImportError):
-    import pandas as pd
     from matplotlib.figure import Figure
 from dataeval.data._images import Images
@@ -38,8 +38,6 @@ class ToDataFrameMixin:
         -----
         This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
         """
-        import pandas as pd
         return pd.DataFrame(
             index=self.factor_names,  # type: ignore - list[str] is documented as acceptable index type
             data={
@@ -130,33 +128,30 @@ class CoverageOutput(Output):
         import matplotlib.pyplot as plt
+        images = Images(images) if isinstance(images, Dataset) else images
+        if np.max(self.uncovered_indices) > len(images):
+            raise ValueError(
+                f"Uncovered indices {self.uncovered_indices} specify images "
+                f"unavailable in the provided number of images {len(images)}."
+            )
         # Determine which images to plot
         selected_indices = self.uncovered_indices[:top_k]
-        images = Images(images) if isinstance(images, Dataset) else images
         # Plot the images
         num_images = min(top_k, len(selected_indices))
         rows = int(np.ceil(num_images / 3))
-        fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
-        if rows == 1:
-            for j in range(3):
-                if j >= len(selected_indices):
-                    continue
-                image = channels_first_to_last(as_numpy(images[selected_indices[j]]))
-                axs[j].imshow(image)
-                axs[j].axis("off")
-        else:
-            for i in range(rows):
-                for j in range(3):
-                    i_j = i * 3 + j
-                    if i_j >= len(selected_indices):
-                        continue
-                    image = channels_first_to_last(as_numpy(images[selected_indices[i_j]]))
-                    axs[i, j].imshow(image)
-                    axs[i, j].axis("off")
+        cols = min(3, num_images)
+        fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
+        for image, ax in zip(images[:num_images], axs.flat):
+            image = channels_first_to_last(as_numpy(image))
+            ax.imshow(image)
+            ax.axis("off")
+        for ax in axs.flat[num_images:]:
+            ax.axis("off")
         fig.tight_layout()
         return fig
@@ -235,14 +230,15 @@ class BalanceOutput(Output):
         # return the masked attribute
         if attr == "factor_names":
             return [x.replace(f"-{factor_type}", "") for x in self.factor_names if mask_lambda(x)]
-        else:
-            factor_type_mask = np.asarray([mask_lambda(x) for x in self.factor_names])
-            if attr == "factors":
-                return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
-            elif attr == "balance":
-                return self.balance[factor_type_mask]
-            elif attr == "classwise":
-                return self.classwise[:, factor_type_mask]
+        factor_type_mask = np.asarray([mask_lambda(x) for x in self.factor_names])
+        if attr == "factors":
+            return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
+        if attr == "balance":
+            return self.balance[factor_type_mask]
+        if attr == "classwise":
+            return self.classwise[:, factor_type_mask]
+        raise ValueError(f"Unknown attr {attr} specified.")
     def plot(
         self,

dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

dataeval 0.85.0py3-none-any.whl → 0.86.1py3-none-any.whl