PyPI - dataeval - Versions diffs - 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl - Mend

dataeval 0.85.0py3-none-any.whl → 0.86.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +65 -42
dataeval/data/_selection.py +2 -3
dataeval/data/_split.py +2 -3
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +6 -8
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/__init__.py +4 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_mvdc.py +92 -0
dataeval/detectors/drift/_nml/__init__.py +6 -0
dataeval/detectors/drift/_nml/_base.py +70 -0
dataeval/detectors/drift/_nml/_chunk.py +396 -0
dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
dataeval/detectors/drift/_nml/_result.py +97 -0
dataeval/detectors/drift/_nml/_thresholds.py +269 -0
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metrics/bias/_parity.py +10 -13
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/__init__.py +2 -1
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +27 -31
dataeval/outputs/_drift.py +60 -0
dataeval/outputs/_linters.py +12 -17
dataeval/outputs/_stats.py +83 -29
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +32 -20
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +19 -11
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
dataeval-0.86.1.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.85.0.dist-info/RECORD +0 -107
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
 __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
-__version__ = "0.85.0"
+__version__ = "0.86.1"
 import logging

dataeval/_log.py CHANGED Viewed

@@ -8,7 +8,7 @@ class LogMessage:
     Deferred message callback for logging expensive messages.
     """
-    def __init__(self, fn: Callable[..., str]):
+    def __init__(self, fn: Callable[..., str]) -> None:
         self._fn = fn
         self._str = None

dataeval/config.py CHANGED Viewed

@@ -4,10 +4,10 @@ Global configuration settings for DataEval.
 from __future__ import annotations
-__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "DeviceLike"]
+__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes", "DeviceLike"]
 import sys
-from typing import Union
+from typing import Any, Union
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
@@ -78,8 +78,7 @@ def get_device(override: DeviceLike | None = None) -> torch.device:
     if override is None:
         global _device
         return torch.get_default_device() if _device is None else _device
-    else:
-        return _todevice(override)
+    return _todevice(override)
 def set_max_processes(processes: int | None) -> None:
@@ -112,6 +111,24 @@ def get_max_processes() -> int | None:
     return _processes
+class MaxProcessesContextManager:
+    def __init__(self, processes: int) -> None:
+        self._processes = processes
+    def __enter__(self) -> None:
+        global _processes
+        self._old = _processes
+        set_max_processes(self._processes)
+    def __exit__(self, *args: tuple[Any, ...]) -> None:
+        global _processes
+        _processes = self._old
+def use_max_processes(processes: int) -> MaxProcessesContextManager:
+    return MaxProcessesContextManager(processes)
 def set_seed(seed: int | None, all_generators: bool = False) -> None:
     """
     Sets the seed for use by classes that allow for a random state or seed.

dataeval/data/_embeddings.py CHANGED Viewed

@@ -144,8 +144,7 @@ class Embeddings:
         """
         if indices is not None:
             return torch.vstack(list(self._batch(indices))).to(self.device)
-        else:
-            return self[:]
+        return self[:]
     def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
         """
@@ -248,6 +247,7 @@ class Embeddings:
             _logger.log(logging.DEBUG, f"Saved embeddings cache from {path}")
         except Exception as e:
             _logger.log(logging.ERROR, f"Failed to save embeddings cache: {e}")
+            raise e
     @classmethod
     def load(cls, path: Path | str) -> Embeddings:

dataeval/data/_images.py CHANGED Viewed

@@ -73,15 +73,14 @@ class Images(Generic[T]):
     def __getitem__(self, key: int | slice, /) -> Sequence[T] | T:
         if isinstance(key, slice):
             return [self._get_image(k) for k in range(len(self._dataset))[key]]
-        elif hasattr(key, "__int__"):
+        if hasattr(key, "__int__"):
             return self._get_image(int(key))
         raise TypeError(f"Key must be integers or slices, not {type(key)}")
     def _get_image(self, index: int) -> T:
         if self._is_tuple_datum:
             return cast(Dataset[tuple[T, Any, Any]], self._dataset)[index][0]
-        else:
-            return cast(Dataset[T], self._dataset)[index]
+        return cast(Dataset[T], self._dataset)[index]
     def __iter__(self) -> Iterator[T]:
         for i in range(len(self._dataset)):

dataeval/data/_metadata.py CHANGED Viewed

@@ -191,7 +191,12 @@ class Metadata:
         self._process()
         return self._image_indices
-    def _collate(self, force: bool = False):
+    @property
+    def image_count(self) -> int:
+        self._process()
+        return int(self._image_indices.max() + 1)
+    def _collate(self, force: bool = False) -> None:
         if self._collated and not force:
             return
@@ -238,7 +243,7 @@ class Metadata:
         self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
         self._collated = True
-    def _merge(self, force: bool = False):
+    def _merge(self, force: bool = False) -> None:
         if self._merged is not None and not force:
             return
@@ -261,48 +266,26 @@ class Metadata:
                     "Metadata dictionary needs to be a single dictionary whose values "
                     "are arraylike containing the metadata on a per image or per object basis."
                 )
-            else:
-                check_length = len(v) if check_length is None else check_length
-                if check_length != len(v):
-                    raise ValueError(
-                        "The lists/arrays in the metadata dict have varying lengths. "
-                        "Metadata requires them to be uniform in length."
-                    )
+            check_length = len(v) if check_length is None else check_length
+            if check_length != len(v):
+                raise ValueError(
+                    "The lists/arrays in the metadata dict have varying lengths. "
+                    "Metadata requires them to be uniform in length."
+                )
         if len(self._class_labels) != check_length:
             raise ValueError(
                 f"The length of the label array {len(self._class_labels)} is not the same as "
                 f"the length of the metadata arrays {check_length}."
             )
-    def _process(self, force: bool = False) -> None:
-        if self._processed and not force:
-            return
-        # Create image indices from targets
-        self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
-        # Validate the metadata dimensions
-        self._validate()
-        # Include specified metadata keys
-        if self.include:
-            metadata = {i: self.merged[i] for i in self.include if i in self.merged}
-            continuous_factor_bins = (
-                {i: self.continuous_factor_bins[i] for i in self.include if i in self.continuous_factor_bins}
-                if self.continuous_factor_bins
-                else {}
-            )
-        else:
-            metadata = self.merged
-            continuous_factor_bins = dict(self.continuous_factor_bins) if self.continuous_factor_bins else {}
-            for k in self.exclude:
-                metadata.pop(k, None)
-                continuous_factor_bins.pop(k, None)
-        # Remove generated "_image_index" if present
-        if "_image_index" in metadata:
-            metadata.pop("_image_index", None)
+    def _filter(self, d: Mapping[str, Any]) -> dict[str, Any]:
+        return (
+            {k: d[k] for k in self.include if k in d} if self.include else {k: d[k] for k in d if k not in self.exclude}
+        )
+    def _split_continuous_discrete(
+        self, metadata: dict[str, NDArray[Any]], continuous_factor_bins: dict[str, int | Sequence[float]]
+    ) -> tuple[dict[str, NDArray[Any]], dict[str, NDArray[np.int64]]]:
         # Bin according to user supplied bins
         continuous_metadata = {}
         discrete_metadata = {}
@@ -341,6 +324,28 @@ class Metadata:
             else:
                 _, discrete_metadata[key] = np.unique(data, return_inverse=True)
+        return continuous_metadata, discrete_metadata
+    def _process(self, force: bool = False) -> None:
+        if self._processed and not force:
+            return
+        # Create image indices from targets
+        self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
+        # Validate the metadata dimensions
+        self._validate()
+        # Filter the merged metadata and continuous factor bins
+        metadata = self._filter(self.merged)
+        continuous_factor_bins = self._filter(self.continuous_factor_bins)
+        # Remove generated "_image_index" if present
+        metadata.pop("_image_index", None)
+        # Split the metadata into continuous and discrete
+        continuous_metadata, discrete_metadata = self._split_continuous_discrete(metadata, continuous_factor_bins)
         # Split out the dictionaries into the keys and values
         self._discrete_factor_names = list(discrete_metadata.keys())
         self._discrete_data = (
@@ -358,13 +363,31 @@ class Metadata:
         self._processed = True
     def add_factors(self, factors: Mapping[str, ArrayLike]) -> None:
+        """
+        Add additional factors to the metadata.
+        The number of measures per factor must match the number of images
+        in the dataset or the number of detections in the dataset.
+        Parameters
+        ----------
+        factors : Mapping[str, ArrayLike]
+            Dictionary of factors to add to the metadata.
+        """
         self._merge()
-        self._processed = False
-        target_len = len(self.targets.source) if self.targets.source is not None else len(self.targets)
-        if any(len(v if isinstance(v, Sized) else as_numpy(v)) != target_len for v in factors.values()):
+        targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
+        images = self.image_count
+        lengths = {k: len(v if isinstance(v, Sized) else np.atleast_1d(as_numpy(v))) for k, v in factors.items()}
+        targets_match = all(f == targets for f in lengths.values())
+        images_match = targets_match if images == targets else all(f == images for f in lengths.values())
+        if not targets_match and not images_match:
             raise ValueError(
                 "The lists/arrays in the provided factors have a different length than the current metadata factors."
             )
-        merged = cast(tuple[dict[str, ArrayLike], dict[str, list[str]]], self._merged)[0]
+        merged = cast(dict[str, ArrayLike], self._merged[0] if self._merged is not None else {})
         for k, v in factors.items():
-            merged[k] = v
+            v = as_numpy(v)
+            merged[k] = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
+        self._processed = False

dataeval/data/_selection.py CHANGED Viewed

@@ -110,8 +110,7 @@ class Select(AnnotatedDataset[_TDatum]):
         grouped: dict[int, list[Selection[_TDatum]]] = {}
         for selection in selections_list:
             grouped.setdefault(selection.stage, []).append(selection)
-        selection_list = [selection for category in sorted(grouped) for selection in grouped[category]]
-        return selection_list
+        return [selection for category in sorted(grouped) for selection in grouped[category]]
     def _apply_selections(self) -> None:
         for selection in self._selections:
@@ -120,7 +119,7 @@ class Select(AnnotatedDataset[_TDatum]):
     def _apply_subselection(self, datum: _TDatum, index: int) -> _TDatum:
         for subselection, indices in self._subselections:
-            datum = subselection(datum) if index in indices else datum
+            datum = subselection(datum) if self._selection[index] in indices else datum
         return datum
     def __getitem__(self, index: int) -> _TDatum:

dataeval/data/_split.py CHANGED Viewed

@@ -23,7 +23,7 @@ _logger = logging.getLogger(__name__)
 class KFoldSplitter(Protocol):
     """Protocol covering sklearn KFold variant splitters"""
-    def __init__(self, n_splits: int): ...
+    def __init__(self, n_splits: int) -> None: ...
     def split(self, X: Any, y: Any, groups: Any) -> Iterator[tuple[NDArray[Any], NDArray[Any]]]: ...
@@ -209,8 +209,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
     split_set = set(split_on)
     indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
     binned_features = metadata.discrete_data[:, indices]
-    group_ids = np.unique(binned_features, axis=0, return_inverse=True)[1]
-    return group_ids
+    return np.unique(binned_features, axis=0, return_inverse=True)[1]
 def make_splits(

dataeval/data/_targets.py CHANGED Viewed

@@ -24,11 +24,13 @@ class Targets:
     labels : NDArray[np.intp]
         Labels (N,) for N images or objects
     scores : NDArray[np.float32]
-        Probability scores (N,M) for N images of M classes or confidence score (N,) of objects
+        Probability scores (N, M) for N images of M classes or confidence score (N,) of objects
     bboxes : NDArray[np.float32] | None
-        Bounding boxes (N,4) for N objects in (x0,y0,x1,y1) format
+        Bounding boxes (N, 4) for N objects in (x0, y0, x1, y1) format
     source : NDArray[np.intp] | None
         Source image index (N,) for N objects
+    size : int
+        Count of objects
     """
     labels: NDArray[np.intp]
@@ -55,13 +57,16 @@ class Targets:
             )
         if self.bboxes is not None and len(self.bboxes) > 0 and self.bboxes.shape[-1] != 4:
-            raise ValueError("Bounding boxes must be in (x0,y0,x1,y1) format.")
+            raise ValueError("Bounding boxes must be in (x0, y0, x1, y1) format.")
+    @property
+    def size(self) -> int:
+        return len(self.labels)
     def __len__(self) -> int:
         if self.source is None:
             return len(self.labels)
-        else:
-            return len(np.unique(self.source))
+        return len(np.unique(self.source))
     def __getitem__(self, idx: int, /) -> Targets:
         if self.source is None or self.bboxes is None:
@@ -71,14 +76,13 @@ class Targets:
                 None,
                 None,
             )
-        else:
-            mask = np.where(self.source == idx, True, False)
-            return Targets(
-                np.atleast_1d(self.labels[mask]),
-                np.atleast_1d(self.scores[mask]),
-                np.atleast_2d(self.bboxes[mask]),
-                np.atleast_1d(self.source[mask]),
-            )
+        mask = np.where(self.source == idx, True, False)
+        return Targets(
+            np.atleast_1d(self.labels[mask]),
+            np.atleast_1d(self.scores[mask]),
+            np.atleast_2d(self.bboxes[mask]),
+            np.atleast_1d(self.source[mask]),
+        )
     def __iter__(self) -> Iterator[Targets]:
         for i in range(len(self.labels)) if self.source is None else np.unique(self.source):

dataeval/data/selections/_classfilter.py CHANGED Viewed

@@ -10,7 +10,6 @@ from numpy.typing import NDArray
 from dataeval.data._selection import Select, Selection, SelectionStage, Subselection
 from dataeval.typing import Array, ObjectDetectionDatum, ObjectDetectionTarget, SegmentationDatum, SegmentationTarget
 from dataeval.utils._array import as_numpy
-from dataeval.utils.data.metadata import flatten
 class ClassFilter(Selection[Any]):
@@ -69,11 +68,8 @@ _TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
 def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
-    if isinstance(obj, Sized) and not isinstance(obj, (str, bytes, bytearray)) and len(obj) == len(mask):
-        if isinstance(obj, Array):
-            return obj[mask]
-        elif isinstance(obj, Sequence):
-            return cast(_T, [item for i, item in enumerate(obj) if mask[i]])
+    if not isinstance(obj, (str, bytes, bytearray)) and isinstance(obj, (Sequence, Array)) and len(obj) == len(mask):
+        return obj[mask] if isinstance(obj, Array) else cast(_T, [item for i, item in enumerate(obj) if mask[i]])
     return obj
@@ -96,13 +92,15 @@ class ClassFilterSubSelection(Subselection[Any]):
     def __init__(self, classes: Sequence[int]) -> None:
         self.classes = classes
+    def _filter(self, d: dict[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
+        return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
     def __call__(self, datum: _TDatum) -> _TDatum:
         # build a mask for any arrays
         image, target, metadata = datum
         mask = np.isin(as_numpy(target.labels), self.classes)
-        flattened_metadata = flatten(metadata)[0]
-        filtered_metadata = {k: _try_mask_object(v, mask) for k, v in flattened_metadata.items()}
+        filtered_metadata = self._filter(metadata, mask)
         # return a masked datum
         filtered_datum = image, ClassFilterTarget(target, mask), filtered_metadata

dataeval/data/selections/_prioritize.py CHANGED Viewed

@@ -99,8 +99,7 @@ class _KNNSorter(_Sorter):
             np.fill_diagonal(dists, np.inf)
         else:
             dists = pairwise_distances(embeddings, reference)
-        inds = np.argsort(np.sort(dists, axis=1)[:, self._k])
-        return inds
+        return np.argsort(np.sort(dists, axis=1)[:, self._k])
 class _KMeansSorter(_Sorter):
@@ -124,15 +123,13 @@ class _KMeansSorter(_Sorter):
 class _KMeansDistanceSorter(_KMeansSorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         clst = self._get_clusters(embeddings if reference is None else reference)
-        inds = np.argsort(clst._dist2center(embeddings))
-        return inds
+        return np.argsort(clst._dist2center(embeddings))
 class _KMeansComplexitySorter(_KMeansSorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         clst = self._get_clusters(embeddings if reference is None else reference)
-        inds = clst._sort_by_weights(embeddings)
-        return inds
+        return clst._sort_by_weights(embeddings)
 class Prioritize(Selection[Any]):
@@ -266,10 +263,10 @@ class Prioritize(Selection[Any]):
     def _get_sorter(self, samples: int) -> _Sorter:
         if self._method == "knn":
             return _KNNSorter(samples, self._k)
-        elif self._method == "kmeans_distance":
+        if self._method == "kmeans_distance":
             return _KMeansDistanceSorter(samples, self._c)
-        else:  # self._method == "kmeans_complexity"
-            return _KMeansComplexitySorter(samples, self._c)
+        # self._method == "kmeans_complexity"
+        return _KMeansComplexitySorter(samples, self._c)
     def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
         emb: NDArray[Any] = embeddings.to_numpy(selection)

dataeval/data/selections/_shuffle.py CHANGED Viewed

@@ -30,7 +30,9 @@ class Shuffle(Selection[Any]):
     seed: int | NDArray[Any] | SeedSequence | BitGenerator | Generator | None
     stage = SelectionStage.ORDER
-    def __init__(self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None):
+    def __init__(
+        self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None
+    ) -> None:
         self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
     def __call__(self, dataset: Select[Any]) -> None:

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -7,6 +7,8 @@ __all__ = [
     "DriftKS",
     "DriftMMD",
     "DriftMMDOutput",
+    "DriftMVDC",
+    "DriftMVDCOutput",
     "DriftOutput",
     "DriftUncertainty",
     "UpdateStrategy",
@@ -18,5 +20,6 @@ from dataeval.detectors.drift._base import UpdateStrategy
 from dataeval.detectors.drift._cvm import DriftCVM
 from dataeval.detectors.drift._ks import DriftKS
 from dataeval.detectors.drift._mmd import DriftMMD
+from dataeval.detectors.drift._mvdc import DriftMVDC
 from dataeval.detectors.drift._uncertainty import DriftUncertainty
-from dataeval.outputs._drift import DriftMMDOutput, DriftOutput
+from dataeval.outputs._drift import DriftMMDOutput, DriftMVDCOutput, DriftOutput

dataeval/detectors/drift/_base.py CHANGED Viewed

@@ -13,7 +13,7 @@ __all__ = []
 import math
 from abc import abstractmethod
 from functools import wraps
-from typing import Callable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
 import numpy as np
 from numpy.typing import NDArray
@@ -40,7 +40,7 @@ def update_strategy(fn: Callable[..., R]) -> Callable[..., R]:
     """Decorator to update x_ref with x using selected update methodology"""
     @wraps(fn)
-    def _(self: BaseDrift, data: Embeddings | Array, *args, **kwargs) -> R:
+    def _(self: BaseDrift, data: Embeddings | Array, *args: tuple[Any, ...], **kwargs: dict[str, Any]) -> R:
         output = fn(self, data, *args, **kwargs)
         # update reference dataset
@@ -184,7 +184,7 @@ class BaseDriftUnivariate(BaseDrift):
             threshold = self.p_val / self.n_features
             drift_pred = bool((p_vals < threshold).any())
             return drift_pred, threshold
-        elif self.correction == "fdr":
+        if self.correction == "fdr":
             n = p_vals.shape[0]
             i = np.arange(n) + np.int_(1)
             p_sorted = np.sort(p_vals)
@@ -195,8 +195,7 @@ class BaseDriftUnivariate(BaseDrift):
             except ValueError:  # sorted p-values not below thresholds
                 return bool(below_threshold.any()), q_threshold.min()
             return bool(below_threshold.any()), q_threshold[idx_threshold]
-        else:
-            raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
+        raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
     @set_metadata
     @update_strategy

dataeval/detectors/drift/_mmd.py CHANGED Viewed

@@ -95,8 +95,7 @@ class DriftMMD(BaseDrift):
         k_xy = self._kernel(x, y)
         k_xx = self._k_xx if self._k_xx is not None and self.update_strategy is None else self._kernel(x, x)
         k_yy = self._kernel(y, y)
-        kernel_mat = torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
-        return kernel_mat
+        return torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
     def score(self, data: Embeddings | Array) -> tuple[float, float, float]:
         """
@@ -205,8 +204,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
     n = min(x.shape[0], y.shape[0])
     n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
     n_median = n + (torch.prod(torch.as_tensor(dist.shape)) - n) // 2 - 1
-    sigma = (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
-    return sigma
+    return (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
 class GaussianRBF(torch.nn.Module):
@@ -310,5 +308,4 @@ def mmd2_from_kernel_matrix(
         kernel_mat = kernel_mat[idx][:, idx]
     k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
     c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
-    mmd2 = c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
-    return mmd2
+    return c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()

dataeval/detectors/drift/_mvdc.py ADDED Viewed

@@ -0,0 +1,92 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import numpy as np
+import pandas as pd
+from numpy.typing import ArrayLike
+if TYPE_CHECKING:
+    from typing import Self
+else:
+    from typing_extensions import Self
+from dataeval.detectors.drift._nml._chunk import CountBasedChunker, SizeBasedChunker
+from dataeval.detectors.drift._nml._domainclassifier import DomainClassifierCalculator
+from dataeval.detectors.drift._nml._thresholds import ConstantThreshold
+from dataeval.outputs._drift import DriftMVDCOutput
+from dataeval.utils._array import flatten
+class DriftMVDC:
+    """Multivariant Domain Classifier
+    Parameters
+    ----------
+    n_folds : int, default 5
+        Number of cross-validation (CV) folds.
+    chunk_size : int or None, default None
+        Number of samples in a chunk used in CV, will get one metric & prediction per chunk.
+    chunk_count : int or None, default None
+        Number of total chunks used in CV, will get one metric & prediction per chunk.
+    threshold : Tuple[float, float], default (0.45, 0.65)
+        (lower, upper) metric bounds on roc_auc for identifying :term:`drift<Drift>`.
+    """
+    def __init__(
+        self,
+        n_folds: int = 5,
+        chunk_size: int | None = None,
+        chunk_count: int | None = None,
+        threshold: tuple[float, float] = (0.45, 0.65),
+    ) -> None:
+        self.threshold: tuple[float, float] = max(0.0, min(threshold)), min(1.0, max(threshold))
+        chunker = (
+            CountBasedChunker(10 if chunk_count is None else chunk_count)
+            if chunk_size is None
+            else SizeBasedChunker(chunk_size)
+        )
+        self._calc = DomainClassifierCalculator(
+            cv_folds_num=n_folds,
+            chunker=chunker,
+            threshold=ConstantThreshold(lower=self.threshold[0], upper=self.threshold[1]),
+        )
+    def fit(self, x_ref: ArrayLike) -> Self:
+        """
+        Fit the domain classifier on the training dataframe
+        Parameters
+        ----------
+        x_ref : ArrayLike
+            Reference data with dim[n_samples, n_features].
+        Returns
+        -------
+        Self
+        """
+        # for 1D input, assume that is 1 sample: dim[1,n_features]
+        self.x_ref: pd.DataFrame = pd.DataFrame(flatten(np.atleast_2d(np.asarray(x_ref))))
+        self.n_features: int = self.x_ref.shape[-1]
+        self._calc.fit(self.x_ref)
+        return self
+    def predict(self, x: ArrayLike) -> DriftMVDCOutput:
+        """
+        Perform :term:`inference<Inference>` on the test dataframe
+        Parameters
+        ----------
+        x : ArrayLike
+            Test (analysis) data with dim[n_samples, n_features].
+        Returns
+        -------
+        DomainClassifierDriftResult
+        """
+        self.x_test: pd.DataFrame = pd.DataFrame(flatten(np.atleast_2d(np.asarray(x))))
+        if self.x_test.shape[-1] != self.n_features:
+            raise ValueError("Reference and test embeddings have different number of features")
+        return self._calc.calculate(self.x_test)

dataeval/detectors/drift/_nml/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""
+Source code derived from NannyML 0.13.0
+https://github.com/NannyML/nannyml/
+Licensed under Apache Software License (Apache 2.0)
+"""

dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

dataeval 0.85.0py3-none-any.whl → 0.86.1py3-none-any.whl