PyPI - dataeval - Versions diffs - 0.88.0__py3-none-any.whl → 0.89.0__py3-none-any.whl - Mend

dataeval 0.88.0py3-none-any.whl → 0.89.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

dataeval/_version.py +2 -2
dataeval/data/_embeddings.py +2 -2
dataeval/data/_metadata.py +2 -1
dataeval/detectors/drift/_base.py +152 -27
dataeval/detectors/drift/_cvm.py +44 -25
dataeval/detectors/drift/_ks.py +56 -28
dataeval/detectors/drift/_mmd.py +44 -18
dataeval/detectors/drift/_uncertainty.py +119 -45
dataeval/outputs/_drift.py +67 -29
dataeval/outputs/_workflows.py +81 -17
dataeval/typing.py +23 -4
dataeval/workflows/sufficiency.py +1 -2
{dataeval-0.88.0.dist-info → dataeval-0.89.0.dist-info}/METADATA +1 -1
{dataeval-0.88.0.dist-info → dataeval-0.89.0.dist-info}/RECORD +16 -16
{dataeval-0.88.0.dist-info → dataeval-0.89.0.dist-info}/WHEEL +0 -0
{dataeval-0.88.0.dist-info → dataeval-0.89.0.dist-info}/licenses/LICENSE +0 -0

dataeval/detectors/drift/_uncertainty.py CHANGED Viewed

@@ -31,24 +31,42 @@ def classifier_uncertainty(
     preds: Array,
     preds_type: Literal["probs", "logits"] = "probs",
 ) -> torch.Tensor:
-    """
-    Evaluate model_fn on x and transform predictions to prediction uncertainties.
+    """Convert model predictions to uncertainty scores using entropy.
+    Computes prediction uncertainty as the entropy of the predicted class
+    probability distribution. Higher entropy indicates greater model uncertainty,
+    with maximum uncertainty at uniform distributions and minimum at confident
+    single-class predictions.
     Parameters
     ----------
-    x : Array
-        Batch of instances.
-    model_fn : Callable
-        Function that evaluates a :term:`classification<Classification>` model on x in a single call (contains
-        batching logic if necessary).
-    preds_type : "probs" | "logits", default "probs"
-        Type of prediction output by the model. Options are 'probs' (in [0,1]) or
-        'logits' (in [-inf,inf]).
+    preds : Array
+        Model predictions for a batch of instances. For "probs" type, should
+        contain class probabilities that sum to 1 across the last dimension.
+        For "logits" type, contains raw model outputs before softmax.
+    preds_type : "probs" or "logits", default "probs"
+        Type of prediction values. "probs" expects probabilities in [0,1] that
+        sum to 1. "logits" expects raw outputs in [-inf,inf] and applies softmax.
+        Default "probs" assumes model outputs normalized probabilities.
     Returns
     -------
-    NDArray
-        A scalar indication of uncertainty of the model on each instance in x.
+    torch.Tensor
+        Uncertainty scores for each instance with shape (n_samples, 1).
+        Values are always >= 0, with higher values indicating greater uncertainty.
+    Raises
+    ------
+    ValueError
+        If preds_type is "probs" but probabilities don't sum to 1 within tolerance.
+    NotImplementedError
+        If preds_type is not "probs" or "logits".
+    Notes
+    -----
+    Uncertainty is computed as Shannon entropy: -sum(p * log(p)) where p are
+    the predicted class probabilities. This provides a principled measure of
+    model confidence that is widely used in uncertainty quantification.
     """
     preds_np = as_numpy(preds)
     if preds_type == "probs":
@@ -65,53 +83,98 @@ def classifier_uncertainty(
 class DriftUncertainty(BaseDrift):
-    """
-    Test for a change in the number of instances falling into regions on which \
-        the model is uncertain.
+    """Drift detector using model prediction uncertainty.
-    Performs a K-S test on prediction entropies.
+    Detects drift by monitoring changes in the distribution of model prediction
+    uncertainties (entropy) rather than input features directly. Uses
+    :term:`Kolmogorov-Smirnov (K-S) Test` to compare uncertainty distributions
+    between reference and test data.
+    This approach is particularly effective for detecting drift that affects model
+    confidence even when input features remain statistically similar, such as
+    out-of-domain samples or adversarial examples.
     Parameters
     ----------
-    data : Array
-        Data used as reference distribution.
-    model : Callable
-        :term:`Classification` model outputting class probabilities (or logits)
+    data : Embeddings or Array
+        Reference dataset used as baseline distribution for drift detection.
+        Should represent the expected "normal" data distribution.
     p_val : float, default 0.05
-        :term:`P-Value` used for the significance of the test.
+        Significance threshold for statistical tests, between 0 and 1.
+        For FDR correction, this represents the acceptable false discovery rate.
+        Default 0.05 provides 95% confidence level for drift detection.
     update_strategy : UpdateStrategy or None, default None
-        Reference data can optionally be updated using an UpdateStrategy class. Update
-        using the last n instances seen by the detector with LastSeenUpdateStrategy
-        or via reservoir sampling with ReservoirSamplingUpdateStrategy.
+        Strategy for updating reference data when new data arrives.
+        When None, reference data remains fixed throughout detection.
     correction : "bonferroni" or "fdr", default "bonferroni"
-        Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
-        Discovery Rate).
+        Multiple testing correction method for multivariate drift detection.
+        "bonferroni" provides conservative family-wise error control by
+        dividing significance threshold by number of features.
+        "fdr" uses Benjamini-Hochberg procedure for less conservative control.
+        Default "bonferroni" minimizes false positive drift detections.
     preds_type : "probs" or "logits", default "probs"
-        Type of prediction output by the model. Options are 'probs' (in [0,1]) or
-        'logits' (in [-inf,inf]).
+        Format of model prediction outputs. "probs" expects normalized
+        probabilities summing to 1. "logits" expects raw model outputs
+        and applies softmax normalization internally.
+        Default "probs" assumes standard classification model outputs.
     batch_size : int, default 32
-        Batch size used to evaluate model. Only relevant when backend has been
-        specified for batch prediction.
+        Batch size for model inference during uncertainty computation.
+        Larger batches improve GPU utilization but require more memory.
+        Default 32 balances efficiency and memory usage.
     transforms : Transform, Sequence[Transform] or None, default None
-        Transform(s) to apply to the data.
+        Data transformations applied before model inference. Should match
+        preprocessing used during model training for consistent predictions.
+        When None, uses raw input data without preprocessing.
     device : DeviceLike or None, default None
-        Device type used. The default None tries to use the GPU and falls back on
-        CPU if needed. Can be specified by passing either 'cuda' or 'cpu'.
+        Hardware device for computation. When None, automatically selects
+        DataEval's configured device, falling back to PyTorch's default.
+    Attributes
+    ----------
+    model : torch.nn.Module
+        Classification model used for uncertainty computation.
+    device : torch.device
+        Hardware device used for model inference.
+    batch_size : int
+        Batch size for model predictions.
+    preds_type : {"probs", "logits"}
+        Format of model prediction outputs.
     Example
     -------
     >>> model = ClassificationModel()
-    >>> drift = DriftUncertainty(x_ref, model=model, batch_size=20)
+    >>> drift_detector = DriftUncertainty(x_ref, model=model, batch_size=16)
     Verify reference images have not drifted
-    >>> drift.predict(x_ref.copy()).drifted
-    False
+    >>> result = drift_detector.predict(x_test)
+    >>> print(f"Drift detected: {result.drifted}")
+    Drift detected: True
-    Test incoming images for drift
+    >>> print(f"Mean uncertainty change: {result.distance:.4f}")
+    Mean uncertainty change: 0.8160
-    >>> drift.predict(x_test).drifted
-    True
+    With data preprocessing
+    >>> import torchvision.transforms.v2 as T
+    >>> transforms = T.Compose([T.ToDtype(torch.float32)])
+    >>> drift_detector = DriftUncertainty(x_ref, model=model, batch_size=16, transforms=transforms)
+    Notes
+    -----
+    Uncertainty-based drift detection is complementary to feature-based methods.
+    It can detect semantic drift (changes in data meaning) that may not be
+    apparent in raw feature statistics, making it valuable for monitoring
+    model performance in production environments.
+    The method assumes that model uncertainty is a reliable indicator of
+    data quality. This works best with well-calibrated models trained on
+    representative data. Poorly calibrated models may produce misleading
+    uncertainty estimates.
+    For optimal performance, ensure the model and transforms match those used
+    during training, and that the reference data represents the expected
+    operational distribution where the model performs reliably.
     """
     def __init__(
@@ -142,27 +205,38 @@ class DriftUncertainty(BaseDrift):
         )
     def _transform(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply preprocessing transforms to input data."""
         for transform in self._transforms:
             x = transform(x)
         return x
     def _preprocess(self, x: Array) -> torch.Tensor:
+        """Convert input data to uncertainty scores via model predictions."""
         preds = predict_batch(x, self.model, self.device, self.batch_size, self._transform)
         return classifier_uncertainty(preds, self.preds_type)
     def predict(self, x: Array) -> DriftOutput:
-        """
-        Predict whether a batch of data has drifted from the reference data.
+        """Predict whether model uncertainty distribution has drifted.
+        Computes prediction uncertainties for the input data and tests
+        whether their distribution significantly differs from the reference
+        uncertainty distribution using Kolmogorov-Smirnov test.
         Parameters
         ----------
         x : Array
-            Batch of instances.
+            Batch of instances to test for uncertainty drift.
         Returns
         -------
-        DriftUnvariateOutput
-            Dictionary containing the drift prediction, :term:`p-value<P-Value>`, and threshold
-            statistics.
+        DriftOutput
+            Drift detection results including overall prediction, p-values,
+            test statistics, and feature-level analysis of uncertainty values.
+        Notes
+        -----
+        The returned DriftOutput treats uncertainty values as "features" for
+        consistency with the underlying KS test implementation, even though
+        uncertainty-based drift typically involves univariate analysis.
         """
         return self._detector.predict(self._preprocess(x).cpu().numpy())

dataeval/outputs/_drift.py CHANGED Viewed

@@ -18,8 +18,28 @@ from dataeval.outputs._base import Output
 @dataclass(frozen=True)
 class DriftBaseOutput(Output):
-    """
-    Base output class for Drift Detector classes
+    """Base output class for drift detector classes.
+    Provides common fields returned by all drift detection methods, containing
+    instance-level drift predictions and summary statistics. Subclasses extend
+    this with detector-specific additional fields.
+    Attributes
+    ----------
+    drifted : bool
+        Whether drift was detected in the analyzed data. True indicates
+        significant drift from reference distribution.
+    threshold : float
+        Significance threshold used for drift detection, typically between 0 and 1.
+        For multivariate methods, this is the corrected threshold after
+        Bonferroni or FDR correction.
+    p_val : float
+        Instance-level p-value from statistical test, between 0 and 1.
+        For univariate methods, this is the mean p-value across all features.
+    distance : float
+        Instance-level test statistic or distance metric, always >= 0.
+        For univariate methods, this is the mean distance across all features.
+        Higher values indicate greater deviation from reference distribution.
     """
     drifted: bool
@@ -31,58 +51,76 @@ class DriftBaseOutput(Output):
 @dataclass(frozen=True)
 class DriftMMDOutput(DriftBaseOutput):
     """
-    Output class for :class:`.DriftMMD` :term:`drift<Drift>` detector.
+    Output class for :class:`.DriftMMD` (Maximum Mean Discrepancy) drift detector.
+    Extends :class:`.DriftBaseOutput` with MMD-specific distance threshold information.
+    Used by MMD-based drift detectors that compare kernel embeddings between
+    reference and test distributions.
     Attributes
     ----------
     drifted : bool
-        Drift prediction for the images
+        Whether drift was detected based on MMD permutation test.
     threshold : float
-        :term:`P-Value` used for significance of the permutation test
+        P-value threshold used for significance of the permutation test.
     p_val : float
-        P-value obtained from the permutation test
+        P-value obtained from the MMD permutation test, between 0 and 1.
     distance : float
-        MMD^2 between the reference and test set
+        Squared Maximum Mean Discrepancy between reference and test set.
+        Always >= 0, with higher values indicating greater distributional difference.
     distance_threshold : float
-        MMD^2 threshold above which drift is flagged
+        Squared Maximum Mean Discrepancy threshold above which drift is flagged, always >= 0.
+        Determined from permutation test at specified significance level.
+    Notes
+    -----
+    MMD uses kernel methods to compare distributions in reproducing kernel
+    Hilbert spaces, making it effective for high-dimensional data like images.
     """
-    # drifted: bool
-    # threshold: float
-    # p_val: float
-    # distance: float
     distance_threshold: float
 @dataclass(frozen=True)
 class DriftOutput(DriftBaseOutput):
-    """
-    Output class for :class:`.DriftCVM`, :class:`.DriftKS`, and :class:`.DriftUncertainty` drift detectors.
+    """Output class for univariate drift detectors.
+    Extends :class:`.DriftBaseOutput` with feature-level (per-pixel) drift information.
+    Used by Kolmogorov-Smirnov, Cramér-von Mises, and uncertainty-based
+    drift detectors that analyze each feature independently.
     Attributes
     ----------
     drifted : bool
-        :term:`Drift` prediction for the images
+        Overall drift prediction after multivariate correction.
     threshold : float
-        Threshold after multivariate correction if needed
+        Corrected threshold after Bonferroni or FDR correction for multiple testing.
     p_val : float
-        Instance-level p-value
+        Mean p-value across all features, between 0 and 1.
+        For descriptive purposes only; individual feature p-values are used
+        for drift detection decisions. Can appear high even when drifted=True
+        if only a subset of features show drift.
     distance : float
-        Instance-level distance
-    feature_drift : NDArray
-        Feature-level array of images detected to have drifted
+        Mean test statistic across all features, always >= 0.
+    feature_drift : NDArray[bool]
+        Boolean array indicating which features (pixels) show drift.
+        Shape matches the number of features in the input data.
     feature_threshold : float
-        Feature-level threshold to determine drift
-    p_vals : NDArray
-        Feature-level p-values
-    distances : NDArray
-        Feature-level distances
+        Uncorrected p-value threshold used for individual feature testing.
+        Typically the original p_val before multivariate correction.
+    p_vals : NDArray[np.float32]
+        P-values for each feature, all values between 0 and 1.
+        Shape matches the number of features in the input data.
+    distances : NDArray[np.float32]
+        Test statistics for each feature, all values >= 0.
+        Shape matches the number of features in the input data.
+    Notes
+    -----
+    Feature-level analysis enables identification of specific pixels or regions
+    that contribute most to detected drift, useful for interpretability.
     """
-    # drifted: bool
-    # threshold: float
-    # p_val: float
-    # distance: float
     feature_drift: NDArray[np.bool_]
     feature_threshold: float
     p_vals: NDArray[np.float32]

dataeval/outputs/_workflows.py CHANGED Viewed

@@ -62,9 +62,12 @@ def project_steps(params: NDArray[Any], projection: NDArray[Any]) -> NDArray[Any
 def plot_measure(
     name: str,
     steps: NDArray[Any],
-    measure: NDArray[Any],
+    averaged_measure: NDArray[Any],
+    measures: NDArray[Any] | None,
     params: NDArray[Any],
     projection: NDArray[Any],
+    error_bars: bool,
+    asymptote: bool,
 ) -> Figure:
     import matplotlib.pyplot
@@ -73,21 +76,57 @@ def plot_measure(
     fig.tight_layout()
     ax = fig.add_subplot(111)
     ax.set_title(f"{name} Sufficiency")
     ax.set_ylabel(f"{name}")
     ax.set_xlabel("Steps")
-    # Plot measure over each step
-    ax.scatter(steps, measure, label=f"Model Results ({name})", s=15, c="black")
+    # Plot asymptote
+    if asymptote:
+        bound = 1 - params[2]
+        ax.axhline(y=bound, color="r", label=f"Asymptote: {bound:.4g}", zorder=1)
+    # Calculate error bars
+    # Plot measure over each step with associated error
+    if error_bars:
+        if measures is None:
+            warnings.warn(
+                "Error bars cannot be plotted without full, unaveraged data",
+                UserWarning,
+            )
+        else:
+            error = np.std(measures, axis=0)
+            ax.errorbar(
+                steps,
+                averaged_measure,
+                yerr=error,
+                capsize=7,
+                capthick=1.5,
+                elinewidth=1.5,
+                fmt="o",
+                label=f"Model Results ({name})",
+                markersize=5,
+                color="black",
+                ecolor="orange",
+                zorder=3,
+            )
+    else:
+        ax.scatter(
+            steps,
+            averaged_measure,
+            label=f"Model Results ({name})",
+            zorder=3,
+            c="black",
+        )
     # Plot extrapolation
     ax.plot(
         projection,
         project_steps(params, projection),
         linestyle="dashed",
         label=f"Potential Model Results ({name})",
+        linewidth=2,
+        zorder=2,
     )
+    ax.set_xscale("log")
-    ax.legend()
+    ax.legend(loc="best")
     return fig
@@ -116,7 +155,9 @@ def f_inv_out(y_i: NDArray[Any], x: NDArray[Any]) -> NDArray[np.int64]:
                 "Number of samples could not be determined for target(s): "
                 f"""{
                     np.array2string(
-                        1 - y_i[unachievable_targets], separator=", ", formatter={"float": lambda x: f"{x}"}
+                        1 - y_i[unachievable_targets],
+                        separator=", ",
+                        formatter={"float": lambda x: f"{x}"},
                     )
                 }""",
                 UserWarning,
@@ -190,7 +231,9 @@ def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int) -> NDArray[np.
 def get_curve_params(
-    averaged_measures: MutableMapping[str, NDArray[Any]], ranges: NDArray[Any], niter: int
+    averaged_measures: MutableMapping[str, NDArray[Any]],
+    ranges: NDArray[Any],
+    niter: int,
 ) -> Mapping[str, NDArray[np.float64]]:
     """Calculates and aggregates parameters for both single and multi-class metrics"""
     output = {}
@@ -286,11 +329,16 @@ class SufficiencyOutput(Output):
                 output[name] = np.array(result)
             else:
                 output[name] = project_steps(self.params[name], projection)
-        proj = SufficiencyOutput(projection, measures=self.measures, averaged_measures=output, n_iter=self.n_iter)
+        proj = SufficiencyOutput(projection, {}, output, self.n_iter)
         proj._params = self._params
         return proj
-    def plot(self, class_names: Sequence[str] | None = None) -> Sequence[Figure]:
+    def plot(
+        self,
+        class_names: Sequence[str] | None = None,
+        error_bars: bool = False,
+        asymptote: bool = False,
+    ) -> Sequence[Figure]:
         """
         Plotting function for data :term:`sufficience<Sufficiency>` tasks.
@@ -298,6 +346,10 @@ class SufficiencyOutput(Output):
         ----------
         class_names : Sequence[str] | None, default None
             List of class names
+        error_bars : bool, default False
+            True if error bars should be plotted, False if not
+        asymptote : bool, default False
+            True if asymptote should be plotted, False if not
         Returns
         -------
@@ -320,25 +372,36 @@ class SufficiencyOutput(Output):
         # Stores all plots
         plots = []
         # Create a plot for each measure on one figure
-        for name, averaged_measures in self.averaged_measures.items():
-            if averaged_measures.ndim > 1:
-                if class_names is not None and len(averaged_measures) != len(class_names):
+        for name, measures in self.averaged_measures.items():
+            if measures.ndim > 1:
+                if class_names is not None and len(measures) != len(class_names):
                     raise IndexError("Class name count does not align with measures")
-                for i, measure in enumerate(averaged_measures):
+                for i, values in enumerate(measures):
                     class_name = str(i) if class_names is None else class_names[i]
                     fig = plot_measure(
                         f"{name}_{class_name}",
                         self.steps,
-                        measure,
+                        values,
+                        self.measures[name][:, :, i] if len(self.measures) else None,
                         self.params[name][i],
                         extrapolated,
+                        error_bars,
+                        asymptote,
                     )
                     plots.append(fig)
             else:
-                fig = plot_measure(name, self.steps, averaged_measures, self.params[name], extrapolated)
+                fig = plot_measure(
+                    name,
+                    self.steps,
+                    measures,
+                    self.measures.get(name),
+                    self.params[name],
+                    extrapolated,
+                    error_bars,
+                    asymptote,
+                )
                 plots.append(fig)
         return plots
@@ -376,7 +439,8 @@ class SufficiencyOutput(Output):
                 projection[name] = np.zeros((len(measure), len(tarray)))
                 for i in range(len(measure)):
                     projection[name][i] = inv_project_steps(
-                        self.params[name][i], tarray[i] if tarray.ndim == measure.ndim else tarray
+                        self.params[name][i],
+                        tarray[i] if tarray.ndim == measure.ndim else tarray,
                     )
             else:
                 projection[name] = inv_project_steps(self.params[name], tarray)

dataeval/typing.py CHANGED Viewed

@@ -21,7 +21,7 @@ __all__ = [
 ]
-from collections.abc import Iterator, Mapping
+from collections.abc import Iterator
 from typing import (
     Any,
     Generic,
@@ -94,6 +94,7 @@ class Array(Protocol):
 _T = TypeVar("_T")
 _T_co = TypeVar("_T_co", covariant=True)
+_T_cn = TypeVar("_T_cn", contravariant=True)
 class DatasetMetadata(TypedDict, total=False):
@@ -128,6 +129,19 @@ class ModelMetadata(TypedDict, total=False):
     index2label: NotRequired[ReadOnly[dict[int, str]]]
+class DatumMetadata(TypedDict, total=False):
+    """
+    Datum level metadata required for all `AnnotatedDataset` classes.
+    Attributes
+    ----------
+    id : Required[str]
+        A unique identifier for the datum
+    """
+    id: Required[ReadOnly[str]]
 @runtime_checkable
 class Dataset(Generic[_T_co], Protocol):
     """
@@ -173,7 +187,7 @@ class AnnotatedDataset(Dataset[_T_co], Generic[_T_co], Protocol):
 # ========== IMAGE CLASSIFICATION DATASETS ==========
-ImageClassificationDatum: TypeAlias = tuple[ArrayLike, ArrayLike, Mapping[str, Any]]
+ImageClassificationDatum: TypeAlias = tuple[ArrayLike, ArrayLike, DatumMetadata]
 """
 Type alias for an image classification datum tuple.
@@ -213,7 +227,7 @@ class ObjectDetectionTarget(Protocol):
     def scores(self) -> ArrayLike: ...
-ObjectDetectionDatum: TypeAlias = tuple[ArrayLike, ObjectDetectionTarget, Mapping[str, Any]]
+ObjectDetectionDatum: TypeAlias = tuple[ArrayLike, ObjectDetectionTarget, DatumMetadata]
 """
 Type alias for an object detection datum tuple.
@@ -254,7 +268,7 @@ class SegmentationTarget(Protocol):
     def scores(self) -> ArrayLike: ...
-SegmentationDatum: TypeAlias = tuple[ArrayLike, SegmentationTarget, Mapping[str, Any]]
+SegmentationDatum: TypeAlias = tuple[ArrayLike, SegmentationTarget, DatumMetadata]
 """
 Type alias for an image classification datum tuple.
@@ -311,3 +325,8 @@ class Transform(Generic[_T], Protocol):
     """
     def __call__(self, data: _T, /) -> _T: ...
+@runtime_checkable
+class Action(Generic[_T_cn, _T_co], Protocol):
+    def __call__(self, evaluator: _T_cn) -> _T_co: ...

dataeval/workflows/sufficiency.py CHANGED Viewed

@@ -280,5 +280,4 @@ class Sufficiency(Generic[T]):
                         )
                     measures[name][run, iteration] = value
-        # The mean for each measure must be calculated before being returned
-        return SufficiencyOutput(ranges, measures=measures)
+        return SufficiencyOutput(ranges, measures)

{dataeval-0.88.0.dist-info → dataeval-0.89.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataeval
-Version: 0.88.0
+Version: 0.89.0
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Project-URL: Homepage, https://dataeval.ai/
 Project-URL: Repository, https://github.com/aria-ml/dataeval/

dataeval 0.88.0__py3-none-any.whl → 0.89.0__py3-none-any.whl

dataeval 0.88.0py3-none-any.whl → 0.89.0py3-none-any.whl