PyPI - dataeval - Versions diffs - 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl - Mend

dataeval 0.74.2py3-none-any.whl → 0.76.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

dataeval/__init__.py +27 -23
dataeval/detectors/__init__.py +2 -2
dataeval/detectors/drift/__init__.py +14 -12
dataeval/detectors/drift/base.py +3 -3
dataeval/detectors/drift/cvm.py +1 -1
dataeval/detectors/drift/ks.py +3 -2
dataeval/detectors/drift/mmd.py +9 -7
dataeval/detectors/drift/torch.py +12 -12
dataeval/detectors/drift/uncertainty.py +5 -4
dataeval/detectors/drift/updates.py +1 -1
dataeval/detectors/linters/__init__.py +4 -4
dataeval/detectors/linters/clusterer.py +5 -9
dataeval/detectors/linters/duplicates.py +10 -14
dataeval/detectors/linters/outliers.py +100 -5
dataeval/detectors/ood/__init__.py +4 -11
dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
dataeval/detectors/ood/base.py +47 -160
dataeval/detectors/ood/metadata_ks_compare.py +34 -42
dataeval/detectors/ood/metadata_least_likely.py +3 -3
dataeval/detectors/ood/metadata_ood_mi.py +6 -5
dataeval/detectors/ood/mixin.py +146 -0
dataeval/detectors/ood/output.py +63 -0
dataeval/interop.py +7 -6
dataeval/{logging.py → log.py} +2 -0
dataeval/metrics/__init__.py +3 -3
dataeval/metrics/bias/__init__.py +10 -13
dataeval/metrics/bias/balance.py +13 -11
dataeval/metrics/bias/coverage.py +53 -5
dataeval/metrics/bias/diversity.py +56 -24
dataeval/metrics/bias/parity.py +20 -17
dataeval/metrics/estimators/__init__.py +2 -2
dataeval/metrics/estimators/ber.py +7 -4
dataeval/metrics/estimators/divergence.py +4 -4
dataeval/metrics/estimators/uap.py +4 -4
dataeval/metrics/stats/__init__.py +19 -19
dataeval/metrics/stats/base.py +28 -12
dataeval/metrics/stats/boxratiostats.py +13 -14
dataeval/metrics/stats/datasetstats.py +49 -20
dataeval/metrics/stats/dimensionstats.py +8 -8
dataeval/metrics/stats/hashstats.py +14 -10
dataeval/metrics/stats/labelstats.py +94 -11
dataeval/metrics/stats/pixelstats.py +11 -14
dataeval/metrics/stats/visualstats.py +10 -13
dataeval/output.py +23 -14
dataeval/utils/__init__.py +5 -14
dataeval/utils/dataset/__init__.py +7 -0
dataeval/utils/{torch → dataset}/datasets.py +2 -0
dataeval/utils/dataset/read.py +63 -0
dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
dataeval/utils/image.py +2 -2
dataeval/utils/metadata.py +317 -14
dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +91 -71
dataeval/utils/torch/__init__.py +2 -17
dataeval/utils/torch/gmm.py +29 -6
dataeval/utils/torch/{utils.py → internal.py} +82 -58
dataeval/utils/torch/models.py +10 -8
dataeval/utils/torch/trainer.py +6 -85
dataeval/workflows/__init__.py +2 -5
dataeval/workflows/sufficiency.py +18 -8
{dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
dataeval-0.76.0.dist-info/METADATA +137 -0
dataeval-0.76.0.dist-info/RECORD +67 -0
dataeval/detectors/ood/base_torch.py +0 -109
dataeval/metrics/bias/metadata_preprocessing.py +0 -285
dataeval/utils/gmm.py +0 -26
dataeval-0.74.2.dist-info/METADATA +0 -120
dataeval-0.74.2.dist-info/RECORD +0 -66
{dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -1,36 +1,40 @@
-__version__ = "0.74.2"
+"""
+DataEval provides a simple interface to characterize image data and its impact on model performance
+across classification and object-detection tasks. It also provides capabilities to select and curate
+datasets to test and train performant, robust, unbiased and reliable AI models and monitor for data
+shifts that impact performance of deployed models.
+"""
+from __future__ import annotations
+__all__ = ["detectors", "log", "metrics", "utils", "workflows"]
+__version__ = "0.76.0"
 import logging
-from importlib.util import find_spec
+from dataeval import detectors, metrics, utils, workflows
 logging.getLogger(__name__).addHandler(logging.NullHandler())
-def log_stderr(level: int = logging.DEBUG) -> None:
+def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> None:
     """
-    Helper for quickly adding a StreamHandler to the logger. Useful for
-    debugging.
+    Helper for quickly adding a StreamHandler to the logger. Useful for debugging.
+    Parameters
+    ----------
+    level : int, default logging.DEBUG(10)
+        Set the logging level for the logger.
+    handler : logging.Handler, optional
+        Sets the logging handler for the logger if provided, otherwise logger will be
+        provided with a StreamHandler.
     """
     import logging
     logger = logging.getLogger(__name__)
-    handler = logging.StreamHandler()
-    handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
+    if handler is None:
+        handler = logging.StreamHandler() if handler is None else handler
+        handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
     logger.addHandler(handler)
     logger.setLevel(level)
-    logger.debug("Added a stderr logging handler to logger: %s", __name__)
-_IS_TORCH_AVAILABLE = find_spec("torch") is not None
-_IS_TORCHVISION_AVAILABLE = find_spec("torchvision") is not None
-del find_spec
-from dataeval import detectors, metrics  # noqa: E402
-__all__ = ["log_stderr", "detectors", "metrics"]
-if _IS_TORCH_AVAILABLE:
-    from dataeval import utils, workflows
-    __all__ += ["utils", "workflows"]
+    logger.debug(f"Added logging handler {handler} to logger: {__name__}")

dataeval/detectors/__init__.py CHANGED Viewed

@@ -2,6 +2,6 @@
 Detectors can determine if a dataset or individual images in a dataset are indicative of a specific issue.
 """
-from dataeval.detectors import drift, linters, ood
 __all__ = ["drift", "linters", "ood"]
+from dataeval.detectors import drift, linters, ood

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -2,19 +2,21 @@
 :term:`Drift` detectors identify if the statistical properties of the data has changed.
 """
-from dataeval import _IS_TORCH_AVAILABLE
+__all__ = [
+    "DriftCVM",
+    "DriftKS",
+    "DriftMMD",
+    "DriftMMDOutput",
+    "DriftOutput",
+    "DriftUncertainty",
+    "preprocess_drift",
+    "updates",
+]
 from dataeval.detectors.drift import updates
 from dataeval.detectors.drift.base import DriftOutput
 from dataeval.detectors.drift.cvm import DriftCVM
 from dataeval.detectors.drift.ks import DriftKS
-__all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
-if _IS_TORCH_AVAILABLE:
-    from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
-    from dataeval.detectors.drift.torch import preprocess_drift
-    from dataeval.detectors.drift.uncertainty import DriftUncertainty
-    __all__ += ["DriftMMD", "DriftMMDOutput", "DriftUncertainty", "preprocess_drift"]
-del _IS_TORCH_AVAILABLE
+from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
+from dataeval.detectors.drift.torch import preprocess_drift
+from dataeval.detectors.drift.uncertainty import DriftUncertainty

dataeval/detectors/drift/base.py CHANGED Viewed

@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-__all__ = ["DriftOutput"]
+__all__ = []
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -45,7 +45,7 @@ class UpdateStrategy(ABC):
 @dataclass(frozen=True)
 class DriftBaseOutput(Output):
     """
-    Base output class for Drift detector classes
+    Base output class for Drift Detector classes
     Attributes
     ----------
@@ -64,7 +64,7 @@ class DriftBaseOutput(Output):
 @dataclass(frozen=True)
 class DriftOutput(DriftBaseOutput):
     """
-    Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors
+    Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors.
     Attributes
     ----------

dataeval/detectors/drift/cvm.py CHANGED Viewed

@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-__all__ = ["DriftCVM"]
+__all__ = []
 from typing import Callable, Literal

dataeval/detectors/drift/ks.py CHANGED Viewed

@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-__all__ = ["DriftKS"]
+__all__ = []
 from typing import Callable, Literal
@@ -22,7 +22,8 @@ from dataeval.interop import to_numpy
 class DriftKS(BaseDriftUnivariate):
     """
-    :term:`Drift` detector employing the Kolmogorov-Smirnov (KS) distribution test.
+    :term:`Drift` detector employing the :term:`Kolmogorov-Smirnov (KS) \
+    distribution<Kolmogorov-Smirnov (K-S) test>` test.
     The KS test detects changes in the maximum distance between two data
     distributions with Bonferroni or :term:`False Discovery Rate (FDR)` correction

dataeval/detectors/drift/mmd.py CHANGED Viewed

@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-__all__ = ["DriftMMD", "DriftMMDOutput"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Callable
@@ -17,15 +17,16 @@ import torch
 from numpy.typing import ArrayLike
 from dataeval.detectors.drift.base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
-from dataeval.detectors.drift.torch import _GaussianRBF, _mmd2_from_kernel_matrix, get_device
+from dataeval.detectors.drift.torch import GaussianRBF, mmd2_from_kernel_matrix
 from dataeval.interop import as_numpy
 from dataeval.output import set_metadata
+from dataeval.utils.torch.internal import get_device
 @dataclass(frozen=True)
 class DriftMMDOutput(DriftBaseOutput):
     """
-    Output class for :class:`DriftMMD` :term:`drift<Drift>` detector
+    Output class for :class:`DriftMMD` :term:`drift<Drift>` detector.
     Attributes
     ----------
@@ -50,7 +51,8 @@ class DriftMMDOutput(DriftBaseOutput):
 class DriftMMD(BaseDrift):
     """
-    :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm using a permutation test.
+    :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
+    using a permutation test.
     Parameters
     ----------
@@ -109,7 +111,7 @@ class DriftMMD(BaseDrift):
         # initialize kernel
         sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
-        self._kernel = _GaussianRBF(sigma_tensor).to(self.device)
+        self._kernel = GaussianRBF(sigma_tensor).to(self.device)
         # compute kernel matrix for the reference data
         if self._infer_sigma or isinstance(sigma_tensor, torch.Tensor):
@@ -150,9 +152,9 @@ class DriftMMD(BaseDrift):
         n = x.shape[0]
         kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
         kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())  # zero diagonal
-        mmd2 = _mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
+        mmd2 = mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
         mmd2_permuted = torch.Tensor(
-            [_mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
+            [mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
         )
         mmd2, mmd2_permuted = mmd2.detach().cpu(), mmd2_permuted.detach().cpu()
         p_val = (mmd2 <= mmd2_permuted).float().mean()

dataeval/detectors/drift/torch.py CHANGED Viewed

@@ -17,10 +17,10 @@ import torch
 import torch.nn as nn
 from numpy.typing import NDArray
-from dataeval.utils.torch.utils import get_device, predict_batch
+from dataeval.utils.torch.internal import get_device, predict_batch
-def _mmd2_from_kernel_matrix(
+def mmd2_from_kernel_matrix(
     kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
 ) -> torch.Tensor:
     """
@@ -127,7 +127,7 @@ def _squared_pairwise_distance(
 def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.Tensor:
     """
-    Bandwidth estimation using the median heuristic :cite:t:`Gretton2012`.
+    Bandwidth estimation using the median heuristic `Gretton2012`
     Parameters
     ----------
@@ -151,7 +151,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
     return sigma
-class _GaussianRBF(nn.Module):
+class GaussianRBF(nn.Module):
     """
     Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
@@ -179,18 +179,18 @@ class _GaussianRBF(nn.Module):
     ) -> None:
         super().__init__()
         init_sigma_fn = sigma_median if init_sigma_fn is None else init_sigma_fn
-        self.config = {
+        self.config: dict[str, Any] = {
             "sigma": sigma,
             "trainable": trainable,
             "init_sigma_fn": init_sigma_fn,
         }
         if sigma is None:
-            self.log_sigma = nn.Parameter(torch.empty(1), requires_grad=trainable)
-            self.init_required = True
+            self.log_sigma: nn.Parameter = nn.Parameter(torch.empty(1), requires_grad=trainable)
+            self.init_required: bool = True
         else:
             sigma = sigma.reshape(-1)  # [Ns,]
-            self.log_sigma = nn.Parameter(sigma.log(), requires_grad=trainable)
-            self.init_required = False
+            self.log_sigma: nn.Parameter = nn.Parameter(sigma.log(), requires_grad=trainable)
+            self.init_required: bool = False
         self.init_sigma_fn = init_sigma_fn
         self.trainable = trainable
@@ -200,8 +200,8 @@ class _GaussianRBF(nn.Module):
     def forward(
         self,
-        x: np.ndarray | torch.Tensor,
-        y: np.ndarray | torch.Tensor,
+        x: np.ndarray[Any, Any] | torch.Tensor,
+        y: np.ndarray[Any, Any] | torch.Tensor,
         infer_sigma: bool = False,
     ) -> torch.Tensor:
         x, y = torch.as_tensor(x), torch.as_tensor(y)
@@ -213,7 +213,7 @@ class _GaussianRBF(nn.Module):
             sigma = self.init_sigma_fn(x, y, dist)
             with torch.no_grad():
                 self.log_sigma.copy_(sigma.log().clone())
-            self.init_required = False
+            self.init_required: bool = False
         gamma = 1.0 / (2.0 * self.sigma**2)  # [Ns,]
         # TODO: do matrix multiplication after all?

dataeval/detectors/drift/uncertainty.py CHANGED Viewed

@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-__all__ = ["DriftUncertainty"]
+__all__ = []
 from functools import partial
 from typing import Callable, Literal
@@ -20,7 +20,8 @@ from scipy.stats import entropy
 from dataeval.detectors.drift.base import DriftOutput, UpdateStrategy
 from dataeval.detectors.drift.ks import DriftKS
-from dataeval.detectors.drift.torch import get_device, preprocess_drift
+from dataeval.detectors.drift.torch import preprocess_drift
+from dataeval.utils.torch.internal import get_device
 def classifier_uncertainty(
@@ -65,8 +66,8 @@ def classifier_uncertainty(
 class DriftUncertainty:
     """
-    Test for a change in the number of instances falling into regions on which the
-    model is uncertain.
+    Test for a change in the number of instances falling into regions on which \
+        the model is uncertain.
     Performs a K-S test on prediction entropies.

dataeval/detectors/drift/updates.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
+Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring.
 for drift.
 """

dataeval/detectors/linters/__init__.py CHANGED Viewed

@@ -2,10 +2,6 @@
 Linters help identify potential issues in training and test data and are an important aspect of data cleaning.
 """
-from dataeval.detectors.linters.clusterer import Clusterer, ClustererOutput
-from dataeval.detectors.linters.duplicates import Duplicates, DuplicatesOutput
-from dataeval.detectors.linters.outliers import Outliers, OutliersOutput
 __all__ = [
     "Clusterer",
     "ClustererOutput",
@@ -14,3 +10,7 @@ __all__ = [
     "Outliers",
     "OutliersOutput",
 ]
+from dataeval.detectors.linters.clusterer import Clusterer, ClustererOutput
+from dataeval.detectors.linters.duplicates import Duplicates, DuplicatesOutput
+from dataeval.detectors.linters.outliers import Outliers, OutliersOutput

dataeval/detectors/linters/clusterer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["ClustererOutput", "Clusterer"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Any, Iterable, NamedTuple, cast
@@ -18,7 +18,7 @@ from dataeval.utils.shared import flatten
 @dataclass(frozen=True)
 class ClustererOutput(Output):
     """
-    Output class for :class:`Clusterer` lint detector
+    Output class for :class:`Clusterer` lint detector.
     Attributes
     ----------
@@ -131,7 +131,8 @@ class _ClusterMergeEntry:
 class Clusterer:
     """
-    Uses hierarchical clustering to flag dataset properties of interest like Outliers and :term:`duplicates<Duplicates>`
+    Uses hierarchical clustering to flag dataset properties of interest like outliers \
+    and :term:`duplicates<Duplicates>`.
     Parameters
     ----------
@@ -147,12 +148,6 @@ class Clusterer:
     ----
     The Clusterer works best when the length of the feature dimension, P, is less than 500.
     If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
-    Example
-    -------
-    Initialize the Clusterer class:
-    >>> cluster = Clusterer(dataset)
     """
     def __init__(self, dataset: ArrayLike) -> None:
@@ -506,6 +501,7 @@ class Clusterer:
         Example
         -------
+        >>> cluster = Clusterer(clusterer_images)
         >>> cluster.evaluate()
         ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
         """  # noqa: E501

dataeval/detectors/linters/duplicates.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["DuplicatesOutput", "Duplicates"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Generic, Iterable, Sequence, TypeVar, overload
@@ -19,7 +19,7 @@ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateG
 @dataclass(frozen=True)
 class DuplicatesOutput(Generic[TIndexCollection], Output):
     """
-    Output class for :class:`Duplicates` lint detector
+    Output class for :class:`Duplicates` lint detector.
     Attributes
     ----------
@@ -39,8 +39,8 @@ class DuplicatesOutput(Generic[TIndexCollection], Output):
 class Duplicates:
     """
-    Finds the duplicate images in a dataset using xxhash for exact :term:`duplicates<Duplicates>`
-    and pchash for near duplicates
+    Finds the duplicate images in a dataset using xxhash for exact \
+    :term:`duplicates<Duplicates>` and pchash for near duplicates.
     Attributes
     ----------
@@ -51,13 +51,6 @@ class Duplicates:
     ----------
     only_exact : bool, default False
         Only inspect the dataset for exact image matches
-    Example
-    -------
-    Initialize the Duplicates class:
-    >>> all_dupes = Duplicates()
-    >>> exact_dupes = Duplicates(only_exact=True)
     """
     def __init__(self, only_exact: bool = False) -> None:
@@ -73,7 +66,8 @@ class Duplicates:
         if not self.only_exact:
             near_dict: dict[int, list] = {}
             for i, value in enumerate(stats["pchash"]):
-                near_dict.setdefault(value, []).append(i)
+                if value:
+                    near_dict.setdefault(value, []).append(i)
             near = [sorted(v) for v in near_dict.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
         else:
             near = []
@@ -98,7 +92,7 @@ class Duplicates:
         Parameters
         ----------
-        data : HashStatsOutput | Sequence[HashStatsOutput]
+        hashes : HashStatsOutput | Sequence[HashStatsOutput]
             The output(s) from a hashstats analysis
         Returns
@@ -112,6 +106,7 @@ class Duplicates:
         Example
         -------
+        >>> exact_dupes = Duplicates(only_exact=True)
         >>> exact_dupes.from_stats([hashes1, hashes2])
         DuplicatesOutput(exact=[{0: [3, 20]}, {0: [16], 1: [12]}], near=[])
         """
@@ -159,7 +154,8 @@ class Duplicates:
         Example
         -------
-        >>> all_dupes.evaluate(images)
+        >>> all_dupes = Duplicates()
+        >>> all_dupes.evaluate(duplicate_images)
         DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
         """  # noqa: E501
         self.stats = hashstats(data)

dataeval/detectors/linters/outliers.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from __future__ import annotations
-__all__ = ["OutliersOutput", "Outliers"]
+__all__ = []
+# import contextlib
 from dataclasses import dataclass
 from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
@@ -12,19 +13,78 @@ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_s
 from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
 from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
 from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
+from dataeval.metrics.stats.labelstats import LabelStatsOutput
 from dataeval.metrics.stats.pixelstats import PixelStatsOutput
 from dataeval.metrics.stats.visualstats import VisualStatsOutput
 from dataeval.output import Output, set_metadata
+# with contextlib.suppress(ImportError):
+#     import pandas as pd
 IndexIssueMap = dict[int, dict[str, float]]
 OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
 TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
+def _reorganize_by_class_and_metric(result, lstats):
+    """Flip result from grouping by image to grouping by class and metric"""
+    metrics = {}
+    class_wise = {label: {} for label in lstats.image_indices_per_label}
+    # Group metrics and calculate class-wise counts
+    for img, group in result.items():
+        for extreme in group:
+            metrics.setdefault(extreme, []).append(img)
+            for label, images in lstats.image_indices_per_label.items():
+                if img in images:
+                    class_wise[label][extreme] = class_wise[label].get(extreme, 0) + 1
+    return metrics, class_wise
+def _create_table(metrics, class_wise):
+    """Create table for displaying the results"""
+    max_class_length = max(len(str(label)) for label in class_wise) + 2
+    max_total = max(len(metrics[group]) for group in metrics) + 2
+    table_header = " | ".join(
+        [f"{'Class':>{max_class_length}}"]
+        + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
+        + [f"{'Total':<{max_total}}"]
+    )
+    table_rows = []
+    for class_cat, results in class_wise.items():
+        table_value = [f"{class_cat:>{max_class_length}}"]
+        total = 0
+        for group in sorted(metrics.keys()):
+            count = results.get(group, 0)
+            table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
+            total += count
+        table_value.append(f"{total:^{max_total}}")
+        table_rows.append(" | ".join(table_value))
+    table = [table_header] + table_rows
+    return table
+# def _create_pandas_dataframe(class_wise):
+#     """Create data for pandas dataframe"""
+#     data = []
+#     for label, metrics_dict in class_wise.items():
+#         row = {"Class": label}
+#         total = sum(metrics_dict.values())
+#         row.update(metrics_dict)  # Add metric counts
+#         row["Total"] = total
+#         data.append(row)
+#     return data
 @dataclass(frozen=True)
 class OutliersOutput(Generic[TIndexIssueMap], Output):
     """
-    Output class for :class:`Outliers` lint detector
+    Output class for :class:`Outliers` lint detector.
     Attributes
     ----------
@@ -45,6 +105,39 @@ class OutliersOutput(Generic[TIndexIssueMap], Output):
         else:
             return sum(len(d) for d in self.issues)
+    def to_table(self, labelstats: LabelStatsOutput) -> str:
+        if isinstance(self.issues, dict):
+            metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
+            listed_table = _create_table(metrics, classwise)
+            table = "\n".join(listed_table)
+        else:
+            outertable = []
+            for d in self.issues:
+                metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
+                listed_table = _create_table(metrics, classwise)
+                str_table = "\n".join(listed_table)
+                outertable.append(str_table)
+            table = "\n\n".join(outertable)
+        return table
+    # def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
+    #     import pandas as pd
+    #     if isinstance(self.issues, dict):
+    #         _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
+    #         data = _create_pandas_dataframe(classwise)
+    #         df = pd.DataFrame(data)
+    #     else:
+    #         df_list = []
+    #         for i, d in enumerate(self.issues):
+    #             _, classwise = _reorganize_by_class_and_metric(d, labelstats)
+    #             data = _create_pandas_dataframe(classwise)
+    #             single_df = pd.DataFrame(data)
+    #             single_df["Dataset"] = i
+    #             df_list.append(single_df)
+    #         df = pd.concat(df_list)
+    #     return df
 def _get_outlier_mask(
     values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
@@ -71,7 +164,7 @@ def _get_outlier_mask(
 class Outliers:
     r"""
-    Calculates statistical Outliers of a dataset using various statistical tests applied to each image
+    Calculates statistical outliers of a dataset using various statistical tests applied to each image.
     Parameters
     ----------
@@ -164,7 +257,7 @@ class Outliers:
         self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
     ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
         """
-        Returns indices of Outliers with the issues identified for each
+        Returns indices of Outliers with the issues identified for each.
         Parameters
         ----------
@@ -188,6 +281,7 @@ class Outliers:
         -------
         Evaluate the dataset:
+        >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
         >>> results = outliers.from_stats([stats1, stats2])
         >>> len(results)
         2
@@ -248,7 +342,8 @@ class Outliers:
         -------
         Evaluate the dataset:
-        >>> results = outliers.evaluate(images)
+        >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
+        >>> results = outliers.evaluate(outlier_images)
         >>> list(results.issues)
         [10, 12]
         >>> results.issues[10]

dataeval/detectors/ood/__init__.py CHANGED Viewed

@@ -1,15 +1,8 @@
 """
-Out-of-distribution (OOD)` detectors identify data that is different from the data used to train a particular model.
+Out-of-distribution (OOD) detectors identify data that is different from the data used to train a particular model.
 """
-from dataeval import _IS_TORCH_AVAILABLE
-from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
+__all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
-__all__ = ["OODOutput", "OODScoreOutput"]
-if _IS_TORCH_AVAILABLE:
-    from dataeval.detectors.ood.ae_torch import OOD_AE
-    __all__ += ["OOD_AE"]
-del _IS_TORCH_AVAILABLE
+from dataeval.detectors.ood.ae import OOD_AE
+from dataeval.detectors.ood.output import OODOutput, OODScoreOutput

dataeval 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl

dataeval 0.74.2py3-none-any.whl → 0.76.0py3-none-any.whl