PyPI - dataeval - Versions diffs - 0.69.4__py3-none-any.whl → 0.70.0__py3-none-any.whl - Mend

dataeval 0.69.4py3-none-any.whl → 0.70.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

dataeval/__init__.py +3 -3
dataeval/_internal/detectors/drift/base.py +5 -6
dataeval/_internal/detectors/drift/mmd.py +3 -3
dataeval/_internal/detectors/duplicates.py +62 -45
dataeval/_internal/detectors/merged_stats.py +23 -54
dataeval/_internal/detectors/ood/ae.py +3 -3
dataeval/_internal/detectors/outliers.py +133 -61
dataeval/_internal/interop.py +11 -7
dataeval/_internal/metrics/balance.py +9 -9
dataeval/_internal/metrics/ber.py +3 -3
dataeval/_internal/metrics/divergence.py +3 -3
dataeval/_internal/metrics/diversity.py +6 -6
dataeval/_internal/metrics/parity.py +24 -16
dataeval/_internal/metrics/stats/base.py +231 -0
dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
dataeval/_internal/metrics/stats/datasetstats.py +97 -0
dataeval/_internal/metrics/stats/dimensionstats.py +111 -0
dataeval/_internal/metrics/stats/hashstats.py +73 -0
dataeval/_internal/metrics/stats/labelstats.py +125 -0
dataeval/_internal/metrics/stats/pixelstats.py +117 -0
dataeval/_internal/metrics/stats/visualstats.py +122 -0
dataeval/_internal/metrics/uap.py +2 -2
dataeval/_internal/metrics/utils.py +28 -13
dataeval/_internal/output.py +3 -18
dataeval/_internal/workflows/sufficiency.py +123 -133
dataeval/metrics/stats/__init__.py +14 -3
dataeval/workflows/__init__.py +2 -2
{dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/METADATA +3 -3
{dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/RECORD +31 -26
{dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/WHEEL +1 -1
dataeval/_internal/flags.py +0 -77
dataeval/_internal/metrics/stats.py +0 -397
dataeval/flags/__init__.py +0 -3
{dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/LICENSE.txt +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.69.4"
+__version__ = "0.70.0"
 from importlib.util import find_spec
@@ -7,9 +7,9 @@ _IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("te
 del find_spec
-from . import detectors, flags, metrics  # noqa: E402
+from . import detectors, metrics  # noqa: E402
-__all__ = ["detectors", "flags", "metrics"]
+__all__ = ["detectors", "metrics"]
 if _IS_TORCH_AVAILABLE:  # pragma: no cover
     from . import torch, utils, workflows

dataeval/_internal/detectors/drift/base.py CHANGED Viewed

@@ -16,7 +16,7 @@ from typing import Callable, Literal
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval._internal.interop import to_numpy
+from dataeval._internal.interop import as_numpy, to_numpy
 from dataeval._internal.output import OutputMetadata, set_metadata
@@ -234,7 +234,7 @@ class BaseDrift:
         if correction not in ["bonferroni", "fdr"]:
             raise ValueError("`correction` must be `bonferroni` or `fdr`.")
-        self._x_ref = x_ref
+        self._x_ref = to_numpy(x_ref)
         self.x_ref_preprocessed = x_ref_preprocessed
         # Other attributes
@@ -242,7 +242,7 @@ class BaseDrift:
         self.update_x_ref = update_x_ref
         self.preprocess_fn = preprocess_fn
         self.correction = correction
-        self.n = len(self._x_ref)  # type: ignore
+        self.n = len(self._x_ref)
         # Ref counter for preprocessed x
         self._x_refcount = 0
@@ -260,9 +260,8 @@ class BaseDrift:
         if not self.x_ref_preprocessed:
             self.x_ref_preprocessed = True
             if self.preprocess_fn is not None:
-                self._x_ref = self.preprocess_fn(self._x_ref)
+                self._x_ref = as_numpy(self.preprocess_fn(self._x_ref))
-        self._x_ref = to_numpy(self._x_ref)
         return self._x_ref
     def _preprocess(self, x: ArrayLike) -> ArrayLike:
@@ -380,7 +379,7 @@ class BaseDriftUnivariate(BaseDrift):
                 self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
             else:
                 # infer number of features after applying preprocessing step
-                x = to_numpy(self.preprocess_fn(self._x_ref[0:1]))  # type: ignore
+                x = as_numpy(self.preprocess_fn(self._x_ref[0:1]))  # type: ignore
                 self._n_features = x.reshape(x.shape[0], -1).shape[-1]
         return self._n_features

dataeval/_internal/detectors/drift/mmd.py CHANGED Viewed

@@ -14,7 +14,7 @@ from typing import Callable
 import torch
 from numpy.typing import ArrayLike
-from dataeval._internal.interop import to_numpy
+from dataeval._internal.interop import as_numpy
 from dataeval._internal.output import set_metadata
 from .base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
@@ -110,7 +110,7 @@ class DriftMMD(BaseDrift):
         self.device = get_device(device)
         # initialize kernel
-        sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if sigma is not None else None
+        sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
         self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
         # compute kernel matrix for the reference data
@@ -147,7 +147,7 @@ class DriftMMD(BaseDrift):
             p-value obtained from the permutation test, MMD^2 between the reference and test set,
             and MMD^2 threshold above which drift is flagged
         """
-        x = to_numpy(x)
+        x = as_numpy(x)
         x_ref = torch.from_numpy(self.x_ref).to(self.device)
         n = x.shape[0]
         kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))

dataeval/_internal/detectors/duplicates.py CHANGED Viewed

@@ -1,13 +1,12 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Generic, Iterable, Sequence, TypeVar, cast
+from typing import Generic, Iterable, Sequence, TypeVar
 from numpy.typing import ArrayLike
 from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
-from dataeval._internal.flags import ImageStat
-from dataeval._internal.metrics.stats import StatsOutput, imagestats
+from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
 from dataeval._internal.output import OutputMetadata, set_metadata
 DuplicateGroup = list[int]
@@ -53,26 +52,23 @@ class Duplicates:
     -------
     Initialize the Duplicates class:
-    >>> dups = Duplicates()
+    >>> all_dupes = Duplicates()
+    >>> exact_dupes = Duplicates(only_exact=True)
     """
     def __init__(self, only_exact: bool = False):
-        self.stats: StatsOutput
+        self.stats: HashStatsOutput
         self.only_exact = only_exact
-    def _get_duplicates(self) -> dict[str, list[list[int]]]:
-        stats_dict = self.stats.dict()
-        if "xxhash" in stats_dict:
-            exact_dict: dict[int, list] = {}
-            for i, value in enumerate(stats_dict["xxhash"]):
-                exact_dict.setdefault(value, []).append(i)
-            exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
-        else:
-            exact = []
+    def _get_duplicates(self, stats: dict) -> dict[str, list[list[int]]]:
+        exact_dict: dict[int, list] = {}
+        for i, value in enumerate(stats["xxhash"]):
+            exact_dict.setdefault(value, []).append(i)
+        exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
-        if "pchash" in stats_dict and not self.only_exact:
+        if not self.only_exact:
             near_dict: dict[int, list] = {}
-            for i, value in enumerate(stats_dict["pchash"]):
+            for i, value in enumerate(stats["pchash"]):
                 near_dict.setdefault(value, []).append(i)
             near = [sorted(v) for v in near_dict.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
         else:
@@ -84,14 +80,14 @@ class Duplicates:
         }
     @set_metadata("dataeval.detectors", ["only_exact"])
-    def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> DuplicatesOutput:
+    def from_stats(self, hashes: HashStatsOutput | Sequence[HashStatsOutput]) -> DuplicatesOutput:
         """
         Returns duplicate image indices for both exact matches and near matches
         Parameters
         ----------
-        data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
-            A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
+        data : HashStatsOutput | Sequence[HashStatsOutput]
+            The output(s) from a hashstats analysis
         Returns
         -------
@@ -100,39 +96,60 @@ class Duplicates:
         See Also
         --------
-        imagestats
+        hashstats
         Example
         -------
-        >>> dups.evaluate(images)
-        DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
-        """  # noqa: E501
+        >>> exact_dupes.from_stats([hashes1, hashes2])
+        DuplicatesOutput(exact=[{0: [3, 20]}, {0: [16], 1: [12]}], near=[])
+        """
-        stats, dataset_steps = combine_stats(data)
+        if isinstance(hashes, HashStatsOutput):
+            return DuplicatesOutput(**self._get_duplicates(hashes.dict()))
-        if isinstance(stats, StatsOutput):
-            if not stats.xxhash:
-                raise ValueError("StatsOutput must include xxhash information of the images.")
-            if not self.only_exact and not stats.pchash:
-                raise ValueError("StatsOutput must include pchash information of the images for near matches.")
-            self.stats = stats
-        else:
-            flags = ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH)
-            self.stats = imagestats(cast(Iterable[ArrayLike], data), flags)
+        if not isinstance(hashes, Sequence):
+            raise TypeError("Invalid stats output type; only use output from hashstats.")
-        duplicates = self._get_duplicates()
+        combined, dataset_steps = combine_stats(hashes)
+        duplicates = self._get_duplicates(combined.dict())
         # split up results from combined dataset into individual dataset buckets
-        if dataset_steps:
-            dup_list: list[list[int]]
-            for dup_type, dup_list in duplicates.items():
-                dup_list_dict = []
-                for idxs in dup_list:
-                    dup_dict = {}
-                    for idx in idxs:
-                        k, v = get_dataset_step_from_idx(idx, dataset_steps)
-                        dup_dict.setdefault(k, []).append(v)
-                    dup_list_dict.append(dup_dict)
-                duplicates[dup_type] = dup_list_dict
+        for dup_type, dup_list in duplicates.items():
+            dup_list_dict = []
+            for idxs in dup_list:
+                dup_dict = {}
+                for idx in idxs:
+                    k, v = get_dataset_step_from_idx(idx, dataset_steps)
+                    dup_dict.setdefault(k, []).append(v)
+                dup_list_dict.append(dup_dict)
+            duplicates[dup_type] = dup_list_dict
+        return DuplicatesOutput(**duplicates)
+    @set_metadata("dataeval.detectors", ["only_exact"])
+    def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
+        """
+        Returns duplicate image indices for both exact matches and near matches
+        Parameters
+        ----------
+        data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
+            A dataset of images in an ArrayLike format or the output(s) from a hashstats analysis
+        Returns
+        -------
+        DuplicatesOutput
+            List of groups of indices that are exact and near matches
+        See Also
+        --------
+        hashstats
+        Example
+        -------
+        >>> all_dupes.evaluate(images)
+        DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
+        """  # noqa: E501
+        self.stats = hashstats(data)
+        duplicates = self._get_duplicates(self.stats.dict())
         return DuplicatesOutput(**duplicates)

dataeval/_internal/detectors/merged_stats.py CHANGED Viewed

@@ -1,71 +1,40 @@
 from __future__ import annotations
-from typing import Sequence, cast
-from warnings import warn
+from copy import deepcopy
+from typing import Sequence, TypeVar
 import numpy as np
-from dataeval._internal.metrics.stats import StatsOutput
-from dataeval._internal.output import populate_defaults
+from dataeval._internal.metrics.stats.base import BaseStatsOutput
+TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
-def add_stats(a: StatsOutput, b: StatsOutput) -> StatsOutput:
-    if not isinstance(a, StatsOutput) or not isinstance(b, StatsOutput):
-        raise TypeError(f"Cannot add object of type {type(a)} and type {type(b)}.")
-    a_dict = a.dict()
-    b_dict = b.dict()
-    a_keys = set(a_dict)
-    b_keys = set(b_dict)
+def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
+    if type(a) is not type(b):
+        raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
-    missing_keys = a_keys - b_keys
-    if missing_keys:
-        raise ValueError(f"Required keys are missing: {missing_keys}.")
+    sum_dict = deepcopy(a.dict())
-    extra_keys = b_keys - a_keys
-    if extra_keys:
-        warn(f"Extraneous keys will be dropped: {extra_keys}.")
+    for k in sum_dict:
+        if isinstance(sum_dict[k], list):
+            sum_dict[k].extend(b.dict()[k])
+        else:
+            sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
-    # perform add of multi-channel stats
-    if "ch_idx_map" in a_dict:
-        for k, v in a_dict.items():
-            if k == "ch_idx_map":
-                offset = sum([len(idxs) for idxs in v.values()])
-                for ch_k, ch_v in b_dict[k].items():
-                    if ch_k not in v:
-                        v[ch_k] = []
-                    a_dict[k][ch_k].extend([idx + offset for idx in ch_v])
-            else:
-                for ch_k in b_dict[k]:
-                    if ch_k not in v:
-                        v[ch_k] = b_dict[k][ch_k]
-                    else:
-                        v[ch_k] = np.concatenate((v[ch_k], b_dict[k][ch_k]), axis=1)
-    else:
-        for k in a_dict:
-            if isinstance(a_dict[k], list):
-                a_dict[k].extend(b_dict[k])
-            else:
-                a_dict[k] = np.concatenate((a_dict[k], b_dict[k]))
+    return type(a)(**sum_dict)
-    return StatsOutput(**populate_defaults(a_dict, StatsOutput))
-def combine_stats(stats) -> tuple[StatsOutput | None, list[int]]:
-    dataset_steps = []
-    if isinstance(stats, StatsOutput):
-        return stats, dataset_steps
+def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
     output = None
-    if isinstance(stats, Sequence) and isinstance(stats[0], StatsOutput):
-        stats = cast(Sequence[StatsOutput], stats)
-        cur_len = 0
-        for s in stats:
-            output = s if output is None else add_stats(output, s)
-            cur_len += len(s)
-            dataset_steps.append(cur_len)
+    dataset_steps = []
+    cur_len = 0
+    for s in stats:
+        output = s if output is None else add_stats(output, s)
+        cur_len += len(s)
+        dataset_steps.append(cur_len)
+    if output is None:
+        raise TypeError("Cannot combine empty sequence of stats.")
     return output, dataset_steps

dataeval/_internal/detectors/ood/ae.py CHANGED Viewed

@@ -16,7 +16,7 @@ import tensorflow as tf
 from numpy.typing import ArrayLike
 from dataeval._internal.detectors.ood.base import OODBase, OODScore
-from dataeval._internal.interop import to_numpy
+from dataeval._internal.interop import as_numpy
 from dataeval._internal.models.tensorflow.autoencoder import AE
 from dataeval._internal.models.tensorflow.utils import predict_batch
@@ -46,10 +46,10 @@ class OOD_AE(OODBase):
     ) -> None:
         if loss_fn is None:
             loss_fn = keras.losses.MeanSquaredError()
-        super().fit(to_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
+        super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
     def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
-        self._validate(X := to_numpy(X))
+        self._validate(X := as_numpy(X))
         # reconstruct instances
         X_recon = predict_batch(X, self.model, batch_size=batch_size)

dataeval/_internal/detectors/outliers.py CHANGED Viewed

@@ -1,39 +1,45 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Iterable, Literal, Sequence, cast
-from warnings import warn
+from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
-from dataeval._internal.flags import ImageStat, to_distinct, verify_supported
-from dataeval._internal.metrics.stats import StatsOutput, imagestats
+from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
+from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
+from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
+from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
+from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
 from dataeval._internal.output import OutputMetadata, set_metadata
 IndexIssueMap = dict[int, dict[str, float]]
-DatasetIndexIssueMap = dict[int, IndexIssueMap]
-"""
-Mapping of image indices to a dictionary of issue types and calculated values
-"""
+OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
+TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
 @dataclass(frozen=True)
-class OutliersOutput(OutputMetadata):
+class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
     """
     Attributes
     ----------
-    issues : dict[int, dict[str, float]] | dict[int, dict[int, dict[str, float]]]
+    issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
         Indices of image outliers with their associated issue type and calculated values.
     - For a single dataset, a dictionary containing the indices of outliers and
       a dictionary showing the issues and calculated values for the given index.
-    - For multiple datasets, a map of dataset indices to the indices of outliers
-      and their associated issues and calculated values.
+    - For multiple stats outputs, a list of dictionaries containing the indices of
+      outliers and their associated issues and calculated values.
     """
-    issues: IndexIssueMap | DatasetIndexIssueMap
+    issues: TIndexIssueMap
+    def __len__(self):
+        if isinstance(self.issues, dict):
+            return len(self.issues)
+        else:
+            return sum(len(d) for d in self.issues)
 def _get_outlier_mask(
@@ -43,7 +49,7 @@ def _get_outlier_mask(
         threshold = threshold if threshold else 3.0
         std = np.std(values)
         abs_diff = np.abs(values - np.mean(values))
-        return (abs_diff / std) > threshold
+        return std != 0 and (abs_diff / std) > threshold
     elif method == "modzscore":
         threshold = threshold if threshold else 3.5
         abs_diff = np.abs(values - np.median(values))
@@ -65,9 +71,6 @@ class Outliers:
     Parameters
     ----------
-    flags : ImageStat, default ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS
-        Metric(s) to calculate for each image - calculates all metrics if None
-        Only supports ImageStat.ALL_STATS
     outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
         Statistical method used to identify outliers
     outlier_threshold : float, optional - default None
@@ -76,8 +79,8 @@ class Outliers:
     Attributes
     ----------
-    stats : dict[str, Any]
-        Dictionary to hold the value of each metric for each image
+    stats : tuple[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
+        Various stats output classes that hold the value of each metric for each image
     See Also
     --------
@@ -109,52 +112,61 @@ class Outliers:
     >>> outliers = Outliers()
-    Specifying specific metrics to analyze:
-    >>> outliers = Outliers(flags=ImageStat.SIZE | ImageStat.ALL_VISUALS)
     Specifying an outlier method:
     >>> outliers = Outliers(outlier_method="iqr")
     Specifying an outlier method and threshold:
-    >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=2.75)
+    >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
     """
     def __init__(
         self,
-        flags: ImageStat = ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS,
+        use_dimension: bool = True,
+        use_pixel: bool = True,
+        use_visual: bool = True,
         outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
         outlier_threshold: float | None = None,
     ):
-        verify_supported(flags, ImageStat.ALL_STATS)
-        self.flags = flags
+        self.stats: DatasetStatsOutput
+        self.use_dimension = use_dimension
+        self.use_pixel = use_pixel
+        self.use_visual = use_visual
         self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
         self.outlier_threshold = outlier_threshold
-    def _get_outliers(self) -> dict:
-        flagged_images = {}
-        stats_dict = self.stats.dict()
-        supported = to_distinct(ImageStat.ALL_STATS)
-        for stat, values in stats_dict.items():
-            if stat in supported.values() and values.ndim == 1 and np.std(values) != 0:
-                mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
+    def _get_outliers(self, stats: dict) -> dict[int, dict[str, float]]:
+        flagged_images: dict[int, dict[str, float]] = {}
+        for stat, values in stats.items():
+            if stat in (SOURCE_INDEX, BOX_COUNT):
+                continue
+            if values.ndim == 1:
+                mask = _get_outlier_mask(values.astype(np.float64), self.outlier_method, self.outlier_threshold)
                 indices = np.flatnonzero(mask)
                 for i, value in zip(indices, values[mask]):
-                    flagged_images.setdefault(i, {}).update({stat: np.round(value, 2)})
+                    flagged_images.setdefault(i, {}).update({stat: value})
         return dict(sorted(flagged_images.items()))
-    @set_metadata("dataeval.detectors", ["flags", "outlier_method", "outlier_threshold"])
-    def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> OutliersOutput:
+    @overload
+    def from_stats(self, stats: OutlierStatsOutput | DatasetStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
+    @overload
+    def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
+    @set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
+    def from_stats(
+        self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
+    ) -> OutliersOutput:
         """
         Returns indices of outliers with the issues identified for each
         Parameters
         ----------
-        data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput | Sequence[StatsOutput]
-            A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
+        stats : OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
+            The output(s) from a dimensionstats, pixelstats, or visualstats metric
+            analysis or an aggregate DatasetStatsOutput
         Returns
         -------
@@ -162,36 +174,96 @@ class Outliers:
             Output class containing the indices of outliers and a dictionary showing
             the issues and calculated values for the given index.
+        See Also
+        --------
+        dimensionstats
+        pixelstats
+        visualstats
         Example
         -------
         Evaluate the dataset:
-        >>> outliers.evaluate(images)
-        OutliersOutput(issues={10: {'blurriness': 1.26, 'contrast': 1.06, 'zeros': 0.05}, 12: {'blurriness': 1.51, 'contrast': 1.06, 'zeros': 0.05}})
+        >>> results = outliers.from_stats([stats1, stats2])
+        >>> len(results)
+        2
+        >>> results.issues[0]
+        {10: {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}, 12: {'std': 0.00536, 'var': 2.87e-05, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}}
+        >>> results.issues[1]
+        {}
         """  # noqa: E501
-        stats, dataset_steps = combine_stats(data)
-        if isinstance(stats, StatsOutput):
-            selected_flags = set(to_distinct(self.flags).values())
-            provided = set(stats.dict())
-            missing = selected_flags - provided
-            if missing:
-                warn(
-                    f"StatsOutput provided {provided} and is missing {missing} \
-                        from the selected stat flags: {selected_flags}."
+        if isinstance(stats, DatasetStatsOutput):
+            outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
+            return OutliersOutput(outliers)
+        if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
+            return OutliersOutput(self._get_outliers(stats.dict()))
+        if not isinstance(stats, Sequence):
+            raise TypeError(
+                "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
+            )
+        stats_map: dict[type, list[int]] = {}
+        for i, stats_output in enumerate(stats):
+            if not isinstance(
+                stats_output, (DatasetStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
+            ):
+                raise TypeError(
+                    "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
                 )
-            self.stats = stats
-        else:
-            self.stats = imagestats(cast(Iterable[ArrayLike], data), self.flags)
-        outliers = self._get_outliers()
+            stats_map.setdefault(type(stats_output), []).append(i)
-        # split up results from combined dataset into individual dataset buckets
-        if dataset_steps:
-            out_dict = {}
+        output_list: list[dict[int, dict[str, float]]] = [{} for _ in stats]
+        for _, indices in stats_map.items():
+            substats, dataset_steps = combine_stats([stats[i] for i in indices])
+            outliers = self._get_outliers(substats.dict())
             for idx, issue in outliers.items():
                 k, v = get_dataset_step_from_idx(idx, dataset_steps)
-                out_dict.setdefault(k, {})[v] = issue
-            outliers = out_dict
+                output_list[indices[k]][v] = issue
+        return OutliersOutput(output_list)
+    @set_metadata(
+        "dataeval.detectors",
+        [
+            "use_dimension",
+            "use_pixel",
+            "use_visual",
+            "outlier_method",
+            "outlier_threshold",
+        ],
+    )
+    def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
+        """
+        Returns indices of outliers with the issues identified for each
+        Parameters
+        ----------
+        data : Iterable[ArrayLike], shape - (C, H, W)
+            A dataset of images in an ArrayLike format
+        Returns
+        -------
+        OutliersOutput
+            Output class containing the indices of outliers and a dictionary showing
+            the issues and calculated values for the given index.
+        Example
+        -------
+        Evaluate the dataset:
+        >>> results = outliers.evaluate(images)
+        >>> list(results.issues)
+        [10, 12]
+        >>> results.issues[10]
+        {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128, 'contrast': 1.25, 'zeros': 0.05493}
+        """
+        self.stats = datasetstats(
+            images=data,
+            use_dimension=self.use_dimension,
+            use_pixel=self.use_pixel,
+            use_visual=self.use_visual,
+        )
+        outliers = self._get_outliers({k: v for o in self.stats.outputs() for k, v in o.dict().items()})
         return OutliersOutput(outliers)

dataeval 0.69.4__py3-none-any.whl → 0.70.0__py3-none-any.whl

dataeval 0.69.4py3-none-any.whl → 0.70.0py3-none-any.whl