PyPI - dataeval - Versions diffs - 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl - Mend

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +9 -10
dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
dataeval/detectors/ood/__init__.py +6 -6
dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
dataeval/detectors/ood/aegmm.py +66 -0
dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
dataeval/detectors/ood/vaegmm.py +75 -0
dataeval/interop.py +56 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
dataeval/metrics/bias/metadata.py +358 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +8 -3
dataeval/utils/image.py +71 -0
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/shared.py +151 -0
dataeval/{_internal → utils}/split_dataset.py +98 -33
dataeval/utils/tensorflow/__init__.py +7 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +48 -42
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
dataeval-0.73.0.dist-info/RECORD +73 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/detectors/ood/aegmm.py +0 -78
dataeval/_internal/detectors/ood/vaegmm.py +0 -89
dataeval/_internal/interop.py +0 -49
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -8
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.1.dist-info/RECORD +0 -81
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/hashstats.py ADDED Viewed

@@ -0,0 +1,156 @@
+from __future__ import annotations
+__all__ = ["HashStatsOutput", "hashstats"]
+from dataclasses import dataclass
+from typing import Callable, Iterable
+import numpy as np
+import xxhash as xxh
+from numpy.typing import ArrayLike
+from PIL import Image
+from scipy.fftpack import dct
+from dataeval.interop import as_numpy
+from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.output import set_metadata
+from dataeval.utils.image import normalize_image_shape, rescale
+HASH_SIZE = 8
+MAX_FACTOR = 4
+@dataclass(frozen=True)
+class HashStatsOutput(BaseStatsOutput):
+    """
+    Output class for :func:`hashstats` stats metric
+    Attributes
+    ----------
+    xxhash : List[str]
+        xxHash hash of the images as a hex string
+    pchash : List[str]
+        :term:`Perception-based Hash` of the images as a hex string
+    """
+    xxhash: list[str]
+    pchash: list[str]
+def pchash(image: ArrayLike) -> str:
+    """
+    Performs a perceptual hash on an image by resizing to a square NxN image
+    using the Lanczos algorithm where N is 32x32 or the largest multiple of
+    8 that is smaller than the input image dimensions.  The resampled image
+    is compressed using a discrete cosine transform and the lowest frequency
+    component is encoded as a bit array of greater or less than median value
+    and returned as a hex string.
+    Parameters
+    ----------
+    image : ArrayLike
+        An image as a numpy array in CxHxW format
+    Returns
+    -------
+    str
+        The hex string hash of the image using perceptual hashing
+    """
+    # Verify that the image is at least larger than an 8x8 image
+    arr = as_numpy(image)
+    min_dim = min(arr.shape[-2:])
+    if min_dim < HASH_SIZE + 1:
+        raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
+    # Calculates the dimensions of the resized square image
+    resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
+    # Normalizes the image to CxHxW and takes the mean over all the channels
+    normalized = np.mean(normalize_image_shape(arr), axis=0).squeeze()
+    # Rescales the pixel values to an 8-bit 0-255 image
+    rescaled = rescale(normalized, 8).astype(np.uint8)
+    # Resizes the image using the Lanczos algorithm to a square image
+    im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
+    # Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
+    transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
+    # Encodes the transform as a bit array over the median value
+    diff = transform > np.median(transform)
+    # Pads the front of the bit array to a multiple of 8 with False
+    padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
+    padded[-diff.size :] = diff.ravel()
+    # Converts the bit array to a hex string and strips leading 0s
+    hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
+    return hash_hex if hash_hex else "0"
+def xxhash(image: ArrayLike) -> str:
+    """
+    Performs a fast non-cryptographic hash using the xxhash algorithm
+    (xxhash.com) against the image as a flattened bytearray.  The hash
+    is returned as a hex string.
+    Parameters
+    ----------
+    image : ArrayLike
+        An image as a numpy array
+    Returns
+    -------
+    str
+        The hex string hash of the image using the xxHash algorithm
+    """
+    return xxh.xxh3_64_hexdigest(as_numpy(image).ravel().tobytes())
+class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
+    output_class: type = HashStatsOutput
+    image_function_map: dict[str, Callable[[StatsProcessor[HashStatsOutput]], str]] = {
+        "xxhash": lambda x: xxhash(x.image),
+        "pchash": lambda x: pchash(x.image),
+    }
+@set_metadata()
+def hashstats(
+    images: Iterable[ArrayLike],
+    bboxes: Iterable[ArrayLike] | None = None,
+) -> HashStatsOutput:
+    """
+    Calculates hashes for each image
+    This function computes hashes from the images including exact hashes and perception-based
+    hashes. These hash values can be used to determine if images are exact or near matches.
+    Parameters
+    ----------
+    images : ArrayLike
+        Images to hashing
+    bboxes : Iterable[ArrayLike] or None
+        Bounding boxes in `xyxy` format for each image
+    Returns
+    -------
+    HashStatsOutput
+        A dictionary-like object containing the computed hashes for each image.
+    See Also
+    --------
+    Duplicates
+    Examples
+    --------
+    Calculating the statistics on the images, whose shape is (C, H, W)
+    >>> results = hashstats(images)
+    >>> print(results.xxhash)
+    ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
+    >>> print(results.pchash)
+    ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
+    """
+    return run_stats(images, bboxes, False, [HashStatsProcessor])[0]

dataeval/{_internal/metrics → metrics}/stats/labelstats.py RENAMED Viewed

@@ -1,13 +1,15 @@
 from __future__ import annotations
+__all__ = ["LabelStatsOutput", "labelstats"]
 from collections import Counter, defaultdict
 from dataclasses import dataclass
 from typing import Any, Iterable, Mapping, TypeVar
 from numpy.typing import ArrayLike
-from dataeval._internal.interop import to_numpy
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.interop import to_numpy
+from dataeval.output import OutputMetadata, set_metadata
 @dataclass(frozen=True)
@@ -55,7 +57,7 @@ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
     return dict(sorted(d.items(), key=lambda x: x[0]))
-@set_metadata("dataeval.metrics")
+@set_metadata()
 def labelstats(
     labels: Iterable[ArrayLike],
 ) -> LabelStatsOutput:

dataeval/{_internal/metrics → metrics}/stats/pixelstats.py RENAMED Viewed

@@ -1,14 +1,16 @@
 from __future__ import annotations
+__all__ = ["PixelStatsOutput", "pixelstats"]
 from dataclasses import dataclass
-from typing import Iterable
+from typing import Any, Callable, Iterable
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import entropy, kurtosis, skew
-from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
-from dataeval._internal.output import set_metadata
+from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.output import set_metadata
 @dataclass(frozen=True)
@@ -44,9 +46,8 @@ class PixelStatsOutput(BaseStatsOutput):
 class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
-    output_class = PixelStatsOutput
-    cache_keys = ["histogram"]
-    image_function_map = {
+    output_class: type = PixelStatsOutput
+    image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
         "mean": lambda self: np.mean(self.scaled),
         "std": lambda x: np.std(x.scaled),
         "var": lambda x: np.var(x.scaled),
@@ -55,7 +56,7 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
         "histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
         "entropy": lambda x: entropy(x.get("histogram")),
     }
-    channel_function_map = {
+    channel_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
         "mean": lambda x: np.mean(x.scaled, axis=1),
         "std": lambda x: np.std(x.scaled, axis=1),
         "var": lambda x: np.var(x.scaled, axis=1),
@@ -66,7 +67,7 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
     }
-@set_metadata("dataeval.metrics")
+@set_metadata()
 def pixelstats(
     images: Iterable[ArrayLike],
     bboxes: Iterable[ArrayLike] | None = None,

dataeval/{_internal/metrics → metrics}/stats/visualstats.py RENAMED Viewed

@@ -1,14 +1,16 @@
 from __future__ import annotations
+__all__ = ["VisualStatsOutput", "visualstats"]
 from dataclasses import dataclass
-from typing import Iterable
+from typing import Any, Callable, Iterable
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
-from dataeval._internal.metrics.utils import edge_filter
-from dataeval._internal.output import set_metadata
+from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.output import set_metadata
+from dataeval.utils.image import edge_filter
 QUARTILES = (0, 25, 50, 75, 100)
@@ -46,9 +48,8 @@ class VisualStatsOutput(BaseStatsOutput):
 class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
-    output_class = VisualStatsOutput
-    cache_keys = ["percentiles"]
-    image_function_map = {
+    output_class: type = VisualStatsOutput
+    image_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
         "brightness": lambda x: x.get("percentiles")[1],
         "contrast": lambda x: np.nan_to_num(
             (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles"))
@@ -59,7 +60,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
         "zeros": lambda x: np.count_nonzero(np.sum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
         "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
     }
-    channel_function_map = {
+    channel_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
         "brightness": lambda x: x.get("percentiles")[:, 1],
         "contrast": lambda x: np.nan_to_num(
             (np.max(x.get("percentiles"), axis=1) - np.min(x.get("percentiles"), axis=1))
@@ -73,7 +74,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
     }
-@set_metadata("dataeval.metrics")
+@set_metadata()
 def visualstats(
     images: Iterable[ArrayLike],
     bboxes: Iterable[ArrayLike] | None = None,

dataeval/{_internal/output.py → output.py} RENAMED Viewed

@@ -1,12 +1,20 @@
 from __future__ import annotations
+__all__ = []
 import inspect
+import sys
 from datetime import datetime, timezone
 from functools import wraps
-from typing import Any
+from typing import Any, Callable, Iterable, TypeVar
 import numpy as np
+if sys.version_info >= (3, 10):
+    from typing import ParamSpec
+else:
+    from typing_extensions import ParamSpec
 from dataeval import __version__
@@ -25,10 +33,18 @@ class OutputMetadata:
         return {k.removeprefix("_"): v for k, v in self.__dict__.items() if k.startswith("_")}
-def set_metadata(module_name: str = "", state_attr: list[str] | None = None):
-    def decorator(fn):
+P = ParamSpec("P")
+R = TypeVar("R", bound=OutputMetadata)
+def set_metadata(
+    state_attr: Iterable[str] | None = None,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    """Decorator to stamp OutputMetadata classes with runtime metadata"""
+    def decorator(fn: Callable[P, R]) -> Callable[P, R]:
         @wraps(fn)
-        def wrapper(*args, **kwargs):
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
             def fmt(v):
                 if np.isscalar(v):
                     return v
@@ -52,9 +68,13 @@ def set_metadata(module_name: str = "", state_attr: list[str] | None = None):
                 if "self" in arguments and state_attr
                 else {}
             )
-            name = args[0].__class__.__name__ if "self" in arguments else fn.__name__
+            name = (
+                f"{args[0].__class__.__module__}.{args[0].__class__.__name__}.{fn.__name__}"
+                if "self" in arguments
+                else f"{fn.__module__}.{fn.__qualname__}"
+            )
             metadata = {
-                "_name": f"{module_name}.{name}",
+                "_name": name,
                 "_execution_time": time,
                 "_execution_duration": duration,
                 "_arguments": {k: v for k, v in arguments.items() if k != "self"},

dataeval/utils/__init__.py CHANGED Viewed

@@ -5,15 +5,20 @@ metrics. Currently DataEval supports both :term:`TensorFlow` and PyTorch backend
 """
 from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
+from dataeval.utils.metadata import merge_metadata
+from dataeval.utils.split_dataset import split_dataset
-__all__ = []
+__all__ = ["split_dataset", "merge_metadata"]
 if _IS_TORCH_AVAILABLE:  # pragma: no cover
-    from . import torch
+    from dataeval.utils import torch
     __all__ += ["torch"]
 if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
-    from . import tensorflow
+    from dataeval.utils import tensorflow
     __all__ += ["tensorflow"]
+del _IS_TENSORFLOW_AVAILABLE
+del _IS_TORCH_AVAILABLE

dataeval/utils/image.py ADDED Viewed

@@ -0,0 +1,71 @@
+from __future__ import annotations
+__all__ = []
+from typing import Any, NamedTuple
+import numpy as np
+from numpy.typing import ArrayLike, NDArray
+from scipy.signal import convolve2d
+EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
+BIT_DEPTH = (1, 8, 12, 16, 32)
+class BitDepth(NamedTuple):
+    depth: int
+    pmin: float | int
+    pmax: float | int
+def get_bitdepth(image: NDArray[Any]) -> BitDepth:
+    """
+    Approximates the bit depth of the image using the
+    min and max pixel values.
+    """
+    pmin, pmax = np.min(image), np.max(image)
+    if pmin < 0:
+        return BitDepth(0, pmin, pmax)
+    else:
+        depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
+        return BitDepth(depth, 0, 2**depth - 1)
+def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
+    """
+    Rescales the image using the bit depth provided.
+    """
+    bitdepth = get_bitdepth(image)
+    if bitdepth.depth == depth:
+        return image
+    else:
+        normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
+        return normalized * (2**depth - 1)
+def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
+    """
+    Normalizes the image shape into (C,H,W).
+    """
+    ndim = image.ndim
+    if ndim == 2:
+        return np.expand_dims(image, axis=0)
+    elif ndim == 3:
+        return image
+    elif ndim > 3:
+        # Slice all but the last 3 dimensions
+        return image[(0,) * (ndim - 3)]
+    else:
+        raise ValueError("Images must have 2 or more dimensions.")
+def edge_filter(image: ArrayLike, offset: float = 0.5) -> NDArray[np.uint8]:
+    """
+    Returns the image filtered using a 3x3 edge detection kernel:
+    [[ -1, -1, -1 ],
+     [ -1,  8, -1 ],
+     [ -1, -1, -1 ]]
+    """
+    edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
+    np.clip(edges, 0, 255, edges)
+    return edges

dataeval/utils/lazy.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+from functools import cached_property
+from importlib import import_module
+from typing import Any
+class LazyModule:
+    def __init__(self, name: str) -> None:
+        self._name = name
+    def __getattr__(self, key: str) -> Any:
+        return getattr(self._module, key)
+    @cached_property
+    def _module(self):
+        return import_module(self._name)
+LAZY_MODULES: dict[str, LazyModule] = {}
+def lazyload(name: str) -> LazyModule:
+    if name not in LAZY_MODULES:
+        LAZY_MODULES[name] = LazyModule(name)
+    return LAZY_MODULES[name]

dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl