PyPI - dataeval - Versions diffs - 0.86.9__py3-none-any.whl → 0.88.0__py3-none-any.whl - Mend

dataeval 0.86.9py3-none-any.whl → 0.88.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/_version.py +2 -2
dataeval/config.py +4 -19
dataeval/data/_embeddings.py +78 -35
dataeval/data/_images.py +41 -8
dataeval/data/_metadata.py +348 -66
dataeval/data/_selection.py +22 -7
dataeval/data/_split.py +3 -2
dataeval/data/selections/_classbalance.py +4 -3
dataeval/data/selections/_classfilter.py +9 -8
dataeval/data/selections/_indices.py +4 -3
dataeval/data/selections/_prioritize.py +249 -29
dataeval/data/selections/_reverse.py +1 -1
dataeval/data/selections/_shuffle.py +5 -4
dataeval/detectors/drift/_base.py +2 -1
dataeval/detectors/drift/_mmd.py +2 -1
dataeval/detectors/drift/_nml/_base.py +1 -1
dataeval/detectors/drift/_nml/_chunk.py +2 -1
dataeval/detectors/drift/_nml/_result.py +3 -2
dataeval/detectors/drift/_nml/_thresholds.py +6 -5
dataeval/detectors/drift/_uncertainty.py +2 -1
dataeval/detectors/linters/duplicates.py +2 -1
dataeval/detectors/linters/outliers.py +4 -3
dataeval/detectors/ood/__init__.py +2 -1
dataeval/detectors/ood/ae.py +1 -1
dataeval/detectors/ood/base.py +39 -1
dataeval/detectors/ood/knn.py +95 -0
dataeval/detectors/ood/mixin.py +2 -1
dataeval/metadata/_utils.py +1 -1
dataeval/metrics/bias/_balance.py +29 -22
dataeval/metrics/bias/_diversity.py +4 -4
dataeval/metrics/bias/_parity.py +2 -2
dataeval/metrics/stats/_base.py +3 -29
dataeval/metrics/stats/_boxratiostats.py +2 -1
dataeval/metrics/stats/_dimensionstats.py +2 -1
dataeval/metrics/stats/_hashstats.py +21 -3
dataeval/metrics/stats/_pixelstats.py +2 -1
dataeval/metrics/stats/_visualstats.py +2 -1
dataeval/outputs/_base.py +2 -3
dataeval/outputs/_bias.py +2 -1
dataeval/outputs/_estimators.py +1 -1
dataeval/outputs/_linters.py +3 -3
dataeval/outputs/_stats.py +3 -3
dataeval/outputs/_utils.py +1 -1
dataeval/outputs/_workflows.py +49 -31
dataeval/typing.py +23 -9
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +3 -2
dataeval/utils/_bin.py +9 -7
dataeval/utils/_method.py +2 -3
dataeval/utils/_multiprocessing.py +34 -0
dataeval/utils/_plot.py +2 -1
dataeval/utils/data/__init__.py +6 -5
dataeval/utils/data/{metadata.py → _merge.py} +3 -2
dataeval/utils/data/_validate.py +170 -0
dataeval/utils/data/collate.py +2 -1
dataeval/utils/torch/_internal.py +2 -1
dataeval/utils/torch/trainer.py +1 -1
dataeval/workflows/sufficiency.py +13 -9
{dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/METADATA +8 -21
dataeval-0.88.0.dist-info/RECORD +105 -0
dataeval/utils/data/_dataset.py +0 -246
dataeval/utils/datasets/__init__.py +0 -21
dataeval/utils/datasets/_antiuav.py +0 -189
dataeval/utils/datasets/_base.py +0 -266
dataeval/utils/datasets/_cifar10.py +0 -201
dataeval/utils/datasets/_fileio.py +0 -142
dataeval/utils/datasets/_milco.py +0 -197
dataeval/utils/datasets/_mixin.py +0 -54
dataeval/utils/datasets/_mnist.py +0 -202
dataeval/utils/datasets/_seadrone.py +0 -512
dataeval/utils/datasets/_ships.py +0 -144
dataeval/utils/datasets/_types.py +0 -48
dataeval/utils/datasets/_voc.py +0 -583
dataeval-0.86.9.dist-info/RECORD +0 -115
{dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
/dataeval-0.86.9.dist-info/licenses/LICENSE.txt → /dataeval-0.88.0.dist-info/licenses/LICENSE +0 -0

dataeval/detectors/drift/_mmd.py CHANGED Viewed

@@ -10,7 +10,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 import torch

dataeval/detectors/drift/_nml/_base.py CHANGED Viewed

@@ -9,8 +9,8 @@ from __future__ import annotations
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from logging import Logger
-from typing import Sequence
 import pandas as pd
 from typing_extensions import Self

dataeval/detectors/drift/_nml/_chunk.py CHANGED Viewed

@@ -13,7 +13,8 @@ import copy
 import logging
 import warnings
 from abc import ABC, abstractmethod
-from typing import Any, Generic, Literal, Sequence, TypeVar, cast
+from collections.abc import Sequence
+from typing import Any, Generic, Literal, TypeVar, cast
 import pandas as pd
 from pandas import Index, Period

dataeval/detectors/drift/_nml/_result.py CHANGED Viewed

@@ -11,7 +11,8 @@ from __future__ import annotations
 import copy
 from abc import ABC, abstractmethod
-from typing import NamedTuple, Sequence
+from collections.abc import Sequence
+from typing import NamedTuple
 import pandas as pd
 from typing_extensions import Self
@@ -52,7 +53,7 @@ class AbstractResult(GenericOutput[pd.DataFrame]):
     def filter(self, period: str = "all", metrics: str | Sequence[str] | None = None) -> Self:
         """Returns filtered result metric data."""
-        if metrics and not isinstance(metrics, (str, Sequence)):
+        if metrics and not isinstance(metrics, str | Sequence):
             raise ValueError("metrics value provided is not a valid metric or sequence of metrics")
         if isinstance(metrics, str):
             metrics = [metrics]

dataeval/detectors/drift/_nml/_thresholds.py CHANGED Viewed

@@ -9,7 +9,8 @@ from __future__ import annotations
 import logging
 from abc import ABC, abstractmethod
-from typing import Any, Callable, ClassVar
+from collections.abc import Callable
+from typing import Any, ClassVar
 import numpy as np
@@ -169,10 +170,10 @@ class ConstantThreshold(Threshold, threshold_type="constant"):
     @staticmethod
     def _validate_inputs(lower: float | int | None = None, upper: float | int | None = None) -> None:
-        if lower is not None and not isinstance(lower, (float, int)) or isinstance(lower, bool):
+        if lower is not None and not isinstance(lower, float | int) or isinstance(lower, bool):
             raise ValueError(f"expected type of 'lower' to be 'float', 'int' or None but got '{type(lower).__name__}'")
-        if upper is not None and not isinstance(upper, (float, int)) or isinstance(upper, bool):
+        if upper is not None and not isinstance(upper, float | int) or isinstance(upper, bool):
             raise ValueError(f"expected type of 'upper' to be 'float', 'int' or None but got '{type(upper).__name__}'")
         # explicit None check is required due to special interpretation of the value 0.0 as False
@@ -244,7 +245,7 @@ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation")
     ) -> None:
         if (
             std_lower_multiplier is not None
-            and not isinstance(std_lower_multiplier, (float, int))
+            and not isinstance(std_lower_multiplier, float | int)
             or isinstance(std_lower_multiplier, bool)
         ):
             raise ValueError(
@@ -257,7 +258,7 @@ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation")
         if (
             std_upper_multiplier is not None
-            and not isinstance(std_upper_multiplier, (float, int))
+            and not isinstance(std_upper_multiplier, float | int)
             or isinstance(std_upper_multiplier, bool)
         ):
             raise ValueError(

dataeval/detectors/drift/_uncertainty.py CHANGED Viewed

@@ -10,7 +10,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Literal, Sequence, cast
+from collections.abc import Sequence
+from typing import Literal, cast
 import numpy as np
 import torch

dataeval/detectors/linters/duplicates.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Sequence, overload
+from collections.abc import Sequence
+from typing import Any, overload
 from dataeval.data._images import Images
 from dataeval.metrics.stats import hashstats

dataeval/detectors/linters/outliers.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Literal, Sequence, overload
+from collections.abc import Sequence
+from typing import Any, Literal, overload
 import numpy as np
 from numpy.typing import NDArray
@@ -201,7 +202,7 @@ class Outliers:
         >>> results.issues[1]
         {}
         """
-        if isinstance(stats, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
+        if isinstance(stats, ImageStatsOutput | DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput):
             return OutliersOutput(self._get_outliers(stats.data()))
         if not isinstance(stats, Sequence):
@@ -212,7 +213,7 @@ class Outliers:
         stats_map: dict[type, list[int]] = {}
         for i, stats_output in enumerate(stats):
             if not isinstance(
-                stats_output, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
+                stats_output, ImageStatsOutput | DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput
             ):
                 raise TypeError(
                     "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."

dataeval/detectors/ood/__init__.py CHANGED Viewed

@@ -2,7 +2,8 @@
 Out-of-distribution (OOD) detectors identify data that is different from the data used to train a particular model.
 """
-__all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
+__all__ = ["OODOutput", "OODScoreOutput", "OOD_AE", "OOD_KNN"]
 from dataeval.detectors.ood.ae import OOD_AE
+from dataeval.detectors.ood.knn import OOD_KNN
 from dataeval.outputs._ood import OODOutput, OODScoreOutput

dataeval/detectors/ood/ae.py CHANGED Viewed

@@ -12,7 +12,7 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable
+from collections.abc import Callable
 import numpy as np
 import torch

dataeval/detectors/ood/base.py CHANGED Viewed

@@ -10,11 +10,16 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable, cast
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from typing import Any, cast
+import numpy as np
 import torch
+from numpy.typing import NDArray
 from dataeval.config import DeviceLike, get_device
+from dataeval.data import Embeddings
 from dataeval.detectors.ood.mixin import OODBaseMixin, OODFitMixin, OODGMMMixin
 from dataeval.typing import ArrayLike
 from dataeval.utils._array import to_numpy
@@ -93,3 +98,36 @@ class OODBaseGMM(OODBase, OODGMMMixin[GaussianMixtureModelParams]):
         # Calculate the GMM parameters
         _, z, gamma = cast(tuple[torch.Tensor, torch.Tensor, torch.Tensor], self.model(x_ref))
         self._gmm_params = gmm_params(z, gamma)
+class EmbeddingBasedOODBase(OODBaseMixin[Callable[[Any], Any]], ABC):
+    """
+    Base class for embedding-based OOD detection methods.
+    These methods work directly on embedding representations,
+    using distance metrics or density estimation in embedding space.
+    Inherits from OODBaseMixin to get automatic thresholding.
+    """
+    def __init__(self) -> None:
+        """Initialize embedding-based OOD detector."""
+        # Pass a dummy callable as model since we don't use it
+        super().__init__(lambda x: x)
+    def _get_data_info(self, X: NDArray) -> tuple[tuple, type]:
+        """Override to skip [0-1] validation for embeddings."""
+        if not isinstance(X, np.ndarray):
+            raise TypeError("Dataset should of type: `NDArray`.")
+        # Skip the [0-1] range check for embeddings
+        return X.shape[1:], X.dtype.type
+    @abstractmethod
+    def fit_embeddings(self, embeddings: Embeddings, threshold_perc: float = 95.0) -> None:
+        """
+        Fit using reference embeddings.
+        Args:
+            embeddings: Reference (in-distribution) embeddings
+            threshold_perc: Percentage of reference data considered normal
+        """
+        pass

dataeval/detectors/ood/knn.py ADDED Viewed

@@ -0,0 +1,95 @@
+from typing import Literal
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+from dataeval.data import Embeddings
+from dataeval.detectors.ood.base import EmbeddingBasedOODBase
+from dataeval.outputs._ood import OODScoreOutput
+from dataeval.typing import ArrayLike
+class OOD_KNN(EmbeddingBasedOODBase):
+    """
+    K-Nearest Neighbors Out-of-Distribution detector.
+    Uses average cosine distance to k nearest neighbors in embedding space to detect OOD samples.
+    Samples with larger average distances to their k nearest neighbors in the
+    reference (in-distribution) set are considered more likely to be OOD.
+    Based on the methodology from:
+    "Back to the Basics: Revisiting Out-of-Distribution Detection Baselines"
+    (Kuan & Mueller, 2022)
+    As referenced in:
+    "Safe AI for coral reefs: Benchmarking out-of-distribution detection
+    algorithms for coral reef image surveys"
+    """
+    def __init__(self, k: int = 10, distance_metric: Literal["cosine", "euclidean"] = "cosine") -> None:
+        """
+        Initialize KNN OOD detector.
+        Args:
+            k: Number of nearest neighbors to consider (default: 10)
+            distance_metric: Distance metric to use ('cosine' or 'euclidean')
+        """
+        super().__init__()
+        self.k = k
+        self.distance_metric = distance_metric
+        self._nn_model: NearestNeighbors
+        self.reference_embeddings: ArrayLike
+    def fit_embeddings(self, embeddings: Embeddings, threshold_perc: float = 95.0) -> None:
+        """
+        Fit the detector using reference (in-distribution) embeddings.
+        Builds a k-NN index for efficient nearest neighbor search and
+        computes reference scores for automatic thresholding.
+        Args:
+            embeddings: Reference embeddings from in-distribution data
+            threshold_perc: Percentage of reference data considered normal
+        """
+        self.reference_embeddings = embeddings.to_numpy()
+        if self.k >= len(self.reference_embeddings):
+            raise ValueError(
+                f"k ({self.k}) must be less than number of reference embeddings ({len(self.reference_embeddings)})"
+            )
+        # Build k-NN index using sklearn
+        self._nn_model = NearestNeighbors(
+            n_neighbors=self.k,
+            metric=self.distance_metric,
+            algorithm="auto",  # Let sklearn choose the best algorithm
+        )
+        self._nn_model.fit(self.reference_embeddings)
+        # efficiently compute reference scores for automatic thresholding
+        ref_scores = self._compute_reference_scores()
+        self._ref_score = OODScoreOutput(instance_score=ref_scores)
+        self._threshold_perc = threshold_perc
+        self._data_info = self._get_data_info(self.reference_embeddings)
+    def _compute_reference_scores(self) -> np.ndarray:
+        """Efficiently compute reference scores by excluding self-matches."""
+        # Find k+1 neighbors (including self) for reference points
+        distances, _ = self._nn_model.kneighbors(self.reference_embeddings, n_neighbors=self.k + 1)
+        # Skip first neighbor (self with distance 0) and average the rest
+        return np.mean(distances[:, 1:], axis=1)
+    def _score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScoreOutput:
+        """
+        Compute OOD scores for input embeddings.
+        Args:
+            X: Input embeddings to score
+            batch_size: Batch size (not used, kept for interface compatibility)
+        Returns:
+            OODScoreOutput containing instance-level scores
+        """
+        # Compute OOD scores using sklearn's efficient k-NN search
+        distances, _ = self._nn_model.kneighbors(X)
+        return OODScoreOutput(instance_score=np.mean(distances, axis=1))

dataeval/detectors/ood/mixin.py CHANGED Viewed

@@ -3,7 +3,8 @@ from __future__ import annotations
 __all__ = []
 from abc import ABC, abstractmethod
-from typing import Callable, Generic, Literal, TypeVar
+from collections.abc import Callable
+from typing import Generic, Literal, TypeVar
 import numpy as np
 from numpy.typing import NDArray

dataeval/metadata/_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 __all__ = []
-from typing import Sequence
+from collections.abc import Sequence
 from numpy.typing import NDArray

dataeval/metrics/bias/_balance.py CHANGED Viewed

@@ -16,7 +16,7 @@ from dataeval.utils._bin import get_counts
 def _validate_num_neighbors(num_neighbors: int) -> int:
-    if not isinstance(num_neighbors, (int, float)):
+    if not isinstance(num_neighbors, int | float):
         raise TypeError(
             f"Variable {num_neighbors} is not real-valued numeric type."
             "num_neighbors should be an int, greater than 0 and less than"
@@ -73,9 +73,9 @@ def balance(
     Return intra/interfactor balance (mutual information)
     >>> bal.factors
-    array([[1.   , 0.017, 0.015],
-           [0.017, 0.445, 0.245],
-           [0.015, 0.245, 1.063]])
+    array([[1.   , 0.   , 0.015],
+           [0.   , 0.08 , 0.011],
+           [0.015, 0.011, 1.063]])
     Return classwise balance (mutual information) of factors with individual class_labels
@@ -95,32 +95,39 @@ def balance(
     num_neighbors = _validate_num_neighbors(num_neighbors)
-    data = metadata.discretized_data
     factor_types = {"class_label": "categorical"} | {k: v.factor_type for k, v in metadata.factor_info.items()}
     is_discrete = [factor_type != "continuous" for factor_type in factor_types.values()]
     num_factors = len(factor_types)
     class_labels = metadata.class_labels
     mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
-    data = np.hstack((class_labels[:, np.newaxis], data))
+    # Use numeric data for MI
+    data = np.hstack((class_labels[:, np.newaxis], metadata.digitized_data))
+    # Present discrete features composed of distinct values as continuous for `mutual_info_classif`
+    for i, factor_type in enumerate(factor_types):
+        if len(data) == len(np.unique(data[:, i])):
+            is_discrete[i] = False
+            factor_types[factor_type] = "continuous"
+    mutual_info_fn_map = {
+        "categorical": mutual_info_classif,
+        "discrete": mutual_info_classif,
+        "continuous": mutual_info_regression,
+    }
     for idx, factor_type in enumerate(factor_types.values()):
-        if factor_type != "continuous":
-            mi[idx, :] = mutual_info_classif(
-                data,
-                data[:, idx],
-                discrete_features=is_discrete,  # type: ignore - sklearn function not typed
-                n_neighbors=num_neighbors,
-                random_state=get_seed(),
-            )
-        else:
-            mi[idx, :] = mutual_info_regression(
-                data,
-                data[:, idx],
-                discrete_features=is_discrete,  # type: ignore - sklearn function not typed
-                n_neighbors=num_neighbors,
-                random_state=get_seed(),
-            )
+        mi[idx, :] = mutual_info_fn_map[factor_type](
+            data,
+            data[:, idx],
+            discrete_features=is_discrete,
+            n_neighbors=num_neighbors,
+            random_state=get_seed(),
+        )
+    # Use binned data for classwise MI
+    data = np.hstack((class_labels[:, np.newaxis], metadata.binned_data))
     # Normalization via entropy
     bin_cnts = get_counts(data)

dataeval/metrics/bias/_diversity.py CHANGED Viewed

@@ -162,12 +162,12 @@ def diversity(
         raise ValueError("No factors found in provided metadata.")
     diversity_fn = get_method(_DIVERSITY_FN_MAP, method)
-    discretized_data = metadata.discretized_data
+    binned_data = metadata.binned_data
     factor_names = metadata.factor_names
     class_lbl = metadata.class_labels
-    class_labels_with_discretized_data = np.hstack((class_lbl[:, np.newaxis], discretized_data))
-    cnts = get_counts(class_labels_with_discretized_data)
+    class_labels_with_binned_data = np.hstack((class_lbl[:, np.newaxis], binned_data))
+    cnts = get_counts(class_labels_with_binned_data)
     num_bins = np.bincount(np.nonzero(cnts)[1])
     diversity_index = diversity_fn(cnts, num_bins)
@@ -176,7 +176,7 @@ def diversity(
     classwise_div = np.full((len(u_classes), num_factors), np.nan)
     for idx, cls in enumerate(u_classes):
         subset_mask = class_lbl == cls
-        cls_cnts = get_counts(discretized_data[subset_mask], min_num_bins=cnts.shape[0])
+        cls_cnts = get_counts(binned_data[subset_mask], min_num_bins=cnts.shape[0])
         classwise_div[idx, :] = diversity_fn(cls_cnts, num_bins[1:])
     return DiversityOutput(diversity_index, classwise_div, factor_names, metadata.class_names)

dataeval/metrics/bias/_parity.py CHANGED Viewed

@@ -245,10 +245,10 @@ def parity(metadata: Metadata) -> ParityOutput:
     if not metadata.factor_names:
         raise ValueError("No factors found in provided metadata.")
-    chi_scores = np.zeros(metadata.discretized_data.shape[1])
+    chi_scores = np.zeros(metadata.binned_data.shape[1])
     p_values = np.zeros_like(chi_scores)
     insufficient_data: defaultdict[str, defaultdict[int, dict[str, int]]] = defaultdict(lambda: defaultdict(dict))
-    for i, col_data in enumerate(metadata.discretized_data.T):
+    for i, col_data in enumerate(metadata.binned_data.T):
         # Builds a contingency matrix where entry at index (r,c) represents
         # the frequency of current_factor_name achieving value unique_factor_values[r]
         # at a data point with class c.

dataeval/metrics/stats/_base.py CHANGED Viewed

@@ -6,11 +6,11 @@ import math
 import re
 import warnings
 from collections import ChainMap
+from collections.abc import Callable, Iterable, Iterator, Sequence
 from copy import deepcopy
 from dataclasses import dataclass
 from functools import partial
-from multiprocessing import Pool
-from typing import Any, Callable, Generic, Iterable, Iterator, Sequence, TypeVar
+from typing import Any, Generic, TypeVar
 import numpy as np
 from numpy.typing import NDArray
@@ -21,14 +21,12 @@ from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput, SourceIndex
 from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
 from dataeval.utils._array import as_numpy, to_numpy
 from dataeval.utils._image import clip_and_pad, clip_box, is_valid_box, normalize_image_shape, rescale
+from dataeval.utils._multiprocessing import PoolWrapper
 DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
-_S = TypeVar("_S")
-_T = TypeVar("_T")
 @dataclass
 class BoundingBox:
@@ -67,30 +65,6 @@ class BoundingBox:
         return x0_int, y0_int, x1_int, y1_int
-class PoolWrapper:
-    """
-    Wraps `multiprocessing.Pool` to allow for easy switching between
-    multiprocessing and single-threaded execution.
-    This helps with debugging and profiling, as well as usage with Jupyter notebooks
-    in VS Code, which does not support subprocess debugging.
-    """
-    def __init__(self, processes: int | None) -> None:
-        self.pool = Pool(processes) if processes is None or processes > 1 else None
-    def imap(self, func: Callable[[_S], _T], iterable: Iterable[_S]) -> Iterator[_T]:
-        return map(func, iterable) if self.pool is None else self.pool.imap(func, iterable)
-    def __enter__(self, *args: Any, **kwargs: Any) -> PoolWrapper:
-        return self
-    def __exit__(self, *args: Any) -> None:
-        if self.pool is not None:
-            self.pool.close()
-            self.pool.join()
 class StatsProcessor(Generic[TStatsOutput]):
     output_class: type[TStatsOutput]
     cache_keys: set[str] = set()

dataeval/metrics/stats/_boxratiostats.py CHANGED Viewed

@@ -3,7 +3,8 @@ from __future__ import annotations
 __all__ = []
 import copy
-from typing import Any, Callable, Generic, TypeVar, cast
+from collections.abc import Callable
+from typing import Any, Generic, TypeVar, cast
 import numpy as np
 from numpy.typing import NDArray

dataeval/metrics/stats/_dimensionstats.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 import numpy as np

dataeval/metrics/stats/_hashstats.py CHANGED Viewed

@@ -4,12 +4,14 @@ import warnings
 __all__ = []
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 import numpy as np
 import xxhash as xxh
-from PIL import Image
+from numpy.typing import NDArray
 from scipy.fftpack import dct
+from scipy.ndimage import zoom
 from dataeval.metrics.stats._base import StatsProcessor, run_stats
 from dataeval.outputs import HashStatsOutput
@@ -18,10 +20,26 @@ from dataeval.typing import ArrayLike, Dataset
 from dataeval.utils._array import as_numpy
 from dataeval.utils._image import normalize_image_shape, rescale
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
 HASH_SIZE = 8
 MAX_FACTOR = 4
+def _resize(image: NDArray[np.uint8], resize_dim: int, use_pil: bool = True) -> NDArray[np.uint8]:
+    """Resizes a grayscale (HxW) 8-bit image using PIL or scipy.ndimage.zoom."""
+    # Use PIL if available, otherwise resize and resample with scipy.ndimage.zoom
+    if use_pil and Image is not None:
+        return np.array(Image.fromarray(image).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
+    zoom_factors = (resize_dim / image.shape[0], resize_dim / image.shape[1])
+    return np.clip(zoom(image, zoom_factors, order=5, mode="reflect"), 0, 255, dtype=np.uint8)
 def pchash(image: ArrayLike) -> str:
     """
     Performs a perceptual hash on an image by resizing to a square NxN image
@@ -59,7 +77,7 @@ def pchash(image: ArrayLike) -> str:
     rescaled = rescale(normalized, 8).astype(np.uint8)
     # Resizes the image using the Lanczos algorithm to a square image
-    im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
+    im = _resize(rescaled, resize_dim)
     # Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
     transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]

dataeval/metrics/stats/_pixelstats.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 import numpy as np
 from scipy.stats import entropy, kurtosis, skew

dataeval/metrics/stats/_visualstats.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 import numpy as np

dataeval/outputs/_base.py CHANGED Viewed

@@ -4,14 +4,13 @@ __all__ = []
 import inspect
 import logging
-from collections.abc import Collection, Mapping, Sequence
+from collections.abc import Callable, Collection, Iterator, Mapping, Sequence
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from functools import partial, wraps
-from typing import Any, Callable, Generic, Iterator, TypeVar, overload
+from typing import Any, Generic, ParamSpec, TypeVar, overload
 import numpy as np
-from typing_extensions import ParamSpec
 from dataeval import __version__

dataeval/outputs/_bias.py CHANGED Viewed

@@ -3,8 +3,9 @@ from __future__ import annotations
 __all__ = []
 import contextlib
+from collections.abc import Mapping, Sequence
 from dataclasses import asdict, dataclass
-from typing import Any, Mapping, Sequence, TypeVar
+from typing import Any, TypeVar
 import numpy as np
 import pandas as pd

dataeval/outputs/_estimators.py CHANGED Viewed

@@ -2,8 +2,8 @@ from __future__ import annotations
 __all__ = []
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Sequence
 import numpy as np
 from numpy.typing import NDArray

dataeval 0.86.9__py3-none-any.whl → 0.88.0__py3-none-any.whl

dataeval 0.86.9py3-none-any.whl → 0.88.0py3-none-any.whl