PyPI - dataeval - Versions diffs - 0.86.7__py3-none-any.whl → 0.86.9__py3-none-any.whl - Mend

dataeval 0.86.7py3-none-any.whl → 0.86.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

dataeval/__init__.py +10 -3
dataeval/_version.py +21 -0
dataeval/config.py +7 -1
dataeval/detectors/drift/_mvdc.py +2 -9
dataeval/detectors/drift/_nml/_chunk.py +2 -2
dataeval/detectors/ood/ae.py +1 -1
dataeval/detectors/ood/base.py +3 -3
dataeval/metrics/bias/_completeness.py +3 -3
dataeval/metrics/bias/_coverage.py +2 -2
dataeval/metrics/bias/_parity.py +1 -1
dataeval/metrics/estimators/_ber.py +2 -2
dataeval/metrics/estimators/_divergence.py +2 -2
dataeval/outputs/_estimators.py +6 -6
dataeval/utils/_array.py +20 -9
dataeval/utils/_clusterer.py +7 -7
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +1 -1
dataeval/utils/datasets/_base.py +12 -8
dataeval/utils/datasets/_fileio.py +3 -3
dataeval/utils/datasets/_milco.py +1 -1
dataeval/utils/datasets/_seadrone.py +512 -0
dataeval/utils/datasets/_voc.py +3 -3
dataeval/utils/torch/_internal.py +3 -3
dataeval/utils/torch/trainer.py +1 -1
dataeval/workflows/sufficiency.py +53 -10
{dataeval-0.86.7.dist-info → dataeval-0.86.9.dist-info}/METADATA +67 -47
{dataeval-0.86.7.dist-info → dataeval-0.86.9.dist-info}/RECORD +33 -31
{dataeval-0.86.7.dist-info → dataeval-0.86.9.dist-info}/WHEEL +1 -1
{dataeval-0.86.7.dist-info → dataeval-0.86.9.dist-info/licenses}/LICENSE.txt +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -7,12 +7,19 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
-__all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
-__version__ = "0.86.7"
+try:
+    from ._version import __version__
+except ImportError:
+    __version__ = "unknown"
+# Strongly type for pyright
+__version__ = str(__version__)
+__all__ = ["__version__", "config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
 import logging
-from dataeval import config, detectors, metrics, typing, utils, workflows
+from . import config, detectors, metrics, typing, utils, workflows
 logging.getLogger(__name__).addHandler(logging.NullHandler())

dataeval/_version.py ADDED Viewed

@@ -0,0 +1,21 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '0.86.9'
+__version_tuple__ = version_tuple = (0, 86, 9)

dataeval/config.py CHANGED Viewed

@@ -77,7 +77,13 @@ def get_device(override: DeviceLike | None = None) -> torch.device:
     """
     if override is None:
         global _device
-        return torch.get_default_device() if _device is None else _device
+        return (
+            torch.get_default_device()
+            if hasattr(torch, "get_default_device")
+            else torch.device("cpu")
+            if _device is None
+            else _device
+        )
     return _todevice(override)

dataeval/detectors/drift/_mvdc.py CHANGED Viewed

@@ -1,16 +1,9 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
 import numpy as np
 import pandas as pd
 from numpy.typing import ArrayLike
-if TYPE_CHECKING:
-    from typing import Self
-else:
-    from typing_extensions import Self
 from dataeval.detectors.drift._nml._chunk import CountBasedChunker, SizeBasedChunker
 from dataeval.detectors.drift._nml._domainclassifier import DomainClassifierCalculator
 from dataeval.detectors.drift._nml._thresholds import ConstantThreshold
@@ -52,7 +45,7 @@ class DriftMVDC:
             threshold=ConstantThreshold(lower=self.threshold[0], upper=self.threshold[1]),
         )
-    def fit(self, x_ref: ArrayLike) -> Self:
+    def fit(self, x_ref: ArrayLike) -> DriftMVDC:
         """
         Fit the domain classifier on the training dataframe
@@ -63,7 +56,7 @@ class DriftMVDC:
         Returns
         -------
-        Self
+        DriftMVDC
         """
         # for 1D input, assume that is 1 sample: dim[1,n_features]

dataeval/detectors/drift/_nml/_chunk.py CHANGED Viewed

@@ -46,10 +46,10 @@ class Chunk(ABC):
         return self.data.shape[0]
     @abstractmethod
-    def __add__(self, other: Self) -> Self: ...
+    def __add__(self, other: Any) -> Any: ...
     @abstractmethod
-    def __lt__(self, other: Self) -> bool: ...
+    def __lt__(self, other: Any) -> bool: ...
     @abstractmethod
     def dict(self) -> dict[str, Any]: ...

dataeval/detectors/ood/ae.py CHANGED Viewed

@@ -65,7 +65,7 @@ class OOD_AE(OODBase):
         self,
         x_ref: ArrayLike,
         threshold_perc: float,
-        loss_fn: Callable[..., torch.nn.Module] | None = None,
+        loss_fn: Callable[..., torch.Tensor] | None = None,
         optimizer: torch.optim.Optimizer | None = None,
         epochs: int = 20,
         batch_size: int = 64,

dataeval/detectors/ood/base.py CHANGED Viewed

@@ -22,7 +22,7 @@ from dataeval.utils.torch._gmm import GaussianMixtureModelParams, gmm_params
 from dataeval.utils.torch._internal import trainer
-class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.nn.Module], torch.optim.Optimizer]):
+class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.Tensor], torch.optim.Optimizer]):
     def __init__(self, model: torch.nn.Module, device: DeviceLike | None = None) -> None:
         self.device: torch.device = get_device(device)
         super().__init__(model)
@@ -31,7 +31,7 @@ class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.nn.
         self,
         x_ref: ArrayLike,
         threshold_perc: float,
-        loss_fn: Callable[..., torch.nn.Module] | None,
+        loss_fn: Callable[..., torch.Tensor] | None,
         optimizer: torch.optim.Optimizer | None,
         epochs: int,
         batch_size: int,
@@ -82,7 +82,7 @@ class OODBaseGMM(OODBase, OODGMMMixin[GaussianMixtureModelParams]):
         self,
         x_ref: ArrayLike,
         threshold_perc: float,
-        loss_fn: Callable[..., torch.nn.Module] | None,
+        loss_fn: Callable[..., torch.Tensor] | None,
         optimizer: torch.optim.Optimizer | None,
         epochs: int,
         batch_size: int,

dataeval/metrics/bias/_completeness.py CHANGED Viewed

@@ -9,11 +9,11 @@ import numpy as np
 from dataeval.config import EPSILON
 from dataeval.outputs import CompletenessOutput
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
 from dataeval.utils._array import ensure_embeddings
-def completeness(embeddings: ArrayLike, quantiles: int) -> CompletenessOutput:
+def completeness(embeddings: Array, quantiles: int) -> CompletenessOutput:
     """
     Calculate the fraction of boxes in a grid defined by quantiles that
     contain at least one data point.
@@ -21,7 +21,7 @@ def completeness(embeddings: ArrayLike, quantiles: int) -> CompletenessOutput:
     Parameters
     ----------
-    embeddings : ArrayLike
+    embeddings : Array
         Embedded dataset (or other low-dimensional data) (nxp)
     quantiles : int
         number of quantile values to use for partitioning each dimension

dataeval/metrics/bias/_coverage.py CHANGED Viewed

@@ -10,13 +10,13 @@ from scipy.spatial.distance import pdist, squareform
 from dataeval.outputs import CoverageOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
 from dataeval.utils._array import ensure_embeddings, flatten
 @set_metadata
 def coverage(
-    embeddings: ArrayLike,
+    embeddings: Array,
     radius_type: Literal["adaptive", "naive"] = "adaptive",
     num_observations: int = 20,
     percent: float = 0.01,

dataeval/metrics/bias/_parity.py CHANGED Viewed

@@ -271,7 +271,7 @@ def parity(metadata: Metadata) -> ParityOutput:
         # because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
         contingency_matrix = contingency_matrix[np.any(contingency_matrix, axis=1)]
-        chi_scores[i], p_values[i] = chi2_contingency(contingency_matrix)[:2]
+        chi_scores[i], p_values[i] = chi2_contingency(contingency_matrix)[:2]  # type: ignore
     if insufficient_data:
         warnings.warn(

dataeval/metrics/estimators/_ber.py CHANGED Viewed

@@ -22,7 +22,7 @@ from scipy.stats import mode
 from dataeval.config import EPSILON
 from dataeval.outputs import BEROutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
 from dataeval.utils._array import as_numpy, ensure_embeddings
 from dataeval.utils._method import get_method
 from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
@@ -105,7 +105,7 @@ _BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
 @set_metadata
-def ber(embeddings: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
+def ber(embeddings: Array, labels: Array, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
     """
     An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` \
     using FR or KNN test statistic basis.

dataeval/metrics/estimators/_divergence.py CHANGED Viewed

@@ -14,7 +14,7 @@ from numpy.typing import NDArray
 from dataeval.outputs import DivergenceOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
 from dataeval.utils._array import ensure_embeddings
 from dataeval.utils._method import get_method
 from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
@@ -65,7 +65,7 @@ _DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
 @set_metadata
-def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
+def divergence(emb_a: Array, emb_b: Array, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
     """
     Calculates the :term:`divergence` and any errors between the datasets.

dataeval/outputs/_estimators.py CHANGED Viewed

@@ -47,11 +47,11 @@ class ClustererOutput(Output):
         The strength of the data point belonging to the assigned cluster
     """
-    clusters: NDArray[np.int_]
-    mst: NDArray[np.double]
-    linkage_tree: NDArray[np.double]
-    condensed_tree: NDArray[np.double]
-    membership_strengths: NDArray[np.double]
+    clusters: NDArray[np.intp]
+    mst: NDArray[np.float32]
+    linkage_tree: NDArray[np.float32]
+    condensed_tree: NDArray[np.float32]
+    membership_strengths: NDArray[np.float32]
     def find_outliers(self) -> NDArray[np.int_]:
         """
@@ -77,7 +77,7 @@ class ClustererOutput(Output):
         # Delay load numba compiled functions
         from dataeval.utils._clusterer import compare_links_to_cluster_std, sorted_union_find
-        exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters)
+        exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters)  # type: ignore
         exact_dupes = sorted_union_find(exact_indices)
         near_dupes = sorted_union_find(near_indices)

dataeval/utils/_array.py CHANGED Viewed

@@ -19,7 +19,7 @@ _logger = logging.getLogger(__name__)
 _MODULE_CACHE = {}
-T = TypeVar("T", ArrayLike, np.ndarray, torch.Tensor)
+T = TypeVar("T", Array, np.ndarray, torch.Tensor)
 _np_dtype = TypeVar("_np_dtype", bound=np.generic)
@@ -73,6 +73,19 @@ def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
         yield to_numpy(array)
+@overload
+def rescale_array(array: NDArray[_np_dtype]) -> NDArray[_np_dtype]: ...
+@overload
+def rescale_array(array: torch.Tensor) -> torch.Tensor: ...
+def rescale_array(array: Array | NDArray[_np_dtype] | torch.Tensor) -> Array | NDArray[_np_dtype] | torch.Tensor:
+    """Rescale an array to the range [0, 1]"""
+    if isinstance(array, (np.ndarray, torch.Tensor)):
+        arr_min = array.min()
+        arr_max = array.max()
+        return (array - arr_min) / (arr_max - arr_min)
+    raise TypeError(f"Unsupported type: {type(array)}")
 @overload
 def ensure_embeddings(
     embeddings: T,
@@ -137,14 +150,12 @@ def ensure_embeddings(
     if arr.ndim != 2:
         raise ValueError(f"Expected a 2D array, but got a {arr.ndim}D array.")
-    if unit_interval:
-        arr_min, arr_max = arr.min(), arr.max()
-        if arr_min < 0 or arr_max > 1:
-            if unit_interval == "force":
-                warnings.warn("Embeddings are not unit interval [0, 1]. Forcing to unit interval.")
-                arr = (arr - arr_min) / (arr_max - arr_min)
-            else:
-                raise ValueError("Embeddings must be unit interval [0, 1].")
+    if unit_interval and (arr.min() < 0 or arr.max() > 1):
+        if unit_interval == "force":
+            warnings.warn("Embeddings are not unit interval [0, 1]. Forcing to unit interval.")
+            arr = rescale_array(arr)
+        else:
+            raise ValueError("Embeddings must be unit interval [0, 1].")
     if dtype is None:
         return embeddings

dataeval/utils/_clusterer.py CHANGED Viewed

@@ -69,12 +69,12 @@ def compare_links_to_cluster_std(
 @dataclass
 class ClusterData:
     clusters: NDArray[np.intp]
-    mst: NDArray[np.double]
-    linkage_tree: NDArray[np.double]
+    mst: NDArray[np.float32]
+    linkage_tree: NDArray[np.float32]
     condensed_tree: CondensedTree
-    membership_strengths: NDArray[np.double]
+    membership_strengths: NDArray[np.float32]
     k_neighbors: NDArray[np.int32]
-    k_distances: NDArray[np.double]
+    k_distances: NDArray[np.float32]
 def cluster(data: ArrayLike) -> ClusterData:
@@ -95,9 +95,9 @@ def cluster(data: ArrayLike) -> ClusterData:
     max_neighbors = min(25, num_samples - 1)
     kneighbors, kdistances = calculate_neighbor_distances(x, max_neighbors)
-    unsorted_mst: NDArray[np.double] = minimum_spanning_tree(x, kneighbors, kdistances)
-    mst: NDArray[np.double] = unsorted_mst[np.argsort(unsorted_mst.T[2])]
-    linkage_tree: NDArray[np.double] = mst_to_linkage_tree(mst)
+    unsorted_mst: NDArray[np.float32] = minimum_spanning_tree(x, kneighbors, kdistances)
+    mst: NDArray[np.float32] = unsorted_mst[np.argsort(unsorted_mst.T[2])]
+    linkage_tree: NDArray[np.float32] = mst_to_linkage_tree(mst).astype(np.float32)
     condensed_tree: CondensedTree = condense_tree(linkage_tree, min_cluster_size, None)
     cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)

dataeval/utils/datasets/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from dataeval.utils.datasets._antiuav import AntiUAVDetection
 from dataeval.utils.datasets._cifar10 import CIFAR10
 from dataeval.utils.datasets._milco import MILCO
 from dataeval.utils.datasets._mnist import MNIST
+from dataeval.utils.datasets._seadrone import SeaDrone
 from dataeval.utils.datasets._ships import Ships
 from dataeval.utils.datasets._voc import VOCDetection, VOCDetectionTorch, VOCSegmentation
@@ -13,6 +14,7 @@ __all__ = [
     "CIFAR10",
     "AntiUAVDetection",
     "MILCO",
+    "SeaDrone",
     "VOCDetection",
     "VOCDetectionTorch",
     "VOCSegmentation",

dataeval/utils/datasets/_antiuav.py CHANGED Viewed

@@ -15,7 +15,7 @@ if TYPE_CHECKING:
     from dataeval.typing import Transform
-class AntiUAVDetection(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
+class AntiUAVDetection(BaseODDataset[NDArray[Any], list[str], str], BaseDatasetNumpyMixin):
     """
     A UAV detection dataset focused on detecting UAVs in natural images against large variation in backgrounds.

dataeval/utils/datasets/_base.py CHANGED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from abc import abstractmethod
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar, cast
 import numpy as np
@@ -28,7 +28,8 @@ else:
     _TArray = TypeVar("_TArray")
 _TTarget = TypeVar("_TTarget")
-_TRawTarget = TypeVar("_TRawTarget", list[int], list[str])
+_TRawTarget = TypeVar("_TRawTarget", Sequence[int], Sequence[str], Sequence[tuple[list[int], list[list[float]]]])
+_TAnnotation = TypeVar("_TAnnotation", int, str, tuple[list[int], list[list[float]]])
 class DataLocation(NamedTuple):
@@ -38,7 +39,9 @@ class DataLocation(NamedTuple):
     checksum: str
-class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Generic[_TArray, _TTarget, _TRawTarget]):
+class BaseDataset(
+    AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Generic[_TArray, _TTarget, _TRawTarget, _TAnnotation]
+):
     """
     Base class for internet downloaded datasets.
     """
@@ -144,7 +147,7 @@ class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Ge
 class BaseICDataset(
-    BaseDataset[_TArray, _TArray, list[int]],
+    BaseDataset[_TArray, _TArray, list[int], int],
     BaseDatasetMixin[_TArray],
     ImageClassificationDataset[_TArray],
 ):
@@ -177,7 +180,7 @@ class BaseICDataset(
 class BaseODDataset(
-    BaseDataset[_TArray, ObjectDetectionTarget[_TArray], list[str]],
+    BaseDataset[_TArray, ObjectDetectionTarget[_TArray], _TRawTarget, _TAnnotation],
     BaseDatasetMixin[_TArray],
     ObjectDetectionDataset[_TArray],
 ):
@@ -200,7 +203,8 @@ class BaseODDataset(
             Image, target, datum_metadata - target.boxes returns boxes in x0, y0, x1, y1 format
         """
         # Grab the bounding boxes and labels from the annotations
-        boxes, labels, additional_metadata = self._read_annotations(self._targets[index])
+        annotation = cast(_TAnnotation, self._targets[index])
+        boxes, labels, additional_metadata = self._read_annotations(annotation)
         # Get the image
         img = self._read_file(self._filepaths[index])
         img_size = img.shape
@@ -217,11 +221,11 @@ class BaseODDataset(
         return img, target, img_metadata
     @abstractmethod
-    def _read_annotations(self, annotation: str) -> tuple[list[list[float]], list[int], dict[str, Any]]: ...
+    def _read_annotations(self, annotation: _TAnnotation) -> tuple[list[list[float]], list[int], dict[str, Any]]: ...
 class BaseSegDataset(
-    BaseDataset[_TArray, SegmentationTarget[_TArray], list[str]],
+    BaseDataset[_TArray, SegmentationTarget[_TArray], list[str], str],
     BaseDatasetMixin[_TArray],
     SegmentationDataset[_TArray],
 ):

dataeval/utils/datasets/_fileio.py CHANGED Viewed

@@ -128,9 +128,9 @@ def _ensure_exists(
     elif not check_path.exists() and not download:
         raise FileNotFoundError(
-            "Data could not be loaded with the provided root directory, ",
-            f"the file path to the file {filename} does not exist, ",
-            "and the download parameter is set to False.",
+            "Data could not be loaded with the provided root directory, "
+            f"the file path to the file {filename} does not exist, "
+            "and the download parameter is set to False."
         )
     else:
         if not _validate_file(check_path, checksum, md5):

dataeval/utils/datasets/_milco.py CHANGED Viewed

@@ -14,7 +14,7 @@ if TYPE_CHECKING:
     from dataeval.typing import Transform
-class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
+class MILCO(BaseODDataset[NDArray[Any], list[str], str], BaseDatasetNumpyMixin):
     """
     A side-scan sonar dataset focused on mine-like object detection.

dataeval 0.86.7__py3-none-any.whl → 0.86.9__py3-none-any.whl

dataeval 0.86.7py3-none-any.whl → 0.86.9py3-none-any.whl