PyPI - dataeval - Versions diffs - 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl - Mend

dataeval 0.86.0py3-none-any.whl → 0.86.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +48 -37
dataeval/data/_selection.py +1 -2
dataeval/data/_split.py +2 -3
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +2 -5
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_nml/_base.py +4 -2
dataeval/detectors/drift/_nml/_chunk.py +11 -19
dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
dataeval/detectors/drift/_nml/_result.py +8 -9
dataeval/detectors/drift/_nml/_thresholds.py +66 -77
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metrics/bias/_parity.py +10 -13
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +26 -28
dataeval/outputs/_drift.py +1 -9
dataeval/outputs/_linters.py +11 -11
dataeval/outputs/_stats.py +82 -23
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +32 -20
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +19 -11
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +1 -1
dataeval-0.86.1.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.86.0.dist-info/RECORD +0 -114
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
 __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
-__version__ = "0.86.0"
+__version__ = "0.86.1"
 import logging

dataeval/_log.py CHANGED Viewed

@@ -8,7 +8,7 @@ class LogMessage:
     Deferred message callback for logging expensive messages.
     """
-    def __init__(self, fn: Callable[..., str]):
+    def __init__(self, fn: Callable[..., str]) -> None:
         self._fn = fn
         self._str = None

dataeval/config.py CHANGED Viewed

@@ -4,10 +4,10 @@ Global configuration settings for DataEval.
 from __future__ import annotations
-__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "DeviceLike"]
+__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes", "DeviceLike"]
 import sys
-from typing import Union
+from typing import Any, Union
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
@@ -78,8 +78,7 @@ def get_device(override: DeviceLike | None = None) -> torch.device:
     if override is None:
         global _device
         return torch.get_default_device() if _device is None else _device
-    else:
-        return _todevice(override)
+    return _todevice(override)
 def set_max_processes(processes: int | None) -> None:
@@ -112,6 +111,24 @@ def get_max_processes() -> int | None:
     return _processes
+class MaxProcessesContextManager:
+    def __init__(self, processes: int) -> None:
+        self._processes = processes
+    def __enter__(self) -> None:
+        global _processes
+        self._old = _processes
+        set_max_processes(self._processes)
+    def __exit__(self, *args: tuple[Any, ...]) -> None:
+        global _processes
+        _processes = self._old
+def use_max_processes(processes: int) -> MaxProcessesContextManager:
+    return MaxProcessesContextManager(processes)
 def set_seed(seed: int | None, all_generators: bool = False) -> None:
     """
     Sets the seed for use by classes that allow for a random state or seed.

dataeval/data/_embeddings.py CHANGED Viewed

@@ -144,8 +144,7 @@ class Embeddings:
         """
         if indices is not None:
             return torch.vstack(list(self._batch(indices))).to(self.device)
-        else:
-            return self[:]
+        return self[:]
     def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
         """
@@ -248,6 +247,7 @@ class Embeddings:
             _logger.log(logging.DEBUG, f"Saved embeddings cache from {path}")
         except Exception as e:
             _logger.log(logging.ERROR, f"Failed to save embeddings cache: {e}")
+            raise e
     @classmethod
     def load(cls, path: Path | str) -> Embeddings:

dataeval/data/_images.py CHANGED Viewed

@@ -73,15 +73,14 @@ class Images(Generic[T]):
     def __getitem__(self, key: int | slice, /) -> Sequence[T] | T:
         if isinstance(key, slice):
             return [self._get_image(k) for k in range(len(self._dataset))[key]]
-        elif hasattr(key, "__int__"):
+        if hasattr(key, "__int__"):
             return self._get_image(int(key))
         raise TypeError(f"Key must be integers or slices, not {type(key)}")
     def _get_image(self, index: int) -> T:
         if self._is_tuple_datum:
             return cast(Dataset[tuple[T, Any, Any]], self._dataset)[index][0]
-        else:
-            return cast(Dataset[T], self._dataset)[index]
+        return cast(Dataset[T], self._dataset)[index]
     def __iter__(self) -> Iterator[T]:
         for i in range(len(self._dataset)):

dataeval/data/_metadata.py CHANGED Viewed

@@ -196,7 +196,7 @@ class Metadata:
         self._process()
         return int(self._image_indices.max() + 1)
-    def _collate(self, force: bool = False):
+    def _collate(self, force: bool = False) -> None:
         if self._collated and not force:
             return
@@ -243,7 +243,7 @@ class Metadata:
         self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
         self._collated = True
-    def _merge(self, force: bool = False):
+    def _merge(self, force: bool = False) -> None:
         if self._merged is not None and not force:
             return
@@ -266,48 +266,26 @@ class Metadata:
                     "Metadata dictionary needs to be a single dictionary whose values "
                     "are arraylike containing the metadata on a per image or per object basis."
                 )
-            else:
-                check_length = len(v) if check_length is None else check_length
-                if check_length != len(v):
-                    raise ValueError(
-                        "The lists/arrays in the metadata dict have varying lengths. "
-                        "Metadata requires them to be uniform in length."
-                    )
+            check_length = len(v) if check_length is None else check_length
+            if check_length != len(v):
+                raise ValueError(
+                    "The lists/arrays in the metadata dict have varying lengths. "
+                    "Metadata requires them to be uniform in length."
+                )
         if len(self._class_labels) != check_length:
             raise ValueError(
                 f"The length of the label array {len(self._class_labels)} is not the same as "
                 f"the length of the metadata arrays {check_length}."
             )
-    def _process(self, force: bool = False) -> None:
-        if self._processed and not force:
-            return
-        # Create image indices from targets
-        self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
-        # Validate the metadata dimensions
-        self._validate()
-        # Include specified metadata keys
-        if self.include:
-            metadata = {i: self.merged[i] for i in self.include if i in self.merged}
-            continuous_factor_bins = (
-                {i: self.continuous_factor_bins[i] for i in self.include if i in self.continuous_factor_bins}
-                if self.continuous_factor_bins
-                else {}
-            )
-        else:
-            metadata = self.merged
-            continuous_factor_bins = dict(self.continuous_factor_bins) if self.continuous_factor_bins else {}
-            for k in self.exclude:
-                metadata.pop(k, None)
-                continuous_factor_bins.pop(k, None)
-        # Remove generated "_image_index" if present
-        if "_image_index" in metadata:
-            metadata.pop("_image_index", None)
+    def _filter(self, d: Mapping[str, Any]) -> dict[str, Any]:
+        return (
+            {k: d[k] for k in self.include if k in d} if self.include else {k: d[k] for k in d if k not in self.exclude}
+        )
+    def _split_continuous_discrete(
+        self, metadata: dict[str, NDArray[Any]], continuous_factor_bins: dict[str, int | Sequence[float]]
+    ) -> tuple[dict[str, NDArray[Any]], dict[str, NDArray[np.int64]]]:
         # Bin according to user supplied bins
         continuous_metadata = {}
         discrete_metadata = {}
@@ -346,6 +324,28 @@ class Metadata:
             else:
                 _, discrete_metadata[key] = np.unique(data, return_inverse=True)
+        return continuous_metadata, discrete_metadata
+    def _process(self, force: bool = False) -> None:
+        if self._processed and not force:
+            return
+        # Create image indices from targets
+        self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
+        # Validate the metadata dimensions
+        self._validate()
+        # Filter the merged metadata and continuous factor bins
+        metadata = self._filter(self.merged)
+        continuous_factor_bins = self._filter(self.continuous_factor_bins)
+        # Remove generated "_image_index" if present
+        metadata.pop("_image_index", None)
+        # Split the metadata into continuous and discrete
+        continuous_metadata, discrete_metadata = self._split_continuous_discrete(metadata, continuous_factor_bins)
         # Split out the dictionaries into the keys and values
         self._discrete_factor_names = list(discrete_metadata.keys())
         self._discrete_data = (
@@ -363,6 +363,17 @@ class Metadata:
         self._processed = True
     def add_factors(self, factors: Mapping[str, ArrayLike]) -> None:
+        """
+        Add additional factors to the metadata.
+        The number of measures per factor must match the number of images
+        in the dataset or the number of detections in the dataset.
+        Parameters
+        ----------
+        factors : Mapping[str, ArrayLike]
+            Dictionary of factors to add to the metadata.
+        """
         self._merge()
         targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)

dataeval/data/_selection.py CHANGED Viewed

@@ -110,8 +110,7 @@ class Select(AnnotatedDataset[_TDatum]):
         grouped: dict[int, list[Selection[_TDatum]]] = {}
         for selection in selections_list:
             grouped.setdefault(selection.stage, []).append(selection)
-        selection_list = [selection for category in sorted(grouped) for selection in grouped[category]]
-        return selection_list
+        return [selection for category in sorted(grouped) for selection in grouped[category]]
     def _apply_selections(self) -> None:
         for selection in self._selections:

dataeval/data/_split.py CHANGED Viewed

@@ -23,7 +23,7 @@ _logger = logging.getLogger(__name__)
 class KFoldSplitter(Protocol):
     """Protocol covering sklearn KFold variant splitters"""
-    def __init__(self, n_splits: int): ...
+    def __init__(self, n_splits: int) -> None: ...
     def split(self, X: Any, y: Any, groups: Any) -> Iterator[tuple[NDArray[Any], NDArray[Any]]]: ...
@@ -209,8 +209,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
     split_set = set(split_on)
     indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
     binned_features = metadata.discrete_data[:, indices]
-    group_ids = np.unique(binned_features, axis=0, return_inverse=True)[1]
-    return group_ids
+    return np.unique(binned_features, axis=0, return_inverse=True)[1]
 def make_splits(

dataeval/data/_targets.py CHANGED Viewed

@@ -24,11 +24,13 @@ class Targets:
     labels : NDArray[np.intp]
         Labels (N,) for N images or objects
     scores : NDArray[np.float32]
-        Probability scores (N,M) for N images of M classes or confidence score (N,) of objects
+        Probability scores (N, M) for N images of M classes or confidence score (N,) of objects
     bboxes : NDArray[np.float32] | None
-        Bounding boxes (N,4) for N objects in (x0,y0,x1,y1) format
+        Bounding boxes (N, 4) for N objects in (x0, y0, x1, y1) format
     source : NDArray[np.intp] | None
         Source image index (N,) for N objects
+    size : int
+        Count of objects
     """
     labels: NDArray[np.intp]
@@ -55,13 +57,16 @@ class Targets:
             )
         if self.bboxes is not None and len(self.bboxes) > 0 and self.bboxes.shape[-1] != 4:
-            raise ValueError("Bounding boxes must be in (x0,y0,x1,y1) format.")
+            raise ValueError("Bounding boxes must be in (x0, y0, x1, y1) format.")
+    @property
+    def size(self) -> int:
+        return len(self.labels)
     def __len__(self) -> int:
         if self.source is None:
             return len(self.labels)
-        else:
-            return len(np.unique(self.source))
+        return len(np.unique(self.source))
     def __getitem__(self, idx: int, /) -> Targets:
         if self.source is None or self.bboxes is None:
@@ -71,14 +76,13 @@ class Targets:
                 None,
                 None,
             )
-        else:
-            mask = np.where(self.source == idx, True, False)
-            return Targets(
-                np.atleast_1d(self.labels[mask]),
-                np.atleast_1d(self.scores[mask]),
-                np.atleast_2d(self.bboxes[mask]),
-                np.atleast_1d(self.source[mask]),
-            )
+        mask = np.where(self.source == idx, True, False)
+        return Targets(
+            np.atleast_1d(self.labels[mask]),
+            np.atleast_1d(self.scores[mask]),
+            np.atleast_2d(self.bboxes[mask]),
+            np.atleast_1d(self.source[mask]),
+        )
     def __iter__(self) -> Iterator[Targets]:
         for i in range(len(self.labels)) if self.source is None else np.unique(self.source):

dataeval/data/selections/_classfilter.py CHANGED Viewed

@@ -68,11 +68,8 @@ _TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
 def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
-    if isinstance(obj, Sized) and not isinstance(obj, (str, bytes, bytearray)) and len(obj) == len(mask):
-        if isinstance(obj, Array):
-            return obj[mask]
-        elif isinstance(obj, Sequence):
-            return cast(_T, [item for i, item in enumerate(obj) if mask[i]])
+    if not isinstance(obj, (str, bytes, bytearray)) and isinstance(obj, (Sequence, Array)) and len(obj) == len(mask):
+        return obj[mask] if isinstance(obj, Array) else cast(_T, [item for i, item in enumerate(obj) if mask[i]])
     return obj

dataeval/data/selections/_prioritize.py CHANGED Viewed

@@ -99,8 +99,7 @@ class _KNNSorter(_Sorter):
             np.fill_diagonal(dists, np.inf)
         else:
             dists = pairwise_distances(embeddings, reference)
-        inds = np.argsort(np.sort(dists, axis=1)[:, self._k])
-        return inds
+        return np.argsort(np.sort(dists, axis=1)[:, self._k])
 class _KMeansSorter(_Sorter):
@@ -124,15 +123,13 @@ class _KMeansSorter(_Sorter):
 class _KMeansDistanceSorter(_KMeansSorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         clst = self._get_clusters(embeddings if reference is None else reference)
-        inds = np.argsort(clst._dist2center(embeddings))
-        return inds
+        return np.argsort(clst._dist2center(embeddings))
 class _KMeansComplexitySorter(_KMeansSorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         clst = self._get_clusters(embeddings if reference is None else reference)
-        inds = clst._sort_by_weights(embeddings)
-        return inds
+        return clst._sort_by_weights(embeddings)
 class Prioritize(Selection[Any]):
@@ -266,10 +263,10 @@ class Prioritize(Selection[Any]):
     def _get_sorter(self, samples: int) -> _Sorter:
         if self._method == "knn":
             return _KNNSorter(samples, self._k)
-        elif self._method == "kmeans_distance":
+        if self._method == "kmeans_distance":
             return _KMeansDistanceSorter(samples, self._c)
-        else:  # self._method == "kmeans_complexity"
-            return _KMeansComplexitySorter(samples, self._c)
+        # self._method == "kmeans_complexity"
+        return _KMeansComplexitySorter(samples, self._c)
     def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
         emb: NDArray[Any] = embeddings.to_numpy(selection)

dataeval/data/selections/_shuffle.py CHANGED Viewed

@@ -30,7 +30,9 @@ class Shuffle(Selection[Any]):
     seed: int | NDArray[Any] | SeedSequence | BitGenerator | Generator | None
     stage = SelectionStage.ORDER
-    def __init__(self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None):
+    def __init__(
+        self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None
+    ) -> None:
         self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
     def __call__(self, dataset: Select[Any]) -> None:

dataeval/detectors/drift/_base.py CHANGED Viewed

@@ -13,7 +13,7 @@ __all__ = []
 import math
 from abc import abstractmethod
 from functools import wraps
-from typing import Callable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
 import numpy as np
 from numpy.typing import NDArray
@@ -40,7 +40,7 @@ def update_strategy(fn: Callable[..., R]) -> Callable[..., R]:
     """Decorator to update x_ref with x using selected update methodology"""
     @wraps(fn)
-    def _(self: BaseDrift, data: Embeddings | Array, *args, **kwargs) -> R:
+    def _(self: BaseDrift, data: Embeddings | Array, *args: tuple[Any, ...], **kwargs: dict[str, Any]) -> R:
         output = fn(self, data, *args, **kwargs)
         # update reference dataset
@@ -184,7 +184,7 @@ class BaseDriftUnivariate(BaseDrift):
             threshold = self.p_val / self.n_features
             drift_pred = bool((p_vals < threshold).any())
             return drift_pred, threshold
-        elif self.correction == "fdr":
+        if self.correction == "fdr":
             n = p_vals.shape[0]
             i = np.arange(n) + np.int_(1)
             p_sorted = np.sort(p_vals)
@@ -195,8 +195,7 @@ class BaseDriftUnivariate(BaseDrift):
             except ValueError:  # sorted p-values not below thresholds
                 return bool(below_threshold.any()), q_threshold.min()
             return bool(below_threshold.any()), q_threshold[idx_threshold]
-        else:
-            raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
+        raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
     @set_metadata
     @update_strategy

dataeval/detectors/drift/_mmd.py CHANGED Viewed

@@ -95,8 +95,7 @@ class DriftMMD(BaseDrift):
         k_xy = self._kernel(x, y)
         k_xx = self._k_xx if self._k_xx is not None and self.update_strategy is None else self._kernel(x, x)
         k_yy = self._kernel(y, y)
-        kernel_mat = torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
-        return kernel_mat
+        return torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
     def score(self, data: Embeddings | Array) -> tuple[float, float, float]:
         """
@@ -205,8 +204,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
     n = min(x.shape[0], y.shape[0])
     n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
     n_median = n + (torch.prod(torch.as_tensor(dist.shape)) - n) // 2 - 1
-    sigma = (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
-    return sigma
+    return (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
 class GaussianRBF(torch.nn.Module):
@@ -310,5 +308,4 @@ def mmd2_from_kernel_matrix(
         kernel_mat = kernel_mat[idx][:, idx]
     k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
     c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
-    mmd2 = c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
-    return mmd2
+    return c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()

dataeval/detectors/drift/_nml/_base.py CHANGED Viewed

@@ -27,7 +27,9 @@ def _validate(data: pd.DataFrame, expected_features: int | None = None) -> int:
     return data.shape[-1]
-def _create_multilevel_index(chunks: Sequence[Chunk], result_group_name: str, result_column_names: Sequence[str]):
+def _create_multilevel_index(
+    chunks: Sequence[Chunk], result_group_name: str, result_column_names: Sequence[str]
+) -> pd.MultiIndex:
     chunk_column_names = (*chunks[0].KEYS, "period")
     chunk_tuples = [("chunk", chunk_column_name) for chunk_column_name in chunk_column_names]
     result_tuples = [(result_group_name, column_name) for column_name in result_column_names]
@@ -37,7 +39,7 @@ def _create_multilevel_index(chunks: Sequence[Chunk], result_group_name: str, re
 class AbstractCalculator(ABC):
     """Base class for drift calculation."""
-    def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None):
+    def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None) -> None:
         self.chunker = chunker if isinstance(chunker, Chunker) else CountBasedChunker(10)
         self.result: DriftMVDCOutput | None = None
         self.n_features: int | None = None

dataeval/detectors/drift/_nml/_chunk.py CHANGED Viewed

@@ -16,7 +16,6 @@ from abc import ABC, abstractmethod
 from typing import Any, Generic, Literal, Sequence, TypeVar, cast
 import pandas as pd
-from dateutil.parser import ParserError
 from pandas import Index, Period
 from typing_extensions import Self
@@ -31,7 +30,7 @@ class Chunk(ABC):
     def __init__(
         self,
         data: pd.DataFrame,
-    ):
+    ) -> None:
         self.key: str
         self.data = data
@@ -39,11 +38,11 @@ class Chunk(ABC):
         self.end_index: int = -1
         self.chunk_index: int = -1
-    def __repr__(self):
+    def __repr__(self) -> str:
         attr_str = ", ".join([f"{k}={v}" for k, v in self.dict().items()])
         return f"{self.__class__.__name__}(data=pd.DataFrame(shape={self.data.shape}), {attr_str})"
-    def __len__(self):
+    def __len__(self) -> int:
         return self.data.shape[0]
     @abstractmethod
@@ -76,7 +75,7 @@ class IndexChunk(Chunk):
         data: pd.DataFrame,
         start_index: int,
         end_index: int,
-    ):
+    ) -> None:
         super().__init__(data)
         self.key = f"[{start_index}:{end_index}]"
         self.start_index: int = start_index
@@ -113,7 +112,7 @@ class PeriodChunk(Chunk):
     KEYS = ("key", "chunk_index", "start_date", "end_date", "chunk_size")
-    def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int):
+    def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int) -> None:
         super().__init__(data)
         self.key = str(period)
         self.start_datetime = period.start_time
@@ -127,6 +126,7 @@ class PeriodChunk(Chunk):
         a, b = (self, other) if self < other else (other, self)
         result = copy.deepcopy(a)
         result.data = pd.concat([a.data, b.data])
+        result.end_index = b.end_index
         result.end_datetime = b.end_datetime
         result.chunk_size += b.chunk_size
         return result
@@ -237,13 +237,7 @@ class PeriodBasedChunker(Chunker[PeriodChunk]):
         if self.timestamp_column_name not in data:
             raise ValueError(f"timestamp column '{self.timestamp_column_name}' not in columns")
-        try:
-            grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
-        except ParserError:
-            raise ValueError(
-                f"could not parse date_column '{self.timestamp_column_name}' values as dates."
-                f"Please verify if you've specified the correct date column."
-            )
+        grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
         for k, v in grouped.groups.items():
             period, index = cast(Period, k), cast(Index, v)
@@ -281,7 +275,7 @@ class SizeBasedChunker(Chunker[IndexChunk]):
         self,
         chunk_size: int,
         incomplete: Literal["append", "drop", "keep"] = "keep",
-    ):
+    ) -> None:
         """Create a new SizeBasedChunker.
         Parameters
@@ -314,12 +308,11 @@ class SizeBasedChunker(Chunker[IndexChunk]):
     def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
         def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> IndexChunk:
             chunk_data = data.iloc[index : index + chunk_size]
-            chunk = IndexChunk(
+            return IndexChunk(
                 data=chunk_data,
                 start_index=index,
                 end_index=index + chunk_size - 1,
             )
-            return chunk
         chunks = [
             _create_chunk(index=i, data=data, chunk_size=self.chunk_size)
@@ -364,7 +357,7 @@ class CountBasedChunker(Chunker[IndexChunk]):
         self,
         chunk_number: int,
         incomplete: Literal["append", "drop", "keep"] = "keep",
-    ):
+    ) -> None:
         """Creates a new CountBasedChunker.
         It will calculate the amount of observations per chunk based on the given chunk count.
@@ -400,5 +393,4 @@ class CountBasedChunker(Chunker[IndexChunk]):
     def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
         chunk_size = data.shape[0] // self.chunk_number
         chunker = SizeBasedChunker(chunk_size, self.incomplete)
-        chunks = chunker.split(data=data)
-        return chunks
+        return chunker.split(data=data)

dataeval/detectors/drift/_nml/_domainclassifier.py CHANGED Viewed

@@ -20,7 +20,7 @@ from sklearn.model_selection import StratifiedKFold
 from dataeval.config import get_max_processes, get_seed
 from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
 from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
-from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold, calculate_threshold_values
+from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold
 from dataeval.outputs._base import set_metadata
 from dataeval.outputs._drift import DriftMVDCOutput
@@ -38,10 +38,8 @@ DEFAULT_LGBM_HYPERPARAMS = {
     "min_child_weight": 0.001,
     "min_split_gain": 0.0,
     "n_estimators": 100,
-    "n_jobs": get_max_processes() or 0,
     "num_leaves": 31,
     "objective": None,
-    "random_state": get_seed(),
     "reg_alpha": 0.0,
     "reg_lambda": 0.0,
     "subsample": 1.0,
@@ -126,7 +124,7 @@ class DomainClassifierCalculator(AbstractCalculator):
             self.result._data = pd.concat([self.result._data, res], ignore_index=True)
         return self.result
-    def _calculate_chunk(self, chunk: Chunk):
+    def _calculate_chunk(self, chunk: Chunk) -> float:
         if self.result is None:
             # Use information from chunk indices to identify reference chunk's location. This is possible because
             # both the internal reference data copy and the chunk data were sorted by timestamp, so these
@@ -151,7 +149,7 @@ class DomainClassifierCalculator(AbstractCalculator):
             _try = y[train_index]
             _tsx = df_X.iloc[test_index]
             _tsy = y[test_index]
-            model = LGBMClassifier(**self.hyperparameters)
+            model = LGBMClassifier(**self.hyperparameters, n_jobs=get_max_processes(), random_state=get_seed())
             model.fit(_trx, _try)
             preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
             all_preds.append(preds)
@@ -159,24 +157,15 @@ class DomainClassifierCalculator(AbstractCalculator):
         np_all_preds = np.concatenate(all_preds, axis=0)
         np_all_tgts = np.concatenate(all_tgts, axis=0)
-        try:
-            # catch case where all rows are duplicates
-            result = roc_auc_score(np_all_tgts, np_all_preds)
-        except ValueError as err:
-            if str(err) != "Only one class present in y_true. ROC AUC score is not defined in that case.":
-                raise
-            else:
-                # by definition if reference and chunk exactly match we can't discriminate
-                result = 0.5
-        return result
+        result = roc_auc_score(np_all_tgts, np_all_preds)
+        return 0.5 if result == np.nan else float(result)
     def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
         if self.result is None:
-            self._threshold_values = calculate_threshold_values(
-                threshold=self.threshold,
+            self._threshold_values = self.threshold.calculate(
                 data=result_data.loc[:, ("domain_classifier_auroc", "value")],  # type: ignore | dataframe loc
-                lower_threshold_value_limit=0.0,
-                upper_threshold_value_limit=1.0,
+                lower_limit=0.0,
+                upper_limit=1.0,
                 logger=self._logger,
             )

dataeval 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

dataeval 0.86.0py3-none-any.whl → 0.86.1py3-none-any.whl