PyPI - dataeval - Versions diffs - 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl - Mend

dataeval 0.86.0py3-none-any.whl → 0.86.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +48 -37
dataeval/data/_selection.py +1 -2
dataeval/data/_split.py +2 -3
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +2 -5
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_nml/_base.py +4 -2
dataeval/detectors/drift/_nml/_chunk.py +11 -19
dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
dataeval/detectors/drift/_nml/_result.py +8 -9
dataeval/detectors/drift/_nml/_thresholds.py +66 -77
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metrics/bias/_parity.py +10 -13
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +26 -28
dataeval/outputs/_drift.py +1 -9
dataeval/outputs/_linters.py +11 -11
dataeval/outputs/_stats.py +82 -23
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +32 -20
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +19 -11
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +1 -1
dataeval-0.86.1.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.86.0.dist-info/RECORD +0 -114
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0

dataeval/detectors/drift/_nml/_result.py CHANGED Viewed

@@ -42,14 +42,13 @@ class AbstractResult(GenericOutput[pd.DataFrame]):
         """Export results to pandas dataframe."""
         if multilevel:
             return self._data
-        else:
-            column_names = [
-                "_".join(col).replace("chunk_chunk_chunk", "chunk").replace("chunk_chunk", "chunk")
-                for col in self._data.columns.values
-            ]
-            single_level_data = self._data.copy(deep=True)
-            single_level_data.columns = column_names
-            return single_level_data
+        column_names = [
+            "_".join(col).replace("chunk_chunk_chunk", "chunk").replace("chunk_chunk", "chunk")
+            for col in self._data.columns.values
+        ]
+        single_level_data = self._data.copy(deep=True)
+        single_level_data.columns = column_names
+        return single_level_data
     def filter(self, period: str = "all", metrics: str | Sequence[str] | None = None) -> Self:
         """Returns filtered result metric data."""
@@ -67,7 +66,7 @@ class Abstract1DResult(AbstractResult, ABC):
     def __init__(self, results_data: pd.DataFrame) -> None:
         super().__init__(results_data)
-    def _filter(self, period: str, metrics=None) -> Self:
+    def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self:
         data = self._data
         if period != "all":
             data = self._data.loc[self._data.loc[:, ("chunk", "period")] == period, :]  # type: ignore | dataframe loc

dataeval/detectors/drift/_nml/_thresholds.py CHANGED Viewed

@@ -29,10 +29,10 @@ class Threshold(ABC):
     """Class registry lookup to get threshold subclass from threshold_type string"""
     def __str__(self) -> str:
-        return self.__str__()
+        return f"{self.__class__.__name__}({str(vars(self))})"
     def __repr__(self) -> str:
-        return self.__class__.__name__ + str(vars(self))
+        return str(self)
     def __eq__(self, other: object) -> bool:
         return isinstance(other, self.__class__) and other.__dict__ == self.__dict__
@@ -41,7 +41,7 @@ class Threshold(ABC):
         Threshold._registry[threshold_type] = cls
     @abstractmethod
-    def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
+    def _thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
         """Returns lower and upper threshold values when given one or more np.ndarray instances.
         Parameters:
@@ -69,6 +69,61 @@ class Threshold(ABC):
         return threshold_cls(**obj)
+    def calculate(
+        self,
+        data: np.ndarray,
+        lower_limit: float | None = None,
+        upper_limit: float | None = None,
+        override_using_none: bool = False,
+        logger: logging.Logger | None = None,
+    ) -> tuple[float | None, float | None]:
+        """
+        Calculate lower and upper threshold values with respect to the provided Threshold and value limits.
+        Parameters
+        ----------
+        data : np.ndarray
+            The data used by the Threshold instance to calculate the lower and upper threshold values.
+            This will often be the values of a drift detection method or performance metric on chunks of reference
+            data.
+        lower_limit : float or None, default None
+            An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
+            values that end up below this limit will be replaced by this limit value.
+            The limit is often a theoretical constraint enforced by a specific drift detection method or performance
+            metric.
+        upper_threshold_value_limit : float or None, default None
+            An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
+            values that end up below this limit will be replaced by this limit value.
+            The limit is often a theoretical constraint enforced by a specific drift detection method or performance
+            metric.
+        override_using_none: bool, default False
+            When set to True use None to override threshold values that exceed value limits.
+            This will prevent them from being rendered on plots.
+        logger: Optional[logging.Logger], default=None
+            An optional Logger instance. When provided a warning will be logged when a calculated threshold value
+            gets overridden by a threshold value limit.
+        """
+        lower_value, upper_value = self._thresholds(data)
+        if lower_limit is not None and lower_value is not None and lower_value <= lower_limit:
+            override_value = None if override_using_none else lower_limit
+            if logger:
+                logger.warning(
+                    f"lower threshold value {lower_value} overridden by lower threshold value limit {override_value}"
+                )
+            lower_value = override_value
+        if upper_limit is not None and upper_value is not None and upper_value >= upper_limit:
+            override_value = None if override_using_none else upper_limit
+            if logger:
+                logger.warning(
+                    f"upper threshold value {upper_value} overridden by upper threshold value limit {override_value}"
+                )
+            upper_value = override_value
+        return lower_value, upper_value
 class ConstantThreshold(Threshold, threshold_type="constant"):
     """A `Thresholder` implementation that returns a constant lower and or upper threshold value.
@@ -91,7 +146,7 @@ class ConstantThreshold(Threshold, threshold_type="constant"):
         None 0.1
     """
-    def __init__(self, lower: float | int | None = None, upper: float | int | None = None):
+    def __init__(self, lower: float | int | None = None, upper: float | int | None = None) -> None:
         """Creates a new ConstantThreshold instance.
         Args:
@@ -109,11 +164,11 @@ class ConstantThreshold(Threshold, threshold_type="constant"):
         self.lower = lower
         self.upper = upper
-    def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
+    def _thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
         return self.lower, self.upper
     @staticmethod
-    def _validate_inputs(lower: float | int | None = None, upper: float | int | None = None):
+    def _validate_inputs(lower: float | int | None = None, upper: float | int | None = None) -> None:
         if lower is not None and not isinstance(lower, (float, int)) or isinstance(lower, bool):
             raise ValueError(f"expected type of 'lower' to be 'float', 'int' or None but got '{type(lower).__name__}'")
@@ -149,7 +204,7 @@ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation")
         std_lower_multiplier: float | int | None = 3,
         std_upper_multiplier: float | int | None = 3,
         offset_from: Callable[[np.ndarray], Any] = np.nanmean,
-    ):
+    ) -> None:
         """Creates a new StandardDeviationThreshold instance.
         Args:
@@ -173,7 +228,7 @@ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation")
         self.std_upper_multiplier = std_upper_multiplier
         self.offset_from = offset_from
-    def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
+    def _thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
         aggregate = self.offset_from(data)
         std = np.nanstd(data)
@@ -184,7 +239,9 @@ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation")
         return lower_threshold, upper_threshold
     @staticmethod
-    def _validate_inputs(std_lower_multiplier: float | int | None = 3, std_upper_multiplier: float | int | None = 3):
+    def _validate_inputs(
+        std_lower_multiplier: float | int | None = 3, std_upper_multiplier: float | int | None = 3
+    ) -> None:
         if (
             std_lower_multiplier is not None
             and not isinstance(std_lower_multiplier, (float, int))
@@ -210,71 +267,3 @@ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation")
         if std_upper_multiplier and std_upper_multiplier < 0:
             raise ValueError(f"'std_upper_multiplier' should be greater than 0 but got value {std_upper_multiplier}")
-def calculate_threshold_values(
-    threshold: Threshold,
-    data: np.ndarray,
-    lower_threshold_value_limit: float | None = None,
-    upper_threshold_value_limit: float | None = None,
-    override_using_none: bool = False,
-    logger: logging.Logger | None = None,
-    metric_name: str | None = None,
-) -> tuple[float | None, float | None]:
-    """Calculate lower and upper threshold values with respect to the provided Threshold and value limits.
-    Parameters:
-        threshold: Threshold
-            The Threshold instance that determines how the lower and upper threshold values will be calculated.
-        data: np.ndarray
-            The data used by the Threshold instance to calculate the lower and upper threshold values.
-            This will often be the values of a drift detection method or performance metric on chunks of reference data.
-        lower_threshold_value_limit: Optional[float], default=None
-            An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
-            values that end up below this limit will be replaced by this limit value.
-            The limit is often a theoretical constraint enforced by a specific drift detection method or performance
-            metric.
-        upper_threshold_value_limit: Optional[float], default=None
-            An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
-            values that end up below this limit will be replaced by this limit value.
-            The limit is often a theoretical constraint enforced by a specific drift detection method or performance
-            metric.
-        override_using_none: bool, default=False
-            When set to True use None to override threshold values that exceed value limits.
-            This will prevent them from being rendered on plots.
-        logger: Optional[logging.Logger], default=None
-            An optional Logger instance. When provided a warning will be logged when a calculated threshold value
-            gets overridden by a threshold value limit.
-        metric_name: Optional[str], default=None
-            When provided the metric name will be included within any log messages for additional clarity.
-    """
-    lower_threshold_value, upper_threshold_value = threshold.thresholds(data)
-    if (
-        lower_threshold_value_limit is not None
-        and lower_threshold_value is not None
-        and lower_threshold_value <= lower_threshold_value_limit
-    ):
-        override_value = None if override_using_none else lower_threshold_value_limit
-        if logger:
-            logger.warning(
-                f"{metric_name + ' ' if metric_name else ''}lower threshold value {lower_threshold_value} "
-                f"overridden by lower threshold value limit {override_value}"
-            )
-        lower_threshold_value = override_value
-    if (
-        upper_threshold_value_limit is not None
-        and upper_threshold_value is not None
-        and upper_threshold_value >= upper_threshold_value_limit
-    ):
-        override_value = None if override_using_none else upper_threshold_value_limit
-        if logger:
-            logger.warning(
-                f"{metric_name + ' ' if metric_name else ''}upper threshold value {upper_threshold_value} "
-                f"overridden by upper threshold value limit {override_value}"
-            )
-        upper_threshold_value = override_value
-    return lower_threshold_value, upper_threshold_value

dataeval/detectors/linters/outliers.py CHANGED Viewed

@@ -13,31 +13,31 @@ from dataeval.metrics.stats._imagestats import imagestats
 from dataeval.outputs import DimensionStatsOutput, ImageStatsOutput, OutliersOutput, PixelStatsOutput, VisualStatsOutput
 from dataeval.outputs._base import set_metadata
 from dataeval.outputs._linters import IndexIssueMap, OutlierStatsOutput
-from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX
+from dataeval.outputs._stats import BASE_ATTRS
 from dataeval.typing import ArrayLike, Dataset
 def _get_outlier_mask(
     values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
 ) -> NDArray:
+    values = values.astype(np.float64)
     if method == "zscore":
         threshold = threshold if threshold else 3.0
         std = np.std(values)
         abs_diff = np.abs(values - np.mean(values))
         return std != 0 and (abs_diff / std) > threshold
-    elif method == "modzscore":
+    if method == "modzscore":
         threshold = threshold if threshold else 3.5
         abs_diff = np.abs(values - np.median(values))
         med_abs_diff = np.median(abs_diff) if np.median(abs_diff) != 0 else np.mean(abs_diff)
         mod_z_score = 0.6745 * abs_diff / med_abs_diff
         return mod_z_score > threshold
-    elif method == "iqr":
+    if method == "iqr":
         threshold = threshold if threshold else 1.5
         qrt = np.percentile(values, q=(25, 75), method="midpoint")
         iqr = (qrt[1] - qrt[0]) * threshold
         return (values < (qrt[0] - iqr)) | (values > (qrt[1] + iqr))
-    else:
-        raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
+    raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
 class Outliers:
@@ -103,7 +103,7 @@ class Outliers:
         use_visual: bool = True,
         outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
         outlier_threshold: float | None = None,
-    ):
+    ) -> None:
         self.stats: ImageStatsOutput
         self.use_dimension = use_dimension
         self.use_pixel = use_pixel
@@ -114,7 +114,7 @@ class Outliers:
     def _get_outliers(self, stats: dict) -> dict[int, dict[str, float]]:
         flagged_images: dict[int, dict[str, float]] = {}
         for stat, values in stats.items():
-            if stat in (SOURCE_INDEX, BOX_COUNT):
+            if stat in BASE_ATTRS:
                 continue
             if values.ndim == 1:
                 mask = _get_outlier_mask(values.astype(np.float64), self.outlier_method, self.outlier_threshold)

dataeval/metrics/bias/_parity.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 __all__ = []
 import warnings
+from collections import defaultdict
 from typing import Any
 import numpy as np
@@ -246,7 +247,7 @@ def parity(metadata: Metadata) -> ParityOutput:
     chi_scores = np.zeros(metadata.discrete_data.shape[1])
     p_values = np.zeros_like(chi_scores)
-    insufficient_data = {}
+    insufficient_data: defaultdict[str, defaultdict[int, dict[str, int]]] = defaultdict(lambda: defaultdict(dict))
     for i, col_data in enumerate(metadata.discrete_data.T):
         # Builds a contingency matrix where entry at index (r,c) represents
         # the frequency of current_factor_name achieving value unique_factor_values[r]
@@ -261,26 +262,22 @@ def parity(metadata: Metadata) -> ParityOutput:
         for int_factor, int_class in zip(counts[0], counts[1]):
             if contingency_matrix[int_factor, int_class] > 0:
                 factor_category = unique_factor_values[int_factor].item()
-                if current_factor_name not in insufficient_data:
-                    insufficient_data[current_factor_name] = {}
-                if factor_category not in insufficient_data[current_factor_name]:
-                    insufficient_data[current_factor_name][factor_category] = {}
                 class_name = metadata.class_names[int_class]
                 class_count = contingency_matrix[int_factor, int_class].item()
                 insufficient_data[current_factor_name][factor_category][class_name] = class_count
         # This deletes rows containing only zeros,
         # because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
-        rowsums = np.sum(contingency_matrix, axis=1)
-        rowmask = np.nonzero(rowsums)[0]
-        contingency_matrix = contingency_matrix[rowmask]
+        contingency_matrix = contingency_matrix[np.any(contingency_matrix, axis=1)]
-        chi2, p, _, _ = chi2_contingency(contingency_matrix)
-        chi_scores[i] = chi2
-        p_values[i] = p
+        chi_scores[i], p_values[i] = chi2_contingency(contingency_matrix)[:2]
     if insufficient_data:
         warnings.warn("Some factors did not meet the recommended 5 occurrences for each value-label combination.")
-    return ParityOutput(chi_scores, p_values, metadata.discrete_factor_names, insufficient_data)
+    return ParityOutput(
+        score=chi_scores,
+        p_value=p_values,
+        factor_names=metadata.discrete_factor_names,
+        insufficient_data={k: dict(v) for k, v in insufficient_data.items()},
+    )

dataeval/metrics/estimators/_divergence.py CHANGED Viewed

@@ -38,8 +38,7 @@ def divergence_mst(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
     """
     mst = minimum_spanning_tree(data).toarray()
     edgelist = np.transpose(np.nonzero(mst))
-    errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
-    return errors
+    return np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
 def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
@@ -59,8 +58,7 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
         Number of label errors when finding nearest neighbors
     """
     nn_indices = compute_neighbors(data, data)
-    errors = np.sum(np.abs(labels[nn_indices] - labels))
-    return errors
+    return np.sum(np.abs(labels[nn_indices] - labels))
 _DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}

dataeval/metrics/stats/_base.py CHANGED Viewed

@@ -10,23 +10,86 @@ from copy import deepcopy
 from dataclasses import dataclass
 from functools import partial
 from multiprocessing import Pool
-from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar
+from typing import Any, Callable, Generic, Iterable, Iterator, Sequence, TypeVar
 import numpy as np
 import tqdm
 from numpy.typing import NDArray
 from dataeval.config import get_max_processes
-from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
+from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput, SourceIndex
 from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
 from dataeval.utils._array import as_numpy, to_numpy
-from dataeval.utils._image import normalize_image_shape, rescale
+from dataeval.utils._image import clip_and_pad, clip_box, is_valid_box, normalize_image_shape, rescale
 DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
-BoundingBox = tuple[float, float, float, float]
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
+_S = TypeVar("_S")
+_T = TypeVar("_T")
+@dataclass
+class BoundingBox:
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    def __post_init__(self) -> None:
+        # Test for invalid coordinates
+        x_swap = self.x0 > self.x1
+        y_swap = self.y0 > self.y1
+        if x_swap or y_swap:
+            warnings.warn(f"Invalid bounding box coordinates: {self} - swapping invalid coordinates.")
+            if x_swap:
+                self.x0, self.x1 = self.x1, self.x0
+            if y_swap:
+                self.y0, self.y1 = self.y1, self.y0
+    @property
+    def width(self) -> float:
+        return self.x1 - self.x0
+    @property
+    def height(self) -> float:
+        return self.y1 - self.y0
+    def to_int(self) -> tuple[int, int, int, int]:
+        """
+        Returns the bounding box as a tuple of integers.
+        """
+        x0_int = math.floor(self.x0)
+        y0_int = math.floor(self.y0)
+        x1_int = math.ceil(self.x1)
+        y1_int = math.ceil(self.y1)
+        return x0_int, y0_int, x1_int, y1_int
+class PoolWrapper:
+    """
+    Wraps `multiprocessing.Pool` to allow for easy switching between
+    multiprocessing and single-threaded execution.
+    This helps with debugging and profiling, as well as usage with Jupyter notebooks
+    in VS Code, which does not support subprocess debugging.
+    """
+    def __init__(self, processes: int | None) -> None:
+        self.pool = Pool(processes) if processes is not None and processes > 1 else None
+    def imap(self, func: Callable[[_S], _T], iterable: Iterable[_S]) -> Iterator[_T]:
+        return map(func, iterable) if self.pool is None else self.pool.imap(func, iterable)
+    def __enter__(self, *args: Any, **kwargs: Any) -> PoolWrapper:
+        return self
+    def __exit__(self, *args: Any) -> None:
+        if self.pool is not None:
+            self.pool.close()
+            self.pool.join()
 class StatsProcessor(Generic[TStatsOutput]):
     output_class: type[TStatsOutput]
@@ -34,32 +97,26 @@ class StatsProcessor(Generic[TStatsOutput]):
     image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
     channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
-    def __init__(self, image: NDArray[Any], box: BoundingBox | None, per_channel: bool) -> None:
+    def __init__(self, image: NDArray[Any], box: BoundingBox | Iterable[Any] | None, per_channel: bool) -> None:
         self.raw = image
         self.width: int = image.shape[-1]
         self.height: int = image.shape[-2]
-        box = BoundingBox((0, 0, self.width, self.height)) if box is None else box
-        # Clip the bounding box to image
-        x0, y0 = (min(j, max(0, math.floor(box[i]))) for i, j in zip((0, 1), (self.width - 1, self.height - 1)))
-        x1, y1 = (min(j, max(1, math.ceil(box[i]))) for i, j in zip((2, 3), (self.width, self.height)))
-        self.box: NDArray[np.int64] = np.array([x0, y0, x1, y1], dtype=np.int64)
+        box = (0, 0, self.width, self.height) if box is None else box
+        self.box = box if isinstance(box, BoundingBox) else BoundingBox(*box)
         self._per_channel = per_channel
         self._image = None
         self._shape = None
         self._scaled = None
         self._cache = {}
         self._fn_map = self.channel_function_map if per_channel else self.image_function_map
-        self._is_valid_slice = box is None or bool(
-            box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
-        )
+        self._is_valid_box = is_valid_box(clip_box(image, self.box.to_int()))
     def get(self, fn_key: str) -> NDArray[Any]:
         if fn_key in self.cache_keys:
             if fn_key not in self._cache:
                 self._cache[fn_key] = self._fn_map[fn_key](self)
             return self._cache[fn_key]
-        else:
-            return self._fn_map[fn_key](self)
+        return self._fn_map[fn_key](self)
     def process(self) -> dict[str, Any]:
         return {k: self._fn_map[k](self) for k in self._fn_map}
@@ -67,11 +124,7 @@ class StatsProcessor(Generic[TStatsOutput]):
     @property
     def image(self) -> NDArray[Any]:
         if self._image is None:
-            if self._is_valid_slice:
-                norm = normalize_image_shape(self.raw)
-                self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
-            else:
-                self._image = np.zeros((self.raw.shape[0], self.box[3] - self.box[1], self.box[2] - self.box[0]))
+            self._image = clip_and_pad(normalize_image_shape(self.raw), self.box.to_int())
         return self._image
     @property
@@ -90,9 +143,9 @@ class StatsProcessor(Generic[TStatsOutput]):
     @classmethod
     def convert_output(
-        cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
+        cls, source: dict[str, Any], source_index: list[SourceIndex], object_count: list[int], image_count: int
     ) -> TStatsOutput:
-        output = {}
+        output: dict[str, Any] = {}
         attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
         for key in (key for key in source if key in attrs):
             stat_type: str = attrs[key]
@@ -101,14 +154,17 @@ class StatsProcessor(Generic[TStatsOutput]):
                 output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
             else:
                 output[key] = source[key]
-        return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
+        base_attrs: dict[str, Any] = dict(
+            zip(BASE_ATTRS, (source_index, np.asarray(object_count, dtype=np.uint16), image_count))
+        )
+        return cls.output_class(**output, **base_attrs)
 @dataclass
 class StatsProcessorOutput:
     results: list[dict[str, Any]]
     source_indices: list[SourceIndex]
-    box_counts: list[int]
+    object_counts: list[int]
     warnings_list: list[str]
@@ -119,18 +175,18 @@ def process_stats(
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    image = to_numpy(image)
+    np_image = to_numpy(image)
     results_list: list[dict[str, Any]] = []
     source_indices: list[SourceIndex] = []
     box_counts: list[int] = []
     warnings_list: list[str] = []
     for i_b, box in [(None, None)] if boxes is None else enumerate(boxes):
-        processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
-        if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
-            warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
+        processor_list = [p(np_image, box, per_channel) for p in stats_processor_cls]
+        if any(not p._is_valid_box for p in processor_list) and i_b is not None and box is not None:
+            warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} for image shape {np_image.shape} is invalid.")
         results_list.append({k: v for p in processor_list for k, v in p.process().items()})
         if per_channel:
-            source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
+            source_indices.extend([SourceIndex(i, i_b, c) for c in range(np_image.shape[-3])])
         else:
             source_indices.append(SourceIndex(i, i_b, None))
     box_counts.append(0 if boxes is None else len(boxes))
@@ -145,13 +201,18 @@ def process_stats_unpack(
     return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
-def _enumerate(dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]], per_box: bool):
+def _enumerate(
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]], per_box: bool
+) -> Iterator[tuple[int, ArrayLike, Any]]:
     for i in range(len(dataset)):
         d = dataset[i]
         image = d[0] if isinstance(d, tuple) else d
         if per_box and isinstance(d, tuple) and isinstance(d[1], ObjectDetectionTarget):
-            boxes = d[1].boxes if isinstance(d[1].boxes, Array) else as_numpy(d[1].boxes)
-            target = [BoundingBox(float(box[i]) for i in range(4)) for box in boxes]
+            try:
+                boxes = d[1].boxes if isinstance(d[1].boxes, Array) else as_numpy(d[1].boxes)
+                target = [BoundingBox(*(float(box[i]) for i in range(4))) for box in boxes]
+            except (ValueError, IndexError):
+                raise ValueError(f"Invalid bounding box format for image {i}: {d[1].boxes}")
         else:
             target = None
@@ -199,12 +260,13 @@ def run_stats(
     """
     results_list: list[dict[str, NDArray[np.float64]]] = []
     source_index: list[SourceIndex] = []
-    box_count: list[int] = []
+    object_count: list[int] = []
+    image_count: int = len(dataset)
     warning_list = []
     stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
-    with Pool(processes=get_max_processes()) as p:
+    with PoolWrapper(processes=get_max_processes()) as p:
         for r in tqdm.tqdm(
             p.imap(
                 partial(
@@ -214,14 +276,12 @@ def run_stats(
                 ),
                 _enumerate(dataset, per_box),
             ),
-            total=len(dataset),
+            total=image_count,
         ):
             results_list.extend(r.results)
             source_index.extend(r.source_indices)
-            box_count.extend(r.box_counts)
+            object_count.extend(r.object_counts)
             warning_list.extend(r.warnings_list)
-    p.close()
-    p.join()
     # warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
     for w in warning_list:
@@ -235,8 +295,7 @@ def run_stats(
             else:
                 output.setdefault(stat, []).append(result.tolist() if isinstance(result, np.ndarray) else result)
-    outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
-    return outputs
+    return [s.convert_output(output, source_index, object_count, image_count) for s in stats_processor_cls]
 def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
@@ -246,10 +305,12 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
     sum_dict = deepcopy(a.data())
     for k in sum_dict:
-        if isinstance(sum_dict[k], list):
+        if isinstance(sum_dict[k], Sequence):
             sum_dict[k].extend(b.data()[k])
-        else:
+        elif isinstance(sum_dict[k], Array):
             sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
+        else:
+            sum_dict[k] += b.data()[k]
     return type(a)(**sum_dict)

dataeval 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

dataeval 0.86.0py3-none-any.whl → 0.86.1py3-none-any.whl