PyPI - dataeval - Versions diffs - 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl - Mend

dataeval 0.85.0py3-none-any.whl → 0.86.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +65 -42
dataeval/data/_selection.py +2 -3
dataeval/data/_split.py +2 -3
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +6 -8
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/__init__.py +4 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_mvdc.py +92 -0
dataeval/detectors/drift/_nml/__init__.py +6 -0
dataeval/detectors/drift/_nml/_base.py +70 -0
dataeval/detectors/drift/_nml/_chunk.py +396 -0
dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
dataeval/detectors/drift/_nml/_result.py +97 -0
dataeval/detectors/drift/_nml/_thresholds.py +269 -0
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metrics/bias/_parity.py +10 -13
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/__init__.py +2 -1
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +27 -31
dataeval/outputs/_drift.py +60 -0
dataeval/outputs/_linters.py +12 -17
dataeval/outputs/_stats.py +83 -29
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +32 -20
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +19 -11
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
dataeval-0.86.1.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.85.0.dist-info/RECORD +0 -107
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0

dataeval/outputs/_drift.py CHANGED Viewed

@@ -2,11 +2,17 @@ from __future__ import annotations
 __all__ = []
+import contextlib
 from dataclasses import dataclass
 import numpy as np
+import pandas as pd
 from numpy.typing import NDArray
+with contextlib.suppress(ImportError):
+    from matplotlib.figure import Figure
+from dataeval.detectors.drift._nml._result import Metric, PerMetricResult
 from dataeval.outputs._base import Output
@@ -81,3 +87,57 @@ class DriftOutput(DriftBaseOutput):
     feature_threshold: float
     p_vals: NDArray[np.float32]
     distances: NDArray[np.float32]
+class DriftMVDCOutput(PerMetricResult):
+    """Class wrapping the results of the classifier for drift detection and providing plotting functionality."""
+    def __init__(self, results_data: pd.DataFrame) -> None:
+        """Initialize a DomainClassifierCalculator results object.
+        Parameters
+        ----------
+        results_data : pd.DataFrame
+            Results data returned by a DomainClassifierCalculator.
+        """
+        metric = Metric(display_name="Domain Classifier", column_name="domain_classifier_auroc")
+        super().__init__(results_data, [metric])
+    def plot(self) -> Figure:
+        """
+        Render the roc_auc metric over the train/test data in relation to the threshold.
+        Returns
+        -------
+        matplotlib.figure.Figure
+        """
+        import matplotlib.pyplot as plt
+        fig, ax = plt.subplots(dpi=300)
+        resdf = self.to_df()
+        xticks = np.arange(resdf.shape[0])
+        trndf = resdf[resdf["chunk"]["period"] == "reference"]
+        tstdf = resdf[resdf["chunk"]["period"] == "analysis"]
+        # Get local indices for drift markers
+        driftx = np.where(resdf["domain_classifier_auroc"]["alert"].values)  # type: ignore | dataframe
+        if np.size(driftx) > 2:
+            ax.plot(resdf.index, resdf["domain_classifier_auroc"]["upper_threshold"], "r--", label="thr_up")
+            ax.plot(resdf.index, resdf["domain_classifier_auroc"]["lower_threshold"], "r--", label="thr_low")
+            ax.plot(trndf.index, trndf["domain_classifier_auroc"]["value"], "b", label="train")
+            ax.plot(tstdf.index, tstdf["domain_classifier_auroc"]["value"], "g", label="test")
+            ax.plot(
+                resdf.index.values[driftx],  # type: ignore | dataframe
+                resdf["domain_classifier_auroc"]["value"].values[driftx],  # type: ignore | dataframe
+                "dm",
+                markersize=3,
+                label="drift",
+            )
+            ax.set_xticks(xticks)
+            ax.tick_params(axis="x", labelsize=6)
+            ax.tick_params(axis="y", labelsize=6)
+            ax.legend(loc="lower left", fontsize=6)
+            ax.set_title("Domain Classifier, Drift Detection", fontsize=8)
+            ax.set_ylabel("ROC AUC", fontsize=7)
+            ax.set_xlabel("Chunk Index", fontsize=7)
+            ax.set_ylim((0.0, 1.1))
+        return fig

dataeval/outputs/_linters.py CHANGED Viewed

@@ -2,15 +2,12 @@ from __future__ import annotations
 __all__ = []
-import contextlib
 from dataclasses import dataclass
 from typing import Generic, TypeVar, Union
+import pandas as pd
 from typing_extensions import TypeAlias
-with contextlib.suppress(ImportError):
-    import pandas as pd
 from dataeval.outputs._base import Output
 from dataeval.outputs._stats import DimensionStatsOutput, LabelStatsOutput, PixelStatsOutput, VisualStatsOutput
@@ -46,10 +43,12 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
     near: list[TIndexCollection]
-def _reorganize_by_class_and_metric(result: IndexIssueMap, lstats: LabelStatsOutput):
+def _reorganize_by_class_and_metric(
+    result: IndexIssueMap, lstats: LabelStatsOutput
+) -> tuple[dict[str, list[int]], dict[str, dict[str, int]]]:
     """Flip result from grouping by image to grouping by class and metric"""
-    metrics = {}
-    class_wise = {label: {} for label in lstats.class_names}
+    metrics: dict[str, list[int]] = {}
+    class_wise: dict[str, dict[str, int]] = {label: {} for label in lstats.class_names}
     # Group metrics and calculate class-wise counts
     for img, group in result.items():
@@ -62,7 +61,7 @@ def _reorganize_by_class_and_metric(result: IndexIssueMap, lstats: LabelStatsOut
     return metrics, class_wise
-def _create_table(metrics, class_wise):
+def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str, int]]) -> list[str]:
     """Create table for displaying the results"""
     max_class_length = max(len(str(label)) for label in class_wise) + 2
     max_total = max(len(metrics[group]) for group in metrics) + 2
@@ -72,7 +71,7 @@ def _create_table(metrics, class_wise):
         + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
         + [f"{'Total':<{max_total}}"]
     )
-    table_rows = []
+    table_rows: list[str] = []
     for class_cat, results in class_wise.items():
         table_value = [f"{class_cat:>{max_class_length}}"]
@@ -84,15 +83,14 @@ def _create_table(metrics, class_wise):
         table_value.append(f"{total:^{max_total}}")
         table_rows.append(" | ".join(table_value))
-    table = [table_header] + table_rows
-    return table
+    return [table_header] + table_rows
-def _create_pandas_dataframe(class_wise):
+def _create_pandas_dataframe(class_wise: dict[str, dict[str, int]]) -> list[dict[str, str | int]]:
     """Create data for pandas dataframe"""
     data = []
     for label, metrics_dict in class_wise.items():
-        row = {"Class": label}
+        row: dict[str, str | int] = {"Class": label}
         total = sum(metrics_dict.values())
         row.update(metrics_dict)  # Add metric counts
         row["Total"] = total
@@ -121,8 +119,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
     def __len__(self) -> int:
         if isinstance(self.issues, dict):
             return len(self.issues)
-        else:
-            return sum(len(d) for d in self.issues)
+        return sum(len(d) for d in self.issues)
     def to_table(self, labelstats: LabelStatsOutput) -> str:
         """
@@ -168,8 +165,6 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
         -----
         This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
         """
-        import pandas as pd
         if isinstance(self.issues, dict):
             _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
             data = _create_pandas_dataframe(classwise)

dataeval/outputs/_stats.py CHANGED Viewed

@@ -2,24 +2,27 @@ from __future__ import annotations
 __all__ = []
-import contextlib
 from dataclasses import dataclass
-from typing import Any, Iterable, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Optional, Sequence, Union
 import numpy as np
+import pandas as pd
 from numpy.typing import NDArray
 from typing_extensions import TypeAlias
-with contextlib.suppress(ImportError):
-    import pandas as pd
 from dataeval.outputs._base import Output
 from dataeval.utils._plot import channel_histogram_plot, histogram_plot
+if TYPE_CHECKING:
+    from matplotlib.figure import Figure
 OptionalRange: TypeAlias = Optional[Union[int, Iterable[int]]]
 SOURCE_INDEX = "source_index"
-BOX_COUNT = "box_count"
+OBJECT_COUNT = "object_count"
+IMAGE_COUNT = "image_count"
+BASE_ATTRS = (SOURCE_INDEX, OBJECT_COUNT, IMAGE_COUNT)
 class SourceIndex(NamedTuple):
@@ -54,17 +57,24 @@ class BaseStatsOutput(Output):
     ----------
     source_index : List[SourceIndex]
         Mapping from statistic to source image, box and channel index
-    box_count : NDArray[np.uint16]
+    object_count : NDArray[np.uint16]
+        The number of detected objects in each image
     """
     source_index: list[SourceIndex]
-    box_count: NDArray[np.uint16]
+    object_count: NDArray[np.uint16]
+    image_count: int
     def __post_init__(self) -> None:
-        length = len(self.source_index)
-        bad = {k: len(v) for k, v in self.data().items() if k not in [SOURCE_INDEX, BOX_COUNT] and len(v) != length}
-        if bad:
-            raise ValueError(f"All values must have the same length as source_index. Bad values: {str(bad)}.")
+        si_length = len(self.source_index)
+        mismatch = {k: len(v) for k, v in self.data().items() if k not in BASE_ATTRS and len(v) != si_length}
+        if mismatch:
+            raise ValueError(f"All values must have the same length as source_index. Bad values: {str(mismatch)}.")
+        oc_length = len(self.object_count)
+        if oc_length != self.image_count:
+            raise ValueError(
+                f"Total object counts per image does not match image count. {oc_length} != {self.image_count}."
+            )
     def get_channel_mask(
         self,
@@ -126,21 +136,64 @@ class BaseStatsOutput(Output):
         return max_channels, ch_mask
-    def factors(self) -> dict[str, NDArray[Any]]:
+    def factors(
+        self,
+        filter: str | Sequence[str] | None = None,  # noqa: A002
+        exclude_constant: bool = False,
+    ) -> dict[str, NDArray[Any]]:
+        """
+        Returns all 1-dimensional data as a dictionary of numpy arrays.
+        Parameters
+        ----------
+        filter : str, Sequence[str] or None, default None:
+            If provided, only returns keys that match the filter.
+        exclude_constant : bool, default False
+            If True, exclude arrays that contain only a single unique value.
+        Returns
+        -------
+        dict[str, NDArray[Any]]
+        """
+        filter_ = [filter] if isinstance(filter, str) else filter
         return {
             k: v
             for k, v in self.data().items()
-            if k not in (SOURCE_INDEX, BOX_COUNT) and isinstance(v, np.ndarray) and v[v != 0].size > 0 and v.ndim == 1
+            if k not in BASE_ATTRS
+            and (filter_ is None or k in filter_)
+            and isinstance(v, np.ndarray)
+            and v.ndim == 1
+            and (not exclude_constant or len(np.unique(v)) > 1)
         }
     def plot(
         self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
-    ) -> None:
+    ) -> Figure:
+        """
+        Plots the statistics as a set of histograms.
+        Parameters
+        ----------
+        log : bool
+            If True, plots the histograms on a logarithmic scale.
+        channel_limit : int or None
+            The maximum number of channels to plot. If None, all channels are plotted.
+        channel_index : int, Iterable[int] or None
+            The index or indices of the channels to plot. If None, all channels are plotted.
+        Returns
+        -------
+        matplotlib.Figure
+        """
+        from matplotlib.figure import Figure
         max_channels, ch_mask = self._get_channels(channel_limit, channel_index)
+        factors = self.factors(exclude_constant=True)
+        if not factors:
+            return Figure()
         if max_channels == 1:
-            histogram_plot(self.factors(), log)
-        else:
-            channel_histogram_plot(self.factors(), log, max_channels, ch_mask)
+            return histogram_plot(factors, log)
+        return channel_histogram_plot(factors, log, max_channels, ch_mask)
 @dataclass(frozen=True)
@@ -150,9 +203,9 @@ class DimensionStatsOutput(BaseStatsOutput):
     Attributes
     ----------
-    left : NDArray[np.int32]
+    offset_x : NDArray[np.int32]
         Offsets from the left edge of images in pixels
-    top : NDArray[np.int32]
+    offset_y : NDArray[np.int32]
         Offsets from the top edge of images in pixels
     width : NDArray[np.uint32]
         Width of the images in pixels
@@ -163,25 +216,28 @@ class DimensionStatsOutput(BaseStatsOutput):
     size : NDArray[np.uint32]
         Size of the images in pixels
     aspect_ratio : NDArray[np.float16]
-        :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
+        :term:`Aspect Ratio<Aspect Ratio>` of the images (width/height)
     depth : NDArray[np.uint8]
         Color depth of the images in bits
-    center : NDArray[np.uint16]
+    center : NDArray[np.uint32]
         Offset from center in [x,y] coordinates of the images in pixels
-    distance : NDArray[np.float16]
+    distance_center : NDArray[np.float32]
         Distance in pixels from center
+    distance_edge : NDArray[np.uint32]
+        Distance in pixels from nearest edge
     """
-    left: NDArray[np.int32]
-    top: NDArray[np.int32]
+    offset_x: NDArray[np.int32]
+    offset_y: NDArray[np.int32]
     width: NDArray[np.uint32]
     height: NDArray[np.uint32]
     channels: NDArray[np.uint8]
     size: NDArray[np.uint32]
     aspect_ratio: NDArray[np.float16]
     depth: NDArray[np.uint8]
-    center: NDArray[np.int16]
-    distance: NDArray[np.float16]
+    center: NDArray[np.int32]
+    distance_center: NDArray[np.float32]
+    distance_edge: NDArray[np.uint32]
 @dataclass(frozen=True)
@@ -281,8 +337,6 @@ class LabelStatsOutput(Output):
         -------
         pd.DataFrame
         """
-        import pandas as pd
         total_count = []
         image_count = []
         for cls in range(len(self.class_names)):

dataeval/outputs/_workflows.py CHANGED Viewed

@@ -154,10 +154,10 @@ def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int) -> NDArray[Any
         Array of parameters to recreate line of best fit
     """
-    def is_valid(f_new, x_new, f_old, x_old):
+    def is_valid(f_new, x_new, f_old, x_old) -> bool:  # noqa: ANN001
         return f_new != np.nan
-    def f(x):
+    def f(x) -> float:  # noqa: ANN001
         try:
             return np.sum(np.square(p_i - f_out(n_i, x)))
         except RuntimeWarning:

dataeval/utils/_array.py CHANGED Viewed

@@ -23,7 +23,7 @@ T = TypeVar("T", ArrayLike, np.ndarray, torch.Tensor)
 _np_dtype = TypeVar("_np_dtype", bound=np.generic)
-def _try_import(module_name) -> ModuleType | None:
+def _try_import(module_name: str) -> ModuleType | None:
     if module_name in _MODULE_CACHE:
         return _MODULE_CACHE[module_name]
@@ -148,8 +148,7 @@ def ensure_embeddings(
     if dtype is None:
         return embeddings
-    else:
-        return arr
+    return arr
 @overload
@@ -174,10 +173,9 @@ def flatten(array: ArrayLike) -> NDArray[Any] | torch.Tensor:
     if isinstance(array, np.ndarray):
         nparr = as_numpy(array)
         return nparr.reshape((nparr.shape[0], -1))
-    elif isinstance(array, torch.Tensor):
+    if isinstance(array, torch.Tensor):
         return torch.flatten(array, start_dim=1)
-    else:
-        raise TypeError(f"Unsupported array type {type(array)}.")
+    raise TypeError(f"Unsupported array type {type(array)}.")
 _TArray = TypeVar("_TArray", bound=Array)
@@ -199,7 +197,6 @@ def channels_first_to_last(array: _TArray) -> _TArray:
     """
     if isinstance(array, np.ndarray):
         return np.transpose(array, (1, 2, 0))
-    elif isinstance(array, torch.Tensor):
+    if isinstance(array, torch.Tensor):
         return torch.permute(array, (1, 2, 0))
-    else:
-        raise TypeError(f"Unsupported array type {type(array)}.")
+    raise TypeError(f"Unsupported array type {type(array)}.")

dataeval/utils/_bin.py CHANGED Viewed

@@ -195,5 +195,4 @@ def bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]:
     if extend_bins:
         bin_edges = np.concatenate([bin_edges, extend_bins])
-    bin_edges = np.sort(bin_edges)
-    return bin_edges
+    return np.sort(bin_edges)

dataeval/utils/_clusterer.py CHANGED Viewed

@@ -4,6 +4,7 @@ __all__ = []
 import warnings
 from dataclasses import dataclass
+from typing import Any
 import numba
 import numpy as np
@@ -30,7 +31,9 @@ from dataeval.utils._fast_mst import calculate_neighbor_distances, minimum_spann
 @numba.njit(parallel=True, locals={"i": numba.types.int32})
-def compare_links_to_cluster_std(mst, clusters):
+def compare_links_to_cluster_std(
+    mst: NDArray[np.float32], clusters: NDArray[np.intp]
+) -> tuple[NDArray[np.int32], NDArray[np.int32]]:
     cluster_ids = np.unique(clusters)
     cluster_grouping = np.full(mst.shape[0], -1, dtype=np.int16)
@@ -79,7 +82,7 @@ def cluster(data: ArrayLike) -> ClusterData:
     cluster_selection_epsilon = 0.0
     # cluster_selection_method = "eom"
-    x = flatten(to_numpy(data))
+    x: NDArray[Any] = flatten(to_numpy(data))
     samples, features = x.shape  # Due to flatten(), we know shape has a length of 2
     if samples < 2:
         raise ValueError(f"Data should have at least 2 samples; got {samples}")
@@ -125,9 +128,9 @@ def cluster(data: ArrayLike) -> ClusterData:
     return ClusterData(clusters, mst, linkage_tree, condensed_tree, membership_strengths, kneighbors, kdistances)
-def sorted_union_find(index_groups):
+def sorted_union_find(index_groups: NDArray[np.int32]) -> list[list[np.int32]]:
     """Merges and sorts groups of indices that share any common index"""
-    groups = [[np.int32(x) for x in range(0)] for y in range(0)]
+    groups: list[list[np.int32]] = [[np.int32(x) for x in range(0)] for y in range(0)]
     uniques, inverse = np.unique(index_groups, return_inverse=True)
     inverse = inverse.flatten()
     disjoint_set = ds_rank_create(uniques.size)

dataeval/utils/_fast_mst.py CHANGED Viewed

@@ -6,9 +6,11 @@
 __all__ = []
 import warnings
+from typing import Any
 import numba
 import numpy as np
+from numpy.typing import NDArray
 from sklearn.neighbors import NearestNeighbors
 with warnings.catch_warnings():
@@ -17,24 +19,26 @@ with warnings.catch_warnings():
 @numba.njit()
-def _ds_union_by_rank(disjoint_set, point, nbr):
+def _ds_union_by_rank(disjoint_set: tuple[NDArray[np.int32], NDArray[np.int32]], point: int, nbr: int) -> int:
     y = ds_find(disjoint_set, point)
     x = ds_find(disjoint_set, nbr)
     if x == y:
         return 0
-    if disjoint_set.rank[x] < disjoint_set.rank[y]:
+    if disjoint_set[1][x] < disjoint_set[1][y]:
         x, y = y, x
-    disjoint_set.parent[y] = x
-    if disjoint_set.rank[x] == disjoint_set.rank[y]:
-        disjoint_set.rank[x] += 1
+    disjoint_set[0][y] = x
+    if disjoint_set[1][x] == disjoint_set[1][y]:
+        disjoint_set[1][x] += 1
     return 1
 @numba.njit(locals={"i": numba.types.uint32, "nbr": numba.types.uint32, "dist": numba.types.float32})
-def _init_tree(n_neighbors, n_distance):
+def _init_tree(
+    n_neighbors: NDArray[np.intp], n_distance: NDArray[np.float32]
+) -> tuple[NDArray[np.float32], int, tuple[NDArray[np.int32], NDArray[np.int32]], NDArray[np.uint32]]:
     # Initial graph to hold tree connections
     tree = np.zeros((n_neighbors.size - 1, 3), dtype=np.float32)
     disjoint_set = ds_rank_create(n_neighbors.size)
@@ -56,7 +60,13 @@ def _init_tree(n_neighbors, n_distance):
 @numba.njit(locals={"i": numba.types.uint32, "nbr": numba.types.uint32})
-def _update_tree_by_distance(tree, int_tree, disjoint_set, n_neighbors, n_distance):
+def _update_tree_by_distance(
+    tree: NDArray[np.float32],
+    int_tree: int,
+    disjoint_set: tuple[NDArray[np.int32], NDArray[np.int32]],
+    n_neighbors: NDArray[np.uint32],
+    n_distance: NDArray[np.float32],
+) -> tuple[NDArray[np.float32], int, tuple[NDArray[np.int32], NDArray[np.int32]], NDArray[np.uint32]]:
     cluster_points = np.empty(n_neighbors.size, dtype=np.uint32)
     sort_dist = np.argsort(n_distance)
     dist_sorted = n_distance[sort_dist]
@@ -80,9 +90,9 @@ def _update_tree_by_distance(tree, int_tree, disjoint_set, n_neighbors, n_distan
 @numba.njit(locals={"i": numba.types.uint32})
-def _cluster_edges(tracker, last_idx, cluster_distances):
+def _cluster_edges(tracker: NDArray[Any], last_idx: int, cluster_distances: NDArray[Any]) -> list[NDArray[np.intp]]:
     cluster_ids = np.unique(tracker)
-    edge_points = []
+    edge_points: list[NDArray[np.intp]] = []
     for idx in range(cluster_ids.size):
         cluster_points = np.nonzero(tracker == cluster_ids[idx])[0]
         cluster_size = cluster_points.size
@@ -102,14 +112,16 @@ def _cluster_edges(tracker, last_idx, cluster_distances):
     return edge_points
-def _compute_nn(dataA, dataB, k):
+def _compute_nn(dataA: NDArray[Any], dataB: NDArray[Any], k: int) -> tuple[NDArray[np.int32], NDArray[np.float32]]:
     distances, neighbors = NearestNeighbors(n_neighbors=k + 1, algorithm="brute").fit(dataA).kneighbors(dataB)
     neighbors = np.array(neighbors[:, 1 : k + 1], dtype=np.int32)
     distances = np.array(distances[:, 1 : k + 1], dtype=np.float32)
     return neighbors, distances
-def _calculate_cluster_neighbors(data, groups, point_array):
+def _calculate_cluster_neighbors(
+    data: NDArray[Any], groups: list[NDArray[np.intp]], point_array: NDArray[Any]
+) -> tuple[NDArray[np.uint32], NDArray[np.float32]]:
     """Rerun nearest neighbor based on clusters"""
     cluster_neighbors = np.zeros(point_array.size, dtype=np.uint32)
     cluster_nbr_distances = np.full(point_array.size, np.inf, dtype=np.float32)
@@ -126,7 +138,9 @@ def _calculate_cluster_neighbors(data, groups, point_array):
     return cluster_neighbors, cluster_nbr_distances
-def minimum_spanning_tree(data, neighbors, distances):
+def minimum_spanning_tree(
+    data: NDArray[Any], neighbors: NDArray[np.int32], distances: NDArray[np.float32]
+) -> NDArray[np.float32]:
     # Transpose arrays to get number of samples along a row
     k_neighbors = neighbors.T.astype(np.uint32).copy()
     k_distances = distances.T.astype(np.float32).copy()
@@ -168,7 +182,7 @@ def minimum_spanning_tree(data, neighbors, distances):
     return tree
-def calculate_neighbor_distances(data: np.ndarray, k: int = 10):
+def calculate_neighbor_distances(data: np.ndarray, k: int = 10) -> tuple[NDArray[np.int32], NDArray[np.float32]]:
     # Have the potential to add in other distance calculations - supported calculations:
     # https://github.com/lmcinnes/pynndescent/blob/master/pynndescent/pynndescent_.py#L524
     try:

dataeval/utils/_image.py CHANGED Viewed

@@ -12,6 +12,9 @@ from scipy.signal import convolve2d
 EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
 BIT_DEPTH = (1, 8, 12, 16, 32)
+Box = tuple[int, int, int, int]
+"""Bounding box as tuple of integers in x0, y0, x1, y1 format."""
 @dataclass
 class BitDepth:
@@ -25,12 +28,11 @@ def get_bitdepth(image: NDArray[Any]) -> BitDepth:
     Approximates the bit depth of the image using the
     min and max pixel values.
     """
-    pmin, pmax = np.min(image), np.max(image)
+    pmin, pmax = np.nanmin(image), np.nanmax(image)
     if pmin < 0:
         return BitDepth(0, pmin, pmax)
-    else:
-        depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
-        return BitDepth(depth, 0, 2**depth - 1)
+    depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
+    return BitDepth(depth, 0, 2**depth - 1)
 def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
@@ -40,9 +42,8 @@ def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
     bitdepth = get_bitdepth(image)
     if bitdepth.depth == depth:
         return image
-    else:
-        normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
-        return normalized * (2**depth - 1)
+    normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
+    return normalized * (2**depth - 1)
 def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
@@ -52,13 +53,12 @@ def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
     ndim = image.ndim
     if ndim == 2:
         return np.expand_dims(image, axis=0)
-    elif ndim == 3:
+    if ndim == 3:
         return image
-    elif ndim > 3:
+    if ndim > 3:
         # Slice all but the last 3 dimensions
         return image[(0,) * (ndim - 3)]
-    else:
-        raise ValueError("Images must have 2 or more dimensions.")
+    raise ValueError("Images must have 2 or more dimensions.")
 def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
@@ -71,3 +71,57 @@ def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
     edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
     np.clip(edges, 0, 255, edges)
     return edges
+def clip_box(image: NDArray[Any], box: Box) -> Box:
+    """
+    Clip the box to inside the provided image dimensions.
+    """
+    x0, y0, x1, y1 = box
+    h, w = image.shape[-2:]
+    return max(0, x0), max(0, y0), min(w, x1), min(h, y1)
+def is_valid_box(box: Box) -> bool:
+    """
+    Check if the box dimensions provided are a valid image.
+    """
+    return box[2] > box[0] and box[3] > box[1]
+def clip_and_pad(image: NDArray[Any], box: Box) -> NDArray[Any]:
+    """
+    Extract a region from an image based on a bounding box, clipping to image boundaries
+    and padding out-of-bounds areas with np.nan.
+    Parameters:
+    -----------
+    image : NDArray[Any]
+        Input image array in format C, H, W (channels first)
+    box : Box
+        Bounding box coordinates as (x0, y0, x1, y1) where (x0, y0) is top-left and (x1, y1) is bottom-right
+    Returns:
+    --------
+    NDArray[Any]
+        The extracted region with out-of-bounds areas padded with np.nan
+    """
+    # Create output array filled with NaN with a minimum size of 1x1
+    bw, bh = max(1, box[2] - box[0]), max(1, box[3] - box[1])
+    output = np.full((image.shape[-3] if image.ndim > 2 else 1, bh, bw), np.nan)
+    # Calculate source box
+    sbox = clip_box(image, box)
+    # Calculate destination box
+    x0, y0 = sbox[0] - box[0], sbox[1] - box[1]
+    x1, y1 = x0 + (sbox[2] - sbox[0]), y0 + (sbox[3] - sbox[1])
+    # Copy the source if valid from the image to the output
+    if is_valid_box(sbox):
+        output[:, y0:y1, x0:x1] = image[:, sbox[1] : sbox[3], sbox[0] : sbox[2]]
+    return output

dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

dataeval 0.85.0py3-none-any.whl → 0.86.1py3-none-any.whl