PyPI - dataeval - Versions diffs - 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl - Mend

dataeval 0.86.0py3-none-any.whl → 0.86.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/config.py +21 -4
dataeval/data/_embeddings.py +2 -2
dataeval/data/_images.py +2 -3
dataeval/data/_metadata.py +48 -37
dataeval/data/_selection.py +1 -2
dataeval/data/_split.py +2 -3
dataeval/data/_targets.py +17 -13
dataeval/data/selections/_classfilter.py +2 -5
dataeval/data/selections/_prioritize.py +6 -9
dataeval/data/selections/_shuffle.py +3 -1
dataeval/detectors/drift/_base.py +4 -5
dataeval/detectors/drift/_mmd.py +3 -6
dataeval/detectors/drift/_nml/_base.py +4 -2
dataeval/detectors/drift/_nml/_chunk.py +11 -19
dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
dataeval/detectors/drift/_nml/_result.py +8 -9
dataeval/detectors/drift/_nml/_thresholds.py +66 -77
dataeval/detectors/linters/outliers.py +7 -7
dataeval/metrics/bias/_parity.py +10 -13
dataeval/metrics/estimators/_divergence.py +2 -4
dataeval/metrics/stats/_base.py +103 -42
dataeval/metrics/stats/_boxratiostats.py +21 -19
dataeval/metrics/stats/_dimensionstats.py +14 -10
dataeval/metrics/stats/_hashstats.py +1 -1
dataeval/metrics/stats/_pixelstats.py +6 -6
dataeval/metrics/stats/_visualstats.py +3 -3
dataeval/outputs/_base.py +22 -7
dataeval/outputs/_bias.py +26 -28
dataeval/outputs/_drift.py +1 -9
dataeval/outputs/_linters.py +11 -11
dataeval/outputs/_stats.py +82 -23
dataeval/outputs/_workflows.py +2 -2
dataeval/utils/_array.py +6 -9
dataeval/utils/_bin.py +1 -2
dataeval/utils/_clusterer.py +7 -4
dataeval/utils/_fast_mst.py +27 -13
dataeval/utils/_image.py +65 -11
dataeval/utils/_mst.py +1 -3
dataeval/utils/_plot.py +15 -10
dataeval/utils/data/_dataset.py +32 -20
dataeval/utils/data/metadata.py +104 -82
dataeval/utils/datasets/__init__.py +2 -0
dataeval/utils/datasets/_antiuav.py +189 -0
dataeval/utils/datasets/_base.py +11 -8
dataeval/utils/datasets/_cifar10.py +104 -45
dataeval/utils/datasets/_fileio.py +21 -47
dataeval/utils/datasets/_milco.py +19 -11
dataeval/utils/datasets/_mixin.py +2 -4
dataeval/utils/datasets/_mnist.py +3 -4
dataeval/utils/datasets/_ships.py +14 -7
dataeval/utils/datasets/_voc.py +229 -42
dataeval/utils/torch/models.py +5 -10
dataeval/utils/torch/trainer.py +3 -3
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +1 -1
dataeval-0.86.1.dist-info/RECORD +114 -0
dataeval/detectors/ood/vae.py +0 -74
dataeval-0.86.0.dist-info/RECORD +0 -114
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/_boxratiostats.py CHANGED Viewed

@@ -8,8 +8,9 @@ from typing import Any, Callable, Generic, TypeVar, cast
 import numpy as np
 from numpy.typing import NDArray
+from dataeval.config import EPSILON
 from dataeval.outputs._base import set_metadata
-from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
+from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput
 TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
 ArraySlice = tuple[int, int]
@@ -40,15 +41,19 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
         self.img = self.StatSlicer(img_stats, img_slice)
-RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
-    DimensionStatsOutput: dict[str, Callable[[BoxImageStatsOutputSlice[DimensionStatsOutput]], NDArray[Any]]](
-        {
-            "left": lambda x: x.box["left"] / x.img["width"],
-            "top": lambda x: x.box["top"] / x.img["height"],
-            "channels": lambda x: x.box["channels"],
-            "depth": lambda x: x.box["depth"],
-            "distance": lambda x: x.box["distance"],
-        }
+RATIOSTATS_OVERRIDE_MAP: dict[str, Callable[[BoxImageStatsOutputSlice[Any]], NDArray[Any]]] = {
+    "offset_x": lambda x: x.box["offset_x"] / x.img["width"],
+    "offset_y": lambda x: x.box["offset_y"] / x.img["height"],
+    "channels": lambda x: x.box["channels"],
+    "depth": lambda x: x.box["depth"],
+    "distance_center": lambda x: x.box["distance_center"]
+    / (np.sqrt(np.square(x.img["width"]) + np.square(x.img["height"])) / 2),
+    "distance_edge": lambda x: x.box["distance_edge"]
+    / (
+        x.img["width"]
+        if np.min([np.abs(x.box["offset_x"]), np.abs((x.box["width"] + x.box["offset_x"]) - x.img["width"])])
+        < np.min([np.abs(x.box["offset_y"]), np.abs((x.box["height"] + x.box["offset_y"]) - x.img["height"])])
+        else x.img["height"]
     ),
 }
@@ -69,11 +74,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
     stats = getattr(box_stats, key)
-    # Copy over stats index maps and box counts
-    if key in (SOURCE_INDEX):
+    # Copy over base attributes
+    if key in BASE_ATTRS:
         return copy.deepcopy(stats)
-    elif key == BOX_COUNT:
-        return np.copy(stats)
     # Calculate ratios for each stat
     out_stats: np.ndarray = np.copy(stats).astype(np.float64)
@@ -84,10 +87,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
         box_j = len(box_stats) if i == len(box_map) - 1 else box_map[i + 1]
         img_j = len(img_stats) if i == len(img_map) - 1 else img_map[i + 1]
         stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
-        out_type = type(box_stats)
-        use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
+        use_override = key in RATIOSTATS_OVERRIDE_MAP
         with np.errstate(divide="ignore", invalid="ignore"):
-            ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
+            ratio = RATIOSTATS_OVERRIDE_MAP[key](stats) if use_override else stats.box[key] / (stats.img[key] + EPSILON)
         out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
     return out_stats
@@ -141,8 +143,8 @@ def boxratiostats(
     output_cls = type(boxstats)
     if type(boxstats) is not type(imgstats):
         raise TypeError("Must provide stats outputs of the same type.")
-    if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
-        raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
+    if boxstats.image_count != imgstats.image_count:
+        raise ValueError("Stats image count length mismatch. Check if the correct box and image stats were provided.")
     if any(src_idx.box is None for src_idx in boxstats.source_index):
         raise ValueError("Input for boxstats must contain box information.")
     if any(src_idx.box is not None for src_idx in imgstats.source_index):

dataeval/metrics/stats/_dimensionstats.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Any, Callable
 import numpy as np
+from dataeval.config import EPSILON
 from dataeval.metrics.stats._base import StatsProcessor, run_stats
 from dataeval.outputs import DimensionStatsOutput
 from dataeval.outputs._base import set_metadata
@@ -16,18 +17,21 @@ from dataeval.utils._image import get_bitdepth
 class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
     output_class: type = DimensionStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
-        "left": lambda x: x.box[0],
-        "top": lambda x: x.box[1],
-        "width": lambda x: x.box[2] - x.box[0],
-        "height": lambda x: x.box[3] - x.box[1],
+        "offset_x": lambda x: x.box.x0,
+        "offset_y": lambda x: x.box.y0,
+        "width": lambda x: x.box.width,
+        "height": lambda x: x.box.height,
         "channels": lambda x: x.shape[-3],
-        "size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
-        "aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
+        "size": lambda x: x.box.width * x.box.height,
+        "aspect_ratio": lambda x: x.box.width / (x.box.height + EPSILON),
         "depth": lambda x: get_bitdepth(x.image).depth,
-        "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
-        "distance": lambda x: np.sqrt(
-            np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
-            + np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
+        "center": lambda x: np.asarray([(x.box.x0 + x.box.x1) / 2, (x.box.y0 + x.box.y1) / 2]),
+        "distance_center": lambda x: np.sqrt(
+            np.square(((x.box.x0 + x.box.x1) / 2) - (x.raw.shape[-1] / 2))
+            + np.square(((x.box.y0 + x.box.y1) / 2) - (x.raw.shape[-2] / 2))
+        ),
+        "distance_edge": lambda x: np.min(
+            [np.abs(x.box.x0), np.abs(x.box.y0), np.abs(x.box.x1 - x.raw.shape[-1]), np.abs(x.box.y1 - x.raw.shape[-2])]
         ),
     }

dataeval/metrics/stats/_hashstats.py CHANGED Viewed

@@ -137,7 +137,7 @@ def hashstats(
     >>> results = hashstats(dataset)
     >>> print(results.xxhash[:5])
-    ['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
+    ['69b50a5f06af238c', '5a861d7a23d1afe7', '7ffdb4990ad44ac6', '4f0c366a3298ceac', 'c5519e36ac1f8839']
     >>> print(results.pchash[:5])
     ['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
     """

dataeval/metrics/stats/_pixelstats.py CHANGED Viewed

@@ -16,18 +16,18 @@ from dataeval.typing import ArrayLike, Dataset
 class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
     output_class: type = PixelStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
-        "mean": lambda x: np.mean(x.scaled),
-        "std": lambda x: np.std(x.scaled),
-        "var": lambda x: np.var(x.scaled),
+        "mean": lambda x: np.nanmean(x.scaled),
+        "std": lambda x: np.nanstd(x.scaled),
+        "var": lambda x: np.nanvar(x.scaled),
         "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
         "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled.ravel())),
         "histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
         "entropy": lambda x: entropy(x.get("histogram")),
     }
     channel_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
-        "mean": lambda x: np.mean(x.scaled, axis=1),
-        "std": lambda x: np.std(x.scaled, axis=1),
-        "var": lambda x: np.var(x.scaled, axis=1),
+        "mean": lambda x: np.nanmean(x.scaled, axis=1),
+        "std": lambda x: np.nanstd(x.scaled, axis=1),
+        "var": lambda x: np.nanvar(x.scaled, axis=1),
         "skew": lambda x: np.nan_to_num(skew(x.scaled, axis=1)),
         "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled, axis=1)),
         "histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),

dataeval/metrics/stats/_visualstats.py CHANGED Viewed

@@ -24,8 +24,8 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
         else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
         "darkness": lambda x: x.get("percentiles")[-2],
         "missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
-        "sharpness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
-        "zeros": lambda x: np.count_nonzero(np.sum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
+        "sharpness": lambda x: np.nanstd(edge_filter(np.mean(x.image, axis=0))),
+        "zeros": lambda x: np.count_nonzero(np.nansum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
         "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
     }
     channel_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
@@ -36,7 +36,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
         ),
         "darkness": lambda x: x.get("percentiles")[:, -2],
         "missing": lambda x: np.count_nonzero(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
-        "sharpness": lambda x: np.std(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
+        "sharpness": lambda x: np.nanstd(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
         "zeros": lambda x: np.count_nonzero(x.image == 0, axis=(1, 2)) / np.prod(x.shape[-2:]),
         "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES, axis=1).T,
     }

dataeval/outputs/_base.py CHANGED Viewed

@@ -66,25 +66,40 @@ class GenericOutput(Generic[T]):
     def meta(self) -> ExecutionMetadata:
         """
         Metadata about the execution of the function or method for the Output class.
+        Returns
+        -------
+        ExecutionMetadata
         """
         return self._meta or ExecutionMetadata.empty()
 class Output(GenericOutput[dict[str, Any]]):
     def data(self) -> dict[str, Any]:
-        return {k: v for k, v in self.__dict__.items() if k != "_meta"}
+        """
+        The output data as a dictionary.
-    def __repr__(self) -> str:
-        return str(self)
+        Returns
+        -------
+        dict[str, Any]
+        """
+        return {k: v for k, v in self.__dict__.items() if k != "_meta"}
     def __str__(self) -> str:
-        return f"{self.__class__.__name__}({', '.join([f'{k}={v}' for k, v in self.data().items()])})"
+        return str(self.data())
 class BaseCollectionMixin(Collection[Any]):
     __slots__ = ["_data"]
     def data(self) -> Any:
+        """
+        The output data as a collection.
+        Returns
+        -------
+        Collection
+        """
         return self._data
     def __len__(self) -> int:
@@ -102,7 +117,7 @@ TValue = TypeVar("TValue")
 class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Mapping[TKey, TValue]]):
-    def __init__(self, data: Mapping[TKey, TValue]):
+    def __init__(self, data: Mapping[TKey, TValue]) -> None:
         self._data = data
     def __getitem__(self, key: TKey) -> TValue:
@@ -113,7 +128,7 @@ class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Ma
 class SequenceOutput(Sequence[TValue], BaseCollectionMixin, GenericOutput[Sequence[TValue]]):
-    def __init__(self, data: Sequence[TValue]):
+    def __init__(self, data: Sequence[TValue]) -> None:
         self._data = data
     @overload
@@ -140,7 +155,7 @@ def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None =
     @wraps(fn)
     def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
-        def fmt(v):
+        def fmt(v: Any) -> Any:
             if np.isscalar(v):
                 return v
             if hasattr(v, "shape"):

dataeval/outputs/_bias.py CHANGED Viewed

@@ -128,33 +128,30 @@ class CoverageOutput(Output):
         import matplotlib.pyplot as plt
+        images = Images(images) if isinstance(images, Dataset) else images
+        if np.max(self.uncovered_indices) > len(images):
+            raise ValueError(
+                f"Uncovered indices {self.uncovered_indices} specify images "
+                f"unavailable in the provided number of images {len(images)}."
+            )
         # Determine which images to plot
         selected_indices = self.uncovered_indices[:top_k]
-        images = Images(images) if isinstance(images, Dataset) else images
         # Plot the images
         num_images = min(top_k, len(selected_indices))
         rows = int(np.ceil(num_images / 3))
-        fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
-        if rows == 1:
-            for j in range(3):
-                if j >= len(selected_indices):
-                    continue
-                image = channels_first_to_last(as_numpy(images[selected_indices[j]]))
-                axs[j].imshow(image)
-                axs[j].axis("off")
-        else:
-            for i in range(rows):
-                for j in range(3):
-                    i_j = i * 3 + j
-                    if i_j >= len(selected_indices):
-                        continue
-                    image = channels_first_to_last(as_numpy(images[selected_indices[i_j]]))
-                    axs[i, j].imshow(image)
-                    axs[i, j].axis("off")
+        cols = min(3, num_images)
+        fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
+        for image, ax in zip(images[:num_images], axs.flat):
+            image = channels_first_to_last(as_numpy(image))
+            ax.imshow(image)
+            ax.axis("off")
+        for ax in axs.flat[num_images:]:
+            ax.axis("off")
         fig.tight_layout()
         return fig
@@ -233,14 +230,15 @@ class BalanceOutput(Output):
         # return the masked attribute
         if attr == "factor_names":
             return [x.replace(f"-{factor_type}", "") for x in self.factor_names if mask_lambda(x)]
-        else:
-            factor_type_mask = np.asarray([mask_lambda(x) for x in self.factor_names])
-            if attr == "factors":
-                return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
-            elif attr == "balance":
-                return self.balance[factor_type_mask]
-            elif attr == "classwise":
-                return self.classwise[:, factor_type_mask]
+        factor_type_mask = np.asarray([mask_lambda(x) for x in self.factor_names])
+        if attr == "factors":
+            return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
+        if attr == "balance":
+            return self.balance[factor_type_mask]
+        if attr == "classwise":
+            return self.classwise[:, factor_type_mask]
+        raise ValueError(f"Unknown attr {attr} specified.")
     def plot(
         self,

dataeval/outputs/_drift.py CHANGED Viewed

@@ -103,19 +103,13 @@ class DriftMVDCOutput(PerMetricResult):
         metric = Metric(display_name="Domain Classifier", column_name="domain_classifier_auroc")
         super().__init__(results_data, [metric])
-    def plot(self, showme: bool = True) -> Figure:
+    def plot(self) -> Figure:
         """
         Render the roc_auc metric over the train/test data in relation to the threshold.
-        Parameters
-        ----------
-        showme : bool, default True
-            Option to display the figure.
         Returns
         -------
         matplotlib.figure.Figure
         """
         import matplotlib.pyplot as plt
@@ -146,6 +140,4 @@ class DriftMVDCOutput(PerMetricResult):
             ax.set_ylabel("ROC AUC", fontsize=7)
             ax.set_xlabel("Chunk Index", fontsize=7)
             ax.set_ylim((0.0, 1.1))
-            if showme:
-                plt.show()
         return fig

dataeval/outputs/_linters.py CHANGED Viewed

@@ -43,10 +43,12 @@ class DuplicatesOutput(Output, Generic[TIndexCollection]):
     near: list[TIndexCollection]
-def _reorganize_by_class_and_metric(result: IndexIssueMap, lstats: LabelStatsOutput):
+def _reorganize_by_class_and_metric(
+    result: IndexIssueMap, lstats: LabelStatsOutput
+) -> tuple[dict[str, list[int]], dict[str, dict[str, int]]]:
     """Flip result from grouping by image to grouping by class and metric"""
-    metrics = {}
-    class_wise = {label: {} for label in lstats.class_names}
+    metrics: dict[str, list[int]] = {}
+    class_wise: dict[str, dict[str, int]] = {label: {} for label in lstats.class_names}
     # Group metrics and calculate class-wise counts
     for img, group in result.items():
@@ -59,7 +61,7 @@ def _reorganize_by_class_and_metric(result: IndexIssueMap, lstats: LabelStatsOut
     return metrics, class_wise
-def _create_table(metrics, class_wise):
+def _create_table(metrics: dict[str, list[int]], class_wise: dict[str, dict[str, int]]) -> list[str]:
     """Create table for displaying the results"""
     max_class_length = max(len(str(label)) for label in class_wise) + 2
     max_total = max(len(metrics[group]) for group in metrics) + 2
@@ -69,7 +71,7 @@ def _create_table(metrics, class_wise):
         + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
         + [f"{'Total':<{max_total}}"]
     )
-    table_rows = []
+    table_rows: list[str] = []
     for class_cat, results in class_wise.items():
         table_value = [f"{class_cat:>{max_class_length}}"]
@@ -81,15 +83,14 @@ def _create_table(metrics, class_wise):
         table_value.append(f"{total:^{max_total}}")
         table_rows.append(" | ".join(table_value))
-    table = [table_header] + table_rows
-    return table
+    return [table_header] + table_rows
-def _create_pandas_dataframe(class_wise):
+def _create_pandas_dataframe(class_wise: dict[str, dict[str, int]]) -> list[dict[str, str | int]]:
     """Create data for pandas dataframe"""
     data = []
     for label, metrics_dict in class_wise.items():
-        row = {"Class": label}
+        row: dict[str, str | int] = {"Class": label}
         total = sum(metrics_dict.values())
         row.update(metrics_dict)  # Add metric counts
         row["Total"] = total
@@ -118,8 +119,7 @@ class OutliersOutput(Output, Generic[TIndexIssueMap]):
     def __len__(self) -> int:
         if isinstance(self.issues, dict):
             return len(self.issues)
-        else:
-            return sum(len(d) for d in self.issues)
+        return sum(len(d) for d in self.issues)
     def to_table(self, labelstats: LabelStatsOutput) -> str:
         """

dataeval/outputs/_stats.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from dataclasses import dataclass
-from typing import Any, Iterable, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Optional, Sequence, Union
 import numpy as np
 import pandas as pd
@@ -13,10 +13,16 @@ from typing_extensions import TypeAlias
 from dataeval.outputs._base import Output
 from dataeval.utils._plot import channel_histogram_plot, histogram_plot
+if TYPE_CHECKING:
+    from matplotlib.figure import Figure
 OptionalRange: TypeAlias = Optional[Union[int, Iterable[int]]]
 SOURCE_INDEX = "source_index"
-BOX_COUNT = "box_count"
+OBJECT_COUNT = "object_count"
+IMAGE_COUNT = "image_count"
+BASE_ATTRS = (SOURCE_INDEX, OBJECT_COUNT, IMAGE_COUNT)
 class SourceIndex(NamedTuple):
@@ -51,17 +57,24 @@ class BaseStatsOutput(Output):
     ----------
     source_index : List[SourceIndex]
         Mapping from statistic to source image, box and channel index
-    box_count : NDArray[np.uint16]
+    object_count : NDArray[np.uint16]
+        The number of detected objects in each image
     """
     source_index: list[SourceIndex]
-    box_count: NDArray[np.uint16]
+    object_count: NDArray[np.uint16]
+    image_count: int
     def __post_init__(self) -> None:
-        length = len(self.source_index)
-        bad = {k: len(v) for k, v in self.data().items() if k not in [SOURCE_INDEX, BOX_COUNT] and len(v) != length}
-        if bad:
-            raise ValueError(f"All values must have the same length as source_index. Bad values: {str(bad)}.")
+        si_length = len(self.source_index)
+        mismatch = {k: len(v) for k, v in self.data().items() if k not in BASE_ATTRS and len(v) != si_length}
+        if mismatch:
+            raise ValueError(f"All values must have the same length as source_index. Bad values: {str(mismatch)}.")
+        oc_length = len(self.object_count)
+        if oc_length != self.image_count:
+            raise ValueError(
+                f"Total object counts per image does not match image count. {oc_length} != {self.image_count}."
+            )
     def get_channel_mask(
         self,
@@ -123,21 +136,64 @@ class BaseStatsOutput(Output):
         return max_channels, ch_mask
-    def factors(self) -> dict[str, NDArray[Any]]:
+    def factors(
+        self,
+        filter: str | Sequence[str] | None = None,  # noqa: A002
+        exclude_constant: bool = False,
+    ) -> dict[str, NDArray[Any]]:
+        """
+        Returns all 1-dimensional data as a dictionary of numpy arrays.
+        Parameters
+        ----------
+        filter : str, Sequence[str] or None, default None:
+            If provided, only returns keys that match the filter.
+        exclude_constant : bool, default False
+            If True, exclude arrays that contain only a single unique value.
+        Returns
+        -------
+        dict[str, NDArray[Any]]
+        """
+        filter_ = [filter] if isinstance(filter, str) else filter
         return {
             k: v
             for k, v in self.data().items()
-            if k not in (SOURCE_INDEX, BOX_COUNT) and isinstance(v, np.ndarray) and v[v != 0].size > 0 and v.ndim == 1
+            if k not in BASE_ATTRS
+            and (filter_ is None or k in filter_)
+            and isinstance(v, np.ndarray)
+            and v.ndim == 1
+            and (not exclude_constant or len(np.unique(v)) > 1)
         }
     def plot(
         self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
-    ) -> None:
+    ) -> Figure:
+        """
+        Plots the statistics as a set of histograms.
+        Parameters
+        ----------
+        log : bool
+            If True, plots the histograms on a logarithmic scale.
+        channel_limit : int or None
+            The maximum number of channels to plot. If None, all channels are plotted.
+        channel_index : int, Iterable[int] or None
+            The index or indices of the channels to plot. If None, all channels are plotted.
+        Returns
+        -------
+        matplotlib.Figure
+        """
+        from matplotlib.figure import Figure
         max_channels, ch_mask = self._get_channels(channel_limit, channel_index)
+        factors = self.factors(exclude_constant=True)
+        if not factors:
+            return Figure()
         if max_channels == 1:
-            histogram_plot(self.factors(), log)
-        else:
-            channel_histogram_plot(self.factors(), log, max_channels, ch_mask)
+            return histogram_plot(factors, log)
+        return channel_histogram_plot(factors, log, max_channels, ch_mask)
 @dataclass(frozen=True)
@@ -147,9 +203,9 @@ class DimensionStatsOutput(BaseStatsOutput):
     Attributes
     ----------
-    left : NDArray[np.int32]
+    offset_x : NDArray[np.int32]
         Offsets from the left edge of images in pixels
-    top : NDArray[np.int32]
+    offset_y : NDArray[np.int32]
         Offsets from the top edge of images in pixels
     width : NDArray[np.uint32]
         Width of the images in pixels
@@ -160,25 +216,28 @@ class DimensionStatsOutput(BaseStatsOutput):
     size : NDArray[np.uint32]
         Size of the images in pixels
     aspect_ratio : NDArray[np.float16]
-        :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
+        :term:`Aspect Ratio<Aspect Ratio>` of the images (width/height)
     depth : NDArray[np.uint8]
         Color depth of the images in bits
-    center : NDArray[np.uint16]
+    center : NDArray[np.uint32]
         Offset from center in [x,y] coordinates of the images in pixels
-    distance : NDArray[np.float16]
+    distance_center : NDArray[np.float32]
         Distance in pixels from center
+    distance_edge : NDArray[np.uint32]
+        Distance in pixels from nearest edge
     """
-    left: NDArray[np.int32]
-    top: NDArray[np.int32]
+    offset_x: NDArray[np.int32]
+    offset_y: NDArray[np.int32]
     width: NDArray[np.uint32]
     height: NDArray[np.uint32]
     channels: NDArray[np.uint8]
     size: NDArray[np.uint32]
     aspect_ratio: NDArray[np.float16]
     depth: NDArray[np.uint8]
-    center: NDArray[np.int16]
-    distance: NDArray[np.float16]
+    center: NDArray[np.int32]
+    distance_center: NDArray[np.float32]
+    distance_edge: NDArray[np.uint32]
 @dataclass(frozen=True)

dataeval/outputs/_workflows.py CHANGED Viewed

@@ -154,10 +154,10 @@ def calc_params(p_i: NDArray[Any], n_i: NDArray[Any], niter: int) -> NDArray[Any
         Array of parameters to recreate line of best fit
     """
-    def is_valid(f_new, x_new, f_old, x_old):
+    def is_valid(f_new, x_new, f_old, x_old) -> bool:  # noqa: ANN001
         return f_new != np.nan
-    def f(x):
+    def f(x) -> float:  # noqa: ANN001
         try:
             return np.sum(np.square(p_i - f_out(n_i, x)))
         except RuntimeWarning:

dataeval/utils/_array.py CHANGED Viewed

@@ -23,7 +23,7 @@ T = TypeVar("T", ArrayLike, np.ndarray, torch.Tensor)
 _np_dtype = TypeVar("_np_dtype", bound=np.generic)
-def _try_import(module_name) -> ModuleType | None:
+def _try_import(module_name: str) -> ModuleType | None:
     if module_name in _MODULE_CACHE:
         return _MODULE_CACHE[module_name]
@@ -148,8 +148,7 @@ def ensure_embeddings(
     if dtype is None:
         return embeddings
-    else:
-        return arr
+    return arr
 @overload
@@ -174,10 +173,9 @@ def flatten(array: ArrayLike) -> NDArray[Any] | torch.Tensor:
     if isinstance(array, np.ndarray):
         nparr = as_numpy(array)
         return nparr.reshape((nparr.shape[0], -1))
-    elif isinstance(array, torch.Tensor):
+    if isinstance(array, torch.Tensor):
         return torch.flatten(array, start_dim=1)
-    else:
-        raise TypeError(f"Unsupported array type {type(array)}.")
+    raise TypeError(f"Unsupported array type {type(array)}.")
 _TArray = TypeVar("_TArray", bound=Array)
@@ -199,7 +197,6 @@ def channels_first_to_last(array: _TArray) -> _TArray:
     """
     if isinstance(array, np.ndarray):
         return np.transpose(array, (1, 2, 0))
-    elif isinstance(array, torch.Tensor):
+    if isinstance(array, torch.Tensor):
         return torch.permute(array, (1, 2, 0))
-    else:
-        raise TypeError(f"Unsupported array type {type(array)}.")
+    raise TypeError(f"Unsupported array type {type(array)}.")

dataeval/utils/_bin.py CHANGED Viewed

@@ -195,5 +195,4 @@ def bin_by_clusters(data: NDArray[np.number[Any]]) -> NDArray[np.float64]:
     if extend_bins:
         bin_edges = np.concatenate([bin_edges, extend_bins])
-    bin_edges = np.sort(bin_edges)
-    return bin_edges
+    return np.sort(bin_edges)

dataeval 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

dataeval 0.86.0py3-none-any.whl → 0.86.1py3-none-any.whl