PyPI - dataeval - Versions diffs - 0.74.1__py3-none-any.whl → 0.75.0__py3-none-any.whl - Mend

dataeval 0.74.1py3-none-any.whl → 0.75.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

dataeval/__init__.py +33 -10
dataeval/detectors/__init__.py +2 -2
dataeval/detectors/drift/__init__.py +14 -12
dataeval/detectors/drift/base.py +1 -1
dataeval/detectors/drift/cvm.py +1 -1
dataeval/detectors/drift/ks.py +1 -1
dataeval/detectors/drift/mmd.py +6 -5
dataeval/detectors/drift/torch.py +12 -12
dataeval/detectors/drift/uncertainty.py +3 -2
dataeval/detectors/linters/__init__.py +4 -4
dataeval/detectors/linters/clusterer.py +2 -7
dataeval/detectors/linters/duplicates.py +6 -10
dataeval/detectors/linters/outliers.py +4 -2
dataeval/detectors/ood/__init__.py +3 -10
dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
dataeval/detectors/ood/base.py +64 -161
dataeval/detectors/ood/metadata_ks_compare.py +34 -42
dataeval/detectors/ood/metadata_least_likely.py +3 -3
dataeval/detectors/ood/metadata_ood_mi.py +6 -5
dataeval/detectors/ood/mixin.py +146 -0
dataeval/detectors/ood/output.py +63 -0
dataeval/interop.py +16 -3
dataeval/log.py +18 -0
dataeval/metrics/__init__.py +2 -2
dataeval/metrics/bias/__init__.py +9 -12
dataeval/metrics/bias/balance.py +10 -8
dataeval/metrics/bias/coverage.py +52 -4
dataeval/metrics/bias/diversity.py +42 -14
dataeval/metrics/bias/parity.py +15 -12
dataeval/metrics/estimators/__init__.py +2 -2
dataeval/metrics/estimators/ber.py +3 -1
dataeval/metrics/estimators/divergence.py +1 -1
dataeval/metrics/estimators/uap.py +1 -1
dataeval/metrics/stats/__init__.py +18 -18
dataeval/metrics/stats/base.py +4 -4
dataeval/metrics/stats/boxratiostats.py +8 -9
dataeval/metrics/stats/datasetstats.py +10 -14
dataeval/metrics/stats/dimensionstats.py +4 -4
dataeval/metrics/stats/hashstats.py +12 -8
dataeval/metrics/stats/labelstats.py +5 -5
dataeval/metrics/stats/pixelstats.py +4 -9
dataeval/metrics/stats/visualstats.py +4 -9
dataeval/output.py +1 -1
dataeval/utils/__init__.py +4 -13
dataeval/utils/dataset/__init__.py +7 -0
dataeval/utils/{torch → dataset}/datasets.py +2 -0
dataeval/utils/dataset/read.py +63 -0
dataeval/utils/dataset/split.py +527 -0
dataeval/utils/image.py +2 -2
dataeval/utils/metadata.py +310 -5
dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +1 -104
dataeval/utils/torch/__init__.py +2 -17
dataeval/utils/torch/gmm.py +29 -6
dataeval/utils/torch/{utils.py → internal.py} +82 -58
dataeval/utils/torch/models.py +10 -8
dataeval/utils/torch/trainer.py +6 -85
dataeval/workflows/__init__.py +2 -5
dataeval/workflows/sufficiency.py +16 -6
dataeval-0.75.0.dist-info/METADATA +136 -0
dataeval-0.75.0.dist-info/RECORD +67 -0
dataeval/detectors/ood/base_torch.py +0 -109
dataeval/metrics/bias/metadata_preprocessing.py +0 -285
dataeval/utils/gmm.py +0 -26
dataeval/utils/split_dataset.py +0 -492
dataeval-0.74.1.dist-info/METADATA +0 -120
dataeval-0.74.1.dist-info/RECORD +0 -65
{dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/WHEEL +0 -0

dataeval/metrics/__init__.py CHANGED Viewed

@@ -3,6 +3,6 @@ Metrics are a way to measure the performance of your models or datasets that
 can then be analyzed in the context of a given problem.
 """
-from dataeval.metrics import bias, estimators, stats
 __all__ = ["bias", "estimators", "stats"]
+from dataeval.metrics import bias, estimators, stats

dataeval/metrics/bias/__init__.py CHANGED Viewed

@@ -3,22 +3,19 @@ Bias metrics check for skewed or imbalanced datasets and incomplete feature
 representation which may impact model performance.
 """
-from dataeval.metrics.bias.balance import BalanceOutput, balance
-from dataeval.metrics.bias.coverage import CoverageOutput, coverage
-from dataeval.metrics.bias.diversity import DiversityOutput, diversity
-from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput, metadata_preprocessing
-from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
 __all__ = [
+    "BalanceOutput",
+    "CoverageOutput",
+    "DiversityOutput",
+    "ParityOutput",
     "balance",
     "coverage",
     "diversity",
     "label_parity",
     "parity",
-    "metadata_preprocessing",
-    "BalanceOutput",
-    "CoverageOutput",
-    "DiversityOutput",
-    "ParityOutput",
-    "MetadataOutput",
 ]
+from dataeval.metrics.bias.balance import BalanceOutput, balance
+from dataeval.metrics.bias.coverage import CoverageOutput, coverage
+from dataeval.metrics.bias.diversity import DiversityOutput, diversity
+from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity

dataeval/metrics/bias/balance.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["BalanceOutput", "balance"]
+__all__ = []
 import contextlib
 import warnings
@@ -12,9 +12,9 @@ import scipy as sp
 from numpy.typing import NDArray
 from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
-from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
-from dataeval.metrics.bias.metadata_utils import get_counts, heatmap
 from dataeval.output import Output, set_metadata
+from dataeval.utils.metadata import Metadata, get_counts
+from dataeval.utils.plot import heatmap
 with contextlib.suppress(ImportError):
     from matplotlib.figure import Figure
@@ -119,7 +119,7 @@ def _validate_num_neighbors(num_neighbors: int) -> int:
 @set_metadata
 def balance(
-    metadata: MetadataOutput,
+    metadata: Metadata,
     num_neighbors: int = 5,
 ) -> BalanceOutput:
     """
@@ -127,14 +127,16 @@ def balance(
     Parameters
     ----------
-    metadata : MetadataOutput
-        Output after running `metadata_preprocessing`
+    metadata : Metadata
+        Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
+    num_neighbors : int, default 5
+        Number of points to consider as neighbors
     Returns
     -------
     BalanceOutput
-        (num_factors+1) x (num_factors+1) estimate of mutual information
-        between num_factors metadata factors and class label. Symmetry is enforced.
+        (num_factors+1) x (num_factors+1) estimate of mutual information \
+            between num_factors metadata factors and class label. Symmetry is enforced.
     Note
     ----

dataeval/metrics/bias/coverage.py CHANGED Viewed

@@ -1,18 +1,17 @@
 from __future__ import annotations
-__all__ = ["CoverageOutput", "coverage"]
+__all__ = []
 import contextlib
 import math
 from dataclasses import dataclass
-from typing import Literal
+from typing import Any, Literal
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.spatial.distance import pdist, squareform
 from dataeval.interop import to_numpy
-from dataeval.metrics.bias.metadata_utils import coverage_plot
 from dataeval.output import Output, set_metadata
 from dataeval.utils.shared import flatten
@@ -20,6 +19,55 @@ with contextlib.suppress(ImportError):
     from matplotlib.figure import Figure
+def _plot(images: NDArray[Any], num_images: int) -> Figure:
+    """
+    Creates a single plot of all of the provided images
+    Parameters
+    ----------
+    images : NDArray
+        Array containing only the desired images to plot
+    Returns
+    -------
+    matplotlib.figure.Figure
+        Plot of all provided images
+    """
+    import matplotlib.pyplot as plt
+    num_images = min(num_images, len(images))
+    if images.ndim == 4:
+        images = np.moveaxis(images, 1, -1)
+    elif images.ndim == 3:
+        images = np.repeat(images[:, :, :, np.newaxis], 3, axis=-1)
+    else:
+        raise ValueError(
+            f"Expected a (N,C,H,W) or a (N, H, W) set of images, but got a {images.ndim}-dimensional set of images."
+        )
+    rows = int(np.ceil(num_images / 3))
+    fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
+    if rows == 1:
+        for j in range(3):
+            if j >= len(images):
+                continue
+            axs[j].imshow(images[j])
+            axs[j].axis("off")
+    else:
+        for i in range(rows):
+            for j in range(3):
+                i_j = i * 3 + j
+                if i_j >= len(images):
+                    continue
+                axs[i, j].imshow(images[i_j])
+                axs[i, j].axis("off")
+    fig.tight_layout()
+    return fig
 @dataclass(frozen=True)
 class CoverageOutput(Output):
     """
@@ -62,7 +110,7 @@ class CoverageOutput(Output):
         selected_images = images[highest_uncovered_indices]
         # Plot the images
-        fig = coverage_plot(selected_images, top_k)
+        fig = _plot(selected_images, top_k)
         return fig

dataeval/metrics/bias/diversity.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["DiversityOutput", "diversity"]
+__all__ = []
 import contextlib
 from dataclasses import dataclass
@@ -10,15 +10,44 @@ import numpy as np
 import scipy as sp
 from numpy.typing import ArrayLike, NDArray
-from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
-from dataeval.metrics.bias.metadata_utils import diversity_bar_plot, get_counts, heatmap
 from dataeval.output import Output, set_metadata
+from dataeval.utils.metadata import Metadata, get_counts
+from dataeval.utils.plot import heatmap
 from dataeval.utils.shared import get_method
 with contextlib.suppress(ImportError):
     from matplotlib.figure import Figure
+def _plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figure:
+    """
+    Plots a formatted bar plot
+    Parameters
+    ----------
+    labels : NDArray
+        Array containing the labels for each bar
+    bar_heights : NDArray
+        Array containing the values for each bar
+    Returns
+    -------
+    matplotlib.figure.Figure
+        Bar plot figure
+    """
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots(figsize=(10, 10))
+    ax.bar(labels, bar_heights)
+    ax.set_xlabel("Factors")
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
+    fig.tight_layout()
+    return fig
 @dataclass(frozen=True)
 class DiversityOutput(Output):
     """
@@ -77,8 +106,7 @@ class DiversityOutput(Output):
         else:
             # Creating label array for heat map axes
             heat_labels = np.concatenate((["class"], self.factor_names))
-            fig = diversity_bar_plot(heat_labels, self.diversity_index)
+            fig = _plot(heat_labels, self.diversity_index)
         return fig
@@ -165,7 +193,7 @@ def diversity_simpson(
 @set_metadata
 def diversity(
-    metadata: MetadataOutput,
+    metadata: Metadata,
     method: Literal["simpson", "shannon"] = "simpson",
 ) -> DiversityOutput:
     """
@@ -179,8 +207,8 @@ def diversity(
     Parameters
     ----------
-    metadata : MetadataOutput
-        Output after running `metadata_preprocessing`
+    metadata : Metadata
+        Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
     Note
     ----
@@ -199,21 +227,21 @@ def diversity(
     >>> div_simp = diversity(metadata, method="simpson")
     >>> div_simp.diversity_index
-    array([0.72413793, 0.88636364, 0.72413793])
+    array([0.6       , 0.80882353, 1.        , 0.8       ])
     >>> div_simp.classwise
-    array([[0.69230769, 0.68965517],
-           [0.5       , 0.8       ]])
+    array([[0.5       , 0.8       , 0.8       ],
+           [0.63043478, 0.97560976, 0.52830189]])
     Compute Shannon diversity index of metadata and class labels
     >>> div_shan = diversity(metadata, method="shannon")
     >>> div_shan.diversity_index
-    array([0.8812909 , 0.96748876, 0.8812909 ])
+    array([0.81127812, 0.9426312 , 1.        , 0.91829583])
     >>> div_shan.classwise
-    array([[0.91651644, 0.86312057],
-           [0.68260619, 0.91829583]])
+    array([[0.68260619, 0.91829583, 0.91829583],
+           [0.81443569, 0.99107606, 0.76420451]])
     See Also
     --------

dataeval/metrics/bias/parity.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["ParityOutput", "parity", "label_parity"]
+__all__ = []
 import warnings
 from dataclasses import dataclass
@@ -12,8 +12,8 @@ from scipy.stats import chisquare
 from scipy.stats.contingency import chi2_contingency, crosstab
 from dataeval.interop import as_numpy, to_numpy
-from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
 from dataeval.output import Output, set_metadata
+from dataeval.utils.metadata import Metadata
 TData = TypeVar("TData", np.float64, NDArray[np.float64])
@@ -167,8 +167,9 @@ def label_parity(
     --------
     Randomly creating some label distributions using ``np.random.default_rng``
-    >>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
-    >>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
+    >>> rng = np.random.default_rng(175)
+    >>> expected_labels = rng.choice([0, 1, 2, 3, 4], (100))
+    >>> observed_labels = rng.choice([2, 3, 0, 4, 1], (100))
     >>> label_parity(expected_labels, observed_labels)
     ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
     """
@@ -205,7 +206,7 @@ def label_parity(
 @set_metadata
-def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
+def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
     """
     Calculate chi-square statistics to assess the linear relationship between multiple factors
     and class labels.
@@ -216,8 +217,8 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
     Parameters
     ----------
-    metadata : MetadataOutput
-        Output after running `metadata_preprocessing`
+    metadata : Metadata
+        Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
     Returns
     -------
@@ -249,16 +250,18 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
     --------
     Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
-    >>> labels = np_random_gen.choice([0, 1, 2], (100))
+    >>> from dataeval.utils.metadata import preprocess
+    >>> rng = np.random.default_rng(175)
+    >>> labels = rng.choice([0, 1, 2], (100))
     >>> metadata_dict = [
     ...     {
-    ...         "age": list(np_random_gen.choice([25, 30, 35, 45], (100))),
-    ...         "income": list(np_random_gen.choice([50000, 65000, 80000], (100))),
-    ...         "gender": list(np_random_gen.choice(["M", "F"], (100))),
+    ...         "age": list(rng.choice([25, 30, 35, 45], (100))),
+    ...         "income": list(rng.choice([50000, 65000, 80000], (100))),
+    ...         "gender": list(rng.choice(["M", "F"], (100))),
     ...     }
     ... ]
     >>> continuous_factor_bincounts = {"age": 4, "income": 3}
-    >>> metadata = metadata_preprocessing(metadata_dict, labels, continuous_factor_bincounts)
+    >>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
     >>> parity(metadata)
     ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
     """  # noqa: E501

dataeval/metrics/estimators/__init__.py CHANGED Viewed

@@ -2,8 +2,8 @@
 Estimators calculate performance bounds and the statistical distance between datasets.
 """
+__all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
 from dataeval.metrics.estimators.ber import BEROutput, ber
 from dataeval.metrics.estimators.divergence import DivergenceOutput, divergence
 from dataeval.metrics.estimators.uap import UAPOutput, uap
-__all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]

dataeval/metrics/estimators/ber.py CHANGED Viewed

@@ -5,11 +5,12 @@ KNN based estimate for the :term:`Bayes error rate<Bayes Error Rate (BER)>`
 Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
 https://arxiv.org/abs/1811.06419
 """
 from __future__ import annotations
-__all__ = ["BEROutput", "ber"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Literal
@@ -38,6 +39,7 @@ class BEROutput(Output):
     """
     ber: float
     ber_lower: float

dataeval/metrics/estimators/divergence.py CHANGED Viewed

@@ -5,7 +5,7 @@ using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
 from __future__ import annotations
-__all__ = ["DivergenceOutput", "divergence"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Literal

dataeval/metrics/estimators/uap.py CHANGED Viewed

@@ -6,7 +6,7 @@ average precision<Upper-Bound Average Precision (UAP)>` using empirical mean pre
 from __future__ import annotations
-__all__ = ["UAPOutput", "uap"]
+__all__ = []
 from dataclasses import dataclass

dataeval/metrics/stats/__init__.py CHANGED Viewed

@@ -3,6 +3,24 @@ Statistics metrics calculate a variety of image properties and pixel statistics
 and label statistics against the images and labels of a dataset.
 """
+__all__ = [
+    "ChannelStatsOutput",
+    "DatasetStatsOutput",
+    "DimensionStatsOutput",
+    "HashStatsOutput",
+    "LabelStatsOutput",
+    "PixelStatsOutput",
+    "VisualStatsOutput",
+    "boxratiostats",
+    "channelstats",
+    "datasetstats",
+    "dimensionstats",
+    "hashstats",
+    "labelstats",
+    "pixelstats",
+    "visualstats",
+]
 from dataeval.metrics.stats.boxratiostats import boxratiostats
 from dataeval.metrics.stats.datasetstats import (
     ChannelStatsOutput,
@@ -15,21 +33,3 @@ from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
 from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
 from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
 from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
-__all__ = [
-    "boxratiostats",
-    "channelstats",
-    "datasetstats",
-    "dimensionstats",
-    "hashstats",
-    "labelstats",
-    "pixelstats",
-    "visualstats",
-    "ChannelStatsOutput",
-    "DatasetStatsOutput",
-    "DimensionStatsOutput",
-    "HashStatsOutput",
-    "LabelStatsOutput",
-    "PixelStatsOutput",
-    "VisualStatsOutput",
-]

dataeval/metrics/stats/base.py CHANGED Viewed

@@ -193,7 +193,7 @@ class StatsProcessorOutput(NamedTuple):
     results: list[dict[str, Any]]
     source_indices: list[SourceIndex]
     box_counts: list[int]
-    warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]]
+    warnings_list: list[str]
 def process_stats(
@@ -206,13 +206,13 @@ def process_stats(
     results_list: list[dict[str, Any]] = []
     source_indices: list[SourceIndex] = []
     box_counts: list[int] = []
-    warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]] = []
+    warnings_list: list[str] = []
     nboxes = [None] if boxes is None else normalize_box_shape(boxes)
     for i_b, box in enumerate(nboxes):
         i_b = None if box is None else i_b
         processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
         if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
-            warnings_list.append((i, i_b, box, image.shape))
+            warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
         results_list.append({k: v for p in processor_list for k, v in p.process().items()})
         if per_channel:
             source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
@@ -302,7 +302,7 @@ def run_stats(
     # warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
     for w in warning_list:
-        warnings.warn(f"Bounding box [{w[0]}][{w[1]}]: {w[2]} is out of bounds of {w[3]}.", UserWarning)
+        warnings.warn(w, UserWarning)
     output = {}
     for results in results_list:

dataeval/metrics/stats/boxratiostats.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["boxratiostats"]
+__all__ = []
 import copy
 from typing import Any, Callable, Generic, TypeVar, cast
@@ -130,17 +130,16 @@ def boxratiostats(
     --------
     Calculating the box ratio statistics using the dimension stats of the boxes and images
-    >>> imagestats = dimensionstats(images)
-    >>> boxstats = dimensionstats(images, bboxes)
+    >>> from dataeval.metrics.stats import dimensionstats
+    >>> imagestats = dimensionstats(stats_images)
+    >>> boxstats = dimensionstats(stats_images, bboxes)
     >>> ratiostats = boxratiostats(boxstats, imagestats)
     >>> print(ratiostats.aspect_ratio)
-    [ 1.15169271  0.78450521 21.33333333  1.5234375   2.25651042  0.77799479
-      0.88867188  3.40625     1.73307292  1.11132812  0.75018315  0.45018315
-      0.69596354 20.          5.11197917  2.33333333  0.75        0.70019531]
+    [ 0.86376953  0.58837891 16.          0.85714286  1.26959707  0.43772894
+      0.66650391  3.83296703  1.95018315]
     >>> print(ratiostats.size)
-    [0.03401693 0.01383464 0.00130208 0.01822917 0.02327474 0.00683594
-     0.01220703 0.0168457  0.01057943 0.00976562 0.00130208 0.01098633
-     0.02246094 0.0012207  0.01123047 0.00911458 0.02636719 0.06835938]
+    [0.0255127  0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
+     0.00915527 0.03369141 0.02115885]
     """
     output_cls = type(boxstats)
     if type(boxstats) is not type(imgstats):

dataeval/metrics/stats/datasetstats.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["DatasetStatsOutput", "ChannelStatsOutput", "datasetstats", "channelstats"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Any, Iterable
@@ -25,7 +25,7 @@ class DatasetStatsOutput(Output):
     This class represents the outputs of various stats functions against a single
     dataset, such that each index across all stat outputs are representative of
-    the same source image.  Modifying or mixing outputs will result in inaccurate
+    the same source image. Modifying or mixing outputs will result in inaccurate
     outlier calculations if not created correctly.
     Attributes
@@ -60,7 +60,7 @@ class ChannelStatsOutput(Output):
     This class represents the outputs of various per-channel stats functions against
     a single dataset, such that each index across all stat outputs are representative
-    of the same source image.  Modifying or mixing outputs will result in inaccurate
+    of the same source image. Modifying or mixing outputs will result in inaccurate
     outlier calculations if not created correctly.
     Attributes
@@ -119,13 +119,11 @@ def datasetstats(
     --------
     Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
-    >>> stats = datasetstats(images, bboxes)
+    >>> stats = datasetstats(stats_images, bboxes)
     >>> print(stats.dimensionstats.aspect_ratio)
-    [ 0.864   0.5884 16.      1.143   1.692   0.5835  0.6665  2.555   1.3
-      0.8335  1.      0.6     0.522  15.      3.834   1.75    0.75    0.7   ]
-    >>> print(stats.visualstats.contrast)
-    [1.744   1.946   0.1164  0.0635  0.0633  0.06274 0.0429  0.0317  0.0317
-     0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
+    [ 0.864   0.5884 16.      1.143   1.692   0.5835  0.6665  2.555   1.3   ]
+    >>> print(stats.visualstats.sharpness)
+    [4.04   4.434  0.2778 4.957  5.145  5.22   4.957  3.076  2.855 ]
     """
     outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
     return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None)  # type: ignore
@@ -162,12 +160,10 @@ def channelstats(
     --------
     Calculating the per-channel pixel and visual stats for a dataset
-    >>> stats = channelstats(images)
+    >>> stats = channelstats(stats_images)
     >>> print(stats.visualstats.darkness)
-    [0.07495 0.1748  0.275   0.1047  0.11096 0.1172  0.2047  0.2109  0.2172
-     0.3047  0.311   0.3171  0.4048  0.411   0.4172  0.505   0.5107  0.517
-     0.6045  0.611   0.617   0.7046  0.711   0.7173  0.8047  0.811   0.8174
-     0.905   0.911   0.917  ]
+    [0.1499 0.3499 0.55   0.2094 0.2219 0.2344 0.4194 0.6094 0.622  0.6343
+     0.8154]
     """
     outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
     return ChannelStatsOutput(*outputs)  # type: ignore

dataeval/metrics/stats/dimensionstats.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["DimensionStatsOutput", "dimensionstats"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Any, Callable, Iterable
@@ -106,10 +106,10 @@ def dimensionstats(
     --------
     Calculating the dimension statistics on the images, whose shape is (C, H, W)
-    >>> results = dimensionstats(images)
+    >>> results = dimensionstats(stats_images)
     >>> print(results.aspect_ratio)
-    [0.75  0.75  0.75  0.75  0.75  0.75  1.333 0.75  0.75  1.   ]
+    [1.     1.     1.333  1.     0.6665]
     >>> print(results.channels)
-    [1 1 1 1 1 1 3 1 1 3]
+    [3 3 1 3 1]
     """
     return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]

dataeval/metrics/stats/hashstats.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
-__all__ = ["HashStatsOutput", "hashstats"]
+import warnings
+__all__ = []
 from dataclasses import dataclass
 from typing import Callable, Iterable
@@ -41,7 +43,7 @@ def pchash(image: ArrayLike) -> str:
     """
     Performs a perceptual hash on an image by resizing to a square NxN image
     using the Lanczos algorithm where N is 32x32 or the largest multiple of
-    8 that is smaller than the input image dimensions.  The resampled image
+    8 that is smaller than the input image dimensions. The resampled image
     is compressed using a discrete cosine transform and the lowest frequency
     component is encoded as a bit array of greater or less than median value
     and returned as a hex string.
@@ -54,13 +56,15 @@ def pchash(image: ArrayLike) -> str:
     Returns
     -------
     str
-        The hex string hash of the image using perceptual hashing
+        The hex string hash of the image using perceptual hashing, or empty
+        string if the image is too small to be hashed
     """
     # Verify that the image is at least larger than an 8x8 image
     arr = as_numpy(image)
     min_dim = min(arr.shape[-2:])
     if min_dim < HASH_SIZE + 1:
-        raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
+        warnings.warn(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
+        return ""
     # Calculates the dimensions of the resized square image
     resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
@@ -92,7 +96,7 @@ def pchash(image: ArrayLike) -> str:
 def xxhash(image: ArrayLike) -> str:
     """
     Performs a fast non-cryptographic hash using the xxhash algorithm
-    (xxhash.com) against the image as a flattened bytearray.  The hash
+    (xxhash.com) against the image as a flattened bytearray. The hash
     is returned as a hex string.
     Parameters
@@ -147,10 +151,10 @@ def hashstats(
     --------
     Calculating the statistics on the images, whose shape is (C, H, W)
-    >>> results = hashstats(images)
+    >>> results = hashstats(stats_images)
     >>> print(results.xxhash)
-    ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
+    ['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
     >>> print(results.pchash)
-    ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
+    ['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
     """
     return run_stats(images, bboxes, False, [HashStatsProcessor])[0]

dataeval 0.74.1__py3-none-any.whl → 0.75.0__py3-none-any.whl

dataeval 0.74.1py3-none-any.whl → 0.75.0py3-none-any.whl