PyPI - dataeval - Versions diffs - 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl - Mend

dataeval 0.64.0py3-none-any.whl → 0.66.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

dataeval/__init__.py +13 -9
dataeval/_internal/detectors/clusterer.py +63 -49
dataeval/_internal/detectors/drift/base.py +248 -51
dataeval/_internal/detectors/drift/cvm.py +28 -26
dataeval/_internal/detectors/drift/ks.py +31 -28
dataeval/_internal/detectors/drift/mmd.py +62 -42
dataeval/_internal/detectors/drift/torch.py +69 -60
dataeval/_internal/detectors/drift/uncertainty.py +32 -32
dataeval/_internal/detectors/duplicates.py +67 -31
dataeval/_internal/detectors/ood/ae.py +15 -29
dataeval/_internal/detectors/ood/aegmm.py +33 -27
dataeval/_internal/detectors/ood/base.py +86 -47
dataeval/_internal/detectors/ood/llr.py +34 -31
dataeval/_internal/detectors/ood/vae.py +32 -31
dataeval/_internal/detectors/ood/vaegmm.py +34 -28
dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
dataeval/_internal/flags.py +44 -21
dataeval/_internal/interop.py +5 -3
dataeval/_internal/metrics/balance.py +42 -5
dataeval/_internal/metrics/ber.py +11 -8
dataeval/_internal/metrics/coverage.py +15 -8
dataeval/_internal/metrics/divergence.py +41 -7
dataeval/_internal/metrics/diversity.py +57 -19
dataeval/_internal/metrics/parity.py +141 -66
dataeval/_internal/metrics/stats.py +330 -313
dataeval/_internal/metrics/uap.py +33 -4
dataeval/_internal/metrics/utils.py +79 -40
dataeval/_internal/models/pytorch/autoencoder.py +127 -22
dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
dataeval/_internal/models/tensorflow/gmm.py +4 -2
dataeval/_internal/models/tensorflow/losses.py +17 -13
dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
dataeval/_internal/models/tensorflow/trainer.py +10 -7
dataeval/_internal/models/tensorflow/utils.py +23 -20
dataeval/_internal/output.py +85 -0
dataeval/_internal/utils.py +5 -3
dataeval/_internal/workflows/sufficiency.py +122 -121
dataeval/detectors/__init__.py +6 -25
dataeval/detectors/drift/__init__.py +16 -0
dataeval/detectors/drift/kernels/__init__.py +6 -0
dataeval/detectors/drift/updates/__init__.py +3 -0
dataeval/detectors/linters/__init__.py +5 -0
dataeval/detectors/ood/__init__.py +11 -0
dataeval/flags/__init__.py +2 -2
dataeval/metrics/__init__.py +2 -26
dataeval/metrics/bias/__init__.py +14 -0
dataeval/metrics/estimators/__init__.py +9 -0
dataeval/metrics/stats/__init__.py +6 -0
dataeval/tensorflow/__init__.py +3 -0
dataeval/tensorflow/loss/__init__.py +3 -0
dataeval/tensorflow/models/__init__.py +5 -0
dataeval/tensorflow/recon/__init__.py +3 -0
dataeval/torch/__init__.py +3 -0
dataeval/{models/torch → torch/models}/__init__.py +1 -2
dataeval/torch/trainer/__init__.py +3 -0
dataeval/utils/__init__.py +3 -6
dataeval/workflows/__init__.py +2 -4
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
dataeval-0.66.0.dist-info/RECORD +72 -0
dataeval/_internal/metrics/base.py +0 -10
dataeval/models/__init__.py +0 -15
dataeval/models/tensorflow/__init__.py +0 -6
dataeval-0.64.0.dist-info/RECORD +0 -60
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0

dataeval/_internal/metrics/parity.py CHANGED Viewed

@@ -1,48 +1,41 @@
+from __future__ import annotations
 import warnings
-from typing import Dict, Mapping, NamedTuple, Optional, Tuple
+from dataclasses import dataclass
+from typing import Generic, Mapping, TypeVar
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import chi2_contingency, chisquare
 from dataeval._internal.interop import to_numpy
+from dataeval._internal.output import OutputMetadata, set_metadata
-class ParityOutput(NamedTuple):
-    """
-    Attributes
-    ----------
-    score : np.float64
-        chi-squared value of the test
-    p_value : np.float64
-        p-value of the test
-    """
-    score: np.float64
-    p_value: np.float64
+TData = TypeVar("TData", np.float64, NDArray[np.float64])
-class ParityMetadataOutput(NamedTuple):
+@dataclass(frozen=True)
+class ParityOutput(Generic[TData], OutputMetadata):
     """
     Attributes
     ----------
-    scores : NDArray[np.float64]
-        chi-squared values of the test
-    p_values : NDArray[np.float64]
-        p-values of the test
+    score : np.float64 | NDArray[np.float64]
+        chi-squared score(s) of the test
+    p_value : np.float64 | NDArray[np.float64]
+        p-value(s) of the test
     """
-    score: NDArray[np.float64]
-    p_value: NDArray[np.float64]
+    score: TData
+    p_value: TData
-def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name: str):
+def digitize_factor_bins(continuous_values: NDArray, bins: int, factor_name: str) -> NDArray:
     """
     Digitizes a list of values into a given number of bins.
     Parameters
     ----------
-    continuous_values: np.ndarray
+    continuous_values: NDArray
         The values to be digitized.
     bins: int
         The number of bins for the discrete values that continuous_values will be digitized into.
@@ -51,10 +44,10 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
     Returns
     -------
-    np.ndarray
+    NDArray
         The digitized values
     """
     if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
         raise TypeError(
             f"Encountered a non-numeric value for factor {factor_name}, but the factor"
@@ -69,14 +62,14 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
 def format_discretize_factors(
-    data_factors: dict[str, np.ndarray], continuous_factor_bincounts: Dict[str, int]
-) -> Tuple[dict, np.ndarray]:
+    data_factors: dict[str, NDArray], continuous_factor_bincounts: dict[str, int]
+) -> tuple[dict[str, NDArray], NDArray]:
     """
     Sets up the internal list of metadata factors.
     Parameters
     ----------
-    data_factors: Dict[str, np.ndarray]
+    data_factors: Dict[str, NDArray]
         The dataset factors, which are per-image attributes including class label and metadata.
         Each key of dataset_factors is a factor, whose value is the per-image factor values.
     continuous_factor_bincounts : Dict[str, int]
@@ -87,12 +80,12 @@ def format_discretize_factors(
     Returns
     -------
-    Dict[str, np.ndarray]
-        Intrinsic per-image metadata information with the formatting that input data_factors uses.
-        Each key is a metadata factor, whose value is the discrete per-image factor values.
-    np.ndarray
-        Per-image labels, whose ith element is the label for the ith element of the dataset.
+    Tuple[Dict[str, NDArray], NDArray]
+        - Intrinsic per-image metadata information with the formatting that input data_factors uses.
+          Each key is a metadata factor, whose value is the discrete per-image factor values.
+        - Per-image labels, whose ith element is the label for the ith element of the dataset.
     """
     invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
     if invalid_keys:
         raise KeyError(
@@ -123,7 +116,36 @@ def format_discretize_factors(
     return metadata_factors, labels
-def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
+def normalize_expected_dist(expected_dist: NDArray, observed_dist: NDArray) -> NDArray:
+    """
+    Normalize the expected label distribution to match the total number of labels in the observed distribution.
+    This function adjusts the expected distribution so that its sum equals the sum of the observed distribution.
+    If the expected distribution is all zeros, an error is raised.
+    Parameters
+    ----------
+    expected_dist : np.ndarray
+        The expected label distribution. This array represents the anticipated distribution of labels.
+    observed_dist : np.ndarray
+        The observed label distribution. This array represents the actual distribution of labels in the dataset.
+    Returns
+    -------
+    np.ndarray
+        The normalized expected distribution, scaled to have the same sum as the observed distribution.
+    Raises
+    ------
+    ValueError
+        If the expected distribution is all zeros.
+    Notes
+    -----
+    The function ensures that the total number of labels in the expected distribution matches the total
+    number of labels in the observed distribution by scaling the expected distribution.
+    """
     exp_sum = np.sum(expected_dist)
     obs_sum = np.sum(observed_dist)
@@ -141,14 +163,14 @@ def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray
     return expected_dist
-def validate_dist(label_dist: np.ndarray, label_name: str):
+def validate_dist(label_dist: NDArray, label_name: str):
     """
     Verifies that the given label distribution has labels and checks if
     any labels have frequencies less than 5.
     Parameters
     ----------
-    label_dist : np.ndarray
+    label_dist : NDArray
         Array representing label distributions
     Raises
@@ -158,6 +180,7 @@ def validate_dist(label_dist: np.ndarray, label_name: str):
     Warning
         If any elements of label_dist are less than 5
     """
     if not len(label_dist):
         raise ValueError(f"No labels found in the {label_name} dataset")
     if np.any(label_dist < 5):
@@ -166,24 +189,20 @@ def validate_dist(label_dist: np.ndarray, label_name: str):
             " dataset have frequencies less than 5. This may lead"
             " to invalid chi-squared evaluation."
         )
-        warnings.warn(
-            f"Labels {np.where(label_dist<5)[0]} in {label_name}"
-            " dataset have frequencies less than 5. This may lead"
-            " to invalid chi-squared evaluation."
-        )
-def parity(
+@set_metadata("dataeval.metrics")
+def label_parity(
     expected_labels: ArrayLike,
     observed_labels: ArrayLike,
-    num_classes: Optional[int] = None,
-) -> ParityOutput:
+    num_classes: int | None = None,
+) -> ParityOutput[np.float64]:
     """
-    Perform a one-way chi-squared test between observation frequencies and expected frequencies that
-    tests the null hypothesis that the observed data has the expected frequencies.
+    Calculate the chi-square statistic to assess the parity between expected and observed label distributions.
-    This function acts as an interface to the scipy.stats.chisquare method, which is documented at
-    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
+    This function computes the frequency distribution of classes in both expected and observed labels, normalizes
+    the expected distribution to match the total number of observed labels, and then calculates the chi-square
+    statistic to determine if there is a significant difference between the two distributions.
     Parameters
     ----------
@@ -191,9 +210,9 @@ def parity(
         List of class labels in the expected dataset
     observed_labels : ArrayLike
         List of class labels in the observed dataset
-    num_classes : Optional[int]
-        The number of unique classes in the datasets. If this is not specified, it will
-        be inferred from the set of unique labels in expected_labels and observed_labels
+    num_classes : int | None, default None
+        The number of unique classes in the datasets. If not provided, the function will infer it
+        from the set of unique labels in expected_labels and observed_labels
     Returns
     -------
@@ -203,8 +222,31 @@ def parity(
     Raises
     ------
     ValueError
-        If x is empty
+        If expected label distribution is empty, is all zeros, or if there is a mismatch in the number
+        of unique classes between the observed and expected distributions.
+    Notes
+    -----
+    - Providing ``num_classes`` can be helpful if there are classes with zero instances in one of the distributions.
+    - The function first validates the observed distribution and normalizes the expected distribution so that it
+      has the same total number of labels as the observed distribution.
+    - It then performs a chi-square test to determine if there is a statistically significant difference between
+      the observed and expected label distributions.
+    - This function acts as an interface to the scipy.stats.chisquare method, which is documented at
+      https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
+    Examples
+    --------
+    Randomly creating some label distributions using ``np.random.default_rng``
+    >>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
+    >>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
+    >>> label_parity(expected_labels, observed_labels)
+    ParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
     """
     # Calculate
     if not num_classes:
         num_classes = 0
@@ -236,27 +278,28 @@ def parity(
     return ParityOutput(cs, p)
-def parity_metadata(
+@set_metadata("dataeval.metrics")
+def parity(
     data_factors: Mapping[str, ArrayLike],
-    continuous_factor_bincounts: Optional[Dict[str, int]] = None,
-) -> ParityMetadataOutput:
+    continuous_factor_bincounts: dict[str, int] | None = None,
+) -> ParityOutput[NDArray[np.float64]]:
     """
-    Evaluates the statistical independence of metadata factors from class labels.
-    This performs a chi-square test, which provides a score and a p-value for
-    statistical independence between each pair of a metadata factor and a class label.
-    A high score with a low p-value suggests that a metadata factor is strongly
-    correlated with a class label.
+    Calculate chi-square statistics to assess the relationship between multiple factors and class labels.
+    This function computes the chi-square statistic for each metadata factor to determine if there is
+    a significant relationship between the factor values and class labels. The function handles both categorical
+    and discretized continuous factors.
     Parameters
     ----------
     data_factors: Mapping[str, ArrayLike]
         The dataset factors, which are per-image attributes including class label and metadata.
         Each key of dataset_factors is a factor, whose value is the per-image factor values.
-    continuous_factor_bincounts : Optional[Dict[str, int]], default None
-        The factors in data_factors that have continuous values and the array of bin counts to
-        discretize values into. All factors are treated as having discrete values unless they
-        are specified as keys in this dictionary. Each element of this array must occur as a key
-        in data_factors.
+    continuous_factor_bincounts : Dict[str, int] | None, default None
+        A dictionary specifying the number of bins for discretizing the continuous factors.
+        The keys should correspond to the names of continuous factors in `data_factors`,
+        and the values should be the number of bins to use for discretization.
+        If not provided, no discretization is applied.
     Returns
     -------
@@ -264,7 +307,39 @@ def parity_metadata(
         Arrays of length (num_factors) whose (i)th element corresponds to the
         chi-square score and p-value for the relationship between factor i and
         the class labels in the dataset.
+    Raises
+    ------
+    Warning
+        If any cell in the contingency matrix has a value between 0 and 5, a warning is issued because this can
+        lead to inaccurate chi-square calculations. It is recommended to ensure that each label co-occurs with
+        factor values either 0 times or at least 5 times. Alternatively, continuous-valued factors can be digitized
+        into fewer bins.
+    Notes
+    -----
+    - Each key of the ``continuous_factor_bincounts`` dictionary must occur as a key in data_factors.
+    - A high score with a low p-value suggests that a metadata factor is strongly correlated with a class label.
+    - The function creates a contingency matrix for each factor, where each entry represents the frequency of a
+      specific factor value co-occurring with a particular class label.
+    - Rows containing only zeros in the contingency matrix are removed before performing the chi-square test
+      to prevent errors in the calculation.
+    Examples
+    --------
+    Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
+    >>> data_factors = {
+    ...     "age": np_random_gen.choice([25, 30, 35, 45], (100)),
+    ...     "income": np_random_gen.choice([50000, 65000, 80000], (100)),
+    ...     "gender": np_random_gen.choice(["M", "F"], (100)),
+    ...     "class": np_random_gen.choice([0, 1, 2], (100)),
+    ... }
+    >>> continuous_factor_bincounts = {"age": 4, "income": 3}
+    >>> parity(data_factors, continuous_factor_bincounts)
+    ParityOutput(score=array([2.82329785, 1.60625584, 1.38377236]), p_value=array([0.83067563, 0.80766733, 0.5006309 ]))
     """
     data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
     continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
@@ -306,4 +381,4 @@ def parity_metadata(
         chi_scores[i] = chi2
         p_values[i] = p
-    return ParityMetadataOutput(chi_scores, p_values)
+    return ParityOutput(chi_scores, p_values)

dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl

dataeval 0.64.0py3-none-any.whl → 0.66.0py3-none-any.whl