PyPI - dataeval - Versions diffs - 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl - Mend

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

dataeval/__init__.py +3 -3
dataeval/config.py +77 -0
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +6 -6
dataeval/detectors/drift/{base.py → _base.py} +40 -85
dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
dataeval/detectors/drift/updates.py +20 -3
dataeval/detectors/linters/__init__.py +3 -5
dataeval/detectors/linters/duplicates.py +13 -36
dataeval/detectors/linters/outliers.py +23 -148
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +30 -9
dataeval/detectors/ood/base.py +5 -4
dataeval/detectors/ood/mixin.py +21 -7
dataeval/detectors/ood/vae.py +73 -0
dataeval/metadata/__init__.py +6 -0
dataeval/metadata/_distance.py +167 -0
dataeval/metadata/_ood.py +217 -0
dataeval/metadata/_utils.py +44 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +6 -4
dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
dataeval/metrics/bias/_coverage.py +98 -0
dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
dataeval/metrics/estimators/__init__.py +15 -4
dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
dataeval/metrics/estimators/_clusterer.py +44 -0
dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
dataeval/metrics/stats/__init__.py +16 -13
dataeval/metrics/stats/{base.py → _base.py} +82 -133
dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
dataeval/metrics/stats/_dimensionstats.py +75 -0
dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
dataeval/metrics/stats/_imagestats.py +94 -0
dataeval/metrics/stats/_labelstats.py +131 -0
dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
dataeval/outputs/__init__.py +53 -0
dataeval/{output.py → outputs/_base.py} +55 -25
dataeval/outputs/_bias.py +381 -0
dataeval/outputs/_drift.py +83 -0
dataeval/outputs/_estimators.py +114 -0
dataeval/outputs/_linters.py +184 -0
dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
dataeval/outputs/_stats.py +387 -0
dataeval/outputs/_utils.py +44 -0
dataeval/outputs/_workflows.py +364 -0
dataeval/typing.py +234 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +169 -0
dataeval/utils/_bin.py +199 -0
dataeval/utils/_clusterer.py +144 -0
dataeval/utils/_fast_mst.py +189 -0
dataeval/utils/{image.py → _image.py} +6 -4
dataeval/utils/_method.py +14 -0
dataeval/utils/{shared.py → _mst.py} +3 -65
dataeval/utils/{plot.py → _plot.py} +6 -6
dataeval/utils/data/__init__.py +26 -0
dataeval/utils/data/_dataset.py +217 -0
dataeval/utils/data/_embeddings.py +104 -0
dataeval/utils/data/_images.py +68 -0
dataeval/utils/data/_metadata.py +360 -0
dataeval/utils/data/_selection.py +126 -0
dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
dataeval/utils/data/_targets.py +85 -0
dataeval/utils/data/collate.py +103 -0
dataeval/utils/data/datasets/__init__.py +17 -0
dataeval/utils/data/datasets/_base.py +254 -0
dataeval/utils/data/datasets/_cifar10.py +134 -0
dataeval/utils/data/datasets/_fileio.py +168 -0
dataeval/utils/data/datasets/_milco.py +153 -0
dataeval/utils/data/datasets/_mixin.py +56 -0
dataeval/utils/data/datasets/_mnist.py +183 -0
dataeval/utils/data/datasets/_ships.py +123 -0
dataeval/utils/data/datasets/_types.py +52 -0
dataeval/utils/data/datasets/_voc.py +352 -0
dataeval/utils/data/selections/__init__.py +15 -0
dataeval/utils/data/selections/_classfilter.py +57 -0
dataeval/utils/data/selections/_indices.py +26 -0
dataeval/utils/data/selections/_limit.py +26 -0
dataeval/utils/data/selections/_reverse.py +18 -0
dataeval/utils/data/selections/_shuffle.py +29 -0
dataeval/utils/metadata.py +51 -376
dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
dataeval/utils/torch/{internal.py → _internal.py} +21 -51
dataeval/utils/torch/models.py +43 -2
dataeval/workflows/__init__.py +2 -1
dataeval/workflows/sufficiency.py +11 -346
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
dataeval-0.82.0.dist-info/RECORD +104 -0
dataeval/detectors/linters/clusterer.py +0 -512
dataeval/detectors/linters/merged_stats.py +0 -49
dataeval/detectors/ood/metadata_ks_compare.py +0 -129
dataeval/detectors/ood/metadata_least_likely.py +0 -119
dataeval/interop.py +0 -69
dataeval/metrics/bias/coverage.py +0 -194
dataeval/metrics/stats/datasetstats.py +0 -202
dataeval/metrics/stats/dimensionstats.py +0 -115
dataeval/metrics/stats/labelstats.py +0 -210
dataeval/utils/dataset/__init__.py +0 -7
dataeval/utils/dataset/datasets.py +0 -412
dataeval/utils/dataset/read.py +0 -63
dataeval-0.76.1.dist-info/RECORD +0 -67
/dataeval/{log.py → _log.py} +0 -0
/dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0

dataeval/metadata/_distance.py ADDED Viewed

@@ -0,0 +1,167 @@
+from __future__ import annotations
+__all__ = []
+import warnings
+from typing import NamedTuple, cast
+import numpy as np
+from scipy.stats import iqr, ks_2samp
+from scipy.stats import wasserstein_distance as emd
+from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
+from dataeval.outputs._base import MappingOutput
+from dataeval.typing import ArrayLike
+from dataeval.utils.data import Metadata
+class KSType(NamedTuple):
+    """Used to typehint scipy's internal hidden ks_2samp output"""
+    statistic: float
+    statistic_location: float
+    pvalue: float
+class MetadataKSResult(NamedTuple):
+    """
+    Attributes
+    ----------
+    statistic : float
+        the KS statistic
+    location : float
+        The value at which the KS statistic has its maximum, measured in IQR-normalized units relative
+        to the median of the reference distribution.
+    dist : float
+        The Earth Mover's Distance normalized by the interquartile range (IQR) of the reference
+    pvalue : float
+        The p-value from the KS two-sample test
+    """
+    statistic: float
+    location: float
+    dist: float
+    pvalue: float
+class KSOutput(MappingOutput[str, MetadataKSResult]):
+    """
+    Output class for results of ks_2samp featurewise comparisons of new metadata to reference metadata.
+    Attributes
+    ----------
+    key: str
+        Metadata feature names
+    value: :class:`MetadataKSResult`
+        Output per feature name containing the statistic, statistic location, distance, and pvalue.
+    """
+def _calculate_drift(x1: ArrayLike, x2: ArrayLike) -> float:
+    """Calculates the shift magnitude between x1 and x2 scaled by x1"""
+    distance = emd(x1, x2)
+    X = iqr(x1)
+    # Preferred scaling of x1
+    if X:
+        return distance / X
+    # Return if single-valued, else scale
+    xmin, xmax = np.min(x1), np.max(x1)
+    return distance if xmin == xmax else distance / (xmax - xmin)
+def metadata_distance(metadata1: Metadata, metadata2: Metadata) -> KSOutput:
+    """
+    Measures the feature-wise distance between two continuous metadata distributions and
+    computes a p-value to evaluate its significance.
+    Uses the Earth Mover's Distance and the Kolmogorov-Smirnov two-sample test, featurewise.
+    Parameters
+    ----------
+    metadata1 : Metadata
+        Class containing continuous factor names and values to be used as reference
+    metadata2 : Metadata
+        Class containing continuous factor names and values to be compare with the reference
+    Returns
+    -------
+    dict[str, KstestResult]
+        A dictionary with keys corresponding to metadata feature names, and values that are KstestResult objects, as
+        defined by scipy.stats.ks_2samp.
+    See Also
+    --------
+    Earth mover's distance
+    Kolmogorov-Smirnov two-sample test
+    Note
+    ----
+    This function only applies to the continuous data
+    Examples
+    --------
+    >>> output = metadata_distance(metadata1, metadata2)
+    >>> list(output)
+    ['time', 'altitude']
+    >>> output["time"]
+    MetadataKSResult(statistic=1.0, location=0.44354838709677413, dist=2.7, pvalue=0.0)
+    """
+    _compare_keys(metadata1.continuous_factor_names, metadata2.continuous_factor_names)
+    fnames = metadata1.continuous_factor_names
+    cont1 = np.atleast_2d(metadata1.continuous_data)  # (S, F)
+    cont2 = np.atleast_2d(metadata2.continuous_data)  # (S, F)
+    _validate_factors_and_data(fnames, cont1)
+    _validate_factors_and_data(fnames, cont2)
+    N = len(cont1)
+    M = len(cont2)
+    # This is a simplified version of sqrt(N*M / N+M) < 4
+    if (N - 16) * (M - 16) < 256:
+        warnings.warn(
+            f"Sample sizes of {N}, {M} will yield unreliable p-values from the KS test. "
+            f"Recommended 32 samples per factor or at least 16 if one set has many more.",
+            UserWarning,
+        )
+    # Set default for statistic, location, and magnitude to zero and pvalue to one
+    results: dict[str, MetadataKSResult] = {}
+    # Per factor
+    for i, fname in enumerate(fnames):
+        fdata1 = cont1[:, i]  # (S, 1)
+        fdata2 = cont2[:, i]  # (S, 1)
+        # Min and max over both distributions
+        xmin = min(np.min(fdata1), np.min(fdata2))
+        xmax = max(np.max(fdata1), np.max(fdata2))
+        # Default case
+        if xmin == xmax:
+            results[fname] = MetadataKSResult(statistic=0.0, location=0.0, dist=0.0, pvalue=1.0)
+            continue
+        ks_result = cast(KSType, ks_2samp(fdata1, fdata2, method="asymp"))
+        # Normalized location
+        loc = float((ks_result.statistic_location - xmin) / (xmax - xmin))
+        drift = _calculate_drift(fdata1, fdata2)
+        results[fname] = MetadataKSResult(
+            statistic=ks_result.statistic,
+            location=loc,
+            dist=drift,
+            pvalue=ks_result.pvalue,
+        )
+    return KSOutput(results)

dataeval/metadata/_ood.py ADDED Viewed

@@ -0,0 +1,217 @@
+from __future__ import annotations
+__all__ = []
+import warnings
+import numpy as np
+from numpy.typing import NDArray
+from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
+from dataeval.outputs import OODOutput
+from dataeval.utils.data import Metadata
+def _combine_metadata(metadata_1: Metadata, metadata_2: Metadata) -> tuple[list[str], list[NDArray], list[NDArray]]:
+    """
+    Combines the factor names and data arrays of metadata_1 and metadata_2 when the names
+    match exactly and data has the same number of columns (factors).
+    Parameters
+    ----------
+    metadata_1 : Metadata
+        The set of factor names used as reference to determine the correct factor names and length of data
+    metadata_2 : Metadata
+        The compared set of factor names and data that must match metadata_1
+    Returns
+    -------
+    list[str]
+        The combined discrete and continuous factor names in that order.
+    list[NDArray]
+        Combined discrete and continuous data of metadata_1
+    list[NDArray]
+        Combined discrete and continuous data of metadata_2
+    Raises
+    ------
+    ValueError
+        If keys do not match in metadata_1 and metadata_2
+    ValueError
+        If the length of keys do not match the length of the data
+    """
+    factor_names: list[str] = []
+    m1_data: list[NDArray] = []
+    m2_data: list[NDArray] = []
+    # Both metadata must have the same number of factors (cols), but not necessarily samples (row)
+    if metadata_1.total_num_factors != metadata_2.total_num_factors:
+        raise ValueError(
+            f"Number of factors differs between metadata_1 ({metadata_1.total_num_factors}) "
+            f"and metadata_2 ({metadata_2.total_num_factors})"
+        )
+    # Validate and attach discrete data
+    if metadata_1.discrete_factor_names:
+        _compare_keys(metadata_1.discrete_factor_names, metadata_2.discrete_factor_names)
+        _validate_factors_and_data(metadata_1.discrete_factor_names, metadata_1.discrete_data)
+        factor_names.extend(metadata_1.discrete_factor_names)
+        m1_data.append(metadata_1.discrete_data)
+        m2_data.append(metadata_2.discrete_data)
+    # Validate and attach continuous data
+    if metadata_1.continuous_factor_names:
+        _compare_keys(metadata_1.continuous_factor_names, metadata_2.continuous_factor_names)
+        _validate_factors_and_data(metadata_1.continuous_factor_names, metadata_1.continuous_data)
+        factor_names.extend(metadata_1.continuous_factor_names)
+        m1_data.append(metadata_1.continuous_data)
+        m2_data.append(metadata_2.continuous_data)
+    # Turns list of discrete and continuous into one array
+    return factor_names, m1_data, m2_data
+def _calc_median_deviations(reference: NDArray, test: NDArray) -> NDArray:
+    """
+    Calculates deviations of the test data from the median of the reference data
+    Parameters
+    ----------
+    reference : NDArray
+        Reference values of shape (samples, factors)
+    test : NDArray
+        Incoming values where each sample's factors will be compared to the median of
+        the reference set corresponding factors
+    Returns
+    -------
+    NDArray
+        Scaled positive and negative deviations of the test data from the reference.
+    Note
+    ----
+    All return values are in the range [0, pos_inf]
+    """
+    # Take median over samples (rows)
+    ref_median = np.median(reference, axis=0)  # (F, )
+    # Shift reference and test distributions by reference
+    ref_dev = reference - ref_median  # (S, F) - F
+    test_dev = test - ref_median  # (S_t, F) - F
+    # Separate positive and negative distributions
+    # Fills with nans to keep shape in both 1-D and N-D matrices
+    pdev = np.where(ref_dev > 0, ref_dev, np.nan)  # (S, F)
+    ndev = np.where(ref_dev < 0, ref_dev, np.nan)  # (S, F)
+    # Calculate middle of positive and negative distributions per feature
+    pscale = np.nanmedian(pdev, axis=0)  # (F, )
+    nscale = np.abs(np.nanmedian(ndev, axis=0))  # (F, )
+    # Replace 0's for division. Negatives should not happen
+    pscale = np.where(pscale > 0, pscale, 1.0)  # (F, )
+    nscale = np.where(nscale > 0, nscale, 1.0)  # (F, )
+    # Scales positive values by positive scale and negative values by negative
+    return np.abs(np.where(test_dev >= 0, test_dev / pscale, test_dev / nscale))  # (S_t, F)
+def most_deviated_factors(
+    metadata_1: Metadata,
+    metadata_2: Metadata,
+    ood: OODOutput,
+) -> list[tuple[str, float]]:
+    """
+    Determines greatest deviation in metadata features per out of distribution sample in metadata_2.
+    Parameters
+    ----------
+    metadata_1 : Metadata
+        A reference set of Metadata containing factor names and samples
+        with discrete and/or continuous values per factor
+    metadata_2 : Metadata
+        The set of Metadata that is tested against the reference metadata.
+        This set must have the same number of features but does not require the same number of samples.
+    ood : OODOutput
+        A class output by the DataEval's OOD functions that contains which examples are OOD.
+    Returns
+    -------
+    list[tuple[str, float]]
+        An array of the factor name and deviation of the highest metadata deviation for each OOD example in metadata_2.
+    Notes
+    -----
+    1. Both :class:`.Metadata` inputs must have discrete and continuous data in the shape (samples, factors)
+       and have equivalent factor names and lengths
+    2. The flag at index `i` in :attr:`.OODOutput.is_ood` must correspond
+       directly to sample `i` of `metadata_2` being out-of-distribution from `metadata_1`
+    Examples
+    --------
+    >>> from dataeval.detectors.ood import OODOutput
+    All samples are out-of-distribution
+    >>> is_ood = OODOutput(np.array([True, True, True]), np.array([]), np.array([]))
+    >>> most_deviated_factors(metadata1, metadata2, is_ood)
+    [('time', 2.0), ('time', 2.592), ('time', 3.51)]
+    If there are no out-of-distribution samples, a list is returned
+    >>> is_ood = OODOutput(np.array([False, False, False]), np.array([]), np.array([]))
+    >>> most_deviated_factors(metadata1, metadata2, is_ood)
+    []
+    """
+    ood_mask: NDArray[np.bool] = ood.is_ood
+    # No metadata correlated with out of distribution data
+    if not any(ood_mask):
+        return []
+    # Combines reference and test factor names and data if exists and match exactly
+    # shape -> (samples, factors)
+    factor_names, md_1, md_2 = _combine_metadata(
+        metadata_1=metadata_1,
+        metadata_2=metadata_2,
+    )
+    # Stack discrete and continuous factors as separate factors. Must have equal sample counts
+    metadata_ref = np.hstack(md_1) if md_1 else np.array([])
+    metadata_tst = np.hstack(md_2) if md_2 else np.array([])
+    if len(metadata_ref) < 3:
+        warnings.warn(
+            f"At least 3 reference metadata samples are needed, got {len(metadata_ref)}",
+            UserWarning,
+        )
+        return []
+    if len(metadata_tst) != len(ood_mask):
+        raise ValueError(
+            f"ood and test metadata must have the same length, "
+            f"got {len(ood_mask)} and {len(metadata_tst)} respectively."
+        )
+    # Calculates deviations of all samples in m2_data
+    # from the median values of the corresponding index in m1_data
+    # Guaranteed for inputs to not be empty
+    deviations = _calc_median_deviations(metadata_ref, metadata_tst)
+    # Get most impactful factor deviation of each sample for ood samples only
+    deviation = np.max(deviations, axis=1)[ood_mask].astype(np.float16)
+    # Get indices of most impactful factors for ood samples only
+    max_factors = np.argmax(deviations, axis=1)[ood_mask]
+    # Get names of most impactful factors TODO: Find better way than np.dtype(<U4)
+    most_ood_factors = np.array(factor_names)[max_factors].tolist()
+    # List of tuples matching the factor name with its deviation
+    return [(factor, dev) for factor, dev in zip(most_ood_factors, deviation)]

dataeval/metadata/_utils.py ADDED Viewed

@@ -0,0 +1,44 @@
+__all__ = []
+from numpy.typing import NDArray
+def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
+    """
+    Raises error when two lists are not equivalent including ordering
+    Parameters
+    ----------
+    keys1 : list of strings
+        List of strings to compare
+    keys2 : list of strings
+        List of strings to compare
+    Raises
+    ------
+    ValueError
+        If lists do not have the same values, value counts, or ordering
+    """
+    if keys1 != keys2:
+        raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
+def _validate_factors_and_data(factors: list[str], data: NDArray) -> None:
+    """
+    Raises error when the number of factors and number of rows do not match
+    Parameters
+    ----------
+    factors : list of strings
+        List of factor names of size N
+    data : NDArray
+        Array of values with shape (M, N)
+    Raises
+    ------
+    ValueError
+        If the length of factors does not equal the length of the transposed data
+    """
+    if len(factors) != len(data.T):
+        raise ValueError(f"Factors and data have mismatched lengths. Got {len(factors)} and {len(data.T)}")

dataeval/metrics/__init__.py CHANGED Viewed

@@ -5,4 +5,4 @@ can then be analyzed in the context of a given problem.
 __all__ = ["bias", "estimators", "stats"]
-from dataeval.metrics import bias, estimators, stats
+from . import bias, estimators, stats

dataeval/metrics/bias/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ __all__ = [
     "BalanceOutput",
     "CoverageOutput",
     "DiversityOutput",
+    "LabelParityOutput",
     "ParityOutput",
     "balance",
     "coverage",
@@ -15,7 +16,8 @@ __all__ = [
     "parity",
 ]
-from dataeval.metrics.bias.balance import BalanceOutput, balance
-from dataeval.metrics.bias.coverage import CoverageOutput, coverage
-from dataeval.metrics.bias.diversity import DiversityOutput, diversity
-from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
+from dataeval.metrics.bias._balance import balance
+from dataeval.metrics.bias._coverage import coverage
+from dataeval.metrics.bias._diversity import diversity
+from dataeval.metrics.bias._parity import label_parity, parity
+from dataeval.outputs._bias import BalanceOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput

dataeval/metrics/bias/{balance.py → _balance.py} RENAMED Viewed

@@ -2,99 +2,16 @@ from __future__ import annotations
 __all__ = []
-import contextlib
 import warnings
-from dataclasses import dataclass
-from typing import Any
 import numpy as np
 import scipy as sp
-from numpy.typing import NDArray
 from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
-from dataeval.output import Output, set_metadata
-from dataeval.utils.metadata import Metadata, get_counts
-from dataeval.utils.plot import heatmap
-with contextlib.suppress(ImportError):
-    from matplotlib.figure import Figure
-@dataclass(frozen=True)
-class BalanceOutput(Output):
-    """
-    Output class for :func:`balance` :term:`bias<Bias>` metric.
-    Attributes
-    ----------
-    balance : NDArray[np.float64]
-        Estimate of mutual information between metadata factors and class label
-    factors : NDArray[np.float64]
-        Estimate of inter/intra-factor mutual information
-    classwise : NDArray[np.float64]
-        Estimate of mutual information between metadata factors and individual class labels
-    factor_names : list[str]
-        Names of each metadata factor
-    class_list : NDArray
-        Array of the class labels present in the dataset
-    """
-    balance: NDArray[np.float64]
-    factors: NDArray[np.float64]
-    classwise: NDArray[np.float64]
-    factor_names: list[str]
-    class_list: NDArray[Any]
-    def plot(
-        self,
-        row_labels: list[Any] | NDArray[Any] | None = None,
-        col_labels: list[Any] | NDArray[Any] | None = None,
-        plot_classwise: bool = False,
-    ) -> Figure:
-        """
-        Plot a heatmap of balance information
-        Parameters
-        ----------
-        row_labels : ArrayLike or None, default None
-            List/Array containing the labels for rows in the histogram
-        col_labels : ArrayLike or None, default None
-            List/Array containing the labels for columns in the histogram
-        plot_classwise : bool, default False
-            Whether to plot per-class balance instead of global balance
-        """
-        if plot_classwise:
-            if row_labels is None:
-                row_labels = self.class_list
-            if col_labels is None:
-                col_labels = self.factor_names
-            fig = heatmap(
-                self.classwise,
-                row_labels,
-                col_labels,
-                xlabel="Factors",
-                ylabel="Class",
-                cbarlabel="Normalized Mutual Information",
-            )
-        else:
-            # Combine balance and factors results
-            data = np.concatenate([self.balance[np.newaxis, 1:], self.factors], axis=0)
-            # Create a mask for the upper triangle of the symmetrical array, ignoring the diagonal
-            mask = np.triu(data + 1, k=0) < 1
-            # Finalize the data for the plot, last row is last factor x last factor so it gets dropped
-            heat_data = np.where(mask, np.nan, data)[:-1]
-            # Creating label array for heat map axes
-            heat_labels = self.factor_names
-            if row_labels is None:
-                row_labels = heat_labels[:-1]
-            if col_labels is None:
-                col_labels = heat_labels[1:]
-            fig = heatmap(heat_data, row_labels, col_labels, cbarlabel="Normalized Mutual Information")
-        return fig
+from dataeval.outputs import BalanceOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.utils._bin import get_counts
+from dataeval.utils.data import Metadata
 def _validate_num_neighbors(num_neighbors: int) -> int:
@@ -128,7 +45,7 @@ def balance(
     Parameters
     ----------
     metadata : Metadata
-        Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
+        Preprocessed metadata
     num_neighbors : int, default 5
         Number of points to consider as neighbors
@@ -150,25 +67,22 @@ def balance(
     >>> bal = balance(metadata)
     >>> bal.balance
-    array([0.9999982 , 0.2494567 , 0.02994455, 0.13363788, 0.        ,
-           0.        ])
+    array([1.   , 0.249, 0.03 , 0.134, 0.   , 0.   ])
     Return intra/interfactor balance (mutual information)
     >>> bal.factors
-    array([[0.99999935, 0.31360499, 0.26925848, 0.85201924, 0.36653548],
-           [0.31360499, 0.99999856, 0.09725766, 0.15836905, 1.98031993],
-           [0.26925848, 0.09725766, 0.99999846, 0.03713108, 0.01544656],
-           [0.85201924, 0.15836905, 0.03713108, 0.47450653, 0.25509664],
-           [0.36653548, 1.98031993, 0.01544656, 0.25509664, 1.06260686]])
+    array([[1.   , 0.314, 0.269, 0.852, 0.367],
+           [0.314, 1.   , 0.097, 0.158, 1.98 ],
+           [0.269, 0.097, 1.   , 0.037, 0.015],
+           [0.852, 0.158, 0.037, 0.475, 0.255],
+           [0.367, 1.98 , 0.015, 0.255, 1.063]])
     Return classwise balance (mutual information) of factors with individual class_labels
     >>> bal.classwise
-    array([[0.9999982 , 0.2494567 , 0.02994455, 0.13363788, 0.        ,
-            0.        ],
-           [0.9999982 , 0.2494567 , 0.02994455, 0.13363788, 0.        ,
-            0.        ]])
+    array([[1.   , 0.249, 0.03 , 0.134, 0.   , 0.   ],
+           [1.   , 0.249, 0.03 , 0.134, 0.   , 0.   ]])
     See Also
@@ -184,7 +98,7 @@ def balance(
     mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
     data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
     discretized_data = data
-    if metadata.continuous_data is not None:
+    if len(metadata.continuous_data):
         data = np.hstack((data, metadata.continuous_data))
         discrete_idx = [metadata.discrete_factor_names.index(name) for name in metadata.continuous_factor_names]
         discretized_data = np.hstack((discretized_data, metadata.discrete_data[:, discrete_idx]))
@@ -218,7 +132,7 @@ def balance(
     factors = nmi[1:, 1:]
     # assume class is a factor
-    num_classes = metadata.class_names.size
+    num_classes = len(metadata.class_names)
     classwise_mi = np.full((num_classes, num_factors), np.nan, dtype=np.float32)
     # classwise targets

dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl