PyPI - dataeval - Versions diffs - 0.86.1__py3-none-any.whl → 0.86.3__py3-none-any.whl - Mend

dataeval 0.86.1py3-none-any.whl → 0.86.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

dataeval/__init__.py +1 -1
dataeval/data/__init__.py +0 -2
dataeval/data/_metadata.py +183 -199
dataeval/data/_split.py +2 -2
dataeval/data/selections/_classfilter.py +2 -2
dataeval/metadata/_distance.py +10 -7
dataeval/metadata/_ood.py +11 -103
dataeval/metadata/_utils.py +4 -2
dataeval/metrics/bias/_balance.py +23 -33
dataeval/metrics/bias/_diversity.py +16 -14
dataeval/metrics/bias/_parity.py +11 -7
dataeval/metrics/stats/_labelstats.py +24 -28
dataeval/outputs/_base.py +1 -1
dataeval/outputs/_bias.py +26 -67
dataeval/outputs/_estimators.py +2 -1
dataeval/outputs/_linters.py +17 -17
dataeval/outputs/_stats.py +20 -20
dataeval/outputs/_utils.py +3 -2
dataeval/outputs/_workflows.py +9 -7
dataeval/typing.py +4 -4
dataeval/utils/_plot.py +4 -4
dataeval/utils/data/_dataset.py +22 -8
dataeval/utils/datasets/_milco.py +3 -1
{dataeval-0.86.1.dist-info → dataeval-0.86.3.dist-info}/METADATA +2 -1
{dataeval-0.86.1.dist-info → dataeval-0.86.3.dist-info}/RECORD +27 -28
dataeval/data/_targets.py +0 -89
{dataeval-0.86.1.dist-info → dataeval-0.86.3.dist-info}/LICENSE.txt +0 -0
{dataeval-0.86.1.dist-info → dataeval-0.86.3.dist-info}/WHEEL +0 -0

dataeval/metadata/_ood.py CHANGED Viewed

@@ -15,95 +15,6 @@ from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorO
 from dataeval.outputs._base import set_metadata
-def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
-    """Combines the discrete and continuous data of a :class:`Metadata` object
-    Returns
-    -------
-    Tuple[list[str], NDArray]
-        The combined list of factors names and the combined discrete and continuous data
-    Note
-    ----
-    Discrete and continuous data must have the same number of samples
-    """
-    names = []
-    data = []
-    if metadata.discrete_factor_names and metadata.discrete_data.size != 0:
-        names.extend(metadata.discrete_factor_names)
-        data.append(metadata.discrete_data)
-    if metadata.continuous_factor_names and metadata.continuous_data.size != 0:
-        names.extend(metadata.continuous_factor_names)
-        data.append(metadata.continuous_data)
-    return names, np.hstack(data, dtype=np.float64) if data else np.array([], dtype=np.float64)
-def _combine_metadata(
-    metadata_1: Metadata, metadata_2: Metadata
-) -> tuple[list[str], list[NDArray[np.float64 | np.int64]], list[NDArray[np.int64 | np.float64]]]:
-    """
-    Combines the factor names and data arrays of metadata_1 and metadata_2 when the names
-    match exactly and data has the same number of columns (factors).
-    Parameters
-    ----------
-    metadata_1 : Metadata
-        The set of factor names used as reference to determine the correct factor names and length of data
-    metadata_2 : Metadata
-        The compared set of factor names and data that must match metadata_1
-    Returns
-    -------
-    list[str]
-        The combined discrete and continuous factor names in that order.
-    list[NDArray]
-        Combined discrete and continuous data of metadata_1
-    list[NDArray]
-        Combined discrete and continuous data of metadata_2
-    Raises
-    ------
-    ValueError
-        If keys do not match in metadata_1 and metadata_2
-    ValueError
-        If the length of keys do not match the length of the data
-    """
-    factor_names: list[str] = []
-    m1_data: list[NDArray[np.int64 | np.float64]] = []
-    m2_data: list[NDArray[np.int64 | np.float64]] = []
-    # Both metadata must have the same number of factors (cols), but not necessarily samples (row)
-    if metadata_1.total_num_factors != metadata_2.total_num_factors:
-        raise ValueError(
-            f"Number of factors differs between metadata_1 ({metadata_1.total_num_factors}) "
-            f"and metadata_2 ({metadata_2.total_num_factors})"
-        )
-    # Validate and attach discrete data
-    if metadata_1.discrete_factor_names:
-        _compare_keys(metadata_1.discrete_factor_names, metadata_2.discrete_factor_names)
-        _validate_factors_and_data(metadata_1.discrete_factor_names, metadata_1.discrete_data)
-        factor_names.extend(metadata_1.discrete_factor_names)
-        m1_data.append(metadata_1.discrete_data)
-        m2_data.append(metadata_2.discrete_data)
-    # Validate and attach continuous data
-    if metadata_1.continuous_factor_names:
-        _compare_keys(metadata_1.continuous_factor_names, metadata_2.continuous_factor_names)
-        _validate_factors_and_data(metadata_1.continuous_factor_names, metadata_1.continuous_data)
-        factor_names.extend(metadata_1.continuous_factor_names)
-        m1_data.append(metadata_1.continuous_data)
-        m2_data.append(metadata_2.continuous_data)
-    # Turns list of discrete and continuous into one array
-    return factor_names, m1_data, m2_data
 def _calc_median_deviations(reference: NDArray, test: NDArray) -> NDArray:
     """
     Calculates deviations of the test data from the median of the reference data
@@ -207,16 +118,13 @@ def find_most_deviated_factors(
     if not any(ood_mask):
         return MostDeviatedFactorsOutput([])
-    # Combines reference and test factor names and data if exists and match exactly
-    # shape -> (samples, factors)
-    factor_names, md_1, md_2 = _combine_metadata(
-        metadata_1=metadata_ref,
-        metadata_2=metadata_tst,
-    )
+    factor_names = metadata_ref.factor_names
+    ref_data = metadata_ref.factor_data
+    tst_data = metadata_tst.factor_data
-    # Stack discrete and continuous factors as separate factors. Must have equal sample counts
-    ref_data = np.hstack(md_1) if md_1 else np.array([])  # (S, Fd + Fc)
-    tst_data = np.hstack(md_2) if md_2 else np.array([])  # (S, Fd + Fc)
+    _compare_keys(factor_names, metadata_tst.factor_names)
+    _validate_factors_and_data(factor_names, ref_data)
+    _validate_factors_and_data(factor_names, tst_data)
     if len(ref_data) < 3:
         warnings.warn(
@@ -256,6 +164,7 @@ which is what many library functions return, multiply it by _NATS2BITS to get it
 """
+@set_metadata
 def find_ood_predictors(
     metadata: Metadata,
     ood: OODOutput,
@@ -305,8 +214,8 @@ def find_ood_predictors(
     ood_mask: NDArray[np.bool_] = ood.is_ood
-    discrete_features_count = len(metadata.discrete_factor_names)
-    factors, data = _combine_discrete_continuous(metadata)  # (F, ), (S, F) => F = Fd + Fc
+    factors = metadata.factor_names
+    data = metadata.factor_data
     # No metadata correlated with out of distribution data, return 0.0 for all factors
     if not any(ood_mask):
@@ -320,14 +229,13 @@ def find_ood_predictors(
     # Calculate mean, std of each factor over all samples
     scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1)  # (S, F)
-    discrete_features = np.zeros_like(factors, dtype=np.bool_)
-    discrete_features[:discrete_features_count] = True
+    discrete_features = [info.factor_type != "continuous" for info in metadata.factor_info.values()]
     mutual_info_values = (
         mutual_info_classif(
             X=scaled_data,
             y=ood_mask,
-            discrete_features=discrete_features,  # type: ignore -> sklearn issue - NDArray[bool] not of accepted type Union[ArrayLike, 'auto']
+            discrete_features=discrete_features,  # type: ignore - sklearn function not typed
             random_state=get_seed(),
         )
         * _NATS2BITS

dataeval/metadata/_utils.py CHANGED Viewed

@@ -1,9 +1,11 @@
 __all__ = []
+from typing import Sequence
 from numpy.typing import NDArray
-def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
+def _compare_keys(keys1: Sequence[str], keys2: Sequence[str]) -> None:
     """
     Raises error when two lists are not equivalent including ordering
@@ -24,7 +26,7 @@ def _compare_keys(keys1: list[str], keys2: list[str]) -> None:
         raise ValueError(f"Metadata keys must be identical, got {keys1} and {keys2}")
-def _validate_factors_and_data(factors: list[str], data: NDArray) -> None:
+def _validate_factors_and_data(factors: Sequence[str], data: NDArray) -> None:
     """
     Raises error when the number of factors and number of rows do not match

dataeval/metrics/bias/_balance.py CHANGED Viewed

@@ -68,22 +68,20 @@ def balance(
     >>> bal = balance(metadata)
     >>> bal.balance
-    array([1.   , 0.249, 0.03 , 0.134, 0.   , 0.   ])
+    array([1.   , 0.134, 0.   , 0.   ])
     Return intra/interfactor balance (mutual information)
     >>> bal.factors
-    array([[1.   , 0.314, 0.269, 0.852, 0.367],
-           [0.314, 1.   , 0.097, 0.158, 1.98 ],
-           [0.269, 0.097, 1.   , 0.037, 0.015],
-           [0.852, 0.158, 0.037, 0.475, 0.255],
-           [0.367, 1.98 , 0.015, 0.255, 1.063]])
+    array([[1.   , 0.017, 0.015],
+           [0.017, 0.445, 0.245],
+           [0.015, 0.245, 1.063]])
     Return classwise balance (mutual information) of factors with individual class_labels
     >>> bal.classwise
-    array([[1.   , 0.249, 0.03 , 0.134, 0.   , 0.   ],
-           [1.   , 0.249, 0.03 , 0.134, 0.   , 0.   ]])
+    array([[1.   , 0.134, 0.   , 0.   ],
+           [1.   , 0.134, 0.   , 0.   ]])
     See Also
@@ -92,41 +90,39 @@ def balance(
     sklearn.feature_selection.mutual_info_regression
     sklearn.metrics.mutual_info_score
     """
-    if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
+    if not metadata.factor_names:
         raise ValueError("No factors found in provided metadata.")
     num_neighbors = _validate_num_neighbors(num_neighbors)
-    num_factors = metadata.total_num_factors
-    is_discrete = [True] * (len(metadata.discrete_factor_names) + 1) + [False] * len(metadata.continuous_factor_names)
+    data = metadata.discretized_data
+    factor_types = {"class_label": "categorical"} | {k: v.factor_type for k, v in metadata.factor_info.items()}
+    is_discrete = [factor_type != "continuous" for factor_type in factor_types.values()]
+    num_factors = len(factor_types)
     mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
-    data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
-    discretized_data = data
-    if len(metadata.continuous_data):
-        data = np.hstack((data, metadata.continuous_data))
-        discrete_idx = [metadata.discrete_factor_names.index(name) for name in metadata.continuous_factor_names]
-        discretized_data = np.hstack((discretized_data, metadata.discrete_data[:, discrete_idx]))
-    for idx in range(num_factors):
-        if idx >= len(metadata.discrete_factor_names) + 1:
-            mi[idx, :] = mutual_info_regression(
+    data = np.hstack((metadata.class_labels[:, np.newaxis], data))
+    for idx, factor_type in enumerate(factor_types.values()):
+        if factor_type != "continuous":
+            mi[idx, :] = mutual_info_classif(
                 data,
                 data[:, idx],
-                discrete_features=is_discrete,  # type: ignore
+                discrete_features=is_discrete,  # type: ignore - sklearn function not typed
                 n_neighbors=num_neighbors,
                 random_state=get_seed(),
             )
         else:
-            mi[idx, :] = mutual_info_classif(
+            mi[idx, :] = mutual_info_regression(
                 data,
                 data[:, idx],
-                discrete_features=is_discrete,  # type: ignore
+                discrete_features=is_discrete,  # type: ignore - sklearn function not typed
                 n_neighbors=num_neighbors,
                 random_state=get_seed(),
             )
     # Normalization via entropy
-    bin_cnts = get_counts(discretized_data)
+    bin_cnts = get_counts(data)
     ent_factor = sp.stats.entropy(bin_cnts, axis=0)
     norm_factor = 0.5 * np.add.outer(ent_factor, ent_factor) + EPSILON
@@ -149,7 +145,7 @@ def balance(
         classwise_mi[idx, :] = mutual_info_classif(
             data,
             tgt_bin[:, idx],
-            discrete_features=is_discrete,  # type: ignore
+            discrete_features=is_discrete,  # type: ignore - sklearn function not typed
             n_neighbors=num_neighbors,
             random_state=get_seed(),
         )
@@ -161,12 +157,6 @@ def balance(
     classwise = classwise_mi / norm_factor
     # Grabbing factor names for plotting function
-    factor_names = ["class"]
-    for name in metadata.discrete_factor_names:
-        if name in metadata.continuous_factor_names:
-            name = name + "-discrete"
-        factor_names.append(name)
-    for name in metadata.continuous_factor_names:
-        factor_names.append(name + "-continuous")
+    factor_names = ["class_label"] + list(metadata.factor_names)
     return BalanceOutput(balance, factors, classwise, factor_names, metadata.class_names)

dataeval/metrics/bias/_diversity.py CHANGED Viewed

@@ -138,43 +138,45 @@ def diversity(
     >>> div_simp = diversity(metadata, method="simpson")
     >>> div_simp.diversity_index
-    array([0.6  , 0.809, 1.   , 0.8  ])
+    array([0.6  , 0.8  , 0.809, 1.   ])
     >>> div_simp.classwise
-    array([[0.5  , 0.8  , 0.8  ],
-           [0.63 , 0.976, 0.528]])
+    array([[0.8  , 0.5  , 0.8  ],
+           [0.528, 0.63 , 0.976]])
     Compute Shannon diversity index of metadata and class labels
     >>> div_shan = diversity(metadata, method="shannon")
     >>> div_shan.diversity_index
-    array([0.811, 0.943, 1.   , 0.918])
+    array([0.811, 0.918, 0.943, 1.   ])
     >>> div_shan.classwise
-    array([[0.683, 0.918, 0.918],
-           [0.814, 0.991, 0.764]])
+    array([[0.918, 0.683, 0.918],
+           [0.764, 0.814, 0.991]])
     See Also
     --------
     scipy.stats.entropy
     """
-    if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
+    if not metadata.factor_names:
         raise ValueError("No factors found in provided metadata.")
     diversity_fn = get_method(_DIVERSITY_FN_MAP, method)
-    discretized_data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
-    cnts = get_counts(discretized_data)
+    discretized_data = metadata.discretized_data
+    factor_names = metadata.factor_names
+    class_lbl = metadata.class_labels
+    class_labels_with_discretized_data = np.hstack((class_lbl[:, np.newaxis], discretized_data))
+    cnts = get_counts(class_labels_with_discretized_data)
     num_bins = np.bincount(np.nonzero(cnts)[1])
     diversity_index = diversity_fn(cnts, num_bins)
-    class_lbl = metadata.class_labels
     u_classes = np.unique(class_lbl)
-    num_factors = len(metadata.discrete_factor_names)
+    num_factors = len(factor_names)
     classwise_div = np.full((len(u_classes), num_factors), np.nan)
     for idx, cls in enumerate(u_classes):
         subset_mask = class_lbl == cls
-        cls_cnts = get_counts(metadata.discrete_data[subset_mask], min_num_bins=cnts.shape[0])
+        cls_cnts = get_counts(discretized_data[subset_mask], min_num_bins=cnts.shape[0])
         classwise_div[idx, :] = diversity_fn(cls_cnts, num_bins[1:])
-    return DiversityOutput(diversity_index, classwise_div, metadata.discrete_factor_names, metadata.class_names)
+    return DiversityOutput(diversity_index, classwise_div, factor_names, metadata.class_names)

dataeval/metrics/bias/_parity.py CHANGED Viewed

@@ -242,13 +242,13 @@ def parity(metadata: Metadata) -> ParityOutput:
     >>> parity(metadata)
     ParityOutput(score=array([7.357, 5.467, 0.515]), p_value=array([0.289, 0.243, 0.773]), factor_names=['age', 'income', 'gender'], insufficient_data={'age': {3: {'artist': 4}, 4: {'artist': 4, 'teacher': 3}}, 'income': {1: {'artist': 3}}})
     """  # noqa: E501
-    if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
+    if not metadata.factor_names:
         raise ValueError("No factors found in provided metadata.")
-    chi_scores = np.zeros(metadata.discrete_data.shape[1])
+    chi_scores = np.zeros(metadata.discretized_data.shape[1])
     p_values = np.zeros_like(chi_scores)
     insufficient_data: defaultdict[str, defaultdict[int, dict[str, int]]] = defaultdict(lambda: defaultdict(dict))
-    for i, col_data in enumerate(metadata.discrete_data.T):
+    for i, col_data in enumerate(metadata.discretized_data.T):
         # Builds a contingency matrix where entry at index (r,c) represents
         # the frequency of current_factor_name achieving value unique_factor_values[r]
         # at a data point with class c.
@@ -258,8 +258,9 @@ def parity(metadata: Metadata) -> ParityOutput:
         # Determines if any frequencies are too low
         counts = np.nonzero(contingency_matrix < 5)
         unique_factor_values = np.unique(col_data)
-        current_factor_name = metadata.discrete_factor_names[i]
-        for int_factor, int_class in zip(counts[0], counts[1]):
+        current_factor_name = metadata.factor_names[i]
+        for _factor, _class in zip(counts[0], counts[1]):
+            int_factor, int_class = int(_factor), int(_class)
             if contingency_matrix[int_factor, int_class] > 0:
                 factor_category = unique_factor_values[int_factor].item()
                 class_name = metadata.class_names[int_class]
@@ -273,11 +274,14 @@ def parity(metadata: Metadata) -> ParityOutput:
         chi_scores[i], p_values[i] = chi2_contingency(contingency_matrix)[:2]
     if insufficient_data:
-        warnings.warn("Some factors did not meet the recommended 5 occurrences for each value-label combination.")
+        warnings.warn(
+            f"Factors {list(insufficient_data)} did not meet the recommended "
+            "5 occurrences for each value-label combination."
+        )
     return ParityOutput(
         score=chi_scores,
         p_value=p_values,
-        factor_names=metadata.discrete_factor_names,
+        factor_names=metadata.factor_names,
         insufficient_data={k: dict(v) for k, v in insufficient_data.items()},
     )

dataeval/metrics/stats/_labelstats.py CHANGED Viewed

@@ -2,9 +2,10 @@ from __future__ import annotations
 __all__ = []
-from collections import Counter, defaultdict
 from typing import Any, Mapping, TypeVar
+import polars as pl
 from dataeval.data._metadata import Metadata
 from dataeval.outputs import LabelStatsOutput
 from dataeval.outputs._base import set_metadata
@@ -52,39 +53,34 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
         pig:      2      -      2
     chicken:      5      -      5
     """
-    dataset = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
-    label_counts: Counter[int] = Counter()
-    image_counts: Counter[int] = Counter()
-    index_location = defaultdict(list[int])
-    label_per_image: list[int] = []
-    index2label = dict(enumerate(dataset.class_names))
-    for i, target in enumerate(dataset.targets):
-        group = target.labels.tolist()
+    metadata = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
+    metadata_df = metadata.dataframe
-        # Count occurrences of each label in all sublists
-        label_counts.update(group)
+    # Count occurrences of each label across all images
+    label_counts_df = metadata_df.group_by("class_label").len()
+    label_counts = label_counts_df.sort("class_label")["len"].to_list()
-        # Get the number of labels per image
-        label_per_image.append(len(group))
+    # Count unique images per label (how many images contain each label)
+    image_counts_df = metadata_df.select(["image_index", "class_label"]).unique().group_by("class_label").len()
+    image_counts = image_counts_df.sort("class_label")["len"].to_list()
-        # Create a set of unique items in the current sublist
-        unique_items: set[int] = set(group)
+    # Create index_location mapping (which images contain each label)
+    index_location: list[list[int]] = [[] for _ in range(len(metadata.class_names))]
+    for row in metadata_df.group_by("class_label").agg(pl.col("image_index")).to_dicts():
+        indices = row["image_index"]
+        index_location[row["class_label"]] = sorted(dict.fromkeys(indices)) if isinstance(indices, list) else [indices]
-        # Update image counts and index locations
-        image_counts.update(unique_items)
-        for item in unique_items:
-            index_location[item].append(i)
+    # Count labels per image
+    label_per_image_df = metadata_df.group_by("image_index").agg(pl.count().alias("label_count"))
+    label_per_image = label_per_image_df.sort("image_index")["label_count"].to_list()
     return LabelStatsOutput(
-        label_counts_per_class=_sort_to_list(label_counts),
+        label_counts_per_class=label_counts,
         label_counts_per_image=label_per_image,
-        image_counts_per_class=_sort_to_list(image_counts),
-        image_indices_per_class=_sort_to_list(index_location),
+        image_counts_per_class=image_counts,
+        image_indices_per_class=index_location,
         image_count=len(label_per_image),
-        class_count=len(label_counts),
-        label_count=sum(label_counts.values()),
-        class_names=list(index2label.values()),
+        class_count=len(metadata.class_names),
+        label_count=sum(label_counts),
+        class_names=metadata.class_names,
     )

dataeval/outputs/_base.py CHANGED Viewed

@@ -147,7 +147,7 @@ P = ParamSpec("P")
 R = TypeVar("R", bound=GenericOutput)
-def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None = None) -> Callable[P, R]:
+def set_metadata(fn: Callable[P, R] | None = None, *, state: Sequence[str] | None = None) -> Callable[P, R]:
     """Decorator to stamp Output classes with runtime metadata"""
     if fn is None:

dataeval 0.86.1__py3-none-any.whl → 0.86.3__py3-none-any.whl

dataeval 0.86.1py3-none-any.whl → 0.86.3py3-none-any.whl