PyPI - dataeval - Versions diffs - 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl - Mend

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +9 -10
dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
dataeval/detectors/ood/__init__.py +6 -6
dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
dataeval/detectors/ood/aegmm.py +66 -0
dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
dataeval/detectors/ood/vaegmm.py +75 -0
dataeval/interop.py +56 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
dataeval/metrics/bias/metadata.py +358 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +8 -3
dataeval/utils/image.py +71 -0
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/shared.py +151 -0
dataeval/{_internal → utils}/split_dataset.py +98 -33
dataeval/utils/tensorflow/__init__.py +7 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +48 -42
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
dataeval-0.73.0.dist-info/RECORD +73 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/detectors/ood/aegmm.py +0 -78
dataeval/_internal/detectors/ood/vaegmm.py +0 -89
dataeval/_internal/interop.py +0 -49
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -8
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.1.dist-info/RECORD +0 -81
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0

dataeval/utils/metadata.py ADDED Viewed

@@ -0,0 +1,258 @@
+from __future__ import annotations
+__all__ = ["merge_metadata"]
+import warnings
+from typing import Any, Iterable, Mapping, TypeVar, overload
+import numpy as np
+from numpy.typing import NDArray
+T = TypeVar("T")
+def _try_cast(v: Any, t: type[T]) -> T | None:
+    """Casts a value to a type or returns None if unable"""
+    try:
+        return t(v)  # type: ignore
+    except (TypeError, ValueError):
+        return None
+@overload
+def _convert_type(data: list[str]) -> list[int] | list[float] | list[str]: ...
+@overload
+def _convert_type(data: str) -> int | float | str: ...
+def _convert_type(data: list[str] | str) -> list[int] | list[float] | list[str] | int | float | str:
+    """
+    Converts a value or a list of values to the simplest form possible, in preferred order of `int`,
+    `float`, or `string`.
+    Parameters
+    ----------
+    data : list[str] | str
+        A list of values or a single value
+    Returns
+    -------
+    list[int | float | str] | int | float | str
+        The same values converted to the numerical type if possible
+    """
+    if not isinstance(data, list):
+        value = _try_cast(data, float)
+        return str(data) if value is None else int(value) if value.is_integer() else value
+    converted = []
+    TYPE_MAP = {int: 0, float: 1, str: 2}
+    max_type = 0
+    for value in data:
+        value = _convert_type(value)
+        max_type = max(max_type, TYPE_MAP.get(type(value), 2))
+        converted.append(value)
+    for i in range(len(converted)):
+        converted[i] = list(TYPE_MAP)[max_type](converted[i])
+    return converted
+def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], int]:
+    """
+    Finds indices to minimize unique tuple keys
+    Parameters
+    ----------
+    keys : Iterable[tuple[str, ...]]
+        Collection of unique expanded tuple keys
+    Returns
+    -------
+    dict[tuple[str, ...], int]
+        Mapping of tuple keys to starting index
+    """
+    indices = {k: -1 for k in keys}
+    ks = list(keys)
+    while len(ks) > 0:
+        seen: dict[tuple[str, ...], list[tuple[str, ...]]] = {}
+        for k in ks:
+            seen.setdefault(k[indices[k] :], []).append(k)
+        ks.clear()
+        for sk in seen.values():
+            if len(sk) > 1:
+                ks.extend(sk)
+                for k in sk:
+                    indices[k] -= 1
+    return indices
+def _flatten_dict_inner(
+    d: Mapping[str, Any], parent_keys: tuple[str, ...], size: int | None = None, nested: bool = False
+) -> tuple[dict[tuple[str, ...], Any], int | None]:
+    """
+    Recursive internal function for flattening a dictionary.
+    Parameters
+    ----------
+    d : dict[str, Any]
+        Dictionary to flatten
+    parent_keys : tuple[str, ...]
+        Parent keys to the current dictionary being flattened
+    size : int or None, default None
+        Tracking int for length of lists
+    nested : bool, default False
+        Tracking if inside a list
+    Returns
+    -------
+    tuple[dict[tuple[str, ...], Any], int | None]
+        - [0]: Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
+        - [1]: Size, if any, of the current list of values
+    """
+    items: dict[tuple[str, ...], Any] = {}
+    for k, v in d.items():
+        new_keys: tuple[str, ...] = parent_keys + (k,)
+        if isinstance(v, dict):
+            fd, size = _flatten_dict_inner(v, new_keys, size=size, nested=nested)
+            items.update(fd)
+        elif isinstance(v, (list, tuple)):
+            if not nested and (size is None or size == len(v)):
+                size = len(v)
+                if all(isinstance(i, dict) for i in v):
+                    for sub_dict in v:
+                        fd, size = _flatten_dict_inner(sub_dict, new_keys, size=size, nested=True)
+                        for fk, fv in fd.items():
+                            items.setdefault(fk, []).append(fv)
+                else:
+                    items[new_keys] = v
+            else:
+                warnings.warn(f"Dropping nested list found in '{parent_keys + (k, )}'.")
+        else:
+            items[new_keys] = v
+    return items, size
+def _flatten_dict(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool) -> dict[str, Any]:
+    """
+    Flattens a dictionary and converts values to numeric values when possible.
+    Parameters
+    ----------
+    d : dict[str, Any]
+        Dictionary to flatten
+    sep : str
+        String separator to use when concatenating key names
+    ignore_lists : bool
+        Option to skip expanding lists within metadata
+    fully_qualified : bool
+        Option to return dictionary keys full qualified instead of minimized
+    Returns
+    -------
+    dict[str, Any]
+        A flattened dictionary
+    """
+    expanded, size = _flatten_dict_inner(d, parent_keys=(), nested=ignore_lists)
+    output = {}
+    if fully_qualified:
+        expanded = {sep.join(k): v for k, v in expanded.items()}
+    else:
+        keys = _get_key_indices(expanded)
+        expanded = {sep.join(k[keys[k] :]): v for k, v in expanded.items()}
+    for k, v in expanded.items():
+        cv = _convert_type(v)
+        if isinstance(cv, list) and len(cv) == size:
+            output[k] = cv
+        elif not isinstance(cv, list):
+            output[k] = cv if not size else [cv] * size
+    return output
+def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
+    """EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
+    # single dict
+    if len(metadata) < 2:
+        return False
+    # dict of non dicts
+    keys = list(metadata)
+    if not isinstance(metadata[keys[0]], Mapping):
+        return False
+    # dict of dicts with matching keys
+    return set(metadata[keys[0]]) == set(metadata[keys[1]])
+def merge_metadata(
+    metadata: Iterable[Mapping[str, Any]],
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+    as_numpy: bool = False,
+) -> dict[str, list[Any]] | dict[str, NDArray[Any]]:
+    """
+    Merges a collection of metadata dictionaries into a single flattened dictionary of keys and values.
+    Nested dictionaries are flattened, and lists are expanded. Nested lists are dropped as the
+    expanding into multiple hierarchical trees is not supported.
+    Parameters
+    ----------
+    metadata : Iterable[Mapping[str, Any]]
+        Iterable collection of metadata dictionaries to flatten and merge
+    ignore_lists : bool, default False
+        Option to skip expanding lists within metadata
+    fully_qualified : bool, default False
+        Option to return dictionary keys full qualified instead of minimized
+    as_numpy : bool, default False
+        Option to return results as lists or NumPy arrays
+    Returns
+    -------
+    dict[str, list[Any]] | dict[str, NDArray[Any]]
+        A single dictionary containing the flattened data as lists or NumPy arrays
+    Note
+    ----
+    Nested lists of values and inconsistent keys are dropped in the merged metadata dictionary
+    Example
+    -------
+    >>> list_metadata = [{"common": 1, "target": [{"a": 1, "b": 3}, {"a": 2, "b": 4}], "source": "example"}]
+    >>> merge_metadata(list_metadata)
+    {'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example']}
+    """
+    merged: dict[str, list[Any]] = {}
+    isect: set[str] = set()
+    union: set[str] = set()
+    keys: list[str] | None = None
+    dicts: list[Mapping[str, Any]]
+    # EXPERIMENTAL
+    if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
+        warnings.warn("Experimental processing for dict of dicts.")
+        keys = [str(k) for k in metadata]
+        dicts = list(metadata.values())
+        ignore_lists = True
+    else:
+        dicts = list(metadata)
+    for d in dicts:
+        flattened = _flatten_dict(d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified)
+        isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
+        union = union.union(flattened.keys())
+        for k, v in flattened.items():
+            merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
+    if len(union) > len(isect):
+        warnings.warn(f"Inconsistent metadata keys found. Dropping {union - isect} from metadata.")
+    output: dict[str, Any] = {}
+    if keys:
+        output["keys"] = np.array(keys) if as_numpy else keys
+    for k in (key for key in merged if key in isect):
+        cv = _convert_type(merged[k])
+        output[k] = np.array(cv) if as_numpy else cv
+    return output

dataeval/utils/shared.py ADDED Viewed

@@ -0,0 +1,151 @@
+from __future__ import annotations
+__all__ = []
+import sys
+from typing import Any, Callable, Literal, TypeVar
+import numpy as np
+from numpy.typing import ArrayLike, NDArray
+from scipy.sparse import csr_matrix
+from scipy.sparse.csgraph import minimum_spanning_tree as mst
+from scipy.spatial.distance import pdist, squareform
+from sklearn.neighbors import NearestNeighbors
+if sys.version_info >= (3, 10):
+    from typing import ParamSpec
+else:
+    from typing_extensions import ParamSpec
+from dataeval.interop import as_numpy
+EPSILON = 1e-5
+HASH_SIZE = 8
+MAX_FACTOR = 4
+P = ParamSpec("P")
+R = TypeVar("R")
+def get_method(method_map: dict[str, Callable[P, R]], method: str) -> Callable[P, R]:
+    if method not in method_map:
+        raise ValueError(f"Specified method {method} is not a valid method: {method_map}.")
+    return method_map[method]
+def flatten(array: ArrayLike) -> NDArray[Any]:
+    """
+    Flattens input array from (N, ... ) to (N, -1) where all samples N have all data in their last dimension
+    Parameters
+    ----------
+    X : NDArray, shape - (N, ... )
+        Input array
+    Returns
+    -------
+    NDArray, shape - (N, -1)
+    """
+    nparr = as_numpy(array)
+    return nparr.reshape((nparr.shape[0], -1))
+def minimum_spanning_tree(X: NDArray[Any]) -> Any:
+    """
+    Returns the minimum spanning tree from a :term:`NumPy` image array.
+    Parameters
+    ----------
+    X : NDArray
+        Numpy image array
+    Returns
+    -------
+        Data representing the minimum spanning tree
+    """
+    # All features belong on second dimension
+    X = flatten(X)
+    # We add a small constant to the distance matrix to ensure scipy interprets
+    # the input graph as fully-connected.
+    dense_eudist = squareform(pdist(X)) + EPSILON
+    eudist_csr = csr_matrix(dense_eudist)
+    return mst(eudist_csr)
+def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
+    """
+    Returns the classes and counts of from an array of labels
+    Parameters
+    ----------
+    label : NDArray
+        Numpy labels array
+    Returns
+    -------
+        Classes and counts
+    Raises
+    ------
+    ValueError
+        If the number of unique classes is less than 2
+    """
+    classes, counts = np.unique(labels, return_counts=True)
+    M = len(classes)
+    if M < 2:
+        raise ValueError("Label vector contains less than 2 classes!")
+    N = np.sum(counts).astype(int)
+    return M, N
+def compute_neighbors(
+    A: NDArray[Any],
+    B: NDArray[Any],
+    k: int = 1,
+    algorithm: Literal["auto", "ball_tree", "kd_tree"] = "auto",
+) -> NDArray[Any]:
+    """
+    For each sample in A, compute the nearest neighbor in B
+    Parameters
+    ----------
+    A, B : NDArray
+        The n_samples and n_features respectively
+    k : int
+        The number of neighbors to find
+    algorithm : Literal
+        Tree method for nearest neighbor (auto, ball_tree or kd_tree)
+    Note
+    ----
+        Do not use kd_tree if n_features > 20
+    Returns
+    -------
+    List:
+        Closest points to each point in A and B
+    Raises
+    ------
+    ValueError
+        If algorithm is not "auto", "ball_tree", or "kd_tree"
+    See Also
+    --------
+    sklearn.neighbors.NearestNeighbors
+    """
+    if k < 1:
+        raise ValueError("k must be >= 1")
+    if algorithm not in ["auto", "ball_tree", "kd_tree"]:
+        raise ValueError("Algorithm must be 'auto', 'ball_tree', or 'kd_tree'")
+    A = flatten(A)
+    B = flatten(B)
+    nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
+    nns = nbrs.kneighbors(A)[1]
+    nns = nns[:, 1:].squeeze()
+    return nns

dataeval/{_internal → utils}/split_dataset.py RENAMED Viewed

@@ -1,20 +1,26 @@
 from __future__ import annotations
+__all__ = ["split_dataset"]
 import warnings
+from typing import Any
 import numpy as np
+from numpy.typing import NDArray
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
 from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
 from sklearn.utils.multiclass import type_of_target
-def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: float | None = None):
-    """Check input arguments to ensure unambiguous splitting arguments are passed.
+def validate_test_val(num_folds: int, test_frac: float | None, val_frac: float | None) -> tuple[float, float]:
+    """Check input fractions to ensure unambiguous splitting arguments are passed return calculated
+    test and validation fractions.
     Parameters
     ----------
-    num_folds : int, default 1
+    num_folds : int
         number of [train, val] cross-validation folds to generate
     test_frac : float, optional
         If specified, also generate a test set containing (test_frac*100)% of the data
@@ -36,19 +42,23 @@ def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: flo
     Returns
     -------
-    None
+    tuple[float, float]
+        Tuple of the validated and calculated values as appropriate for test and validation fractions
     """
     if (num_folds > 1) and (val_frac is not None):
         raise ValueError("If specifying val_frac, num_folds must be None or 1")
     if (num_folds == 1) and (val_frac is None):
-        raise UnboundLocalError("If num_folds is None or 1, must assign a value to val_frac")
+        raise ValueError("If num_folds is None or 1, must assign a value to val_frac")
     t_frac = 0.0 if test_frac is None else test_frac
     v_frac = 1.0 / num_folds * (1.0 - t_frac) if val_frac is None else val_frac * (1.0 - t_frac)
     if (t_frac + v_frac) >= 1.0:
         raise ValueError(f"val_frac + test_frac must be less that 1.0, currently {v_frac+t_frac}")
+    return t_frac, v_frac
-def check_labels(labels: list | np.ndarray, total_partitions: int):
+def check_labels(
+    labels: list[int] | NDArray[np.int_], total_partitions: int
+) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
     """Check to make sure there are more input data than the total number of partitions requested
     Also converts labels to a numpy array, if it isn't already
@@ -88,7 +98,7 @@ def check_labels(labels: list | np.ndarray, total_partitions: int):
     return index, labels
-def check_stratifiable(labels: np.ndarray, total_partitions: int):
+def check_stratifiable(labels: NDArray[np.int_], total_partitions: int) -> bool:
     """
     Very basic check to see if dataset can be stratified by class label. This is not a
     comprehensive test, as factors such as grouping also affect the ability to stratify by label
@@ -124,7 +134,7 @@ def check_stratifiable(labels: np.ndarray, total_partitions: int):
     return stratifiable
-def check_groups(group_ids: np.ndarray, num_partitions: int):
+def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
     """
     Warns user if the number of unique group_ids is incompatible with a grouped partition containing
     num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
@@ -162,7 +172,7 @@ def check_groups(group_ids: np.ndarray, num_partitions: int):
     return groupable
-def bin_kmeans(array: np.ndarray):
+def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
     """
     Find bins of continuous data by iteratively applying k-means clustering, and keeping the
     clustering with the highest silhouette score.
@@ -182,18 +192,18 @@ def bin_kmeans(array: np.ndarray):
         best_score = 0.60
     else:
         best_score = 0.50
-    bin_index = np.zeros(len(array))
+    bin_index = np.zeros(len(array), dtype=np.int_)
     for k in range(2, 20):
         clusterer = KMeans(n_clusters=k)
         cluster_labels = clusterer.fit_predict(array)
         score = silhouette_score(array, cluster_labels, sample_size=25_000)
         if score > best_score:
             best_score = score
-            bin_index = cluster_labels
+            bin_index = cluster_labels.astype(np.int_)
     return bin_index
-def angle2xy(angles: np.ndarray):
+def angle2xy(angles: NDArray[Any]) -> NDArray[Any]:
     """
     Converts angle measurements to xy coordinates on the unit circle. Needed for binning angle data.
@@ -213,7 +223,7 @@ def angle2xy(angles: np.ndarray):
     return xy
-def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
+def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.int_]:
     """Returns individual group numbers based on a subset of metadata defined by groupnames
     Parameters
@@ -235,7 +245,7 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
     group_ids: np.ndarray
         group identifiers from metadata
     """
-    features2group = {k: np.array(v) for k, v in metadata.items() if k in groupnames}
+    features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
     if not features2group:
         return np.zeros(num_samples, dtype=int)
     for name, feature in features2group.items():
@@ -252,8 +262,12 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
 def make_splits(
-    index: np.ndarray, labels: np.ndarray, n_folds: int, groups: np.ndarray | None = None, stratified: bool = False
-):
+    index: NDArray[np.int_],
+    labels: NDArray[np.int_],
+    n_folds: int,
+    groups: NDArray[np.int_] | None = None,
+    stratified: bool = False,
+) -> list[dict[str, NDArray[np.int_]]]:
     """Split data into n_folds partitions of training and validation data.
     Parameters
@@ -290,9 +304,59 @@ def make_splits(
     return split_defs
+def find_best_split(
+    labels: NDArray[np.int_], split_defs: list[dict[str, NDArray[np.int_]]], stratified: bool, eval_frac: float
+) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
+    """Finds the split that most closely satisfies a criterion determined by the arguments passed.
+    If stratified is True, returns the split whose class balance most closely resembles the overall
+    class balance. If false, returns the split with the size closest to the desired eval_frac
+    Parameters
+    ----------
+    labels : np.ndarray
+        Labels upon which splits are (optionally) stratified
+    split_defs : list[dict]
+        List of dictionaries, which specifying train index, validation index, and the ratio of
+        validation to all data.
+    stratified: bool
+        If True, maintain dataset class balance within each train/val split
+    eval_frac: float
+        Desired fraction of the dataset sequestered for evaluation
+    Returns
+    -------
+    train_index : np.ndarray
+        indices of data partitioned for training
+    eval_index : np.ndarray
+        indices of data partitioned for evaluation
+    """
+    def class_freq_diff(split):
+        train_labels = labels[split["train"]]
+        _, train_counts = np.unique(train_labels, return_counts=True)
+        train_freq = train_counts / train_counts.sum()
+        return np.square(train_freq - class_freq).sum()
+    if stratified:
+        _, class_counts = np.unique(labels, return_counts=True)
+        class_freq = class_counts / class_counts.sum()
+        best_split = min(split_defs, key=class_freq_diff)
+        return best_split["train"], best_split["eval"]
+    elif eval_frac <= 2 / 3:
+        best_split = min(split_defs, key=lambda x: abs(eval_frac - x["eval_frac"]))  # type: ignore
+        return best_split["train"], best_split["eval"]
+    else:
+        best_split = min(split_defs, key=lambda x: abs(eval_frac - (1 - x["eval_frac"])))  # type: ignore
+        return best_split["eval"], best_split["train"]
 def single_split(
-    index: np.ndarray, labels: np.ndarray, eval_frac: float, groups: np.ndarray | None = None, stratified: bool = False
-):
+    index: NDArray[np.int_],
+    labels: NDArray[np.int_],
+    eval_frac: float,
+    groups: NDArray[np.int_] | None = None,
+    stratified: bool = False,
+) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
     """Handles the special case where only 1 partition of the data is desired (such as when
     generating the test holdout split). In this case, the desired fraction of the data to be
     partitioned into the test data must be specified, and a single [train, eval] pair are returned.
@@ -317,27 +381,28 @@ def single_split(
     eval_index : np.ndarray
         indices of data partitioned for evaluation
     """
-    if eval_frac <= 2 / 3:
+    if groups is not None:
+        n_unique_groups = np.unique(groups).shape[0]
+        _, label_counts = np.unique(labels, return_counts=True)
+        n_folds = min(n_unique_groups, label_counts.min())
+    elif eval_frac <= 2 / 3:
         n_folds = max(2, int(round(1 / (eval_frac + 1e-6))))
-        split_candidates = make_splits(index, labels, n_folds, groups, stratified)
-        best_split = min(split_candidates, key=lambda x: abs(eval_frac - x["eval_frac"]))
-        return best_split["train"], best_split["eval"]
     else:
-        n_folds = max(2, int(round(1 / (1 - eval_frac + 1e-6))))
-        split_candidates = make_splits(index, labels, n_folds, groups, stratified)
-        best_split = min(split_candidates, key=lambda x: abs(eval_frac - (1 - x["eval_frac"])))
-        return best_split["eval"], best_split["train"]
+        n_folds = max(2, int(round(1 / (1 - eval_frac - 1e-6))))
+    split_candidates = make_splits(index, labels, n_folds, groups, stratified)
+    best_train, best_eval = find_best_split(labels, split_candidates, stratified, eval_frac)
+    return best_train, best_eval
 def split_dataset(
-    labels: list | np.ndarray,
+    labels: list[int] | NDArray[np.int_],
     num_folds: int = 1,
     stratify: bool = False,
-    split_on: list | None = None,
-    metadata: dict | None = None,
+    split_on: list[str] | None = None,
+    metadata: dict[str, Any] | None = None,
     test_frac: float | None = None,
     val_frac: float | None = None,
-):
+) -> dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]]:
     """Top level splitting function. Returns a dict with each key-value pair containing
     train and validation indices. Indices for a test holdout may also be optionally included
@@ -386,7 +451,7 @@ def split_dataset(
         }
     """
-    check_args(num_folds, test_frac, val_frac)
+    test_frac, val_frac = validate_test_val(num_folds, test_frac, val_frac)
     total_partitions = num_folds + 1 if test_frac else num_folds
     index, labels = check_labels(labels, total_partitions)
     stratify &= check_stratifiable(labels, total_partitions)
@@ -399,7 +464,7 @@ def split_dataset(
             groups = None
     else:
         groups = None
-    split_defs = {}
+    split_defs: dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]] = {}
     if test_frac:
         tv_idx, test_idx = single_split(index, labels, test_frac, groups, stratify)
         tv_labels = labels[tv_idx]
@@ -410,7 +475,7 @@ def split_dataset(
         tv_labels = labels
         tv_groups = groups
     if num_folds == 1:
-        train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)  # type: ignore
+        train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)
         split_defs["fold_0"] = {"train": tv_idx[train_idx].squeeze(), "val": tv_idx[val_idx].squeeze()}
     else:
         tv_splits = make_splits(tv_idx, tv_labels, num_folds, tv_groups, stratify)

dataeval/utils/tensorflow/__init__.py CHANGED Viewed

@@ -2,17 +2,18 @@
 TensorFlow models are used in :term:`out of distribution<Out-of-distribution (OOD)>` detectors in the
 :mod:`dataeval.detectors.ood` module.
-DataEval provides both basic default models through the utility :func:`dataeval.utils.tensorflow.models.create_model`
-as well as constructors which allow for customization of the encoder, decoder and any other applicable
-layers used by the model.
+DataEval provides basic default models through the utility :func:`dataeval.utils.tensorflow.create_model`.
 """
 from dataeval import _IS_TENSORFLOW_AVAILABLE
-from . import loss, models, recon
 __all__ = []
 if _IS_TENSORFLOW_AVAILABLE:
-    __all__ = ["loss", "models", "recon"]
+    import dataeval.utils.tensorflow.loss as loss
+    from dataeval.utils.tensorflow._internal.utils import create_model
+    __all__ = ["create_model", "loss"]
+del _IS_TENSORFLOW_AVAILABLE

dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl