PyPI - dataeval - Versions diffs - 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl - Mend

dataeval 0.72.2py3-none-any.whl → 0.73.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

dataeval/__init__.py +3 -3
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +1 -1
dataeval/detectors/drift/base.py +2 -2
dataeval/detectors/linters/clusterer.py +1 -1
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +14 -6
dataeval/detectors/ood/aegmm.py +14 -6
dataeval/detectors/ood/base.py +9 -3
dataeval/detectors/ood/llr.py +22 -16
dataeval/detectors/ood/vae.py +14 -6
dataeval/detectors/ood/vaegmm.py +14 -6
dataeval/interop.py +9 -7
dataeval/metrics/bias/balance.py +50 -44
dataeval/metrics/bias/coverage.py +38 -6
dataeval/metrics/bias/diversity.py +117 -65
dataeval/metrics/bias/metadata.py +225 -60
dataeval/metrics/bias/parity.py +68 -54
dataeval/utils/__init__.py +4 -3
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/shared.py +1 -1
dataeval/utils/split_dataset.py +12 -6
dataeval/utils/tensorflow/_internal/gmm.py +8 -2
dataeval/utils/tensorflow/_internal/loss.py +20 -11
dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
dataeval/utils/tensorflow/_internal/trainer.py +12 -5
dataeval/utils/tensorflow/_internal/utils.py +70 -71
dataeval/utils/torch/datasets.py +2 -2
dataeval/workflows/__init__.py +1 -1
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0

dataeval/metrics/bias/parity.py CHANGED Viewed

@@ -11,6 +11,7 @@ from numpy.typing import ArrayLike, NDArray
 from scipy.stats import chi2_contingency, chisquare
 from dataeval.interop import to_numpy
+from dataeval.metrics.bias.metadata import CLASS_LABEL, preprocess_metadata
 from dataeval.output import OutputMetadata, set_metadata
 TData = TypeVar("TData", np.float64, NDArray[np.float64])
@@ -27,10 +28,13 @@ class ParityOutput(Generic[TData], OutputMetadata):
         chi-squared score(s) of the test
     p_value : np.float64 | NDArray[np.float64]
         p-value(s) of the test
+    metadata_names : list[str] | None
+        Names of each metadata factor
     """
     score: TData
     p_value: TData
+    metadata_names: list[str] | None
 def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name: str) -> NDArray[np.intp]:
@@ -39,16 +43,16 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
     Parameters
     ----------
-    continuous_values: NDArray
+    continuous_values : NDArray
         The values to be digitized.
-    bins: int
+    bins : int
         The number of bins for the discrete values that continuous_values will be digitized into.
-    factor_name: str
+    factor_name : str
         The name of the factor to be digitized.
     Returns
     -------
-    NDArray
+    NDArray[np.intp]
         The digitized values
     """
@@ -66,17 +70,21 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
 def format_discretize_factors(
-    data_factors: Mapping[str, NDArray[Any]], continuous_factor_bincounts: Mapping[str, int]
+    data: NDArray[Any],
+    names: list[str],
+    is_categorical: list[bool],
+    continuous_factor_bincounts: Mapping[str, int] | None,
 ) -> dict[str, NDArray[Any]]:
     """
     Sets up the internal list of metadata factors.
     Parameters
     ----------
-    data_factors: Dict[str, NDArray]
+    data : NDArray
         The dataset factors, which are per-image attributes including class label and metadata.
-        Each key of dataset_factors is a factor, whose value is the per-image factor values.
-    continuous_factor_bincounts : Dict[str, int]
+    names : list[str]
+        The class label
+    continuous_factor_bincounts : Mapping[str, int] or None
         The factors in data_factors that have continuous values and the array of bin counts to
         discretize values into. All factors are treated as having discrete values unless they
         are specified as keys in this dictionary. Each element of this array must occur as a key
@@ -89,30 +97,33 @@ def format_discretize_factors(
           Each key is a metadata factor, whose value is the discrete per-image factor values.
     """
-    invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
-    if invalid_keys:
-        raise KeyError(
-            f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
-            "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
-        )
+    if continuous_factor_bincounts:
+        invalid_keys = set(continuous_factor_bincounts.keys()) - set(names)
+        if invalid_keys:
+            raise KeyError(
+                f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
+                "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
+            )
+    warn = []
     metadata_factors = {}
-    # make sure each factor has the same number of entries
-    lengths = []
-    for arr in data_factors.values():
-        lengths.append(arr.shape)
-    if lengths[1:] != lengths[:-1]:
-        raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
-    metadata_factors = {
-        name: val
-        if name not in continuous_factor_bincounts
-        else digitize_factor_bins(val, continuous_factor_bincounts[name], name)
-        for name, val in data_factors.items()
-        if name != "class"
-    }
+    for i, name in enumerate(names):
+        if name == CLASS_LABEL:
+            continue
+        if continuous_factor_bincounts and name in continuous_factor_bincounts:
+            metadata_factors[name] = digitize_factor_bins(data[:, i], continuous_factor_bincounts[name], name)
+        elif not is_categorical[i]:
+            warn.append(name)
+            metadata_factors[name] = data[:, i]
+        else:
+            metadata_factors[name] = data[:, i]
+    if warn:
+        warnings.warn(
+            f"The following factors appear to be continuous but did not have the desired number of bins specified: \n\
+            {warn}",
+            UserWarning,
+        )
     return metadata_factors
@@ -126,14 +137,14 @@ def normalize_expected_dist(expected_dist: NDArray[Any], observed_dist: NDArray[
     Parameters
     ----------
-    expected_dist : np.ndarray
+    expected_dist : NDArray
         The expected label distribution. This array represents the anticipated distribution of labels.
-    observed_dist : np.ndarray
+    observed_dist : NDArray
         The observed label distribution. This array represents the actual distribution of labels in the dataset.
     Returns
     -------
-    np.ndarray
+    NDArray
         The normalized expected distribution, scaled to have the same sum as the observed distribution.
     Raises
@@ -173,6 +184,8 @@ def validate_dist(label_dist: NDArray[Any], label_name: str) -> None:
     ----------
     label_dist : NDArray
         Array representing label distributions
+    label_name : str
+        String representing label name
     Raises
     ------
@@ -213,7 +226,7 @@ def label_parity(
         List of class labels in the expected dataset
     observed_labels : ArrayLike
         List of class labels in the observed dataset
-    num_classes : int | None, default None
+    num_classes : int or None, default None
         The number of unique classes in the datasets. If not provided, the function will infer it
         from the set of unique labels in expected_labels and observed_labels
@@ -247,7 +260,7 @@ def label_parity(
     >>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
     >>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
     >>> label_parity(expected_labels, observed_labels)
-    ParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
+    ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
     """
     # Calculate
@@ -278,13 +291,13 @@ def label_parity(
         )
     cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
-    return ParityOutput(cs, p)
+    return ParityOutput(cs, p, None)
 @set_metadata()
 def parity(
     class_labels: ArrayLike,
-    data_factors: Mapping[str, ArrayLike],
+    metadata: Mapping[str, ArrayLike],
     continuous_factor_bincounts: Mapping[str, int] | None = None,
 ) -> ParityOutput[NDArray[np.float64]]:
     """
@@ -297,14 +310,14 @@ def parity(
     Parameters
     ----------
-    class_labels: ArrayLike
+    class_labels : ArrayLike
         List of class labels for each image
-    data_factors: Mapping[str, ArrayLike]
+    metadata : Mapping[str, ArrayLike]
         The dataset factors, which are per-image metadata attributes.
         Each key of dataset_factors is a factor, whose value is the per-image factor values.
-    continuous_factor_bincounts : Mapping[str, int] | None, default None
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
         A dictionary specifying the number of bins for discretizing the continuous factors.
-        The keys should correspond to the names of continuous factors in `data_factors`,
+        The keys should correspond to the names of continuous factors in `metadata`,
         and the values should be the number of bins to use for discretization.
         If not provided, no discretization is applied.
@@ -337,42 +350,43 @@ def parity(
     Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
     >>> labels = np_random_gen.choice([0, 1, 2], (100))
-    >>> data_factors = {
+    >>> metadata = {
     ...     "age": np_random_gen.choice([25, 30, 35, 45], (100)),
     ...     "income": np_random_gen.choice([50000, 65000, 80000], (100)),
     ...     "gender": np_random_gen.choice(["M", "F"], (100)),
     ... }
     >>> continuous_factor_bincounts = {"age": 4, "income": 3}
-    >>> parity(labels, data_factors, continuous_factor_bincounts)
-    ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]))
-    """
+    >>> parity(labels, metadata, continuous_factor_bincounts)
+    ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
+    """  # noqa: E501
     if len(np.shape(class_labels)) > 1:
         raise ValueError(
             f"Got class labels with {len(np.shape(class_labels))}-dimensional",
             f" shape {np.shape(class_labels)}, but expected a 1-dimensional array.",
         )
-    data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
-    continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
+    data, names, is_categorical, _ = preprocess_metadata(class_labels, metadata)
+    factors = format_discretize_factors(data, names, is_categorical, continuous_factor_bincounts)
-    labels = to_numpy(class_labels)
-    factors = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
+    # unique class labels
+    class_idx = names.index(CLASS_LABEL)
+    u_cls = np.unique(data[:, class_idx])
     chi_scores = np.zeros(len(factors))
     p_values = np.zeros(len(factors))
-    n_cls = len(np.unique(labels))
     not_enough_data = {}
     for i, (current_factor_name, factor_values) in enumerate(factors.items()):
         unique_factor_values = np.unique(factor_values)
-        contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
+        contingency_matrix = np.zeros((len(unique_factor_values), u_cls.size))
         # Builds a contingency matrix where entry at index (r,c) represents
         # the frequency of current_factor_name achieving value unique_factor_values[r]
         # at a data point with class c.
         # TODO: Vectorize this nested for loop
         for fi, factor_value in enumerate(unique_factor_values):
-            for label in range(n_cls):
-                with_both = np.bitwise_and((labels == label), factor_values == factor_value)
+            for label in u_cls:
+                with_both = np.bitwise_and((data[:, class_idx] == label), factor_values == factor_value)
                 contingency_matrix[fi, label] = np.sum(with_both)
                 if 0 < contingency_matrix[fi, label] < 5:
                     if current_factor_name not in not_enough_data:
@@ -414,4 +428,4 @@ def parity(
             UserWarning,
         )
-    return ParityOutput(chi_scores, p_values)
+    return ParityOutput(chi_scores, p_values, list(metadata.keys()))

dataeval/utils/__init__.py CHANGED Viewed

@@ -5,16 +5,17 @@ metrics. Currently DataEval supports both :term:`TensorFlow` and PyTorch backend
 """
 from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
+from dataeval.utils.metadata import merge_metadata
 from dataeval.utils.split_dataset import split_dataset
-__all__ = ["split_dataset"]
+__all__ = ["split_dataset", "merge_metadata"]
-if _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TORCH_AVAILABLE:
     from dataeval.utils import torch
     __all__ += ["torch"]
-if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
+if _IS_TENSORFLOW_AVAILABLE:
     from dataeval.utils import tensorflow
     __all__ += ["tensorflow"]

dataeval/utils/lazy.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+from functools import cached_property
+from importlib import import_module
+from typing import Any
+class LazyModule:
+    def __init__(self, name: str) -> None:
+        self._name = name
+    def __getattr__(self, key: str) -> Any:
+        return getattr(self._module, key)
+    @cached_property
+    def _module(self):
+        return import_module(self._name)
+LAZY_MODULES: dict[str, LazyModule] = {}
+def lazyload(name: str) -> LazyModule:
+    if name not in LAZY_MODULES:
+        LAZY_MODULES[name] = LazyModule(name)
+    return LAZY_MODULES[name]

dataeval/utils/metadata.py ADDED Viewed

@@ -0,0 +1,258 @@
+from __future__ import annotations
+__all__ = ["merge_metadata"]
+import warnings
+from typing import Any, Iterable, Mapping, TypeVar, overload
+import numpy as np
+from numpy.typing import NDArray
+T = TypeVar("T")
+def _try_cast(v: Any, t: type[T]) -> T | None:
+    """Casts a value to a type or returns None if unable"""
+    try:
+        return t(v)  # type: ignore
+    except (TypeError, ValueError):
+        return None
+@overload
+def _convert_type(data: list[str]) -> list[int] | list[float] | list[str]: ...
+@overload
+def _convert_type(data: str) -> int | float | str: ...
+def _convert_type(data: list[str] | str) -> list[int] | list[float] | list[str] | int | float | str:
+    """
+    Converts a value or a list of values to the simplest form possible, in preferred order of `int`,
+    `float`, or `string`.
+    Parameters
+    ----------
+    data : list[str] | str
+        A list of values or a single value
+    Returns
+    -------
+    list[int | float | str] | int | float | str
+        The same values converted to the numerical type if possible
+    """
+    if not isinstance(data, list):
+        value = _try_cast(data, float)
+        return str(data) if value is None else int(value) if value.is_integer() else value
+    converted = []
+    TYPE_MAP = {int: 0, float: 1, str: 2}
+    max_type = 0
+    for value in data:
+        value = _convert_type(value)
+        max_type = max(max_type, TYPE_MAP.get(type(value), 2))
+        converted.append(value)
+    for i in range(len(converted)):
+        converted[i] = list(TYPE_MAP)[max_type](converted[i])
+    return converted
+def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], int]:
+    """
+    Finds indices to minimize unique tuple keys
+    Parameters
+    ----------
+    keys : Iterable[tuple[str, ...]]
+        Collection of unique expanded tuple keys
+    Returns
+    -------
+    dict[tuple[str, ...], int]
+        Mapping of tuple keys to starting index
+    """
+    indices = {k: -1 for k in keys}
+    ks = list(keys)
+    while len(ks) > 0:
+        seen: dict[tuple[str, ...], list[tuple[str, ...]]] = {}
+        for k in ks:
+            seen.setdefault(k[indices[k] :], []).append(k)
+        ks.clear()
+        for sk in seen.values():
+            if len(sk) > 1:
+                ks.extend(sk)
+                for k in sk:
+                    indices[k] -= 1
+    return indices
+def _flatten_dict_inner(
+    d: Mapping[str, Any], parent_keys: tuple[str, ...], size: int | None = None, nested: bool = False
+) -> tuple[dict[tuple[str, ...], Any], int | None]:
+    """
+    Recursive internal function for flattening a dictionary.
+    Parameters
+    ----------
+    d : dict[str, Any]
+        Dictionary to flatten
+    parent_keys : tuple[str, ...]
+        Parent keys to the current dictionary being flattened
+    size : int or None, default None
+        Tracking int for length of lists
+    nested : bool, default False
+        Tracking if inside a list
+    Returns
+    -------
+    tuple[dict[tuple[str, ...], Any], int | None]
+        - [0]: Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
+        - [1]: Size, if any, of the current list of values
+    """
+    items: dict[tuple[str, ...], Any] = {}
+    for k, v in d.items():
+        new_keys: tuple[str, ...] = parent_keys + (k,)
+        if isinstance(v, dict):
+            fd, size = _flatten_dict_inner(v, new_keys, size=size, nested=nested)
+            items.update(fd)
+        elif isinstance(v, (list, tuple)):
+            if not nested and (size is None or size == len(v)):
+                size = len(v)
+                if all(isinstance(i, dict) for i in v):
+                    for sub_dict in v:
+                        fd, size = _flatten_dict_inner(sub_dict, new_keys, size=size, nested=True)
+                        for fk, fv in fd.items():
+                            items.setdefault(fk, []).append(fv)
+                else:
+                    items[new_keys] = v
+            else:
+                warnings.warn(f"Dropping nested list found in '{parent_keys + (k, )}'.")
+        else:
+            items[new_keys] = v
+    return items, size
+def _flatten_dict(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool) -> dict[str, Any]:
+    """
+    Flattens a dictionary and converts values to numeric values when possible.
+    Parameters
+    ----------
+    d : dict[str, Any]
+        Dictionary to flatten
+    sep : str
+        String separator to use when concatenating key names
+    ignore_lists : bool
+        Option to skip expanding lists within metadata
+    fully_qualified : bool
+        Option to return dictionary keys full qualified instead of minimized
+    Returns
+    -------
+    dict[str, Any]
+        A flattened dictionary
+    """
+    expanded, size = _flatten_dict_inner(d, parent_keys=(), nested=ignore_lists)
+    output = {}
+    if fully_qualified:
+        expanded = {sep.join(k): v for k, v in expanded.items()}
+    else:
+        keys = _get_key_indices(expanded)
+        expanded = {sep.join(k[keys[k] :]): v for k, v in expanded.items()}
+    for k, v in expanded.items():
+        cv = _convert_type(v)
+        if isinstance(cv, list) and len(cv) == size:
+            output[k] = cv
+        elif not isinstance(cv, list):
+            output[k] = cv if not size else [cv] * size
+    return output
+def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
+    """EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
+    # single dict
+    if len(metadata) < 2:
+        return False
+    # dict of non dicts
+    keys = list(metadata)
+    if not isinstance(metadata[keys[0]], Mapping):
+        return False
+    # dict of dicts with matching keys
+    return set(metadata[keys[0]]) == set(metadata[keys[1]])
+def merge_metadata(
+    metadata: Iterable[Mapping[str, Any]],
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+    as_numpy: bool = False,
+) -> dict[str, list[Any]] | dict[str, NDArray[Any]]:
+    """
+    Merges a collection of metadata dictionaries into a single flattened dictionary of keys and values.
+    Nested dictionaries are flattened, and lists are expanded. Nested lists are dropped as the
+    expanding into multiple hierarchical trees is not supported.
+    Parameters
+    ----------
+    metadata : Iterable[Mapping[str, Any]]
+        Iterable collection of metadata dictionaries to flatten and merge
+    ignore_lists : bool, default False
+        Option to skip expanding lists within metadata
+    fully_qualified : bool, default False
+        Option to return dictionary keys full qualified instead of minimized
+    as_numpy : bool, default False
+        Option to return results as lists or NumPy arrays
+    Returns
+    -------
+    dict[str, list[Any]] | dict[str, NDArray[Any]]
+        A single dictionary containing the flattened data as lists or NumPy arrays
+    Note
+    ----
+    Nested lists of values and inconsistent keys are dropped in the merged metadata dictionary
+    Example
+    -------
+    >>> list_metadata = [{"common": 1, "target": [{"a": 1, "b": 3}, {"a": 2, "b": 4}], "source": "example"}]
+    >>> merge_metadata(list_metadata)
+    {'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example']}
+    """
+    merged: dict[str, list[Any]] = {}
+    isect: set[str] = set()
+    union: set[str] = set()
+    keys: list[str] | None = None
+    dicts: list[Mapping[str, Any]]
+    # EXPERIMENTAL
+    if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
+        warnings.warn("Experimental processing for dict of dicts.")
+        keys = [str(k) for k in metadata]
+        dicts = list(metadata.values())
+        ignore_lists = True
+    else:
+        dicts = list(metadata)
+    for d in dicts:
+        flattened = _flatten_dict(d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified)
+        isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
+        union = union.union(flattened.keys())
+        for k, v in flattened.items():
+            merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
+    if len(union) > len(isect):
+        warnings.warn(f"Inconsistent metadata keys found. Dropping {union - isect} from metadata.")
+    output: dict[str, Any] = {}
+    if keys:
+        output["keys"] = np.array(keys) if as_numpy else keys
+    for k in (key for key in merged if key in isect):
+        cv = _convert_type(merged[k])
+        output[k] = np.array(cv) if as_numpy else cv
+    return output

dataeval/utils/shared.py CHANGED Viewed

@@ -95,7 +95,7 @@ def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
     M = len(classes)
     if M < 2:
         raise ValueError("Label vector contains less than 2 classes!")
-    N = np.sum(counts).astype(int)
+    N = int(np.sum(counts))
     return M, N

dataeval/utils/split_dataset.py CHANGED Viewed

@@ -144,7 +144,7 @@ def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
     ----------
     group_ids : np.ndarray
         Identifies the group to which a sample at the same index belongs.
-    num_partitions: int
+    num_partitions : int
         How many total (train, val) folds will be generated (+1 if also specifying a test fold).
     Warns
@@ -242,12 +242,12 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
     Returns
     -------
-    group_ids: np.ndarray
+    group_ids : np.ndarray
         group identifiers from metadata
     """
     features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
     if not features2group:
-        return np.zeros(num_samples, dtype=int)
+        return np.zeros(num_samples, dtype=np.int_)
     for name, feature in features2group.items():
         if len(feature) != num_samples:
             raise IndexError(f"""Feature length does not match number of labels.
@@ -300,7 +300,13 @@ def make_splits(
         splits = splitter.split(index, labels)
     for train_idx, eval_idx in splits:
         test_ratio = len(eval_idx) / index.shape[0]
-        split_defs.append({"train": train_idx.astype(int), "eval": eval_idx.astype(int), "eval_frac": test_ratio})
+        split_defs.append(
+            {
+                "train": train_idx.astype(np.int_),
+                "eval": eval_idx.astype(np.int_),
+                "eval_frac": test_ratio,
+            }
+        )
     return split_defs
@@ -318,9 +324,9 @@ def find_best_split(
     split_defs : list[dict]
         List of dictionaries, which specifying train index, validation index, and the ratio of
         validation to all data.
-    stratified: bool
+    stratified : bool
         If True, maintain dataset class balance within each train/val split
-    eval_frac: float
+    eval_frac : float
         Desired fraction of the dataset sequestered for evaluation
     Returns

dataeval/utils/tensorflow/_internal/gmm.py CHANGED Viewed

@@ -8,10 +8,16 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-from typing import NamedTuple
+from typing import TYPE_CHECKING, NamedTuple
 import numpy as np
-import tensorflow as tf
+from dataeval.utils.lazy import lazyload
+if TYPE_CHECKING:
+    import tensorflow as tf
+else:
+    tf = lazyload("tensorflow")
 class GaussianMixtureModelParams(NamedTuple):

dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl

dataeval 0.72.2py3-none-any.whl → 0.73.1py3-none-any.whl