PyPI - dataeval - Versions diffs - 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl - Mend

dataeval 0.76.0py3-none-any.whl → 0.81.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

dataeval/__init__.py +3 -3
dataeval/{output.py → _output.py} +14 -0
dataeval/config.py +77 -0
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +6 -6
dataeval/detectors/drift/{base.py → _base.py} +41 -30
dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
dataeval/detectors/drift/updates.py +1 -1
dataeval/detectors/linters/__init__.py +0 -3
dataeval/detectors/linters/duplicates.py +17 -8
dataeval/detectors/linters/outliers.py +52 -43
dataeval/detectors/ood/ae.py +29 -8
dataeval/detectors/ood/base.py +5 -4
dataeval/detectors/ood/metadata_ks_compare.py +1 -1
dataeval/detectors/ood/mixin.py +20 -5
dataeval/detectors/ood/output.py +1 -1
dataeval/detectors/ood/vae.py +73 -0
dataeval/metadata/__init__.py +5 -0
dataeval/metadata/_ood.py +238 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +5 -4
dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
dataeval/metrics/bias/{parity.py → _parity.py} +89 -63
dataeval/metrics/estimators/__init__.py +14 -4
dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
dataeval/metrics/estimators/_clusterer.py +104 -0
dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
dataeval/metrics/stats/__init__.py +7 -7
dataeval/metrics/stats/{base.py → _base.py} +52 -16
dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
dataeval/metrics/stats/{labelstats.py → _labelstats.py} +25 -25
dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
dataeval/typing.py +54 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +169 -0
dataeval/utils/_bin.py +199 -0
dataeval/utils/_clusterer.py +144 -0
dataeval/utils/_fast_mst.py +189 -0
dataeval/utils/{image.py → _image.py} +6 -4
dataeval/utils/_method.py +18 -0
dataeval/utils/{shared.py → _mst.py} +3 -65
dataeval/utils/{plot.py → _plot.py} +4 -4
dataeval/utils/data/__init__.py +22 -0
dataeval/utils/data/_embeddings.py +105 -0
dataeval/utils/data/_images.py +65 -0
dataeval/utils/data/_metadata.py +352 -0
dataeval/utils/data/_selection.py +119 -0
dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
dataeval/utils/data/_targets.py +73 -0
dataeval/utils/data/_types.py +58 -0
dataeval/utils/data/collate.py +103 -0
dataeval/utils/data/datasets/__init__.py +17 -0
dataeval/utils/data/datasets/_base.py +254 -0
dataeval/utils/data/datasets/_cifar10.py +134 -0
dataeval/utils/data/datasets/_fileio.py +168 -0
dataeval/utils/data/datasets/_milco.py +153 -0
dataeval/utils/data/datasets/_mixin.py +56 -0
dataeval/utils/data/datasets/_mnist.py +183 -0
dataeval/utils/data/datasets/_ships.py +123 -0
dataeval/utils/data/datasets/_voc.py +352 -0
dataeval/utils/data/selections/__init__.py +15 -0
dataeval/utils/data/selections/_classfilter.py +60 -0
dataeval/utils/data/selections/_indices.py +26 -0
dataeval/utils/data/selections/_limit.py +26 -0
dataeval/utils/data/selections/_reverse.py +18 -0
dataeval/utils/data/selections/_shuffle.py +29 -0
dataeval/utils/metadata.py +198 -376
dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
dataeval/utils/torch/{internal.py → _internal.py} +21 -51
dataeval/utils/torch/models.py +43 -2
dataeval/workflows/sufficiency.py +10 -9
{dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/METADATA +44 -15
dataeval-0.81.0.dist-info/RECORD +94 -0
dataeval/detectors/linters/clusterer.py +0 -512
dataeval/detectors/linters/merged_stats.py +0 -49
dataeval/detectors/ood/metadata_least_likely.py +0 -119
dataeval/interop.py +0 -69
dataeval/utils/dataset/__init__.py +0 -7
dataeval/utils/dataset/datasets.py +0 -412
dataeval/utils/dataset/read.py +0 -63
dataeval-0.76.0.dist-info/RECORD +0 -67
/dataeval/{log.py → _log.py} +0 -0
/dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
{dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/{base.py → _base.py} RENAMED Viewed

@@ -1,32 +1,31 @@
 from __future__ import annotations
-from dataeval.utils.plot import histogram_plot
 __all__ = []
 import re
 import warnings
+from copy import deepcopy
 from dataclasses import dataclass
 from functools import partial
 from itertools import repeat
 from multiprocessing import Pool
-from typing import Any, Callable, Generic, Iterable, NamedTuple, Optional, TypeVar, Union
+from typing import Any, Callable, Generic, Iterable, Optional, Sequence, Sized, TypeVar, Union
 import numpy as np
 import tqdm
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.interop import to_numpy_iter
-from dataeval.output import Output
-from dataeval.utils.image import normalize_image_shape, rescale
+from dataeval._output import Output
+from dataeval.config import get_max_processes
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import to_numpy_iter
+from dataeval.utils._image import normalize_image_shape, rescale
+from dataeval.utils._plot import histogram_plot
 DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
 SOURCE_INDEX = "source_index"
 BOX_COUNT = "box_count"
-# TODO: Replace with global config
-DEFAULT_PROCESSES: int | None = None
 OptionalRange = Optional[Union[int, Iterable[int]]]
@@ -49,7 +48,8 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
         return bounding_box
-class SourceIndex(NamedTuple):
+@dataclass
+class SourceIndex:
     """
     Attributes
     ----------
@@ -205,7 +205,8 @@ class StatsProcessor(Generic[TStatsOutput]):
         return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
-class StatsProcessorOutput(NamedTuple):
+@dataclass
+class StatsProcessorOutput:
     results: list[dict[str, Any]]
     source_indices: list[SourceIndex]
     box_counts: list[int]
@@ -272,8 +273,6 @@ def run_stats(
         A flag which determines if the states should be evaluated on a per-channel basis or not.
     stats_processor_cls : Iterable[type[StatsProcessor]]
         An iterable of stats processor classes that calculate stats and return output classes.
-    processes : int | None, default None
-        Number of processes to use, defaults to None which uses all available CPU cores.
     Returns
     -------
@@ -297,11 +296,11 @@ def run_stats(
     bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
     warning_list = []
-    total_for_status = getattr(images, "__len__")() if hasattr(images, "__len__") else None
+    total_for_status = len(images) if isinstance(images, Sized) else None
     stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
     # TODO: Introduce global controls for CPU job parallelism and GPU configurations
-    with Pool(processes=DEFAULT_PROCESSES) as p:
+    with Pool(processes=get_max_processes()) as p:
         for r in tqdm.tqdm(
             p.imap(
                 partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
@@ -330,3 +329,40 @@ def run_stats(
     outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
     return outputs
+def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
+    if type(a) is not type(b):
+        raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
+    sum_dict = deepcopy(a.dict())
+    for k in sum_dict:
+        if isinstance(sum_dict[k], list):
+            sum_dict[k].extend(b.dict()[k])
+        else:
+            sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
+    return type(a)(**sum_dict)
+def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
+    output = None
+    dataset_steps = []
+    cur_len = 0
+    for s in stats:
+        output = s if output is None else add_stats(output, s)
+        cur_len += len(s)
+        dataset_steps.append(cur_len)
+    if output is None:
+        raise TypeError("Cannot combine empty sequence of stats.")
+    return output, dataset_steps
+def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
+    last_step = 0
+    for i, step in enumerate(dataset_steps):
+        if idx < step:
+            return i, idx - last_step
+        last_step = step
+    return -1, idx

dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} RENAMED Viewed

@@ -8,9 +8,9 @@ from typing import Any, Callable, Generic, TypeVar, cast
 import numpy as np
 from numpy.typing import NDArray
-from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
-from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
-from dataeval.output import set_metadata
+from dataeval._output import set_metadata
+from dataeval.metrics.stats._base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
+from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
 TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
 ArraySlice = tuple[int, int]
@@ -50,7 +50,7 @@ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
             "depth": lambda x: x.box["depth"],
             "distance": lambda x: x.box["distance"],
         }
-    )
+    ),
 }
@@ -87,11 +87,8 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
         stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
         out_type = type(box_stats)
         use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
-        ratio = (
-            RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
-            if use_override
-            else np.nan_to_num(stats.box[key] / stats.img[key])
-        )
+        with np.errstate(divide="ignore", invalid="ignore"):
+            ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
         out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
     return out_stats

dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} RENAMED Viewed

@@ -5,24 +5,20 @@ __all__ = []
 from dataclasses import dataclass
 from typing import Any, Iterable
-from numpy.typing import ArrayLike
-from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
-from dataeval.metrics.stats.dimensionstats import (
-    DimensionStatsOutput,
-    DimensionStatsProcessor,
-)
-from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
-from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
-from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
-from dataeval.output import Output, set_metadata
-from dataeval.utils.plot import channel_histogram_plot
+from dataeval._output import Output, set_metadata
+from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
+from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, DimensionStatsProcessor
+from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
+from dataeval.metrics.stats._pixelstats import PixelStatsOutput, PixelStatsProcessor
+from dataeval.metrics.stats._visualstats import VisualStatsOutput, VisualStatsProcessor
+from dataeval.typing import ArrayLike
+from dataeval.utils._plot import channel_histogram_plot
 @dataclass(frozen=True)
 class DatasetStatsOutput(Output, HistogramPlotMixin):
     """
-    Output class for :func:`datasetstats` stats metric.
+    Output class for :func:`.datasetstats` stats metric.
     This class represents the outputs of various stats functions against a single
     dataset, such that each index across all stat outputs are representative of
@@ -82,7 +78,7 @@ def _get_channels(cls, channel_limit: int | None = None, channel_index: int | It
 @dataclass(frozen=True)
 class ChannelStatsOutput(Output):
     """
-    Output class for :func:`channelstats` stats metric.
+    Output class for :func:`.channelstats` stats metric.
     This class represents the outputs of various per-channel stats functions against
     a single dataset, such that each index across all stat outputs are representative

dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} RENAMED Viewed

@@ -6,17 +6,18 @@ from dataclasses import dataclass
 from typing import Any, Callable, Iterable
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
-from dataeval.output import set_metadata
-from dataeval.utils.image import get_bitdepth
+from dataeval._output import set_metadata
+from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
+from dataeval.typing import ArrayLike
+from dataeval.utils._image import get_bitdepth
 @dataclass(frozen=True)
 class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`dimensionstats` stats metric.
+    Output class for :func:`.dimensionstats` stats metric.
     Attributes
     ----------

dataeval/metrics/stats/{hashstats.py → _hashstats.py} RENAMED Viewed

@@ -9,14 +9,14 @@ from typing import Callable, Iterable
 import numpy as np
 import xxhash as xxh
-from numpy.typing import ArrayLike
 from PIL import Image
 from scipy.fftpack import dct
-from dataeval.interop import as_numpy
-from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
-from dataeval.output import set_metadata
-from dataeval.utils.image import normalize_image_shape, rescale
+from dataeval._output import set_metadata
+from dataeval.metrics.stats._base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import as_numpy
+from dataeval.utils._image import normalize_image_shape, rescale
 HASH_SIZE = 8
 MAX_FACTOR = 4
@@ -25,7 +25,7 @@ MAX_FACTOR = 4
 @dataclass(frozen=True)
 class HashStatsOutput(BaseStatsOutput):
     """
-    Output class for :func:`hashstats` stats metric.
+    Output class for :func:`.hashstats` stats metric.
     Attributes
     ----------

dataeval/metrics/stats/{labelstats.py → _labelstats.py} RENAMED Viewed

@@ -2,25 +2,25 @@ from __future__ import annotations
 __all__ = []
-# import contextlib
+import contextlib
 from collections import Counter, defaultdict
 from dataclasses import dataclass
 from typing import Any, Iterable, Mapping, TypeVar
 import numpy as np
-from numpy.typing import ArrayLike
-from dataeval.interop import as_numpy
-from dataeval.output import Output, set_metadata
+from dataeval._output import Output, set_metadata
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import as_numpy
-# with contextlib.suppress(ImportError):
-#     import pandas as pd
+with contextlib.suppress(ImportError):
+    import pandas as pd
 @dataclass(frozen=True)
 class LabelStatsOutput(Output):
     """
-    Output class for :func:`labelstats` stats metric.
+    Output class for :func:`.labelstats` stats metric.
     Attributes
     ----------
@@ -73,24 +73,24 @@ class LabelStatsOutput(Output):
         return table_str
-    # def to_dataframe(self) -> pd.DataFrame:
-    #     import pandas as pd
-    #     class_list = []
-    #     total_count = []
-    #     image_count = []
-    #     for cls in self.label_counts_per_class:
-    #         class_list.append(cls)
-    #         total_count.append(self.label_counts_per_class[cls])
-    #         image_count.append(self.image_counts_per_label[cls])
-    #     return pd.DataFrame(
-    #         {
-    #             "Label": class_list,
-    #             "Total Count": total_count,
-    #             "Image Count": image_count,
-    #         }
-    #     )
+    def to_dataframe(self) -> pd.DataFrame:
+        import pandas as pd
+        class_list = []
+        total_count = []
+        image_count = []
+        for cls in self.label_counts_per_class:
+            class_list.append(cls)
+            total_count.append(self.label_counts_per_class[cls])
+            image_count.append(self.image_counts_per_label[cls])
+        return pd.DataFrame(
+            {
+                "Label": class_list,
+                "Total Count": total_count,
+                "Image Count": image_count,
+            }
+        )
 TKey = TypeVar("TKey", int, str)

dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} RENAMED Viewed

@@ -6,17 +6,18 @@ from dataclasses import dataclass
 from typing import Any, Callable, Iterable
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
 from scipy.stats import entropy, kurtosis, skew
-from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
-from dataeval.output import set_metadata
+from dataeval._output import set_metadata
+from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
+from dataeval.typing import ArrayLike
 @dataclass(frozen=True)
 class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`pixelstats` stats metric.
+    Output class for :func:`.pixelstats` stats metric.
     Attributes
     ----------

dataeval/metrics/stats/{visualstats.py → _visualstats.py} RENAMED Viewed

@@ -6,11 +6,12 @@ from dataclasses import dataclass
 from typing import Any, Callable, Iterable
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
-from dataeval.output import set_metadata
-from dataeval.utils.image import edge_filter
+from dataeval._output import set_metadata
+from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
+from dataeval.typing import ArrayLike
+from dataeval.utils._image import edge_filter
 QUARTILES = (0, 25, 50, 75, 100)
@@ -18,7 +19,7 @@ QUARTILES = (0, 25, 50, 75, 100)
 @dataclass(frozen=True)
 class VisualStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`visualstats` stats metric.
+    Output class for :func:`.visualstats` stats metric.
     Attributes
     ----------
@@ -53,9 +54,9 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
     output_class: type = VisualStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
         "brightness": lambda x: x.get("percentiles")[1],
-        "contrast": lambda x: np.nan_to_num(
-            (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles"))
-        ),
+        "contrast": lambda x: 0
+        if np.mean(x.get("percentiles")) == 0
+        else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
         "darkness": lambda x: x.get("percentiles")[-2],
         "missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
         "sharpness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),

dataeval/typing.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""
+Common type hints used for interoperability with DataEval.
+"""
+__all__ = ["Array", "ArrayLike"]
+from typing import Any, Iterator, Protocol, Sequence, TypeVar, Union, runtime_checkable
+@runtime_checkable
+class Array(Protocol):
+    """
+    Protocol for array objects providing interoperability with DataEval.
+    Supports common array representations with popular libraries like
+    PyTorch, Tensorflow and JAX, as well as NumPy arrays.
+    Example
+    -------
+    >>> import numpy as np
+    >>> import torch
+    >>> from dataeval.typing import Array
+    Create array objects
+    >>> ndarray = np.random.random((10, 10))
+    >>> tensor = torch.tensor([1, 2, 3])
+    Check type at runtime
+    >>> isinstance(ndarray, Array)
+    True
+    >>> isinstance(tensor, Array)
+    True
+    """
+    @property
+    def shape(self) -> tuple[int, ...]: ...
+    def __array__(self) -> Any: ...
+    def __getitem__(self, key: Any, /) -> Any: ...
+    def __iter__(self) -> Iterator[Any]: ...
+    def __len__(self) -> int: ...
+TArray = TypeVar("TArray", bound=Array)
+ArrayLike = Union[Sequence[Any], Array]
+"""
+Type alias for array-like objects used for interoperability with DataEval.
+This includes native Python sequences, as well as objects that conform to
+the `Array` protocol.
+"""

dataeval/utils/__init__.py CHANGED Viewed

@@ -4,6 +4,6 @@ in setting up data and architectures that are guaranteed to work with applicable
 DataEval metrics.
 """
-__all__ = ["dataset", "metadata", "torch"]
+__all__ = ["data", "metadata", "torch"]
-from dataeval.utils import dataset, metadata, torch
+from . import data, metadata, torch

dataeval/utils/_array.py ADDED Viewed

@@ -0,0 +1,169 @@
+from __future__ import annotations
+__all__ = []
+import logging
+import warnings
+from importlib import import_module
+from types import ModuleType
+from typing import Any, Iterable, Iterator, Literal, TypeVar, overload
+import numpy as np
+import torch
+from numpy.typing import NDArray
+from dataeval._log import LogMessage
+from dataeval.typing import ArrayLike
+_logger = logging.getLogger(__name__)
+_MODULE_CACHE = {}
+T = TypeVar("T", ArrayLike, np.ndarray, torch.Tensor)
+_np_dtype = TypeVar("_np_dtype", bound=np.generic)
+def _try_import(module_name) -> ModuleType | None:
+    if module_name in _MODULE_CACHE:
+        return _MODULE_CACHE[module_name]
+    try:
+        module = import_module(module_name)
+    except ImportError:  # pragma: no cover
+        _logger.log(logging.INFO, f"Unable to import {module_name}.")
+        module = None
+    _MODULE_CACHE[module_name] = module
+    return module
+def as_numpy(array: ArrayLike | None) -> NDArray[Any]:
+    """Converts an ArrayLike to Numpy array without copying (if possible)"""
+    return to_numpy(array, copy=False)
+def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
+    """Converts an ArrayLike to new Numpy array"""
+    if array is None:
+        return np.ndarray([])
+    if isinstance(array, np.ndarray):
+        return array.copy() if copy else array
+    if array.__class__.__module__.startswith("tensorflow"):  # pragma: no cover - removed tf from deps
+        tf = _try_import("tensorflow")
+        if tf and tf.is_tensor(array):
+            _logger.log(logging.INFO, "Converting Tensorflow array to NumPy array.")
+            return array.numpy().copy() if copy else array.numpy()  # type: ignore
+    if array.__class__.__module__.startswith("torch"):
+        torch = _try_import("torch")
+        if torch and isinstance(array, torch.Tensor):
+            _logger.log(logging.INFO, "Converting PyTorch array to NumPy array.")
+            numpy = array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy()  # type: ignore
+            _logger.log(logging.DEBUG, LogMessage(lambda: f"{str(array)} -> {str(numpy)}"))
+            return numpy
+    return np.array(array) if copy else np.asarray(array)
+def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
+    """Yields an iterator of numpy arrays from an ArrayLike"""
+    for array in iterable:
+        yield to_numpy(array)
+@overload
+def ensure_embeddings(
+    embeddings: T,
+    dtype: torch.dtype,
+    unit_interval: Literal[True, False, "force"] = False,
+) -> torch.Tensor: ...
+@overload
+def ensure_embeddings(
+    embeddings: T,
+    dtype: type[_np_dtype],
+    unit_interval: Literal[True, False, "force"] = False,
+) -> NDArray[_np_dtype]: ...
+@overload
+def ensure_embeddings(
+    embeddings: T,
+    dtype: None,
+    unit_interval: Literal[True, False, "force"] = False,
+) -> T: ...
+def ensure_embeddings(
+    embeddings: T,
+    dtype: type[_np_dtype] | torch.dtype | None = None,
+    unit_interval: Literal[True, False, "force"] = False,
+) -> torch.Tensor | NDArray[_np_dtype] | T:
+    """
+    Validates the embeddings array and converts it to the specified type
+    Parameters
+    ----------
+    embeddings : ArrayLike
+        Embeddings array
+    dtype : numpy dtype or torch dtype or None, default None
+        The desired dtype of the output array, None to skip conversion
+    unit_interval : bool or "force", default False
+        Whether to validate or force the embeddings to unit interval
+    Returns
+    -------
+        Converted embeddings array
+    Raises
+    ------
+    ValueError
+        If the embeddings array is not 2D
+    ValueError
+        If the embeddings array is not unit interval [0, 1]
+    """
+    if isinstance(dtype, torch.dtype):
+        arr = torch.as_tensor(embeddings, dtype=dtype)
+    else:
+        arr = (
+            embeddings.detach().cpu().numpy().astype(dtype)
+            if isinstance(embeddings, torch.Tensor)
+            else np.asarray(embeddings, dtype=dtype)
+        )
+    if arr.ndim != 2:
+        raise ValueError(f"Expected a 2D array, but got a {arr.ndim}D array.")
+    if unit_interval:
+        arr_min, arr_max = arr.min(), arr.max()
+        if arr_min < 0 or arr_max > 1:
+            if unit_interval == "force":
+                warnings.warn("Embeddings are not unit interval [0, 1]. Forcing to unit interval.")
+                arr = (arr - arr_min) / (arr_max - arr_min)
+            else:
+                raise ValueError("Embeddings must be unit interval [0, 1].")
+    if dtype is None:
+        return embeddings
+    else:
+        return arr
+def flatten(array: ArrayLike) -> NDArray[Any]:
+    """
+    Flattens input array from (N, ... ) to (N, -1) where all samples N have all data in their last dimension
+    Parameters
+    ----------
+    X : NDArray, shape - (N, ... )
+        Input array
+    Returns
+    -------
+    NDArray, shape - (N, -1)
+    """
+    nparr = as_numpy(array)
+    return nparr.reshape((nparr.shape[0], -1))

dataeval 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl

dataeval 0.76.0py3-none-any.whl → 0.81.0py3-none-any.whl