PyPI - dataeval - Versions diffs - 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl - Mend

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +9 -10
dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
dataeval/detectors/ood/__init__.py +6 -6
dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
dataeval/detectors/ood/aegmm.py +66 -0
dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
dataeval/detectors/ood/vaegmm.py +75 -0
dataeval/interop.py +56 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
dataeval/metrics/bias/metadata.py +358 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +8 -3
dataeval/utils/image.py +71 -0
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/shared.py +151 -0
dataeval/{_internal → utils}/split_dataset.py +98 -33
dataeval/utils/tensorflow/__init__.py +7 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +48 -42
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
dataeval-0.73.0.dist-info/RECORD +73 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/detectors/ood/aegmm.py +0 -78
dataeval/_internal/detectors/ood/vaegmm.py +0 -89
dataeval/_internal/interop.py +0 -49
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -8
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.1.dist-info/RECORD +0 -81
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0

dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py RENAMED Viewed

@@ -8,20 +8,27 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-from typing import Callable, Iterable, cast
+from typing import TYPE_CHECKING, Callable, Iterable, cast
 import numpy as np
-import tensorflow as tf
-import tf_keras as keras
 from numpy.typing import NDArray
+from dataeval.utils.lazy import lazyload
+if TYPE_CHECKING:
+    import tensorflow as tf
+    import tf_keras as keras
+else:
+    tf = lazyload("tensorflow")
+    keras = lazyload("tf_keras")
 def trainer(
     model: keras.Model,
     x_train: NDArray,
     y_train: NDArray | None = None,
     loss_fn: Callable[..., tf.Tensor] | None = None,
-    optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
+    optimizer: keras.optimizers.Optimizer | None = None,
     preprocess_fn: Callable[[tf.Tensor], tf.Tensor] | None = None,
     epochs: int = 20,
     reg_loss_fn: Callable[[keras.Model], tf.Tensor] = (lambda _: cast(tf.Tensor, tf.Variable(0, dtype=tf.float32))),
@@ -58,9 +65,11 @@ def trainer(
         Whether to print training progress.
     """
     loss_fn = loss_fn() if isinstance(loss_fn, type) else loss_fn
-    optimizer = optimizer() if isinstance(optimizer, type) else optimizer
+    optimizer = keras.optimizers.Adam() if optimizer is None else optimizer
-    train_data = x_train if y_train is None else (x_train, y_train)
+    train_data = (
+        x_train.astype(np.float32) if y_train is None else (x_train.astype(np.float32), y_train.astype(np.float32))
+    )
     dataset = tf.data.Dataset.from_tensor_slices(train_data)
     dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size)
     n_minibatch = len(dataset)

dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py RENAMED Viewed

@@ -9,25 +9,24 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
 import math
-from typing import Callable, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, Literal, Union, cast
 import numpy as np
-import tensorflow as tf
-import tf_keras as keras
 from numpy.typing import NDArray
-from tensorflow._api.v2.nn import relu, softmax, tanh
-from tf_keras import Sequential
-from tf_keras.layers import (
-    Conv2D,
-    Conv2DTranspose,
-    Dense,
-    Flatten,
-    InputLayer,
-    Reshape,
-)
-from dataeval._internal.models.tensorflow.autoencoder import AE, AEGMM, VAE, VAEGMM
-from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
+from dataeval.utils.lazy import lazyload
+if TYPE_CHECKING:
+    import tensorflow as tf
+    import tensorflow._api.v2.nn as nn
+    import tf_keras as keras
+    import dataeval.utils.tensorflow._internal.models as tf_models
+else:
+    tf = lazyload("tensorflow")
+    nn = lazyload("tensorflow._api.v2.nn")
+    keras = lazyload("tf_keras")
+    tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
 def predict_batch(
@@ -95,47 +94,47 @@ def predict_batch(
     return out
-def _get_default_encoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
-    return Sequential(
+def get_default_encoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
+    return keras.Sequential(
         [
-            InputLayer(input_shape=input_shape),
-            Conv2D(64, 4, strides=2, padding="same", activation=relu),
-            Conv2D(128, 4, strides=2, padding="same", activation=relu),
-            Conv2D(512, 4, strides=2, padding="same", activation=relu),
-            Flatten(),
-            Dense(encoding_dim),
+            keras.layers.InputLayer(input_shape=input_shape),
+            keras.layers.Conv2D(64, 4, strides=2, padding="same", activation=nn.relu),
+            keras.layers.Conv2D(128, 4, strides=2, padding="same", activation=nn.relu),
+            keras.layers.Conv2D(512, 4, strides=2, padding="same", activation=nn.relu),
+            keras.layers.Flatten(),
+            keras.layers.Dense(encoding_dim),
         ]
     )
-def _get_default_decoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
-    return Sequential(
+def get_default_decoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
+    return keras.Sequential(
         [
-            InputLayer(input_shape=(encoding_dim,)),
-            Dense(4 * 4 * 128),
-            Reshape(target_shape=(4, 4, 128)),
-            Conv2DTranspose(256, 4, strides=2, padding="same", activation=relu),
-            Conv2DTranspose(64, 4, strides=2, padding="same", activation=relu),
-            Flatten(),
-            Dense(math.prod(input_shape)),
-            Reshape(target_shape=input_shape),
+            keras.layers.InputLayer(input_shape=(encoding_dim,)),
+            keras.layers.Dense(4 * 4 * 128),
+            keras.layers.Reshape(target_shape=(4, 4, 128)),
+            keras.layers.Conv2DTranspose(256, 4, strides=2, padding="same", activation=nn.relu),
+            keras.layers.Conv2DTranspose(64, 4, strides=2, padding="same", activation=nn.relu),
+            keras.layers.Flatten(),
+            keras.layers.Dense(math.prod(input_shape)),
+            keras.layers.Reshape(target_shape=input_shape),
         ]
     )
 def create_model(
-    model_type: AE | AEGMM | PixelCNN | VAE | VAEGMM,
+    model_type: Literal["AE", "AEGMM", "PixelCNN", "VAE", "VAEGMM"],
     input_shape: tuple[int, int, int],
     encoding_dim: int | None = None,
     n_gmm: int | None = None,
     gmm_latent_dim: int | None = None,
-):
+) -> Any:
     """
     Create a default model for the specified model type.
     Parameters
     ----------
-    model_type : Union[AE, AEGMM, PixelCNN, VAE, VAEGMM]
+    model_type : Literal["AE", "AEGMM", "PixelCNN", "VAE", "VAEGMM"]
         The model type to create.
     input_shape : Tuple[int, int, int]
         The input shape of the data used.
@@ -148,93 +147,93 @@ def create_model(
     """
     input_dim = math.prod(input_shape)
     encoding_dim = int(math.pow(2, int(input_dim.bit_length() * 0.8)) if encoding_dim is None else encoding_dim)
-    if model_type == AE:
-        return AE(
-            _get_default_encoder_net(input_shape, encoding_dim),
-            _get_default_decoder_net(input_shape, encoding_dim),
+    if model_type == "AE":
+        return tf_models.AE(
+            get_default_encoder_net(input_shape, encoding_dim),
+            get_default_decoder_net(input_shape, encoding_dim),
         )
-    if model_type == VAE:
-        return VAE(
-            _get_default_encoder_net(input_shape, encoding_dim),
-            _get_default_decoder_net(input_shape, encoding_dim),
+    if model_type == "VAE":
+        return tf_models.VAE(
+            get_default_encoder_net(input_shape, encoding_dim),
+            get_default_decoder_net(input_shape, encoding_dim),
             encoding_dim,
         )
-    if model_type == AEGMM:
+    if model_type == "AEGMM":
         n_gmm = 2 if n_gmm is None else n_gmm
         gmm_latent_dim = 1 if gmm_latent_dim is None else gmm_latent_dim
         # The outlier detector is an encoder/decoder architecture
-        encoder_net = Sequential(
+        encoder_net = keras.Sequential(
             [
-                Flatten(),
-                InputLayer(input_shape=(input_dim,)),
-                Dense(60, activation=tanh),
-                Dense(30, activation=tanh),
-                Dense(10, activation=tanh),
-                Dense(gmm_latent_dim, activation=None),
+                keras.layers.Flatten(),
+                keras.layers.InputLayer(input_shape=(input_dim,)),
+                keras.layers.Dense(60, activation=nn.tanh),
+                keras.layers.Dense(30, activation=nn.tanh),
+                keras.layers.Dense(10, activation=nn.tanh),
+                keras.layers.Dense(gmm_latent_dim, activation=None),
             ]
         )
         # Here we define the decoder
-        decoder_net = Sequential(
+        decoder_net = keras.Sequential(
             [
-                InputLayer(input_shape=(gmm_latent_dim,)),
-                Dense(10, activation=tanh),
-                Dense(30, activation=tanh),
-                Dense(60, activation=tanh),
-                Dense(input_dim, activation=None),
-                Reshape(target_shape=input_shape),
+                keras.layers.InputLayer(input_shape=(gmm_latent_dim,)),
+                keras.layers.Dense(10, activation=nn.tanh),
+                keras.layers.Dense(30, activation=nn.tanh),
+                keras.layers.Dense(60, activation=nn.tanh),
+                keras.layers.Dense(input_dim, activation=None),
+                keras.layers.Reshape(target_shape=input_shape),
             ]
         )
         # GMM autoencoders have a density network too
-        gmm_density_net = Sequential(
+        gmm_density_net = keras.Sequential(
             [
-                InputLayer(input_shape=(gmm_latent_dim + 2,)),
-                Dense(10, activation=tanh),
-                Dense(n_gmm, activation=softmax),
+                keras.layers.InputLayer(input_shape=(gmm_latent_dim + 2,)),
+                keras.layers.Dense(10, activation=nn.tanh),
+                keras.layers.Dense(n_gmm, activation=nn.softmax),
             ]
         )
-        return AEGMM(
+        return tf_models.AEGMM(
             encoder_net=encoder_net,
             decoder_net=decoder_net,
             gmm_density_net=gmm_density_net,
             n_gmm=n_gmm,
         )
-    if model_type == VAEGMM:
+    if model_type == "VAEGMM":
         n_gmm = 2 if n_gmm is None else n_gmm
         gmm_latent_dim = 2 if gmm_latent_dim is None else gmm_latent_dim
         # The outlier detector is an encoder/decoder architecture
         # Here we define the encoder
-        encoder_net = Sequential(
+        encoder_net = keras.Sequential(
             [
-                Flatten(),
-                InputLayer(input_shape=(input_dim,)),
-                Dense(20, activation=relu),
-                Dense(15, activation=relu),
-                Dense(7, activation=relu),
+                keras.layers.Flatten(),
+                keras.layers.InputLayer(input_shape=(input_dim,)),
+                keras.layers.Dense(20, activation=nn.relu),
+                keras.layers.Dense(15, activation=nn.relu),
+                keras.layers.Dense(7, activation=nn.relu),
             ]
         )
         # Here we define the decoder
-        decoder_net = Sequential(
+        decoder_net = keras.Sequential(
             [
-                InputLayer(input_shape=(gmm_latent_dim,)),
-                Dense(7, activation=relu),
-                Dense(15, activation=relu),
-                Dense(20, activation=relu),
-                Dense(input_dim, activation=None),
-                Reshape(target_shape=input_shape),
+                keras.layers.InputLayer(input_shape=(gmm_latent_dim,)),
+                keras.layers.Dense(7, activation=nn.relu),
+                keras.layers.Dense(15, activation=nn.relu),
+                keras.layers.Dense(20, activation=nn.relu),
+                keras.layers.Dense(input_dim, activation=None),
+                keras.layers.Reshape(target_shape=input_shape),
             ]
         )
         # GMM autoencoders have a density network too
-        gmm_density_net = Sequential(
+        gmm_density_net = keras.Sequential(
             [
-                InputLayer(input_shape=(gmm_latent_dim + 2,)),
-                Dense(10, activation=relu),
-                Dense(n_gmm, activation=softmax),
+                keras.layers.InputLayer(input_shape=(gmm_latent_dim + 2,)),
+                keras.layers.Dense(10, activation=nn.relu),
+                keras.layers.Dense(n_gmm, activation=nn.softmax),
             ]
         )
-        return VAEGMM(
+        return tf_models.VAEGMM(
             encoder_net=encoder_net,
             decoder_net=decoder_net,
             gmm_density_net=gmm_density_net,
@@ -242,8 +241,8 @@ def create_model(
             latent_dim=gmm_latent_dim,
         )
-    if model_type == PixelCNN:
-        return PixelCNN(
+    if model_type == "PixelCNN":
+        return tf_models.PixelCNN(
             image_shape=input_shape,
             num_resnet=5,
             num_hierarchies=2,

dataeval/utils/tensorflow/loss/__init__.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from dataeval import _IS_TENSORFLOW_AVAILABLE
-from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
 __all__ = []
 if _IS_TENSORFLOW_AVAILABLE:
-    __all__ += ["Elbo", "LossGMM"]
+    from dataeval.utils.tensorflow._internal.loss import Elbo, LossGMM
+    __all__ = ["Elbo", "LossGMM"]
+del _IS_TENSORFLOW_AVAILABLE

dataeval/utils/torch/__init__.py CHANGED Viewed

@@ -6,16 +6,20 @@ to create a seamless integration between custom models and DataEval's metrics.
 """
 from dataeval import _IS_TORCH_AVAILABLE, _IS_TORCHVISION_AVAILABLE
-from dataeval._internal.utils import read_dataset
 __all__ = []
 if _IS_TORCH_AVAILABLE:
-    from . import models, trainer
+    from dataeval.utils.torch import models, trainer
+    from dataeval.utils.torch.utils import read_dataset
     __all__ += ["read_dataset", "models", "trainer"]
 if _IS_TORCHVISION_AVAILABLE:
-    from . import datasets
+    from dataeval.utils.torch import datasets
     __all__ += ["datasets"]
+del _IS_TORCH_AVAILABLE
+del _IS_TORCHVISION_AVAILABLE

dataeval/{_internal/models/pytorch → utils/torch}/blocks.py RENAMED Viewed

@@ -1,3 +1,7 @@
+from typing import Any
+__all__ = []
 import torch.nn as nn
@@ -8,21 +12,22 @@ class Conv(nn.Module):
     def __init__(
         self,
-        in_channels,
-        out_channels,
-        k=1,
-        s=1,
-        p=0,
-        activation="relu",
-        norm="instance",
-    ):
+        in_channels: int,
+        out_channels: int,
+        k: int = 1,
+        s: int = 1,
+        p: int = 0,
+        activation: str = "relu",
+        norm: str = "instance",
+    ) -> None:
         super().__init__()
-        conv = nn.Conv2d(in_channels, out_channels, kernel_size=k, stride=s, padding=p)
-        norm = self.get_norm_func(norm=norm, out_channels=out_channels)
-        act = self.get_activation_func(activation=activation)
-        self.module = nn.Sequential(conv, norm, act)
+        self.module: nn.Sequential = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=k, stride=s, padding=p),
+            self.get_norm_func(norm=norm, out_channels=out_channels),
+            self.get_activation_func(activation=activation),
+        )
-    def get_norm_func(self, norm: str, out_channels) -> nn.Module:
+    def get_norm_func(self, norm: str, out_channels: int) -> nn.Module:
         if norm == "batch":
             return nn.BatchNorm2d(out_channels)
         if norm == "instance":
@@ -42,5 +47,5 @@ class Conv(nn.Module):
             return nn.Tanh()
         return nn.Identity()
-    def forward(self, x):
+    def forward(self, x: Any) -> Any:
         return self.module(x)

dataeval/{_internal → utils/torch}/datasets.py RENAMED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+__all__ = ["MNIST", "CIFAR10", "VOCDetection"]
 import hashlib
 import os
 import zipfile
@@ -11,7 +13,7 @@ import numpy as np
 import requests
 from numpy.typing import NDArray
 from torch.utils.data import Dataset
-from torchvision.datasets import CIFAR10, VOCDetection  # noqa: F401
+from torchvision.datasets import CIFAR10, VOCDetection
 ClassStringMap = Literal["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
 TClassMap = TypeVar("TClassMap", ClassStringMap, int, list[ClassStringMap], list[int])
@@ -50,6 +52,7 @@ def _get_file(
     file_hash: str | None = None,
     verbose: bool = True,
     md5: bool = False,
+    timeout: int = 60,
 ):
     fpath = os.path.join(root, fname)
     download = True
@@ -64,16 +67,16 @@ def _get_file(
         try:
             error_msg = "URL fetch failure on {}: {} -- {}"
             try:
-                with requests.get(origin, stream=True, timeout=60) as r:
+                with requests.get(origin, stream=True, timeout=timeout) as r:
                     r.raise_for_status()
                     with open(fpath, "wb") as f:
                         for chunk in r.iter_content(chunk_size=8192):
                             if chunk:
                                 f.write(chunk)
             except requests.exceptions.HTTPError as e:
-                raise Exception(f"{error_msg.format(origin, e.response.status_code)} -- {e.response.reason}") from e
+                raise RuntimeError(f"{error_msg.format(origin, e.response.status_code, e.response.reason)}") from e
             except requests.exceptions.RequestException as e:
-                raise Exception(f"{error_msg.format(origin, 'Unknown error')} -- {str(e)}") from e
+                raise ValueError(f"{error_msg.format(origin, 'Unknown error', str(e))}") from e
         except (Exception, KeyboardInterrupt):
             if os.path.exists(fpath):
                 os.remove(fpath)
@@ -89,7 +92,7 @@ def _get_file(
     return fpath
-def check_exists(
+def _check_exists(
     folder: str | Path,
     url: str,
     root: str | Path,
@@ -103,7 +106,7 @@ def check_exists(
     location = str(folder)
     if not os.path.exists(folder):
         if download:
-            location = download_dataset(url, root, fname, file_hash, verbose, md5)
+            location = _download_dataset(url, root, fname, file_hash, verbose, md5)
         else:
             raise RuntimeError("Dataset not found. You can use download=True to download it")
     else:
@@ -112,7 +115,7 @@ def check_exists(
     return location
-def download_dataset(
+def _download_dataset(
     url: str, root: str | Path, fname: str, file_hash: str, verbose: bool = True, md5: bool = False
 ) -> str:
     """Code to download mnist and corruptions, originates from tensorflow_datasets (tfds):
@@ -131,11 +134,11 @@ def download_dataset(
         md5=md5,
     )
     if md5:
-        folder = extract_archive(fpath, root, remove_finished=True)
+        folder = _extract_archive(fpath, root, remove_finished=True)
     return folder
-def extract_archive(
+def _extract_archive(
     from_path: str | Path,
     to_path: str | Path | None = None,
     remove_finished: bool = False,
@@ -163,13 +166,13 @@ def extract_archive(
     return str(to_path)
-def subselect(arr: NDArray, count: int, from_back: bool = False):
+def _subselect(arr: NDArray, count: int, from_back: bool = False):
     if from_back:
         return arr[-count:]
     return arr[:count]
-class MNIST(Dataset):
+class MNIST(Dataset[tuple[NDArray[np.float64], int]]):
     """MNIST Dataset and Corruptions.
     Args:
@@ -211,17 +214,17 @@ class MNIST(Dataset):
             If True, outputs print statements.
     """
-    mirror = [
+    _mirrors: tuple[str, ...] = (
         "https://storage.googleapis.com/tensorflow/tf-keras-datasets/",
         "https://zenodo.org/record/3239543/files/",
-    ]
+    )
-    resources = [
+    _resources: tuple[tuple[str, str], ...] = (
         ("mnist.npz", "731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1"),
         ("mnist_c.zip", "4b34b33045869ee6d424616cd3a65da3"),
-    ]
+    )
-    class_dict = {
+    class_dict: dict[str, int] = {
         "zero": 0,
         "one": 1,
         "two": 2,
@@ -267,43 +270,46 @@ class MNIST(Dataset):
         self.randomize = randomize
         self.from_back = slice_back
         self.verbose = verbose
+        self.data: NDArray[np.float64]
+        self.targets: NDArray[np.int_]
+        self.size: int
-        self.class_set = []
+        self._class_set = []
         if classes is not None:
             if not isinstance(classes, list):
                 classes = [classes]  # type: ignore
             for val in classes:  # type: ignore
                 if isinstance(val, int) and 0 <= val < 10:
-                    self.class_set.append(val)
+                    self._class_set.append(val)
                 elif isinstance(val, str):
-                    self.class_set.append(self.class_dict[val])
-            self.class_set = set(self.class_set)
+                    self._class_set.append(self.class_dict[val])
+            self._class_set = set(self._class_set)
-        if not self.class_set:
-            self.class_set = set(self.class_dict.values())
+        if not self._class_set:
+            self._class_set = set(self.class_dict.values())
-        self.num_classes = len(self.class_set)
+        self._num_classes = len(self._class_set)
         if self.corruption is None:
-            file_resource = self.resources[0]
-            mirror = self.mirror[0]
+            file_resource = self._resources[0]
+            mirror = self._mirrors[0]
             md5 = False
         else:
             if self.corruption == "identity" and verbose:
                 print("Identity is not a corrupted dataset but the original MNIST dataset.")
-            file_resource = self.resources[1]
-            mirror = self.mirror[1]
+            file_resource = self._resources[1]
+            mirror = self._mirrors[1]
             md5 = True
-        check_exists(self.mnist_folder, mirror, self.root, file_resource[0], file_resource[1], download, verbose, md5)
+        _check_exists(self.mnist_folder, mirror, self.root, file_resource[0], file_resource[1], download, verbose, md5)
         self.data, self.targets = self._load_data()
         self._augmentations()
-    def _load_data(self):
+    def _load_data(self) -> tuple[NDArray[np.float64], NDArray[np.int64]]:
         if self.corruption is None:
-            image_file = self.resources[0][0]
+            image_file = self._resources[0][0]
             data, targets = self._read_normal_file(os.path.join(self.mnist_folder, image_file))
         else:
             image_file = f"{'train' if self.train else 'test'}_images.npy"
@@ -329,27 +335,27 @@ class MNIST(Dataset):
             self.data = self.data[shuffled_indices]
             self.targets = self.targets[shuffled_indices]
-        if not self.balance and self.num_classes > self.size:
+        if not self.balance and self._num_classes > self.size:
             if self.size > 0:
-                self.data = subselect(self.data, self.size, self.from_back)
-                self.targets = subselect(self.targets, self.size, self.from_back)
+                self.data = _subselect(self.data, self.size, self.from_back)
+                self.targets = _subselect(self.targets, self.size, self.from_back)
         else:
-            label_dict = {label: np.where(self.targets == label)[0] for label in self.class_set}
+            label_dict = {label: np.where(self.targets == label)[0] for label in self._class_set}
             min_label_count = min(len(indices) for indices in label_dict.values())
-            self.per_class_count = int(np.ceil(self.size / self.num_classes)) if self.size > 0 else min_label_count
+            self._per_class_count = int(np.ceil(self.size / self._num_classes)) if self.size > 0 else min_label_count
-            if self.per_class_count > min_label_count:
-                self.per_class_count = min_label_count
+            if self._per_class_count > min_label_count:
+                self._per_class_count = min_label_count
                 if not self.balance and self.verbose:
                     warn(
-                        f"Because of dataset limitations, only {min_label_count*self.num_classes} samples "
+                        f"Because of dataset limitations, only {min_label_count*self._num_classes} samples "
                         f"will be returned, instead of the desired {self.size}."
                     )
-            all_indices = np.empty(shape=(self.num_classes, self.per_class_count), dtype=int)
-            for i, label in enumerate(self.class_set):
-                all_indices[i] = subselect(label_dict[label], self.per_class_count, self.from_back)
+            all_indices: NDArray[np.int_] = np.empty(shape=(self._num_classes, self._per_class_count), dtype=np.int_)
+            for i, label in enumerate(self._class_set):
+                all_indices[i] = _subselect(label_dict[label], self._per_class_count, self.from_back)
             self.data = np.vstack(self.data[all_indices.T])  # type: ignore
             self.targets = np.hstack(self.targets[all_indices.T])  # type: ignore
@@ -370,7 +376,7 @@ class MNIST(Dataset):
         if self.flatten and self.channels is None:
             self.data = self.data.reshape(self.data.shape[0], -1)
-    def __getitem__(self, index: int) -> tuple[NDArray, int]:
+    def __getitem__(self, index: int) -> tuple[NDArray[np.float64], int]:
         """
         Args:
             index (int): Index

dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl