PyPI - dataeval - Versions diffs - 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl - Mend

dataeval 0.72.1py3-none-any.whl → 0.72.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +9 -10
dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
dataeval/detectors/ood/__init__.py +6 -6
dataeval/{_internal/detectors → detectors}/ood/ae.py +7 -7
dataeval/{_internal/detectors → detectors}/ood/aegmm.py +9 -29
dataeval/{_internal/detectors → detectors}/ood/base.py +24 -18
dataeval/{_internal/detectors → detectors}/ood/llr.py +24 -20
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +10 -12
dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
dataeval/{_internal/interop.py → interop.py} +12 -7
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -9
dataeval/{_internal/metrics → metrics/bias}/coverage.py +6 -4
dataeval/{_internal/metrics → metrics/bias}/diversity.py +48 -14
dataeval/metrics/bias/metadata.py +275 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +12 -10
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +7 -3
dataeval/utils/image.py +71 -0
dataeval/utils/shared.py +151 -0
dataeval/{_internal → utils}/split_dataset.py +98 -33
dataeval/utils/tensorflow/__init__.py +7 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +60 -64
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +9 -8
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +16 -20
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +17 -17
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +48 -42
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
{dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/METADATA +2 -1
dataeval-0.72.2.dist-info/RECORD +72 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -8
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.1.dist-info/RECORD +0 -81
/dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0

dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py RENAMED Viewed

@@ -34,13 +34,9 @@ from tensorflow_probability.python.internal import (
     tensorshape_util,
 )
-__all__ = [
-    "Shift",
-]
 class WeightNorm(keras.layers.Wrapper):
-    def __init__(self, layer, data_init: bool = True, **kwargs):
+    def __init__(self, layer, data_init: bool = True, **kwargs) -> None:
         """Layer wrapper to decouple magnitude and direction of the layer's weights.
         This wrapper reparameterizes a layer by decoupling the weight's
@@ -187,7 +183,7 @@ class WeightNorm(keras.layers.Wrapper):
 class Shift(bijector.Bijector):
-    def __init__(self, shift, validate_args=False, name="shift"):
+    def __init__(self, shift, validate_args=False, name="shift") -> None:
         """Instantiates the `Shift` bijector which computes `Y = g(X; shift) = X + shift`
         where `shift` is a numeric `Tensor`.
@@ -276,13 +272,13 @@ class PixelCNN(distribution.Distribution):
     def __init__(
         self,
-        image_shape: tuple,
-        conditional_shape: tuple | None = None,
+        image_shape: tuple[int, int, int],
+        conditional_shape: tuple[int, ...] | None = None,
         num_resnet: int = 5,
         num_hierarchies: int = 3,
         num_filters: int = 160,
         num_logistic_mix: int = 10,
-        receptive_field_dims: tuple = (3, 3),
+        receptive_field_dims: tuple[int, int] = (3, 3),
         dropout_p: float = 0.5,
         resnet_activation: str = "concat_elu",
         l2_weight: float = 0.0,
@@ -290,7 +286,7 @@ class PixelCNN(distribution.Distribution):
         use_data_init: bool = True,
         high: int = 255,
         low: int = 0,
-        dtype=tf.float32,
+        dtype: tf.DType = tf.float32,
     ) -> None:
         parameters = dict(locals())
         with tf.name_scope("PixelCNN") as name:
@@ -315,7 +311,7 @@ class PixelCNN(distribution.Distribution):
             self._high = tf.cast(high, self.dtype)
             self._low = tf.cast(low, self.dtype)
             self._num_logistic_mix = num_logistic_mix
-            self.network = _PixelCNNNetwork(
+            self._network = PixelCNNNetwork(
                 dropout_p=dropout_p,
                 num_resnet=num_resnet,
                 num_hierarchies=num_hierarchies,
@@ -338,7 +334,7 @@ class PixelCNN(distribution.Distribution):
             self.image_shape = image_shape
             self.conditional_shape = conditional_shape
-            self.network.build(input_shape)
+            self._network.build(input_shape)
     def _make_mixture_dist(self, component_logits, locs, scales, return_per_feature: bool = False):
         """Builds a mixture of quantized logistic distributions.
@@ -455,7 +451,7 @@ class PixelCNN(distribution.Distribution):
         transformed_value = (2.0 * (value - self._low) / (self._high - self._low)) - 1.0
         inputs = transformed_value if conditional_input is None else [transformed_value, conditional_input]
-        params = self.network(inputs, training=training)
+        params = self._network(inputs, training=training)
         num_channels = self.event_shape[-1]
         if num_channels == 1:
@@ -554,7 +550,7 @@ class PixelCNN(distribution.Distribution):
             seed=seed,
         )
         inputs = samples_0 if conditional_input is None else [samples_0, h]
-        params_0 = self.network(inputs, training=training)
+        params_0 = self._network(inputs, training=training)
         samples_0 = self._sample_channels(*params_0, seed=seed)
         image_height, image_width, _ = tensorshape_util.as_list(self.event_shape)
@@ -579,7 +575,7 @@ class PixelCNN(distribution.Distribution):
                 width, num_channels]`.
             """
             inputs = samples if conditional_input is None else [samples, h]
-            params = self.network(inputs, training=training)
+            params = self._network(inputs, training=training)
             samples_new = self._sample_channels(*params, seed=seed)
             # Update the current pixel
@@ -673,7 +669,7 @@ class PixelCNN(distribution.Distribution):
         return tf.TensorShape(self.image_shape)
-class _PixelCNNNetwork(keras.layers.Layer):
+class PixelCNNNetwork(keras.layers.Layer):
     """Keras `Layer` to parameterize a Pixel CNN++ distribution.
     This is a Keras implementation of the Pixel CNN++ network, as described in
     Salimans et al. (2017)[1] and van den Oord et al. (2016)[2].
@@ -699,12 +695,12 @@ class _PixelCNNNetwork(keras.layers.Layer):
         num_hierarchies: int = 3,
         num_filters: int = 160,
         num_logistic_mix: int = 10,
-        receptive_field_dims: tuple = (3, 3),
+        receptive_field_dims: tuple[int, int] = (3, 3),
         resnet_activation: str = "concat_elu",
         l2_weight: float = 0.0,
         use_weight_norm: bool = True,
         use_data_init: bool = True,
-        dtype=tf.float32,
+        dtype: tf.DType = tf.float32,
     ) -> None:
         """Initialize the :term:`neural network<Neural Network>` for the Pixel CNN++ distribution.
@@ -765,7 +761,7 @@ class _PixelCNNNetwork(keras.layers.Layer):
         else:
             self._layer_wrapper = lambda layer: layer
-    def build(self, input_shape):
+    def build(self, input_shape: tuple[int, ...]) -> None:
         dtype = self.dtype
         if len(input_shape) == 2:
             batch_image_shape, batch_conditional_shape = input_shape
@@ -1040,7 +1036,7 @@ class _PixelCNNNetwork(keras.layers.Layer):
         self._network = keras.Model(inputs=inputs, outputs=outputs)
         super().build(input_shape)
-    def call(self, inputs, training=None):
+    def call(self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None) -> tf.Tensor:
         """Call the Pixel CNN network model.
         Parameters

dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py RENAMED Viewed

@@ -60,7 +60,9 @@ def trainer(
     loss_fn = loss_fn() if isinstance(loss_fn, type) else loss_fn
     optimizer = optimizer() if isinstance(optimizer, type) else optimizer
-    train_data = x_train if y_train is None else (x_train, y_train)
+    train_data = (
+        x_train.astype(np.float32) if y_train is None else (x_train.astype(np.float32), y_train.astype(np.float32))
+    )
     dataset = tf.data.Dataset.from_tensor_slices(train_data)
     dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size)
     n_minibatch = len(dataset)

dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py RENAMED Viewed

@@ -9,7 +9,7 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
 import math
-from typing import Callable, Union, cast
+from typing import Any, Callable, Literal, Union, cast
 import numpy as np
 import tensorflow as tf
@@ -26,8 +26,8 @@ from tf_keras.layers import (
     Reshape,
 )
-from dataeval._internal.models.tensorflow.autoencoder import AE, AEGMM, VAE, VAEGMM
-from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
+from dataeval.utils.tensorflow._internal.autoencoder import AE, AEGMM, VAE, VAEGMM
+from dataeval.utils.tensorflow._internal.pixelcnn import PixelCNN
 def predict_batch(
@@ -95,7 +95,7 @@ def predict_batch(
     return out
-def _get_default_encoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
+def get_default_encoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
     return Sequential(
         [
             InputLayer(input_shape=input_shape),
@@ -108,7 +108,7 @@ def _get_default_encoder_net(input_shape: tuple[int, int, int], encoding_dim: in
     )
-def _get_default_decoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
+def get_default_decoder_net(input_shape: tuple[int, int, int], encoding_dim: int):
     return Sequential(
         [
             InputLayer(input_shape=(encoding_dim,)),
@@ -124,18 +124,18 @@ def _get_default_decoder_net(input_shape: tuple[int, int, int], encoding_dim: in
 def create_model(
-    model_type: AE | AEGMM | PixelCNN | VAE | VAEGMM,
+    model_type: Literal["AE", "AEGMM", "PixelCNN", "VAE", "VAEGMM"],
     input_shape: tuple[int, int, int],
     encoding_dim: int | None = None,
     n_gmm: int | None = None,
     gmm_latent_dim: int | None = None,
-):
+) -> Any:
     """
     Create a default model for the specified model type.
     Parameters
     ----------
-    model_type : Union[AE, AEGMM, PixelCNN, VAE, VAEGMM]
+    model_type : Literal["AE", "AEGMM", "PixelCNN", "VAE", "VAEGMM"]
         The model type to create.
     input_shape : Tuple[int, int, int]
         The input shape of the data used.
@@ -148,20 +148,20 @@ def create_model(
     """
     input_dim = math.prod(input_shape)
     encoding_dim = int(math.pow(2, int(input_dim.bit_length() * 0.8)) if encoding_dim is None else encoding_dim)
-    if model_type == AE:
+    if model_type == "AE":
         return AE(
-            _get_default_encoder_net(input_shape, encoding_dim),
-            _get_default_decoder_net(input_shape, encoding_dim),
+            get_default_encoder_net(input_shape, encoding_dim),
+            get_default_decoder_net(input_shape, encoding_dim),
         )
-    if model_type == VAE:
+    if model_type == "VAE":
         return VAE(
-            _get_default_encoder_net(input_shape, encoding_dim),
-            _get_default_decoder_net(input_shape, encoding_dim),
+            get_default_encoder_net(input_shape, encoding_dim),
+            get_default_decoder_net(input_shape, encoding_dim),
             encoding_dim,
         )
-    if model_type == AEGMM:
+    if model_type == "AEGMM":
         n_gmm = 2 if n_gmm is None else n_gmm
         gmm_latent_dim = 1 if gmm_latent_dim is None else gmm_latent_dim
         # The outlier detector is an encoder/decoder architecture
@@ -201,7 +201,7 @@ def create_model(
             n_gmm=n_gmm,
         )
-    if model_type == VAEGMM:
+    if model_type == "VAEGMM":
         n_gmm = 2 if n_gmm is None else n_gmm
         gmm_latent_dim = 2 if gmm_latent_dim is None else gmm_latent_dim
         # The outlier detector is an encoder/decoder architecture
@@ -242,7 +242,7 @@ def create_model(
             latent_dim=gmm_latent_dim,
         )
-    if model_type == PixelCNN:
+    if model_type == "PixelCNN":
         return PixelCNN(
             image_shape=input_shape,
             num_resnet=5,

dataeval/utils/tensorflow/loss/__init__.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from dataeval import _IS_TENSORFLOW_AVAILABLE
-from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
 __all__ = []
 if _IS_TENSORFLOW_AVAILABLE:
-    __all__ += ["Elbo", "LossGMM"]
+    from dataeval.utils.tensorflow._internal.loss import Elbo, LossGMM
+    __all__ = ["Elbo", "LossGMM"]
+del _IS_TENSORFLOW_AVAILABLE

dataeval/utils/torch/__init__.py CHANGED Viewed

@@ -6,16 +6,20 @@ to create a seamless integration between custom models and DataEval's metrics.
 """
 from dataeval import _IS_TORCH_AVAILABLE, _IS_TORCHVISION_AVAILABLE
-from dataeval._internal.utils import read_dataset
 __all__ = []
 if _IS_TORCH_AVAILABLE:
-    from . import models, trainer
+    from dataeval.utils.torch import models, trainer
+    from dataeval.utils.torch.utils import read_dataset
     __all__ += ["read_dataset", "models", "trainer"]
 if _IS_TORCHVISION_AVAILABLE:
-    from . import datasets
+    from dataeval.utils.torch import datasets
     __all__ += ["datasets"]
+del _IS_TORCH_AVAILABLE
+del _IS_TORCHVISION_AVAILABLE

dataeval/{_internal/models/pytorch → utils/torch}/blocks.py RENAMED Viewed

@@ -1,3 +1,7 @@
+from typing import Any
+__all__ = []
 import torch.nn as nn
@@ -8,21 +12,22 @@ class Conv(nn.Module):
     def __init__(
         self,
-        in_channels,
-        out_channels,
-        k=1,
-        s=1,
-        p=0,
-        activation="relu",
-        norm="instance",
-    ):
+        in_channels: int,
+        out_channels: int,
+        k: int = 1,
+        s: int = 1,
+        p: int = 0,
+        activation: str = "relu",
+        norm: str = "instance",
+    ) -> None:
         super().__init__()
-        conv = nn.Conv2d(in_channels, out_channels, kernel_size=k, stride=s, padding=p)
-        norm = self.get_norm_func(norm=norm, out_channels=out_channels)
-        act = self.get_activation_func(activation=activation)
-        self.module = nn.Sequential(conv, norm, act)
+        self.module: nn.Sequential = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=k, stride=s, padding=p),
+            self.get_norm_func(norm=norm, out_channels=out_channels),
+            self.get_activation_func(activation=activation),
+        )
-    def get_norm_func(self, norm: str, out_channels) -> nn.Module:
+    def get_norm_func(self, norm: str, out_channels: int) -> nn.Module:
         if norm == "batch":
             return nn.BatchNorm2d(out_channels)
         if norm == "instance":
@@ -42,5 +47,5 @@ class Conv(nn.Module):
             return nn.Tanh()
         return nn.Identity()
-    def forward(self, x):
+    def forward(self, x: Any) -> Any:
         return self.module(x)

dataeval/{_internal → utils/torch}/datasets.py RENAMED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+__all__ = ["MNIST", "CIFAR10", "VOCDetection"]
 import hashlib
 import os
 import zipfile
@@ -11,7 +13,7 @@ import numpy as np
 import requests
 from numpy.typing import NDArray
 from torch.utils.data import Dataset
-from torchvision.datasets import CIFAR10, VOCDetection  # noqa: F401
+from torchvision.datasets import CIFAR10, VOCDetection
 ClassStringMap = Literal["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
 TClassMap = TypeVar("TClassMap", ClassStringMap, int, list[ClassStringMap], list[int])
@@ -50,6 +52,7 @@ def _get_file(
     file_hash: str | None = None,
     verbose: bool = True,
     md5: bool = False,
+    timeout: int = 60,
 ):
     fpath = os.path.join(root, fname)
     download = True
@@ -64,16 +67,16 @@ def _get_file(
         try:
             error_msg = "URL fetch failure on {}: {} -- {}"
             try:
-                with requests.get(origin, stream=True, timeout=60) as r:
+                with requests.get(origin, stream=True, timeout=timeout) as r:
                     r.raise_for_status()
                     with open(fpath, "wb") as f:
                         for chunk in r.iter_content(chunk_size=8192):
                             if chunk:
                                 f.write(chunk)
             except requests.exceptions.HTTPError as e:
-                raise Exception(f"{error_msg.format(origin, e.response.status_code)} -- {e.response.reason}") from e
+                raise RuntimeError(f"{error_msg.format(origin, e.response.status_code, e.response.reason)}") from e
             except requests.exceptions.RequestException as e:
-                raise Exception(f"{error_msg.format(origin, 'Unknown error')} -- {str(e)}") from e
+                raise ValueError(f"{error_msg.format(origin, 'Unknown error', str(e))}") from e
         except (Exception, KeyboardInterrupt):
             if os.path.exists(fpath):
                 os.remove(fpath)
@@ -89,7 +92,7 @@ def _get_file(
     return fpath
-def check_exists(
+def _check_exists(
     folder: str | Path,
     url: str,
     root: str | Path,
@@ -103,7 +106,7 @@ def check_exists(
     location = str(folder)
     if not os.path.exists(folder):
         if download:
-            location = download_dataset(url, root, fname, file_hash, verbose, md5)
+            location = _download_dataset(url, root, fname, file_hash, verbose, md5)
         else:
             raise RuntimeError("Dataset not found. You can use download=True to download it")
     else:
@@ -112,7 +115,7 @@ def check_exists(
     return location
-def download_dataset(
+def _download_dataset(
     url: str, root: str | Path, fname: str, file_hash: str, verbose: bool = True, md5: bool = False
 ) -> str:
     """Code to download mnist and corruptions, originates from tensorflow_datasets (tfds):
@@ -131,11 +134,11 @@ def download_dataset(
         md5=md5,
     )
     if md5:
-        folder = extract_archive(fpath, root, remove_finished=True)
+        folder = _extract_archive(fpath, root, remove_finished=True)
     return folder
-def extract_archive(
+def _extract_archive(
     from_path: str | Path,
     to_path: str | Path | None = None,
     remove_finished: bool = False,
@@ -163,13 +166,13 @@ def extract_archive(
     return str(to_path)
-def subselect(arr: NDArray, count: int, from_back: bool = False):
+def _subselect(arr: NDArray, count: int, from_back: bool = False):
     if from_back:
         return arr[-count:]
     return arr[:count]
-class MNIST(Dataset):
+class MNIST(Dataset[tuple[NDArray[np.float64], int]]):
     """MNIST Dataset and Corruptions.
     Args:
@@ -211,17 +214,17 @@ class MNIST(Dataset):
             If True, outputs print statements.
     """
-    mirror = [
+    _mirrors: tuple[str, ...] = (
         "https://storage.googleapis.com/tensorflow/tf-keras-datasets/",
         "https://zenodo.org/record/3239543/files/",
-    ]
+    )
-    resources = [
+    _resources: tuple[tuple[str, str], ...] = (
         ("mnist.npz", "731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1"),
         ("mnist_c.zip", "4b34b33045869ee6d424616cd3a65da3"),
-    ]
+    )
-    class_dict = {
+    class_dict: dict[str, int] = {
         "zero": 0,
         "one": 1,
         "two": 2,
@@ -267,43 +270,46 @@ class MNIST(Dataset):
         self.randomize = randomize
         self.from_back = slice_back
         self.verbose = verbose
+        self.data: NDArray[np.float64]
+        self.targets: NDArray[np.int_]
+        self.size: int
-        self.class_set = []
+        self._class_set = []
         if classes is not None:
             if not isinstance(classes, list):
                 classes = [classes]  # type: ignore
             for val in classes:  # type: ignore
                 if isinstance(val, int) and 0 <= val < 10:
-                    self.class_set.append(val)
+                    self._class_set.append(val)
                 elif isinstance(val, str):
-                    self.class_set.append(self.class_dict[val])
-            self.class_set = set(self.class_set)
+                    self._class_set.append(self.class_dict[val])
+            self._class_set = set(self._class_set)
-        if not self.class_set:
-            self.class_set = set(self.class_dict.values())
+        if not self._class_set:
+            self._class_set = set(self.class_dict.values())
-        self.num_classes = len(self.class_set)
+        self._num_classes = len(self._class_set)
         if self.corruption is None:
-            file_resource = self.resources[0]
-            mirror = self.mirror[0]
+            file_resource = self._resources[0]
+            mirror = self._mirrors[0]
             md5 = False
         else:
             if self.corruption == "identity" and verbose:
                 print("Identity is not a corrupted dataset but the original MNIST dataset.")
-            file_resource = self.resources[1]
-            mirror = self.mirror[1]
+            file_resource = self._resources[1]
+            mirror = self._mirrors[1]
             md5 = True
-        check_exists(self.mnist_folder, mirror, self.root, file_resource[0], file_resource[1], download, verbose, md5)
+        _check_exists(self.mnist_folder, mirror, self.root, file_resource[0], file_resource[1], download, verbose, md5)
         self.data, self.targets = self._load_data()
         self._augmentations()
-    def _load_data(self):
+    def _load_data(self) -> tuple[NDArray[np.float64], NDArray[np.int64]]:
         if self.corruption is None:
-            image_file = self.resources[0][0]
+            image_file = self._resources[0][0]
             data, targets = self._read_normal_file(os.path.join(self.mnist_folder, image_file))
         else:
             image_file = f"{'train' if self.train else 'test'}_images.npy"
@@ -329,27 +335,27 @@ class MNIST(Dataset):
             self.data = self.data[shuffled_indices]
             self.targets = self.targets[shuffled_indices]
-        if not self.balance and self.num_classes > self.size:
+        if not self.balance and self._num_classes > self.size:
             if self.size > 0:
-                self.data = subselect(self.data, self.size, self.from_back)
-                self.targets = subselect(self.targets, self.size, self.from_back)
+                self.data = _subselect(self.data, self.size, self.from_back)
+                self.targets = _subselect(self.targets, self.size, self.from_back)
         else:
-            label_dict = {label: np.where(self.targets == label)[0] for label in self.class_set}
+            label_dict = {label: np.where(self.targets == label)[0] for label in self._class_set}
             min_label_count = min(len(indices) for indices in label_dict.values())
-            self.per_class_count = int(np.ceil(self.size / self.num_classes)) if self.size > 0 else min_label_count
+            self._per_class_count = int(np.ceil(self.size / self._num_classes)) if self.size > 0 else min_label_count
-            if self.per_class_count > min_label_count:
-                self.per_class_count = min_label_count
+            if self._per_class_count > min_label_count:
+                self._per_class_count = min_label_count
                 if not self.balance and self.verbose:
                     warn(
-                        f"Because of dataset limitations, only {min_label_count*self.num_classes} samples "
+                        f"Because of dataset limitations, only {min_label_count*self._num_classes} samples "
                         f"will be returned, instead of the desired {self.size}."
                     )
-            all_indices = np.empty(shape=(self.num_classes, self.per_class_count), dtype=int)
-            for i, label in enumerate(self.class_set):
-                all_indices[i] = subselect(label_dict[label], self.per_class_count, self.from_back)
+            all_indices: NDArray[np.int_] = np.empty(shape=(self._num_classes, self._per_class_count), dtype=np.int_)
+            for i, label in enumerate(self._class_set):
+                all_indices[i] = _subselect(label_dict[label], self._per_class_count, self.from_back)
             self.data = np.vstack(self.data[all_indices.T])  # type: ignore
             self.targets = np.hstack(self.targets[all_indices.T])  # type: ignore
@@ -370,7 +376,7 @@ class MNIST(Dataset):
         if self.flatten and self.channels is None:
             self.data = self.data.reshape(self.data.shape[0], -1)
-    def __getitem__(self, index: int) -> tuple[NDArray, int]:
+    def __getitem__(self, index: int) -> tuple[NDArray[np.float64], int]:
         """
         Args:
             index (int): Index

dataeval 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl

dataeval 0.72.1py3-none-any.whl → 0.72.2py3-none-any.whl