dataeval 0.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +18 -0
  2. dataeval/_internal/detectors/__init__.py +0 -0
  3. dataeval/_internal/detectors/clusterer.py +469 -0
  4. dataeval/_internal/detectors/drift/__init__.py +0 -0
  5. dataeval/_internal/detectors/drift/base.py +265 -0
  6. dataeval/_internal/detectors/drift/cvm.py +97 -0
  7. dataeval/_internal/detectors/drift/ks.py +100 -0
  8. dataeval/_internal/detectors/drift/mmd.py +166 -0
  9. dataeval/_internal/detectors/drift/torch.py +310 -0
  10. dataeval/_internal/detectors/drift/uncertainty.py +149 -0
  11. dataeval/_internal/detectors/duplicates.py +49 -0
  12. dataeval/_internal/detectors/linter.py +78 -0
  13. dataeval/_internal/detectors/ood/__init__.py +0 -0
  14. dataeval/_internal/detectors/ood/ae.py +77 -0
  15. dataeval/_internal/detectors/ood/aegmm.py +69 -0
  16. dataeval/_internal/detectors/ood/base.py +199 -0
  17. dataeval/_internal/detectors/ood/llr.py +284 -0
  18. dataeval/_internal/detectors/ood/vae.py +86 -0
  19. dataeval/_internal/detectors/ood/vaegmm.py +79 -0
  20. dataeval/_internal/flags.py +47 -0
  21. dataeval/_internal/metrics/__init__.py +0 -0
  22. dataeval/_internal/metrics/base.py +92 -0
  23. dataeval/_internal/metrics/ber.py +124 -0
  24. dataeval/_internal/metrics/coverage.py +80 -0
  25. dataeval/_internal/metrics/divergence.py +94 -0
  26. dataeval/_internal/metrics/hash.py +79 -0
  27. dataeval/_internal/metrics/parity.py +180 -0
  28. dataeval/_internal/metrics/stats.py +332 -0
  29. dataeval/_internal/metrics/uap.py +45 -0
  30. dataeval/_internal/metrics/utils.py +158 -0
  31. dataeval/_internal/models/__init__.py +0 -0
  32. dataeval/_internal/models/pytorch/__init__.py +0 -0
  33. dataeval/_internal/models/pytorch/autoencoder.py +202 -0
  34. dataeval/_internal/models/pytorch/blocks.py +46 -0
  35. dataeval/_internal/models/pytorch/utils.py +67 -0
  36. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  37. dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
  38. dataeval/_internal/models/tensorflow/gmm.py +115 -0
  39. dataeval/_internal/models/tensorflow/losses.py +107 -0
  40. dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
  41. dataeval/_internal/models/tensorflow/trainer.py +102 -0
  42. dataeval/_internal/models/tensorflow/utils.py +254 -0
  43. dataeval/_internal/workflows/sufficiency.py +555 -0
  44. dataeval/detectors/__init__.py +29 -0
  45. dataeval/flags/__init__.py +3 -0
  46. dataeval/metrics/__init__.py +7 -0
  47. dataeval/models/__init__.py +15 -0
  48. dataeval/models/tensorflow/__init__.py +6 -0
  49. dataeval/models/torch/__init__.py +8 -0
  50. dataeval/py.typed +0 -0
  51. dataeval/workflows/__init__.py +8 -0
  52. dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
  53. dataeval-0.61.0.dist-info/METADATA +114 -0
  54. dataeval-0.61.0.dist-info/RECORD +55 -0
  55. dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,158 @@
1
+ from typing import Any, Literal, NamedTuple, Tuple, Union
2
+
3
+ import numpy as np
4
+ from scipy.signal import convolve2d
5
+ from scipy.sparse import csr_matrix
6
+ from scipy.sparse.csgraph import minimum_spanning_tree as mst
7
+ from scipy.spatial.distance import pdist, squareform
8
+ from sklearn.neighbors import NearestNeighbors
9
+
10
+ EPSILON = 1e-5
11
+ EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
12
+ BIT_DEPTH = (1, 8, 12, 16, 32)
13
+
14
+
15
+ def minimum_spanning_tree(X: np.ndarray) -> Any:
16
+ """
17
+ Returns the minimum spanning tree from a NumPy image array.
18
+
19
+ Parameters
20
+ ----------
21
+ X: np.ndarray
22
+ Numpy image array
23
+
24
+ Returns
25
+ -------
26
+ Data representing the minimum spanning tree
27
+ """
28
+ # All features belong on second dimension
29
+ X = X.reshape((X.shape[0], -1))
30
+ # We add a small constant to the distance matrix to ensure scipy interprets
31
+ # the input graph as fully-connected.
32
+ dense_eudist = squareform(pdist(X)) + EPSILON
33
+ eudist_csr = csr_matrix(dense_eudist)
34
+ return mst(eudist_csr)
35
+
36
+
37
+ def get_classes_counts(labels: np.ndarray) -> Tuple[int, int]:
38
+ """
39
+ Returns the classes and counts of from an array of labels
40
+
41
+ Parameters
42
+ ----------
43
+ label: np.ndarray
44
+ Numpy labels array
45
+
46
+ Returns
47
+ -------
48
+ Classes and counts
49
+
50
+ Raises
51
+ ------
52
+ ValueError
53
+ If the number of unique classes is less than 2
54
+ """
55
+ classes, counts = np.unique(labels, return_counts=True)
56
+ M = len(classes)
57
+ if M < 2:
58
+ raise ValueError("Label vector contains less than 2 classes!")
59
+ N = np.sum(counts).astype(int)
60
+ return M, N
61
+
62
+
63
+ def compute_neighbors(
64
+ A: np.ndarray,
65
+ B: np.ndarray,
66
+ k: int = 1,
67
+ algorithm: Literal["auto", "ball_tree", "kd_tree"] = "auto",
68
+ ) -> np.ndarray:
69
+ """
70
+ For each sample in A, compute the nearest neighbor in B
71
+
72
+ Parameters
73
+ ----------
74
+ A, B : np.ndarray
75
+ The n_samples and n_features respectively
76
+ k : int
77
+ The number of neighbors to find
78
+ algorithm : Literal
79
+ Tree method for nearest neighbor (auto, ball_tree or kd_tree)
80
+
81
+ Note
82
+ ----
83
+ Do not use kd_tree if n_features > 20
84
+
85
+ Returns
86
+ -------
87
+ List:
88
+ Closest points to each point in A and B
89
+
90
+ See Also
91
+ --------
92
+ :func:`sklearn.neighbors.NearestNeighbors`
93
+ """
94
+
95
+ nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
96
+ nns = nbrs.kneighbors(A)[1]
97
+ nns = nns[:, 1:].squeeze()
98
+
99
+ return nns
100
+
101
+
102
+ class BitDepth(NamedTuple):
103
+ depth: int
104
+ pmin: Union[float, int]
105
+ pmax: Union[float, int]
106
+
107
+
108
+ def get_bitdepth(image: np.ndarray) -> BitDepth:
109
+ """
110
+ Approximates the bit depth of the image using the
111
+ min and max pixel values.
112
+ """
113
+ pmin, pmax = np.min(image), np.max(image)
114
+ if pmin < 0:
115
+ return BitDepth(0, pmin, pmax)
116
+ else:
117
+ depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
118
+ return BitDepth(depth, 0, 2**depth - 1)
119
+
120
+
121
+ def rescale(image: np.ndarray, depth: int = 1) -> np.ndarray:
122
+ """
123
+ Rescales the image using the bit depth provided.
124
+ """
125
+ bitdepth = get_bitdepth(image)
126
+ if bitdepth.depth == depth:
127
+ return image
128
+ else:
129
+ normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
130
+ return normalized * (2**depth - 1)
131
+
132
+
133
+ def normalize_image_shape(image: np.ndarray) -> np.ndarray:
134
+ """
135
+ Normalizes the image shape into (C,H,W).
136
+ """
137
+ ndim = image.ndim
138
+ if ndim == 2:
139
+ return np.expand_dims(image, axis=0)
140
+ elif ndim == 3:
141
+ return image
142
+ elif ndim > 3:
143
+ # Slice all but the last 3 dimensions
144
+ return image[(0,) * (ndim - 3)]
145
+ else:
146
+ raise ValueError("Images must have 2 or more dimensions.")
147
+
148
+
149
+ def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
150
+ """
151
+ Returns the image filtered using a 3x3 edge detection kernel:
152
+ [[ -1, -1, -1 ],
153
+ [ -1, 8, -1 ],
154
+ [ -1, -1, -1 ]]
155
+ """
156
+ edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
157
+ np.clip(edges, 0, 255, edges)
158
+ return edges
File without changes
File without changes
@@ -0,0 +1,202 @@
1
+ from typing import Any, List, Union
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.optim import Adam
6
+ from torch.utils.data import DataLoader, Dataset
7
+
8
+ torch.manual_seed(0)
9
+
10
+
11
+ def get_images_from_batch(batch: Any) -> Any:
12
+ """Extracts images from a batch of collated data by DataLoader"""
13
+ return batch[0] if isinstance(batch, (list, tuple)) else batch
14
+
15
+
16
+ class AETrainer:
17
+ def __init__(
18
+ self,
19
+ model: nn.Module,
20
+ device: Union[str, torch.device] = "auto",
21
+ batch_size: int = 8,
22
+ ):
23
+ """
24
+ model : nn.Module
25
+ Model to be trained
26
+ device : str | torch.device, default "cpu"
27
+ Hardware device for model, optimizer, and data to run on
28
+ batch_size : int, default 8
29
+ Number of images to group together in `torch.utils.data.DataLoader`
30
+ """
31
+ if device == "auto":
32
+ device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ self.device = device
34
+ self.model = model.to(device)
35
+ self.batch_size = batch_size
36
+
37
+ def train(self, dataset: Dataset, epochs: int = 25) -> List[float]:
38
+ """
39
+ Basic training function for Autoencoder models for reconstruction tasks
40
+
41
+ Uses `torch.optim.Adam` and `torch.nn.MSELoss` as default hyperparameters
42
+
43
+ Parameters
44
+ ----------
45
+ dataset : Dataset
46
+ Torch Dataset containing images in the first return position
47
+ epochs : int, default 25
48
+ Number of full training loops
49
+
50
+ Note
51
+ ----
52
+ To replace this function with a custom function, do
53
+ AETrainer.train = custom_function
54
+ """
55
+ # Setup training
56
+ self.model.train()
57
+ dataloader = DataLoader(dataset, batch_size=self.batch_size)
58
+ opt = Adam(self.model.parameters(), lr=0.001)
59
+ criterion = nn.MSELoss().to(self.device)
60
+ # Record loss
61
+ loss_history: List[float] = []
62
+
63
+ for _ in range(epochs):
64
+ epoch_loss: float = 0
65
+ for batch in dataloader:
66
+ imgs = get_images_from_batch(batch)
67
+ imgs = imgs.to(self.device)
68
+ # Zero your gradients for every batch!
69
+ opt.zero_grad()
70
+
71
+ # Make predictions for this batch
72
+ pred = self.model(imgs)
73
+
74
+ # Compute the loss and its gradients
75
+ loss = criterion(pred, imgs)
76
+ loss.backward()
77
+
78
+ # Adjust learning weights
79
+ opt.step()
80
+
81
+ # Gather data and report
82
+ epoch_loss += loss.item()
83
+ # Will take the average from all batches
84
+ epoch_loss /= len(dataloader)
85
+ loss_history.append(epoch_loss)
86
+
87
+ return loss_history
88
+
89
+ @torch.no_grad
90
+ def eval(self, dataset: Dataset) -> float:
91
+ """
92
+ Basic evaluation function for Autoencoder models for reconstruction tasks
93
+
94
+ Uses `torch.optim.Adam` and `torch.nn.MSELoss` as default hyperparameters
95
+
96
+ Parameters
97
+ ----------
98
+ dataset : Dataset
99
+ Torch Dataset containing images in the first return position
100
+
101
+ Returns
102
+ -------
103
+ float
104
+ Total reconstruction loss over all data
105
+
106
+ Note
107
+ ----
108
+ To replace this function with a custom function, do
109
+ AETrainer.eval = custom_function
110
+ """
111
+ self.model.eval()
112
+ dataloader = DataLoader(dataset, batch_size=self.batch_size)
113
+ criterion = nn.MSELoss().to(self.device)
114
+ total_loss: float = 0.0
115
+
116
+ for batch in dataloader:
117
+ imgs = get_images_from_batch(batch)
118
+ imgs = imgs.to(self.device)
119
+ pred = self.model(imgs)
120
+ loss = criterion(pred, imgs)
121
+ total_loss += loss.item()
122
+ return total_loss / len(dataloader)
123
+
124
+ @torch.no_grad
125
+ def encode(self, dataset: Dataset) -> torch.Tensor:
126
+ """
127
+ Encode data through model if it has an encode attribute,
128
+ otherwise passes data through model.forward
129
+
130
+ Parameters
131
+ ----------
132
+ dataset: Dataset
133
+ Dataset containing images to be encoded by the model
134
+
135
+ Returns
136
+ -------
137
+ torch.Tensor
138
+ Data encoded by the model
139
+ """
140
+ self.model.eval()
141
+ dl = DataLoader(dataset, batch_size=self.batch_size)
142
+ encodings = torch.Tensor([])
143
+
144
+ # Get encode function if defined
145
+ encode_func = self.model.encode if getattr(self.model, "encode", None) else self.model.forward
146
+
147
+ # Accumulate encodings from batches
148
+ for batch in dl:
149
+ imgs = get_images_from_batch(batch)
150
+ imgs = imgs.to(self.device)
151
+ embeddings = encode_func(imgs).to("cpu")
152
+ encodings = torch.vstack((encodings, embeddings)) if len(encodings) else embeddings
153
+
154
+ return encodings
155
+
156
+
157
+ class AriaAutoencoder(nn.Module):
158
+ def __init__(self, channels=3):
159
+ super().__init__()
160
+ self.encoder = Encoder(channels)
161
+ self.decoder = Decoder(channels)
162
+
163
+ def forward(self, x):
164
+ x = self.encoder(x)
165
+ x = self.decoder(x)
166
+ return x
167
+
168
+ def encode(self, x):
169
+ return self.encoder(x)
170
+
171
+
172
+ class Encoder(nn.Module):
173
+ def __init__(self, channels=3):
174
+ super().__init__()
175
+ self.encoder = nn.Sequential(
176
+ nn.Conv2d(channels, 256, 2, stride=1, padding=1),
177
+ nn.ReLU(),
178
+ nn.MaxPool2d(2),
179
+ nn.Conv2d(256, 128, 2, stride=1, padding=1),
180
+ nn.ReLU(),
181
+ nn.MaxPool2d(2),
182
+ nn.Conv2d(128, 64, 2, stride=1),
183
+ )
184
+
185
+ def forward(self, x):
186
+ return self.encoder(x)
187
+
188
+
189
+ class Decoder(nn.Module):
190
+ def __init__(self, channels):
191
+ super().__init__()
192
+ self.decoder = nn.Sequential(
193
+ nn.ConvTranspose2d(64, 128, 2, stride=1),
194
+ nn.ReLU(),
195
+ nn.ConvTranspose2d(128, 256, 2, stride=2),
196
+ nn.ReLU(),
197
+ nn.ConvTranspose2d(256, channels, 2, stride=2),
198
+ nn.Sigmoid(),
199
+ )
200
+
201
+ def forward(self, x):
202
+ return self.decoder(x)
@@ -0,0 +1,46 @@
1
+ import torch.nn as nn
2
+
3
+
4
+ class Conv(nn.Module):
5
+ """
6
+ Wrapper for conv modules, so we don't have to specify everything every time
7
+ """
8
+
9
+ def __init__(
10
+ self,
11
+ in_channels,
12
+ out_channels,
13
+ k=1,
14
+ s=1,
15
+ p=0,
16
+ activation="relu",
17
+ norm="instance",
18
+ ):
19
+ super().__init__()
20
+ conv = nn.Conv2d(in_channels, out_channels, kernel_size=k, stride=s, padding=p)
21
+ norm = self.get_norm_func(norm=norm, out_channels=out_channels)
22
+ act = self.get_activation_func(activation=activation)
23
+ self.module = nn.Sequential(conv, norm, act)
24
+
25
+ def get_norm_func(self, norm: str, out_channels) -> nn.Module:
26
+ if norm == "batch":
27
+ return nn.BatchNorm2d(out_channels)
28
+ if norm == "instance":
29
+ return nn.InstanceNorm2d(out_channels)
30
+ if norm == "layer":
31
+ return nn.LayerNorm(out_channels)
32
+ return nn.Identity()
33
+
34
+ def get_activation_func(self, activation: str) -> nn.Module:
35
+ if activation == "selu":
36
+ return nn.SELU()
37
+ if activation == "relu":
38
+ return nn.ReLU()
39
+ if activation == "leaky":
40
+ return nn.LeakyReLU()
41
+ if activation == "tanh":
42
+ return nn.Tanh()
43
+ return nn.Identity()
44
+
45
+ def forward(self, x):
46
+ return self.module(x)
@@ -0,0 +1,67 @@
1
+ from numpy import float32, ndarray
2
+ from torch import Tensor, from_numpy
3
+
4
+
5
+ def torch_to_numpy(tensor: Tensor) -> ndarray:
6
+ """
7
+ Converts a PyTorch tensor to a NumPy array
8
+ """
9
+ if isinstance(tensor, ndarray): # Already array, return
10
+ return tensor
11
+ if not isinstance(tensor, Tensor):
12
+ raise TypeError("Tensor is not of type Tensor")
13
+
14
+ x: ndarray = tensor.detach().cpu().numpy()
15
+ return x
16
+
17
+
18
+ def numpy_to_torch(array: ndarray) -> Tensor:
19
+ """
20
+ Converts a NumPy array to a PyTorch tensor
21
+ """
22
+ if isinstance(array, Tensor): # Already tensor, return
23
+ return array
24
+ if not isinstance(array, ndarray):
25
+ raise TypeError("Array is not of type numpy.ndarray")
26
+ x: Tensor = from_numpy(array.astype(float32))
27
+ return x
28
+
29
+
30
+ def permute_to_torch(array: ndarray) -> Tensor:
31
+ """
32
+ Converts and permutes a NumPy image array into a PyTorch image tensor.
33
+
34
+ Parameters
35
+ ----------
36
+ array: ndarray
37
+ Array containing image data in the format NHWC
38
+
39
+ Returns
40
+ -------
41
+ Tensor
42
+ Tensor containing image data in the format NCHW
43
+ """
44
+ x = numpy_to_torch(array)
45
+ x = x.permute(0, 3, 1, 2) # NHWC -> NCHW
46
+ return x
47
+
48
+
49
+ def permute_to_numpy(tensor: Tensor) -> ndarray:
50
+ """
51
+ Converts and permutes a PyTorch image tensor into a NumPy image array.
52
+
53
+ Does not permute if given ndarray
54
+
55
+ Parameters
56
+ ----------
57
+ tensor: Tensor
58
+ Tensor containing image data in the format NCHW
59
+
60
+ Returns
61
+ -------
62
+ ndarray
63
+ Array containing image data in the format NHWC
64
+ """
65
+ x = tensor.permute(0, 2, 3, 1)
66
+ x = torch_to_numpy(x) # NCHW -> NHWC
67
+ return x
File without changes