dataeval 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/_internal/detectors/clusterer.py +47 -34
- dataeval/_internal/detectors/drift/base.py +53 -35
- dataeval/_internal/detectors/drift/cvm.py +5 -4
- dataeval/_internal/detectors/drift/ks.py +7 -6
- dataeval/_internal/detectors/drift/mmd.py +39 -19
- dataeval/_internal/detectors/drift/torch.py +6 -5
- dataeval/_internal/detectors/drift/uncertainty.py +7 -8
- dataeval/_internal/detectors/duplicates.py +57 -30
- dataeval/_internal/detectors/linter.py +40 -24
- dataeval/_internal/detectors/ood/ae.py +2 -1
- dataeval/_internal/detectors/ood/aegmm.py +2 -1
- dataeval/_internal/detectors/ood/base.py +37 -15
- dataeval/_internal/detectors/ood/llr.py +9 -8
- dataeval/_internal/detectors/ood/vae.py +2 -1
- dataeval/_internal/detectors/ood/vaegmm.py +2 -1
- dataeval/_internal/flags.py +42 -21
- dataeval/_internal/interop.py +3 -12
- dataeval/_internal/metrics/balance.py +188 -0
- dataeval/_internal/metrics/ber.py +123 -48
- dataeval/_internal/metrics/coverage.py +90 -74
- dataeval/_internal/metrics/divergence.py +101 -67
- dataeval/_internal/metrics/diversity.py +211 -0
- dataeval/_internal/metrics/parity.py +287 -155
- dataeval/_internal/metrics/stats.py +198 -317
- dataeval/_internal/metrics/uap.py +40 -29
- dataeval/_internal/metrics/utils.py +430 -0
- dataeval/_internal/models/tensorflow/losses.py +3 -3
- dataeval/_internal/models/tensorflow/trainer.py +3 -2
- dataeval/_internal/models/tensorflow/utils.py +4 -3
- dataeval/_internal/output.py +82 -0
- dataeval/_internal/utils.py +64 -0
- dataeval/_internal/workflows/sufficiency.py +96 -107
- dataeval/flags/__init__.py +2 -2
- dataeval/metrics/__init__.py +26 -7
- dataeval/utils/__init__.py +9 -0
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
- dataeval-0.65.0.dist-info/RECORD +60 -0
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +0 -63
- dataeval/_internal/functional/coverage.py +0 -75
- dataeval/_internal/functional/divergence.py +0 -16
- dataeval/_internal/functional/hash.py +0 -79
- dataeval/_internal/functional/metadata.py +0 -136
- dataeval/_internal/functional/metadataparity.py +0 -190
- dataeval/_internal/functional/uap.py +0 -6
- dataeval/_internal/functional/utils.py +0 -158
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +0 -30
- dataeval/_internal/metrics/base.py +0 -92
- dataeval/_internal/metrics/metadata.py +0 -610
- dataeval/_internal/metrics/metadataparity.py +0 -67
- dataeval-0.63.0.dist-info/RECORD +0 -68
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -1,75 +0,0 @@
|
|
1
|
-
import math
|
2
|
-
from typing import Literal, Tuple
|
3
|
-
|
4
|
-
import numpy as np
|
5
|
-
from scipy.spatial.distance import pdist, squareform
|
6
|
-
|
7
|
-
|
8
|
-
def coverage(
|
9
|
-
embeddings: np.ndarray,
|
10
|
-
radius_type: Literal["adaptive", "naive"] = "adaptive",
|
11
|
-
k: int = 20,
|
12
|
-
percent: np.float64 = np.float64(0.01),
|
13
|
-
) -> Tuple[np.ndarray, np.ndarray, float]:
|
14
|
-
"""
|
15
|
-
Perform a one-way chi-squared test between observation frequencies and expected frequencies that
|
16
|
-
tests the null hypothesis that the observed data has the expected frequencies.
|
17
|
-
|
18
|
-
Parameters
|
19
|
-
----------
|
20
|
-
embeddings : ArrayLike, shape - (N, P)
|
21
|
-
A dataset in an ArrayLike format.
|
22
|
-
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
23
|
-
radius_type : Literal["adaptive", "naive"], default "adaptive"
|
24
|
-
The function used to determine radius.
|
25
|
-
k: int, default 20
|
26
|
-
Number of observations required in order to be covered.
|
27
|
-
[1] suggests that a minimum of 20-50 samples is necessary.
|
28
|
-
percent: np.float64, default np.float(0.01)
|
29
|
-
Percent of observations to be considered uncovered. Only applies to adaptive radius.
|
30
|
-
|
31
|
-
Returns
|
32
|
-
-------
|
33
|
-
np.ndarray
|
34
|
-
Array of uncovered indices
|
35
|
-
np.ndarray
|
36
|
-
Array of critical value radii
|
37
|
-
float
|
38
|
-
Radius for coverage
|
39
|
-
|
40
|
-
Raises
|
41
|
-
------
|
42
|
-
ValueError
|
43
|
-
If length of embeddings is less than or equal to k
|
44
|
-
ValueError
|
45
|
-
If radius_type is unknown
|
46
|
-
|
47
|
-
Note
|
48
|
-
----
|
49
|
-
Embeddings should be on the unit interval.
|
50
|
-
|
51
|
-
Reference
|
52
|
-
---------
|
53
|
-
This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
|
54
|
-
[1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
|
55
|
-
"""
|
56
|
-
|
57
|
-
# Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
|
58
|
-
n = len(embeddings)
|
59
|
-
if n <= k:
|
60
|
-
raise ValueError("Number of observations less than or equal to the specified number of neighbors.")
|
61
|
-
mat = squareform(pdist(embeddings))
|
62
|
-
sorted_dists = np.sort(mat, axis=1)
|
63
|
-
crit = sorted_dists[:, k + 1]
|
64
|
-
|
65
|
-
d = np.shape(embeddings)[1]
|
66
|
-
if radius_type == "naive":
|
67
|
-
rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
|
68
|
-
pvals = np.where(crit > rho)[0]
|
69
|
-
elif radius_type == "adaptive":
|
70
|
-
# Use data adaptive cutoff as rho
|
71
|
-
rho = int(n * percent)
|
72
|
-
pvals = np.argsort(crit)[::-1][:rho]
|
73
|
-
else:
|
74
|
-
raise ValueError("Invalid radius type.")
|
75
|
-
return pvals, crit, rho
|
@@ -1,16 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
|
3
|
-
from .utils import compute_neighbors, minimum_spanning_tree
|
4
|
-
|
5
|
-
|
6
|
-
def divergence_mst(data: np.ndarray, labels: np.ndarray) -> int:
|
7
|
-
mst = minimum_spanning_tree(data).toarray()
|
8
|
-
edgelist = np.transpose(np.nonzero(mst))
|
9
|
-
errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
|
10
|
-
return errors
|
11
|
-
|
12
|
-
|
13
|
-
def divergence_fnn(data: np.ndarray, labels: np.ndarray) -> int:
|
14
|
-
nn_indices = compute_neighbors(data, data)
|
15
|
-
errors = np.sum(np.abs(labels[nn_indices] - labels))
|
16
|
-
return errors
|
@@ -1,79 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import xxhash as xxh
|
3
|
-
from PIL import Image
|
4
|
-
from scipy.fftpack import dct
|
5
|
-
|
6
|
-
from dataeval._internal.functional.utils import normalize_image_shape, rescale
|
7
|
-
|
8
|
-
HASH_SIZE = 8
|
9
|
-
MAX_FACTOR = 4
|
10
|
-
|
11
|
-
|
12
|
-
def pchash(image: np.ndarray) -> str:
|
13
|
-
"""
|
14
|
-
Performs a perceptual hash on an image by resizing to a square NxN image
|
15
|
-
using the Lanczos algorithm where N is 32x32 or the largest multiple of
|
16
|
-
8 that is smaller than the input image dimensions. The resampled image
|
17
|
-
is compressed using a discrete cosine transform and the lowest frequency
|
18
|
-
component is encoded as a bit array of greater or less than median value
|
19
|
-
and returned as a hex string.
|
20
|
-
|
21
|
-
Parameters
|
22
|
-
----------
|
23
|
-
image : np.ndarray
|
24
|
-
An image as a numpy array in CxHxW format
|
25
|
-
|
26
|
-
Returns
|
27
|
-
-------
|
28
|
-
str
|
29
|
-
The hex string hash of the image using perceptual hashing
|
30
|
-
"""
|
31
|
-
# Verify that the image is at least larger than an 8x8 image
|
32
|
-
min_dim = min(image.shape[-2:])
|
33
|
-
if min_dim < HASH_SIZE + 1:
|
34
|
-
raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
|
35
|
-
|
36
|
-
# Calculates the dimensions of the resized square image
|
37
|
-
resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
|
38
|
-
|
39
|
-
# Normalizes the image to CxHxW and takes the mean over all the channels
|
40
|
-
normalized = np.mean(normalize_image_shape(image), axis=0).squeeze()
|
41
|
-
|
42
|
-
# Rescales the pixel values to an 8-bit 0-255 image
|
43
|
-
rescaled = rescale(normalized, 8).astype(np.uint8)
|
44
|
-
|
45
|
-
# Resizes the image using the Lanczos algorithm to a square image
|
46
|
-
im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
|
47
|
-
|
48
|
-
# Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
|
49
|
-
transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
|
50
|
-
|
51
|
-
# Encodes the transform as a bit array over the median value
|
52
|
-
diff = transform > np.median(transform)
|
53
|
-
|
54
|
-
# Pads the front of the bit array to a multiple of 8 with False
|
55
|
-
padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
|
56
|
-
padded[-diff.size :] = diff.ravel()
|
57
|
-
|
58
|
-
# Converts the bit array to a hex string and strips leading 0s
|
59
|
-
hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
|
60
|
-
return hash_hex if hash_hex else "0"
|
61
|
-
|
62
|
-
|
63
|
-
def xxhash(image: np.ndarray) -> str:
|
64
|
-
"""
|
65
|
-
Performs a fast non-cryptographic hash using the xxhash algorithm
|
66
|
-
(xxhash.com) against the image as a flattened bytearray. The hash
|
67
|
-
is returned as a hex string.
|
68
|
-
|
69
|
-
Parameters
|
70
|
-
----------
|
71
|
-
image : np.ndarray
|
72
|
-
An image as a numpy array
|
73
|
-
|
74
|
-
Returns
|
75
|
-
-------
|
76
|
-
str
|
77
|
-
The hex string hash of the image using the xxHash algorithm
|
78
|
-
"""
|
79
|
-
return xxh.xxh3_64_hexdigest(image.ravel().tobytes())
|
@@ -1,136 +0,0 @@
|
|
1
|
-
from typing import Dict, List
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
from scipy.stats import entropy
|
5
|
-
|
6
|
-
|
7
|
-
def _get_counts(
|
8
|
-
data: np.ndarray, names: list[str], is_categorical: List, subset_mask: np.ndarray = np.empty(shape=0)
|
9
|
-
) -> tuple[Dict, Dict]:
|
10
|
-
"""
|
11
|
-
Initialize dictionary of histogram counts --- treat categorical values
|
12
|
-
as histogram bins.
|
13
|
-
|
14
|
-
Parameters
|
15
|
-
----------
|
16
|
-
subset_mask: Optional[np.ndarray[bool]]
|
17
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
18
|
-
|
19
|
-
Returns
|
20
|
-
-------
|
21
|
-
counts: Dict
|
22
|
-
histogram counts per metadata factor in `factors`. Each
|
23
|
-
factor will have a different number of bins. Counts get reused
|
24
|
-
across metrics, so hist_counts are cached but only if computed
|
25
|
-
globally, i.e. without masked samples.
|
26
|
-
"""
|
27
|
-
|
28
|
-
hist_counts, hist_bins = {}, {}
|
29
|
-
# np.where needed to satisfy linter
|
30
|
-
mask = np.where(subset_mask if len(subset_mask) > 0 else np.ones(data.shape[0], dtype=bool))
|
31
|
-
|
32
|
-
for cdx, fn in enumerate(names):
|
33
|
-
# linter doesn't like double indexing
|
34
|
-
col_data = data[mask, cdx].squeeze()
|
35
|
-
if is_categorical[cdx]:
|
36
|
-
# if discrete, use unique values as bins
|
37
|
-
bins, cnts = np.unique(col_data, return_counts=True)
|
38
|
-
else:
|
39
|
-
bins = hist_bins.get(fn, "auto")
|
40
|
-
cnts, bins = np.histogram(col_data, bins=bins, density=True)
|
41
|
-
|
42
|
-
hist_counts[fn] = cnts
|
43
|
-
hist_bins[fn] = bins
|
44
|
-
|
45
|
-
return hist_counts, hist_bins
|
46
|
-
|
47
|
-
|
48
|
-
def _entropy(
|
49
|
-
data: np.ndarray,
|
50
|
-
names: list,
|
51
|
-
is_categorical: List,
|
52
|
-
normalized: bool = False,
|
53
|
-
subset_mask: np.ndarray = np.empty(shape=0),
|
54
|
-
) -> np.ndarray:
|
55
|
-
"""
|
56
|
-
Meant for use with Bias metrics, Balance, Diversity, ClasswiseBalance,
|
57
|
-
and Classwise Diversity.
|
58
|
-
|
59
|
-
Compute entropy for discrete/categorical variables and, through standard
|
60
|
-
histogram binning, for continuous variables.
|
61
|
-
|
62
|
-
|
63
|
-
Parameters
|
64
|
-
----------
|
65
|
-
normalized: bool
|
66
|
-
Flag that determines whether or not to normalize entropy by log(num_bins)
|
67
|
-
subset_mask: Optional[np.ndarray[bool]]
|
68
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
69
|
-
|
70
|
-
|
71
|
-
Notes
|
72
|
-
-----
|
73
|
-
For continuous variables, histogram bins are chosen automatically. See
|
74
|
-
numpy.histogram for details.
|
75
|
-
|
76
|
-
Returns
|
77
|
-
-------
|
78
|
-
ent: np.ndarray[float]
|
79
|
-
Entropy estimate per column of X
|
80
|
-
|
81
|
-
See Also
|
82
|
-
--------
|
83
|
-
numpy.histogram
|
84
|
-
scipy.stats.entropy
|
85
|
-
"""
|
86
|
-
|
87
|
-
num_factors = len(names)
|
88
|
-
hist_counts, _ = _get_counts(data, names, is_categorical, subset_mask=subset_mask)
|
89
|
-
|
90
|
-
ev_index = np.empty(num_factors)
|
91
|
-
for col, cnts in enumerate(hist_counts.values()):
|
92
|
-
# entropy in nats, normalizes counts
|
93
|
-
ev_index[col] = entropy(cnts)
|
94
|
-
if normalized:
|
95
|
-
if len(cnts) == 1:
|
96
|
-
# log(0)
|
97
|
-
ev_index[col] = 0
|
98
|
-
else:
|
99
|
-
ev_index[col] /= np.log(len(cnts))
|
100
|
-
return ev_index
|
101
|
-
|
102
|
-
|
103
|
-
def _get_num_bins(
|
104
|
-
data: np.ndarray, names: list, is_categorical: List, subset_mask: np.ndarray = np.empty(shape=0)
|
105
|
-
) -> np.ndarray:
|
106
|
-
"""
|
107
|
-
Number of bins or unique values for each metadata factor, used to
|
108
|
-
normalize entropy/diversity.
|
109
|
-
|
110
|
-
Parameters
|
111
|
-
----------
|
112
|
-
subset_mask: Optional[np.ndarray[bool]]
|
113
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
114
|
-
"""
|
115
|
-
# likely cached
|
116
|
-
hist_counts, _ = _get_counts(data, names, is_categorical, subset_mask)
|
117
|
-
num_bins = np.empty(len(hist_counts))
|
118
|
-
for idx, cnts in enumerate(hist_counts.values()):
|
119
|
-
num_bins[idx] = len(cnts)
|
120
|
-
|
121
|
-
return num_bins
|
122
|
-
|
123
|
-
|
124
|
-
def _infer_categorical(X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
|
125
|
-
"""
|
126
|
-
Compute fraction of feature values that are unique --- intended to be used
|
127
|
-
for inferring whether variables are categorical.
|
128
|
-
"""
|
129
|
-
if X.ndim == 1:
|
130
|
-
X = np.expand_dims(X, axis=1)
|
131
|
-
num_samples = X.shape[0]
|
132
|
-
pct_unique = np.empty(X.shape[1])
|
133
|
-
for col in range(X.shape[1]): # type: ignore
|
134
|
-
uvals = np.unique(X[:, col], axis=0)
|
135
|
-
pct_unique[col] = len(uvals) / num_samples
|
136
|
-
return pct_unique < threshold
|
@@ -1,190 +0,0 @@
|
|
1
|
-
import warnings
|
2
|
-
from typing import Dict, Tuple
|
3
|
-
|
4
|
-
import numpy as np
|
5
|
-
import scipy
|
6
|
-
|
7
|
-
|
8
|
-
def validate_dict(d: Dict) -> None:
|
9
|
-
"""
|
10
|
-
Verify that dict-of-arrays (proxy for dataframe) contains arrays of equal
|
11
|
-
length. Future iterations could include type checking, conversion from
|
12
|
-
string to numeric types, etc.
|
13
|
-
|
14
|
-
Parameters
|
15
|
-
----------
|
16
|
-
d: Dict
|
17
|
-
dictionary of {variable_name: values}
|
18
|
-
"""
|
19
|
-
# assert that length of all arrays are equal -- could expand to other properties
|
20
|
-
lengths = []
|
21
|
-
for arr in d.values():
|
22
|
-
lengths.append(arr.shape)
|
23
|
-
|
24
|
-
if lengths[1:] != lengths[:-1]:
|
25
|
-
raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
|
26
|
-
|
27
|
-
|
28
|
-
def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name: str):
|
29
|
-
"""
|
30
|
-
Digitizes a list of values into a given number of bins.
|
31
|
-
|
32
|
-
Parameters
|
33
|
-
----------
|
34
|
-
continuous_values: np.ndarray
|
35
|
-
The values to be digitized.
|
36
|
-
bins: int
|
37
|
-
The number of bins for the discrete values that continuous_values will be digitized into.
|
38
|
-
factor_name: str
|
39
|
-
The name of the factor to be digitized.
|
40
|
-
|
41
|
-
Returns
|
42
|
-
-------
|
43
|
-
np.ndarray
|
44
|
-
The digitized values
|
45
|
-
|
46
|
-
"""
|
47
|
-
if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
|
48
|
-
raise TypeError(
|
49
|
-
f"Encountered a non-numeric value for factor {factor_name}, but the factor"
|
50
|
-
" was specified to be continuous. Ensure all occurrences of this factor are numeric types,"
|
51
|
-
f" or do not specify {factor_name} as a continuous factor."
|
52
|
-
)
|
53
|
-
|
54
|
-
_, bin_edges = np.histogram(continuous_values, bins=bins)
|
55
|
-
bin_edges[-1] = np.inf
|
56
|
-
bin_edges[0] = -np.inf
|
57
|
-
return np.digitize(continuous_values, bin_edges)
|
58
|
-
|
59
|
-
|
60
|
-
def format_discretize_factors(
|
61
|
-
data_factors: dict[str, np.ndarray], continuous_factor_names: np.ndarray, continuous_factor_bincounts: np.ndarray
|
62
|
-
) -> Tuple[dict, np.ndarray]:
|
63
|
-
"""
|
64
|
-
Sets up the internal list of metadata factors.
|
65
|
-
|
66
|
-
Parameters
|
67
|
-
----------
|
68
|
-
data_factors: Dict[str, np.ndarray]
|
69
|
-
The dataset factors, which are per-image attributes including class label and metadata.
|
70
|
-
Each key of dataset_factors is a factor, whose value is the per-image factor values.
|
71
|
-
continuous_factor_names : np.ndarray
|
72
|
-
The factors in data_factors that have continuous values.
|
73
|
-
All factors are treated as having discrete values unless they
|
74
|
-
are specified in this array. Each element of this array must occur as a key in data_factors.
|
75
|
-
continuous_factor_bincounts : np.ndarray
|
76
|
-
Array of the bin counts to discretize values into for each factor in continuous_factor_names.
|
77
|
-
|
78
|
-
Returns
|
79
|
-
-------
|
80
|
-
Dict[str, np.ndarray]
|
81
|
-
Intrinsic per-image metadata information with the formatting that input data_factors uses.
|
82
|
-
Each key is a metadata factor, whose value is the discrete per-image factor values.
|
83
|
-
np.ndarray
|
84
|
-
Per-image labels, whose ith element is the label for the ith element of the dataset.
|
85
|
-
"""
|
86
|
-
|
87
|
-
if len(continuous_factor_bincounts) != len(continuous_factor_names):
|
88
|
-
raise ValueError(
|
89
|
-
f"continuous_factor_bincounts has length {len(continuous_factor_bincounts)}, "
|
90
|
-
f"but continuous_factor_names has length {len(continuous_factor_names)}. "
|
91
|
-
"Each element of continuous_factor_names must have a corresponding element "
|
92
|
-
"in continuous_factor_bincounts. Alternatively, leave continuous_factor_bincounts empty "
|
93
|
-
"to use a default digitization of 10 bins."
|
94
|
-
)
|
95
|
-
|
96
|
-
# TODO: add unit test for this
|
97
|
-
for key in continuous_factor_names:
|
98
|
-
if key not in data_factors:
|
99
|
-
raise KeyError(
|
100
|
-
f"The continuous factor name {key} "
|
101
|
-
f"does not exist in data_factors. Delete {key} from "
|
102
|
-
f"continuous_factor_names or add an entry with key {key} to "
|
103
|
-
"data_factors."
|
104
|
-
)
|
105
|
-
|
106
|
-
metadata_factors = {}
|
107
|
-
|
108
|
-
# make sure each factor has the same number of entries
|
109
|
-
validate_dict(data_factors)
|
110
|
-
|
111
|
-
labels = data_factors["class"]
|
112
|
-
|
113
|
-
# Each continuous factor is discretized into some number of bins.
|
114
|
-
# This matches the number of bins for a factor with the factor
|
115
|
-
num_bins = dict(zip(continuous_factor_names, continuous_factor_bincounts))
|
116
|
-
|
117
|
-
metadata_factors = {
|
118
|
-
name: val if name not in continuous_factor_names else digitize_factor_bins(val, num_bins[name], name)
|
119
|
-
for name, val in data_factors.items()
|
120
|
-
if name != "class"
|
121
|
-
}
|
122
|
-
|
123
|
-
return metadata_factors, labels
|
124
|
-
|
125
|
-
|
126
|
-
def compute_parity(factors: dict[str, np.ndarray], labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
127
|
-
"""
|
128
|
-
Evaluates the statistical independence of metadata factors from class labels.
|
129
|
-
This performs a chi-square test, which provides a score and a p-value for
|
130
|
-
statistical independence between each pair of a metadata factor and a class label.
|
131
|
-
A high score with a low p-value suggests that a metadata factor is strongly
|
132
|
-
correlated with a class label.
|
133
|
-
|
134
|
-
Parameters
|
135
|
-
----------
|
136
|
-
factors: Dict[str, np.ndarray]
|
137
|
-
Intrinsic per-image metadata information.
|
138
|
-
factors['key'][i] is the value of the metadata factor 'key' at the ith element of the dataset.
|
139
|
-
labels: np.ndarray
|
140
|
-
Dataset labels.
|
141
|
-
Labels[i] is the label for the ith element of the dataset.
|
142
|
-
|
143
|
-
Returns
|
144
|
-
-------
|
145
|
-
np.ndarray
|
146
|
-
Array of length (num_factors) whose (i)th element corresponds to
|
147
|
-
the chi-square score for the relationship between factor i
|
148
|
-
and the class labels in the dataset.
|
149
|
-
np.ndarray
|
150
|
-
Array of length (num_factors) whose (i)th element corresponds to
|
151
|
-
the p-value value for the chi-square test for the relationship between
|
152
|
-
factor i and the class labels in the dataset.
|
153
|
-
"""
|
154
|
-
|
155
|
-
chi_scores = np.zeros(len(factors))
|
156
|
-
p_values = np.zeros(len(factors))
|
157
|
-
n_cls = len(np.unique(labels))
|
158
|
-
for i, (current_factor_name, factor_values) in enumerate(factors.items()):
|
159
|
-
unique_factor_values = np.unique(factor_values)
|
160
|
-
contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
|
161
|
-
# Builds a contingency matrix where entry at index (r,c) represents
|
162
|
-
# the frequency of current_factor_name achieving value unique_factor_values[r]
|
163
|
-
# at a data point with class c.
|
164
|
-
|
165
|
-
# TODO: Vectorize this nested for loop
|
166
|
-
for fi, factor_value in enumerate(unique_factor_values):
|
167
|
-
for label in range(n_cls):
|
168
|
-
with_both = np.bitwise_and((labels == label), factor_values == factor_value)
|
169
|
-
contingency_matrix[fi, label] = np.sum(with_both)
|
170
|
-
if 0 < contingency_matrix[fi, label] < 5:
|
171
|
-
warnings.warn(
|
172
|
-
f"Factor {current_factor_name} value {factor_value} co-occurs "
|
173
|
-
f"only {contingency_matrix[fi, label]} times with label {label}. "
|
174
|
-
"This can cause inaccurate chi_square calculation. Recommend"
|
175
|
-
"ensuring each label occurs either 0 times or at least 5 times. "
|
176
|
-
"Alternatively, digitize any continuous-valued factors "
|
177
|
-
"into fewer bins."
|
178
|
-
)
|
179
|
-
|
180
|
-
# This deletes rows containing only zeros,
|
181
|
-
# because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
|
182
|
-
rowsums = np.sum(contingency_matrix, axis=1)
|
183
|
-
rowmask = np.where(rowsums)
|
184
|
-
contingency_matrix = contingency_matrix[rowmask]
|
185
|
-
|
186
|
-
chi2, p, _, _ = scipy.stats.chi2_contingency(contingency_matrix)
|
187
|
-
|
188
|
-
chi_scores[i] = chi2
|
189
|
-
p_values[i] = p
|
190
|
-
return chi_scores, p_values
|
@@ -1,158 +0,0 @@
|
|
1
|
-
from typing import Any, Literal, NamedTuple, Tuple, Union
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
from scipy.signal import convolve2d
|
5
|
-
from scipy.sparse import csr_matrix
|
6
|
-
from scipy.sparse.csgraph import minimum_spanning_tree as mst
|
7
|
-
from scipy.spatial.distance import pdist, squareform
|
8
|
-
from sklearn.neighbors import NearestNeighbors
|
9
|
-
|
10
|
-
EPSILON = 1e-5
|
11
|
-
EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
|
12
|
-
BIT_DEPTH = (1, 8, 12, 16, 32)
|
13
|
-
|
14
|
-
|
15
|
-
def minimum_spanning_tree(X: np.ndarray) -> Any:
|
16
|
-
"""
|
17
|
-
Returns the minimum spanning tree from a NumPy image array.
|
18
|
-
|
19
|
-
Parameters
|
20
|
-
----------
|
21
|
-
X: np.ndarray
|
22
|
-
Numpy image array
|
23
|
-
|
24
|
-
Returns
|
25
|
-
-------
|
26
|
-
Data representing the minimum spanning tree
|
27
|
-
"""
|
28
|
-
# All features belong on second dimension
|
29
|
-
X = X.reshape((X.shape[0], -1))
|
30
|
-
# We add a small constant to the distance matrix to ensure scipy interprets
|
31
|
-
# the input graph as fully-connected.
|
32
|
-
dense_eudist = squareform(pdist(X)) + EPSILON
|
33
|
-
eudist_csr = csr_matrix(dense_eudist)
|
34
|
-
return mst(eudist_csr)
|
35
|
-
|
36
|
-
|
37
|
-
def get_classes_counts(labels: np.ndarray) -> Tuple[int, int]:
|
38
|
-
"""
|
39
|
-
Returns the classes and counts of from an array of labels
|
40
|
-
|
41
|
-
Parameters
|
42
|
-
----------
|
43
|
-
label: np.ndarray
|
44
|
-
Numpy labels array
|
45
|
-
|
46
|
-
Returns
|
47
|
-
-------
|
48
|
-
Classes and counts
|
49
|
-
|
50
|
-
Raises
|
51
|
-
------
|
52
|
-
ValueError
|
53
|
-
If the number of unique classes is less than 2
|
54
|
-
"""
|
55
|
-
classes, counts = np.unique(labels, return_counts=True)
|
56
|
-
M = len(classes)
|
57
|
-
if M < 2:
|
58
|
-
raise ValueError("Label vector contains less than 2 classes!")
|
59
|
-
N = np.sum(counts).astype(int)
|
60
|
-
return M, N
|
61
|
-
|
62
|
-
|
63
|
-
def compute_neighbors(
|
64
|
-
A: np.ndarray,
|
65
|
-
B: np.ndarray,
|
66
|
-
k: int = 1,
|
67
|
-
algorithm: Literal["auto", "ball_tree", "kd_tree"] = "auto",
|
68
|
-
) -> np.ndarray:
|
69
|
-
"""
|
70
|
-
For each sample in A, compute the nearest neighbor in B
|
71
|
-
|
72
|
-
Parameters
|
73
|
-
----------
|
74
|
-
A, B : np.ndarray
|
75
|
-
The n_samples and n_features respectively
|
76
|
-
k : int
|
77
|
-
The number of neighbors to find
|
78
|
-
algorithm : Literal
|
79
|
-
Tree method for nearest neighbor (auto, ball_tree or kd_tree)
|
80
|
-
|
81
|
-
Note
|
82
|
-
----
|
83
|
-
Do not use kd_tree if n_features > 20
|
84
|
-
|
85
|
-
Returns
|
86
|
-
-------
|
87
|
-
List:
|
88
|
-
Closest points to each point in A and B
|
89
|
-
|
90
|
-
See Also
|
91
|
-
--------
|
92
|
-
:func:`sklearn.neighbors.NearestNeighbors`
|
93
|
-
"""
|
94
|
-
|
95
|
-
nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
|
96
|
-
nns = nbrs.kneighbors(A)[1]
|
97
|
-
nns = nns[:, 1:].squeeze()
|
98
|
-
|
99
|
-
return nns
|
100
|
-
|
101
|
-
|
102
|
-
class BitDepth(NamedTuple):
|
103
|
-
depth: int
|
104
|
-
pmin: Union[float, int]
|
105
|
-
pmax: Union[float, int]
|
106
|
-
|
107
|
-
|
108
|
-
def get_bitdepth(image: np.ndarray) -> BitDepth:
|
109
|
-
"""
|
110
|
-
Approximates the bit depth of the image using the
|
111
|
-
min and max pixel values.
|
112
|
-
"""
|
113
|
-
pmin, pmax = np.min(image), np.max(image)
|
114
|
-
if pmin < 0:
|
115
|
-
return BitDepth(0, pmin, pmax)
|
116
|
-
else:
|
117
|
-
depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
|
118
|
-
return BitDepth(depth, 0, 2**depth - 1)
|
119
|
-
|
120
|
-
|
121
|
-
def rescale(image: np.ndarray, depth: int = 1) -> np.ndarray:
|
122
|
-
"""
|
123
|
-
Rescales the image using the bit depth provided.
|
124
|
-
"""
|
125
|
-
bitdepth = get_bitdepth(image)
|
126
|
-
if bitdepth.depth == depth:
|
127
|
-
return image
|
128
|
-
else:
|
129
|
-
normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
|
130
|
-
return normalized * (2**depth - 1)
|
131
|
-
|
132
|
-
|
133
|
-
def normalize_image_shape(image: np.ndarray) -> np.ndarray:
|
134
|
-
"""
|
135
|
-
Normalizes the image shape into (C,H,W).
|
136
|
-
"""
|
137
|
-
ndim = image.ndim
|
138
|
-
if ndim == 2:
|
139
|
-
return np.expand_dims(image, axis=0)
|
140
|
-
elif ndim == 3:
|
141
|
-
return image
|
142
|
-
elif ndim > 3:
|
143
|
-
# Slice all but the last 3 dimensions
|
144
|
-
return image[(0,) * (ndim - 3)]
|
145
|
-
else:
|
146
|
-
raise ValueError("Images must have 2 or more dimensions.")
|
147
|
-
|
148
|
-
|
149
|
-
def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
|
150
|
-
"""
|
151
|
-
Returns the image filtered using a 3x3 edge detection kernel:
|
152
|
-
[[ -1, -1, -1 ],
|
153
|
-
[ -1, 8, -1 ],
|
154
|
-
[ -1, -1, -1 ]]
|
155
|
-
"""
|
156
|
-
edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
|
157
|
-
np.clip(edges, 0, 255, edges)
|
158
|
-
return edges
|
File without changes
|
@@ -1,30 +0,0 @@
|
|
1
|
-
from typing import Tuple
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
import torch
|
5
|
-
|
6
|
-
import maite.protocols.image_classification as ic
|
7
|
-
from maite.protocols import ArrayLike
|
8
|
-
|
9
|
-
|
10
|
-
def arraylike_to_numpy(xp: ArrayLike) -> np.ndarray:
|
11
|
-
"""Converts ArrayLike objects to numpy"""
|
12
|
-
|
13
|
-
# Must ensure Tensors are not on GPU
|
14
|
-
return xp.detach().cpu().numpy() if isinstance(xp, torch.Tensor) else np.asarray(xp)
|
15
|
-
|
16
|
-
|
17
|
-
# TODO: Overload with od.Dataset
|
18
|
-
# TODO: Check if batching aggregation is faster (e.g. DataLoader)
|
19
|
-
# TODO: Add verbosity flags (tqdm?)
|
20
|
-
def extract_to_numpy(dataset: ic.Dataset) -> Tuple[np.ndarray, np.ndarray]:
|
21
|
-
"""Iterate over dataset and separate images from labels"""
|
22
|
-
images = []
|
23
|
-
labels = []
|
24
|
-
|
25
|
-
# (image, label, metadata)
|
26
|
-
for image, label, _ in dataset:
|
27
|
-
images.append(image)
|
28
|
-
labels.append(label)
|
29
|
-
|
30
|
-
return np.asarray(images), np.asarray(labels)
|