dataeval 0.86.5__py3-none-any.whl → 0.86.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/detectors/linters/outliers.py +41 -39
- {dataeval-0.86.5.dist-info → dataeval-0.86.6.dist-info}/METADATA +1 -1
- {dataeval-0.86.5.dist-info → dataeval-0.86.6.dist-info}/RECORD +6 -6
- {dataeval-0.86.5.dist-info → dataeval-0.86.6.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.86.5.dist-info → dataeval-0.86.6.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
@@ -18,57 +18,59 @@ from dataeval.outputs._stats import BASE_ATTRS
|
|
18
18
|
from dataeval.typing import ArrayLike, Dataset
|
19
19
|
|
20
20
|
|
21
|
+
def _get_zscore_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
|
22
|
+
threshold = threshold if threshold is not None else 3.0
|
23
|
+
std_val = np.nanstd(values)
|
24
|
+
if std_val > EPSILON:
|
25
|
+
mean_val = np.nanmean(values)
|
26
|
+
abs_diff = np.abs(values - mean_val)
|
27
|
+
return (abs_diff / std_val) > threshold
|
28
|
+
return None
|
29
|
+
|
30
|
+
|
31
|
+
def _get_modzscore_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
|
32
|
+
threshold = threshold if threshold is not None else 3.5
|
33
|
+
median_val = np.nanmedian(values)
|
34
|
+
abs_diff = np.abs(values - median_val)
|
35
|
+
m_abs_diff = np.nanmedian(abs_diff)
|
36
|
+
m_abs_diff = np.nanmean(abs_diff) if m_abs_diff <= EPSILON else m_abs_diff
|
37
|
+
if m_abs_diff > EPSILON:
|
38
|
+
mod_z_score = 0.6745 * abs_diff / m_abs_diff
|
39
|
+
return mod_z_score > threshold
|
40
|
+
return None
|
41
|
+
|
42
|
+
|
43
|
+
def _get_iqr_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
|
44
|
+
threshold = threshold if threshold is not None else 1.5
|
45
|
+
qrt = np.nanpercentile(values, q=(25, 75), method="midpoint")
|
46
|
+
iqr_val = qrt[1] - qrt[0]
|
47
|
+
if iqr_val > EPSILON:
|
48
|
+
iqr_threshold = iqr_val * threshold
|
49
|
+
return (values < (qrt[0] - iqr_threshold)) | (values > (qrt[1] + iqr_threshold))
|
50
|
+
return None
|
51
|
+
|
52
|
+
|
21
53
|
def _get_outlier_mask(
|
22
54
|
values: NDArray[Any], method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
|
23
55
|
) -> NDArray[np.bool_]:
|
24
56
|
if len(values) == 0:
|
25
57
|
return np.array([], dtype=bool)
|
26
58
|
|
27
|
-
|
28
|
-
|
29
|
-
valid_mask = ~np.isnan(values)
|
30
|
-
outliers = np.full(values.shape, False, dtype=bool)
|
31
|
-
|
32
|
-
if not np.any(valid_mask):
|
33
|
-
return outliers
|
34
|
-
|
35
|
-
if method == "zscore":
|
36
|
-
threshold = threshold if threshold is not None else 3.0
|
37
|
-
|
38
|
-
std_val = np.nanstd(values)
|
39
|
-
|
40
|
-
if std_val > EPSILON:
|
41
|
-
mean_val = np.nanmean(values)
|
42
|
-
abs_diff = np.abs(values - mean_val)
|
43
|
-
outliers = (abs_diff / std_val) > threshold
|
59
|
+
nan_mask = np.isnan(values)
|
44
60
|
|
61
|
+
if np.all(nan_mask):
|
62
|
+
outliers = None
|
63
|
+
elif method == "zscore":
|
64
|
+
outliers = _get_zscore_mask(values.astype(np.float64), threshold)
|
45
65
|
elif method == "modzscore":
|
46
|
-
|
47
|
-
|
48
|
-
median_val = np.nanmedian(values)
|
49
|
-
abs_diff = np.abs(values - median_val)
|
50
|
-
m_abs_diff = np.nanmedian(abs_diff)
|
51
|
-
m_abs_diff = np.nanmean(abs_diff) if m_abs_diff <= EPSILON else m_abs_diff
|
52
|
-
|
53
|
-
if m_abs_diff > EPSILON:
|
54
|
-
mod_z_score = 0.6745 * abs_diff / m_abs_diff
|
55
|
-
outliers = mod_z_score > threshold
|
56
|
-
|
66
|
+
outliers = _get_modzscore_mask(values.astype(np.float64), threshold)
|
57
67
|
elif method == "iqr":
|
58
|
-
|
59
|
-
|
60
|
-
qrt = np.nanpercentile(values, q=(25, 75), method="midpoint")
|
61
|
-
iqr_val = qrt[1] - qrt[0]
|
62
|
-
|
63
|
-
if iqr_val > EPSILON:
|
64
|
-
iqr_threshold = iqr_val * threshold
|
65
|
-
outliers = (values < (qrt[0] - iqr_threshold)) | (values > (qrt[1] + iqr_threshold))
|
66
|
-
|
68
|
+
outliers = _get_iqr_mask(values.astype(np.float64), threshold)
|
67
69
|
else:
|
68
70
|
raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
|
69
71
|
|
70
|
-
outliers
|
71
|
-
return outliers
|
72
|
+
# If outliers were found, return the mask with NaN values set to False, otherwise return all False
|
73
|
+
return outliers & ~nan_mask if outliers is not None else np.full(values.shape, False, dtype=bool)
|
72
74
|
|
73
75
|
|
74
76
|
class Outliers:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.86.
|
3
|
+
Version: 0.86.6
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -1,4 +1,4 @@
|
|
1
|
-
dataeval/__init__.py,sha256=
|
1
|
+
dataeval/__init__.py,sha256=9M6Th_pJ371mFO5oLUC0UZJmDclHa8SbNJse71-T84I,1636
|
2
2
|
dataeval/_log.py,sha256=C7AGkIRzymvYJ0LQXtnShiy3i5Xrp8T58JzIHHguk_Q,365
|
3
3
|
dataeval/config.py,sha256=bHa8np4FCtLLv8_xlfdDC4lb1InJ_kT0vXDO5P42rvk,4082
|
4
4
|
dataeval/data/__init__.py,sha256=wzQ6uUFLNB3VJR0a2QnRBYwEmwXT93q0WpHu7FmFW1E,486
|
@@ -32,7 +32,7 @@ dataeval/detectors/drift/_uncertainty.py,sha256=BHlykJ-r7TGLJxdPfoazXnoAJ1qVDzbk
|
|
32
32
|
dataeval/detectors/drift/updates.py,sha256=L1PnrPlIE1x6ujCc5mCwjcAZwadVTn-Zjb6MnTDvzJQ,2251
|
33
33
|
dataeval/detectors/linters/__init__.py,sha256=xn2zPwUcmsuf-Jd9uw6AVI11C9z1b1Y9fYtuFnXenZ0,404
|
34
34
|
dataeval/detectors/linters/duplicates.py,sha256=X5WSEvI_BHkLoXjkaHK6wTnSkx4IjpO_exMRjSlhc70,4963
|
35
|
-
dataeval/detectors/linters/outliers.py,sha256=
|
35
|
+
dataeval/detectors/linters/outliers.py,sha256=GaM9n8yPgBPzVOL_bxJCj0eCwobEEP4JHKHD9liRdlw,10130
|
36
36
|
dataeval/detectors/ood/__init__.py,sha256=juCYBDs7CQEAtMhnEpPqF6uTrOIH9kTBSuQ_GRw6a8o,283
|
37
37
|
dataeval/detectors/ood/ae.py,sha256=fTrUfFxv6xUqzKpwMC8rW3JrizA16M_bgzqLuBKMrS0,2944
|
38
38
|
dataeval/detectors/ood/base.py,sha256=9b-Ljznf0lB1SXF4F_Aj3eJ4Y3ijGEDPMjucUsWOGJM,3051
|
@@ -107,7 +107,7 @@ dataeval/utils/torch/models.py,sha256=1idpXyjrYcCBSsbxxRUOto8xr4MJNjDEqQHiIXVU5Z
|
|
107
107
|
dataeval/utils/torch/trainer.py,sha256=Oc2lK13uPGhmLYbmAqlPWyKxgG4YJFlnSXCqFHUZbdA,5528
|
108
108
|
dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
|
109
109
|
dataeval/workflows/sufficiency.py,sha256=j-R8dg4XE6a66p_oTXG2GNzgg3vGk85CTblxhFXaxog,8513
|
110
|
-
dataeval-0.86.
|
111
|
-
dataeval-0.86.
|
112
|
-
dataeval-0.86.
|
113
|
-
dataeval-0.86.
|
110
|
+
dataeval-0.86.6.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
|
111
|
+
dataeval-0.86.6.dist-info/METADATA,sha256=pHhjYhbE3BlgvxtINd333FwljVfELIKQnplaAwLNZVg,5353
|
112
|
+
dataeval-0.86.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
113
|
+
dataeval-0.86.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|