dataeval 0.84.1__py3-none-any.whl → 0.86.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/data/__init__.py +19 -0
- dataeval/{utils/data → data}/_embeddings.py +137 -17
- dataeval/{utils/data → data}/_metadata.py +20 -8
- dataeval/{utils/data → data}/_selection.py +22 -9
- dataeval/{utils/data → data}/_split.py +1 -1
- dataeval/data/selections/__init__.py +19 -0
- dataeval/{utils/data → data}/selections/_classbalance.py +1 -2
- dataeval/data/selections/_classfilter.py +110 -0
- dataeval/{utils/data → data}/selections/_indices.py +1 -1
- dataeval/{utils/data → data}/selections/_limit.py +1 -1
- dataeval/{utils/data → data}/selections/_prioritize.py +2 -2
- dataeval/{utils/data → data}/selections/_reverse.py +1 -1
- dataeval/{utils/data → data}/selections/_shuffle.py +1 -1
- dataeval/detectors/drift/__init__.py +4 -1
- dataeval/detectors/drift/_base.py +1 -1
- dataeval/detectors/drift/_cvm.py +2 -2
- dataeval/detectors/drift/_ks.py +2 -2
- dataeval/detectors/drift/_mmd.py +2 -2
- dataeval/detectors/drift/_mvdc.py +92 -0
- dataeval/detectors/drift/_nml/__init__.py +6 -0
- dataeval/detectors/drift/_nml/_base.py +68 -0
- dataeval/detectors/drift/_nml/_chunk.py +404 -0
- dataeval/detectors/drift/_nml/_domainclassifier.py +192 -0
- dataeval/detectors/drift/_nml/_result.py +98 -0
- dataeval/detectors/drift/_nml/_thresholds.py +280 -0
- dataeval/detectors/linters/duplicates.py +1 -1
- dataeval/detectors/linters/outliers.py +1 -1
- dataeval/metadata/_distance.py +1 -1
- dataeval/metadata/_ood.py +4 -4
- dataeval/metrics/bias/_balance.py +1 -1
- dataeval/metrics/bias/_diversity.py +1 -1
- dataeval/metrics/bias/_parity.py +1 -1
- dataeval/metrics/stats/_labelstats.py +2 -2
- dataeval/outputs/__init__.py +2 -1
- dataeval/outputs/_bias.py +2 -4
- dataeval/outputs/_drift.py +68 -0
- dataeval/outputs/_linters.py +1 -6
- dataeval/outputs/_stats.py +1 -6
- dataeval/typing.py +31 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/data/__init__.py +5 -20
- dataeval/utils/data/collate.py +2 -0
- dataeval/utils/datasets/__init__.py +17 -0
- dataeval/utils/{data/datasets → datasets}/_base.py +3 -3
- dataeval/utils/{data/datasets → datasets}/_cifar10.py +2 -2
- dataeval/utils/{data/datasets → datasets}/_milco.py +2 -2
- dataeval/utils/{data/datasets → datasets}/_mnist.py +2 -2
- dataeval/utils/{data/datasets → datasets}/_ships.py +2 -2
- dataeval/utils/{data/datasets → datasets}/_voc.py +3 -3
- {dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/METADATA +3 -2
- dataeval-0.86.0.dist-info/RECORD +114 -0
- dataeval/utils/data/datasets/__init__.py +0 -17
- dataeval/utils/data/selections/__init__.py +0 -19
- dataeval/utils/data/selections/_classfilter.py +0 -44
- dataeval-0.84.1.dist-info/RECORD +0 -106
- /dataeval/{utils/data → data}/_images.py +0 -0
- /dataeval/{utils/data → data}/_targets.py +0 -0
- /dataeval/utils/{metadata.py → data/metadata.py} +0 -0
- /dataeval/utils/{data/datasets → datasets}/_fileio.py +0 -0
- /dataeval/utils/{data/datasets → datasets}/_mixin.py +0 -0
- /dataeval/utils/{data/datasets → datasets}/_types.py +0 -0
- {dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,192 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from NannyML 0.13.0
|
3
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/drift/multivariate/domain_classifier/calculator.py
|
4
|
+
|
5
|
+
Licensed under Apache Software License (Apache 2.0)
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import logging
|
11
|
+
from typing import Any
|
12
|
+
|
13
|
+
import numpy as np
|
14
|
+
import pandas as pd
|
15
|
+
from lightgbm import LGBMClassifier
|
16
|
+
from numpy.typing import NDArray
|
17
|
+
from sklearn.metrics import roc_auc_score
|
18
|
+
from sklearn.model_selection import StratifiedKFold
|
19
|
+
|
20
|
+
from dataeval.config import get_max_processes, get_seed
|
21
|
+
from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
|
22
|
+
from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
|
23
|
+
from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold, calculate_threshold_values
|
24
|
+
from dataeval.outputs._base import set_metadata
|
25
|
+
from dataeval.outputs._drift import DriftMVDCOutput
|
26
|
+
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
DEFAULT_LGBM_HYPERPARAMS = {
|
30
|
+
"boosting_type": "gbdt",
|
31
|
+
"class_weight": None,
|
32
|
+
"colsample_bytree": 1.0,
|
33
|
+
"deterministic": True,
|
34
|
+
"importance_type": "split",
|
35
|
+
"learning_rate": 0.1,
|
36
|
+
"max_depth": -1,
|
37
|
+
"min_child_samples": 20,
|
38
|
+
"min_child_weight": 0.001,
|
39
|
+
"min_split_gain": 0.0,
|
40
|
+
"n_estimators": 100,
|
41
|
+
"n_jobs": get_max_processes() or 0,
|
42
|
+
"num_leaves": 31,
|
43
|
+
"objective": None,
|
44
|
+
"random_state": get_seed(),
|
45
|
+
"reg_alpha": 0.0,
|
46
|
+
"reg_lambda": 0.0,
|
47
|
+
"subsample": 1.0,
|
48
|
+
"subsample_for_bin": 200000,
|
49
|
+
"subsample_freq": 0,
|
50
|
+
"verbosity": -1,
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
class DomainClassifierCalculator(AbstractCalculator):
|
55
|
+
"""
|
56
|
+
DomainClassifierCalculator implementation.
|
57
|
+
|
58
|
+
Uses Drift Detection Classifier's cross validated performance as a measure of drift.
|
59
|
+
"""
|
60
|
+
|
61
|
+
def __init__(
|
62
|
+
self,
|
63
|
+
chunker: Chunker | None = None,
|
64
|
+
cv_folds_num: int = 5,
|
65
|
+
hyperparameters: dict[str, Any] | None = None,
|
66
|
+
threshold: Threshold = ConstantThreshold(lower=0.45, upper=0.65),
|
67
|
+
) -> None:
|
68
|
+
"""
|
69
|
+
Create a new DomainClassifierCalculator instance.
|
70
|
+
|
71
|
+
Parameters
|
72
|
+
-----------
|
73
|
+
chunker : Chunker, default=None
|
74
|
+
The `Chunker` used to split the data sets into a lists of chunks.
|
75
|
+
cv_folds_num: Optional[int]
|
76
|
+
Number of cross-validation folds to use when calculating DC discrimination value.
|
77
|
+
hyperparameters : dict[str, Any], default = None
|
78
|
+
A dictionary used to provide your own custom hyperparameters when training the discrimination model.
|
79
|
+
Check out the available hyperparameter options in the
|
80
|
+
`LightGBM docs <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>`_.
|
81
|
+
threshold: Threshold, default=ConstantThreshold
|
82
|
+
The threshold you wish to evaluate values on. Defaults to a ConstantThreshold with lower value
|
83
|
+
of 0.45 and upper value of 0.65.
|
84
|
+
"""
|
85
|
+
super().__init__(chunker, logger)
|
86
|
+
|
87
|
+
self.cv_folds_num = cv_folds_num
|
88
|
+
self.hyperparameters = DEFAULT_LGBM_HYPERPARAMS if hyperparameters is None else hyperparameters
|
89
|
+
self.threshold = threshold
|
90
|
+
self.result: DriftMVDCOutput | None = None
|
91
|
+
|
92
|
+
def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput:
|
93
|
+
"""Fits the DC calculator to a set of reference data."""
|
94
|
+
self._x_ref = reference_data
|
95
|
+
result = self._calculate(data=self._x_ref)
|
96
|
+
result._data[("chunk", "period")] = "reference"
|
97
|
+
|
98
|
+
return result
|
99
|
+
|
100
|
+
@set_metadata
|
101
|
+
def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
|
102
|
+
"""Calculate the data DC calculator metric for a given data set."""
|
103
|
+
chunks = self.chunker.split(data)
|
104
|
+
|
105
|
+
res = pd.DataFrame.from_records(
|
106
|
+
[
|
107
|
+
{
|
108
|
+
**chunk.dict(),
|
109
|
+
"period": "analysis",
|
110
|
+
"classifier_auroc_value": self._calculate_chunk(chunk=chunk),
|
111
|
+
}
|
112
|
+
for chunk in chunks
|
113
|
+
]
|
114
|
+
)
|
115
|
+
|
116
|
+
multilevel_index = _create_multilevel_index(chunks, "domain_classifier_auroc", ["value"])
|
117
|
+
res.columns = multilevel_index
|
118
|
+
res = res.reset_index(drop=True)
|
119
|
+
|
120
|
+
res = self._populate_alert_thresholds(res)
|
121
|
+
|
122
|
+
if self.result is None:
|
123
|
+
self.result = DriftMVDCOutput(results_data=res)
|
124
|
+
else:
|
125
|
+
self.result = self.result.filter(period="reference")
|
126
|
+
self.result._data = pd.concat([self.result._data, res], ignore_index=True)
|
127
|
+
return self.result
|
128
|
+
|
129
|
+
def _calculate_chunk(self, chunk: Chunk):
|
130
|
+
if self.result is None:
|
131
|
+
# Use information from chunk indices to identify reference chunk's location. This is possible because
|
132
|
+
# both the internal reference data copy and the chunk data were sorted by timestamp, so these
|
133
|
+
# indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
|
134
|
+
# which is a costly operation.
|
135
|
+
df_X = self._x_ref
|
136
|
+
y = np.zeros(len(df_X), dtype=np.intp)
|
137
|
+
y[chunk.start_index : chunk.end_index + 1] = 1
|
138
|
+
else:
|
139
|
+
chunk_X = chunk.data
|
140
|
+
reference_X = self._x_ref
|
141
|
+
chunk_y = np.ones(len(chunk_X), dtype=np.intp)
|
142
|
+
reference_y = np.zeros(len(reference_X), dtype=np.intp)
|
143
|
+
df_X = pd.concat([reference_X, chunk_X], ignore_index=True)
|
144
|
+
y = np.concatenate([reference_y, chunk_y])
|
145
|
+
|
146
|
+
skf = StratifiedKFold(n_splits=self.cv_folds_num)
|
147
|
+
all_preds: list[NDArray[np.float32]] = []
|
148
|
+
all_tgts: list[NDArray[np.intp]] = []
|
149
|
+
for i, (train_index, test_index) in enumerate(skf.split(df_X, y)):
|
150
|
+
_trx = df_X.iloc[train_index]
|
151
|
+
_try = y[train_index]
|
152
|
+
_tsx = df_X.iloc[test_index]
|
153
|
+
_tsy = y[test_index]
|
154
|
+
model = LGBMClassifier(**self.hyperparameters)
|
155
|
+
model.fit(_trx, _try)
|
156
|
+
preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
|
157
|
+
all_preds.append(preds)
|
158
|
+
all_tgts.append(_tsy)
|
159
|
+
|
160
|
+
np_all_preds = np.concatenate(all_preds, axis=0)
|
161
|
+
np_all_tgts = np.concatenate(all_tgts, axis=0)
|
162
|
+
try:
|
163
|
+
# catch case where all rows are duplicates
|
164
|
+
result = roc_auc_score(np_all_tgts, np_all_preds)
|
165
|
+
except ValueError as err:
|
166
|
+
if str(err) != "Only one class present in y_true. ROC AUC score is not defined in that case.":
|
167
|
+
raise
|
168
|
+
else:
|
169
|
+
# by definition if reference and chunk exactly match we can't discriminate
|
170
|
+
result = 0.5
|
171
|
+
return result
|
172
|
+
|
173
|
+
def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
|
174
|
+
if self.result is None:
|
175
|
+
self._threshold_values = calculate_threshold_values(
|
176
|
+
threshold=self.threshold,
|
177
|
+
data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
|
178
|
+
lower_threshold_value_limit=0.0,
|
179
|
+
upper_threshold_value_limit=1.0,
|
180
|
+
logger=self._logger,
|
181
|
+
)
|
182
|
+
|
183
|
+
result_data[("domain_classifier_auroc", "upper_threshold")] = self._threshold_values[1]
|
184
|
+
result_data[("domain_classifier_auroc", "lower_threshold")] = self._threshold_values[0]
|
185
|
+
result_data[("domain_classifier_auroc", "alert")] = result_data.apply(
|
186
|
+
lambda row: bool(
|
187
|
+
row["domain_classifier_auroc", "value"] > row["domain_classifier_auroc", "upper_threshold"]
|
188
|
+
or row["domain_classifier_auroc", "value"] < row["domain_classifier_auroc", "lower_threshold"]
|
189
|
+
),
|
190
|
+
axis=1,
|
191
|
+
)
|
192
|
+
return result_data
|
@@ -0,0 +1,98 @@
|
|
1
|
+
"""
|
2
|
+
Contains the results of the data reconstruction drift calculation and provides plotting functionality.
|
3
|
+
|
4
|
+
Source code derived from NannyML 0.13.0
|
5
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/base.py
|
6
|
+
|
7
|
+
Licensed under Apache Software License (Apache 2.0)
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import copy
|
13
|
+
from abc import ABC, abstractmethod
|
14
|
+
from typing import NamedTuple, Sequence
|
15
|
+
|
16
|
+
import pandas as pd
|
17
|
+
from typing_extensions import Self
|
18
|
+
|
19
|
+
from dataeval.outputs._base import GenericOutput
|
20
|
+
|
21
|
+
|
22
|
+
class Metric(NamedTuple):
|
23
|
+
display_name: str
|
24
|
+
column_name: str
|
25
|
+
|
26
|
+
|
27
|
+
class AbstractResult(GenericOutput[pd.DataFrame]):
|
28
|
+
def __init__(self, results_data: pd.DataFrame) -> None:
|
29
|
+
self._data = results_data.copy(deep=True)
|
30
|
+
|
31
|
+
def data(self) -> pd.DataFrame:
|
32
|
+
return self.to_df()
|
33
|
+
|
34
|
+
@property
|
35
|
+
def empty(self) -> bool:
|
36
|
+
return self._data is None or self._data.empty
|
37
|
+
|
38
|
+
def __len__(self) -> int:
|
39
|
+
return 0 if self.empty else len(self._data)
|
40
|
+
|
41
|
+
def to_df(self, multilevel: bool = True) -> pd.DataFrame:
|
42
|
+
"""Export results to pandas dataframe."""
|
43
|
+
if multilevel:
|
44
|
+
return self._data
|
45
|
+
else:
|
46
|
+
column_names = [
|
47
|
+
"_".join(col).replace("chunk_chunk_chunk", "chunk").replace("chunk_chunk", "chunk")
|
48
|
+
for col in self._data.columns.values
|
49
|
+
]
|
50
|
+
single_level_data = self._data.copy(deep=True)
|
51
|
+
single_level_data.columns = column_names
|
52
|
+
return single_level_data
|
53
|
+
|
54
|
+
def filter(self, period: str = "all", metrics: str | Sequence[str] | None = None) -> Self:
|
55
|
+
"""Returns filtered result metric data."""
|
56
|
+
if metrics and not isinstance(metrics, (str, Sequence)):
|
57
|
+
raise ValueError("metrics value provided is not a valid metric or sequence of metrics")
|
58
|
+
if isinstance(metrics, str):
|
59
|
+
metrics = [metrics]
|
60
|
+
return self._filter(period, metrics)
|
61
|
+
|
62
|
+
@abstractmethod
|
63
|
+
def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self: ...
|
64
|
+
|
65
|
+
|
66
|
+
class Abstract1DResult(AbstractResult, ABC):
|
67
|
+
def __init__(self, results_data: pd.DataFrame) -> None:
|
68
|
+
super().__init__(results_data)
|
69
|
+
|
70
|
+
def _filter(self, period: str, metrics=None) -> Self:
|
71
|
+
data = self._data
|
72
|
+
if period != "all":
|
73
|
+
data = self._data.loc[self._data.loc[:, ("chunk", "period")] == period, :] # type: ignore | dataframe loc
|
74
|
+
data = data.reset_index(drop=True)
|
75
|
+
|
76
|
+
res = copy.deepcopy(self)
|
77
|
+
res._data = data
|
78
|
+
return res
|
79
|
+
|
80
|
+
|
81
|
+
class PerMetricResult(Abstract1DResult):
|
82
|
+
def __init__(self, results_data: pd.DataFrame, metrics: Sequence[Metric] = []) -> None:
|
83
|
+
super().__init__(results_data)
|
84
|
+
self.metrics = metrics
|
85
|
+
|
86
|
+
def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self:
|
87
|
+
if metrics is None:
|
88
|
+
metrics = [metric.column_name for metric in self.metrics]
|
89
|
+
|
90
|
+
res = super()._filter(period)
|
91
|
+
|
92
|
+
data = pd.concat([res._data.loc[:, (["chunk"])], res._data.loc[:, (metrics,)]], axis=1) # type: ignore | dataframe loc
|
93
|
+
data = data.reset_index(drop=True)
|
94
|
+
|
95
|
+
res._data = data
|
96
|
+
res.metrics = [metric for metric in self.metrics if metric.column_name in metrics]
|
97
|
+
|
98
|
+
return res
|
@@ -0,0 +1,280 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from NannyML 0.13.0
|
3
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/thresholds.py
|
4
|
+
|
5
|
+
Licensed under Apache Software License (Apache 2.0)
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import logging
|
11
|
+
from abc import ABC, abstractmethod
|
12
|
+
from typing import Any, Callable, ClassVar
|
13
|
+
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
|
17
|
+
class Threshold(ABC):
|
18
|
+
"""A base class used to calculate lower and upper threshold values given one or multiple arrays.
|
19
|
+
|
20
|
+
Any subclass should implement the abstract `thresholds` method.
|
21
|
+
It takes an array or list of arrays and converts them into lower and upper threshold values, represented
|
22
|
+
as a tuple of optional floats.
|
23
|
+
|
24
|
+
A `None` threshold value is interpreted as if there is no upper or lower threshold.
|
25
|
+
One or both values might be `None`.
|
26
|
+
"""
|
27
|
+
|
28
|
+
_registry: ClassVar[dict[str, type[Threshold]]] = {}
|
29
|
+
"""Class registry lookup to get threshold subclass from threshold_type string"""
|
30
|
+
|
31
|
+
def __str__(self) -> str:
|
32
|
+
return self.__str__()
|
33
|
+
|
34
|
+
def __repr__(self) -> str:
|
35
|
+
return self.__class__.__name__ + str(vars(self))
|
36
|
+
|
37
|
+
def __eq__(self, other: object) -> bool:
|
38
|
+
return isinstance(other, self.__class__) and other.__dict__ == self.__dict__
|
39
|
+
|
40
|
+
def __init_subclass__(cls, threshold_type: str) -> None:
|
41
|
+
Threshold._registry[threshold_type] = cls
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
|
45
|
+
"""Returns lower and upper threshold values when given one or more np.ndarray instances.
|
46
|
+
|
47
|
+
Parameters:
|
48
|
+
data: np.ndarray
|
49
|
+
An array of values used to calculate the thresholds on. This will most often represent a metric
|
50
|
+
calculated on one or more sets of data, e.g. a list of F1 scores of multiple data chunks.
|
51
|
+
kwargs: dict[str, Any]
|
52
|
+
Optional keyword arguments passed to the implementing subclass.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
lower, upper: tuple[Optional[float], Optional[float]]
|
56
|
+
The lower and upper threshold values. One or both might be `None`.
|
57
|
+
"""
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def parse_object(cls, obj: dict[str, Any]) -> Threshold:
|
61
|
+
"""Parse object as :class:`Threshold`"""
|
62
|
+
class_name = obj.pop("type", "")
|
63
|
+
|
64
|
+
try:
|
65
|
+
threshold_cls = cls._registry[class_name]
|
66
|
+
except KeyError:
|
67
|
+
accepted_values = ", ".join(map(repr, cls._registry))
|
68
|
+
raise ValueError(f"Expected one of {accepted_values} for threshold type, but received '{class_name}'")
|
69
|
+
|
70
|
+
return threshold_cls(**obj)
|
71
|
+
|
72
|
+
|
73
|
+
class ConstantThreshold(Threshold, threshold_type="constant"):
|
74
|
+
"""A `Thresholder` implementation that returns a constant lower and or upper threshold value.
|
75
|
+
|
76
|
+
Attributes:
|
77
|
+
lower: Optional[float]
|
78
|
+
The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
|
79
|
+
upper: Optional[float]
|
80
|
+
The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
|
81
|
+
|
82
|
+
Raises:
|
83
|
+
ValueError: raised when an argument was given using an incorrect type or name
|
84
|
+
ValueError: raised when the ConstantThreshold could not be created using the given argument values
|
85
|
+
|
86
|
+
Examples:
|
87
|
+
>>> data = np.array(range(10))
|
88
|
+
>>> t = ConstantThreshold(lower=None, upper=0.1)
|
89
|
+
>>> lower, upper = t.threshold()
|
90
|
+
>>> print(lower, upper)
|
91
|
+
None 0.1
|
92
|
+
"""
|
93
|
+
|
94
|
+
def __init__(self, lower: float | int | None = None, upper: float | int | None = None):
|
95
|
+
"""Creates a new ConstantThreshold instance.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
lower: Optional[Union[float, int]], default=None
|
99
|
+
The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
|
100
|
+
upper: Optional[Union[float, int]], default=None
|
101
|
+
The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
|
102
|
+
|
103
|
+
Raises:
|
104
|
+
ValueError: raised when an argument was given using an incorrect type or name
|
105
|
+
ValueError: raised when the ConstantThreshold could not be created using the given argument values
|
106
|
+
"""
|
107
|
+
self._validate_inputs(lower, upper)
|
108
|
+
|
109
|
+
self.lower = lower
|
110
|
+
self.upper = upper
|
111
|
+
|
112
|
+
def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
|
113
|
+
return self.lower, self.upper
|
114
|
+
|
115
|
+
@staticmethod
|
116
|
+
def _validate_inputs(lower: float | int | None = None, upper: float | int | None = None):
|
117
|
+
if lower is not None and not isinstance(lower, (float, int)) or isinstance(lower, bool):
|
118
|
+
raise ValueError(f"expected type of 'lower' to be 'float', 'int' or None but got '{type(lower).__name__}'")
|
119
|
+
|
120
|
+
if upper is not None and not isinstance(upper, (float, int)) or isinstance(upper, bool):
|
121
|
+
raise ValueError(f"expected type of 'upper' to be 'float', 'int' or None but got '{type(upper).__name__}'")
|
122
|
+
|
123
|
+
# explicit None check is required due to special interpretation of the value 0.0 as False
|
124
|
+
if lower is not None and upper is not None and lower >= upper:
|
125
|
+
raise ValueError(f"lower threshold {lower} must be less than upper threshold {upper}")
|
126
|
+
|
127
|
+
|
128
|
+
class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation"):
|
129
|
+
"""A Thresholder that offsets the mean of an array by a multiple of the standard deviation of the array values.
|
130
|
+
|
131
|
+
This thresholder will take the aggregate of an array of values, the mean by default and add or subtract an offset
|
132
|
+
to get the upper and lower threshold values.
|
133
|
+
This offset is calculated as a multiplier, by default 3, times the standard deviation of the given array.
|
134
|
+
|
135
|
+
Attributes:
|
136
|
+
std_lower_multiplier: float
|
137
|
+
std_upper_multiplier: float
|
138
|
+
|
139
|
+
Examples:
|
140
|
+
>>> data = np.array(range(10))
|
141
|
+
>>> t = ConstantThreshold(lower=None, upper=0.1)
|
142
|
+
>>> lower, upper = t.threshold()
|
143
|
+
>>> print(lower, upper)
|
144
|
+
-4.116843969807043 13.116843969807043
|
145
|
+
"""
|
146
|
+
|
147
|
+
def __init__(
|
148
|
+
self,
|
149
|
+
std_lower_multiplier: float | int | None = 3,
|
150
|
+
std_upper_multiplier: float | int | None = 3,
|
151
|
+
offset_from: Callable[[np.ndarray], Any] = np.nanmean,
|
152
|
+
):
|
153
|
+
"""Creates a new StandardDeviationThreshold instance.
|
154
|
+
|
155
|
+
Args:
|
156
|
+
std_lower_multiplier: float, default=3
|
157
|
+
The number the standard deviation of the input array will be multiplied with to form the lower offset.
|
158
|
+
This value will be subtracted from the aggregate of the input array.
|
159
|
+
Defaults to 3.
|
160
|
+
std_upper_multiplier: float, default=3
|
161
|
+
The number the standard deviation of the input array will be multiplied with to form the upper offset.
|
162
|
+
This value will be added to the aggregate of the input array.
|
163
|
+
Defaults to 3.
|
164
|
+
offset_from: Callable[[np.ndarray], Any], default=np.nanmean
|
165
|
+
A function that will be applied to the input array to aggregate it into a single value.
|
166
|
+
Adding the upper offset to this value will yield the upper threshold, subtracting the lower offset
|
167
|
+
will yield the lower threshold.
|
168
|
+
"""
|
169
|
+
|
170
|
+
self._validate_inputs(std_lower_multiplier, std_upper_multiplier)
|
171
|
+
|
172
|
+
self.std_lower_multiplier = std_lower_multiplier
|
173
|
+
self.std_upper_multiplier = std_upper_multiplier
|
174
|
+
self.offset_from = offset_from
|
175
|
+
|
176
|
+
def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
|
177
|
+
aggregate = self.offset_from(data)
|
178
|
+
std = np.nanstd(data)
|
179
|
+
|
180
|
+
lower_threshold = aggregate - std * self.std_lower_multiplier if self.std_lower_multiplier is not None else None
|
181
|
+
|
182
|
+
upper_threshold = aggregate + std * self.std_upper_multiplier if self.std_upper_multiplier is not None else None
|
183
|
+
|
184
|
+
return lower_threshold, upper_threshold
|
185
|
+
|
186
|
+
@staticmethod
|
187
|
+
def _validate_inputs(std_lower_multiplier: float | int | None = 3, std_upper_multiplier: float | int | None = 3):
|
188
|
+
if (
|
189
|
+
std_lower_multiplier is not None
|
190
|
+
and not isinstance(std_lower_multiplier, (float, int))
|
191
|
+
or isinstance(std_lower_multiplier, bool)
|
192
|
+
):
|
193
|
+
raise ValueError(
|
194
|
+
f"expected type of 'std_lower_multiplier' to be 'float', 'int' or None "
|
195
|
+
f"but got '{type(std_lower_multiplier).__name__}'"
|
196
|
+
)
|
197
|
+
|
198
|
+
if std_lower_multiplier and std_lower_multiplier < 0:
|
199
|
+
raise ValueError(f"'std_lower_multiplier' should be greater than 0 but got value {std_lower_multiplier}")
|
200
|
+
|
201
|
+
if (
|
202
|
+
std_upper_multiplier is not None
|
203
|
+
and not isinstance(std_upper_multiplier, (float, int))
|
204
|
+
or isinstance(std_upper_multiplier, bool)
|
205
|
+
):
|
206
|
+
raise ValueError(
|
207
|
+
f"expected type of 'std_upper_multiplier' to be 'float', 'int' or None "
|
208
|
+
f"but got '{type(std_upper_multiplier).__name__}'"
|
209
|
+
)
|
210
|
+
|
211
|
+
if std_upper_multiplier and std_upper_multiplier < 0:
|
212
|
+
raise ValueError(f"'std_upper_multiplier' should be greater than 0 but got value {std_upper_multiplier}")
|
213
|
+
|
214
|
+
|
215
|
+
def calculate_threshold_values(
|
216
|
+
threshold: Threshold,
|
217
|
+
data: np.ndarray,
|
218
|
+
lower_threshold_value_limit: float | None = None,
|
219
|
+
upper_threshold_value_limit: float | None = None,
|
220
|
+
override_using_none: bool = False,
|
221
|
+
logger: logging.Logger | None = None,
|
222
|
+
metric_name: str | None = None,
|
223
|
+
) -> tuple[float | None, float | None]:
|
224
|
+
"""Calculate lower and upper threshold values with respect to the provided Threshold and value limits.
|
225
|
+
|
226
|
+
Parameters:
|
227
|
+
threshold: Threshold
|
228
|
+
The Threshold instance that determines how the lower and upper threshold values will be calculated.
|
229
|
+
data: np.ndarray
|
230
|
+
The data used by the Threshold instance to calculate the lower and upper threshold values.
|
231
|
+
This will often be the values of a drift detection method or performance metric on chunks of reference data.
|
232
|
+
lower_threshold_value_limit: Optional[float], default=None
|
233
|
+
An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
|
234
|
+
values that end up below this limit will be replaced by this limit value.
|
235
|
+
The limit is often a theoretical constraint enforced by a specific drift detection method or performance
|
236
|
+
metric.
|
237
|
+
upper_threshold_value_limit: Optional[float], default=None
|
238
|
+
An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
|
239
|
+
values that end up below this limit will be replaced by this limit value.
|
240
|
+
The limit is often a theoretical constraint enforced by a specific drift detection method or performance
|
241
|
+
metric.
|
242
|
+
override_using_none: bool, default=False
|
243
|
+
When set to True use None to override threshold values that exceed value limits.
|
244
|
+
This will prevent them from being rendered on plots.
|
245
|
+
logger: Optional[logging.Logger], default=None
|
246
|
+
An optional Logger instance. When provided a warning will be logged when a calculated threshold value
|
247
|
+
gets overridden by a threshold value limit.
|
248
|
+
metric_name: Optional[str], default=None
|
249
|
+
When provided the metric name will be included within any log messages for additional clarity.
|
250
|
+
"""
|
251
|
+
|
252
|
+
lower_threshold_value, upper_threshold_value = threshold.thresholds(data)
|
253
|
+
|
254
|
+
if (
|
255
|
+
lower_threshold_value_limit is not None
|
256
|
+
and lower_threshold_value is not None
|
257
|
+
and lower_threshold_value <= lower_threshold_value_limit
|
258
|
+
):
|
259
|
+
override_value = None if override_using_none else lower_threshold_value_limit
|
260
|
+
if logger:
|
261
|
+
logger.warning(
|
262
|
+
f"{metric_name + ' ' if metric_name else ''}lower threshold value {lower_threshold_value} "
|
263
|
+
f"overridden by lower threshold value limit {override_value}"
|
264
|
+
)
|
265
|
+
lower_threshold_value = override_value
|
266
|
+
|
267
|
+
if (
|
268
|
+
upper_threshold_value_limit is not None
|
269
|
+
and upper_threshold_value is not None
|
270
|
+
and upper_threshold_value >= upper_threshold_value_limit
|
271
|
+
):
|
272
|
+
override_value = None if override_using_none else upper_threshold_value_limit
|
273
|
+
if logger:
|
274
|
+
logger.warning(
|
275
|
+
f"{metric_name + ' ' if metric_name else ''}upper threshold value {upper_threshold_value} "
|
276
|
+
f"overridden by upper threshold value limit {override_value}"
|
277
|
+
)
|
278
|
+
upper_threshold_value = override_value
|
279
|
+
|
280
|
+
return lower_threshold_value, upper_threshold_value
|
@@ -4,13 +4,13 @@ __all__ = []
|
|
4
4
|
|
5
5
|
from typing import Any, Sequence, overload
|
6
6
|
|
7
|
+
from dataeval.data._images import Images
|
7
8
|
from dataeval.metrics.stats import hashstats
|
8
9
|
from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
|
9
10
|
from dataeval.outputs import DuplicatesOutput, HashStatsOutput
|
10
11
|
from dataeval.outputs._base import set_metadata
|
11
12
|
from dataeval.outputs._linters import DatasetDuplicateGroupMap, DuplicateGroup
|
12
13
|
from dataeval.typing import ArrayLike, Dataset
|
13
|
-
from dataeval.utils.data._images import Images
|
14
14
|
|
15
15
|
|
16
16
|
class Duplicates:
|
@@ -7,6 +7,7 @@ from typing import Any, Literal, Sequence, overload
|
|
7
7
|
import numpy as np
|
8
8
|
from numpy.typing import NDArray
|
9
9
|
|
10
|
+
from dataeval.data._images import Images
|
10
11
|
from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
|
11
12
|
from dataeval.metrics.stats._imagestats import imagestats
|
12
13
|
from dataeval.outputs import DimensionStatsOutput, ImageStatsOutput, OutliersOutput, PixelStatsOutput, VisualStatsOutput
|
@@ -14,7 +15,6 @@ from dataeval.outputs._base import set_metadata
|
|
14
15
|
from dataeval.outputs._linters import IndexIssueMap, OutlierStatsOutput
|
15
16
|
from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX
|
16
17
|
from dataeval.typing import ArrayLike, Dataset
|
17
|
-
from dataeval.utils.data._images import Images
|
18
18
|
|
19
19
|
|
20
20
|
def _get_outlier_mask(
|
dataeval/metadata/_distance.py
CHANGED
@@ -9,11 +9,11 @@ import numpy as np
|
|
9
9
|
from scipy.stats import iqr, ks_2samp
|
10
10
|
from scipy.stats import wasserstein_distance as emd
|
11
11
|
|
12
|
+
from dataeval.data import Metadata
|
12
13
|
from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
|
13
14
|
from dataeval.outputs import MetadataDistanceOutput, MetadataDistanceValues
|
14
15
|
from dataeval.outputs._base import set_metadata
|
15
16
|
from dataeval.typing import ArrayLike
|
16
|
-
from dataeval.utils.data import Metadata
|
17
17
|
|
18
18
|
|
19
19
|
class KSType(NamedTuple):
|
dataeval/metadata/_ood.py
CHANGED
@@ -9,10 +9,10 @@ from numpy.typing import NDArray
|
|
9
9
|
from sklearn.feature_selection import mutual_info_classif
|
10
10
|
|
11
11
|
from dataeval.config import get_seed
|
12
|
+
from dataeval.data import Metadata
|
12
13
|
from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
|
13
14
|
from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorOutput
|
14
15
|
from dataeval.outputs._base import set_metadata
|
15
|
-
from dataeval.utils.data import Metadata
|
16
16
|
|
17
17
|
|
18
18
|
def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
|
@@ -201,7 +201,7 @@ def find_most_deviated_factors(
|
|
201
201
|
MostDeviatedFactorsOutput([])
|
202
202
|
"""
|
203
203
|
|
204
|
-
ood_mask: NDArray[np.
|
204
|
+
ood_mask: NDArray[np.bool_] = ood.is_ood
|
205
205
|
|
206
206
|
# No metadata correlated with out of distribution data
|
207
207
|
if not any(ood_mask):
|
@@ -303,7 +303,7 @@ def find_ood_predictors(
|
|
303
303
|
OODPredictorOutput({})
|
304
304
|
"""
|
305
305
|
|
306
|
-
ood_mask: NDArray[np.
|
306
|
+
ood_mask: NDArray[np.bool_] = ood.is_ood
|
307
307
|
|
308
308
|
discrete_features_count = len(metadata.discrete_factor_names)
|
309
309
|
factors, data = _combine_discrete_continuous(metadata) # (F, ), (S, F) => F = Fd + Fc
|
@@ -320,7 +320,7 @@ def find_ood_predictors(
|
|
320
320
|
# Calculate mean, std of each factor over all samples
|
321
321
|
scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1) # (S, F)
|
322
322
|
|
323
|
-
discrete_features = np.zeros_like(factors, dtype=np.
|
323
|
+
discrete_features = np.zeros_like(factors, dtype=np.bool_)
|
324
324
|
discrete_features[:discrete_features_count] = True
|
325
325
|
|
326
326
|
mutual_info_values = (
|
@@ -9,10 +9,10 @@ import scipy as sp
|
|
9
9
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
10
10
|
|
11
11
|
from dataeval.config import EPSILON, get_seed
|
12
|
+
from dataeval.data import Metadata
|
12
13
|
from dataeval.outputs import BalanceOutput
|
13
14
|
from dataeval.outputs._base import set_metadata
|
14
15
|
from dataeval.utils._bin import get_counts
|
15
|
-
from dataeval.utils.data import Metadata
|
16
16
|
|
17
17
|
|
18
18
|
def _validate_num_neighbors(num_neighbors: int) -> int:
|
@@ -8,11 +8,11 @@ import numpy as np
|
|
8
8
|
import scipy as sp
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
|
+
from dataeval.data import Metadata
|
11
12
|
from dataeval.outputs import DiversityOutput
|
12
13
|
from dataeval.outputs._base import set_metadata
|
13
14
|
from dataeval.utils._bin import get_counts
|
14
15
|
from dataeval.utils._method import get_method
|
15
|
-
from dataeval.utils.data import Metadata
|
16
16
|
|
17
17
|
|
18
18
|
def diversity_shannon(
|