dataeval 0.85.0__py3-none-any.whl → 0.86.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/data/_metadata.py +17 -5
- dataeval/data/_selection.py +1 -1
- dataeval/data/selections/_classfilter.py +4 -3
- dataeval/detectors/drift/__init__.py +4 -1
- dataeval/detectors/drift/_mvdc.py +92 -0
- dataeval/detectors/drift/_nml/__init__.py +6 -0
- dataeval/detectors/drift/_nml/_base.py +68 -0
- dataeval/detectors/drift/_nml/_chunk.py +404 -0
- dataeval/detectors/drift/_nml/_domainclassifier.py +192 -0
- dataeval/detectors/drift/_nml/_result.py +98 -0
- dataeval/detectors/drift/_nml/_thresholds.py +280 -0
- dataeval/outputs/__init__.py +2 -1
- dataeval/outputs/_bias.py +1 -3
- dataeval/outputs/_drift.py +68 -0
- dataeval/outputs/_linters.py +1 -6
- dataeval/outputs/_stats.py +1 -6
- {dataeval-0.85.0.dist-info → dataeval-0.86.0.dist-info}/METADATA +3 -2
- {dataeval-0.85.0.dist-info → dataeval-0.86.0.dist-info}/RECORD +21 -14
- {dataeval-0.85.0.dist-info → dataeval-0.86.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.85.0.dist-info → dataeval-0.86.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,192 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from NannyML 0.13.0
|
3
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/drift/multivariate/domain_classifier/calculator.py
|
4
|
+
|
5
|
+
Licensed under Apache Software License (Apache 2.0)
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import logging
|
11
|
+
from typing import Any
|
12
|
+
|
13
|
+
import numpy as np
|
14
|
+
import pandas as pd
|
15
|
+
from lightgbm import LGBMClassifier
|
16
|
+
from numpy.typing import NDArray
|
17
|
+
from sklearn.metrics import roc_auc_score
|
18
|
+
from sklearn.model_selection import StratifiedKFold
|
19
|
+
|
20
|
+
from dataeval.config import get_max_processes, get_seed
|
21
|
+
from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
|
22
|
+
from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
|
23
|
+
from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold, calculate_threshold_values
|
24
|
+
from dataeval.outputs._base import set_metadata
|
25
|
+
from dataeval.outputs._drift import DriftMVDCOutput
|
26
|
+
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
DEFAULT_LGBM_HYPERPARAMS = {
|
30
|
+
"boosting_type": "gbdt",
|
31
|
+
"class_weight": None,
|
32
|
+
"colsample_bytree": 1.0,
|
33
|
+
"deterministic": True,
|
34
|
+
"importance_type": "split",
|
35
|
+
"learning_rate": 0.1,
|
36
|
+
"max_depth": -1,
|
37
|
+
"min_child_samples": 20,
|
38
|
+
"min_child_weight": 0.001,
|
39
|
+
"min_split_gain": 0.0,
|
40
|
+
"n_estimators": 100,
|
41
|
+
"n_jobs": get_max_processes() or 0,
|
42
|
+
"num_leaves": 31,
|
43
|
+
"objective": None,
|
44
|
+
"random_state": get_seed(),
|
45
|
+
"reg_alpha": 0.0,
|
46
|
+
"reg_lambda": 0.0,
|
47
|
+
"subsample": 1.0,
|
48
|
+
"subsample_for_bin": 200000,
|
49
|
+
"subsample_freq": 0,
|
50
|
+
"verbosity": -1,
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
class DomainClassifierCalculator(AbstractCalculator):
|
55
|
+
"""
|
56
|
+
DomainClassifierCalculator implementation.
|
57
|
+
|
58
|
+
Uses Drift Detection Classifier's cross validated performance as a measure of drift.
|
59
|
+
"""
|
60
|
+
|
61
|
+
def __init__(
|
62
|
+
self,
|
63
|
+
chunker: Chunker | None = None,
|
64
|
+
cv_folds_num: int = 5,
|
65
|
+
hyperparameters: dict[str, Any] | None = None,
|
66
|
+
threshold: Threshold = ConstantThreshold(lower=0.45, upper=0.65),
|
67
|
+
) -> None:
|
68
|
+
"""
|
69
|
+
Create a new DomainClassifierCalculator instance.
|
70
|
+
|
71
|
+
Parameters
|
72
|
+
-----------
|
73
|
+
chunker : Chunker, default=None
|
74
|
+
The `Chunker` used to split the data sets into a lists of chunks.
|
75
|
+
cv_folds_num: Optional[int]
|
76
|
+
Number of cross-validation folds to use when calculating DC discrimination value.
|
77
|
+
hyperparameters : dict[str, Any], default = None
|
78
|
+
A dictionary used to provide your own custom hyperparameters when training the discrimination model.
|
79
|
+
Check out the available hyperparameter options in the
|
80
|
+
`LightGBM docs <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>`_.
|
81
|
+
threshold: Threshold, default=ConstantThreshold
|
82
|
+
The threshold you wish to evaluate values on. Defaults to a ConstantThreshold with lower value
|
83
|
+
of 0.45 and upper value of 0.65.
|
84
|
+
"""
|
85
|
+
super().__init__(chunker, logger)
|
86
|
+
|
87
|
+
self.cv_folds_num = cv_folds_num
|
88
|
+
self.hyperparameters = DEFAULT_LGBM_HYPERPARAMS if hyperparameters is None else hyperparameters
|
89
|
+
self.threshold = threshold
|
90
|
+
self.result: DriftMVDCOutput | None = None
|
91
|
+
|
92
|
+
def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput:
|
93
|
+
"""Fits the DC calculator to a set of reference data."""
|
94
|
+
self._x_ref = reference_data
|
95
|
+
result = self._calculate(data=self._x_ref)
|
96
|
+
result._data[("chunk", "period")] = "reference"
|
97
|
+
|
98
|
+
return result
|
99
|
+
|
100
|
+
@set_metadata
|
101
|
+
def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
|
102
|
+
"""Calculate the data DC calculator metric for a given data set."""
|
103
|
+
chunks = self.chunker.split(data)
|
104
|
+
|
105
|
+
res = pd.DataFrame.from_records(
|
106
|
+
[
|
107
|
+
{
|
108
|
+
**chunk.dict(),
|
109
|
+
"period": "analysis",
|
110
|
+
"classifier_auroc_value": self._calculate_chunk(chunk=chunk),
|
111
|
+
}
|
112
|
+
for chunk in chunks
|
113
|
+
]
|
114
|
+
)
|
115
|
+
|
116
|
+
multilevel_index = _create_multilevel_index(chunks, "domain_classifier_auroc", ["value"])
|
117
|
+
res.columns = multilevel_index
|
118
|
+
res = res.reset_index(drop=True)
|
119
|
+
|
120
|
+
res = self._populate_alert_thresholds(res)
|
121
|
+
|
122
|
+
if self.result is None:
|
123
|
+
self.result = DriftMVDCOutput(results_data=res)
|
124
|
+
else:
|
125
|
+
self.result = self.result.filter(period="reference")
|
126
|
+
self.result._data = pd.concat([self.result._data, res], ignore_index=True)
|
127
|
+
return self.result
|
128
|
+
|
129
|
+
def _calculate_chunk(self, chunk: Chunk):
|
130
|
+
if self.result is None:
|
131
|
+
# Use information from chunk indices to identify reference chunk's location. This is possible because
|
132
|
+
# both the internal reference data copy and the chunk data were sorted by timestamp, so these
|
133
|
+
# indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
|
134
|
+
# which is a costly operation.
|
135
|
+
df_X = self._x_ref
|
136
|
+
y = np.zeros(len(df_X), dtype=np.intp)
|
137
|
+
y[chunk.start_index : chunk.end_index + 1] = 1
|
138
|
+
else:
|
139
|
+
chunk_X = chunk.data
|
140
|
+
reference_X = self._x_ref
|
141
|
+
chunk_y = np.ones(len(chunk_X), dtype=np.intp)
|
142
|
+
reference_y = np.zeros(len(reference_X), dtype=np.intp)
|
143
|
+
df_X = pd.concat([reference_X, chunk_X], ignore_index=True)
|
144
|
+
y = np.concatenate([reference_y, chunk_y])
|
145
|
+
|
146
|
+
skf = StratifiedKFold(n_splits=self.cv_folds_num)
|
147
|
+
all_preds: list[NDArray[np.float32]] = []
|
148
|
+
all_tgts: list[NDArray[np.intp]] = []
|
149
|
+
for i, (train_index, test_index) in enumerate(skf.split(df_X, y)):
|
150
|
+
_trx = df_X.iloc[train_index]
|
151
|
+
_try = y[train_index]
|
152
|
+
_tsx = df_X.iloc[test_index]
|
153
|
+
_tsy = y[test_index]
|
154
|
+
model = LGBMClassifier(**self.hyperparameters)
|
155
|
+
model.fit(_trx, _try)
|
156
|
+
preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
|
157
|
+
all_preds.append(preds)
|
158
|
+
all_tgts.append(_tsy)
|
159
|
+
|
160
|
+
np_all_preds = np.concatenate(all_preds, axis=0)
|
161
|
+
np_all_tgts = np.concatenate(all_tgts, axis=0)
|
162
|
+
try:
|
163
|
+
# catch case where all rows are duplicates
|
164
|
+
result = roc_auc_score(np_all_tgts, np_all_preds)
|
165
|
+
except ValueError as err:
|
166
|
+
if str(err) != "Only one class present in y_true. ROC AUC score is not defined in that case.":
|
167
|
+
raise
|
168
|
+
else:
|
169
|
+
# by definition if reference and chunk exactly match we can't discriminate
|
170
|
+
result = 0.5
|
171
|
+
return result
|
172
|
+
|
173
|
+
def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
|
174
|
+
if self.result is None:
|
175
|
+
self._threshold_values = calculate_threshold_values(
|
176
|
+
threshold=self.threshold,
|
177
|
+
data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
|
178
|
+
lower_threshold_value_limit=0.0,
|
179
|
+
upper_threshold_value_limit=1.0,
|
180
|
+
logger=self._logger,
|
181
|
+
)
|
182
|
+
|
183
|
+
result_data[("domain_classifier_auroc", "upper_threshold")] = self._threshold_values[1]
|
184
|
+
result_data[("domain_classifier_auroc", "lower_threshold")] = self._threshold_values[0]
|
185
|
+
result_data[("domain_classifier_auroc", "alert")] = result_data.apply(
|
186
|
+
lambda row: bool(
|
187
|
+
row["domain_classifier_auroc", "value"] > row["domain_classifier_auroc", "upper_threshold"]
|
188
|
+
or row["domain_classifier_auroc", "value"] < row["domain_classifier_auroc", "lower_threshold"]
|
189
|
+
),
|
190
|
+
axis=1,
|
191
|
+
)
|
192
|
+
return result_data
|
@@ -0,0 +1,98 @@
|
|
1
|
+
"""
|
2
|
+
Contains the results of the data reconstruction drift calculation and provides plotting functionality.
|
3
|
+
|
4
|
+
Source code derived from NannyML 0.13.0
|
5
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/base.py
|
6
|
+
|
7
|
+
Licensed under Apache Software License (Apache 2.0)
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import copy
|
13
|
+
from abc import ABC, abstractmethod
|
14
|
+
from typing import NamedTuple, Sequence
|
15
|
+
|
16
|
+
import pandas as pd
|
17
|
+
from typing_extensions import Self
|
18
|
+
|
19
|
+
from dataeval.outputs._base import GenericOutput
|
20
|
+
|
21
|
+
|
22
|
+
class Metric(NamedTuple):
|
23
|
+
display_name: str
|
24
|
+
column_name: str
|
25
|
+
|
26
|
+
|
27
|
+
class AbstractResult(GenericOutput[pd.DataFrame]):
|
28
|
+
def __init__(self, results_data: pd.DataFrame) -> None:
|
29
|
+
self._data = results_data.copy(deep=True)
|
30
|
+
|
31
|
+
def data(self) -> pd.DataFrame:
|
32
|
+
return self.to_df()
|
33
|
+
|
34
|
+
@property
|
35
|
+
def empty(self) -> bool:
|
36
|
+
return self._data is None or self._data.empty
|
37
|
+
|
38
|
+
def __len__(self) -> int:
|
39
|
+
return 0 if self.empty else len(self._data)
|
40
|
+
|
41
|
+
def to_df(self, multilevel: bool = True) -> pd.DataFrame:
|
42
|
+
"""Export results to pandas dataframe."""
|
43
|
+
if multilevel:
|
44
|
+
return self._data
|
45
|
+
else:
|
46
|
+
column_names = [
|
47
|
+
"_".join(col).replace("chunk_chunk_chunk", "chunk").replace("chunk_chunk", "chunk")
|
48
|
+
for col in self._data.columns.values
|
49
|
+
]
|
50
|
+
single_level_data = self._data.copy(deep=True)
|
51
|
+
single_level_data.columns = column_names
|
52
|
+
return single_level_data
|
53
|
+
|
54
|
+
def filter(self, period: str = "all", metrics: str | Sequence[str] | None = None) -> Self:
|
55
|
+
"""Returns filtered result metric data."""
|
56
|
+
if metrics and not isinstance(metrics, (str, Sequence)):
|
57
|
+
raise ValueError("metrics value provided is not a valid metric or sequence of metrics")
|
58
|
+
if isinstance(metrics, str):
|
59
|
+
metrics = [metrics]
|
60
|
+
return self._filter(period, metrics)
|
61
|
+
|
62
|
+
@abstractmethod
|
63
|
+
def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self: ...
|
64
|
+
|
65
|
+
|
66
|
+
class Abstract1DResult(AbstractResult, ABC):
|
67
|
+
def __init__(self, results_data: pd.DataFrame) -> None:
|
68
|
+
super().__init__(results_data)
|
69
|
+
|
70
|
+
def _filter(self, period: str, metrics=None) -> Self:
|
71
|
+
data = self._data
|
72
|
+
if period != "all":
|
73
|
+
data = self._data.loc[self._data.loc[:, ("chunk", "period")] == period, :] # type: ignore | dataframe loc
|
74
|
+
data = data.reset_index(drop=True)
|
75
|
+
|
76
|
+
res = copy.deepcopy(self)
|
77
|
+
res._data = data
|
78
|
+
return res
|
79
|
+
|
80
|
+
|
81
|
+
class PerMetricResult(Abstract1DResult):
|
82
|
+
def __init__(self, results_data: pd.DataFrame, metrics: Sequence[Metric] = []) -> None:
|
83
|
+
super().__init__(results_data)
|
84
|
+
self.metrics = metrics
|
85
|
+
|
86
|
+
def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self:
|
87
|
+
if metrics is None:
|
88
|
+
metrics = [metric.column_name for metric in self.metrics]
|
89
|
+
|
90
|
+
res = super()._filter(period)
|
91
|
+
|
92
|
+
data = pd.concat([res._data.loc[:, (["chunk"])], res._data.loc[:, (metrics,)]], axis=1) # type: ignore | dataframe loc
|
93
|
+
data = data.reset_index(drop=True)
|
94
|
+
|
95
|
+
res._data = data
|
96
|
+
res.metrics = [metric for metric in self.metrics if metric.column_name in metrics]
|
97
|
+
|
98
|
+
return res
|
@@ -0,0 +1,280 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from NannyML 0.13.0
|
3
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/thresholds.py
|
4
|
+
|
5
|
+
Licensed under Apache Software License (Apache 2.0)
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import logging
|
11
|
+
from abc import ABC, abstractmethod
|
12
|
+
from typing import Any, Callable, ClassVar
|
13
|
+
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
|
17
|
+
class Threshold(ABC):
|
18
|
+
"""A base class used to calculate lower and upper threshold values given one or multiple arrays.
|
19
|
+
|
20
|
+
Any subclass should implement the abstract `thresholds` method.
|
21
|
+
It takes an array or list of arrays and converts them into lower and upper threshold values, represented
|
22
|
+
as a tuple of optional floats.
|
23
|
+
|
24
|
+
A `None` threshold value is interpreted as if there is no upper or lower threshold.
|
25
|
+
One or both values might be `None`.
|
26
|
+
"""
|
27
|
+
|
28
|
+
_registry: ClassVar[dict[str, type[Threshold]]] = {}
|
29
|
+
"""Class registry lookup to get threshold subclass from threshold_type string"""
|
30
|
+
|
31
|
+
def __str__(self) -> str:
|
32
|
+
return self.__str__()
|
33
|
+
|
34
|
+
def __repr__(self) -> str:
|
35
|
+
return self.__class__.__name__ + str(vars(self))
|
36
|
+
|
37
|
+
def __eq__(self, other: object) -> bool:
|
38
|
+
return isinstance(other, self.__class__) and other.__dict__ == self.__dict__
|
39
|
+
|
40
|
+
def __init_subclass__(cls, threshold_type: str) -> None:
|
41
|
+
Threshold._registry[threshold_type] = cls
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
|
45
|
+
"""Returns lower and upper threshold values when given one or more np.ndarray instances.
|
46
|
+
|
47
|
+
Parameters:
|
48
|
+
data: np.ndarray
|
49
|
+
An array of values used to calculate the thresholds on. This will most often represent a metric
|
50
|
+
calculated on one or more sets of data, e.g. a list of F1 scores of multiple data chunks.
|
51
|
+
kwargs: dict[str, Any]
|
52
|
+
Optional keyword arguments passed to the implementing subclass.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
lower, upper: tuple[Optional[float], Optional[float]]
|
56
|
+
The lower and upper threshold values. One or both might be `None`.
|
57
|
+
"""
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def parse_object(cls, obj: dict[str, Any]) -> Threshold:
|
61
|
+
"""Parse object as :class:`Threshold`"""
|
62
|
+
class_name = obj.pop("type", "")
|
63
|
+
|
64
|
+
try:
|
65
|
+
threshold_cls = cls._registry[class_name]
|
66
|
+
except KeyError:
|
67
|
+
accepted_values = ", ".join(map(repr, cls._registry))
|
68
|
+
raise ValueError(f"Expected one of {accepted_values} for threshold type, but received '{class_name}'")
|
69
|
+
|
70
|
+
return threshold_cls(**obj)
|
71
|
+
|
72
|
+
|
73
|
+
class ConstantThreshold(Threshold, threshold_type="constant"):
|
74
|
+
"""A `Thresholder` implementation that returns a constant lower and or upper threshold value.
|
75
|
+
|
76
|
+
Attributes:
|
77
|
+
lower: Optional[float]
|
78
|
+
The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
|
79
|
+
upper: Optional[float]
|
80
|
+
The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
|
81
|
+
|
82
|
+
Raises:
|
83
|
+
ValueError: raised when an argument was given using an incorrect type or name
|
84
|
+
ValueError: raised when the ConstantThreshold could not be created using the given argument values
|
85
|
+
|
86
|
+
Examples:
|
87
|
+
>>> data = np.array(range(10))
|
88
|
+
>>> t = ConstantThreshold(lower=None, upper=0.1)
|
89
|
+
>>> lower, upper = t.threshold()
|
90
|
+
>>> print(lower, upper)
|
91
|
+
None 0.1
|
92
|
+
"""
|
93
|
+
|
94
|
+
def __init__(self, lower: float | int | None = None, upper: float | int | None = None):
|
95
|
+
"""Creates a new ConstantThreshold instance.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
lower: Optional[Union[float, int]], default=None
|
99
|
+
The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
|
100
|
+
upper: Optional[Union[float, int]], default=None
|
101
|
+
The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
|
102
|
+
|
103
|
+
Raises:
|
104
|
+
ValueError: raised when an argument was given using an incorrect type or name
|
105
|
+
ValueError: raised when the ConstantThreshold could not be created using the given argument values
|
106
|
+
"""
|
107
|
+
self._validate_inputs(lower, upper)
|
108
|
+
|
109
|
+
self.lower = lower
|
110
|
+
self.upper = upper
|
111
|
+
|
112
|
+
def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
|
113
|
+
return self.lower, self.upper
|
114
|
+
|
115
|
+
@staticmethod
|
116
|
+
def _validate_inputs(lower: float | int | None = None, upper: float | int | None = None):
|
117
|
+
if lower is not None and not isinstance(lower, (float, int)) or isinstance(lower, bool):
|
118
|
+
raise ValueError(f"expected type of 'lower' to be 'float', 'int' or None but got '{type(lower).__name__}'")
|
119
|
+
|
120
|
+
if upper is not None and not isinstance(upper, (float, int)) or isinstance(upper, bool):
|
121
|
+
raise ValueError(f"expected type of 'upper' to be 'float', 'int' or None but got '{type(upper).__name__}'")
|
122
|
+
|
123
|
+
# explicit None check is required due to special interpretation of the value 0.0 as False
|
124
|
+
if lower is not None and upper is not None and lower >= upper:
|
125
|
+
raise ValueError(f"lower threshold {lower} must be less than upper threshold {upper}")
|
126
|
+
|
127
|
+
|
128
|
+
class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation"):
|
129
|
+
"""A Thresholder that offsets the mean of an array by a multiple of the standard deviation of the array values.
|
130
|
+
|
131
|
+
This thresholder will take the aggregate of an array of values, the mean by default and add or subtract an offset
|
132
|
+
to get the upper and lower threshold values.
|
133
|
+
This offset is calculated as a multiplier, by default 3, times the standard deviation of the given array.
|
134
|
+
|
135
|
+
Attributes:
|
136
|
+
std_lower_multiplier: float
|
137
|
+
std_upper_multiplier: float
|
138
|
+
|
139
|
+
Examples:
|
140
|
+
>>> data = np.array(range(10))
|
141
|
+
>>> t = ConstantThreshold(lower=None, upper=0.1)
|
142
|
+
>>> lower, upper = t.threshold()
|
143
|
+
>>> print(lower, upper)
|
144
|
+
-4.116843969807043 13.116843969807043
|
145
|
+
"""
|
146
|
+
|
147
|
+
def __init__(
|
148
|
+
self,
|
149
|
+
std_lower_multiplier: float | int | None = 3,
|
150
|
+
std_upper_multiplier: float | int | None = 3,
|
151
|
+
offset_from: Callable[[np.ndarray], Any] = np.nanmean,
|
152
|
+
):
|
153
|
+
"""Creates a new StandardDeviationThreshold instance.
|
154
|
+
|
155
|
+
Args:
|
156
|
+
std_lower_multiplier: float, default=3
|
157
|
+
The number the standard deviation of the input array will be multiplied with to form the lower offset.
|
158
|
+
This value will be subtracted from the aggregate of the input array.
|
159
|
+
Defaults to 3.
|
160
|
+
std_upper_multiplier: float, default=3
|
161
|
+
The number the standard deviation of the input array will be multiplied with to form the upper offset.
|
162
|
+
This value will be added to the aggregate of the input array.
|
163
|
+
Defaults to 3.
|
164
|
+
offset_from: Callable[[np.ndarray], Any], default=np.nanmean
|
165
|
+
A function that will be applied to the input array to aggregate it into a single value.
|
166
|
+
Adding the upper offset to this value will yield the upper threshold, subtracting the lower offset
|
167
|
+
will yield the lower threshold.
|
168
|
+
"""
|
169
|
+
|
170
|
+
self._validate_inputs(std_lower_multiplier, std_upper_multiplier)
|
171
|
+
|
172
|
+
self.std_lower_multiplier = std_lower_multiplier
|
173
|
+
self.std_upper_multiplier = std_upper_multiplier
|
174
|
+
self.offset_from = offset_from
|
175
|
+
|
176
|
+
def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
|
177
|
+
aggregate = self.offset_from(data)
|
178
|
+
std = np.nanstd(data)
|
179
|
+
|
180
|
+
lower_threshold = aggregate - std * self.std_lower_multiplier if self.std_lower_multiplier is not None else None
|
181
|
+
|
182
|
+
upper_threshold = aggregate + std * self.std_upper_multiplier if self.std_upper_multiplier is not None else None
|
183
|
+
|
184
|
+
return lower_threshold, upper_threshold
|
185
|
+
|
186
|
+
@staticmethod
|
187
|
+
def _validate_inputs(std_lower_multiplier: float | int | None = 3, std_upper_multiplier: float | int | None = 3):
|
188
|
+
if (
|
189
|
+
std_lower_multiplier is not None
|
190
|
+
and not isinstance(std_lower_multiplier, (float, int))
|
191
|
+
or isinstance(std_lower_multiplier, bool)
|
192
|
+
):
|
193
|
+
raise ValueError(
|
194
|
+
f"expected type of 'std_lower_multiplier' to be 'float', 'int' or None "
|
195
|
+
f"but got '{type(std_lower_multiplier).__name__}'"
|
196
|
+
)
|
197
|
+
|
198
|
+
if std_lower_multiplier and std_lower_multiplier < 0:
|
199
|
+
raise ValueError(f"'std_lower_multiplier' should be greater than 0 but got value {std_lower_multiplier}")
|
200
|
+
|
201
|
+
if (
|
202
|
+
std_upper_multiplier is not None
|
203
|
+
and not isinstance(std_upper_multiplier, (float, int))
|
204
|
+
or isinstance(std_upper_multiplier, bool)
|
205
|
+
):
|
206
|
+
raise ValueError(
|
207
|
+
f"expected type of 'std_upper_multiplier' to be 'float', 'int' or None "
|
208
|
+
f"but got '{type(std_upper_multiplier).__name__}'"
|
209
|
+
)
|
210
|
+
|
211
|
+
if std_upper_multiplier and std_upper_multiplier < 0:
|
212
|
+
raise ValueError(f"'std_upper_multiplier' should be greater than 0 but got value {std_upper_multiplier}")
|
213
|
+
|
214
|
+
|
215
|
+
def calculate_threshold_values(
|
216
|
+
threshold: Threshold,
|
217
|
+
data: np.ndarray,
|
218
|
+
lower_threshold_value_limit: float | None = None,
|
219
|
+
upper_threshold_value_limit: float | None = None,
|
220
|
+
override_using_none: bool = False,
|
221
|
+
logger: logging.Logger | None = None,
|
222
|
+
metric_name: str | None = None,
|
223
|
+
) -> tuple[float | None, float | None]:
|
224
|
+
"""Calculate lower and upper threshold values with respect to the provided Threshold and value limits.
|
225
|
+
|
226
|
+
Parameters:
|
227
|
+
threshold: Threshold
|
228
|
+
The Threshold instance that determines how the lower and upper threshold values will be calculated.
|
229
|
+
data: np.ndarray
|
230
|
+
The data used by the Threshold instance to calculate the lower and upper threshold values.
|
231
|
+
This will often be the values of a drift detection method or performance metric on chunks of reference data.
|
232
|
+
lower_threshold_value_limit: Optional[float], default=None
|
233
|
+
An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
|
234
|
+
values that end up below this limit will be replaced by this limit value.
|
235
|
+
The limit is often a theoretical constraint enforced by a specific drift detection method or performance
|
236
|
+
metric.
|
237
|
+
upper_threshold_value_limit: Optional[float], default=None
|
238
|
+
An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
|
239
|
+
values that end up below this limit will be replaced by this limit value.
|
240
|
+
The limit is often a theoretical constraint enforced by a specific drift detection method or performance
|
241
|
+
metric.
|
242
|
+
override_using_none: bool, default=False
|
243
|
+
When set to True use None to override threshold values that exceed value limits.
|
244
|
+
This will prevent them from being rendered on plots.
|
245
|
+
logger: Optional[logging.Logger], default=None
|
246
|
+
An optional Logger instance. When provided a warning will be logged when a calculated threshold value
|
247
|
+
gets overridden by a threshold value limit.
|
248
|
+
metric_name: Optional[str], default=None
|
249
|
+
When provided the metric name will be included within any log messages for additional clarity.
|
250
|
+
"""
|
251
|
+
|
252
|
+
lower_threshold_value, upper_threshold_value = threshold.thresholds(data)
|
253
|
+
|
254
|
+
if (
|
255
|
+
lower_threshold_value_limit is not None
|
256
|
+
and lower_threshold_value is not None
|
257
|
+
and lower_threshold_value <= lower_threshold_value_limit
|
258
|
+
):
|
259
|
+
override_value = None if override_using_none else lower_threshold_value_limit
|
260
|
+
if logger:
|
261
|
+
logger.warning(
|
262
|
+
f"{metric_name + ' ' if metric_name else ''}lower threshold value {lower_threshold_value} "
|
263
|
+
f"overridden by lower threshold value limit {override_value}"
|
264
|
+
)
|
265
|
+
lower_threshold_value = override_value
|
266
|
+
|
267
|
+
if (
|
268
|
+
upper_threshold_value_limit is not None
|
269
|
+
and upper_threshold_value is not None
|
270
|
+
and upper_threshold_value >= upper_threshold_value_limit
|
271
|
+
):
|
272
|
+
override_value = None if override_using_none else upper_threshold_value_limit
|
273
|
+
if logger:
|
274
|
+
logger.warning(
|
275
|
+
f"{metric_name + ' ' if metric_name else ''}upper threshold value {upper_threshold_value} "
|
276
|
+
f"overridden by upper threshold value limit {override_value}"
|
277
|
+
)
|
278
|
+
upper_threshold_value = override_value
|
279
|
+
|
280
|
+
return lower_threshold_value, upper_threshold_value
|
dataeval/outputs/__init__.py
CHANGED
@@ -5,7 +5,7 @@ as well as runtime metadata for reproducibility and logging.
|
|
5
5
|
|
6
6
|
from ._base import ExecutionMetadata
|
7
7
|
from ._bias import BalanceOutput, CompletenessOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
|
8
|
-
from ._drift import DriftMMDOutput, DriftOutput
|
8
|
+
from ._drift import DriftMMDOutput, DriftMVDCOutput, DriftOutput
|
9
9
|
from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
|
10
10
|
from ._linters import DuplicatesOutput, OutliersOutput
|
11
11
|
from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
|
@@ -34,6 +34,7 @@ __all__ = [
|
|
34
34
|
"DivergenceOutput",
|
35
35
|
"DiversityOutput",
|
36
36
|
"DriftMMDOutput",
|
37
|
+
"DriftMVDCOutput",
|
37
38
|
"DriftOutput",
|
38
39
|
"DuplicatesOutput",
|
39
40
|
"ExecutionMetadata",
|
dataeval/outputs/_bias.py
CHANGED
@@ -7,10 +7,10 @@ from dataclasses import asdict, dataclass
|
|
7
7
|
from typing import Any, Literal, TypeVar, overload
|
8
8
|
|
9
9
|
import numpy as np
|
10
|
+
import pandas as pd
|
10
11
|
from numpy.typing import NDArray
|
11
12
|
|
12
13
|
with contextlib.suppress(ImportError):
|
13
|
-
import pandas as pd
|
14
14
|
from matplotlib.figure import Figure
|
15
15
|
|
16
16
|
from dataeval.data._images import Images
|
@@ -38,8 +38,6 @@ class ToDataFrameMixin:
|
|
38
38
|
-----
|
39
39
|
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
40
40
|
"""
|
41
|
-
import pandas as pd
|
42
|
-
|
43
41
|
return pd.DataFrame(
|
44
42
|
index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
|
45
43
|
data={
|