dataeval 0.85.0__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,192 @@
1
+ """
2
+ Source code derived from NannyML 0.13.0
3
+ https://github.com/NannyML/nannyml/blob/main/nannyml/drift/multivariate/domain_classifier/calculator.py
4
+
5
+ Licensed under Apache Software License (Apache 2.0)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Any
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from lightgbm import LGBMClassifier
16
+ from numpy.typing import NDArray
17
+ from sklearn.metrics import roc_auc_score
18
+ from sklearn.model_selection import StratifiedKFold
19
+
20
+ from dataeval.config import get_max_processes, get_seed
21
+ from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
22
+ from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
23
+ from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold, calculate_threshold_values
24
+ from dataeval.outputs._base import set_metadata
25
+ from dataeval.outputs._drift import DriftMVDCOutput
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ DEFAULT_LGBM_HYPERPARAMS = {
30
+ "boosting_type": "gbdt",
31
+ "class_weight": None,
32
+ "colsample_bytree": 1.0,
33
+ "deterministic": True,
34
+ "importance_type": "split",
35
+ "learning_rate": 0.1,
36
+ "max_depth": -1,
37
+ "min_child_samples": 20,
38
+ "min_child_weight": 0.001,
39
+ "min_split_gain": 0.0,
40
+ "n_estimators": 100,
41
+ "n_jobs": get_max_processes() or 0,
42
+ "num_leaves": 31,
43
+ "objective": None,
44
+ "random_state": get_seed(),
45
+ "reg_alpha": 0.0,
46
+ "reg_lambda": 0.0,
47
+ "subsample": 1.0,
48
+ "subsample_for_bin": 200000,
49
+ "subsample_freq": 0,
50
+ "verbosity": -1,
51
+ }
52
+
53
+
54
+ class DomainClassifierCalculator(AbstractCalculator):
55
+ """
56
+ DomainClassifierCalculator implementation.
57
+
58
+ Uses Drift Detection Classifier's cross validated performance as a measure of drift.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ chunker: Chunker | None = None,
64
+ cv_folds_num: int = 5,
65
+ hyperparameters: dict[str, Any] | None = None,
66
+ threshold: Threshold = ConstantThreshold(lower=0.45, upper=0.65),
67
+ ) -> None:
68
+ """
69
+ Create a new DomainClassifierCalculator instance.
70
+
71
+ Parameters
72
+ -----------
73
+ chunker : Chunker, default=None
74
+ The `Chunker` used to split the data sets into a lists of chunks.
75
+ cv_folds_num: Optional[int]
76
+ Number of cross-validation folds to use when calculating DC discrimination value.
77
+ hyperparameters : dict[str, Any], default = None
78
+ A dictionary used to provide your own custom hyperparameters when training the discrimination model.
79
+ Check out the available hyperparameter options in the
80
+ `LightGBM docs <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>`_.
81
+ threshold: Threshold, default=ConstantThreshold
82
+ The threshold you wish to evaluate values on. Defaults to a ConstantThreshold with lower value
83
+ of 0.45 and upper value of 0.65.
84
+ """
85
+ super().__init__(chunker, logger)
86
+
87
+ self.cv_folds_num = cv_folds_num
88
+ self.hyperparameters = DEFAULT_LGBM_HYPERPARAMS if hyperparameters is None else hyperparameters
89
+ self.threshold = threshold
90
+ self.result: DriftMVDCOutput | None = None
91
+
92
+ def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput:
93
+ """Fits the DC calculator to a set of reference data."""
94
+ self._x_ref = reference_data
95
+ result = self._calculate(data=self._x_ref)
96
+ result._data[("chunk", "period")] = "reference"
97
+
98
+ return result
99
+
100
+ @set_metadata
101
+ def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
102
+ """Calculate the data DC calculator metric for a given data set."""
103
+ chunks = self.chunker.split(data)
104
+
105
+ res = pd.DataFrame.from_records(
106
+ [
107
+ {
108
+ **chunk.dict(),
109
+ "period": "analysis",
110
+ "classifier_auroc_value": self._calculate_chunk(chunk=chunk),
111
+ }
112
+ for chunk in chunks
113
+ ]
114
+ )
115
+
116
+ multilevel_index = _create_multilevel_index(chunks, "domain_classifier_auroc", ["value"])
117
+ res.columns = multilevel_index
118
+ res = res.reset_index(drop=True)
119
+
120
+ res = self._populate_alert_thresholds(res)
121
+
122
+ if self.result is None:
123
+ self.result = DriftMVDCOutput(results_data=res)
124
+ else:
125
+ self.result = self.result.filter(period="reference")
126
+ self.result._data = pd.concat([self.result._data, res], ignore_index=True)
127
+ return self.result
128
+
129
+ def _calculate_chunk(self, chunk: Chunk):
130
+ if self.result is None:
131
+ # Use information from chunk indices to identify reference chunk's location. This is possible because
132
+ # both the internal reference data copy and the chunk data were sorted by timestamp, so these
133
+ # indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
134
+ # which is a costly operation.
135
+ df_X = self._x_ref
136
+ y = np.zeros(len(df_X), dtype=np.intp)
137
+ y[chunk.start_index : chunk.end_index + 1] = 1
138
+ else:
139
+ chunk_X = chunk.data
140
+ reference_X = self._x_ref
141
+ chunk_y = np.ones(len(chunk_X), dtype=np.intp)
142
+ reference_y = np.zeros(len(reference_X), dtype=np.intp)
143
+ df_X = pd.concat([reference_X, chunk_X], ignore_index=True)
144
+ y = np.concatenate([reference_y, chunk_y])
145
+
146
+ skf = StratifiedKFold(n_splits=self.cv_folds_num)
147
+ all_preds: list[NDArray[np.float32]] = []
148
+ all_tgts: list[NDArray[np.intp]] = []
149
+ for i, (train_index, test_index) in enumerate(skf.split(df_X, y)):
150
+ _trx = df_X.iloc[train_index]
151
+ _try = y[train_index]
152
+ _tsx = df_X.iloc[test_index]
153
+ _tsy = y[test_index]
154
+ model = LGBMClassifier(**self.hyperparameters)
155
+ model.fit(_trx, _try)
156
+ preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
157
+ all_preds.append(preds)
158
+ all_tgts.append(_tsy)
159
+
160
+ np_all_preds = np.concatenate(all_preds, axis=0)
161
+ np_all_tgts = np.concatenate(all_tgts, axis=0)
162
+ try:
163
+ # catch case where all rows are duplicates
164
+ result = roc_auc_score(np_all_tgts, np_all_preds)
165
+ except ValueError as err:
166
+ if str(err) != "Only one class present in y_true. ROC AUC score is not defined in that case.":
167
+ raise
168
+ else:
169
+ # by definition if reference and chunk exactly match we can't discriminate
170
+ result = 0.5
171
+ return result
172
+
173
+ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
174
+ if self.result is None:
175
+ self._threshold_values = calculate_threshold_values(
176
+ threshold=self.threshold,
177
+ data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
178
+ lower_threshold_value_limit=0.0,
179
+ upper_threshold_value_limit=1.0,
180
+ logger=self._logger,
181
+ )
182
+
183
+ result_data[("domain_classifier_auroc", "upper_threshold")] = self._threshold_values[1]
184
+ result_data[("domain_classifier_auroc", "lower_threshold")] = self._threshold_values[0]
185
+ result_data[("domain_classifier_auroc", "alert")] = result_data.apply(
186
+ lambda row: bool(
187
+ row["domain_classifier_auroc", "value"] > row["domain_classifier_auroc", "upper_threshold"]
188
+ or row["domain_classifier_auroc", "value"] < row["domain_classifier_auroc", "lower_threshold"]
189
+ ),
190
+ axis=1,
191
+ )
192
+ return result_data
@@ -0,0 +1,98 @@
1
+ """
2
+ Contains the results of the data reconstruction drift calculation and provides plotting functionality.
3
+
4
+ Source code derived from NannyML 0.13.0
5
+ https://github.com/NannyML/nannyml/blob/main/nannyml/base.py
6
+
7
+ Licensed under Apache Software License (Apache 2.0)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import copy
13
+ from abc import ABC, abstractmethod
14
+ from typing import NamedTuple, Sequence
15
+
16
+ import pandas as pd
17
+ from typing_extensions import Self
18
+
19
+ from dataeval.outputs._base import GenericOutput
20
+
21
+
22
+ class Metric(NamedTuple):
23
+ display_name: str
24
+ column_name: str
25
+
26
+
27
+ class AbstractResult(GenericOutput[pd.DataFrame]):
28
+ def __init__(self, results_data: pd.DataFrame) -> None:
29
+ self._data = results_data.copy(deep=True)
30
+
31
+ def data(self) -> pd.DataFrame:
32
+ return self.to_df()
33
+
34
+ @property
35
+ def empty(self) -> bool:
36
+ return self._data is None or self._data.empty
37
+
38
+ def __len__(self) -> int:
39
+ return 0 if self.empty else len(self._data)
40
+
41
+ def to_df(self, multilevel: bool = True) -> pd.DataFrame:
42
+ """Export results to pandas dataframe."""
43
+ if multilevel:
44
+ return self._data
45
+ else:
46
+ column_names = [
47
+ "_".join(col).replace("chunk_chunk_chunk", "chunk").replace("chunk_chunk", "chunk")
48
+ for col in self._data.columns.values
49
+ ]
50
+ single_level_data = self._data.copy(deep=True)
51
+ single_level_data.columns = column_names
52
+ return single_level_data
53
+
54
+ def filter(self, period: str = "all", metrics: str | Sequence[str] | None = None) -> Self:
55
+ """Returns filtered result metric data."""
56
+ if metrics and not isinstance(metrics, (str, Sequence)):
57
+ raise ValueError("metrics value provided is not a valid metric or sequence of metrics")
58
+ if isinstance(metrics, str):
59
+ metrics = [metrics]
60
+ return self._filter(period, metrics)
61
+
62
+ @abstractmethod
63
+ def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self: ...
64
+
65
+
66
+ class Abstract1DResult(AbstractResult, ABC):
67
+ def __init__(self, results_data: pd.DataFrame) -> None:
68
+ super().__init__(results_data)
69
+
70
+ def _filter(self, period: str, metrics=None) -> Self:
71
+ data = self._data
72
+ if period != "all":
73
+ data = self._data.loc[self._data.loc[:, ("chunk", "period")] == period, :] # type: ignore | dataframe loc
74
+ data = data.reset_index(drop=True)
75
+
76
+ res = copy.deepcopy(self)
77
+ res._data = data
78
+ return res
79
+
80
+
81
+ class PerMetricResult(Abstract1DResult):
82
+ def __init__(self, results_data: pd.DataFrame, metrics: Sequence[Metric] = []) -> None:
83
+ super().__init__(results_data)
84
+ self.metrics = metrics
85
+
86
+ def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self:
87
+ if metrics is None:
88
+ metrics = [metric.column_name for metric in self.metrics]
89
+
90
+ res = super()._filter(period)
91
+
92
+ data = pd.concat([res._data.loc[:, (["chunk"])], res._data.loc[:, (metrics,)]], axis=1) # type: ignore | dataframe loc
93
+ data = data.reset_index(drop=True)
94
+
95
+ res._data = data
96
+ res.metrics = [metric for metric in self.metrics if metric.column_name in metrics]
97
+
98
+ return res
@@ -0,0 +1,280 @@
1
+ """
2
+ Source code derived from NannyML 0.13.0
3
+ https://github.com/NannyML/nannyml/blob/main/nannyml/thresholds.py
4
+
5
+ Licensed under Apache Software License (Apache 2.0)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from abc import ABC, abstractmethod
12
+ from typing import Any, Callable, ClassVar
13
+
14
+ import numpy as np
15
+
16
+
17
+ class Threshold(ABC):
18
+ """A base class used to calculate lower and upper threshold values given one or multiple arrays.
19
+
20
+ Any subclass should implement the abstract `thresholds` method.
21
+ It takes an array or list of arrays and converts them into lower and upper threshold values, represented
22
+ as a tuple of optional floats.
23
+
24
+ A `None` threshold value is interpreted as if there is no upper or lower threshold.
25
+ One or both values might be `None`.
26
+ """
27
+
28
+ _registry: ClassVar[dict[str, type[Threshold]]] = {}
29
+ """Class registry lookup to get threshold subclass from threshold_type string"""
30
+
31
+ def __str__(self) -> str:
32
+ return self.__str__()
33
+
34
+ def __repr__(self) -> str:
35
+ return self.__class__.__name__ + str(vars(self))
36
+
37
+ def __eq__(self, other: object) -> bool:
38
+ return isinstance(other, self.__class__) and other.__dict__ == self.__dict__
39
+
40
+ def __init_subclass__(cls, threshold_type: str) -> None:
41
+ Threshold._registry[threshold_type] = cls
42
+
43
+ @abstractmethod
44
+ def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
45
+ """Returns lower and upper threshold values when given one or more np.ndarray instances.
46
+
47
+ Parameters:
48
+ data: np.ndarray
49
+ An array of values used to calculate the thresholds on. This will most often represent a metric
50
+ calculated on one or more sets of data, e.g. a list of F1 scores of multiple data chunks.
51
+ kwargs: dict[str, Any]
52
+ Optional keyword arguments passed to the implementing subclass.
53
+
54
+ Returns:
55
+ lower, upper: tuple[Optional[float], Optional[float]]
56
+ The lower and upper threshold values. One or both might be `None`.
57
+ """
58
+
59
+ @classmethod
60
+ def parse_object(cls, obj: dict[str, Any]) -> Threshold:
61
+ """Parse object as :class:`Threshold`"""
62
+ class_name = obj.pop("type", "")
63
+
64
+ try:
65
+ threshold_cls = cls._registry[class_name]
66
+ except KeyError:
67
+ accepted_values = ", ".join(map(repr, cls._registry))
68
+ raise ValueError(f"Expected one of {accepted_values} for threshold type, but received '{class_name}'")
69
+
70
+ return threshold_cls(**obj)
71
+
72
+
73
+ class ConstantThreshold(Threshold, threshold_type="constant"):
74
+ """A `Thresholder` implementation that returns a constant lower and or upper threshold value.
75
+
76
+ Attributes:
77
+ lower: Optional[float]
78
+ The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
79
+ upper: Optional[float]
80
+ The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
81
+
82
+ Raises:
83
+ ValueError: raised when an argument was given using an incorrect type or name
84
+ ValueError: raised when the ConstantThreshold could not be created using the given argument values
85
+
86
+ Examples:
87
+ >>> data = np.array(range(10))
88
+ >>> t = ConstantThreshold(lower=None, upper=0.1)
89
+ >>> lower, upper = t.threshold()
90
+ >>> print(lower, upper)
91
+ None 0.1
92
+ """
93
+
94
+ def __init__(self, lower: float | int | None = None, upper: float | int | None = None):
95
+ """Creates a new ConstantThreshold instance.
96
+
97
+ Args:
98
+ lower: Optional[Union[float, int]], default=None
99
+ The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
100
+ upper: Optional[Union[float, int]], default=None
101
+ The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
102
+
103
+ Raises:
104
+ ValueError: raised when an argument was given using an incorrect type or name
105
+ ValueError: raised when the ConstantThreshold could not be created using the given argument values
106
+ """
107
+ self._validate_inputs(lower, upper)
108
+
109
+ self.lower = lower
110
+ self.upper = upper
111
+
112
+ def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
113
+ return self.lower, self.upper
114
+
115
+ @staticmethod
116
+ def _validate_inputs(lower: float | int | None = None, upper: float | int | None = None):
117
+ if lower is not None and not isinstance(lower, (float, int)) or isinstance(lower, bool):
118
+ raise ValueError(f"expected type of 'lower' to be 'float', 'int' or None but got '{type(lower).__name__}'")
119
+
120
+ if upper is not None and not isinstance(upper, (float, int)) or isinstance(upper, bool):
121
+ raise ValueError(f"expected type of 'upper' to be 'float', 'int' or None but got '{type(upper).__name__}'")
122
+
123
+ # explicit None check is required due to special interpretation of the value 0.0 as False
124
+ if lower is not None and upper is not None and lower >= upper:
125
+ raise ValueError(f"lower threshold {lower} must be less than upper threshold {upper}")
126
+
127
+
128
+ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation"):
129
+ """A Thresholder that offsets the mean of an array by a multiple of the standard deviation of the array values.
130
+
131
+ This thresholder will take the aggregate of an array of values, the mean by default and add or subtract an offset
132
+ to get the upper and lower threshold values.
133
+ This offset is calculated as a multiplier, by default 3, times the standard deviation of the given array.
134
+
135
+ Attributes:
136
+ std_lower_multiplier: float
137
+ std_upper_multiplier: float
138
+
139
+ Examples:
140
+ >>> data = np.array(range(10))
141
+ >>> t = ConstantThreshold(lower=None, upper=0.1)
142
+ >>> lower, upper = t.threshold()
143
+ >>> print(lower, upper)
144
+ -4.116843969807043 13.116843969807043
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ std_lower_multiplier: float | int | None = 3,
150
+ std_upper_multiplier: float | int | None = 3,
151
+ offset_from: Callable[[np.ndarray], Any] = np.nanmean,
152
+ ):
153
+ """Creates a new StandardDeviationThreshold instance.
154
+
155
+ Args:
156
+ std_lower_multiplier: float, default=3
157
+ The number the standard deviation of the input array will be multiplied with to form the lower offset.
158
+ This value will be subtracted from the aggregate of the input array.
159
+ Defaults to 3.
160
+ std_upper_multiplier: float, default=3
161
+ The number the standard deviation of the input array will be multiplied with to form the upper offset.
162
+ This value will be added to the aggregate of the input array.
163
+ Defaults to 3.
164
+ offset_from: Callable[[np.ndarray], Any], default=np.nanmean
165
+ A function that will be applied to the input array to aggregate it into a single value.
166
+ Adding the upper offset to this value will yield the upper threshold, subtracting the lower offset
167
+ will yield the lower threshold.
168
+ """
169
+
170
+ self._validate_inputs(std_lower_multiplier, std_upper_multiplier)
171
+
172
+ self.std_lower_multiplier = std_lower_multiplier
173
+ self.std_upper_multiplier = std_upper_multiplier
174
+ self.offset_from = offset_from
175
+
176
+ def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
177
+ aggregate = self.offset_from(data)
178
+ std = np.nanstd(data)
179
+
180
+ lower_threshold = aggregate - std * self.std_lower_multiplier if self.std_lower_multiplier is not None else None
181
+
182
+ upper_threshold = aggregate + std * self.std_upper_multiplier if self.std_upper_multiplier is not None else None
183
+
184
+ return lower_threshold, upper_threshold
185
+
186
+ @staticmethod
187
+ def _validate_inputs(std_lower_multiplier: float | int | None = 3, std_upper_multiplier: float | int | None = 3):
188
+ if (
189
+ std_lower_multiplier is not None
190
+ and not isinstance(std_lower_multiplier, (float, int))
191
+ or isinstance(std_lower_multiplier, bool)
192
+ ):
193
+ raise ValueError(
194
+ f"expected type of 'std_lower_multiplier' to be 'float', 'int' or None "
195
+ f"but got '{type(std_lower_multiplier).__name__}'"
196
+ )
197
+
198
+ if std_lower_multiplier and std_lower_multiplier < 0:
199
+ raise ValueError(f"'std_lower_multiplier' should be greater than 0 but got value {std_lower_multiplier}")
200
+
201
+ if (
202
+ std_upper_multiplier is not None
203
+ and not isinstance(std_upper_multiplier, (float, int))
204
+ or isinstance(std_upper_multiplier, bool)
205
+ ):
206
+ raise ValueError(
207
+ f"expected type of 'std_upper_multiplier' to be 'float', 'int' or None "
208
+ f"but got '{type(std_upper_multiplier).__name__}'"
209
+ )
210
+
211
+ if std_upper_multiplier and std_upper_multiplier < 0:
212
+ raise ValueError(f"'std_upper_multiplier' should be greater than 0 but got value {std_upper_multiplier}")
213
+
214
+
215
+ def calculate_threshold_values(
216
+ threshold: Threshold,
217
+ data: np.ndarray,
218
+ lower_threshold_value_limit: float | None = None,
219
+ upper_threshold_value_limit: float | None = None,
220
+ override_using_none: bool = False,
221
+ logger: logging.Logger | None = None,
222
+ metric_name: str | None = None,
223
+ ) -> tuple[float | None, float | None]:
224
+ """Calculate lower and upper threshold values with respect to the provided Threshold and value limits.
225
+
226
+ Parameters:
227
+ threshold: Threshold
228
+ The Threshold instance that determines how the lower and upper threshold values will be calculated.
229
+ data: np.ndarray
230
+ The data used by the Threshold instance to calculate the lower and upper threshold values.
231
+ This will often be the values of a drift detection method or performance metric on chunks of reference data.
232
+ lower_threshold_value_limit: Optional[float], default=None
233
+ An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
234
+ values that end up below this limit will be replaced by this limit value.
235
+ The limit is often a theoretical constraint enforced by a specific drift detection method or performance
236
+ metric.
237
+ upper_threshold_value_limit: Optional[float], default=None
238
+ An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
239
+ values that end up below this limit will be replaced by this limit value.
240
+ The limit is often a theoretical constraint enforced by a specific drift detection method or performance
241
+ metric.
242
+ override_using_none: bool, default=False
243
+ When set to True use None to override threshold values that exceed value limits.
244
+ This will prevent them from being rendered on plots.
245
+ logger: Optional[logging.Logger], default=None
246
+ An optional Logger instance. When provided a warning will be logged when a calculated threshold value
247
+ gets overridden by a threshold value limit.
248
+ metric_name: Optional[str], default=None
249
+ When provided the metric name will be included within any log messages for additional clarity.
250
+ """
251
+
252
+ lower_threshold_value, upper_threshold_value = threshold.thresholds(data)
253
+
254
+ if (
255
+ lower_threshold_value_limit is not None
256
+ and lower_threshold_value is not None
257
+ and lower_threshold_value <= lower_threshold_value_limit
258
+ ):
259
+ override_value = None if override_using_none else lower_threshold_value_limit
260
+ if logger:
261
+ logger.warning(
262
+ f"{metric_name + ' ' if metric_name else ''}lower threshold value {lower_threshold_value} "
263
+ f"overridden by lower threshold value limit {override_value}"
264
+ )
265
+ lower_threshold_value = override_value
266
+
267
+ if (
268
+ upper_threshold_value_limit is not None
269
+ and upper_threshold_value is not None
270
+ and upper_threshold_value >= upper_threshold_value_limit
271
+ ):
272
+ override_value = None if override_using_none else upper_threshold_value_limit
273
+ if logger:
274
+ logger.warning(
275
+ f"{metric_name + ' ' if metric_name else ''}upper threshold value {upper_threshold_value} "
276
+ f"overridden by upper threshold value limit {override_value}"
277
+ )
278
+ upper_threshold_value = override_value
279
+
280
+ return lower_threshold_value, upper_threshold_value
@@ -5,7 +5,7 @@ as well as runtime metadata for reproducibility and logging.
5
5
 
6
6
  from ._base import ExecutionMetadata
7
7
  from ._bias import BalanceOutput, CompletenessOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
8
- from ._drift import DriftMMDOutput, DriftOutput
8
+ from ._drift import DriftMMDOutput, DriftMVDCOutput, DriftOutput
9
9
  from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
10
10
  from ._linters import DuplicatesOutput, OutliersOutput
11
11
  from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
@@ -34,6 +34,7 @@ __all__ = [
34
34
  "DivergenceOutput",
35
35
  "DiversityOutput",
36
36
  "DriftMMDOutput",
37
+ "DriftMVDCOutput",
37
38
  "DriftOutput",
38
39
  "DuplicatesOutput",
39
40
  "ExecutionMetadata",
dataeval/outputs/_bias.py CHANGED
@@ -7,10 +7,10 @@ from dataclasses import asdict, dataclass
7
7
  from typing import Any, Literal, TypeVar, overload
8
8
 
9
9
  import numpy as np
10
+ import pandas as pd
10
11
  from numpy.typing import NDArray
11
12
 
12
13
  with contextlib.suppress(ImportError):
13
- import pandas as pd
14
14
  from matplotlib.figure import Figure
15
15
 
16
16
  from dataeval.data._images import Images
@@ -38,8 +38,6 @@ class ToDataFrameMixin:
38
38
  -----
39
39
  This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
40
40
  """
41
- import pandas as pd
42
-
43
41
  return pd.DataFrame(
44
42
  index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
45
43
  data={