dataeval 0.84.1__py3-none-any.whl → 0.86.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/data/__init__.py +19 -0
  3. dataeval/{utils/data → data}/_embeddings.py +137 -17
  4. dataeval/{utils/data → data}/_metadata.py +20 -8
  5. dataeval/{utils/data → data}/_selection.py +22 -9
  6. dataeval/{utils/data → data}/_split.py +1 -1
  7. dataeval/data/selections/__init__.py +19 -0
  8. dataeval/{utils/data → data}/selections/_classbalance.py +1 -2
  9. dataeval/data/selections/_classfilter.py +110 -0
  10. dataeval/{utils/data → data}/selections/_indices.py +1 -1
  11. dataeval/{utils/data → data}/selections/_limit.py +1 -1
  12. dataeval/{utils/data → data}/selections/_prioritize.py +2 -2
  13. dataeval/{utils/data → data}/selections/_reverse.py +1 -1
  14. dataeval/{utils/data → data}/selections/_shuffle.py +1 -1
  15. dataeval/detectors/drift/__init__.py +4 -1
  16. dataeval/detectors/drift/_base.py +1 -1
  17. dataeval/detectors/drift/_cvm.py +2 -2
  18. dataeval/detectors/drift/_ks.py +2 -2
  19. dataeval/detectors/drift/_mmd.py +2 -2
  20. dataeval/detectors/drift/_mvdc.py +92 -0
  21. dataeval/detectors/drift/_nml/__init__.py +6 -0
  22. dataeval/detectors/drift/_nml/_base.py +68 -0
  23. dataeval/detectors/drift/_nml/_chunk.py +404 -0
  24. dataeval/detectors/drift/_nml/_domainclassifier.py +192 -0
  25. dataeval/detectors/drift/_nml/_result.py +98 -0
  26. dataeval/detectors/drift/_nml/_thresholds.py +280 -0
  27. dataeval/detectors/linters/duplicates.py +1 -1
  28. dataeval/detectors/linters/outliers.py +1 -1
  29. dataeval/metadata/_distance.py +1 -1
  30. dataeval/metadata/_ood.py +4 -4
  31. dataeval/metrics/bias/_balance.py +1 -1
  32. dataeval/metrics/bias/_diversity.py +1 -1
  33. dataeval/metrics/bias/_parity.py +1 -1
  34. dataeval/metrics/stats/_labelstats.py +2 -2
  35. dataeval/outputs/__init__.py +2 -1
  36. dataeval/outputs/_bias.py +2 -4
  37. dataeval/outputs/_drift.py +68 -0
  38. dataeval/outputs/_linters.py +1 -6
  39. dataeval/outputs/_stats.py +1 -6
  40. dataeval/typing.py +31 -0
  41. dataeval/utils/__init__.py +2 -2
  42. dataeval/utils/data/__init__.py +5 -20
  43. dataeval/utils/data/collate.py +2 -0
  44. dataeval/utils/datasets/__init__.py +17 -0
  45. dataeval/utils/{data/datasets → datasets}/_base.py +3 -3
  46. dataeval/utils/{data/datasets → datasets}/_cifar10.py +2 -2
  47. dataeval/utils/{data/datasets → datasets}/_milco.py +2 -2
  48. dataeval/utils/{data/datasets → datasets}/_mnist.py +2 -2
  49. dataeval/utils/{data/datasets → datasets}/_ships.py +2 -2
  50. dataeval/utils/{data/datasets → datasets}/_voc.py +3 -3
  51. {dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/METADATA +3 -2
  52. dataeval-0.86.0.dist-info/RECORD +114 -0
  53. dataeval/utils/data/datasets/__init__.py +0 -17
  54. dataeval/utils/data/selections/__init__.py +0 -19
  55. dataeval/utils/data/selections/_classfilter.py +0 -44
  56. dataeval-0.84.1.dist-info/RECORD +0 -106
  57. /dataeval/{utils/data → data}/_images.py +0 -0
  58. /dataeval/{utils/data → data}/_targets.py +0 -0
  59. /dataeval/utils/{metadata.py → data/metadata.py} +0 -0
  60. /dataeval/utils/{data/datasets → datasets}/_fileio.py +0 -0
  61. /dataeval/utils/{data/datasets → datasets}/_mixin.py +0 -0
  62. /dataeval/utils/{data/datasets → datasets}/_types.py +0 -0
  63. {dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/LICENSE.txt +0 -0
  64. {dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,192 @@
1
+ """
2
+ Source code derived from NannyML 0.13.0
3
+ https://github.com/NannyML/nannyml/blob/main/nannyml/drift/multivariate/domain_classifier/calculator.py
4
+
5
+ Licensed under Apache Software License (Apache 2.0)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Any
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from lightgbm import LGBMClassifier
16
+ from numpy.typing import NDArray
17
+ from sklearn.metrics import roc_auc_score
18
+ from sklearn.model_selection import StratifiedKFold
19
+
20
+ from dataeval.config import get_max_processes, get_seed
21
+ from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
22
+ from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
23
+ from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold, calculate_threshold_values
24
+ from dataeval.outputs._base import set_metadata
25
+ from dataeval.outputs._drift import DriftMVDCOutput
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ DEFAULT_LGBM_HYPERPARAMS = {
30
+ "boosting_type": "gbdt",
31
+ "class_weight": None,
32
+ "colsample_bytree": 1.0,
33
+ "deterministic": True,
34
+ "importance_type": "split",
35
+ "learning_rate": 0.1,
36
+ "max_depth": -1,
37
+ "min_child_samples": 20,
38
+ "min_child_weight": 0.001,
39
+ "min_split_gain": 0.0,
40
+ "n_estimators": 100,
41
+ "n_jobs": get_max_processes() or 0,
42
+ "num_leaves": 31,
43
+ "objective": None,
44
+ "random_state": get_seed(),
45
+ "reg_alpha": 0.0,
46
+ "reg_lambda": 0.0,
47
+ "subsample": 1.0,
48
+ "subsample_for_bin": 200000,
49
+ "subsample_freq": 0,
50
+ "verbosity": -1,
51
+ }
52
+
53
+
54
+ class DomainClassifierCalculator(AbstractCalculator):
55
+ """
56
+ DomainClassifierCalculator implementation.
57
+
58
+ Uses Drift Detection Classifier's cross validated performance as a measure of drift.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ chunker: Chunker | None = None,
64
+ cv_folds_num: int = 5,
65
+ hyperparameters: dict[str, Any] | None = None,
66
+ threshold: Threshold = ConstantThreshold(lower=0.45, upper=0.65),
67
+ ) -> None:
68
+ """
69
+ Create a new DomainClassifierCalculator instance.
70
+
71
+ Parameters
72
+ -----------
73
+ chunker : Chunker, default=None
74
+ The `Chunker` used to split the data sets into a lists of chunks.
75
+ cv_folds_num: Optional[int]
76
+ Number of cross-validation folds to use when calculating DC discrimination value.
77
+ hyperparameters : dict[str, Any], default = None
78
+ A dictionary used to provide your own custom hyperparameters when training the discrimination model.
79
+ Check out the available hyperparameter options in the
80
+ `LightGBM docs <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>`_.
81
+ threshold: Threshold, default=ConstantThreshold
82
+ The threshold you wish to evaluate values on. Defaults to a ConstantThreshold with lower value
83
+ of 0.45 and upper value of 0.65.
84
+ """
85
+ super().__init__(chunker, logger)
86
+
87
+ self.cv_folds_num = cv_folds_num
88
+ self.hyperparameters = DEFAULT_LGBM_HYPERPARAMS if hyperparameters is None else hyperparameters
89
+ self.threshold = threshold
90
+ self.result: DriftMVDCOutput | None = None
91
+
92
+ def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput:
93
+ """Fits the DC calculator to a set of reference data."""
94
+ self._x_ref = reference_data
95
+ result = self._calculate(data=self._x_ref)
96
+ result._data[("chunk", "period")] = "reference"
97
+
98
+ return result
99
+
100
+ @set_metadata
101
+ def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
102
+ """Calculate the data DC calculator metric for a given data set."""
103
+ chunks = self.chunker.split(data)
104
+
105
+ res = pd.DataFrame.from_records(
106
+ [
107
+ {
108
+ **chunk.dict(),
109
+ "period": "analysis",
110
+ "classifier_auroc_value": self._calculate_chunk(chunk=chunk),
111
+ }
112
+ for chunk in chunks
113
+ ]
114
+ )
115
+
116
+ multilevel_index = _create_multilevel_index(chunks, "domain_classifier_auroc", ["value"])
117
+ res.columns = multilevel_index
118
+ res = res.reset_index(drop=True)
119
+
120
+ res = self._populate_alert_thresholds(res)
121
+
122
+ if self.result is None:
123
+ self.result = DriftMVDCOutput(results_data=res)
124
+ else:
125
+ self.result = self.result.filter(period="reference")
126
+ self.result._data = pd.concat([self.result._data, res], ignore_index=True)
127
+ return self.result
128
+
129
+ def _calculate_chunk(self, chunk: Chunk):
130
+ if self.result is None:
131
+ # Use information from chunk indices to identify reference chunk's location. This is possible because
132
+ # both the internal reference data copy and the chunk data were sorted by timestamp, so these
133
+ # indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
134
+ # which is a costly operation.
135
+ df_X = self._x_ref
136
+ y = np.zeros(len(df_X), dtype=np.intp)
137
+ y[chunk.start_index : chunk.end_index + 1] = 1
138
+ else:
139
+ chunk_X = chunk.data
140
+ reference_X = self._x_ref
141
+ chunk_y = np.ones(len(chunk_X), dtype=np.intp)
142
+ reference_y = np.zeros(len(reference_X), dtype=np.intp)
143
+ df_X = pd.concat([reference_X, chunk_X], ignore_index=True)
144
+ y = np.concatenate([reference_y, chunk_y])
145
+
146
+ skf = StratifiedKFold(n_splits=self.cv_folds_num)
147
+ all_preds: list[NDArray[np.float32]] = []
148
+ all_tgts: list[NDArray[np.intp]] = []
149
+ for i, (train_index, test_index) in enumerate(skf.split(df_X, y)):
150
+ _trx = df_X.iloc[train_index]
151
+ _try = y[train_index]
152
+ _tsx = df_X.iloc[test_index]
153
+ _tsy = y[test_index]
154
+ model = LGBMClassifier(**self.hyperparameters)
155
+ model.fit(_trx, _try)
156
+ preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
157
+ all_preds.append(preds)
158
+ all_tgts.append(_tsy)
159
+
160
+ np_all_preds = np.concatenate(all_preds, axis=0)
161
+ np_all_tgts = np.concatenate(all_tgts, axis=0)
162
+ try:
163
+ # catch case where all rows are duplicates
164
+ result = roc_auc_score(np_all_tgts, np_all_preds)
165
+ except ValueError as err:
166
+ if str(err) != "Only one class present in y_true. ROC AUC score is not defined in that case.":
167
+ raise
168
+ else:
169
+ # by definition if reference and chunk exactly match we can't discriminate
170
+ result = 0.5
171
+ return result
172
+
173
+ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
174
+ if self.result is None:
175
+ self._threshold_values = calculate_threshold_values(
176
+ threshold=self.threshold,
177
+ data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
178
+ lower_threshold_value_limit=0.0,
179
+ upper_threshold_value_limit=1.0,
180
+ logger=self._logger,
181
+ )
182
+
183
+ result_data[("domain_classifier_auroc", "upper_threshold")] = self._threshold_values[1]
184
+ result_data[("domain_classifier_auroc", "lower_threshold")] = self._threshold_values[0]
185
+ result_data[("domain_classifier_auroc", "alert")] = result_data.apply(
186
+ lambda row: bool(
187
+ row["domain_classifier_auroc", "value"] > row["domain_classifier_auroc", "upper_threshold"]
188
+ or row["domain_classifier_auroc", "value"] < row["domain_classifier_auroc", "lower_threshold"]
189
+ ),
190
+ axis=1,
191
+ )
192
+ return result_data
@@ -0,0 +1,98 @@
1
+ """
2
+ Contains the results of the data reconstruction drift calculation and provides plotting functionality.
3
+
4
+ Source code derived from NannyML 0.13.0
5
+ https://github.com/NannyML/nannyml/blob/main/nannyml/base.py
6
+
7
+ Licensed under Apache Software License (Apache 2.0)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import copy
13
+ from abc import ABC, abstractmethod
14
+ from typing import NamedTuple, Sequence
15
+
16
+ import pandas as pd
17
+ from typing_extensions import Self
18
+
19
+ from dataeval.outputs._base import GenericOutput
20
+
21
+
22
+ class Metric(NamedTuple):
23
+ display_name: str
24
+ column_name: str
25
+
26
+
27
+ class AbstractResult(GenericOutput[pd.DataFrame]):
28
+ def __init__(self, results_data: pd.DataFrame) -> None:
29
+ self._data = results_data.copy(deep=True)
30
+
31
+ def data(self) -> pd.DataFrame:
32
+ return self.to_df()
33
+
34
+ @property
35
+ def empty(self) -> bool:
36
+ return self._data is None or self._data.empty
37
+
38
+ def __len__(self) -> int:
39
+ return 0 if self.empty else len(self._data)
40
+
41
+ def to_df(self, multilevel: bool = True) -> pd.DataFrame:
42
+ """Export results to pandas dataframe."""
43
+ if multilevel:
44
+ return self._data
45
+ else:
46
+ column_names = [
47
+ "_".join(col).replace("chunk_chunk_chunk", "chunk").replace("chunk_chunk", "chunk")
48
+ for col in self._data.columns.values
49
+ ]
50
+ single_level_data = self._data.copy(deep=True)
51
+ single_level_data.columns = column_names
52
+ return single_level_data
53
+
54
+ def filter(self, period: str = "all", metrics: str | Sequence[str] | None = None) -> Self:
55
+ """Returns filtered result metric data."""
56
+ if metrics and not isinstance(metrics, (str, Sequence)):
57
+ raise ValueError("metrics value provided is not a valid metric or sequence of metrics")
58
+ if isinstance(metrics, str):
59
+ metrics = [metrics]
60
+ return self._filter(period, metrics)
61
+
62
+ @abstractmethod
63
+ def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self: ...
64
+
65
+
66
+ class Abstract1DResult(AbstractResult, ABC):
67
+ def __init__(self, results_data: pd.DataFrame) -> None:
68
+ super().__init__(results_data)
69
+
70
+ def _filter(self, period: str, metrics=None) -> Self:
71
+ data = self._data
72
+ if period != "all":
73
+ data = self._data.loc[self._data.loc[:, ("chunk", "period")] == period, :] # type: ignore | dataframe loc
74
+ data = data.reset_index(drop=True)
75
+
76
+ res = copy.deepcopy(self)
77
+ res._data = data
78
+ return res
79
+
80
+
81
+ class PerMetricResult(Abstract1DResult):
82
+ def __init__(self, results_data: pd.DataFrame, metrics: Sequence[Metric] = []) -> None:
83
+ super().__init__(results_data)
84
+ self.metrics = metrics
85
+
86
+ def _filter(self, period: str, metrics: Sequence[str] | None = None) -> Self:
87
+ if metrics is None:
88
+ metrics = [metric.column_name for metric in self.metrics]
89
+
90
+ res = super()._filter(period)
91
+
92
+ data = pd.concat([res._data.loc[:, (["chunk"])], res._data.loc[:, (metrics,)]], axis=1) # type: ignore | dataframe loc
93
+ data = data.reset_index(drop=True)
94
+
95
+ res._data = data
96
+ res.metrics = [metric for metric in self.metrics if metric.column_name in metrics]
97
+
98
+ return res
@@ -0,0 +1,280 @@
1
+ """
2
+ Source code derived from NannyML 0.13.0
3
+ https://github.com/NannyML/nannyml/blob/main/nannyml/thresholds.py
4
+
5
+ Licensed under Apache Software License (Apache 2.0)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from abc import ABC, abstractmethod
12
+ from typing import Any, Callable, ClassVar
13
+
14
+ import numpy as np
15
+
16
+
17
+ class Threshold(ABC):
18
+ """A base class used to calculate lower and upper threshold values given one or multiple arrays.
19
+
20
+ Any subclass should implement the abstract `thresholds` method.
21
+ It takes an array or list of arrays and converts them into lower and upper threshold values, represented
22
+ as a tuple of optional floats.
23
+
24
+ A `None` threshold value is interpreted as if there is no upper or lower threshold.
25
+ One or both values might be `None`.
26
+ """
27
+
28
+ _registry: ClassVar[dict[str, type[Threshold]]] = {}
29
+ """Class registry lookup to get threshold subclass from threshold_type string"""
30
+
31
+ def __str__(self) -> str:
32
+ return self.__str__()
33
+
34
+ def __repr__(self) -> str:
35
+ return self.__class__.__name__ + str(vars(self))
36
+
37
+ def __eq__(self, other: object) -> bool:
38
+ return isinstance(other, self.__class__) and other.__dict__ == self.__dict__
39
+
40
+ def __init_subclass__(cls, threshold_type: str) -> None:
41
+ Threshold._registry[threshold_type] = cls
42
+
43
+ @abstractmethod
44
+ def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
45
+ """Returns lower and upper threshold values when given one or more np.ndarray instances.
46
+
47
+ Parameters:
48
+ data: np.ndarray
49
+ An array of values used to calculate the thresholds on. This will most often represent a metric
50
+ calculated on one or more sets of data, e.g. a list of F1 scores of multiple data chunks.
51
+ kwargs: dict[str, Any]
52
+ Optional keyword arguments passed to the implementing subclass.
53
+
54
+ Returns:
55
+ lower, upper: tuple[Optional[float], Optional[float]]
56
+ The lower and upper threshold values. One or both might be `None`.
57
+ """
58
+
59
+ @classmethod
60
+ def parse_object(cls, obj: dict[str, Any]) -> Threshold:
61
+ """Parse object as :class:`Threshold`"""
62
+ class_name = obj.pop("type", "")
63
+
64
+ try:
65
+ threshold_cls = cls._registry[class_name]
66
+ except KeyError:
67
+ accepted_values = ", ".join(map(repr, cls._registry))
68
+ raise ValueError(f"Expected one of {accepted_values} for threshold type, but received '{class_name}'")
69
+
70
+ return threshold_cls(**obj)
71
+
72
+
73
+ class ConstantThreshold(Threshold, threshold_type="constant"):
74
+ """A `Thresholder` implementation that returns a constant lower and or upper threshold value.
75
+
76
+ Attributes:
77
+ lower: Optional[float]
78
+ The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
79
+ upper: Optional[float]
80
+ The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
81
+
82
+ Raises:
83
+ ValueError: raised when an argument was given using an incorrect type or name
84
+ ValueError: raised when the ConstantThreshold could not be created using the given argument values
85
+
86
+ Examples:
87
+ >>> data = np.array(range(10))
88
+ >>> t = ConstantThreshold(lower=None, upper=0.1)
89
+ >>> lower, upper = t.threshold()
90
+ >>> print(lower, upper)
91
+ None 0.1
92
+ """
93
+
94
+ def __init__(self, lower: float | int | None = None, upper: float | int | None = None):
95
+ """Creates a new ConstantThreshold instance.
96
+
97
+ Args:
98
+ lower: Optional[Union[float, int]], default=None
99
+ The constant lower threshold value. Defaults to `None`, meaning there is no lower threshold.
100
+ upper: Optional[Union[float, int]], default=None
101
+ The constant upper threshold value. Defaults to `None`, meaning there is no upper threshold.
102
+
103
+ Raises:
104
+ ValueError: raised when an argument was given using an incorrect type or name
105
+ ValueError: raised when the ConstantThreshold could not be created using the given argument values
106
+ """
107
+ self._validate_inputs(lower, upper)
108
+
109
+ self.lower = lower
110
+ self.upper = upper
111
+
112
+ def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
113
+ return self.lower, self.upper
114
+
115
+ @staticmethod
116
+ def _validate_inputs(lower: float | int | None = None, upper: float | int | None = None):
117
+ if lower is not None and not isinstance(lower, (float, int)) or isinstance(lower, bool):
118
+ raise ValueError(f"expected type of 'lower' to be 'float', 'int' or None but got '{type(lower).__name__}'")
119
+
120
+ if upper is not None and not isinstance(upper, (float, int)) or isinstance(upper, bool):
121
+ raise ValueError(f"expected type of 'upper' to be 'float', 'int' or None but got '{type(upper).__name__}'")
122
+
123
+ # explicit None check is required due to special interpretation of the value 0.0 as False
124
+ if lower is not None and upper is not None and lower >= upper:
125
+ raise ValueError(f"lower threshold {lower} must be less than upper threshold {upper}")
126
+
127
+
128
+ class StandardDeviationThreshold(Threshold, threshold_type="standard_deviation"):
129
+ """A Thresholder that offsets the mean of an array by a multiple of the standard deviation of the array values.
130
+
131
+ This thresholder will take the aggregate of an array of values, the mean by default and add or subtract an offset
132
+ to get the upper and lower threshold values.
133
+ This offset is calculated as a multiplier, by default 3, times the standard deviation of the given array.
134
+
135
+ Attributes:
136
+ std_lower_multiplier: float
137
+ std_upper_multiplier: float
138
+
139
+ Examples:
140
+ >>> data = np.array(range(10))
141
+ >>> t = ConstantThreshold(lower=None, upper=0.1)
142
+ >>> lower, upper = t.threshold()
143
+ >>> print(lower, upper)
144
+ -4.116843969807043 13.116843969807043
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ std_lower_multiplier: float | int | None = 3,
150
+ std_upper_multiplier: float | int | None = 3,
151
+ offset_from: Callable[[np.ndarray], Any] = np.nanmean,
152
+ ):
153
+ """Creates a new StandardDeviationThreshold instance.
154
+
155
+ Args:
156
+ std_lower_multiplier: float, default=3
157
+ The number the standard deviation of the input array will be multiplied with to form the lower offset.
158
+ This value will be subtracted from the aggregate of the input array.
159
+ Defaults to 3.
160
+ std_upper_multiplier: float, default=3
161
+ The number the standard deviation of the input array will be multiplied with to form the upper offset.
162
+ This value will be added to the aggregate of the input array.
163
+ Defaults to 3.
164
+ offset_from: Callable[[np.ndarray], Any], default=np.nanmean
165
+ A function that will be applied to the input array to aggregate it into a single value.
166
+ Adding the upper offset to this value will yield the upper threshold, subtracting the lower offset
167
+ will yield the lower threshold.
168
+ """
169
+
170
+ self._validate_inputs(std_lower_multiplier, std_upper_multiplier)
171
+
172
+ self.std_lower_multiplier = std_lower_multiplier
173
+ self.std_upper_multiplier = std_upper_multiplier
174
+ self.offset_from = offset_from
175
+
176
+ def thresholds(self, data: np.ndarray) -> tuple[float | None, float | None]:
177
+ aggregate = self.offset_from(data)
178
+ std = np.nanstd(data)
179
+
180
+ lower_threshold = aggregate - std * self.std_lower_multiplier if self.std_lower_multiplier is not None else None
181
+
182
+ upper_threshold = aggregate + std * self.std_upper_multiplier if self.std_upper_multiplier is not None else None
183
+
184
+ return lower_threshold, upper_threshold
185
+
186
+ @staticmethod
187
+ def _validate_inputs(std_lower_multiplier: float | int | None = 3, std_upper_multiplier: float | int | None = 3):
188
+ if (
189
+ std_lower_multiplier is not None
190
+ and not isinstance(std_lower_multiplier, (float, int))
191
+ or isinstance(std_lower_multiplier, bool)
192
+ ):
193
+ raise ValueError(
194
+ f"expected type of 'std_lower_multiplier' to be 'float', 'int' or None "
195
+ f"but got '{type(std_lower_multiplier).__name__}'"
196
+ )
197
+
198
+ if std_lower_multiplier and std_lower_multiplier < 0:
199
+ raise ValueError(f"'std_lower_multiplier' should be greater than 0 but got value {std_lower_multiplier}")
200
+
201
+ if (
202
+ std_upper_multiplier is not None
203
+ and not isinstance(std_upper_multiplier, (float, int))
204
+ or isinstance(std_upper_multiplier, bool)
205
+ ):
206
+ raise ValueError(
207
+ f"expected type of 'std_upper_multiplier' to be 'float', 'int' or None "
208
+ f"but got '{type(std_upper_multiplier).__name__}'"
209
+ )
210
+
211
+ if std_upper_multiplier and std_upper_multiplier < 0:
212
+ raise ValueError(f"'std_upper_multiplier' should be greater than 0 but got value {std_upper_multiplier}")
213
+
214
+
215
+ def calculate_threshold_values(
216
+ threshold: Threshold,
217
+ data: np.ndarray,
218
+ lower_threshold_value_limit: float | None = None,
219
+ upper_threshold_value_limit: float | None = None,
220
+ override_using_none: bool = False,
221
+ logger: logging.Logger | None = None,
222
+ metric_name: str | None = None,
223
+ ) -> tuple[float | None, float | None]:
224
+ """Calculate lower and upper threshold values with respect to the provided Threshold and value limits.
225
+
226
+ Parameters:
227
+ threshold: Threshold
228
+ The Threshold instance that determines how the lower and upper threshold values will be calculated.
229
+ data: np.ndarray
230
+ The data used by the Threshold instance to calculate the lower and upper threshold values.
231
+ This will often be the values of a drift detection method or performance metric on chunks of reference data.
232
+ lower_threshold_value_limit: Optional[float], default=None
233
+ An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
234
+ values that end up below this limit will be replaced by this limit value.
235
+ The limit is often a theoretical constraint enforced by a specific drift detection method or performance
236
+ metric.
237
+ upper_threshold_value_limit: Optional[float], default=None
238
+ An optional value that serves as a limit for the lower threshold value. Any calculated lower threshold
239
+ values that end up below this limit will be replaced by this limit value.
240
+ The limit is often a theoretical constraint enforced by a specific drift detection method or performance
241
+ metric.
242
+ override_using_none: bool, default=False
243
+ When set to True use None to override threshold values that exceed value limits.
244
+ This will prevent them from being rendered on plots.
245
+ logger: Optional[logging.Logger], default=None
246
+ An optional Logger instance. When provided a warning will be logged when a calculated threshold value
247
+ gets overridden by a threshold value limit.
248
+ metric_name: Optional[str], default=None
249
+ When provided the metric name will be included within any log messages for additional clarity.
250
+ """
251
+
252
+ lower_threshold_value, upper_threshold_value = threshold.thresholds(data)
253
+
254
+ if (
255
+ lower_threshold_value_limit is not None
256
+ and lower_threshold_value is not None
257
+ and lower_threshold_value <= lower_threshold_value_limit
258
+ ):
259
+ override_value = None if override_using_none else lower_threshold_value_limit
260
+ if logger:
261
+ logger.warning(
262
+ f"{metric_name + ' ' if metric_name else ''}lower threshold value {lower_threshold_value} "
263
+ f"overridden by lower threshold value limit {override_value}"
264
+ )
265
+ lower_threshold_value = override_value
266
+
267
+ if (
268
+ upper_threshold_value_limit is not None
269
+ and upper_threshold_value is not None
270
+ and upper_threshold_value >= upper_threshold_value_limit
271
+ ):
272
+ override_value = None if override_using_none else upper_threshold_value_limit
273
+ if logger:
274
+ logger.warning(
275
+ f"{metric_name + ' ' if metric_name else ''}upper threshold value {upper_threshold_value} "
276
+ f"overridden by upper threshold value limit {override_value}"
277
+ )
278
+ upper_threshold_value = override_value
279
+
280
+ return lower_threshold_value, upper_threshold_value
@@ -4,13 +4,13 @@ __all__ = []
4
4
 
5
5
  from typing import Any, Sequence, overload
6
6
 
7
+ from dataeval.data._images import Images
7
8
  from dataeval.metrics.stats import hashstats
8
9
  from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
9
10
  from dataeval.outputs import DuplicatesOutput, HashStatsOutput
10
11
  from dataeval.outputs._base import set_metadata
11
12
  from dataeval.outputs._linters import DatasetDuplicateGroupMap, DuplicateGroup
12
13
  from dataeval.typing import ArrayLike, Dataset
13
- from dataeval.utils.data._images import Images
14
14
 
15
15
 
16
16
  class Duplicates:
@@ -7,6 +7,7 @@ from typing import Any, Literal, Sequence, overload
7
7
  import numpy as np
8
8
  from numpy.typing import NDArray
9
9
 
10
+ from dataeval.data._images import Images
10
11
  from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
11
12
  from dataeval.metrics.stats._imagestats import imagestats
12
13
  from dataeval.outputs import DimensionStatsOutput, ImageStatsOutput, OutliersOutput, PixelStatsOutput, VisualStatsOutput
@@ -14,7 +15,6 @@ from dataeval.outputs._base import set_metadata
14
15
  from dataeval.outputs._linters import IndexIssueMap, OutlierStatsOutput
15
16
  from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX
16
17
  from dataeval.typing import ArrayLike, Dataset
17
- from dataeval.utils.data._images import Images
18
18
 
19
19
 
20
20
  def _get_outlier_mask(
@@ -9,11 +9,11 @@ import numpy as np
9
9
  from scipy.stats import iqr, ks_2samp
10
10
  from scipy.stats import wasserstein_distance as emd
11
11
 
12
+ from dataeval.data import Metadata
12
13
  from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
13
14
  from dataeval.outputs import MetadataDistanceOutput, MetadataDistanceValues
14
15
  from dataeval.outputs._base import set_metadata
15
16
  from dataeval.typing import ArrayLike
16
- from dataeval.utils.data import Metadata
17
17
 
18
18
 
19
19
  class KSType(NamedTuple):
dataeval/metadata/_ood.py CHANGED
@@ -9,10 +9,10 @@ from numpy.typing import NDArray
9
9
  from sklearn.feature_selection import mutual_info_classif
10
10
 
11
11
  from dataeval.config import get_seed
12
+ from dataeval.data import Metadata
12
13
  from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
13
14
  from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorOutput
14
15
  from dataeval.outputs._base import set_metadata
15
- from dataeval.utils.data import Metadata
16
16
 
17
17
 
18
18
  def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
@@ -201,7 +201,7 @@ def find_most_deviated_factors(
201
201
  MostDeviatedFactorsOutput([])
202
202
  """
203
203
 
204
- ood_mask: NDArray[np.bool] = ood.is_ood
204
+ ood_mask: NDArray[np.bool_] = ood.is_ood
205
205
 
206
206
  # No metadata correlated with out of distribution data
207
207
  if not any(ood_mask):
@@ -303,7 +303,7 @@ def find_ood_predictors(
303
303
  OODPredictorOutput({})
304
304
  """
305
305
 
306
- ood_mask: NDArray[np.bool] = ood.is_ood
306
+ ood_mask: NDArray[np.bool_] = ood.is_ood
307
307
 
308
308
  discrete_features_count = len(metadata.discrete_factor_names)
309
309
  factors, data = _combine_discrete_continuous(metadata) # (F, ), (S, F) => F = Fd + Fc
@@ -320,7 +320,7 @@ def find_ood_predictors(
320
320
  # Calculate mean, std of each factor over all samples
321
321
  scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1) # (S, F)
322
322
 
323
- discrete_features = np.zeros_like(factors, dtype=np.bool)
323
+ discrete_features = np.zeros_like(factors, dtype=np.bool_)
324
324
  discrete_features[:discrete_features_count] = True
325
325
 
326
326
  mutual_info_values = (
@@ -9,10 +9,10 @@ import scipy as sp
9
9
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
10
10
 
11
11
  from dataeval.config import EPSILON, get_seed
12
+ from dataeval.data import Metadata
12
13
  from dataeval.outputs import BalanceOutput
13
14
  from dataeval.outputs._base import set_metadata
14
15
  from dataeval.utils._bin import get_counts
15
- from dataeval.utils.data import Metadata
16
16
 
17
17
 
18
18
  def _validate_num_neighbors(num_neighbors: int) -> int:
@@ -8,11 +8,11 @@ import numpy as np
8
8
  import scipy as sp
9
9
  from numpy.typing import NDArray
10
10
 
11
+ from dataeval.data import Metadata
11
12
  from dataeval.outputs import DiversityOutput
12
13
  from dataeval.outputs._base import set_metadata
13
14
  from dataeval.utils._bin import get_counts
14
15
  from dataeval.utils._method import get_method
15
- from dataeval.utils.data import Metadata
16
16
 
17
17
 
18
18
  def diversity_shannon(