dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_log.py +1 -1
  3. dataeval/config.py +21 -4
  4. dataeval/data/_embeddings.py +2 -2
  5. dataeval/data/_images.py +2 -3
  6. dataeval/data/_metadata.py +65 -42
  7. dataeval/data/_selection.py +2 -3
  8. dataeval/data/_split.py +2 -3
  9. dataeval/data/_targets.py +17 -13
  10. dataeval/data/selections/_classfilter.py +6 -8
  11. dataeval/data/selections/_prioritize.py +6 -9
  12. dataeval/data/selections/_shuffle.py +3 -1
  13. dataeval/detectors/drift/__init__.py +4 -1
  14. dataeval/detectors/drift/_base.py +4 -5
  15. dataeval/detectors/drift/_mmd.py +3 -6
  16. dataeval/detectors/drift/_mvdc.py +92 -0
  17. dataeval/detectors/drift/_nml/__init__.py +6 -0
  18. dataeval/detectors/drift/_nml/_base.py +70 -0
  19. dataeval/detectors/drift/_nml/_chunk.py +396 -0
  20. dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
  21. dataeval/detectors/drift/_nml/_result.py +97 -0
  22. dataeval/detectors/drift/_nml/_thresholds.py +269 -0
  23. dataeval/detectors/linters/outliers.py +7 -7
  24. dataeval/metrics/bias/_parity.py +10 -13
  25. dataeval/metrics/estimators/_divergence.py +2 -4
  26. dataeval/metrics/stats/_base.py +103 -42
  27. dataeval/metrics/stats/_boxratiostats.py +21 -19
  28. dataeval/metrics/stats/_dimensionstats.py +14 -10
  29. dataeval/metrics/stats/_hashstats.py +1 -1
  30. dataeval/metrics/stats/_pixelstats.py +6 -6
  31. dataeval/metrics/stats/_visualstats.py +3 -3
  32. dataeval/outputs/__init__.py +2 -1
  33. dataeval/outputs/_base.py +22 -7
  34. dataeval/outputs/_bias.py +27 -31
  35. dataeval/outputs/_drift.py +60 -0
  36. dataeval/outputs/_linters.py +12 -17
  37. dataeval/outputs/_stats.py +83 -29
  38. dataeval/outputs/_workflows.py +2 -2
  39. dataeval/utils/_array.py +6 -9
  40. dataeval/utils/_bin.py +1 -2
  41. dataeval/utils/_clusterer.py +7 -4
  42. dataeval/utils/_fast_mst.py +27 -13
  43. dataeval/utils/_image.py +65 -11
  44. dataeval/utils/_mst.py +1 -3
  45. dataeval/utils/_plot.py +15 -10
  46. dataeval/utils/data/_dataset.py +32 -20
  47. dataeval/utils/data/metadata.py +104 -82
  48. dataeval/utils/datasets/__init__.py +2 -0
  49. dataeval/utils/datasets/_antiuav.py +189 -0
  50. dataeval/utils/datasets/_base.py +11 -8
  51. dataeval/utils/datasets/_cifar10.py +104 -45
  52. dataeval/utils/datasets/_fileio.py +21 -47
  53. dataeval/utils/datasets/_milco.py +19 -11
  54. dataeval/utils/datasets/_mixin.py +2 -4
  55. dataeval/utils/datasets/_mnist.py +3 -4
  56. dataeval/utils/datasets/_ships.py +14 -7
  57. dataeval/utils/datasets/_voc.py +229 -42
  58. dataeval/utils/torch/models.py +5 -10
  59. dataeval/utils/torch/trainer.py +3 -3
  60. dataeval/workflows/sufficiency.py +2 -2
  61. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
  62. dataeval-0.86.1.dist-info/RECORD +114 -0
  63. dataeval/detectors/ood/vae.py +0 -74
  64. dataeval-0.85.0.dist-info/RECORD +0 -107
  65. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
  66. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,70 @@
1
+ """
2
+ Source code derived from NannyML 0.13.0
3
+ https://github.com/NannyML/nannyml/blob/main/nannyml/base.py
4
+
5
+ Licensed under Apache Software License (Apache 2.0)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from abc import ABC, abstractmethod
12
+ from logging import Logger
13
+ from typing import Sequence
14
+
15
+ import pandas as pd
16
+ from typing_extensions import Self
17
+
18
+ from dataeval.detectors.drift._nml._chunk import Chunk, Chunker, CountBasedChunker
19
+ from dataeval.outputs._drift import DriftMVDCOutput
20
+
21
+
22
+ def _validate(data: pd.DataFrame, expected_features: int | None = None) -> int:
23
+ if data.empty:
24
+ raise ValueError("data contains no rows. Please provide a valid data set.")
25
+ if expected_features is not None and data.shape[-1] != expected_features:
26
+ raise ValueError(f"expected '{expected_features}' features in data set:\n\t{data}")
27
+ return data.shape[-1]
28
+
29
+
30
+ def _create_multilevel_index(
31
+ chunks: Sequence[Chunk], result_group_name: str, result_column_names: Sequence[str]
32
+ ) -> pd.MultiIndex:
33
+ chunk_column_names = (*chunks[0].KEYS, "period")
34
+ chunk_tuples = [("chunk", chunk_column_name) for chunk_column_name in chunk_column_names]
35
+ result_tuples = [(result_group_name, column_name) for column_name in result_column_names]
36
+ return pd.MultiIndex.from_tuples(chunk_tuples + result_tuples)
37
+
38
+
39
+ class AbstractCalculator(ABC):
40
+ """Base class for drift calculation."""
41
+
42
+ def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None) -> None:
43
+ self.chunker = chunker if isinstance(chunker, Chunker) else CountBasedChunker(10)
44
+ self.result: DriftMVDCOutput | None = None
45
+ self.n_features: int | None = None
46
+ self._logger = logger if isinstance(logger, Logger) else logging.getLogger(__name__)
47
+
48
+ def fit(self, reference_data: pd.DataFrame) -> Self:
49
+ """Trains the calculator using reference data."""
50
+ self.n_features = _validate(reference_data)
51
+
52
+ self._logger.debug(f"fitting {str(self)}")
53
+ self.result = self._fit(reference_data)
54
+ return self
55
+
56
+ def calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
57
+ """Performs a calculation on the provided data."""
58
+ if self.result is None:
59
+ raise RuntimeError("must run fit with reference data before running calculate")
60
+ _validate(data, self.n_features)
61
+
62
+ self._logger.debug(f"calculating {str(self)}")
63
+ self.result = self._calculate(data)
64
+ return self.result
65
+
66
+ @abstractmethod
67
+ def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput: ...
68
+
69
+ @abstractmethod
70
+ def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput: ...
@@ -0,0 +1,396 @@
1
+ """
2
+ NannyML module providing intelligent splitting of data into chunks.
3
+
4
+ Source code derived from NannyML 0.13.0
5
+ https://github.com/NannyML/nannyml/blob/main/nannyml/chunk.py
6
+
7
+ Licensed under Apache Software License (Apache 2.0)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import copy
13
+ import logging
14
+ import warnings
15
+ from abc import ABC, abstractmethod
16
+ from typing import Any, Generic, Literal, Sequence, TypeVar, cast
17
+
18
+ import pandas as pd
19
+ from pandas import Index, Period
20
+ from typing_extensions import Self
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class Chunk(ABC):
26
+ """A subset of data that acts as a logical unit during calculations."""
27
+
28
+ KEYS: Sequence[str]
29
+
30
+ def __init__(
31
+ self,
32
+ data: pd.DataFrame,
33
+ ) -> None:
34
+ self.key: str
35
+ self.data = data
36
+
37
+ self.start_index: int = -1
38
+ self.end_index: int = -1
39
+ self.chunk_index: int = -1
40
+
41
+ def __repr__(self) -> str:
42
+ attr_str = ", ".join([f"{k}={v}" for k, v in self.dict().items()])
43
+ return f"{self.__class__.__name__}(data=pd.DataFrame(shape={self.data.shape}), {attr_str})"
44
+
45
+ def __len__(self) -> int:
46
+ return self.data.shape[0]
47
+
48
+ @abstractmethod
49
+ def __add__(self, other: Self) -> Self: ...
50
+
51
+ @abstractmethod
52
+ def __lt__(self, other: Self) -> bool: ...
53
+
54
+ @abstractmethod
55
+ def dict(self) -> dict[str, Any]: ...
56
+
57
+
58
+ class IndexChunk(Chunk):
59
+ """Creates a new chunk.
60
+
61
+ Parameters
62
+ ----------
63
+ data : DataFrame, required
64
+ The data to be contained within the chunk.
65
+ start_datetime: datetime
66
+ The starting point in time for this chunk.
67
+ end_datetime: datetime
68
+ The end point in time for this chunk.
69
+ """
70
+
71
+ KEYS = ("key", "chunk_index", "start_index", "end_index")
72
+
73
+ def __init__(
74
+ self,
75
+ data: pd.DataFrame,
76
+ start_index: int,
77
+ end_index: int,
78
+ ) -> None:
79
+ super().__init__(data)
80
+ self.key = f"[{start_index}:{end_index}]"
81
+ self.start_index: int = start_index
82
+ self.end_index: int = end_index
83
+
84
+ def __lt__(self, other: Self) -> bool:
85
+ return self.end_index < other.start_index
86
+
87
+ def __add__(self, other: Self) -> Self:
88
+ a, b = (self, other) if self < other else (other, self)
89
+ result = copy.deepcopy(a)
90
+ result.data = pd.concat([a.data, b.data])
91
+ result.end_index = b.end_index
92
+ return result
93
+
94
+ def dict(self) -> dict[str, Any]:
95
+ return dict(zip(self.KEYS, (self.key, self.chunk_index, self.start_index, self.end_index)))
96
+
97
+
98
+ class PeriodChunk(Chunk):
99
+ """Creates a new chunk.
100
+
101
+ Parameters
102
+ ----------
103
+ data : DataFrame, required
104
+ The data to be contained within the chunk.
105
+ start_datetime: datetime
106
+ The starting point in time for this chunk.
107
+ end_datetime: datetime
108
+ The end point in time for this chunk.
109
+ chunk_size : int
110
+ The size of the chunk.
111
+ """
112
+
113
+ KEYS = ("key", "chunk_index", "start_date", "end_date", "chunk_size")
114
+
115
+ def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int) -> None:
116
+ super().__init__(data)
117
+ self.key = str(period)
118
+ self.start_datetime = period.start_time
119
+ self.end_datetime = period.end_time
120
+ self.chunk_size = chunk_size
121
+
122
+ def __lt__(self, other: Self) -> bool:
123
+ return self.end_datetime < other.start_datetime
124
+
125
+ def __add__(self, other: Self) -> Self:
126
+ a, b = (self, other) if self < other else (other, self)
127
+ result = copy.deepcopy(a)
128
+ result.data = pd.concat([a.data, b.data])
129
+ result.end_index = b.end_index
130
+ result.end_datetime = b.end_datetime
131
+ result.chunk_size += b.chunk_size
132
+ return result
133
+
134
+ def dict(self) -> dict[str, Any]:
135
+ return dict(
136
+ zip(self.KEYS, (self.key, self.chunk_index, self.start_datetime, self.end_datetime, self.chunk_size))
137
+ )
138
+
139
+
140
+ TChunk = TypeVar("TChunk", bound=Chunk)
141
+
142
+
143
+ class Chunker(Generic[TChunk]):
144
+ """Base class for Chunker implementations.
145
+
146
+ Inheriting classes will split a DataFrame into a list of Chunks.
147
+ They will do this based on several constraints, e.g. observation timestamps, number of observations per Chunk
148
+ or a preferred number of Chunks.
149
+ """
150
+
151
+ def split(self, data: pd.DataFrame) -> list[TChunk]:
152
+ """Splits a given data frame into a list of chunks.
153
+
154
+ This method provides a uniform interface across Chunker implementations to keep them interchangeable.
155
+
156
+ After performing the implementation-specific `_split` method, there are some checks on the resulting chunk list.
157
+
158
+ If the total number of chunks is low a warning will be written out to the logs.
159
+
160
+ We dynamically determine the optimal minimum number of observations per chunk and then check if the resulting
161
+ chunks contain at least as many. If there are any underpopulated chunks a warning will be written out in
162
+ the logs.
163
+
164
+ Parameters
165
+ ----------
166
+ data: DataFrame
167
+ The data to be split into chunks
168
+
169
+ Returns
170
+ -------
171
+ chunks: List[Chunk]
172
+ The list of chunks
173
+
174
+ """
175
+ if data.shape[0] == 0:
176
+ return []
177
+
178
+ chunks = self._split(data)
179
+ for chunk_index, chunk in enumerate(chunks):
180
+ chunk.start_index = cast(int, chunk.data.index.min())
181
+ chunk.end_index = cast(int, chunk.data.index.max())
182
+ chunk.chunk_index = chunk_index
183
+
184
+ if len(chunks) < 6:
185
+ # TODO wording
186
+ warnings.warn(
187
+ "The resulting number of chunks is too low. "
188
+ "Please consider splitting your data in a different way or continue at your own risk."
189
+ )
190
+
191
+ return chunks
192
+
193
+ @abstractmethod
194
+ def _split(self, data: pd.DataFrame) -> list[TChunk]: ...
195
+
196
+
197
+ class PeriodBasedChunker(Chunker[PeriodChunk]):
198
+ """A Chunker that will split data into Chunks based on a date column in the data.
199
+
200
+ Examples
201
+ --------
202
+ Chunk using monthly periods and providing a column name
203
+
204
+ >>> from nannyml.chunk import PeriodBasedChunker
205
+ >>> df = pd.read_parquet("/path/to/my/data.pq")
206
+ >>> chunker = PeriodBasedChunker(timestamp_column_name="observation_date", offset="M")
207
+ >>> chunks = chunker.split(data=df)
208
+
209
+ Or chunk using weekly periods
210
+
211
+ >>> from nannyml.chunk import PeriodBasedChunker
212
+ >>> df = pd.read_parquet("/path/to/my/data.pq")
213
+ >>> chunker = PeriodBasedChunker(timestamp_column_name=df["observation_date"], offset="W", minimum_chunk_size=50)
214
+ >>> chunks = chunker.split(data=df)
215
+
216
+ """
217
+
218
+ def __init__(self, timestamp_column_name: str, offset: str = "W") -> None:
219
+ """Creates a new PeriodBasedChunker.
220
+
221
+ Parameters
222
+ ----------
223
+ timestamp_column_name : str
224
+ The column name containing the timestamp to chunk on
225
+ offset : str
226
+ A frequency string representing a pandas.tseries.offsets.DateOffset.
227
+ The offset determines how the time-based grouping will occur. A list of possible values
228
+ can be found at <https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases>.
229
+ """
230
+ self.timestamp_column_name = timestamp_column_name
231
+ self.offset = offset
232
+
233
+ def _split(self, data: pd.DataFrame) -> list[PeriodChunk]:
234
+ chunks = []
235
+ if self.timestamp_column_name is None:
236
+ raise ValueError("timestamp_column_name must be provided")
237
+ if self.timestamp_column_name not in data:
238
+ raise ValueError(f"timestamp column '{self.timestamp_column_name}' not in columns")
239
+
240
+ grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
241
+
242
+ for k, v in grouped.groups.items():
243
+ period, index = cast(Period, k), cast(Index, v)
244
+ chunk = PeriodChunk(
245
+ data=grouped.get_group(period), # type: ignore | dataframe
246
+ period=period,
247
+ chunk_size=len(index),
248
+ )
249
+ chunks.append(chunk)
250
+
251
+ return chunks
252
+
253
+
254
+ class SizeBasedChunker(Chunker[IndexChunk]):
255
+ """A Chunker that will split data into Chunks based on the preferred number of observations per Chunk.
256
+
257
+ Notes
258
+ -----
259
+ - Chunks are adjacent, not overlapping
260
+ - There may be "incomplete" chunks, as the remainder of observations after dividing by `chunk_size`
261
+ will form a chunk of their own.
262
+
263
+ Examples
264
+ --------
265
+ Chunk using monthly periods and providing a column name
266
+
267
+ >>> from nannyml.chunk import SizeBasedChunker
268
+ >>> df = pd.read_parquet("/path/to/my/data.pq")
269
+ >>> chunker = SizeBasedChunker(chunk_size=2000, incomplete="drop")
270
+ >>> chunks = chunker.split(data=df)
271
+
272
+ """
273
+
274
+ def __init__(
275
+ self,
276
+ chunk_size: int,
277
+ incomplete: Literal["append", "drop", "keep"] = "keep",
278
+ ) -> None:
279
+ """Create a new SizeBasedChunker.
280
+
281
+ Parameters
282
+ ----------
283
+ chunk_size: int
284
+ The preferred size of the resulting Chunks, i.e. the number of observations in each Chunk.
285
+ incomplete: str, default='keep'
286
+ Choose how to handle any leftover observations that don't make up a full Chunk.
287
+ The following options are available:
288
+
289
+ - ``'drop'``: drop the leftover observations
290
+ - ``'keep'``: keep the incomplete Chunk (containing less than ``chunk_size`` observations)
291
+ - ``'append'``: append leftover observations to the last complete Chunk (overfilling it)
292
+
293
+ Defaults to ``'keep'``.
294
+
295
+ Returns
296
+ -------
297
+ chunker: a size-based instance used to split data into Chunks of a constant size.
298
+
299
+ """
300
+ if not isinstance(chunk_size, int) or chunk_size <= 0:
301
+ raise ValueError(f"chunk_size={chunk_size} is invalid - provide an integer greater than 0")
302
+ if incomplete not in ("append", "drop", "keep"):
303
+ raise ValueError(f"incomplete={incomplete} is invalid - must be one of ['append', 'drop', 'keep']")
304
+
305
+ self.chunk_size = chunk_size
306
+ self.incomplete = incomplete
307
+
308
+ def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
309
+ def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> IndexChunk:
310
+ chunk_data = data.iloc[index : index + chunk_size]
311
+ return IndexChunk(
312
+ data=chunk_data,
313
+ start_index=index,
314
+ end_index=index + chunk_size - 1,
315
+ )
316
+
317
+ chunks = [
318
+ _create_chunk(index=i, data=data, chunk_size=self.chunk_size)
319
+ for i in range(0, data.shape[0], self.chunk_size)
320
+ if i + self.chunk_size - 1 < len(data)
321
+ ]
322
+
323
+ # deal with unassigned observations
324
+ if data.shape[0] % self.chunk_size != 0 and self.incomplete != "drop":
325
+ incomplete_chunk = _create_chunk(
326
+ index=self.chunk_size * (data.shape[0] // self.chunk_size),
327
+ data=data,
328
+ chunk_size=(data.shape[0] % self.chunk_size),
329
+ )
330
+ if self.incomplete == "append":
331
+ chunks[-1] += incomplete_chunk
332
+ else:
333
+ chunks += [incomplete_chunk]
334
+
335
+ return chunks
336
+
337
+
338
+ class CountBasedChunker(Chunker[IndexChunk]):
339
+ """A Chunker that will split data into chunks based on the preferred number of total chunks.
340
+
341
+ Notes
342
+ -----
343
+ - Chunks are adjacent, not overlapping
344
+ - There may be "incomplete" chunks, as the remainder of observations after dividing by `chunk_size`
345
+ will form a chunk of their own.
346
+
347
+ Examples
348
+ --------
349
+ >>> from nannyml.chunk import CountBasedChunker
350
+ >>> df = pd.read_parquet("/path/to/my/data.pq")
351
+ >>> chunker = CountBasedChunker(chunk_number=100)
352
+ >>> chunks = chunker.split(data=df)
353
+
354
+ """
355
+
356
+ def __init__(
357
+ self,
358
+ chunk_number: int,
359
+ incomplete: Literal["append", "drop", "keep"] = "keep",
360
+ ) -> None:
361
+ """Creates a new CountBasedChunker.
362
+
363
+ It will calculate the amount of observations per chunk based on the given chunk count.
364
+ It then continues to split the data into chunks just like a SizeBasedChunker does.
365
+
366
+ Parameters
367
+ ----------
368
+ chunk_number: int
369
+ The amount of chunks to split the data in.
370
+ incomplete: str, default='keep'
371
+ Choose how to handle any leftover observations that don't make up a full Chunk.
372
+ The following options are available:
373
+
374
+ - ``'drop'``: drop the leftover observations
375
+ - ``'keep'``: keep the incomplete Chunk (containing less than ``chunk_size`` observations)
376
+ - ``'append'``: append leftover observations to the last complete Chunk (overfilling it)
377
+
378
+ Defaults to ``'keep'``.
379
+
380
+ Returns
381
+ -------
382
+ chunker: CountBasedChunker
383
+
384
+ """
385
+ if not isinstance(chunk_number, int) or chunk_number <= 0:
386
+ raise ValueError(f"given chunk_number {chunk_number} is invalid - provide an integer greater than 0")
387
+ if incomplete not in ("append", "drop", "keep"):
388
+ raise ValueError(f"incomplete={incomplete} is invalid - must be one of ['append', 'drop', 'keep']")
389
+
390
+ self.chunk_number = chunk_number
391
+ self.incomplete: Literal["append", "drop", "keep"] = incomplete
392
+
393
+ def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
394
+ chunk_size = data.shape[0] // self.chunk_number
395
+ chunker = SizeBasedChunker(chunk_size, self.incomplete)
396
+ return chunker.split(data=data)
@@ -0,0 +1,181 @@
1
+ """
2
+ Source code derived from NannyML 0.13.0
3
+ https://github.com/NannyML/nannyml/blob/main/nannyml/drift/multivariate/domain_classifier/calculator.py
4
+
5
+ Licensed under Apache Software License (Apache 2.0)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Any
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ from lightgbm import LGBMClassifier
16
+ from numpy.typing import NDArray
17
+ from sklearn.metrics import roc_auc_score
18
+ from sklearn.model_selection import StratifiedKFold
19
+
20
+ from dataeval.config import get_max_processes, get_seed
21
+ from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
22
+ from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
23
+ from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold
24
+ from dataeval.outputs._base import set_metadata
25
+ from dataeval.outputs._drift import DriftMVDCOutput
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ DEFAULT_LGBM_HYPERPARAMS = {
30
+ "boosting_type": "gbdt",
31
+ "class_weight": None,
32
+ "colsample_bytree": 1.0,
33
+ "deterministic": True,
34
+ "importance_type": "split",
35
+ "learning_rate": 0.1,
36
+ "max_depth": -1,
37
+ "min_child_samples": 20,
38
+ "min_child_weight": 0.001,
39
+ "min_split_gain": 0.0,
40
+ "n_estimators": 100,
41
+ "num_leaves": 31,
42
+ "objective": None,
43
+ "reg_alpha": 0.0,
44
+ "reg_lambda": 0.0,
45
+ "subsample": 1.0,
46
+ "subsample_for_bin": 200000,
47
+ "subsample_freq": 0,
48
+ "verbosity": -1,
49
+ }
50
+
51
+
52
+ class DomainClassifierCalculator(AbstractCalculator):
53
+ """
54
+ DomainClassifierCalculator implementation.
55
+
56
+ Uses Drift Detection Classifier's cross validated performance as a measure of drift.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ chunker: Chunker | None = None,
62
+ cv_folds_num: int = 5,
63
+ hyperparameters: dict[str, Any] | None = None,
64
+ threshold: Threshold = ConstantThreshold(lower=0.45, upper=0.65),
65
+ ) -> None:
66
+ """
67
+ Create a new DomainClassifierCalculator instance.
68
+
69
+ Parameters
70
+ -----------
71
+ chunker : Chunker, default=None
72
+ The `Chunker` used to split the data sets into a lists of chunks.
73
+ cv_folds_num: Optional[int]
74
+ Number of cross-validation folds to use when calculating DC discrimination value.
75
+ hyperparameters : dict[str, Any], default = None
76
+ A dictionary used to provide your own custom hyperparameters when training the discrimination model.
77
+ Check out the available hyperparameter options in the
78
+ `LightGBM docs <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>`_.
79
+ threshold: Threshold, default=ConstantThreshold
80
+ The threshold you wish to evaluate values on. Defaults to a ConstantThreshold with lower value
81
+ of 0.45 and upper value of 0.65.
82
+ """
83
+ super().__init__(chunker, logger)
84
+
85
+ self.cv_folds_num = cv_folds_num
86
+ self.hyperparameters = DEFAULT_LGBM_HYPERPARAMS if hyperparameters is None else hyperparameters
87
+ self.threshold = threshold
88
+ self.result: DriftMVDCOutput | None = None
89
+
90
+ def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput:
91
+ """Fits the DC calculator to a set of reference data."""
92
+ self._x_ref = reference_data
93
+ result = self._calculate(data=self._x_ref)
94
+ result._data[("chunk", "period")] = "reference"
95
+
96
+ return result
97
+
98
+ @set_metadata
99
+ def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
100
+ """Calculate the data DC calculator metric for a given data set."""
101
+ chunks = self.chunker.split(data)
102
+
103
+ res = pd.DataFrame.from_records(
104
+ [
105
+ {
106
+ **chunk.dict(),
107
+ "period": "analysis",
108
+ "classifier_auroc_value": self._calculate_chunk(chunk=chunk),
109
+ }
110
+ for chunk in chunks
111
+ ]
112
+ )
113
+
114
+ multilevel_index = _create_multilevel_index(chunks, "domain_classifier_auroc", ["value"])
115
+ res.columns = multilevel_index
116
+ res = res.reset_index(drop=True)
117
+
118
+ res = self._populate_alert_thresholds(res)
119
+
120
+ if self.result is None:
121
+ self.result = DriftMVDCOutput(results_data=res)
122
+ else:
123
+ self.result = self.result.filter(period="reference")
124
+ self.result._data = pd.concat([self.result._data, res], ignore_index=True)
125
+ return self.result
126
+
127
+ def _calculate_chunk(self, chunk: Chunk) -> float:
128
+ if self.result is None:
129
+ # Use information from chunk indices to identify reference chunk's location. This is possible because
130
+ # both the internal reference data copy and the chunk data were sorted by timestamp, so these
131
+ # indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
132
+ # which is a costly operation.
133
+ df_X = self._x_ref
134
+ y = np.zeros(len(df_X), dtype=np.intp)
135
+ y[chunk.start_index : chunk.end_index + 1] = 1
136
+ else:
137
+ chunk_X = chunk.data
138
+ reference_X = self._x_ref
139
+ chunk_y = np.ones(len(chunk_X), dtype=np.intp)
140
+ reference_y = np.zeros(len(reference_X), dtype=np.intp)
141
+ df_X = pd.concat([reference_X, chunk_X], ignore_index=True)
142
+ y = np.concatenate([reference_y, chunk_y])
143
+
144
+ skf = StratifiedKFold(n_splits=self.cv_folds_num)
145
+ all_preds: list[NDArray[np.float32]] = []
146
+ all_tgts: list[NDArray[np.intp]] = []
147
+ for i, (train_index, test_index) in enumerate(skf.split(df_X, y)):
148
+ _trx = df_X.iloc[train_index]
149
+ _try = y[train_index]
150
+ _tsx = df_X.iloc[test_index]
151
+ _tsy = y[test_index]
152
+ model = LGBMClassifier(**self.hyperparameters, n_jobs=get_max_processes(), random_state=get_seed())
153
+ model.fit(_trx, _try)
154
+ preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
155
+ all_preds.append(preds)
156
+ all_tgts.append(_tsy)
157
+
158
+ np_all_preds = np.concatenate(all_preds, axis=0)
159
+ np_all_tgts = np.concatenate(all_tgts, axis=0)
160
+ result = roc_auc_score(np_all_tgts, np_all_preds)
161
+ return 0.5 if result == np.nan else float(result)
162
+
163
+ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
164
+ if self.result is None:
165
+ self._threshold_values = self.threshold.calculate(
166
+ data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
167
+ lower_limit=0.0,
168
+ upper_limit=1.0,
169
+ logger=self._logger,
170
+ )
171
+
172
+ result_data[("domain_classifier_auroc", "upper_threshold")] = self._threshold_values[1]
173
+ result_data[("domain_classifier_auroc", "lower_threshold")] = self._threshold_values[0]
174
+ result_data[("domain_classifier_auroc", "alert")] = result_data.apply(
175
+ lambda row: bool(
176
+ row["domain_classifier_auroc", "value"] > row["domain_classifier_auroc", "upper_threshold"]
177
+ or row["domain_classifier_auroc", "value"] < row["domain_classifier_auroc", "lower_threshold"]
178
+ ),
179
+ axis=1,
180
+ )
181
+ return result_data