dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_log.py +1 -1
- dataeval/config.py +21 -4
- dataeval/data/_embeddings.py +2 -2
- dataeval/data/_images.py +2 -3
- dataeval/data/_metadata.py +65 -42
- dataeval/data/_selection.py +2 -3
- dataeval/data/_split.py +2 -3
- dataeval/data/_targets.py +17 -13
- dataeval/data/selections/_classfilter.py +6 -8
- dataeval/data/selections/_prioritize.py +6 -9
- dataeval/data/selections/_shuffle.py +3 -1
- dataeval/detectors/drift/__init__.py +4 -1
- dataeval/detectors/drift/_base.py +4 -5
- dataeval/detectors/drift/_mmd.py +3 -6
- dataeval/detectors/drift/_mvdc.py +92 -0
- dataeval/detectors/drift/_nml/__init__.py +6 -0
- dataeval/detectors/drift/_nml/_base.py +70 -0
- dataeval/detectors/drift/_nml/_chunk.py +396 -0
- dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
- dataeval/detectors/drift/_nml/_result.py +97 -0
- dataeval/detectors/drift/_nml/_thresholds.py +269 -0
- dataeval/detectors/linters/outliers.py +7 -7
- dataeval/metrics/bias/_parity.py +10 -13
- dataeval/metrics/estimators/_divergence.py +2 -4
- dataeval/metrics/stats/_base.py +103 -42
- dataeval/metrics/stats/_boxratiostats.py +21 -19
- dataeval/metrics/stats/_dimensionstats.py +14 -10
- dataeval/metrics/stats/_hashstats.py +1 -1
- dataeval/metrics/stats/_pixelstats.py +6 -6
- dataeval/metrics/stats/_visualstats.py +3 -3
- dataeval/outputs/__init__.py +2 -1
- dataeval/outputs/_base.py +22 -7
- dataeval/outputs/_bias.py +27 -31
- dataeval/outputs/_drift.py +60 -0
- dataeval/outputs/_linters.py +12 -17
- dataeval/outputs/_stats.py +83 -29
- dataeval/outputs/_workflows.py +2 -2
- dataeval/utils/_array.py +6 -9
- dataeval/utils/_bin.py +1 -2
- dataeval/utils/_clusterer.py +7 -4
- dataeval/utils/_fast_mst.py +27 -13
- dataeval/utils/_image.py +65 -11
- dataeval/utils/_mst.py +1 -3
- dataeval/utils/_plot.py +15 -10
- dataeval/utils/data/_dataset.py +32 -20
- dataeval/utils/data/metadata.py +104 -82
- dataeval/utils/datasets/__init__.py +2 -0
- dataeval/utils/datasets/_antiuav.py +189 -0
- dataeval/utils/datasets/_base.py +11 -8
- dataeval/utils/datasets/_cifar10.py +104 -45
- dataeval/utils/datasets/_fileio.py +21 -47
- dataeval/utils/datasets/_milco.py +19 -11
- dataeval/utils/datasets/_mixin.py +2 -4
- dataeval/utils/datasets/_mnist.py +3 -4
- dataeval/utils/datasets/_ships.py +14 -7
- dataeval/utils/datasets/_voc.py +229 -42
- dataeval/utils/torch/models.py +5 -10
- dataeval/utils/torch/trainer.py +3 -3
- dataeval/workflows/sufficiency.py +2 -2
- {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
- dataeval-0.86.1.dist-info/RECORD +114 -0
- dataeval/detectors/ood/vae.py +0 -74
- dataeval-0.85.0.dist-info/RECORD +0 -107
- {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from NannyML 0.13.0
|
3
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/base.py
|
4
|
+
|
5
|
+
Licensed under Apache Software License (Apache 2.0)
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import logging
|
11
|
+
from abc import ABC, abstractmethod
|
12
|
+
from logging import Logger
|
13
|
+
from typing import Sequence
|
14
|
+
|
15
|
+
import pandas as pd
|
16
|
+
from typing_extensions import Self
|
17
|
+
|
18
|
+
from dataeval.detectors.drift._nml._chunk import Chunk, Chunker, CountBasedChunker
|
19
|
+
from dataeval.outputs._drift import DriftMVDCOutput
|
20
|
+
|
21
|
+
|
22
|
+
def _validate(data: pd.DataFrame, expected_features: int | None = None) -> int:
|
23
|
+
if data.empty:
|
24
|
+
raise ValueError("data contains no rows. Please provide a valid data set.")
|
25
|
+
if expected_features is not None and data.shape[-1] != expected_features:
|
26
|
+
raise ValueError(f"expected '{expected_features}' features in data set:\n\t{data}")
|
27
|
+
return data.shape[-1]
|
28
|
+
|
29
|
+
|
30
|
+
def _create_multilevel_index(
|
31
|
+
chunks: Sequence[Chunk], result_group_name: str, result_column_names: Sequence[str]
|
32
|
+
) -> pd.MultiIndex:
|
33
|
+
chunk_column_names = (*chunks[0].KEYS, "period")
|
34
|
+
chunk_tuples = [("chunk", chunk_column_name) for chunk_column_name in chunk_column_names]
|
35
|
+
result_tuples = [(result_group_name, column_name) for column_name in result_column_names]
|
36
|
+
return pd.MultiIndex.from_tuples(chunk_tuples + result_tuples)
|
37
|
+
|
38
|
+
|
39
|
+
class AbstractCalculator(ABC):
|
40
|
+
"""Base class for drift calculation."""
|
41
|
+
|
42
|
+
def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None) -> None:
|
43
|
+
self.chunker = chunker if isinstance(chunker, Chunker) else CountBasedChunker(10)
|
44
|
+
self.result: DriftMVDCOutput | None = None
|
45
|
+
self.n_features: int | None = None
|
46
|
+
self._logger = logger if isinstance(logger, Logger) else logging.getLogger(__name__)
|
47
|
+
|
48
|
+
def fit(self, reference_data: pd.DataFrame) -> Self:
|
49
|
+
"""Trains the calculator using reference data."""
|
50
|
+
self.n_features = _validate(reference_data)
|
51
|
+
|
52
|
+
self._logger.debug(f"fitting {str(self)}")
|
53
|
+
self.result = self._fit(reference_data)
|
54
|
+
return self
|
55
|
+
|
56
|
+
def calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
|
57
|
+
"""Performs a calculation on the provided data."""
|
58
|
+
if self.result is None:
|
59
|
+
raise RuntimeError("must run fit with reference data before running calculate")
|
60
|
+
_validate(data, self.n_features)
|
61
|
+
|
62
|
+
self._logger.debug(f"calculating {str(self)}")
|
63
|
+
self.result = self._calculate(data)
|
64
|
+
return self.result
|
65
|
+
|
66
|
+
@abstractmethod
|
67
|
+
def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput: ...
|
68
|
+
|
69
|
+
@abstractmethod
|
70
|
+
def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput: ...
|
@@ -0,0 +1,396 @@
|
|
1
|
+
"""
|
2
|
+
NannyML module providing intelligent splitting of data into chunks.
|
3
|
+
|
4
|
+
Source code derived from NannyML 0.13.0
|
5
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/chunk.py
|
6
|
+
|
7
|
+
Licensed under Apache Software License (Apache 2.0)
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import copy
|
13
|
+
import logging
|
14
|
+
import warnings
|
15
|
+
from abc import ABC, abstractmethod
|
16
|
+
from typing import Any, Generic, Literal, Sequence, TypeVar, cast
|
17
|
+
|
18
|
+
import pandas as pd
|
19
|
+
from pandas import Index, Period
|
20
|
+
from typing_extensions import Self
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
class Chunk(ABC):
|
26
|
+
"""A subset of data that acts as a logical unit during calculations."""
|
27
|
+
|
28
|
+
KEYS: Sequence[str]
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
data: pd.DataFrame,
|
33
|
+
) -> None:
|
34
|
+
self.key: str
|
35
|
+
self.data = data
|
36
|
+
|
37
|
+
self.start_index: int = -1
|
38
|
+
self.end_index: int = -1
|
39
|
+
self.chunk_index: int = -1
|
40
|
+
|
41
|
+
def __repr__(self) -> str:
|
42
|
+
attr_str = ", ".join([f"{k}={v}" for k, v in self.dict().items()])
|
43
|
+
return f"{self.__class__.__name__}(data=pd.DataFrame(shape={self.data.shape}), {attr_str})"
|
44
|
+
|
45
|
+
def __len__(self) -> int:
|
46
|
+
return self.data.shape[0]
|
47
|
+
|
48
|
+
@abstractmethod
|
49
|
+
def __add__(self, other: Self) -> Self: ...
|
50
|
+
|
51
|
+
@abstractmethod
|
52
|
+
def __lt__(self, other: Self) -> bool: ...
|
53
|
+
|
54
|
+
@abstractmethod
|
55
|
+
def dict(self) -> dict[str, Any]: ...
|
56
|
+
|
57
|
+
|
58
|
+
class IndexChunk(Chunk):
|
59
|
+
"""Creates a new chunk.
|
60
|
+
|
61
|
+
Parameters
|
62
|
+
----------
|
63
|
+
data : DataFrame, required
|
64
|
+
The data to be contained within the chunk.
|
65
|
+
start_datetime: datetime
|
66
|
+
The starting point in time for this chunk.
|
67
|
+
end_datetime: datetime
|
68
|
+
The end point in time for this chunk.
|
69
|
+
"""
|
70
|
+
|
71
|
+
KEYS = ("key", "chunk_index", "start_index", "end_index")
|
72
|
+
|
73
|
+
def __init__(
|
74
|
+
self,
|
75
|
+
data: pd.DataFrame,
|
76
|
+
start_index: int,
|
77
|
+
end_index: int,
|
78
|
+
) -> None:
|
79
|
+
super().__init__(data)
|
80
|
+
self.key = f"[{start_index}:{end_index}]"
|
81
|
+
self.start_index: int = start_index
|
82
|
+
self.end_index: int = end_index
|
83
|
+
|
84
|
+
def __lt__(self, other: Self) -> bool:
|
85
|
+
return self.end_index < other.start_index
|
86
|
+
|
87
|
+
def __add__(self, other: Self) -> Self:
|
88
|
+
a, b = (self, other) if self < other else (other, self)
|
89
|
+
result = copy.deepcopy(a)
|
90
|
+
result.data = pd.concat([a.data, b.data])
|
91
|
+
result.end_index = b.end_index
|
92
|
+
return result
|
93
|
+
|
94
|
+
def dict(self) -> dict[str, Any]:
|
95
|
+
return dict(zip(self.KEYS, (self.key, self.chunk_index, self.start_index, self.end_index)))
|
96
|
+
|
97
|
+
|
98
|
+
class PeriodChunk(Chunk):
|
99
|
+
"""Creates a new chunk.
|
100
|
+
|
101
|
+
Parameters
|
102
|
+
----------
|
103
|
+
data : DataFrame, required
|
104
|
+
The data to be contained within the chunk.
|
105
|
+
start_datetime: datetime
|
106
|
+
The starting point in time for this chunk.
|
107
|
+
end_datetime: datetime
|
108
|
+
The end point in time for this chunk.
|
109
|
+
chunk_size : int
|
110
|
+
The size of the chunk.
|
111
|
+
"""
|
112
|
+
|
113
|
+
KEYS = ("key", "chunk_index", "start_date", "end_date", "chunk_size")
|
114
|
+
|
115
|
+
def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int) -> None:
|
116
|
+
super().__init__(data)
|
117
|
+
self.key = str(period)
|
118
|
+
self.start_datetime = period.start_time
|
119
|
+
self.end_datetime = period.end_time
|
120
|
+
self.chunk_size = chunk_size
|
121
|
+
|
122
|
+
def __lt__(self, other: Self) -> bool:
|
123
|
+
return self.end_datetime < other.start_datetime
|
124
|
+
|
125
|
+
def __add__(self, other: Self) -> Self:
|
126
|
+
a, b = (self, other) if self < other else (other, self)
|
127
|
+
result = copy.deepcopy(a)
|
128
|
+
result.data = pd.concat([a.data, b.data])
|
129
|
+
result.end_index = b.end_index
|
130
|
+
result.end_datetime = b.end_datetime
|
131
|
+
result.chunk_size += b.chunk_size
|
132
|
+
return result
|
133
|
+
|
134
|
+
def dict(self) -> dict[str, Any]:
|
135
|
+
return dict(
|
136
|
+
zip(self.KEYS, (self.key, self.chunk_index, self.start_datetime, self.end_datetime, self.chunk_size))
|
137
|
+
)
|
138
|
+
|
139
|
+
|
140
|
+
TChunk = TypeVar("TChunk", bound=Chunk)
|
141
|
+
|
142
|
+
|
143
|
+
class Chunker(Generic[TChunk]):
|
144
|
+
"""Base class for Chunker implementations.
|
145
|
+
|
146
|
+
Inheriting classes will split a DataFrame into a list of Chunks.
|
147
|
+
They will do this based on several constraints, e.g. observation timestamps, number of observations per Chunk
|
148
|
+
or a preferred number of Chunks.
|
149
|
+
"""
|
150
|
+
|
151
|
+
def split(self, data: pd.DataFrame) -> list[TChunk]:
|
152
|
+
"""Splits a given data frame into a list of chunks.
|
153
|
+
|
154
|
+
This method provides a uniform interface across Chunker implementations to keep them interchangeable.
|
155
|
+
|
156
|
+
After performing the implementation-specific `_split` method, there are some checks on the resulting chunk list.
|
157
|
+
|
158
|
+
If the total number of chunks is low a warning will be written out to the logs.
|
159
|
+
|
160
|
+
We dynamically determine the optimal minimum number of observations per chunk and then check if the resulting
|
161
|
+
chunks contain at least as many. If there are any underpopulated chunks a warning will be written out in
|
162
|
+
the logs.
|
163
|
+
|
164
|
+
Parameters
|
165
|
+
----------
|
166
|
+
data: DataFrame
|
167
|
+
The data to be split into chunks
|
168
|
+
|
169
|
+
Returns
|
170
|
+
-------
|
171
|
+
chunks: List[Chunk]
|
172
|
+
The list of chunks
|
173
|
+
|
174
|
+
"""
|
175
|
+
if data.shape[0] == 0:
|
176
|
+
return []
|
177
|
+
|
178
|
+
chunks = self._split(data)
|
179
|
+
for chunk_index, chunk in enumerate(chunks):
|
180
|
+
chunk.start_index = cast(int, chunk.data.index.min())
|
181
|
+
chunk.end_index = cast(int, chunk.data.index.max())
|
182
|
+
chunk.chunk_index = chunk_index
|
183
|
+
|
184
|
+
if len(chunks) < 6:
|
185
|
+
# TODO wording
|
186
|
+
warnings.warn(
|
187
|
+
"The resulting number of chunks is too low. "
|
188
|
+
"Please consider splitting your data in a different way or continue at your own risk."
|
189
|
+
)
|
190
|
+
|
191
|
+
return chunks
|
192
|
+
|
193
|
+
@abstractmethod
|
194
|
+
def _split(self, data: pd.DataFrame) -> list[TChunk]: ...
|
195
|
+
|
196
|
+
|
197
|
+
class PeriodBasedChunker(Chunker[PeriodChunk]):
|
198
|
+
"""A Chunker that will split data into Chunks based on a date column in the data.
|
199
|
+
|
200
|
+
Examples
|
201
|
+
--------
|
202
|
+
Chunk using monthly periods and providing a column name
|
203
|
+
|
204
|
+
>>> from nannyml.chunk import PeriodBasedChunker
|
205
|
+
>>> df = pd.read_parquet("/path/to/my/data.pq")
|
206
|
+
>>> chunker = PeriodBasedChunker(timestamp_column_name="observation_date", offset="M")
|
207
|
+
>>> chunks = chunker.split(data=df)
|
208
|
+
|
209
|
+
Or chunk using weekly periods
|
210
|
+
|
211
|
+
>>> from nannyml.chunk import PeriodBasedChunker
|
212
|
+
>>> df = pd.read_parquet("/path/to/my/data.pq")
|
213
|
+
>>> chunker = PeriodBasedChunker(timestamp_column_name=df["observation_date"], offset="W", minimum_chunk_size=50)
|
214
|
+
>>> chunks = chunker.split(data=df)
|
215
|
+
|
216
|
+
"""
|
217
|
+
|
218
|
+
def __init__(self, timestamp_column_name: str, offset: str = "W") -> None:
|
219
|
+
"""Creates a new PeriodBasedChunker.
|
220
|
+
|
221
|
+
Parameters
|
222
|
+
----------
|
223
|
+
timestamp_column_name : str
|
224
|
+
The column name containing the timestamp to chunk on
|
225
|
+
offset : str
|
226
|
+
A frequency string representing a pandas.tseries.offsets.DateOffset.
|
227
|
+
The offset determines how the time-based grouping will occur. A list of possible values
|
228
|
+
can be found at <https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases>.
|
229
|
+
"""
|
230
|
+
self.timestamp_column_name = timestamp_column_name
|
231
|
+
self.offset = offset
|
232
|
+
|
233
|
+
def _split(self, data: pd.DataFrame) -> list[PeriodChunk]:
|
234
|
+
chunks = []
|
235
|
+
if self.timestamp_column_name is None:
|
236
|
+
raise ValueError("timestamp_column_name must be provided")
|
237
|
+
if self.timestamp_column_name not in data:
|
238
|
+
raise ValueError(f"timestamp column '{self.timestamp_column_name}' not in columns")
|
239
|
+
|
240
|
+
grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
|
241
|
+
|
242
|
+
for k, v in grouped.groups.items():
|
243
|
+
period, index = cast(Period, k), cast(Index, v)
|
244
|
+
chunk = PeriodChunk(
|
245
|
+
data=grouped.get_group(period), # type: ignore | dataframe
|
246
|
+
period=period,
|
247
|
+
chunk_size=len(index),
|
248
|
+
)
|
249
|
+
chunks.append(chunk)
|
250
|
+
|
251
|
+
return chunks
|
252
|
+
|
253
|
+
|
254
|
+
class SizeBasedChunker(Chunker[IndexChunk]):
|
255
|
+
"""A Chunker that will split data into Chunks based on the preferred number of observations per Chunk.
|
256
|
+
|
257
|
+
Notes
|
258
|
+
-----
|
259
|
+
- Chunks are adjacent, not overlapping
|
260
|
+
- There may be "incomplete" chunks, as the remainder of observations after dividing by `chunk_size`
|
261
|
+
will form a chunk of their own.
|
262
|
+
|
263
|
+
Examples
|
264
|
+
--------
|
265
|
+
Chunk using monthly periods and providing a column name
|
266
|
+
|
267
|
+
>>> from nannyml.chunk import SizeBasedChunker
|
268
|
+
>>> df = pd.read_parquet("/path/to/my/data.pq")
|
269
|
+
>>> chunker = SizeBasedChunker(chunk_size=2000, incomplete="drop")
|
270
|
+
>>> chunks = chunker.split(data=df)
|
271
|
+
|
272
|
+
"""
|
273
|
+
|
274
|
+
def __init__(
|
275
|
+
self,
|
276
|
+
chunk_size: int,
|
277
|
+
incomplete: Literal["append", "drop", "keep"] = "keep",
|
278
|
+
) -> None:
|
279
|
+
"""Create a new SizeBasedChunker.
|
280
|
+
|
281
|
+
Parameters
|
282
|
+
----------
|
283
|
+
chunk_size: int
|
284
|
+
The preferred size of the resulting Chunks, i.e. the number of observations in each Chunk.
|
285
|
+
incomplete: str, default='keep'
|
286
|
+
Choose how to handle any leftover observations that don't make up a full Chunk.
|
287
|
+
The following options are available:
|
288
|
+
|
289
|
+
- ``'drop'``: drop the leftover observations
|
290
|
+
- ``'keep'``: keep the incomplete Chunk (containing less than ``chunk_size`` observations)
|
291
|
+
- ``'append'``: append leftover observations to the last complete Chunk (overfilling it)
|
292
|
+
|
293
|
+
Defaults to ``'keep'``.
|
294
|
+
|
295
|
+
Returns
|
296
|
+
-------
|
297
|
+
chunker: a size-based instance used to split data into Chunks of a constant size.
|
298
|
+
|
299
|
+
"""
|
300
|
+
if not isinstance(chunk_size, int) or chunk_size <= 0:
|
301
|
+
raise ValueError(f"chunk_size={chunk_size} is invalid - provide an integer greater than 0")
|
302
|
+
if incomplete not in ("append", "drop", "keep"):
|
303
|
+
raise ValueError(f"incomplete={incomplete} is invalid - must be one of ['append', 'drop', 'keep']")
|
304
|
+
|
305
|
+
self.chunk_size = chunk_size
|
306
|
+
self.incomplete = incomplete
|
307
|
+
|
308
|
+
def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
|
309
|
+
def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> IndexChunk:
|
310
|
+
chunk_data = data.iloc[index : index + chunk_size]
|
311
|
+
return IndexChunk(
|
312
|
+
data=chunk_data,
|
313
|
+
start_index=index,
|
314
|
+
end_index=index + chunk_size - 1,
|
315
|
+
)
|
316
|
+
|
317
|
+
chunks = [
|
318
|
+
_create_chunk(index=i, data=data, chunk_size=self.chunk_size)
|
319
|
+
for i in range(0, data.shape[0], self.chunk_size)
|
320
|
+
if i + self.chunk_size - 1 < len(data)
|
321
|
+
]
|
322
|
+
|
323
|
+
# deal with unassigned observations
|
324
|
+
if data.shape[0] % self.chunk_size != 0 and self.incomplete != "drop":
|
325
|
+
incomplete_chunk = _create_chunk(
|
326
|
+
index=self.chunk_size * (data.shape[0] // self.chunk_size),
|
327
|
+
data=data,
|
328
|
+
chunk_size=(data.shape[0] % self.chunk_size),
|
329
|
+
)
|
330
|
+
if self.incomplete == "append":
|
331
|
+
chunks[-1] += incomplete_chunk
|
332
|
+
else:
|
333
|
+
chunks += [incomplete_chunk]
|
334
|
+
|
335
|
+
return chunks
|
336
|
+
|
337
|
+
|
338
|
+
class CountBasedChunker(Chunker[IndexChunk]):
|
339
|
+
"""A Chunker that will split data into chunks based on the preferred number of total chunks.
|
340
|
+
|
341
|
+
Notes
|
342
|
+
-----
|
343
|
+
- Chunks are adjacent, not overlapping
|
344
|
+
- There may be "incomplete" chunks, as the remainder of observations after dividing by `chunk_size`
|
345
|
+
will form a chunk of their own.
|
346
|
+
|
347
|
+
Examples
|
348
|
+
--------
|
349
|
+
>>> from nannyml.chunk import CountBasedChunker
|
350
|
+
>>> df = pd.read_parquet("/path/to/my/data.pq")
|
351
|
+
>>> chunker = CountBasedChunker(chunk_number=100)
|
352
|
+
>>> chunks = chunker.split(data=df)
|
353
|
+
|
354
|
+
"""
|
355
|
+
|
356
|
+
def __init__(
|
357
|
+
self,
|
358
|
+
chunk_number: int,
|
359
|
+
incomplete: Literal["append", "drop", "keep"] = "keep",
|
360
|
+
) -> None:
|
361
|
+
"""Creates a new CountBasedChunker.
|
362
|
+
|
363
|
+
It will calculate the amount of observations per chunk based on the given chunk count.
|
364
|
+
It then continues to split the data into chunks just like a SizeBasedChunker does.
|
365
|
+
|
366
|
+
Parameters
|
367
|
+
----------
|
368
|
+
chunk_number: int
|
369
|
+
The amount of chunks to split the data in.
|
370
|
+
incomplete: str, default='keep'
|
371
|
+
Choose how to handle any leftover observations that don't make up a full Chunk.
|
372
|
+
The following options are available:
|
373
|
+
|
374
|
+
- ``'drop'``: drop the leftover observations
|
375
|
+
- ``'keep'``: keep the incomplete Chunk (containing less than ``chunk_size`` observations)
|
376
|
+
- ``'append'``: append leftover observations to the last complete Chunk (overfilling it)
|
377
|
+
|
378
|
+
Defaults to ``'keep'``.
|
379
|
+
|
380
|
+
Returns
|
381
|
+
-------
|
382
|
+
chunker: CountBasedChunker
|
383
|
+
|
384
|
+
"""
|
385
|
+
if not isinstance(chunk_number, int) or chunk_number <= 0:
|
386
|
+
raise ValueError(f"given chunk_number {chunk_number} is invalid - provide an integer greater than 0")
|
387
|
+
if incomplete not in ("append", "drop", "keep"):
|
388
|
+
raise ValueError(f"incomplete={incomplete} is invalid - must be one of ['append', 'drop', 'keep']")
|
389
|
+
|
390
|
+
self.chunk_number = chunk_number
|
391
|
+
self.incomplete: Literal["append", "drop", "keep"] = incomplete
|
392
|
+
|
393
|
+
def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
|
394
|
+
chunk_size = data.shape[0] // self.chunk_number
|
395
|
+
chunker = SizeBasedChunker(chunk_size, self.incomplete)
|
396
|
+
return chunker.split(data=data)
|
@@ -0,0 +1,181 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from NannyML 0.13.0
|
3
|
+
https://github.com/NannyML/nannyml/blob/main/nannyml/drift/multivariate/domain_classifier/calculator.py
|
4
|
+
|
5
|
+
Licensed under Apache Software License (Apache 2.0)
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import logging
|
11
|
+
from typing import Any
|
12
|
+
|
13
|
+
import numpy as np
|
14
|
+
import pandas as pd
|
15
|
+
from lightgbm import LGBMClassifier
|
16
|
+
from numpy.typing import NDArray
|
17
|
+
from sklearn.metrics import roc_auc_score
|
18
|
+
from sklearn.model_selection import StratifiedKFold
|
19
|
+
|
20
|
+
from dataeval.config import get_max_processes, get_seed
|
21
|
+
from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
|
22
|
+
from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
|
23
|
+
from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold
|
24
|
+
from dataeval.outputs._base import set_metadata
|
25
|
+
from dataeval.outputs._drift import DriftMVDCOutput
|
26
|
+
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
DEFAULT_LGBM_HYPERPARAMS = {
|
30
|
+
"boosting_type": "gbdt",
|
31
|
+
"class_weight": None,
|
32
|
+
"colsample_bytree": 1.0,
|
33
|
+
"deterministic": True,
|
34
|
+
"importance_type": "split",
|
35
|
+
"learning_rate": 0.1,
|
36
|
+
"max_depth": -1,
|
37
|
+
"min_child_samples": 20,
|
38
|
+
"min_child_weight": 0.001,
|
39
|
+
"min_split_gain": 0.0,
|
40
|
+
"n_estimators": 100,
|
41
|
+
"num_leaves": 31,
|
42
|
+
"objective": None,
|
43
|
+
"reg_alpha": 0.0,
|
44
|
+
"reg_lambda": 0.0,
|
45
|
+
"subsample": 1.0,
|
46
|
+
"subsample_for_bin": 200000,
|
47
|
+
"subsample_freq": 0,
|
48
|
+
"verbosity": -1,
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
class DomainClassifierCalculator(AbstractCalculator):
|
53
|
+
"""
|
54
|
+
DomainClassifierCalculator implementation.
|
55
|
+
|
56
|
+
Uses Drift Detection Classifier's cross validated performance as a measure of drift.
|
57
|
+
"""
|
58
|
+
|
59
|
+
def __init__(
|
60
|
+
self,
|
61
|
+
chunker: Chunker | None = None,
|
62
|
+
cv_folds_num: int = 5,
|
63
|
+
hyperparameters: dict[str, Any] | None = None,
|
64
|
+
threshold: Threshold = ConstantThreshold(lower=0.45, upper=0.65),
|
65
|
+
) -> None:
|
66
|
+
"""
|
67
|
+
Create a new DomainClassifierCalculator instance.
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
-----------
|
71
|
+
chunker : Chunker, default=None
|
72
|
+
The `Chunker` used to split the data sets into a lists of chunks.
|
73
|
+
cv_folds_num: Optional[int]
|
74
|
+
Number of cross-validation folds to use when calculating DC discrimination value.
|
75
|
+
hyperparameters : dict[str, Any], default = None
|
76
|
+
A dictionary used to provide your own custom hyperparameters when training the discrimination model.
|
77
|
+
Check out the available hyperparameter options in the
|
78
|
+
`LightGBM docs <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>`_.
|
79
|
+
threshold: Threshold, default=ConstantThreshold
|
80
|
+
The threshold you wish to evaluate values on. Defaults to a ConstantThreshold with lower value
|
81
|
+
of 0.45 and upper value of 0.65.
|
82
|
+
"""
|
83
|
+
super().__init__(chunker, logger)
|
84
|
+
|
85
|
+
self.cv_folds_num = cv_folds_num
|
86
|
+
self.hyperparameters = DEFAULT_LGBM_HYPERPARAMS if hyperparameters is None else hyperparameters
|
87
|
+
self.threshold = threshold
|
88
|
+
self.result: DriftMVDCOutput | None = None
|
89
|
+
|
90
|
+
def _fit(self, reference_data: pd.DataFrame) -> DriftMVDCOutput:
|
91
|
+
"""Fits the DC calculator to a set of reference data."""
|
92
|
+
self._x_ref = reference_data
|
93
|
+
result = self._calculate(data=self._x_ref)
|
94
|
+
result._data[("chunk", "period")] = "reference"
|
95
|
+
|
96
|
+
return result
|
97
|
+
|
98
|
+
@set_metadata
|
99
|
+
def _calculate(self, data: pd.DataFrame) -> DriftMVDCOutput:
|
100
|
+
"""Calculate the data DC calculator metric for a given data set."""
|
101
|
+
chunks = self.chunker.split(data)
|
102
|
+
|
103
|
+
res = pd.DataFrame.from_records(
|
104
|
+
[
|
105
|
+
{
|
106
|
+
**chunk.dict(),
|
107
|
+
"period": "analysis",
|
108
|
+
"classifier_auroc_value": self._calculate_chunk(chunk=chunk),
|
109
|
+
}
|
110
|
+
for chunk in chunks
|
111
|
+
]
|
112
|
+
)
|
113
|
+
|
114
|
+
multilevel_index = _create_multilevel_index(chunks, "domain_classifier_auroc", ["value"])
|
115
|
+
res.columns = multilevel_index
|
116
|
+
res = res.reset_index(drop=True)
|
117
|
+
|
118
|
+
res = self._populate_alert_thresholds(res)
|
119
|
+
|
120
|
+
if self.result is None:
|
121
|
+
self.result = DriftMVDCOutput(results_data=res)
|
122
|
+
else:
|
123
|
+
self.result = self.result.filter(period="reference")
|
124
|
+
self.result._data = pd.concat([self.result._data, res], ignore_index=True)
|
125
|
+
return self.result
|
126
|
+
|
127
|
+
def _calculate_chunk(self, chunk: Chunk) -> float:
|
128
|
+
if self.result is None:
|
129
|
+
# Use information from chunk indices to identify reference chunk's location. This is possible because
|
130
|
+
# both the internal reference data copy and the chunk data were sorted by timestamp, so these
|
131
|
+
# indices align. This way we eliminate the need to combine these two data frames and drop duplicate rows,
|
132
|
+
# which is a costly operation.
|
133
|
+
df_X = self._x_ref
|
134
|
+
y = np.zeros(len(df_X), dtype=np.intp)
|
135
|
+
y[chunk.start_index : chunk.end_index + 1] = 1
|
136
|
+
else:
|
137
|
+
chunk_X = chunk.data
|
138
|
+
reference_X = self._x_ref
|
139
|
+
chunk_y = np.ones(len(chunk_X), dtype=np.intp)
|
140
|
+
reference_y = np.zeros(len(reference_X), dtype=np.intp)
|
141
|
+
df_X = pd.concat([reference_X, chunk_X], ignore_index=True)
|
142
|
+
y = np.concatenate([reference_y, chunk_y])
|
143
|
+
|
144
|
+
skf = StratifiedKFold(n_splits=self.cv_folds_num)
|
145
|
+
all_preds: list[NDArray[np.float32]] = []
|
146
|
+
all_tgts: list[NDArray[np.intp]] = []
|
147
|
+
for i, (train_index, test_index) in enumerate(skf.split(df_X, y)):
|
148
|
+
_trx = df_X.iloc[train_index]
|
149
|
+
_try = y[train_index]
|
150
|
+
_tsx = df_X.iloc[test_index]
|
151
|
+
_tsy = y[test_index]
|
152
|
+
model = LGBMClassifier(**self.hyperparameters, n_jobs=get_max_processes(), random_state=get_seed())
|
153
|
+
model.fit(_trx, _try)
|
154
|
+
preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
|
155
|
+
all_preds.append(preds)
|
156
|
+
all_tgts.append(_tsy)
|
157
|
+
|
158
|
+
np_all_preds = np.concatenate(all_preds, axis=0)
|
159
|
+
np_all_tgts = np.concatenate(all_tgts, axis=0)
|
160
|
+
result = roc_auc_score(np_all_tgts, np_all_preds)
|
161
|
+
return 0.5 if result == np.nan else float(result)
|
162
|
+
|
163
|
+
def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
|
164
|
+
if self.result is None:
|
165
|
+
self._threshold_values = self.threshold.calculate(
|
166
|
+
data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
|
167
|
+
lower_limit=0.0,
|
168
|
+
upper_limit=1.0,
|
169
|
+
logger=self._logger,
|
170
|
+
)
|
171
|
+
|
172
|
+
result_data[("domain_classifier_auroc", "upper_threshold")] = self._threshold_values[1]
|
173
|
+
result_data[("domain_classifier_auroc", "lower_threshold")] = self._threshold_values[0]
|
174
|
+
result_data[("domain_classifier_auroc", "alert")] = result_data.apply(
|
175
|
+
lambda row: bool(
|
176
|
+
row["domain_classifier_auroc", "value"] > row["domain_classifier_auroc", "upper_threshold"]
|
177
|
+
or row["domain_classifier_auroc", "value"] < row["domain_classifier_auroc", "lower_threshold"]
|
178
|
+
),
|
179
|
+
axis=1,
|
180
|
+
)
|
181
|
+
return result_data
|