sliceline 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sliceline/__init__.py +3 -0
- sliceline/slicefinder.py +732 -0
- sliceline/validation.py +858 -0
- sliceline-0.0.0.dist-info/LICENSE +29 -0
- sliceline-0.0.0.dist-info/METADATA +119 -0
- sliceline-0.0.0.dist-info/RECORD +7 -0
- sliceline-0.0.0.dist-info/WHEEL +4 -0
sliceline/slicefinder.py
ADDED
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The slicefinder module implements the Slicefinder class.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Tuple, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from scipy import sparse as sp
|
|
10
|
+
from scipy.stats import rankdata
|
|
11
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
12
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
13
|
+
from sklearn.utils.validation import check_is_fitted
|
|
14
|
+
|
|
15
|
+
from sliceline.validation import check_array, check_X_e
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
logging.basicConfig(level=logging.INFO)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Slicefinder(BaseEstimator, TransformerMixin):
|
|
22
|
+
"""Slicefinder class.
|
|
23
|
+
|
|
24
|
+
SliceLine is a fast, linear-algebra-based slice finding for ML Model Debugging.
|
|
25
|
+
|
|
26
|
+
Given an input dataset (`X`) and a model error vector (`errors`), SliceLine finds
|
|
27
|
+
the `k` slices in `X` that identify where the model performs significantly worse.
|
|
28
|
+
A slice is a subspace of `X` defined by one or more predicates.
|
|
29
|
+
|
|
30
|
+
The maximal dimension of this subspace is controlled by `max_l`.
|
|
31
|
+
|
|
32
|
+
The slice scoring function is the linear combination of two objectives:
|
|
33
|
+
- Find sufficiently large slices, with more than `min_sup` elements
|
|
34
|
+
(high impact on the overall model)
|
|
35
|
+
- With substantial errors
|
|
36
|
+
(high negative impact on sub-group/model)
|
|
37
|
+
|
|
38
|
+
The importance of each objective is controlled through a single parameter `alpha`.
|
|
39
|
+
|
|
40
|
+
Slice enumeration and pruning techniques are done via sparse linear algebra.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
alpha: float, default=0.6
|
|
45
|
+
Weight parameter for the importance of the average slice error.
|
|
46
|
+
0 < `alpha` <= 1.
|
|
47
|
+
|
|
48
|
+
k: int, default=1
|
|
49
|
+
Maximum number of slices to return.
|
|
50
|
+
Note: in case of equality between `k`-th slice score and the following ones,
|
|
51
|
+
all those slices are returned, leading to `_n_features_out` slices returned.
|
|
52
|
+
(`_n_features_out` >= `k`)
|
|
53
|
+
|
|
54
|
+
max_l: int, default=4
|
|
55
|
+
Maximum lattice level.
|
|
56
|
+
In other words: the maximum number of predicate to define a slice.
|
|
57
|
+
|
|
58
|
+
min_sup: int or float, default=10
|
|
59
|
+
Minimum support threshold. Inspired by frequent itemset mining,
|
|
60
|
+
it ensures statistical significance. If `min_sup` is a float (0 < `min_sup` < 1),
|
|
61
|
+
it represents the faction of the input dataset (`X`).
|
|
62
|
+
|
|
63
|
+
verbose: bool, default=True
|
|
64
|
+
Controls the verbosity.
|
|
65
|
+
|
|
66
|
+
Attributes
|
|
67
|
+
----------
|
|
68
|
+
top_slices: np.ndarray of shape (_n_features_out, number of columns of the input dataset)
|
|
69
|
+
The `_n_features_out` slices with the highest score.
|
|
70
|
+
`None` values in slices represent unused column in the slice.
|
|
71
|
+
|
|
72
|
+
average_error: float
|
|
73
|
+
Mean value of the input error.
|
|
74
|
+
|
|
75
|
+
top_slices_statistics: list of dict of length `len(top_slices_)`
|
|
76
|
+
The statistics of the slices found sorted by slice's scores.
|
|
77
|
+
For each slice, the following statistics are stored:
|
|
78
|
+
- slice_score: the score of the slice (defined in `_score` method)
|
|
79
|
+
- sum_slice_error: the sum of all the errors in the slice
|
|
80
|
+
- max_slice_error: the maximum of all errors in the slice
|
|
81
|
+
- slice_size: the number of elements in the slice
|
|
82
|
+
- slice_average_error: the average error in the slice (sum_slice_error / slice_size)
|
|
83
|
+
|
|
84
|
+
References
|
|
85
|
+
----------
|
|
86
|
+
`SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging
|
|
87
|
+
<https://mboehm7.github.io/resources/sigmod2021b_sliceline.pdf>`__,
|
|
88
|
+
from *Svetlana Sagadeeva* and *Matthias Boehm* of Graz University of Technology.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
alpha: float = 0.6,
|
|
94
|
+
k: int = 1,
|
|
95
|
+
max_l: int = 4,
|
|
96
|
+
min_sup: Union[int, float] = 10,
|
|
97
|
+
verbose: bool = True,
|
|
98
|
+
):
|
|
99
|
+
self.alpha = alpha
|
|
100
|
+
self.k = k
|
|
101
|
+
self.max_l = max_l
|
|
102
|
+
self.min_sup = min_sup
|
|
103
|
+
self.verbose = verbose
|
|
104
|
+
|
|
105
|
+
self._one_hot_encoder = self._top_slices_enc = None
|
|
106
|
+
self.top_slices_ = self.top_slices_statistics_ = None
|
|
107
|
+
self.average_error_ = None
|
|
108
|
+
|
|
109
|
+
if self.verbose:
|
|
110
|
+
logger.setLevel(logging.DEBUG)
|
|
111
|
+
else:
|
|
112
|
+
logger.setLevel(logging.INFO)
|
|
113
|
+
|
|
114
|
+
def _check_params(self):
|
|
115
|
+
"""Check transformer parameters."""
|
|
116
|
+
if not 0 < self.alpha <= 1:
|
|
117
|
+
raise ValueError(f"Invalid 'alpha' parameter: {self.alpha}")
|
|
118
|
+
|
|
119
|
+
if self.k <= 0:
|
|
120
|
+
raise ValueError(f"Invalid 'k' parameter: {self.k}")
|
|
121
|
+
|
|
122
|
+
if self.max_l <= 0:
|
|
123
|
+
raise ValueError(f"Invalid 'max_l' parameter: {self.max_l}")
|
|
124
|
+
|
|
125
|
+
if self.min_sup < 0 or (
|
|
126
|
+
isinstance(self.min_sup, float) and self.min_sup >= 1
|
|
127
|
+
):
|
|
128
|
+
raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")
|
|
129
|
+
|
|
130
|
+
def _check_top_slices(self):
|
|
131
|
+
"""Check if slices have been found."""
|
|
132
|
+
# Check if fit has been called
|
|
133
|
+
check_is_fitted(self)
|
|
134
|
+
|
|
135
|
+
# Check if a slice has been found
|
|
136
|
+
if self.top_slices_.size == 0:
|
|
137
|
+
raise ValueError("No transform: Sliceline did not find any slice.")
|
|
138
|
+
|
|
139
|
+
def fit(self, X, errors):
|
|
140
|
+
"""Search for slice(s) on `X` based on `errors`.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
X: array-like of shape (n_samples, n_features)
|
|
145
|
+
Training data, where `n_samples` is the number of samples
|
|
146
|
+
and `n_features` is the number of features.
|
|
147
|
+
|
|
148
|
+
errors: array-like of shape (n_samples, )
|
|
149
|
+
Errors of a machine learning model.
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
self: object
|
|
154
|
+
Returns the instance itself.
|
|
155
|
+
"""
|
|
156
|
+
self._check_params()
|
|
157
|
+
|
|
158
|
+
# Update min_sup for a fraction of the input dataset size
|
|
159
|
+
if 0 < self.min_sup < 1:
|
|
160
|
+
self.min_sup = int(self.min_sup * len(X))
|
|
161
|
+
|
|
162
|
+
# Check that X and e have correct shape
|
|
163
|
+
X_array, errors = check_X_e(X, errors, y_numeric=True)
|
|
164
|
+
|
|
165
|
+
self._check_feature_names(X, reset=True)
|
|
166
|
+
|
|
167
|
+
self._search_slices(X_array, errors)
|
|
168
|
+
|
|
169
|
+
return self
|
|
170
|
+
|
|
171
|
+
def transform(self, X):
|
|
172
|
+
"""Generate slices masks for `X`.
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
X: array-like of shape (n_samples, n_features)
|
|
177
|
+
Training data, where `n_samples` is the number of samples
|
|
178
|
+
and `n_features` is the number of features.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
slices_masks: np.ndarray of shape (n_samples, _n_features_out)
|
|
183
|
+
`slices_masks[i, j] == 1`: the `i`-th sample of `X` is in the `j`-th `top_slices_`.
|
|
184
|
+
"""
|
|
185
|
+
self._check_top_slices()
|
|
186
|
+
|
|
187
|
+
# Input validation
|
|
188
|
+
X = check_array(X)
|
|
189
|
+
|
|
190
|
+
slices_masks = self._get_slices_masks(X)
|
|
191
|
+
|
|
192
|
+
return slices_masks.T
|
|
193
|
+
|
|
194
|
+
def get_slice(self, X, slice_index: int):
|
|
195
|
+
"""Filter `X` samples according to the `slice_index`-th slice.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
X: array-like of shape (n_samples, n_features)
|
|
200
|
+
Dataset, where `n_samples` is the number of samples
|
|
201
|
+
and `n_features` is the number of features.
|
|
202
|
+
|
|
203
|
+
slice_index: int
|
|
204
|
+
Index of the slice to get from `top_slices_`.
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
X_slice: np.ndarray of shape (n_samples in the `slice_index`-th slice, n_features)
|
|
209
|
+
Filter `X` samples that are in the `slice_index`-th slice.
|
|
210
|
+
"""
|
|
211
|
+
self._check_top_slices()
|
|
212
|
+
|
|
213
|
+
# Input validation
|
|
214
|
+
X = check_array(X, force_all_finite=False)
|
|
215
|
+
|
|
216
|
+
slices_masks = self._get_slices_masks(X)
|
|
217
|
+
|
|
218
|
+
return X[np.where(slices_masks[slice_index])[0], :]
|
|
219
|
+
|
|
220
|
+
def get_feature_names_out(self):
|
|
221
|
+
"""Get output feature names for transformation.
|
|
222
|
+
|
|
223
|
+
Returns
|
|
224
|
+
-------
|
|
225
|
+
feature_names_out : ndarray of str objects
|
|
226
|
+
The following output feature names are generated:
|
|
227
|
+
`["slice_0", "slice_1", ..., "slice_(_n_features_out)"]`.
|
|
228
|
+
"""
|
|
229
|
+
check_is_fitted(self)
|
|
230
|
+
|
|
231
|
+
feature_names = [f"slice_{i}" for i in range(self._n_features_out)]
|
|
232
|
+
|
|
233
|
+
return np.array(feature_names, dtype=object)
|
|
234
|
+
|
|
235
|
+
def _get_slices_masks(self, X):
|
|
236
|
+
"""Private utilities function generating slices masks for `X`."""
|
|
237
|
+
X_encoded = self._one_hot_encoder.transform(X)
|
|
238
|
+
|
|
239
|
+
# Shape X_encoded: (X.shape[0], total number of modalities in _one_hot_encoder.categories_)
|
|
240
|
+
# Shape _top_slices_enc: (top_slices_.shape[0], X_encoded[1])
|
|
241
|
+
slice_candidates = self._top_slices_enc @ X_encoded.T
|
|
242
|
+
|
|
243
|
+
# self._top_slices_enc.sum(axis=1) is the number of predicate(s) for each top_slices_
|
|
244
|
+
slices_masks = (
|
|
245
|
+
slice_candidates == self._top_slices_enc.sum(axis=1)
|
|
246
|
+
).A.astype(int)
|
|
247
|
+
|
|
248
|
+
return slices_masks
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def _n_features_out(self):
|
|
252
|
+
"""Number of transformed output features."""
|
|
253
|
+
return self.top_slices_.shape[0]
|
|
254
|
+
|
|
255
|
+
@staticmethod
|
|
256
|
+
def _dummify(array: np.ndarray, n_col_x_encoded: int) -> sp.csr_matrix:
|
|
257
|
+
"""Dummify `array` with respect to `n_col_x_encoded`.
|
|
258
|
+
Assumption: v does not contain any 0."""
|
|
259
|
+
assert (
|
|
260
|
+
0 not in array
|
|
261
|
+
), "Modality 0 is not expected to be one-hot encoded."
|
|
262
|
+
one_hot_encoding = sp.lil_matrix(
|
|
263
|
+
(array.size, n_col_x_encoded), dtype=bool
|
|
264
|
+
)
|
|
265
|
+
one_hot_encoding[np.arange(array.size), array - 1] = True
|
|
266
|
+
return one_hot_encoding.tocsr()
|
|
267
|
+
|
|
268
|
+
def _maintain_top_k(
|
|
269
|
+
self,
|
|
270
|
+
slices: sp.csr_matrix,
|
|
271
|
+
statistics: np.ndarray,
|
|
272
|
+
top_k_slices: sp.csr_matrix,
|
|
273
|
+
top_k_statistics: np.ndarray,
|
|
274
|
+
) -> Tuple[sp.csr_matrix, np.ndarray]:
|
|
275
|
+
"""Add new `slices` to `top_k_slices` and update the top-k slices."""
|
|
276
|
+
# prune invalid min_sup and scores
|
|
277
|
+
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
|
|
278
|
+
statistics[:, 0] > 0
|
|
279
|
+
)
|
|
280
|
+
if np.sum(valid_slices_mask) != 0:
|
|
281
|
+
slices, statistics = (
|
|
282
|
+
slices[valid_slices_mask],
|
|
283
|
+
statistics[valid_slices_mask],
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
if (slices.shape[1] != top_k_slices.shape[1]) & (
|
|
287
|
+
slices.shape[1] == 1
|
|
288
|
+
):
|
|
289
|
+
slices, statistics = slices.T, statistics.T
|
|
290
|
+
|
|
291
|
+
# evaluated candidates and previous top-k
|
|
292
|
+
slices = sp.vstack([top_k_slices, slices])
|
|
293
|
+
statistics = np.concatenate([top_k_statistics, statistics])
|
|
294
|
+
|
|
295
|
+
# extract top-k
|
|
296
|
+
top_slices_bool = (
|
|
297
|
+
rankdata(-statistics[:, 0], method="min") <= self.k
|
|
298
|
+
)
|
|
299
|
+
top_k_slices, top_k_statistics = (
|
|
300
|
+
slices[top_slices_bool],
|
|
301
|
+
statistics[top_slices_bool],
|
|
302
|
+
)
|
|
303
|
+
top_slices_indices = np.argsort(-top_k_statistics[:, 0])
|
|
304
|
+
top_k_slices, top_k_statistics = (
|
|
305
|
+
top_k_slices[top_slices_indices],
|
|
306
|
+
top_k_statistics[top_slices_indices],
|
|
307
|
+
)
|
|
308
|
+
return top_k_slices, top_k_statistics
|
|
309
|
+
|
|
310
|
+
def _score_ub(
|
|
311
|
+
self,
|
|
312
|
+
slice_sizes_ub: np.ndarray,
|
|
313
|
+
slice_errors_ub: np.ndarray,
|
|
314
|
+
max_slice_errors_ub: np.ndarray,
|
|
315
|
+
n_col_x_encoded: int,
|
|
316
|
+
) -> np.ndarray:
|
|
317
|
+
"""Compute the upper-bound score for all the slices."""
|
|
318
|
+
# Since slice_scores is either monotonically increasing or decreasing, we
|
|
319
|
+
# probe interesting points of slice_scores in the interval [min_sup, ss],
|
|
320
|
+
# and compute the maximum to serve as the upper bound
|
|
321
|
+
potential_solutions = np.column_stack(
|
|
322
|
+
(
|
|
323
|
+
self.min_sup * np.ones(slice_sizes_ub.shape[0]),
|
|
324
|
+
np.maximum(
|
|
325
|
+
slice_errors_ub / max_slice_errors_ub, self.min_sup
|
|
326
|
+
),
|
|
327
|
+
slice_sizes_ub,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
slice_scores_ub = np.amax(
|
|
331
|
+
(
|
|
332
|
+
self.alpha
|
|
333
|
+
* (
|
|
334
|
+
np.minimum(
|
|
335
|
+
potential_solutions.T * max_slice_errors_ub,
|
|
336
|
+
slice_errors_ub,
|
|
337
|
+
).T
|
|
338
|
+
/ self.average_error_
|
|
339
|
+
- potential_solutions
|
|
340
|
+
)
|
|
341
|
+
- (1 - self.alpha) * (n_col_x_encoded - potential_solutions)
|
|
342
|
+
)
|
|
343
|
+
/ potential_solutions,
|
|
344
|
+
axis=1,
|
|
345
|
+
)
|
|
346
|
+
return slice_scores_ub
|
|
347
|
+
|
|
348
|
+
@staticmethod
|
|
349
|
+
def _analyse_top_k(top_k_statistics: np.ndarray) -> tuple:
|
|
350
|
+
"""Get the maximum and the minimum slices scores."""
|
|
351
|
+
max_slice_scores = min_slice_scores = -np.inf
|
|
352
|
+
if top_k_statistics.shape[0] > 0:
|
|
353
|
+
max_slice_scores = top_k_statistics[0, 0]
|
|
354
|
+
min_slice_scores = top_k_statistics[
|
|
355
|
+
top_k_statistics.shape[0] - 1, 0
|
|
356
|
+
]
|
|
357
|
+
return max_slice_scores, min_slice_scores
|
|
358
|
+
|
|
359
|
+
def _score(
|
|
360
|
+
self,
|
|
361
|
+
slice_sizes: np.ndarray,
|
|
362
|
+
slice_errors: np.ndarray,
|
|
363
|
+
n_row_x_encoded: int,
|
|
364
|
+
) -> np.ndarray:
|
|
365
|
+
"""Compute the score for all the slices."""
|
|
366
|
+
slice_scores = self.alpha * (
|
|
367
|
+
(slice_errors / slice_sizes) / self.average_error_ - 1
|
|
368
|
+
) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
|
|
369
|
+
return np.nan_to_num(slice_scores, nan=-np.inf)
|
|
370
|
+
|
|
371
|
+
def _eval_slice(
|
|
372
|
+
self,
|
|
373
|
+
x_encoded: sp.csr_matrix,
|
|
374
|
+
errors: np.ndarray,
|
|
375
|
+
slices: sp.csr_matrix,
|
|
376
|
+
level: int,
|
|
377
|
+
) -> np.ndarray:
|
|
378
|
+
"""Compute several statistics for all the slices."""
|
|
379
|
+
slice_candidates = x_encoded @ slices.T == level
|
|
380
|
+
slice_sizes = slice_candidates.sum(axis=0).A[0]
|
|
381
|
+
slice_errors = errors @ slice_candidates
|
|
382
|
+
# Here we can't use the .A shorthand because it is not
|
|
383
|
+
# implemented in all scipy versions for coo_matrix objects
|
|
384
|
+
max_slice_errors = (
|
|
385
|
+
slice_candidates.T.multiply(errors).max(axis=1).toarray()
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# score of relative error and relative size
|
|
389
|
+
slice_scores = self._score(
|
|
390
|
+
slice_sizes, slice_errors, x_encoded.shape[0]
|
|
391
|
+
)
|
|
392
|
+
return np.column_stack(
|
|
393
|
+
[slice_scores, slice_errors, max_slice_errors, slice_sizes]
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
def _create_and_score_basic_slices(
|
|
397
|
+
self,
|
|
398
|
+
x_encoded: sp.csr_matrix,
|
|
399
|
+
n_col_x_encoded: int,
|
|
400
|
+
errors: np.ndarray,
|
|
401
|
+
) -> Tuple[sp.csr_matrix, np.ndarray]:
|
|
402
|
+
"""Initialise 1-slices, i.e. slices with one predicate."""
|
|
403
|
+
slice_sizes = x_encoded.sum(axis=0).A[0]
|
|
404
|
+
slice_errors = errors @ x_encoded
|
|
405
|
+
# Here we can't use the .A shorthand because it is not
|
|
406
|
+
# implemented in all scipy versions for coo_matrix objects
|
|
407
|
+
max_slice_errors = (
|
|
408
|
+
x_encoded.T.multiply(errors).max(axis=1).toarray()[:, 0]
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# working set of active slices (#attr x #slices) and top-k
|
|
412
|
+
valid_slices_mask = (slice_sizes >= self.min_sup) & (slice_errors > 0)
|
|
413
|
+
attr = np.arange(1, n_col_x_encoded + 1)[valid_slices_mask]
|
|
414
|
+
slice_sizes = slice_sizes[valid_slices_mask]
|
|
415
|
+
slice_errors = slice_errors[valid_slices_mask]
|
|
416
|
+
max_slice_errors = max_slice_errors[valid_slices_mask]
|
|
417
|
+
slices = self._dummify(attr, n_col_x_encoded)
|
|
418
|
+
|
|
419
|
+
# score 1-slices and create initial top-k
|
|
420
|
+
slice_scores = self._score(
|
|
421
|
+
slice_sizes, slice_errors, x_encoded.shape[0]
|
|
422
|
+
)
|
|
423
|
+
statistics = np.column_stack(
|
|
424
|
+
(slice_scores, slice_errors, max_slice_errors, slice_sizes)
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
n_col_dropped = n_col_x_encoded - sum(valid_slices_mask)
|
|
428
|
+
logger.debug(
|
|
429
|
+
"Dropping %i/%i features below min_sup = %i."
|
|
430
|
+
% (n_col_dropped, n_col_x_encoded, self.min_sup)
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return slices, statistics
|
|
434
|
+
|
|
435
|
+
def _get_pruned_s_r(
|
|
436
|
+
self, slices: sp.csr_matrix, statistics: np.ndarray
|
|
437
|
+
) -> Tuple[sp.csr_matrix, np.ndarray]:
|
|
438
|
+
"""Prune invalid slices.
|
|
439
|
+
Do not affect overall pruning effectiveness due to handling of missing parents.
|
|
440
|
+
"""
|
|
441
|
+
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
|
|
442
|
+
statistics[:, 1] > 0
|
|
443
|
+
)
|
|
444
|
+
return slices[valid_slices_mask], statistics[valid_slices_mask]
|
|
445
|
+
|
|
446
|
+
@staticmethod
|
|
447
|
+
def _join_compatible_slices(
|
|
448
|
+
slices: sp.csr_matrix, level: int
|
|
449
|
+
) -> np.ndarray:
|
|
450
|
+
"""Join compatible slices according to `level`."""
|
|
451
|
+
slices_int = slices.astype(int)
|
|
452
|
+
# Here we can't use the .A shorthand because it is not
|
|
453
|
+
# implemented in all scipy versions for coo_matrix objects
|
|
454
|
+
join = (slices_int @ slices_int.T).toarray() == level - 2
|
|
455
|
+
return np.triu(join, 1) * join
|
|
456
|
+
|
|
457
|
+
@staticmethod
|
|
458
|
+
def _combine_slices(
|
|
459
|
+
slices: sp.csr_matrix,
|
|
460
|
+
statistics: np.ndarray,
|
|
461
|
+
compatible_slices: np.ndarray,
|
|
462
|
+
) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
|
|
463
|
+
"""Combine slices by exploiting parents node statistics."""
|
|
464
|
+
parent_1_idx, parent_2_idx = np.where(compatible_slices == 1)
|
|
465
|
+
pair_candidates = slices[parent_1_idx] + slices[parent_2_idx]
|
|
466
|
+
|
|
467
|
+
slice_errors = np.minimum(
|
|
468
|
+
statistics[parent_1_idx, 1], statistics[parent_2_idx, 1]
|
|
469
|
+
)
|
|
470
|
+
max_slice_errors = np.minimum(
|
|
471
|
+
statistics[parent_1_idx, 2], statistics[parent_2_idx, 2]
|
|
472
|
+
)
|
|
473
|
+
slice_sizes = np.minimum(
|
|
474
|
+
statistics[parent_1_idx, 3], statistics[parent_2_idx, 3]
|
|
475
|
+
)
|
|
476
|
+
return pair_candidates, slice_sizes, slice_errors, max_slice_errors
|
|
477
|
+
|
|
478
|
+
@staticmethod
|
|
479
|
+
def _prune_invalid_self_joins(
|
|
480
|
+
feature_offset_start: np.ndarray,
|
|
481
|
+
feature_offset_end: np.ndarray,
|
|
482
|
+
pair_candidates: sp.csr_matrix,
|
|
483
|
+
slice_sizes: np.ndarray,
|
|
484
|
+
slice_errors: np.ndarray,
|
|
485
|
+
max_slice_errors: np.ndarray,
|
|
486
|
+
) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
|
|
487
|
+
"""Prune invalid self joins (>1 bit per feature)."""
|
|
488
|
+
valid_slices_mask = np.full(pair_candidates.shape[0], True)
|
|
489
|
+
for start, end in zip(feature_offset_start, feature_offset_end):
|
|
490
|
+
valid_slices_mask = (
|
|
491
|
+
valid_slices_mask
|
|
492
|
+
* (pair_candidates[:, start:end].sum(axis=1) <= 1).A[:, 0]
|
|
493
|
+
)
|
|
494
|
+
return (
|
|
495
|
+
pair_candidates[valid_slices_mask],
|
|
496
|
+
slice_sizes[valid_slices_mask],
|
|
497
|
+
slice_errors[valid_slices_mask],
|
|
498
|
+
max_slice_errors[valid_slices_mask],
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
@staticmethod
|
|
502
|
+
def _prepare_deduplication_and_pruning(
|
|
503
|
+
feature_offset_start: np.ndarray,
|
|
504
|
+
feature_offset_end: np.ndarray,
|
|
505
|
+
feature_domains: np.ndarray,
|
|
506
|
+
pair_candidates: sp.csr_matrix,
|
|
507
|
+
) -> np.ndarray:
|
|
508
|
+
"""Prepare IDs for deduplication and pruning."""
|
|
509
|
+
ids = np.zeros(pair_candidates.shape[0])
|
|
510
|
+
dom = feature_domains + 1
|
|
511
|
+
for j, (start, end) in enumerate(
|
|
512
|
+
zip(feature_offset_start, feature_offset_end)
|
|
513
|
+
):
|
|
514
|
+
sub_pair_candidates = pair_candidates[:, start:end]
|
|
515
|
+
# sub_p should not contain multiple True on the same line
|
|
516
|
+
i = sub_pair_candidates.argmax(axis=1).T + np.any(
|
|
517
|
+
# Here we can't use the .A shorthand because it is not
|
|
518
|
+
# implemented in all scipy versions for coo_matrix objects
|
|
519
|
+
sub_pair_candidates.toarray(),
|
|
520
|
+
axis=1,
|
|
521
|
+
)
|
|
522
|
+
ids = ids + i.A * np.prod(dom[(j + 1) : dom.shape[0]])
|
|
523
|
+
return ids
|
|
524
|
+
|
|
525
|
+
def _get_pair_candidates(
|
|
526
|
+
self,
|
|
527
|
+
slices: sp.csr_matrix,
|
|
528
|
+
statistics: np.ndarray,
|
|
529
|
+
top_k_statistics: np.ndarray,
|
|
530
|
+
level: int,
|
|
531
|
+
n_col_x_encoded: int,
|
|
532
|
+
feature_domains: np.ndarray,
|
|
533
|
+
feature_offset_start: np.ndarray,
|
|
534
|
+
feature_offset_end: np.ndarray,
|
|
535
|
+
) -> sp.csr_matrix:
|
|
536
|
+
"""Compute and prune plausible slices candidates."""
|
|
537
|
+
compatible_slices = self._join_compatible_slices(slices, level)
|
|
538
|
+
|
|
539
|
+
if np.sum(compatible_slices) == 0:
|
|
540
|
+
return sp.csr_matrix(np.empty((0, slices.shape[1])))
|
|
541
|
+
|
|
542
|
+
(
|
|
543
|
+
pair_candidates,
|
|
544
|
+
slice_sizes,
|
|
545
|
+
slice_errors,
|
|
546
|
+
max_slice_errors,
|
|
547
|
+
) = self._combine_slices(slices, statistics, compatible_slices)
|
|
548
|
+
|
|
549
|
+
(
|
|
550
|
+
pair_candidates,
|
|
551
|
+
slice_sizes,
|
|
552
|
+
slice_errors,
|
|
553
|
+
max_slice_errors,
|
|
554
|
+
) = self._prune_invalid_self_joins(
|
|
555
|
+
feature_offset_start,
|
|
556
|
+
feature_offset_end,
|
|
557
|
+
pair_candidates,
|
|
558
|
+
slice_sizes,
|
|
559
|
+
slice_errors,
|
|
560
|
+
max_slice_errors,
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
if pair_candidates.shape[0] == 0:
|
|
564
|
+
return sp.csr_matrix(np.empty((0, slices.shape[1])))
|
|
565
|
+
|
|
566
|
+
ids = self._prepare_deduplication_and_pruning(
|
|
567
|
+
feature_offset_start,
|
|
568
|
+
feature_offset_end,
|
|
569
|
+
feature_domains,
|
|
570
|
+
pair_candidates,
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# remove duplicate candidates and select corresponding statistics
|
|
574
|
+
_, unique_candidate_indices, duplicate_counts = np.unique(
|
|
575
|
+
ids, return_index=True, return_counts=True
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Slices at level i normally have i parents (cf. section 3.1 in the paper)
|
|
579
|
+
# We want to keep only slices whose parents have not been pruned.
|
|
580
|
+
# If all the parents are present they are going to get combined 2 by 2 in i*(i-1)/2 ways
|
|
581
|
+
# So, we select only candidates which appear with the correct cardinality.
|
|
582
|
+
all_parents_mask = duplicate_counts == level * (level - 1) / 2
|
|
583
|
+
unique_candidate_indices = unique_candidate_indices[all_parents_mask]
|
|
584
|
+
|
|
585
|
+
pair_candidates = pair_candidates[unique_candidate_indices]
|
|
586
|
+
slice_sizes = slice_sizes[unique_candidate_indices]
|
|
587
|
+
slice_errors = slice_errors[unique_candidate_indices]
|
|
588
|
+
max_slice_errors = max_slice_errors[unique_candidate_indices]
|
|
589
|
+
|
|
590
|
+
slice_scores = self._score_ub(
|
|
591
|
+
slice_sizes,
|
|
592
|
+
slice_errors,
|
|
593
|
+
max_slice_errors,
|
|
594
|
+
n_col_x_encoded,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Seems to be always fully True
|
|
598
|
+
# Due to maintain_top_k that apply slice_sizes filter
|
|
599
|
+
pruning_sizes = slice_sizes >= self.min_sup
|
|
600
|
+
|
|
601
|
+
_, min_slice_scores = self._analyse_top_k(top_k_statistics)
|
|
602
|
+
|
|
603
|
+
pruning_scores = (slice_scores > min_slice_scores) & (slice_scores > 0)
|
|
604
|
+
|
|
605
|
+
return pair_candidates[pruning_scores & pruning_sizes]
|
|
606
|
+
|
|
607
|
+
def _search_slices(
|
|
608
|
+
self,
|
|
609
|
+
input_x: np.ndarray,
|
|
610
|
+
errors: np.ndarray,
|
|
611
|
+
) -> None:
|
|
612
|
+
"""Main function of the SliceLine algorithm."""
|
|
613
|
+
# prepare offset vectors and one-hot encoded input_x
|
|
614
|
+
self._one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
|
|
615
|
+
x_encoded = self._one_hot_encoder.fit_transform(input_x)
|
|
616
|
+
feature_domains: np.ndarray = np.array(
|
|
617
|
+
[len(sub_array) for sub_array in self._one_hot_encoder.categories_]
|
|
618
|
+
)
|
|
619
|
+
feature_offset_end = np.cumsum(feature_domains)
|
|
620
|
+
feature_offset_start = feature_offset_end - feature_domains
|
|
621
|
+
|
|
622
|
+
# initialize statistics and basic slices
|
|
623
|
+
n_col_x_encoded = x_encoded.shape[1]
|
|
624
|
+
self.average_error_ = float(np.mean(errors))
|
|
625
|
+
slices, statistics = self._create_and_score_basic_slices(
|
|
626
|
+
x_encoded,
|
|
627
|
+
n_col_x_encoded,
|
|
628
|
+
errors,
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
# initialize top-k
|
|
632
|
+
top_k_slices, top_k_statistics = self._maintain_top_k(
|
|
633
|
+
slices,
|
|
634
|
+
statistics,
|
|
635
|
+
sp.csr_matrix((0, n_col_x_encoded)),
|
|
636
|
+
np.zeros((0, 4)),
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
max_slice_scores, min_slice_scores = self._analyse_top_k(
|
|
640
|
+
top_k_statistics
|
|
641
|
+
)
|
|
642
|
+
logger.debug(
|
|
643
|
+
"Initial top-K: count=%i, max=%f, min=%f"
|
|
644
|
+
% (top_k_slices.shape[0], max_slice_scores, min_slice_scores)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# lattice enumeration w/ size/error pruning, one iteration per level
|
|
648
|
+
# termination condition (max #feature levels)
|
|
649
|
+
level = 1
|
|
650
|
+
min_condition = min(input_x.shape[1], self.max_l)
|
|
651
|
+
while (
|
|
652
|
+
(slices.shape[0] > 0)
|
|
653
|
+
& (slices.sum() > 0)
|
|
654
|
+
& (level < min_condition)
|
|
655
|
+
):
|
|
656
|
+
level += 1
|
|
657
|
+
|
|
658
|
+
# enumerate candidate join pairs, including size/error pruning
|
|
659
|
+
slices, statistics = self._get_pruned_s_r(slices, statistics)
|
|
660
|
+
nr_s = slices.shape[0]
|
|
661
|
+
slices = self._get_pair_candidates(
|
|
662
|
+
slices,
|
|
663
|
+
statistics,
|
|
664
|
+
top_k_statistics,
|
|
665
|
+
level,
|
|
666
|
+
n_col_x_encoded,
|
|
667
|
+
feature_domains,
|
|
668
|
+
feature_offset_start,
|
|
669
|
+
feature_offset_end,
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
logger.debug("Level %i:" % level)
|
|
673
|
+
logger.debug(
|
|
674
|
+
" -- generated paired slice candidates: %i -> %i"
|
|
675
|
+
% (nr_s, slices.shape[0])
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
# extract and evaluate candidate slices
|
|
679
|
+
statistics = self._eval_slice(x_encoded, errors, slices, level)
|
|
680
|
+
|
|
681
|
+
# maintain top-k after evaluation
|
|
682
|
+
top_k_slices, top_k_statistics = self._maintain_top_k(
|
|
683
|
+
slices, statistics, top_k_slices, top_k_statistics
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
max_slice_scores, min_slice_scores = self._analyse_top_k(
|
|
687
|
+
top_k_statistics
|
|
688
|
+
)
|
|
689
|
+
valid = np.sum(
|
|
690
|
+
(statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
|
|
691
|
+
)
|
|
692
|
+
logger.debug(
|
|
693
|
+
" -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
|
|
694
|
+
)
|
|
695
|
+
logger.debug(
|
|
696
|
+
" -- top-K: count=%i, max=%f, min=%f"
|
|
697
|
+
% (top_k_slices.shape[0], max_slice_scores, min_slice_scores)
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
self._top_slices_enc = top_k_slices.copy()
|
|
701
|
+
if top_k_slices.shape[0] == 0:
|
|
702
|
+
self.top_slices_ = np.empty((0, input_x.shape[1]))
|
|
703
|
+
else:
|
|
704
|
+
self.top_slices_ = self._one_hot_encoder.inverse_transform(
|
|
705
|
+
top_k_slices
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
# compute slices' average errors
|
|
709
|
+
top_k_statistics = np.column_stack(
|
|
710
|
+
(
|
|
711
|
+
top_k_statistics,
|
|
712
|
+
np.divide(top_k_statistics[:, 1], top_k_statistics[:, 3]),
|
|
713
|
+
)
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
# transform statistics to a list of dict
|
|
717
|
+
statistics_names = [
|
|
718
|
+
"slice_score",
|
|
719
|
+
"sum_slice_error",
|
|
720
|
+
"max_slice_error",
|
|
721
|
+
"slice_size",
|
|
722
|
+
"slice_average_error",
|
|
723
|
+
]
|
|
724
|
+
self.top_slices_statistics_ = [
|
|
725
|
+
{
|
|
726
|
+
stat_name: stat_value
|
|
727
|
+
for stat_value, stat_name in zip(statistic, statistics_names)
|
|
728
|
+
}
|
|
729
|
+
for statistic in top_k_statistics
|
|
730
|
+
]
|
|
731
|
+
|
|
732
|
+
logger.debug("Terminated at level %i." % level)
|