sliceline 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,732 @@
1
+ """
2
+ The slicefinder module implements the Slicefinder class.
3
+ """
4
+
5
+ import logging
6
+ from typing import Tuple, Union
7
+
8
+ import numpy as np
9
+ from scipy import sparse as sp
10
+ from scipy.stats import rankdata
11
+ from sklearn.base import BaseEstimator, TransformerMixin
12
+ from sklearn.preprocessing import OneHotEncoder
13
+ from sklearn.utils.validation import check_is_fitted
14
+
15
+ from sliceline.validation import check_array, check_X_e
16
+
17
+ logger = logging.getLogger(__name__)
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
+
21
+ class Slicefinder(BaseEstimator, TransformerMixin):
22
+ """Slicefinder class.
23
+
24
+ SliceLine is a fast, linear-algebra-based slice finding for ML Model Debugging.
25
+
26
+ Given an input dataset (`X`) and a model error vector (`errors`), SliceLine finds
27
+ the `k` slices in `X` that identify where the model performs significantly worse.
28
+ A slice is a subspace of `X` defined by one or more predicates.
29
+
30
+ The maximal dimension of this subspace is controlled by `max_l`.
31
+
32
+ The slice scoring function is the linear combination of two objectives:
33
+ - Find sufficiently large slices, with more than `min_sup` elements
34
+ (high impact on the overall model)
35
+ - With substantial errors
36
+ (high negative impact on sub-group/model)
37
+
38
+ The importance of each objective is controlled through a single parameter `alpha`.
39
+
40
+ Slice enumeration and pruning techniques are done via sparse linear algebra.
41
+
42
+ Parameters
43
+ ----------
44
+ alpha: float, default=0.6
45
+ Weight parameter for the importance of the average slice error.
46
+ 0 < `alpha` <= 1.
47
+
48
+ k: int, default=1
49
+ Maximum number of slices to return.
50
+ Note: in case of equality between `k`-th slice score and the following ones,
51
+ all those slices are returned, leading to `_n_features_out` slices returned.
52
+ (`_n_features_out` >= `k`)
53
+
54
+ max_l: int, default=4
55
+ Maximum lattice level.
56
+ In other words: the maximum number of predicate to define a slice.
57
+
58
+ min_sup: int or float, default=10
59
+ Minimum support threshold. Inspired by frequent itemset mining,
60
+ it ensures statistical significance. If `min_sup` is a float (0 < `min_sup` < 1),
61
+ it represents the faction of the input dataset (`X`).
62
+
63
+ verbose: bool, default=True
64
+ Controls the verbosity.
65
+
66
+ Attributes
67
+ ----------
68
+ top_slices: np.ndarray of shape (_n_features_out, number of columns of the input dataset)
69
+ The `_n_features_out` slices with the highest score.
70
+ `None` values in slices represent unused column in the slice.
71
+
72
+ average_error: float
73
+ Mean value of the input error.
74
+
75
+ top_slices_statistics: list of dict of length `len(top_slices_)`
76
+ The statistics of the slices found sorted by slice's scores.
77
+ For each slice, the following statistics are stored:
78
+ - slice_score: the score of the slice (defined in `_score` method)
79
+ - sum_slice_error: the sum of all the errors in the slice
80
+ - max_slice_error: the maximum of all errors in the slice
81
+ - slice_size: the number of elements in the slice
82
+ - slice_average_error: the average error in the slice (sum_slice_error / slice_size)
83
+
84
+ References
85
+ ----------
86
+ `SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging
87
+ <https://mboehm7.github.io/resources/sigmod2021b_sliceline.pdf>`__,
88
+ from *Svetlana Sagadeeva* and *Matthias Boehm* of Graz University of Technology.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ alpha: float = 0.6,
94
+ k: int = 1,
95
+ max_l: int = 4,
96
+ min_sup: Union[int, float] = 10,
97
+ verbose: bool = True,
98
+ ):
99
+ self.alpha = alpha
100
+ self.k = k
101
+ self.max_l = max_l
102
+ self.min_sup = min_sup
103
+ self.verbose = verbose
104
+
105
+ self._one_hot_encoder = self._top_slices_enc = None
106
+ self.top_slices_ = self.top_slices_statistics_ = None
107
+ self.average_error_ = None
108
+
109
+ if self.verbose:
110
+ logger.setLevel(logging.DEBUG)
111
+ else:
112
+ logger.setLevel(logging.INFO)
113
+
114
+ def _check_params(self):
115
+ """Check transformer parameters."""
116
+ if not 0 < self.alpha <= 1:
117
+ raise ValueError(f"Invalid 'alpha' parameter: {self.alpha}")
118
+
119
+ if self.k <= 0:
120
+ raise ValueError(f"Invalid 'k' parameter: {self.k}")
121
+
122
+ if self.max_l <= 0:
123
+ raise ValueError(f"Invalid 'max_l' parameter: {self.max_l}")
124
+
125
+ if self.min_sup < 0 or (
126
+ isinstance(self.min_sup, float) and self.min_sup >= 1
127
+ ):
128
+ raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")
129
+
130
+ def _check_top_slices(self):
131
+ """Check if slices have been found."""
132
+ # Check if fit has been called
133
+ check_is_fitted(self)
134
+
135
+ # Check if a slice has been found
136
+ if self.top_slices_.size == 0:
137
+ raise ValueError("No transform: Sliceline did not find any slice.")
138
+
139
+ def fit(self, X, errors):
140
+ """Search for slice(s) on `X` based on `errors`.
141
+
142
+ Parameters
143
+ ----------
144
+ X: array-like of shape (n_samples, n_features)
145
+ Training data, where `n_samples` is the number of samples
146
+ and `n_features` is the number of features.
147
+
148
+ errors: array-like of shape (n_samples, )
149
+ Errors of a machine learning model.
150
+
151
+ Returns
152
+ -------
153
+ self: object
154
+ Returns the instance itself.
155
+ """
156
+ self._check_params()
157
+
158
+ # Update min_sup for a fraction of the input dataset size
159
+ if 0 < self.min_sup < 1:
160
+ self.min_sup = int(self.min_sup * len(X))
161
+
162
+ # Check that X and e have correct shape
163
+ X_array, errors = check_X_e(X, errors, y_numeric=True)
164
+
165
+ self._check_feature_names(X, reset=True)
166
+
167
+ self._search_slices(X_array, errors)
168
+
169
+ return self
170
+
171
+ def transform(self, X):
172
+ """Generate slices masks for `X`.
173
+
174
+ Parameters
175
+ ----------
176
+ X: array-like of shape (n_samples, n_features)
177
+ Training data, where `n_samples` is the number of samples
178
+ and `n_features` is the number of features.
179
+
180
+ Returns
181
+ -------
182
+ slices_masks: np.ndarray of shape (n_samples, _n_features_out)
183
+ `slices_masks[i, j] == 1`: the `i`-th sample of `X` is in the `j`-th `top_slices_`.
184
+ """
185
+ self._check_top_slices()
186
+
187
+ # Input validation
188
+ X = check_array(X)
189
+
190
+ slices_masks = self._get_slices_masks(X)
191
+
192
+ return slices_masks.T
193
+
194
+ def get_slice(self, X, slice_index: int):
195
+ """Filter `X` samples according to the `slice_index`-th slice.
196
+
197
+ Parameters
198
+ ----------
199
+ X: array-like of shape (n_samples, n_features)
200
+ Dataset, where `n_samples` is the number of samples
201
+ and `n_features` is the number of features.
202
+
203
+ slice_index: int
204
+ Index of the slice to get from `top_slices_`.
205
+
206
+ Returns
207
+ -------
208
+ X_slice: np.ndarray of shape (n_samples in the `slice_index`-th slice, n_features)
209
+ Filter `X` samples that are in the `slice_index`-th slice.
210
+ """
211
+ self._check_top_slices()
212
+
213
+ # Input validation
214
+ X = check_array(X, force_all_finite=False)
215
+
216
+ slices_masks = self._get_slices_masks(X)
217
+
218
+ return X[np.where(slices_masks[slice_index])[0], :]
219
+
220
+ def get_feature_names_out(self):
221
+ """Get output feature names for transformation.
222
+
223
+ Returns
224
+ -------
225
+ feature_names_out : ndarray of str objects
226
+ The following output feature names are generated:
227
+ `["slice_0", "slice_1", ..., "slice_(_n_features_out)"]`.
228
+ """
229
+ check_is_fitted(self)
230
+
231
+ feature_names = [f"slice_{i}" for i in range(self._n_features_out)]
232
+
233
+ return np.array(feature_names, dtype=object)
234
+
235
+ def _get_slices_masks(self, X):
236
+ """Private utilities function generating slices masks for `X`."""
237
+ X_encoded = self._one_hot_encoder.transform(X)
238
+
239
+ # Shape X_encoded: (X.shape[0], total number of modalities in _one_hot_encoder.categories_)
240
+ # Shape _top_slices_enc: (top_slices_.shape[0], X_encoded[1])
241
+ slice_candidates = self._top_slices_enc @ X_encoded.T
242
+
243
+ # self._top_slices_enc.sum(axis=1) is the number of predicate(s) for each top_slices_
244
+ slices_masks = (
245
+ slice_candidates == self._top_slices_enc.sum(axis=1)
246
+ ).A.astype(int)
247
+
248
+ return slices_masks
249
+
250
+ @property
251
+ def _n_features_out(self):
252
+ """Number of transformed output features."""
253
+ return self.top_slices_.shape[0]
254
+
255
+ @staticmethod
256
+ def _dummify(array: np.ndarray, n_col_x_encoded: int) -> sp.csr_matrix:
257
+ """Dummify `array` with respect to `n_col_x_encoded`.
258
+ Assumption: v does not contain any 0."""
259
+ assert (
260
+ 0 not in array
261
+ ), "Modality 0 is not expected to be one-hot encoded."
262
+ one_hot_encoding = sp.lil_matrix(
263
+ (array.size, n_col_x_encoded), dtype=bool
264
+ )
265
+ one_hot_encoding[np.arange(array.size), array - 1] = True
266
+ return one_hot_encoding.tocsr()
267
+
268
+ def _maintain_top_k(
269
+ self,
270
+ slices: sp.csr_matrix,
271
+ statistics: np.ndarray,
272
+ top_k_slices: sp.csr_matrix,
273
+ top_k_statistics: np.ndarray,
274
+ ) -> Tuple[sp.csr_matrix, np.ndarray]:
275
+ """Add new `slices` to `top_k_slices` and update the top-k slices."""
276
+ # prune invalid min_sup and scores
277
+ valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
278
+ statistics[:, 0] > 0
279
+ )
280
+ if np.sum(valid_slices_mask) != 0:
281
+ slices, statistics = (
282
+ slices[valid_slices_mask],
283
+ statistics[valid_slices_mask],
284
+ )
285
+
286
+ if (slices.shape[1] != top_k_slices.shape[1]) & (
287
+ slices.shape[1] == 1
288
+ ):
289
+ slices, statistics = slices.T, statistics.T
290
+
291
+ # evaluated candidates and previous top-k
292
+ slices = sp.vstack([top_k_slices, slices])
293
+ statistics = np.concatenate([top_k_statistics, statistics])
294
+
295
+ # extract top-k
296
+ top_slices_bool = (
297
+ rankdata(-statistics[:, 0], method="min") <= self.k
298
+ )
299
+ top_k_slices, top_k_statistics = (
300
+ slices[top_slices_bool],
301
+ statistics[top_slices_bool],
302
+ )
303
+ top_slices_indices = np.argsort(-top_k_statistics[:, 0])
304
+ top_k_slices, top_k_statistics = (
305
+ top_k_slices[top_slices_indices],
306
+ top_k_statistics[top_slices_indices],
307
+ )
308
+ return top_k_slices, top_k_statistics
309
+
310
+ def _score_ub(
311
+ self,
312
+ slice_sizes_ub: np.ndarray,
313
+ slice_errors_ub: np.ndarray,
314
+ max_slice_errors_ub: np.ndarray,
315
+ n_col_x_encoded: int,
316
+ ) -> np.ndarray:
317
+ """Compute the upper-bound score for all the slices."""
318
+ # Since slice_scores is either monotonically increasing or decreasing, we
319
+ # probe interesting points of slice_scores in the interval [min_sup, ss],
320
+ # and compute the maximum to serve as the upper bound
321
+ potential_solutions = np.column_stack(
322
+ (
323
+ self.min_sup * np.ones(slice_sizes_ub.shape[0]),
324
+ np.maximum(
325
+ slice_errors_ub / max_slice_errors_ub, self.min_sup
326
+ ),
327
+ slice_sizes_ub,
328
+ )
329
+ )
330
+ slice_scores_ub = np.amax(
331
+ (
332
+ self.alpha
333
+ * (
334
+ np.minimum(
335
+ potential_solutions.T * max_slice_errors_ub,
336
+ slice_errors_ub,
337
+ ).T
338
+ / self.average_error_
339
+ - potential_solutions
340
+ )
341
+ - (1 - self.alpha) * (n_col_x_encoded - potential_solutions)
342
+ )
343
+ / potential_solutions,
344
+ axis=1,
345
+ )
346
+ return slice_scores_ub
347
+
348
+ @staticmethod
349
+ def _analyse_top_k(top_k_statistics: np.ndarray) -> tuple:
350
+ """Get the maximum and the minimum slices scores."""
351
+ max_slice_scores = min_slice_scores = -np.inf
352
+ if top_k_statistics.shape[0] > 0:
353
+ max_slice_scores = top_k_statistics[0, 0]
354
+ min_slice_scores = top_k_statistics[
355
+ top_k_statistics.shape[0] - 1, 0
356
+ ]
357
+ return max_slice_scores, min_slice_scores
358
+
359
+ def _score(
360
+ self,
361
+ slice_sizes: np.ndarray,
362
+ slice_errors: np.ndarray,
363
+ n_row_x_encoded: int,
364
+ ) -> np.ndarray:
365
+ """Compute the score for all the slices."""
366
+ slice_scores = self.alpha * (
367
+ (slice_errors / slice_sizes) / self.average_error_ - 1
368
+ ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
369
+ return np.nan_to_num(slice_scores, nan=-np.inf)
370
+
371
+ def _eval_slice(
372
+ self,
373
+ x_encoded: sp.csr_matrix,
374
+ errors: np.ndarray,
375
+ slices: sp.csr_matrix,
376
+ level: int,
377
+ ) -> np.ndarray:
378
+ """Compute several statistics for all the slices."""
379
+ slice_candidates = x_encoded @ slices.T == level
380
+ slice_sizes = slice_candidates.sum(axis=0).A[0]
381
+ slice_errors = errors @ slice_candidates
382
+ # Here we can't use the .A shorthand because it is not
383
+ # implemented in all scipy versions for coo_matrix objects
384
+ max_slice_errors = (
385
+ slice_candidates.T.multiply(errors).max(axis=1).toarray()
386
+ )
387
+
388
+ # score of relative error and relative size
389
+ slice_scores = self._score(
390
+ slice_sizes, slice_errors, x_encoded.shape[0]
391
+ )
392
+ return np.column_stack(
393
+ [slice_scores, slice_errors, max_slice_errors, slice_sizes]
394
+ )
395
+
396
+ def _create_and_score_basic_slices(
397
+ self,
398
+ x_encoded: sp.csr_matrix,
399
+ n_col_x_encoded: int,
400
+ errors: np.ndarray,
401
+ ) -> Tuple[sp.csr_matrix, np.ndarray]:
402
+ """Initialise 1-slices, i.e. slices with one predicate."""
403
+ slice_sizes = x_encoded.sum(axis=0).A[0]
404
+ slice_errors = errors @ x_encoded
405
+ # Here we can't use the .A shorthand because it is not
406
+ # implemented in all scipy versions for coo_matrix objects
407
+ max_slice_errors = (
408
+ x_encoded.T.multiply(errors).max(axis=1).toarray()[:, 0]
409
+ )
410
+
411
+ # working set of active slices (#attr x #slices) and top-k
412
+ valid_slices_mask = (slice_sizes >= self.min_sup) & (slice_errors > 0)
413
+ attr = np.arange(1, n_col_x_encoded + 1)[valid_slices_mask]
414
+ slice_sizes = slice_sizes[valid_slices_mask]
415
+ slice_errors = slice_errors[valid_slices_mask]
416
+ max_slice_errors = max_slice_errors[valid_slices_mask]
417
+ slices = self._dummify(attr, n_col_x_encoded)
418
+
419
+ # score 1-slices and create initial top-k
420
+ slice_scores = self._score(
421
+ slice_sizes, slice_errors, x_encoded.shape[0]
422
+ )
423
+ statistics = np.column_stack(
424
+ (slice_scores, slice_errors, max_slice_errors, slice_sizes)
425
+ )
426
+
427
+ n_col_dropped = n_col_x_encoded - sum(valid_slices_mask)
428
+ logger.debug(
429
+ "Dropping %i/%i features below min_sup = %i."
430
+ % (n_col_dropped, n_col_x_encoded, self.min_sup)
431
+ )
432
+
433
+ return slices, statistics
434
+
435
+ def _get_pruned_s_r(
436
+ self, slices: sp.csr_matrix, statistics: np.ndarray
437
+ ) -> Tuple[sp.csr_matrix, np.ndarray]:
438
+ """Prune invalid slices.
439
+ Do not affect overall pruning effectiveness due to handling of missing parents.
440
+ """
441
+ valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
442
+ statistics[:, 1] > 0
443
+ )
444
+ return slices[valid_slices_mask], statistics[valid_slices_mask]
445
+
446
+ @staticmethod
447
+ def _join_compatible_slices(
448
+ slices: sp.csr_matrix, level: int
449
+ ) -> np.ndarray:
450
+ """Join compatible slices according to `level`."""
451
+ slices_int = slices.astype(int)
452
+ # Here we can't use the .A shorthand because it is not
453
+ # implemented in all scipy versions for coo_matrix objects
454
+ join = (slices_int @ slices_int.T).toarray() == level - 2
455
+ return np.triu(join, 1) * join
456
+
457
+ @staticmethod
458
+ def _combine_slices(
459
+ slices: sp.csr_matrix,
460
+ statistics: np.ndarray,
461
+ compatible_slices: np.ndarray,
462
+ ) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
463
+ """Combine slices by exploiting parents node statistics."""
464
+ parent_1_idx, parent_2_idx = np.where(compatible_slices == 1)
465
+ pair_candidates = slices[parent_1_idx] + slices[parent_2_idx]
466
+
467
+ slice_errors = np.minimum(
468
+ statistics[parent_1_idx, 1], statistics[parent_2_idx, 1]
469
+ )
470
+ max_slice_errors = np.minimum(
471
+ statistics[parent_1_idx, 2], statistics[parent_2_idx, 2]
472
+ )
473
+ slice_sizes = np.minimum(
474
+ statistics[parent_1_idx, 3], statistics[parent_2_idx, 3]
475
+ )
476
+ return pair_candidates, slice_sizes, slice_errors, max_slice_errors
477
+
478
+ @staticmethod
479
+ def _prune_invalid_self_joins(
480
+ feature_offset_start: np.ndarray,
481
+ feature_offset_end: np.ndarray,
482
+ pair_candidates: sp.csr_matrix,
483
+ slice_sizes: np.ndarray,
484
+ slice_errors: np.ndarray,
485
+ max_slice_errors: np.ndarray,
486
+ ) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
487
+ """Prune invalid self joins (>1 bit per feature)."""
488
+ valid_slices_mask = np.full(pair_candidates.shape[0], True)
489
+ for start, end in zip(feature_offset_start, feature_offset_end):
490
+ valid_slices_mask = (
491
+ valid_slices_mask
492
+ * (pair_candidates[:, start:end].sum(axis=1) <= 1).A[:, 0]
493
+ )
494
+ return (
495
+ pair_candidates[valid_slices_mask],
496
+ slice_sizes[valid_slices_mask],
497
+ slice_errors[valid_slices_mask],
498
+ max_slice_errors[valid_slices_mask],
499
+ )
500
+
501
+ @staticmethod
502
+ def _prepare_deduplication_and_pruning(
503
+ feature_offset_start: np.ndarray,
504
+ feature_offset_end: np.ndarray,
505
+ feature_domains: np.ndarray,
506
+ pair_candidates: sp.csr_matrix,
507
+ ) -> np.ndarray:
508
+ """Prepare IDs for deduplication and pruning."""
509
+ ids = np.zeros(pair_candidates.shape[0])
510
+ dom = feature_domains + 1
511
+ for j, (start, end) in enumerate(
512
+ zip(feature_offset_start, feature_offset_end)
513
+ ):
514
+ sub_pair_candidates = pair_candidates[:, start:end]
515
+ # sub_p should not contain multiple True on the same line
516
+ i = sub_pair_candidates.argmax(axis=1).T + np.any(
517
+ # Here we can't use the .A shorthand because it is not
518
+ # implemented in all scipy versions for coo_matrix objects
519
+ sub_pair_candidates.toarray(),
520
+ axis=1,
521
+ )
522
+ ids = ids + i.A * np.prod(dom[(j + 1) : dom.shape[0]])
523
+ return ids
524
+
525
+ def _get_pair_candidates(
526
+ self,
527
+ slices: sp.csr_matrix,
528
+ statistics: np.ndarray,
529
+ top_k_statistics: np.ndarray,
530
+ level: int,
531
+ n_col_x_encoded: int,
532
+ feature_domains: np.ndarray,
533
+ feature_offset_start: np.ndarray,
534
+ feature_offset_end: np.ndarray,
535
+ ) -> sp.csr_matrix:
536
+ """Compute and prune plausible slices candidates."""
537
+ compatible_slices = self._join_compatible_slices(slices, level)
538
+
539
+ if np.sum(compatible_slices) == 0:
540
+ return sp.csr_matrix(np.empty((0, slices.shape[1])))
541
+
542
+ (
543
+ pair_candidates,
544
+ slice_sizes,
545
+ slice_errors,
546
+ max_slice_errors,
547
+ ) = self._combine_slices(slices, statistics, compatible_slices)
548
+
549
+ (
550
+ pair_candidates,
551
+ slice_sizes,
552
+ slice_errors,
553
+ max_slice_errors,
554
+ ) = self._prune_invalid_self_joins(
555
+ feature_offset_start,
556
+ feature_offset_end,
557
+ pair_candidates,
558
+ slice_sizes,
559
+ slice_errors,
560
+ max_slice_errors,
561
+ )
562
+
563
+ if pair_candidates.shape[0] == 0:
564
+ return sp.csr_matrix(np.empty((0, slices.shape[1])))
565
+
566
+ ids = self._prepare_deduplication_and_pruning(
567
+ feature_offset_start,
568
+ feature_offset_end,
569
+ feature_domains,
570
+ pair_candidates,
571
+ )
572
+
573
+ # remove duplicate candidates and select corresponding statistics
574
+ _, unique_candidate_indices, duplicate_counts = np.unique(
575
+ ids, return_index=True, return_counts=True
576
+ )
577
+
578
+ # Slices at level i normally have i parents (cf. section 3.1 in the paper)
579
+ # We want to keep only slices whose parents have not been pruned.
580
+ # If all the parents are present they are going to get combined 2 by 2 in i*(i-1)/2 ways
581
+ # So, we select only candidates which appear with the correct cardinality.
582
+ all_parents_mask = duplicate_counts == level * (level - 1) / 2
583
+ unique_candidate_indices = unique_candidate_indices[all_parents_mask]
584
+
585
+ pair_candidates = pair_candidates[unique_candidate_indices]
586
+ slice_sizes = slice_sizes[unique_candidate_indices]
587
+ slice_errors = slice_errors[unique_candidate_indices]
588
+ max_slice_errors = max_slice_errors[unique_candidate_indices]
589
+
590
+ slice_scores = self._score_ub(
591
+ slice_sizes,
592
+ slice_errors,
593
+ max_slice_errors,
594
+ n_col_x_encoded,
595
+ )
596
+
597
+ # Seems to be always fully True
598
+ # Due to maintain_top_k that apply slice_sizes filter
599
+ pruning_sizes = slice_sizes >= self.min_sup
600
+
601
+ _, min_slice_scores = self._analyse_top_k(top_k_statistics)
602
+
603
+ pruning_scores = (slice_scores > min_slice_scores) & (slice_scores > 0)
604
+
605
+ return pair_candidates[pruning_scores & pruning_sizes]
606
+
607
+ def _search_slices(
608
+ self,
609
+ input_x: np.ndarray,
610
+ errors: np.ndarray,
611
+ ) -> None:
612
+ """Main function of the SliceLine algorithm."""
613
+ # prepare offset vectors and one-hot encoded input_x
614
+ self._one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
615
+ x_encoded = self._one_hot_encoder.fit_transform(input_x)
616
+ feature_domains: np.ndarray = np.array(
617
+ [len(sub_array) for sub_array in self._one_hot_encoder.categories_]
618
+ )
619
+ feature_offset_end = np.cumsum(feature_domains)
620
+ feature_offset_start = feature_offset_end - feature_domains
621
+
622
+ # initialize statistics and basic slices
623
+ n_col_x_encoded = x_encoded.shape[1]
624
+ self.average_error_ = float(np.mean(errors))
625
+ slices, statistics = self._create_and_score_basic_slices(
626
+ x_encoded,
627
+ n_col_x_encoded,
628
+ errors,
629
+ )
630
+
631
+ # initialize top-k
632
+ top_k_slices, top_k_statistics = self._maintain_top_k(
633
+ slices,
634
+ statistics,
635
+ sp.csr_matrix((0, n_col_x_encoded)),
636
+ np.zeros((0, 4)),
637
+ )
638
+
639
+ max_slice_scores, min_slice_scores = self._analyse_top_k(
640
+ top_k_statistics
641
+ )
642
+ logger.debug(
643
+ "Initial top-K: count=%i, max=%f, min=%f"
644
+ % (top_k_slices.shape[0], max_slice_scores, min_slice_scores)
645
+ )
646
+
647
+ # lattice enumeration w/ size/error pruning, one iteration per level
648
+ # termination condition (max #feature levels)
649
+ level = 1
650
+ min_condition = min(input_x.shape[1], self.max_l)
651
+ while (
652
+ (slices.shape[0] > 0)
653
+ & (slices.sum() > 0)
654
+ & (level < min_condition)
655
+ ):
656
+ level += 1
657
+
658
+ # enumerate candidate join pairs, including size/error pruning
659
+ slices, statistics = self._get_pruned_s_r(slices, statistics)
660
+ nr_s = slices.shape[0]
661
+ slices = self._get_pair_candidates(
662
+ slices,
663
+ statistics,
664
+ top_k_statistics,
665
+ level,
666
+ n_col_x_encoded,
667
+ feature_domains,
668
+ feature_offset_start,
669
+ feature_offset_end,
670
+ )
671
+
672
+ logger.debug("Level %i:" % level)
673
+ logger.debug(
674
+ " -- generated paired slice candidates: %i -> %i"
675
+ % (nr_s, slices.shape[0])
676
+ )
677
+
678
+ # extract and evaluate candidate slices
679
+ statistics = self._eval_slice(x_encoded, errors, slices, level)
680
+
681
+ # maintain top-k after evaluation
682
+ top_k_slices, top_k_statistics = self._maintain_top_k(
683
+ slices, statistics, top_k_slices, top_k_statistics
684
+ )
685
+
686
+ max_slice_scores, min_slice_scores = self._analyse_top_k(
687
+ top_k_statistics
688
+ )
689
+ valid = np.sum(
690
+ (statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
691
+ )
692
+ logger.debug(
693
+ " -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
694
+ )
695
+ logger.debug(
696
+ " -- top-K: count=%i, max=%f, min=%f"
697
+ % (top_k_slices.shape[0], max_slice_scores, min_slice_scores)
698
+ )
699
+
700
+ self._top_slices_enc = top_k_slices.copy()
701
+ if top_k_slices.shape[0] == 0:
702
+ self.top_slices_ = np.empty((0, input_x.shape[1]))
703
+ else:
704
+ self.top_slices_ = self._one_hot_encoder.inverse_transform(
705
+ top_k_slices
706
+ )
707
+
708
+ # compute slices' average errors
709
+ top_k_statistics = np.column_stack(
710
+ (
711
+ top_k_statistics,
712
+ np.divide(top_k_statistics[:, 1], top_k_statistics[:, 3]),
713
+ )
714
+ )
715
+
716
+ # transform statistics to a list of dict
717
+ statistics_names = [
718
+ "slice_score",
719
+ "sum_slice_error",
720
+ "max_slice_error",
721
+ "slice_size",
722
+ "slice_average_error",
723
+ ]
724
+ self.top_slices_statistics_ = [
725
+ {
726
+ stat_name: stat_value
727
+ for stat_value, stat_name in zip(statistic, statistics_names)
728
+ }
729
+ for statistic in top_k_statistics
730
+ ]
731
+
732
+ logger.debug("Terminated at level %i." % level)