sliceline 0.2.11__tar.gz → 0.2.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sliceline-0.2.11 → sliceline-0.2.13}/PKG-INFO +11 -4
- sliceline-0.2.13/pyproject.toml +69 -0
- {sliceline-0.2.11 → sliceline-0.2.13}/sliceline/__init__.py +1 -1
- {sliceline-0.2.11 → sliceline-0.2.13}/sliceline/slicefinder.py +110 -33
- {sliceline-0.2.11 → sliceline-0.2.13}/sliceline/validation.py +14 -5
- sliceline-0.2.11/pyproject.toml +0 -31
- {sliceline-0.2.11 → sliceline-0.2.13}/LICENSE +0 -0
- {sliceline-0.2.11 → sliceline-0.2.13}/README.rst +0 -0
|
@@ -1,18 +1,25 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sliceline
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13
|
|
4
4
|
Summary: ✂️ Fast slice finding for Machine Learning model debugging.
|
|
5
5
|
Home-page: https://github.com/DataDome/sliceline
|
|
6
6
|
License: BSD-3-Clause
|
|
7
7
|
Author: Antoine de Daran
|
|
8
|
-
Requires-Python: >=3.7,<3.
|
|
8
|
+
Requires-Python: >=3.7,<3.12
|
|
9
9
|
Classifier: License :: OSI Approved :: BSD License
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.7
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.8
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Requires-Dist: numpy (>=1.21,<2.0) ; python_version < "3.9"
|
|
17
|
+
Requires-Dist: numpy (>=1.25,<2.0) ; python_version >= "3.9"
|
|
18
|
+
Requires-Dist: scikit-learn (>=1,<2) ; python_version < "3.8"
|
|
19
|
+
Requires-Dist: scikit-learn (>=1.3,<2.0) ; python_version >= "3.8" and python_version < "3.9"
|
|
20
|
+
Requires-Dist: scikit-learn (>=1.4,<2.0) ; python_version >= "3.9"
|
|
21
|
+
Requires-Dist: scipy (>=1,<2) ; python_version < "3.9"
|
|
22
|
+
Requires-Dist: scipy (>=1.12,<2.0) ; python_version >= "3.9"
|
|
16
23
|
Project-URL: Documentation, https://sliceline.readthedocs.io/en/stable/
|
|
17
24
|
Project-URL: Repository, https://github.com/DataDome/sliceline
|
|
18
25
|
Description-Content-Type: text/x-rst
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["poetry_core>=1.0.0"]
|
|
3
|
+
build-backend = "poetry.core.masonry.api"
|
|
4
|
+
|
|
5
|
+
[tool.poetry]
|
|
6
|
+
name = "sliceline"
|
|
7
|
+
version = "0.2.13" # This version is a generic placeholder. It should not be changed.
|
|
8
|
+
description = "✂️ Fast slice finding for Machine Learning model debugging."
|
|
9
|
+
authors = ["Antoine de Daran"]
|
|
10
|
+
readme = "README.rst"
|
|
11
|
+
license = "BSD-3-Clause"
|
|
12
|
+
repository = "https://github.com/DataDome/sliceline"
|
|
13
|
+
documentation = "https://sliceline.readthedocs.io/en/stable/"
|
|
14
|
+
|
|
15
|
+
[tool.poetry.dependencies]
|
|
16
|
+
python = ">=3.7, <3.12"
|
|
17
|
+
numpy = [
|
|
18
|
+
{ version = "^1.21", python = "<3.9" },
|
|
19
|
+
{ version = "^1.25", python = ">=3.9" },
|
|
20
|
+
]
|
|
21
|
+
scikit-learn = [
|
|
22
|
+
{ version = "^1", python = "<3.8" },
|
|
23
|
+
{ version = "^1.3", python = ">=3.8,<3.9" },
|
|
24
|
+
{ version = "^1.4", python = ">=3.9" },
|
|
25
|
+
]
|
|
26
|
+
scipy = [
|
|
27
|
+
{ version = "^1", python = "<3.9" },
|
|
28
|
+
{ version = "^1.12", python = ">=3.9" },
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[tool.poetry.dev-dependencies]
|
|
32
|
+
black = [
|
|
33
|
+
{ version = "^23", python = "<3.8" },
|
|
34
|
+
{ version = "^24", python = ">=3.8" },
|
|
35
|
+
]
|
|
36
|
+
flake8 = [
|
|
37
|
+
{ version = "^5", python = "<3.8.1" },
|
|
38
|
+
{ version = "^6", python = ">=3.8.1" },
|
|
39
|
+
]
|
|
40
|
+
jupyter = "^1.0.0"
|
|
41
|
+
matplotlib = [
|
|
42
|
+
{ version = "^3", python = "<3.8" },
|
|
43
|
+
{ version = "^3.7", python = ">=3.8,<3.9" },
|
|
44
|
+
{ version = "^3.8", python = ">=3.9" },
|
|
45
|
+
]
|
|
46
|
+
nbconvert = "^6.5.0"
|
|
47
|
+
optbinning = "^0.15.0"
|
|
48
|
+
pandas = [
|
|
49
|
+
{ version = "^1", python = "<3.9" },
|
|
50
|
+
{ version = "^1.5", python = ">=3.9" },
|
|
51
|
+
]
|
|
52
|
+
pytest = "^6.2.5"
|
|
53
|
+
pytest-benchmark = "^3.4.1"
|
|
54
|
+
pytest-cov = "^3.0.0"
|
|
55
|
+
Sphinx = "^4.0.0"
|
|
56
|
+
sphinx-rtd-theme = "^1.0.0"
|
|
57
|
+
|
|
58
|
+
[tool.black]
|
|
59
|
+
line-length = 79
|
|
60
|
+
include = '\.pyi?$'
|
|
61
|
+
|
|
62
|
+
[tool.isort]
|
|
63
|
+
profile = "black"
|
|
64
|
+
|
|
65
|
+
[tool.coverage.run]
|
|
66
|
+
omit = [".*", "*/site-packages/*", "tests/*", "*/validation.py"]
|
|
67
|
+
|
|
68
|
+
[tool.coverage.report]
|
|
69
|
+
fail_under = 80
|
|
@@ -24,8 +24,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
24
24
|
|
|
25
25
|
Given an input dataset (`X`) and a model error vector (`errors`), SliceLine finds
|
|
26
26
|
the `k` slices in `X` that identify where the model performs significantly worse.
|
|
27
|
-
A slice is a subspace of `X` defined by one or more predicates.
|
|
28
|
-
of this subspace is controlled by `max_l`.
|
|
27
|
+
A slice is a subspace of `X` defined by one or more predicates.
|
|
28
|
+
The maximal dimension of this subspace is controlled by `max_l`.
|
|
29
29
|
|
|
30
30
|
The slice scoring function is the linear combination of two objectives:
|
|
31
31
|
- Find sufficiently large slices, with more than `min_sup` elements
|
|
@@ -55,7 +55,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
55
55
|
min_sup: int or float, default=10
|
|
56
56
|
Minimum support threshold.
|
|
57
57
|
Inspired by frequent itemset mining, it ensures statistical significance.
|
|
58
|
-
If `min_sup` is a float (0 < `min_sup` < 1),
|
|
58
|
+
If `min_sup` is a float (0 < `min_sup` < 1),
|
|
59
|
+
it represents the faction of the input dataset (`X`).
|
|
59
60
|
|
|
60
61
|
verbose: bool, default=True
|
|
61
62
|
Controls the verbosity.
|
|
@@ -69,9 +70,19 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
69
70
|
average_error_: float
|
|
70
71
|
Mean value of the input error.
|
|
71
72
|
|
|
73
|
+
top_slices_statistics_: list of dict of length `len(top_slices_)`
|
|
74
|
+
The statistics of the slices found sorted by slice's scores.
|
|
75
|
+
For each slice, the following statistics are stored:
|
|
76
|
+
- slice_score: the score of the slice (defined in `_score` method)
|
|
77
|
+
- sum_slice_error: the sum of all the errors in the slice
|
|
78
|
+
- max_slice_error: the maximum of all errors in the slice
|
|
79
|
+
- slice_size: the number of elements in the slice
|
|
80
|
+
- slice_average_error: the average error in the slice (sum_slice_error / slice_size)
|
|
81
|
+
|
|
72
82
|
References
|
|
73
83
|
----------
|
|
74
|
-
`SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging
|
|
84
|
+
`SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging
|
|
85
|
+
<https://mboehm7.github.io/resources/sigmod2021b_sliceline.pdf>`__,
|
|
75
86
|
from *Svetlana Sagadeeva* and *Matthias Boehm* of Graz University of Technology.
|
|
76
87
|
"""
|
|
77
88
|
|
|
@@ -90,7 +101,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
90
101
|
self.verbose = verbose
|
|
91
102
|
|
|
92
103
|
self._one_hot_encoder = self._top_slices_enc = None
|
|
93
|
-
self.top_slices_ = self.
|
|
104
|
+
self.top_slices_ = self.top_slices_statistics_ = None
|
|
105
|
+
self.average_error_ = None
|
|
94
106
|
|
|
95
107
|
if self.verbose:
|
|
96
108
|
logger.setLevel(logging.DEBUG)
|
|
@@ -108,9 +120,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
108
120
|
if self.max_l <= 0:
|
|
109
121
|
raise ValueError(f"Invalid 'max_l' parameter: {self.max_l}")
|
|
110
122
|
|
|
111
|
-
if (
|
|
112
|
-
self.min_sup
|
|
113
|
-
(isinstance(self.min_sup, float) and self.min_sup >= 1)
|
|
123
|
+
if self.min_sup < 0 or (
|
|
124
|
+
isinstance(self.min_sup, float) and self.min_sup >= 1
|
|
114
125
|
):
|
|
115
126
|
raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")
|
|
116
127
|
|
|
@@ -184,7 +195,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
184
195
|
Parameters
|
|
185
196
|
----------
|
|
186
197
|
X: array-like of shape (n_samples, n_features)
|
|
187
|
-
|
|
198
|
+
Dataset, where `n_samples` is the number of samples
|
|
188
199
|
and `n_features` is the number of features.
|
|
189
200
|
|
|
190
201
|
slice_index: int
|
|
@@ -198,7 +209,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
198
209
|
self._check_top_slices()
|
|
199
210
|
|
|
200
211
|
# Input validation
|
|
201
|
-
X = check_array(X)
|
|
212
|
+
X = check_array(X, force_all_finite=False)
|
|
202
213
|
|
|
203
214
|
slices_masks = self._get_slices_masks(X)
|
|
204
215
|
|
|
@@ -228,9 +239,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
228
239
|
slice_candidates = self._top_slices_enc @ X_encoded.T
|
|
229
240
|
|
|
230
241
|
# self._top_slices_enc.sum(axis=1) is the number of predicate(s) for each top_slices_
|
|
231
|
-
slices_masks = (
|
|
232
|
-
|
|
233
|
-
)
|
|
242
|
+
slices_masks = (
|
|
243
|
+
slice_candidates == self._top_slices_enc.sum(axis=1)
|
|
244
|
+
).A.astype(int)
|
|
234
245
|
|
|
235
246
|
return slices_masks
|
|
236
247
|
|
|
@@ -243,8 +254,12 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
243
254
|
def _dummify(array: np.ndarray, n_col_x_encoded: int) -> sp.csr_matrix:
|
|
244
255
|
"""Dummify `array` with respect to `n_col_x_encoded`.
|
|
245
256
|
Assumption: v does not contain any 0."""
|
|
246
|
-
assert
|
|
247
|
-
|
|
257
|
+
assert (
|
|
258
|
+
0 not in array
|
|
259
|
+
), "Modality 0 is not expected to be one-hot encoded."
|
|
260
|
+
one_hot_encoding = sp.lil_matrix(
|
|
261
|
+
(array.size, n_col_x_encoded), dtype=bool
|
|
262
|
+
)
|
|
248
263
|
one_hot_encoding[np.arange(array.size), array - 1] = True
|
|
249
264
|
return one_hot_encoding.tocsr()
|
|
250
265
|
|
|
@@ -257,14 +272,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
257
272
|
) -> Tuple[sp.csr_matrix, np.ndarray]:
|
|
258
273
|
"""Add new `slices` to `top_k_slices` and update the top-k slices."""
|
|
259
274
|
# prune invalid min_sup and scores
|
|
260
|
-
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
|
|
275
|
+
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
|
|
276
|
+
statistics[:, 0] > 0
|
|
277
|
+
)
|
|
261
278
|
if np.sum(valid_slices_mask) != 0:
|
|
262
279
|
slices, statistics = (
|
|
263
280
|
slices[valid_slices_mask],
|
|
264
281
|
statistics[valid_slices_mask],
|
|
265
282
|
)
|
|
266
283
|
|
|
267
|
-
if (slices.shape[1] != top_k_slices.shape[1]) & (
|
|
284
|
+
if (slices.shape[1] != top_k_slices.shape[1]) & (
|
|
285
|
+
slices.shape[1] == 1
|
|
286
|
+
):
|
|
268
287
|
slices, statistics = slices.T, statistics.T
|
|
269
288
|
|
|
270
289
|
# evaluated candidates and previous top-k
|
|
@@ -272,7 +291,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
272
291
|
statistics = np.concatenate([top_k_statistics, statistics])
|
|
273
292
|
|
|
274
293
|
# extract top-k
|
|
275
|
-
top_slices_bool =
|
|
294
|
+
top_slices_bool = (
|
|
295
|
+
rankdata(-statistics[:, 0], method="min") <= self.k
|
|
296
|
+
)
|
|
276
297
|
top_k_slices, top_k_statistics = (
|
|
277
298
|
slices[top_slices_bool],
|
|
278
299
|
statistics[top_slices_bool],
|
|
@@ -298,7 +319,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
298
319
|
potential_solutions = np.column_stack(
|
|
299
320
|
(
|
|
300
321
|
self.min_sup * np.ones(slice_sizes_ub.shape[0]),
|
|
301
|
-
np.maximum(
|
|
322
|
+
np.maximum(
|
|
323
|
+
slice_errors_ub / max_slice_errors_ub, self.min_sup
|
|
324
|
+
),
|
|
302
325
|
slice_sizes_ub,
|
|
303
326
|
)
|
|
304
327
|
)
|
|
@@ -307,7 +330,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
307
330
|
self.alpha
|
|
308
331
|
* (
|
|
309
332
|
np.minimum(
|
|
310
|
-
potential_solutions.T * max_slice_errors_ub,
|
|
333
|
+
potential_solutions.T * max_slice_errors_ub,
|
|
334
|
+
slice_errors_ub,
|
|
311
335
|
).T
|
|
312
336
|
/ self.average_error_
|
|
313
337
|
- potential_solutions
|
|
@@ -325,7 +349,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
325
349
|
max_slice_scores = min_slice_scores = -np.inf
|
|
326
350
|
if top_k_statistics.shape[0] > 0:
|
|
327
351
|
max_slice_scores = top_k_statistics[0, 0]
|
|
328
|
-
min_slice_scores = top_k_statistics[
|
|
352
|
+
min_slice_scores = top_k_statistics[
|
|
353
|
+
top_k_statistics.shape[0] - 1, 0
|
|
354
|
+
]
|
|
329
355
|
return max_slice_scores, min_slice_scores
|
|
330
356
|
|
|
331
357
|
def _score(
|
|
@@ -354,7 +380,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
354
380
|
max_slice_errors = slice_candidates.T.multiply(errors).max(axis=1).A
|
|
355
381
|
|
|
356
382
|
# score of relative error and relative size
|
|
357
|
-
slice_scores = self._score(
|
|
383
|
+
slice_scores = self._score(
|
|
384
|
+
slice_sizes, slice_errors, x_encoded.shape[0]
|
|
385
|
+
)
|
|
358
386
|
return np.column_stack(
|
|
359
387
|
[slice_scores, slice_errors, max_slice_errors, slice_sizes]
|
|
360
388
|
)
|
|
@@ -379,7 +407,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
379
407
|
slices = self._dummify(attr, n_col_x_encoded)
|
|
380
408
|
|
|
381
409
|
# score 1-slices and create initial top-k
|
|
382
|
-
slice_scores = self._score(
|
|
410
|
+
slice_scores = self._score(
|
|
411
|
+
slice_sizes, slice_errors, x_encoded.shape[0]
|
|
412
|
+
)
|
|
383
413
|
statistics = np.column_stack(
|
|
384
414
|
(slice_scores, slice_errors, max_slice_errors, slice_sizes)
|
|
385
415
|
)
|
|
@@ -397,11 +427,15 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
397
427
|
) -> Tuple[sp.csr_matrix, np.ndarray]:
|
|
398
428
|
"""Prune invalid slices.
|
|
399
429
|
Do not affect overall pruning effectiveness due to handling of missing parents."""
|
|
400
|
-
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
|
|
430
|
+
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
|
|
431
|
+
statistics[:, 1] > 0
|
|
432
|
+
)
|
|
401
433
|
return slices[valid_slices_mask], statistics[valid_slices_mask]
|
|
402
434
|
|
|
403
435
|
@staticmethod
|
|
404
|
-
def _join_compatible_slices(
|
|
436
|
+
def _join_compatible_slices(
|
|
437
|
+
slices: sp.csr_matrix, level: int
|
|
438
|
+
) -> np.ndarray:
|
|
405
439
|
"""Join compatible slices according to `level`."""
|
|
406
440
|
slices_int = slices.astype(int)
|
|
407
441
|
join = (slices_int @ slices_int.T).A == level - 2
|
|
@@ -409,7 +443,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
409
443
|
|
|
410
444
|
@staticmethod
|
|
411
445
|
def _combine_slices(
|
|
412
|
-
slices: sp.csr_matrix,
|
|
446
|
+
slices: sp.csr_matrix,
|
|
447
|
+
statistics: np.ndarray,
|
|
448
|
+
compatible_slices: np.ndarray,
|
|
413
449
|
) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
|
|
414
450
|
"""Combine slices by exploiting parents node statistics."""
|
|
415
451
|
parent_1_idx, parent_2_idx = np.where(compatible_slices == 1)
|
|
@@ -459,7 +495,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
459
495
|
"""Prepare IDs for deduplication and pruning."""
|
|
460
496
|
ids = np.zeros(pair_candidates.shape[0])
|
|
461
497
|
dom = feature_domains + 1
|
|
462
|
-
for j, (start, end) in enumerate(
|
|
498
|
+
for j, (start, end) in enumerate(
|
|
499
|
+
zip(feature_offset_start, feature_offset_end)
|
|
500
|
+
):
|
|
463
501
|
sub_pair_candidates = pair_candidates[:, start:end]
|
|
464
502
|
# sub_p should not contain multiple True on the same line
|
|
465
503
|
i = sub_pair_candidates.argmax(axis=1).T + np.any(
|
|
@@ -510,7 +548,10 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
510
548
|
return sp.csr_matrix(np.empty((0, slices.shape[1])))
|
|
511
549
|
|
|
512
550
|
ids = self._prepare_deduplication_and_pruning(
|
|
513
|
-
feature_offset_start,
|
|
551
|
+
feature_offset_start,
|
|
552
|
+
feature_offset_end,
|
|
553
|
+
feature_domains,
|
|
554
|
+
pair_candidates,
|
|
514
555
|
)
|
|
515
556
|
|
|
516
557
|
# remove duplicate candidates and select corresponding statistics
|
|
@@ -579,7 +620,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
579
620
|
np.zeros((0, 4)),
|
|
580
621
|
)
|
|
581
622
|
|
|
582
|
-
max_slice_scores, min_slice_scores = self._analyse_top_k(
|
|
623
|
+
max_slice_scores, min_slice_scores = self._analyse_top_k(
|
|
624
|
+
top_k_statistics
|
|
625
|
+
)
|
|
583
626
|
logger.debug(
|
|
584
627
|
"Initial top-K: count=%i, max=%f, min=%f"
|
|
585
628
|
% (top_k_slices.shape[0], max_slice_scores, min_slice_scores)
|
|
@@ -589,7 +632,11 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
589
632
|
# termination condition (max #feature levels)
|
|
590
633
|
level = 1
|
|
591
634
|
min_condition = min(input_x.shape[1], self.max_l)
|
|
592
|
-
while (
|
|
635
|
+
while (
|
|
636
|
+
(slices.shape[0] > 0)
|
|
637
|
+
& (slices.sum() > 0)
|
|
638
|
+
& (level < min_condition)
|
|
639
|
+
):
|
|
593
640
|
level += 1
|
|
594
641
|
|
|
595
642
|
# enumerate candidate join pairs, including size/error pruning
|
|
@@ -620,8 +667,12 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
620
667
|
slices, statistics, top_k_slices, top_k_statistics
|
|
621
668
|
)
|
|
622
669
|
|
|
623
|
-
max_slice_scores, min_slice_scores = self._analyse_top_k(
|
|
624
|
-
|
|
670
|
+
max_slice_scores, min_slice_scores = self._analyse_top_k(
|
|
671
|
+
top_k_statistics
|
|
672
|
+
)
|
|
673
|
+
valid = np.sum(
|
|
674
|
+
(statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
|
|
675
|
+
)
|
|
625
676
|
logger.debug(
|
|
626
677
|
" -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
|
|
627
678
|
)
|
|
@@ -634,6 +685,32 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
634
685
|
if top_k_slices.shape[0] == 0:
|
|
635
686
|
self.top_slices_ = np.empty((0, input_x.shape[1]))
|
|
636
687
|
else:
|
|
637
|
-
self.top_slices_ = self._one_hot_encoder.inverse_transform(
|
|
688
|
+
self.top_slices_ = self._one_hot_encoder.inverse_transform(
|
|
689
|
+
top_k_slices
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
# compute slices' average errors
|
|
693
|
+
top_k_statistics = np.column_stack(
|
|
694
|
+
(
|
|
695
|
+
top_k_statistics,
|
|
696
|
+
np.divide(top_k_statistics[:, 1], top_k_statistics[:, 3]),
|
|
697
|
+
)
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
# transform statistics to a list of dict
|
|
701
|
+
statistics_names = [
|
|
702
|
+
"slice_score",
|
|
703
|
+
"sum_slice_error",
|
|
704
|
+
"max_slice_error",
|
|
705
|
+
"slice_size",
|
|
706
|
+
"slice_average_error",
|
|
707
|
+
]
|
|
708
|
+
self.top_slices_statistics_ = [
|
|
709
|
+
{
|
|
710
|
+
stat_name: stat_value
|
|
711
|
+
for stat_value, stat_name in zip(statistic, statistics_names)
|
|
712
|
+
}
|
|
713
|
+
for statistic in top_k_statistics
|
|
714
|
+
]
|
|
638
715
|
|
|
639
716
|
logger.debug("Terminated at level %i." % level)
|
|
@@ -99,7 +99,8 @@ def _num_samples(x):
|
|
|
99
99
|
if hasattr(x, "shape") and x.shape is not None:
|
|
100
100
|
if len(x.shape) == 0:
|
|
101
101
|
raise TypeError(
|
|
102
|
-
"Singleton array %r cannot be considered a valid collection."
|
|
102
|
+
"Singleton array %r cannot be considered a valid collection."
|
|
103
|
+
% x
|
|
103
104
|
)
|
|
104
105
|
# Check that shape is returning an integer or default to len
|
|
105
106
|
# Dask dataframes may not return numeric shape[0] value
|
|
@@ -242,7 +243,8 @@ def _ensure_sparse_format(
|
|
|
242
243
|
if force_all_finite:
|
|
243
244
|
if not hasattr(spmatrix, "data"):
|
|
244
245
|
warnings.warn(
|
|
245
|
-
"Can't check %s sparse matrix for nan or inf."
|
|
246
|
+
"Can't check %s sparse matrix for nan or inf."
|
|
247
|
+
% spmatrix.format,
|
|
246
248
|
stacklevel=2,
|
|
247
249
|
)
|
|
248
250
|
else:
|
|
@@ -450,7 +452,10 @@ def check_array(
|
|
|
450
452
|
with suppress(ImportError):
|
|
451
453
|
from pandas.api.types import is_sparse
|
|
452
454
|
|
|
453
|
-
if
|
|
455
|
+
if (
|
|
456
|
+
not hasattr(array, "sparse")
|
|
457
|
+
and array.dtypes.apply(is_sparse).any()
|
|
458
|
+
):
|
|
454
459
|
warnings.warn(
|
|
455
460
|
"pandas.DataFrame with sparse columns found."
|
|
456
461
|
"It will be converted to a dense numpy array."
|
|
@@ -781,7 +786,9 @@ def check_X_e(
|
|
|
781
786
|
input_name="X",
|
|
782
787
|
)
|
|
783
788
|
|
|
784
|
-
y = _check_y(
|
|
789
|
+
y = _check_y(
|
|
790
|
+
y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator
|
|
791
|
+
)
|
|
785
792
|
|
|
786
793
|
check_consistent_length(X, y)
|
|
787
794
|
|
|
@@ -847,4 +854,6 @@ def column_or_1d(y, *, warn=False):
|
|
|
847
854
|
)
|
|
848
855
|
return np.ravel(y)
|
|
849
856
|
|
|
850
|
-
raise ValueError(
|
|
857
|
+
raise ValueError(
|
|
858
|
+
f"y should be a 1d array, got an array of shape {shape} instead."
|
|
859
|
+
)
|
sliceline-0.2.11/pyproject.toml
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
[build-system]
|
|
2
|
-
requires = ["poetry_core>=1.0.0"]
|
|
3
|
-
build-backend = "poetry.core.masonry.api"
|
|
4
|
-
|
|
5
|
-
[tool.poetry]
|
|
6
|
-
name = "sliceline"
|
|
7
|
-
version = "0.2.11" # This version is a generic placeholder. It should not be changed.
|
|
8
|
-
description = "✂️ Fast slice finding for Machine Learning model debugging."
|
|
9
|
-
authors = ["Antoine de Daran"]
|
|
10
|
-
readme = "README.rst"
|
|
11
|
-
license = "BSD-3-Clause"
|
|
12
|
-
repository = "https://github.com/DataDome/sliceline"
|
|
13
|
-
documentation = "https://sliceline.readthedocs.io/en/stable/"
|
|
14
|
-
|
|
15
|
-
[tool.poetry.dependencies]
|
|
16
|
-
python = ">=3.7, <3.10"
|
|
17
|
-
numpy = "^1.0.0"
|
|
18
|
-
scikit-learn = "^1.0.1"
|
|
19
|
-
|
|
20
|
-
[tool.poetry.dev-dependencies]
|
|
21
|
-
black = "^22.3.0"
|
|
22
|
-
flake8 = "^3.0.0"
|
|
23
|
-
jupyter = "^1.0.0"
|
|
24
|
-
nbconvert = "^6.5.0"
|
|
25
|
-
optbinning = "^0.15.0"
|
|
26
|
-
pandas = "^1.1.0"
|
|
27
|
-
pytest = "^6.2.5"
|
|
28
|
-
pytest-benchmark = "^3.4.1"
|
|
29
|
-
pytest-cov = "^3.0.0"
|
|
30
|
-
Sphinx = "^4.0.0"
|
|
31
|
-
sphinx-rtd-theme = "^1.0.0"
|
|
File without changes
|
|
File without changes
|