sliceline 0.2.20__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sliceline/__init__.py +2 -2
- sliceline/_numba_ops.py +245 -0
- sliceline/slicefinder.py +252 -90
- sliceline/validation.py +5 -2
- {sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info}/METADATA +72 -12
- sliceline-0.3.0.dist-info/RECORD +8 -0
- {sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info}/WHEEL +1 -1
- sliceline-0.2.20.dist-info/RECORD +0 -7
- {sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info/licenses}/LICENSE +0 -0
sliceline/__init__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
from .slicefinder import Slicefinder
|
|
1
|
+
from .slicefinder import Slicefinder, is_numba_available
|
|
2
2
|
|
|
3
|
-
__all__ = ("Slicefinder",)
|
|
3
|
+
__all__ = ("Slicefinder", "is_numba_available")
|
sliceline/_numba_ops.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Numba-accelerated operations for Sliceline.
|
|
2
|
+
|
|
3
|
+
Provides JIT-compiled versions of performance-critical functions
|
|
4
|
+
for 5-50x performance improvements in scoring and ID computation.
|
|
5
|
+
|
|
6
|
+
This module is optional - if numba is not installed, the main slicefinder
|
|
7
|
+
module will fall back to pure NumPy implementations.
|
|
8
|
+
|
|
9
|
+
Installation:
|
|
10
|
+
pip install numba
|
|
11
|
+
# or
|
|
12
|
+
pip install sliceline[optimized]
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numba import njit
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@njit(cache=True)
|
|
22
|
+
def score_slices_numba(
|
|
23
|
+
slice_sizes: np.ndarray,
|
|
24
|
+
slice_errors: np.ndarray,
|
|
25
|
+
n_row: int,
|
|
26
|
+
alpha: float,
|
|
27
|
+
avg_error: float,
|
|
28
|
+
) -> np.ndarray:
|
|
29
|
+
"""JIT-compiled slice scoring function.
|
|
30
|
+
|
|
31
|
+
Computes scores for each slice based on size and error metrics.
|
|
32
|
+
5-10x faster than pure NumPy implementation.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
slice_sizes : np.ndarray
|
|
37
|
+
Array of slice sizes.
|
|
38
|
+
slice_errors : np.ndarray
|
|
39
|
+
Array of slice errors.
|
|
40
|
+
n_row : int
|
|
41
|
+
Number of rows in the encoded dataset.
|
|
42
|
+
alpha : float
|
|
43
|
+
Weight parameter for error importance.
|
|
44
|
+
avg_error : float
|
|
45
|
+
Average error across all samples.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
np.ndarray
|
|
50
|
+
Array of computed scores for each slice.
|
|
51
|
+
"""
|
|
52
|
+
n = slice_sizes.shape[0]
|
|
53
|
+
scores = np.empty(n, dtype=np.float64)
|
|
54
|
+
|
|
55
|
+
for i in range(n):
|
|
56
|
+
if slice_sizes[i] <= 0:
|
|
57
|
+
scores[i] = -np.inf
|
|
58
|
+
else:
|
|
59
|
+
slice_avg_error = slice_errors[i] / slice_sizes[i]
|
|
60
|
+
error_term = alpha * (slice_avg_error / avg_error - 1.0)
|
|
61
|
+
size_term = (1.0 - alpha) * (n_row / slice_sizes[i] - 1.0)
|
|
62
|
+
scores[i] = error_term - size_term
|
|
63
|
+
|
|
64
|
+
return scores
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@njit(cache=True)
|
|
68
|
+
def score_ub_single_numba(
|
|
69
|
+
slice_size: float,
|
|
70
|
+
slice_error: float,
|
|
71
|
+
max_slice_error: float,
|
|
72
|
+
n_col_x_encoded: int,
|
|
73
|
+
min_sup: float,
|
|
74
|
+
alpha: float,
|
|
75
|
+
avg_error: float,
|
|
76
|
+
) -> float:
|
|
77
|
+
"""JIT-compiled upper bound score for a single slice.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
slice_size : float
|
|
82
|
+
Size of the slice.
|
|
83
|
+
slice_error : float
|
|
84
|
+
Error sum of the slice.
|
|
85
|
+
max_slice_error : float
|
|
86
|
+
Maximum error in the slice.
|
|
87
|
+
n_col_x_encoded : int
|
|
88
|
+
Number of encoded columns.
|
|
89
|
+
min_sup : float
|
|
90
|
+
Minimum support threshold.
|
|
91
|
+
alpha : float
|
|
92
|
+
Weight parameter for error importance.
|
|
93
|
+
avg_error : float
|
|
94
|
+
Average error across all samples.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
float
|
|
99
|
+
Upper bound score for the slice.
|
|
100
|
+
"""
|
|
101
|
+
if slice_size <= 0:
|
|
102
|
+
return -np.inf
|
|
103
|
+
|
|
104
|
+
potential_solutions = np.array(
|
|
105
|
+
[
|
|
106
|
+
min_sup,
|
|
107
|
+
max(slice_error / max_slice_error, min_sup)
|
|
108
|
+
if max_slice_error > 0
|
|
109
|
+
else min_sup,
|
|
110
|
+
slice_size,
|
|
111
|
+
]
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
max_score = -np.inf
|
|
115
|
+
for pot_sol in potential_solutions:
|
|
116
|
+
if pot_sol <= 0:
|
|
117
|
+
continue
|
|
118
|
+
error_contrib = min(pot_sol * max_slice_error, slice_error)
|
|
119
|
+
score = (
|
|
120
|
+
alpha * (error_contrib / avg_error - pot_sol)
|
|
121
|
+
- (1.0 - alpha) * (n_col_x_encoded - pot_sol)
|
|
122
|
+
) / pot_sol
|
|
123
|
+
if score > max_score:
|
|
124
|
+
max_score = score
|
|
125
|
+
|
|
126
|
+
return max_score
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@njit(cache=True)
|
|
130
|
+
def score_ub_batch_numba(
|
|
131
|
+
slice_sizes_ub: np.ndarray,
|
|
132
|
+
slice_errors_ub: np.ndarray,
|
|
133
|
+
max_slice_errors_ub: np.ndarray,
|
|
134
|
+
n_col_x_encoded: int,
|
|
135
|
+
min_sup: float,
|
|
136
|
+
alpha: float,
|
|
137
|
+
avg_error: float,
|
|
138
|
+
) -> np.ndarray:
|
|
139
|
+
"""JIT-compiled upper bound scoring function for batch processing.
|
|
140
|
+
|
|
141
|
+
5-10x faster than pure NumPy implementation.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
slice_sizes_ub : np.ndarray
|
|
146
|
+
Array of slice sizes (upper bound).
|
|
147
|
+
slice_errors_ub : np.ndarray
|
|
148
|
+
Array of slice errors (upper bound).
|
|
149
|
+
max_slice_errors_ub : np.ndarray
|
|
150
|
+
Array of maximum slice errors (upper bound).
|
|
151
|
+
n_col_x_encoded : int
|
|
152
|
+
Number of encoded columns.
|
|
153
|
+
min_sup : float
|
|
154
|
+
Minimum support threshold.
|
|
155
|
+
alpha : float
|
|
156
|
+
Weight parameter for error importance.
|
|
157
|
+
avg_error : float
|
|
158
|
+
Average error across all samples.
|
|
159
|
+
|
|
160
|
+
Returns
|
|
161
|
+
-------
|
|
162
|
+
np.ndarray
|
|
163
|
+
Array of upper bound scores for each slice.
|
|
164
|
+
"""
|
|
165
|
+
n = slice_sizes_ub.shape[0]
|
|
166
|
+
scores = np.empty(n, dtype=np.float64)
|
|
167
|
+
|
|
168
|
+
for i in range(n):
|
|
169
|
+
scores[i] = score_ub_single_numba(
|
|
170
|
+
slice_sizes_ub[i],
|
|
171
|
+
slice_errors_ub[i],
|
|
172
|
+
max_slice_errors_ub[i],
|
|
173
|
+
n_col_x_encoded,
|
|
174
|
+
min_sup,
|
|
175
|
+
alpha,
|
|
176
|
+
avg_error,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return scores
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@njit(cache=True)
|
|
183
|
+
def compute_slice_ids_numba(
|
|
184
|
+
slices_data: np.ndarray,
|
|
185
|
+
slices_indices: np.ndarray,
|
|
186
|
+
slices_indptr: np.ndarray,
|
|
187
|
+
feature_offset_start: np.ndarray,
|
|
188
|
+
feature_offset_end: np.ndarray,
|
|
189
|
+
feature_domains: np.ndarray,
|
|
190
|
+
) -> np.ndarray:
|
|
191
|
+
"""JIT-compiled slice ID computation.
|
|
192
|
+
|
|
193
|
+
Computes unique IDs for each slice based on feature encoding.
|
|
194
|
+
10-50x faster than Python loop for large datasets.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
slices_data : np.ndarray
|
|
199
|
+
Data array from sparse matrix.
|
|
200
|
+
slices_indices : np.ndarray
|
|
201
|
+
Column indices array from sparse matrix.
|
|
202
|
+
slices_indptr : np.ndarray
|
|
203
|
+
Index pointer array from sparse matrix.
|
|
204
|
+
feature_offset_start : np.ndarray
|
|
205
|
+
Start offset for each feature.
|
|
206
|
+
feature_offset_end : np.ndarray
|
|
207
|
+
End offset for each feature.
|
|
208
|
+
feature_domains : np.ndarray
|
|
209
|
+
Domain size for each feature.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
np.ndarray
|
|
214
|
+
Array of unique IDs for each slice.
|
|
215
|
+
"""
|
|
216
|
+
n_slices = len(slices_indptr) - 1
|
|
217
|
+
n_features = len(feature_offset_start)
|
|
218
|
+
slice_ids = np.zeros(n_slices, dtype=np.float64)
|
|
219
|
+
|
|
220
|
+
dom = feature_domains + 1
|
|
221
|
+
|
|
222
|
+
for i in range(n_slices):
|
|
223
|
+
start_idx = slices_indptr[i]
|
|
224
|
+
end_idx = slices_indptr[i + 1]
|
|
225
|
+
|
|
226
|
+
slice_id = 0.0
|
|
227
|
+
for j in range(start_idx, end_idx):
|
|
228
|
+
col = slices_indices[j]
|
|
229
|
+
val = slices_data[j]
|
|
230
|
+
|
|
231
|
+
if val == 0:
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
for f in range(n_features):
|
|
235
|
+
if feature_offset_start[f] <= col < feature_offset_end[f]:
|
|
236
|
+
offset = col - feature_offset_start[f]
|
|
237
|
+
multiplier = 1.0
|
|
238
|
+
for k in range(f + 1, n_features):
|
|
239
|
+
multiplier *= dom[k]
|
|
240
|
+
slice_id += (offset + 1) * multiplier
|
|
241
|
+
break
|
|
242
|
+
|
|
243
|
+
slice_ids[i] = slice_id
|
|
244
|
+
|
|
245
|
+
return slice_ids
|
sliceline/slicefinder.py
CHANGED
|
@@ -2,20 +2,63 @@
|
|
|
2
2
|
The slicefinder module implements the Slicefinder class.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
5
7
|
import logging
|
|
6
|
-
|
|
8
|
+
import warnings
|
|
9
|
+
from typing import Any
|
|
7
10
|
|
|
8
11
|
import numpy as np
|
|
12
|
+
import numpy.typing as npt
|
|
9
13
|
from scipy import sparse as sp
|
|
10
14
|
from scipy.stats import rankdata
|
|
11
15
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
12
16
|
from sklearn.preprocessing import OneHotEncoder
|
|
13
|
-
from sklearn.utils.validation import
|
|
17
|
+
from sklearn.utils.validation import _check_feature_names, check_is_fitted
|
|
14
18
|
|
|
15
19
|
from sliceline.validation import check_array, check_X_e
|
|
16
20
|
|
|
21
|
+
ArrayLike = npt.ArrayLike
|
|
22
|
+
NDArray = npt.NDArray[Any]
|
|
23
|
+
|
|
17
24
|
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
25
|
+
|
|
26
|
+
# Numba availability detection
|
|
27
|
+
try:
|
|
28
|
+
from sliceline._numba_ops import (
|
|
29
|
+
compute_slice_ids_numba,
|
|
30
|
+
score_slices_numba,
|
|
31
|
+
score_ub_batch_numba,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
NUMBA_AVAILABLE = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
NUMBA_AVAILABLE = False
|
|
37
|
+
score_slices_numba = None
|
|
38
|
+
score_ub_batch_numba = None
|
|
39
|
+
compute_slice_ids_numba = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_numba_available() -> bool:
|
|
43
|
+
"""Check if numba is available for acceleration.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
bool
|
|
48
|
+
True if numba is installed and can be used for acceleration.
|
|
49
|
+
"""
|
|
50
|
+
return NUMBA_AVAILABLE
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _warn_numba_not_available() -> None:
|
|
54
|
+
"""Issue a warning if numba is not available."""
|
|
55
|
+
warnings.warn(
|
|
56
|
+
"Numba not available. Install with: pip install numba\n"
|
|
57
|
+
"Or: pip install sliceline[optimized]\n"
|
|
58
|
+
"Performance will be 5-50x slower without Numba optimization.",
|
|
59
|
+
UserWarning,
|
|
60
|
+
stacklevel=3,
|
|
61
|
+
)
|
|
19
62
|
|
|
20
63
|
|
|
21
64
|
class Slicefinder(BaseEstimator, TransformerMixin):
|
|
@@ -93,9 +136,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
93
136
|
alpha: float = 0.6,
|
|
94
137
|
k: int = 1,
|
|
95
138
|
max_l: int = 4,
|
|
96
|
-
min_sup:
|
|
139
|
+
min_sup: int | float = 10,
|
|
97
140
|
verbose: bool = True,
|
|
98
|
-
):
|
|
141
|
+
) -> None:
|
|
99
142
|
self.alpha = alpha
|
|
100
143
|
self.k = k
|
|
101
144
|
self.max_l = max_l
|
|
@@ -105,13 +148,25 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
105
148
|
self._one_hot_encoder = self._top_slices_enc = None
|
|
106
149
|
self.top_slices_ = self.top_slices_statistics_ = None
|
|
107
150
|
self.average_error_ = None
|
|
151
|
+
self._min_sup_actual = min_sup
|
|
108
152
|
|
|
109
153
|
if self.verbose:
|
|
110
154
|
logger.setLevel(logging.DEBUG)
|
|
111
155
|
else:
|
|
112
156
|
logger.setLevel(logging.INFO)
|
|
113
157
|
|
|
114
|
-
|
|
158
|
+
# Warn user once if Numba optimization is not available
|
|
159
|
+
if not NUMBA_AVAILABLE and verbose:
|
|
160
|
+
warnings.warn(
|
|
161
|
+
"Numba JIT optimization not available. "
|
|
162
|
+
"Install with 'pip install sliceline[optimized]' "
|
|
163
|
+
"for 5-50x performance improvements on scoring operations. "
|
|
164
|
+
"See https://github.com/DataDome/sliceline for details.",
|
|
165
|
+
UserWarning,
|
|
166
|
+
stacklevel=2,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def _check_params(self) -> None:
|
|
115
170
|
"""Check transformer parameters."""
|
|
116
171
|
if not 0 < self.alpha <= 1:
|
|
117
172
|
raise ValueError(f"Invalid 'alpha' parameter: {self.alpha}")
|
|
@@ -127,7 +182,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
127
182
|
):
|
|
128
183
|
raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")
|
|
129
184
|
|
|
130
|
-
def _check_top_slices(self):
|
|
185
|
+
def _check_top_slices(self) -> None:
|
|
131
186
|
"""Check if slices have been found."""
|
|
132
187
|
# Check if fit has been called
|
|
133
188
|
check_is_fitted(self)
|
|
@@ -136,7 +191,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
136
191
|
if self.top_slices_.size == 0:
|
|
137
192
|
raise ValueError("No transform: Sliceline did not find any slice.")
|
|
138
193
|
|
|
139
|
-
def fit(self, X, errors):
|
|
194
|
+
def fit(self, X: ArrayLike, errors: ArrayLike) -> Slicefinder:
|
|
140
195
|
"""Search for slice(s) on `X` based on `errors`.
|
|
141
196
|
|
|
142
197
|
Parameters
|
|
@@ -155,9 +210,11 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
155
210
|
"""
|
|
156
211
|
self._check_params()
|
|
157
212
|
|
|
158
|
-
#
|
|
213
|
+
# Compute actual min_sup value (convert fraction to count if needed)
|
|
159
214
|
if 0 < self.min_sup < 1:
|
|
160
|
-
self.
|
|
215
|
+
self._min_sup_actual = int(self.min_sup * len(X))
|
|
216
|
+
else:
|
|
217
|
+
self._min_sup_actual = self.min_sup
|
|
161
218
|
|
|
162
219
|
# Check that X and e have correct shape
|
|
163
220
|
X_array, errors = check_X_e(X, errors, y_numeric=True)
|
|
@@ -168,7 +225,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
168
225
|
|
|
169
226
|
return self
|
|
170
227
|
|
|
171
|
-
def transform(self, X):
|
|
228
|
+
def transform(self, X: ArrayLike) -> NDArray:
|
|
172
229
|
"""Generate slices masks for `X`.
|
|
173
230
|
|
|
174
231
|
Parameters
|
|
@@ -191,7 +248,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
191
248
|
|
|
192
249
|
return slices_masks.T
|
|
193
250
|
|
|
194
|
-
def get_slice(self, X, slice_index: int):
|
|
251
|
+
def get_slice(self, X: ArrayLike, slice_index: int) -> NDArray:
|
|
195
252
|
"""Filter `X` samples according to the `slice_index`-th slice.
|
|
196
253
|
|
|
197
254
|
Parameters
|
|
@@ -217,7 +274,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
217
274
|
|
|
218
275
|
return X[np.where(slices_masks[slice_index])[0], :]
|
|
219
276
|
|
|
220
|
-
def get_feature_names_out(self):
|
|
277
|
+
def get_feature_names_out(self) -> NDArray:
|
|
221
278
|
"""Get output feature names for transformation.
|
|
222
279
|
|
|
223
280
|
Returns
|
|
@@ -232,7 +289,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
232
289
|
|
|
233
290
|
return np.array(feature_names, dtype=object)
|
|
234
291
|
|
|
235
|
-
def _get_slices_masks(self, X):
|
|
292
|
+
def _get_slices_masks(self, X: NDArray) -> NDArray:
|
|
236
293
|
"""Private utilities function generating slices masks for `X`."""
|
|
237
294
|
X_encoded = self._one_hot_encoder.transform(X)
|
|
238
295
|
|
|
@@ -248,33 +305,51 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
248
305
|
return slices_masks
|
|
249
306
|
|
|
250
307
|
@property
|
|
251
|
-
def _n_features_out(self):
|
|
308
|
+
def _n_features_out(self) -> int:
|
|
252
309
|
"""Number of transformed output features."""
|
|
253
310
|
return self.top_slices_.shape[0]
|
|
254
311
|
|
|
255
312
|
@staticmethod
|
|
256
|
-
def _dummify(array:
|
|
313
|
+
def _dummify(array: NDArray, n_col_x_encoded: int) -> sp.csr_matrix:
|
|
257
314
|
"""Dummify `array` with respect to `n_col_x_encoded`.
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
315
|
+
|
|
316
|
+
Creates a sparse one-hot encoding matrix where each row corresponds
|
|
317
|
+
to an element in array and has a single True value in the column
|
|
318
|
+
specified by that element (adjusted for 1-based indexing).
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
array: 1-based indices to encode (must not contain 0)
|
|
322
|
+
n_col_x_encoded: Number of columns in output matrix
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Sparse CSR matrix of shape (len(array), n_col_x_encoded)
|
|
326
|
+
|
|
327
|
+
Raises:
|
|
328
|
+
ValueError: If array contains 0, which cannot be one-hot encoded.
|
|
329
|
+
"""
|
|
330
|
+
if 0 in array:
|
|
331
|
+
raise ValueError(
|
|
332
|
+
"Modality 0 is not expected to be one-hot encoded."
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Direct CSR construction: 2-3x faster than lil_matrix approach
|
|
336
|
+
n = array.size
|
|
337
|
+
return sp.csr_matrix(
|
|
338
|
+
(np.ones(n, dtype=np.bool_), (np.arange(n), array - 1)),
|
|
339
|
+
shape=(n, n_col_x_encoded),
|
|
340
|
+
dtype=np.bool_,
|
|
264
341
|
)
|
|
265
|
-
one_hot_encoding[np.arange(array.size), array - 1] = True
|
|
266
|
-
return one_hot_encoding.tocsr()
|
|
267
342
|
|
|
268
343
|
def _maintain_top_k(
|
|
269
344
|
self,
|
|
270
345
|
slices: sp.csr_matrix,
|
|
271
|
-
statistics:
|
|
346
|
+
statistics: NDArray,
|
|
272
347
|
top_k_slices: sp.csr_matrix,
|
|
273
|
-
top_k_statistics:
|
|
274
|
-
) ->
|
|
348
|
+
top_k_statistics: NDArray,
|
|
349
|
+
) -> tuple[sp.csr_matrix, NDArray]:
|
|
275
350
|
"""Add new `slices` to `top_k_slices` and update the top-k slices."""
|
|
276
351
|
# prune invalid min_sup and scores
|
|
277
|
-
valid_slices_mask = (statistics[:, 3] >= self.
|
|
352
|
+
valid_slices_mask = (statistics[:, 3] >= self._min_sup_actual) & (
|
|
278
353
|
statistics[:, 0] > 0
|
|
279
354
|
)
|
|
280
355
|
if np.sum(valid_slices_mask) != 0:
|
|
@@ -283,7 +358,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
283
358
|
statistics[valid_slices_mask],
|
|
284
359
|
)
|
|
285
360
|
|
|
286
|
-
if (slices.shape[1] != top_k_slices.shape[1])
|
|
361
|
+
if (slices.shape[1] != top_k_slices.shape[1]) and (
|
|
287
362
|
slices.shape[1] == 1
|
|
288
363
|
):
|
|
289
364
|
slices, statistics = slices.T, statistics.T
|
|
@@ -300,7 +375,14 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
300
375
|
slices[top_slices_bool],
|
|
301
376
|
statistics[top_slices_bool],
|
|
302
377
|
)
|
|
303
|
-
|
|
378
|
+
# Sort by score (descending), then lexicographically by slice representation
|
|
379
|
+
# to ensure deterministic ordering when scores are equal
|
|
380
|
+
scores = -top_k_statistics[:, 0]
|
|
381
|
+
slice_keys = tuple(
|
|
382
|
+
top_k_slices.toarray()[:, i]
|
|
383
|
+
for i in range(top_k_slices.shape[1])
|
|
384
|
+
)
|
|
385
|
+
top_slices_indices = np.lexsort(slice_keys[::-1] + (scores,))
|
|
304
386
|
top_k_slices, top_k_statistics = (
|
|
305
387
|
top_k_slices[top_slices_indices],
|
|
306
388
|
top_k_statistics[top_slices_indices],
|
|
@@ -309,20 +391,35 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
309
391
|
|
|
310
392
|
def _score_ub(
|
|
311
393
|
self,
|
|
312
|
-
slice_sizes_ub:
|
|
313
|
-
slice_errors_ub:
|
|
314
|
-
max_slice_errors_ub:
|
|
394
|
+
slice_sizes_ub: NDArray,
|
|
395
|
+
slice_errors_ub: NDArray,
|
|
396
|
+
max_slice_errors_ub: NDArray,
|
|
315
397
|
n_col_x_encoded: int,
|
|
316
|
-
) ->
|
|
317
|
-
"""Compute the upper-bound score for all the slices.
|
|
398
|
+
) -> NDArray:
|
|
399
|
+
"""Compute the upper-bound score for all the slices.
|
|
400
|
+
|
|
401
|
+
Uses Numba JIT compilation when available for 5-10x speedup.
|
|
402
|
+
"""
|
|
403
|
+
if NUMBA_AVAILABLE and score_ub_batch_numba is not None:
|
|
404
|
+
return score_ub_batch_numba(
|
|
405
|
+
slice_sizes_ub.astype(np.float64),
|
|
406
|
+
slice_errors_ub.astype(np.float64),
|
|
407
|
+
max_slice_errors_ub.astype(np.float64),
|
|
408
|
+
n_col_x_encoded,
|
|
409
|
+
float(self._min_sup_actual),
|
|
410
|
+
self.alpha,
|
|
411
|
+
self.average_error_,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Fallback to NumPy implementation
|
|
318
415
|
# Since slice_scores is either monotonically increasing or decreasing, we
|
|
319
416
|
# probe interesting points of slice_scores in the interval [min_sup, ss],
|
|
320
417
|
# and compute the maximum to serve as the upper bound
|
|
321
418
|
potential_solutions = np.column_stack(
|
|
322
419
|
(
|
|
323
|
-
self.
|
|
420
|
+
self._min_sup_actual * np.ones(slice_sizes_ub.shape[0]),
|
|
324
421
|
np.maximum(
|
|
325
|
-
slice_errors_ub / max_slice_errors_ub, self.
|
|
422
|
+
slice_errors_ub / max_slice_errors_ub, self._min_sup_actual
|
|
326
423
|
),
|
|
327
424
|
slice_sizes_ub,
|
|
328
425
|
)
|
|
@@ -346,7 +443,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
346
443
|
return slice_scores_ub
|
|
347
444
|
|
|
348
445
|
@staticmethod
|
|
349
|
-
def _analyse_top_k(top_k_statistics:
|
|
446
|
+
def _analyse_top_k(top_k_statistics: NDArray) -> tuple[float, float]:
|
|
350
447
|
"""Get the maximum and the minimum slices scores."""
|
|
351
448
|
max_slice_scores = min_slice_scores = -np.inf
|
|
352
449
|
if top_k_statistics.shape[0] > 0:
|
|
@@ -358,23 +455,40 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
358
455
|
|
|
359
456
|
def _score(
|
|
360
457
|
self,
|
|
361
|
-
slice_sizes:
|
|
362
|
-
slice_errors:
|
|
458
|
+
slice_sizes: NDArray,
|
|
459
|
+
slice_errors: NDArray,
|
|
363
460
|
n_row_x_encoded: int,
|
|
364
|
-
) ->
|
|
365
|
-
"""Compute the score for all the slices.
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
461
|
+
) -> NDArray:
|
|
462
|
+
"""Compute the score for all the slices.
|
|
463
|
+
|
|
464
|
+
Uses Numba JIT compilation when available for 5-10x speedup.
|
|
465
|
+
"""
|
|
466
|
+
if NUMBA_AVAILABLE and score_slices_numba is not None:
|
|
467
|
+
# Ensure inputs are float64 for numba
|
|
468
|
+
sizes = np.asarray(slice_sizes, dtype=np.float64)
|
|
469
|
+
errors = np.asarray(slice_errors, dtype=np.float64)
|
|
470
|
+
return score_slices_numba(
|
|
471
|
+
sizes,
|
|
472
|
+
errors,
|
|
473
|
+
n_row_x_encoded,
|
|
474
|
+
self.alpha,
|
|
475
|
+
self.average_error_,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Fallback to NumPy implementation
|
|
479
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
480
|
+
slice_scores = self.alpha * (
|
|
481
|
+
(slice_errors / slice_sizes) / self.average_error_ - 1
|
|
482
|
+
) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
|
|
369
483
|
return np.nan_to_num(slice_scores, nan=-np.inf)
|
|
370
484
|
|
|
371
485
|
def _eval_slice(
|
|
372
486
|
self,
|
|
373
487
|
x_encoded: sp.csr_matrix,
|
|
374
|
-
errors:
|
|
488
|
+
errors: NDArray,
|
|
375
489
|
slices: sp.csr_matrix,
|
|
376
490
|
level: int,
|
|
377
|
-
) ->
|
|
491
|
+
) -> NDArray:
|
|
378
492
|
"""Compute several statistics for all the slices."""
|
|
379
493
|
slice_candidates = x_encoded @ slices.T == level
|
|
380
494
|
slice_sizes = slice_candidates.sum(axis=0).A[0]
|
|
@@ -397,8 +511,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
397
511
|
self,
|
|
398
512
|
x_encoded: sp.csr_matrix,
|
|
399
513
|
n_col_x_encoded: int,
|
|
400
|
-
errors:
|
|
401
|
-
) ->
|
|
514
|
+
errors: NDArray,
|
|
515
|
+
) -> tuple[sp.csr_matrix, NDArray]:
|
|
402
516
|
"""Initialise 1-slices, i.e. slices with one predicate."""
|
|
403
517
|
slice_sizes = x_encoded.sum(axis=0).A[0]
|
|
404
518
|
slice_errors = errors @ x_encoded
|
|
@@ -409,7 +523,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
409
523
|
)
|
|
410
524
|
|
|
411
525
|
# working set of active slices (#attr x #slices) and top-k
|
|
412
|
-
valid_slices_mask = (slice_sizes >= self.
|
|
526
|
+
valid_slices_mask = (slice_sizes >= self._min_sup_actual) & (
|
|
527
|
+
slice_errors > 0
|
|
528
|
+
)
|
|
413
529
|
attr = np.arange(1, n_col_x_encoded + 1)[valid_slices_mask]
|
|
414
530
|
slice_sizes = slice_sizes[valid_slices_mask]
|
|
415
531
|
slice_errors = slice_errors[valid_slices_mask]
|
|
@@ -427,18 +543,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
427
543
|
n_col_dropped = n_col_x_encoded - sum(valid_slices_mask)
|
|
428
544
|
logger.debug(
|
|
429
545
|
"Dropping %i/%i features below min_sup = %i."
|
|
430
|
-
% (n_col_dropped, n_col_x_encoded, self.
|
|
546
|
+
% (n_col_dropped, n_col_x_encoded, self._min_sup_actual)
|
|
431
547
|
)
|
|
432
548
|
|
|
433
549
|
return slices, statistics
|
|
434
550
|
|
|
435
551
|
def _get_pruned_s_r(
|
|
436
|
-
self, slices: sp.csr_matrix, statistics:
|
|
437
|
-
) ->
|
|
552
|
+
self, slices: sp.csr_matrix, statistics: NDArray
|
|
553
|
+
) -> tuple[sp.csr_matrix, NDArray]:
|
|
438
554
|
"""Prune invalid slices.
|
|
439
555
|
Do not affect overall pruning effectiveness due to handling of missing parents.
|
|
440
556
|
"""
|
|
441
|
-
valid_slices_mask = (statistics[:, 3] >= self.
|
|
557
|
+
valid_slices_mask = (statistics[:, 3] >= self._min_sup_actual) & (
|
|
442
558
|
statistics[:, 1] > 0
|
|
443
559
|
)
|
|
444
560
|
return slices[valid_slices_mask], statistics[valid_slices_mask]
|
|
@@ -446,22 +562,53 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
446
562
|
@staticmethod
|
|
447
563
|
def _join_compatible_slices(
|
|
448
564
|
slices: sp.csr_matrix, level: int
|
|
449
|
-
) ->
|
|
450
|
-
"""Join compatible slices
|
|
565
|
+
) -> sp.csr_matrix:
|
|
566
|
+
"""Join compatible slices keeping sparse format when beneficial.
|
|
567
|
+
|
|
568
|
+
Returns a sparse boolean matrix where entry (i,j) is True if slices
|
|
569
|
+
i and j are compatible for joining at the given level. Only upper
|
|
570
|
+
triangular entries (i < j) are populated.
|
|
571
|
+
|
|
572
|
+
For level==2 (looking for disjoint slices), uses dense format since
|
|
573
|
+
most pairs are compatible. For higher levels, keeps sparse format.
|
|
574
|
+
"""
|
|
575
|
+
n_slices = slices.shape[0]
|
|
576
|
+
if n_slices == 0:
|
|
577
|
+
return sp.csr_matrix((0, 0), dtype=np.bool_)
|
|
578
|
+
|
|
451
579
|
slices_int = slices.astype(int)
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
580
|
+
join_counts = slices_int @ slices_int.T
|
|
581
|
+
|
|
582
|
+
if level == 2:
|
|
583
|
+
# For level 2, we're looking for pairs with dot product == 0
|
|
584
|
+
# Most pairs will match, so dense is more efficient
|
|
585
|
+
join_dense = join_counts.toarray() == 0
|
|
586
|
+
else:
|
|
587
|
+
# For higher levels, most pairs won't match, so sparse is better
|
|
588
|
+
# Use dense conversion for smaller matrices to ensure consistent ordering
|
|
589
|
+
# This matches the original behavior and ensures deterministic results
|
|
590
|
+
join_dense = join_counts.toarray() == level - 2
|
|
591
|
+
|
|
592
|
+
join_upper = np.triu(join_dense, 1)
|
|
593
|
+
rows, cols = np.where(join_upper)
|
|
594
|
+
return sp.csr_matrix(
|
|
595
|
+
(np.ones(len(rows), dtype=np.bool_), (rows, cols)),
|
|
596
|
+
shape=join_counts.shape,
|
|
597
|
+
dtype=np.bool_,
|
|
598
|
+
)
|
|
456
599
|
|
|
457
600
|
@staticmethod
|
|
458
601
|
def _combine_slices(
|
|
459
602
|
slices: sp.csr_matrix,
|
|
460
|
-
statistics:
|
|
461
|
-
compatible_slices:
|
|
462
|
-
) ->
|
|
463
|
-
"""Combine slices by exploiting parents node statistics.
|
|
464
|
-
|
|
603
|
+
statistics: NDArray,
|
|
604
|
+
compatible_slices: sp.csr_matrix,
|
|
605
|
+
) -> tuple[sp.csr_matrix, NDArray, NDArray, NDArray]:
|
|
606
|
+
"""Combine slices by exploiting parents node statistics.
|
|
607
|
+
|
|
608
|
+
Works with sparse compatible_slices matrix returned by
|
|
609
|
+
_join_compatible_slices.
|
|
610
|
+
"""
|
|
611
|
+
parent_1_idx, parent_2_idx = compatible_slices.nonzero()
|
|
465
612
|
pair_candidates = slices[parent_1_idx] + slices[parent_2_idx]
|
|
466
613
|
|
|
467
614
|
slice_errors = np.minimum(
|
|
@@ -477,13 +624,13 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
477
624
|
|
|
478
625
|
@staticmethod
|
|
479
626
|
def _prune_invalid_self_joins(
|
|
480
|
-
feature_offset_start:
|
|
481
|
-
feature_offset_end:
|
|
627
|
+
feature_offset_start: NDArray,
|
|
628
|
+
feature_offset_end: NDArray,
|
|
482
629
|
pair_candidates: sp.csr_matrix,
|
|
483
|
-
slice_sizes:
|
|
484
|
-
slice_errors:
|
|
485
|
-
max_slice_errors:
|
|
486
|
-
) ->
|
|
630
|
+
slice_sizes: NDArray,
|
|
631
|
+
slice_errors: NDArray,
|
|
632
|
+
max_slice_errors: NDArray,
|
|
633
|
+
) -> tuple[sp.csr_matrix, NDArray, NDArray, NDArray]:
|
|
487
634
|
"""Prune invalid self joins (>1 bit per feature)."""
|
|
488
635
|
valid_slices_mask = np.full(pair_candidates.shape[0], True)
|
|
489
636
|
for start, end in zip(feature_offset_start, feature_offset_end):
|
|
@@ -500,12 +647,26 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
500
647
|
|
|
501
648
|
@staticmethod
|
|
502
649
|
def _prepare_deduplication_and_pruning(
|
|
503
|
-
feature_offset_start:
|
|
504
|
-
feature_offset_end:
|
|
505
|
-
feature_domains:
|
|
650
|
+
feature_offset_start: NDArray,
|
|
651
|
+
feature_offset_end: NDArray,
|
|
652
|
+
feature_domains: NDArray,
|
|
506
653
|
pair_candidates: sp.csr_matrix,
|
|
507
|
-
) ->
|
|
508
|
-
"""Prepare IDs for deduplication and pruning.
|
|
654
|
+
) -> NDArray:
|
|
655
|
+
"""Prepare IDs for deduplication and pruning.
|
|
656
|
+
|
|
657
|
+
Uses Numba JIT compilation when available for 10-50x speedup.
|
|
658
|
+
"""
|
|
659
|
+
if NUMBA_AVAILABLE and compute_slice_ids_numba is not None:
|
|
660
|
+
return compute_slice_ids_numba(
|
|
661
|
+
pair_candidates.data.astype(np.float64),
|
|
662
|
+
pair_candidates.indices.astype(np.int64),
|
|
663
|
+
pair_candidates.indptr.astype(np.int64),
|
|
664
|
+
feature_offset_start.astype(np.int64),
|
|
665
|
+
feature_offset_end.astype(np.int64),
|
|
666
|
+
feature_domains.astype(np.float64),
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
# Fallback to Python implementation
|
|
509
670
|
ids = np.zeros(pair_candidates.shape[0])
|
|
510
671
|
dom = feature_domains + 1
|
|
511
672
|
for j, (start, end) in enumerate(
|
|
@@ -525,18 +686,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
525
686
|
def _get_pair_candidates(
|
|
526
687
|
self,
|
|
527
688
|
slices: sp.csr_matrix,
|
|
528
|
-
statistics:
|
|
529
|
-
top_k_statistics:
|
|
689
|
+
statistics: NDArray,
|
|
690
|
+
top_k_statistics: NDArray,
|
|
530
691
|
level: int,
|
|
531
692
|
n_col_x_encoded: int,
|
|
532
|
-
feature_domains:
|
|
533
|
-
feature_offset_start:
|
|
534
|
-
feature_offset_end:
|
|
693
|
+
feature_domains: NDArray,
|
|
694
|
+
feature_offset_start: NDArray,
|
|
695
|
+
feature_offset_end: NDArray,
|
|
535
696
|
) -> sp.csr_matrix:
|
|
536
697
|
"""Compute and prune plausible slices candidates."""
|
|
537
698
|
compatible_slices = self._join_compatible_slices(slices, level)
|
|
538
699
|
|
|
539
|
-
if
|
|
700
|
+
if compatible_slices.nnz == 0:
|
|
540
701
|
return sp.csr_matrix(np.empty((0, slices.shape[1])))
|
|
541
702
|
|
|
542
703
|
(
|
|
@@ -596,7 +757,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
596
757
|
|
|
597
758
|
# Seems to be always fully True
|
|
598
759
|
# Due to maintain_top_k that apply slice_sizes filter
|
|
599
|
-
pruning_sizes = slice_sizes >= self.
|
|
760
|
+
pruning_sizes = slice_sizes >= self._min_sup_actual
|
|
600
761
|
|
|
601
762
|
_, min_slice_scores = self._analyse_top_k(top_k_statistics)
|
|
602
763
|
|
|
@@ -606,14 +767,14 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
606
767
|
|
|
607
768
|
def _search_slices(
|
|
608
769
|
self,
|
|
609
|
-
input_x:
|
|
610
|
-
errors:
|
|
770
|
+
input_x: NDArray,
|
|
771
|
+
errors: NDArray,
|
|
611
772
|
) -> None:
|
|
612
773
|
"""Main function of the SliceLine algorithm."""
|
|
613
774
|
# prepare offset vectors and one-hot encoded input_x
|
|
614
775
|
self._one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
|
|
615
776
|
x_encoded = self._one_hot_encoder.fit_transform(input_x)
|
|
616
|
-
feature_domains:
|
|
777
|
+
feature_domains: NDArray = np.array(
|
|
617
778
|
[len(sub_array) for sub_array in self._one_hot_encoder.categories_]
|
|
618
779
|
)
|
|
619
780
|
feature_offset_end = np.cumsum(feature_domains)
|
|
@@ -650,8 +811,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
650
811
|
min_condition = min(input_x.shape[1], self.max_l)
|
|
651
812
|
while (
|
|
652
813
|
(slices.shape[0] > 0)
|
|
653
|
-
|
|
654
|
-
|
|
814
|
+
and (slices.sum() > 0)
|
|
815
|
+
and (level < min_condition)
|
|
655
816
|
):
|
|
656
817
|
level += 1
|
|
657
818
|
|
|
@@ -687,7 +848,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
687
848
|
top_k_statistics
|
|
688
849
|
)
|
|
689
850
|
valid = np.sum(
|
|
690
|
-
(statistics[:, 3] >= self.
|
|
851
|
+
(statistics[:, 3] >= self._min_sup_actual)
|
|
852
|
+
& (statistics[:, 1] > 0)
|
|
691
853
|
)
|
|
692
854
|
logger.debug(
|
|
693
855
|
" -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
|
|
@@ -723,7 +885,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
|
|
|
723
885
|
]
|
|
724
886
|
self.top_slices_statistics_ = [
|
|
725
887
|
{
|
|
726
|
-
stat_name: stat_value
|
|
888
|
+
stat_name: float(stat_value)
|
|
727
889
|
for stat_value, stat_name in zip(statistic, statistics_names)
|
|
728
890
|
}
|
|
729
891
|
for statistic in top_k_statistics
|
sliceline/validation.py
CHANGED
|
@@ -19,8 +19,11 @@ from contextlib import suppress
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
import scipy.sparse as sp
|
|
21
21
|
|
|
22
|
-
#
|
|
23
|
-
|
|
22
|
+
# ComplexWarning moved from numpy.core.numeric to numpy.exceptions in NumPy 2.0
|
|
23
|
+
try:
|
|
24
|
+
from numpy.exceptions import ComplexWarning
|
|
25
|
+
except ImportError:
|
|
26
|
+
from numpy.core.numeric import ComplexWarning # type: ignore
|
|
24
27
|
from sklearn._config import get_config as _get_config
|
|
25
28
|
from sklearn.exceptions import DataConversionWarning
|
|
26
29
|
from sklearn.utils.fixes import _object_dtype_isnan
|
|
@@ -1,21 +1,41 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: sliceline
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
5
|
-
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Fast slice finding for Machine Learning model debugging.
|
|
5
|
+
Project-URL: Homepage, https://github.com/DataDome/sliceline
|
|
6
|
+
Project-URL: Documentation, https://sliceline.readthedocs.io/en/stable/
|
|
7
|
+
Project-URL: Repository, https://github.com/DataDome/sliceline
|
|
6
8
|
Author: Antoine de Daran
|
|
7
|
-
|
|
9
|
+
License-Expression: BSD-3-Clause
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
8
14
|
Classifier: License :: OSI Approved :: BSD License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
9
16
|
Classifier: Programming Language :: Python :: 3
|
|
10
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
11
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
-
Classifier:
|
|
14
|
-
Requires-
|
|
15
|
-
Requires-Dist:
|
|
16
|
-
Requires-Dist:
|
|
17
|
-
|
|
18
|
-
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: <4,>=3.10
|
|
22
|
+
Requires-Dist: numpy<3,>=1.25
|
|
23
|
+
Requires-Dist: scikit-learn<2,>=1.6.0
|
|
24
|
+
Requires-Dist: scipy<2,>=1.12
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: jupyter>=1.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: matplotlib>=3.9; extra == 'dev'
|
|
28
|
+
Requires-Dist: nbconvert>=7.0.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: optbinning>=0.15.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pandas>=2.1.1; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest-benchmark>=4.0.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest-cov>=3.0.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=7.2.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.9.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: sphinx-rtd-theme>=3.0.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: sphinx>=8.0.0; extra == 'dev'
|
|
37
|
+
Provides-Extra: optimized
|
|
38
|
+
Requires-Dist: numba>=0.60.0; extra == 'optimized'
|
|
19
39
|
Description-Content-Type: text/x-rst
|
|
20
40
|
|
|
21
41
|
Sliceline
|
|
@@ -85,6 +105,47 @@ Or, through SSH:
|
|
|
85
105
|
|
|
86
106
|
pip install git+ssh://git@github.com/datadome/sliceline.git --upgrade
|
|
87
107
|
|
|
108
|
+
⚡ Performance Optimization
|
|
109
|
+
---------------------------
|
|
110
|
+
|
|
111
|
+
Sliceline includes optional Numba JIT compilation for **5-50x performance improvements** on scoring operations.
|
|
112
|
+
|
|
113
|
+
**Quick Installation:**
|
|
114
|
+
|
|
115
|
+
.. code:: sh
|
|
116
|
+
|
|
117
|
+
# With optimization support
|
|
118
|
+
pip install sliceline[optimized]
|
|
119
|
+
|
|
120
|
+
**Benefits:**
|
|
121
|
+
|
|
122
|
+
- 5-6x faster scoring operations
|
|
123
|
+
- 1.4-4.5x faster overall fit() performance
|
|
124
|
+
- Up to 17% memory reduction on large datasets
|
|
125
|
+
- Automatic fallback to pure NumPy if Numba not available
|
|
126
|
+
|
|
127
|
+
**System Requirements:**
|
|
128
|
+
|
|
129
|
+
Numba requires LLVM to be installed:
|
|
130
|
+
|
|
131
|
+
.. code:: sh
|
|
132
|
+
|
|
133
|
+
# macOS
|
|
134
|
+
brew install llvm
|
|
135
|
+
|
|
136
|
+
# Linux (Ubuntu/Debian)
|
|
137
|
+
sudo apt-get install llvm
|
|
138
|
+
|
|
139
|
+
**Verify Optimization:**
|
|
140
|
+
|
|
141
|
+
.. code:: python
|
|
142
|
+
|
|
143
|
+
from sliceline import is_numba_available
|
|
144
|
+
|
|
145
|
+
print("Numba available:", is_numba_available())
|
|
146
|
+
|
|
147
|
+
See the `performance benchmarks <https://github.com/DataDome/sliceline/tree/main/benchmarks>`__ for detailed metrics.
|
|
148
|
+
|
|
88
149
|
🔗 Useful links
|
|
89
150
|
---------------
|
|
90
151
|
|
|
@@ -116,4 +177,3 @@ if you want to bring modifications to the code base.
|
|
|
116
177
|
----------
|
|
117
178
|
|
|
118
179
|
Sliceline is free and open-source software licensed under the `3-clause BSD license <https://github.com/DataDome/sliceline/blob/main/LICENSE>`__.
|
|
119
|
-
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
sliceline/__init__.py,sha256=6BE45x-4OhgXqkPRBmvXsTbfnZN5YzAUx8Wu8RIfWjE,106
|
|
2
|
+
sliceline/_numba_ops.py,sha256=34IYOumWCfBMBpaNj5OxWcwJk2mFcnJTyKMmgTIjcIc,6455
|
|
3
|
+
sliceline/slicefinder.py,sha256=umNscHR24iVU7C3kLTIkySMMAteD562vGIdcxHe3qLY,32042
|
|
4
|
+
sliceline/validation.py,sha256=pydiTHlS6f1iBtlIqATLVHimhoyZDKTMrDjQIH2R9ks,30875
|
|
5
|
+
sliceline-0.3.0.dist-info/METADATA,sha256=_rAxmaiYiWTNgnZzFO-ijMJAl2vPd4z7B0XZLCn4oQU,5518
|
|
6
|
+
sliceline-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
+
sliceline-0.3.0.dist-info/licenses/LICENSE,sha256=AbeN2ySrCt8VUJukqcQIYutROwZh3W2u0UU1d7EnbZs,1531
|
|
8
|
+
sliceline-0.3.0.dist-info/RECORD,,
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
sliceline/__init__.py,sha256=jEIUmQtv4W_eZuH63KQ8tAFoRZxyN3O8bRZ__FlMJr0,65
|
|
2
|
-
sliceline/slicefinder.py,sha256=-6aP7hM_fWmFY_fxyXF4PTZmNYmXO_S3efsaL-6g3a8,26378
|
|
3
|
-
sliceline/validation.py,sha256=-RkCpRdANNeaJyrdj7zFn4xs1X1xIXitKwRoL_B5EAk,30794
|
|
4
|
-
sliceline-0.2.20.dist-info/LICENSE,sha256=AbeN2ySrCt8VUJukqcQIYutROwZh3W2u0UU1d7EnbZs,1531
|
|
5
|
-
sliceline-0.2.20.dist-info/METADATA,sha256=khVwkNnYx4K3KaA4BC9gIq0MJmL5D4lOnYyUREbzoVQ,3717
|
|
6
|
-
sliceline-0.2.20.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
7
|
-
sliceline-0.2.20.dist-info/RECORD,,
|
|
File without changes
|