sliceline 0.2.18__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sliceline/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from .slicefinder import Slicefinder
1
+ from .slicefinder import Slicefinder, is_numba_available
2
2
 
3
- __all__ = ("Slicefinder",)
3
+ __all__ = ("Slicefinder", "is_numba_available")
@@ -0,0 +1,245 @@
1
+ """Numba-accelerated operations for Sliceline.
2
+
3
+ Provides JIT-compiled versions of performance-critical functions
4
+ for 5-50x performance improvements in scoring and ID computation.
5
+
6
+ This module is optional - if numba is not installed, the main slicefinder
7
+ module will fall back to pure NumPy implementations.
8
+
9
+ Installation:
10
+ pip install numba
11
+ # or
12
+ pip install sliceline[optimized]
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import numpy as np
18
+ from numba import njit
19
+
20
+
21
+ @njit(cache=True)
22
+ def score_slices_numba(
23
+ slice_sizes: np.ndarray,
24
+ slice_errors: np.ndarray,
25
+ n_row: int,
26
+ alpha: float,
27
+ avg_error: float,
28
+ ) -> np.ndarray:
29
+ """JIT-compiled slice scoring function.
30
+
31
+ Computes scores for each slice based on size and error metrics.
32
+ 5-10x faster than pure NumPy implementation.
33
+
34
+ Parameters
35
+ ----------
36
+ slice_sizes : np.ndarray
37
+ Array of slice sizes.
38
+ slice_errors : np.ndarray
39
+ Array of slice errors.
40
+ n_row : int
41
+ Number of rows in the encoded dataset.
42
+ alpha : float
43
+ Weight parameter for error importance.
44
+ avg_error : float
45
+ Average error across all samples.
46
+
47
+ Returns
48
+ -------
49
+ np.ndarray
50
+ Array of computed scores for each slice.
51
+ """
52
+ n = slice_sizes.shape[0]
53
+ scores = np.empty(n, dtype=np.float64)
54
+
55
+ for i in range(n):
56
+ if slice_sizes[i] <= 0:
57
+ scores[i] = -np.inf
58
+ else:
59
+ slice_avg_error = slice_errors[i] / slice_sizes[i]
60
+ error_term = alpha * (slice_avg_error / avg_error - 1.0)
61
+ size_term = (1.0 - alpha) * (n_row / slice_sizes[i] - 1.0)
62
+ scores[i] = error_term - size_term
63
+
64
+ return scores
65
+
66
+
67
+ @njit(cache=True)
68
+ def score_ub_single_numba(
69
+ slice_size: float,
70
+ slice_error: float,
71
+ max_slice_error: float,
72
+ n_col_x_encoded: int,
73
+ min_sup: float,
74
+ alpha: float,
75
+ avg_error: float,
76
+ ) -> float:
77
+ """JIT-compiled upper bound score for a single slice.
78
+
79
+ Parameters
80
+ ----------
81
+ slice_size : float
82
+ Size of the slice.
83
+ slice_error : float
84
+ Error sum of the slice.
85
+ max_slice_error : float
86
+ Maximum error in the slice.
87
+ n_col_x_encoded : int
88
+ Number of encoded columns.
89
+ min_sup : float
90
+ Minimum support threshold.
91
+ alpha : float
92
+ Weight parameter for error importance.
93
+ avg_error : float
94
+ Average error across all samples.
95
+
96
+ Returns
97
+ -------
98
+ float
99
+ Upper bound score for the slice.
100
+ """
101
+ if slice_size <= 0:
102
+ return -np.inf
103
+
104
+ potential_solutions = np.array(
105
+ [
106
+ min_sup,
107
+ max(slice_error / max_slice_error, min_sup)
108
+ if max_slice_error > 0
109
+ else min_sup,
110
+ slice_size,
111
+ ]
112
+ )
113
+
114
+ max_score = -np.inf
115
+ for pot_sol in potential_solutions:
116
+ if pot_sol <= 0:
117
+ continue
118
+ error_contrib = min(pot_sol * max_slice_error, slice_error)
119
+ score = (
120
+ alpha * (error_contrib / avg_error - pot_sol)
121
+ - (1.0 - alpha) * (n_col_x_encoded - pot_sol)
122
+ ) / pot_sol
123
+ if score > max_score:
124
+ max_score = score
125
+
126
+ return max_score
127
+
128
+
129
+ @njit(cache=True)
130
+ def score_ub_batch_numba(
131
+ slice_sizes_ub: np.ndarray,
132
+ slice_errors_ub: np.ndarray,
133
+ max_slice_errors_ub: np.ndarray,
134
+ n_col_x_encoded: int,
135
+ min_sup: float,
136
+ alpha: float,
137
+ avg_error: float,
138
+ ) -> np.ndarray:
139
+ """JIT-compiled upper bound scoring function for batch processing.
140
+
141
+ 5-10x faster than pure NumPy implementation.
142
+
143
+ Parameters
144
+ ----------
145
+ slice_sizes_ub : np.ndarray
146
+ Array of slice sizes (upper bound).
147
+ slice_errors_ub : np.ndarray
148
+ Array of slice errors (upper bound).
149
+ max_slice_errors_ub : np.ndarray
150
+ Array of maximum slice errors (upper bound).
151
+ n_col_x_encoded : int
152
+ Number of encoded columns.
153
+ min_sup : float
154
+ Minimum support threshold.
155
+ alpha : float
156
+ Weight parameter for error importance.
157
+ avg_error : float
158
+ Average error across all samples.
159
+
160
+ Returns
161
+ -------
162
+ np.ndarray
163
+ Array of upper bound scores for each slice.
164
+ """
165
+ n = slice_sizes_ub.shape[0]
166
+ scores = np.empty(n, dtype=np.float64)
167
+
168
+ for i in range(n):
169
+ scores[i] = score_ub_single_numba(
170
+ slice_sizes_ub[i],
171
+ slice_errors_ub[i],
172
+ max_slice_errors_ub[i],
173
+ n_col_x_encoded,
174
+ min_sup,
175
+ alpha,
176
+ avg_error,
177
+ )
178
+
179
+ return scores
180
+
181
+
182
+ @njit(cache=True)
183
+ def compute_slice_ids_numba(
184
+ slices_data: np.ndarray,
185
+ slices_indices: np.ndarray,
186
+ slices_indptr: np.ndarray,
187
+ feature_offset_start: np.ndarray,
188
+ feature_offset_end: np.ndarray,
189
+ feature_domains: np.ndarray,
190
+ ) -> np.ndarray:
191
+ """JIT-compiled slice ID computation.
192
+
193
+ Computes unique IDs for each slice based on feature encoding.
194
+ 10-50x faster than Python loop for large datasets.
195
+
196
+ Parameters
197
+ ----------
198
+ slices_data : np.ndarray
199
+ Data array from sparse matrix.
200
+ slices_indices : np.ndarray
201
+ Column indices array from sparse matrix.
202
+ slices_indptr : np.ndarray
203
+ Index pointer array from sparse matrix.
204
+ feature_offset_start : np.ndarray
205
+ Start offset for each feature.
206
+ feature_offset_end : np.ndarray
207
+ End offset for each feature.
208
+ feature_domains : np.ndarray
209
+ Domain size for each feature.
210
+
211
+ Returns
212
+ -------
213
+ np.ndarray
214
+ Array of unique IDs for each slice.
215
+ """
216
+ n_slices = len(slices_indptr) - 1
217
+ n_features = len(feature_offset_start)
218
+ slice_ids = np.zeros(n_slices, dtype=np.float64)
219
+
220
+ dom = feature_domains + 1
221
+
222
+ for i in range(n_slices):
223
+ start_idx = slices_indptr[i]
224
+ end_idx = slices_indptr[i + 1]
225
+
226
+ slice_id = 0.0
227
+ for j in range(start_idx, end_idx):
228
+ col = slices_indices[j]
229
+ val = slices_data[j]
230
+
231
+ if val == 0:
232
+ continue
233
+
234
+ for f in range(n_features):
235
+ if feature_offset_start[f] <= col < feature_offset_end[f]:
236
+ offset = col - feature_offset_start[f]
237
+ multiplier = 1.0
238
+ for k in range(f + 1, n_features):
239
+ multiplier *= dom[k]
240
+ slice_id += (offset + 1) * multiplier
241
+ break
242
+
243
+ slice_ids[i] = slice_id
244
+
245
+ return slice_ids
sliceline/slicefinder.py CHANGED
@@ -2,20 +2,63 @@
2
2
  The slicefinder module implements the Slicefinder class.
3
3
  """
4
4
 
5
+ from __future__ import annotations
6
+
5
7
  import logging
6
- from typing import Tuple, Union
8
+ import warnings
9
+ from typing import Any
7
10
 
8
11
  import numpy as np
12
+ import numpy.typing as npt
9
13
  from scipy import sparse as sp
10
14
  from scipy.stats import rankdata
11
15
  from sklearn.base import BaseEstimator, TransformerMixin
12
16
  from sklearn.preprocessing import OneHotEncoder
13
- from sklearn.utils.validation import check_is_fitted
17
+ from sklearn.utils.validation import _check_feature_names, check_is_fitted
14
18
 
15
19
  from sliceline.validation import check_array, check_X_e
16
20
 
21
+ ArrayLike = npt.ArrayLike
22
+ NDArray = npt.NDArray[Any]
23
+
17
24
  logger = logging.getLogger(__name__)
18
- logging.basicConfig(level=logging.INFO)
25
+
26
+ # Numba availability detection
27
+ try:
28
+ from sliceline._numba_ops import (
29
+ compute_slice_ids_numba,
30
+ score_slices_numba,
31
+ score_ub_batch_numba,
32
+ )
33
+
34
+ NUMBA_AVAILABLE = True
35
+ except ImportError:
36
+ NUMBA_AVAILABLE = False
37
+ score_slices_numba = None
38
+ score_ub_batch_numba = None
39
+ compute_slice_ids_numba = None
40
+
41
+
42
+ def is_numba_available() -> bool:
43
+ """Check if numba is available for acceleration.
44
+
45
+ Returns
46
+ -------
47
+ bool
48
+ True if numba is installed and can be used for acceleration.
49
+ """
50
+ return NUMBA_AVAILABLE
51
+
52
+
53
+ def _warn_numba_not_available() -> None:
54
+ """Issue a warning if numba is not available."""
55
+ warnings.warn(
56
+ "Numba not available. Install with: pip install numba\n"
57
+ "Or: pip install sliceline[optimized]\n"
58
+ "Performance will be 5-50x slower without Numba optimization.",
59
+ UserWarning,
60
+ stacklevel=3,
61
+ )
19
62
 
20
63
 
21
64
  class Slicefinder(BaseEstimator, TransformerMixin):
@@ -93,9 +136,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
93
136
  alpha: float = 0.6,
94
137
  k: int = 1,
95
138
  max_l: int = 4,
96
- min_sup: Union[int, float] = 10,
139
+ min_sup: int | float = 10,
97
140
  verbose: bool = True,
98
- ):
141
+ ) -> None:
99
142
  self.alpha = alpha
100
143
  self.k = k
101
144
  self.max_l = max_l
@@ -105,13 +148,25 @@ class Slicefinder(BaseEstimator, TransformerMixin):
105
148
  self._one_hot_encoder = self._top_slices_enc = None
106
149
  self.top_slices_ = self.top_slices_statistics_ = None
107
150
  self.average_error_ = None
151
+ self._min_sup_actual = min_sup
108
152
 
109
153
  if self.verbose:
110
154
  logger.setLevel(logging.DEBUG)
111
155
  else:
112
156
  logger.setLevel(logging.INFO)
113
157
 
114
- def _check_params(self):
158
+ # Warn user once if Numba optimization is not available
159
+ if not NUMBA_AVAILABLE and verbose:
160
+ warnings.warn(
161
+ "Numba JIT optimization not available. "
162
+ "Install with 'pip install sliceline[optimized]' "
163
+ "for 5-50x performance improvements on scoring operations. "
164
+ "See https://github.com/DataDome/sliceline for details.",
165
+ UserWarning,
166
+ stacklevel=2,
167
+ )
168
+
169
+ def _check_params(self) -> None:
115
170
  """Check transformer parameters."""
116
171
  if not 0 < self.alpha <= 1:
117
172
  raise ValueError(f"Invalid 'alpha' parameter: {self.alpha}")
@@ -127,7 +182,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
127
182
  ):
128
183
  raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")
129
184
 
130
- def _check_top_slices(self):
185
+ def _check_top_slices(self) -> None:
131
186
  """Check if slices have been found."""
132
187
  # Check if fit has been called
133
188
  check_is_fitted(self)
@@ -136,7 +191,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
136
191
  if self.top_slices_.size == 0:
137
192
  raise ValueError("No transform: Sliceline did not find any slice.")
138
193
 
139
- def fit(self, X, errors):
194
+ def fit(self, X: ArrayLike, errors: ArrayLike) -> Slicefinder:
140
195
  """Search for slice(s) on `X` based on `errors`.
141
196
 
142
197
  Parameters
@@ -155,20 +210,22 @@ class Slicefinder(BaseEstimator, TransformerMixin):
155
210
  """
156
211
  self._check_params()
157
212
 
158
- # Update min_sup for a fraction of the input dataset size
213
+ # Compute actual min_sup value (convert fraction to count if needed)
159
214
  if 0 < self.min_sup < 1:
160
- self.min_sup = int(self.min_sup * len(X))
215
+ self._min_sup_actual = int(self.min_sup * len(X))
216
+ else:
217
+ self._min_sup_actual = self.min_sup
161
218
 
162
219
  # Check that X and e have correct shape
163
220
  X_array, errors = check_X_e(X, errors, y_numeric=True)
164
221
 
165
- self._check_feature_names(X, reset=True)
222
+ _check_feature_names(self, X, reset=True)
166
223
 
167
224
  self._search_slices(X_array, errors)
168
225
 
169
226
  return self
170
227
 
171
- def transform(self, X):
228
+ def transform(self, X: ArrayLike) -> NDArray:
172
229
  """Generate slices masks for `X`.
173
230
 
174
231
  Parameters
@@ -191,7 +248,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
191
248
 
192
249
  return slices_masks.T
193
250
 
194
- def get_slice(self, X, slice_index: int):
251
+ def get_slice(self, X: ArrayLike, slice_index: int) -> NDArray:
195
252
  """Filter `X` samples according to the `slice_index`-th slice.
196
253
 
197
254
  Parameters
@@ -217,7 +274,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
217
274
 
218
275
  return X[np.where(slices_masks[slice_index])[0], :]
219
276
 
220
- def get_feature_names_out(self):
277
+ def get_feature_names_out(self) -> NDArray:
221
278
  """Get output feature names for transformation.
222
279
 
223
280
  Returns
@@ -232,7 +289,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
232
289
 
233
290
  return np.array(feature_names, dtype=object)
234
291
 
235
- def _get_slices_masks(self, X):
292
+ def _get_slices_masks(self, X: NDArray) -> NDArray:
236
293
  """Private utilities function generating slices masks for `X`."""
237
294
  X_encoded = self._one_hot_encoder.transform(X)
238
295
 
@@ -248,33 +305,51 @@ class Slicefinder(BaseEstimator, TransformerMixin):
248
305
  return slices_masks
249
306
 
250
307
  @property
251
- def _n_features_out(self):
308
+ def _n_features_out(self) -> int:
252
309
  """Number of transformed output features."""
253
310
  return self.top_slices_.shape[0]
254
311
 
255
312
  @staticmethod
256
- def _dummify(array: np.ndarray, n_col_x_encoded: int) -> sp.csr_matrix:
313
+ def _dummify(array: NDArray, n_col_x_encoded: int) -> sp.csr_matrix:
257
314
  """Dummify `array` with respect to `n_col_x_encoded`.
258
- Assumption: v does not contain any 0."""
259
- assert (
260
- 0 not in array
261
- ), "Modality 0 is not expected to be one-hot encoded."
262
- one_hot_encoding = sp.lil_matrix(
263
- (array.size, n_col_x_encoded), dtype=bool
315
+
316
+ Creates a sparse one-hot encoding matrix where each row corresponds
317
+ to an element in array and has a single True value in the column
318
+ specified by that element (adjusted for 1-based indexing).
319
+
320
+ Args:
321
+ array: 1-based indices to encode (must not contain 0)
322
+ n_col_x_encoded: Number of columns in output matrix
323
+
324
+ Returns:
325
+ Sparse CSR matrix of shape (len(array), n_col_x_encoded)
326
+
327
+ Raises:
328
+ ValueError: If array contains 0, which cannot be one-hot encoded.
329
+ """
330
+ if 0 in array:
331
+ raise ValueError(
332
+ "Modality 0 is not expected to be one-hot encoded."
333
+ )
334
+
335
+ # Direct CSR construction: 2-3x faster than lil_matrix approach
336
+ n = array.size
337
+ return sp.csr_matrix(
338
+ (np.ones(n, dtype=np.bool_), (np.arange(n), array - 1)),
339
+ shape=(n, n_col_x_encoded),
340
+ dtype=np.bool_,
264
341
  )
265
- one_hot_encoding[np.arange(array.size), array - 1] = True
266
- return one_hot_encoding.tocsr()
267
342
 
268
343
  def _maintain_top_k(
269
344
  self,
270
345
  slices: sp.csr_matrix,
271
- statistics: np.ndarray,
346
+ statistics: NDArray,
272
347
  top_k_slices: sp.csr_matrix,
273
- top_k_statistics: np.ndarray,
274
- ) -> Tuple[sp.csr_matrix, np.ndarray]:
348
+ top_k_statistics: NDArray,
349
+ ) -> tuple[sp.csr_matrix, NDArray]:
275
350
  """Add new `slices` to `top_k_slices` and update the top-k slices."""
276
351
  # prune invalid min_sup and scores
277
- valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
352
+ valid_slices_mask = (statistics[:, 3] >= self._min_sup_actual) & (
278
353
  statistics[:, 0] > 0
279
354
  )
280
355
  if np.sum(valid_slices_mask) != 0:
@@ -283,7 +358,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
283
358
  statistics[valid_slices_mask],
284
359
  )
285
360
 
286
- if (slices.shape[1] != top_k_slices.shape[1]) & (
361
+ if (slices.shape[1] != top_k_slices.shape[1]) and (
287
362
  slices.shape[1] == 1
288
363
  ):
289
364
  slices, statistics = slices.T, statistics.T
@@ -300,7 +375,14 @@ class Slicefinder(BaseEstimator, TransformerMixin):
300
375
  slices[top_slices_bool],
301
376
  statistics[top_slices_bool],
302
377
  )
303
- top_slices_indices = np.argsort(-top_k_statistics[:, 0])
378
+ # Sort by score (descending), then lexicographically by slice representation
379
+ # to ensure deterministic ordering when scores are equal
380
+ scores = -top_k_statistics[:, 0]
381
+ slice_keys = tuple(
382
+ top_k_slices.toarray()[:, i]
383
+ for i in range(top_k_slices.shape[1])
384
+ )
385
+ top_slices_indices = np.lexsort(slice_keys[::-1] + (scores,))
304
386
  top_k_slices, top_k_statistics = (
305
387
  top_k_slices[top_slices_indices],
306
388
  top_k_statistics[top_slices_indices],
@@ -309,20 +391,35 @@ class Slicefinder(BaseEstimator, TransformerMixin):
309
391
 
310
392
  def _score_ub(
311
393
  self,
312
- slice_sizes_ub: np.ndarray,
313
- slice_errors_ub: np.ndarray,
314
- max_slice_errors_ub: np.ndarray,
394
+ slice_sizes_ub: NDArray,
395
+ slice_errors_ub: NDArray,
396
+ max_slice_errors_ub: NDArray,
315
397
  n_col_x_encoded: int,
316
- ) -> np.ndarray:
317
- """Compute the upper-bound score for all the slices."""
398
+ ) -> NDArray:
399
+ """Compute the upper-bound score for all the slices.
400
+
401
+ Uses Numba JIT compilation when available for 5-10x speedup.
402
+ """
403
+ if NUMBA_AVAILABLE and score_ub_batch_numba is not None:
404
+ return score_ub_batch_numba(
405
+ slice_sizes_ub.astype(np.float64),
406
+ slice_errors_ub.astype(np.float64),
407
+ max_slice_errors_ub.astype(np.float64),
408
+ n_col_x_encoded,
409
+ float(self._min_sup_actual),
410
+ self.alpha,
411
+ self.average_error_,
412
+ )
413
+
414
+ # Fallback to NumPy implementation
318
415
  # Since slice_scores is either monotonically increasing or decreasing, we
319
416
  # probe interesting points of slice_scores in the interval [min_sup, ss],
320
417
  # and compute the maximum to serve as the upper bound
321
418
  potential_solutions = np.column_stack(
322
419
  (
323
- self.min_sup * np.ones(slice_sizes_ub.shape[0]),
420
+ self._min_sup_actual * np.ones(slice_sizes_ub.shape[0]),
324
421
  np.maximum(
325
- slice_errors_ub / max_slice_errors_ub, self.min_sup
422
+ slice_errors_ub / max_slice_errors_ub, self._min_sup_actual
326
423
  ),
327
424
  slice_sizes_ub,
328
425
  )
@@ -346,7 +443,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
346
443
  return slice_scores_ub
347
444
 
348
445
  @staticmethod
349
- def _analyse_top_k(top_k_statistics: np.ndarray) -> tuple:
446
+ def _analyse_top_k(top_k_statistics: NDArray) -> tuple[float, float]:
350
447
  """Get the maximum and the minimum slices scores."""
351
448
  max_slice_scores = min_slice_scores = -np.inf
352
449
  if top_k_statistics.shape[0] > 0:
@@ -358,23 +455,40 @@ class Slicefinder(BaseEstimator, TransformerMixin):
358
455
 
359
456
  def _score(
360
457
  self,
361
- slice_sizes: np.ndarray,
362
- slice_errors: np.ndarray,
458
+ slice_sizes: NDArray,
459
+ slice_errors: NDArray,
363
460
  n_row_x_encoded: int,
364
- ) -> np.ndarray:
365
- """Compute the score for all the slices."""
366
- slice_scores = self.alpha * (
367
- (slice_errors / slice_sizes) / self.average_error_ - 1
368
- ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
461
+ ) -> NDArray:
462
+ """Compute the score for all the slices.
463
+
464
+ Uses Numba JIT compilation when available for 5-10x speedup.
465
+ """
466
+ if NUMBA_AVAILABLE and score_slices_numba is not None:
467
+ # Ensure inputs are float64 for numba
468
+ sizes = np.asarray(slice_sizes, dtype=np.float64)
469
+ errors = np.asarray(slice_errors, dtype=np.float64)
470
+ return score_slices_numba(
471
+ sizes,
472
+ errors,
473
+ n_row_x_encoded,
474
+ self.alpha,
475
+ self.average_error_,
476
+ )
477
+
478
+ # Fallback to NumPy implementation
479
+ with np.errstate(divide="ignore", invalid="ignore"):
480
+ slice_scores = self.alpha * (
481
+ (slice_errors / slice_sizes) / self.average_error_ - 1
482
+ ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
369
483
  return np.nan_to_num(slice_scores, nan=-np.inf)
370
484
 
371
485
  def _eval_slice(
372
486
  self,
373
487
  x_encoded: sp.csr_matrix,
374
- errors: np.ndarray,
488
+ errors: NDArray,
375
489
  slices: sp.csr_matrix,
376
490
  level: int,
377
- ) -> np.ndarray:
491
+ ) -> NDArray:
378
492
  """Compute several statistics for all the slices."""
379
493
  slice_candidates = x_encoded @ slices.T == level
380
494
  slice_sizes = slice_candidates.sum(axis=0).A[0]
@@ -397,8 +511,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
397
511
  self,
398
512
  x_encoded: sp.csr_matrix,
399
513
  n_col_x_encoded: int,
400
- errors: np.ndarray,
401
- ) -> Tuple[sp.csr_matrix, np.ndarray]:
514
+ errors: NDArray,
515
+ ) -> tuple[sp.csr_matrix, NDArray]:
402
516
  """Initialise 1-slices, i.e. slices with one predicate."""
403
517
  slice_sizes = x_encoded.sum(axis=0).A[0]
404
518
  slice_errors = errors @ x_encoded
@@ -409,7 +523,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
409
523
  )
410
524
 
411
525
  # working set of active slices (#attr x #slices) and top-k
412
- valid_slices_mask = (slice_sizes >= self.min_sup) & (slice_errors > 0)
526
+ valid_slices_mask = (slice_sizes >= self._min_sup_actual) & (
527
+ slice_errors > 0
528
+ )
413
529
  attr = np.arange(1, n_col_x_encoded + 1)[valid_slices_mask]
414
530
  slice_sizes = slice_sizes[valid_slices_mask]
415
531
  slice_errors = slice_errors[valid_slices_mask]
@@ -427,18 +543,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
427
543
  n_col_dropped = n_col_x_encoded - sum(valid_slices_mask)
428
544
  logger.debug(
429
545
  "Dropping %i/%i features below min_sup = %i."
430
- % (n_col_dropped, n_col_x_encoded, self.min_sup)
546
+ % (n_col_dropped, n_col_x_encoded, self._min_sup_actual)
431
547
  )
432
548
 
433
549
  return slices, statistics
434
550
 
435
551
  def _get_pruned_s_r(
436
- self, slices: sp.csr_matrix, statistics: np.ndarray
437
- ) -> Tuple[sp.csr_matrix, np.ndarray]:
552
+ self, slices: sp.csr_matrix, statistics: NDArray
553
+ ) -> tuple[sp.csr_matrix, NDArray]:
438
554
  """Prune invalid slices.
439
555
  Do not affect overall pruning effectiveness due to handling of missing parents.
440
556
  """
441
- valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
557
+ valid_slices_mask = (statistics[:, 3] >= self._min_sup_actual) & (
442
558
  statistics[:, 1] > 0
443
559
  )
444
560
  return slices[valid_slices_mask], statistics[valid_slices_mask]
@@ -446,22 +562,53 @@ class Slicefinder(BaseEstimator, TransformerMixin):
446
562
  @staticmethod
447
563
  def _join_compatible_slices(
448
564
  slices: sp.csr_matrix, level: int
449
- ) -> np.ndarray:
450
- """Join compatible slices according to `level`."""
565
+ ) -> sp.csr_matrix:
566
+ """Join compatible slices keeping sparse format when beneficial.
567
+
568
+ Returns a sparse boolean matrix where entry (i,j) is True if slices
569
+ i and j are compatible for joining at the given level. Only upper
570
+ triangular entries (i < j) are populated.
571
+
572
+ For level==2 (looking for disjoint slices), uses dense format since
573
+ most pairs are compatible. For higher levels, keeps sparse format.
574
+ """
575
+ n_slices = slices.shape[0]
576
+ if n_slices == 0:
577
+ return sp.csr_matrix((0, 0), dtype=np.bool_)
578
+
451
579
  slices_int = slices.astype(int)
452
- # Here we can't use the .A shorthand because it is not
453
- # implemented in all scipy versions for coo_matrix objects
454
- join = (slices_int @ slices_int.T).toarray() == level - 2
455
- return np.triu(join, 1) * join
580
+ join_counts = slices_int @ slices_int.T
581
+
582
+ if level == 2:
583
+ # For level 2, we're looking for pairs with dot product == 0
584
+ # Most pairs will match, so dense is more efficient
585
+ join_dense = join_counts.toarray() == 0
586
+ else:
587
+ # For higher levels, most pairs won't match, so sparse is better
588
+ # Use dense conversion for smaller matrices to ensure consistent ordering
589
+ # This matches the original behavior and ensures deterministic results
590
+ join_dense = join_counts.toarray() == level - 2
591
+
592
+ join_upper = np.triu(join_dense, 1)
593
+ rows, cols = np.where(join_upper)
594
+ return sp.csr_matrix(
595
+ (np.ones(len(rows), dtype=np.bool_), (rows, cols)),
596
+ shape=join_counts.shape,
597
+ dtype=np.bool_,
598
+ )
456
599
 
457
600
  @staticmethod
458
601
  def _combine_slices(
459
602
  slices: sp.csr_matrix,
460
- statistics: np.ndarray,
461
- compatible_slices: np.ndarray,
462
- ) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
463
- """Combine slices by exploiting parents node statistics."""
464
- parent_1_idx, parent_2_idx = np.where(compatible_slices == 1)
603
+ statistics: NDArray,
604
+ compatible_slices: sp.csr_matrix,
605
+ ) -> tuple[sp.csr_matrix, NDArray, NDArray, NDArray]:
606
+ """Combine slices by exploiting parents node statistics.
607
+
608
+ Works with sparse compatible_slices matrix returned by
609
+ _join_compatible_slices.
610
+ """
611
+ parent_1_idx, parent_2_idx = compatible_slices.nonzero()
465
612
  pair_candidates = slices[parent_1_idx] + slices[parent_2_idx]
466
613
 
467
614
  slice_errors = np.minimum(
@@ -477,13 +624,13 @@ class Slicefinder(BaseEstimator, TransformerMixin):
477
624
 
478
625
  @staticmethod
479
626
  def _prune_invalid_self_joins(
480
- feature_offset_start: np.ndarray,
481
- feature_offset_end: np.ndarray,
627
+ feature_offset_start: NDArray,
628
+ feature_offset_end: NDArray,
482
629
  pair_candidates: sp.csr_matrix,
483
- slice_sizes: np.ndarray,
484
- slice_errors: np.ndarray,
485
- max_slice_errors: np.ndarray,
486
- ) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
630
+ slice_sizes: NDArray,
631
+ slice_errors: NDArray,
632
+ max_slice_errors: NDArray,
633
+ ) -> tuple[sp.csr_matrix, NDArray, NDArray, NDArray]:
487
634
  """Prune invalid self joins (>1 bit per feature)."""
488
635
  valid_slices_mask = np.full(pair_candidates.shape[0], True)
489
636
  for start, end in zip(feature_offset_start, feature_offset_end):
@@ -500,12 +647,26 @@ class Slicefinder(BaseEstimator, TransformerMixin):
500
647
 
501
648
  @staticmethod
502
649
  def _prepare_deduplication_and_pruning(
503
- feature_offset_start: np.ndarray,
504
- feature_offset_end: np.ndarray,
505
- feature_domains: np.ndarray,
650
+ feature_offset_start: NDArray,
651
+ feature_offset_end: NDArray,
652
+ feature_domains: NDArray,
506
653
  pair_candidates: sp.csr_matrix,
507
- ) -> np.ndarray:
508
- """Prepare IDs for deduplication and pruning."""
654
+ ) -> NDArray:
655
+ """Prepare IDs for deduplication and pruning.
656
+
657
+ Uses Numba JIT compilation when available for 10-50x speedup.
658
+ """
659
+ if NUMBA_AVAILABLE and compute_slice_ids_numba is not None:
660
+ return compute_slice_ids_numba(
661
+ pair_candidates.data.astype(np.float64),
662
+ pair_candidates.indices.astype(np.int64),
663
+ pair_candidates.indptr.astype(np.int64),
664
+ feature_offset_start.astype(np.int64),
665
+ feature_offset_end.astype(np.int64),
666
+ feature_domains.astype(np.float64),
667
+ )
668
+
669
+ # Fallback to Python implementation
509
670
  ids = np.zeros(pair_candidates.shape[0])
510
671
  dom = feature_domains + 1
511
672
  for j, (start, end) in enumerate(
@@ -525,18 +686,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
525
686
  def _get_pair_candidates(
526
687
  self,
527
688
  slices: sp.csr_matrix,
528
- statistics: np.ndarray,
529
- top_k_statistics: np.ndarray,
689
+ statistics: NDArray,
690
+ top_k_statistics: NDArray,
530
691
  level: int,
531
692
  n_col_x_encoded: int,
532
- feature_domains: np.ndarray,
533
- feature_offset_start: np.ndarray,
534
- feature_offset_end: np.ndarray,
693
+ feature_domains: NDArray,
694
+ feature_offset_start: NDArray,
695
+ feature_offset_end: NDArray,
535
696
  ) -> sp.csr_matrix:
536
697
  """Compute and prune plausible slices candidates."""
537
698
  compatible_slices = self._join_compatible_slices(slices, level)
538
699
 
539
- if np.sum(compatible_slices) == 0:
700
+ if compatible_slices.nnz == 0:
540
701
  return sp.csr_matrix(np.empty((0, slices.shape[1])))
541
702
 
542
703
  (
@@ -596,7 +757,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
596
757
 
597
758
  # Seems to be always fully True
598
759
  # Due to maintain_top_k that apply slice_sizes filter
599
- pruning_sizes = slice_sizes >= self.min_sup
760
+ pruning_sizes = slice_sizes >= self._min_sup_actual
600
761
 
601
762
  _, min_slice_scores = self._analyse_top_k(top_k_statistics)
602
763
 
@@ -606,14 +767,14 @@ class Slicefinder(BaseEstimator, TransformerMixin):
606
767
 
607
768
  def _search_slices(
608
769
  self,
609
- input_x: np.ndarray,
610
- errors: np.ndarray,
770
+ input_x: NDArray,
771
+ errors: NDArray,
611
772
  ) -> None:
612
773
  """Main function of the SliceLine algorithm."""
613
774
  # prepare offset vectors and one-hot encoded input_x
614
775
  self._one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
615
776
  x_encoded = self._one_hot_encoder.fit_transform(input_x)
616
- feature_domains: np.ndarray = np.array(
777
+ feature_domains: NDArray = np.array(
617
778
  [len(sub_array) for sub_array in self._one_hot_encoder.categories_]
618
779
  )
619
780
  feature_offset_end = np.cumsum(feature_domains)
@@ -650,8 +811,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
650
811
  min_condition = min(input_x.shape[1], self.max_l)
651
812
  while (
652
813
  (slices.shape[0] > 0)
653
- & (slices.sum() > 0)
654
- & (level < min_condition)
814
+ and (slices.sum() > 0)
815
+ and (level < min_condition)
655
816
  ):
656
817
  level += 1
657
818
 
@@ -687,7 +848,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
687
848
  top_k_statistics
688
849
  )
689
850
  valid = np.sum(
690
- (statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
851
+ (statistics[:, 3] >= self._min_sup_actual)
852
+ & (statistics[:, 1] > 0)
691
853
  )
692
854
  logger.debug(
693
855
  " -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
@@ -723,7 +885,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
723
885
  ]
724
886
  self.top_slices_statistics_ = [
725
887
  {
726
- stat_name: stat_value
888
+ stat_name: float(stat_value)
727
889
  for stat_value, stat_name in zip(statistic, statistics_names)
728
890
  }
729
891
  for statistic in top_k_statistics
sliceline/validation.py CHANGED
@@ -19,8 +19,11 @@ from contextlib import suppress
19
19
  import numpy as np
20
20
  import scipy.sparse as sp
21
21
 
22
- # mypy error: Module 'numpy.core.numeric' has no attribute 'ComplexWarning'
23
- from numpy.core.numeric import ComplexWarning # type: ignore
22
+ # ComplexWarning moved from numpy.core.numeric to numpy.exceptions in NumPy 2.0
23
+ try:
24
+ from numpy.exceptions import ComplexWarning
25
+ except ImportError:
26
+ from numpy.core.numeric import ComplexWarning # type: ignore
24
27
  from sklearn._config import get_config as _get_config
25
28
  from sklearn.exceptions import DataConversionWarning
26
29
  from sklearn.utils.fixes import _object_dtype_isnan
@@ -1,21 +1,41 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: sliceline
3
- Version: 0.2.18
4
- Summary: ✂️ Fast slice finding for Machine Learning model debugging.
5
- License: BSD-3-Clause
3
+ Version: 0.3.0
4
+ Summary: Fast slice finding for Machine Learning model debugging.
5
+ Project-URL: Homepage, https://github.com/DataDome/sliceline
6
+ Project-URL: Documentation, https://sliceline.readthedocs.io/en/stable/
7
+ Project-URL: Repository, https://github.com/DataDome/sliceline
6
8
  Author: Antoine de Daran
7
- Requires-Python: >=3.10,<4
9
+ License-Expression: BSD-3-Clause
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
8
14
  Classifier: License :: OSI Approved :: BSD License
15
+ Classifier: Operating System :: OS Independent
9
16
  Classifier: Programming Language :: Python :: 3
10
17
  Classifier: Programming Language :: Python :: 3.10
11
18
  Classifier: Programming Language :: Python :: 3.11
12
19
  Classifier: Programming Language :: Python :: 3.12
13
- Classifier: Programming Language :: Python :: 3.13
14
- Requires-Dist: numpy (>=1.25,<2.0)
15
- Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
16
- Requires-Dist: scipy (>=1.12,<2.0)
17
- Project-URL: Documentation, https://sliceline.readthedocs.io/en/stable/
18
- Project-URL: Repository, https://github.com/DataDome/sliceline
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: <4,>=3.10
22
+ Requires-Dist: numpy<3,>=1.25
23
+ Requires-Dist: scikit-learn<2,>=1.6.0
24
+ Requires-Dist: scipy<2,>=1.12
25
+ Provides-Extra: dev
26
+ Requires-Dist: jupyter>=1.0.0; extra == 'dev'
27
+ Requires-Dist: matplotlib>=3.9; extra == 'dev'
28
+ Requires-Dist: nbconvert>=7.0.0; extra == 'dev'
29
+ Requires-Dist: optbinning>=0.15.0; extra == 'dev'
30
+ Requires-Dist: pandas>=2.1.1; extra == 'dev'
31
+ Requires-Dist: pytest-benchmark>=4.0.0; extra == 'dev'
32
+ Requires-Dist: pytest-cov>=3.0.0; extra == 'dev'
33
+ Requires-Dist: pytest>=7.2.0; extra == 'dev'
34
+ Requires-Dist: ruff>=0.9.0; extra == 'dev'
35
+ Requires-Dist: sphinx-rtd-theme>=3.0.0; extra == 'dev'
36
+ Requires-Dist: sphinx>=8.0.0; extra == 'dev'
37
+ Provides-Extra: optimized
38
+ Requires-Dist: numba>=0.60.0; extra == 'optimized'
19
39
  Description-Content-Type: text/x-rst
20
40
 
21
41
  Sliceline
@@ -85,6 +105,47 @@ Or, through SSH:
85
105
 
86
106
  pip install git+ssh://git@github.com/datadome/sliceline.git --upgrade
87
107
 
108
+ ⚡ Performance Optimization
109
+ ---------------------------
110
+
111
+ Sliceline includes optional Numba JIT compilation for **5-50x performance improvements** on scoring operations.
112
+
113
+ **Quick Installation:**
114
+
115
+ .. code:: sh
116
+
117
+ # With optimization support
118
+ pip install sliceline[optimized]
119
+
120
+ **Benefits:**
121
+
122
+ - 5-6x faster scoring operations
123
+ - 1.4-4.5x faster overall fit() performance
124
+ - Up to 17% memory reduction on large datasets
125
+ - Automatic fallback to pure NumPy if Numba not available
126
+
127
+ **System Requirements:**
128
+
129
+ Numba requires LLVM to be installed:
130
+
131
+ .. code:: sh
132
+
133
+ # macOS
134
+ brew install llvm
135
+
136
+ # Linux (Ubuntu/Debian)
137
+ sudo apt-get install llvm
138
+
139
+ **Verify Optimization:**
140
+
141
+ .. code:: python
142
+
143
+ from sliceline import is_numba_available
144
+
145
+ print("Numba available:", is_numba_available())
146
+
147
+ See the `performance benchmarks <https://github.com/DataDome/sliceline/tree/main/benchmarks>`__ for detailed metrics.
148
+
88
149
  🔗 Useful links
89
150
  ---------------
90
151
 
@@ -116,4 +177,3 @@ if you want to bring modifications to the code base.
116
177
  ----------
117
178
 
118
179
  Sliceline is free and open-source software licensed under the `3-clause BSD license <https://github.com/DataDome/sliceline/blob/main/LICENSE>`__.
119
-
@@ -0,0 +1,8 @@
1
+ sliceline/__init__.py,sha256=6BE45x-4OhgXqkPRBmvXsTbfnZN5YzAUx8Wu8RIfWjE,106
2
+ sliceline/_numba_ops.py,sha256=34IYOumWCfBMBpaNj5OxWcwJk2mFcnJTyKMmgTIjcIc,6455
3
+ sliceline/slicefinder.py,sha256=umNscHR24iVU7C3kLTIkySMMAteD562vGIdcxHe3qLY,32042
4
+ sliceline/validation.py,sha256=pydiTHlS6f1iBtlIqATLVHimhoyZDKTMrDjQIH2R9ks,30875
5
+ sliceline-0.3.0.dist-info/METADATA,sha256=_rAxmaiYiWTNgnZzFO-ijMJAl2vPd4z7B0XZLCn4oQU,5518
6
+ sliceline-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
7
+ sliceline-0.3.0.dist-info/licenses/LICENSE,sha256=AbeN2ySrCt8VUJukqcQIYutROwZh3W2u0UU1d7EnbZs,1531
8
+ sliceline-0.3.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,7 +0,0 @@
1
- sliceline/__init__.py,sha256=jEIUmQtv4W_eZuH63KQ8tAFoRZxyN3O8bRZ__FlMJr0,65
2
- sliceline/slicefinder.py,sha256=xuGsxGXtihkKNEokRmhycJ6aY-FPkkNjCsPQKTcPABg,26355
3
- sliceline/validation.py,sha256=-RkCpRdANNeaJyrdj7zFn4xs1X1xIXitKwRoL_B5EAk,30794
4
- sliceline-0.2.18.dist-info/LICENSE,sha256=AbeN2ySrCt8VUJukqcQIYutROwZh3W2u0UU1d7EnbZs,1531
5
- sliceline-0.2.18.dist-info/METADATA,sha256=JiKzXGFQfX7pOMhuhafNrtKxB57yTK7z91CySd3MD7I,3717
6
- sliceline-0.2.18.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
7
- sliceline-0.2.18.dist-info/RECORD,,