PyPI - sliceline - Versions diffs - 0.2.20__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sliceline 0.2.20py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sliceline/__init__.py +2 -2
sliceline/_numba_ops.py +245 -0
sliceline/slicefinder.py +252 -90
sliceline/validation.py +5 -2
{sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info}/METADATA +72 -12
sliceline-0.3.0.dist-info/RECORD +8 -0
{sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info}/WHEEL +1 -1
sliceline-0.2.20.dist-info/RECORD +0 -7
{sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info/licenses}/LICENSE +0 -0

sliceline/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .slicefinder import Slicefinder
+from .slicefinder import Slicefinder, is_numba_available
-__all__ = ("Slicefinder",)
+__all__ = ("Slicefinder", "is_numba_available")

sliceline/_numba_ops.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""Numba-accelerated operations for Sliceline.
+Provides JIT-compiled versions of performance-critical functions
+for 5-50x performance improvements in scoring and ID computation.
+This module is optional - if numba is not installed, the main slicefinder
+module will fall back to pure NumPy implementations.
+Installation:
+    pip install numba
+    # or
+    pip install sliceline[optimized]
+"""
+from __future__ import annotations
+import numpy as np
+from numba import njit
+@njit(cache=True)
+def score_slices_numba(
+    slice_sizes: np.ndarray,
+    slice_errors: np.ndarray,
+    n_row: int,
+    alpha: float,
+    avg_error: float,
+) -> np.ndarray:
+    """JIT-compiled slice scoring function.
+    Computes scores for each slice based on size and error metrics.
+    5-10x faster than pure NumPy implementation.
+    Parameters
+    ----------
+    slice_sizes : np.ndarray
+        Array of slice sizes.
+    slice_errors : np.ndarray
+        Array of slice errors.
+    n_row : int
+        Number of rows in the encoded dataset.
+    alpha : float
+        Weight parameter for error importance.
+    avg_error : float
+        Average error across all samples.
+    Returns
+    -------
+    np.ndarray
+        Array of computed scores for each slice.
+    """
+    n = slice_sizes.shape[0]
+    scores = np.empty(n, dtype=np.float64)
+    for i in range(n):
+        if slice_sizes[i] <= 0:
+            scores[i] = -np.inf
+        else:
+            slice_avg_error = slice_errors[i] / slice_sizes[i]
+            error_term = alpha * (slice_avg_error / avg_error - 1.0)
+            size_term = (1.0 - alpha) * (n_row / slice_sizes[i] - 1.0)
+            scores[i] = error_term - size_term
+    return scores
+@njit(cache=True)
+def score_ub_single_numba(
+    slice_size: float,
+    slice_error: float,
+    max_slice_error: float,
+    n_col_x_encoded: int,
+    min_sup: float,
+    alpha: float,
+    avg_error: float,
+) -> float:
+    """JIT-compiled upper bound score for a single slice.
+    Parameters
+    ----------
+    slice_size : float
+        Size of the slice.
+    slice_error : float
+        Error sum of the slice.
+    max_slice_error : float
+        Maximum error in the slice.
+    n_col_x_encoded : int
+        Number of encoded columns.
+    min_sup : float
+        Minimum support threshold.
+    alpha : float
+        Weight parameter for error importance.
+    avg_error : float
+        Average error across all samples.
+    Returns
+    -------
+    float
+        Upper bound score for the slice.
+    """
+    if slice_size <= 0:
+        return -np.inf
+    potential_solutions = np.array(
+        [
+            min_sup,
+            max(slice_error / max_slice_error, min_sup)
+            if max_slice_error > 0
+            else min_sup,
+            slice_size,
+        ]
+    )
+    max_score = -np.inf
+    for pot_sol in potential_solutions:
+        if pot_sol <= 0:
+            continue
+        error_contrib = min(pot_sol * max_slice_error, slice_error)
+        score = (
+            alpha * (error_contrib / avg_error - pot_sol)
+            - (1.0 - alpha) * (n_col_x_encoded - pot_sol)
+        ) / pot_sol
+        if score > max_score:
+            max_score = score
+    return max_score
+@njit(cache=True)
+def score_ub_batch_numba(
+    slice_sizes_ub: np.ndarray,
+    slice_errors_ub: np.ndarray,
+    max_slice_errors_ub: np.ndarray,
+    n_col_x_encoded: int,
+    min_sup: float,
+    alpha: float,
+    avg_error: float,
+) -> np.ndarray:
+    """JIT-compiled upper bound scoring function for batch processing.
+    5-10x faster than pure NumPy implementation.
+    Parameters
+    ----------
+    slice_sizes_ub : np.ndarray
+        Array of slice sizes (upper bound).
+    slice_errors_ub : np.ndarray
+        Array of slice errors (upper bound).
+    max_slice_errors_ub : np.ndarray
+        Array of maximum slice errors (upper bound).
+    n_col_x_encoded : int
+        Number of encoded columns.
+    min_sup : float
+        Minimum support threshold.
+    alpha : float
+        Weight parameter for error importance.
+    avg_error : float
+        Average error across all samples.
+    Returns
+    -------
+    np.ndarray
+        Array of upper bound scores for each slice.
+    """
+    n = slice_sizes_ub.shape[0]
+    scores = np.empty(n, dtype=np.float64)
+    for i in range(n):
+        scores[i] = score_ub_single_numba(
+            slice_sizes_ub[i],
+            slice_errors_ub[i],
+            max_slice_errors_ub[i],
+            n_col_x_encoded,
+            min_sup,
+            alpha,
+            avg_error,
+        )
+    return scores
+@njit(cache=True)
+def compute_slice_ids_numba(
+    slices_data: np.ndarray,
+    slices_indices: np.ndarray,
+    slices_indptr: np.ndarray,
+    feature_offset_start: np.ndarray,
+    feature_offset_end: np.ndarray,
+    feature_domains: np.ndarray,
+) -> np.ndarray:
+    """JIT-compiled slice ID computation.
+    Computes unique IDs for each slice based on feature encoding.
+    10-50x faster than Python loop for large datasets.
+    Parameters
+    ----------
+    slices_data : np.ndarray
+        Data array from sparse matrix.
+    slices_indices : np.ndarray
+        Column indices array from sparse matrix.
+    slices_indptr : np.ndarray
+        Index pointer array from sparse matrix.
+    feature_offset_start : np.ndarray
+        Start offset for each feature.
+    feature_offset_end : np.ndarray
+        End offset for each feature.
+    feature_domains : np.ndarray
+        Domain size for each feature.
+    Returns
+    -------
+    np.ndarray
+        Array of unique IDs for each slice.
+    """
+    n_slices = len(slices_indptr) - 1
+    n_features = len(feature_offset_start)
+    slice_ids = np.zeros(n_slices, dtype=np.float64)
+    dom = feature_domains + 1
+    for i in range(n_slices):
+        start_idx = slices_indptr[i]
+        end_idx = slices_indptr[i + 1]
+        slice_id = 0.0
+        for j in range(start_idx, end_idx):
+            col = slices_indices[j]
+            val = slices_data[j]
+            if val == 0:
+                continue
+            for f in range(n_features):
+                if feature_offset_start[f] <= col < feature_offset_end[f]:
+                    offset = col - feature_offset_start[f]
+                    multiplier = 1.0
+                    for k in range(f + 1, n_features):
+                        multiplier *= dom[k]
+                    slice_id += (offset + 1) * multiplier
+                    break
+        slice_ids[i] = slice_id
+    return slice_ids

sliceline/slicefinder.py CHANGED Viewed

@@ -2,20 +2,63 @@
 The slicefinder module implements the Slicefinder class.
 """
+from __future__ import annotations
 import logging
-from typing import Tuple, Union
+import warnings
+from typing import Any
 import numpy as np
+import numpy.typing as npt
 from scipy import sparse as sp
 from scipy.stats import rankdata
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.utils.validation import check_is_fitted, _check_feature_names
+from sklearn.utils.validation import _check_feature_names, check_is_fitted
 from sliceline.validation import check_array, check_X_e
+ArrayLike = npt.ArrayLike
+NDArray = npt.NDArray[Any]
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
+# Numba availability detection
+try:
+    from sliceline._numba_ops import (
+        compute_slice_ids_numba,
+        score_slices_numba,
+        score_ub_batch_numba,
+    )
+    NUMBA_AVAILABLE = True
+except ImportError:
+    NUMBA_AVAILABLE = False
+    score_slices_numba = None
+    score_ub_batch_numba = None
+    compute_slice_ids_numba = None
+def is_numba_available() -> bool:
+    """Check if numba is available for acceleration.
+    Returns
+    -------
+    bool
+        True if numba is installed and can be used for acceleration.
+    """
+    return NUMBA_AVAILABLE
+def _warn_numba_not_available() -> None:
+    """Issue a warning if numba is not available."""
+    warnings.warn(
+        "Numba not available. Install with: pip install numba\n"
+        "Or: pip install sliceline[optimized]\n"
+        "Performance will be 5-50x slower without Numba optimization.",
+        UserWarning,
+        stacklevel=3,
+    )
 class Slicefinder(BaseEstimator, TransformerMixin):
@@ -93,9 +136,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         alpha: float = 0.6,
         k: int = 1,
         max_l: int = 4,
-        min_sup: Union[int, float] = 10,
+        min_sup: int | float = 10,
         verbose: bool = True,
-    ):
+    ) -> None:
         self.alpha = alpha
         self.k = k
         self.max_l = max_l
@@ -105,13 +148,25 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         self._one_hot_encoder = self._top_slices_enc = None
         self.top_slices_ = self.top_slices_statistics_ = None
         self.average_error_ = None
+        self._min_sup_actual = min_sup
         if self.verbose:
             logger.setLevel(logging.DEBUG)
         else:
             logger.setLevel(logging.INFO)
-    def _check_params(self):
+        # Warn user once if Numba optimization is not available
+        if not NUMBA_AVAILABLE and verbose:
+            warnings.warn(
+                "Numba JIT optimization not available. "
+                "Install with 'pip install sliceline[optimized]' "
+                "for 5-50x performance improvements on scoring operations. "
+                "See https://github.com/DataDome/sliceline for details.",
+                UserWarning,
+                stacklevel=2,
+            )
+    def _check_params(self) -> None:
         """Check transformer parameters."""
         if not 0 < self.alpha <= 1:
             raise ValueError(f"Invalid 'alpha' parameter: {self.alpha}")
@@ -127,7 +182,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         ):
             raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")
-    def _check_top_slices(self):
+    def _check_top_slices(self) -> None:
         """Check if slices have been found."""
         # Check if fit has been called
         check_is_fitted(self)
@@ -136,7 +191,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         if self.top_slices_.size == 0:
             raise ValueError("No transform: Sliceline did not find any slice.")
-    def fit(self, X, errors):
+    def fit(self, X: ArrayLike, errors: ArrayLike) -> Slicefinder:
         """Search for slice(s) on `X` based on `errors`.
         Parameters
@@ -155,9 +210,11 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         """
         self._check_params()
-        # Update min_sup for a fraction of the input dataset size
+        # Compute actual min_sup value (convert fraction to count if needed)
         if 0 < self.min_sup < 1:
-            self.min_sup = int(self.min_sup * len(X))
+            self._min_sup_actual = int(self.min_sup * len(X))
+        else:
+            self._min_sup_actual = self.min_sup
         # Check that X and e have correct shape
         X_array, errors = check_X_e(X, errors, y_numeric=True)
@@ -168,7 +225,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         return self
-    def transform(self, X):
+    def transform(self, X: ArrayLike) -> NDArray:
         """Generate slices masks for `X`.
         Parameters
@@ -191,7 +248,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         return slices_masks.T
-    def get_slice(self, X, slice_index: int):
+    def get_slice(self, X: ArrayLike, slice_index: int) -> NDArray:
         """Filter `X` samples according to the `slice_index`-th slice.
         Parameters
@@ -217,7 +274,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         return X[np.where(slices_masks[slice_index])[0], :]
-    def get_feature_names_out(self):
+    def get_feature_names_out(self) -> NDArray:
         """Get output feature names for transformation.
         Returns
@@ -232,7 +289,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         return np.array(feature_names, dtype=object)
-    def _get_slices_masks(self, X):
+    def _get_slices_masks(self, X: NDArray) -> NDArray:
         """Private utilities function generating slices masks for `X`."""
         X_encoded = self._one_hot_encoder.transform(X)
@@ -248,33 +305,51 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         return slices_masks
     @property
-    def _n_features_out(self):
+    def _n_features_out(self) -> int:
         """Number of transformed output features."""
         return self.top_slices_.shape[0]
     @staticmethod
-    def _dummify(array: np.ndarray, n_col_x_encoded: int) -> sp.csr_matrix:
+    def _dummify(array: NDArray, n_col_x_encoded: int) -> sp.csr_matrix:
         """Dummify `array` with respect to `n_col_x_encoded`.
-        Assumption: v does not contain any 0."""
-        assert (
-            0 not in array
-        ), "Modality 0 is not expected to be one-hot encoded."
-        one_hot_encoding = sp.lil_matrix(
-            (array.size, n_col_x_encoded), dtype=bool
+        Creates a sparse one-hot encoding matrix where each row corresponds
+        to an element in array and has a single True value in the column
+        specified by that element (adjusted for 1-based indexing).
+        Args:
+            array: 1-based indices to encode (must not contain 0)
+            n_col_x_encoded: Number of columns in output matrix
+        Returns:
+            Sparse CSR matrix of shape (len(array), n_col_x_encoded)
+        Raises:
+            ValueError: If array contains 0, which cannot be one-hot encoded.
+        """
+        if 0 in array:
+            raise ValueError(
+                "Modality 0 is not expected to be one-hot encoded."
+            )
+        # Direct CSR construction: 2-3x faster than lil_matrix approach
+        n = array.size
+        return sp.csr_matrix(
+            (np.ones(n, dtype=np.bool_), (np.arange(n), array - 1)),
+            shape=(n, n_col_x_encoded),
+            dtype=np.bool_,
         )
-        one_hot_encoding[np.arange(array.size), array - 1] = True
-        return one_hot_encoding.tocsr()
     def _maintain_top_k(
         self,
         slices: sp.csr_matrix,
-        statistics: np.ndarray,
+        statistics: NDArray,
         top_k_slices: sp.csr_matrix,
-        top_k_statistics: np.ndarray,
-    ) -> Tuple[sp.csr_matrix, np.ndarray]:
+        top_k_statistics: NDArray,
+    ) -> tuple[sp.csr_matrix, NDArray]:
         """Add new `slices` to `top_k_slices` and update the top-k slices."""
         # prune invalid min_sup and scores
-        valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
+        valid_slices_mask = (statistics[:, 3] >= self._min_sup_actual) & (
             statistics[:, 0] > 0
         )
         if np.sum(valid_slices_mask) != 0:
@@ -283,7 +358,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
                 statistics[valid_slices_mask],
             )
-            if (slices.shape[1] != top_k_slices.shape[1]) & (
+            if (slices.shape[1] != top_k_slices.shape[1]) and (
                 slices.shape[1] == 1
             ):
                 slices, statistics = slices.T, statistics.T
@@ -300,7 +375,14 @@ class Slicefinder(BaseEstimator, TransformerMixin):
                 slices[top_slices_bool],
                 statistics[top_slices_bool],
             )
-            top_slices_indices = np.argsort(-top_k_statistics[:, 0])
+            # Sort by score (descending), then lexicographically by slice representation
+            # to ensure deterministic ordering when scores are equal
+            scores = -top_k_statistics[:, 0]
+            slice_keys = tuple(
+                top_k_slices.toarray()[:, i]
+                for i in range(top_k_slices.shape[1])
+            )
+            top_slices_indices = np.lexsort(slice_keys[::-1] + (scores,))
             top_k_slices, top_k_statistics = (
                 top_k_slices[top_slices_indices],
                 top_k_statistics[top_slices_indices],
@@ -309,20 +391,35 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     def _score_ub(
         self,
-        slice_sizes_ub: np.ndarray,
-        slice_errors_ub: np.ndarray,
-        max_slice_errors_ub: np.ndarray,
+        slice_sizes_ub: NDArray,
+        slice_errors_ub: NDArray,
+        max_slice_errors_ub: NDArray,
         n_col_x_encoded: int,
-    ) -> np.ndarray:
-        """Compute the upper-bound score for all the slices."""
+    ) -> NDArray:
+        """Compute the upper-bound score for all the slices.
+        Uses Numba JIT compilation when available for 5-10x speedup.
+        """
+        if NUMBA_AVAILABLE and score_ub_batch_numba is not None:
+            return score_ub_batch_numba(
+                slice_sizes_ub.astype(np.float64),
+                slice_errors_ub.astype(np.float64),
+                max_slice_errors_ub.astype(np.float64),
+                n_col_x_encoded,
+                float(self._min_sup_actual),
+                self.alpha,
+                self.average_error_,
+            )
+        # Fallback to NumPy implementation
         # Since slice_scores is either monotonically increasing or decreasing, we
         # probe interesting points of slice_scores in the interval [min_sup, ss],
         # and compute the maximum to serve as the upper bound
         potential_solutions = np.column_stack(
             (
-                self.min_sup * np.ones(slice_sizes_ub.shape[0]),
+                self._min_sup_actual * np.ones(slice_sizes_ub.shape[0]),
                 np.maximum(
-                    slice_errors_ub / max_slice_errors_ub, self.min_sup
+                    slice_errors_ub / max_slice_errors_ub, self._min_sup_actual
                 ),
                 slice_sizes_ub,
             )
@@ -346,7 +443,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         return slice_scores_ub
     @staticmethod
-    def _analyse_top_k(top_k_statistics: np.ndarray) -> tuple:
+    def _analyse_top_k(top_k_statistics: NDArray) -> tuple[float, float]:
         """Get the maximum and the minimum slices scores."""
         max_slice_scores = min_slice_scores = -np.inf
         if top_k_statistics.shape[0] > 0:
@@ -358,23 +455,40 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     def _score(
         self,
-        slice_sizes: np.ndarray,
-        slice_errors: np.ndarray,
+        slice_sizes: NDArray,
+        slice_errors: NDArray,
         n_row_x_encoded: int,
-    ) -> np.ndarray:
-        """Compute the score for all the slices."""
-        slice_scores = self.alpha * (
-            (slice_errors / slice_sizes) / self.average_error_ - 1
-        ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
+    ) -> NDArray:
+        """Compute the score for all the slices.
+        Uses Numba JIT compilation when available for 5-10x speedup.
+        """
+        if NUMBA_AVAILABLE and score_slices_numba is not None:
+            # Ensure inputs are float64 for numba
+            sizes = np.asarray(slice_sizes, dtype=np.float64)
+            errors = np.asarray(slice_errors, dtype=np.float64)
+            return score_slices_numba(
+                sizes,
+                errors,
+                n_row_x_encoded,
+                self.alpha,
+                self.average_error_,
+            )
+        # Fallback to NumPy implementation
+        with np.errstate(divide="ignore", invalid="ignore"):
+            slice_scores = self.alpha * (
+                (slice_errors / slice_sizes) / self.average_error_ - 1
+            ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
         return np.nan_to_num(slice_scores, nan=-np.inf)
     def _eval_slice(
         self,
         x_encoded: sp.csr_matrix,
-        errors: np.ndarray,
+        errors: NDArray,
         slices: sp.csr_matrix,
         level: int,
-    ) -> np.ndarray:
+    ) -> NDArray:
         """Compute several statistics for all the slices."""
         slice_candidates = x_encoded @ slices.T == level
         slice_sizes = slice_candidates.sum(axis=0).A[0]
@@ -397,8 +511,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         self,
         x_encoded: sp.csr_matrix,
         n_col_x_encoded: int,
-        errors: np.ndarray,
-    ) -> Tuple[sp.csr_matrix, np.ndarray]:
+        errors: NDArray,
+    ) -> tuple[sp.csr_matrix, NDArray]:
         """Initialise 1-slices, i.e. slices with one predicate."""
         slice_sizes = x_encoded.sum(axis=0).A[0]
         slice_errors = errors @ x_encoded
@@ -409,7 +523,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         )
         # working set of active slices (#attr x #slices) and top-k
-        valid_slices_mask = (slice_sizes >= self.min_sup) & (slice_errors > 0)
+        valid_slices_mask = (slice_sizes >= self._min_sup_actual) & (
+            slice_errors > 0
+        )
         attr = np.arange(1, n_col_x_encoded + 1)[valid_slices_mask]
         slice_sizes = slice_sizes[valid_slices_mask]
         slice_errors = slice_errors[valid_slices_mask]
@@ -427,18 +543,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         n_col_dropped = n_col_x_encoded - sum(valid_slices_mask)
         logger.debug(
             "Dropping %i/%i features below min_sup = %i."
-            % (n_col_dropped, n_col_x_encoded, self.min_sup)
+            % (n_col_dropped, n_col_x_encoded, self._min_sup_actual)
         )
         return slices, statistics
     def _get_pruned_s_r(
-        self, slices: sp.csr_matrix, statistics: np.ndarray
-    ) -> Tuple[sp.csr_matrix, np.ndarray]:
+        self, slices: sp.csr_matrix, statistics: NDArray
+    ) -> tuple[sp.csr_matrix, NDArray]:
         """Prune invalid slices.
         Do not affect overall pruning effectiveness due to handling of missing parents.
         """
-        valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
+        valid_slices_mask = (statistics[:, 3] >= self._min_sup_actual) & (
             statistics[:, 1] > 0
         )
         return slices[valid_slices_mask], statistics[valid_slices_mask]
@@ -446,22 +562,53 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     @staticmethod
     def _join_compatible_slices(
         slices: sp.csr_matrix, level: int
-    ) -> np.ndarray:
-        """Join compatible slices according to `level`."""
+    ) -> sp.csr_matrix:
+        """Join compatible slices keeping sparse format when beneficial.
+        Returns a sparse boolean matrix where entry (i,j) is True if slices
+        i and j are compatible for joining at the given level. Only upper
+        triangular entries (i < j) are populated.
+        For level==2 (looking for disjoint slices), uses dense format since
+        most pairs are compatible. For higher levels, keeps sparse format.
+        """
+        n_slices = slices.shape[0]
+        if n_slices == 0:
+            return sp.csr_matrix((0, 0), dtype=np.bool_)
         slices_int = slices.astype(int)
-        # Here we can't use the .A shorthand because it is not
-        # implemented in all scipy versions for coo_matrix objects
-        join = (slices_int @ slices_int.T).toarray() == level - 2
-        return np.triu(join, 1) * join
+        join_counts = slices_int @ slices_int.T
+        if level == 2:
+            # For level 2, we're looking for pairs with dot product == 0
+            # Most pairs will match, so dense is more efficient
+            join_dense = join_counts.toarray() == 0
+        else:
+            # For higher levels, most pairs won't match, so sparse is better
+            # Use dense conversion for smaller matrices to ensure consistent ordering
+            # This matches the original behavior and ensures deterministic results
+            join_dense = join_counts.toarray() == level - 2
+        join_upper = np.triu(join_dense, 1)
+        rows, cols = np.where(join_upper)
+        return sp.csr_matrix(
+            (np.ones(len(rows), dtype=np.bool_), (rows, cols)),
+            shape=join_counts.shape,
+            dtype=np.bool_,
+        )
     @staticmethod
     def _combine_slices(
         slices: sp.csr_matrix,
-        statistics: np.ndarray,
-        compatible_slices: np.ndarray,
-    ) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
-        """Combine slices by exploiting parents node statistics."""
-        parent_1_idx, parent_2_idx = np.where(compatible_slices == 1)
+        statistics: NDArray,
+        compatible_slices: sp.csr_matrix,
+    ) -> tuple[sp.csr_matrix, NDArray, NDArray, NDArray]:
+        """Combine slices by exploiting parents node statistics.
+        Works with sparse compatible_slices matrix returned by
+        _join_compatible_slices.
+        """
+        parent_1_idx, parent_2_idx = compatible_slices.nonzero()
         pair_candidates = slices[parent_1_idx] + slices[parent_2_idx]
         slice_errors = np.minimum(
@@ -477,13 +624,13 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     @staticmethod
     def _prune_invalid_self_joins(
-        feature_offset_start: np.ndarray,
-        feature_offset_end: np.ndarray,
+        feature_offset_start: NDArray,
+        feature_offset_end: NDArray,
         pair_candidates: sp.csr_matrix,
-        slice_sizes: np.ndarray,
-        slice_errors: np.ndarray,
-        max_slice_errors: np.ndarray,
-    ) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
+        slice_sizes: NDArray,
+        slice_errors: NDArray,
+        max_slice_errors: NDArray,
+    ) -> tuple[sp.csr_matrix, NDArray, NDArray, NDArray]:
         """Prune invalid self joins (>1 bit per feature)."""
         valid_slices_mask = np.full(pair_candidates.shape[0], True)
         for start, end in zip(feature_offset_start, feature_offset_end):
@@ -500,12 +647,26 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     @staticmethod
     def _prepare_deduplication_and_pruning(
-        feature_offset_start: np.ndarray,
-        feature_offset_end: np.ndarray,
-        feature_domains: np.ndarray,
+        feature_offset_start: NDArray,
+        feature_offset_end: NDArray,
+        feature_domains: NDArray,
         pair_candidates: sp.csr_matrix,
-    ) -> np.ndarray:
-        """Prepare IDs for deduplication and pruning."""
+    ) -> NDArray:
+        """Prepare IDs for deduplication and pruning.
+        Uses Numba JIT compilation when available for 10-50x speedup.
+        """
+        if NUMBA_AVAILABLE and compute_slice_ids_numba is not None:
+            return compute_slice_ids_numba(
+                pair_candidates.data.astype(np.float64),
+                pair_candidates.indices.astype(np.int64),
+                pair_candidates.indptr.astype(np.int64),
+                feature_offset_start.astype(np.int64),
+                feature_offset_end.astype(np.int64),
+                feature_domains.astype(np.float64),
+            )
+        # Fallback to Python implementation
         ids = np.zeros(pair_candidates.shape[0])
         dom = feature_domains + 1
         for j, (start, end) in enumerate(
@@ -525,18 +686,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     def _get_pair_candidates(
         self,
         slices: sp.csr_matrix,
-        statistics: np.ndarray,
-        top_k_statistics: np.ndarray,
+        statistics: NDArray,
+        top_k_statistics: NDArray,
         level: int,
         n_col_x_encoded: int,
-        feature_domains: np.ndarray,
-        feature_offset_start: np.ndarray,
-        feature_offset_end: np.ndarray,
+        feature_domains: NDArray,
+        feature_offset_start: NDArray,
+        feature_offset_end: NDArray,
     ) -> sp.csr_matrix:
         """Compute and prune plausible slices candidates."""
         compatible_slices = self._join_compatible_slices(slices, level)
-        if np.sum(compatible_slices) == 0:
+        if compatible_slices.nnz == 0:
             return sp.csr_matrix(np.empty((0, slices.shape[1])))
         (
@@ -596,7 +757,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         # Seems to be always fully True
         # Due to maintain_top_k that apply slice_sizes filter
-        pruning_sizes = slice_sizes >= self.min_sup
+        pruning_sizes = slice_sizes >= self._min_sup_actual
         _, min_slice_scores = self._analyse_top_k(top_k_statistics)
@@ -606,14 +767,14 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     def _search_slices(
         self,
-        input_x: np.ndarray,
-        errors: np.ndarray,
+        input_x: NDArray,
+        errors: NDArray,
     ) -> None:
         """Main function of the SliceLine algorithm."""
         # prepare offset vectors and one-hot encoded input_x
         self._one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
         x_encoded = self._one_hot_encoder.fit_transform(input_x)
-        feature_domains: np.ndarray = np.array(
+        feature_domains: NDArray = np.array(
             [len(sub_array) for sub_array in self._one_hot_encoder.categories_]
         )
         feature_offset_end = np.cumsum(feature_domains)
@@ -650,8 +811,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         min_condition = min(input_x.shape[1], self.max_l)
         while (
             (slices.shape[0] > 0)
-            & (slices.sum() > 0)
-            & (level < min_condition)
+            and (slices.sum() > 0)
+            and (level < min_condition)
         ):
             level += 1
@@ -687,7 +848,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
                 top_k_statistics
             )
             valid = np.sum(
-                (statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
+                (statistics[:, 3] >= self._min_sup_actual)
+                & (statistics[:, 1] > 0)
             )
             logger.debug(
                 " -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
@@ -723,7 +885,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         ]
         self.top_slices_statistics_ = [
             {
-                stat_name: stat_value
+                stat_name: float(stat_value)
                 for stat_value, stat_name in zip(statistic, statistics_names)
             }
             for statistic in top_k_statistics

sliceline/validation.py CHANGED Viewed

@@ -19,8 +19,11 @@ from contextlib import suppress
 import numpy as np
 import scipy.sparse as sp
-# mypy error: Module 'numpy.core.numeric' has no attribute 'ComplexWarning'
-from numpy.core.numeric import ComplexWarning  # type: ignore
+# ComplexWarning moved from numpy.core.numeric to numpy.exceptions in NumPy 2.0
+try:
+    from numpy.exceptions import ComplexWarning
+except ImportError:
+    from numpy.core.numeric import ComplexWarning  # type: ignore
 from sklearn._config import get_config as _get_config
 from sklearn.exceptions import DataConversionWarning
 from sklearn.utils.fixes import _object_dtype_isnan

{sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,21 +1,41 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: sliceline
-Version: 0.2.20
-Summary: ✂️ Fast slice finding for Machine Learning model debugging.
-License: BSD-3-Clause
+Version: 0.3.0
+Summary: Fast slice finding for Machine Learning model debugging.
+Project-URL: Homepage, https://github.com/DataDome/sliceline
+Project-URL: Documentation, https://sliceline.readthedocs.io/en/stable/
+Project-URL: Repository, https://github.com/DataDome/sliceline
 Author: Antoine de Daran
-Requires-Python: >=3.10,<4
+License-Expression: BSD-3-Clause
+License-File: LICENSE
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
-Requires-Dist: numpy (>=1.25,<2.0)
-Requires-Dist: scikit-learn (>=1.6.0,<2.0.0)
-Requires-Dist: scipy (>=1.12,<2.0)
-Project-URL: Documentation, https://sliceline.readthedocs.io/en/stable/
-Project-URL: Repository, https://github.com/DataDome/sliceline
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: <4,>=3.10
+Requires-Dist: numpy<3,>=1.25
+Requires-Dist: scikit-learn<2,>=1.6.0
+Requires-Dist: scipy<2,>=1.12
+Provides-Extra: dev
+Requires-Dist: jupyter>=1.0.0; extra == 'dev'
+Requires-Dist: matplotlib>=3.9; extra == 'dev'
+Requires-Dist: nbconvert>=7.0.0; extra == 'dev'
+Requires-Dist: optbinning>=0.15.0; extra == 'dev'
+Requires-Dist: pandas>=2.1.1; extra == 'dev'
+Requires-Dist: pytest-benchmark>=4.0.0; extra == 'dev'
+Requires-Dist: pytest-cov>=3.0.0; extra == 'dev'
+Requires-Dist: pytest>=7.2.0; extra == 'dev'
+Requires-Dist: ruff>=0.9.0; extra == 'dev'
+Requires-Dist: sphinx-rtd-theme>=3.0.0; extra == 'dev'
+Requires-Dist: sphinx>=8.0.0; extra == 'dev'
+Provides-Extra: optimized
+Requires-Dist: numba>=0.60.0; extra == 'optimized'
 Description-Content-Type: text/x-rst
 Sliceline
@@ -85,6 +105,47 @@ Or, through SSH:
    pip install git+ssh://git@github.com/datadome/sliceline.git --upgrade
+⚡ Performance Optimization
+---------------------------
+Sliceline includes optional Numba JIT compilation for **5-50x performance improvements** on scoring operations.
+**Quick Installation:**
+.. code:: sh
+   # With optimization support
+   pip install sliceline[optimized]
+**Benefits:**
+- 5-6x faster scoring operations
+- 1.4-4.5x faster overall fit() performance
+- Up to 17% memory reduction on large datasets
+- Automatic fallback to pure NumPy if Numba not available
+**System Requirements:**
+Numba requires LLVM to be installed:
+.. code:: sh
+   # macOS
+   brew install llvm
+   # Linux (Ubuntu/Debian)
+   sudo apt-get install llvm
+**Verify Optimization:**
+.. code:: python
+   from sliceline import is_numba_available
+   print("Numba available:", is_numba_available())
+See the `performance benchmarks <https://github.com/DataDome/sliceline/tree/main/benchmarks>`__ for detailed metrics.
 🔗 Useful links
 ---------------
@@ -116,4 +177,3 @@ if you want to bring modifications to the code base.
 ----------
 Sliceline is free and open-source software licensed under the `3-clause BSD license <https://github.com/DataDome/sliceline/blob/main/LICENSE>`__.

sliceline-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+sliceline/__init__.py,sha256=6BE45x-4OhgXqkPRBmvXsTbfnZN5YzAUx8Wu8RIfWjE,106
+sliceline/_numba_ops.py,sha256=34IYOumWCfBMBpaNj5OxWcwJk2mFcnJTyKMmgTIjcIc,6455
+sliceline/slicefinder.py,sha256=umNscHR24iVU7C3kLTIkySMMAteD562vGIdcxHe3qLY,32042
+sliceline/validation.py,sha256=pydiTHlS6f1iBtlIqATLVHimhoyZDKTMrDjQIH2R9ks,30875
+sliceline-0.3.0.dist-info/METADATA,sha256=_rAxmaiYiWTNgnZzFO-ijMJAl2vPd4z7B0XZLCn4oQU,5518
+sliceline-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+sliceline-0.3.0.dist-info/licenses/LICENSE,sha256=AbeN2ySrCt8VUJukqcQIYutROwZh3W2u0UU1d7EnbZs,1531
+sliceline-0.3.0.dist-info/RECORD,,

{sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.1.3
+Generator: hatchling 1.28.0
 Root-Is-Purelib: true
 Tag: py3-none-any

sliceline-0.2.20.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-sliceline/__init__.py,sha256=jEIUmQtv4W_eZuH63KQ8tAFoRZxyN3O8bRZ__FlMJr0,65
-sliceline/slicefinder.py,sha256=-6aP7hM_fWmFY_fxyXF4PTZmNYmXO_S3efsaL-6g3a8,26378
-sliceline/validation.py,sha256=-RkCpRdANNeaJyrdj7zFn4xs1X1xIXitKwRoL_B5EAk,30794
-sliceline-0.2.20.dist-info/LICENSE,sha256=AbeN2ySrCt8VUJukqcQIYutROwZh3W2u0UU1d7EnbZs,1531
-sliceline-0.2.20.dist-info/METADATA,sha256=khVwkNnYx4K3KaA4BC9gIq0MJmL5D4lOnYyUREbzoVQ,3717
-sliceline-0.2.20.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-sliceline-0.2.20.dist-info/RECORD,,

{sliceline-0.2.20.dist-info → sliceline-0.3.0.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

sliceline 0.2.20__py3-none-any.whl → 0.3.0__py3-none-any.whl

sliceline 0.2.20py3-none-any.whl → 0.3.0py3-none-any.whl