PyPI - sliceline - Versions diffs - 0.2.11__tar.gz → 0.2.13__tar.gz - Mend

sliceline 0.2.11tar.gz → 0.2.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{sliceline-0.2.11 → sliceline-0.2.13}/PKG-INFO RENAMED Viewed

@@ -1,18 +1,25 @@
 Metadata-Version: 2.1
 Name: sliceline
-Version: 0.2.11
+Version: 0.2.13
 Summary: ✂️ Fast slice finding for Machine Learning model debugging.
 Home-page: https://github.com/DataDome/sliceline
 License: BSD-3-Clause
 Author: Antoine de Daran
-Requires-Python: >=3.7,<3.10
+Requires-Python: >=3.7,<3.12
 Classifier: License :: OSI Approved :: BSD License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
-Requires-Dist: numpy (>=1.0.0,<2.0.0)
-Requires-Dist: scikit-learn (>=1.0.1,<2.0.0)
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Dist: numpy (>=1.21,<2.0) ; python_version < "3.9"
+Requires-Dist: numpy (>=1.25,<2.0) ; python_version >= "3.9"
+Requires-Dist: scikit-learn (>=1,<2) ; python_version < "3.8"
+Requires-Dist: scikit-learn (>=1.3,<2.0) ; python_version >= "3.8" and python_version < "3.9"
+Requires-Dist: scikit-learn (>=1.4,<2.0) ; python_version >= "3.9"
+Requires-Dist: scipy (>=1,<2) ; python_version < "3.9"
+Requires-Dist: scipy (>=1.12,<2.0) ; python_version >= "3.9"
 Project-URL: Documentation, https://sliceline.readthedocs.io/en/stable/
 Project-URL: Repository, https://github.com/DataDome/sliceline
 Description-Content-Type: text/x-rst

sliceline-0.2.13/pyproject.toml ADDED Viewed

@@ -0,0 +1,69 @@
+[build-system]
+requires = ["poetry_core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.poetry]
+name = "sliceline"
+version = "0.2.13" # This version is a generic placeholder. It should not be changed.
+description = "✂️ Fast slice finding for Machine Learning model debugging."
+authors = ["Antoine de Daran"]
+readme = "README.rst"
+license = "BSD-3-Clause"
+repository = "https://github.com/DataDome/sliceline"
+documentation = "https://sliceline.readthedocs.io/en/stable/"
+[tool.poetry.dependencies]
+python = ">=3.7, <3.12"
+numpy = [
+    { version = "^1.21", python = "<3.9" },
+    { version = "^1.25", python = ">=3.9" },
+]
+scikit-learn = [
+    { version = "^1", python = "<3.8" },
+    { version = "^1.3", python = ">=3.8,<3.9" },
+    { version = "^1.4", python = ">=3.9" },
+]
+scipy = [
+    { version = "^1", python = "<3.9" },
+    { version = "^1.12", python = ">=3.9" },
+]
+[tool.poetry.dev-dependencies]
+black = [
+    { version = "^23", python = "<3.8" },
+    { version = "^24", python = ">=3.8" },
+]
+flake8 = [
+    { version = "^5", python = "<3.8.1" },
+    { version = "^6", python = ">=3.8.1" },
+]
+jupyter = "^1.0.0"
+matplotlib = [
+    { version = "^3", python = "<3.8" },
+    { version = "^3.7", python = ">=3.8,<3.9" },
+    { version = "^3.8", python = ">=3.9" },
+]
+nbconvert = "^6.5.0"
+optbinning = "^0.15.0"
+pandas = [
+    { version = "^1", python = "<3.9" },
+    { version = "^1.5", python = ">=3.9" },
+]
+pytest = "^6.2.5"
+pytest-benchmark = "^3.4.1"
+pytest-cov = "^3.0.0"
+Sphinx = "^4.0.0"
+sphinx-rtd-theme = "^1.0.0"
+[tool.black]
+line-length = 79
+include = '\.pyi?$'
+[tool.isort]
+profile = "black"
+[tool.coverage.run]
+omit = [".*", "*/site-packages/*", "tests/*", "*/validation.py"]
+[tool.coverage.report]
+fail_under = 80

{sliceline-0.2.11 → sliceline-0.2.13}/sliceline/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 from .slicefinder import Slicefinder
-__all__ = ("Slicefinder",)
+__all__ = ("Slicefinder",)

{sliceline-0.2.11 → sliceline-0.2.13}/sliceline/slicefinder.py RENAMED Viewed

@@ -24,8 +24,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     Given an input dataset (`X`) and a model error vector (`errors`), SliceLine finds
     the `k` slices in `X` that identify where the model performs significantly worse.
-    A slice is a subspace of `X` defined by one or more predicates. The maximal dimension
-    of this subspace is controlled by `max_l`.
+    A slice is a subspace of `X` defined by one or more predicates.
+    The maximal dimension of this subspace is controlled by `max_l`.
     The slice scoring function is the linear combination of two objectives:
         - Find sufficiently large slices, with more than `min_sup` elements
@@ -55,7 +55,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     min_sup: int or float, default=10
         Minimum support threshold.
         Inspired by frequent itemset mining, it ensures statistical significance.
-        If `min_sup` is a float (0 < `min_sup` < 1), it represents the faction of the input dataset (`X`)
+        If `min_sup` is a float (0 < `min_sup` < 1),
+            it represents the faction of the input dataset (`X`).
     verbose: bool, default=True
         Controls the verbosity.
@@ -69,9 +70,19 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     average_error_: float
         Mean value of the input error.
+    top_slices_statistics_: list of dict of length `len(top_slices_)`
+        The statistics of the slices found sorted by slice's scores.
+        For each slice, the following statistics are stored:
+            - slice_score: the score of the slice (defined in `_score` method)
+            - sum_slice_error: the sum of all the errors in the slice
+            - max_slice_error: the maximum of all errors in the slice
+            - slice_size: the number of elements in the slice
+            - slice_average_error: the average error in the slice (sum_slice_error / slice_size)
     References
     ----------
-    `SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging <https://mboehm7.github.io/resources/sigmod2021b_sliceline.pdf>`__,
+    `SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging
+    <https://mboehm7.github.io/resources/sigmod2021b_sliceline.pdf>`__,
     from *Svetlana Sagadeeva* and *Matthias Boehm* of Graz University of Technology.
     """
@@ -90,7 +101,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         self.verbose = verbose
         self._one_hot_encoder = self._top_slices_enc = None
-        self.top_slices_ = self.average_error_ = None
+        self.top_slices_ = self.top_slices_statistics_ = None
+        self.average_error_ = None
         if self.verbose:
             logger.setLevel(logging.DEBUG)
@@ -108,9 +120,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         if self.max_l <= 0:
             raise ValueError(f"Invalid 'max_l' parameter: {self.max_l}")
-        if (
-            self.min_sup < 0 or
-            (isinstance(self.min_sup, float) and self.min_sup >= 1)
+        if self.min_sup < 0 or (
+            isinstance(self.min_sup, float) and self.min_sup >= 1
         ):
             raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")
@@ -184,7 +195,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         Parameters
         ----------
         X: array-like of shape (n_samples, n_features)
-            Training data, where `n_samples` is the number of samples
+            Dataset, where `n_samples` is the number of samples
             and `n_features` is the number of features.
         slice_index: int
@@ -198,7 +209,7 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         self._check_top_slices()
         # Input validation
-        X = check_array(X)
+        X = check_array(X, force_all_finite=False)
         slices_masks = self._get_slices_masks(X)
@@ -228,9 +239,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         slice_candidates = self._top_slices_enc @ X_encoded.T
         # self._top_slices_enc.sum(axis=1) is the number of predicate(s) for each top_slices_
-        slices_masks = (slice_candidates == self._top_slices_enc.sum(axis=1)).A.astype(
-            int
-        )
+        slices_masks = (
+            slice_candidates == self._top_slices_enc.sum(axis=1)
+        ).A.astype(int)
         return slices_masks
@@ -243,8 +254,12 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     def _dummify(array: np.ndarray, n_col_x_encoded: int) -> sp.csr_matrix:
         """Dummify `array` with respect to `n_col_x_encoded`.
         Assumption: v does not contain any 0."""
-        assert 0 not in array, "Modality 0 is not expected to be one-hot encoded."
-        one_hot_encoding = sp.lil_matrix((array.size, n_col_x_encoded), dtype=bool)
+        assert (
+            0 not in array
+        ), "Modality 0 is not expected to be one-hot encoded."
+        one_hot_encoding = sp.lil_matrix(
+            (array.size, n_col_x_encoded), dtype=bool
+        )
         one_hot_encoding[np.arange(array.size), array - 1] = True
         return one_hot_encoding.tocsr()
@@ -257,14 +272,18 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     ) -> Tuple[sp.csr_matrix, np.ndarray]:
         """Add new `slices` to `top_k_slices` and update the top-k slices."""
         # prune invalid min_sup and scores
-        valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (statistics[:, 0] > 0)
+        valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
+            statistics[:, 0] > 0
+        )
         if np.sum(valid_slices_mask) != 0:
             slices, statistics = (
                 slices[valid_slices_mask],
                 statistics[valid_slices_mask],
             )
-            if (slices.shape[1] != top_k_slices.shape[1]) & (slices.shape[1] == 1):
+            if (slices.shape[1] != top_k_slices.shape[1]) & (
+                slices.shape[1] == 1
+            ):
                 slices, statistics = slices.T, statistics.T
             # evaluated candidates and previous top-k
@@ -272,7 +291,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
             statistics = np.concatenate([top_k_statistics, statistics])
             # extract top-k
-            top_slices_bool = rankdata(-statistics[:, 0], method="min") <= self.k
+            top_slices_bool = (
+                rankdata(-statistics[:, 0], method="min") <= self.k
+            )
             top_k_slices, top_k_statistics = (
                 slices[top_slices_bool],
                 statistics[top_slices_bool],
@@ -298,7 +319,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         potential_solutions = np.column_stack(
             (
                 self.min_sup * np.ones(slice_sizes_ub.shape[0]),
-                np.maximum(slice_errors_ub / max_slice_errors_ub, self.min_sup),
+                np.maximum(
+                    slice_errors_ub / max_slice_errors_ub, self.min_sup
+                ),
                 slice_sizes_ub,
             )
         )
@@ -307,7 +330,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
                 self.alpha
                 * (
                     np.minimum(
-                        potential_solutions.T * max_slice_errors_ub, slice_errors_ub
+                        potential_solutions.T * max_slice_errors_ub,
+                        slice_errors_ub,
                     ).T
                     / self.average_error_
                     - potential_solutions
@@ -325,7 +349,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         max_slice_scores = min_slice_scores = -np.inf
         if top_k_statistics.shape[0] > 0:
             max_slice_scores = top_k_statistics[0, 0]
-            min_slice_scores = top_k_statistics[top_k_statistics.shape[0] - 1, 0]
+            min_slice_scores = top_k_statistics[
+                top_k_statistics.shape[0] - 1, 0
+            ]
         return max_slice_scores, min_slice_scores
     def _score(
@@ -354,7 +380,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         max_slice_errors = slice_candidates.T.multiply(errors).max(axis=1).A
         # score of relative error and relative size
-        slice_scores = self._score(slice_sizes, slice_errors, x_encoded.shape[0])
+        slice_scores = self._score(
+            slice_sizes, slice_errors, x_encoded.shape[0]
+        )
         return np.column_stack(
             [slice_scores, slice_errors, max_slice_errors, slice_sizes]
         )
@@ -379,7 +407,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         slices = self._dummify(attr, n_col_x_encoded)
         # score 1-slices and create initial top-k
-        slice_scores = self._score(slice_sizes, slice_errors, x_encoded.shape[0])
+        slice_scores = self._score(
+            slice_sizes, slice_errors, x_encoded.shape[0]
+        )
         statistics = np.column_stack(
             (slice_scores, slice_errors, max_slice_errors, slice_sizes)
         )
@@ -397,11 +427,15 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     ) -> Tuple[sp.csr_matrix, np.ndarray]:
         """Prune invalid slices.
         Do not affect overall pruning effectiveness due to handling of missing parents."""
-        valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
+        valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
+            statistics[:, 1] > 0
+        )
         return slices[valid_slices_mask], statistics[valid_slices_mask]
     @staticmethod
-    def _join_compatible_slices(slices: sp.csr_matrix, level: int) -> np.ndarray:
+    def _join_compatible_slices(
+        slices: sp.csr_matrix, level: int
+    ) -> np.ndarray:
         """Join compatible slices according to `level`."""
         slices_int = slices.astype(int)
         join = (slices_int @ slices_int.T).A == level - 2
@@ -409,7 +443,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
     @staticmethod
     def _combine_slices(
-        slices: sp.csr_matrix, statistics: np.ndarray, compatible_slices: np.ndarray
+        slices: sp.csr_matrix,
+        statistics: np.ndarray,
+        compatible_slices: np.ndarray,
     ) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
         """Combine slices by exploiting parents node statistics."""
         parent_1_idx, parent_2_idx = np.where(compatible_slices == 1)
@@ -459,7 +495,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         """Prepare IDs for deduplication and pruning."""
         ids = np.zeros(pair_candidates.shape[0])
         dom = feature_domains + 1
-        for j, (start, end) in enumerate(zip(feature_offset_start, feature_offset_end)):
+        for j, (start, end) in enumerate(
+            zip(feature_offset_start, feature_offset_end)
+        ):
             sub_pair_candidates = pair_candidates[:, start:end]
             # sub_p should not contain multiple True on the same line
             i = sub_pair_candidates.argmax(axis=1).T + np.any(
@@ -510,7 +548,10 @@ class Slicefinder(BaseEstimator, TransformerMixin):
             return sp.csr_matrix(np.empty((0, slices.shape[1])))
         ids = self._prepare_deduplication_and_pruning(
-            feature_offset_start, feature_offset_end, feature_domains, pair_candidates
+            feature_offset_start,
+            feature_offset_end,
+            feature_domains,
+            pair_candidates,
         )
         # remove duplicate candidates and select corresponding statistics
@@ -579,7 +620,9 @@ class Slicefinder(BaseEstimator, TransformerMixin):
             np.zeros((0, 4)),
         )
-        max_slice_scores, min_slice_scores = self._analyse_top_k(top_k_statistics)
+        max_slice_scores, min_slice_scores = self._analyse_top_k(
+            top_k_statistics
+        )
         logger.debug(
             "Initial top-K: count=%i, max=%f, min=%f"
             % (top_k_slices.shape[0], max_slice_scores, min_slice_scores)
@@ -589,7 +632,11 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         # termination condition (max #feature levels)
         level = 1
         min_condition = min(input_x.shape[1], self.max_l)
-        while (slices.shape[0] > 0) & (slices.sum() > 0) & (level < min_condition):
+        while (
+            (slices.shape[0] > 0)
+            & (slices.sum() > 0)
+            & (level < min_condition)
+        ):
             level += 1
             # enumerate candidate join pairs, including size/error pruning
@@ -620,8 +667,12 @@ class Slicefinder(BaseEstimator, TransformerMixin):
                 slices, statistics, top_k_slices, top_k_statistics
             )
-            max_slice_scores, min_slice_scores = self._analyse_top_k(top_k_statistics)
-            valid = np.sum((statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0))
+            max_slice_scores, min_slice_scores = self._analyse_top_k(
+                top_k_statistics
+            )
+            valid = np.sum(
+                (statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
+            )
             logger.debug(
                 " -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
             )
@@ -634,6 +685,32 @@ class Slicefinder(BaseEstimator, TransformerMixin):
         if top_k_slices.shape[0] == 0:
             self.top_slices_ = np.empty((0, input_x.shape[1]))
         else:
-            self.top_slices_ = self._one_hot_encoder.inverse_transform(top_k_slices)
+            self.top_slices_ = self._one_hot_encoder.inverse_transform(
+                top_k_slices
+            )
+        # compute slices' average errors
+        top_k_statistics = np.column_stack(
+            (
+                top_k_statistics,
+                np.divide(top_k_statistics[:, 1], top_k_statistics[:, 3]),
+            )
+        )
+        # transform statistics to a list of dict
+        statistics_names = [
+            "slice_score",
+            "sum_slice_error",
+            "max_slice_error",
+            "slice_size",
+            "slice_average_error",
+        ]
+        self.top_slices_statistics_ = [
+            {
+                stat_name: stat_value
+                for stat_value, stat_name in zip(statistic, statistics_names)
+            }
+            for statistic in top_k_statistics
+        ]
         logger.debug("Terminated at level %i." % level)

{sliceline-0.2.11 → sliceline-0.2.13}/sliceline/validation.py RENAMED Viewed

@@ -99,7 +99,8 @@ def _num_samples(x):
     if hasattr(x, "shape") and x.shape is not None:
         if len(x.shape) == 0:
             raise TypeError(
-                "Singleton array %r cannot be considered a valid collection." % x
+                "Singleton array %r cannot be considered a valid collection."
+                % x
             )
         # Check that shape is returning an integer or default to len
         # Dask dataframes may not return numeric shape[0] value
@@ -242,7 +243,8 @@ def _ensure_sparse_format(
     if force_all_finite:
         if not hasattr(spmatrix, "data"):
             warnings.warn(
-                "Can't check %s sparse matrix for nan or inf." % spmatrix.format,
+                "Can't check %s sparse matrix for nan or inf."
+                % spmatrix.format,
                 stacklevel=2,
             )
         else:
@@ -450,7 +452,10 @@ def check_array(
         with suppress(ImportError):
             from pandas.api.types import is_sparse
-            if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
+            if (
+                not hasattr(array, "sparse")
+                and array.dtypes.apply(is_sparse).any()
+            ):
                 warnings.warn(
                     "pandas.DataFrame with sparse columns found."
                     "It will be converted to a dense numpy array."
@@ -781,7 +786,9 @@ def check_X_e(
         input_name="X",
     )
-    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
+    y = _check_y(
+        y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator
+    )
     check_consistent_length(X, y)
@@ -847,4 +854,6 @@ def column_or_1d(y, *, warn=False):
             )
         return np.ravel(y)
-    raise ValueError(f"y should be a 1d array, got an array of shape {shape} instead.")
+    raise ValueError(
+        f"y should be a 1d array, got an array of shape {shape} instead."
+    )

sliceline-0.2.11/pyproject.toml DELETED Viewed

@@ -1,31 +0,0 @@
-[build-system]
-requires = ["poetry_core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"
-[tool.poetry]
-name = "sliceline"
-version = "0.2.11" # This version is a generic placeholder. It should not be changed.
-description = "✂️ Fast slice finding for Machine Learning model debugging."
-authors = ["Antoine de Daran"]
-readme = "README.rst"
-license = "BSD-3-Clause"
-repository = "https://github.com/DataDome/sliceline"
-documentation = "https://sliceline.readthedocs.io/en/stable/"
-[tool.poetry.dependencies]
-python = ">=3.7, <3.10"
-numpy = "^1.0.0"
-scikit-learn = "^1.0.1"
-[tool.poetry.dev-dependencies]
-black = "^22.3.0"
-flake8 = "^3.0.0"
-jupyter = "^1.0.0"
-nbconvert = "^6.5.0"
-optbinning = "^0.15.0"
-pandas = "^1.1.0"
-pytest = "^6.2.5"
-pytest-benchmark = "^3.4.1"
-pytest-cov = "^3.0.0"
-Sphinx = "^4.0.0"
-sphinx-rtd-theme = "^1.0.0"

{sliceline-0.2.11 → sliceline-0.2.13}/LICENSE RENAMED Viewed

File without changes

{sliceline-0.2.11 → sliceline-0.2.13}/README.rst RENAMED Viewed

File without changes

sliceline 0.2.11__tar.gz → 0.2.13__tar.gz

sliceline 0.2.11tar.gz → 0.2.13tar.gz