sliceline 0.2.18__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. sliceline-0.3.0/.github/CODEOWNERS +3 -0
  2. sliceline-0.3.0/.github/workflows/push-pull.yml +38 -0
  3. sliceline-0.3.0/.github/workflows/release.yml +91 -0
  4. sliceline-0.3.0/.github/workflows/test-release.yml +56 -0
  5. sliceline-0.3.0/.gitignore +144 -0
  6. sliceline-0.3.0/.pre-commit-config.yaml +17 -0
  7. sliceline-0.3.0/.readthedocs.yml +22 -0
  8. sliceline-0.3.0/CLAUDE.md +221 -0
  9. sliceline-0.3.0/CODE_OF_CONDUCT.md +128 -0
  10. sliceline-0.3.0/CONTRIBUTING.md +100 -0
  11. sliceline-0.3.0/Makefile +25 -0
  12. sliceline-0.3.0/NUMBA_OPTIMIZATION.md +173 -0
  13. sliceline-0.2.18/README.rst → sliceline-0.3.0/PKG-INFO +81 -0
  14. sliceline-0.2.18/PKG-INFO → sliceline-0.3.0/README.rst +41 -21
  15. sliceline-0.3.0/benchmarks/benchmark_results.json +200 -0
  16. sliceline-0.3.0/benchmarks/benchmarks.py +641 -0
  17. sliceline-0.3.0/docs/make.bat +35 -0
  18. sliceline-0.3.0/docs/project_notes/python-3.13-readiness.md +133 -0
  19. sliceline-0.3.0/docs/source/Slicefinder.rst +4 -0
  20. sliceline-0.3.0/docs/source/conf.py +32 -0
  21. sliceline-0.3.0/docs/source/index.rst +5 -0
  22. sliceline-0.3.0/notebooks/1. Implementing Sliceline on Titanic dataset.ipynb +1781 -0
  23. sliceline-0.3.0/notebooks/2. Implementing Sliceline on California housing dataset.ipynb +1043 -0
  24. sliceline-0.3.0/pyproject.toml +93 -0
  25. sliceline-0.3.0/scripts/check_python_313_readiness.sh +58 -0
  26. sliceline-0.3.0/setup.cfg +6 -0
  27. sliceline-0.3.0/sliceline/__init__.py +3 -0
  28. sliceline-0.3.0/sliceline/_numba_ops.py +245 -0
  29. {sliceline-0.2.18 → sliceline-0.3.0}/sliceline/slicefinder.py +253 -91
  30. {sliceline-0.2.18 → sliceline-0.3.0}/sliceline/validation.py +5 -2
  31. sliceline-0.3.0/tests/__init__.py +0 -0
  32. sliceline-0.3.0/tests/conftest.py +849 -0
  33. sliceline-0.3.0/tests/experiment.py +68 -0
  34. sliceline-0.3.0/tests/test_performance.py +394 -0
  35. sliceline-0.3.0/tests/test_slicefinder.py +629 -0
  36. sliceline-0.2.18/pyproject.toml +0 -47
  37. sliceline-0.2.18/sliceline/__init__.py +0 -3
  38. {sliceline-0.2.18 → sliceline-0.3.0}/LICENSE +0 -0
@@ -0,0 +1,3 @@
1
+ # @adedaran and @kkontoudi-dd will be requested for
2
+ # review when someone opens a pull request.
3
+ * @adedaran @kkontoudi-dd @florent-pajot
@@ -0,0 +1,38 @@
1
+ name: Test
2
+
3
+ on:
4
+ push:
5
+ branches: [ master ]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ timeout-minutes: 45
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"]
16
+ numba: [true, false]
17
+ name: Python ${{ matrix.python-version }} (Numba=${{ matrix.numba }})
18
+ steps:
19
+ - uses: actions/checkout@v3
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v4
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+ - name: Install LLVM for Numba optimization
25
+ if: matrix.numba
26
+ run: |
27
+ sudo apt-get update
28
+ sudo apt-get install -y llvm-14
29
+ - name: Install dependencies
30
+ run: make init
31
+ - name: Verify Numba availability
32
+ run: |
33
+ uv run python -c "from sliceline import is_numba_available; print(f'Numba available: {is_numba_available()}')"
34
+ - name: Run test
35
+ run: make test
36
+ - name: Run notebooks (with Numba optimization)
37
+ if: matrix.numba
38
+ run: make execute-notebooks
@@ -0,0 +1,91 @@
1
+ name: Publish Python 🐍 distribution 📦 to PyPI
2
+
3
+ on: release
4
+
5
+ jobs:
6
+ build:
7
+ name: Build distribution 📦
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ with:
13
+ persist-credentials: false
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.10"
18
+ - name: Install pypa/build
19
+ run: >-
20
+ python3 -m
21
+ pip install
22
+ build
23
+ --user
24
+ - name: Build a binary wheel and a source tarball
25
+ run: python3 -m build
26
+ - name: Store the distribution packages
27
+ uses: actions/upload-artifact@v4
28
+ with:
29
+ name: python-package-distributions
30
+ path: dist/
31
+
32
+ publish-to-pypi:
33
+ name: >-
34
+ Publish Python 🐍 distribution 📦 to PyPI
35
+ needs:
36
+ - build
37
+ runs-on: ubuntu-latest
38
+ environment:
39
+ name: pypi
40
+ url: https://pypi.org/p/sliceline
41
+ permissions:
42
+ id-token: write
43
+
44
+ steps:
45
+ - name: Download all the dists
46
+ uses: actions/download-artifact@v4
47
+ with:
48
+ name: python-package-distributions
49
+ path: dist/
50
+ - name: Publish distribution 📦 to PyPI
51
+ uses: pypa/gh-action-pypi-publish@release/v1
52
+
53
+ github-release:
54
+ name: >-
55
+ Sign the Python 🐍 distribution 📦 with Sigstore
56
+ and upload them to GitHub Release
57
+ needs:
58
+ - publish-to-pypi
59
+ runs-on: ubuntu-latest
60
+
61
+ permissions:
62
+ contents: write
63
+ id-token: write
64
+
65
+ steps:
66
+ - name: Download all the dists
67
+ uses: actions/download-artifact@v4
68
+ with:
69
+ name: python-package-distributions
70
+ path: dist/
71
+ - name: Sign the dists with Sigstore
72
+ uses: sigstore/gh-action-sigstore-python@v3.0.0
73
+ with:
74
+ inputs: >-
75
+ ./dist/*.tar.gz
76
+ ./dist/*.whl
77
+ - name: Create GitHub Release
78
+ env:
79
+ GITHUB_TOKEN: ${{ github.token }}
80
+ run: >-
81
+ gh release create
82
+ "$GITHUB_REF_NAME"
83
+ --repo "$GITHUB_REPOSITORY"
84
+ --notes ""
85
+ - name: Upload artifact signatures to GitHub Release
86
+ env:
87
+ GITHUB_TOKEN: ${{ github.token }}
88
+ run: >-
89
+ gh release upload
90
+ "$GITHUB_REF_NAME" dist/**
91
+ --repo "$GITHUB_REPOSITORY"
@@ -0,0 +1,56 @@
1
+ name: Publish Python 🐍 distribution 📦 to TestPyPI
2
+
3
+ on: push
4
+
5
+ jobs:
6
+ build:
7
+ name: Build distribution 📦
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ with:
13
+ persist-credentials: false
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.10"
18
+ - name: Install pypa/build
19
+ run: >-
20
+ python3 -m
21
+ pip install
22
+ build
23
+ --user
24
+ - name: Build a binary wheel and a source tarball
25
+ run: python3 -m build
26
+ - name: Store the distribution packages
27
+ uses: actions/upload-artifact@v4
28
+ with:
29
+ name: python-package-distributions
30
+ path: dist/
31
+
32
+ publish-to-testpypi:
33
+ name: >-
34
+ Publish Python 🐍 distribution 📦 to TestPyPI
35
+ if: startsWith(github.ref, 'refs/tags/') # only publish to TestPyPI on tag pushes
36
+ needs:
37
+ - build
38
+ runs-on: ubuntu-latest
39
+
40
+ environment:
41
+ name: testpypi
42
+ url: https://test.pypi.org/p/sliceline
43
+
44
+ permissions:
45
+ id-token: write
46
+
47
+ steps:
48
+ - name: Download all the dists
49
+ uses: actions/download-artifact@v4
50
+ with:
51
+ name: python-package-distributions
52
+ path: dist/
53
+ - name: Publish distribution 📦 to TestPyPI
54
+ uses: pypa/gh-action-pypi-publish@release/v1
55
+ with:
56
+ repository-url: https://test.pypi.org/legacy/
@@ -0,0 +1,144 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # IDE
85
+ .idea/
86
+ .DS_Store
87
+ .metals/
88
+ .vscode/
89
+
90
+ # pyenv
91
+ .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101
+ __pypackages__/
102
+
103
+ # Celery stuff
104
+ celerybeat-schedule
105
+ celerybeat.pid
106
+
107
+ # SageMath parsed files
108
+ *.sage.py
109
+
110
+ # Environments
111
+ .env
112
+ .venv
113
+ env/
114
+ venv/
115
+ ENV/
116
+ env.bak/
117
+ venv.bak/
118
+
119
+ # Spyder project settings
120
+ .spyderproject
121
+ .spyproject
122
+
123
+ # Rope project settings
124
+ .ropeproject
125
+
126
+ # mkdocs documentation
127
+ /site
128
+
129
+ # mypy
130
+ .mypy_cache/
131
+ .dmypy.json
132
+ dmypy.json
133
+
134
+ # Pyre type checker
135
+ .pyre/
136
+
137
+ # uv
138
+ uv.lock
139
+
140
+ # Ruff
141
+ .ruff_cache/
142
+
143
+ # data file
144
+ *.csv
@@ -0,0 +1,17 @@
1
+ files: sliceline
2
+ repos:
3
+ - repo: https://github.com/pre-commit/pre-commit-hooks
4
+ rev: v5.0.0
5
+ hooks:
6
+ - id: check-json
7
+ - id: check-yaml
8
+ - id: end-of-file-fixer
9
+ - id: trailing-whitespace
10
+ - id: mixed-line-ending
11
+
12
+ - repo: https://github.com/astral-sh/ruff-pre-commit
13
+ rev: v0.9.4
14
+ hooks:
15
+ - id: ruff
16
+ args: [--fix]
17
+ - id: ruff-format
@@ -0,0 +1,22 @@
1
+ # Read the Docs configuration file
2
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3
+
4
+ # Required
5
+ version: 2
6
+
7
+ build:
8
+ os: ubuntu-24.04
9
+ tools:
10
+ python: "3.10"
11
+
12
+ # Build documentation in the docs/ directory with Sphinx
13
+ sphinx:
14
+ configuration: docs/source/conf.py
15
+
16
+ # Optionally build your docs in additional formats such as PDF and ePub
17
+ formats: all
18
+
19
+ python:
20
+ install:
21
+ - method: pip
22
+ path: .
@@ -0,0 +1,221 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Sliceline is a Python library for fast slice finding for Machine Learning model debugging. It implements the SliceLine algorithm from the paper "SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging" by Svetlana Sagadeeva and Matthias Boehm.
8
+
9
+ **Core Purpose**: Given an input dataset `X` and a model error vector `errors`, SliceLine identifies the top `k` slices (subspaces defined by predicates) where the ML model performs significantly worse.
10
+
11
+ ## Development Commands
12
+
13
+ ### Environment Setup
14
+ ```sh
15
+ make init # Install dependencies via uv
16
+ pre-commit install --hook-type pre-push # Install pre-commit hooks
17
+ ```
18
+
19
+ ### Testing
20
+ ```sh
21
+ make test # Run unit tests with coverage (requires 80% coverage minimum)
22
+ uv run pytest # Run tests without coverage report
23
+ uv run pytest tests/test_slicefinder.py::test_experiments # Run specific test
24
+ uv run pytest -k "experiment_1" # Run tests matching pattern
25
+ ```
26
+
27
+ ### Code Quality
28
+ ```sh
29
+ uv run ruff check . # Check code style
30
+ uv run ruff format . --check # Check formatting
31
+ uv run ruff format . # Apply formatting
32
+ ```
33
+
34
+ ### Documentation
35
+ ```sh
36
+ make doc # Build Sphinx documentation locally
37
+ make notebook # Start Jupyter notebook server
38
+ make execute-notebooks # Execute all notebooks (run before releases)
39
+ ```
40
+
41
+ ### Benchmarking
42
+
43
+ The project includes two types of benchmarks:
44
+
45
+ **Standalone benchmark scripts** (in `benchmarks/`):
46
+ ```sh
47
+ # Run all benchmarks (cardinality + dataset size scaling)
48
+ python benchmarks/benchmarks.py
49
+
50
+ # Results are saved to:
51
+ # - benchmarks/benchmark_results.json (cardinality benchmark)
52
+ # - benchmarks/dataset_size_results.json (dataset size benchmark)
53
+ ```
54
+
55
+ **pytest-benchmark suite** (in `tests/test_performance.py`):
56
+ ```sh
57
+ # Run performance regression tests with benchmarks
58
+ uv run pytest tests/test_performance.py -v --benchmark-only
59
+
60
+ # Run with full output
61
+ uv run pytest tests/test_performance.py -v
62
+ ```
63
+
64
+ The standalone benchmarks are for profiling and manual performance analysis.
65
+ The pytest-benchmark suite is for regression testing to detect performance regressions.
66
+
67
+ ## Architecture
68
+
69
+ ### Core Algorithm (sliceline/slicefinder.py)
70
+
71
+ The `Slicefinder` class is a scikit-learn compatible estimator implementing the SliceLine algorithm through sparse linear algebra operations.
72
+
73
+ **Key Algorithm Steps**:
74
+ 1. **One-hot encode input**: Convert categorical/numerical features to binary representation
75
+ 2. **Initialize 1-slices**: Create and score basic slices (single predicates)
76
+ 3. **Lattice enumeration**: Iteratively combine slices up to `max_l` levels, pruning based on size and error bounds
77
+ 4. **Top-k maintenance**: Track best slices throughout enumeration
78
+
79
+ **Critical Parameters**:
80
+ - `alpha` (0 < alpha <= 1): Balance between slice size and average error
81
+ - `k`: Number of top slices to return
82
+ - `max_l`: Maximum predicates per slice (controls combinatorial explosion)
83
+ - `min_sup`: Minimum support threshold (absolute or fraction)
84
+
85
+ **Key Methods**:
86
+ - `fit(X, errors)`: Main entry point - searches for slices
87
+ - `transform(X)`: Returns binary masks indicating slice membership
88
+ - `get_slice(X, slice_index)`: Filters dataset to specific slice
89
+ - `_search_slices()`: Core algorithm implementation
90
+ - `_score()` / `_score_ub()`: Slice scoring and upper-bound pruning
91
+ - `_maintain_top_k()`: Efficiently tracks best slices
92
+
93
+ **Performance Optimizations (v0.3.0)**:
94
+ - Sparse matrix operations (scipy.sparse) throughout
95
+ - Direct CSR construction in `_dummify()` (2-3x faster than lil_matrix)
96
+ - Sparse-preserving join in `_join_compatible_slices()` (memory efficient)
97
+ - Upper-bound pruning to avoid evaluating unpromising candidates
98
+ - Missing parent detection to avoid invalid slice combinations
99
+ - Deduplication via ID-based hashing
100
+ - Deterministic ordering for reproducible results
101
+
102
+ ### Validation Module (sliceline/validation.py)
103
+
104
+ Custom validation overriding sklearn's `check_array` to **accept string/object dtype inputs** (line 554-555). This is essential because SliceLine works with categorical data that may be represented as strings. The module is derived from sklearn's validation utilities but modified specifically for this use case.
105
+
106
+ ### Numba Optimization Module (sliceline/_numba_ops.py)
107
+
108
+ Optional JIT-compiled operations for performance improvement. Contains Numba-accelerated versions of:
109
+ - `score_slices_numba()`: 5-6x faster slice scoring
110
+ - `score_ub_single_numba()` / `score_ub_batch_numba()`: Upper-bound scoring
111
+ - `compute_slice_ids_numba()`: ID computation for deduplication
112
+
113
+ **Coverage exclusion**: This module is excluded from coverage requirements (similar to `validation.py`) because:
114
+ 1. It's completely optional (only loaded if Numba is installed)
115
+ 2. Functions are tested indirectly through main slicefinder tests
116
+ 3. Numba implementations are verified to produce numerically identical results to NumPy fallbacks
117
+ 4. Direct testing of JIT-compiled functions adds complexity with minimal value
118
+
119
+ ### Testing Structure (tests/)
120
+
121
+ - `test_slicefinder.py`: Comprehensive unit tests for all private and public methods
122
+ - `test_performance.py`: Performance benchmark suite using pytest-benchmark
123
+ - Dataset size scaling tests (1K to 50K samples)
124
+ - Feature count scaling tests (5 to 30 features)
125
+ - Lattice level scaling tests (max_l 2 to 5)
126
+ - Memory efficiency tests for sparse operations
127
+ - `conftest.py`: Pytest fixtures for test data (17 different experiments)
128
+ - `experiment.py`: Test case definitions
129
+ - Tests use `pytest-benchmark` for performance tracking
130
+ - Parametrized tests (`experiment_1` through `experiment_17`) validate algorithm correctness on various scenarios
131
+
132
+ ### Benchmarking (benchmarks/)
133
+
134
+ - `benchmarks.py`: Profiling script for performance testing
135
+ - Cardinality benchmark: Tests cardinality levels 10, 100, 500, 1000
136
+ - Dataset size benchmark: Tests scaling with 1K to 50K samples
137
+ - Measures time, memory, and improvement metrics
138
+ - Outputs `benchmark_results.json` and `dataset_size_results.json`
139
+ - Run with: `python benchmarks/benchmarks.py`
140
+
141
+ ## Development Guidelines
142
+
143
+ ### Code Style
144
+ - Line length: 79 characters (enforced by Black)
145
+ - Import sorting: Black profile (enforced by isort)
146
+ - Docstrings: Follow numpydoc convention for all public methods
147
+ - Type hints: Used where applicable (see slicefinder.py lines 6, 91-97)
148
+
149
+ ### Testing Requirements
150
+ - Unit tests must pass for all changes
151
+ - Coverage threshold: 80% minimum (configured in pyproject.toml)
152
+ - Coverage excludes: validation.py, _numba_ops.py, tests/, hidden files
153
+ - Benchmarking: Available via pytest-benchmark for performance-sensitive changes
154
+
155
+ ### Adding New Features
156
+ - Open a GitHub discussion before starting work
157
+ - Add docstrings following numpydoc format
158
+ - Update relevant documentation in docs/source/
159
+ - Add unit tests achieving 80%+ coverage
160
+ - Update release notes (when requested)
161
+
162
+ ### scikit-learn Compatibility
163
+ The `Slicefinder` class follows scikit-learn conventions:
164
+ - Inherits from `BaseEstimator` and `TransformerMixin`
165
+ - Implements `fit()`, `transform()`, `fit_transform()` pattern
166
+ - Uses `check_is_fitted()` for state validation
167
+ - Exposes `get_feature_names_out()` for pipeline integration
168
+ - Parameters set in `__init__` without validation (validated in `fit()`)
169
+
170
+ ### Working with Sparse Matrices
171
+ - All internal representations use `scipy.sparse.csr_matrix`
172
+ - Avoid `.A` shorthand for `.toarray()` - not supported in all scipy versions (see comments at lines 383-386, 405-408, 517-519)
173
+ - Use explicit `.toarray()` calls when converting to dense
174
+ - Use `.nnz` for counting non-zero elements (faster than `.sum()`)
175
+ - Direct CSR construction preferred over lil_matrix for one-hot encoding
176
+
177
+ ## Common Pitfalls
178
+
179
+ 1. **Sparse matrix compatibility**: Some scipy versions don't support `.A` attribute on certain sparse matrix types. Always use `.toarray()` explicitly.
180
+
181
+ 2. **String dtype handling**: The custom validation module allows string inputs, which sklearn's standard validation rejects. Don't replace with sklearn's built-in validation.
182
+
183
+ 3. **min_sup conversion**: When `min_sup` is a float (0 < min_sup < 1), it gets converted to an absolute count in `fit()` (line 160). This modifies the instance attribute.
184
+
185
+ 4. **Missing parents**: The `_get_pair_candidates()` method includes logic to handle cases where some parent slices were pruned (lines 578-583). This prevents invalid combinations.
186
+
187
+ 5. **Notebook execution**: Notebooks require specific execution with unlimited timeout (see Makefile line 22) due to potentially long-running experiments.
188
+
189
+ 6. **Deterministic ordering**: Results are sorted by score first, then lexicographically by slice representation. This ensures reproducible results across runs and Python versions.
190
+
191
+ 7. **Memory efficiency**: Use sparse matrices throughout. The `_join_compatible_slices()` method returns sparse format to avoid memory explosion with large numbers of slices.
192
+
193
+ ## Performance Considerations
194
+
195
+ ### When to Use Sliceline
196
+
197
+ Sliceline is designed for datasets where:
198
+ - You want to find subgroups where your ML model underperforms
199
+ - Features are categorical or can be binned (continuous values should be discretized)
200
+ - Dataset size is reasonable (10K-100K samples works well)
201
+
202
+ ### Performance Characteristics
203
+
204
+ Based on benchmarks:
205
+ - **Small datasets (1K samples)**: < 100ms
206
+ - **Medium datasets (10K samples)**: 100ms - 1s
207
+ - **Large datasets (50K+ samples)**: 1-10s depending on cardinality
208
+
209
+ ### Optimization Tips
210
+
211
+ 1. **Reduce cardinality**: Bin continuous features or use feature hashing for high-cardinality columns
212
+ 2. **Limit lattice depth**: Keep `max_l` small (2-3) for faster execution
213
+ 3. **Increase min_sup**: Higher support threshold prunes more aggressively
214
+ 4. **Use appropriate k**: Smaller `k` values enable better pruning
215
+
216
+ ### Future Optimizations (See NUMBA_OPTIMIZATION.md)
217
+
218
+ Planned Numba JIT compilation for:
219
+ - Scoring functions (`_score`, `_score_ub`)
220
+ - ID computation for deduplication
221
+ - Expected speedup: 5-50x on numeric-heavy operations