sliceline 0.2.20__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sliceline-0.3.0/.github/CODEOWNERS +3 -0
- sliceline-0.3.0/.github/workflows/push-pull.yml +38 -0
- sliceline-0.3.0/.github/workflows/release.yml +91 -0
- sliceline-0.3.0/.github/workflows/test-release.yml +56 -0
- sliceline-0.3.0/.gitignore +144 -0
- sliceline-0.3.0/.pre-commit-config.yaml +17 -0
- sliceline-0.3.0/.readthedocs.yml +22 -0
- sliceline-0.3.0/CLAUDE.md +221 -0
- sliceline-0.3.0/CODE_OF_CONDUCT.md +128 -0
- sliceline-0.3.0/CONTRIBUTING.md +100 -0
- sliceline-0.3.0/Makefile +25 -0
- sliceline-0.3.0/NUMBA_OPTIMIZATION.md +173 -0
- sliceline-0.2.20/README.rst → sliceline-0.3.0/PKG-INFO +81 -0
- sliceline-0.2.20/PKG-INFO → sliceline-0.3.0/README.rst +41 -21
- sliceline-0.3.0/benchmarks/benchmark_results.json +200 -0
- sliceline-0.3.0/benchmarks/benchmarks.py +641 -0
- sliceline-0.3.0/docs/make.bat +35 -0
- sliceline-0.3.0/docs/project_notes/python-3.13-readiness.md +133 -0
- sliceline-0.3.0/docs/source/Slicefinder.rst +4 -0
- sliceline-0.3.0/docs/source/conf.py +32 -0
- sliceline-0.3.0/docs/source/index.rst +5 -0
- sliceline-0.3.0/notebooks/1. Implementing Sliceline on Titanic dataset.ipynb +1781 -0
- sliceline-0.3.0/notebooks/2. Implementing Sliceline on California housing dataset.ipynb +1043 -0
- sliceline-0.3.0/pyproject.toml +93 -0
- sliceline-0.3.0/scripts/check_python_313_readiness.sh +58 -0
- sliceline-0.3.0/setup.cfg +6 -0
- sliceline-0.3.0/sliceline/__init__.py +3 -0
- sliceline-0.3.0/sliceline/_numba_ops.py +245 -0
- {sliceline-0.2.20 → sliceline-0.3.0}/sliceline/slicefinder.py +252 -90
- {sliceline-0.2.20 → sliceline-0.3.0}/sliceline/validation.py +5 -2
- sliceline-0.3.0/tests/__init__.py +0 -0
- sliceline-0.3.0/tests/conftest.py +849 -0
- sliceline-0.3.0/tests/experiment.py +68 -0
- sliceline-0.3.0/tests/test_performance.py +394 -0
- sliceline-0.3.0/tests/test_slicefinder.py +629 -0
- sliceline-0.2.20/pyproject.toml +0 -47
- sliceline-0.2.20/sliceline/__init__.py +0 -3
- {sliceline-0.2.20 → sliceline-0.3.0}/LICENSE +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ master ]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
timeout-minutes: 45
|
|
12
|
+
strategy:
|
|
13
|
+
fail-fast: false
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
16
|
+
numba: [true, false]
|
|
17
|
+
name: Python ${{ matrix.python-version }} (Numba=${{ matrix.numba }})
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v3
|
|
20
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
21
|
+
uses: actions/setup-python@v4
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
- name: Install LLVM for Numba optimization
|
|
25
|
+
if: matrix.numba
|
|
26
|
+
run: |
|
|
27
|
+
sudo apt-get update
|
|
28
|
+
sudo apt-get install -y llvm-14
|
|
29
|
+
- name: Install dependencies
|
|
30
|
+
run: make init
|
|
31
|
+
- name: Verify Numba availability
|
|
32
|
+
run: |
|
|
33
|
+
uv run python -c "from sliceline import is_numba_available; print(f'Numba available: {is_numba_available()}')"
|
|
34
|
+
- name: Run test
|
|
35
|
+
run: make test
|
|
36
|
+
- name: Run notebooks (with Numba optimization)
|
|
37
|
+
if: matrix.numba
|
|
38
|
+
run: make execute-notebooks
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
name: Publish Python 🐍 distribution 📦 to PyPI
|
|
2
|
+
|
|
3
|
+
on: release
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
build:
|
|
7
|
+
name: Build distribution 📦
|
|
8
|
+
runs-on: ubuntu-latest
|
|
9
|
+
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
with:
|
|
13
|
+
persist-credentials: false
|
|
14
|
+
- name: Set up Python
|
|
15
|
+
uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.10"
|
|
18
|
+
- name: Install pypa/build
|
|
19
|
+
run: >-
|
|
20
|
+
python3 -m
|
|
21
|
+
pip install
|
|
22
|
+
build
|
|
23
|
+
--user
|
|
24
|
+
- name: Build a binary wheel and a source tarball
|
|
25
|
+
run: python3 -m build
|
|
26
|
+
- name: Store the distribution packages
|
|
27
|
+
uses: actions/upload-artifact@v4
|
|
28
|
+
with:
|
|
29
|
+
name: python-package-distributions
|
|
30
|
+
path: dist/
|
|
31
|
+
|
|
32
|
+
publish-to-pypi:
|
|
33
|
+
name: >-
|
|
34
|
+
Publish Python 🐍 distribution 📦 to PyPI
|
|
35
|
+
needs:
|
|
36
|
+
- build
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
environment:
|
|
39
|
+
name: pypi
|
|
40
|
+
url: https://pypi.org/p/sliceline
|
|
41
|
+
permissions:
|
|
42
|
+
id-token: write
|
|
43
|
+
|
|
44
|
+
steps:
|
|
45
|
+
- name: Download all the dists
|
|
46
|
+
uses: actions/download-artifact@v4
|
|
47
|
+
with:
|
|
48
|
+
name: python-package-distributions
|
|
49
|
+
path: dist/
|
|
50
|
+
- name: Publish distribution 📦 to PyPI
|
|
51
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
52
|
+
|
|
53
|
+
github-release:
|
|
54
|
+
name: >-
|
|
55
|
+
Sign the Python 🐍 distribution 📦 with Sigstore
|
|
56
|
+
and upload them to GitHub Release
|
|
57
|
+
needs:
|
|
58
|
+
- publish-to-pypi
|
|
59
|
+
runs-on: ubuntu-latest
|
|
60
|
+
|
|
61
|
+
permissions:
|
|
62
|
+
contents: write
|
|
63
|
+
id-token: write
|
|
64
|
+
|
|
65
|
+
steps:
|
|
66
|
+
- name: Download all the dists
|
|
67
|
+
uses: actions/download-artifact@v4
|
|
68
|
+
with:
|
|
69
|
+
name: python-package-distributions
|
|
70
|
+
path: dist/
|
|
71
|
+
- name: Sign the dists with Sigstore
|
|
72
|
+
uses: sigstore/gh-action-sigstore-python@v3.0.0
|
|
73
|
+
with:
|
|
74
|
+
inputs: >-
|
|
75
|
+
./dist/*.tar.gz
|
|
76
|
+
./dist/*.whl
|
|
77
|
+
- name: Create GitHub Release
|
|
78
|
+
env:
|
|
79
|
+
GITHUB_TOKEN: ${{ github.token }}
|
|
80
|
+
run: >-
|
|
81
|
+
gh release create
|
|
82
|
+
"$GITHUB_REF_NAME"
|
|
83
|
+
--repo "$GITHUB_REPOSITORY"
|
|
84
|
+
--notes ""
|
|
85
|
+
- name: Upload artifact signatures to GitHub Release
|
|
86
|
+
env:
|
|
87
|
+
GITHUB_TOKEN: ${{ github.token }}
|
|
88
|
+
run: >-
|
|
89
|
+
gh release upload
|
|
90
|
+
"$GITHUB_REF_NAME" dist/**
|
|
91
|
+
--repo "$GITHUB_REPOSITORY"
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
name: Publish Python 🐍 distribution 📦 to TestPyPI
|
|
2
|
+
|
|
3
|
+
on: push
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
build:
|
|
7
|
+
name: Build distribution 📦
|
|
8
|
+
runs-on: ubuntu-latest
|
|
9
|
+
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
with:
|
|
13
|
+
persist-credentials: false
|
|
14
|
+
- name: Set up Python
|
|
15
|
+
uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.10"
|
|
18
|
+
- name: Install pypa/build
|
|
19
|
+
run: >-
|
|
20
|
+
python3 -m
|
|
21
|
+
pip install
|
|
22
|
+
build
|
|
23
|
+
--user
|
|
24
|
+
- name: Build a binary wheel and a source tarball
|
|
25
|
+
run: python3 -m build
|
|
26
|
+
- name: Store the distribution packages
|
|
27
|
+
uses: actions/upload-artifact@v4
|
|
28
|
+
with:
|
|
29
|
+
name: python-package-distributions
|
|
30
|
+
path: dist/
|
|
31
|
+
|
|
32
|
+
publish-to-testpypi:
|
|
33
|
+
name: >-
|
|
34
|
+
Publish Python 🐍 distribution 📦 to TestPyPI
|
|
35
|
+
if: startsWith(github.ref, 'refs/tags/') # only publish to TestPyPI on tag pushes
|
|
36
|
+
needs:
|
|
37
|
+
- build
|
|
38
|
+
runs-on: ubuntu-latest
|
|
39
|
+
|
|
40
|
+
environment:
|
|
41
|
+
name: testpypi
|
|
42
|
+
url: https://test.pypi.org/p/sliceline
|
|
43
|
+
|
|
44
|
+
permissions:
|
|
45
|
+
id-token: write
|
|
46
|
+
|
|
47
|
+
steps:
|
|
48
|
+
- name: Download all the dists
|
|
49
|
+
uses: actions/download-artifact@v4
|
|
50
|
+
with:
|
|
51
|
+
name: python-package-distributions
|
|
52
|
+
path: dist/
|
|
53
|
+
- name: Publish distribution 📦 to TestPyPI
|
|
54
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
55
|
+
with:
|
|
56
|
+
repository-url: https://test.pypi.org/legacy/
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
# Usually these files are written by a python script from a template
|
|
32
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Installer logs
|
|
37
|
+
pip-log.txt
|
|
38
|
+
pip-delete-this-directory.txt
|
|
39
|
+
|
|
40
|
+
# Unit test / coverage reports
|
|
41
|
+
htmlcov/
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
*.py,cover
|
|
51
|
+
.hypothesis/
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
target/
|
|
76
|
+
|
|
77
|
+
# Jupyter Notebook
|
|
78
|
+
.ipynb_checkpoints
|
|
79
|
+
|
|
80
|
+
# IPython
|
|
81
|
+
profile_default/
|
|
82
|
+
ipython_config.py
|
|
83
|
+
|
|
84
|
+
# IDE
|
|
85
|
+
.idea/
|
|
86
|
+
.DS_Store
|
|
87
|
+
.metals/
|
|
88
|
+
.vscode/
|
|
89
|
+
|
|
90
|
+
# pyenv
|
|
91
|
+
.python-version
|
|
92
|
+
|
|
93
|
+
# pipenv
|
|
94
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
95
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
96
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
97
|
+
# install all needed dependencies.
|
|
98
|
+
#Pipfile.lock
|
|
99
|
+
|
|
100
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
101
|
+
__pypackages__/
|
|
102
|
+
|
|
103
|
+
# Celery stuff
|
|
104
|
+
celerybeat-schedule
|
|
105
|
+
celerybeat.pid
|
|
106
|
+
|
|
107
|
+
# SageMath parsed files
|
|
108
|
+
*.sage.py
|
|
109
|
+
|
|
110
|
+
# Environments
|
|
111
|
+
.env
|
|
112
|
+
.venv
|
|
113
|
+
env/
|
|
114
|
+
venv/
|
|
115
|
+
ENV/
|
|
116
|
+
env.bak/
|
|
117
|
+
venv.bak/
|
|
118
|
+
|
|
119
|
+
# Spyder project settings
|
|
120
|
+
.spyderproject
|
|
121
|
+
.spyproject
|
|
122
|
+
|
|
123
|
+
# Rope project settings
|
|
124
|
+
.ropeproject
|
|
125
|
+
|
|
126
|
+
# mkdocs documentation
|
|
127
|
+
/site
|
|
128
|
+
|
|
129
|
+
# mypy
|
|
130
|
+
.mypy_cache/
|
|
131
|
+
.dmypy.json
|
|
132
|
+
dmypy.json
|
|
133
|
+
|
|
134
|
+
# Pyre type checker
|
|
135
|
+
.pyre/
|
|
136
|
+
|
|
137
|
+
# uv
|
|
138
|
+
uv.lock
|
|
139
|
+
|
|
140
|
+
# Ruff
|
|
141
|
+
.ruff_cache/
|
|
142
|
+
|
|
143
|
+
# data file
|
|
144
|
+
*.csv
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
files: sliceline
|
|
2
|
+
repos:
|
|
3
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
4
|
+
rev: v5.0.0
|
|
5
|
+
hooks:
|
|
6
|
+
- id: check-json
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: end-of-file-fixer
|
|
9
|
+
- id: trailing-whitespace
|
|
10
|
+
- id: mixed-line-ending
|
|
11
|
+
|
|
12
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
+
rev: v0.9.4
|
|
14
|
+
hooks:
|
|
15
|
+
- id: ruff
|
|
16
|
+
args: [--fix]
|
|
17
|
+
- id: ruff-format
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Read the Docs configuration file
|
|
2
|
+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
|
3
|
+
|
|
4
|
+
# Required
|
|
5
|
+
version: 2
|
|
6
|
+
|
|
7
|
+
build:
|
|
8
|
+
os: ubuntu-24.04
|
|
9
|
+
tools:
|
|
10
|
+
python: "3.10"
|
|
11
|
+
|
|
12
|
+
# Build documentation in the docs/ directory with Sphinx
|
|
13
|
+
sphinx:
|
|
14
|
+
configuration: docs/source/conf.py
|
|
15
|
+
|
|
16
|
+
# Optionally build your docs in additional formats such as PDF and ePub
|
|
17
|
+
formats: all
|
|
18
|
+
|
|
19
|
+
python:
|
|
20
|
+
install:
|
|
21
|
+
- method: pip
|
|
22
|
+
path: .
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
Sliceline is a Python library for fast slice finding for Machine Learning model debugging. It implements the SliceLine algorithm from the paper "SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging" by Svetlana Sagadeeva and Matthias Boehm.
|
|
8
|
+
|
|
9
|
+
**Core Purpose**: Given an input dataset `X` and a model error vector `errors`, SliceLine identifies the top `k` slices (subspaces defined by predicates) where the ML model performs significantly worse.
|
|
10
|
+
|
|
11
|
+
## Development Commands
|
|
12
|
+
|
|
13
|
+
### Environment Setup
|
|
14
|
+
```sh
|
|
15
|
+
make init # Install dependencies via uv
|
|
16
|
+
pre-commit install --hook-type pre-push # Install pre-commit hooks
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Testing
|
|
20
|
+
```sh
|
|
21
|
+
make test # Run unit tests with coverage (requires 80% coverage minimum)
|
|
22
|
+
uv run pytest # Run tests without coverage report
|
|
23
|
+
uv run pytest tests/test_slicefinder.py::test_experiments # Run specific test
|
|
24
|
+
uv run pytest -k "experiment_1" # Run tests matching pattern
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Code Quality
|
|
28
|
+
```sh
|
|
29
|
+
uv run ruff check . # Check code style
|
|
30
|
+
uv run ruff format . --check # Check formatting
|
|
31
|
+
uv run ruff format . # Apply formatting
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Documentation
|
|
35
|
+
```sh
|
|
36
|
+
make doc # Build Sphinx documentation locally
|
|
37
|
+
make notebook # Start Jupyter notebook server
|
|
38
|
+
make execute-notebooks # Execute all notebooks (run before releases)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Benchmarking
|
|
42
|
+
|
|
43
|
+
The project includes two types of benchmarks:
|
|
44
|
+
|
|
45
|
+
**Standalone benchmark scripts** (in `benchmarks/`):
|
|
46
|
+
```sh
|
|
47
|
+
# Run all benchmarks (cardinality + dataset size scaling)
|
|
48
|
+
python benchmarks/benchmarks.py
|
|
49
|
+
|
|
50
|
+
# Results are saved to:
|
|
51
|
+
# - benchmarks/benchmark_results.json (cardinality benchmark)
|
|
52
|
+
# - benchmarks/dataset_size_results.json (dataset size benchmark)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**pytest-benchmark suite** (in `tests/test_performance.py`):
|
|
56
|
+
```sh
|
|
57
|
+
# Run performance regression tests with benchmarks
|
|
58
|
+
uv run pytest tests/test_performance.py -v --benchmark-only
|
|
59
|
+
|
|
60
|
+
# Run with full output
|
|
61
|
+
uv run pytest tests/test_performance.py -v
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
The standalone benchmarks are for profiling and manual performance analysis.
|
|
65
|
+
The pytest-benchmark suite is for regression testing to detect performance regressions.
|
|
66
|
+
|
|
67
|
+
## Architecture
|
|
68
|
+
|
|
69
|
+
### Core Algorithm (sliceline/slicefinder.py)
|
|
70
|
+
|
|
71
|
+
The `Slicefinder` class is a scikit-learn compatible estimator implementing the SliceLine algorithm through sparse linear algebra operations.
|
|
72
|
+
|
|
73
|
+
**Key Algorithm Steps**:
|
|
74
|
+
1. **One-hot encode input**: Convert categorical/numerical features to binary representation
|
|
75
|
+
2. **Initialize 1-slices**: Create and score basic slices (single predicates)
|
|
76
|
+
3. **Lattice enumeration**: Iteratively combine slices up to `max_l` levels, pruning based on size and error bounds
|
|
77
|
+
4. **Top-k maintenance**: Track best slices throughout enumeration
|
|
78
|
+
|
|
79
|
+
**Critical Parameters**:
|
|
80
|
+
- `alpha` (0 < alpha <= 1): Balance between slice size and average error
|
|
81
|
+
- `k`: Number of top slices to return
|
|
82
|
+
- `max_l`: Maximum predicates per slice (controls combinatorial explosion)
|
|
83
|
+
- `min_sup`: Minimum support threshold (absolute or fraction)
|
|
84
|
+
|
|
85
|
+
**Key Methods**:
|
|
86
|
+
- `fit(X, errors)`: Main entry point - searches for slices
|
|
87
|
+
- `transform(X)`: Returns binary masks indicating slice membership
|
|
88
|
+
- `get_slice(X, slice_index)`: Filters dataset to specific slice
|
|
89
|
+
- `_search_slices()`: Core algorithm implementation
|
|
90
|
+
- `_score()` / `_score_ub()`: Slice scoring and upper-bound pruning
|
|
91
|
+
- `_maintain_top_k()`: Efficiently tracks best slices
|
|
92
|
+
|
|
93
|
+
**Performance Optimizations (v0.3.0)**:
|
|
94
|
+
- Sparse matrix operations (scipy.sparse) throughout
|
|
95
|
+
- Direct CSR construction in `_dummify()` (2-3x faster than lil_matrix)
|
|
96
|
+
- Sparse-preserving join in `_join_compatible_slices()` (memory efficient)
|
|
97
|
+
- Upper-bound pruning to avoid evaluating unpromising candidates
|
|
98
|
+
- Missing parent detection to avoid invalid slice combinations
|
|
99
|
+
- Deduplication via ID-based hashing
|
|
100
|
+
- Deterministic ordering for reproducible results
|
|
101
|
+
|
|
102
|
+
### Validation Module (sliceline/validation.py)
|
|
103
|
+
|
|
104
|
+
Custom validation overriding sklearn's `check_array` to **accept string/object dtype inputs** (line 554-555). This is essential because SliceLine works with categorical data that may be represented as strings. The module is derived from sklearn's validation utilities but modified specifically for this use case.
|
|
105
|
+
|
|
106
|
+
### Numba Optimization Module (sliceline/_numba_ops.py)
|
|
107
|
+
|
|
108
|
+
Optional JIT-compiled operations for performance improvement. Contains Numba-accelerated versions of:
|
|
109
|
+
- `score_slices_numba()`: 5-6x faster slice scoring
|
|
110
|
+
- `score_ub_single_numba()` / `score_ub_batch_numba()`: Upper-bound scoring
|
|
111
|
+
- `compute_slice_ids_numba()`: ID computation for deduplication
|
|
112
|
+
|
|
113
|
+
**Coverage exclusion**: This module is excluded from coverage requirements (similar to `validation.py`) because:
|
|
114
|
+
1. It's completely optional (only loaded if Numba is installed)
|
|
115
|
+
2. Functions are tested indirectly through main slicefinder tests
|
|
116
|
+
3. Numba implementations are verified to produce numerically identical results to NumPy fallbacks
|
|
117
|
+
4. Direct testing of JIT-compiled functions adds complexity with minimal value
|
|
118
|
+
|
|
119
|
+
### Testing Structure (tests/)
|
|
120
|
+
|
|
121
|
+
- `test_slicefinder.py`: Comprehensive unit tests for all private and public methods
|
|
122
|
+
- `test_performance.py`: Performance benchmark suite using pytest-benchmark
|
|
123
|
+
- Dataset size scaling tests (1K to 50K samples)
|
|
124
|
+
- Feature count scaling tests (5 to 30 features)
|
|
125
|
+
- Lattice level scaling tests (max_l 2 to 5)
|
|
126
|
+
- Memory efficiency tests for sparse operations
|
|
127
|
+
- `conftest.py`: Pytest fixtures for test data (17 different experiments)
|
|
128
|
+
- `experiment.py`: Test case definitions
|
|
129
|
+
- Tests use `pytest-benchmark` for performance tracking
|
|
130
|
+
- Parametrized tests (`experiment_1` through `experiment_17`) validate algorithm correctness on various scenarios
|
|
131
|
+
|
|
132
|
+
### Benchmarking (benchmarks/)
|
|
133
|
+
|
|
134
|
+
- `benchmarks.py`: Profiling script for performance testing
|
|
135
|
+
- Cardinality benchmark: Tests cardinality levels 10, 100, 500, 1000
|
|
136
|
+
- Dataset size benchmark: Tests scaling with 1K to 50K samples
|
|
137
|
+
- Measures time, memory, and improvement metrics
|
|
138
|
+
- Outputs `benchmark_results.json` and `dataset_size_results.json`
|
|
139
|
+
- Run with: `python benchmarks/benchmarks.py`
|
|
140
|
+
|
|
141
|
+
## Development Guidelines
|
|
142
|
+
|
|
143
|
+
### Code Style
|
|
144
|
+
- Line length: 79 characters (enforced by Black)
|
|
145
|
+
- Import sorting: Black profile (enforced by isort)
|
|
146
|
+
- Docstrings: Follow numpydoc convention for all public methods
|
|
147
|
+
- Type hints: Used where applicable (see slicefinder.py lines 6, 91-97)
|
|
148
|
+
|
|
149
|
+
### Testing Requirements
|
|
150
|
+
- Unit tests must pass for all changes
|
|
151
|
+
- Coverage threshold: 80% minimum (configured in pyproject.toml)
|
|
152
|
+
- Coverage excludes: validation.py, _numba_ops.py, tests/, hidden files
|
|
153
|
+
- Benchmarking: Available via pytest-benchmark for performance-sensitive changes
|
|
154
|
+
|
|
155
|
+
### Adding New Features
|
|
156
|
+
- Open a GitHub discussion before starting work
|
|
157
|
+
- Add docstrings following numpydoc format
|
|
158
|
+
- Update relevant documentation in docs/source/
|
|
159
|
+
- Add unit tests achieving 80%+ coverage
|
|
160
|
+
- Update release notes (when requested)
|
|
161
|
+
|
|
162
|
+
### scikit-learn Compatibility
|
|
163
|
+
The `Slicefinder` class follows scikit-learn conventions:
|
|
164
|
+
- Inherits from `BaseEstimator` and `TransformerMixin`
|
|
165
|
+
- Implements `fit()`, `transform()`, `fit_transform()` pattern
|
|
166
|
+
- Uses `check_is_fitted()` for state validation
|
|
167
|
+
- Exposes `get_feature_names_out()` for pipeline integration
|
|
168
|
+
- Parameters set in `__init__` without validation (validated in `fit()`)
|
|
169
|
+
|
|
170
|
+
### Working with Sparse Matrices
|
|
171
|
+
- All internal representations use `scipy.sparse.csr_matrix`
|
|
172
|
+
- Avoid `.A` shorthand for `.toarray()` - not supported in all scipy versions (see comments at lines 383-386, 405-408, 517-519)
|
|
173
|
+
- Use explicit `.toarray()` calls when converting to dense
|
|
174
|
+
- Use `.nnz` for counting non-zero elements (faster than `.sum()`)
|
|
175
|
+
- Direct CSR construction preferred over lil_matrix for one-hot encoding
|
|
176
|
+
|
|
177
|
+
## Common Pitfalls
|
|
178
|
+
|
|
179
|
+
1. **Sparse matrix compatibility**: Some scipy versions don't support `.A` attribute on certain sparse matrix types. Always use `.toarray()` explicitly.
|
|
180
|
+
|
|
181
|
+
2. **String dtype handling**: The custom validation module allows string inputs, which sklearn's standard validation rejects. Don't replace with sklearn's built-in validation.
|
|
182
|
+
|
|
183
|
+
3. **min_sup conversion**: When `min_sup` is a float (0 < min_sup < 1), it gets converted to an absolute count in `fit()` (line 160). This modifies the instance attribute.
|
|
184
|
+
|
|
185
|
+
4. **Missing parents**: The `_get_pair_candidates()` method includes logic to handle cases where some parent slices were pruned (lines 578-583). This prevents invalid combinations.
|
|
186
|
+
|
|
187
|
+
5. **Notebook execution**: Notebooks require specific execution with unlimited timeout (see Makefile line 22) due to potentially long-running experiments.
|
|
188
|
+
|
|
189
|
+
6. **Deterministic ordering**: Results are sorted by score first, then lexicographically by slice representation. This ensures reproducible results across runs and Python versions.
|
|
190
|
+
|
|
191
|
+
7. **Memory efficiency**: Use sparse matrices throughout. The `_join_compatible_slices()` method returns sparse format to avoid memory explosion with large numbers of slices.
|
|
192
|
+
|
|
193
|
+
## Performance Considerations
|
|
194
|
+
|
|
195
|
+
### When to Use Sliceline
|
|
196
|
+
|
|
197
|
+
Sliceline is designed for datasets where:
|
|
198
|
+
- You want to find subgroups where your ML model underperforms
|
|
199
|
+
- Features are categorical or can be binned (continuous values should be discretized)
|
|
200
|
+
- Dataset size is reasonable (10K-100K samples works well)
|
|
201
|
+
|
|
202
|
+
### Performance Characteristics
|
|
203
|
+
|
|
204
|
+
Based on benchmarks:
|
|
205
|
+
- **Small datasets (1K samples)**: < 100ms
|
|
206
|
+
- **Medium datasets (10K samples)**: 100ms - 1s
|
|
207
|
+
- **Large datasets (50K+ samples)**: 1-10s depending on cardinality
|
|
208
|
+
|
|
209
|
+
### Optimization Tips
|
|
210
|
+
|
|
211
|
+
1. **Reduce cardinality**: Bin continuous features or use feature hashing for high-cardinality columns
|
|
212
|
+
2. **Limit lattice depth**: Keep `max_l` small (2-3) for faster execution
|
|
213
|
+
3. **Increase min_sup**: Higher support threshold prunes more aggressively
|
|
214
|
+
4. **Use appropriate k**: Smaller `k` values enable better pruning
|
|
215
|
+
|
|
216
|
+
### Future Optimizations (See NUMBA_OPTIMIZATION.md)
|
|
217
|
+
|
|
218
|
+
Planned Numba JIT compilation for:
|
|
219
|
+
- Scoring functions (`_score`, `_score_ub`)
|
|
220
|
+
- ID computation for deduplication
|
|
221
|
+
- Expected speedup: 5-50x on numeric-heavy operations
|