licketysplit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- licketysplit-0.1.0/MANIFEST.in +12 -0
- licketysplit-0.1.0/PKG-INFO +127 -0
- licketysplit-0.1.0/README.md +100 -0
- licketysplit-0.1.0/pyproject.toml +47 -0
- licketysplit-0.1.0/setup.cfg +16 -0
- licketysplit-0.1.0/setup.py +96 -0
- licketysplit-0.1.0/src/licketysplit/__init__.py +421 -0
- licketysplit-0.1.0/src/licketysplit/_core.cpp +270 -0
- licketysplit-0.1.0/src/licketysplit/_threshold_guessing.py +338 -0
- licketysplit-0.1.0/src/licketysplit/cpp/licketysplit.cpp +1305 -0
- licketysplit-0.1.0/src/licketysplit.egg-info/PKG-INFO +127 -0
- licketysplit-0.1.0/src/licketysplit.egg-info/SOURCES.txt +14 -0
- licketysplit-0.1.0/src/licketysplit.egg-info/dependency_links.txt +1 -0
- licketysplit-0.1.0/src/licketysplit.egg-info/requires.txt +4 -0
- licketysplit-0.1.0/src/licketysplit.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include pyproject.toml
|
|
3
|
+
include setup.py
|
|
4
|
+
include setup.cfg
|
|
5
|
+
|
|
6
|
+
recursive-include src/licketysplit *.py
|
|
7
|
+
recursive-include src/licketysplit *.cpp
|
|
8
|
+
recursive-include src/licketysplit *.hpp
|
|
9
|
+
recursive-include src/licketysplit *.h
|
|
10
|
+
recursive-include src/licketysplit/cpp *.cpp
|
|
11
|
+
recursive-include src/licketysplit/cpp *.hpp
|
|
12
|
+
recursive-include src/licketysplit/cpp *.h
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: licketysplit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Near-optimal decision trees via LicketySPLIT
|
|
5
|
+
Author: Varun Babbar, Hayden McTavish, Zakk Heile, Margo Seltzer, Cynthia Rudin
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/zakk-h/LicketySPLIT
|
|
8
|
+
Project-URL: Repository, https://github.com/zakk-h/LicketySPLIT
|
|
9
|
+
Project-URL: Issues, https://github.com/zakk-h/LicketySPLIT/issues
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: C++
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: numpy>=1.23
|
|
24
|
+
Requires-Dist: matplotlib>=3.6
|
|
25
|
+
Requires-Dist: scikit-learn>=1.1
|
|
26
|
+
Requires-Dist: pandas>=1.5
|
|
27
|
+
|
|
28
|
+
# LicketySPLIT
|
|
29
|
+
|
|
30
|
+
Near-optimal decision tree learning via LicketySPLIT.
|
|
31
|
+
|
|
32
|
+
# LicketySPLIT
|
|
33
|
+
|
|
34
|
+
LicketySPLIT is a Python package with a C++ backend for learning sparse decision trees for classification. It builds on the LicketySPLIT algorithm from [Near Optimal Decision Trees in a SPLIT Second](https://arxiv.org/abs/2502.15988) and generalizes it with recursive pilot ideas as in [From Rashomon Theory to PRAXIS: Efficient Decision Tree Rashomon Sets](https://arxiv.org/abs/2606.00202).
|
|
35
|
+
|
|
36
|
+
The package supports binary and multi-class classification, optional sample weights, efficient subproblem caching, and threshold-based binarization for continuous features.
|
|
37
|
+
|
|
38
|
+
See the example notebook [here.](https://github.com/zakk-h/LicketySPLIT/blob/main/example.ipynb)
|
|
39
|
+
|
|
40
|
+
## Objective
|
|
41
|
+
|
|
42
|
+
LicketySPLIT learns a sparse decision tree by attempting to minimize a regularized empirical training objective.
|
|
43
|
+
|
|
44
|
+
For an unweighted dataset with `n` training samples, the objective is
|
|
45
|
+
|
|
46
|
+
```text
|
|
47
|
+
objective(tree) = training_mistakes(tree) + lambda_leaf * n * number_of_leaves(tree)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
where:
|
|
51
|
+
|
|
52
|
+
- `training_mistakes(tree)` is the number of training samples misclassified by the tree
|
|
53
|
+
- `number_of_leaves(tree)` is the number of leaves in the tree
|
|
54
|
+
- `lambda_leaf` is the leaf regularization parameter
|
|
55
|
+
- `n` is the number of training samples
|
|
56
|
+
|
|
57
|
+
For a weighted dataset, `n` is replaced with the sum of weights and the `training_mistakes(tree)` penalizes incorrect points by their weight instead of by 1.
|
|
58
|
+
|
|
59
|
+
## Classification setting
|
|
60
|
+
|
|
61
|
+
Labels should be encoded as contiguous nonnegative integers:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
0, 1, 2, ..., num_classes - 1
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Features
|
|
68
|
+
|
|
69
|
+
LicketySPLIT expects binary input features. For continuous data, use `ThresholdGuessBinarizer` to generate binary threshold features.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from licketysplit import ThresholdGuessBinarizer
|
|
73
|
+
|
|
74
|
+
binarizer = ThresholdGuessBinarizer(
|
|
75
|
+
learning_rate=0.1,
|
|
76
|
+
n_estimators=100,
|
|
77
|
+
max_depth=3,
|
|
78
|
+
random_state=0,
|
|
79
|
+
column_elimination=False,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
X_train_bin = binarizer.fit_transform(X_train_raw, y_train)
|
|
83
|
+
X_test_bin = binarizer.transform(X_test_raw)
|
|
84
|
+
|
|
85
|
+
feature_names = binarizer.get_feature_names_out()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Each generated feature has the form
|
|
89
|
+
|
|
90
|
+
```text
|
|
91
|
+
original_feature <= threshold
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Basic usage
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from licketysplit import LicketySPLIT
|
|
98
|
+
|
|
99
|
+
model = LicketySPLIT(
|
|
100
|
+
cache_mode="fingerprint",
|
|
101
|
+
cost_caching_enabled=True,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
model.fit(
|
|
105
|
+
X_train_bin,
|
|
106
|
+
y_train,
|
|
107
|
+
lambda_leaf=0.001,
|
|
108
|
+
depth_budget=6,
|
|
109
|
+
lookahead_k=1,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
y_pred = model.predict(X_test_bin)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Full example
|
|
116
|
+
|
|
117
|
+
A complete notebook showing loading data, binarization, fitting, prediction, tree inspection, plotting, caching, and sample weights is available [here](https://github.com/zakk-h/LicketySPLIT/blob/main/example.ipynb).
|
|
118
|
+
|
|
119
|
+
## References
|
|
120
|
+
|
|
121
|
+
This package builds on the LicketySPLIT algorithm from:
|
|
122
|
+
|
|
123
|
+
[Near Optimal Decision Trees in a SPLIT Second](https://arxiv.org/abs/2502.15988)
|
|
124
|
+
|
|
125
|
+
and generalizes it with a pilot algorithm approach as in:
|
|
126
|
+
|
|
127
|
+
[From Rashomon Theory to PRAXIS: Efficient Decision Tree Rashomon Sets](https://arxiv.org/abs/2606.00202)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# LicketySPLIT
|
|
2
|
+
|
|
3
|
+
Near-optimal decision tree learning via LicketySPLIT.
|
|
4
|
+
|
|
5
|
+
# LicketySPLIT
|
|
6
|
+
|
|
7
|
+
LicketySPLIT is a Python package with a C++ backend for learning sparse decision trees for classification. It builds on the LicketySPLIT algorithm from [Near Optimal Decision Trees in a SPLIT Second](https://arxiv.org/abs/2502.15988) and generalizes it with recursive pilot ideas as in [From Rashomon Theory to PRAXIS: Efficient Decision Tree Rashomon Sets](https://arxiv.org/abs/2606.00202).
|
|
8
|
+
|
|
9
|
+
The package supports binary and multi-class classification, optional sample weights, efficient subproblem caching, and threshold-based binarization for continuous features.
|
|
10
|
+
|
|
11
|
+
See the example notebook [here.](https://github.com/zakk-h/LicketySPLIT/blob/main/example.ipynb)
|
|
12
|
+
|
|
13
|
+
## Objective
|
|
14
|
+
|
|
15
|
+
LicketySPLIT learns a sparse decision tree by attempting to minimize a regularized empirical training objective.
|
|
16
|
+
|
|
17
|
+
For an unweighted dataset with `n` training samples, the objective is
|
|
18
|
+
|
|
19
|
+
```text
|
|
20
|
+
objective(tree) = training_mistakes(tree) + lambda_leaf * n * number_of_leaves(tree)
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
where:
|
|
24
|
+
|
|
25
|
+
- `training_mistakes(tree)` is the number of training samples misclassified by the tree
|
|
26
|
+
- `number_of_leaves(tree)` is the number of leaves in the tree
|
|
27
|
+
- `lambda_leaf` is the leaf regularization parameter
|
|
28
|
+
- `n` is the number of training samples
|
|
29
|
+
|
|
30
|
+
For a weighted dataset, `n` is replaced with the sum of weights and the `training_mistakes(tree)` penalizes incorrect points by their weight instead of by 1.
|
|
31
|
+
|
|
32
|
+
## Classification setting
|
|
33
|
+
|
|
34
|
+
Labels should be encoded as contiguous nonnegative integers:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
0, 1, 2, ..., num_classes - 1
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
LicketySPLIT expects binary input features. For continuous data, use `ThresholdGuessBinarizer` to generate binary threshold features.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from licketysplit import ThresholdGuessBinarizer
|
|
46
|
+
|
|
47
|
+
binarizer = ThresholdGuessBinarizer(
|
|
48
|
+
learning_rate=0.1,
|
|
49
|
+
n_estimators=100,
|
|
50
|
+
max_depth=3,
|
|
51
|
+
random_state=0,
|
|
52
|
+
column_elimination=False,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
X_train_bin = binarizer.fit_transform(X_train_raw, y_train)
|
|
56
|
+
X_test_bin = binarizer.transform(X_test_raw)
|
|
57
|
+
|
|
58
|
+
feature_names = binarizer.get_feature_names_out()
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Each generated feature has the form
|
|
62
|
+
|
|
63
|
+
```text
|
|
64
|
+
original_feature <= threshold
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Basic usage
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from licketysplit import LicketySPLIT
|
|
71
|
+
|
|
72
|
+
model = LicketySPLIT(
|
|
73
|
+
cache_mode="fingerprint",
|
|
74
|
+
cost_caching_enabled=True,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
model.fit(
|
|
78
|
+
X_train_bin,
|
|
79
|
+
y_train,
|
|
80
|
+
lambda_leaf=0.001,
|
|
81
|
+
depth_budget=6,
|
|
82
|
+
lookahead_k=1,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
y_pred = model.predict(X_test_bin)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Full example
|
|
89
|
+
|
|
90
|
+
A complete notebook showing loading data, binarization, fitting, prediction, tree inspection, plotting, caching, and sample weights is available [here](https://github.com/zakk-h/LicketySPLIT/blob/main/example.ipynb).
|
|
91
|
+
|
|
92
|
+
## References
|
|
93
|
+
|
|
94
|
+
This package builds on the LicketySPLIT algorithm from:
|
|
95
|
+
|
|
96
|
+
[Near Optimal Decision Trees in a SPLIT Second](https://arxiv.org/abs/2502.15988)
|
|
97
|
+
|
|
98
|
+
and generalizes it with a pilot algorithm approach as in:
|
|
99
|
+
|
|
100
|
+
[From Rashomon Theory to PRAXIS: Efficient Decision Tree Rashomon Sets](https://arxiv.org/abs/2606.00202)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [
|
|
3
|
+
"setuptools>=64",
|
|
4
|
+
"wheel",
|
|
5
|
+
"pybind11>=2.13",
|
|
6
|
+
"numpy>=1.23"
|
|
7
|
+
]
|
|
8
|
+
build-backend = "setuptools.build_meta"
|
|
9
|
+
|
|
10
|
+
[project]
|
|
11
|
+
name = "licketysplit"
|
|
12
|
+
version = "0.1.0"
|
|
13
|
+
description = "Near-optimal decision trees via LicketySPLIT"
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
requires-python = ">=3.9"
|
|
16
|
+
license = "MIT"
|
|
17
|
+
authors = [
|
|
18
|
+
{ name = "Varun Babbar" },
|
|
19
|
+
{ name = "Hayden McTavish" },
|
|
20
|
+
{ name = "Zakk Heile" },
|
|
21
|
+
{ name = "Margo Seltzer" },
|
|
22
|
+
{ name = "Cynthia Rudin" }
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"numpy>=1.23",
|
|
26
|
+
"matplotlib>=3.6",
|
|
27
|
+
"scikit-learn>=1.1",
|
|
28
|
+
"pandas>=1.5"
|
|
29
|
+
]
|
|
30
|
+
classifiers = [
|
|
31
|
+
"Development Status :: 3 - Alpha",
|
|
32
|
+
"Intended Audience :: Science/Research",
|
|
33
|
+
"Intended Audience :: Developers",
|
|
34
|
+
"Programming Language :: Python :: 3",
|
|
35
|
+
"Programming Language :: Python :: 3.9",
|
|
36
|
+
"Programming Language :: Python :: 3.10",
|
|
37
|
+
"Programming Language :: Python :: 3.11",
|
|
38
|
+
"Programming Language :: Python :: 3.12",
|
|
39
|
+
"Programming Language :: Python :: 3.13",
|
|
40
|
+
"Programming Language :: C++",
|
|
41
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/zakk-h/LicketySPLIT"
|
|
46
|
+
Repository = "https://github.com/zakk-h/LicketySPLIT"
|
|
47
|
+
Issues = "https://github.com/zakk-h/LicketySPLIT/issues"
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import platform
|
|
3
|
+
from setuptools import setup, Extension
|
|
4
|
+
from setuptools.command.build_ext import build_ext
|
|
5
|
+
import pybind11
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_truthy_env(name: str) -> bool:
|
|
9
|
+
return os.environ.get(name, "").lower() in ("1", "true", "yes", "on")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BuildExt(build_ext):
|
|
13
|
+
c_opts = {
|
|
14
|
+
"msvc": [
|
|
15
|
+
"/O2",
|
|
16
|
+
"/std:c++17",
|
|
17
|
+
],
|
|
18
|
+
"unix": [
|
|
19
|
+
"-O3",
|
|
20
|
+
"-DNDEBUG",
|
|
21
|
+
"-funroll-loops",
|
|
22
|
+
],
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
l_opts = {
|
|
26
|
+
"msvc": [],
|
|
27
|
+
"unix": [],
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def build_extensions(self):
|
|
31
|
+
ct = self.compiler.compiler_type
|
|
32
|
+
system = platform.system().lower()
|
|
33
|
+
machine = platform.machine().lower()
|
|
34
|
+
|
|
35
|
+
opts = self.c_opts.get(ct, []).copy()
|
|
36
|
+
link_opts = self.l_opts.get(ct, []).copy()
|
|
37
|
+
|
|
38
|
+
if ct == "unix":
|
|
39
|
+
opts += ["-std=c++17", "-fPIC"]
|
|
40
|
+
|
|
41
|
+
if system != "darwin":
|
|
42
|
+
opts += ["-flto"]
|
|
43
|
+
link_opts += ["-flto", "-lm"]
|
|
44
|
+
|
|
45
|
+
if machine in ("x86_64", "amd64"):
|
|
46
|
+
opts += ["-mpopcnt"]
|
|
47
|
+
print("** Building LicketySPLIT with x86 POPCNT support")
|
|
48
|
+
elif machine in ("arm64", "aarch64"):
|
|
49
|
+
print("** Building LicketySPLIT on ARM64; skipping x86 -mpopcnt")
|
|
50
|
+
else:
|
|
51
|
+
print(f"** Building LicketySPLIT on unknown Unix arch {machine}; skipping popcount-specific flags")
|
|
52
|
+
|
|
53
|
+
elif ct == "msvc":
|
|
54
|
+
print("** Building LicketySPLIT with MSVC safe flags")
|
|
55
|
+
|
|
56
|
+
aggressive = is_truthy_env("AGGRESSIVE")
|
|
57
|
+
|
|
58
|
+
if aggressive and ct == "unix":
|
|
59
|
+
if machine in ("x86_64", "amd64"):
|
|
60
|
+
opts += [
|
|
61
|
+
"-mbmi",
|
|
62
|
+
"-mbmi2",
|
|
63
|
+
"-mavx2",
|
|
64
|
+
]
|
|
65
|
+
print("** Building LicketySPLIT with additional aggressive x86 flags")
|
|
66
|
+
elif machine in ("arm64", "aarch64"):
|
|
67
|
+
print("** AGGRESSIVE requested on ARM64; no extra portable flags added")
|
|
68
|
+
elif aggressive and ct != "unix":
|
|
69
|
+
print("** AGGRESSIVE requested on non-Unix compiler; using safe flags")
|
|
70
|
+
|
|
71
|
+
for ext in self.extensions:
|
|
72
|
+
ext.extra_compile_args = opts
|
|
73
|
+
ext.extra_link_args = link_opts
|
|
74
|
+
|
|
75
|
+
build_ext.build_extensions(self)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
ext_modules = [
|
|
79
|
+
Extension(
|
|
80
|
+
"licketysplit._core",
|
|
81
|
+
sources=[
|
|
82
|
+
"src/licketysplit/_core.cpp",
|
|
83
|
+
],
|
|
84
|
+
include_dirs=[
|
|
85
|
+
pybind11.get_include(),
|
|
86
|
+
"src/licketysplit/cpp",
|
|
87
|
+
],
|
|
88
|
+
language="c++",
|
|
89
|
+
),
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
setup(
|
|
94
|
+
ext_modules=ext_modules,
|
|
95
|
+
cmdclass={"build_ext": BuildExt},
|
|
96
|
+
)
|