nrgboost 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nrgboost-0.0.1/PKG-INFO +137 -0
- nrgboost-0.0.1/README.md +114 -0
- nrgboost-0.0.1/setup.cfg +4 -0
- nrgboost-0.0.1/setup.py +41 -0
- nrgboost-0.0.1/src/nrgboost/__init__.py +2 -0
- nrgboost-0.0.1/src/nrgboost/dataset.py +107 -0
- nrgboost-0.0.1/src/nrgboost/distribution.py +1036 -0
- nrgboost-0.0.1/src/nrgboost/domain.py +364 -0
- nrgboost-0.0.1/src/nrgboost/preprocessing.py +275 -0
- nrgboost-0.0.1/src/nrgboost/tree/__init__.py +0 -0
- nrgboost-0.0.1/src/nrgboost/tree/_partitioner.py +377 -0
- nrgboost-0.0.1/src/nrgboost/tree/build_tree.py +131 -0
- nrgboost-0.0.1/src/nrgboost/tree/ensemble.py +372 -0
- nrgboost-0.0.1/src/nrgboost/tree/eval/__init__.py +493 -0
- nrgboost-0.0.1/src/nrgboost/tree/eval/build.py +26 -0
- nrgboost-0.0.1/src/nrgboost/tree/fit_tree.py +162 -0
- nrgboost-0.0.1/src/nrgboost/tree/generative.py +226 -0
- nrgboost-0.0.1/src/nrgboost/tree/partitioner.py +265 -0
- nrgboost-0.0.1/src/nrgboost/utils.py +50 -0
- nrgboost-0.0.1/src/nrgboost/wrapper.py +224 -0
- nrgboost-0.0.1/src/nrgboost.egg-info/PKG-INFO +137 -0
- nrgboost-0.0.1/src/nrgboost.egg-info/SOURCES.txt +24 -0
- nrgboost-0.0.1/src/nrgboost.egg-info/dependency_links.txt +1 -0
- nrgboost-0.0.1/src/nrgboost.egg-info/requires.txt +7 -0
- nrgboost-0.0.1/src/nrgboost.egg-info/top_level.txt +2 -0
nrgboost-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: nrgboost
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Official NRGBoost implementation
|
|
5
|
+
Home-page: https://github.com/ajoo/nrgboost
|
|
6
|
+
Author: João Bravo
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: cffi>=1
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: scipy
|
|
11
|
+
Requires-Dist: numba
|
|
12
|
+
Requires-Dist: tqdm
|
|
13
|
+
Requires-Dist: joblib
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: description
|
|
17
|
+
Dynamic: description-content-type
|
|
18
|
+
Dynamic: home-page
|
|
19
|
+
Dynamic: requires-dist
|
|
20
|
+
Dynamic: summary
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# 🔋 NRGBoost: Energy-Based Generative Boosted Trees
|
|
24
|
+
|
|
25
|
+
Official implementation of the [NRGBoost](https://arxiv.org/abs/2410.03535) algorithm.
|
|
26
|
+
|
|
27
|
+
Github: https://github.com/ajoo/nrgboost
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
To install the latest version of the python package run:
|
|
32
|
+
|
|
33
|
+
```shell
|
|
34
|
+
pip install nrgboost
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## NRGBoost Models
|
|
38
|
+
|
|
39
|
+
The following example shows how to train a NRGBoost model on the California Housing dataset:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import numpy as np
|
|
43
|
+
from sklearn.datasets import fetch_california_housing
|
|
44
|
+
from sklearn.model_selection import train_test_split
|
|
45
|
+
from sklearn.metrics import r2_score
|
|
46
|
+
|
|
47
|
+
from nrgboost import Dataset, NRGBooster
|
|
48
|
+
|
|
49
|
+
# Get data
|
|
50
|
+
df, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
|
51
|
+
df.insert(0, y.name, y)
|
|
52
|
+
df, test_df = train_test_split(df, test_size=0.2, random_state=123)
|
|
53
|
+
train_df, val_df = train_test_split(df, test_size=0.2, random_state=124)
|
|
54
|
+
|
|
55
|
+
# Create training set
|
|
56
|
+
train_ds = Dataset(train_df)
|
|
57
|
+
|
|
58
|
+
# Train model
|
|
59
|
+
params = {
|
|
60
|
+
'num_trees': 200,
|
|
61
|
+
'shrinkage': 0.15,
|
|
62
|
+
'max_leaves': 256,
|
|
63
|
+
'max_ratio_in_leaf': 2,
|
|
64
|
+
'num_model_samples': 80_000,
|
|
65
|
+
'p_refresh': 0.1,
|
|
66
|
+
'num_chains': 16,
|
|
67
|
+
'burn_in': 100,
|
|
68
|
+
}
|
|
69
|
+
model = NRGBooster.fit(train_ds, params, seed=1984)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**Note:** If your dataset contains categorical variables, they should be cast to pandas `Categorical` dtype before calling the `Dataset` constructor. For example `df[categorical_col] = df[categorical_col].astype("category")`.
|
|
73
|
+
|
|
74
|
+
### Prediction
|
|
75
|
+
|
|
76
|
+
To use the trained model for prediction we can call the `predict` method.
|
|
77
|
+
This allows the user to specify a column name for prediction.
|
|
78
|
+
Unlike discriminative methods, **NRGBoost** can be used to predict any column in the data, not just a specific "target" column.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# Do "early stopping" first:
|
|
82
|
+
# find the best boosting round for prediction in validation
|
|
83
|
+
# with cumulative=True, predict will return an iterator
|
|
84
|
+
# over predictions at different rounds
|
|
85
|
+
preds = model.predict(val_df, y.name, cumulative=True)
|
|
86
|
+
val_r2 = [r2_score(val_df[y.name], yh) for yh in preds]
|
|
87
|
+
best_round = np.argmax(val_r2)
|
|
88
|
+
|
|
89
|
+
#%% Compute test R^2 using only the first `best_round` trees
|
|
90
|
+
test_preds = model.predict(test_df, y.name, num_rounds=best_round)
|
|
91
|
+
test_r2 = r2_score(test_df[target_col], test_preds)
|
|
92
|
+
print('Test R^2:', test_r2)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**Note:** For numerical columns, NRGBoost currently predicts the expected value according to it's learned distribution. In the future we plan to make this more flexible so that the user can select a different point estimate (e.g., median or another quantile) or have access to the full distribution.
|
|
96
|
+
|
|
97
|
+
For categorical columns, NRGBoost will output logits for each possible outcome. The prediction will be an array with shape (N, K) where N is the number of points and K the cardinality of the column. The orders of each logit are determined by the pandas `codes` for each possible value.
|
|
98
|
+
The output logits are already normalized so we can convert them to probabilities simply by exponentiation (i.e., no need to softmax).
|
|
99
|
+
|
|
100
|
+
### Sampling
|
|
101
|
+
|
|
102
|
+
To draw 500 samples from the model we can run:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
samples_df = model.sample(500, num_steps=100)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
`num_steps` is the number of Gibbs sampling steps that are used to generate each individual sample. It allows the user to trade-off computation time (which scales linearly in `num_steps`) for bias in the generated samples.
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
### Saving and Loading
|
|
112
|
+
|
|
113
|
+
To save a NRGBoost model simply run
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
model.save('filename')
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
The model can then be loaded via:
|
|
120
|
+
```python
|
|
121
|
+
from nrgboost import NRGBooster
|
|
122
|
+
|
|
123
|
+
model = NRGBooster.load('filename')
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Cite NRGBoost
|
|
127
|
+
|
|
128
|
+
You can cite NRGBoost as:
|
|
129
|
+
|
|
130
|
+
```latex
|
|
131
|
+
@article{bravo2024nrgboost,
|
|
132
|
+
title={NRGBoost: Energy-Based Generative Boosted Trees},
|
|
133
|
+
author={Bravo, Jo{\~a}o},
|
|
134
|
+
journal={arXiv preprint arXiv:2410.03535},
|
|
135
|
+
year={2024}
|
|
136
|
+
}
|
|
137
|
+
```
|
nrgboost-0.0.1/README.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# 🔋 NRGBoost: Energy-Based Generative Boosted Trees 🌳
|
|
2
|
+
|
|
3
|
+
This repository contains the official code of the paper [NRGBoost: Energy-Based Generative Boosted Trees](https://arxiv.org/abs/2410.03535) (ICLR 2025).
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
## Installation
|
|
7
|
+
|
|
8
|
+
To install the latest version of the python package run:
|
|
9
|
+
|
|
10
|
+
```shell
|
|
11
|
+
pip install nrgboost
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## NRGBoost Models
|
|
15
|
+
|
|
16
|
+
The following example shows how to train a NRGBoost model on the California Housing dataset:
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
import numpy as np
|
|
20
|
+
from sklearn.datasets import fetch_california_housing
|
|
21
|
+
from sklearn.model_selection import train_test_split
|
|
22
|
+
from sklearn.metrics import r2_score
|
|
23
|
+
|
|
24
|
+
from nrgboost import Dataset, NRGBooster
|
|
25
|
+
|
|
26
|
+
# Get data
|
|
27
|
+
df, y = fetch_california_housing(return_X_y=True, as_frame=True)
|
|
28
|
+
df.insert(0, y.name, y)
|
|
29
|
+
df, test_df = train_test_split(df, test_size=0.2, random_state=123)
|
|
30
|
+
train_df, val_df = train_test_split(df, test_size=0.2, random_state=124)
|
|
31
|
+
|
|
32
|
+
# Create training set
|
|
33
|
+
train_ds = Dataset(train_df)
|
|
34
|
+
|
|
35
|
+
# Train model
|
|
36
|
+
params = {
|
|
37
|
+
'num_trees': 200,
|
|
38
|
+
'shrinkage': 0.15,
|
|
39
|
+
'max_leaves': 256,
|
|
40
|
+
'max_ratio_in_leaf': 2,
|
|
41
|
+
'num_model_samples': 80_000,
|
|
42
|
+
'p_refresh': 0.1,
|
|
43
|
+
'num_chains': 16,
|
|
44
|
+
'burn_in': 100,
|
|
45
|
+
}
|
|
46
|
+
model = NRGBooster.fit(train_ds, params, seed=1984)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Note:** If your dataset contains categorical variables, they should be cast to pandas `Categorical` dtype before calling the `Dataset` constructor. For example `df[categorical_col] = df[categorical_col].astype("category")`.
|
|
50
|
+
|
|
51
|
+
### Prediction
|
|
52
|
+
|
|
53
|
+
To use the trained model for prediction we can call the `predict` method.
|
|
54
|
+
This allows the user to specify a column name for prediction.
|
|
55
|
+
Unlike discriminative methods, **NRGBoost** can be used to predict any column in the data, not just a specific "target" column.
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# Do "early stopping" first:
|
|
59
|
+
# find the best boosting round for prediction in validation
|
|
60
|
+
# with cumulative=True, predict will return an iterator
|
|
61
|
+
# over predictions at different rounds
|
|
62
|
+
preds = model.predict(val_df, y.name, cumulative=True)
|
|
63
|
+
val_r2 = [r2_score(val_df[y.name], yh) for yh in preds]
|
|
64
|
+
best_round = np.argmax(val_r2)
|
|
65
|
+
|
|
66
|
+
#%% Compute test R^2 using only the first `best_round` trees
|
|
67
|
+
test_preds = model.predict(test_df, y.name, num_rounds=best_round)
|
|
68
|
+
test_r2 = r2_score(test_df[target_col], test_preds)
|
|
69
|
+
print('Test R^2:', test_r2)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**Note:** For numerical columns, NRGBoost currently predicts the expected value according to it's learned distribution. In the future we plan to make this more flexible so that the user can select a different point estimate (e.g., median or another quantile) or have access to the full distribution.
|
|
73
|
+
|
|
74
|
+
For categorical columns, NRGBoost will output logits for each possible outcome. The prediction will be an array with shape (N, K) where N is the number of points and K the cardinality of the column. The orders of each logit are determined by the pandas `codes` for each possible value.
|
|
75
|
+
The output logits are already normalized so we can convert them to probabilities simply by exponentiation (i.e., no need to softmax).
|
|
76
|
+
|
|
77
|
+
### Sampling
|
|
78
|
+
|
|
79
|
+
To draw 500 samples from the model we can run:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
samples_df = model.sample(500, num_steps=100)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
`num_steps` is the number of Gibbs sampling steps that are used to generate each individual sample. It allows the user to trade-off computation time (which scales linearly in `num_steps`) for bias in the generated samples.
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
### Saving and Loading
|
|
89
|
+
|
|
90
|
+
To save a NRGBoost model simply run
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
model.save('filename')
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
The model can then be loaded via:
|
|
97
|
+
```python
|
|
98
|
+
from nrgboost import NRGBooster
|
|
99
|
+
|
|
100
|
+
model = NRGBooster.load('filename')
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Cite NRGBoost
|
|
104
|
+
|
|
105
|
+
You can cite NRGBoost as:
|
|
106
|
+
|
|
107
|
+
```latex
|
|
108
|
+
@article{bravo2024nrgboost,
|
|
109
|
+
title={NRGBoost: Energy-Based Generative Boosted Trees},
|
|
110
|
+
author={Bravo, Jo{\~a}o},
|
|
111
|
+
journal={arXiv preprint arXiv:2410.03535},
|
|
112
|
+
year={2024}
|
|
113
|
+
}
|
|
114
|
+
```
|
nrgboost-0.0.1/setup.cfg
ADDED
nrgboost-0.0.1/setup.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
from os import path
|
|
3
|
+
|
|
4
|
+
readme_folder = path.dirname(path.abspath(__file__))
|
|
5
|
+
readme_path = path.join(readme_folder, 'README.md')
|
|
6
|
+
|
|
7
|
+
with open(readme_path) as file:
|
|
8
|
+
long_description = file.read()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
long_description = '''
|
|
12
|
+
# 🔋 NRGBoost: Energy-Based Generative Boosted Trees
|
|
13
|
+
|
|
14
|
+
Official implementation of the [NRGBoost](https://arxiv.org/abs/2410.03535) algorithm.
|
|
15
|
+
|
|
16
|
+
Github: https://github.com/ajoo/nrgboost
|
|
17
|
+
|
|
18
|
+
''' + long_description[long_description.find('## Installation'):]
|
|
19
|
+
|
|
20
|
+
setup(
|
|
21
|
+
name='nrgboost',
|
|
22
|
+
version='0.0.1',
|
|
23
|
+
description='Official NRGBoost implementation',
|
|
24
|
+
author='João Bravo',
|
|
25
|
+
url='https://github.com/ajoo/nrgboost',
|
|
26
|
+
packages=find_packages(where='src'), #['nrgboost', 'nrgboost.tree', 'nrgboost.tree.eval'],
|
|
27
|
+
package_dir={'': 'src'},
|
|
28
|
+
install_requires=[
|
|
29
|
+
'cffi>=1',
|
|
30
|
+
'numpy',
|
|
31
|
+
'scipy',
|
|
32
|
+
'numba',
|
|
33
|
+
'tqdm',
|
|
34
|
+
'joblib',
|
|
35
|
+
'pandas',
|
|
36
|
+
],
|
|
37
|
+
setup_requires=['cffi>=1'],
|
|
38
|
+
cffi_modules=['src/nrgboost/tree/eval/build.py:ffibuilder'],
|
|
39
|
+
long_description=long_description,
|
|
40
|
+
long_description_content_type='text/markdown',
|
|
41
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from nrgboost.distribution import Distribution
|
|
6
|
+
from nrgboost.preprocessing import (
|
|
7
|
+
convert_fixed_point,
|
|
8
|
+
fit_discretize_dataset,
|
|
9
|
+
transform_dataset,
|
|
10
|
+
infer_discretization_types,
|
|
11
|
+
map_discretization_types,
|
|
12
|
+
pullback_samples
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class DatasetTransforms:
|
|
18
|
+
columns: list[str]
|
|
19
|
+
fixed_point_digits: dict
|
|
20
|
+
discretization_types: dict
|
|
21
|
+
transforms: dict
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def fit_transform(
|
|
25
|
+
df,
|
|
26
|
+
num_bins=255,
|
|
27
|
+
infer_fixed_point=True,
|
|
28
|
+
discretization_types=None,
|
|
29
|
+
infer_ordered_categoricals=False,
|
|
30
|
+
infer_continuous_ordered_categoricals=False,
|
|
31
|
+
) -> tuple[DatasetTransforms, Distribution]:
|
|
32
|
+
if infer_fixed_point:
|
|
33
|
+
df, fixed_point_digits = convert_fixed_point(df)
|
|
34
|
+
else:
|
|
35
|
+
fixed_point_digits = dict()
|
|
36
|
+
|
|
37
|
+
if discretization_types is None:
|
|
38
|
+
discretization_types = infer_discretization_types(
|
|
39
|
+
df,
|
|
40
|
+
max_value=num_bins,
|
|
41
|
+
infer_ordered_categoricals=infer_ordered_categoricals,
|
|
42
|
+
infer_continuous_ordered_categoricals=infer_continuous_ordered_categoricals
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
distribution, uniform, transforms = fit_discretize_dataset(
|
|
46
|
+
df,
|
|
47
|
+
num_bins=num_bins,
|
|
48
|
+
return_uniform=True,
|
|
49
|
+
discretization_types=discretization_types
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return DatasetTransforms(df.columns.to_list(), fixed_point_digits, discretization_types, transforms), distribution, uniform
|
|
53
|
+
|
|
54
|
+
def col_number(self, col):
|
|
55
|
+
return self.columns.index(col)
|
|
56
|
+
|
|
57
|
+
def transform(self, df):
|
|
58
|
+
df, _ = convert_fixed_point(df, digits=self.fixed_point_digits)
|
|
59
|
+
return transform_dataset(df, self.transforms)
|
|
60
|
+
|
|
61
|
+
__call__ = transform
|
|
62
|
+
|
|
63
|
+
def inverse_transform(self, data, seed=None):
|
|
64
|
+
col_types = map_discretization_types(self.discretization_types, self.columns)
|
|
65
|
+
df = pullback_samples(data, self.transforms, col_types=col_types, seed=seed)
|
|
66
|
+
for col, d in self.fixed_point_digits.items():
|
|
67
|
+
df[col] = df[col] / 10**d
|
|
68
|
+
return df
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class Dataset:
|
|
72
|
+
def __init__(self,
|
|
73
|
+
df,
|
|
74
|
+
num_bins: int = 255,
|
|
75
|
+
infer_fixed_point: bool = True,
|
|
76
|
+
discretization_types: Optional[dict] = None,
|
|
77
|
+
infer_ordered_categoricals: bool = False,
|
|
78
|
+
infer_continuous_ordered_categoricals: bool = False,
|
|
79
|
+
) -> Dataset:
|
|
80
|
+
"""Create a Dataset for training models.
|
|
81
|
+
|
|
82
|
+
Automatically infers how to discretize data and creates a discrete dataset.
|
|
83
|
+
Internally stores metadata to transform to and from discretized space.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
df (Dataframe): Dataset to transform.
|
|
87
|
+
num_bins (int): Maximum cardinality of each dimension. Defaults to 255.
|
|
88
|
+
infer_fixed_point (bool): Whether to automatically infer fixed point precision columns.
|
|
89
|
+
Defaults to True.
|
|
90
|
+
discretization_types (dict, optional): User specified discretization types.
|
|
91
|
+
Only for advanced use. Defaults to None.
|
|
92
|
+
infer_ordered_categoricals (bool): Whether to infer integer columns that only
|
|
93
|
+
take a small number of possible discrete values. Defaults to False.
|
|
94
|
+
infer_continuous_ordered_categoricals (bool): Whether to infer floating point columns
|
|
95
|
+
that only take a small number of possible discrete values. Defaults to False.
|
|
96
|
+
Returns:
|
|
97
|
+
Dataset: Transformed dataset.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
self.transform, self.distribution, self.uniform = DatasetTransforms.fit_transform(
|
|
101
|
+
df,
|
|
102
|
+
num_bins=num_bins,
|
|
103
|
+
infer_fixed_point=infer_fixed_point,
|
|
104
|
+
discretization_types=discretization_types,
|
|
105
|
+
infer_ordered_categoricals=infer_ordered_categoricals,
|
|
106
|
+
infer_continuous_ordered_categoricals=infer_continuous_ordered_categoricals
|
|
107
|
+
)
|