unifiedbooster 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/PKG-INFO +1 -1
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/README.md +3 -1
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/setup.py +1 -1
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/gbdt.py +9 -5
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/gbdt_classification.py +43 -11
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/gbdt_regression.py +45 -12
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/gpoptimization.py +32 -25
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/__init__.py +7 -1
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/base.py +4 -4
- unifiedbooster-0.7.0/unifiedbooster/predictioninterval/__init__.py +3 -0
- unifiedbooster-0.7.0/unifiedbooster/predictioninterval/predictioninterval.py +314 -0
- unifiedbooster-0.7.0/unifiedbooster/predictionset/__init__.py +3 -0
- unifiedbooster-0.7.0/unifiedbooster/predictionset/predictionset.py +111 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster.egg-info/PKG-INFO +1 -1
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster.egg-info/SOURCES.txt +5 -1
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/LICENSE +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/setup.cfg +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/__init__.py +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/acp.py +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/cp.py +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/evaluation.py +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/icp.py +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/nc.py +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster/nonconformist/util.py +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster.egg-info/dependency_links.txt +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster.egg-info/entry_points.txt +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster.egg-info/not-zip-safe +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster.egg-info/requires.txt +0 -0
- {unifiedbooster-0.6.0 → unifiedbooster-0.7.0}/unifiedbooster.egg-info/top_level.txt +0 -0
|
@@ -7,6 +7,8 @@ Unified interface for Gradient Boosted Decision Trees algorithms
|
|
|
7
7
|
|
|
8
8
|
## Examples
|
|
9
9
|
|
|
10
|
+
See also https://thierrymoudiki.github.io/blog/2024/08/05/python/r/unibooster
|
|
11
|
+
|
|
10
12
|
### classification
|
|
11
13
|
|
|
12
14
|
```python
|
|
@@ -90,4 +92,4 @@ for dataset in datasets:
|
|
|
90
92
|
print(f"Regression Root Mean Squared Error xgboost: {mse1:.2f}")
|
|
91
93
|
print(f"Regression Root Mean Squared Error catboost: {mse2:.2f}")
|
|
92
94
|
print(f"Regression Root Mean Squared Error lightgbm: {mse3:.2f}")
|
|
93
|
-
```
|
|
95
|
+
```
|
|
@@ -35,7 +35,6 @@ class GBDT(BaseEstimator):
|
|
|
35
35
|
**kwargs: dict
|
|
36
36
|
additional parameters to be passed to the class
|
|
37
37
|
"""
|
|
38
|
-
|
|
39
38
|
def __init__(
|
|
40
39
|
self,
|
|
41
40
|
model_type="xgboost",
|
|
@@ -44,6 +43,8 @@ class GBDT(BaseEstimator):
|
|
|
44
43
|
max_depth=3,
|
|
45
44
|
rowsample=1.0,
|
|
46
45
|
colsample=1.0,
|
|
46
|
+
level=None,
|
|
47
|
+
pi_method=None,
|
|
47
48
|
verbose=0,
|
|
48
49
|
seed=123,
|
|
49
50
|
**kwargs
|
|
@@ -55,6 +56,8 @@ class GBDT(BaseEstimator):
|
|
|
55
56
|
self.max_depth = max_depth
|
|
56
57
|
self.rowsample = rowsample
|
|
57
58
|
self.colsample = colsample
|
|
59
|
+
self.level = level
|
|
60
|
+
self.pi_method = pi_method
|
|
58
61
|
self.verbose = verbose
|
|
59
62
|
self.seed = seed
|
|
60
63
|
|
|
@@ -91,7 +94,7 @@ class GBDT(BaseEstimator):
|
|
|
91
94
|
"verbose": self.verbose,
|
|
92
95
|
"random_seed": self.seed,
|
|
93
96
|
"boosting_type": "Plain",
|
|
94
|
-
"leaf_estimation_iterations": 1,
|
|
97
|
+
"leaf_estimation_iterations": 1,
|
|
95
98
|
"bootstrap_type": "Bernoulli",
|
|
96
99
|
**kwargs,
|
|
97
100
|
}
|
|
@@ -126,7 +129,6 @@ class GBDT(BaseEstimator):
|
|
|
126
129
|
|
|
127
130
|
self: object
|
|
128
131
|
"""
|
|
129
|
-
|
|
130
132
|
if getattr(self, "type_fit") == "classification":
|
|
131
133
|
self.classes_ = np.unique(y) # for compatibility with sklearn
|
|
132
134
|
self.n_classes_ = len(
|
|
@@ -152,5 +154,7 @@ class GBDT(BaseEstimator):
|
|
|
152
154
|
|
|
153
155
|
model predictions: {array-like}
|
|
154
156
|
"""
|
|
155
|
-
|
|
156
|
-
|
|
157
|
+
if self.level is not None and self.type_fit == "regression":
|
|
158
|
+
return getattr(self, "model").predict(X, return_pi=True)
|
|
159
|
+
else:
|
|
160
|
+
return getattr(self, "model").predict(X)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from .gbdt import GBDT
|
|
2
2
|
from sklearn.base import ClassifierMixin
|
|
3
|
+
from .predictionset import PredictionSet
|
|
3
4
|
|
|
4
5
|
try:
|
|
5
6
|
from xgboost import XGBClassifier
|
|
@@ -39,6 +40,12 @@ class GBDTClassifier(GBDT, ClassifierMixin):
|
|
|
39
40
|
|
|
40
41
|
colsample: float
|
|
41
42
|
percentage of features to use at each node split
|
|
43
|
+
|
|
44
|
+
level: float
|
|
45
|
+
confidence level for prediction sets
|
|
46
|
+
|
|
47
|
+
pi_method: str
|
|
48
|
+
method for constructing the prediction intervals: 'icp' (inductive conformal), 'tcp' (transductive conformal)
|
|
42
49
|
|
|
43
50
|
verbose: int
|
|
44
51
|
controls verbosity (default=0)
|
|
@@ -88,7 +95,6 @@ class GBDTClassifier(GBDT, ClassifierMixin):
|
|
|
88
95
|
print(f"Classification Accuracy lightgbm: {accuracy3:.2f}")
|
|
89
96
|
```
|
|
90
97
|
"""
|
|
91
|
-
|
|
92
98
|
def __init__(
|
|
93
99
|
self,
|
|
94
100
|
model_type="xgboost",
|
|
@@ -97,6 +103,8 @@ class GBDTClassifier(GBDT, ClassifierMixin):
|
|
|
97
103
|
max_depth=3,
|
|
98
104
|
rowsample=1.0,
|
|
99
105
|
colsample=1.0,
|
|
106
|
+
level=None,
|
|
107
|
+
pi_method="icp",
|
|
100
108
|
verbose=0,
|
|
101
109
|
seed=123,
|
|
102
110
|
**kwargs,
|
|
@@ -111,21 +119,46 @@ class GBDTClassifier(GBDT, ClassifierMixin):
|
|
|
111
119
|
max_depth=max_depth,
|
|
112
120
|
rowsample=rowsample,
|
|
113
121
|
colsample=colsample,
|
|
122
|
+
level=level,
|
|
123
|
+
pi_method=pi_method,
|
|
114
124
|
verbose=verbose,
|
|
115
125
|
seed=seed,
|
|
116
126
|
**kwargs,
|
|
117
127
|
)
|
|
118
128
|
|
|
119
|
-
if
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
129
|
+
if self.level is not None:
|
|
130
|
+
|
|
131
|
+
if model_type == "xgboost":
|
|
132
|
+
self.model = PredictionSet(XGBClassifier(**self.params),
|
|
133
|
+
level=self.level,
|
|
134
|
+
method=self.pi_method)
|
|
135
|
+
elif model_type == "catboost":
|
|
136
|
+
self.model = PredictionSet(CatBoostClassifier(**self.params),
|
|
137
|
+
level=self.level,
|
|
138
|
+
method=self.pi_method)
|
|
139
|
+
elif model_type == "lightgbm":
|
|
140
|
+
self.model = PredictionSet(LGBMClassifier(**self.params),
|
|
141
|
+
level=self.level,
|
|
142
|
+
method=self.pi_method)
|
|
143
|
+
elif model_type == "gradientboosting":
|
|
144
|
+
self.model = PredictionSet(GradientBoostingClassifier(**self.params),
|
|
145
|
+
level=self.level,
|
|
146
|
+
method=self.pi_method)
|
|
147
|
+
else:
|
|
148
|
+
raise ValueError(f"Unknown model_type: {model_type}")
|
|
149
|
+
|
|
127
150
|
else:
|
|
128
|
-
|
|
151
|
+
|
|
152
|
+
if model_type == "xgboost":
|
|
153
|
+
self.model = XGBClassifier(**self.params)
|
|
154
|
+
elif model_type == "catboost":
|
|
155
|
+
self.model = CatBoostClassifier(**self.params)
|
|
156
|
+
elif model_type == "lightgbm":
|
|
157
|
+
self.model = LGBMClassifier(**self.params)
|
|
158
|
+
elif model_type == "gradientboosting":
|
|
159
|
+
self.model = GradientBoostingClassifier(**self.params)
|
|
160
|
+
else:
|
|
161
|
+
raise ValueError(f"Unknown model_type: {model_type}")
|
|
129
162
|
|
|
130
163
|
def predict_proba(self, X):
|
|
131
164
|
"""Predict probabilities for test data X.
|
|
@@ -143,5 +176,4 @@ class GBDTClassifier(GBDT, ClassifierMixin):
|
|
|
143
176
|
|
|
144
177
|
probability estimates for test data: {array-like}
|
|
145
178
|
"""
|
|
146
|
-
|
|
147
179
|
return self.model.predict_proba(X)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from .gbdt import GBDT
|
|
2
2
|
from sklearn.base import RegressorMixin
|
|
3
|
+
from .predictioninterval import PredictionInterval
|
|
3
4
|
|
|
4
5
|
try:
|
|
5
6
|
from xgboost import XGBRegressor
|
|
@@ -39,6 +40,12 @@ class GBDTRegressor(GBDT, RegressorMixin):
|
|
|
39
40
|
|
|
40
41
|
colsample: float
|
|
41
42
|
percentage of features to use at each node split
|
|
43
|
+
|
|
44
|
+
level: float
|
|
45
|
+
confidence level for prediction sets
|
|
46
|
+
|
|
47
|
+
pi_method: str
|
|
48
|
+
method for constructing the prediction intervals: 'splitconformal', 'localconformal'
|
|
42
49
|
|
|
43
50
|
verbose: int
|
|
44
51
|
controls verbosity (default=0)
|
|
@@ -88,7 +95,6 @@ class GBDTRegressor(GBDT, RegressorMixin):
|
|
|
88
95
|
print(f"Regression Mean Squared Error lightgbm: {mse3:.2f}")
|
|
89
96
|
```
|
|
90
97
|
"""
|
|
91
|
-
|
|
92
98
|
def __init__(
|
|
93
99
|
self,
|
|
94
100
|
model_type="xgboost",
|
|
@@ -97,12 +103,14 @@ class GBDTRegressor(GBDT, RegressorMixin):
|
|
|
97
103
|
max_depth=3,
|
|
98
104
|
rowsample=1.0,
|
|
99
105
|
colsample=1.0,
|
|
106
|
+
level=None,
|
|
107
|
+
pi_method="splitconformal",
|
|
100
108
|
verbose=0,
|
|
101
109
|
seed=123,
|
|
102
110
|
**kwargs,
|
|
103
111
|
):
|
|
104
112
|
|
|
105
|
-
self.type_fit = "regression"
|
|
113
|
+
self.type_fit = "regression"
|
|
106
114
|
|
|
107
115
|
super().__init__(
|
|
108
116
|
model_type=model_type,
|
|
@@ -111,18 +119,43 @@ class GBDTRegressor(GBDT, RegressorMixin):
|
|
|
111
119
|
max_depth=max_depth,
|
|
112
120
|
rowsample=rowsample,
|
|
113
121
|
colsample=colsample,
|
|
122
|
+
level=level,
|
|
123
|
+
pi_method=pi_method,
|
|
114
124
|
verbose=verbose,
|
|
115
125
|
seed=seed,
|
|
116
126
|
**kwargs,
|
|
117
127
|
)
|
|
118
128
|
|
|
119
|
-
if
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
+
if self.level is not None:
|
|
130
|
+
|
|
131
|
+
if model_type == "xgboost":
|
|
132
|
+
self.model = PredictionInterval(XGBRegressor(**self.params),
|
|
133
|
+
level=self.level,
|
|
134
|
+
method=self.pi_method)
|
|
135
|
+
elif model_type == "catboost":
|
|
136
|
+
self.model = PredictionInterval(CatBoostRegressor(**self.params),
|
|
137
|
+
level=self.level,
|
|
138
|
+
method=self.pi_method)
|
|
139
|
+
elif model_type == "lightgbm":
|
|
140
|
+
self.model = PredictionInterval(LGBMRegressor(**self.params),
|
|
141
|
+
level=self.level,
|
|
142
|
+
method=self.pi_method)
|
|
143
|
+
elif model_type == "gradientboosting":
|
|
144
|
+
self.model = PredictionInterval(GradientBoostingRegressor(**self.params),
|
|
145
|
+
level=self.level,
|
|
146
|
+
method=self.pi_method)
|
|
147
|
+
else:
|
|
148
|
+
raise ValueError(f"Unknown model_type: {model_type}")
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
|
|
152
|
+
if model_type == "xgboost":
|
|
153
|
+
self.model = XGBRegressor(**self.params)
|
|
154
|
+
elif model_type == "catboost":
|
|
155
|
+
self.model = CatBoostRegressor(**self.params)
|
|
156
|
+
elif model_type == "lightgbm":
|
|
157
|
+
self.model = LGBMRegressor(**self.params)
|
|
158
|
+
elif model_type == "gradientboosting":
|
|
159
|
+
self.model = GradientBoostingRegressor(**self.params)
|
|
160
|
+
else:
|
|
161
|
+
raise ValueError(f"Unknown model_type: {model_type}")
|
|
@@ -168,7 +168,8 @@ def cross_val_optim(
|
|
|
168
168
|
).mean()
|
|
169
169
|
|
|
170
170
|
# objective function for hyperparams tuning
|
|
171
|
-
if n_estimators is not None:
|
|
171
|
+
if n_estimators is not None:
|
|
172
|
+
|
|
172
173
|
def crossval_objective(xx):
|
|
173
174
|
return gbdt_cv(
|
|
174
175
|
X_train=X_train,
|
|
@@ -185,25 +186,27 @@ def cross_val_optim(
|
|
|
185
186
|
scoring=scoring,
|
|
186
187
|
seed=seed,
|
|
187
188
|
)
|
|
188
|
-
|
|
189
|
+
|
|
190
|
+
else: # n_estimators is None
|
|
191
|
+
|
|
189
192
|
def crossval_objective(xx):
|
|
190
193
|
return gbdt_cv(
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
194
|
+
X_train=X_train,
|
|
195
|
+
y_train=y_train,
|
|
196
|
+
model_type=model_type,
|
|
197
|
+
n_estimators=int(10 ** xx[4]),
|
|
198
|
+
learning_rate=10 ** xx[0],
|
|
199
|
+
max_depth=int(xx[1]),
|
|
200
|
+
rowsample=xx[2],
|
|
201
|
+
colsample=xx[3],
|
|
202
|
+
cv=cv,
|
|
203
|
+
n_jobs=n_jobs,
|
|
204
|
+
type_fit=type_fit,
|
|
205
|
+
scoring=scoring,
|
|
206
|
+
seed=seed,
|
|
207
|
+
)
|
|
205
208
|
|
|
206
|
-
if n_estimators is not None:
|
|
209
|
+
if n_estimators is not None:
|
|
207
210
|
if surrogate_obj is None:
|
|
208
211
|
gp_opt = gp.GPOpt(
|
|
209
212
|
objective_func=crossval_objective,
|
|
@@ -240,7 +243,7 @@ def cross_val_optim(
|
|
|
240
243
|
n_iter=n_iter,
|
|
241
244
|
seed=seed,
|
|
242
245
|
)
|
|
243
|
-
else:
|
|
246
|
+
else: # n_estimators is None
|
|
244
247
|
if surrogate_obj is None:
|
|
245
248
|
gp_opt = gp.GPOpt(
|
|
246
249
|
objective_func=crossval_objective,
|
|
@@ -251,7 +254,7 @@ def cross_val_optim(
|
|
|
251
254
|
"max_depth",
|
|
252
255
|
"rowsample",
|
|
253
256
|
"colsample",
|
|
254
|
-
"n_estimators"
|
|
257
|
+
"n_estimators",
|
|
255
258
|
],
|
|
256
259
|
method="bayesian",
|
|
257
260
|
n_init=n_init,
|
|
@@ -268,7 +271,7 @@ def cross_val_optim(
|
|
|
268
271
|
"max_depth",
|
|
269
272
|
"rowsample",
|
|
270
273
|
"colsample",
|
|
271
|
-
"n_estimators"
|
|
274
|
+
"n_estimators",
|
|
272
275
|
],
|
|
273
276
|
acquisition="ucb",
|
|
274
277
|
method="splitconformal",
|
|
@@ -282,7 +285,11 @@ def cross_val_optim(
|
|
|
282
285
|
|
|
283
286
|
res = gp_opt.optimize(verbose=verbose, abs_tol=abs_tol)
|
|
284
287
|
res.best_params["model_type"] = model_type
|
|
285
|
-
res.best_params["n_estimators"] =
|
|
288
|
+
res.best_params["n_estimators"] = (
|
|
289
|
+
int(n_estimators)
|
|
290
|
+
if n_estimators is not None
|
|
291
|
+
else int(10 ** res.best_params["n_estimators"])
|
|
292
|
+
)
|
|
286
293
|
res.best_params["learning_rate"] = 10 ** res.best_params["learning_rate"]
|
|
287
294
|
res.best_params["max_depth"] = int(res.best_params["max_depth"])
|
|
288
295
|
res.best_params["rowsample"] = res.best_params["rowsample"]
|
|
@@ -355,7 +362,7 @@ def lazy_cross_val_optim(
|
|
|
355
362
|
|
|
356
363
|
customize: boolean
|
|
357
364
|
if True, the surrogate is transformed into a quasi-randomized network (default is False)
|
|
358
|
-
|
|
365
|
+
|
|
359
366
|
n_estimators: int
|
|
360
367
|
maximum number of trees that can be built (default is None, if None, the parameters is tuned)
|
|
361
368
|
|
|
@@ -383,7 +390,7 @@ def lazy_cross_val_optim(
|
|
|
383
390
|
Examples:
|
|
384
391
|
|
|
385
392
|
```python
|
|
386
|
-
import os
|
|
393
|
+
import os
|
|
387
394
|
import unifiedbooster as ub
|
|
388
395
|
from sklearn.datasets import load_breast_cancer
|
|
389
396
|
from sklearn.model_selection import train_test_split
|
|
@@ -454,7 +461,7 @@ def lazy_cross_val_optim(
|
|
|
454
461
|
if customize == True:
|
|
455
462
|
print(f"\n surrogate: CustomRegressor({est[0]})")
|
|
456
463
|
surr_obj = ns.CustomRegressor(obj=est[1]())
|
|
457
|
-
else:
|
|
464
|
+
else:
|
|
458
465
|
print(f"\n surrogate: {est[0]}")
|
|
459
466
|
surr_obj = est[1]()
|
|
460
467
|
res = cross_val_optim(
|
|
@@ -479,7 +486,7 @@ def lazy_cross_val_optim(
|
|
|
479
486
|
if customize == True:
|
|
480
487
|
results.append((f"CustomRegressor({est[0]})", res))
|
|
481
488
|
else:
|
|
482
|
-
results.append((est[0], res))
|
|
489
|
+
results.append((est[0], res))
|
|
483
490
|
except:
|
|
484
491
|
pass
|
|
485
492
|
|
|
@@ -18,13 +18,19 @@ from .nc import (
|
|
|
18
18
|
)
|
|
19
19
|
from .cp import IcpRegressor, TcpClassifier
|
|
20
20
|
from .icp import IcpClassifier
|
|
21
|
-
from .
|
|
21
|
+
from .nc import ClassifierNc, MarginErrFunc
|
|
22
|
+
from .base import RegressorAdapter, ClassifierAdapter
|
|
22
23
|
|
|
23
24
|
__all__ = [
|
|
24
25
|
"AbsErrorErrFunc",
|
|
26
|
+
"MarginErrFunc",
|
|
25
27
|
"QuantileRegErrFunc",
|
|
26
28
|
"RegressorAdapter",
|
|
29
|
+
"ClassifierAdapter",
|
|
27
30
|
"RegressorNc",
|
|
31
|
+
"ClassifierNc",
|
|
28
32
|
"RegressorNormalizer",
|
|
29
33
|
"IcpRegressor",
|
|
34
|
+
"IcpClassifier",
|
|
35
|
+
"TcpClassifier"
|
|
30
36
|
]
|
|
@@ -9,7 +9,7 @@ docstring
|
|
|
9
9
|
import abc
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
-
from sklearn.base import BaseEstimator
|
|
12
|
+
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class RegressorMixin(object):
|
|
@@ -102,15 +102,15 @@ class BaseModelAdapter(BaseEstimator):
|
|
|
102
102
|
pass
|
|
103
103
|
|
|
104
104
|
|
|
105
|
-
class ClassifierAdapter(BaseModelAdapter):
|
|
105
|
+
class ClassifierAdapter(BaseModelAdapter, ClassifierMixin):
|
|
106
106
|
def __init__(self, model, fit_params=None):
|
|
107
107
|
super(ClassifierAdapter, self).__init__(model, fit_params)
|
|
108
108
|
|
|
109
109
|
def _underlying_predict(self, x):
|
|
110
110
|
return self.model.predict_proba(x)
|
|
111
|
+
|
|
111
112
|
|
|
112
|
-
|
|
113
|
-
class RegressorAdapter(BaseModelAdapter):
|
|
113
|
+
class RegressorAdapter(BaseModelAdapter, RegressorMixin):
|
|
114
114
|
def __init__(self, model, fit_params=None):
|
|
115
115
|
super(RegressorAdapter, self).__init__(model, fit_params)
|
|
116
116
|
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
from locale import normalize
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pickle
|
|
4
|
+
from collections import namedtuple
|
|
5
|
+
from sklearn.base import BaseEstimator, RegressorMixin
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from sklearn.ensemble import ExtraTreesRegressor
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from sklearn.neighbors import KernelDensity
|
|
10
|
+
from sklearn.model_selection import GridSearchCV
|
|
11
|
+
from scipy.stats import gaussian_kde
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
from ..nonconformist import IcpRegressor
|
|
14
|
+
from ..nonconformist import RegressorNc
|
|
15
|
+
from ..nonconformist import RegressorNormalizer, AbsErrorErrFunc
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PredictionInterval(BaseEstimator, RegressorMixin):
|
|
19
|
+
"""Class PredictionInterval: Obtain prediction intervals.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
|
|
23
|
+
obj: an object;
|
|
24
|
+
fitted object containing methods `fit` and `predict`
|
|
25
|
+
|
|
26
|
+
method: a string;
|
|
27
|
+
method for constructing the prediction intervals.
|
|
28
|
+
Currently "splitconformal" (default) and "localconformal"
|
|
29
|
+
|
|
30
|
+
level: a float;
|
|
31
|
+
Confidence level for prediction intervals. Default is 95,
|
|
32
|
+
equivalent to a miscoverage error of 5 (%)
|
|
33
|
+
|
|
34
|
+
replications: an integer;
|
|
35
|
+
Number of replications for simulated conformal (default is `None`)
|
|
36
|
+
|
|
37
|
+
type_pi: a string;
|
|
38
|
+
type of prediction interval: currently "kde" (default) or "bootstrap"
|
|
39
|
+
|
|
40
|
+
type_split: a string;
|
|
41
|
+
"random" (random split of data) or "sequential" (sequential split of data)
|
|
42
|
+
|
|
43
|
+
seed: an integer;
|
|
44
|
+
Reproducibility of fit (there's a random split between fitting and calibration data)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
obj,
|
|
50
|
+
method="splitconformal",
|
|
51
|
+
level=95,
|
|
52
|
+
type_pi="bootstrap",
|
|
53
|
+
type_split="random",
|
|
54
|
+
replications=None,
|
|
55
|
+
kernel=None,
|
|
56
|
+
agg="mean",
|
|
57
|
+
seed=123,
|
|
58
|
+
):
|
|
59
|
+
|
|
60
|
+
self.obj = obj
|
|
61
|
+
self.method = method
|
|
62
|
+
self.level = level
|
|
63
|
+
self.type_pi = type_pi
|
|
64
|
+
self.type_split = type_split
|
|
65
|
+
self.replications = replications
|
|
66
|
+
self.kernel = kernel
|
|
67
|
+
self.agg = agg
|
|
68
|
+
self.seed = seed
|
|
69
|
+
self.alpha_ = 1 - self.level / 100
|
|
70
|
+
self.quantile_ = None
|
|
71
|
+
self.icp_ = None
|
|
72
|
+
self.calibrated_residuals_ = None
|
|
73
|
+
self.scaled_calibrated_residuals_ = None
|
|
74
|
+
self.calibrated_residuals_scaler_ = None
|
|
75
|
+
self.kde_ = None
|
|
76
|
+
|
|
77
|
+
def fit(self, X, y):
|
|
78
|
+
"""Fit the `method` to training data (X, y).
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
|
|
82
|
+
X: array-like, shape = [n_samples, n_features];
|
|
83
|
+
Training set vectors, where n_samples is the number
|
|
84
|
+
of samples and n_features is the number of features.
|
|
85
|
+
|
|
86
|
+
y: array-like, shape = [n_samples, ]; Target values.
|
|
87
|
+
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
if self.type_split == "random":
|
|
91
|
+
X_train, X_calibration, y_train, y_calibration = train_test_split(
|
|
92
|
+
X, y, test_size=0.5, random_state=self.seed
|
|
93
|
+
)
|
|
94
|
+
elif self.type_split == "sequential":
|
|
95
|
+
n_x = X.shape[0]
|
|
96
|
+
n_x_half = n_x // 2
|
|
97
|
+
first_half_idx = range(0, n_x_half)
|
|
98
|
+
second_half_idx = range(n_x_half, n_x)
|
|
99
|
+
X_train = X[first_half_idx, :]
|
|
100
|
+
X_calibration = X[second_half_idx, :]
|
|
101
|
+
y_train = y[first_half_idx]
|
|
102
|
+
y_calibration = y[second_half_idx]
|
|
103
|
+
|
|
104
|
+
if self.method == "splitconformal":
|
|
105
|
+
|
|
106
|
+
n_samples_calibration = X_calibration.shape[0]
|
|
107
|
+
self.obj.fit(X_train, y_train)
|
|
108
|
+
preds_calibration = self.obj.predict(X_calibration)
|
|
109
|
+
self.calibrated_residuals_ = y_calibration - preds_calibration
|
|
110
|
+
absolute_residuals = np.abs(self.calibrated_residuals_)
|
|
111
|
+
self.calibrated_residuals_scaler_ = StandardScaler(
|
|
112
|
+
with_mean=True, with_std=True
|
|
113
|
+
)
|
|
114
|
+
self.scaled_calibrated_residuals_ = (
|
|
115
|
+
self.calibrated_residuals_scaler_.fit_transform(
|
|
116
|
+
self.calibrated_residuals_.reshape(-1, 1)
|
|
117
|
+
).ravel()
|
|
118
|
+
)
|
|
119
|
+
try:
|
|
120
|
+
# numpy version >= 1.22
|
|
121
|
+
self.quantile_ = np.quantile(
|
|
122
|
+
a=absolute_residuals, q=self.level / 100, method="higher"
|
|
123
|
+
)
|
|
124
|
+
except:
|
|
125
|
+
# numpy version < 1.22
|
|
126
|
+
self.quantile_ = np.quantile(
|
|
127
|
+
a=absolute_residuals,
|
|
128
|
+
q=self.level / 100,
|
|
129
|
+
interpolation="higher",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if self.method == "localconformal":
|
|
133
|
+
|
|
134
|
+
mad_estimator = ExtraTreesRegressor()
|
|
135
|
+
normalizer = RegressorNormalizer(
|
|
136
|
+
self.obj, mad_estimator, AbsErrorErrFunc()
|
|
137
|
+
)
|
|
138
|
+
nc = RegressorNc(self.obj, AbsErrorErrFunc(), normalizer)
|
|
139
|
+
self.icp_ = IcpRegressor(nc)
|
|
140
|
+
self.icp_.fit(X_train, y_train)
|
|
141
|
+
self.icp_.calibrate(X_calibration, y_calibration)
|
|
142
|
+
|
|
143
|
+
return self
|
|
144
|
+
|
|
145
|
+
def predict(self, X, return_pi=False):
|
|
146
|
+
"""Obtain predictions and prediction intervals
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
|
|
150
|
+
X: array-like, shape = [n_samples, n_features];
|
|
151
|
+
Testing set vectors, where n_samples is the number
|
|
152
|
+
of samples and n_features is the number of features.
|
|
153
|
+
|
|
154
|
+
return_pi: boolean
|
|
155
|
+
Whether the prediction interval is returned or not.
|
|
156
|
+
Default is False, for compatibility with other _estimators_.
|
|
157
|
+
If True, a tuple containing the predictions + lower and upper
|
|
158
|
+
bounds is returned.
|
|
159
|
+
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
pred = self.obj.predict(X)
|
|
163
|
+
|
|
164
|
+
if self.method == "splitconformal":
|
|
165
|
+
|
|
166
|
+
if self.replications is None:
|
|
167
|
+
|
|
168
|
+
if return_pi:
|
|
169
|
+
|
|
170
|
+
DescribeResult = namedtuple(
|
|
171
|
+
"DescribeResult", ("mean", "lower", "upper")
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return DescribeResult(
|
|
175
|
+
pred, pred - self.quantile_, pred + self.quantile_
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
else:
|
|
179
|
+
|
|
180
|
+
return pred
|
|
181
|
+
|
|
182
|
+
else: # if self.replications is not None
|
|
183
|
+
|
|
184
|
+
assert self.type_pi in (
|
|
185
|
+
"bootstrap",
|
|
186
|
+
"kde",
|
|
187
|
+
), "`self.type_pi` must be in ('bootstrap', 'kde')"
|
|
188
|
+
|
|
189
|
+
if self.type_pi == "bootstrap":
|
|
190
|
+
np.random.seed(self.seed)
|
|
191
|
+
self.residuals_sims_ = np.asarray(
|
|
192
|
+
[
|
|
193
|
+
np.random.choice(
|
|
194
|
+
a=self.scaled_calibrated_residuals_,
|
|
195
|
+
size=X.shape[0],
|
|
196
|
+
)
|
|
197
|
+
for _ in range(self.replications)
|
|
198
|
+
]
|
|
199
|
+
).T
|
|
200
|
+
self.sims_ = np.asarray(
|
|
201
|
+
[
|
|
202
|
+
pred
|
|
203
|
+
+ self.calibrated_residuals_scaler_.scale_[0]
|
|
204
|
+
* self.residuals_sims_[:, i].ravel()
|
|
205
|
+
for i in range(self.replications)
|
|
206
|
+
]
|
|
207
|
+
).T
|
|
208
|
+
elif self.type_pi == "kde":
|
|
209
|
+
self.kde_ = gaussian_kde(
|
|
210
|
+
dataset=self.scaled_calibrated_residuals_
|
|
211
|
+
)
|
|
212
|
+
self.sims_ = np.asarray(
|
|
213
|
+
[
|
|
214
|
+
pred
|
|
215
|
+
+ self.calibrated_residuals_scaler_.scale_[0]
|
|
216
|
+
* self.kde_.resample(
|
|
217
|
+
size=X.shape[0], seed=self.seed + i
|
|
218
|
+
).ravel()
|
|
219
|
+
for i in range(self.replications)
|
|
220
|
+
]
|
|
221
|
+
).T
|
|
222
|
+
|
|
223
|
+
self.mean_ = np.mean(self.sims_, axis=1)
|
|
224
|
+
self.lower_ = np.quantile(
|
|
225
|
+
self.sims_, q=self.alpha_ / 200, axis=1
|
|
226
|
+
)
|
|
227
|
+
self.upper_ = np.quantile(
|
|
228
|
+
self.sims_, q=1 - self.alpha_ / 200, axis=1
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
DescribeResult = namedtuple(
|
|
232
|
+
"DescribeResult", ("mean", "sims", "lower", "upper")
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return DescribeResult(
|
|
236
|
+
self.mean_, self.sims_, self.lower_, self.upper_
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
if self.method == "localconformal":
|
|
240
|
+
|
|
241
|
+
if self.replications is None:
|
|
242
|
+
|
|
243
|
+
if return_pi:
|
|
244
|
+
|
|
245
|
+
predictions_bounds = self.icp_.predict(
|
|
246
|
+
X, significance=1 - self.level
|
|
247
|
+
)
|
|
248
|
+
DescribeResult = namedtuple(
|
|
249
|
+
"DescribeResult", ("mean", "lower", "upper")
|
|
250
|
+
)
|
|
251
|
+
return DescribeResult(
|
|
252
|
+
pred, predictions_bounds[:, 0], predictions_bounds[:, 1]
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
else:
|
|
256
|
+
|
|
257
|
+
return pred
|
|
258
|
+
|
|
259
|
+
else: # if self.replications is not None
|
|
260
|
+
|
|
261
|
+
assert self.type_pi in (
|
|
262
|
+
"bootstrap",
|
|
263
|
+
"kde",
|
|
264
|
+
), "`self.type_pi` must be in ('bootstrap', 'kde')"
|
|
265
|
+
|
|
266
|
+
if self.type_pi == "bootstrap":
|
|
267
|
+
np.random.seed(self.seed)
|
|
268
|
+
self.residuals_sims_ = np.asarray(
|
|
269
|
+
[
|
|
270
|
+
np.random.choice(
|
|
271
|
+
a=self.scaled_calibrated_residuals_,
|
|
272
|
+
size=X.shape[0],
|
|
273
|
+
)
|
|
274
|
+
for _ in range(self.replications)
|
|
275
|
+
]
|
|
276
|
+
).T
|
|
277
|
+
self.sims_ = np.asarray(
|
|
278
|
+
[
|
|
279
|
+
pred
|
|
280
|
+
+ self.calibrated_residuals_scaler_.scale_[0]
|
|
281
|
+
* self.residuals_sims_[:, i].ravel()
|
|
282
|
+
for i in tqdm(range(self.replications))
|
|
283
|
+
]
|
|
284
|
+
).T
|
|
285
|
+
elif self.type_pi == "kde":
|
|
286
|
+
self.kde_ = gaussian_kde(
|
|
287
|
+
dataset=self.scaled_calibrated_residuals_
|
|
288
|
+
)
|
|
289
|
+
self.sims_ = np.asarray(
|
|
290
|
+
[
|
|
291
|
+
pred
|
|
292
|
+
+ self.calibrated_residuals_scaler_.scale_[0]
|
|
293
|
+
* self.kde_.resample(
|
|
294
|
+
size=X.shape[0], seed=self.seed + i
|
|
295
|
+
).ravel()
|
|
296
|
+
for i in tqdm(range(self.replications))
|
|
297
|
+
]
|
|
298
|
+
).T
|
|
299
|
+
|
|
300
|
+
self.mean_ = np.mean(self.sims_, axis=1)
|
|
301
|
+
self.lower_ = np.quantile(
|
|
302
|
+
self.sims_, q=self.alpha_ / 200, axis=1
|
|
303
|
+
)
|
|
304
|
+
self.upper_ = np.quantile(
|
|
305
|
+
self.sims_, q=1 - self.alpha_ / 200, axis=1
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
DescribeResult = namedtuple(
|
|
309
|
+
"DescribeResult", ("mean", "sims", "lower", "upper")
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
return DescribeResult(
|
|
313
|
+
self.mean_, self.sims_, self.lower_, self.upper_
|
|
314
|
+
)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from locale import normalize
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pickle
|
|
4
|
+
from collections import namedtuple
|
|
5
|
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from sklearn.ensemble import ExtraTreesRegressor
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from scipy.stats import gaussian_kde
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
from ..nonconformist import ClassifierAdapter, IcpClassifier, TcpClassifier, ClassifierNc, MarginErrFunc
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PredictionSet(BaseEstimator, ClassifierMixin):
|
|
15
|
+
"""Class PredictionSet: Obtain prediction sets.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
|
|
19
|
+
obj: an object;
|
|
20
|
+
fitted object containing methods `fit` and `predict`
|
|
21
|
+
|
|
22
|
+
method: a string;
|
|
23
|
+
method for constructing the prediction sets.
|
|
24
|
+
Currently "icp" (default, inductive conformal) and "tcp" (transductive conformal)
|
|
25
|
+
|
|
26
|
+
level: a float;
|
|
27
|
+
Confidence level for prediction sets. Default is None,
|
|
28
|
+
95 is equivalent to a miscoverage error of 5 (%)
|
|
29
|
+
|
|
30
|
+
seed: an integer;
|
|
31
|
+
Reproducibility of fit (there's a random split between fitting and calibration data)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
obj,
|
|
37
|
+
method="icp",
|
|
38
|
+
level=None,
|
|
39
|
+
seed=123,
|
|
40
|
+
):
|
|
41
|
+
|
|
42
|
+
self.obj = obj
|
|
43
|
+
self.method = method
|
|
44
|
+
self.level = level
|
|
45
|
+
self.seed = seed
|
|
46
|
+
if self.level is not None:
|
|
47
|
+
self.alpha_ = 1 - self.level / 100
|
|
48
|
+
self.quantile_ = None
|
|
49
|
+
self.icp_ = None
|
|
50
|
+
self.tcp_ = None
|
|
51
|
+
|
|
52
|
+
if self.method == "icp":
|
|
53
|
+
self.icp_ = IcpClassifier(
|
|
54
|
+
ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()),
|
|
55
|
+
)
|
|
56
|
+
elif self.method == "tcp":
|
|
57
|
+
self.tcp_ = TcpClassifier(
|
|
58
|
+
ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()),
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
"`self.method` must be in ('icp', 'tcp')"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def fit(self, X, y):
|
|
67
|
+
"""Fit the `method` to training data (X, y).
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
|
|
71
|
+
X: array-like, shape = [n_samples, n_features];
|
|
72
|
+
Training set vectors, where n_samples is the number
|
|
73
|
+
of samples and n_features is the number of features.
|
|
74
|
+
|
|
75
|
+
y: array-like, shape = [n_samples, ]; Target values.
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
if self.method == "icp":
|
|
79
|
+
|
|
80
|
+
X_train, X_calibration, y_train, y_calibration = train_test_split(
|
|
81
|
+
X, y, test_size=0.5, random_state=self.seed)
|
|
82
|
+
self.icp_.fit(X_train, y_train)
|
|
83
|
+
self.icp_.calibrate(X_calibration, y_calibration)
|
|
84
|
+
|
|
85
|
+
elif self.method == "tcp":
|
|
86
|
+
|
|
87
|
+
self.tcp_.fit(X, y)
|
|
88
|
+
|
|
89
|
+
return self
|
|
90
|
+
|
|
91
|
+
def predict(self, X):
|
|
92
|
+
"""Obtain predictions and prediction sets
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
|
|
96
|
+
X: array-like, shape = [n_samples, n_features];
|
|
97
|
+
Testing set vectors, where n_samples is the number
|
|
98
|
+
of samples and n_features is the number of features.
|
|
99
|
+
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
if self.method == "icp":
|
|
103
|
+
return self.icp_.predict(X, significance=self.alpha_)
|
|
104
|
+
|
|
105
|
+
elif self.method == "tcp":
|
|
106
|
+
return self.tcp_.predict(X, significance=self.alpha_)
|
|
107
|
+
|
|
108
|
+
else:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
"`self.method` must be in ('icp', 'tcp')"
|
|
111
|
+
)
|
|
@@ -20,4 +20,8 @@ unifiedbooster/nonconformist/cp.py
|
|
|
20
20
|
unifiedbooster/nonconformist/evaluation.py
|
|
21
21
|
unifiedbooster/nonconformist/icp.py
|
|
22
22
|
unifiedbooster/nonconformist/nc.py
|
|
23
|
-
unifiedbooster/nonconformist/util.py
|
|
23
|
+
unifiedbooster/nonconformist/util.py
|
|
24
|
+
unifiedbooster/predictioninterval/__init__.py
|
|
25
|
+
unifiedbooster/predictioninterval/predictioninterval.py
|
|
26
|
+
unifiedbooster/predictionset/__init__.py
|
|
27
|
+
unifiedbooster/predictionset/predictionset.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|