unifiedbooster 0.6.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/PKG-INFO +13 -3
  2. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/README.md +6 -1
  3. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/setup.py +1 -3
  4. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/gbdt.py +9 -4
  5. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/gbdt_classification.py +51 -10
  6. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/gbdt_regression.py +61 -9
  7. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/gpoptimization.py +32 -25
  8. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/__init__.py +7 -1
  9. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/base.py +3 -3
  10. unifiedbooster-0.9.0/unifiedbooster/predictioninterval/__init__.py +3 -0
  11. unifiedbooster-0.9.0/unifiedbooster/predictioninterval/predictioninterval.py +314 -0
  12. unifiedbooster-0.9.0/unifiedbooster/predictionset/__init__.py +3 -0
  13. unifiedbooster-0.9.0/unifiedbooster/predictionset/predictionset.py +113 -0
  14. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster.egg-info/PKG-INFO +13 -3
  15. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster.egg-info/SOURCES.txt +5 -1
  16. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster.egg-info/requires.txt +0 -1
  17. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/LICENSE +0 -0
  18. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/setup.cfg +0 -0
  19. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/__init__.py +0 -0
  20. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/acp.py +0 -0
  21. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/cp.py +0 -0
  22. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/evaluation.py +0 -0
  23. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/icp.py +0 -0
  24. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/nc.py +0 -0
  25. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster/nonconformist/util.py +0 -0
  26. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster.egg-info/dependency_links.txt +0 -0
  27. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster.egg-info/entry_points.txt +0 -0
  28. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster.egg-info/not-zip-safe +0 -0
  29. {unifiedbooster-0.6.0 → unifiedbooster-0.9.0}/unifiedbooster.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: unifiedbooster
3
- Version: 0.6.0
3
+ Version: 0.9.0
4
4
  Summary: Unified interface for Gradient Boosted Decision Trees
5
5
  Home-page: https://github.com/thierrymoudiki/unifiedbooster
6
6
  Author: T. Moudiki
@@ -22,8 +22,18 @@ Requires-Dist: numpy
22
22
  Requires-Dist: scikit-learn
23
23
  Requires-Dist: xgboost
24
24
  Requires-Dist: lightgbm
25
- Requires-Dist: catboost
26
25
  Requires-Dist: GPopt
27
26
  Requires-Dist: nnetsauce
27
+ Dynamic: author
28
+ Dynamic: author-email
29
+ Dynamic: classifier
30
+ Dynamic: description
31
+ Dynamic: home-page
32
+ Dynamic: keywords
33
+ Dynamic: license
34
+ Dynamic: license-file
35
+ Dynamic: requires-dist
36
+ Dynamic: requires-python
37
+ Dynamic: summary
28
38
 
29
39
  Unified interface for Gradient Boosted Decision Trees
@@ -7,6 +7,11 @@ Unified interface for Gradient Boosted Decision Trees algorithms
7
7
 
8
8
  ## Examples
9
9
 
10
+ See also:
11
+ - Auto XGBoost, Auto LightGBM, Auto CatBoost, Auto GradientBoosting: https://thierrymoudiki.github.io/blog/2024/08/05/python/r/unibooster
12
+ - Prediction sets and prediction intervals for conformalized Auto XGBoost, Auto LightGBM, Auto CatBoost, Auto GradientBoosting: https://thierrymoudiki.github.io/blog/2024/09/02/python/r/conformalized-unibooster
13
+ - Notebooks in [/unifiedbooster/demo](/unifiedbooster/demo)
14
+
10
15
  ### classification
11
16
 
12
17
  ```python
@@ -90,4 +95,4 @@ for dataset in datasets:
90
95
  print(f"Regression Root Mean Squared Error xgboost: {mse1:.2f}")
91
96
  print(f"Regression Root Mean Squared Error catboost: {mse2:.2f}")
92
97
  print(f"Regression Root Mean Squared Error lightgbm: {mse3:.2f}")
93
- ```
98
+ ```
@@ -7,9 +7,7 @@ from setuptools import setup, find_packages
7
7
  from codecs import open
8
8
  from os import path
9
9
 
10
- subprocess.check_call(['pip', 'install', 'Cython'])
11
-
12
- __version__ = "0.6.0"
10
+ __version__ = "0.9.0"
13
11
 
14
12
  here = path.abspath(path.dirname(__file__))
15
13
 
@@ -44,6 +44,8 @@ class GBDT(BaseEstimator):
44
44
  max_depth=3,
45
45
  rowsample=1.0,
46
46
  colsample=1.0,
47
+ level=None,
48
+ pi_method=None,
47
49
  verbose=0,
48
50
  seed=123,
49
51
  **kwargs
@@ -55,6 +57,8 @@ class GBDT(BaseEstimator):
55
57
  self.max_depth = max_depth
56
58
  self.rowsample = rowsample
57
59
  self.colsample = colsample
60
+ self.level = level
61
+ self.pi_method = pi_method
58
62
  self.verbose = verbose
59
63
  self.seed = seed
60
64
 
@@ -91,7 +95,7 @@ class GBDT(BaseEstimator):
91
95
  "verbose": self.verbose,
92
96
  "random_seed": self.seed,
93
97
  "boosting_type": "Plain",
94
- "leaf_estimation_iterations": 1,
98
+ "leaf_estimation_iterations": 1,
95
99
  "bootstrap_type": "Bernoulli",
96
100
  **kwargs,
97
101
  }
@@ -126,7 +130,6 @@ class GBDT(BaseEstimator):
126
130
 
127
131
  self: object
128
132
  """
129
-
130
133
  if getattr(self, "type_fit") == "classification":
131
134
  self.classes_ = np.unique(y) # for compatibility with sklearn
132
135
  self.n_classes_ = len(
@@ -152,5 +155,7 @@ class GBDT(BaseEstimator):
152
155
 
153
156
  model predictions: {array-like}
154
157
  """
155
-
156
- return getattr(self, "model").predict(X)
158
+ if self.level is not None and self.type_fit == "regression":
159
+ return getattr(self, "model").predict(X, return_pi=True)
160
+ else:
161
+ return getattr(self, "model").predict(X)
@@ -1,5 +1,6 @@
1
1
  from .gbdt import GBDT
2
2
  from sklearn.base import ClassifierMixin
3
+ from .predictionset import PredictionSet
3
4
 
4
5
  try:
5
6
  from xgboost import XGBClassifier
@@ -40,6 +41,12 @@ class GBDTClassifier(GBDT, ClassifierMixin):
40
41
  colsample: float
41
42
  percentage of features to use at each node split
42
43
 
44
+ level: float
45
+ confidence level for prediction sets
46
+
47
+ pi_method: str
48
+ method for constructing the prediction intervals: 'icp' (inductive conformal), 'tcp' (transductive conformal)
49
+
43
50
  verbose: int
44
51
  controls verbosity (default=0)
45
52
 
@@ -97,6 +104,8 @@ class GBDTClassifier(GBDT, ClassifierMixin):
97
104
  max_depth=3,
98
105
  rowsample=1.0,
99
106
  colsample=1.0,
107
+ level=None,
108
+ pi_method="icp",
100
109
  verbose=0,
101
110
  seed=123,
102
111
  **kwargs,
@@ -111,21 +120,54 @@ class GBDTClassifier(GBDT, ClassifierMixin):
111
120
  max_depth=max_depth,
112
121
  rowsample=rowsample,
113
122
  colsample=colsample,
123
+ level=level,
124
+ pi_method=pi_method,
114
125
  verbose=verbose,
115
126
  seed=seed,
116
127
  **kwargs,
117
128
  )
118
129
 
119
- if model_type == "xgboost":
120
- self.model = XGBClassifier(**self.params)
121
- elif model_type == "catboost":
122
- self.model = CatBoostClassifier(**self.params)
123
- elif model_type == "lightgbm":
124
- self.model = LGBMClassifier(**self.params)
125
- elif model_type == "gradientboosting":
126
- self.model = GradientBoostingClassifier(**self.params)
130
+ if self.level is not None:
131
+
132
+ if model_type in ("xgboost", "xgb"):
133
+ self.model = PredictionSet(
134
+ XGBClassifier(**self.params),
135
+ level=self.level,
136
+ method=self.pi_method,
137
+ )
138
+ elif model_type in ("catboost", "cb"):
139
+ self.model = PredictionSet(
140
+ CatBoostClassifier(**self.params),
141
+ level=self.level,
142
+ method=self.pi_method,
143
+ )
144
+ elif model_type in ("lightgbm", "lgb"):
145
+ self.model = PredictionSet(
146
+ LGBMClassifier(**self.params),
147
+ level=self.level,
148
+ method=self.pi_method,
149
+ )
150
+ elif model_type in ("gradientboosting", "gb"):
151
+ self.model = PredictionSet(
152
+ GradientBoostingClassifier(**self.params),
153
+ level=self.level,
154
+ method=self.pi_method,
155
+ )
156
+ else:
157
+ raise ValueError(f"Unknown model_type: {model_type}")
158
+
127
159
  else:
128
- raise ValueError(f"Unknown model_type: {model_type}")
160
+
161
+ if model_type in ("xgboost", "xgb"):
162
+ self.model = XGBClassifier(**self.params)
163
+ elif model_type in ("catboost", "cb"):
164
+ self.model = CatBoostClassifier(**self.params)
165
+ elif model_type in ("lightgbm", "lgb"):
166
+ self.model = LGBMClassifier(**self.params)
167
+ elif model_type in ("gradientboosting", "gb"):
168
+ self.model = GradientBoostingClassifier(**self.params)
169
+ else:
170
+ raise ValueError(f"Unknown model_type: {model_type}")
129
171
 
130
172
  def predict_proba(self, X):
131
173
  """Predict probabilities for test data X.
@@ -143,5 +185,4 @@ class GBDTClassifier(GBDT, ClassifierMixin):
143
185
 
144
186
  probability estimates for test data: {array-like}
145
187
  """
146
-
147
188
  return self.model.predict_proba(X)
@@ -1,5 +1,6 @@
1
1
  from .gbdt import GBDT
2
2
  from sklearn.base import RegressorMixin
3
+ from .predictioninterval import PredictionInterval
3
4
 
4
5
  try:
5
6
  from xgboost import XGBRegressor
@@ -40,6 +41,16 @@ class GBDTRegressor(GBDT, RegressorMixin):
40
41
  colsample: float
41
42
  percentage of features to use at each node split
42
43
 
44
+ level: float
45
+ confidence level for prediction sets
46
+
47
+ pi_method: str
48
+ method for constructing the prediction intervals: 'splitconformal', 'localconformal'
49
+
50
+ type_split: a string;
51
+ Only if `level` is not `None`
52
+ "random" (random split of data) or "sequential" (sequential split of data)
53
+
43
54
  verbose: int
44
55
  controls verbosity (default=0)
45
56
 
@@ -97,12 +108,16 @@ class GBDTRegressor(GBDT, RegressorMixin):
97
108
  max_depth=3,
98
109
  rowsample=1.0,
99
110
  colsample=1.0,
111
+ level=None,
112
+ pi_method="splitconformal",
113
+ type_split="random",
100
114
  verbose=0,
101
115
  seed=123,
102
116
  **kwargs,
103
117
  ):
104
118
 
105
119
  self.type_fit = "regression"
120
+ self.type_split = type_split
106
121
 
107
122
  super().__init__(
108
123
  model_type=model_type,
@@ -111,18 +126,55 @@ class GBDTRegressor(GBDT, RegressorMixin):
111
126
  max_depth=max_depth,
112
127
  rowsample=rowsample,
113
128
  colsample=colsample,
129
+ level=level,
130
+ pi_method=pi_method,
114
131
  verbose=verbose,
115
132
  seed=seed,
116
133
  **kwargs,
117
134
  )
118
135
 
119
- if model_type == "xgboost":
120
- self.model = XGBRegressor(**self.params)
121
- elif model_type == "catboost":
122
- self.model = CatBoostRegressor(**self.params)
123
- elif model_type == "lightgbm":
124
- self.model = LGBMRegressor(**self.params)
125
- elif model_type == "gradientboosting":
126
- self.model = GradientBoostingRegressor(**self.params)
136
+ if self.level is not None:
137
+
138
+ if model_type in ("xgboost", "xgb"):
139
+ self.model = PredictionInterval(
140
+ XGBRegressor(**self.params),
141
+ level=self.level,
142
+ method=self.pi_method,
143
+ type_split=self.type_split
144
+ )
145
+ elif model_type in ("catboost", "cb"):
146
+ self.model = PredictionInterval(
147
+ CatBoostRegressor(**self.params),
148
+ level=self.level,
149
+ method=self.pi_method,
150
+ type_split=self.type_split
151
+ )
152
+ elif model_type in ("lightgbm", "lgb"):
153
+ self.model = PredictionInterval(
154
+ LGBMRegressor(**self.params),
155
+ level=self.level,
156
+ method=self.pi_method,
157
+ type_split=self.type_split
158
+ )
159
+ elif model_type in ("gradientboosting", "gb"):
160
+ self.model = PredictionInterval(
161
+ GradientBoostingRegressor(**self.params),
162
+ level=self.level,
163
+ method=self.pi_method,
164
+ type_split=self.type_split
165
+ )
166
+ else:
167
+ raise ValueError(f"Unknown model_type: {model_type}")
168
+
127
169
  else:
128
- raise ValueError(f"Unknown model_type: {model_type}")
170
+
171
+ if model_type in ("xgboost", "xgb"):
172
+ self.model = XGBRegressor(**self.params)
173
+ elif model_type in ("catboost", "cb"):
174
+ self.model = CatBoostRegressor(**self.params)
175
+ elif model_type in ("lightgbm", "lgb"):
176
+ self.model = LGBMRegressor(**self.params)
177
+ elif model_type in ("gradientboosting", "gb"):
178
+ self.model = GradientBoostingRegressor(**self.params)
179
+ else:
180
+ raise ValueError(f"Unknown model_type: {model_type}")
@@ -168,7 +168,8 @@ def cross_val_optim(
168
168
  ).mean()
169
169
 
170
170
  # objective function for hyperparams tuning
171
- if n_estimators is not None:
171
+ if n_estimators is not None:
172
+
172
173
  def crossval_objective(xx):
173
174
  return gbdt_cv(
174
175
  X_train=X_train,
@@ -185,25 +186,27 @@ def cross_val_optim(
185
186
  scoring=scoring,
186
187
  seed=seed,
187
188
  )
188
- else: # n_estimators is None
189
+
190
+ else: # n_estimators is None
191
+
189
192
  def crossval_objective(xx):
190
193
  return gbdt_cv(
191
- X_train=X_train,
192
- y_train=y_train,
193
- model_type=model_type,
194
- n_estimators=int(10 ** xx[4]),
195
- learning_rate=10 ** xx[0],
196
- max_depth=int(xx[1]),
197
- rowsample=xx[2],
198
- colsample=xx[3],
199
- cv=cv,
200
- n_jobs=n_jobs,
201
- type_fit=type_fit,
202
- scoring=scoring,
203
- seed=seed,
204
- )
194
+ X_train=X_train,
195
+ y_train=y_train,
196
+ model_type=model_type,
197
+ n_estimators=int(10 ** xx[4]),
198
+ learning_rate=10 ** xx[0],
199
+ max_depth=int(xx[1]),
200
+ rowsample=xx[2],
201
+ colsample=xx[3],
202
+ cv=cv,
203
+ n_jobs=n_jobs,
204
+ type_fit=type_fit,
205
+ scoring=scoring,
206
+ seed=seed,
207
+ )
205
208
 
206
- if n_estimators is not None:
209
+ if n_estimators is not None:
207
210
  if surrogate_obj is None:
208
211
  gp_opt = gp.GPOpt(
209
212
  objective_func=crossval_objective,
@@ -240,7 +243,7 @@ def cross_val_optim(
240
243
  n_iter=n_iter,
241
244
  seed=seed,
242
245
  )
243
- else: # n_estimators is None
246
+ else: # n_estimators is None
244
247
  if surrogate_obj is None:
245
248
  gp_opt = gp.GPOpt(
246
249
  objective_func=crossval_objective,
@@ -251,7 +254,7 @@ def cross_val_optim(
251
254
  "max_depth",
252
255
  "rowsample",
253
256
  "colsample",
254
- "n_estimators"
257
+ "n_estimators",
255
258
  ],
256
259
  method="bayesian",
257
260
  n_init=n_init,
@@ -268,7 +271,7 @@ def cross_val_optim(
268
271
  "max_depth",
269
272
  "rowsample",
270
273
  "colsample",
271
- "n_estimators"
274
+ "n_estimators",
272
275
  ],
273
276
  acquisition="ucb",
274
277
  method="splitconformal",
@@ -282,7 +285,11 @@ def cross_val_optim(
282
285
 
283
286
  res = gp_opt.optimize(verbose=verbose, abs_tol=abs_tol)
284
287
  res.best_params["model_type"] = model_type
285
- res.best_params["n_estimators"] = int(n_estimators) if n_estimators is not None else int(10 ** res.best_params["n_estimators"])
288
+ res.best_params["n_estimators"] = (
289
+ int(n_estimators)
290
+ if n_estimators is not None
291
+ else int(10 ** res.best_params["n_estimators"])
292
+ )
286
293
  res.best_params["learning_rate"] = 10 ** res.best_params["learning_rate"]
287
294
  res.best_params["max_depth"] = int(res.best_params["max_depth"])
288
295
  res.best_params["rowsample"] = res.best_params["rowsample"]
@@ -355,7 +362,7 @@ def lazy_cross_val_optim(
355
362
 
356
363
  customize: boolean
357
364
  if True, the surrogate is transformed into a quasi-randomized network (default is False)
358
-
365
+
359
366
  n_estimators: int
360
367
  maximum number of trees that can be built (default is None, if None, the parameters is tuned)
361
368
 
@@ -383,7 +390,7 @@ def lazy_cross_val_optim(
383
390
  Examples:
384
391
 
385
392
  ```python
386
- import os
393
+ import os
387
394
  import unifiedbooster as ub
388
395
  from sklearn.datasets import load_breast_cancer
389
396
  from sklearn.model_selection import train_test_split
@@ -454,7 +461,7 @@ def lazy_cross_val_optim(
454
461
  if customize == True:
455
462
  print(f"\n surrogate: CustomRegressor({est[0]})")
456
463
  surr_obj = ns.CustomRegressor(obj=est[1]())
457
- else:
464
+ else:
458
465
  print(f"\n surrogate: {est[0]}")
459
466
  surr_obj = est[1]()
460
467
  res = cross_val_optim(
@@ -479,7 +486,7 @@ def lazy_cross_val_optim(
479
486
  if customize == True:
480
487
  results.append((f"CustomRegressor({est[0]})", res))
481
488
  else:
482
- results.append((est[0], res))
489
+ results.append((est[0], res))
483
490
  except:
484
491
  pass
485
492
 
@@ -18,13 +18,19 @@ from .nc import (
18
18
  )
19
19
  from .cp import IcpRegressor, TcpClassifier
20
20
  from .icp import IcpClassifier
21
- from .base import RegressorAdapter
21
+ from .nc import ClassifierNc, MarginErrFunc
22
+ from .base import RegressorAdapter, ClassifierAdapter
22
23
 
23
24
  __all__ = [
24
25
  "AbsErrorErrFunc",
26
+ "MarginErrFunc",
25
27
  "QuantileRegErrFunc",
26
28
  "RegressorAdapter",
29
+ "ClassifierAdapter",
27
30
  "RegressorNc",
31
+ "ClassifierNc",
28
32
  "RegressorNormalizer",
29
33
  "IcpRegressor",
34
+ "IcpClassifier",
35
+ "TcpClassifier",
30
36
  ]
@@ -9,7 +9,7 @@ docstring
9
9
  import abc
10
10
  import numpy as np
11
11
 
12
- from sklearn.base import BaseEstimator
12
+ from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
13
13
 
14
14
 
15
15
  class RegressorMixin(object):
@@ -102,7 +102,7 @@ class BaseModelAdapter(BaseEstimator):
102
102
  pass
103
103
 
104
104
 
105
- class ClassifierAdapter(BaseModelAdapter):
105
+ class ClassifierAdapter(BaseModelAdapter, ClassifierMixin):
106
106
  def __init__(self, model, fit_params=None):
107
107
  super(ClassifierAdapter, self).__init__(model, fit_params)
108
108
 
@@ -110,7 +110,7 @@ class ClassifierAdapter(BaseModelAdapter):
110
110
  return self.model.predict_proba(x)
111
111
 
112
112
 
113
- class RegressorAdapter(BaseModelAdapter):
113
+ class RegressorAdapter(BaseModelAdapter, RegressorMixin):
114
114
  def __init__(self, model, fit_params=None):
115
115
  super(RegressorAdapter, self).__init__(model, fit_params)
116
116
 
@@ -0,0 +1,3 @@
1
+ from .predictioninterval import PredictionInterval
2
+
3
+ __all__ = ["PredictionInterval"]
@@ -0,0 +1,314 @@
1
+ from locale import normalize
2
+ import numpy as np
3
+ import pickle
4
+ from collections import namedtuple
5
+ from sklearn.base import BaseEstimator, RegressorMixin
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.ensemble import ExtraTreesRegressor
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.neighbors import KernelDensity
10
+ from sklearn.model_selection import GridSearchCV
11
+ from scipy.stats import gaussian_kde
12
+ from tqdm import tqdm
13
+ from ..nonconformist import IcpRegressor
14
+ from ..nonconformist import RegressorNc
15
+ from ..nonconformist import RegressorNormalizer, AbsErrorErrFunc
16
+
17
+
18
+ class PredictionInterval(BaseEstimator, RegressorMixin):
19
+ """Class PredictionInterval: Obtain prediction intervals.
20
+
21
+ Attributes:
22
+
23
+ obj: an object;
24
+ fitted object containing methods `fit` and `predict`
25
+
26
+ method: a string;
27
+ method for constructing the prediction intervals.
28
+ Currently "splitconformal" (default) and "localconformal"
29
+
30
+ level: a float;
31
+ Confidence level for prediction intervals. Default is 95,
32
+ equivalent to a miscoverage error of 5 (%)
33
+
34
+ replications: an integer;
35
+ Number of replications for simulated conformal (default is `None`)
36
+
37
+ type_pi: a string;
38
+ type of prediction interval: currently "kde" (default) or "bootstrap"
39
+
40
+ type_split: a string;
41
+ "random" (random split of data) or "sequential" (sequential split of data)
42
+
43
+ seed: an integer;
44
+ Reproducibility of fit (there's a random split between fitting and calibration data)
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ obj,
50
+ method="splitconformal",
51
+ level=95,
52
+ type_pi="bootstrap",
53
+ type_split="random",
54
+ replications=None,
55
+ kernel=None,
56
+ agg="mean",
57
+ seed=123,
58
+ ):
59
+
60
+ self.obj = obj
61
+ self.method = method
62
+ self.level = level
63
+ self.type_pi = type_pi
64
+ self.type_split = type_split
65
+ self.replications = replications
66
+ self.kernel = kernel
67
+ self.agg = agg
68
+ self.seed = seed
69
+ self.alpha_ = 1 - self.level / 100
70
+ self.quantile_ = None
71
+ self.icp_ = None
72
+ self.calibrated_residuals_ = None
73
+ self.scaled_calibrated_residuals_ = None
74
+ self.calibrated_residuals_scaler_ = None
75
+ self.kde_ = None
76
+
77
+ def fit(self, X, y):
78
+ """Fit the `method` to training data (X, y).
79
+
80
+ Args:
81
+
82
+ X: array-like, shape = [n_samples, n_features];
83
+ Training set vectors, where n_samples is the number
84
+ of samples and n_features is the number of features.
85
+
86
+ y: array-like, shape = [n_samples, ]; Target values.
87
+
88
+ """
89
+
90
+ if self.type_split == "random":
91
+ X_train, X_calibration, y_train, y_calibration = train_test_split(
92
+ X, y, test_size=0.5, random_state=self.seed
93
+ )
94
+ elif self.type_split == "sequential":
95
+ n_x = X.shape[0]
96
+ n_x_half = n_x // 2
97
+ first_half_idx = range(0, n_x_half)
98
+ second_half_idx = range(n_x_half, n_x)
99
+ X_train = X[first_half_idx, :]
100
+ X_calibration = X[second_half_idx, :]
101
+ y_train = y[first_half_idx]
102
+ y_calibration = y[second_half_idx]
103
+
104
+ if self.method == "splitconformal":
105
+
106
+ n_samples_calibration = X_calibration.shape[0]
107
+ self.obj.fit(X_train, y_train)
108
+ preds_calibration = self.obj.predict(X_calibration)
109
+ self.calibrated_residuals_ = y_calibration - preds_calibration
110
+ absolute_residuals = np.abs(self.calibrated_residuals_)
111
+ self.calibrated_residuals_scaler_ = StandardScaler(
112
+ with_mean=True, with_std=True
113
+ )
114
+ self.scaled_calibrated_residuals_ = (
115
+ self.calibrated_residuals_scaler_.fit_transform(
116
+ self.calibrated_residuals_.reshape(-1, 1)
117
+ ).ravel()
118
+ )
119
+ try:
120
+ # numpy version >= 1.22
121
+ self.quantile_ = np.quantile(
122
+ a=absolute_residuals, q=self.level / 100, method="higher"
123
+ )
124
+ except:
125
+ # numpy version < 1.22
126
+ self.quantile_ = np.quantile(
127
+ a=absolute_residuals,
128
+ q=self.level / 100,
129
+ interpolation="higher",
130
+ )
131
+
132
+ if self.method == "localconformal":
133
+
134
+ mad_estimator = ExtraTreesRegressor()
135
+ normalizer = RegressorNormalizer(
136
+ self.obj, mad_estimator, AbsErrorErrFunc()
137
+ )
138
+ nc = RegressorNc(self.obj, AbsErrorErrFunc(), normalizer)
139
+ self.icp_ = IcpRegressor(nc)
140
+ self.icp_.fit(X_train, y_train)
141
+ self.icp_.calibrate(X_calibration, y_calibration)
142
+
143
+ return self
144
+
145
+ def predict(self, X, return_pi=False):
146
+ """Obtain predictions and prediction intervals
147
+
148
+ Args:
149
+
150
+ X: array-like, shape = [n_samples, n_features];
151
+ Testing set vectors, where n_samples is the number
152
+ of samples and n_features is the number of features.
153
+
154
+ return_pi: boolean
155
+ Whether the prediction interval is returned or not.
156
+ Default is False, for compatibility with other _estimators_.
157
+ If True, a tuple containing the predictions + lower and upper
158
+ bounds is returned.
159
+
160
+ """
161
+
162
+ pred = self.obj.predict(X)
163
+
164
+ if self.method == "splitconformal":
165
+
166
+ if self.replications is None:
167
+
168
+ if return_pi:
169
+
170
+ DescribeResult = namedtuple(
171
+ "DescribeResult", ("mean", "lower", "upper")
172
+ )
173
+
174
+ return DescribeResult(
175
+ pred, pred - self.quantile_, pred + self.quantile_
176
+ )
177
+
178
+ else:
179
+
180
+ return pred
181
+
182
+ else: # if self.replications is not None
183
+
184
+ assert self.type_pi in (
185
+ "bootstrap",
186
+ "kde",
187
+ ), "`self.type_pi` must be in ('bootstrap', 'kde')"
188
+
189
+ if self.type_pi == "bootstrap":
190
+ np.random.seed(self.seed)
191
+ self.residuals_sims_ = np.asarray(
192
+ [
193
+ np.random.choice(
194
+ a=self.scaled_calibrated_residuals_,
195
+ size=X.shape[0],
196
+ )
197
+ for _ in range(self.replications)
198
+ ]
199
+ ).T
200
+ self.sims_ = np.asarray(
201
+ [
202
+ pred
203
+ + self.calibrated_residuals_scaler_.scale_[0]
204
+ * self.residuals_sims_[:, i].ravel()
205
+ for i in range(self.replications)
206
+ ]
207
+ ).T
208
+ elif self.type_pi == "kde":
209
+ self.kde_ = gaussian_kde(
210
+ dataset=self.scaled_calibrated_residuals_
211
+ )
212
+ self.sims_ = np.asarray(
213
+ [
214
+ pred
215
+ + self.calibrated_residuals_scaler_.scale_[0]
216
+ * self.kde_.resample(
217
+ size=X.shape[0], seed=self.seed + i
218
+ ).ravel()
219
+ for i in range(self.replications)
220
+ ]
221
+ ).T
222
+
223
+ self.mean_ = np.mean(self.sims_, axis=1)
224
+ self.lower_ = np.quantile(
225
+ self.sims_, q=self.alpha_ / 200, axis=1
226
+ )
227
+ self.upper_ = np.quantile(
228
+ self.sims_, q=1 - self.alpha_ / 200, axis=1
229
+ )
230
+
231
+ DescribeResult = namedtuple(
232
+ "DescribeResult", ("mean", "sims", "lower", "upper")
233
+ )
234
+
235
+ return DescribeResult(
236
+ self.mean_, self.sims_, self.lower_, self.upper_
237
+ )
238
+
239
+ if self.method == "localconformal":
240
+
241
+ if self.replications is None:
242
+
243
+ if return_pi:
244
+
245
+ predictions_bounds = self.icp_.predict(
246
+ X, significance=1 - self.level
247
+ )
248
+ DescribeResult = namedtuple(
249
+ "DescribeResult", ("mean", "lower", "upper")
250
+ )
251
+ return DescribeResult(
252
+ pred, predictions_bounds[:, 0], predictions_bounds[:, 1]
253
+ )
254
+
255
+ else:
256
+
257
+ return pred
258
+
259
+ else: # if self.replications is not None
260
+
261
+ assert self.type_pi in (
262
+ "bootstrap",
263
+ "kde",
264
+ ), "`self.type_pi` must be in ('bootstrap', 'kde')"
265
+
266
+ if self.type_pi == "bootstrap":
267
+ np.random.seed(self.seed)
268
+ self.residuals_sims_ = np.asarray(
269
+ [
270
+ np.random.choice(
271
+ a=self.scaled_calibrated_residuals_,
272
+ size=X.shape[0],
273
+ )
274
+ for _ in range(self.replications)
275
+ ]
276
+ ).T
277
+ self.sims_ = np.asarray(
278
+ [
279
+ pred
280
+ + self.calibrated_residuals_scaler_.scale_[0]
281
+ * self.residuals_sims_[:, i].ravel()
282
+ for i in tqdm(range(self.replications))
283
+ ]
284
+ ).T
285
+ elif self.type_pi == "kde":
286
+ self.kde_ = gaussian_kde(
287
+ dataset=self.scaled_calibrated_residuals_
288
+ )
289
+ self.sims_ = np.asarray(
290
+ [
291
+ pred
292
+ + self.calibrated_residuals_scaler_.scale_[0]
293
+ * self.kde_.resample(
294
+ size=X.shape[0], seed=self.seed + i
295
+ ).ravel()
296
+ for i in tqdm(range(self.replications))
297
+ ]
298
+ ).T
299
+
300
+ self.mean_ = np.mean(self.sims_, axis=1)
301
+ self.lower_ = np.quantile(
302
+ self.sims_, q=self.alpha_ / 200, axis=1
303
+ )
304
+ self.upper_ = np.quantile(
305
+ self.sims_, q=1 - self.alpha_ / 200, axis=1
306
+ )
307
+
308
+ DescribeResult = namedtuple(
309
+ "DescribeResult", ("mean", "sims", "lower", "upper")
310
+ )
311
+
312
+ return DescribeResult(
313
+ self.mean_, self.sims_, self.lower_, self.upper_
314
+ )
@@ -0,0 +1,3 @@
1
+ from .predictionset import PredictionSet
2
+
3
+ __all__ = ["PredictionSet"]
@@ -0,0 +1,113 @@
1
+ from locale import normalize
2
+ import numpy as np
3
+ import pickle
4
+ from collections import namedtuple
5
+ from sklearn.base import BaseEstimator, ClassifierMixin
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.ensemble import ExtraTreesRegressor
8
+ from sklearn.preprocessing import StandardScaler
9
+ from scipy.stats import gaussian_kde
10
+ from tqdm import tqdm
11
+ from ..nonconformist import (
12
+ ClassifierAdapter,
13
+ IcpClassifier,
14
+ TcpClassifier,
15
+ ClassifierNc,
16
+ MarginErrFunc,
17
+ )
18
+
19
+
20
+ class PredictionSet(BaseEstimator, ClassifierMixin):
21
+ """Class PredictionSet: Obtain prediction sets.
22
+
23
+ Attributes:
24
+
25
+ obj: an object;
26
+ fitted object containing methods `fit` and `predict`
27
+
28
+ method: a string;
29
+ method for constructing the prediction sets.
30
+ Currently "icp" (default, inductive conformal) and "tcp" (transductive conformal)
31
+
32
+ level: a float;
33
+ Confidence level for prediction sets. Default is None,
34
+ 95 is equivalent to a miscoverage error of 5 (%)
35
+
36
+ seed: an integer;
37
+ Reproducibility of fit (there's a random split between fitting and calibration data)
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ obj,
43
+ method="icp",
44
+ level=None,
45
+ seed=123,
46
+ ):
47
+
48
+ self.obj = obj
49
+ self.method = method
50
+ self.level = level
51
+ self.seed = seed
52
+ if self.level is not None:
53
+ self.alpha_ = 1 - self.level / 100
54
+ self.quantile_ = None
55
+ self.icp_ = None
56
+ self.tcp_ = None
57
+
58
+ if self.method == "icp":
59
+ self.icp_ = IcpClassifier(
60
+ ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()),
61
+ )
62
+ elif self.method == "tcp":
63
+ self.tcp_ = TcpClassifier(
64
+ ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()),
65
+ )
66
+ else:
67
+ raise ValueError("`self.method` must be in ('icp', 'tcp')")
68
+
69
+ def fit(self, X, y):
70
+ """Fit the `method` to training data (X, y).
71
+
72
+ Args:
73
+
74
+ X: array-like, shape = [n_samples, n_features];
75
+ Training set vectors, where n_samples is the number
76
+ of samples and n_features is the number of features.
77
+
78
+ y: array-like, shape = [n_samples, ]; Target values.
79
+
80
+ """
81
+ if self.method == "icp":
82
+
83
+ X_train, X_calibration, y_train, y_calibration = train_test_split(
84
+ X, y, test_size=0.5, random_state=self.seed
85
+ )
86
+ self.icp_.fit(X_train, y_train)
87
+ self.icp_.calibrate(X_calibration, y_calibration)
88
+
89
+ elif self.method == "tcp":
90
+
91
+ self.tcp_.fit(X, y)
92
+
93
+ return self
94
+
95
+ def predict(self, X):
96
+ """Obtain predictions and prediction sets
97
+
98
+ Args:
99
+
100
+ X: array-like, shape = [n_samples, n_features];
101
+ Testing set vectors, where n_samples is the number
102
+ of samples and n_features is the number of features.
103
+
104
+ """
105
+
106
+ if self.method == "icp":
107
+ return self.icp_.predict(X, significance=self.alpha_)
108
+
109
+ elif self.method == "tcp":
110
+ return self.tcp_.predict(X, significance=self.alpha_)
111
+
112
+ else:
113
+ raise ValueError("`self.method` must be in ('icp', 'tcp')")
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: unifiedbooster
3
- Version: 0.6.0
3
+ Version: 0.9.0
4
4
  Summary: Unified interface for Gradient Boosted Decision Trees
5
5
  Home-page: https://github.com/thierrymoudiki/unifiedbooster
6
6
  Author: T. Moudiki
@@ -22,8 +22,18 @@ Requires-Dist: numpy
22
22
  Requires-Dist: scikit-learn
23
23
  Requires-Dist: xgboost
24
24
  Requires-Dist: lightgbm
25
- Requires-Dist: catboost
26
25
  Requires-Dist: GPopt
27
26
  Requires-Dist: nnetsauce
27
+ Dynamic: author
28
+ Dynamic: author-email
29
+ Dynamic: classifier
30
+ Dynamic: description
31
+ Dynamic: home-page
32
+ Dynamic: keywords
33
+ Dynamic: license
34
+ Dynamic: license-file
35
+ Dynamic: requires-dist
36
+ Dynamic: requires-python
37
+ Dynamic: summary
28
38
 
29
39
  Unified interface for Gradient Boosted Decision Trees
@@ -20,4 +20,8 @@ unifiedbooster/nonconformist/cp.py
20
20
  unifiedbooster/nonconformist/evaluation.py
21
21
  unifiedbooster/nonconformist/icp.py
22
22
  unifiedbooster/nonconformist/nc.py
23
- unifiedbooster/nonconformist/util.py
23
+ unifiedbooster/nonconformist/util.py
24
+ unifiedbooster/predictioninterval/__init__.py
25
+ unifiedbooster/predictioninterval/predictioninterval.py
26
+ unifiedbooster/predictionset/__init__.py
27
+ unifiedbooster/predictionset/predictionset.py
@@ -3,6 +3,5 @@ numpy
3
3
  scikit-learn
4
4
  xgboost
5
5
  lightgbm
6
- catboost
7
6
  GPopt
8
7
  nnetsauce
File without changes
File without changes