unifiedbooster 0.3.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,12 @@
1
1
  from .gbdt import GBDT
2
2
  from .gbdt_classification import GBDTClassifier
3
3
  from .gbdt_regression import GBDTRegressor
4
+ from .gpoptimization import cross_val_optim, lazy_cross_val_optim
4
5
 
5
- __all__ = ["GBDT", "GBDTClassifier", "GBDTRegressor"]
6
+ __all__ = [
7
+ "GBDT",
8
+ "GBDTClassifier",
9
+ "GBDTRegressor",
10
+ "cross_val_optim",
11
+ "lazy_cross_val_optim",
12
+ ]
unifiedbooster/gbdt.py CHANGED
@@ -17,6 +17,9 @@ class GBDT(BaseEstimator):
17
17
  learning_rate: float
18
18
  shrinkage rate; used for reducing the gradient step
19
19
 
20
+ max_depth: int
21
+ maximum tree depth
22
+
20
23
  rowsample: float
21
24
  subsample ratio of the training instances
22
25
 
@@ -87,7 +90,7 @@ class GBDT(BaseEstimator):
87
90
  "depth": self.max_depth,
88
91
  "verbose": self.verbose,
89
92
  "random_seed": self.seed,
90
- "bootstrap_type": "Bernoulli",
93
+ "bootstrap_type": "MVS",
91
94
  **kwargs,
92
95
  }
93
96
  elif self.model_type == "gradientboosting":
@@ -1,12 +1,18 @@
1
1
  from .gbdt import GBDT
2
2
  from sklearn.base import ClassifierMixin
3
- from xgboost import XGBClassifier
4
3
 
4
+ try:
5
+ from xgboost import XGBClassifier
6
+ except:
7
+ pass
5
8
  try:
6
9
  from catboost import CatBoostClassifier
7
10
  except:
8
- print("catboost package can't be built")
9
- from lightgbm import LGBMClassifier
11
+ pass
12
+ try:
13
+ from lightgbm import LGBMClassifier
14
+ except:
15
+ pass
10
16
  from sklearn.ensemble import GradientBoostingClassifier
11
17
 
12
18
 
@@ -25,6 +31,9 @@ class GBDTClassifier(GBDT, ClassifierMixin):
25
31
  learning_rate: float
26
32
  shrinkage rate; used for reducing the gradient step
27
33
 
34
+ max_depth: int
35
+ maximum tree depth
36
+
28
37
  rowsample: float
29
38
  subsample ratio of the training instances
30
39
 
@@ -1,12 +1,18 @@
1
1
  from .gbdt import GBDT
2
2
  from sklearn.base import RegressorMixin
3
- from xgboost import XGBRegressor
4
3
 
4
+ try:
5
+ from xgboost import XGBRegressor
6
+ except:
7
+ pass
5
8
  try:
6
9
  from catboost import CatBoostRegressor
7
10
  except:
8
- print("catboost package can't be built")
9
- from lightgbm import LGBMRegressor
11
+ pass
12
+ try:
13
+ from lightgbm import LGBMRegressor
14
+ except:
15
+ pass
10
16
  from sklearn.ensemble import GradientBoostingRegressor
11
17
 
12
18
 
@@ -25,6 +31,9 @@ class GBDTRegressor(GBDT, RegressorMixin):
25
31
  learning_rate: float
26
32
  shrinkage rate; used for reducing the gradient step
27
33
 
34
+ max_depth: int
35
+ maximum tree depth
36
+
28
37
  rowsample: float
29
38
  subsample ratio of the training instances
30
39
 
@@ -0,0 +1,422 @@
1
+ import GPopt as gp
2
+ import nnetsauce as ns
3
+ import numpy as np
4
+ from collections import namedtuple
5
+ from .gbdt_classification import GBDTClassifier
6
+ from .gbdt_regression import GBDTRegressor
7
+ from sklearn.model_selection import cross_val_score
8
+ from sklearn.base import ClassifierMixin, RegressorMixin
9
+ from sklearn.utils import all_estimators
10
+ from sklearn import metrics
11
+
12
+
13
+ def cross_val_optim(
14
+ X_train,
15
+ y_train,
16
+ X_test=None,
17
+ y_test=None,
18
+ model_type="xgboost",
19
+ type_fit="classification",
20
+ scoring="accuracy",
21
+ n_estimators=100,
22
+ surrogate_obj=None,
23
+ cv=5,
24
+ n_jobs=None,
25
+ n_init=10,
26
+ n_iter=190,
27
+ abs_tol=1e-3,
28
+ verbose=2,
29
+ seed=123,
30
+ ):
31
+ """Cross-validation function and hyperparameters' search
32
+
33
+ Parameters:
34
+
35
+ X_train: array-like,
36
+ Training vectors, where rows is the number of samples
37
+ and columns is the number of features.
38
+
39
+ y_train: array-like,
40
+ Training vectors, where rows is the number of samples
41
+ and columns is the number of features.
42
+
43
+ X_test: array-like,
44
+ Testing vectors, where rows is the number of samples
45
+ and columns is the number of features.
46
+
47
+ y_test: array-like,
48
+ Testing vectors, where rows is the number of samples
49
+ and columns is the number of features.
50
+
51
+ model_type: str
52
+ type of gradient boosting algorithm: 'xgboost', 'lightgbm',
53
+ 'catboost', 'gradientboosting'
54
+
55
+ type_fit: str
56
+ "regression" or "classification"
57
+
58
+ scoring: str
59
+ scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
60
+
61
+ n_estimators: int
62
+ maximum number of trees that can be built
63
+
64
+ surrogate_obj: an object;
65
+ An ML model for estimating the uncertainty around the objective function
66
+
67
+ cv: int;
68
+ number of cross-validation folds
69
+
70
+ n_jobs: int;
71
+ number of jobs for parallel execution
72
+
73
+ n_init: an integer;
74
+ number of points in the initial setting, when `x_init` and `y_init` are not provided
75
+
76
+ n_iter: an integer;
77
+ number of iterations of the minimization algorithm
78
+
79
+ abs_tol: a float;
80
+ tolerance for convergence of the optimizer (early stopping based on acquisition function)
81
+
82
+ verbose: int
83
+ controls verbosity
84
+
85
+ seed: int
86
+ reproducibility seed
87
+
88
+ Examples:
89
+
90
+ ```python
91
+ import unifiedbooster as ub
92
+ from sklearn.datasets import load_breast_cancer
93
+ from sklearn.model_selection import train_test_split
94
+
95
+ dataset = load_breast_cancer()
96
+ X, y = dataset.data, dataset.target
97
+ X_train, X_test, y_train, y_test = train_test_split(
98
+ X, y, test_size=0.2, random_state=42
99
+ )
100
+
101
+ res1 = ub.cross_val_optim(
102
+ X_train,
103
+ y_train,
104
+ X_test=None,
105
+ y_test=None,
106
+ model_type="lightgbm",
107
+ type_fit="classification",
108
+ scoring="accuracy",
109
+ n_estimators=100,
110
+ surrogate_obj=None,
111
+ cv=5,
112
+ n_jobs=None,
113
+ n_init=10,
114
+ n_iter=190,
115
+ abs_tol=1e-3,
116
+ verbose=2,
117
+ seed=123,
118
+ )
119
+ print(res1)
120
+ ```
121
+ """
122
+
123
+ def gbdt_cv(
124
+ X_train,
125
+ y_train,
126
+ model_type="xgboost",
127
+ n_estimators=100,
128
+ learning_rate=0.1,
129
+ max_depth=3,
130
+ rowsample=1.0,
131
+ colsample=1.0,
132
+ cv=5,
133
+ n_jobs=None,
134
+ type_fit="classification",
135
+ scoring="accuracy",
136
+ seed=123,
137
+ ):
138
+ if type_fit == "regression":
139
+ estimator = GBDTRegressor(
140
+ model_type=model_type,
141
+ n_estimators=n_estimators,
142
+ learning_rate=learning_rate,
143
+ max_depth=max_depth,
144
+ rowsample=rowsample,
145
+ colsample=colsample,
146
+ verbose=0,
147
+ seed=seed,
148
+ )
149
+ elif type_fit == "classification":
150
+ estimator = GBDTClassifier(
151
+ model_type=model_type,
152
+ n_estimators=n_estimators,
153
+ learning_rate=learning_rate,
154
+ max_depth=max_depth,
155
+ rowsample=rowsample,
156
+ colsample=colsample,
157
+ verbose=0,
158
+ seed=seed,
159
+ )
160
+ return -cross_val_score(
161
+ estimator,
162
+ X_train,
163
+ y_train,
164
+ scoring=scoring,
165
+ cv=cv,
166
+ n_jobs=n_jobs,
167
+ verbose=0,
168
+ ).mean()
169
+
170
+ # objective function for hyperparams tuning
171
+ def crossval_objective(xx):
172
+ return gbdt_cv(
173
+ X_train=X_train,
174
+ y_train=y_train,
175
+ model_type=model_type,
176
+ n_estimators=n_estimators,
177
+ learning_rate=10 ** xx[0],
178
+ max_depth=int(xx[1]),
179
+ rowsample=xx[2],
180
+ colsample=xx[3],
181
+ cv=cv,
182
+ n_jobs=n_jobs,
183
+ type_fit=type_fit,
184
+ scoring=scoring,
185
+ seed=seed,
186
+ )
187
+
188
+ if surrogate_obj is None:
189
+ gp_opt = gp.GPOpt(
190
+ objective_func=crossval_objective,
191
+ lower_bound=np.array([-6, 1, 0.5, 0.5]),
192
+ upper_bound=np.array([0, 16, 1.0, 1.0]),
193
+ params_names=[
194
+ "learning_rate",
195
+ "max_depth",
196
+ "rowsample",
197
+ "colsample",
198
+ ],
199
+ method="bayesian",
200
+ n_init=n_init,
201
+ n_iter=n_iter,
202
+ seed=seed,
203
+ )
204
+ else:
205
+ gp_opt = gp.GPOpt(
206
+ objective_func=crossval_objective,
207
+ lower_bound=np.array([-6, 1, 0.5, 0.5]),
208
+ upper_bound=np.array([0, 16, 1.0, 1.0]),
209
+ params_names=[
210
+ "learning_rate",
211
+ "max_depth",
212
+ "rowsample",
213
+ "colsample",
214
+ ],
215
+ acquisition="ucb",
216
+ method="splitconformal",
217
+ surrogate_obj=ns.PredictionInterval(
218
+ obj=surrogate_obj, method="splitconformal"
219
+ ),
220
+ n_init=n_init,
221
+ n_iter=n_iter,
222
+ seed=seed,
223
+ )
224
+
225
+ res = gp_opt.optimize(verbose=verbose, abs_tol=abs_tol)
226
+ res.best_params["model_type"] = model_type
227
+ res.best_params["n_estimators"] = int(n_estimators)
228
+ res.best_params["learning_rate"] = 10 ** res.best_params["learning_rate"]
229
+ res.best_params["max_depth"] = int(res.best_params["max_depth"])
230
+ res.best_params["rowsample"] = res.best_params["rowsample"]
231
+ res.best_params["colsample"] = res.best_params["colsample"]
232
+
233
+ # out-of-sample error
234
+ if X_test is not None and y_test is not None:
235
+ if type_fit == "regression":
236
+ estimator = GBDTRegressor(**res.best_params, verbose=0, seed=seed)
237
+ elif type_fit == "classification":
238
+ estimator = GBDTClassifier(**res.best_params, verbose=0, seed=seed)
239
+ preds = estimator.fit(X_train, y_train).predict(X_test)
240
+ # check error on y_test
241
+ oos_err = getattr(metrics, scoring + "_score")(
242
+ y_true=y_test, y_pred=preds
243
+ )
244
+ result = namedtuple("result", res._fields + ("test_" + scoring,))
245
+ return result(*res, oos_err)
246
+ else:
247
+ return res
248
+
249
+
250
+ def lazy_cross_val_optim(
251
+ X_train,
252
+ y_train,
253
+ X_test=None,
254
+ y_test=None,
255
+ model_type="xgboost",
256
+ type_fit="classification",
257
+ scoring="accuracy",
258
+ customize=False,
259
+ n_estimators=100,
260
+ cv=5,
261
+ n_jobs=None,
262
+ n_init=10,
263
+ n_iter=190,
264
+ abs_tol=1e-3,
265
+ verbose=1,
266
+ seed=123,
267
+ ):
268
+ """Automated Cross-validation function and hyperparameters' search using multiple surrogates
269
+
270
+ Parameters:
271
+
272
+ X_train: array-like,
273
+ Training vectors, where rows is the number of samples
274
+ and columns is the number of features.
275
+
276
+ y_train: array-like,
277
+ Training vectors, where rows is the number of samples
278
+ and columns is the number of features.
279
+
280
+ X_test: array-like,
281
+ Testing vectors, where rows is the number of samples
282
+ and columns is the number of features.
283
+
284
+ y_test: array-like,
285
+ Testing vectors, where rows is the number of samples
286
+ and columns is the number of features.
287
+
288
+ model_type: str
289
+ type of gradient boosting algorithm: 'xgboost', 'lightgbm',
290
+ 'catboost', 'gradientboosting'
291
+
292
+ type_fit: str
293
+ "regression" or "classification"
294
+
295
+ scoring: str
296
+ scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
297
+
298
+ customize: boolean
299
+ if True, the surrogate is transformed into a quasi-randomized network (default is False)
300
+
301
+ n_estimators: int
302
+ maximum number of trees that can be built
303
+
304
+ cv: int;
305
+ number of cross-validation folds
306
+
307
+ n_jobs: int;
308
+ number of jobs for parallel execution
309
+
310
+ n_init: an integer;
311
+ number of points in the initial setting, when `x_init` and `y_init` are not provided
312
+
313
+ n_iter: an integer;
314
+ number of iterations of the minimization algorithm
315
+
316
+ abs_tol: a float;
317
+ tolerance for convergence of the optimizer (early stopping based on acquisition function)
318
+
319
+ verbose: int
320
+ controls verbosity
321
+
322
+ seed: int
323
+ reproducibility seed
324
+
325
+ Examples:
326
+
327
+ ```python
328
+ import unifiedbooster as ub
329
+ from sklearn.datasets import load_breast_cancer
330
+ from sklearn.model_selection import train_test_split
331
+
332
+ dataset = load_breast_cancer()
333
+ X, y = dataset.data, dataset.target
334
+ X_train, X_test, y_train, y_test = train_test_split(
335
+ X, y, test_size=0.2, random_state=42
336
+ )
337
+
338
+ res1 = ub.cross_val_optim(
339
+ X_train,
340
+ y_train,
341
+ X_test=None,
342
+ y_test=None,
343
+ model_type="lightgbm",
344
+ type_fit="classification",
345
+ scoring="accuracy",
346
+ n_estimators=100,
347
+ surrogate_obj=None,
348
+ cv=5,
349
+ n_jobs=None,
350
+ n_init=10,
351
+ n_iter=190,
352
+ abs_tol=1e-3,
353
+ verbose=2,
354
+ seed=123,
355
+ )
356
+ print(res1)
357
+ ```
358
+ """
359
+
360
+ removed_regressors = [
361
+ "TheilSenRegressor",
362
+ "ARDRegression",
363
+ "CCA",
364
+ "GaussianProcessRegressor",
365
+ "GradientBoostingRegressor",
366
+ "HistGradientBoostingRegressor",
367
+ "IsotonicRegression",
368
+ "MultiOutputRegressor",
369
+ "MultiTaskElasticNet",
370
+ "MultiTaskElasticNetCV",
371
+ "MultiTaskLasso",
372
+ "MultiTaskLassoCV",
373
+ "OrthogonalMatchingPursuit",
374
+ "OrthogonalMatchingPursuitCV",
375
+ "PLSCanonical",
376
+ "PLSRegression",
377
+ "RadiusNeighborsRegressor",
378
+ "RegressorChain",
379
+ "StackingRegressor",
380
+ "VotingRegressor",
381
+ ]
382
+
383
+ results = []
384
+
385
+ for est in all_estimators():
386
+ if issubclass(est[1], RegressorMixin) and (
387
+ est[0] not in removed_regressors
388
+ ):
389
+ try:
390
+ if customize == True:
391
+ print(f"\n surrogate: CustomRegressor({est[0]})")
392
+ surr_obj = ns.CustomRegressor(obj=est[1]())
393
+ else:
394
+ print(f"\n surrogate: {est[0]}")
395
+ surr_obj = est[1]()
396
+ res = cross_val_optim(
397
+ X_train=X_train,
398
+ y_train=y_train,
399
+ X_test=X_test,
400
+ y_test=y_test,
401
+ model_type=model_type,
402
+ n_estimators=n_estimators,
403
+ surrogate_obj=surr_obj,
404
+ cv=cv,
405
+ n_jobs=n_jobs,
406
+ type_fit=type_fit,
407
+ scoring=scoring,
408
+ n_init=n_init,
409
+ n_iter=n_iter,
410
+ abs_tol=abs_tol,
411
+ verbose=verbose,
412
+ seed=seed,
413
+ )
414
+ print(f"\n result: {res}")
415
+ if customize == True:
416
+ results.append((f"CustomRegressor({est[0]})", res))
417
+ else:
418
+ results.append((est[0], res))
419
+ except:
420
+ pass
421
+
422
+ return results
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unifiedbooster
3
- Version: 0.3.0
3
+ Version: 0.4.2
4
4
  Summary: Unified interface for Gradient Boosted Decision Trees
5
5
  Home-page: https://github.com/thierrymoudiki/unifiedbooster
6
6
  Author: T. Moudiki
@@ -24,5 +24,6 @@ Requires-Dist: xgboost
24
24
  Requires-Dist: lightgbm
25
25
  Requires-Dist: catboost
26
26
  Requires-Dist: GPopt
27
+ Requires-Dist: nnetsauce
27
28
 
28
29
  Unified interface for Gradient Boosted Decision Trees
@@ -0,0 +1,11 @@
1
+ unifiedbooster/__init__.py,sha256=8FEkWCZ2tT8xcW46Z0X_BS9_r0kQWVAu37IncLq6QWU,301
2
+ unifiedbooster/gbdt.py,sha256=1qVdOeoEyBxxbJ7HBHZegGJo2d2onXs73o8_JntOtN8,4819
3
+ unifiedbooster/gbdt_classification.py,sha256=RLoM_lCmvEDrpNLRFlEzwKBA2oc0mkYUVKLFOTYAPrs,4099
4
+ unifiedbooster/gbdt_regression.py,sha256=Eavj3mV5Lsjpx-d03GLgT8GrwEYuBmBEWkUyDPcJu_g,3590
5
+ unifiedbooster/gpoptimization.py,sha256=xomHqQHu1wvG2wDdmErY8fYgB39pmNMo0-IvJdwEpoM,12606
6
+ unifiedbooster-0.4.2.dist-info/LICENSE,sha256=3rWw63btcdqbC0XMnpzCQhxDP8Vx7yKkKS7EDgJiY_4,1061
7
+ unifiedbooster-0.4.2.dist-info/METADATA,sha256=FiWDX64O41lbiNDL406XjArYUcnoIKKAZjNdxkzbHGo,955
8
+ unifiedbooster-0.4.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
9
+ unifiedbooster-0.4.2.dist-info/entry_points.txt,sha256=OVNTsCzMYnaJ11WIByB7G8Lym_dj-ERKZyQxWFUcW30,59
10
+ unifiedbooster-0.4.2.dist-info/top_level.txt,sha256=gOMxxpRtx8_nJXTWsXJDFkNeCsjSJQPs6aUXKK5_nI4,15
11
+ unifiedbooster-0.4.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- unifiedbooster/__init__.py,sha256=3d8wQVXaeVIxqtk_STM6nvIGZiGTxKn9aAWjuwiDYuo,169
2
- unifiedbooster/gbdt.py,sha256=QCcWfXYfrOXdiSeygPEvVMjg9fVNjRaOnW9KsHK6bvo,4770
3
- unifiedbooster/gbdt_classification.py,sha256=UqZEOjDp_2hSm4jCxVoqz8vNQ-8JRW4Xn5CjFqPqRF4,4028
4
- unifiedbooster/gbdt_regression.py,sha256=ZNX5RJF-Wk2KJpOUD-lgNnqruDHZpzSTxdKeayv6iw0,3519
5
- unifiedbooster-0.3.0.dist-info/LICENSE,sha256=3rWw63btcdqbC0XMnpzCQhxDP8Vx7yKkKS7EDgJiY_4,1061
6
- unifiedbooster-0.3.0.dist-info/METADATA,sha256=E0drgIWtoGZNF1lkxrj_zlbMxq8QmOPIW4iDY_GPKm0,930
7
- unifiedbooster-0.3.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
8
- unifiedbooster-0.3.0.dist-info/entry_points.txt,sha256=OVNTsCzMYnaJ11WIByB7G8Lym_dj-ERKZyQxWFUcW30,59
9
- unifiedbooster-0.3.0.dist-info/top_level.txt,sha256=gOMxxpRtx8_nJXTWsXJDFkNeCsjSJQPs6aUXKK5_nI4,15
10
- unifiedbooster-0.3.0.dist-info/RECORD,,