unifiedbooster 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,493 @@
1
+ import GPopt as gp
2
+ import nnetsauce as ns
3
+ import numpy as np
4
+ from collections import namedtuple
5
+ from .gbdt_classification import GBDTClassifier
6
+ from .gbdt_regression import GBDTRegressor
7
+ from sklearn.model_selection import cross_val_score
8
+ from sklearn.base import ClassifierMixin, RegressorMixin
9
+ from sklearn.utils import all_estimators
10
+ from sklearn import metrics
11
+
12
+
13
+ def cross_val_optim(
14
+ X_train,
15
+ y_train,
16
+ X_test=None,
17
+ y_test=None,
18
+ model_type="xgboost",
19
+ type_fit="classification",
20
+ scoring="accuracy",
21
+ n_estimators=None,
22
+ surrogate_obj=None,
23
+ cv=5,
24
+ n_jobs=None,
25
+ n_init=10,
26
+ n_iter=190,
27
+ abs_tol=1e-3,
28
+ verbose=2,
29
+ seed=123,
30
+ ):
31
+ """Cross-validation function and hyperparameters' search
32
+
33
+ Parameters:
34
+
35
+ X_train: array-like,
36
+ Training vectors, where rows is the number of samples
37
+ and columns is the number of features.
38
+
39
+ y_train: array-like,
40
+ Training vectors, where rows is the number of samples
41
+ and columns is the number of features.
42
+
43
+ X_test: array-like,
44
+ Testing vectors, where rows is the number of samples
45
+ and columns is the number of features.
46
+
47
+ y_test: array-like,
48
+ Testing vectors, where rows is the number of samples
49
+ and columns is the number of features.
50
+
51
+ model_type: str
52
+ type of gradient boosting algorithm: 'xgboost', 'lightgbm',
53
+ 'catboost', 'gradientboosting'
54
+
55
+ type_fit: str
56
+ "regression" or "classification"
57
+
58
+ scoring: str
59
+ scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
60
+
61
+ n_estimators: int
62
+ maximum number of trees that can be built (default is None, and if None, then the parameter is tuned)
63
+
64
+ surrogate_obj: an object;
65
+ An ML model for estimating the uncertainty around the objective function
66
+
67
+ cv: int;
68
+ number of cross-validation folds
69
+
70
+ n_jobs: int;
71
+ number of jobs for parallel execution
72
+
73
+ n_init: an integer;
74
+ number of points in the initial setting, when `x_init` and `y_init` are not provided
75
+
76
+ n_iter: an integer;
77
+ number of iterations of the minimization algorithm
78
+
79
+ abs_tol: a float;
80
+ tolerance for convergence of the optimizer (early stopping based on acquisition function)
81
+
82
+ verbose: int
83
+ controls verbosity
84
+
85
+ seed: int
86
+ reproducibility seed
87
+
88
+ Examples:
89
+
90
+ ```python
91
+ import unifiedbooster as ub
92
+ from sklearn.datasets import load_breast_cancer
93
+ from sklearn.model_selection import train_test_split
94
+
95
+ dataset = load_breast_cancer()
96
+ X, y = dataset.data, dataset.target
97
+ X_train, X_test, y_train, y_test = train_test_split(
98
+ X, y, test_size=0.2, random_state=42
99
+ )
100
+
101
+ res1 = ub.cross_val_optim(
102
+ X_train,
103
+ y_train,
104
+ X_test=None,
105
+ y_test=None,
106
+ model_type="lightgbm",
107
+ type_fit="classification",
108
+ scoring="accuracy",
109
+ n_estimators=100,
110
+ surrogate_obj=None,
111
+ cv=5,
112
+ n_jobs=None,
113
+ n_init=10,
114
+ n_iter=190,
115
+ abs_tol=1e-3,
116
+ verbose=2,
117
+ seed=123,
118
+ )
119
+ print(res1)
120
+ ```
121
+ """
122
+
123
+ def gbdt_cv(
124
+ X_train,
125
+ y_train,
126
+ model_type="xgboost",
127
+ n_estimators=100,
128
+ learning_rate=0.1,
129
+ max_depth=3,
130
+ rowsample=1.0,
131
+ colsample=1.0,
132
+ cv=5,
133
+ n_jobs=None,
134
+ type_fit="classification",
135
+ scoring="accuracy",
136
+ seed=123,
137
+ ):
138
+ if type_fit == "regression":
139
+ estimator = GBDTRegressor(
140
+ model_type=model_type,
141
+ n_estimators=n_estimators,
142
+ learning_rate=learning_rate,
143
+ max_depth=max_depth,
144
+ rowsample=rowsample,
145
+ colsample=colsample,
146
+ verbose=0,
147
+ seed=seed,
148
+ )
149
+ elif type_fit == "classification":
150
+ estimator = GBDTClassifier(
151
+ model_type=model_type,
152
+ n_estimators=n_estimators,
153
+ learning_rate=learning_rate,
154
+ max_depth=max_depth,
155
+ rowsample=rowsample,
156
+ colsample=colsample,
157
+ verbose=0,
158
+ seed=seed,
159
+ )
160
+ return -cross_val_score(
161
+ estimator,
162
+ X_train,
163
+ y_train,
164
+ scoring=scoring,
165
+ cv=cv,
166
+ n_jobs=n_jobs,
167
+ verbose=0,
168
+ ).mean()
169
+
170
+ # objective function for hyperparams tuning
171
+ if n_estimators is not None:
172
+
173
+ def crossval_objective(xx):
174
+ return gbdt_cv(
175
+ X_train=X_train,
176
+ y_train=y_train,
177
+ model_type=model_type,
178
+ n_estimators=n_estimators,
179
+ learning_rate=10 ** xx[0],
180
+ max_depth=int(xx[1]),
181
+ rowsample=xx[2],
182
+ colsample=xx[3],
183
+ cv=cv,
184
+ n_jobs=n_jobs,
185
+ type_fit=type_fit,
186
+ scoring=scoring,
187
+ seed=seed,
188
+ )
189
+
190
+ else: # n_estimators is None
191
+
192
+ def crossval_objective(xx):
193
+ return gbdt_cv(
194
+ X_train=X_train,
195
+ y_train=y_train,
196
+ model_type=model_type,
197
+ n_estimators=int(10 ** xx[4]),
198
+ learning_rate=10 ** xx[0],
199
+ max_depth=int(xx[1]),
200
+ rowsample=xx[2],
201
+ colsample=xx[3],
202
+ cv=cv,
203
+ n_jobs=n_jobs,
204
+ type_fit=type_fit,
205
+ scoring=scoring,
206
+ seed=seed,
207
+ )
208
+
209
+ if n_estimators is not None:
210
+ if surrogate_obj is None:
211
+ gp_opt = gp.GPOpt(
212
+ objective_func=crossval_objective,
213
+ lower_bound=np.array([-6, 1, 0.5, 0.5]),
214
+ upper_bound=np.array([0, 16, 1.0, 1.0]),
215
+ params_names=[
216
+ "learning_rate",
217
+ "max_depth",
218
+ "rowsample",
219
+ "colsample",
220
+ ],
221
+ method="bayesian",
222
+ n_init=n_init,
223
+ n_iter=n_iter,
224
+ seed=seed,
225
+ )
226
+ else:
227
+ gp_opt = gp.GPOpt(
228
+ objective_func=crossval_objective,
229
+ lower_bound=np.array([-6, 1, 0.5, 0.5]),
230
+ upper_bound=np.array([0, 16, 1.0, 1.0]),
231
+ params_names=[
232
+ "learning_rate",
233
+ "max_depth",
234
+ "rowsample",
235
+ "colsample",
236
+ ],
237
+ acquisition="ucb",
238
+ method="splitconformal",
239
+ surrogate_obj=ns.PredictionInterval(
240
+ obj=surrogate_obj, method="splitconformal"
241
+ ),
242
+ n_init=n_init,
243
+ n_iter=n_iter,
244
+ seed=seed,
245
+ )
246
+ else: # n_estimators is None
247
+ if surrogate_obj is None:
248
+ gp_opt = gp.GPOpt(
249
+ objective_func=crossval_objective,
250
+ lower_bound=np.array([-6, 1, 0.5, 0.5, 2]),
251
+ upper_bound=np.array([0, 16, 1.0, 1.0, 3]),
252
+ params_names=[
253
+ "learning_rate",
254
+ "max_depth",
255
+ "rowsample",
256
+ "colsample",
257
+ "n_estimators",
258
+ ],
259
+ method="bayesian",
260
+ n_init=n_init,
261
+ n_iter=n_iter,
262
+ seed=seed,
263
+ )
264
+ else:
265
+ gp_opt = gp.GPOpt(
266
+ objective_func=crossval_objective,
267
+ lower_bound=np.array([-6, 1, 0.5, 0.5, 2]),
268
+ upper_bound=np.array([0, 16, 1.0, 1.0, 3]),
269
+ params_names=[
270
+ "learning_rate",
271
+ "max_depth",
272
+ "rowsample",
273
+ "colsample",
274
+ "n_estimators",
275
+ ],
276
+ acquisition="ucb",
277
+ method="splitconformal",
278
+ surrogate_obj=ns.PredictionInterval(
279
+ obj=surrogate_obj, method="splitconformal"
280
+ ),
281
+ n_init=n_init,
282
+ n_iter=n_iter,
283
+ seed=seed,
284
+ )
285
+
286
+ res = gp_opt.optimize(verbose=verbose, abs_tol=abs_tol)
287
+ res.best_params["model_type"] = model_type
288
+ res.best_params["n_estimators"] = (
289
+ int(n_estimators)
290
+ if n_estimators is not None
291
+ else int(10 ** res.best_params["n_estimators"])
292
+ )
293
+ res.best_params["learning_rate"] = 10 ** res.best_params["learning_rate"]
294
+ res.best_params["max_depth"] = int(res.best_params["max_depth"])
295
+ res.best_params["rowsample"] = res.best_params["rowsample"]
296
+ res.best_params["colsample"] = res.best_params["colsample"]
297
+
298
+ # out-of-sample error
299
+ if X_test is not None and y_test is not None:
300
+ if type_fit == "regression":
301
+ estimator = GBDTRegressor(**res.best_params, verbose=0, seed=seed)
302
+ elif type_fit == "classification":
303
+ estimator = GBDTClassifier(**res.best_params, verbose=0, seed=seed)
304
+ preds = estimator.fit(X_train, y_train).predict(X_test)
305
+ # check error on y_test
306
+ oos_err = getattr(metrics, scoring + "_score")(
307
+ y_true=y_test, y_pred=preds
308
+ )
309
+ result = namedtuple("result", res._fields + ("test_" + scoring,))
310
+ return result(*res, oos_err)
311
+ else:
312
+ return res
313
+
314
+
315
+ def lazy_cross_val_optim(
316
+ X_train,
317
+ y_train,
318
+ X_test=None,
319
+ y_test=None,
320
+ model_type="xgboost",
321
+ type_fit="classification",
322
+ scoring="accuracy",
323
+ customize=False,
324
+ n_estimators=None,
325
+ cv=5,
326
+ n_jobs=None,
327
+ n_init=10,
328
+ n_iter=190,
329
+ abs_tol=1e-3,
330
+ verbose=1,
331
+ seed=123,
332
+ ):
333
+ """Automated Cross-validation function and hyperparameters' search using multiple surrogates
334
+
335
+ Parameters:
336
+
337
+ X_train: array-like,
338
+ Training vectors, where rows is the number of samples
339
+ and columns is the number of features.
340
+
341
+ y_train: array-like,
342
+ Training vectors, where rows is the number of samples
343
+ and columns is the number of features.
344
+
345
+ X_test: array-like,
346
+ Testing vectors, where rows is the number of samples
347
+ and columns is the number of features.
348
+
349
+ y_test: array-like,
350
+ Testing vectors, where rows is the number of samples
351
+ and columns is the number of features.
352
+
353
+ model_type: str
354
+ type of gradient boosting algorithm: 'xgboost', 'lightgbm',
355
+ 'catboost', 'gradientboosting'
356
+
357
+ type_fit: str
358
+ "regression" or "classification"
359
+
360
+ scoring: str
361
+ scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
362
+
363
+ customize: boolean
364
+ if True, the surrogate is transformed into a quasi-randomized network (default is False)
365
+
366
+ n_estimators: int
367
+ maximum number of trees that can be built (default is None, if None, the parameters is tuned)
368
+
369
+ cv: int;
370
+ number of cross-validation folds
371
+
372
+ n_jobs: int;
373
+ number of jobs for parallel execution
374
+
375
+ n_init: an integer;
376
+ number of points in the initial setting, when `x_init` and `y_init` are not provided
377
+
378
+ n_iter: an integer;
379
+ number of iterations of the minimization algorithm
380
+
381
+ abs_tol: a float;
382
+ tolerance for convergence of the optimizer (early stopping based on acquisition function)
383
+
384
+ verbose: int
385
+ controls verbosity
386
+
387
+ seed: int
388
+ reproducibility seed
389
+
390
+ Examples:
391
+
392
+ ```python
393
+ import os
394
+ import unifiedbooster as ub
395
+ from sklearn.datasets import load_breast_cancer
396
+ from sklearn.model_selection import train_test_split
397
+ from sklearn.metrics import accuracy_score
398
+ from time import time
399
+
400
+ print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
401
+
402
+ dataset = load_breast_cancer()
403
+ X, y = dataset.data, dataset.target
404
+ X_train, X_test, y_train, y_test = train_test_split(
405
+ X, y, test_size=0.2, random_state=42
406
+ )
407
+
408
+ start = time()
409
+ res4 = ub.lazy_cross_val_optim(
410
+ X_train,
411
+ y_train,
412
+ X_test=X_test,
413
+ y_test=y_test,
414
+ model_type="lightgbm",
415
+ type_fit="classification",
416
+ scoring="accuracy",
417
+ n_estimators=100,
418
+ cv=5,
419
+ n_jobs=None,
420
+ n_init=10,
421
+ n_iter=190,
422
+ abs_tol=1e-3,
423
+ seed=123,
424
+ customize=False
425
+ )
426
+ print(f"Elapsed: {time()-start}")
427
+ print(res4)
428
+ ```
429
+ """
430
+
431
+ removed_regressors = [
432
+ "TheilSenRegressor",
433
+ "ARDRegression",
434
+ "CCA",
435
+ "GaussianProcessRegressor",
436
+ "GradientBoostingRegressor",
437
+ "HistGradientBoostingRegressor",
438
+ "IsotonicRegression",
439
+ "MultiOutputRegressor",
440
+ "MultiTaskElasticNet",
441
+ "MultiTaskElasticNetCV",
442
+ "MultiTaskLasso",
443
+ "MultiTaskLassoCV",
444
+ "OrthogonalMatchingPursuit",
445
+ "OrthogonalMatchingPursuitCV",
446
+ "PLSCanonical",
447
+ "PLSRegression",
448
+ "RadiusNeighborsRegressor",
449
+ "RegressorChain",
450
+ "StackingRegressor",
451
+ "VotingRegressor",
452
+ ]
453
+
454
+ results = []
455
+
456
+ for est in all_estimators():
457
+ if issubclass(est[1], RegressorMixin) and (
458
+ est[0] not in removed_regressors
459
+ ):
460
+ try:
461
+ if customize == True:
462
+ print(f"\n surrogate: CustomRegressor({est[0]})")
463
+ surr_obj = ns.CustomRegressor(obj=est[1]())
464
+ else:
465
+ print(f"\n surrogate: {est[0]}")
466
+ surr_obj = est[1]()
467
+ res = cross_val_optim(
468
+ X_train=X_train,
469
+ y_train=y_train,
470
+ X_test=X_test,
471
+ y_test=y_test,
472
+ model_type=model_type,
473
+ n_estimators=n_estimators,
474
+ surrogate_obj=surr_obj,
475
+ cv=cv,
476
+ n_jobs=n_jobs,
477
+ type_fit=type_fit,
478
+ scoring=scoring,
479
+ n_init=n_init,
480
+ n_iter=n_iter,
481
+ abs_tol=abs_tol,
482
+ verbose=verbose,
483
+ seed=seed,
484
+ )
485
+ print(f"\n result: {res}")
486
+ if customize == True:
487
+ results.append((f"CustomRegressor({est[0]})", res))
488
+ else:
489
+ results.append((est[0], res))
490
+ except:
491
+ pass
492
+
493
+ return results
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ docstring
5
+ """
6
+
7
+ # Authors: Henrik Linusson
8
+ # Yaniv Romano modified np.py file to include CQR
9
+ # T. Moudiki modified __init__.py to import classes
10
+
11
+ # __version__ = '2.1.0'
12
+
13
+ from .nc import (
14
+ AbsErrorErrFunc,
15
+ QuantileRegErrFunc,
16
+ RegressorNc,
17
+ RegressorNormalizer,
18
+ )
19
+ from .cp import IcpRegressor, TcpClassifier
20
+ from .icp import IcpClassifier
21
+ from .nc import ClassifierNc, MarginErrFunc
22
+ from .base import RegressorAdapter, ClassifierAdapter
23
+
24
+ __all__ = [
25
+ "AbsErrorErrFunc",
26
+ "MarginErrFunc",
27
+ "QuantileRegErrFunc",
28
+ "RegressorAdapter",
29
+ "ClassifierAdapter",
30
+ "RegressorNc",
31
+ "ClassifierNc",
32
+ "RegressorNormalizer",
33
+ "IcpRegressor",
34
+ "IcpClassifier",
35
+ "TcpClassifier",
36
+ ]