perpetual 0.7.11__cp39-none-win_amd64.whl → 0.8.0__cp39-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of perpetual might be problematic. Click here for more details.
- perpetual/booster.py +150 -56
- perpetual/perpetual.cp39-win_amd64.pyd +0 -0
- perpetual/utils.py +3 -1
- {perpetual-0.7.11.dist-info → perpetual-0.8.0.dist-info}/METADATA +73 -27
- perpetual-0.8.0.dist-info/RECORD +12 -0
- perpetual-0.7.11.dist-info/RECORD +0 -12
- {perpetual-0.7.11.dist-info → perpetual-0.8.0.dist-info}/WHEEL +0 -0
- {perpetual-0.7.11.dist-info → perpetual-0.8.0.dist-info}/license_files/LICENSE +0 -0
perpetual/booster.py
CHANGED
|
@@ -26,18 +26,20 @@ class PerpetualBooster:
|
|
|
26
26
|
# this is useful for parameters that should be
|
|
27
27
|
# attempted to be loaded in and set
|
|
28
28
|
# as attributes on the booster after it is loaded.
|
|
29
|
-
|
|
29
|
+
metadata_attributes: Dict[str, BaseSerializer] = {
|
|
30
30
|
"feature_names_in_": ObjectSerializer(),
|
|
31
31
|
"n_features_": ObjectSerializer(),
|
|
32
32
|
"feature_importance_method": ObjectSerializer(),
|
|
33
33
|
"cat_mapping": ObjectSerializer(),
|
|
34
34
|
"classes_": ObjectSerializer(),
|
|
35
|
+
# "categorical_features": ObjectSerializer(),
|
|
35
36
|
}
|
|
36
37
|
|
|
37
38
|
def __init__(
|
|
38
39
|
self,
|
|
39
40
|
*,
|
|
40
41
|
objective: str = "LogLoss",
|
|
42
|
+
budget: float = 0.5,
|
|
41
43
|
num_threads: Optional[int] = None,
|
|
42
44
|
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
43
45
|
force_children_to_bound_parent: bool = False,
|
|
@@ -48,8 +50,7 @@ class PerpetualBooster:
|
|
|
48
50
|
missing_node_treatment: str = "None",
|
|
49
51
|
log_iterations: int = 0,
|
|
50
52
|
feature_importance_method: str = "Gain",
|
|
51
|
-
|
|
52
|
-
alpha: Optional[float] = None,
|
|
53
|
+
quantile: Optional[float] = None,
|
|
53
54
|
reset: Optional[bool] = None,
|
|
54
55
|
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
|
|
55
56
|
timeout: Optional[float] = None,
|
|
@@ -59,16 +60,17 @@ class PerpetualBooster:
|
|
|
59
60
|
max_bin: int = 256,
|
|
60
61
|
max_cat: int = 1000,
|
|
61
62
|
):
|
|
62
|
-
"""PerpetualBooster class, used to
|
|
63
|
-
The following parameters can also be specified in the fit method to override the values in the constructor:
|
|
64
|
-
budget, alpha, reset, categorical_features, timeout, iteration_limit, memory_limit, and stopping_rounds.
|
|
63
|
+
"""PerpetualBooster class, used to create gradient boosted decision tree ensembles.
|
|
65
64
|
|
|
66
65
|
Args:
|
|
67
|
-
objective (str, optional): Learning objective function to be used for optimization.
|
|
68
|
-
|
|
66
|
+
objective (str, optional): Learning objective function to be used for optimization. Valid options are:
|
|
67
|
+
"LogLoss" to use logistic loss (classification),
|
|
69
68
|
"SquaredLoss" to use squared error (regression),
|
|
70
69
|
"QuantileLoss" to use quantile error (regression).
|
|
71
70
|
Defaults to "LogLoss".
|
|
71
|
+
budget (float, optional): a positive number for fitting budget. Increasing this number will more
|
|
72
|
+
likely result in more boosting rounds and more increased predictive power.
|
|
73
|
+
Default value is 0.5.
|
|
72
74
|
num_threads (int, optional): Number of threads to be used during training.
|
|
73
75
|
monotone_constraints (Dict[Any, int], optional): Constraints that are used to enforce a
|
|
74
76
|
specific relationship between the training features and the target variable. A dictionary
|
|
@@ -105,10 +107,7 @@ class PerpetualBooster:
|
|
|
105
107
|
- "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
|
|
106
108
|
log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
|
|
107
109
|
feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
|
|
108
|
-
|
|
109
|
-
likely result in more boosting rounds and more increased predictive power.
|
|
110
|
-
Default value is 1.0.
|
|
111
|
-
alpha (float, optional): only used in quantile regression.
|
|
110
|
+
quantile (float, optional): only used in quantile regression.
|
|
112
111
|
reset (bool, optional): whether to reset the model or continue training.
|
|
113
112
|
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
|
|
114
113
|
Defaults to `auto` for Polars or Pandas categorical data types.
|
|
@@ -166,6 +165,7 @@ class PerpetualBooster:
|
|
|
166
165
|
)
|
|
167
166
|
|
|
168
167
|
self.objective = objective
|
|
168
|
+
self.budget = budget
|
|
169
169
|
self.num_threads = num_threads
|
|
170
170
|
self.monotone_constraints = monotone_constraints_
|
|
171
171
|
self.force_children_to_bound_parent = force_children_to_bound_parent
|
|
@@ -176,8 +176,7 @@ class PerpetualBooster:
|
|
|
176
176
|
self.missing_node_treatment = missing_node_treatment
|
|
177
177
|
self.log_iterations = log_iterations
|
|
178
178
|
self.feature_importance_method = feature_importance_method
|
|
179
|
-
self.
|
|
180
|
-
self.alpha = alpha
|
|
179
|
+
self.quantile = quantile
|
|
181
180
|
self.reset = reset
|
|
182
181
|
self.categorical_features = categorical_features
|
|
183
182
|
self.timeout = timeout
|
|
@@ -189,6 +188,7 @@ class PerpetualBooster:
|
|
|
189
188
|
|
|
190
189
|
booster = CratePerpetualBooster(
|
|
191
190
|
objective=self.objective,
|
|
191
|
+
budget=self.budget,
|
|
192
192
|
max_bin=self.max_bin,
|
|
193
193
|
num_threads=self.num_threads,
|
|
194
194
|
monotone_constraints=dict(),
|
|
@@ -199,23 +199,17 @@ class PerpetualBooster:
|
|
|
199
199
|
terminate_missing_features=set(),
|
|
200
200
|
missing_node_treatment=self.missing_node_treatment,
|
|
201
201
|
log_iterations=self.log_iterations,
|
|
202
|
+
quantile=self.quantile,
|
|
203
|
+
reset=self.reset,
|
|
204
|
+
categorical_features=set(),
|
|
205
|
+
timeout=self.timeout,
|
|
206
|
+
iteration_limit=self.iteration_limit,
|
|
207
|
+
memory_limit=self.memory_limit,
|
|
208
|
+
stopping_rounds=self.stopping_rounds,
|
|
202
209
|
)
|
|
203
210
|
self.booster = cast(BoosterType, booster)
|
|
204
211
|
|
|
205
|
-
def fit(
|
|
206
|
-
self,
|
|
207
|
-
X,
|
|
208
|
-
y,
|
|
209
|
-
sample_weight=None,
|
|
210
|
-
budget: Optional[float] = None,
|
|
211
|
-
alpha: Optional[float] = None,
|
|
212
|
-
reset: Optional[bool] = None,
|
|
213
|
-
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
|
|
214
|
-
timeout: Optional[float] = None,
|
|
215
|
-
iteration_limit: Optional[int] = None,
|
|
216
|
-
memory_limit: Optional[float] = None,
|
|
217
|
-
stopping_rounds: Optional[int] = None,
|
|
218
|
-
) -> Self:
|
|
212
|
+
def fit(self, X, y, sample_weight=None) -> Self:
|
|
219
213
|
"""Fit the gradient booster on a provided dataset.
|
|
220
214
|
|
|
221
215
|
Args:
|
|
@@ -225,26 +219,10 @@ class PerpetualBooster:
|
|
|
225
219
|
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
|
|
226
220
|
training the model. If None is passed, a weight of 1 will be used for every record.
|
|
227
221
|
Defaults to None.
|
|
228
|
-
budget (float, optional): a positive number for fitting budget. Increasing this number will more
|
|
229
|
-
likely result in more boosting rounds and more increased predictive power.
|
|
230
|
-
Defaults to 1.0.
|
|
231
|
-
alpha (float, optional): only used in quantile regression.
|
|
232
|
-
reset (bool, optional): whether to reset the model or continue training.
|
|
233
|
-
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
|
|
234
|
-
Defaults to `auto` for Polars or Pandas categorical data types.
|
|
235
|
-
timeout (float, optional): optional fit timeout in seconds
|
|
236
|
-
iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
|
|
237
|
-
The algorithm automatically stops for most of the cases before hitting this limit.
|
|
238
|
-
If you want to experiment with very high budget (>2.0), you can also increase this limit.
|
|
239
|
-
memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
|
|
240
|
-
available memory and the algorithm requirements.
|
|
241
|
-
stopping_rounds (int, optional): optional limit for auto stopping. Defaults to 3.
|
|
242
222
|
"""
|
|
243
223
|
|
|
244
224
|
features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
|
|
245
|
-
convert_input_frame(
|
|
246
|
-
X, categorical_features or self.categorical_features, self.max_cat
|
|
247
|
-
)
|
|
225
|
+
convert_input_frame(X, self.categorical_features, self.max_cat)
|
|
248
226
|
)
|
|
249
227
|
self.n_features_ = cols
|
|
250
228
|
self.cat_mapping = cat_mapping
|
|
@@ -268,6 +246,7 @@ class PerpetualBooster:
|
|
|
268
246
|
):
|
|
269
247
|
booster = CratePerpetualBooster(
|
|
270
248
|
objective=self.objective,
|
|
249
|
+
budget=self.budget,
|
|
271
250
|
max_bin=self.max_bin,
|
|
272
251
|
num_threads=self.num_threads,
|
|
273
252
|
monotone_constraints=crate_mc,
|
|
@@ -278,12 +257,20 @@ class PerpetualBooster:
|
|
|
278
257
|
terminate_missing_features=crate_tmf,
|
|
279
258
|
missing_node_treatment=self.missing_node_treatment,
|
|
280
259
|
log_iterations=self.log_iterations,
|
|
260
|
+
quantile=self.quantile,
|
|
261
|
+
reset=self.reset,
|
|
262
|
+
categorical_features=categorical_features_,
|
|
263
|
+
timeout=self.timeout,
|
|
264
|
+
iteration_limit=self.iteration_limit,
|
|
265
|
+
memory_limit=self.memory_limit,
|
|
266
|
+
stopping_rounds=self.stopping_rounds,
|
|
281
267
|
)
|
|
282
268
|
self.booster = cast(BoosterType, booster)
|
|
283
269
|
else:
|
|
284
270
|
booster = CrateMultiOutputBooster(
|
|
285
271
|
n_boosters=len(classes_),
|
|
286
272
|
objective=self.objective,
|
|
273
|
+
budget=self.budget,
|
|
287
274
|
max_bin=self.max_bin,
|
|
288
275
|
num_threads=self.num_threads,
|
|
289
276
|
monotone_constraints=crate_mc,
|
|
@@ -294,6 +281,13 @@ class PerpetualBooster:
|
|
|
294
281
|
terminate_missing_features=crate_tmf,
|
|
295
282
|
missing_node_treatment=self.missing_node_treatment,
|
|
296
283
|
log_iterations=self.log_iterations,
|
|
284
|
+
quantile=self.quantile,
|
|
285
|
+
reset=self.reset,
|
|
286
|
+
categorical_features=categorical_features_,
|
|
287
|
+
timeout=self.timeout,
|
|
288
|
+
iteration_limit=self.iteration_limit,
|
|
289
|
+
memory_limit=self.memory_limit,
|
|
290
|
+
stopping_rounds=self.stopping_rounds,
|
|
297
291
|
)
|
|
298
292
|
self.booster = cast(MultiOutputBoosterType, booster)
|
|
299
293
|
|
|
@@ -305,20 +299,97 @@ class PerpetualBooster:
|
|
|
305
299
|
)
|
|
306
300
|
self._set_metadata_attributes("classes_", self.classes_)
|
|
307
301
|
|
|
302
|
+
self.categorical_features = categorical_features_
|
|
303
|
+
|
|
308
304
|
self.booster.fit(
|
|
309
305
|
flat_data=flat_data,
|
|
310
306
|
rows=rows,
|
|
311
307
|
cols=cols,
|
|
312
308
|
y=y_,
|
|
313
|
-
budget=budget or self.budget,
|
|
314
309
|
sample_weight=sample_weight_, # type: ignore
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
return self
|
|
313
|
+
|
|
314
|
+
def prune(self, X, y, sample_weight=None) -> Self:
|
|
315
|
+
"""Prune the gradient booster on a provided dataset.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
319
|
+
y (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
|
|
320
|
+
or a 1 or 2 dimensional Numpy array.
|
|
321
|
+
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
|
|
322
|
+
training the model. If None is passed, a weight of 1 will be used for every record.
|
|
323
|
+
Defaults to None.
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
|
|
327
|
+
|
|
328
|
+
y_, _ = convert_input_array(y, self.objective)
|
|
329
|
+
|
|
330
|
+
if sample_weight is None:
|
|
331
|
+
sample_weight_ = None
|
|
332
|
+
else:
|
|
333
|
+
sample_weight_, _ = convert_input_array(sample_weight, self.objective)
|
|
334
|
+
|
|
335
|
+
self.booster.prune(
|
|
336
|
+
flat_data=flat_data,
|
|
337
|
+
rows=rows,
|
|
338
|
+
cols=cols,
|
|
339
|
+
y=y_,
|
|
340
|
+
sample_weight=sample_weight_, # type: ignore
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
return self
|
|
344
|
+
|
|
345
|
+
def calibrate(
|
|
346
|
+
self, X_train, y_train, X_cal, y_cal, alpha, sample_weight=None
|
|
347
|
+
) -> Self:
|
|
348
|
+
"""Calibrate the gradient booster on a provided dataset.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
X_train (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
352
|
+
y_train (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
|
|
353
|
+
or a 1 or 2 dimensional Numpy array.
|
|
354
|
+
X_cal (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
355
|
+
y_cal (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
|
|
356
|
+
or a 1 or 2 dimensional Numpy array.
|
|
357
|
+
alpha (ArrayLike): Between 0 and 1, represents the uncertainty of the confidence interval.
|
|
358
|
+
Lower alpha produce larger (more conservative) prediction intervals.
|
|
359
|
+
alpha is the complement of the target coverage level.
|
|
360
|
+
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
|
|
361
|
+
training the model. If None is passed, a weight of 1 will be used for every record.
|
|
362
|
+
Defaults to None.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
_, flat_data_train, rows_train, cols_train = transform_input_frame(
|
|
366
|
+
X_train, self.cat_mapping
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
y_train_, _ = convert_input_array(y_train, self.objective)
|
|
370
|
+
|
|
371
|
+
_, flat_data_cal, rows_cal, cols_cal = transform_input_frame(
|
|
372
|
+
X_cal, self.cat_mapping
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
y_cal_, _ = convert_input_array(y_cal, self.objective)
|
|
376
|
+
|
|
377
|
+
if sample_weight is None:
|
|
378
|
+
sample_weight_ = None
|
|
379
|
+
else:
|
|
380
|
+
sample_weight_, _ = convert_input_array(sample_weight, self.objective)
|
|
381
|
+
|
|
382
|
+
self.booster.calibrate(
|
|
383
|
+
flat_data=flat_data_train,
|
|
384
|
+
rows=rows_train,
|
|
385
|
+
cols=cols_train,
|
|
386
|
+
y=y_train_,
|
|
387
|
+
flat_data_cal=flat_data_cal,
|
|
388
|
+
rows_cal=rows_cal,
|
|
389
|
+
cols_cal=cols_cal,
|
|
390
|
+
y_cal=y_cal_,
|
|
391
|
+
alpha=np.array(alpha),
|
|
392
|
+
sample_weight=sample_weight_, # type: ignore
|
|
322
393
|
)
|
|
323
394
|
|
|
324
395
|
return self
|
|
@@ -331,6 +402,29 @@ class PerpetualBooster:
|
|
|
331
402
|
f"Columns mismatch between data {features} passed, and data {self.feature_names_in_} used at fit."
|
|
332
403
|
)
|
|
333
404
|
|
|
405
|
+
def predict_intervals(self, X, parallel: Union[bool, None] = None) -> dict:
|
|
406
|
+
"""Predict intervals with the fitted booster on new data.
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
410
|
+
parallel (Union[bool, None], optional): Optionally specify if the predict
|
|
411
|
+
function should run in parallel on multiple threads. If `None` is
|
|
412
|
+
passed, the `parallel` attribute of the booster will be used.
|
|
413
|
+
Defaults to `None`.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
np.ndarray: Returns a numpy array of the predictions.
|
|
417
|
+
"""
|
|
418
|
+
features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
|
|
419
|
+
self._validate_features(features_)
|
|
420
|
+
|
|
421
|
+
return self.booster.predict_intervals(
|
|
422
|
+
flat_data=flat_data,
|
|
423
|
+
rows=rows,
|
|
424
|
+
cols=cols,
|
|
425
|
+
parallel=parallel,
|
|
426
|
+
)
|
|
427
|
+
|
|
334
428
|
def predict(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
|
|
335
429
|
"""Predict with the fitted booster on new data.
|
|
336
430
|
|
|
@@ -706,7 +800,7 @@ class PerpetualBooster:
|
|
|
706
800
|
warnings.simplefilter("ignore")
|
|
707
801
|
c = cls(**params)
|
|
708
802
|
c.booster = booster
|
|
709
|
-
for m in c.
|
|
803
|
+
for m in c.metadata_attributes:
|
|
710
804
|
try:
|
|
711
805
|
m_ = c._get_metadata_attributes(m)
|
|
712
806
|
setattr(c, m, m_)
|
|
@@ -774,12 +868,12 @@ class PerpetualBooster:
|
|
|
774
868
|
return v
|
|
775
869
|
|
|
776
870
|
def _set_metadata_attributes(self, key: str, value: Any) -> None:
|
|
777
|
-
value_ = self.
|
|
871
|
+
value_ = self.metadata_attributes[key].serialize(value)
|
|
778
872
|
self.insert_metadata(key=key, value=value_)
|
|
779
873
|
|
|
780
874
|
def _get_metadata_attributes(self, key: str) -> Any:
|
|
781
875
|
value = self.get_metadata(key)
|
|
782
|
-
return self.
|
|
876
|
+
return self.metadata_attributes[key].deserialize(value)
|
|
783
877
|
|
|
784
878
|
@property
|
|
785
879
|
def base_score(self) -> Union[float, Iterable[float]]:
|
|
Binary file
|
perpetual/utils.py
CHANGED
|
@@ -65,7 +65,9 @@ def convert_input_array(x, objective) -> np.ndarray:
|
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
def convert_input_frame(
|
|
68
|
-
X,
|
|
68
|
+
X,
|
|
69
|
+
categorical_features,
|
|
70
|
+
max_cat,
|
|
69
71
|
) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
|
|
70
72
|
"""Convert data to format needed by booster.
|
|
71
73
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: perpetual
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.13
|
|
11
11
|
Requires-Dist: numpy
|
|
12
12
|
Requires-Dist: typing-extensions
|
|
13
|
+
Requires-Dist: black ; extra == 'dev'
|
|
13
14
|
Requires-Dist: pandas ; extra == 'dev'
|
|
14
15
|
Requires-Dist: polars ; extra == 'dev'
|
|
15
16
|
Requires-Dist: pyarrow ; extra == 'dev'
|
|
@@ -24,7 +25,7 @@ Requires-Dist: ruff ; extra == 'dev'
|
|
|
24
25
|
Provides-Extra: dev
|
|
25
26
|
License-File: LICENSE
|
|
26
27
|
License-File: LICENSE
|
|
27
|
-
Summary: A self-generalizing gradient boosting machine
|
|
28
|
+
Summary: A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization
|
|
28
29
|
Keywords: rust,perpetual,machine learning,tree model,decision tree,gradient boosted decision tree,gradient boosting machine
|
|
29
30
|
Home-Page: https://perpetual-ml.com
|
|
30
31
|
Author: Mutlu Simsek
|
|
@@ -49,10 +50,42 @@ Project-URL: Source Code, https://github.com/perpetual-ml/perpetual
|
|
|
49
50
|
|
|
50
51
|
# Perpetual
|
|
51
52
|
|
|
52
|
-
PerpetualBooster is a gradient boosting machine (GBM) algorithm
|
|
53
|
+
PerpetualBooster is a gradient boosting machine (GBM) algorithm that doesn't need hyperparameter optimization unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from perpetual import PerpetualBooster
|
|
61
|
+
|
|
62
|
+
model = PerpetualBooster(objective="SquaredLoss")
|
|
63
|
+
model.fit(X, y, budget=1.0)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Documentation
|
|
67
|
+
|
|
68
|
+
Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/).
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from perpetual import PerpetualBooster
|
|
76
|
+
|
|
77
|
+
model = PerpetualBooster(objective="SquaredLoss")
|
|
78
|
+
model.fit(X, y, budget=1.0)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Documentation
|
|
82
|
+
|
|
83
|
+
Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/).
|
|
53
84
|
|
|
54
85
|
## Benchmark
|
|
55
86
|
|
|
87
|
+
### PerpetualBooster vs. Optuna + LightGBM
|
|
88
|
+
|
|
56
89
|
Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
|
|
57
90
|
|
|
58
91
|
The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression):
|
|
@@ -71,38 +104,51 @@ The following table summarizes the results for the [Cover Types](https://scikit-
|
|
|
71
104
|
|
|
72
105
|
The results can be reproduced using the scripts in the [examples](./python-package/examples) folder.
|
|
73
106
|
|
|
74
|
-
PerpetualBooster
|
|
107
|
+
### PerpetualBooster vs. AutoGluon
|
|
75
108
|
|
|
76
|
-
|
|
77
|
-
| -------------------------------------------- | --------------------------- | ----------------------------------------------------------------- | -------------- | --------------------------- | ----------------------------------------------------------------- | -------------- |
|
|
78
|
-
| [Airlines_DepDelay_10M](https://www.openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 <td style="background-color:green;color:white;"> 28.8 </td> |
|
|
79
|
-
| [bates_regr_100](https://www.openml.org/t/361940) | 3421 | 15.1 <td style="background-color:green;color:white;"> 1.084 </td> | OOM | OOM | OOM |
|
|
80
|
-
| [BNG(libras_move)](https://www.openml.org/t/7327) | 1956 | 4.2 <td style="background-color:green;color:white;"> 2.51 </td> | 1922 | 97.6 | 2.53 |
|
|
81
|
-
| [BNG(satellite_image)](https://www.openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 <td style="background-color:green;color:white;"> 0.721 </td> |
|
|
82
|
-
| [COMET_MC](https://www.openml.org/t/14949) | 44 | 1.0 <td style="background-color:green;color:white;"> 0.0615 </td> | 47 | 5.0 | 0.0662 |
|
|
83
|
-
| [friedman1](https://www.openml.org/t/361939) | 275 | 4.2 <td style="background-color:green;color:white;"> 1.047 </td> | 278 | 5.1 | 1.487 |
|
|
84
|
-
| [poker](https://www.openml.org/t/10102) | 38 | 0.6 <td style="background-color:green;color:white;"> 0.256 </td> | 41 | 1.2 | 0.722 |
|
|
85
|
-
| [subset_higgs](https://www.openml.org/t/361955) | 868 | 10.6 <td style="background-color:green;color:white;"> 0.420 </td> | 870 | 24.5 | 0.421 |
|
|
86
|
-
| [BNG(autoHorse)](https://www.openml.org/t/7319) | 107 | 1.1 <td style="background-color:green;color:white;"> 19.0 </td> | 107 | 3.2 | 20.5 |
|
|
87
|
-
| [BNG(pbc)](https://www.openml.org/t/7318) | 48 | 0.6 <td style="background-color:green;color:white;"> 836.5 </td> | 51 | 0.2 | 957.1 |
|
|
88
|
-
| average | 465 | 3.9 | - | 464 | 19.7 | - |
|
|
109
|
+
PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/) for both regression and classification tasks.
|
|
89
110
|
|
|
90
|
-
|
|
111
|
+
The results are summarized in the following table for regression tasks:
|
|
91
112
|
|
|
92
|
-
|
|
113
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
|
|
114
|
+
| -------------------------------------------------------- | ----- | ----- | ------------------- | -------- | ------ | ------------------ |
|
|
115
|
+
| [Airlines_DepDelay_10M](https://www.openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 | <ins> 28.8 </ins> |
|
|
116
|
+
| [bates_regr_100](https://www.openml.org/t/361940) | 3421 | 15.1 | <ins> 1.084 </ins> | OOM | OOM | OOM |
|
|
117
|
+
| [BNG(libras_move)](https://www.openml.org/t/7327) | 1956 | 4.2 | <ins> 2.51 </ins> | 1922 | 97.6 | 2.53 |
|
|
118
|
+
| [BNG(satellite_image)](https://www.openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 | <ins> 0.721 </ins> |
|
|
119
|
+
| [COMET_MC](https://www.openml.org/t/14949) | 44 | 1.0 | <ins> 0.0615 </ins> | 47 | 5.0 | 0.0662 |
|
|
120
|
+
| [friedman1](https://www.openml.org/t/361939) | 275 | 4.2 | <ins> 1.047 </ins> | 278 | 5.1 | 1.487 |
|
|
121
|
+
| [poker](https://www.openml.org/t/10102) | 38 | 0.6 | <ins> 0.256 </ins> | 41 | 1.2 | 0.722 |
|
|
122
|
+
| [subset_higgs](https://www.openml.org/t/361955) | 868 | 10.6 | <ins> 0.420 </ins> | 870 | 24.5 | 0.421 |
|
|
123
|
+
| [BNG(autoHorse)](https://www.openml.org/t/7319) | 107 | 1.1 | <ins> 19.0 </ins> | 107 | 3.2 | 20.5 |
|
|
124
|
+
| [BNG(pbc)](https://www.openml.org/t/7318) | 48 | 0.6 | <ins> 836.5 </ins> | 51 | 0.2 | 957.1 |
|
|
125
|
+
| average | 465 | 3.9 | - | 464 | 19.7 | - |
|
|
93
126
|
|
|
94
|
-
|
|
127
|
+
PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster.
|
|
95
128
|
|
|
96
|
-
|
|
97
|
-
from perpetual import PerpetualBooster
|
|
129
|
+
The results are summarized in the following table for classification tasks:
|
|
98
130
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
131
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC |
|
|
132
|
+
| -------------------------------------------------------- | ------- | ------ | ------------------- | -------- | ------ | ------------------ |
|
|
133
|
+
| [BNG(spambase)](https://www.openml.org/t/146163) | 70.1 | 2.1 | <ins> 0.671 </ins> | 73.1 | 3.7 | 0.669 |
|
|
134
|
+
| [BNG(trains)](https://www.openml.org/t/208) | 89.5 | 1.7 | <ins> 0.996 </ins> | 106.4 | 2.4 | 0.994 |
|
|
135
|
+
| [breast](https://www.openml.org/t/361942) | 13699.3 | 97.7 | <ins> 0.991 </ins> | 13330.7 | 79.7 | 0.949 |
|
|
136
|
+
| [Click_prediction_small](https://www.openml.org/t/7291) | 89.1 | 1.0 | <ins> 0.749 </ins> | 101.0 | 2.8 | 0.703 |
|
|
137
|
+
| [colon](https://www.openml.org/t/361938) | 12435.2 | 126.7 | <ins> 0.997 </ins> | 12356.2 | 152.3 | 0.997 |
|
|
138
|
+
| [Higgs](https://www.openml.org/t/362113) | 3485.3 | 40.9 | <ins> 0.843 </ins> | 3501.4 | 67.9 | 0.816 |
|
|
139
|
+
| [SEA(50000)](https://www.openml.org/t/230) | 21.9 | 0.2 | <ins> 0.936 </ins> | 25.6 | 0.5 | 0.935 |
|
|
140
|
+
| [sf-police-incidents](https://www.openml.org/t/359994) | 85.8 | 1.5 | <ins> 0.687 </ins> | 99.4 | 2.8 | 0.659 |
|
|
141
|
+
| [bates_classif_100](https://www.openml.org/t/361941) | 11152.8 | 50.0 | <ins> 0.864 </ins> | OOM | OOM | OOM |
|
|
142
|
+
| [prostate](https://www.openml.org/t/361945) | 13699.9 | 79.8 | <ins> 0.987 </ins> | OOM | OOM | OOM |
|
|
143
|
+
| average | 3747.0 | 34.0 | - | 3699.2 | 39.0 | - |
|
|
144
|
+
|
|
145
|
+
PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster.
|
|
146
|
+
|
|
147
|
+
PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks.
|
|
148
|
+
|
|
149
|
+
The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark).
|
|
102
150
|
|
|
103
|
-
## Documentation
|
|
104
151
|
|
|
105
|
-
Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/).
|
|
106
152
|
|
|
107
153
|
## Installation
|
|
108
154
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
perpetual-0.8.0.dist-info/METADATA,sha256=ltP-CG0Mf7qq2jMoDAS3hhZBSPFJdbDQ3MGlFuPwczc,11199
|
|
2
|
+
perpetual-0.8.0.dist-info/WHEEL,sha256=EqgtSuBVfoo49ZCSzjFhNjgk8PaMP7solNQLg3lziKE,94
|
|
3
|
+
perpetual-0.8.0.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
4
|
+
perpetual-0.8.0.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
5
|
+
perpetual/booster.py,sha256=HB0y3UNFEc0mL9FdmitFdZbPwUxrCN2-fqnCrN4XNrU,49886
|
|
6
|
+
perpetual/data.py,sha256=HiDsv2i1p9cLkXe8vnekxfpafyuxfWXwXrucdIir3xk,614
|
|
7
|
+
perpetual/serialize.py,sha256=FeW4JsUFVsrft9N7gz-ebn5mXvDv4LiJC2sgBEeGxYo,1957
|
|
8
|
+
perpetual/types.py,sha256=idZNsDErNTur_rJ_5Co8Pb6fik-AUn9lkrXmjbQJVX0,3381
|
|
9
|
+
perpetual/utils.py,sha256=nqwO6GFHi7I5iltuvgLT3NFaPm1h9cHlnomjFcdSfHY,7455
|
|
10
|
+
perpetual/__init__.py,sha256=V0RhghaG0CuKxKrzYUBYqrf7Drb-gjmznsbz9KT12lk,122
|
|
11
|
+
perpetual/perpetual.cp39-win_amd64.pyd,sha256=a9x8cCsIHjKcCfs89heVwWAhCkV0NAay90XwoULv1Ag,1670656
|
|
12
|
+
perpetual-0.8.0.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
perpetual-0.7.11.dist-info/METADATA,sha256=powxgbYl7rj8TyDkohrPHlVZvpriGrTNDTiY0sNh0e4,10014
|
|
2
|
-
perpetual-0.7.11.dist-info/WHEEL,sha256=EqgtSuBVfoo49ZCSzjFhNjgk8PaMP7solNQLg3lziKE,94
|
|
3
|
-
perpetual-0.7.11.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
4
|
-
perpetual-0.7.11.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
5
|
-
perpetual/booster.py,sha256=ICWJRuSxoaUgRHo9N8hodz1MlyRBVKPhVnfQJOes968,46919
|
|
6
|
-
perpetual/data.py,sha256=HiDsv2i1p9cLkXe8vnekxfpafyuxfWXwXrucdIir3xk,614
|
|
7
|
-
perpetual/serialize.py,sha256=FeW4JsUFVsrft9N7gz-ebn5mXvDv4LiJC2sgBEeGxYo,1957
|
|
8
|
-
perpetual/types.py,sha256=idZNsDErNTur_rJ_5Co8Pb6fik-AUn9lkrXmjbQJVX0,3381
|
|
9
|
-
perpetual/utils.py,sha256=i_7EB5xQXAGtODONhrOwfxRfH3YR7U0cQJvL8eUNFK8,7444
|
|
10
|
-
perpetual/__init__.py,sha256=V0RhghaG0CuKxKrzYUBYqrf7Drb-gjmznsbz9KT12lk,122
|
|
11
|
-
perpetual/perpetual.cp39-win_amd64.pyd,sha256=i_9sUGs3T__Y6gRwZaPOMxa7nlrMjrUrFCEl8GJcQD0,1513984
|
|
12
|
-
perpetual-0.7.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|