perpetual 0.7.12__cp313-none-win_amd64.whl → 0.8.1__cp313-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of perpetual might be problematic. Click here for more details.
- perpetual/booster.py +159 -58
- perpetual/perpetual.cp313-win_amd64.pyd +0 -0
- perpetual/utils.py +3 -1
- {perpetual-0.7.12.dist-info → perpetual-0.8.1.dist-info}/METADATA +59 -27
- perpetual-0.8.1.dist-info/RECORD +12 -0
- perpetual-0.7.12.dist-info/RECORD +0 -12
- {perpetual-0.7.12.dist-info → perpetual-0.8.1.dist-info}/WHEEL +0 -0
- {perpetual-0.7.12.dist-info → perpetual-0.8.1.dist-info}/license_files/LICENSE +0 -0
perpetual/booster.py
CHANGED
|
@@ -26,7 +26,7 @@ class PerpetualBooster:
|
|
|
26
26
|
# this is useful for parameters that should be
|
|
27
27
|
# attempted to be loaded in and set
|
|
28
28
|
# as attributes on the booster after it is loaded.
|
|
29
|
-
|
|
29
|
+
metadata_attributes: Dict[str, BaseSerializer] = {
|
|
30
30
|
"feature_names_in_": ObjectSerializer(),
|
|
31
31
|
"n_features_": ObjectSerializer(),
|
|
32
32
|
"feature_importance_method": ObjectSerializer(),
|
|
@@ -38,6 +38,7 @@ class PerpetualBooster:
|
|
|
38
38
|
self,
|
|
39
39
|
*,
|
|
40
40
|
objective: str = "LogLoss",
|
|
41
|
+
budget: float = 0.5,
|
|
41
42
|
num_threads: Optional[int] = None,
|
|
42
43
|
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
43
44
|
force_children_to_bound_parent: bool = False,
|
|
@@ -48,8 +49,7 @@ class PerpetualBooster:
|
|
|
48
49
|
missing_node_treatment: str = "None",
|
|
49
50
|
log_iterations: int = 0,
|
|
50
51
|
feature_importance_method: str = "Gain",
|
|
51
|
-
|
|
52
|
-
alpha: Optional[float] = None,
|
|
52
|
+
quantile: Optional[float] = None,
|
|
53
53
|
reset: Optional[bool] = None,
|
|
54
54
|
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
|
|
55
55
|
timeout: Optional[float] = None,
|
|
@@ -59,16 +59,17 @@ class PerpetualBooster:
|
|
|
59
59
|
max_bin: int = 256,
|
|
60
60
|
max_cat: int = 1000,
|
|
61
61
|
):
|
|
62
|
-
"""PerpetualBooster class, used to
|
|
63
|
-
The following parameters can also be specified in the fit method to override the values in the constructor:
|
|
64
|
-
budget, alpha, reset, categorical_features, timeout, iteration_limit, memory_limit, and stopping_rounds.
|
|
62
|
+
"""PerpetualBooster class, used to create gradient boosted decision tree ensembles.
|
|
65
63
|
|
|
66
64
|
Args:
|
|
67
|
-
objective (str, optional): Learning objective function to be used for optimization.
|
|
68
|
-
|
|
65
|
+
objective (str, optional): Learning objective function to be used for optimization. Valid options are:
|
|
66
|
+
"LogLoss" to use logistic loss (classification),
|
|
69
67
|
"SquaredLoss" to use squared error (regression),
|
|
70
68
|
"QuantileLoss" to use quantile error (regression).
|
|
71
69
|
Defaults to "LogLoss".
|
|
70
|
+
budget (float, optional): a positive number for fitting budget. Increasing this number will more
|
|
71
|
+
likely result in more boosting rounds and more increased predictive power.
|
|
72
|
+
Default value is 0.5.
|
|
72
73
|
num_threads (int, optional): Number of threads to be used during training.
|
|
73
74
|
monotone_constraints (Dict[Any, int], optional): Constraints that are used to enforce a
|
|
74
75
|
specific relationship between the training features and the target variable. A dictionary
|
|
@@ -105,10 +106,7 @@ class PerpetualBooster:
|
|
|
105
106
|
- "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
|
|
106
107
|
log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
|
|
107
108
|
feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
|
|
108
|
-
|
|
109
|
-
likely result in more boosting rounds and more increased predictive power.
|
|
110
|
-
Default value is 1.0.
|
|
111
|
-
alpha (float, optional): only used in quantile regression.
|
|
109
|
+
quantile (float, optional): only used in quantile regression.
|
|
112
110
|
reset (bool, optional): whether to reset the model or continue training.
|
|
113
111
|
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
|
|
114
112
|
Defaults to `auto` for Polars or Pandas categorical data types.
|
|
@@ -166,6 +164,7 @@ class PerpetualBooster:
|
|
|
166
164
|
)
|
|
167
165
|
|
|
168
166
|
self.objective = objective
|
|
167
|
+
self.budget = budget
|
|
169
168
|
self.num_threads = num_threads
|
|
170
169
|
self.monotone_constraints = monotone_constraints_
|
|
171
170
|
self.force_children_to_bound_parent = force_children_to_bound_parent
|
|
@@ -176,8 +175,7 @@ class PerpetualBooster:
|
|
|
176
175
|
self.missing_node_treatment = missing_node_treatment
|
|
177
176
|
self.log_iterations = log_iterations
|
|
178
177
|
self.feature_importance_method = feature_importance_method
|
|
179
|
-
self.
|
|
180
|
-
self.alpha = alpha
|
|
178
|
+
self.quantile = quantile
|
|
181
179
|
self.reset = reset
|
|
182
180
|
self.categorical_features = categorical_features
|
|
183
181
|
self.timeout = timeout
|
|
@@ -189,6 +187,7 @@ class PerpetualBooster:
|
|
|
189
187
|
|
|
190
188
|
booster = CratePerpetualBooster(
|
|
191
189
|
objective=self.objective,
|
|
190
|
+
budget=self.budget,
|
|
192
191
|
max_bin=self.max_bin,
|
|
193
192
|
num_threads=self.num_threads,
|
|
194
193
|
monotone_constraints=dict(),
|
|
@@ -199,23 +198,17 @@ class PerpetualBooster:
|
|
|
199
198
|
terminate_missing_features=set(),
|
|
200
199
|
missing_node_treatment=self.missing_node_treatment,
|
|
201
200
|
log_iterations=self.log_iterations,
|
|
201
|
+
quantile=self.quantile,
|
|
202
|
+
reset=self.reset,
|
|
203
|
+
categorical_features=set(),
|
|
204
|
+
timeout=self.timeout,
|
|
205
|
+
iteration_limit=self.iteration_limit,
|
|
206
|
+
memory_limit=self.memory_limit,
|
|
207
|
+
stopping_rounds=self.stopping_rounds,
|
|
202
208
|
)
|
|
203
209
|
self.booster = cast(BoosterType, booster)
|
|
204
210
|
|
|
205
|
-
def fit(
|
|
206
|
-
self,
|
|
207
|
-
X,
|
|
208
|
-
y,
|
|
209
|
-
sample_weight=None,
|
|
210
|
-
budget: Optional[float] = None,
|
|
211
|
-
alpha: Optional[float] = None,
|
|
212
|
-
reset: Optional[bool] = None,
|
|
213
|
-
categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
|
|
214
|
-
timeout: Optional[float] = None,
|
|
215
|
-
iteration_limit: Optional[int] = None,
|
|
216
|
-
memory_limit: Optional[float] = None,
|
|
217
|
-
stopping_rounds: Optional[int] = None,
|
|
218
|
-
) -> Self:
|
|
211
|
+
def fit(self, X, y, sample_weight=None) -> Self:
|
|
219
212
|
"""Fit the gradient booster on a provided dataset.
|
|
220
213
|
|
|
221
214
|
Args:
|
|
@@ -225,26 +218,10 @@ class PerpetualBooster:
|
|
|
225
218
|
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
|
|
226
219
|
training the model. If None is passed, a weight of 1 will be used for every record.
|
|
227
220
|
Defaults to None.
|
|
228
|
-
budget (float, optional): a positive number for fitting budget. Increasing this number will more
|
|
229
|
-
likely result in more boosting rounds and more increased predictive power.
|
|
230
|
-
Defaults to 1.0.
|
|
231
|
-
alpha (float, optional): only used in quantile regression.
|
|
232
|
-
reset (bool, optional): whether to reset the model or continue training.
|
|
233
|
-
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
|
|
234
|
-
Defaults to `auto` for Polars or Pandas categorical data types.
|
|
235
|
-
timeout (float, optional): optional fit timeout in seconds
|
|
236
|
-
iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
|
|
237
|
-
The algorithm automatically stops for most of the cases before hitting this limit.
|
|
238
|
-
If you want to experiment with very high budget (>2.0), you can also increase this limit.
|
|
239
|
-
memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
|
|
240
|
-
available memory and the algorithm requirements.
|
|
241
|
-
stopping_rounds (int, optional): optional limit for auto stopping. Defaults to 3.
|
|
242
221
|
"""
|
|
243
222
|
|
|
244
223
|
features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
|
|
245
|
-
convert_input_frame(
|
|
246
|
-
X, categorical_features or self.categorical_features, self.max_cat
|
|
247
|
-
)
|
|
224
|
+
convert_input_frame(X, self.categorical_features, self.max_cat)
|
|
248
225
|
)
|
|
249
226
|
self.n_features_ = cols
|
|
250
227
|
self.cat_mapping = cat_mapping
|
|
@@ -268,6 +245,7 @@ class PerpetualBooster:
|
|
|
268
245
|
):
|
|
269
246
|
booster = CratePerpetualBooster(
|
|
270
247
|
objective=self.objective,
|
|
248
|
+
budget=self.budget,
|
|
271
249
|
max_bin=self.max_bin,
|
|
272
250
|
num_threads=self.num_threads,
|
|
273
251
|
monotone_constraints=crate_mc,
|
|
@@ -278,12 +256,20 @@ class PerpetualBooster:
|
|
|
278
256
|
terminate_missing_features=crate_tmf,
|
|
279
257
|
missing_node_treatment=self.missing_node_treatment,
|
|
280
258
|
log_iterations=self.log_iterations,
|
|
259
|
+
quantile=self.quantile,
|
|
260
|
+
reset=self.reset,
|
|
261
|
+
categorical_features=categorical_features_,
|
|
262
|
+
timeout=self.timeout,
|
|
263
|
+
iteration_limit=self.iteration_limit,
|
|
264
|
+
memory_limit=self.memory_limit,
|
|
265
|
+
stopping_rounds=self.stopping_rounds,
|
|
281
266
|
)
|
|
282
267
|
self.booster = cast(BoosterType, booster)
|
|
283
268
|
else:
|
|
284
269
|
booster = CrateMultiOutputBooster(
|
|
285
270
|
n_boosters=len(classes_),
|
|
286
271
|
objective=self.objective,
|
|
272
|
+
budget=self.budget,
|
|
287
273
|
max_bin=self.max_bin,
|
|
288
274
|
num_threads=self.num_threads,
|
|
289
275
|
monotone_constraints=crate_mc,
|
|
@@ -294,6 +280,13 @@ class PerpetualBooster:
|
|
|
294
280
|
terminate_missing_features=crate_tmf,
|
|
295
281
|
missing_node_treatment=self.missing_node_treatment,
|
|
296
282
|
log_iterations=self.log_iterations,
|
|
283
|
+
quantile=self.quantile,
|
|
284
|
+
reset=self.reset,
|
|
285
|
+
categorical_features=categorical_features_,
|
|
286
|
+
timeout=self.timeout,
|
|
287
|
+
iteration_limit=self.iteration_limit,
|
|
288
|
+
memory_limit=self.memory_limit,
|
|
289
|
+
stopping_rounds=self.stopping_rounds,
|
|
297
290
|
)
|
|
298
291
|
self.booster = cast(MultiOutputBoosterType, booster)
|
|
299
292
|
|
|
@@ -305,20 +298,97 @@ class PerpetualBooster:
|
|
|
305
298
|
)
|
|
306
299
|
self._set_metadata_attributes("classes_", self.classes_)
|
|
307
300
|
|
|
301
|
+
self.categorical_features = categorical_features_
|
|
302
|
+
|
|
308
303
|
self.booster.fit(
|
|
309
304
|
flat_data=flat_data,
|
|
310
305
|
rows=rows,
|
|
311
306
|
cols=cols,
|
|
312
307
|
y=y_,
|
|
313
|
-
budget=budget or self.budget,
|
|
314
308
|
sample_weight=sample_weight_, # type: ignore
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
return self
|
|
312
|
+
|
|
313
|
+
def prune(self, X, y, sample_weight=None) -> Self:
|
|
314
|
+
"""Prune the gradient booster on a provided dataset.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
318
|
+
y (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
|
|
319
|
+
or a 1 or 2 dimensional Numpy array.
|
|
320
|
+
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
|
|
321
|
+
training the model. If None is passed, a weight of 1 will be used for every record.
|
|
322
|
+
Defaults to None.
|
|
323
|
+
"""
|
|
324
|
+
|
|
325
|
+
_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
|
|
326
|
+
|
|
327
|
+
y_, _ = convert_input_array(y, self.objective)
|
|
328
|
+
|
|
329
|
+
if sample_weight is None:
|
|
330
|
+
sample_weight_ = None
|
|
331
|
+
else:
|
|
332
|
+
sample_weight_, _ = convert_input_array(sample_weight, self.objective)
|
|
333
|
+
|
|
334
|
+
self.booster.prune(
|
|
335
|
+
flat_data=flat_data,
|
|
336
|
+
rows=rows,
|
|
337
|
+
cols=cols,
|
|
338
|
+
y=y_,
|
|
339
|
+
sample_weight=sample_weight_, # type: ignore
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
return self
|
|
343
|
+
|
|
344
|
+
def calibrate(
|
|
345
|
+
self, X_train, y_train, X_cal, y_cal, alpha, sample_weight=None
|
|
346
|
+
) -> Self:
|
|
347
|
+
"""Calibrate the gradient booster on a provided dataset.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
X_train (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
351
|
+
y_train (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
|
|
352
|
+
or a 1 or 2 dimensional Numpy array.
|
|
353
|
+
X_cal (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
354
|
+
y_cal (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
|
|
355
|
+
or a 1 or 2 dimensional Numpy array.
|
|
356
|
+
alpha (ArrayLike): Between 0 and 1, represents the uncertainty of the confidence interval.
|
|
357
|
+
Lower alpha produce larger (more conservative) prediction intervals.
|
|
358
|
+
alpha is the complement of the target coverage level.
|
|
359
|
+
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
|
|
360
|
+
training the model. If None is passed, a weight of 1 will be used for every record.
|
|
361
|
+
Defaults to None.
|
|
362
|
+
"""
|
|
363
|
+
|
|
364
|
+
_, flat_data_train, rows_train, cols_train = transform_input_frame(
|
|
365
|
+
X_train, self.cat_mapping
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
y_train_, _ = convert_input_array(y_train, self.objective)
|
|
369
|
+
|
|
370
|
+
_, flat_data_cal, rows_cal, cols_cal = transform_input_frame(
|
|
371
|
+
X_cal, self.cat_mapping
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
y_cal_, _ = convert_input_array(y_cal, self.objective)
|
|
375
|
+
|
|
376
|
+
if sample_weight is None:
|
|
377
|
+
sample_weight_ = None
|
|
378
|
+
else:
|
|
379
|
+
sample_weight_, _ = convert_input_array(sample_weight, self.objective)
|
|
380
|
+
|
|
381
|
+
self.booster.calibrate(
|
|
382
|
+
flat_data=flat_data_train,
|
|
383
|
+
rows=rows_train,
|
|
384
|
+
cols=cols_train,
|
|
385
|
+
y=y_train_,
|
|
386
|
+
flat_data_cal=flat_data_cal,
|
|
387
|
+
rows_cal=rows_cal,
|
|
388
|
+
cols_cal=cols_cal,
|
|
389
|
+
y_cal=y_cal_,
|
|
390
|
+
alpha=np.array(alpha),
|
|
391
|
+
sample_weight=sample_weight_, # type: ignore
|
|
322
392
|
)
|
|
323
393
|
|
|
324
394
|
return self
|
|
@@ -331,6 +401,29 @@ class PerpetualBooster:
|
|
|
331
401
|
f"Columns mismatch between data {features} passed, and data {self.feature_names_in_} used at fit."
|
|
332
402
|
)
|
|
333
403
|
|
|
404
|
+
def predict_intervals(self, X, parallel: Union[bool, None] = None) -> dict:
|
|
405
|
+
"""Predict intervals with the fitted booster on new data.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
|
|
409
|
+
parallel (Union[bool, None], optional): Optionally specify if the predict
|
|
410
|
+
function should run in parallel on multiple threads. If `None` is
|
|
411
|
+
passed, the `parallel` attribute of the booster will be used.
|
|
412
|
+
Defaults to `None`.
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
np.ndarray: Returns a numpy array of the predictions.
|
|
416
|
+
"""
|
|
417
|
+
features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
|
|
418
|
+
self._validate_features(features_)
|
|
419
|
+
|
|
420
|
+
return self.booster.predict_intervals(
|
|
421
|
+
flat_data=flat_data,
|
|
422
|
+
rows=rows,
|
|
423
|
+
cols=cols,
|
|
424
|
+
parallel=parallel,
|
|
425
|
+
)
|
|
426
|
+
|
|
334
427
|
def predict(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
|
|
335
428
|
"""Predict with the fitted booster on new data.
|
|
336
429
|
|
|
@@ -699,14 +792,17 @@ class PerpetualBooster:
|
|
|
699
792
|
Returns:
|
|
700
793
|
PerpetualBooster: An initialized booster object.
|
|
701
794
|
"""
|
|
702
|
-
|
|
795
|
+
try:
|
|
796
|
+
booster = CratePerpetualBooster.load_booster(str(path))
|
|
797
|
+
except ValueError:
|
|
798
|
+
booster = CrateMultiOutputBooster.load_booster(str(path))
|
|
703
799
|
|
|
704
800
|
params = booster.get_params()
|
|
705
801
|
with warnings.catch_warnings():
|
|
706
802
|
warnings.simplefilter("ignore")
|
|
707
803
|
c = cls(**params)
|
|
708
804
|
c.booster = booster
|
|
709
|
-
for m in c.
|
|
805
|
+
for m in c.metadata_attributes:
|
|
710
806
|
try:
|
|
711
807
|
m_ = c._get_metadata_attributes(m)
|
|
712
808
|
setattr(c, m, m_)
|
|
@@ -774,12 +870,12 @@ class PerpetualBooster:
|
|
|
774
870
|
return v
|
|
775
871
|
|
|
776
872
|
def _set_metadata_attributes(self, key: str, value: Any) -> None:
|
|
777
|
-
value_ = self.
|
|
873
|
+
value_ = self.metadata_attributes[key].serialize(value)
|
|
778
874
|
self.insert_metadata(key=key, value=value_)
|
|
779
875
|
|
|
780
876
|
def _get_metadata_attributes(self, key: str) -> Any:
|
|
781
877
|
value = self.get_metadata(key)
|
|
782
|
-
return self.
|
|
878
|
+
return self.metadata_attributes[key].deserialize(value)
|
|
783
879
|
|
|
784
880
|
@property
|
|
785
881
|
def base_score(self) -> Union[float, Iterable[float]]:
|
|
@@ -810,7 +906,12 @@ class PerpetualBooster:
|
|
|
810
906
|
|
|
811
907
|
def __setstate__(self, d: Dict[Any, Any]) -> None:
|
|
812
908
|
# Load the booster object the pickled JSon string.
|
|
813
|
-
|
|
909
|
+
try:
|
|
910
|
+
booster_object = CratePerpetualBooster.from_json(d["__booster_json_file__"])
|
|
911
|
+
except ValueError:
|
|
912
|
+
booster_object = CrateMultiOutputBooster.from_json(
|
|
913
|
+
d["__booster_json_file__"]
|
|
914
|
+
)
|
|
814
915
|
d["booster"] = booster_object
|
|
815
916
|
# Are there any new parameters, that need to be added to the python object,
|
|
816
917
|
# that would have been loaded in as defaults on the json object?
|
|
Binary file
|
perpetual/utils.py
CHANGED
|
@@ -65,7 +65,9 @@ def convert_input_array(x, objective) -> np.ndarray:
|
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
def convert_input_frame(
|
|
68
|
-
X,
|
|
68
|
+
X,
|
|
69
|
+
categorical_features,
|
|
70
|
+
max_cat,
|
|
69
71
|
) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
|
|
70
72
|
"""Convert data to format needed by booster.
|
|
71
73
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: perpetual
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
|
6
6
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.13
|
|
11
11
|
Requires-Dist: numpy
|
|
12
12
|
Requires-Dist: typing-extensions
|
|
13
|
+
Requires-Dist: black ; extra == 'dev'
|
|
13
14
|
Requires-Dist: pandas ; extra == 'dev'
|
|
14
15
|
Requires-Dist: polars ; extra == 'dev'
|
|
15
16
|
Requires-Dist: pyarrow ; extra == 'dev'
|
|
@@ -24,7 +25,7 @@ Requires-Dist: ruff ; extra == 'dev'
|
|
|
24
25
|
Provides-Extra: dev
|
|
25
26
|
License-File: LICENSE
|
|
26
27
|
License-File: LICENSE
|
|
27
|
-
Summary: A self-generalizing gradient boosting machine
|
|
28
|
+
Summary: A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization
|
|
28
29
|
Keywords: rust,perpetual,machine learning,tree model,decision tree,gradient boosted decision tree,gradient boosting machine
|
|
29
30
|
Home-Page: https://perpetual-ml.com
|
|
30
31
|
Author: Mutlu Simsek
|
|
@@ -49,10 +50,28 @@ Project-URL: Source Code, https://github.com/perpetual-ml/perpetual
|
|
|
49
50
|
|
|
50
51
|
# Perpetual
|
|
51
52
|
|
|
52
|
-
PerpetualBooster is a gradient boosting machine (GBM) algorithm
|
|
53
|
+
PerpetualBooster is a gradient boosting machine (GBM) algorithm that doesn't need hyperparameter optimization unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from perpetual import PerpetualBooster
|
|
61
|
+
|
|
62
|
+
model = PerpetualBooster(objective="SquaredLoss", budget=0.5)
|
|
63
|
+
model.fit(X, y)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Documentation
|
|
67
|
+
|
|
68
|
+
Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/).
|
|
69
|
+
|
|
53
70
|
|
|
54
71
|
## Benchmark
|
|
55
72
|
|
|
73
|
+
### PerpetualBooster vs. Optuna + LightGBM
|
|
74
|
+
|
|
56
75
|
Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
|
|
57
76
|
|
|
58
77
|
The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression):
|
|
@@ -71,38 +90,51 @@ The following table summarizes the results for the [Cover Types](https://scikit-
|
|
|
71
90
|
|
|
72
91
|
The results can be reproduced using the scripts in the [examples](./python-package/examples) folder.
|
|
73
92
|
|
|
74
|
-
PerpetualBooster
|
|
93
|
+
### PerpetualBooster vs. AutoGluon
|
|
75
94
|
|
|
76
|
-
|
|
77
|
-
| -------------------------------------------- | --------------------------- | ----------------------------------------------------------------- | -------------- | --------------------------- | ----------------------------------------------------------------- | -------------- |
|
|
78
|
-
| [Airlines_DepDelay_10M](https://www.openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 <td style="background-color:green;color:white;"> 28.8 </td> |
|
|
79
|
-
| [bates_regr_100](https://www.openml.org/t/361940) | 3421 | 15.1 <td style="background-color:green;color:white;"> 1.084 </td> | OOM | OOM | OOM |
|
|
80
|
-
| [BNG(libras_move)](https://www.openml.org/t/7327) | 1956 | 4.2 <td style="background-color:green;color:white;"> 2.51 </td> | 1922 | 97.6 | 2.53 |
|
|
81
|
-
| [BNG(satellite_image)](https://www.openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 <td style="background-color:green;color:white;"> 0.721 </td> |
|
|
82
|
-
| [COMET_MC](https://www.openml.org/t/14949) | 44 | 1.0 <td style="background-color:green;color:white;"> 0.0615 </td> | 47 | 5.0 | 0.0662 |
|
|
83
|
-
| [friedman1](https://www.openml.org/t/361939) | 275 | 4.2 <td style="background-color:green;color:white;"> 1.047 </td> | 278 | 5.1 | 1.487 |
|
|
84
|
-
| [poker](https://www.openml.org/t/10102) | 38 | 0.6 <td style="background-color:green;color:white;"> 0.256 </td> | 41 | 1.2 | 0.722 |
|
|
85
|
-
| [subset_higgs](https://www.openml.org/t/361955) | 868 | 10.6 <td style="background-color:green;color:white;"> 0.420 </td> | 870 | 24.5 | 0.421 |
|
|
86
|
-
| [BNG(autoHorse)](https://www.openml.org/t/7319) | 107 | 1.1 <td style="background-color:green;color:white;"> 19.0 </td> | 107 | 3.2 | 20.5 |
|
|
87
|
-
| [BNG(pbc)](https://www.openml.org/t/7318) | 48 | 0.6 <td style="background-color:green;color:white;"> 836.5 </td> | 51 | 0.2 | 957.1 |
|
|
88
|
-
| average | 465 | 3.9 | - | 464 | 19.7 | - |
|
|
95
|
+
PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/) for both regression and classification tasks.
|
|
89
96
|
|
|
90
|
-
|
|
97
|
+
The results are summarized in the following table for regression tasks:
|
|
91
98
|
|
|
92
|
-
|
|
99
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
|
|
100
|
+
| -------------------------------------------------------- | ----- | ----- | ------------------- | -------- | ------ | ------------------ |
|
|
101
|
+
| [Airlines_DepDelay_10M](https://www.openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 | <ins> 28.8 </ins> |
|
|
102
|
+
| [bates_regr_100](https://www.openml.org/t/361940) | 3421 | 15.1 | <ins> 1.084 </ins> | OOM | OOM | OOM |
|
|
103
|
+
| [BNG(libras_move)](https://www.openml.org/t/7327) | 1956 | 4.2 | <ins> 2.51 </ins> | 1922 | 97.6 | 2.53 |
|
|
104
|
+
| [BNG(satellite_image)](https://www.openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 | <ins> 0.721 </ins> |
|
|
105
|
+
| [COMET_MC](https://www.openml.org/t/14949) | 44 | 1.0 | <ins> 0.0615 </ins> | 47 | 5.0 | 0.0662 |
|
|
106
|
+
| [friedman1](https://www.openml.org/t/361939) | 275 | 4.2 | <ins> 1.047 </ins> | 278 | 5.1 | 1.487 |
|
|
107
|
+
| [poker](https://www.openml.org/t/10102) | 38 | 0.6 | <ins> 0.256 </ins> | 41 | 1.2 | 0.722 |
|
|
108
|
+
| [subset_higgs](https://www.openml.org/t/361955) | 868 | 10.6 | <ins> 0.420 </ins> | 870 | 24.5 | 0.421 |
|
|
109
|
+
| [BNG(autoHorse)](https://www.openml.org/t/7319) | 107 | 1.1 | <ins> 19.0 </ins> | 107 | 3.2 | 20.5 |
|
|
110
|
+
| [BNG(pbc)](https://www.openml.org/t/7318) | 48 | 0.6 | <ins> 836.5 </ins> | 51 | 0.2 | 957.1 |
|
|
111
|
+
| average | 465 | 3.9 | - | 464 | 19.7 | - |
|
|
93
112
|
|
|
94
|
-
|
|
113
|
+
PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster.
|
|
95
114
|
|
|
96
|
-
|
|
97
|
-
from perpetual import PerpetualBooster
|
|
115
|
+
The results are summarized in the following table for classification tasks:
|
|
98
116
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
117
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC |
|
|
118
|
+
| -------------------------------------------------------- | ------- | ------ | ------------------- | -------- | ------ | ------------------ |
|
|
119
|
+
| [BNG(spambase)](https://www.openml.org/t/146163) | 70.1 | 2.1 | <ins> 0.671 </ins> | 73.1 | 3.7 | 0.669 |
|
|
120
|
+
| [BNG(trains)](https://www.openml.org/t/208) | 89.5 | 1.7 | <ins> 0.996 </ins> | 106.4 | 2.4 | 0.994 |
|
|
121
|
+
| [breast](https://www.openml.org/t/361942) | 13699.3 | 97.7 | <ins> 0.991 </ins> | 13330.7 | 79.7 | 0.949 |
|
|
122
|
+
| [Click_prediction_small](https://www.openml.org/t/7291) | 89.1 | 1.0 | <ins> 0.749 </ins> | 101.0 | 2.8 | 0.703 |
|
|
123
|
+
| [colon](https://www.openml.org/t/361938) | 12435.2 | 126.7 | <ins> 0.997 </ins> | 12356.2 | 152.3 | 0.997 |
|
|
124
|
+
| [Higgs](https://www.openml.org/t/362113) | 3485.3 | 40.9 | <ins> 0.843 </ins> | 3501.4 | 67.9 | 0.816 |
|
|
125
|
+
| [SEA(50000)](https://www.openml.org/t/230) | 21.9 | 0.2 | <ins> 0.936 </ins> | 25.6 | 0.5 | 0.935 |
|
|
126
|
+
| [sf-police-incidents](https://www.openml.org/t/359994) | 85.8 | 1.5 | <ins> 0.687 </ins> | 99.4 | 2.8 | 0.659 |
|
|
127
|
+
| [bates_classif_100](https://www.openml.org/t/361941) | 11152.8 | 50.0 | <ins> 0.864 </ins> | OOM | OOM | OOM |
|
|
128
|
+
| [prostate](https://www.openml.org/t/361945) | 13699.9 | 79.8 | <ins> 0.987 </ins> | OOM | OOM | OOM |
|
|
129
|
+
| average | 3747.0 | 34.0 | - | 3699.2 | 39.0 | - |
|
|
130
|
+
|
|
131
|
+
PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster.
|
|
132
|
+
|
|
133
|
+
PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks.
|
|
134
|
+
|
|
135
|
+
The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark).
|
|
102
136
|
|
|
103
|
-
## Documentation
|
|
104
137
|
|
|
105
|
-
Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/).
|
|
106
138
|
|
|
107
139
|
## Installation
|
|
108
140
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
perpetual-0.8.1.dist-info/METADATA,sha256=UNcSudsW5App4W9EnFAgjxFrhbHwfbomtz7jwGIVi5s,10752
|
|
2
|
+
perpetual-0.8.1.dist-info/WHEEL,sha256=iNzfSeughQ6gviCftXhu6zZQCMTOJAdqefPsfmeKgU8,95
|
|
3
|
+
perpetual-0.8.1.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
4
|
+
perpetual-0.8.1.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
5
|
+
perpetual/booster.py,sha256=ne-RgsYIjzQAYiTtI1PJ_IpolnS-C89trIFCdoiZoH4,50118
|
|
6
|
+
perpetual/data.py,sha256=HiDsv2i1p9cLkXe8vnekxfpafyuxfWXwXrucdIir3xk,614
|
|
7
|
+
perpetual/serialize.py,sha256=FeW4JsUFVsrft9N7gz-ebn5mXvDv4LiJC2sgBEeGxYo,1957
|
|
8
|
+
perpetual/types.py,sha256=idZNsDErNTur_rJ_5Co8Pb6fik-AUn9lkrXmjbQJVX0,3381
|
|
9
|
+
perpetual/utils.py,sha256=nqwO6GFHi7I5iltuvgLT3NFaPm1h9cHlnomjFcdSfHY,7455
|
|
10
|
+
perpetual/__init__.py,sha256=V0RhghaG0CuKxKrzYUBYqrf7Drb-gjmznsbz9KT12lk,122
|
|
11
|
+
perpetual/perpetual.cp313-win_amd64.pyd,sha256=lrWyuEZiMH8l75Tprn383NMkPDGh7q18LRIlKYzzE14,1661952
|
|
12
|
+
perpetual-0.8.1.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
perpetual-0.7.12.dist-info/METADATA,sha256=Sfq0haXk0OttukvSaG6ARYPQipfeQ7ZIO_m7iaqkvys,10014
|
|
2
|
-
perpetual-0.7.12.dist-info/WHEEL,sha256=iNzfSeughQ6gviCftXhu6zZQCMTOJAdqefPsfmeKgU8,95
|
|
3
|
-
perpetual-0.7.12.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
4
|
-
perpetual-0.7.12.dist-info/license_files/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
5
|
-
perpetual/booster.py,sha256=ICWJRuSxoaUgRHo9N8hodz1MlyRBVKPhVnfQJOes968,46919
|
|
6
|
-
perpetual/data.py,sha256=HiDsv2i1p9cLkXe8vnekxfpafyuxfWXwXrucdIir3xk,614
|
|
7
|
-
perpetual/serialize.py,sha256=FeW4JsUFVsrft9N7gz-ebn5mXvDv4LiJC2sgBEeGxYo,1957
|
|
8
|
-
perpetual/types.py,sha256=idZNsDErNTur_rJ_5Co8Pb6fik-AUn9lkrXmjbQJVX0,3381
|
|
9
|
-
perpetual/utils.py,sha256=i_7EB5xQXAGtODONhrOwfxRfH3YR7U0cQJvL8eUNFK8,7444
|
|
10
|
-
perpetual/__init__.py,sha256=V0RhghaG0CuKxKrzYUBYqrf7Drb-gjmznsbz9KT12lk,122
|
|
11
|
-
perpetual/perpetual.cp313-win_amd64.pyd,sha256=wB4UC94u1mcKNicec_h62WjjJJwp1WOerPjMAcKyELY,1509376
|
|
12
|
-
perpetual-0.7.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|