perpetual 1.1.1__cp312-cp312-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
perpetual/utils.py ADDED
@@ -0,0 +1,463 @@
1
+ import logging
2
+ from typing import Dict, Iterable, List, Optional, Tuple
3
+
4
+ import numpy as np
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def type_df(df):
10
+ library_name = type(df).__module__.split(".")[0]
11
+ if type(df).__name__ == "DataFrame":
12
+ if library_name == "pandas":
13
+ return "pandas_df"
14
+ elif library_name == "polars":
15
+ return "polars_df"
16
+ elif library_name == "numpy":
17
+ return "numpy"
18
+ else:
19
+ return ""
20
+
21
+
22
+ def type_series(y):
23
+ library_name = type(y).__module__.split(".")[0]
24
+ if type(y).__name__ == "Series":
25
+ if library_name == "pandas":
26
+ return "pandas_series"
27
+ elif library_name == "polars":
28
+ return "polars_series"
29
+ elif library_name == "numpy":
30
+ return "numpy"
31
+ else:
32
+ return ""
33
+
34
+
35
+ def convert_input_array(x, objective, is_target=False, is_int=False) -> np.ndarray:
36
+ classes_ = []
37
+
38
+ if type(x).__module__.split(".")[0] == "numpy":
39
+ if len(x.shape) == 2:
40
+ classes_, x_, *_ = convert_input_frame(x, None, 1000)
41
+ else:
42
+ x_ = x
43
+ elif type_series(x) == "pandas_series":
44
+ x_ = x.to_numpy()
45
+ elif type_series(x) == "polars_series":
46
+ x_ = x.to_numpy(allow_copy=False)
47
+ elif type_df(x) == "polars_df" or type_df(x) == "pandas_df":
48
+ classes_, x_, *_ = convert_input_frame(x, None, 1000)
49
+ else:
50
+ x_ = x.to_numpy()
51
+
52
+ if is_target and objective == "LogLoss" and len(x_.shape) == 1:
53
+ classes_, x_index = np.unique(x_, return_inverse=True)
54
+ if len(classes_) > 2:
55
+ x_ = np.eye(len(classes_))[x_index]
56
+ else:
57
+ x_ = x_index.astype("float64")
58
+
59
+ if is_int and not np.issubdtype(x_.dtype, "uint64"):
60
+ x_ = x_.astype(dtype="uint64", copy=False)
61
+
62
+ if not is_int and not np.issubdtype(x_.dtype, "float64"):
63
+ x_ = x_.astype(dtype="float64", copy=False)
64
+
65
+ if len(x_.shape) == 2:
66
+ x_ = x_.ravel(order="F")
67
+
68
+ return x_, classes_
69
+
70
+
71
+ def convert_input_frame(
72
+ X,
73
+ categorical_features,
74
+ max_cat,
75
+ ) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
76
+ """Convert data to format needed by booster.
77
+
78
+ Returns:
79
+ Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping
80
+ """
81
+ categorical_features_ = None
82
+ if type_df(X) == "pandas_df":
83
+ X_ = X.to_numpy()
84
+ features_ = X.columns.to_list()
85
+ if categorical_features == "auto":
86
+ categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
87
+ categorical_features_ = [
88
+ features_.index(c) for c in categorical_columns
89
+ ] or None
90
+ elif type_df(X) == "numpy":
91
+ X_ = X
92
+ features_ = list(map(str, range(X_.shape[1])))
93
+ else:
94
+ raise ValueError(f"Object type {type(X)} is not supported.")
95
+
96
+ if (
97
+ categorical_features
98
+ and all(isinstance(s, int) for s in categorical_features)
99
+ and isinstance(categorical_features, list)
100
+ ):
101
+ categorical_features_ = categorical_features
102
+ elif (
103
+ categorical_features
104
+ and all(isinstance(s, str) for s in categorical_features)
105
+ and isinstance(categorical_features, list)
106
+ ):
107
+ categorical_features_ = [features_.index(c) for c in categorical_features]
108
+
109
+ cat_mapping = {} # key: feature_name, value: ordered category names
110
+ cat_to_num = []
111
+ if categorical_features_:
112
+ for i in categorical_features_:
113
+ categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True)
114
+
115
+ categories = list(categories)
116
+ if "nan" in categories:
117
+ categories.remove("nan")
118
+ categories.insert(0, "nan")
119
+
120
+ inversed = inversed + 1.0
121
+
122
+ if len(categories) > max_cat:
123
+ cat_to_num.append(i)
124
+ logger.warning(
125
+ f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
126
+ )
127
+
128
+ feature_name = features_[i]
129
+ cat_mapping[feature_name] = categories
130
+ ind_nan = len(categories)
131
+ inversed[inversed == ind_nan] = np.nan
132
+ X_[:, i] = inversed
133
+
134
+ categorical_features_ = [
135
+ x for x in categorical_features_ if x not in cat_to_num
136
+ ]
137
+
138
+ logger.info(f"Categorical features: {categorical_features_}")
139
+ logger.info(f"Mapping of categories: {cat_mapping}")
140
+
141
+ if not np.issubdtype(X_.dtype, "float64"):
142
+ X_ = X_.astype(dtype="float64", copy=False)
143
+ flat_data = X_.ravel(order="F")
144
+ rows, cols = X_.shape
145
+
146
+ if isinstance(categorical_features_, list):
147
+ categorical_features_ = set(categorical_features_)
148
+
149
+ return features_, flat_data, rows, cols, categorical_features_, cat_mapping
150
+
151
+
152
+ def convert_input_frame_columnar(
153
+ X, categorical_features, max_cat
154
+ ) -> Tuple[
155
+ List[str],
156
+ List[np.ndarray],
157
+ List[Optional[np.ndarray]],
158
+ int,
159
+ int,
160
+ Optional[set],
161
+ dict,
162
+ ]:
163
+ """Convert Polars DataFrame to columnar format for zero-copy transfer.
164
+
165
+ Returns list of column arrays and list of validity masks.
166
+ """
167
+ import polars.selectors as cs
168
+
169
+ features_ = list(X.columns)
170
+ rows, cols = X.shape
171
+
172
+ # Determine categorical features
173
+ categorical_features_ = None
174
+ if categorical_features == "auto":
175
+ categorical_columns = X.select(cs.categorical()).columns
176
+ categorical_features_ = [
177
+ features_.index(c) for c in categorical_columns
178
+ ] or None
179
+ elif (
180
+ categorical_features
181
+ and all(isinstance(s, int) for s in categorical_features)
182
+ and isinstance(categorical_features, list)
183
+ ):
184
+ categorical_features_ = categorical_features
185
+ elif (
186
+ categorical_features
187
+ and all(isinstance(s, str) for s in categorical_features)
188
+ and isinstance(categorical_features, list)
189
+ ):
190
+ categorical_features_ = [features_.index(c) for c in categorical_features]
191
+
192
+ cat_mapping = {}
193
+ cat_to_num = []
194
+ categorical_set = set(categorical_features_) if categorical_features_ else set()
195
+
196
+ # Convert each column to numpy array
197
+ columns = []
198
+ masks = []
199
+ import pyarrow as pa
200
+
201
+ for i, col_name in enumerate(features_):
202
+ if i in categorical_set:
203
+ # For categorical columns, we need to encode them
204
+ # Use Arrow to get codes and categories without forcing numpy object conversion
205
+ arr = X[col_name].to_arrow()
206
+ if isinstance(arr, pa.ChunkedArray):
207
+ arr = arr.combine_chunks()
208
+
209
+ if not isinstance(arr, pa.DictionaryArray):
210
+ arr = arr.dictionary_encode()
211
+
212
+ # Extract categories (dictionary)
213
+ # Arrow dictionary is usually StringArray
214
+ cats = arr.dictionary.to_pylist()
215
+
216
+ # Extract codes (indices)
217
+ # Cast to float64 for Perpetual
218
+ indices = arr.indices.to_numpy(zero_copy_only=False)
219
+ out_values = indices.astype(np.float64)
220
+ out_values += 1.0 # Shift: 0 in Perpetual is "nan"
221
+
222
+ # Handle Nulls (masked values in Arrow)
223
+ # Set them to NaN
224
+ if arr.null_count > 0:
225
+ # Simpler: convert validity bitmap to byte array using numpy unpacking
226
+ row_count = len(out_values)
227
+ if arr.buffers()[0]:
228
+ valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
229
+ valid_mask = np.unpackbits(valid_bits, bitorder="little")[
230
+ :row_count
231
+ ].astype(bool)
232
+ # mask is 1 where valid, 0 where null.
233
+ # We want to set nulls (0) to NaN.
234
+ out_values[~valid_mask] = np.nan
235
+
236
+ # Handle "nan" string in categories
237
+ if "nan" in cats:
238
+ nan_idx = cats.index("nan")
239
+ # Indices pointing to this "nan" string should become NaN
240
+ # Current logic: cats["nan"] is at `nan_idx`.
241
+ # `out_values` has `nan_idx + 1`.
242
+ out_values[out_values == (nan_idx + 1.0)] = np.nan
243
+ cats.remove("nan")
244
+
245
+ cats.insert(0, "nan")
246
+
247
+ if len(cats) > max_cat:
248
+ cat_to_num.append(i)
249
+ logger.warning(
250
+ f"Feature {col_name} will be treated as numerical since the number of categories ({len(cats)}) exceeds max_cat ({max_cat}) threshold."
251
+ )
252
+
253
+ cat_mapping[col_name] = cats
254
+ columns.append(out_values)
255
+ masks.append(None) # Categorical encoding handles NaNs
256
+ else:
257
+ # For non-categorical columns, use zero-copy via Arrow
258
+ series = X[col_name]
259
+ # Use Arrow to get validity bitmap and values zero-copy
260
+ arr = series.to_arrow()
261
+ if isinstance(arr, pa.ChunkedArray):
262
+ if arr.num_chunks > 1:
263
+ arr = arr.combine_chunks()
264
+ else:
265
+ arr = arr.chunk(0)
266
+
267
+ # Check buffers
268
+ buffers = arr.buffers()
269
+ # buffers[0] is validity bitmap
270
+ # buffers[1] is values
271
+ if buffers[0] is None:
272
+ masks.append(None)
273
+ else:
274
+ masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
275
+
276
+ # values
277
+ col_array = np.frombuffer(buffers[1], dtype=np.float64)
278
+ columns.append(col_array)
279
+
280
+ if categorical_features_:
281
+ categorical_features_ = [
282
+ x for x in categorical_features_ if x not in cat_to_num
283
+ ]
284
+ logger.info(f"Categorical features: {categorical_features_}")
285
+ logger.info(f"Mapping of categories: {cat_mapping}")
286
+
287
+ if isinstance(categorical_features_, list):
288
+ categorical_features_ = set(categorical_features_)
289
+
290
+ return features_, columns, masks, rows, cols, categorical_features_, cat_mapping
291
+
292
+
293
+ def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, int]:
294
+ """Convert data to format needed by booster.
295
+
296
+ Returns:
297
+ Tuple[List[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns
298
+ """
299
+ if type_df(X) == "pandas_df":
300
+ X_ = X.to_numpy()
301
+ features_ = X.columns.to_list()
302
+ elif type_df(X) == "numpy":
303
+ X_ = X
304
+ features_ = list(map(str, range(X_.shape[1])))
305
+ else:
306
+ raise ValueError(f"Object type {type(X)} is not supported.")
307
+
308
+ if cat_mapping:
309
+ for feature_name, categories in cat_mapping.items():
310
+ feature_index = features_.index(feature_name)
311
+ cats = categories.copy()
312
+ cats.remove("nan")
313
+ x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str))
314
+ x_enc = x_enc + 1.0
315
+ ind_nan = len(categories)
316
+ x_enc[x_enc == ind_nan] = np.nan
317
+ X_[:, feature_index] = x_enc
318
+
319
+ if not np.issubdtype(X_.dtype, "float64"):
320
+ X_ = X_.astype(dtype="float64", copy=False)
321
+ flat_data = X_.ravel(order="F")
322
+ rows, cols = X_.shape
323
+
324
+ return features_, flat_data, rows, cols
325
+
326
+
327
+ def transform_input_frame_columnar(
328
+ X, cat_mapping
329
+ ) -> Tuple[List[str], List[np.ndarray], List[Optional[np.ndarray]], int, int]:
330
+ """Convert Polars DataFrame to columnar format for zero-copy prediction.
331
+
332
+ Returns list of column arrays and masks instead of flattened data, avoiding copies.
333
+ """
334
+ features_ = list(X.columns)
335
+ rows, cols = X.shape
336
+
337
+ columns = []
338
+ masks = []
339
+ import pyarrow as pa
340
+
341
+ for i, col_name in enumerate(features_):
342
+ if cat_mapping and col_name in cat_mapping:
343
+ # For categorical columns, we need to encode them using the existing cat_mapping
344
+ categories = cat_mapping[col_name]
345
+
346
+ # Use Arrow for zero-copy extraction
347
+ arr = X[col_name].to_arrow()
348
+ if isinstance(arr, pa.ChunkedArray):
349
+ arr = arr.combine_chunks()
350
+ if not isinstance(arr, pa.DictionaryArray):
351
+ arr = arr.dictionary_encode()
352
+
353
+ # Input categories
354
+ new_cats = arr.dictionary.to_pylist()
355
+
356
+ # Extract codes (indices)
357
+ # We need integers for indexing `lookup`.
358
+ # If indices has nulls, to_numpy() might return floats.
359
+ # We fill nulls with 0 to ensure we get integers, then mask result later.
360
+ filled_indices_arr = arr.indices.fill_null(0)
361
+ new_indices = filled_indices_arr.to_numpy()
362
+
363
+ # Build mapping from new_cats indices to old_cats indices
364
+ # old_cats = categories. "nan" is at index 0.
365
+ # We want to map new_cat_idx -> old_cat_idx.
366
+ # If new_cat is "nan", map to 0?
367
+ # Perpetual encoding: "nan" -> NaN (in float), Cat1 -> 1.0, Cat2 -> 2.0.
368
+ # categories list has "nan" at 0.
369
+ # So "A" is at index 1.
370
+
371
+ # We need to map `new_indices` to `out_values`.
372
+
373
+ # Create a lookup table (array)
374
+ # lookup[new_code] = old_float_code
375
+
376
+ lookup = np.full(len(new_cats), np.nan, dtype=np.float64)
377
+
378
+ # optimization: map strings to indices for old categories
379
+ old_cat_map = {c: i for i, c in enumerate(categories)}
380
+ # categories[0] is "nan"
381
+
382
+ for i, cat in enumerate(new_cats):
383
+ if cat in old_cat_map:
384
+ idx = old_cat_map[cat]
385
+ # If idx is 0 ("nan"), we want result to be np.nan?
386
+ # Previous logic: `inversed[inversed == ind_nan] = np.nan`.
387
+ # Wait, `inversed` from `searchsorted` was 0-based index into `cats` (without "nan" inside `searchsorted` call?).
388
+ # Previous logic:
389
+ # `cats.remove("nan")`
390
+ # `searchsorted(cats, ...)` -> index into cats (0 to N-1).
391
+ # `+ 1.0`. So 1 to N.
392
+ # `categories` has "nan" inserted at 0.
393
+ # So index 1 corresponds to `categories[1]`.
394
+ # Logic holds.
395
+
396
+ if categories[idx] == "nan":
397
+ lookup[i] = np.nan
398
+ else:
399
+ lookup[i] = float(idx)
400
+ # Note: categories has "nan" at 0. "A" at 1.
401
+ # If `cat` is "A", `idx` is 1. We want 1.0. Correct.
402
+ else:
403
+ # Unknown category -> NaN?
404
+ lookup[i] = np.nan
405
+
406
+ # Apply lookup
407
+ # Handle out of bounds indices just in case? Arrow indices should be valid.
408
+ # `new_indices` are codes into `new_cats`.
409
+
410
+ # Check for nulls in `new_indices` (masked)
411
+ # If null, they map to NaN.
412
+
413
+ # `take` style mapping
414
+ # `lookup` has NaN for unknown/nan cats.
415
+ x_enc = lookup[new_indices]
416
+
417
+ # Handle array-level nulls
418
+ if arr.null_count > 0:
419
+ if arr.buffers()[0]:
420
+ valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
421
+ valid_mask = np.unpackbits(valid_bits, bitorder="little")[
422
+ : len(x_enc)
423
+ ].astype(bool)
424
+ x_enc[~valid_mask] = np.nan
425
+
426
+ columns.append(x_enc)
427
+ masks.append(None)
428
+ else:
429
+ series = X[col_name]
430
+ arr = series.to_arrow()
431
+ if isinstance(arr, pa.ChunkedArray):
432
+ if arr.num_chunks > 1:
433
+ arr = arr.combine_chunks() # Fallback for chunked
434
+ else:
435
+ arr = arr.chunk(0)
436
+ buffers = arr.buffers()
437
+ if buffers[0] is None:
438
+ masks.append(None)
439
+ else:
440
+ masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
441
+ columns.append(np.frombuffer(buffers[1], dtype=np.float64))
442
+
443
+ return features_, columns, masks, rows, cols
444
+
445
+
446
+ CONTRIBUTION_METHODS = {
447
+ "weight": "Weight",
448
+ "Weight": "Weight",
449
+ "average": "Average",
450
+ "Average": "Average",
451
+ "branch-difference": "BranchDifference",
452
+ "branchdifference": "BranchDifference",
453
+ "BranchDifference": "BranchDifference",
454
+ "midpoint-difference": "MidpointDifference",
455
+ "midpointdifference": "MidpointDifference",
456
+ "MidpointDifference": "MidpointDifference",
457
+ "mode-difference": "ModeDifference",
458
+ "modedifference": "ModeDifference",
459
+ "ModeDifference": "ModeDifference",
460
+ "ProbabilityChange": "ProbabilityChange",
461
+ "probabilitychange": "ProbabilityChange",
462
+ "probability-change": "ProbabilityChange",
463
+ }
@@ -0,0 +1,177 @@
1
+ Metadata-Version: 2.4
2
+ Name: perpetual
3
+ Version: 1.1.1
4
+ Classifier: Programming Language :: Python :: 3
5
+ Classifier: Programming Language :: Python :: 3.10
6
+ Classifier: Programming Language :: Python :: 3.11
7
+ Classifier: Programming Language :: Python :: 3.12
8
+ Classifier: Programming Language :: Python :: 3.13
9
+ Classifier: Programming Language :: Python :: 3.14
10
+ Requires-Dist: numpy
11
+ Requires-Dist: typing-extensions
12
+ Requires-Dist: pandas ; extra == 'dev'
13
+ Requires-Dist: polars ; extra == 'dev'
14
+ Requires-Dist: pyarrow ; extra == 'dev'
15
+ Requires-Dist: maturin ; extra == 'dev'
16
+ Requires-Dist: pytest ; extra == 'dev'
17
+ Requires-Dist: seaborn ; extra == 'dev'
18
+ Requires-Dist: scikit-learn ; extra == 'dev'
19
+ Requires-Dist: mkdocs-material ; extra == 'dev'
20
+ Requires-Dist: mkdocstrings[python] ; extra == 'dev'
21
+ Requires-Dist: mkdocs-autorefs ; extra == 'dev'
22
+ Requires-Dist: ruff ; extra == 'dev'
23
+ Requires-Dist: xgboost ; extra == 'dev'
24
+ Requires-Dist: onnxmltools ; extra == 'dev'
25
+ Requires-Dist: onnx ; extra == 'dev'
26
+ Requires-Dist: onnxruntime ; python_full_version < '3.14' and extra == 'dev'
27
+ Requires-Dist: nbsphinx ; extra == 'dev'
28
+ Requires-Dist: onnxmltools ; extra == 'onnx'
29
+ Requires-Dist: onnx ; extra == 'onnx'
30
+ Requires-Dist: onnxruntime ; extra == 'onnx'
31
+ Requires-Dist: xgboost ; extra == 'xgboost'
32
+ Provides-Extra: dev
33
+ Provides-Extra: onnx
34
+ Provides-Extra: xgboost
35
+ License-File: LICENSE
36
+ Summary: A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization
37
+ Keywords: rust,perpetual,machine learning,tree model,decision tree,gradient boosted decision tree,gradient boosting machine
38
+ Home-Page: https://perpetual-ml.com
39
+ Author-email: Mutlu Simsek <mutlusims3k@gmail.com>, Serkan Korkmaz <serkor1@duck.com>, Pieter Pel <pelpieter@gmail.com>
40
+ Requires-Python: >=3.10
41
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
42
+
43
+ <!-- markdownlint-disable MD033 -->
44
+ # Perpetual
45
+
46
+ <p align="center">
47
+ <img height="120" src="https://github.com/perpetual-ml/perpetual/raw/main/resources/perp_logo.png" alt="Perpetual Logo">
48
+ </p>
49
+
50
+ <div align="center">
51
+
52
+ <a href="https://pypi.org/project/perpetual" target="_blank"><img src="https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white" alt="Python Versions"></a>
53
+ <a href="https://pypi.org/project/perpetual" target="_blank"><img src="https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white" alt="PyPI Version"></a>
54
+ <a href="https://crates.io/crates/perpetual" target="_blank"><img src="https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white" alt="Crates.io Version"></a>
55
+ <a href="https://perpetual-ml.r-universe.dev/perpetual" target="_blank"><img src="https://img.shields.io/badge/dynamic/json?url=https://perpetual-ml.r-universe.dev/api/packages/perpetual&query=$.Version&label=r-universe&logo=R&logoColor=white&color=brightgreen" alt="R-Universe status"></a>
56
+ <a href="https://discord.gg/AyUK7rr6wy" target="_blank"><img src="https://img.shields.io/badge/join-discord-blue?logo=discord" alt="Static Badge"></a>
57
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/perpetual)
58
+
59
+ </div>
60
+
61
+ PerpetualBooster is a gradient boosting machine (GBM) that doesn't need hyperparameter optimization unlike other GBMs. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
62
+
63
+ ## Supported Languages
64
+
65
+ Perpetual is built in Rust and provides high-performance bindings for Python and R.
66
+
67
+ <!-- markdownlint-disable MD060 -->
68
+ | Language | Installation | Documentation | Source | Package |
69
+ | :--------- | :---------------------------------------------------------------------- | :---------------------------------------------------------------------------------- | :------------------------------------------------------------ | :---------------------------------------------------------------------------------------------------------------------------------- |
70
+ | **Python** | `pip install perpetual`<br><br>`conda install -c conda-forge perpetual` | <a href="https://perpetual-ml.github.io/perpetual" target="_blank">Python API</a> | <a href="./package-python" target="_blank">`package-python`</a> | <a href="https://pypi.org/project/perpetual" target="_blank">PyPI</a><br><br><a href="https://anaconda.org/conda-forge/perpetual" target="_blank">Conda Forge</a> |
71
+ | **Rust** | `cargo add perpetual` | <a href="https://docs.rs/perpetual" target="_blank">docs.rs</a> | <a href="./src" target="_blank">`src`</a> | <a href="https://crates.io/crates/perpetual" target="_blank">crates.io</a> |
72
+ | **R** | `install.packages("perpetual")` | <a href="https://perpetual-ml.github.io/perpetual/r" target="_blank">pkgdown Site</a> | <a href="./package-r" target="_blank">`package-r`</a> | <a href="https://perpetual-ml.r-universe.dev/perpetual" target="_blank">R-universe</a> |
73
+
74
+ ### Optional Dependencies
75
+
76
+ * `pandas`: Enables support for training directly on Pandas DataFrames.
77
+ * `polars`: Enables zero-copy training support for Polars DataFrames.
78
+ * `scikit-learn`: Provides a scikit-learn compatible wrapper interface.
79
+ * `xgboost`: Enables saving and loading models in XGBoost format for interoperability.
80
+ * `onnxruntime`: Enables exporting and loading models in ONNX standard format.
81
+
82
+ ## Usage
83
+
84
+ You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
85
+
86
+ ```python
87
+ from perpetual import PerpetualBooster
88
+
89
+ model = PerpetualBooster(objective="SquaredLoss", budget=0.5)
90
+ model.fit(X, y)
91
+ ```
92
+
93
+ ## Benchmark
94
+
95
+ ### PerpetualBooster vs. Optuna + LightGBM
96
+
97
+ Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
98
+
99
+ The following table summarizes the results for the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html" target="_blank">California Housing</a> dataset (regression):
100
+
101
+ | Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Speed-up wall time | Speed-up cpu time |
102
+ | :--------------- | :-------------------- | :------------ | :----------- | :----------------- | :---------------- |
103
+ | 0.76 | 50 | 0.201 | 0.201 | 39x | 57x |
104
+ | 0.85 | 100 | 0.196 | 0.196 | 60x | 87x |
105
+ | 1.15 | 200 | 0.190 | 0.190 | 230x | 259x |
106
+
107
+ The following table summarizes the results for the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html" target="_blank">Cover Types</a> dataset (classification):
108
+
109
+ | Perpetual budget | LightGBM n_estimators | Perpetual ROC AUC | LightGBM ROC AUC | Speed-up wall time | Speed-up cpu time |
110
+ | :--------------- | :-------------------- | :----------------- | :---------------- | :----------------- | :---------------- |
111
+ | 1.0 | 100 | 0.944 | 0.945 | 39x | 130x |
112
+
113
+ The results can be reproduced using the scripts in the <a href="./package-python/examples" target="_blank">examples</a> folder.
114
+
115
+ ### PerpetualBooster vs. AutoGluon
116
+
117
+ PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in <a href="https://automlbenchmark.streamlit.app/cd_diagram" target="_blank">AutoML benchmark</a>. Top 10 datasets with the most number of rows are selected from <a href="https://www.openml.org/" target="_blank">OpenML datasets</a> for both regression and classification tasks.
118
+
119
+ The results are summarized in the following table for regression tasks:
120
+
121
+ | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
122
+ | :---------------------------------------------------------------------------------- | :-------------------------- | :--------------------------- | :------------------ | :-------------------------- | :--------------------------- | :----------------- |
123
+ | <a href="https://www.openml.org/t/359929" target="_blank">Airlines_DepDelay_10M</a> | 518 | 11.3 | 29.0 | 520 | 30.9 | <ins> 28.8 </ins> |
124
+ | <a href="https://www.openml.org/t/361940" target="_blank">bates_regr_100</a> | 3421 | 15.1 | <ins> 1.084 </ins> | OOM | OOM | OOM |
125
+ | <a href="https://www.openml.org/t/7327" target="_blank">BNG(libras_move)</a> | 1956 | 4.2 | <ins> 2.51 </ins> | 1922 | 97.6 | 2.53 |
126
+ | <a href="https://www.openml.org/t/7326" target="_blank">BNG(satellite_image)</a> | 334 | 1.6 | 0.731 | 337 | 10.0 | <ins> 0.721 </ins> |
127
+ | <a href="https://www.openml.org/t/14949" target="_blank">COMET_MC</a> | 44 | 1.0 | <ins> 0.0615 </ins> | 47 | 5.0 | 0.0662 |
128
+ | <a href="https://www.openml.org/t/361939" target="_blank">friedman1</a> | 275 | 4.2 | <ins> 1.047 </ins> | 278 | 5.1 | 1.487 |
129
+ | <a href="https://www.openml.org/t/10102" target="_blank">poker</a> | 38 | 0.6 | <ins> 0.256 </ins> | 41 | 1.2 | 0.722 |
130
+ | <a href="https://www.openml.org/t/361955" target="_blank">subset_higgs</a> | 868 | 10.6 | <ins> 0.420 </ins> | 870 | 24.5 | 0.421 |
131
+ | <a href="https://www.openml.org/t/7319" target="_blank">BNG(autoHorse)</a> | 107 | 1.1 | <ins> 19.0 </ins> | 107 | 3.2 | 20.5 |
132
+ | <a href="https://www.openml.org/t/7318" target="_blank">BNG(pbc)</a> | 48 | 0.6 | <ins> 836.5 </ins> | 51 | 0.2 | 957.1 |
133
+ | average | 465 | 3.9 | - | 464 | 19.7 | - |
134
+
135
+ PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster.
136
+
137
+ The results are summarized in the following table for classification tasks:
138
+
139
+ | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC |
140
+ | :--------------------------------------------------------------------------------- | :-------------------------- | :--------------------------- | :----------------- | :-------------------------- | :--------------------------- | :------------ |
141
+ | <a href="https://www.openml.org/t/146163" target="_blank">BNG(spambase)</a> | 70.1 | 2.1 | <ins> 0.671 </ins> | 73.1 | 3.7 | 0.669 |
142
+ | <a href="https://www.openml.org/t/208" target="_blank">BNG(trains)</a> | 89.5 | 1.7 | <ins> 0.996 </ins> | 106.4 | 2.4 | 0.994 |
143
+ | <a href="https://www.openml.org/t/361942" target="_blank">breast</a> | 13699.3 | 97.7 | <ins> 0.991 </ins> | 13330.7 | 79.7 | 0.949 |
144
+ | <a href="https://www.openml.org/t/7291" target="_blank">Click_prediction_small</a> | 89.1 | 1.0 | <ins> 0.749 </ins> | 101.0 | 2.8 | 0.703 |
145
+ | <a href="https://www.openml.org/t/361938" target="_blank">colon</a> | 12435.2 | 126.7 | <ins> 0.997 </ins> | 12356.2 | 152.3 | 0.997 |
146
+ | <a href="https://www.openml.org/t/362113" target="_blank">Higgs</a> | 3485.3 | 40.9 | <ins> 0.843 </ins> | 3501.4 | 67.9 | 0.816 |
147
+ | <a href="https://www.openml.org/t/230" target="_blank">SEA(50000)</a> | 21.9 | 0.2 | <ins> 0.936 </ins> | 25.6 | 0.5 | 0.935 |
148
+ | <a href="https://www.openml.org/t/359994" target="_blank">sf-police-incidents</a> | 85.8 | 1.5 | <ins> 0.687 </ins> | 99.4 | 2.8 | 0.659 |
149
+ | <a href="https://www.openml.org/t/361941" target="_blank">bates_classif_100</a> | 11152.8 | 50.0 | <ins> 0.864 </ins> | OOM | OOM | OOM |
150
+ | <a href="https://www.openml.org/t/361945" target="_blank">prostate</a> | 13699.9 | 79.8 | <ins> 0.987 </ins> | OOM | OOM | OOM |
151
+ | average | 3747.0 | 34.0 | - | 3699.2 | 39.0 | - |
152
+
153
+ PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster.
154
+
155
+ PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks.
156
+
157
+ The results can be reproduced using the <a href="https://github.com/deadsoul44/automlbenchmark" target="_blank">automlbenchmark fork</a>.
158
+
159
+ ## Contribution
160
+
161
+ Contributions are welcome. Check <a href="./CONTRIBUTING.md" target="_blank">CONTRIBUTING.md</a> for the guideline.
162
+
163
+ ## Paper
164
+
165
+ PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our <a href="https://perpetual-ml.com/blog/how-perpetual-works" target="_blank">blog post</a> for a high level introduction to the algorithm.
166
+
167
+ ## Perpetual ML Suite
168
+
169
+ The **Perpetual ML Suite** is a comprehensive, batteries-included ML platform designed to deliver maximum predictive power with minimal effort. It allows you to track experiments, monitor metrics, and manage model drift through an intuitive interface.
170
+
171
+ For a fully managed, **serverless ML experience**, visit <a href="https://app.perpetual-ml.com" target="_blank">app.perpetual-ml.com</a>.
172
+
173
+ * **Serverless Marimo Notebooks**: Run interactive, reactive notebooks without managing any infrastructure.
174
+ * **Serverless ML Endpoints**: One-click deployment of models as production-ready endpoints for real-time inference.
175
+
176
+ Perpetual is also designed to live where your data lives. It is available as a native application on the <a href="https://app.snowflake.com/marketplace/listing/GZSYZX0EMJ/perpetual-ml-perpetual-ml-suite" target="_blank">Snowflake Marketplace</a>, with support for Databricks and other major data warehouses coming soon.
177
+
@@ -0,0 +1,12 @@
1
+ perpetual/__init__.py,sha256=TVNxt-DXYxEb1BfC6vY5xfH4kSSgPJdhOL84Z35GifQ,285
2
+ perpetual/booster.py,sha256=lhXoBq7tMwd9Upa7WMdnjr0FRRcMaqPWxLbmcJLfK6U,73005
3
+ perpetual/data.py,sha256=e2xF5xVq3KYotj5fpIhSfnF3B4qLQpdHYDSaP5NpcxA,768
4
+ perpetual/perpetual.cpython-312-darwin.so,sha256=SuCSELp5j9m3QikyZWgxhBWR1De7nA5Kmqi4bpUZn4c,1463904
5
+ perpetual/serialize.py,sha256=Tg2BbuA1jKQ5-ITuVhwtj6hgBaRAbZ66eHctR7fcVk4,1883
6
+ perpetual/sklearn.py,sha256=kYDDy0ODW6ZYZEKBTLDpy_cVsCae6cZ3ARYZj01gjj4,17122
7
+ perpetual/types.py,sha256=T0KJu8bK8xiYHaPt8b6RmUR1xP3f5N1FV7qaZTy1rtM,3232
8
+ perpetual/utils.py,sha256=j3iuvJcSf6sYp1D5ATW4QAyjyOou6MkY4biNrU8TzRg,16903
9
+ perpetual-1.1.1.dist-info/METADATA,sha256=698cRCoIrH429APmhvPl00LVTfCRnmd7f8Kx03otnIs,17304
10
+ perpetual-1.1.1.dist-info/WHEEL,sha256=5PbULt6DKUkIiubOjwQPyn-vzBuHJr28QRlczaBjcdg,107
11
+ perpetual-1.1.1.dist-info/licenses/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
12
+ perpetual-1.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-macosx_10_12_x86_64