perpetual 1.0.40__cp311-cp311-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
perpetual/utils.py ADDED
@@ -0,0 +1,462 @@
1
+ import logging
2
+ from typing import Dict, Iterable, List, Optional, Tuple
3
+
4
+ import numpy as np
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def type_df(df):
10
+ library_name = type(df).__module__.split(".")[0]
11
+ if type(df).__name__ == "DataFrame":
12
+ if library_name == "pandas":
13
+ return "pandas_df"
14
+ elif library_name == "polars":
15
+ return "polars_df"
16
+ elif library_name == "numpy":
17
+ return "numpy"
18
+ else:
19
+ return ""
20
+
21
+
22
+ def type_series(y):
23
+ library_name = type(y).__module__.split(".")[0]
24
+ if type(y).__name__ == "Series":
25
+ if library_name == "pandas":
26
+ return "pandas_series"
27
+ elif library_name == "polars":
28
+ return "polars_series"
29
+ elif library_name == "numpy":
30
+ return "numpy"
31
+ else:
32
+ return ""
33
+
34
+
35
+ def convert_input_array(x, objective, is_target=False, is_int=False) -> np.ndarray:
36
+ classes_ = []
37
+
38
+ if type(x).__module__.split(".")[0] == "numpy":
39
+ if len(x.shape) == 2:
40
+ classes_, x_, *_ = convert_input_frame(x, None, 1000)
41
+ else:
42
+ x_ = x
43
+ elif type_series(x) == "pandas_series":
44
+ x_ = x.to_numpy()
45
+ elif type_series(x) == "polars_series":
46
+ x_ = x.to_numpy(allow_copy=False)
47
+ elif type_df(x) == "polars_df" or type_df(x) == "pandas_df":
48
+ classes_, x_, *_ = convert_input_frame(x, None, 1000)
49
+ else:
50
+ x_ = x.to_numpy()
51
+
52
+ if is_target and objective == "LogLoss" and len(x_.shape) == 1:
53
+ classes_ = np.unique(x_)
54
+ x_index = np.array([np.where(classes_ == i) for i in x_])
55
+ if len(classes_) > 2:
56
+ x_ = np.squeeze(np.eye(len(classes_))[x_index])
57
+
58
+ if is_int and not np.issubdtype(x_.dtype, "uint64"):
59
+ x_ = x_.astype(dtype="uint64", copy=False)
60
+
61
+ if not is_int and not np.issubdtype(x_.dtype, "float64"):
62
+ x_ = x_.astype(dtype="float64", copy=False)
63
+
64
+ if len(x_.shape) == 2:
65
+ x_ = x_.ravel(order="F")
66
+
67
+ return x_, classes_
68
+
69
+
70
+ def convert_input_frame(
71
+ X,
72
+ categorical_features,
73
+ max_cat,
74
+ ) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
75
+ """Convert data to format needed by booster.
76
+
77
+ Returns:
78
+ Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping
79
+ """
80
+ categorical_features_ = None
81
+ if type_df(X) == "pandas_df":
82
+ X_ = X.to_numpy()
83
+ features_ = X.columns.to_list()
84
+ if categorical_features == "auto":
85
+ categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
86
+ categorical_features_ = [
87
+ features_.index(c) for c in categorical_columns
88
+ ] or None
89
+ elif type_df(X) == "numpy":
90
+ X_ = X
91
+ features_ = list(map(str, range(X_.shape[1])))
92
+ else:
93
+ raise ValueError(f"Object type {type(X)} is not supported.")
94
+
95
+ if (
96
+ categorical_features
97
+ and all(isinstance(s, int) for s in categorical_features)
98
+ and isinstance(categorical_features, list)
99
+ ):
100
+ categorical_features_ = categorical_features
101
+ elif (
102
+ categorical_features
103
+ and all(isinstance(s, str) for s in categorical_features)
104
+ and isinstance(categorical_features, list)
105
+ ):
106
+ categorical_features_ = [features_.index(c) for c in categorical_features]
107
+
108
+ cat_mapping = {} # key: feature_name, value: ordered category names
109
+ cat_to_num = []
110
+ if categorical_features_:
111
+ for i in categorical_features_:
112
+ categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True)
113
+
114
+ categories = list(categories)
115
+ if "nan" in categories:
116
+ categories.remove("nan")
117
+ categories.insert(0, "nan")
118
+
119
+ inversed = inversed + 1.0
120
+
121
+ if len(categories) > max_cat:
122
+ cat_to_num.append(i)
123
+ logger.warning(
124
+ f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
125
+ )
126
+
127
+ feature_name = features_[i]
128
+ cat_mapping[feature_name] = categories
129
+ ind_nan = len(categories)
130
+ inversed[inversed == ind_nan] = np.nan
131
+ X_[:, i] = inversed
132
+
133
+ categorical_features_ = [
134
+ x for x in categorical_features_ if x not in cat_to_num
135
+ ]
136
+
137
+ logger.info(f"Categorical features: {categorical_features_}")
138
+ logger.info(f"Mapping of categories: {cat_mapping}")
139
+
140
+ if not np.issubdtype(X_.dtype, "float64"):
141
+ X_ = X_.astype(dtype="float64", copy=False)
142
+ flat_data = X_.ravel(order="F")
143
+ rows, cols = X_.shape
144
+
145
+ if isinstance(categorical_features_, list):
146
+ categorical_features_ = set(categorical_features_)
147
+
148
+ return features_, flat_data, rows, cols, categorical_features_, cat_mapping
149
+
150
+
151
+ def convert_input_frame_columnar(
152
+ X, categorical_features, max_cat
153
+ ) -> Tuple[
154
+ List[str],
155
+ List[np.ndarray],
156
+ List[Optional[np.ndarray]],
157
+ int,
158
+ int,
159
+ Optional[set],
160
+ dict,
161
+ ]:
162
+ """Convert Polars DataFrame to columnar format for zero-copy transfer.
163
+
164
+ Returns list of column arrays and list of validity masks.
165
+ """
166
+ import polars.selectors as cs
167
+
168
+ features_ = list(X.columns)
169
+ rows, cols = X.shape
170
+
171
+ # Determine categorical features
172
+ categorical_features_ = None
173
+ if categorical_features == "auto":
174
+ categorical_columns = X.select(cs.categorical()).columns
175
+ categorical_features_ = [
176
+ features_.index(c) for c in categorical_columns
177
+ ] or None
178
+ elif (
179
+ categorical_features
180
+ and all(isinstance(s, int) for s in categorical_features)
181
+ and isinstance(categorical_features, list)
182
+ ):
183
+ categorical_features_ = categorical_features
184
+ elif (
185
+ categorical_features
186
+ and all(isinstance(s, str) for s in categorical_features)
187
+ and isinstance(categorical_features, list)
188
+ ):
189
+ categorical_features_ = [features_.index(c) for c in categorical_features]
190
+
191
+ cat_mapping = {}
192
+ cat_to_num = []
193
+ categorical_set = set(categorical_features_) if categorical_features_ else set()
194
+
195
+ # Convert each column to numpy array
196
+ columns = []
197
+ masks = []
198
+ import pyarrow as pa
199
+
200
+ for i, col_name in enumerate(features_):
201
+ if i in categorical_set:
202
+ # For categorical columns, we need to encode them
203
+ # Use Arrow to get codes and categories without forcing numpy object conversion
204
+ arr = X[col_name].to_arrow()
205
+ if isinstance(arr, pa.ChunkedArray):
206
+ arr = arr.combine_chunks()
207
+
208
+ if not isinstance(arr, pa.DictionaryArray):
209
+ arr = arr.dictionary_encode()
210
+
211
+ # Extract categories (dictionary)
212
+ # Arrow dictionary is usually StringArray
213
+ cats = arr.dictionary.to_pylist()
214
+
215
+ # Extract codes (indices)
216
+ # Cast to float64 for Perpetual
217
+ indices = arr.indices.to_numpy(zero_copy_only=False)
218
+ out_values = indices.astype(np.float64)
219
+ out_values += 1.0 # Shift: 0 in Perpetual is "nan"
220
+
221
+ # Handle Nulls (masked values in Arrow)
222
+ # Set them to NaN
223
+ if arr.null_count > 0:
224
+ # Simpler: convert validity bitmap to byte array using numpy unpacking
225
+ row_count = len(out_values)
226
+ if arr.buffers()[0]:
227
+ valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
228
+ valid_mask = np.unpackbits(valid_bits, bitorder="little")[
229
+ :row_count
230
+ ].astype(bool)
231
+ # mask is 1 where valid, 0 where null.
232
+ # We want to set nulls (0) to NaN.
233
+ out_values[~valid_mask] = np.nan
234
+
235
+ # Handle "nan" string in categories
236
+ if "nan" in cats:
237
+ nan_idx = cats.index("nan")
238
+ # Indices pointing to this "nan" string should become NaN
239
+ # Current logic: cats["nan"] is at `nan_idx`.
240
+ # `out_values` has `nan_idx + 1`.
241
+ out_values[out_values == (nan_idx + 1.0)] = np.nan
242
+ cats.remove("nan")
243
+
244
+ cats.insert(0, "nan")
245
+
246
+ if len(cats) > max_cat:
247
+ cat_to_num.append(i)
248
+ logger.warning(
249
+ f"Feature {col_name} will be treated as numerical since the number of categories ({len(cats)}) exceeds max_cat ({max_cat}) threshold."
250
+ )
251
+
252
+ cat_mapping[col_name] = cats
253
+ columns.append(out_values)
254
+ masks.append(None) # Categorical encoding handles NaNs
255
+ else:
256
+ # For non-categorical columns, use zero-copy via Arrow
257
+ series = X[col_name]
258
+ # Use Arrow to get validity bitmap and values zero-copy
259
+ arr = series.to_arrow()
260
+ if isinstance(arr, pa.ChunkedArray):
261
+ if arr.num_chunks > 1:
262
+ arr = arr.combine_chunks()
263
+ else:
264
+ arr = arr.chunk(0)
265
+
266
+ # Check buffers
267
+ buffers = arr.buffers()
268
+ # buffers[0] is validity bitmap
269
+ # buffers[1] is values
270
+ if buffers[0] is None:
271
+ masks.append(None)
272
+ else:
273
+ masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
274
+
275
+ # values
276
+ col_array = np.frombuffer(buffers[1], dtype=np.float64)
277
+ columns.append(col_array)
278
+
279
+ if categorical_features_:
280
+ categorical_features_ = [
281
+ x for x in categorical_features_ if x not in cat_to_num
282
+ ]
283
+ logger.info(f"Categorical features: {categorical_features_}")
284
+ logger.info(f"Mapping of categories: {cat_mapping}")
285
+
286
+ if isinstance(categorical_features_, list):
287
+ categorical_features_ = set(categorical_features_)
288
+
289
+ return features_, columns, masks, rows, cols, categorical_features_, cat_mapping
290
+
291
+
292
+ def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, int]:
293
+ """Convert data to format needed by booster.
294
+
295
+ Returns:
296
+ Tuple[List[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns
297
+ """
298
+ if type_df(X) == "pandas_df":
299
+ X_ = X.to_numpy()
300
+ features_ = X.columns.to_list()
301
+ elif type_df(X) == "numpy":
302
+ X_ = X
303
+ features_ = list(map(str, range(X_.shape[1])))
304
+ else:
305
+ raise ValueError(f"Object type {type(X)} is not supported.")
306
+
307
+ if cat_mapping:
308
+ for feature_name, categories in cat_mapping.items():
309
+ feature_index = features_.index(feature_name)
310
+ cats = categories.copy()
311
+ cats.remove("nan")
312
+ x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str))
313
+ x_enc = x_enc + 1.0
314
+ ind_nan = len(categories)
315
+ x_enc[x_enc == ind_nan] = np.nan
316
+ X_[:, feature_index] = x_enc
317
+
318
+ if not np.issubdtype(X_.dtype, "float64"):
319
+ X_ = X_.astype(dtype="float64", copy=False)
320
+ flat_data = X_.ravel(order="F")
321
+ rows, cols = X_.shape
322
+
323
+ return features_, flat_data, rows, cols
324
+
325
+
326
+ def transform_input_frame_columnar(
327
+ X, cat_mapping
328
+ ) -> Tuple[List[str], List[np.ndarray], List[Optional[np.ndarray]], int, int]:
329
+ """Convert Polars DataFrame to columnar format for zero-copy prediction.
330
+
331
+ Returns list of column arrays and masks instead of flattened data, avoiding copies.
332
+ """
333
+ features_ = list(X.columns)
334
+ rows, cols = X.shape
335
+
336
+ columns = []
337
+ masks = []
338
+ import pyarrow as pa
339
+
340
+ for i, col_name in enumerate(features_):
341
+ if cat_mapping and col_name in cat_mapping:
342
+ # For categorical columns, we need to encode them using the existing cat_mapping
343
+ categories = cat_mapping[col_name]
344
+
345
+ # Use Arrow for zero-copy extraction
346
+ arr = X[col_name].to_arrow()
347
+ if isinstance(arr, pa.ChunkedArray):
348
+ arr = arr.combine_chunks()
349
+ if not isinstance(arr, pa.DictionaryArray):
350
+ arr = arr.dictionary_encode()
351
+
352
+ # Input categories
353
+ new_cats = arr.dictionary.to_pylist()
354
+
355
+ # Extract codes (indices)
356
+ # We need integers for indexing `lookup`.
357
+ # If indices has nulls, to_numpy() might return floats.
358
+ # We fill nulls with 0 to ensure we get integers, then mask result later.
359
+ filled_indices_arr = arr.indices.fill_null(0)
360
+ new_indices = filled_indices_arr.to_numpy()
361
+
362
+ # Build mapping from new_cats indices to old_cats indices
363
+ # old_cats = categories. "nan" is at index 0.
364
+ # We want to map new_cat_idx -> old_cat_idx.
365
+ # If new_cat is "nan", map to 0?
366
+ # Perpetual encoding: "nan" -> NaN (in float), Cat1 -> 1.0, Cat2 -> 2.0.
367
+ # categories list has "nan" at 0.
368
+ # So "A" is at index 1.
369
+
370
+ # We need to map `new_indices` to `out_values`.
371
+
372
+ # Create a lookup table (array)
373
+ # lookup[new_code] = old_float_code
374
+
375
+ lookup = np.full(len(new_cats), np.nan, dtype=np.float64)
376
+
377
+ # optimization: map strings to indices for old categories
378
+ old_cat_map = {c: i for i, c in enumerate(categories)}
379
+ # categories[0] is "nan"
380
+
381
+ for i, cat in enumerate(new_cats):
382
+ if cat in old_cat_map:
383
+ idx = old_cat_map[cat]
384
+ # If idx is 0 ("nan"), we want result to be np.nan?
385
+ # Previous logic: `inversed[inversed == ind_nan] = np.nan`.
386
+ # Wait, `inversed` from `searchsorted` was 0-based index into `cats` (without "nan" inside `searchsorted` call?).
387
+ # Previous logic:
388
+ # `cats.remove("nan")`
389
+ # `searchsorted(cats, ...)` -> index into cats (0 to N-1).
390
+ # `+ 1.0`. So 1 to N.
391
+ # `categories` has "nan" inserted at 0.
392
+ # So index 1 corresponds to `categories[1]`.
393
+ # Logic holds.
394
+
395
+ if categories[idx] == "nan":
396
+ lookup[i] = np.nan
397
+ else:
398
+ lookup[i] = float(idx)
399
+ # Note: categories has "nan" at 0. "A" at 1.
400
+ # If `cat` is "A", `idx` is 1. We want 1.0. Correct.
401
+ else:
402
+ # Unknown category -> NaN?
403
+ lookup[i] = np.nan
404
+
405
+ # Apply lookup
406
+ # Handle out of bounds indices just in case? Arrow indices should be valid.
407
+ # `new_indices` are codes into `new_cats`.
408
+
409
+ # Check for nulls in `new_indices` (masked)
410
+ # If null, they map to NaN.
411
+
412
+ # `take` style mapping
413
+ # `lookup` has NaN for unknown/nan cats.
414
+ x_enc = lookup[new_indices]
415
+
416
+ # Handle array-level nulls
417
+ if arr.null_count > 0:
418
+ if arr.buffers()[0]:
419
+ valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
420
+ valid_mask = np.unpackbits(valid_bits, bitorder="little")[
421
+ : len(x_enc)
422
+ ].astype(bool)
423
+ x_enc[~valid_mask] = np.nan
424
+
425
+ columns.append(x_enc)
426
+ masks.append(None)
427
+ else:
428
+ series = X[col_name]
429
+ arr = series.to_arrow()
430
+ if isinstance(arr, pa.ChunkedArray):
431
+ if arr.num_chunks > 1:
432
+ arr = arr.combine_chunks() # Fallback for chunked
433
+ else:
434
+ arr = arr.chunk(0)
435
+ buffers = arr.buffers()
436
+ if buffers[0] is None:
437
+ masks.append(None)
438
+ else:
439
+ masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
440
+ columns.append(np.frombuffer(buffers[1], dtype=np.float64))
441
+
442
+ return features_, columns, masks, rows, cols
443
+
444
+
445
+ CONTRIBUTION_METHODS = {
446
+ "weight": "Weight",
447
+ "Weight": "Weight",
448
+ "average": "Average",
449
+ "Average": "Average",
450
+ "branch-difference": "BranchDifference",
451
+ "branchdifference": "BranchDifference",
452
+ "BranchDifference": "BranchDifference",
453
+ "midpoint-difference": "MidpointDifference",
454
+ "midpointdifference": "MidpointDifference",
455
+ "MidpointDifference": "MidpointDifference",
456
+ "mode-difference": "ModeDifference",
457
+ "modedifference": "ModeDifference",
458
+ "ModeDifference": "ModeDifference",
459
+ "ProbabilityChange": "ProbabilityChange",
460
+ "probabilitychange": "ProbabilityChange",
461
+ "probability-change": "ProbabilityChange",
462
+ }
@@ -0,0 +1,169 @@
1
+ Metadata-Version: 2.4
2
+ Name: perpetual
3
+ Version: 1.0.40
4
+ Classifier: Programming Language :: Python :: 3
5
+ Classifier: Programming Language :: Python :: 3.10
6
+ Classifier: Programming Language :: Python :: 3.11
7
+ Classifier: Programming Language :: Python :: 3.12
8
+ Classifier: Programming Language :: Python :: 3.13
9
+ Classifier: Programming Language :: Python :: 3.14
10
+ Requires-Dist: numpy
11
+ Requires-Dist: typing-extensions
12
+ Requires-Dist: pandas ; extra == 'dev'
13
+ Requires-Dist: polars ; extra == 'dev'
14
+ Requires-Dist: pyarrow ; extra == 'dev'
15
+ Requires-Dist: maturin ; extra == 'dev'
16
+ Requires-Dist: pytest ; extra == 'dev'
17
+ Requires-Dist: seaborn ; extra == 'dev'
18
+ Requires-Dist: scikit-learn ; extra == 'dev'
19
+ Requires-Dist: mkdocs-material ; extra == 'dev'
20
+ Requires-Dist: mkdocstrings[python] ; extra == 'dev'
21
+ Requires-Dist: mkdocs-autorefs ; extra == 'dev'
22
+ Requires-Dist: ruff ; extra == 'dev'
23
+ Requires-Dist: xgboost ; extra == 'dev'
24
+ Requires-Dist: onnxmltools ; extra == 'dev'
25
+ Requires-Dist: onnx ; extra == 'dev'
26
+ Requires-Dist: onnxruntime ; python_full_version < '3.14' and extra == 'dev'
27
+ Provides-Extra: dev
28
+ License-File: LICENSE
29
+ Summary: A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization
30
+ Keywords: rust,perpetual,machine learning,tree model,decision tree,gradient boosted decision tree,gradient boosting machine
31
+ Home-Page: https://perpetual-ml.com
32
+ Author-email: Mutlu Simsek <mutlusims3k@gmail.com>, Serkan Korkmaz <serkor1@duck.com>, Pieter Pel <pelpieter@gmail.com>
33
+ Requires-Python: >=3.10
34
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
35
+
36
+ <!-- markdownlint-disable MD033 -->
37
+ # Perpetual
38
+
39
+ <p align="center">
40
+ <img height="120" src="https://github.com/perpetual-ml/perpetual/raw/main/resources/perp_logo.png" alt="Perpetual Logo">
41
+ </p>
42
+
43
+ <div align="center">
44
+
45
+ <a href="https://pypi.org/project/perpetual" target="_blank"><img src="https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white" alt="Python Versions"></a>
46
+ <a href="https://pypi.org/project/perpetual" target="_blank"><img src="https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white" alt="PyPI Version"></a>
47
+ <a href="https://crates.io/crates/perpetual" target="_blank"><img src="https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white" alt="Crates.io Version"></a>
48
+ <a href="https://perpetual-ml.r-universe.dev/perpetual" target="_blank"><img src="https://img.shields.io/badge/dynamic/json?url=https://perpetual-ml.r-universe.dev/api/packages/perpetual&query=$.Version&label=r-universe&logo=R&logoColor=white&color=brightgreen" alt="R-Universe status"></a>
49
+ <a href="https://discord.gg/AyUK7rr6wy" target="_blank"><img src="https://img.shields.io/badge/join-discord-blue?logo=discord" alt="Static Badge"></a>
50
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/perpetual)
51
+
52
+ </div>
53
+
54
+ PerpetualBooster is a gradient boosting machine (GBM) algorithm that doesn't need hyperparameter optimization unlike other GBMs. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
55
+
56
+ ## Supported Languages
57
+
58
+ Perpetual is built in Rust and provides high-performance bindings for Python and R.
59
+
60
+ | Language | Installation | Documentation | Repository |
61
+ | :--------- | :------------------------------ | :------------------------------------------------------------------------------------ | :-------------------------------------------------------------- |
62
+ | **Python** | `pip install perpetual` | <a href="https://perpetual-ml.github.io/perpetual" target="_blank">Python API</a> | <a href="./package-python" target="_blank">`package-python`</a> |
63
+ | **Rust** | `cargo add perpetual` | <a href="https://docs.rs/perpetual" target="_blank">docs.rs</a> | <a href="./src" target="_blank">`src`</a> |
64
+ | **R** | `install.packages("perpetual")` | <a href="https://perpetual-ml.github.io/perpetual/r" target="_blank">pkgdown Site</a> | <a href="./package-r" target="_blank">`package-r`</a> |
65
+
66
+ ## Usage
67
+
68
+ You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
69
+
70
+ ```python
71
+ from perpetual import PerpetualBooster
72
+
73
+ model = PerpetualBooster(objective="SquaredLoss", budget=0.5)
74
+ model.fit(X, y)
75
+ ```
76
+
77
+ ## Documentation
78
+
79
+ Comprehensive documentation for all supported languages is available:
80
+
81
+ - **Python**: <a href="https://perpetual-ml.github.io/perpetual" target="_blank">API Reference & Guides</a>
82
+ - **Rust**: <a href="https://docs.rs/perpetual" target="_blank">docs.rs/perpetual</a>
83
+ - **R**: <a href="https://perpetual-ml.github.io/perpetual/r" target="_blank">pkgdown Documentation</a>
84
+
85
+ ## Benchmark
86
+
87
+ ### PerpetualBooster vs. Optuna + LightGBM
88
+
89
+ Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
90
+
91
+ The following table summarizes the results for the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html" target="_blank">California Housing</a> dataset (regression):
92
+
93
+ | Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Speed-up wall time | Speed-up cpu time |
94
+ | :--------------- | :-------------------- | :------------ | :----------- | :----------------- | :---------------- |
95
+ | 1.0 | 100 | 0.192 | 0.192 | 54x | 56x |
96
+ | 1.5 | 300 | 0.188 | 0.188 | 59x | 58x |
97
+ | 2.1 | 1000 | 0.185 | 0.186 | 42x | 41x |
98
+
99
+ The following table summarizes the results for the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html" target="_blank">Cover Types</a> dataset (classification):
100
+
101
+ | Perpetual budget | LightGBM n_estimators | Perpetual log loss | LightGBM log loss | Speed-up wall time | Speed-up cpu time |
102
+ | :--------------- | :-------------------- | :----------------- | :---------------- | :----------------- | :---------------- |
103
+ | 0.9 | 100 | 0.091 | 0.084 | 72x | 78x |
104
+
105
+ The results can be reproduced using the scripts in the <a href="./package-python/examples" target="_blank">examples</a> folder.
106
+
107
+ ### PerpetualBooster vs. AutoGluon
108
+
109
+ PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in <a href="https://automlbenchmark.streamlit.app/cd_diagram" target="_blank">AutoML benchmark</a>. Top 10 datasets with the most number of rows are selected from <a href="https://www.openml.org/" target="_blank">OpenML datasets</a> for both regression and classification tasks.
110
+
111
+ The results are summarized in the following table for regression tasks:
112
+
113
+ | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
114
+ | :---------------------------------------------------------------------------------- | :-------------------------- | :--------------------------- | :------------------ | :-------------------------- | :--------------------------- | :----------------- |
115
+ | <a href="https://www.openml.org/t/359929" target="_blank">Airlines_DepDelay_10M</a> | 518 | 11.3 | 29.0 | 520 | 30.9 | <ins> 28.8 </ins> |
116
+ | <a href="https://www.openml.org/t/361940" target="_blank">bates_regr_100</a> | 3421 | 15.1 | <ins> 1.084 </ins> | OOM | OOM | OOM |
117
+ | <a href="https://www.openml.org/t/7327" target="_blank">BNG(libras_move)</a> | 1956 | 4.2 | <ins> 2.51 </ins> | 1922 | 97.6 | 2.53 |
118
+ | <a href="https://www.openml.org/t/7326" target="_blank">BNG(satellite_image)</a> | 334 | 1.6 | 0.731 | 337 | 10.0 | <ins> 0.721 </ins> |
119
+ | <a href="https://www.openml.org/t/14949" target="_blank">COMET_MC</a> | 44 | 1.0 | <ins> 0.0615 </ins> | 47 | 5.0 | 0.0662 |
120
+ | <a href="https://www.openml.org/t/361939" target="_blank">friedman1</a> | 275 | 4.2 | <ins> 1.047 </ins> | 278 | 5.1 | 1.487 |
121
+ | <a href="https://www.openml.org/t/10102" target="_blank">poker</a> | 38 | 0.6 | <ins> 0.256 </ins> | 41 | 1.2 | 0.722 |
122
+ | <a href="https://www.openml.org/t/361955" target="_blank">subset_higgs</a> | 868 | 10.6 | <ins> 0.420 </ins> | 870 | 24.5 | 0.421 |
123
+ | <a href="https://www.openml.org/t/7319" target="_blank">BNG(autoHorse)</a> | 107 | 1.1 | <ins> 19.0 </ins> | 107 | 3.2 | 20.5 |
124
+ | <a href="https://www.openml.org/t/7318" target="_blank">BNG(pbc)</a> | 48 | 0.6 | <ins> 836.5 </ins> | 51 | 0.2 | 957.1 |
125
+ | average | 465 | 3.9 | - | 464 | 19.7 | - |
126
+
127
+ PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster.
128
+
129
+ The results are summarized in the following table for classification tasks:
130
+
131
+ | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC |
132
+ | :--------------------------------------------------------------------------------- | :-------------------------- | :--------------------------- | :----------------- | :-------------------------- | :--------------------------- | :------------ |
133
+ | <a href="https://www.openml.org/t/146163" target="_blank">BNG(spambase)</a> | 70.1 | 2.1 | <ins> 0.671 </ins> | 73.1 | 3.7 | 0.669 |
134
+ | <a href="https://www.openml.org/t/208" target="_blank">BNG(trains)</a> | 89.5 | 1.7 | <ins> 0.996 </ins> | 106.4 | 2.4 | 0.994 |
135
+ | <a href="https://www.openml.org/t/361942" target="_blank">breast</a> | 13699.3 | 97.7 | <ins> 0.991 </ins> | 13330.7 | 79.7 | 0.949 |
136
+ | <a href="https://www.openml.org/t/7291" target="_blank">Click_prediction_small</a> | 89.1 | 1.0 | <ins> 0.749 </ins> | 101.0 | 2.8 | 0.703 |
137
+ | <a href="https://www.openml.org/t/361938" target="_blank">colon</a> | 12435.2 | 126.7 | <ins> 0.997 </ins> | 12356.2 | 152.3 | 0.997 |
138
+ | <a href="https://www.openml.org/t/362113" target="_blank">Higgs</a> | 3485.3 | 40.9 | <ins> 0.843 </ins> | 3501.4 | 67.9 | 0.816 |
139
+ | <a href="https://www.openml.org/t/230" target="_blank">SEA(50000)</a> | 21.9 | 0.2 | <ins> 0.936 </ins> | 25.6 | 0.5 | 0.935 |
140
+ | <a href="https://www.openml.org/t/359994" target="_blank">sf-police-incidents</a> | 85.8 | 1.5 | <ins> 0.687 </ins> | 99.4 | 2.8 | 0.659 |
141
+ | <a href="https://www.openml.org/t/361941" target="_blank">bates_classif_100</a> | 11152.8 | 50.0 | <ins> 0.864 </ins> | OOM | OOM | OOM |
142
+ | <a href="https://www.openml.org/t/361945" target="_blank">prostate</a> | 13699.9 | 79.8 | <ins> 0.987 </ins> | OOM | OOM | OOM |
143
+ | average | 3747.0 | 34.0 | - | 3699.2 | 39.0 | - |
144
+
145
+ PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster.
146
+
147
+ PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks.
148
+
149
+ The results can be reproduced using the <a href="https://github.com/deadsoul44/automlbenchmark" target="_blank">automlbenchmark fork</a>.
150
+
151
+ ## Contribution
152
+
153
+ Contributions are welcome. Check <a href="./CONTRIBUTING.md" target="_blank">CONTRIBUTING.md</a> for the guideline.
154
+
155
+ ## Paper
156
+
157
+ PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our <a href="https://perpetual-ml.com/blog/how-perpetual-works" target="_blank">blog post</a> for a high level introduction to the algorithm.
158
+
159
+ ## Perpetual ML Suite
160
+
161
+ The **Perpetual ML Suite** is a comprehensive, batteries-included ML platform designed to deliver maximum predictive power with minimal effort. It allows you to track experiments, monitor metrics, and manage model drift through an intuitive interface.
162
+
163
+ For a fully managed, **serverless ML experience**, visit <a href="https://app.perpetual-ml.com" target="_blank">app.perpetual-ml.com</a>.
164
+
165
+ - **Serverless Marimo Notebooks**: Run interactive, reactive notebooks without managing any infrastructure.
166
+ - **Serverless ML Endpoints**: One-click deployment of models as production-ready endpoints for real-time inference.
167
+
168
+ Perpetual is also designed to live where your data lives. It is available as a native application on the <a href="https://app.snowflake.com/marketplace/listing/GZSYZX0EMJ/perpetual-ml-perpetual-ml-suite" target="_blank">Snowflake Marketplace</a>, with support for Databricks and other major data warehouses coming soon.
169
+
@@ -0,0 +1,12 @@
1
+ perpetual/__init__.py,sha256=hKi__gjLuv4MIKSdmiuLg-Y3Aj3Wj8O3zOQ0u9dw5Cc,115
2
+ perpetual/booster.py,sha256=lhXoBq7tMwd9Upa7WMdnjr0FRRcMaqPWxLbmcJLfK6U,73005
3
+ perpetual/data.py,sha256=e2xF5xVq3KYotj5fpIhSfnF3B4qLQpdHYDSaP5NpcxA,768
4
+ perpetual/perpetual.cpython-311-darwin.so,sha256=bpV-Pce18WVZmNYUDHqTNsTCa67D9X8teW1TygAeblU,1476216
5
+ perpetual/serialize.py,sha256=Tg2BbuA1jKQ5-ITuVhwtj6hgBaRAbZ66eHctR7fcVk4,1883
6
+ perpetual/sklearn.py,sha256=6Kl3dlYBQK0yaFF7I7qzfyOP_dtc_c3q05OjKxCApmk,7011
7
+ perpetual/types.py,sha256=T0KJu8bK8xiYHaPt8b6RmUR1xP3f5N1FV7qaZTy1rtM,3232
8
+ perpetual/utils.py,sha256=qxWSlS1yZNtSRECgTOeDILqeA1wJHV5SYPRUgtL9Goc,16894
9
+ perpetual-1.0.40.dist-info/METADATA,sha256=kXiPgDS3hiLpAv86EH-ytYLeMi9pca_ltLR1GcPo4jE,16083
10
+ perpetual-1.0.40.dist-info/WHEEL,sha256=uo497LsoCAD-gruBuBwmuPSjitTK7HU_NyyROWcAuhA,107
11
+ perpetual-1.0.40.dist-info/licenses/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
12
+ perpetual-1.0.40.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp311-cp311-macosx_10_12_x86_64