perpetual 1.0.40__cp311-cp311-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perpetual/__init__.py +5 -0
- perpetual/booster.py +1915 -0
- perpetual/data.py +27 -0
- perpetual/perpetual.cpython-311-darwin.so +0 -0
- perpetual/serialize.py +74 -0
- perpetual/sklearn.py +194 -0
- perpetual/types.py +151 -0
- perpetual/utils.py +462 -0
- perpetual-1.0.40.dist-info/METADATA +169 -0
- perpetual-1.0.40.dist-info/RECORD +12 -0
- perpetual-1.0.40.dist-info/WHEEL +4 -0
- perpetual-1.0.40.dist-info/licenses/LICENSE +674 -0
perpetual/utils.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def type_df(df):
|
|
10
|
+
library_name = type(df).__module__.split(".")[0]
|
|
11
|
+
if type(df).__name__ == "DataFrame":
|
|
12
|
+
if library_name == "pandas":
|
|
13
|
+
return "pandas_df"
|
|
14
|
+
elif library_name == "polars":
|
|
15
|
+
return "polars_df"
|
|
16
|
+
elif library_name == "numpy":
|
|
17
|
+
return "numpy"
|
|
18
|
+
else:
|
|
19
|
+
return ""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def type_series(y):
|
|
23
|
+
library_name = type(y).__module__.split(".")[0]
|
|
24
|
+
if type(y).__name__ == "Series":
|
|
25
|
+
if library_name == "pandas":
|
|
26
|
+
return "pandas_series"
|
|
27
|
+
elif library_name == "polars":
|
|
28
|
+
return "polars_series"
|
|
29
|
+
elif library_name == "numpy":
|
|
30
|
+
return "numpy"
|
|
31
|
+
else:
|
|
32
|
+
return ""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def convert_input_array(x, objective, is_target=False, is_int=False) -> np.ndarray:
|
|
36
|
+
classes_ = []
|
|
37
|
+
|
|
38
|
+
if type(x).__module__.split(".")[0] == "numpy":
|
|
39
|
+
if len(x.shape) == 2:
|
|
40
|
+
classes_, x_, *_ = convert_input_frame(x, None, 1000)
|
|
41
|
+
else:
|
|
42
|
+
x_ = x
|
|
43
|
+
elif type_series(x) == "pandas_series":
|
|
44
|
+
x_ = x.to_numpy()
|
|
45
|
+
elif type_series(x) == "polars_series":
|
|
46
|
+
x_ = x.to_numpy(allow_copy=False)
|
|
47
|
+
elif type_df(x) == "polars_df" or type_df(x) == "pandas_df":
|
|
48
|
+
classes_, x_, *_ = convert_input_frame(x, None, 1000)
|
|
49
|
+
else:
|
|
50
|
+
x_ = x.to_numpy()
|
|
51
|
+
|
|
52
|
+
if is_target and objective == "LogLoss" and len(x_.shape) == 1:
|
|
53
|
+
classes_ = np.unique(x_)
|
|
54
|
+
x_index = np.array([np.where(classes_ == i) for i in x_])
|
|
55
|
+
if len(classes_) > 2:
|
|
56
|
+
x_ = np.squeeze(np.eye(len(classes_))[x_index])
|
|
57
|
+
|
|
58
|
+
if is_int and not np.issubdtype(x_.dtype, "uint64"):
|
|
59
|
+
x_ = x_.astype(dtype="uint64", copy=False)
|
|
60
|
+
|
|
61
|
+
if not is_int and not np.issubdtype(x_.dtype, "float64"):
|
|
62
|
+
x_ = x_.astype(dtype="float64", copy=False)
|
|
63
|
+
|
|
64
|
+
if len(x_.shape) == 2:
|
|
65
|
+
x_ = x_.ravel(order="F")
|
|
66
|
+
|
|
67
|
+
return x_, classes_
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def convert_input_frame(
|
|
71
|
+
X,
|
|
72
|
+
categorical_features,
|
|
73
|
+
max_cat,
|
|
74
|
+
) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
|
|
75
|
+
"""Convert data to format needed by booster.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping
|
|
79
|
+
"""
|
|
80
|
+
categorical_features_ = None
|
|
81
|
+
if type_df(X) == "pandas_df":
|
|
82
|
+
X_ = X.to_numpy()
|
|
83
|
+
features_ = X.columns.to_list()
|
|
84
|
+
if categorical_features == "auto":
|
|
85
|
+
categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
|
|
86
|
+
categorical_features_ = [
|
|
87
|
+
features_.index(c) for c in categorical_columns
|
|
88
|
+
] or None
|
|
89
|
+
elif type_df(X) == "numpy":
|
|
90
|
+
X_ = X
|
|
91
|
+
features_ = list(map(str, range(X_.shape[1])))
|
|
92
|
+
else:
|
|
93
|
+
raise ValueError(f"Object type {type(X)} is not supported.")
|
|
94
|
+
|
|
95
|
+
if (
|
|
96
|
+
categorical_features
|
|
97
|
+
and all(isinstance(s, int) for s in categorical_features)
|
|
98
|
+
and isinstance(categorical_features, list)
|
|
99
|
+
):
|
|
100
|
+
categorical_features_ = categorical_features
|
|
101
|
+
elif (
|
|
102
|
+
categorical_features
|
|
103
|
+
and all(isinstance(s, str) for s in categorical_features)
|
|
104
|
+
and isinstance(categorical_features, list)
|
|
105
|
+
):
|
|
106
|
+
categorical_features_ = [features_.index(c) for c in categorical_features]
|
|
107
|
+
|
|
108
|
+
cat_mapping = {} # key: feature_name, value: ordered category names
|
|
109
|
+
cat_to_num = []
|
|
110
|
+
if categorical_features_:
|
|
111
|
+
for i in categorical_features_:
|
|
112
|
+
categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True)
|
|
113
|
+
|
|
114
|
+
categories = list(categories)
|
|
115
|
+
if "nan" in categories:
|
|
116
|
+
categories.remove("nan")
|
|
117
|
+
categories.insert(0, "nan")
|
|
118
|
+
|
|
119
|
+
inversed = inversed + 1.0
|
|
120
|
+
|
|
121
|
+
if len(categories) > max_cat:
|
|
122
|
+
cat_to_num.append(i)
|
|
123
|
+
logger.warning(
|
|
124
|
+
f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
feature_name = features_[i]
|
|
128
|
+
cat_mapping[feature_name] = categories
|
|
129
|
+
ind_nan = len(categories)
|
|
130
|
+
inversed[inversed == ind_nan] = np.nan
|
|
131
|
+
X_[:, i] = inversed
|
|
132
|
+
|
|
133
|
+
categorical_features_ = [
|
|
134
|
+
x for x in categorical_features_ if x not in cat_to_num
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
logger.info(f"Categorical features: {categorical_features_}")
|
|
138
|
+
logger.info(f"Mapping of categories: {cat_mapping}")
|
|
139
|
+
|
|
140
|
+
if not np.issubdtype(X_.dtype, "float64"):
|
|
141
|
+
X_ = X_.astype(dtype="float64", copy=False)
|
|
142
|
+
flat_data = X_.ravel(order="F")
|
|
143
|
+
rows, cols = X_.shape
|
|
144
|
+
|
|
145
|
+
if isinstance(categorical_features_, list):
|
|
146
|
+
categorical_features_ = set(categorical_features_)
|
|
147
|
+
|
|
148
|
+
return features_, flat_data, rows, cols, categorical_features_, cat_mapping
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def convert_input_frame_columnar(
|
|
152
|
+
X, categorical_features, max_cat
|
|
153
|
+
) -> Tuple[
|
|
154
|
+
List[str],
|
|
155
|
+
List[np.ndarray],
|
|
156
|
+
List[Optional[np.ndarray]],
|
|
157
|
+
int,
|
|
158
|
+
int,
|
|
159
|
+
Optional[set],
|
|
160
|
+
dict,
|
|
161
|
+
]:
|
|
162
|
+
"""Convert Polars DataFrame to columnar format for zero-copy transfer.
|
|
163
|
+
|
|
164
|
+
Returns list of column arrays and list of validity masks.
|
|
165
|
+
"""
|
|
166
|
+
import polars.selectors as cs
|
|
167
|
+
|
|
168
|
+
features_ = list(X.columns)
|
|
169
|
+
rows, cols = X.shape
|
|
170
|
+
|
|
171
|
+
# Determine categorical features
|
|
172
|
+
categorical_features_ = None
|
|
173
|
+
if categorical_features == "auto":
|
|
174
|
+
categorical_columns = X.select(cs.categorical()).columns
|
|
175
|
+
categorical_features_ = [
|
|
176
|
+
features_.index(c) for c in categorical_columns
|
|
177
|
+
] or None
|
|
178
|
+
elif (
|
|
179
|
+
categorical_features
|
|
180
|
+
and all(isinstance(s, int) for s in categorical_features)
|
|
181
|
+
and isinstance(categorical_features, list)
|
|
182
|
+
):
|
|
183
|
+
categorical_features_ = categorical_features
|
|
184
|
+
elif (
|
|
185
|
+
categorical_features
|
|
186
|
+
and all(isinstance(s, str) for s in categorical_features)
|
|
187
|
+
and isinstance(categorical_features, list)
|
|
188
|
+
):
|
|
189
|
+
categorical_features_ = [features_.index(c) for c in categorical_features]
|
|
190
|
+
|
|
191
|
+
cat_mapping = {}
|
|
192
|
+
cat_to_num = []
|
|
193
|
+
categorical_set = set(categorical_features_) if categorical_features_ else set()
|
|
194
|
+
|
|
195
|
+
# Convert each column to numpy array
|
|
196
|
+
columns = []
|
|
197
|
+
masks = []
|
|
198
|
+
import pyarrow as pa
|
|
199
|
+
|
|
200
|
+
for i, col_name in enumerate(features_):
|
|
201
|
+
if i in categorical_set:
|
|
202
|
+
# For categorical columns, we need to encode them
|
|
203
|
+
# Use Arrow to get codes and categories without forcing numpy object conversion
|
|
204
|
+
arr = X[col_name].to_arrow()
|
|
205
|
+
if isinstance(arr, pa.ChunkedArray):
|
|
206
|
+
arr = arr.combine_chunks()
|
|
207
|
+
|
|
208
|
+
if not isinstance(arr, pa.DictionaryArray):
|
|
209
|
+
arr = arr.dictionary_encode()
|
|
210
|
+
|
|
211
|
+
# Extract categories (dictionary)
|
|
212
|
+
# Arrow dictionary is usually StringArray
|
|
213
|
+
cats = arr.dictionary.to_pylist()
|
|
214
|
+
|
|
215
|
+
# Extract codes (indices)
|
|
216
|
+
# Cast to float64 for Perpetual
|
|
217
|
+
indices = arr.indices.to_numpy(zero_copy_only=False)
|
|
218
|
+
out_values = indices.astype(np.float64)
|
|
219
|
+
out_values += 1.0 # Shift: 0 in Perpetual is "nan"
|
|
220
|
+
|
|
221
|
+
# Handle Nulls (masked values in Arrow)
|
|
222
|
+
# Set them to NaN
|
|
223
|
+
if arr.null_count > 0:
|
|
224
|
+
# Simpler: convert validity bitmap to byte array using numpy unpacking
|
|
225
|
+
row_count = len(out_values)
|
|
226
|
+
if arr.buffers()[0]:
|
|
227
|
+
valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
|
|
228
|
+
valid_mask = np.unpackbits(valid_bits, bitorder="little")[
|
|
229
|
+
:row_count
|
|
230
|
+
].astype(bool)
|
|
231
|
+
# mask is 1 where valid, 0 where null.
|
|
232
|
+
# We want to set nulls (0) to NaN.
|
|
233
|
+
out_values[~valid_mask] = np.nan
|
|
234
|
+
|
|
235
|
+
# Handle "nan" string in categories
|
|
236
|
+
if "nan" in cats:
|
|
237
|
+
nan_idx = cats.index("nan")
|
|
238
|
+
# Indices pointing to this "nan" string should become NaN
|
|
239
|
+
# Current logic: cats["nan"] is at `nan_idx`.
|
|
240
|
+
# `out_values` has `nan_idx + 1`.
|
|
241
|
+
out_values[out_values == (nan_idx + 1.0)] = np.nan
|
|
242
|
+
cats.remove("nan")
|
|
243
|
+
|
|
244
|
+
cats.insert(0, "nan")
|
|
245
|
+
|
|
246
|
+
if len(cats) > max_cat:
|
|
247
|
+
cat_to_num.append(i)
|
|
248
|
+
logger.warning(
|
|
249
|
+
f"Feature {col_name} will be treated as numerical since the number of categories ({len(cats)}) exceeds max_cat ({max_cat}) threshold."
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
cat_mapping[col_name] = cats
|
|
253
|
+
columns.append(out_values)
|
|
254
|
+
masks.append(None) # Categorical encoding handles NaNs
|
|
255
|
+
else:
|
|
256
|
+
# For non-categorical columns, use zero-copy via Arrow
|
|
257
|
+
series = X[col_name]
|
|
258
|
+
# Use Arrow to get validity bitmap and values zero-copy
|
|
259
|
+
arr = series.to_arrow()
|
|
260
|
+
if isinstance(arr, pa.ChunkedArray):
|
|
261
|
+
if arr.num_chunks > 1:
|
|
262
|
+
arr = arr.combine_chunks()
|
|
263
|
+
else:
|
|
264
|
+
arr = arr.chunk(0)
|
|
265
|
+
|
|
266
|
+
# Check buffers
|
|
267
|
+
buffers = arr.buffers()
|
|
268
|
+
# buffers[0] is validity bitmap
|
|
269
|
+
# buffers[1] is values
|
|
270
|
+
if buffers[0] is None:
|
|
271
|
+
masks.append(None)
|
|
272
|
+
else:
|
|
273
|
+
masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
|
|
274
|
+
|
|
275
|
+
# values
|
|
276
|
+
col_array = np.frombuffer(buffers[1], dtype=np.float64)
|
|
277
|
+
columns.append(col_array)
|
|
278
|
+
|
|
279
|
+
if categorical_features_:
|
|
280
|
+
categorical_features_ = [
|
|
281
|
+
x for x in categorical_features_ if x not in cat_to_num
|
|
282
|
+
]
|
|
283
|
+
logger.info(f"Categorical features: {categorical_features_}")
|
|
284
|
+
logger.info(f"Mapping of categories: {cat_mapping}")
|
|
285
|
+
|
|
286
|
+
if isinstance(categorical_features_, list):
|
|
287
|
+
categorical_features_ = set(categorical_features_)
|
|
288
|
+
|
|
289
|
+
return features_, columns, masks, rows, cols, categorical_features_, cat_mapping
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, int]:
|
|
293
|
+
"""Convert data to format needed by booster.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Tuple[List[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns
|
|
297
|
+
"""
|
|
298
|
+
if type_df(X) == "pandas_df":
|
|
299
|
+
X_ = X.to_numpy()
|
|
300
|
+
features_ = X.columns.to_list()
|
|
301
|
+
elif type_df(X) == "numpy":
|
|
302
|
+
X_ = X
|
|
303
|
+
features_ = list(map(str, range(X_.shape[1])))
|
|
304
|
+
else:
|
|
305
|
+
raise ValueError(f"Object type {type(X)} is not supported.")
|
|
306
|
+
|
|
307
|
+
if cat_mapping:
|
|
308
|
+
for feature_name, categories in cat_mapping.items():
|
|
309
|
+
feature_index = features_.index(feature_name)
|
|
310
|
+
cats = categories.copy()
|
|
311
|
+
cats.remove("nan")
|
|
312
|
+
x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str))
|
|
313
|
+
x_enc = x_enc + 1.0
|
|
314
|
+
ind_nan = len(categories)
|
|
315
|
+
x_enc[x_enc == ind_nan] = np.nan
|
|
316
|
+
X_[:, feature_index] = x_enc
|
|
317
|
+
|
|
318
|
+
if not np.issubdtype(X_.dtype, "float64"):
|
|
319
|
+
X_ = X_.astype(dtype="float64", copy=False)
|
|
320
|
+
flat_data = X_.ravel(order="F")
|
|
321
|
+
rows, cols = X_.shape
|
|
322
|
+
|
|
323
|
+
return features_, flat_data, rows, cols
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def transform_input_frame_columnar(
|
|
327
|
+
X, cat_mapping
|
|
328
|
+
) -> Tuple[List[str], List[np.ndarray], List[Optional[np.ndarray]], int, int]:
|
|
329
|
+
"""Convert Polars DataFrame to columnar format for zero-copy prediction.
|
|
330
|
+
|
|
331
|
+
Returns list of column arrays and masks instead of flattened data, avoiding copies.
|
|
332
|
+
"""
|
|
333
|
+
features_ = list(X.columns)
|
|
334
|
+
rows, cols = X.shape
|
|
335
|
+
|
|
336
|
+
columns = []
|
|
337
|
+
masks = []
|
|
338
|
+
import pyarrow as pa
|
|
339
|
+
|
|
340
|
+
for i, col_name in enumerate(features_):
|
|
341
|
+
if cat_mapping and col_name in cat_mapping:
|
|
342
|
+
# For categorical columns, we need to encode them using the existing cat_mapping
|
|
343
|
+
categories = cat_mapping[col_name]
|
|
344
|
+
|
|
345
|
+
# Use Arrow for zero-copy extraction
|
|
346
|
+
arr = X[col_name].to_arrow()
|
|
347
|
+
if isinstance(arr, pa.ChunkedArray):
|
|
348
|
+
arr = arr.combine_chunks()
|
|
349
|
+
if not isinstance(arr, pa.DictionaryArray):
|
|
350
|
+
arr = arr.dictionary_encode()
|
|
351
|
+
|
|
352
|
+
# Input categories
|
|
353
|
+
new_cats = arr.dictionary.to_pylist()
|
|
354
|
+
|
|
355
|
+
# Extract codes (indices)
|
|
356
|
+
# We need integers for indexing `lookup`.
|
|
357
|
+
# If indices has nulls, to_numpy() might return floats.
|
|
358
|
+
# We fill nulls with 0 to ensure we get integers, then mask result later.
|
|
359
|
+
filled_indices_arr = arr.indices.fill_null(0)
|
|
360
|
+
new_indices = filled_indices_arr.to_numpy()
|
|
361
|
+
|
|
362
|
+
# Build mapping from new_cats indices to old_cats indices
|
|
363
|
+
# old_cats = categories. "nan" is at index 0.
|
|
364
|
+
# We want to map new_cat_idx -> old_cat_idx.
|
|
365
|
+
# If new_cat is "nan", map to 0?
|
|
366
|
+
# Perpetual encoding: "nan" -> NaN (in float), Cat1 -> 1.0, Cat2 -> 2.0.
|
|
367
|
+
# categories list has "nan" at 0.
|
|
368
|
+
# So "A" is at index 1.
|
|
369
|
+
|
|
370
|
+
# We need to map `new_indices` to `out_values`.
|
|
371
|
+
|
|
372
|
+
# Create a lookup table (array)
|
|
373
|
+
# lookup[new_code] = old_float_code
|
|
374
|
+
|
|
375
|
+
lookup = np.full(len(new_cats), np.nan, dtype=np.float64)
|
|
376
|
+
|
|
377
|
+
# optimization: map strings to indices for old categories
|
|
378
|
+
old_cat_map = {c: i for i, c in enumerate(categories)}
|
|
379
|
+
# categories[0] is "nan"
|
|
380
|
+
|
|
381
|
+
for i, cat in enumerate(new_cats):
|
|
382
|
+
if cat in old_cat_map:
|
|
383
|
+
idx = old_cat_map[cat]
|
|
384
|
+
# If idx is 0 ("nan"), we want result to be np.nan?
|
|
385
|
+
# Previous logic: `inversed[inversed == ind_nan] = np.nan`.
|
|
386
|
+
# Wait, `inversed` from `searchsorted` was 0-based index into `cats` (without "nan" inside `searchsorted` call?).
|
|
387
|
+
# Previous logic:
|
|
388
|
+
# `cats.remove("nan")`
|
|
389
|
+
# `searchsorted(cats, ...)` -> index into cats (0 to N-1).
|
|
390
|
+
# `+ 1.0`. So 1 to N.
|
|
391
|
+
# `categories` has "nan" inserted at 0.
|
|
392
|
+
# So index 1 corresponds to `categories[1]`.
|
|
393
|
+
# Logic holds.
|
|
394
|
+
|
|
395
|
+
if categories[idx] == "nan":
|
|
396
|
+
lookup[i] = np.nan
|
|
397
|
+
else:
|
|
398
|
+
lookup[i] = float(idx)
|
|
399
|
+
# Note: categories has "nan" at 0. "A" at 1.
|
|
400
|
+
# If `cat` is "A", `idx` is 1. We want 1.0. Correct.
|
|
401
|
+
else:
|
|
402
|
+
# Unknown category -> NaN?
|
|
403
|
+
lookup[i] = np.nan
|
|
404
|
+
|
|
405
|
+
# Apply lookup
|
|
406
|
+
# Handle out of bounds indices just in case? Arrow indices should be valid.
|
|
407
|
+
# `new_indices` are codes into `new_cats`.
|
|
408
|
+
|
|
409
|
+
# Check for nulls in `new_indices` (masked)
|
|
410
|
+
# If null, they map to NaN.
|
|
411
|
+
|
|
412
|
+
# `take` style mapping
|
|
413
|
+
# `lookup` has NaN for unknown/nan cats.
|
|
414
|
+
x_enc = lookup[new_indices]
|
|
415
|
+
|
|
416
|
+
# Handle array-level nulls
|
|
417
|
+
if arr.null_count > 0:
|
|
418
|
+
if arr.buffers()[0]:
|
|
419
|
+
valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
|
|
420
|
+
valid_mask = np.unpackbits(valid_bits, bitorder="little")[
|
|
421
|
+
: len(x_enc)
|
|
422
|
+
].astype(bool)
|
|
423
|
+
x_enc[~valid_mask] = np.nan
|
|
424
|
+
|
|
425
|
+
columns.append(x_enc)
|
|
426
|
+
masks.append(None)
|
|
427
|
+
else:
|
|
428
|
+
series = X[col_name]
|
|
429
|
+
arr = series.to_arrow()
|
|
430
|
+
if isinstance(arr, pa.ChunkedArray):
|
|
431
|
+
if arr.num_chunks > 1:
|
|
432
|
+
arr = arr.combine_chunks() # Fallback for chunked
|
|
433
|
+
else:
|
|
434
|
+
arr = arr.chunk(0)
|
|
435
|
+
buffers = arr.buffers()
|
|
436
|
+
if buffers[0] is None:
|
|
437
|
+
masks.append(None)
|
|
438
|
+
else:
|
|
439
|
+
masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
|
|
440
|
+
columns.append(np.frombuffer(buffers[1], dtype=np.float64))
|
|
441
|
+
|
|
442
|
+
return features_, columns, masks, rows, cols
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
CONTRIBUTION_METHODS = {
|
|
446
|
+
"weight": "Weight",
|
|
447
|
+
"Weight": "Weight",
|
|
448
|
+
"average": "Average",
|
|
449
|
+
"Average": "Average",
|
|
450
|
+
"branch-difference": "BranchDifference",
|
|
451
|
+
"branchdifference": "BranchDifference",
|
|
452
|
+
"BranchDifference": "BranchDifference",
|
|
453
|
+
"midpoint-difference": "MidpointDifference",
|
|
454
|
+
"midpointdifference": "MidpointDifference",
|
|
455
|
+
"MidpointDifference": "MidpointDifference",
|
|
456
|
+
"mode-difference": "ModeDifference",
|
|
457
|
+
"modedifference": "ModeDifference",
|
|
458
|
+
"ModeDifference": "ModeDifference",
|
|
459
|
+
"ProbabilityChange": "ProbabilityChange",
|
|
460
|
+
"probabilitychange": "ProbabilityChange",
|
|
461
|
+
"probability-change": "ProbabilityChange",
|
|
462
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: perpetual
|
|
3
|
+
Version: 1.0.40
|
|
4
|
+
Classifier: Programming Language :: Python :: 3
|
|
5
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
6
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: typing-extensions
|
|
12
|
+
Requires-Dist: pandas ; extra == 'dev'
|
|
13
|
+
Requires-Dist: polars ; extra == 'dev'
|
|
14
|
+
Requires-Dist: pyarrow ; extra == 'dev'
|
|
15
|
+
Requires-Dist: maturin ; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest ; extra == 'dev'
|
|
17
|
+
Requires-Dist: seaborn ; extra == 'dev'
|
|
18
|
+
Requires-Dist: scikit-learn ; extra == 'dev'
|
|
19
|
+
Requires-Dist: mkdocs-material ; extra == 'dev'
|
|
20
|
+
Requires-Dist: mkdocstrings[python] ; extra == 'dev'
|
|
21
|
+
Requires-Dist: mkdocs-autorefs ; extra == 'dev'
|
|
22
|
+
Requires-Dist: ruff ; extra == 'dev'
|
|
23
|
+
Requires-Dist: xgboost ; extra == 'dev'
|
|
24
|
+
Requires-Dist: onnxmltools ; extra == 'dev'
|
|
25
|
+
Requires-Dist: onnx ; extra == 'dev'
|
|
26
|
+
Requires-Dist: onnxruntime ; python_full_version < '3.14' and extra == 'dev'
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Summary: A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization
|
|
30
|
+
Keywords: rust,perpetual,machine learning,tree model,decision tree,gradient boosted decision tree,gradient boosting machine
|
|
31
|
+
Home-Page: https://perpetual-ml.com
|
|
32
|
+
Author-email: Mutlu Simsek <mutlusims3k@gmail.com>, Serkan Korkmaz <serkor1@duck.com>, Pieter Pel <pelpieter@gmail.com>
|
|
33
|
+
Requires-Python: >=3.10
|
|
34
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
35
|
+
|
|
36
|
+
<!-- markdownlint-disable MD033 -->
|
|
37
|
+
# Perpetual
|
|
38
|
+
|
|
39
|
+
<p align="center">
|
|
40
|
+
<img height="120" src="https://github.com/perpetual-ml/perpetual/raw/main/resources/perp_logo.png" alt="Perpetual Logo">
|
|
41
|
+
</p>
|
|
42
|
+
|
|
43
|
+
<div align="center">
|
|
44
|
+
|
|
45
|
+
<a href="https://pypi.org/project/perpetual" target="_blank"><img src="https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white" alt="Python Versions"></a>
|
|
46
|
+
<a href="https://pypi.org/project/perpetual" target="_blank"><img src="https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white" alt="PyPI Version"></a>
|
|
47
|
+
<a href="https://crates.io/crates/perpetual" target="_blank"><img src="https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white" alt="Crates.io Version"></a>
|
|
48
|
+
<a href="https://perpetual-ml.r-universe.dev/perpetual" target="_blank"><img src="https://img.shields.io/badge/dynamic/json?url=https://perpetual-ml.r-universe.dev/api/packages/perpetual&query=$.Version&label=r-universe&logo=R&logoColor=white&color=brightgreen" alt="R-Universe status"></a>
|
|
49
|
+
<a href="https://discord.gg/AyUK7rr6wy" target="_blank"><img src="https://img.shields.io/badge/join-discord-blue?logo=discord" alt="Static Badge"></a>
|
|
50
|
+

|
|
51
|
+
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
PerpetualBooster is a gradient boosting machine (GBM) algorithm that doesn't need hyperparameter optimization unlike other GBMs. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
|
|
55
|
+
|
|
56
|
+
## Supported Languages
|
|
57
|
+
|
|
58
|
+
Perpetual is built in Rust and provides high-performance bindings for Python and R.
|
|
59
|
+
|
|
60
|
+
| Language | Installation | Documentation | Repository |
|
|
61
|
+
| :--------- | :------------------------------ | :------------------------------------------------------------------------------------ | :-------------------------------------------------------------- |
|
|
62
|
+
| **Python** | `pip install perpetual` | <a href="https://perpetual-ml.github.io/perpetual" target="_blank">Python API</a> | <a href="./package-python" target="_blank">`package-python`</a> |
|
|
63
|
+
| **Rust** | `cargo add perpetual` | <a href="https://docs.rs/perpetual" target="_blank">docs.rs</a> | <a href="./src" target="_blank">`src`</a> |
|
|
64
|
+
| **R** | `install.packages("perpetual")` | <a href="https://perpetual-ml.github.io/perpetual/r" target="_blank">pkgdown Site</a> | <a href="./package-r" target="_blank">`package-r`</a> |
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from perpetual import PerpetualBooster
|
|
72
|
+
|
|
73
|
+
model = PerpetualBooster(objective="SquaredLoss", budget=0.5)
|
|
74
|
+
model.fit(X, y)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Documentation
|
|
78
|
+
|
|
79
|
+
Comprehensive documentation for all supported languages is available:
|
|
80
|
+
|
|
81
|
+
- **Python**: <a href="https://perpetual-ml.github.io/perpetual" target="_blank">API Reference & Guides</a>
|
|
82
|
+
- **Rust**: <a href="https://docs.rs/perpetual" target="_blank">docs.rs/perpetual</a>
|
|
83
|
+
- **R**: <a href="https://perpetual-ml.github.io/perpetual/r" target="_blank">pkgdown Documentation</a>
|
|
84
|
+
|
|
85
|
+
## Benchmark
|
|
86
|
+
|
|
87
|
+
### PerpetualBooster vs. Optuna + LightGBM
|
|
88
|
+
|
|
89
|
+
Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
|
|
90
|
+
|
|
91
|
+
The following table summarizes the results for the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html" target="_blank">California Housing</a> dataset (regression):
|
|
92
|
+
|
|
93
|
+
| Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Speed-up wall time | Speed-up cpu time |
|
|
94
|
+
| :--------------- | :-------------------- | :------------ | :----------- | :----------------- | :---------------- |
|
|
95
|
+
| 1.0 | 100 | 0.192 | 0.192 | 54x | 56x |
|
|
96
|
+
| 1.5 | 300 | 0.188 | 0.188 | 59x | 58x |
|
|
97
|
+
| 2.1 | 1000 | 0.185 | 0.186 | 42x | 41x |
|
|
98
|
+
|
|
99
|
+
The following table summarizes the results for the <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html" target="_blank">Cover Types</a> dataset (classification):
|
|
100
|
+
|
|
101
|
+
| Perpetual budget | LightGBM n_estimators | Perpetual log loss | LightGBM log loss | Speed-up wall time | Speed-up cpu time |
|
|
102
|
+
| :--------------- | :-------------------- | :----------------- | :---------------- | :----------------- | :---------------- |
|
|
103
|
+
| 0.9 | 100 | 0.091 | 0.084 | 72x | 78x |
|
|
104
|
+
|
|
105
|
+
The results can be reproduced using the scripts in the <a href="./package-python/examples" target="_blank">examples</a> folder.
|
|
106
|
+
|
|
107
|
+
### PerpetualBooster vs. AutoGluon
|
|
108
|
+
|
|
109
|
+
PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in <a href="https://automlbenchmark.streamlit.app/cd_diagram" target="_blank">AutoML benchmark</a>. Top 10 datasets with the most number of rows are selected from <a href="https://www.openml.org/" target="_blank">OpenML datasets</a> for both regression and classification tasks.
|
|
110
|
+
|
|
111
|
+
The results are summarized in the following table for regression tasks:
|
|
112
|
+
|
|
113
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
|
|
114
|
+
| :---------------------------------------------------------------------------------- | :-------------------------- | :--------------------------- | :------------------ | :-------------------------- | :--------------------------- | :----------------- |
|
|
115
|
+
| <a href="https://www.openml.org/t/359929" target="_blank">Airlines_DepDelay_10M</a> | 518 | 11.3 | 29.0 | 520 | 30.9 | <ins> 28.8 </ins> |
|
|
116
|
+
| <a href="https://www.openml.org/t/361940" target="_blank">bates_regr_100</a> | 3421 | 15.1 | <ins> 1.084 </ins> | OOM | OOM | OOM |
|
|
117
|
+
| <a href="https://www.openml.org/t/7327" target="_blank">BNG(libras_move)</a> | 1956 | 4.2 | <ins> 2.51 </ins> | 1922 | 97.6 | 2.53 |
|
|
118
|
+
| <a href="https://www.openml.org/t/7326" target="_blank">BNG(satellite_image)</a> | 334 | 1.6 | 0.731 | 337 | 10.0 | <ins> 0.721 </ins> |
|
|
119
|
+
| <a href="https://www.openml.org/t/14949" target="_blank">COMET_MC</a> | 44 | 1.0 | <ins> 0.0615 </ins> | 47 | 5.0 | 0.0662 |
|
|
120
|
+
| <a href="https://www.openml.org/t/361939" target="_blank">friedman1</a> | 275 | 4.2 | <ins> 1.047 </ins> | 278 | 5.1 | 1.487 |
|
|
121
|
+
| <a href="https://www.openml.org/t/10102" target="_blank">poker</a> | 38 | 0.6 | <ins> 0.256 </ins> | 41 | 1.2 | 0.722 |
|
|
122
|
+
| <a href="https://www.openml.org/t/361955" target="_blank">subset_higgs</a> | 868 | 10.6 | <ins> 0.420 </ins> | 870 | 24.5 | 0.421 |
|
|
123
|
+
| <a href="https://www.openml.org/t/7319" target="_blank">BNG(autoHorse)</a> | 107 | 1.1 | <ins> 19.0 </ins> | 107 | 3.2 | 20.5 |
|
|
124
|
+
| <a href="https://www.openml.org/t/7318" target="_blank">BNG(pbc)</a> | 48 | 0.6 | <ins> 836.5 </ins> | 51 | 0.2 | 957.1 |
|
|
125
|
+
| average | 465 | 3.9 | - | 464 | 19.7 | - |
|
|
126
|
+
|
|
127
|
+
PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster.
|
|
128
|
+
|
|
129
|
+
The results are summarized in the following table for classification tasks:
|
|
130
|
+
|
|
131
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC |
|
|
132
|
+
| :--------------------------------------------------------------------------------- | :-------------------------- | :--------------------------- | :----------------- | :-------------------------- | :--------------------------- | :------------ |
|
|
133
|
+
| <a href="https://www.openml.org/t/146163" target="_blank">BNG(spambase)</a> | 70.1 | 2.1 | <ins> 0.671 </ins> | 73.1 | 3.7 | 0.669 |
|
|
134
|
+
| <a href="https://www.openml.org/t/208" target="_blank">BNG(trains)</a> | 89.5 | 1.7 | <ins> 0.996 </ins> | 106.4 | 2.4 | 0.994 |
|
|
135
|
+
| <a href="https://www.openml.org/t/361942" target="_blank">breast</a> | 13699.3 | 97.7 | <ins> 0.991 </ins> | 13330.7 | 79.7 | 0.949 |
|
|
136
|
+
| <a href="https://www.openml.org/t/7291" target="_blank">Click_prediction_small</a> | 89.1 | 1.0 | <ins> 0.749 </ins> | 101.0 | 2.8 | 0.703 |
|
|
137
|
+
| <a href="https://www.openml.org/t/361938" target="_blank">colon</a> | 12435.2 | 126.7 | <ins> 0.997 </ins> | 12356.2 | 152.3 | 0.997 |
|
|
138
|
+
| <a href="https://www.openml.org/t/362113" target="_blank">Higgs</a> | 3485.3 | 40.9 | <ins> 0.843 </ins> | 3501.4 | 67.9 | 0.816 |
|
|
139
|
+
| <a href="https://www.openml.org/t/230" target="_blank">SEA(50000)</a> | 21.9 | 0.2 | <ins> 0.936 </ins> | 25.6 | 0.5 | 0.935 |
|
|
140
|
+
| <a href="https://www.openml.org/t/359994" target="_blank">sf-police-incidents</a> | 85.8 | 1.5 | <ins> 0.687 </ins> | 99.4 | 2.8 | 0.659 |
|
|
141
|
+
| <a href="https://www.openml.org/t/361941" target="_blank">bates_classif_100</a> | 11152.8 | 50.0 | <ins> 0.864 </ins> | OOM | OOM | OOM |
|
|
142
|
+
| <a href="https://www.openml.org/t/361945" target="_blank">prostate</a> | 13699.9 | 79.8 | <ins> 0.987 </ins> | OOM | OOM | OOM |
|
|
143
|
+
| average | 3747.0 | 34.0 | - | 3699.2 | 39.0 | - |
|
|
144
|
+
|
|
145
|
+
PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster.
|
|
146
|
+
|
|
147
|
+
PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks.
|
|
148
|
+
|
|
149
|
+
The results can be reproduced using the <a href="https://github.com/deadsoul44/automlbenchmark" target="_blank">automlbenchmark fork</a>.
|
|
150
|
+
|
|
151
|
+
## Contribution
|
|
152
|
+
|
|
153
|
+
Contributions are welcome. Check <a href="./CONTRIBUTING.md" target="_blank">CONTRIBUTING.md</a> for the guideline.
|
|
154
|
+
|
|
155
|
+
## Paper
|
|
156
|
+
|
|
157
|
+
PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our <a href="https://perpetual-ml.com/blog/how-perpetual-works" target="_blank">blog post</a> for a high level introduction to the algorithm.
|
|
158
|
+
|
|
159
|
+
## Perpetual ML Suite
|
|
160
|
+
|
|
161
|
+
The **Perpetual ML Suite** is a comprehensive, batteries-included ML platform designed to deliver maximum predictive power with minimal effort. It allows you to track experiments, monitor metrics, and manage model drift through an intuitive interface.
|
|
162
|
+
|
|
163
|
+
For a fully managed, **serverless ML experience**, visit <a href="https://app.perpetual-ml.com" target="_blank">app.perpetual-ml.com</a>.
|
|
164
|
+
|
|
165
|
+
- **Serverless Marimo Notebooks**: Run interactive, reactive notebooks without managing any infrastructure.
|
|
166
|
+
- **Serverless ML Endpoints**: One-click deployment of models as production-ready endpoints for real-time inference.
|
|
167
|
+
|
|
168
|
+
Perpetual is also designed to live where your data lives. It is available as a native application on the <a href="https://app.snowflake.com/marketplace/listing/GZSYZX0EMJ/perpetual-ml-perpetual-ml-suite" target="_blank">Snowflake Marketplace</a>, with support for Databricks and other major data warehouses coming soon.
|
|
169
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
perpetual/__init__.py,sha256=hKi__gjLuv4MIKSdmiuLg-Y3Aj3Wj8O3zOQ0u9dw5Cc,115
|
|
2
|
+
perpetual/booster.py,sha256=lhXoBq7tMwd9Upa7WMdnjr0FRRcMaqPWxLbmcJLfK6U,73005
|
|
3
|
+
perpetual/data.py,sha256=e2xF5xVq3KYotj5fpIhSfnF3B4qLQpdHYDSaP5NpcxA,768
|
|
4
|
+
perpetual/perpetual.cpython-311-darwin.so,sha256=bpV-Pce18WVZmNYUDHqTNsTCa67D9X8teW1TygAeblU,1476216
|
|
5
|
+
perpetual/serialize.py,sha256=Tg2BbuA1jKQ5-ITuVhwtj6hgBaRAbZ66eHctR7fcVk4,1883
|
|
6
|
+
perpetual/sklearn.py,sha256=6Kl3dlYBQK0yaFF7I7qzfyOP_dtc_c3q05OjKxCApmk,7011
|
|
7
|
+
perpetual/types.py,sha256=T0KJu8bK8xiYHaPt8b6RmUR1xP3f5N1FV7qaZTy1rtM,3232
|
|
8
|
+
perpetual/utils.py,sha256=qxWSlS1yZNtSRECgTOeDILqeA1wJHV5SYPRUgtL9Goc,16894
|
|
9
|
+
perpetual-1.0.40.dist-info/METADATA,sha256=kXiPgDS3hiLpAv86EH-ytYLeMi9pca_ltLR1GcPo4jE,16083
|
|
10
|
+
perpetual-1.0.40.dist-info/WHEEL,sha256=uo497LsoCAD-gruBuBwmuPSjitTK7HU_NyyROWcAuhA,107
|
|
11
|
+
perpetual-1.0.40.dist-info/licenses/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
|
|
12
|
+
perpetual-1.0.40.dist-info/RECORD,,
|