perpetual 1.0.7__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
perpetual/utils.py ADDED
@@ -0,0 +1,462 @@
1
+ import logging
2
+ from typing import Dict, Iterable, List, Optional, Tuple
3
+
4
+ import numpy as np
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def type_df(df):
10
+ library_name = type(df).__module__.split(".")[0]
11
+ if type(df).__name__ == "DataFrame":
12
+ if library_name == "pandas":
13
+ return "pandas_df"
14
+ elif library_name == "polars":
15
+ return "polars_df"
16
+ elif library_name == "numpy":
17
+ return "numpy"
18
+ else:
19
+ return ""
20
+
21
+
22
+ def type_series(y):
23
+ library_name = type(y).__module__.split(".")[0]
24
+ if type(y).__name__ == "Series":
25
+ if library_name == "pandas":
26
+ return "pandas_series"
27
+ elif library_name == "polars":
28
+ return "polars_series"
29
+ elif library_name == "numpy":
30
+ return "numpy"
31
+ else:
32
+ return ""
33
+
34
+
35
+ def convert_input_array(x, objective, is_target=False, is_int=False) -> np.ndarray:
36
+ classes_ = []
37
+
38
+ if type(x).__module__.split(".")[0] == "numpy":
39
+ if len(x.shape) == 2:
40
+ classes_, x_, *_ = convert_input_frame(x, None, 1000)
41
+ else:
42
+ x_ = x
43
+ elif type_series(x) == "pandas_series":
44
+ x_ = x.to_numpy()
45
+ elif type_series(x) == "polars_series":
46
+ x_ = x.to_numpy(allow_copy=False)
47
+ elif type_df(x) == "polars_df" or type_df(x) == "pandas_df":
48
+ classes_, x_, *_ = convert_input_frame(x, None, 1000)
49
+ else:
50
+ x_ = x.to_numpy()
51
+
52
+ if is_target and objective == "LogLoss" and len(x_.shape) == 1:
53
+ classes_ = np.unique(x_)
54
+ x_index = np.array([np.where(classes_ == i) for i in x_])
55
+ if len(classes_) > 2:
56
+ x_ = np.squeeze(np.eye(len(classes_))[x_index])
57
+
58
+ if is_int and not np.issubdtype(x_.dtype, "uint64"):
59
+ x_ = x_.astype(dtype="uint64", copy=False)
60
+
61
+ if not is_int and not np.issubdtype(x_.dtype, "float64"):
62
+ x_ = x_.astype(dtype="float64", copy=False)
63
+
64
+ if len(x_.shape) == 2:
65
+ x_ = x_.ravel(order="F")
66
+
67
+ return x_, classes_
68
+
69
+
70
+ def convert_input_frame(
71
+ X,
72
+ categorical_features,
73
+ max_cat,
74
+ ) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
75
+ """Convert data to format needed by booster.
76
+
77
+ Returns:
78
+ Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping
79
+ """
80
+ categorical_features_ = None
81
+ if type_df(X) == "pandas_df":
82
+ X_ = X.to_numpy()
83
+ features_ = X.columns.to_list()
84
+ if categorical_features == "auto":
85
+ categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
86
+ categorical_features_ = [
87
+ features_.index(c) for c in categorical_columns
88
+ ] or None
89
+ elif type_df(X) == "numpy":
90
+ X_ = X
91
+ features_ = list(map(str, range(X_.shape[1])))
92
+ else:
93
+ raise ValueError(f"Object type {type(X)} is not supported.")
94
+
95
+ if (
96
+ categorical_features
97
+ and all(isinstance(s, int) for s in categorical_features)
98
+ and isinstance(categorical_features, list)
99
+ ):
100
+ categorical_features_ = categorical_features
101
+ elif (
102
+ categorical_features
103
+ and all(isinstance(s, str) for s in categorical_features)
104
+ and isinstance(categorical_features, list)
105
+ ):
106
+ categorical_features_ = [features_.index(c) for c in categorical_features]
107
+
108
+ cat_mapping = {} # key: feature_name, value: ordered category names
109
+ cat_to_num = []
110
+ if categorical_features_:
111
+ for i in categorical_features_:
112
+ categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True)
113
+
114
+ categories = list(categories)
115
+ if "nan" in categories:
116
+ categories.remove("nan")
117
+ categories.insert(0, "nan")
118
+
119
+ inversed = inversed + 1.0
120
+
121
+ if len(categories) > max_cat:
122
+ cat_to_num.append(i)
123
+ logger.warning(
124
+ f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
125
+ )
126
+
127
+ feature_name = features_[i]
128
+ cat_mapping[feature_name] = categories
129
+ ind_nan = len(categories)
130
+ inversed[inversed == ind_nan] = np.nan
131
+ X_[:, i] = inversed
132
+
133
+ categorical_features_ = [
134
+ x for x in categorical_features_ if x not in cat_to_num
135
+ ]
136
+
137
+ logger.info(f"Categorical features: {categorical_features_}")
138
+ logger.info(f"Mapping of categories: {cat_mapping}")
139
+
140
+ if not np.issubdtype(X_.dtype, "float64"):
141
+ X_ = X_.astype(dtype="float64", copy=False)
142
+ flat_data = X_.ravel(order="F")
143
+ rows, cols = X_.shape
144
+
145
+ if isinstance(categorical_features_, list):
146
+ categorical_features_ = set(categorical_features_)
147
+
148
+ return features_, flat_data, rows, cols, categorical_features_, cat_mapping
149
+
150
+
151
+ def convert_input_frame_columnar(
152
+ X, categorical_features, max_cat
153
+ ) -> Tuple[
154
+ List[str],
155
+ List[np.ndarray],
156
+ List[Optional[np.ndarray]],
157
+ int,
158
+ int,
159
+ Optional[set],
160
+ dict,
161
+ ]:
162
+ """Convert Polars DataFrame to columnar format for zero-copy transfer.
163
+
164
+ Returns list of column arrays and list of validity masks.
165
+ """
166
+ import polars.selectors as cs
167
+
168
+ features_ = list(X.columns)
169
+ rows, cols = X.shape
170
+
171
+ # Determine categorical features
172
+ categorical_features_ = None
173
+ if categorical_features == "auto":
174
+ categorical_columns = X.select(cs.categorical()).columns
175
+ categorical_features_ = [
176
+ features_.index(c) for c in categorical_columns
177
+ ] or None
178
+ elif (
179
+ categorical_features
180
+ and all(isinstance(s, int) for s in categorical_features)
181
+ and isinstance(categorical_features, list)
182
+ ):
183
+ categorical_features_ = categorical_features
184
+ elif (
185
+ categorical_features
186
+ and all(isinstance(s, str) for s in categorical_features)
187
+ and isinstance(categorical_features, list)
188
+ ):
189
+ categorical_features_ = [features_.index(c) for c in categorical_features]
190
+
191
+ cat_mapping = {}
192
+ cat_to_num = []
193
+ categorical_set = set(categorical_features_) if categorical_features_ else set()
194
+
195
+ # Convert each column to numpy array
196
+ columns = []
197
+ masks = []
198
+ import pyarrow as pa
199
+
200
+ for i, col_name in enumerate(features_):
201
+ if i in categorical_set:
202
+ # For categorical columns, we need to encode them
203
+ # Use Arrow to get codes and categories without forcing numpy object conversion
204
+ arr = X[col_name].to_arrow()
205
+ if isinstance(arr, pa.ChunkedArray):
206
+ arr = arr.combine_chunks()
207
+
208
+ if not isinstance(arr, pa.DictionaryArray):
209
+ arr = arr.dictionary_encode()
210
+
211
+ # Extract categories (dictionary)
212
+ # Arrow dictionary is usually StringArray
213
+ cats = arr.dictionary.to_pylist()
214
+
215
+ # Extract codes (indices)
216
+ # Cast to float64 for Perpetual
217
+ indices = arr.indices.to_numpy(zero_copy_only=False)
218
+ out_values = indices.astype(np.float64)
219
+ out_values += 1.0 # Shift: 0 in Perpetual is "nan"
220
+
221
+ # Handle Nulls (masked values in Arrow)
222
+ # Set them to NaN
223
+ if arr.null_count > 0:
224
+ # Simpler: convert validity bitmap to byte array using numpy unpacking
225
+ row_count = len(out_values)
226
+ if arr.buffers()[0]:
227
+ valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
228
+ valid_mask = np.unpackbits(valid_bits, bitorder="little")[
229
+ :row_count
230
+ ].astype(bool)
231
+ # mask is 1 where valid, 0 where null.
232
+ # We want to set nulls (0) to NaN.
233
+ out_values[~valid_mask] = np.nan
234
+
235
+ # Handle "nan" string in categories
236
+ if "nan" in cats:
237
+ nan_idx = cats.index("nan")
238
+ # Indices pointing to this "nan" string should become NaN
239
+ # Current logic: cats["nan"] is at `nan_idx`.
240
+ # `out_values` has `nan_idx + 1`.
241
+ out_values[out_values == (nan_idx + 1.0)] = np.nan
242
+ cats.remove("nan")
243
+
244
+ cats.insert(0, "nan")
245
+
246
+ if len(cats) > max_cat:
247
+ cat_to_num.append(i)
248
+ logger.warning(
249
+ f"Feature {col_name} will be treated as numerical since the number of categories ({len(cats)}) exceeds max_cat ({max_cat}) threshold."
250
+ )
251
+
252
+ cat_mapping[col_name] = cats
253
+ columns.append(out_values)
254
+ masks.append(None) # Categorical encoding handles NaNs
255
+ else:
256
+ # For non-categorical columns, use zero-copy via Arrow
257
+ series = X[col_name]
258
+ # Use Arrow to get validity bitmap and values zero-copy
259
+ arr = series.to_arrow()
260
+ if isinstance(arr, pa.ChunkedArray):
261
+ if arr.num_chunks > 1:
262
+ arr = arr.combine_chunks()
263
+ else:
264
+ arr = arr.chunk(0)
265
+
266
+ # Check buffers
267
+ buffers = arr.buffers()
268
+ # buffers[0] is validity bitmap
269
+ # buffers[1] is values
270
+ if buffers[0] is None:
271
+ masks.append(None)
272
+ else:
273
+ masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
274
+
275
+ # values
276
+ col_array = np.frombuffer(buffers[1], dtype=np.float64)
277
+ columns.append(col_array)
278
+
279
+ if categorical_features_:
280
+ categorical_features_ = [
281
+ x for x in categorical_features_ if x not in cat_to_num
282
+ ]
283
+ logger.info(f"Categorical features: {categorical_features_}")
284
+ logger.info(f"Mapping of categories: {cat_mapping}")
285
+
286
+ if isinstance(categorical_features_, list):
287
+ categorical_features_ = set(categorical_features_)
288
+
289
+ return features_, columns, masks, rows, cols, categorical_features_, cat_mapping
290
+
291
+
292
+ def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, int]:
293
+ """Convert data to format needed by booster.
294
+
295
+ Returns:
296
+ Tuple[List[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns
297
+ """
298
+ if type_df(X) == "pandas_df":
299
+ X_ = X.to_numpy()
300
+ features_ = X.columns.to_list()
301
+ elif type_df(X) == "numpy":
302
+ X_ = X
303
+ features_ = list(map(str, range(X_.shape[1])))
304
+ else:
305
+ raise ValueError(f"Object type {type(X)} is not supported.")
306
+
307
+ if cat_mapping:
308
+ for feature_name, categories in cat_mapping.items():
309
+ feature_index = features_.index(feature_name)
310
+ cats = categories.copy()
311
+ cats.remove("nan")
312
+ x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str))
313
+ x_enc = x_enc + 1.0
314
+ ind_nan = len(categories)
315
+ x_enc[x_enc == ind_nan] = np.nan
316
+ X_[:, feature_index] = x_enc
317
+
318
+ if not np.issubdtype(X_.dtype, "float64"):
319
+ X_ = X_.astype(dtype="float64", copy=False)
320
+ flat_data = X_.ravel(order="F")
321
+ rows, cols = X_.shape
322
+
323
+ return features_, flat_data, rows, cols
324
+
325
+
326
+ def transform_input_frame_columnar(
327
+ X, cat_mapping
328
+ ) -> Tuple[List[str], List[np.ndarray], List[Optional[np.ndarray]], int, int]:
329
+ """Convert Polars DataFrame to columnar format for zero-copy prediction.
330
+
331
+ Returns list of column arrays and masks instead of flattened data, avoiding copies.
332
+ """
333
+ features_ = list(X.columns)
334
+ rows, cols = X.shape
335
+
336
+ columns = []
337
+ masks = []
338
+ import pyarrow as pa
339
+
340
+ for i, col_name in enumerate(features_):
341
+ if cat_mapping and col_name in cat_mapping:
342
+ # For categorical columns, we need to encode them using the existing cat_mapping
343
+ categories = cat_mapping[col_name]
344
+
345
+ # Use Arrow for zero-copy extraction
346
+ arr = X[col_name].to_arrow()
347
+ if isinstance(arr, pa.ChunkedArray):
348
+ arr = arr.combine_chunks()
349
+ if not isinstance(arr, pa.DictionaryArray):
350
+ arr = arr.dictionary_encode()
351
+
352
+ # Input categories
353
+ new_cats = arr.dictionary.to_pylist()
354
+
355
+ # Extract codes (indices)
356
+ # We need integers for indexing `lookup`.
357
+ # If indices has nulls, to_numpy() might return floats.
358
+ # We fill nulls with 0 to ensure we get integers, then mask result later.
359
+ filled_indices_arr = arr.indices.fill_null(0)
360
+ new_indices = filled_indices_arr.to_numpy()
361
+
362
+ # Build mapping from new_cats indices to old_cats indices
363
+ # old_cats = categories. "nan" is at index 0.
364
+ # We want to map new_cat_idx -> old_cat_idx.
365
+ # If new_cat is "nan", map to 0?
366
+ # Perpetual encoding: "nan" -> NaN (in float), Cat1 -> 1.0, Cat2 -> 2.0.
367
+ # categories list has "nan" at 0.
368
+ # So "A" is at index 1.
369
+
370
+ # We need to map `new_indices` to `out_values`.
371
+
372
+ # Create a lookup table (array)
373
+ # lookup[new_code] = old_float_code
374
+
375
+ lookup = np.full(len(new_cats), np.nan, dtype=np.float64)
376
+
377
+ # optimization: map strings to indices for old categories
378
+ old_cat_map = {c: i for i, c in enumerate(categories)}
379
+ # categories[0] is "nan"
380
+
381
+ for i, cat in enumerate(new_cats):
382
+ if cat in old_cat_map:
383
+ idx = old_cat_map[cat]
384
+ # If idx is 0 ("nan"), we want result to be np.nan?
385
+ # Previous logic: `inversed[inversed == ind_nan] = np.nan`.
386
+ # Wait, `inversed` from `searchsorted` was 0-based index into `cats` (without "nan" inside `searchsorted` call?).
387
+ # Previous logic:
388
+ # `cats.remove("nan")`
389
+ # `searchsorted(cats, ...)` -> index into cats (0 to N-1).
390
+ # `+ 1.0`. So 1 to N.
391
+ # `categories` has "nan" inserted at 0.
392
+ # So index 1 corresponds to `categories[1]`.
393
+ # Logic holds.
394
+
395
+ if categories[idx] == "nan":
396
+ lookup[i] = np.nan
397
+ else:
398
+ lookup[i] = float(idx)
399
+ # Note: categories has "nan" at 0. "A" at 1.
400
+ # If `cat` is "A", `idx` is 1. We want 1.0. Correct.
401
+ else:
402
+ # Unknown category -> NaN?
403
+ lookup[i] = np.nan
404
+
405
+ # Apply lookup
406
+ # Handle out of bounds indices just in case? Arrow indices should be valid.
407
+ # `new_indices` are codes into `new_cats`.
408
+
409
+ # Check for nulls in `new_indices` (masked)
410
+ # If null, they map to NaN.
411
+
412
+ # `take` style mapping
413
+ # `lookup` has NaN for unknown/nan cats.
414
+ x_enc = lookup[new_indices]
415
+
416
+ # Handle array-level nulls
417
+ if arr.null_count > 0:
418
+ if arr.buffers()[0]:
419
+ valid_bits = np.frombuffer(arr.buffers()[0], dtype=np.uint8)
420
+ valid_mask = np.unpackbits(valid_bits, bitorder="little")[
421
+ : len(x_enc)
422
+ ].astype(bool)
423
+ x_enc[~valid_mask] = np.nan
424
+
425
+ columns.append(x_enc)
426
+ masks.append(None)
427
+ else:
428
+ series = X[col_name]
429
+ arr = series.to_arrow()
430
+ if isinstance(arr, pa.ChunkedArray):
431
+ if arr.num_chunks > 1:
432
+ arr = arr.combine_chunks() # Fallback for chunked
433
+ else:
434
+ arr = arr.chunk(0)
435
+ buffers = arr.buffers()
436
+ if buffers[0] is None:
437
+ masks.append(None)
438
+ else:
439
+ masks.append(np.frombuffer(buffers[0], dtype=np.uint8))
440
+ columns.append(np.frombuffer(buffers[1], dtype=np.float64))
441
+
442
+ return features_, columns, masks, rows, cols
443
+
444
+
445
+ CONTRIBUTION_METHODS = {
446
+ "weight": "Weight",
447
+ "Weight": "Weight",
448
+ "average": "Average",
449
+ "Average": "Average",
450
+ "branch-difference": "BranchDifference",
451
+ "branchdifference": "BranchDifference",
452
+ "BranchDifference": "BranchDifference",
453
+ "midpoint-difference": "MidpointDifference",
454
+ "midpointdifference": "MidpointDifference",
455
+ "MidpointDifference": "MidpointDifference",
456
+ "mode-difference": "ModeDifference",
457
+ "modedifference": "ModeDifference",
458
+ "ModeDifference": "ModeDifference",
459
+ "ProbabilityChange": "ProbabilityChange",
460
+ "probabilitychange": "ProbabilityChange",
461
+ "probability-change": "ProbabilityChange",
462
+ }
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: perpetual
3
+ Version: 1.0.7
4
+ Classifier: Programming Language :: Python :: 3
5
+ Classifier: Programming Language :: Python :: 3.10
6
+ Classifier: Programming Language :: Python :: 3.11
7
+ Classifier: Programming Language :: Python :: 3.12
8
+ Classifier: Programming Language :: Python :: 3.13
9
+ Classifier: Programming Language :: Python :: 3.14
10
+ Requires-Dist: numpy
11
+ Requires-Dist: typing-extensions
12
+ Requires-Dist: pandas ; extra == 'dev'
13
+ Requires-Dist: polars ; extra == 'dev'
14
+ Requires-Dist: pyarrow ; extra == 'dev'
15
+ Requires-Dist: maturin ; extra == 'dev'
16
+ Requires-Dist: pytest ; extra == 'dev'
17
+ Requires-Dist: seaborn ; extra == 'dev'
18
+ Requires-Dist: scikit-learn ; extra == 'dev'
19
+ Requires-Dist: mkdocs-material ; extra == 'dev'
20
+ Requires-Dist: mkdocstrings[python] ; extra == 'dev'
21
+ Requires-Dist: mkdocs-autorefs ; extra == 'dev'
22
+ Requires-Dist: ruff ; extra == 'dev'
23
+ Requires-Dist: xgboost ; extra == 'dev'
24
+ Requires-Dist: onnxmltools ; extra == 'dev'
25
+ Requires-Dist: onnx ; extra == 'dev'
26
+ Requires-Dist: onnxruntime ; python_full_version < '3.14' and extra == 'dev'
27
+ Provides-Extra: dev
28
+ License-File: LICENSE
29
+ Summary: A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization
30
+ Keywords: rust,perpetual,machine learning,tree model,decision tree,gradient boosted decision tree,gradient boosting machine
31
+ Home-Page: https://perpetual-ml.com
32
+ Author-email: Mutlu Simsek <mutlusims3k@gmail.com>, Serkan Korkmaz <serkor1@duck.com>, Pieter Pel <pelpieter@gmail.com>
33
+ Requires-Python: >=3.10
@@ -0,0 +1,12 @@
1
+ perpetual/__init__.py,sha256=hKi__gjLuv4MIKSdmiuLg-Y3Aj3Wj8O3zOQ0u9dw5Cc,115
2
+ perpetual/booster.py,sha256=lhXoBq7tMwd9Upa7WMdnjr0FRRcMaqPWxLbmcJLfK6U,73005
3
+ perpetual/data.py,sha256=e2xF5xVq3KYotj5fpIhSfnF3B4qLQpdHYDSaP5NpcxA,768
4
+ perpetual/perpetual.cpython-311-x86_64-linux-gnu.so,sha256=BkQSGRGkePryfzsad_kA9bvNRP2y4TMrHx8KHYOdmz4,1720240
5
+ perpetual/serialize.py,sha256=Tg2BbuA1jKQ5-ITuVhwtj6hgBaRAbZ66eHctR7fcVk4,1883
6
+ perpetual/sklearn.py,sha256=6Kl3dlYBQK0yaFF7I7qzfyOP_dtc_c3q05OjKxCApmk,7011
7
+ perpetual/types.py,sha256=T0KJu8bK8xiYHaPt8b6RmUR1xP3f5N1FV7qaZTy1rtM,3232
8
+ perpetual/utils.py,sha256=qxWSlS1yZNtSRECgTOeDILqeA1wJHV5SYPRUgtL9Goc,16894
9
+ perpetual-1.0.7.dist-info/METADATA,sha256=a341Z0Fa89UgDDM33pSMHPv3ti6Vn5CEYFGc33z65ZU,1523
10
+ perpetual-1.0.7.dist-info/WHEEL,sha256=KmtbzEMhBG7ILlpCgdxkDv7AlFCmdxefhRI54YAwnLk,147
11
+ perpetual-1.0.7.dist-info/licenses/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
12
+ perpetual-1.0.7.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp311-cp311-manylinux_2_17_x86_64
5
+ Tag: cp311-cp311-manylinux2014_x86_64