perpetual 0.9.1__cp311-cp311-macosx_11_0_arm64.whl → 1.0.38__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
perpetual/booster.py CHANGED
@@ -1,21 +1,28 @@
1
- import json
2
1
  import inspect
2
+ import json
3
3
  import warnings
4
- from typing_extensions import Self
4
+ from types import FunctionType
5
5
  from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union, cast
6
6
 
7
7
  import numpy as np
8
+ from typing_extensions import Self
8
9
 
9
- from perpetual.perpetual import PerpetualBooster as CratePerpetualBooster # type: ignore
10
- from perpetual.perpetual import MultiOutputBooster as CrateMultiOutputBooster # type: ignore
10
+ from perpetual.data import Node
11
+ from perpetual.perpetual import (
12
+ MultiOutputBooster as CrateMultiOutputBooster, # type: ignore
13
+ )
14
+ from perpetual.perpetual import (
15
+ PerpetualBooster as CratePerpetualBooster, # type: ignore
16
+ )
11
17
  from perpetual.serialize import BaseSerializer, ObjectSerializer
12
18
  from perpetual.types import BoosterType, MultiOutputBoosterType
13
- from perpetual.data import Node
14
19
  from perpetual.utils import (
15
20
  CONTRIBUTION_METHODS,
16
21
  convert_input_array,
17
22
  convert_input_frame,
23
+ convert_input_frame_columnar,
18
24
  transform_input_frame,
25
+ transform_input_frame_columnar,
19
26
  type_df,
20
27
  )
21
28
 
@@ -37,7 +44,9 @@ class PerpetualBooster:
37
44
  def __init__(
38
45
  self,
39
46
  *,
40
- objective: str = "LogLoss",
47
+ objective: Union[
48
+ str, Tuple[FunctionType, FunctionType, FunctionType]
49
+ ] = "LogLoss",
41
50
  budget: float = 0.5,
42
51
  num_threads: Optional[int] = None,
43
52
  monotone_constraints: Union[Dict[Any, int], None] = None,
@@ -59,101 +68,97 @@ class PerpetualBooster:
59
68
  max_bin: int = 256,
60
69
  max_cat: int = 1000,
61
70
  ):
62
- """PerpetualBooster class, used to create gradient boosted decision tree ensembles.
63
-
64
- Args:
65
- objective (str, optional): Learning objective function to be used for optimization. Valid options are:
66
- "LogLoss" to use logistic loss (classification),
67
- "SquaredLoss" to use squared error (regression),
68
- "QuantileLoss" to use quantile error (regression).
69
- Defaults to "LogLoss".
70
- budget (float, optional): a positive number for fitting budget. Increasing this number will more
71
- likely result in more boosting rounds and more increased predictive power.
72
- Default value is 0.5.
73
- num_threads (int, optional): Number of threads to be used during training.
74
- monotone_constraints (Dict[Any, int], optional): Constraints that are used to enforce a
75
- specific relationship between the training features and the target variable. A dictionary
76
- should be provided where the keys are the feature index value if the model will be fit on
77
- a numpy array, or a feature name if it will be fit on a Dataframe. The values of
78
- the dictionary should be an integer value of -1, 1, or 0 to specify the relationship
79
- that should be estimated between the respective feature and the target variable.
80
- Use a value of -1 to enforce a negative relationship, 1 a positive relationship,
81
- and 0 will enforce no specific relationship at all. Features not included in the
82
- mapping will not have any constraint applied. If `None` is passed, no constraints
83
- will be enforced on any variable. Defaults to `None`.
84
- force_children_to_bound_parent (bool, optional): Setting this parameter to `True` will restrict children nodes, so that they always contain the parent node inside of their range. Without setting this it's possible that both, the left and the right nodes could be greater, than or less than, the parent node. Defaults to `False`.
85
- missing (float, optional): Value to consider missing, when training and predicting
86
- with the booster. Defaults to `np.nan`.
87
- allow_missing_splits (bool, optional): Allow for splits to be made such that all missing values go
88
- down one branch, and all non-missing values go down the other, if this results
89
- in the greatest reduction of loss. If this is false, splits will only be made on non
90
- missing values. If `create_missing_branch` is set to `True` having this parameter be
91
- set to `True` will result in the missing branch further split, if this parameter
92
- is `False` then in that case the missing branch will always be a terminal node.
93
- Defaults to `True`.
94
- create_missing_branch (bool, optional): An experimental parameter, that if `True`, will
95
- create a separate branch for missing, creating a ternary tree, the missing node will be given the same
96
- weight value as the parent node. If this parameter is `False`, missing will be sent
97
- down either the left or right branch, creating a binary tree. Defaults to `False`.
98
- terminate_missing_features (Set[Any], optional): An optional iterable of features
99
- (either strings, or integer values specifying the feature indices if numpy arrays are used for fitting),
100
- for which the missing node will always be terminated, even if `allow_missing_splits` is set to true.
101
- This value is only valid if `create_missing_branch` is also True.
102
- missing_node_treatment (str, optional): Method for selecting the `weight` for the missing node, if `create_missing_branch` is set to `True`. Defaults to "None". Valid options are:
103
- - "None": Calculate missing node weight values without any constraints.
104
- - "AssignToParent": Assign the weight of the missing node to that of the parent.
105
- - "AverageLeafWeight": After training each tree, starting from the bottom of the tree, assign the missing node weight to the weighted average of the left and right child nodes. Next assign the parent to the weighted average of the children nodes. This is performed recursively up through the entire tree. This is performed as a post processing step on each tree after it is built, and prior to updating the predictions for which to train the next tree.
106
- - "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
107
- log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
108
- feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
109
- quantile (float, optional): only used in quantile regression.
110
- reset (bool, optional): whether to reset the model or continue training.
111
- categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
112
- Defaults to `auto` for Polars or Pandas categorical data types.
113
- timeout (float, optional): optional fit timeout in seconds
114
- iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
115
- The algorithm automatically stops for most of the cases before hitting this limit.
116
- If you want to experiment with very high budget (>2.0), you can also increase this limit.
117
- memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
118
- available memory and the algorithm requirements.
119
- stopping_rounds (int, optional): optional limit for auto stopping.
120
- max_bin (int, optional): maximum number of bins for feature discretization. Defaults to 256.
121
- max_cat (int, optional): Maximum number of unique categories for a categorical feature.
122
- Features with more categories will be treated as numerical.
123
- Defaults to 1000.
124
-
125
- Raises:
126
- TypeError: Raised if an invalid dtype is passed.
127
-
128
- Example:
129
- Once, the booster has been initialized, it can be fit on a provided dataset, and performance field. After fitting, the model can be used to predict on a dataset.
130
- In the case of this example, the predictions are the log odds of a given record being 1.
131
-
132
- ```python
133
- # Small example dataset
134
- from seaborn import load_dataset
135
-
136
- df = load_dataset("titanic")
137
- X = df.select_dtypes("number").drop(columns=["survived"])
138
- y = df["survived"]
139
-
140
- # Initialize a booster with defaults.
141
- from perpetual import PerpetualBooster
142
- model = PerpetualBooster(objective="LogLoss")
143
- model.fit(X, y)
144
-
145
- # Predict on data
146
- model.predict(X.head())
147
- # array([-1.94919663, 2.25863229, 0.32963671, 2.48732194, -3.00371813])
148
-
149
- # predict contributions
150
- model.predict_contributions(X.head())
151
- # array([[-0.63014213, 0.33880048, -0.16520798, -0.07798772, -0.85083578,
152
- # -1.07720813],
153
- # [ 1.05406709, 0.08825999, 0.21662544, -0.12083538, 0.35209258,
154
- # -1.07720813],
155
- ```
156
-
71
+ """
72
+ Gradient Boosting Machine with Perpetual Learning.
73
+
74
+ A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization.
75
+ It automatically finds the best configuration based on the provided budget.
76
+
77
+ Parameters
78
+ ----------
79
+ objective : str or tuple, default="LogLoss"
80
+ Learning objective function to be used for optimization. Valid options are:
81
+
82
+ - "LogLoss": logistic loss for binary classification.
83
+ - "SquaredLoss": squared error for regression.
84
+ - "QuantileLoss": quantile error for quantile regression.
85
+ - "HuberLoss": Huber loss for robust regression.
86
+ - "AdaptiveHuberLoss": adaptive Huber loss for robust regression.
87
+ - "ListNetLoss": ListNet loss for ranking.
88
+ - custom objective: a tuple of (grad, hess, init) functions.
89
+
90
+ budget : float, default=0.5
91
+ A positive number for fitting budget. Increasing this number will more likely result
92
+ in more boosting rounds and increased predictive power.
93
+ num_threads : int, optional
94
+ Number of threads to be used during training and prediction.
95
+ monotone_constraints : dict, optional
96
+ Constraints to enforce a specific relationship between features and target.
97
+ Keys are feature indices or names, values are -1, 1, or 0.
98
+ force_children_to_bound_parent : bool, default=False
99
+ Whether to restrict children nodes to be within the parent's range.
100
+ missing : float, default=np.nan
101
+ Value to consider as missing data.
102
+ allow_missing_splits : bool, default=True
103
+ Whether to allow splits that separate missing from non-missing values.
104
+ create_missing_branch : bool, default=False
105
+ Whether to create a separate branch for missing values (ternary trees).
106
+ terminate_missing_features : iterable, optional
107
+ Features for which missing branches will always be terminated if
108
+ ``create_missing_branch`` is True.
109
+ missing_node_treatment : str, default="None"
110
+ How to handle weights for missing nodes if ``create_missing_branch`` is True.
111
+ Options: "None", "AssignToParent", "AverageLeafWeight", "AverageNodeWeight".
112
+ log_iterations : int, default=0
113
+ Logging frequency (every N iterations). 0 disables logging.
114
+ feature_importance_method : str, default="Gain"
115
+ Method for calculating feature importance. Options: "Gain", "Weight", "Cover",
116
+ "TotalGain", "TotalCover".
117
+ quantile : float, optional
118
+ Target quantile for quantile regression (objective="QuantileLoss").
119
+ reset : bool, optional
120
+ Whether to reset the model or continue training on subsequent calls to fit.
121
+ categorical_features : str or iterable, default="auto"
122
+ Feature indices or names to treat as categorical.
123
+ timeout : float, optional
124
+ Time limit for fitting in seconds.
125
+ iteration_limit : int, optional
126
+ Maximum number of boosting iterations.
127
+ memory_limit : float, optional
128
+ Memory limit for training in GB.
129
+ stopping_rounds : int, optional
130
+ Early stopping rounds.
131
+ max_bin : int, default=256
132
+ Maximum number of bins for feature discretization.
133
+ max_cat : int, default=1000
134
+ Maximum unique categories before a feature is treated as numerical.
135
+
136
+ Attributes
137
+ ----------
138
+ feature_names_in_ : list of str
139
+ Names of features seen during :meth:`fit`.
140
+ n_features_ : int
141
+ Number of features seen during :meth:`fit`.
142
+ classes_ : list
143
+ Class labels for classification tasks.
144
+ feature_importances_ : ndarray of shape (n_features,)
145
+ Feature importances calculated via ``feature_importance_method``.
146
+
147
+ See Also
148
+ --------
149
+ perpetual.sklearn.PerpetualClassifier : Scikit-learn compatible classifier.
150
+ perpetual.sklearn.PerpetualRegressor : Scikit-learn compatible regressor.
151
+
152
+ Examples
153
+ --------
154
+ Basic usage for binary classification:
155
+
156
+ >>> from perpetual import PerpetualBooster
157
+ >>> from sklearn.datasets import make_classification
158
+ >>> X, y = make_classification(n_samples=1000, n_features=20)
159
+ >>> model = PerpetualBooster(objective="LogLoss")
160
+ >>> model.fit(X, y)
161
+ >>> preds = model.predict(X[:5])
157
162
  """
158
163
 
159
164
  terminate_missing_features_ = (
@@ -163,7 +168,16 @@ class PerpetualBooster:
163
168
  {} if monotone_constraints is None else monotone_constraints
164
169
  )
165
170
 
166
- self.objective = objective
171
+ if isinstance(objective, str):
172
+ self.objective = objective
173
+ self.loss = None
174
+ self.grad = None
175
+ self.init = None
176
+ else:
177
+ self.objective = None
178
+ self.loss = objective[0]
179
+ self.grad = objective[1]
180
+ self.init = objective[2]
167
181
  self.budget = budget
168
182
  self.num_threads = num_threads
169
183
  self.monotone_constraints = monotone_constraints_
@@ -205,29 +219,64 @@ class PerpetualBooster:
205
219
  iteration_limit=self.iteration_limit,
206
220
  memory_limit=self.memory_limit,
207
221
  stopping_rounds=self.stopping_rounds,
222
+ loss=self.loss,
223
+ grad=self.grad,
224
+ init=self.init,
208
225
  )
209
226
  self.booster = cast(BoosterType, booster)
210
227
 
211
- def fit(self, X, y, sample_weight=None) -> Self:
212
- """Fit the gradient booster on a provided dataset.
213
-
214
- Args:
215
- X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
216
- y (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
217
- or a 1 or 2 dimensional Numpy array.
218
- sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
219
- training the model. If None is passed, a weight of 1 will be used for every record.
220
- Defaults to None.
228
+ def fit(self, X, y, sample_weight=None, group=None) -> Self:
229
+ """
230
+ Fit the gradient booster on a provided dataset.
231
+
232
+ Parameters
233
+ ----------
234
+ X : array-like of shape (n_samples, n_features)
235
+ Training data. Can be a Polars or Pandas DataFrame, or a 2D Numpy array.
236
+ Polars DataFrames use a zero-copy columnar path for efficiency.
237
+ y : array-like of shape (n_samples,) or (n_samples, n_targets)
238
+ Target values.
239
+ sample_weight : array-like of shape (n_samples,), optional
240
+ Individual weights for each sample. If None, all samples are weighted equally.
241
+ group : array-like, optional
242
+ Group labels for ranking objectives.
243
+
244
+ Returns
245
+ -------
246
+ self : object
247
+ Returns self.
221
248
  """
222
249
 
223
- features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
224
- convert_input_frame(X, self.categorical_features, self.max_cat)
225
- )
250
+ # Check if input is a Polars DataFrame for zero-copy columnar path
251
+ is_polars = type_df(X) == "polars_df"
252
+
253
+ if is_polars:
254
+ # Use columnar path for Polars DataFrames (true zero-copy)
255
+ (
256
+ features_,
257
+ columns, # list of 1D arrays instead of flat_data
258
+ masks,
259
+ rows,
260
+ cols,
261
+ categorical_features_,
262
+ cat_mapping,
263
+ ) = convert_input_frame_columnar(X, self.categorical_features, self.max_cat)
264
+ else:
265
+ # Use existing flat path for pandas and numpy
266
+ (
267
+ features_,
268
+ flat_data,
269
+ rows,
270
+ cols,
271
+ categorical_features_,
272
+ cat_mapping,
273
+ ) = convert_input_frame(X, self.categorical_features, self.max_cat)
274
+
226
275
  self.n_features_ = cols
227
276
  self.cat_mapping = cat_mapping
228
277
  self.feature_names_in_ = features_
229
278
 
230
- y_, classes_ = convert_input_array(y, self.objective)
279
+ y_, classes_ = convert_input_array(y, self.objective, is_target=True)
231
280
  self.classes_ = np.array(classes_).tolist()
232
281
 
233
282
  if sample_weight is None:
@@ -235,6 +284,11 @@ class PerpetualBooster:
235
284
  else:
236
285
  sample_weight_, _ = convert_input_array(sample_weight, self.objective)
237
286
 
287
+ if group is None:
288
+ group_ = None
289
+ else:
290
+ group_, _ = convert_input_array(group, self.objective, is_int=True)
291
+
238
292
  # Convert the monotone constraints into the form needed
239
293
  # by the rust code.
240
294
  crate_mc = self._standardize_monotonicity_map(X)
@@ -263,6 +317,9 @@ class PerpetualBooster:
263
317
  iteration_limit=self.iteration_limit,
264
318
  memory_limit=self.memory_limit,
265
319
  stopping_rounds=self.stopping_rounds,
320
+ loss=self.loss,
321
+ grad=self.grad,
322
+ init=self.init,
266
323
  )
267
324
  self.booster = cast(BoosterType, booster)
268
325
  else:
@@ -287,6 +344,9 @@ class PerpetualBooster:
287
344
  iteration_limit=self.iteration_limit,
288
345
  memory_limit=self.memory_limit,
289
346
  stopping_rounds=self.stopping_rounds,
347
+ loss=self.loss,
348
+ grad=self.grad,
349
+ init=self.init,
290
350
  )
291
351
  self.booster = cast(MultiOutputBoosterType, booster)
292
352
 
@@ -300,26 +360,51 @@ class PerpetualBooster:
300
360
 
301
361
  self.categorical_features = categorical_features_
302
362
 
303
- self.booster.fit(
304
- flat_data=flat_data,
305
- rows=rows,
306
- cols=cols,
307
- y=y_,
308
- sample_weight=sample_weight_, # type: ignore
309
- )
363
+ if is_polars:
364
+ # Use columnar fit for Polars (zero-copy)
365
+ self.booster.fit_columnar(
366
+ columns=columns,
367
+ masks=masks,
368
+ rows=rows,
369
+ y=y_,
370
+ sample_weight=sample_weight_, # type: ignore
371
+ group=group_,
372
+ )
373
+ else:
374
+ # Use standard fit for pandas/numpy
375
+ self.booster.fit(
376
+ flat_data=flat_data,
377
+ rows=rows,
378
+ cols=cols,
379
+ y=y_,
380
+ sample_weight=sample_weight_, # type: ignore
381
+ group=group_,
382
+ )
310
383
 
311
384
  return self
312
385
 
313
- def prune(self, X, y, sample_weight=None) -> Self:
314
- """Prune the gradient booster on a provided dataset.
315
-
316
- Args:
317
- X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
318
- y (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
319
- or a 1 or 2 dimensional Numpy array.
320
- sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
321
- training the model. If None is passed, a weight of 1 will be used for every record.
322
- Defaults to None.
386
+ def prune(self, X, y, sample_weight=None, group=None) -> Self:
387
+ """
388
+ Prune the gradient booster on a provided dataset.
389
+
390
+ This removes nodes that do not contribute to a reduction in loss on the provided
391
+ validation set.
392
+
393
+ Parameters
394
+ ----------
395
+ X : array-like of shape (n_samples, n_features)
396
+ Validation data.
397
+ y : array-like of shape (n_samples,)
398
+ Validation targets.
399
+ sample_weight : array-like of shape (n_samples,), optional
400
+ Weights for validation samples.
401
+ group : array-like, optional
402
+ Group labels for ranking objectives.
403
+
404
+ Returns
405
+ -------
406
+ self : object
407
+ Returns self.
323
408
  """
324
409
 
325
410
  _, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
@@ -331,65 +416,115 @@ class PerpetualBooster:
331
416
  else:
332
417
  sample_weight_, _ = convert_input_array(sample_weight, self.objective)
333
418
 
419
+ if group is None:
420
+ group_ = None
421
+ else:
422
+ group_, _ = convert_input_array(group, self.objective, is_int=True)
423
+
334
424
  self.booster.prune(
335
425
  flat_data=flat_data,
336
426
  rows=rows,
337
427
  cols=cols,
338
428
  y=y_,
339
429
  sample_weight=sample_weight_, # type: ignore
430
+ group=group_,
340
431
  )
341
432
 
342
433
  return self
343
434
 
344
435
  def calibrate(
345
- self, X_train, y_train, X_cal, y_cal, alpha, sample_weight=None
436
+ self, X_train, y_train, X_cal, y_cal, alpha, sample_weight=None, group=None
346
437
  ) -> Self:
347
- """Calibrate the gradient booster on a provided dataset.
348
-
349
- Args:
350
- X_train (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
351
- y_train (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
352
- or a 1 or 2 dimensional Numpy array.
353
- X_cal (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
354
- y_cal (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
355
- or a 1 or 2 dimensional Numpy array.
356
- alpha (ArrayLike): Between 0 and 1, represents the uncertainty of the confidence interval.
357
- Lower alpha produce larger (more conservative) prediction intervals.
358
- alpha is the complement of the target coverage level.
359
- sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
360
- training the model. If None is passed, a weight of 1 will be used for every record.
361
- Defaults to None.
438
+ """
439
+ Calibrate the gradient booster for prediction intervals.
440
+
441
+ Uses the provided training and calibration sets to compute scaling factors
442
+ for intervals.
443
+
444
+ Parameters
445
+ ----------
446
+ X_train : array-like
447
+ Data used to train the base model.
448
+ y_train : array-like
449
+ Targets for training data.
450
+ X_cal : array-like
451
+ Independent calibration dataset.
452
+ y_cal : array-like
453
+ Targets for calibration data.
454
+ alpha : float or array-like
455
+ Significance level(s) for the intervals (1 - coverage).
456
+ sample_weight : array-like, optional
457
+ Sample weights.
458
+ group : array-like, optional
459
+ Group labels.
460
+
461
+ Returns
462
+ -------
463
+ self : object
464
+ Returns self.
362
465
  """
363
466
 
364
- _, flat_data_train, rows_train, cols_train = transform_input_frame(
365
- X_train, self.cat_mapping
366
- )
367
-
368
- y_train_, _ = convert_input_array(y_train, self.objective)
467
+ is_polars = type_df(X_train) == "polars_df"
468
+ if is_polars:
469
+ features_train, cols_train, masks_train, rows_train, _ = (
470
+ transform_input_frame_columnar(X_train, self.cat_mapping)
471
+ )
472
+ self._validate_features(features_train)
473
+ features_cal, cols_cal, masks_cal, rows_cal, _ = (
474
+ transform_input_frame_columnar(X_cal, self.cat_mapping)
475
+ )
476
+ # Use columnar calibration
477
+ y_train_, _ = convert_input_array(y_train, self.objective)
478
+ y_cal_, _ = convert_input_array(y_cal, self.objective)
479
+ if sample_weight is None:
480
+ sample_weight_ = None
481
+ else:
482
+ sample_weight_, _ = convert_input_array(sample_weight, self.objective)
483
+
484
+ self.booster.calibrate_columnar(
485
+ columns=cols_train,
486
+ masks=masks_train,
487
+ rows=rows_train,
488
+ y=y_train_,
489
+ columns_cal=cols_cal,
490
+ masks_cal=masks_cal,
491
+ rows_cal=rows_cal,
492
+ y_cal=y_cal_,
493
+ alpha=np.array(alpha),
494
+ sample_weight=sample_weight_, # type: ignore
495
+ group=group,
496
+ )
497
+ else:
498
+ _, flat_data_train, rows_train, cols_train = transform_input_frame(
499
+ X_train, self.cat_mapping
500
+ )
369
501
 
370
- _, flat_data_cal, rows_cal, cols_cal = transform_input_frame(
371
- X_cal, self.cat_mapping
372
- )
502
+ y_train_, _ = convert_input_array(y_train, self.objective)
373
503
 
374
- y_cal_, _ = convert_input_array(y_cal, self.objective)
504
+ _, flat_data_cal, rows_cal, cols_cal = transform_input_frame(
505
+ X_cal, self.cat_mapping
506
+ )
375
507
 
376
- if sample_weight is None:
377
- sample_weight_ = None
378
- else:
379
- sample_weight_, _ = convert_input_array(sample_weight, self.objective)
508
+ y_cal_, _ = convert_input_array(y_cal, self.objective)
380
509
 
381
- self.booster.calibrate(
382
- flat_data=flat_data_train,
383
- rows=rows_train,
384
- cols=cols_train,
385
- y=y_train_,
386
- flat_data_cal=flat_data_cal,
387
- rows_cal=rows_cal,
388
- cols_cal=cols_cal,
389
- y_cal=y_cal_,
390
- alpha=np.array(alpha),
391
- sample_weight=sample_weight_, # type: ignore
392
- )
510
+ if sample_weight is None:
511
+ sample_weight_ = None
512
+ else:
513
+ sample_weight_, _ = convert_input_array(sample_weight, self.objective)
514
+
515
+ self.booster.calibrate(
516
+ flat_data=flat_data_train,
517
+ rows=rows_train,
518
+ cols=cols_train,
519
+ y=y_train_,
520
+ flat_data_cal=flat_data_cal,
521
+ rows_cal=rows_cal,
522
+ cols_cal=cols_cal,
523
+ y_cal=y_cal_,
524
+ alpha=np.array(alpha),
525
+ sample_weight=sample_weight_, # type: ignore
526
+ group=group,
527
+ )
393
528
 
394
529
  return self
395
530
 
@@ -402,18 +537,31 @@ class PerpetualBooster:
402
537
  )
403
538
 
404
539
  def predict_intervals(self, X, parallel: Union[bool, None] = None) -> dict:
405
- """Predict intervals with the fitted booster on new data.
406
-
407
- Args:
408
- X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
409
- parallel (Union[bool, None], optional): Optionally specify if the predict
410
- function should run in parallel on multiple threads. If `None` is
411
- passed, the `parallel` attribute of the booster will be used.
412
- Defaults to `None`.
413
-
414
- Returns:
415
- np.ndarray: Returns a numpy array of the predictions.
416
540
  """
541
+ Predict intervals with the fitted booster on new data.
542
+
543
+ Parameters
544
+ ----------
545
+ X : array-like of shape (n_samples, n_features)
546
+ New data for prediction.
547
+ parallel : bool, optional
548
+ Whether to run prediction in parallel. If None, uses class default.
549
+
550
+ Returns
551
+ -------
552
+ intervals : dict
553
+ A dictionary containing lower and upper bounds for the specified alpha levels.
554
+ """
555
+ is_polars = type_df(X) == "polars_df"
556
+ if is_polars:
557
+ features_, columns, masks, rows, cols = transform_input_frame_columnar(
558
+ X, self.cat_mapping
559
+ )
560
+ self._validate_features(features_)
561
+ return self.booster.predict_intervals_columnar(
562
+ columns=columns, masks=masks, rows=rows, parallel=parallel
563
+ )
564
+
417
565
  features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
418
566
  self._validate_features(features_)
419
567
 
@@ -425,98 +573,170 @@ class PerpetualBooster:
425
573
  )
426
574
 
427
575
  def predict(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
428
- """Predict with the fitted booster on new data.
429
-
430
- Args:
431
- X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
432
- parallel (Union[bool, None], optional): Optionally specify if the predict
433
- function should run in parallel on multiple threads. If `None` is
434
- passed, the `parallel` attribute of the booster will be used.
435
- Defaults to `None`.
436
-
437
- Returns:
438
- np.ndarray: Returns a numpy array of the predictions.
439
576
  """
440
- features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
577
+ Predict with the fitted booster on new data.
578
+
579
+ Parameters
580
+ ----------
581
+ X : array-like of shape (n_samples, n_features)
582
+ Input features.
583
+ parallel : bool, optional
584
+ Whether to run prediction in parallel.
585
+
586
+ Returns
587
+ -------
588
+ predictions : ndarray of shape (n_samples,)
589
+ The predicted values (log-odds for classification, raw values for regression).
590
+ """
591
+ is_polars = type_df(X) == "polars_df"
592
+ if is_polars:
593
+ features_, columns, masks, rows, cols = transform_input_frame_columnar(
594
+ X, self.cat_mapping
595
+ )
596
+ else:
597
+ features_, flat_data, rows, cols = transform_input_frame(
598
+ X, self.cat_mapping
599
+ )
441
600
  self._validate_features(features_)
442
601
 
443
602
  if len(self.classes_) == 0:
603
+ if is_polars:
604
+ return self.booster.predict_columnar(
605
+ columns=columns, masks=masks, rows=rows, parallel=parallel
606
+ )
444
607
  return self.booster.predict(
445
608
  flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
446
609
  )
447
610
  elif len(self.classes_) == 2:
611
+ if is_polars:
612
+ return np.rint(
613
+ self.booster.predict_proba_columnar(
614
+ columns=columns, masks=masks, rows=rows, parallel=parallel
615
+ )
616
+ ).astype(int)
448
617
  return np.rint(
449
618
  self.booster.predict_proba(
450
619
  flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
451
620
  )
452
621
  ).astype(int)
453
622
  else:
454
- preds = self.booster.predict(
455
- flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
456
- )
623
+ if is_polars:
624
+ preds = self.booster.predict_columnar(
625
+ columns=columns, masks=masks, rows=rows, parallel=parallel
626
+ )
627
+ else:
628
+ preds = self.booster.predict(
629
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
630
+ )
457
631
  preds_matrix = preds.reshape((-1, len(self.classes_)), order="F")
458
632
  indices = np.argmax(preds_matrix, axis=1)
459
633
  return np.array([self.classes_[i] for i in indices])
460
634
 
461
635
  def predict_proba(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
462
- """Predict probabilities with the fitted booster on new data.
636
+ """
637
+ Predict class probabilities with the fitted booster on new data.
463
638
 
464
- Args:
465
- X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
466
- parallel (Union[bool, None], optional): Optionally specify if the predict
467
- function should run in parallel on multiple threads. If `None` is
468
- passed, the `parallel` attribute of the booster will be used.
469
- Defaults to `None`.
639
+ Only valid for classification tasks.
470
640
 
471
- Returns:
472
- np.ndarray, shape (n_samples, n_classes): Returns a numpy array of the class probabilities.
641
+ Parameters
642
+ ----------
643
+ X : array-like of shape (n_samples, n_features)
644
+ Input features.
645
+ parallel : bool, optional
646
+ Whether to run prediction in parallel.
647
+
648
+ Returns
649
+ -------
650
+ probabilities : ndarray of shape (n_samples, n_classes)
651
+ The class probabilities.
473
652
  """
474
- features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
653
+ is_polars = type_df(X) == "polars_df"
654
+ if is_polars:
655
+ features_, columns, masks, rows, cols = transform_input_frame_columnar(
656
+ X, self.cat_mapping
657
+ )
658
+ else:
659
+ features_, flat_data, rows, cols = transform_input_frame(
660
+ X, self.cat_mapping
661
+ )
475
662
  self._validate_features(features_)
476
663
 
477
664
  if len(self.classes_) > 2:
478
- probabilities = self.booster.predict_proba(
479
- flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
480
- )
665
+ if is_polars:
666
+ probabilities = self.booster.predict_proba_columnar(
667
+ columns=columns, masks=masks, rows=rows, parallel=parallel
668
+ )
669
+ else:
670
+ probabilities = self.booster.predict_proba(
671
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
672
+ )
481
673
  return probabilities.reshape((-1, len(self.classes_)), order="C")
482
674
  elif len(self.classes_) == 2:
483
- probabilities = self.booster.predict_proba(
484
- flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
485
- )
675
+ if is_polars:
676
+ probabilities = self.booster.predict_proba_columnar(
677
+ columns=columns, masks=masks, rows=rows, parallel=parallel
678
+ )
679
+ else:
680
+ probabilities = self.booster.predict_proba(
681
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
682
+ )
486
683
  return np.concatenate(
487
684
  [(1.0 - probabilities).reshape(-1, 1), probabilities.reshape(-1, 1)],
488
685
  axis=1,
489
686
  )
490
687
  else:
491
- raise NotImplementedError(
688
+ warnings.warn(
492
689
  f"predict_proba not implemented for regression. n_classes = {len(self.classes_)}"
493
690
  )
691
+ return np.ones((rows, 1))
494
692
 
495
693
  def predict_log_proba(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
496
- """Predict class log-probabilities with the fitted booster on new data.
694
+ """
695
+ Predict class log-probabilities with the fitted booster on new data.
696
+
697
+ Only valid for classification tasks.
497
698
 
498
- Args:
499
- X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
500
- parallel (Union[bool, None], optional): Optionally specify if the predict
501
- function should run in parallel on multiple threads. If `None` is
502
- passed, the `parallel` attribute of the booster will be used.
503
- Defaults to `None`.
699
+ Parameters
700
+ ----------
701
+ X : array-like of shape (n_samples, n_features)
702
+ Input features.
703
+ parallel : bool, optional
704
+ Whether to run prediction in parallel.
504
705
 
505
- Returns:
506
- np.ndarray: Returns a numpy array of the predictions.
706
+ Returns
707
+ -------
708
+ log_probabilities : ndarray of shape (n_samples, n_classes)
709
+ The log-probabilities of each class.
507
710
  """
508
- features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
711
+ is_polars = type_df(X) == "polars_df"
712
+ if is_polars:
713
+ features_, columns, masks, rows, cols = transform_input_frame_columnar(
714
+ X, self.cat_mapping
715
+ )
716
+ else:
717
+ features_, flat_data, rows, cols = transform_input_frame(
718
+ X, self.cat_mapping
719
+ )
509
720
  self._validate_features(features_)
510
721
 
511
722
  if len(self.classes_) > 2:
512
- preds = self.booster.predict(
513
- flat_data=flat_data,
514
- rows=rows,
515
- cols=cols,
516
- parallel=parallel,
517
- )
723
+ if is_polars:
724
+ preds = self.booster.predict_columnar(
725
+ columns=columns, masks=masks, rows=rows, parallel=parallel
726
+ )
727
+ else:
728
+ preds = self.booster.predict(
729
+ flat_data=flat_data,
730
+ rows=rows,
731
+ cols=cols,
732
+ parallel=parallel,
733
+ )
518
734
  return preds.reshape((-1, len(self.classes_)), order="F")
519
735
  elif len(self.classes_) == 2:
736
+ if is_polars:
737
+ return self.booster.predict_columnar(
738
+ columns=columns, masks=masks, rows=rows, parallel=parallel
739
+ )
520
740
  return self.booster.predict(
521
741
  flat_data=flat_data,
522
742
  rows=rows,
@@ -524,23 +744,36 @@ class PerpetualBooster:
524
744
  parallel=parallel,
525
745
  )
526
746
  else:
527
- raise NotImplementedError(
528
- "predict_log_proba not implemented for regression."
529
- )
747
+ warnings.warn("predict_log_proba not implemented for regression.")
748
+ return np.ones((rows, 1))
530
749
 
531
750
  def predict_nodes(self, X, parallel: Union[bool, None] = None) -> List:
532
- """Predict nodes with the fitted booster on new data.
533
-
534
- Args:
535
- X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
536
- parallel (Union[bool, None], optional): Optionally specify if the predict
537
- function should run in parallel on multiple threads. If `None` is
538
- passed, the `parallel` attribute of the booster will be used.
539
- Defaults to `None`.
540
-
541
- Returns:
542
- List: Returns a list of node predictions.
543
751
  """
752
+ Predict leaf node indices with the fitted booster on new data.
753
+
754
+ Parameters
755
+ ----------
756
+ X : array-like of shape (n_samples, n_features)
757
+ Input features.
758
+ parallel : bool, optional
759
+ Whether to run prediction in parallel.
760
+
761
+ Returns
762
+ -------
763
+ node_indices : list of ndarray
764
+ A list where each element corresponds to a tree and contains node indices
765
+ for each sample.
766
+ """
767
+ is_polars = type_df(X) == "polars_df"
768
+ if is_polars:
769
+ features_, columns, masks, rows, cols = transform_input_frame_columnar(
770
+ X, self.cat_mapping
771
+ )
772
+ self._validate_features(features_)
773
+ return self.booster.predict_nodes_columnar(
774
+ columns=columns, masks=masks, rows=rows, parallel=parallel
775
+ )
776
+
544
777
  features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
545
778
  self._validate_features(features_)
546
779
 
@@ -563,39 +796,66 @@ class PerpetualBooster:
563
796
  def predict_contributions(
564
797
  self, X, method: str = "Average", parallel: Union[bool, None] = None
565
798
  ) -> np.ndarray:
566
- """Predict with the fitted booster on new data, returning the feature
567
- contribution matrix. The last column is the bias term.
568
-
569
-
570
- Args:
571
- X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
572
- method (str, optional): Method to calculate the contributions, available options are:
573
-
574
- - "Average": If this option is specified, the average internal node values are calculated.
575
- - "Shapley": Using this option will calculate contributions using the tree shap algorithm.
576
- - "Weight": This method will use the internal leaf weights, to calculate the contributions. This is the same as what is described by Saabas [here](https://blog.datadive.net/interpreting-random-forests/).
577
- - "BranchDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the other non-missing branch. This method does not have the property where the contributions summed is equal to the final prediction of the model.
578
- - "MidpointDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the mid-point between the right and left node weighted by the cover of each node. This method does not have the property where the contributions summed is equal to the final prediction of the model.
579
- - "ModeDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the node with the largest cover (the mode node). This method does not have the property where the contributions summed is equal to the final prediction of the model.
580
- - "ProbabilityChange": This method is only valid when the objective type is set to "LogLoss". This method will calculate contributions as the change in a records probability of being 1 moving from a parent node to a child node. The sum of the returned contributions matrix, will be equal to the probability a record will be 1. For example, given a model, `model.predict_contributions(X, method="ProbabilityChange") == 1 / (1 + np.exp(-model.predict(X)))`
581
- parallel (Union[bool, None], optional): Optionally specify if the predict
582
- function should run in parallel on multiple threads. If `None` is
583
- passed, the `parallel` attribute of the booster will be used.
584
- Defaults to `None`.
585
-
586
- Returns:
587
- np.ndarray: Returns a numpy array of the predictions.
588
799
  """
589
- features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
590
- self._validate_features(features_)
800
+ Predict feature contributions (SHAP-like values) for new data.
801
+
802
+ Parameters
803
+ ----------
804
+ X : array-like of shape (n_samples, n_features)
805
+ Input features.
806
+ method : str, default="Average"
807
+ Method to calculate contributions. Options:
808
+
809
+ - "Average": Internal node averages.
810
+ - "Shapley": Exact tree SHAP values.
811
+ - "Weight": Saabas-style leaf weights.
812
+ - "BranchDifference": Difference between chosen and other branch.
813
+ - "MidpointDifference": Weighted difference between branches.
814
+ - "ModeDifference": Difference from the most frequent node.
815
+ - "ProbabilityChange": Change in probability (LogLoss only).
816
+
817
+ parallel : bool, optional
818
+ Whether to run prediction in parallel.
819
+
820
+ Returns
821
+ -------
822
+ contributions : ndarray of shape (n_samples, n_features + 1)
823
+ The contribution of each feature to the prediction. The last column
824
+ is the bias term.
825
+ """
826
+ is_polars = type_df(X) == "polars_df"
827
+ if is_polars:
828
+ features_, columns, masks, rows, cols = transform_input_frame_columnar(
829
+ X, self.cat_mapping
830
+ )
831
+ self._validate_features(features_)
832
+ contributions = self.booster.predict_contributions_columnar(
833
+ columns=columns,
834
+ masks=masks,
835
+ rows=rows,
836
+ method=CONTRIBUTION_METHODS.get(method, method),
837
+ parallel=parallel,
838
+ )
839
+ else:
840
+ features_, flat_data, rows, cols = transform_input_frame(
841
+ X, self.cat_mapping
842
+ )
843
+ self._validate_features(features_)
591
844
 
592
- contributions = self.booster.predict_contributions(
593
- flat_data=flat_data,
594
- rows=rows,
595
- cols=cols,
596
- method=CONTRIBUTION_METHODS.get(method, method),
597
- parallel=parallel,
598
- )
845
+ contributions = self.booster.predict_contributions(
846
+ flat_data=flat_data,
847
+ rows=rows,
848
+ cols=cols,
849
+ method=CONTRIBUTION_METHODS.get(method, method),
850
+ parallel=parallel,
851
+ )
852
+
853
+ if len(self.classes_) > 2:
854
+ return (
855
+ np.reshape(contributions, (len(self.classes_), rows, cols + 1))
856
+ .transpose(1, 0, 2)
857
+ .reshape(rows, -1)
858
+ )
599
859
  return np.reshape(contributions, (rows, cols + 1))
600
860
 
601
861
  def partial_dependence(
@@ -606,76 +866,49 @@ class PerpetualBooster:
606
866
  exclude_missing: bool = True,
607
867
  percentile_bounds: Tuple[float, float] = (0.2, 0.98),
608
868
  ) -> np.ndarray:
609
- """Calculate the partial dependence values of a feature. For each unique
610
- value of the feature, this gives the estimate of the predicted value for that
611
- feature, with the effects of all features averaged out. This information gives
612
- an estimate of how a given feature impacts the model.
613
-
614
- Args:
615
- X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
616
- This should be the same data passed into the models fit, or predict,
617
- with the columns in the same order.
618
- feature (Union[str, int]): The feature for which to calculate the partial
619
- dependence values. This can be the name of a column, if the provided
620
- X is a pandas DataFrame, or the index of the feature.
621
- samples (Optional[int]): Number of evenly spaced samples to select. If None
622
- is passed all unique values will be used. Defaults to 100.
623
- exclude_missing (bool, optional): Should missing excluded from the features? Defaults to True.
624
- percentile_bounds (Tuple[float, float], optional): Upper and lower percentiles to start at
625
- when calculating the samples. Defaults to (0.2, 0.98) to cap the samples selected
626
- at the 5th and 95th percentiles respectively.
627
-
628
- Raises:
629
- ValueError: An error will be raised if the provided X parameter is not a
630
- pandas DataFrame, and a string is provided for the feature.
631
-
632
- Returns:
633
- np.ndarray: A 2 dimensional numpy array, where the first column is the
634
- sorted unique values of the feature, and then the second column
635
- is the partial dependence values for each feature value.
636
-
637
- Example:
638
- This information can be plotted to visualize how a feature is used in the model, like so.
639
-
640
- ```python
641
- from seaborn import lineplot
642
- import matplotlib.pyplot as plt
643
-
644
- pd_values = model.partial_dependence(X=X, feature="age", samples=None)
645
-
646
- fig = lineplot(x=pd_values[:,0], y=pd_values[:,1],)
647
- plt.title("Partial Dependence Plot")
648
- plt.xlabel("Age")
649
- plt.ylabel("Log Odds")
650
- ```
651
- <img height="340" src="https://github.com/jinlow/forust/raw/main/resources/pdp_plot_age.png">
652
-
653
- We can see how this is impacted if a model is created, where a specific constraint is applied to the feature using the `monotone_constraint` parameter.
654
-
655
- ```python
656
- model = PerpetualBooster(
657
- objective="LogLoss",
658
- monotone_constraints={"age": -1},
659
- )
660
- model.fit(X, y)
661
-
662
- pd_values = model.partial_dependence(X=X, feature="age")
663
- fig = lineplot(
664
- x=pd_values[:, 0],
665
- y=pd_values[:, 1],
666
- )
667
- plt.title("Partial Dependence Plot with Monotonicity")
668
- plt.xlabel("Age")
669
- plt.ylabel("Log Odds")
670
- ```
671
- <img height="340" src="https://github.com/jinlow/forust/raw/main/resources/pdp_plot_age_mono.png">
869
+ """
870
+ Calculate the partial dependence values of a feature.
871
+
872
+ For each unique value of the feature, this gives the estimate of the predicted
873
+ value for that feature, with the effects of all other features averaged out.
874
+
875
+ Parameters
876
+ ----------
877
+ X : array-like
878
+ Data used to calculate partial dependence. Should be the same format
879
+ as passed to :meth:`fit`.
880
+ feature : str or int
881
+ The feature for which to calculate partial dependence.
882
+ samples : int, optional, default=100
883
+ Number of evenly spaced samples to select. If None, all unique values are used.
884
+ exclude_missing : bool, optional, default=True
885
+ Whether to exclude missing values from the calculation.
886
+ percentile_bounds : tuple of float, optional, default=(0.2, 0.98)
887
+ Lower and upper percentiles for sample selection.
888
+
889
+ Returns
890
+ -------
891
+ pd_values : ndarray of shape (n_samples, 2)
892
+ The first column contains the feature values, and the second column
893
+ contains the partial dependence values.
894
+
895
+ Examples
896
+ --------
897
+ >>> import matplotlib.pyplot as plt
898
+ >>> pd_values = model.partial_dependence(X, feature="age")
899
+ >>> plt.plot(pd_values[:, 0], pd_values[:, 1])
672
900
  """
673
901
  if isinstance(feature, str):
674
- if not (type_df(X) == "pandas_df" or type_df(X) == "polars_df"):
902
+ is_polars = type_df(X) == "polars_df"
903
+ if not (type_df(X) == "pandas_df" or is_polars):
675
904
  raise ValueError(
676
- "If `feature` is a string, then the object passed as `X` must be a pandas DataFrame."
905
+ "If `feature` is a string, then the object passed as `X` must be a pandas or polars DataFrame."
677
906
  )
678
- values = X.loc[:, feature].to_numpy()
907
+ if is_polars:
908
+ values = X[feature].to_numpy()
909
+ else:
910
+ values = X.loc[:, feature].to_numpy()
911
+
679
912
  if hasattr(self, "feature_names_in_") and self.feature_names_in_[0] != "0":
680
913
  [feature_idx] = [
681
914
  i for i, v in enumerate(self.feature_names_in_) if v == feature
@@ -687,7 +920,8 @@ class PerpetualBooster:
687
920
  + "ensure columns are the same order as data passed when fit."
688
921
  )
689
922
  warnings.warn(w_msg)
690
- [feature_idx] = [i for i, v in enumerate(X.columns) if v == feature]
923
+ features = X.columns if is_polars else X.columns.to_list()
924
+ [feature_idx] = [i for i, v in enumerate(features) if v == feature]
691
925
  elif isinstance(feature, int):
692
926
  feature_idx = feature
693
927
  if type_df(X) == "pandas_df":
@@ -722,32 +956,27 @@ class PerpetualBooster:
722
956
  def calculate_feature_importance(
723
957
  self, method: str = "Gain", normalize: bool = True
724
958
  ) -> Union[Dict[int, float], Dict[str, float]]:
725
- """Feature importance values can be calculated with the `calculate_feature_importance` method. This function will return a dictionary of the features and their importance values. It should be noted that if a feature was never used for splitting it will not be returned in importance dictionary.
726
-
727
- Args:
728
- method (str, optional): Variable importance method. Defaults to "Gain". Valid options are:
729
-
730
- - "Weight": The number of times a feature is used to split the data across all trees.
731
- - "Gain": The average split gain across all splits the feature is used in.
732
- - "Cover": The average coverage across all splits the feature is used in.
733
- - "TotalGain": The total gain across all splits the feature is used in.
734
- - "TotalCover": The total coverage across all splits the feature is used in.
735
- normalize (bool, optional): Should the importance be normalized to sum to 1? Defaults to `True`.
736
-
737
- Returns:
738
- Dict[str, float]: Variable importance values, for features present in the model.
739
-
740
- Example:
741
- ```python
742
- model.calculate_feature_importance("Gain")
743
- # {
744
- # 'parch': 0.0713072270154953,
745
- # 'age': 0.11609109491109848,
746
- # 'sibsp': 0.1486879289150238,
747
- # 'fare': 0.14309120178222656,
748
- # 'pclass': 0.5208225250244141
749
- # }
750
- ```
959
+ """
960
+ Calculate feature importance for the model.
961
+
962
+ Parameters
963
+ ----------
964
+ method : str, optional, default="Gain"
965
+ Importance method. Options:
966
+
967
+ - "Weight": Number of times a feature is used in splits.
968
+ - "Gain": Average improvement in loss brought by a feature.
969
+ - "Cover": Average number of samples affected by splits on a feature.
970
+ - "TotalGain": Total improvement in loss brought by a feature.
971
+ - "TotalCover": Total number of samples affected by splits on a feature.
972
+
973
+ normalize : bool, optional, default=True
974
+ Whether to normalize importance scores to sum to 1.
975
+
976
+ Returns
977
+ -------
978
+ importance : dict
979
+ A dictionary mapping feature names (or indices) to importance scores.
751
980
  """
752
981
  importance_: Dict[int, float] = self.booster.calculate_feature_importance(
753
982
  method=method,
@@ -761,41 +990,41 @@ class PerpetualBooster:
761
990
  return importance_
762
991
 
763
992
  def text_dump(self) -> List[str]:
764
- """Return all of the trees of the model in text form.
765
-
766
- Returns:
767
- List[str]: A list of strings, where each string is a text representation
768
- of the tree.
769
- Example:
770
- ```python
771
- model.text_dump()[0]
772
- # 0:[0 < 3] yes=1,no=2,missing=2,gain=91.50833,cover=209.388307
773
- # 1:[4 < 13.7917] yes=3,no=4,missing=4,gain=28.185467,cover=94.00148
774
- # 3:[1 < 18] yes=7,no=8,missing=8,gain=1.4576768,cover=22.090348
775
- # 7:[1 < 17] yes=15,no=16,missing=16,gain=0.691266,cover=0.705011
776
- # 15:leaf=-0.15120,cover=0.23500
777
- # 16:leaf=0.154097,cover=0.470007
778
- ```
993
+ """
994
+ Return the booster model in a human-readable text format.
995
+
996
+ Returns
997
+ -------
998
+ dump : list of str
999
+ A list where each string represents a tree in the ensemble.
779
1000
  """
780
1001
  return self.booster.text_dump()
781
1002
 
782
1003
  def json_dump(self) -> str:
783
- """Return the booster object as a string.
1004
+ """
1005
+ Return the booster model in JSON format.
784
1006
 
785
- Returns:
786
- str: The booster dumped as a json object in string form.
1007
+ Returns
1008
+ -------
1009
+ dump : str
1010
+ The JSON representation of the model.
787
1011
  """
788
1012
  return self.booster.json_dump()
789
1013
 
790
1014
  @classmethod
791
1015
  def load_booster(cls, path: str) -> Self:
792
- """Load a booster object that was saved with the `save_booster` method.
1016
+ """
1017
+ Load a booster model from a file.
793
1018
 
794
- Args:
795
- path (str): Path to the saved booster file.
1019
+ Parameters
1020
+ ----------
1021
+ path : str
1022
+ Path to the saved booster (JSON format).
796
1023
 
797
- Returns:
798
- PerpetualBooster: An initialized booster object.
1024
+ Returns
1025
+ -------
1026
+ model : PerpetualBooster
1027
+ The loaded booster object.
799
1028
  """
800
1029
  try:
801
1030
  booster = CratePerpetualBooster.load_booster(str(path))
@@ -826,10 +1055,15 @@ class PerpetualBooster:
826
1055
  return c
827
1056
 
828
1057
  def save_booster(self, path: str):
829
- """Save a booster object, the underlying representation is a json file.
1058
+ """
1059
+ Save the booster model to a file.
1060
+
1061
+ The model is saved in a JSON-based format.
830
1062
 
831
- Args:
832
- path (str): Path to save the booster object.
1063
+ Parameters
1064
+ ----------
1065
+ path : str
1066
+ Path where the model will be saved.
833
1067
  """
834
1068
  self.booster.save_booster(str(path))
835
1069
 
@@ -854,22 +1088,33 @@ class PerpetualBooster:
854
1088
  return set(feature_map[f] for f in self.terminate_missing_features)
855
1089
 
856
1090
  def insert_metadata(self, key: str, value: str):
857
- """Insert data into the models metadata, this will be saved on the booster object.
1091
+ """
1092
+ Insert metadata into the model.
1093
+
1094
+ Metadata is saved alongside the model and can be retrieved later.
858
1095
 
859
- Args:
860
- key (str): Key to give the inserted value in the metadata.
861
- value (str): String value to assign to the key.
862
- """ # noqa: E501
1096
+ Parameters
1097
+ ----------
1098
+ key : str
1099
+ The key for the metadata item.
1100
+ value : str
1101
+ The value for the metadata item.
1102
+ """
863
1103
  self.booster.insert_metadata(key=key, value=value)
864
1104
 
865
1105
  def get_metadata(self, key: str) -> str:
866
- """Get the value associated with a given key, on the boosters metadata.
1106
+ """
1107
+ Get metadata associated with a given key.
867
1108
 
868
- Args:
869
- key (str): Key of item in metadata.
1109
+ Parameters
1110
+ ----------
1111
+ key : str
1112
+ The key to look up in the metadata.
870
1113
 
871
- Returns:
872
- str: Value associated with the provided key in the boosters metadata.
1114
+ Returns
1115
+ -------
1116
+ value : str
1117
+ The value associated with the key.
873
1118
  """
874
1119
  v = self.booster.get_metadata(key=key)
875
1120
  return v
@@ -884,19 +1129,25 @@ class PerpetualBooster:
884
1129
 
885
1130
  @property
886
1131
  def base_score(self) -> Union[float, Iterable[float]]:
887
- """Base score of the model.
1132
+ """
1133
+ The base score(s) of the model.
888
1134
 
889
- Returns:
890
- Union[float, Iterable[float]]: Base score(s) of the model.
1135
+ Returns
1136
+ -------
1137
+ score : float or iterable of float
1138
+ The initial prediction value(s) of the model.
891
1139
  """
892
1140
  return self.booster.base_score
893
1141
 
894
1142
  @property
895
1143
  def number_of_trees(self) -> Union[int, Iterable[int]]:
896
- """The number of trees in the model.
1144
+ """
1145
+ The number of trees in the ensemble.
897
1146
 
898
- Returns:
899
- int: The total number of trees in the model.
1147
+ Returns
1148
+ -------
1149
+ n_trees : int or iterable of int
1150
+ Total number of trees.
900
1151
  """
901
1152
  return self.booster.number_of_trees
902
1153
 
@@ -931,22 +1182,35 @@ class PerpetualBooster:
931
1182
  # Functions for scikit-learn compatibility, will feel out adding these manually,
932
1183
  # and then if that feels too unwieldy will add scikit-learn as a dependency.
933
1184
  def get_params(self, deep=True) -> Dict[str, Any]:
934
- """Get all of the parameters for the booster.
1185
+ """
1186
+ Get parameters for this booster.
935
1187
 
936
- Args:
937
- deep (bool, optional): This argument does nothing, and is simply here for scikit-learn compatibility.. Defaults to True.
1188
+ Parameters
1189
+ ----------
1190
+ deep : bool, default=True
1191
+ Currently ignored, exists for scikit-learn compatibility.
938
1192
 
939
- Returns:
940
- Dict[str, Any]: The parameters of the booster.
1193
+ Returns
1194
+ -------
1195
+ params : dict
1196
+ Parameter names mapped to their values.
941
1197
  """
942
1198
  args = inspect.getfullargspec(PerpetualBooster).kwonlyargs
943
1199
  return {param: getattr(self, param) for param in args}
944
1200
 
945
1201
  def set_params(self, **params: Any) -> Self:
946
- """Set the parameters of the booster, this has the same effect as reinstating the booster.
1202
+ """
1203
+ Set parameters for this booster.
947
1204
 
948
- Returns:
949
- PerpetualBooster: Booster with new parameters.
1205
+ Parameters
1206
+ ----------
1207
+ **params : dict
1208
+ Booster parameters.
1209
+
1210
+ Returns
1211
+ -------
1212
+ self : object
1213
+ Returns self.
950
1214
  """
951
1215
  old_params = self.get_params()
952
1216
  old_params.update(params)
@@ -954,33 +1218,26 @@ class PerpetualBooster:
954
1218
  return self
955
1219
 
956
1220
  def get_node_lists(self, map_features_names: bool = True) -> List[List[Node]]:
957
- """Return the tree structures representation as a list of python objects.
958
-
959
- Args:
960
- map_features_names (bool, optional): Should the feature names tried to be mapped to a string, if a pandas dataframe was used. Defaults to True.
961
-
962
- Returns:
963
- List[List[Node]]: A list of lists where each sub list is a tree, with all of it's respective nodes.
964
-
965
- Example:
966
- This can be run directly to get the tree structure as python objects.
967
-
968
- ```python
969
- model = PerpetualBooster()
970
- model.fit(X, y)
1221
+ """
1222
+ Return tree structures as lists of node objects.
971
1223
 
972
- model.get_node_lists()[0]
1224
+ Parameters
1225
+ ----------
1226
+ map_features_names : bool, default=True
1227
+ Whether to use feature names instead of indices.
973
1228
 
974
- # [Node(num=0, weight_value...,
975
- # Node(num=1, weight_value...,
976
- # Node(num=2, weight_value...,
977
- # Node(num=3, weight_value...,
978
- # Node(num=4, weight_value...,
979
- # Node(num=5, weight_value...,
980
- # Node(num=6, weight_value...,]
981
- ```
1229
+ Returns
1230
+ -------
1231
+ trees : list of list of Node
1232
+ Each inner list represents a tree.
982
1233
  """
983
- model = json.loads(self.json_dump())["trees"]
1234
+ dump = json.loads(self.json_dump())
1235
+ if "trees" in dump:
1236
+ all_booster_trees = [dump["trees"]]
1237
+ else:
1238
+ # Multi-output
1239
+ all_booster_trees = [b["trees"] for b in dump["boosters"]]
1240
+
984
1241
  feature_map: Union[Dict[int, str], Dict[int, int]]
985
1242
  leaf_split_feature: Union[str, int]
986
1243
  if map_features_names and hasattr(self, "feature_names_in_"):
@@ -991,34 +1248,26 @@ class PerpetualBooster:
991
1248
  leaf_split_feature = -1
992
1249
 
993
1250
  trees = []
994
- for t in model:
995
- nodes = []
996
- for node in t["nodes"].values():
997
- if not node["is_leaf"]:
998
- node["split_feature"] = feature_map[node["split_feature"]]
999
- else:
1000
- node["split_feature"] = leaf_split_feature
1001
- nodes.append(Node(**node))
1002
- trees.append(nodes)
1251
+ for booster_trees in all_booster_trees:
1252
+ for t in booster_trees:
1253
+ nodes = []
1254
+ for node in t["nodes"].values():
1255
+ if not node["is_leaf"]:
1256
+ node["split_feature"] = feature_map[node["split_feature"]]
1257
+ else:
1258
+ node["split_feature"] = leaf_split_feature
1259
+ nodes.append(Node(**node))
1260
+ trees.append(nodes)
1003
1261
  return trees
1004
1262
 
1005
- def trees_to_dataframe(self):
1006
- """Return the tree structure as a Polars or Pandas DataFrame object.
1007
-
1008
- Returns:
1009
- DataFrame: Trees in a Polars or Pandas DataFrame.
1010
-
1011
- Example:
1012
- This can be used directly to print out the tree structure as a dataframe. The Leaf values will have the "Gain" column replaced with the weight value.
1013
-
1014
- ```python
1015
- model.trees_to_dataframe().head()
1016
- ```
1263
+ def trees_to_dataframe(self) -> Any:
1264
+ """
1265
+ Return the tree structures as a DataFrame.
1017
1266
 
1018
- | | Tree | Node | ID | Feature | Split | Yes | No | Missing | Gain | Cover |
1019
- |---:|-------:|-------:|:-----|:----------|--------:|:------|:-----|:----------|--------:|---------:|
1020
- | 0 | 0 | 0 | 0-0 | pclass | 3 | 0-1 | 0-2 | 0-2 | 91.5083 | 209.388 |
1021
- | 1 | 0 | 1 | 0-1 | fare | 13.7917 | 0-3 | 0-4 | 0-4 | 28.1855 | 94.0015 |
1267
+ Returns
1268
+ -------
1269
+ df : DataFrame
1270
+ A Polars or Pandas DataFrame containing tree information.
1022
1271
  """
1023
1272
 
1024
1273
  def node_to_row(
@@ -1062,3 +1311,605 @@ class PerpetualBooster:
1062
1311
  return pd.DataFrame.from_records(vals).sort_values(
1063
1312
  ["Tree", "Node"], ascending=[True, True]
1064
1313
  )
1314
+
1315
+ def _to_xgboost_json(self) -> Dict[str, Any]:
1316
+ """Convert the Perpetual model to an XGBoost JSON model structure."""
1317
+
1318
+ # Check if it's a multi-output model
1319
+ is_multi = len(self.classes_) > 2
1320
+
1321
+ # Get raw dump
1322
+ raw_dump = json.loads(self.json_dump())
1323
+
1324
+ # Initialize XGBoost structure
1325
+ xgb_json = {
1326
+ "learner": {
1327
+ "attributes": {},
1328
+ "feature_names": [],
1329
+ "feature_types": [],
1330
+ "gradient_booster": {
1331
+ "model": {
1332
+ "gbtree_model_param": {
1333
+ "num_parallel_tree": "1",
1334
+ },
1335
+ "trees": [],
1336
+ "tree_info": [],
1337
+ "iteration_indptr": [],
1338
+ "cats": {
1339
+ "enc": [],
1340
+ "feature_segments": [],
1341
+ "sorted_idx": [],
1342
+ },
1343
+ },
1344
+ "name": "gbtree",
1345
+ },
1346
+ "learner_model_param": {
1347
+ "boost_from_average": "1",
1348
+ "num_feature": str(self.n_features_),
1349
+ },
1350
+ "objective": {
1351
+ "name": "binary:logistic",
1352
+ },
1353
+ },
1354
+ "version": [3, 1, 3], # Use a reasonably recent version
1355
+ }
1356
+
1357
+ # Fill feature names if available
1358
+ if hasattr(self, "feature_names_in_"):
1359
+ xgb_json["learner"]["feature_names"] = self.feature_names_in_
1360
+ xgb_json["learner"]["feature_types"] = ["float"] * self.n_features_
1361
+ else:
1362
+ xgb_json["learner"]["feature_names"] = [
1363
+ f"f{i}" for i in range(self.n_features_)
1364
+ ]
1365
+ xgb_json["learner"]["feature_types"] = ["float"] * self.n_features_
1366
+
1367
+ # Objective and Base Score Handling
1368
+ if is_multi:
1369
+ # Multi-class
1370
+ n_classes = len(self.classes_)
1371
+ xgb_json["learner"]["objective"]["name"] = "multi:softprob"
1372
+ xgb_json["learner"]["objective"]["softmax_multiclass_param"] = {
1373
+ "num_class": str(n_classes)
1374
+ }
1375
+ xgb_json["learner"]["learner_model_param"]["num_class"] = str(n_classes)
1376
+ xgb_json["learner"]["learner_model_param"]["num_target"] = "1"
1377
+
1378
+ # Base score vector [0.5, 0.5, ...]
1379
+ # 5.0E-1
1380
+ base_score_str = ",".join(["5.0E-1"] * n_classes)
1381
+ xgb_json["learner"]["learner_model_param"]["base_score"] = (
1382
+ f"[{base_score_str}]"
1383
+ )
1384
+
1385
+ boosters = raw_dump["boosters"]
1386
+
1387
+ trees = []
1388
+ tree_info = []
1389
+ # For multi-class, we need to interleave trees if we want to follow XGBoost structure perfectly?
1390
+ # Or can we just dump them? iteration_indptr depends on this.
1391
+ # XGBoost expects trees for iteration i to be contiguous.
1392
+ # Perpetual stores boosters separately.
1393
+ # Booster 0 has trees for class 0. Booster 1 for class 1.
1394
+ # We need to rearrange them: Round 0 (C0, C1, C2), Round 1 (C0, C1, C2)...
1395
+
1396
+ # Assuming all boosters have same number of trees?
1397
+ num_trees_per_booster = [len(b["trees"]) for b in boosters]
1398
+ max_trees = max(num_trees_per_booster) if num_trees_per_booster else 0
1399
+
1400
+ # Iteration pointers: 0, 3, 6...
1401
+ # But what if some booster has fewer trees? (Early stopping might cause this?)
1402
+ # Perpetual implementation usually stops all or none?
1403
+ # "MultiOutputBooster::fit" trains them sequentially but they might have different tree counts if EarlyStopping is per-booster.
1404
+ # But XGBoost expects consistent num_class trees per round (or use "multi:softprob"?)
1405
+ # If we just list them, XGBoost might get confused if we don't align them.
1406
+
1407
+ # Let's try to align them by round.
1408
+
1409
+ iteration_indptr = [0]
1410
+ current_ptr = 0
1411
+
1412
+ for round_idx in range(max_trees):
1413
+ # For each class
1414
+ for group_id, booster_dump in enumerate(boosters):
1415
+ booster_trees = booster_dump["trees"]
1416
+ if round_idx < len(booster_trees):
1417
+ tree = booster_trees[round_idx]
1418
+ base_score = booster_dump["base_score"]
1419
+
1420
+ xgb_tree = self._convert_tree(tree, current_ptr)
1421
+
1422
+ if round_idx == 0:
1423
+ self._adjust_tree_leaves(xgb_tree, base_score)
1424
+
1425
+ trees.append(xgb_tree)
1426
+ tree_info.append(group_id)
1427
+ current_ptr += 1
1428
+ else:
1429
+ # Missing tree for this class in this round?
1430
+ # Should we insert a dummy tree (0 prediction)?
1431
+ # For now, let's assume balanced trees or hope XGB handles it.
1432
+ # If we skip, tree_info tracks class.
1433
+ pass
1434
+
1435
+ iteration_indptr.append(current_ptr)
1436
+
1437
+ xgb_json["learner"]["gradient_booster"]["model"]["trees"] = trees
1438
+ xgb_json["learner"]["gradient_booster"]["model"]["tree_info"] = tree_info
1439
+ xgb_json["learner"]["gradient_booster"]["model"]["gbtree_model_param"][
1440
+ "num_trees"
1441
+ ] = str(len(trees))
1442
+ xgb_json["learner"]["gradient_booster"]["model"]["iteration_indptr"] = (
1443
+ iteration_indptr
1444
+ )
1445
+
1446
+ else:
1447
+ # Binary or Regression
1448
+ if self.objective == "LogLoss":
1449
+ xgb_json["learner"]["objective"]["name"] = "binary:logistic"
1450
+ xgb_json["learner"]["objective"]["reg_loss_param"] = {
1451
+ "scale_pos_weight": "1"
1452
+ }
1453
+ xgb_json["learner"]["learner_model_param"]["num_class"] = "0"
1454
+ xgb_json["learner"]["learner_model_param"]["num_target"] = "1"
1455
+
1456
+ # Base Score
1457
+ base_score_val = 1.0 / (1.0 + np.exp(-raw_dump["base_score"]))
1458
+ xgb_json["learner"]["learner_model_param"]["base_score"] = (
1459
+ f"[{base_score_val:.6E}]"
1460
+ )
1461
+
1462
+ elif self.objective == "SquaredLoss":
1463
+ xgb_json["learner"]["objective"]["name"] = "reg:squarederror"
1464
+ xgb_json["learner"]["objective"]["reg_loss_param"] = {}
1465
+ xgb_json["learner"]["learner_model_param"]["num_class"] = "0"
1466
+ xgb_json["learner"]["learner_model_param"]["num_target"] = "1"
1467
+ xgb_json["learner"]["learner_model_param"]["base_score"] = (
1468
+ f"[{raw_dump['base_score']:.6E}]"
1469
+ )
1470
+ else:
1471
+ warnings.warn(
1472
+ f"Objective {self.objective} not explicitly supported for XGBoost export. Defaulting to reg:squarederror."
1473
+ )
1474
+ xgb_json["learner"]["objective"]["name"] = "reg:squarederror"
1475
+ xgb_json["learner"]["objective"]["reg_loss_param"] = {}
1476
+ xgb_json["learner"]["learner_model_param"]["num_class"] = "0"
1477
+ xgb_json["learner"]["learner_model_param"]["num_target"] = "1"
1478
+ xgb_json["learner"]["learner_model_param"]["base_score"] = (
1479
+ f"[{raw_dump['base_score']:.6E}]"
1480
+ )
1481
+
1482
+ trees = []
1483
+ tree_info = []
1484
+ for tree_idx, tree in enumerate(raw_dump["trees"]):
1485
+ xgb_tree = self._convert_tree(tree, tree_idx)
1486
+ trees.append(xgb_tree)
1487
+ tree_info.append(0)
1488
+
1489
+ xgb_json["learner"]["gradient_booster"]["model"]["trees"] = trees
1490
+ xgb_json["learner"]["gradient_booster"]["model"]["tree_info"] = tree_info
1491
+ xgb_json["learner"]["gradient_booster"]["model"]["gbtree_model_param"][
1492
+ "num_trees"
1493
+ ] = str(len(trees))
1494
+ xgb_json["learner"]["gradient_booster"]["model"]["iteration_indptr"] = list(
1495
+ range(len(trees) + 1)
1496
+ )
1497
+
1498
+ return xgb_json
1499
+
1500
+ def _convert_tree(self, tree: Dict[str, Any], group_id: int) -> Dict[str, Any]:
1501
+ """Convert a single Perpetual tree to XGBoost dictionary format."""
1502
+
1503
+ nodes_dict = tree["nodes"]
1504
+ # Convert keys to int and sort
1505
+ sorted_keys = sorted(nodes_dict.keys(), key=lambda x: int(x))
1506
+
1507
+ # Mapping from Perpetual ID (int) to XGBoost Array Index (int)
1508
+ node_map = {int(k): i for i, k in enumerate(sorted_keys)}
1509
+
1510
+ num_nodes = len(sorted_keys)
1511
+ # print(f"DEBUG: Converting tree group={group_id}. num_nodes={num_nodes}")
1512
+
1513
+ left_children = [-1] * num_nodes
1514
+ right_children = [-1] * num_nodes
1515
+ parents = [2147483647] * num_nodes
1516
+ split_indices = [0] * num_nodes
1517
+ split_conditions = [0.0] * num_nodes
1518
+ split_type = [0] * num_nodes
1519
+ sum_hessian = [0.0] * num_nodes
1520
+ loss_changes = [0.0] * num_nodes
1521
+ base_weights = [0.0] * num_nodes
1522
+ default_left = [0] * num_nodes
1523
+
1524
+ categories = []
1525
+ categories_nodes = []
1526
+ categories_segments = []
1527
+ categories_sizes = []
1528
+
1529
+ for i, k in enumerate(sorted_keys):
1530
+ node = nodes_dict[k]
1531
+ nid = int(node["num"])
1532
+ idx = node_map[nid]
1533
+
1534
+ # print(f" DEBUG: Node {i} nid={nid} idx={idx}")
1535
+
1536
+ sum_hessian[idx] = node["hessian_sum"]
1537
+ base_weights[idx] = node["weight_value"]
1538
+ loss_changes[idx] = node.get("split_gain", 0.0)
1539
+
1540
+ if node["is_leaf"]:
1541
+ left_children[idx] = -1
1542
+ right_children[idx] = -1
1543
+ split_indices[idx] = 0
1544
+ split_conditions[idx] = node["weight_value"]
1545
+ else:
1546
+ left_id = node["left_child"]
1547
+ right_id = node["right_child"]
1548
+
1549
+ left_idx = node_map[left_id]
1550
+ right_idx = node_map[right_id]
1551
+
1552
+ left_children[idx] = left_idx
1553
+ right_children[idx] = right_idx
1554
+ parents[left_idx] = idx
1555
+ parents[right_idx] = idx
1556
+
1557
+ split_indices[idx] = node["split_feature"]
1558
+ split_conditions[idx] = node["split_value"]
1559
+
1560
+ # Missing handling
1561
+ # If missing_node goes left
1562
+ if node["missing_node"] == left_id:
1563
+ default_left[idx] = 1
1564
+ else:
1565
+ default_left[idx] = 0
1566
+
1567
+ if (
1568
+ "left_cats" in node
1569
+ and node["left_cats"] is not None
1570
+ and len(node["left_cats"]) > 0
1571
+ ):
1572
+ # It's a categorical split
1573
+ cats = node["left_cats"]
1574
+ # XGBoost uses split_type=1 for categorical?
1575
+ # Or just presence in categories_nodes?
1576
+ # Docs say: split_type [default=0]: 0=numerical, 1=categorical
1577
+ split_type[idx] = 1
1578
+
1579
+ # Update categorical arrays
1580
+ categories_nodes.append(idx)
1581
+ categories_sizes.append(len(cats))
1582
+ # Segment is start index.
1583
+ # If this is the first one, 0. Else prev_segment + prev_size?
1584
+ # Actually valid XGBoost format usually has segments as exclusive scan.
1585
+ # [0, len0, len0+len1, ...]
1586
+ # Wait, segments length should be same as nodes?
1587
+ # Let's check logic:
1588
+ # segments[i] points to start of cats for node i (in categories_nodes)
1589
+
1590
+ next_segment = (
1591
+ (categories_segments[-1] + categories_sizes[-2])
1592
+ if categories_segments
1593
+ else 0
1594
+ )
1595
+ categories_segments.append(next_segment)
1596
+
1597
+ categories.extend(sorted(cats))
1598
+
1599
+ # split_condition for categorical is usually NaN or special?
1600
+ # XGBoost JSON parser might ignore it if type is categorical
1601
+ # But often it is set to something.
1602
+
1603
+ return {
1604
+ "base_weights": base_weights,
1605
+ "default_left": default_left,
1606
+ "id": group_id,
1607
+ "left_children": left_children,
1608
+ "loss_changes": loss_changes,
1609
+ "parents": parents,
1610
+ "right_children": right_children,
1611
+ "split_conditions": split_conditions,
1612
+ "split_indices": split_indices,
1613
+ "split_type": split_type,
1614
+ "sum_hessian": sum_hessian,
1615
+ "tree_param": {
1616
+ "num_deleted": "0",
1617
+ "num_feature": str(self.n_features_),
1618
+ "num_nodes": str(num_nodes),
1619
+ "size_leaf_vector": "1",
1620
+ },
1621
+ "categories": categories,
1622
+ "categories_nodes": categories_nodes,
1623
+ "categories_segments": categories_segments,
1624
+ "categories_sizes": categories_sizes,
1625
+ }
1626
+
1627
+ def _adjust_tree_leaves(self, xgb_tree: Dict[str, Any], adjustment: float):
1628
+ """Add adjustment value to all leaves in an XGBoost tree dict."""
1629
+ left_children = xgb_tree["left_children"]
1630
+ split_conditions = xgb_tree["split_conditions"]
1631
+ base_weights = xgb_tree["base_weights"]
1632
+
1633
+ for i, left in enumerate(left_children):
1634
+ if left == -1: # Leaf
1635
+ split_conditions[i] += adjustment
1636
+ base_weights[i] += adjustment
1637
+
1638
+ def save_as_xgboost(self, path: str):
1639
+ """
1640
+ Save the model in XGBoost JSON format.
1641
+
1642
+ Parameters
1643
+ ----------
1644
+ path : str
1645
+ The path where the XGBoost-compatible model will be saved.
1646
+ """
1647
+ xgboost_json = self._to_xgboost_json()
1648
+ with open(path, "w") as f:
1649
+ json.dump(xgboost_json, f, indent=2)
1650
+
1651
+ def save_as_onnx(self, path: str, name: str = "perpetual_model"):
1652
+ """
1653
+ Save the model in ONNX format.
1654
+
1655
+ Parameters
1656
+ ----------
1657
+ path : str
1658
+ The path where the ONNX model will be saved.
1659
+ name : str, optional, default="perpetual_model"
1660
+ The name of the graph in the exported model.
1661
+ """
1662
+ import json
1663
+
1664
+ import onnx
1665
+ from onnx import TensorProto, helper
1666
+
1667
+ raw_dump = json.loads(self.json_dump())
1668
+ is_classifier = len(self.classes_) >= 2
1669
+ is_multi = is_classifier and len(self.classes_) > 2
1670
+ n_classes = len(self.classes_) if is_classifier else 1
1671
+
1672
+ if "trees" in raw_dump:
1673
+ booster_data = [{"trees": raw_dump["trees"]}]
1674
+ else:
1675
+ booster_data = raw_dump["boosters"]
1676
+
1677
+ feature_map_inverse = (
1678
+ {v: k for k, v in enumerate(self.feature_names_in_)}
1679
+ if hasattr(self, "feature_names_in_")
1680
+ else None
1681
+ )
1682
+
1683
+ nodes_treeids = []
1684
+ nodes_nodeids = []
1685
+ nodes_featureids = []
1686
+ nodes_values = []
1687
+ nodes_modes = []
1688
+ nodes_truenodeids = []
1689
+ nodes_falsenodeids = []
1690
+ nodes_missing_value_tracks_true = []
1691
+
1692
+ target_treeids = []
1693
+ target_nodeids = []
1694
+ target_ids = []
1695
+ target_weights = []
1696
+
1697
+ # Base score handling
1698
+ base_score = self.base_score
1699
+ if is_classifier:
1700
+ if is_multi:
1701
+ base_values = [float(b) for b in base_score]
1702
+ else:
1703
+ base_values = [float(base_score)]
1704
+ else:
1705
+ base_values = [float(base_score)]
1706
+
1707
+ global_tree_idx = 0
1708
+ for b_idx, booster in enumerate(booster_data):
1709
+ for tree_data in booster["trees"]:
1710
+ nodes_dict = tree_data["nodes"]
1711
+ node_keys = sorted(nodes_dict.keys(), key=lambda x: int(x))
1712
+
1713
+ node_id_to_idx = {}
1714
+ for i, k in enumerate(node_keys):
1715
+ node_id_to_idx[int(k)] = i
1716
+
1717
+ for k in node_keys:
1718
+ node_dict = nodes_dict[k]
1719
+ nid = int(node_dict["num"])
1720
+ idx_for_onnx = node_id_to_idx[nid]
1721
+
1722
+ nodes_treeids.append(global_tree_idx)
1723
+ nodes_nodeids.append(idx_for_onnx)
1724
+
1725
+ if node_dict["is_leaf"]:
1726
+ nodes_modes.append("LEAF")
1727
+ nodes_featureids.append(0)
1728
+ nodes_values.append(0.0)
1729
+ nodes_truenodeids.append(0)
1730
+ nodes_falsenodeids.append(0)
1731
+ nodes_missing_value_tracks_true.append(0)
1732
+
1733
+ target_treeids.append(global_tree_idx)
1734
+ target_nodeids.append(idx_for_onnx)
1735
+ target_ids.append(b_idx if is_multi else 0)
1736
+ target_weights.append(float(node_dict["weight_value"]))
1737
+ else:
1738
+ nodes_modes.append("BRANCH_LT")
1739
+ feat_val = node_dict["split_feature"]
1740
+ f_idx = 0
1741
+ if isinstance(feat_val, int):
1742
+ f_idx = feat_val
1743
+ elif feature_map_inverse and feat_val in feature_map_inverse:
1744
+ f_idx = feature_map_inverse[feat_val]
1745
+ elif isinstance(feat_val, str) and feat_val.isdigit():
1746
+ f_idx = int(feat_val)
1747
+
1748
+ nodes_featureids.append(f_idx)
1749
+ nodes_values.append(float(node_dict["split_value"]))
1750
+
1751
+ tracks_true = 0
1752
+ if node_dict["missing_node"] == node_dict["left_child"]:
1753
+ tracks_true = 1
1754
+ nodes_missing_value_tracks_true.append(tracks_true)
1755
+
1756
+ nodes_truenodeids.append(
1757
+ node_id_to_idx[int(node_dict["left_child"])]
1758
+ )
1759
+ nodes_falsenodeids.append(
1760
+ node_id_to_idx[int(node_dict["right_child"])]
1761
+ )
1762
+
1763
+ global_tree_idx += 1
1764
+
1765
+ input_name = "input"
1766
+ input_type = helper.make_tensor_value_info(
1767
+ input_name, TensorProto.FLOAT, [None, self.n_features_]
1768
+ )
1769
+
1770
+ raw_scores_name = "raw_scores"
1771
+ reg_node = helper.make_node(
1772
+ "TreeEnsembleRegressor",
1773
+ inputs=[input_name],
1774
+ outputs=[raw_scores_name],
1775
+ domain="ai.onnx.ml",
1776
+ nodes_treeids=nodes_treeids,
1777
+ nodes_nodeids=nodes_nodeids,
1778
+ nodes_featureids=nodes_featureids,
1779
+ nodes_values=nodes_values,
1780
+ nodes_modes=nodes_modes,
1781
+ nodes_truenodeids=nodes_truenodeids,
1782
+ nodes_falsenodeids=nodes_falsenodeids,
1783
+ nodes_missing_value_tracks_true=nodes_missing_value_tracks_true,
1784
+ target_treeids=target_treeids,
1785
+ target_nodeids=target_nodeids,
1786
+ target_ids=target_ids,
1787
+ target_weights=target_weights,
1788
+ base_values=base_values,
1789
+ n_targets=n_classes if is_multi else 1,
1790
+ name="PerpetualTreeEnsemble",
1791
+ )
1792
+
1793
+ ops = [reg_node]
1794
+ if is_classifier:
1795
+ # Prepare class labels mapping
1796
+ classes = self.classes_
1797
+ if all(isinstance(c, (int, np.integer)) for c in classes):
1798
+ tensor_type = TensorProto.INT64
1799
+ classes_array = np.array(classes, dtype=np.int64)
1800
+ elif all(isinstance(c, (float, np.floating)) for c in classes):
1801
+ tensor_type = TensorProto.FLOAT
1802
+ classes_array = np.array(classes, dtype=np.float32)
1803
+ else:
1804
+ tensor_type = TensorProto.STRING
1805
+ classes_array = np.array([str(c) for c in classes], dtype=object)
1806
+
1807
+ classes_name = "class_labels"
1808
+ if tensor_type == TensorProto.STRING:
1809
+ classes_const_node = helper.make_node(
1810
+ "Constant",
1811
+ [],
1812
+ [classes_name],
1813
+ value=helper.make_tensor(
1814
+ name="classes_tensor",
1815
+ data_type=tensor_type,
1816
+ dims=[len(classes)],
1817
+ vals=[s.encode("utf-8") for s in classes_array],
1818
+ ),
1819
+ )
1820
+ else:
1821
+ classes_const_node = helper.make_node(
1822
+ "Constant",
1823
+ [],
1824
+ [classes_name],
1825
+ value=helper.make_tensor(
1826
+ name="classes_tensor",
1827
+ data_type=tensor_type,
1828
+ dims=[len(classes)],
1829
+ vals=classes_array.flatten().tolist(),
1830
+ ),
1831
+ )
1832
+ ops.append(classes_const_node)
1833
+
1834
+ if is_multi:
1835
+ prob_name = "probabilities"
1836
+ softmax_node = helper.make_node(
1837
+ "Softmax", [raw_scores_name], [prob_name], axis=1
1838
+ )
1839
+ label_idx_name = "label_idx"
1840
+ argmax_node = helper.make_node(
1841
+ "ArgMax", [prob_name], [label_idx_name], axis=1, keepdims=0
1842
+ )
1843
+ label_name = "label"
1844
+ gather_node = helper.make_node(
1845
+ "Gather", [classes_name, label_idx_name], [label_name], axis=0
1846
+ )
1847
+ ops.extend([softmax_node, argmax_node, gather_node])
1848
+ outputs = [
1849
+ helper.make_tensor_value_info(label_name, tensor_type, [None]),
1850
+ helper.make_tensor_value_info(
1851
+ prob_name, TensorProto.FLOAT, [None, n_classes]
1852
+ ),
1853
+ ]
1854
+ else:
1855
+ p_name = "p"
1856
+ sigmoid_node = helper.make_node("Sigmoid", [raw_scores_name], [p_name])
1857
+ one_name = "one"
1858
+ one_node = helper.make_node(
1859
+ "Constant",
1860
+ [],
1861
+ [one_name],
1862
+ value=helper.make_tensor("one_v", TensorProto.FLOAT, [1, 1], [1.0]),
1863
+ )
1864
+ one_minus_p_name = "one_minus_p"
1865
+ sub_node = helper.make_node(
1866
+ "Sub", [one_name, p_name], [one_minus_p_name]
1867
+ )
1868
+ prob_name = "probabilities"
1869
+ concat_node = helper.make_node(
1870
+ "Concat", [one_minus_p_name, p_name], [prob_name], axis=1
1871
+ )
1872
+ label_idx_name = "label_idx"
1873
+ argmax_node = helper.make_node(
1874
+ "ArgMax", [prob_name], [label_idx_name], axis=1, keepdims=0
1875
+ )
1876
+ label_name = "label"
1877
+ gather_node = helper.make_node(
1878
+ "Gather", [classes_name, label_idx_name], [label_name], axis=0
1879
+ )
1880
+ ops.extend(
1881
+ [
1882
+ sigmoid_node,
1883
+ one_node,
1884
+ sub_node,
1885
+ concat_node,
1886
+ argmax_node,
1887
+ gather_node,
1888
+ ]
1889
+ )
1890
+ outputs = [
1891
+ helper.make_tensor_value_info(label_name, tensor_type, [None]),
1892
+ helper.make_tensor_value_info(
1893
+ prob_name, TensorProto.FLOAT, [None, 2]
1894
+ ),
1895
+ ]
1896
+ else:
1897
+ prediction_name = "prediction"
1898
+ reg_node.output[0] = prediction_name
1899
+ outputs = [
1900
+ helper.make_tensor_value_info(
1901
+ prediction_name, TensorProto.FLOAT, [None, 1]
1902
+ )
1903
+ ]
1904
+
1905
+ graph_def = helper.make_graph(ops, name, [input_type], outputs)
1906
+ model_def = helper.make_model(
1907
+ graph_def,
1908
+ producer_name="perpetual",
1909
+ opset_imports=[
1910
+ helper.make_opsetid("", 13),
1911
+ helper.make_opsetid("ai.onnx.ml", 2),
1912
+ ],
1913
+ )
1914
+ model_def.ir_version = 6
1915
+ onnx.save(model_def, path)