perpetual 0.9.1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of perpetual might be problematic. Click here for more details.

perpetual/booster.py ADDED
@@ -0,0 +1,1064 @@
1
+ import json
2
+ import inspect
3
+ import warnings
4
+ from typing_extensions import Self
5
+ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union, cast
6
+
7
+ import numpy as np
8
+
9
+ from perpetual.perpetual import PerpetualBooster as CratePerpetualBooster # type: ignore
10
+ from perpetual.perpetual import MultiOutputBooster as CrateMultiOutputBooster # type: ignore
11
+ from perpetual.serialize import BaseSerializer, ObjectSerializer
12
+ from perpetual.types import BoosterType, MultiOutputBoosterType
13
+ from perpetual.data import Node
14
+ from perpetual.utils import (
15
+ CONTRIBUTION_METHODS,
16
+ convert_input_array,
17
+ convert_input_frame,
18
+ transform_input_frame,
19
+ type_df,
20
+ )
21
+
22
+
23
+ class PerpetualBooster:
24
+ # Define the metadata parameters
25
+ # that are present on all instances of this class
26
+ # this is useful for parameters that should be
27
+ # attempted to be loaded in and set
28
+ # as attributes on the booster after it is loaded.
29
+ metadata_attributes: Dict[str, BaseSerializer] = {
30
+ "feature_names_in_": ObjectSerializer(),
31
+ "n_features_": ObjectSerializer(),
32
+ "feature_importance_method": ObjectSerializer(),
33
+ "cat_mapping": ObjectSerializer(),
34
+ "classes_": ObjectSerializer(),
35
+ }
36
+
37
+ def __init__(
38
+ self,
39
+ *,
40
+ objective: str = "LogLoss",
41
+ budget: float = 0.5,
42
+ num_threads: Optional[int] = None,
43
+ monotone_constraints: Union[Dict[Any, int], None] = None,
44
+ force_children_to_bound_parent: bool = False,
45
+ missing: float = np.nan,
46
+ allow_missing_splits: bool = True,
47
+ create_missing_branch: bool = False,
48
+ terminate_missing_features: Optional[Iterable[Any]] = None,
49
+ missing_node_treatment: str = "None",
50
+ log_iterations: int = 0,
51
+ feature_importance_method: str = "Gain",
52
+ quantile: Optional[float] = None,
53
+ reset: Optional[bool] = None,
54
+ categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto",
55
+ timeout: Optional[float] = None,
56
+ iteration_limit: Optional[int] = None,
57
+ memory_limit: Optional[float] = None,
58
+ stopping_rounds: Optional[int] = None,
59
+ max_bin: int = 256,
60
+ max_cat: int = 1000,
61
+ ):
62
+ """PerpetualBooster class, used to create gradient boosted decision tree ensembles.
63
+
64
+ Args:
65
+ objective (str, optional): Learning objective function to be used for optimization. Valid options are:
66
+ "LogLoss" to use logistic loss (classification),
67
+ "SquaredLoss" to use squared error (regression),
68
+ "QuantileLoss" to use quantile error (regression).
69
+ Defaults to "LogLoss".
70
+ budget (float, optional): a positive number for fitting budget. Increasing this number will more
71
+ likely result in more boosting rounds and more increased predictive power.
72
+ Default value is 0.5.
73
+ num_threads (int, optional): Number of threads to be used during training.
74
+ monotone_constraints (Dict[Any, int], optional): Constraints that are used to enforce a
75
+ specific relationship between the training features and the target variable. A dictionary
76
+ should be provided where the keys are the feature index value if the model will be fit on
77
+ a numpy array, or a feature name if it will be fit on a Dataframe. The values of
78
+ the dictionary should be an integer value of -1, 1, or 0 to specify the relationship
79
+ that should be estimated between the respective feature and the target variable.
80
+ Use a value of -1 to enforce a negative relationship, 1 a positive relationship,
81
+ and 0 will enforce no specific relationship at all. Features not included in the
82
+ mapping will not have any constraint applied. If `None` is passed, no constraints
83
+ will be enforced on any variable. Defaults to `None`.
84
+ force_children_to_bound_parent (bool, optional): Setting this parameter to `True` will restrict children nodes, so that they always contain the parent node inside of their range. Without setting this it's possible that both, the left and the right nodes could be greater, than or less than, the parent node. Defaults to `False`.
85
+ missing (float, optional): Value to consider missing, when training and predicting
86
+ with the booster. Defaults to `np.nan`.
87
+ allow_missing_splits (bool, optional): Allow for splits to be made such that all missing values go
88
+ down one branch, and all non-missing values go down the other, if this results
89
+ in the greatest reduction of loss. If this is false, splits will only be made on non
90
+ missing values. If `create_missing_branch` is set to `True` having this parameter be
91
+ set to `True` will result in the missing branch further split, if this parameter
92
+ is `False` then in that case the missing branch will always be a terminal node.
93
+ Defaults to `True`.
94
+ create_missing_branch (bool, optional): An experimental parameter, that if `True`, will
95
+ create a separate branch for missing, creating a ternary tree, the missing node will be given the same
96
+ weight value as the parent node. If this parameter is `False`, missing will be sent
97
+ down either the left or right branch, creating a binary tree. Defaults to `False`.
98
+ terminate_missing_features (Set[Any], optional): An optional iterable of features
99
+ (either strings, or integer values specifying the feature indices if numpy arrays are used for fitting),
100
+ for which the missing node will always be terminated, even if `allow_missing_splits` is set to true.
101
+ This value is only valid if `create_missing_branch` is also True.
102
+ missing_node_treatment (str, optional): Method for selecting the `weight` for the missing node, if `create_missing_branch` is set to `True`. Defaults to "None". Valid options are:
103
+ - "None": Calculate missing node weight values without any constraints.
104
+ - "AssignToParent": Assign the weight of the missing node to that of the parent.
105
+ - "AverageLeafWeight": After training each tree, starting from the bottom of the tree, assign the missing node weight to the weighted average of the left and right child nodes. Next assign the parent to the weighted average of the children nodes. This is performed recursively up through the entire tree. This is performed as a post processing step on each tree after it is built, and prior to updating the predictions for which to train the next tree.
106
+ - "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
107
+ log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
108
+ feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
109
+ quantile (float, optional): only used in quantile regression.
110
+ reset (bool, optional): whether to reset the model or continue training.
111
+ categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
112
+ Defaults to `auto` for Polars or Pandas categorical data types.
113
+ timeout (float, optional): optional fit timeout in seconds
114
+ iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
115
+ The algorithm automatically stops for most of the cases before hitting this limit.
116
+ If you want to experiment with very high budget (>2.0), you can also increase this limit.
117
+ memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
118
+ available memory and the algorithm requirements.
119
+ stopping_rounds (int, optional): optional limit for auto stopping.
120
+ max_bin (int, optional): maximum number of bins for feature discretization. Defaults to 256.
121
+ max_cat (int, optional): Maximum number of unique categories for a categorical feature.
122
+ Features with more categories will be treated as numerical.
123
+ Defaults to 1000.
124
+
125
+ Raises:
126
+ TypeError: Raised if an invalid dtype is passed.
127
+
128
+ Example:
129
+ Once, the booster has been initialized, it can be fit on a provided dataset, and performance field. After fitting, the model can be used to predict on a dataset.
130
+ In the case of this example, the predictions are the log odds of a given record being 1.
131
+
132
+ ```python
133
+ # Small example dataset
134
+ from seaborn import load_dataset
135
+
136
+ df = load_dataset("titanic")
137
+ X = df.select_dtypes("number").drop(columns=["survived"])
138
+ y = df["survived"]
139
+
140
+ # Initialize a booster with defaults.
141
+ from perpetual import PerpetualBooster
142
+ model = PerpetualBooster(objective="LogLoss")
143
+ model.fit(X, y)
144
+
145
+ # Predict on data
146
+ model.predict(X.head())
147
+ # array([-1.94919663, 2.25863229, 0.32963671, 2.48732194, -3.00371813])
148
+
149
+ # predict contributions
150
+ model.predict_contributions(X.head())
151
+ # array([[-0.63014213, 0.33880048, -0.16520798, -0.07798772, -0.85083578,
152
+ # -1.07720813],
153
+ # [ 1.05406709, 0.08825999, 0.21662544, -0.12083538, 0.35209258,
154
+ # -1.07720813],
155
+ ```
156
+
157
+ """
158
+
159
+ terminate_missing_features_ = (
160
+ set() if terminate_missing_features is None else terminate_missing_features
161
+ )
162
+ monotone_constraints_ = (
163
+ {} if monotone_constraints is None else monotone_constraints
164
+ )
165
+
166
+ self.objective = objective
167
+ self.budget = budget
168
+ self.num_threads = num_threads
169
+ self.monotone_constraints = monotone_constraints_
170
+ self.force_children_to_bound_parent = force_children_to_bound_parent
171
+ self.allow_missing_splits = allow_missing_splits
172
+ self.missing = missing
173
+ self.create_missing_branch = create_missing_branch
174
+ self.terminate_missing_features = terminate_missing_features_
175
+ self.missing_node_treatment = missing_node_treatment
176
+ self.log_iterations = log_iterations
177
+ self.feature_importance_method = feature_importance_method
178
+ self.quantile = quantile
179
+ self.reset = reset
180
+ self.categorical_features = categorical_features
181
+ self.timeout = timeout
182
+ self.iteration_limit = iteration_limit
183
+ self.memory_limit = memory_limit
184
+ self.stopping_rounds = stopping_rounds
185
+ self.max_bin = max_bin
186
+ self.max_cat = max_cat
187
+
188
+ booster = CratePerpetualBooster(
189
+ objective=self.objective,
190
+ budget=self.budget,
191
+ max_bin=self.max_bin,
192
+ num_threads=self.num_threads,
193
+ monotone_constraints=dict(),
194
+ force_children_to_bound_parent=self.force_children_to_bound_parent,
195
+ missing=self.missing,
196
+ allow_missing_splits=self.allow_missing_splits,
197
+ create_missing_branch=self.create_missing_branch,
198
+ terminate_missing_features=set(),
199
+ missing_node_treatment=self.missing_node_treatment,
200
+ log_iterations=self.log_iterations,
201
+ quantile=self.quantile,
202
+ reset=self.reset,
203
+ categorical_features=set(),
204
+ timeout=self.timeout,
205
+ iteration_limit=self.iteration_limit,
206
+ memory_limit=self.memory_limit,
207
+ stopping_rounds=self.stopping_rounds,
208
+ )
209
+ self.booster = cast(BoosterType, booster)
210
+
211
+ def fit(self, X, y, sample_weight=None) -> Self:
212
+ """Fit the gradient booster on a provided dataset.
213
+
214
+ Args:
215
+ X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
216
+ y (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
217
+ or a 1 or 2 dimensional Numpy array.
218
+ sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
219
+ training the model. If None is passed, a weight of 1 will be used for every record.
220
+ Defaults to None.
221
+ """
222
+
223
+ features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
224
+ convert_input_frame(X, self.categorical_features, self.max_cat)
225
+ )
226
+ self.n_features_ = cols
227
+ self.cat_mapping = cat_mapping
228
+ self.feature_names_in_ = features_
229
+
230
+ y_, classes_ = convert_input_array(y, self.objective)
231
+ self.classes_ = np.array(classes_).tolist()
232
+
233
+ if sample_weight is None:
234
+ sample_weight_ = None
235
+ else:
236
+ sample_weight_, _ = convert_input_array(sample_weight, self.objective)
237
+
238
+ # Convert the monotone constraints into the form needed
239
+ # by the rust code.
240
+ crate_mc = self._standardize_monotonicity_map(X)
241
+ crate_tmf = self._standardize_terminate_missing_features(X)
242
+
243
+ if (len(classes_) <= 2) or (
244
+ len(classes_) > 1 and self.objective == "SquaredLoss"
245
+ ):
246
+ booster = CratePerpetualBooster(
247
+ objective=self.objective,
248
+ budget=self.budget,
249
+ max_bin=self.max_bin,
250
+ num_threads=self.num_threads,
251
+ monotone_constraints=crate_mc,
252
+ force_children_to_bound_parent=self.force_children_to_bound_parent,
253
+ missing=self.missing,
254
+ allow_missing_splits=self.allow_missing_splits,
255
+ create_missing_branch=self.create_missing_branch,
256
+ terminate_missing_features=crate_tmf,
257
+ missing_node_treatment=self.missing_node_treatment,
258
+ log_iterations=self.log_iterations,
259
+ quantile=self.quantile,
260
+ reset=self.reset,
261
+ categorical_features=categorical_features_,
262
+ timeout=self.timeout,
263
+ iteration_limit=self.iteration_limit,
264
+ memory_limit=self.memory_limit,
265
+ stopping_rounds=self.stopping_rounds,
266
+ )
267
+ self.booster = cast(BoosterType, booster)
268
+ else:
269
+ booster = CrateMultiOutputBooster(
270
+ n_boosters=len(classes_),
271
+ objective=self.objective,
272
+ budget=self.budget,
273
+ max_bin=self.max_bin,
274
+ num_threads=self.num_threads,
275
+ monotone_constraints=crate_mc,
276
+ force_children_to_bound_parent=self.force_children_to_bound_parent,
277
+ missing=self.missing,
278
+ allow_missing_splits=self.allow_missing_splits,
279
+ create_missing_branch=self.create_missing_branch,
280
+ terminate_missing_features=crate_tmf,
281
+ missing_node_treatment=self.missing_node_treatment,
282
+ log_iterations=self.log_iterations,
283
+ quantile=self.quantile,
284
+ reset=self.reset,
285
+ categorical_features=categorical_features_,
286
+ timeout=self.timeout,
287
+ iteration_limit=self.iteration_limit,
288
+ memory_limit=self.memory_limit,
289
+ stopping_rounds=self.stopping_rounds,
290
+ )
291
+ self.booster = cast(MultiOutputBoosterType, booster)
292
+
293
+ self._set_metadata_attributes("n_features_", self.n_features_)
294
+ self._set_metadata_attributes("cat_mapping", self.cat_mapping)
295
+ self._set_metadata_attributes("feature_names_in_", self.feature_names_in_)
296
+ self._set_metadata_attributes(
297
+ "feature_importance_method", self.feature_importance_method
298
+ )
299
+ self._set_metadata_attributes("classes_", self.classes_)
300
+
301
+ self.categorical_features = categorical_features_
302
+
303
+ self.booster.fit(
304
+ flat_data=flat_data,
305
+ rows=rows,
306
+ cols=cols,
307
+ y=y_,
308
+ sample_weight=sample_weight_, # type: ignore
309
+ )
310
+
311
+ return self
312
+
313
+ def prune(self, X, y, sample_weight=None) -> Self:
314
+ """Prune the gradient booster on a provided dataset.
315
+
316
+ Args:
317
+ X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
318
+ y (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
319
+ or a 1 or 2 dimensional Numpy array.
320
+ sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
321
+ training the model. If None is passed, a weight of 1 will be used for every record.
322
+ Defaults to None.
323
+ """
324
+
325
+ _, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
326
+
327
+ y_, _ = convert_input_array(y, self.objective)
328
+
329
+ if sample_weight is None:
330
+ sample_weight_ = None
331
+ else:
332
+ sample_weight_, _ = convert_input_array(sample_weight, self.objective)
333
+
334
+ self.booster.prune(
335
+ flat_data=flat_data,
336
+ rows=rows,
337
+ cols=cols,
338
+ y=y_,
339
+ sample_weight=sample_weight_, # type: ignore
340
+ )
341
+
342
+ return self
343
+
344
+ def calibrate(
345
+ self, X_train, y_train, X_cal, y_cal, alpha, sample_weight=None
346
+ ) -> Self:
347
+ """Calibrate the gradient booster on a provided dataset.
348
+
349
+ Args:
350
+ X_train (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
351
+ y_train (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
352
+ or a 1 or 2 dimensional Numpy array.
353
+ X_cal (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
354
+ y_cal (Union[FrameLike, ArrayLike]): Either a Polars or Pandas DataFrame or Series,
355
+ or a 1 or 2 dimensional Numpy array.
356
+ alpha (ArrayLike): Between 0 and 1, represents the uncertainty of the confidence interval.
357
+ Lower alpha produce larger (more conservative) prediction intervals.
358
+ alpha is the complement of the target coverage level.
359
+ sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
360
+ training the model. If None is passed, a weight of 1 will be used for every record.
361
+ Defaults to None.
362
+ """
363
+
364
+ _, flat_data_train, rows_train, cols_train = transform_input_frame(
365
+ X_train, self.cat_mapping
366
+ )
367
+
368
+ y_train_, _ = convert_input_array(y_train, self.objective)
369
+
370
+ _, flat_data_cal, rows_cal, cols_cal = transform_input_frame(
371
+ X_cal, self.cat_mapping
372
+ )
373
+
374
+ y_cal_, _ = convert_input_array(y_cal, self.objective)
375
+
376
+ if sample_weight is None:
377
+ sample_weight_ = None
378
+ else:
379
+ sample_weight_, _ = convert_input_array(sample_weight, self.objective)
380
+
381
+ self.booster.calibrate(
382
+ flat_data=flat_data_train,
383
+ rows=rows_train,
384
+ cols=cols_train,
385
+ y=y_train_,
386
+ flat_data_cal=flat_data_cal,
387
+ rows_cal=rows_cal,
388
+ cols_cal=cols_cal,
389
+ y_cal=y_cal_,
390
+ alpha=np.array(alpha),
391
+ sample_weight=sample_weight_, # type: ignore
392
+ )
393
+
394
+ return self
395
+
396
+ def _validate_features(self, features: List[str]):
397
+ if len(features) > 0 and hasattr(self, "feature_names_in_"):
398
+ if features[0] != "0" and self.feature_names_in_[0] != "0":
399
+ if features != self.feature_names_in_:
400
+ raise ValueError(
401
+ f"Columns mismatch between data {features} passed, and data {self.feature_names_in_} used at fit."
402
+ )
403
+
404
+ def predict_intervals(self, X, parallel: Union[bool, None] = None) -> dict:
405
+ """Predict intervals with the fitted booster on new data.
406
+
407
+ Args:
408
+ X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
409
+ parallel (Union[bool, None], optional): Optionally specify if the predict
410
+ function should run in parallel on multiple threads. If `None` is
411
+ passed, the `parallel` attribute of the booster will be used.
412
+ Defaults to `None`.
413
+
414
+ Returns:
415
+ np.ndarray: Returns a numpy array of the predictions.
416
+ """
417
+ features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
418
+ self._validate_features(features_)
419
+
420
+ return self.booster.predict_intervals(
421
+ flat_data=flat_data,
422
+ rows=rows,
423
+ cols=cols,
424
+ parallel=parallel,
425
+ )
426
+
427
+ def predict(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
428
+ """Predict with the fitted booster on new data.
429
+
430
+ Args:
431
+ X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
432
+ parallel (Union[bool, None], optional): Optionally specify if the predict
433
+ function should run in parallel on multiple threads. If `None` is
434
+ passed, the `parallel` attribute of the booster will be used.
435
+ Defaults to `None`.
436
+
437
+ Returns:
438
+ np.ndarray: Returns a numpy array of the predictions.
439
+ """
440
+ features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
441
+ self._validate_features(features_)
442
+
443
+ if len(self.classes_) == 0:
444
+ return self.booster.predict(
445
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
446
+ )
447
+ elif len(self.classes_) == 2:
448
+ return np.rint(
449
+ self.booster.predict_proba(
450
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
451
+ )
452
+ ).astype(int)
453
+ else:
454
+ preds = self.booster.predict(
455
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
456
+ )
457
+ preds_matrix = preds.reshape((-1, len(self.classes_)), order="F")
458
+ indices = np.argmax(preds_matrix, axis=1)
459
+ return np.array([self.classes_[i] for i in indices])
460
+
461
+ def predict_proba(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
462
+ """Predict probabilities with the fitted booster on new data.
463
+
464
+ Args:
465
+ X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
466
+ parallel (Union[bool, None], optional): Optionally specify if the predict
467
+ function should run in parallel on multiple threads. If `None` is
468
+ passed, the `parallel` attribute of the booster will be used.
469
+ Defaults to `None`.
470
+
471
+ Returns:
472
+ np.ndarray, shape (n_samples, n_classes): Returns a numpy array of the class probabilities.
473
+ """
474
+ features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
475
+ self._validate_features(features_)
476
+
477
+ if len(self.classes_) > 2:
478
+ probabilities = self.booster.predict_proba(
479
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
480
+ )
481
+ return probabilities.reshape((-1, len(self.classes_)), order="C")
482
+ elif len(self.classes_) == 2:
483
+ probabilities = self.booster.predict_proba(
484
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
485
+ )
486
+ return np.concatenate(
487
+ [(1.0 - probabilities).reshape(-1, 1), probabilities.reshape(-1, 1)],
488
+ axis=1,
489
+ )
490
+ else:
491
+ raise NotImplementedError(
492
+ f"predict_proba not implemented for regression. n_classes = {len(self.classes_)}"
493
+ )
494
+
495
+ def predict_log_proba(self, X, parallel: Union[bool, None] = None) -> np.ndarray:
496
+ """Predict class log-probabilities with the fitted booster on new data.
497
+
498
+ Args:
499
+ X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
500
+ parallel (Union[bool, None], optional): Optionally specify if the predict
501
+ function should run in parallel on multiple threads. If `None` is
502
+ passed, the `parallel` attribute of the booster will be used.
503
+ Defaults to `None`.
504
+
505
+ Returns:
506
+ np.ndarray: Returns a numpy array of the predictions.
507
+ """
508
+ features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
509
+ self._validate_features(features_)
510
+
511
+ if len(self.classes_) > 2:
512
+ preds = self.booster.predict(
513
+ flat_data=flat_data,
514
+ rows=rows,
515
+ cols=cols,
516
+ parallel=parallel,
517
+ )
518
+ return preds.reshape((-1, len(self.classes_)), order="F")
519
+ elif len(self.classes_) == 2:
520
+ return self.booster.predict(
521
+ flat_data=flat_data,
522
+ rows=rows,
523
+ cols=cols,
524
+ parallel=parallel,
525
+ )
526
+ else:
527
+ raise NotImplementedError(
528
+ "predict_log_proba not implemented for regression."
529
+ )
530
+
531
+ def predict_nodes(self, X, parallel: Union[bool, None] = None) -> List:
532
+ """Predict nodes with the fitted booster on new data.
533
+
534
+ Args:
535
+ X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array.
536
+ parallel (Union[bool, None], optional): Optionally specify if the predict
537
+ function should run in parallel on multiple threads. If `None` is
538
+ passed, the `parallel` attribute of the booster will be used.
539
+ Defaults to `None`.
540
+
541
+ Returns:
542
+ List: Returns a list of node predictions.
543
+ """
544
+ features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
545
+ self._validate_features(features_)
546
+
547
+ return self.booster.predict_nodes(
548
+ flat_data=flat_data, rows=rows, cols=cols, parallel=parallel
549
+ )
550
+
551
+ @property
552
+ def feature_importances_(self) -> np.ndarray:
553
+ vals = self.calculate_feature_importance(
554
+ method=self.feature_importance_method, normalize=True
555
+ )
556
+ if hasattr(self, "feature_names_in_"):
557
+ vals = cast(Dict[str, float], vals)
558
+ return np.array([vals.get(ft, 0.0) for ft in self.feature_names_in_])
559
+ else:
560
+ vals = cast(Dict[int, float], vals)
561
+ return np.array([vals.get(ft, 0.0) for ft in range(self.n_features_)])
562
+
563
+ def predict_contributions(
564
+ self, X, method: str = "Average", parallel: Union[bool, None] = None
565
+ ) -> np.ndarray:
566
+ """Predict with the fitted booster on new data, returning the feature
567
+ contribution matrix. The last column is the bias term.
568
+
569
+
570
+ Args:
571
+ X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
572
+ method (str, optional): Method to calculate the contributions, available options are:
573
+
574
+ - "Average": If this option is specified, the average internal node values are calculated.
575
+ - "Shapley": Using this option will calculate contributions using the tree shap algorithm.
576
+ - "Weight": This method will use the internal leaf weights, to calculate the contributions. This is the same as what is described by Saabas [here](https://blog.datadive.net/interpreting-random-forests/).
577
+ - "BranchDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the other non-missing branch. This method does not have the property where the contributions summed is equal to the final prediction of the model.
578
+ - "MidpointDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the mid-point between the right and left node weighted by the cover of each node. This method does not have the property where the contributions summed is equal to the final prediction of the model.
579
+ - "ModeDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the node with the largest cover (the mode node). This method does not have the property where the contributions summed is equal to the final prediction of the model.
580
+ - "ProbabilityChange": This method is only valid when the objective type is set to "LogLoss". This method will calculate contributions as the change in a records probability of being 1 moving from a parent node to a child node. The sum of the returned contributions matrix, will be equal to the probability a record will be 1. For example, given a model, `model.predict_contributions(X, method="ProbabilityChange") == 1 / (1 + np.exp(-model.predict(X)))`
581
+ parallel (Union[bool, None], optional): Optionally specify if the predict
582
+ function should run in parallel on multiple threads. If `None` is
583
+ passed, the `parallel` attribute of the booster will be used.
584
+ Defaults to `None`.
585
+
586
+ Returns:
587
+ np.ndarray: Returns a numpy array of the predictions.
588
+ """
589
+ features_, flat_data, rows, cols = transform_input_frame(X, self.cat_mapping)
590
+ self._validate_features(features_)
591
+
592
+ contributions = self.booster.predict_contributions(
593
+ flat_data=flat_data,
594
+ rows=rows,
595
+ cols=cols,
596
+ method=CONTRIBUTION_METHODS.get(method, method),
597
+ parallel=parallel,
598
+ )
599
+ return np.reshape(contributions, (rows, cols + 1))
600
+
601
+ def partial_dependence(
602
+ self,
603
+ X,
604
+ feature: Union[str, int],
605
+ samples: Optional[int] = 100,
606
+ exclude_missing: bool = True,
607
+ percentile_bounds: Tuple[float, float] = (0.2, 0.98),
608
+ ) -> np.ndarray:
609
+ """Calculate the partial dependence values of a feature. For each unique
610
+ value of the feature, this gives the estimate of the predicted value for that
611
+ feature, with the effects of all features averaged out. This information gives
612
+ an estimate of how a given feature impacts the model.
613
+
614
+ Args:
615
+ X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
616
+ This should be the same data passed into the models fit, or predict,
617
+ with the columns in the same order.
618
+ feature (Union[str, int]): The feature for which to calculate the partial
619
+ dependence values. This can be the name of a column, if the provided
620
+ X is a pandas DataFrame, or the index of the feature.
621
+ samples (Optional[int]): Number of evenly spaced samples to select. If None
622
+ is passed all unique values will be used. Defaults to 100.
623
+ exclude_missing (bool, optional): Should missing excluded from the features? Defaults to True.
624
+ percentile_bounds (Tuple[float, float], optional): Upper and lower percentiles to start at
625
+ when calculating the samples. Defaults to (0.2, 0.98) to cap the samples selected
626
+ at the 5th and 95th percentiles respectively.
627
+
628
+ Raises:
629
+ ValueError: An error will be raised if the provided X parameter is not a
630
+ pandas DataFrame, and a string is provided for the feature.
631
+
632
+ Returns:
633
+ np.ndarray: A 2 dimensional numpy array, where the first column is the
634
+ sorted unique values of the feature, and then the second column
635
+ is the partial dependence values for each feature value.
636
+
637
+ Example:
638
+ This information can be plotted to visualize how a feature is used in the model, like so.
639
+
640
+ ```python
641
+ from seaborn import lineplot
642
+ import matplotlib.pyplot as plt
643
+
644
+ pd_values = model.partial_dependence(X=X, feature="age", samples=None)
645
+
646
+ fig = lineplot(x=pd_values[:,0], y=pd_values[:,1],)
647
+ plt.title("Partial Dependence Plot")
648
+ plt.xlabel("Age")
649
+ plt.ylabel("Log Odds")
650
+ ```
651
+ <img height="340" src="https://github.com/jinlow/forust/raw/main/resources/pdp_plot_age.png">
652
+
653
+ We can see how this is impacted if a model is created, where a specific constraint is applied to the feature using the `monotone_constraint` parameter.
654
+
655
+ ```python
656
+ model = PerpetualBooster(
657
+ objective="LogLoss",
658
+ monotone_constraints={"age": -1},
659
+ )
660
+ model.fit(X, y)
661
+
662
+ pd_values = model.partial_dependence(X=X, feature="age")
663
+ fig = lineplot(
664
+ x=pd_values[:, 0],
665
+ y=pd_values[:, 1],
666
+ )
667
+ plt.title("Partial Dependence Plot with Monotonicity")
668
+ plt.xlabel("Age")
669
+ plt.ylabel("Log Odds")
670
+ ```
671
+ <img height="340" src="https://github.com/jinlow/forust/raw/main/resources/pdp_plot_age_mono.png">
672
+ """
673
+ if isinstance(feature, str):
674
+ if not (type_df(X) == "pandas_df" or type_df(X) == "polars_df"):
675
+ raise ValueError(
676
+ "If `feature` is a string, then the object passed as `X` must be a pandas DataFrame."
677
+ )
678
+ values = X.loc[:, feature].to_numpy()
679
+ if hasattr(self, "feature_names_in_") and self.feature_names_in_[0] != "0":
680
+ [feature_idx] = [
681
+ i for i, v in enumerate(self.feature_names_in_) if v == feature
682
+ ]
683
+ else:
684
+ w_msg = (
685
+ "No feature names were provided at fit, but feature was a string, attempting to "
686
+ + "determine feature index from DataFrame Column, "
687
+ + "ensure columns are the same order as data passed when fit."
688
+ )
689
+ warnings.warn(w_msg)
690
+ [feature_idx] = [i for i, v in enumerate(X.columns) if v == feature]
691
+ elif isinstance(feature, int):
692
+ feature_idx = feature
693
+ if type_df(X) == "pandas_df":
694
+ values = X.to_numpy()[:, feature]
695
+ elif type_df(X) == "polars_df":
696
+ values = X.to_numpy(allow_copy=False)[:, feature]
697
+ else:
698
+ values = X[:, feature]
699
+ else:
700
+ raise ValueError(
701
+ f"The parameter `feature` must be a string, or an int, however an object of type {type(feature)} was passed."
702
+ )
703
+ min_p, max_p = percentile_bounds
704
+ values = values[~(np.isnan(values) | (values == self.missing))]
705
+ if samples is None:
706
+ search_values = np.sort(np.unique(values))
707
+ else:
708
+ # Exclude missing from this calculation.
709
+ search_values = np.quantile(values, np.linspace(min_p, max_p, num=samples))
710
+
711
+ # Add missing back, if they wanted it...
712
+ if not exclude_missing:
713
+ search_values = np.append([self.missing], search_values)
714
+
715
+ res = []
716
+ for v in search_values:
717
+ res.append(
718
+ (v, self.booster.value_partial_dependence(feature=feature_idx, value=v))
719
+ )
720
+ return np.array(res)
721
+
722
+ def calculate_feature_importance(
723
+ self, method: str = "Gain", normalize: bool = True
724
+ ) -> Union[Dict[int, float], Dict[str, float]]:
725
+ """Feature importance values can be calculated with the `calculate_feature_importance` method. This function will return a dictionary of the features and their importance values. It should be noted that if a feature was never used for splitting it will not be returned in importance dictionary.
726
+
727
+ Args:
728
+ method (str, optional): Variable importance method. Defaults to "Gain". Valid options are:
729
+
730
+ - "Weight": The number of times a feature is used to split the data across all trees.
731
+ - "Gain": The average split gain across all splits the feature is used in.
732
+ - "Cover": The average coverage across all splits the feature is used in.
733
+ - "TotalGain": The total gain across all splits the feature is used in.
734
+ - "TotalCover": The total coverage across all splits the feature is used in.
735
+ normalize (bool, optional): Should the importance be normalized to sum to 1? Defaults to `True`.
736
+
737
+ Returns:
738
+ Dict[str, float]: Variable importance values, for features present in the model.
739
+
740
+ Example:
741
+ ```python
742
+ model.calculate_feature_importance("Gain")
743
+ # {
744
+ # 'parch': 0.0713072270154953,
745
+ # 'age': 0.11609109491109848,
746
+ # 'sibsp': 0.1486879289150238,
747
+ # 'fare': 0.14309120178222656,
748
+ # 'pclass': 0.5208225250244141
749
+ # }
750
+ ```
751
+ """
752
+ importance_: Dict[int, float] = self.booster.calculate_feature_importance(
753
+ method=method,
754
+ normalize=normalize,
755
+ )
756
+ if hasattr(self, "feature_names_in_"):
757
+ feature_map: Dict[int, str] = {
758
+ i: f for i, f in enumerate(self.feature_names_in_)
759
+ }
760
+ return {feature_map[i]: v for i, v in importance_.items()}
761
+ return importance_
762
+
763
+ def text_dump(self) -> List[str]:
764
+ """Return all of the trees of the model in text form.
765
+
766
+ Returns:
767
+ List[str]: A list of strings, where each string is a text representation
768
+ of the tree.
769
+ Example:
770
+ ```python
771
+ model.text_dump()[0]
772
+ # 0:[0 < 3] yes=1,no=2,missing=2,gain=91.50833,cover=209.388307
773
+ # 1:[4 < 13.7917] yes=3,no=4,missing=4,gain=28.185467,cover=94.00148
774
+ # 3:[1 < 18] yes=7,no=8,missing=8,gain=1.4576768,cover=22.090348
775
+ # 7:[1 < 17] yes=15,no=16,missing=16,gain=0.691266,cover=0.705011
776
+ # 15:leaf=-0.15120,cover=0.23500
777
+ # 16:leaf=0.154097,cover=0.470007
778
+ ```
779
+ """
780
+ return self.booster.text_dump()
781
+
782
+ def json_dump(self) -> str:
783
+ """Return the booster object as a string.
784
+
785
+ Returns:
786
+ str: The booster dumped as a json object in string form.
787
+ """
788
+ return self.booster.json_dump()
789
+
790
+ @classmethod
791
+ def load_booster(cls, path: str) -> Self:
792
+ """Load a booster object that was saved with the `save_booster` method.
793
+
794
+ Args:
795
+ path (str): Path to the saved booster file.
796
+
797
+ Returns:
798
+ PerpetualBooster: An initialized booster object.
799
+ """
800
+ try:
801
+ booster = CratePerpetualBooster.load_booster(str(path))
802
+ except ValueError:
803
+ booster = CrateMultiOutputBooster.load_booster(str(path))
804
+
805
+ params = booster.get_params()
806
+ with warnings.catch_warnings():
807
+ warnings.simplefilter("ignore")
808
+ c = cls(**params)
809
+ c.booster = booster
810
+ for m in c.metadata_attributes:
811
+ try:
812
+ m_ = c._get_metadata_attributes(m)
813
+ setattr(c, m, m_)
814
+ # If "feature_names_in_" is present, we know a
815
+ # pandas dataframe was used for fitting, in this case
816
+ # get back the original monotonicity map, with the
817
+ # feature names as keys.
818
+ if m == "feature_names_in_" and c.feature_names_in_[0] != "0":
819
+ if c.monotone_constraints is not None:
820
+ c.monotone_constraints = {
821
+ ft: c.monotone_constraints[i]
822
+ for i, ft in enumerate(c.feature_names_in_)
823
+ }
824
+ except KeyError:
825
+ pass
826
+ return c
827
+
828
+ def save_booster(self, path: str):
829
+ """Save a booster object, the underlying representation is a json file.
830
+
831
+ Args:
832
+ path (str): Path to save the booster object.
833
+ """
834
+ self.booster.save_booster(str(path))
835
+
836
+ def _standardize_monotonicity_map(
837
+ self,
838
+ X,
839
+ ) -> Dict[int, Any]:
840
+ if isinstance(X, np.ndarray):
841
+ return self.monotone_constraints
842
+ else:
843
+ feature_map = {f: i for i, f in enumerate(X.columns)}
844
+ return {feature_map[f]: v for f, v in self.monotone_constraints.items()}
845
+
846
+ def _standardize_terminate_missing_features(
847
+ self,
848
+ X,
849
+ ) -> Set[int]:
850
+ if isinstance(X, np.ndarray):
851
+ return set(self.terminate_missing_features)
852
+ else:
853
+ feature_map = {f: i for i, f in enumerate(X.columns)}
854
+ return set(feature_map[f] for f in self.terminate_missing_features)
855
+
856
+ def insert_metadata(self, key: str, value: str):
857
+ """Insert data into the models metadata, this will be saved on the booster object.
858
+
859
+ Args:
860
+ key (str): Key to give the inserted value in the metadata.
861
+ value (str): String value to assign to the key.
862
+ """ # noqa: E501
863
+ self.booster.insert_metadata(key=key, value=value)
864
+
865
+ def get_metadata(self, key: str) -> str:
866
+ """Get the value associated with a given key, on the boosters metadata.
867
+
868
+ Args:
869
+ key (str): Key of item in metadata.
870
+
871
+ Returns:
872
+ str: Value associated with the provided key in the boosters metadata.
873
+ """
874
+ v = self.booster.get_metadata(key=key)
875
+ return v
876
+
877
+ def _set_metadata_attributes(self, key: str, value: Any) -> None:
878
+ value_ = self.metadata_attributes[key].serialize(value)
879
+ self.insert_metadata(key=key, value=value_)
880
+
881
+ def _get_metadata_attributes(self, key: str) -> Any:
882
+ value = self.get_metadata(key)
883
+ return self.metadata_attributes[key].deserialize(value)
884
+
885
+ @property
886
+ def base_score(self) -> Union[float, Iterable[float]]:
887
+ """Base score of the model.
888
+
889
+ Returns:
890
+ Union[float, Iterable[float]]: Base score(s) of the model.
891
+ """
892
+ return self.booster.base_score
893
+
894
+ @property
895
+ def number_of_trees(self) -> Union[int, Iterable[int]]:
896
+ """The number of trees in the model.
897
+
898
+ Returns:
899
+ int: The total number of trees in the model.
900
+ """
901
+ return self.booster.number_of_trees
902
+
903
+ # Make picklable with getstate and setstate
904
+ def __getstate__(self) -> Dict[Any, Any]:
905
+ booster_json = self.json_dump()
906
+ # Delete booster
907
+ # Doing it like this, so it doesn't delete it globally.
908
+ res = {k: v for k, v in self.__dict__.items() if k != "booster"}
909
+ res["__booster_json_file__"] = booster_json
910
+ return res
911
+
912
+ def __setstate__(self, d: Dict[Any, Any]) -> None:
913
+ # Load the booster object the pickled JSon string.
914
+ try:
915
+ booster_object = CratePerpetualBooster.from_json(d["__booster_json_file__"])
916
+ except ValueError:
917
+ booster_object = CrateMultiOutputBooster.from_json(
918
+ d["__booster_json_file__"]
919
+ )
920
+ d["booster"] = booster_object
921
+ # Are there any new parameters, that need to be added to the python object,
922
+ # that would have been loaded in as defaults on the json object?
923
+ # This makes sure that defaults set with a serde default function get
924
+ # carried through to the python object.
925
+ for p, v in booster_object.get_params().items():
926
+ if p not in d:
927
+ d[p] = v
928
+ del d["__booster_json_file__"]
929
+ self.__dict__ = d
930
+
931
+ # Functions for scikit-learn compatibility, will feel out adding these manually,
932
+ # and then if that feels too unwieldy will add scikit-learn as a dependency.
933
+ def get_params(self, deep=True) -> Dict[str, Any]:
934
+ """Get all of the parameters for the booster.
935
+
936
+ Args:
937
+ deep (bool, optional): This argument does nothing, and is simply here for scikit-learn compatibility.. Defaults to True.
938
+
939
+ Returns:
940
+ Dict[str, Any]: The parameters of the booster.
941
+ """
942
+ args = inspect.getfullargspec(PerpetualBooster).kwonlyargs
943
+ return {param: getattr(self, param) for param in args}
944
+
945
+ def set_params(self, **params: Any) -> Self:
946
+ """Set the parameters of the booster, this has the same effect as reinstating the booster.
947
+
948
+ Returns:
949
+ PerpetualBooster: Booster with new parameters.
950
+ """
951
+ old_params = self.get_params()
952
+ old_params.update(params)
953
+ PerpetualBooster.__init__(self, **old_params)
954
+ return self
955
+
956
+ def get_node_lists(self, map_features_names: bool = True) -> List[List[Node]]:
957
+ """Return the tree structures representation as a list of python objects.
958
+
959
+ Args:
960
+ map_features_names (bool, optional): Should the feature names tried to be mapped to a string, if a pandas dataframe was used. Defaults to True.
961
+
962
+ Returns:
963
+ List[List[Node]]: A list of lists where each sub list is a tree, with all of it's respective nodes.
964
+
965
+ Example:
966
+ This can be run directly to get the tree structure as python objects.
967
+
968
+ ```python
969
+ model = PerpetualBooster()
970
+ model.fit(X, y)
971
+
972
+ model.get_node_lists()[0]
973
+
974
+ # [Node(num=0, weight_value...,
975
+ # Node(num=1, weight_value...,
976
+ # Node(num=2, weight_value...,
977
+ # Node(num=3, weight_value...,
978
+ # Node(num=4, weight_value...,
979
+ # Node(num=5, weight_value...,
980
+ # Node(num=6, weight_value...,]
981
+ ```
982
+ """
983
+ model = json.loads(self.json_dump())["trees"]
984
+ feature_map: Union[Dict[int, str], Dict[int, int]]
985
+ leaf_split_feature: Union[str, int]
986
+ if map_features_names and hasattr(self, "feature_names_in_"):
987
+ feature_map = {i: ft for i, ft in enumerate(self.feature_names_in_)}
988
+ leaf_split_feature = ""
989
+ else:
990
+ feature_map = {i: i for i in range(self.n_features_)}
991
+ leaf_split_feature = -1
992
+
993
+ trees = []
994
+ for t in model:
995
+ nodes = []
996
+ for node in t["nodes"].values():
997
+ if not node["is_leaf"]:
998
+ node["split_feature"] = feature_map[node["split_feature"]]
999
+ else:
1000
+ node["split_feature"] = leaf_split_feature
1001
+ nodes.append(Node(**node))
1002
+ trees.append(nodes)
1003
+ return trees
1004
+
1005
+ def trees_to_dataframe(self):
1006
+ """Return the tree structure as a Polars or Pandas DataFrame object.
1007
+
1008
+ Returns:
1009
+ DataFrame: Trees in a Polars or Pandas DataFrame.
1010
+
1011
+ Example:
1012
+ This can be used directly to print out the tree structure as a dataframe. The Leaf values will have the "Gain" column replaced with the weight value.
1013
+
1014
+ ```python
1015
+ model.trees_to_dataframe().head()
1016
+ ```
1017
+
1018
+ | | Tree | Node | ID | Feature | Split | Yes | No | Missing | Gain | Cover |
1019
+ |---:|-------:|-------:|:-----|:----------|--------:|:------|:-----|:----------|--------:|---------:|
1020
+ | 0 | 0 | 0 | 0-0 | pclass | 3 | 0-1 | 0-2 | 0-2 | 91.5083 | 209.388 |
1021
+ | 1 | 0 | 1 | 0-1 | fare | 13.7917 | 0-3 | 0-4 | 0-4 | 28.1855 | 94.0015 |
1022
+ """
1023
+
1024
+ def node_to_row(
1025
+ n: Node,
1026
+ tree_n: int,
1027
+ ) -> Dict[str, Any]:
1028
+ def _id(i: int) -> str:
1029
+ return f"{tree_n}-{i}"
1030
+
1031
+ return dict(
1032
+ Tree=tree_n,
1033
+ Node=n.num,
1034
+ ID=_id(n.num),
1035
+ Feature="Leaf" if n.is_leaf else str(n.split_feature),
1036
+ Split=None if n.is_leaf else n.split_value,
1037
+ Yes=None if n.is_leaf else _id(n.left_child),
1038
+ No=None if n.is_leaf else _id(n.right_child),
1039
+ Missing=None if n.is_leaf else _id(n.missing_node),
1040
+ Gain=n.weight_value if n.is_leaf else n.split_gain,
1041
+ Cover=n.hessian_sum,
1042
+ Left_Cats=n.left_cats,
1043
+ Right_Cats=n.right_cats,
1044
+ )
1045
+
1046
+ # Flatten list of lists using list comprehension
1047
+ vals = [
1048
+ node_to_row(n, i)
1049
+ for i, tree in enumerate(self.get_node_lists())
1050
+ for n in tree
1051
+ ]
1052
+
1053
+ try:
1054
+ import polars as pl
1055
+
1056
+ return pl.from_records(vals).sort(
1057
+ ["Tree", "Node"], descending=[False, False]
1058
+ )
1059
+ except ImportError:
1060
+ import pandas as pd
1061
+
1062
+ return pd.DataFrame.from_records(vals).sort_values(
1063
+ ["Tree", "Node"], ascending=[True, True]
1064
+ )