perpetual 1.1.1__cp312-cp312-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perpetual/__init__.py +11 -0
- perpetual/booster.py +1915 -0
- perpetual/data.py +27 -0
- perpetual/perpetual.cpython-312-darwin.so +0 -0
- perpetual/serialize.py +74 -0
- perpetual/sklearn.py +383 -0
- perpetual/types.py +151 -0
- perpetual/utils.py +463 -0
- perpetual-1.1.1.dist-info/METADATA +177 -0
- perpetual-1.1.1.dist-info/RECORD +12 -0
- perpetual-1.1.1.dist-info/WHEEL +4 -0
- perpetual-1.1.1.dist-info/licenses/LICENSE +674 -0
perpetual/data.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Iterable, Optional, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Node:
|
|
7
|
+
"""Dataclass representation of a node, this represents all of the fields present in a tree node."""
|
|
8
|
+
|
|
9
|
+
num: int
|
|
10
|
+
weight_value: float
|
|
11
|
+
hessian_sum: float
|
|
12
|
+
depth: int = 0
|
|
13
|
+
split_value: float = 0.0
|
|
14
|
+
split_feature: Union[str, int] = ""
|
|
15
|
+
split_gain: float = 0.0
|
|
16
|
+
missing_node: int = 0
|
|
17
|
+
left_child: int = 0
|
|
18
|
+
right_child: int = 0
|
|
19
|
+
is_leaf: bool = False
|
|
20
|
+
node_type: str = "split"
|
|
21
|
+
parent_node: int = 0
|
|
22
|
+
generalization: Optional[float] = None
|
|
23
|
+
left_cats: Optional[Iterable] = None
|
|
24
|
+
right_cats: Optional[Iterable] = None
|
|
25
|
+
count: int = 0
|
|
26
|
+
weights: Optional[Iterable[float]] = None
|
|
27
|
+
stats: Optional[Any] = None
|
|
Binary file
|
perpetual/serialize.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from ast import literal_eval
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Dict, Generic, List, Tuple, TypeVar, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import numpy.typing as npt
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseSerializer(ABC, Generic[T]):
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def serialize(self, obj: T) -> str:
|
|
18
|
+
"""serialize method - should take an object and return a string"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def deserialize(self, obj_repr: str) -> T:
|
|
22
|
+
"""deserialize method - should take a string and return original object"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Scaler = Union[int, float, str]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ScalerSerializer(BaseSerializer[Scaler]):
|
|
29
|
+
def serialize(self, obj: Scaler) -> str:
|
|
30
|
+
if isinstance(obj, str):
|
|
31
|
+
obj_ = f"'{obj}'"
|
|
32
|
+
else:
|
|
33
|
+
obj_ = str(obj)
|
|
34
|
+
return obj_
|
|
35
|
+
|
|
36
|
+
def deserialize(self, obj_repr: str) -> Scaler:
|
|
37
|
+
return literal_eval(node_or_string=obj_repr)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
ObjectItem = Union[
|
|
41
|
+
List[Scaler],
|
|
42
|
+
Dict[str, Scaler],
|
|
43
|
+
Scaler,
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ObjectSerializer(BaseSerializer[ObjectItem]):
|
|
48
|
+
def serialize(self, obj: ObjectItem) -> str:
|
|
49
|
+
return json.dumps(obj)
|
|
50
|
+
|
|
51
|
+
def deserialize(self, obj_repr: str) -> ObjectItem:
|
|
52
|
+
return json.loads(obj_repr)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class NumpyData:
|
|
57
|
+
array: Union[List[float], List[int]]
|
|
58
|
+
dtype: str
|
|
59
|
+
shape: Tuple[int, ...]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class NumpySerializer(BaseSerializer[npt.NDArray]):
|
|
63
|
+
def serialize(self, obj: npt.NDArray) -> str:
|
|
64
|
+
return json.dumps(
|
|
65
|
+
{"array": obj.tolist(), "dtype": str(obj.dtype), "shape": obj.shape}
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def deserialize(self, obj_repr: str) -> npt.NDArray:
|
|
69
|
+
data = NumpyData(**json.loads(obj_repr))
|
|
70
|
+
a = np.array(data.array, dtype=data.dtype) # type: ignore
|
|
71
|
+
if len(data.shape) == 1:
|
|
72
|
+
return a
|
|
73
|
+
else:
|
|
74
|
+
return a.reshape(data.shape)
|
perpetual/sklearn.py
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from types import FunctionType
|
|
3
|
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sklearn.base import ClassifierMixin, RegressorMixin
|
|
6
|
+
from sklearn.metrics import accuracy_score, r2_score
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
from perpetual.booster import PerpetualBooster
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PerpetualClassifier(PerpetualBooster, ClassifierMixin):
|
|
13
|
+
"""
|
|
14
|
+
A scikit-learn compatible classifier based on PerpetualBooster.
|
|
15
|
+
Uses 'LogLoss' as the default objective.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
# Expose the objective explicitly in the __init__ signature to allow
|
|
19
|
+
# scikit-learn to correctly discover and set it via set_params.
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
*,
|
|
23
|
+
objective: Union[
|
|
24
|
+
str, Tuple[FunctionType, FunctionType, FunctionType]
|
|
25
|
+
] = "LogLoss",
|
|
26
|
+
budget: float = 0.5,
|
|
27
|
+
num_threads: Optional[int] = None,
|
|
28
|
+
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
29
|
+
# ... other parameters ...
|
|
30
|
+
max_bin: int = 256,
|
|
31
|
+
max_cat: int = 1000,
|
|
32
|
+
# Capture all parameters in a way that BaseEstimator can handle
|
|
33
|
+
**kwargs,
|
|
34
|
+
):
|
|
35
|
+
"""
|
|
36
|
+
Gradient Boosting Machine with Perpetual Learning.
|
|
37
|
+
|
|
38
|
+
A self-generalizing gradient boosting machine that doesn't need hyperparameter
|
|
39
|
+
optimization. It automatically finds the best configuration based on the provided budget.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
objective : str or tuple, default="LogLoss"
|
|
44
|
+
Learning objective function to be used for optimization. Valid options are:
|
|
45
|
+
|
|
46
|
+
- "LogLoss": logistic loss for binary classification.
|
|
47
|
+
- custom objective: a tuple of (grad, hess, init) functions.
|
|
48
|
+
|
|
49
|
+
budget : float, default=0.5
|
|
50
|
+
A positive number for fitting budget. Increasing this number will more likely result
|
|
51
|
+
in more boosting rounds and increased predictive power.
|
|
52
|
+
num_threads : int, optional
|
|
53
|
+
Number of threads to be used during training and prediction.
|
|
54
|
+
monotone_constraints : dict, optional
|
|
55
|
+
Constraints to enforce a specific relationship between features and target.
|
|
56
|
+
Keys are feature indices or names, values are -1, 1, or 0.
|
|
57
|
+
force_children_to_bound_parent : bool, default=False
|
|
58
|
+
Whether to restrict children nodes to be within the parent's range.
|
|
59
|
+
missing : float, default=np.nan
|
|
60
|
+
Value to consider as missing data.
|
|
61
|
+
allow_missing_splits : bool, default=True
|
|
62
|
+
Whether to allow splits that separate missing from non-missing values.
|
|
63
|
+
create_missing_branch : bool, default=False
|
|
64
|
+
Whether to create a separate branch for missing values (ternary trees).
|
|
65
|
+
terminate_missing_features : iterable, optional
|
|
66
|
+
Features for which missing branches will always be terminated if
|
|
67
|
+
``create_missing_branch`` is True.
|
|
68
|
+
missing_node_treatment : str, default="None"
|
|
69
|
+
How to handle weights for missing nodes if ``create_missing_branch`` is True.
|
|
70
|
+
Options: "None", "AssignToParent", "AverageLeafWeight", "AverageNodeWeight".
|
|
71
|
+
log_iterations : int, default=0
|
|
72
|
+
Logging frequency (every N iterations). 0 disables logging.
|
|
73
|
+
feature_importance_method : str, default="Gain"
|
|
74
|
+
Method for calculating feature importance. Options: "Gain", "Weight", "Cover",
|
|
75
|
+
"TotalGain", "TotalCover".
|
|
76
|
+
quantile : float, optional
|
|
77
|
+
Target quantile for quantile regression (objective="QuantileLoss").
|
|
78
|
+
reset : bool, optional
|
|
79
|
+
Whether to reset the model or continue training on subsequent calls to fit.
|
|
80
|
+
categorical_features : str or iterable, default="auto"
|
|
81
|
+
Feature indices or names to treat as categorical.
|
|
82
|
+
timeout : float, optional
|
|
83
|
+
Time limit for fitting in seconds.
|
|
84
|
+
iteration_limit : int, optional
|
|
85
|
+
Maximum number of boosting iterations.
|
|
86
|
+
memory_limit : float, optional
|
|
87
|
+
Memory limit for training in GB.
|
|
88
|
+
stopping_rounds : int, optional
|
|
89
|
+
Early stopping rounds.
|
|
90
|
+
max_bin : int, default=256
|
|
91
|
+
Maximum number of bins for feature discretization.
|
|
92
|
+
max_cat : int, default=1000
|
|
93
|
+
Maximum unique categories before a feature is treated as numerical.
|
|
94
|
+
**kwargs
|
|
95
|
+
Arbitrary keyword arguments to be passed to the base class.
|
|
96
|
+
"""
|
|
97
|
+
# Ensure the objective is one of the valid classification objectives
|
|
98
|
+
valid_objectives = {
|
|
99
|
+
"LogLoss"
|
|
100
|
+
} # Assuming only LogLoss for classification for simplicity
|
|
101
|
+
if isinstance(objective, str) and objective not in valid_objectives:
|
|
102
|
+
# Custom objectives are allowed via the tuple form
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
super().__init__(
|
|
106
|
+
objective=objective,
|
|
107
|
+
budget=budget,
|
|
108
|
+
num_threads=num_threads,
|
|
109
|
+
monotone_constraints=monotone_constraints,
|
|
110
|
+
# ... pass all other parameters ...
|
|
111
|
+
max_bin=max_bin,
|
|
112
|
+
max_cat=max_cat,
|
|
113
|
+
**kwargs, # Catch-all for any other parameters passed by user or set_params
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# fit, predict, predict_proba, and predict_log_proba are inherited
|
|
117
|
+
# and properly adapted in PerpetualBooster.
|
|
118
|
+
|
|
119
|
+
def score(self, X, y, sample_weight=None):
|
|
120
|
+
"""Returns the mean accuracy on the given test data and labels."""
|
|
121
|
+
preds = self.predict(X)
|
|
122
|
+
return accuracy_score(y, preds, sample_weight=sample_weight)
|
|
123
|
+
|
|
124
|
+
def fit(self, X, y, sample_weight=None, **fit_params) -> Self:
|
|
125
|
+
"""A wrapper for the base fit method."""
|
|
126
|
+
# Check if objective is appropriate for classification if it's a string
|
|
127
|
+
if isinstance(self.objective, str) and self.objective not in ["LogLoss"]:
|
|
128
|
+
warnings.warn(
|
|
129
|
+
f"Objective '{self.objective}' is typically for regression/ranking but used in PerpetualClassifier. Consider 'LogLoss'."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# In classification, the labels (classes_) are set in the base fit.
|
|
133
|
+
return super().fit(X, y, sample_weight=sample_weight, **fit_params)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class PerpetualRegressor(PerpetualBooster, RegressorMixin):
|
|
137
|
+
"""
|
|
138
|
+
A scikit-learn compatible regressor based on PerpetualBooster.
|
|
139
|
+
Uses 'SquaredLoss' as the default objective.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(
|
|
143
|
+
self,
|
|
144
|
+
*,
|
|
145
|
+
objective: Union[
|
|
146
|
+
str, Tuple[FunctionType, FunctionType, FunctionType]
|
|
147
|
+
] = "SquaredLoss",
|
|
148
|
+
budget: float = 0.5,
|
|
149
|
+
num_threads: Optional[int] = None,
|
|
150
|
+
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
151
|
+
# ... other parameters ...
|
|
152
|
+
max_bin: int = 256,
|
|
153
|
+
max_cat: int = 1000,
|
|
154
|
+
**kwargs,
|
|
155
|
+
):
|
|
156
|
+
"""
|
|
157
|
+
Gradient Boosting Machine with Perpetual Learning.
|
|
158
|
+
|
|
159
|
+
A self-generalizing gradient boosting machine that doesn't need hyperparameter
|
|
160
|
+
optimization. It automatically finds the best configuration based on the provided budget.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
objective : str or tuple, default="SquaredLoss"
|
|
165
|
+
Learning objective function to be used for optimization. Valid options are:
|
|
166
|
+
|
|
167
|
+
- "SquaredLoss": squared error for regression.
|
|
168
|
+
- "QuantileLoss": quantile error for quantile regression.
|
|
169
|
+
- "HuberLoss": Huber loss for robust regression.
|
|
170
|
+
- "AdaptiveHuberLoss": adaptive Huber loss for robust regression.
|
|
171
|
+
- custom objective: a tuple of (grad, hess, init) functions.
|
|
172
|
+
|
|
173
|
+
budget : float, default=0.5
|
|
174
|
+
A positive number for fitting budget. Increasing this number will more likely result
|
|
175
|
+
in more boosting rounds and increased predictive power.
|
|
176
|
+
num_threads : int, optional
|
|
177
|
+
Number of threads to be used during training and prediction.
|
|
178
|
+
monotone_constraints : dict, optional
|
|
179
|
+
Constraints to enforce a specific relationship between features and target.
|
|
180
|
+
Keys are feature indices or names, values are -1, 1, or 0.
|
|
181
|
+
force_children_to_bound_parent : bool, default=False
|
|
182
|
+
Whether to restrict children nodes to be within the parent's range.
|
|
183
|
+
missing : float, default=np.nan
|
|
184
|
+
Value to consider as missing data.
|
|
185
|
+
allow_missing_splits : bool, default=True
|
|
186
|
+
Whether to allow splits that separate missing from non-missing values.
|
|
187
|
+
create_missing_branch : bool, default=False
|
|
188
|
+
Whether to create a separate branch for missing values (ternary trees).
|
|
189
|
+
terminate_missing_features : iterable, optional
|
|
190
|
+
Features for which missing branches will always be terminated if
|
|
191
|
+
``create_missing_branch`` is True.
|
|
192
|
+
missing_node_treatment : str, default="None"
|
|
193
|
+
How to handle weights for missing nodes if ``create_missing_branch`` is True.
|
|
194
|
+
Options: "None", "AssignToParent", "AverageLeafWeight", "AverageNodeWeight".
|
|
195
|
+
log_iterations : int, default=0
|
|
196
|
+
Logging frequency (every N iterations). 0 disables logging.
|
|
197
|
+
feature_importance_method : str, default="Gain"
|
|
198
|
+
Method for calculating feature importance. Options: "Gain", "Weight", "Cover",
|
|
199
|
+
"TotalGain", "TotalCover".
|
|
200
|
+
quantile : float, optional
|
|
201
|
+
Target quantile for quantile regression (objective="QuantileLoss").
|
|
202
|
+
reset : bool, optional
|
|
203
|
+
Whether to reset the model or continue training on subsequent calls to fit.
|
|
204
|
+
categorical_features : str or iterable, default="auto"
|
|
205
|
+
Feature indices or names to treat as categorical.
|
|
206
|
+
timeout : float, optional
|
|
207
|
+
Time limit for fitting in seconds.
|
|
208
|
+
iteration_limit : int, optional
|
|
209
|
+
Maximum number of boosting iterations.
|
|
210
|
+
memory_limit : float, optional
|
|
211
|
+
Memory limit for training in GB.
|
|
212
|
+
stopping_rounds : int, optional
|
|
213
|
+
Early stopping rounds.
|
|
214
|
+
max_bin : int, default=256
|
|
215
|
+
Maximum number of bins for feature discretization.
|
|
216
|
+
max_cat : int, default=1000
|
|
217
|
+
Maximum unique categories before a feature is treated as numerical.
|
|
218
|
+
**kwargs
|
|
219
|
+
Arbitrary keyword arguments to be passed to the base class.
|
|
220
|
+
"""
|
|
221
|
+
# Enforce or warn about regression objectives
|
|
222
|
+
valid_objectives = {
|
|
223
|
+
"SquaredLoss",
|
|
224
|
+
"QuantileLoss",
|
|
225
|
+
"HuberLoss",
|
|
226
|
+
"AdaptiveHuberLoss",
|
|
227
|
+
}
|
|
228
|
+
if isinstance(objective, str) and objective not in valid_objectives:
|
|
229
|
+
pass # Allow for custom string or tuple objective
|
|
230
|
+
|
|
231
|
+
super().__init__(
|
|
232
|
+
objective=objective,
|
|
233
|
+
budget=budget,
|
|
234
|
+
num_threads=num_threads,
|
|
235
|
+
monotone_constraints=monotone_constraints,
|
|
236
|
+
# ... pass all other parameters ...
|
|
237
|
+
max_bin=max_bin,
|
|
238
|
+
max_cat=max_cat,
|
|
239
|
+
**kwargs,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
def fit(self, X, y, sample_weight=None, **fit_params) -> Self:
|
|
243
|
+
"""A wrapper for the base fit method."""
|
|
244
|
+
# For regression, we typically enforce len(self.classes_) == 0 after fit
|
|
245
|
+
if isinstance(self.objective, str) and self.objective not in [
|
|
246
|
+
"SquaredLoss",
|
|
247
|
+
"QuantileLoss",
|
|
248
|
+
"HuberLoss",
|
|
249
|
+
"AdaptiveHuberLoss",
|
|
250
|
+
]:
|
|
251
|
+
warnings.warn(
|
|
252
|
+
f"Objective '{self.objective}' may not be suitable for PerpetualRegressor. Consider 'SquaredLoss' or a quantile/huber loss."
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return super().fit(X, y, sample_weight=sample_weight, **fit_params)
|
|
256
|
+
|
|
257
|
+
def score(self, X, y, sample_weight=None):
|
|
258
|
+
"""Returns the coefficient of determination ($R^2$) of the prediction."""
|
|
259
|
+
preds = self.predict(X)
|
|
260
|
+
return r2_score(y, preds, sample_weight=sample_weight)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class PerpetualRanker(
|
|
264
|
+
PerpetualBooster, RegressorMixin
|
|
265
|
+
): # Ranking models sometimes inherit from RegressorMixin for compatibility
|
|
266
|
+
"""
|
|
267
|
+
A scikit-learn compatible ranker based on PerpetualBooster.
|
|
268
|
+
Uses 'ListNetLoss' as the default objective.
|
|
269
|
+
Requires the 'group' parameter to be passed to fit.
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
def __init__(
|
|
273
|
+
self,
|
|
274
|
+
*,
|
|
275
|
+
objective: Union[
|
|
276
|
+
str, Tuple[FunctionType, FunctionType, FunctionType]
|
|
277
|
+
] = "ListNetLoss",
|
|
278
|
+
budget: float = 0.5,
|
|
279
|
+
num_threads: Optional[int] = None,
|
|
280
|
+
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
281
|
+
# ... other parameters ...
|
|
282
|
+
max_bin: int = 256,
|
|
283
|
+
max_cat: int = 1000,
|
|
284
|
+
**kwargs,
|
|
285
|
+
):
|
|
286
|
+
"""
|
|
287
|
+
Gradient Boosting Machine with Perpetual Learning.
|
|
288
|
+
|
|
289
|
+
A self-generalizing gradient boosting machine that doesn't need hyperparameter
|
|
290
|
+
optimization. It automatically finds the best configuration based on the provided budget.
|
|
291
|
+
|
|
292
|
+
Parameters
|
|
293
|
+
----------
|
|
294
|
+
objective : str or tuple, default="ListNetLoss"
|
|
295
|
+
Learning objective function to be used for optimization. Valid options are:
|
|
296
|
+
|
|
297
|
+
- "ListNetLoss": ListNet loss for ranking.
|
|
298
|
+
- custom objective: a tuple of (grad, hess, init) functions.
|
|
299
|
+
|
|
300
|
+
budget : float, default=0.5
|
|
301
|
+
A positive number for fitting budget. Increasing this number will more likely result
|
|
302
|
+
in more boosting rounds and increased predictive power.
|
|
303
|
+
num_threads : int, optional
|
|
304
|
+
Number of threads to be used during training and prediction.
|
|
305
|
+
monotone_constraints : dict, optional
|
|
306
|
+
Constraints to enforce a specific relationship between features and target.
|
|
307
|
+
Keys are feature indices or names, values are -1, 1, or 0.
|
|
308
|
+
force_children_to_bound_parent : bool, default=False
|
|
309
|
+
Whether to restrict children nodes to be within the parent's range.
|
|
310
|
+
missing : float, default=np.nan
|
|
311
|
+
Value to consider as missing data.
|
|
312
|
+
allow_missing_splits : bool, default=True
|
|
313
|
+
Whether to allow splits that separate missing from non-missing values.
|
|
314
|
+
create_missing_branch : bool, default=False
|
|
315
|
+
Whether to create a separate branch for missing values (ternary trees).
|
|
316
|
+
terminate_missing_features : iterable, optional
|
|
317
|
+
Features for which missing branches will always be terminated if
|
|
318
|
+
``create_missing_branch`` is True.
|
|
319
|
+
missing_node_treatment : str, default="None"
|
|
320
|
+
How to handle weights for missing nodes if ``create_missing_branch`` is True.
|
|
321
|
+
Options: "None", "AssignToParent", "AverageLeafWeight", "AverageNodeWeight".
|
|
322
|
+
log_iterations : int, default=0
|
|
323
|
+
Logging frequency (every N iterations). 0 disables logging.
|
|
324
|
+
feature_importance_method : str, default="Gain"
|
|
325
|
+
Method for calculating feature importance. Options: "Gain", "Weight", "Cover",
|
|
326
|
+
"TotalGain", "TotalCover".
|
|
327
|
+
quantile : float, optional
|
|
328
|
+
Target quantile for quantile regression (objective="QuantileLoss").
|
|
329
|
+
reset : bool, optional
|
|
330
|
+
Whether to reset the model or continue training on subsequent calls to fit.
|
|
331
|
+
categorical_features : str or iterable, default="auto"
|
|
332
|
+
Feature indices or names to treat as categorical.
|
|
333
|
+
timeout : float, optional
|
|
334
|
+
Time limit for fitting in seconds.
|
|
335
|
+
iteration_limit : int, optional
|
|
336
|
+
Maximum number of boosting iterations.
|
|
337
|
+
memory_limit : float, optional
|
|
338
|
+
Memory limit for training in GB.
|
|
339
|
+
stopping_rounds : int, optional
|
|
340
|
+
Early stopping rounds.
|
|
341
|
+
max_bin : int, default=256
|
|
342
|
+
Maximum number of bins for feature discretization.
|
|
343
|
+
max_cat : int, default=1000
|
|
344
|
+
Maximum unique categories before a feature is treated as numerical.
|
|
345
|
+
**kwargs
|
|
346
|
+
Arbitrary keyword arguments to be passed to the base class.
|
|
347
|
+
"""
|
|
348
|
+
if isinstance(objective, str) and objective not in {"ListNetLoss"}:
|
|
349
|
+
warnings.warn(
|
|
350
|
+
f"Objective '{objective}' may not be suitable for PerpetualRanker. Consider 'ListNetLoss'."
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
super().__init__(
|
|
354
|
+
objective=objective,
|
|
355
|
+
budget=budget,
|
|
356
|
+
num_threads=num_threads,
|
|
357
|
+
monotone_constraints=monotone_constraints,
|
|
358
|
+
# ... pass all other parameters ...
|
|
359
|
+
max_bin=max_bin,
|
|
360
|
+
max_cat=max_cat,
|
|
361
|
+
**kwargs,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
def fit(self, X, y, group=None, sample_weight=None, **fit_params) -> Self:
|
|
365
|
+
"""
|
|
366
|
+
Fit the ranker. Requires the 'group' parameter.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
X: Training data.
|
|
370
|
+
y: Target relevance scores.
|
|
371
|
+
group: Group lengths to use for a ranking objective. (Required for ListNetLoss).
|
|
372
|
+
sample_weight: Instance weights.
|
|
373
|
+
"""
|
|
374
|
+
if (
|
|
375
|
+
group is None
|
|
376
|
+
and isinstance(self.objective, str)
|
|
377
|
+
and self.objective == "ListNetLoss"
|
|
378
|
+
):
|
|
379
|
+
raise ValueError(
|
|
380
|
+
"The 'group' parameter must be provided when using the 'ListNetLoss' objective for ranking."
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return super().fit(X, y, sample_weight=sample_weight, group=group, **fit_params)
|
perpetual/types.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterable, Protocol, Set
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing_extensions import Self
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BoosterType(Protocol):
|
|
8
|
+
monotone_constraints: Dict[int, int]
|
|
9
|
+
terminate_missing_features: Set[int]
|
|
10
|
+
number_of_trees: int
|
|
11
|
+
base_score: float
|
|
12
|
+
|
|
13
|
+
def fit(
|
|
14
|
+
self,
|
|
15
|
+
flat_data: np.ndarray,
|
|
16
|
+
rows: int,
|
|
17
|
+
cols: int,
|
|
18
|
+
y: np.ndarray,
|
|
19
|
+
budget: float,
|
|
20
|
+
sample_weight: np.ndarray,
|
|
21
|
+
parallel: bool = False,
|
|
22
|
+
):
|
|
23
|
+
"""Fit method"""
|
|
24
|
+
|
|
25
|
+
def predict(
|
|
26
|
+
self,
|
|
27
|
+
flat_data: np.ndarray,
|
|
28
|
+
rows: int,
|
|
29
|
+
cols: int,
|
|
30
|
+
parallel: bool = True,
|
|
31
|
+
) -> np.ndarray:
|
|
32
|
+
"""predict method"""
|
|
33
|
+
|
|
34
|
+
def predict_proba(
|
|
35
|
+
self,
|
|
36
|
+
flat_data: np.ndarray,
|
|
37
|
+
rows: int,
|
|
38
|
+
cols: int,
|
|
39
|
+
parallel: bool = True,
|
|
40
|
+
) -> np.ndarray:
|
|
41
|
+
"""predict probabilities method"""
|
|
42
|
+
|
|
43
|
+
def predict_contributions(
|
|
44
|
+
self,
|
|
45
|
+
flat_data: np.ndarray,
|
|
46
|
+
rows: int,
|
|
47
|
+
cols: int,
|
|
48
|
+
method: str,
|
|
49
|
+
parallel: bool = True,
|
|
50
|
+
) -> np.ndarray:
|
|
51
|
+
"""method"""
|
|
52
|
+
|
|
53
|
+
def value_partial_dependence(
|
|
54
|
+
self,
|
|
55
|
+
feature: int,
|
|
56
|
+
value: float,
|
|
57
|
+
) -> float:
|
|
58
|
+
"""pass"""
|
|
59
|
+
|
|
60
|
+
def calculate_feature_importance(
|
|
61
|
+
self,
|
|
62
|
+
method: str,
|
|
63
|
+
normalize: bool,
|
|
64
|
+
) -> Dict[int, float]:
|
|
65
|
+
"""pass"""
|
|
66
|
+
|
|
67
|
+
def text_dump(self) -> Iterable[str]:
|
|
68
|
+
"""pass"""
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def load_booster(cls, path: str) -> Self:
|
|
72
|
+
"""pass"""
|
|
73
|
+
|
|
74
|
+
def save_booster(self, path: str):
|
|
75
|
+
"""pass"""
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def from_json(cls, json_str: str) -> Self:
|
|
79
|
+
"""pass"""
|
|
80
|
+
|
|
81
|
+
def json_dump(self) -> str:
|
|
82
|
+
"""pass"""
|
|
83
|
+
|
|
84
|
+
def get_params(self) -> Dict[str, Any]:
|
|
85
|
+
"""pass"""
|
|
86
|
+
|
|
87
|
+
def insert_metadata(self, key: str, value: str) -> None:
|
|
88
|
+
"""pass"""
|
|
89
|
+
|
|
90
|
+
def get_metadata(self, key: str) -> str:
|
|
91
|
+
"""pass"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class MultiOutputBoosterType(Protocol):
|
|
95
|
+
monotone_constraints: Dict[int, int]
|
|
96
|
+
terminate_missing_features: Set[int]
|
|
97
|
+
number_of_trees: Iterable[int]
|
|
98
|
+
base_score: Iterable[float]
|
|
99
|
+
|
|
100
|
+
def fit(
|
|
101
|
+
self,
|
|
102
|
+
flat_data: np.ndarray,
|
|
103
|
+
rows: int,
|
|
104
|
+
cols: int,
|
|
105
|
+
y: np.ndarray,
|
|
106
|
+
budget: float,
|
|
107
|
+
sample_weight: np.ndarray,
|
|
108
|
+
parallel: bool = False,
|
|
109
|
+
):
|
|
110
|
+
"""Fit method"""
|
|
111
|
+
|
|
112
|
+
def predict(
|
|
113
|
+
self,
|
|
114
|
+
flat_data: np.ndarray,
|
|
115
|
+
rows: int,
|
|
116
|
+
cols: int,
|
|
117
|
+
parallel: bool = True,
|
|
118
|
+
) -> np.ndarray:
|
|
119
|
+
"""predict method"""
|
|
120
|
+
|
|
121
|
+
def predict_proba(
|
|
122
|
+
self,
|
|
123
|
+
flat_data: np.ndarray,
|
|
124
|
+
rows: int,
|
|
125
|
+
cols: int,
|
|
126
|
+
parallel: bool = True,
|
|
127
|
+
) -> np.ndarray:
|
|
128
|
+
"""predict probabilities method"""
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def load_booster(cls, path: str) -> Self:
|
|
132
|
+
"""pass"""
|
|
133
|
+
|
|
134
|
+
def save_booster(self, path: str):
|
|
135
|
+
"""pass"""
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def from_json(cls, json_str: str) -> Self:
|
|
139
|
+
"""pass"""
|
|
140
|
+
|
|
141
|
+
def json_dump(self) -> str:
|
|
142
|
+
"""pass"""
|
|
143
|
+
|
|
144
|
+
def get_params(self) -> Dict[str, Any]:
|
|
145
|
+
"""pass"""
|
|
146
|
+
|
|
147
|
+
def insert_metadata(self, key: str, value: str) -> None:
|
|
148
|
+
"""pass"""
|
|
149
|
+
|
|
150
|
+
def get_metadata(self, key: str) -> str:
|
|
151
|
+
"""pass"""
|