perpetual 1.0.40__cp311-cp311-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perpetual/__init__.py +5 -0
- perpetual/booster.py +1915 -0
- perpetual/data.py +27 -0
- perpetual/perpetual.cpython-311-darwin.so +0 -0
- perpetual/serialize.py +74 -0
- perpetual/sklearn.py +194 -0
- perpetual/types.py +151 -0
- perpetual/utils.py +462 -0
- perpetual-1.0.40.dist-info/METADATA +169 -0
- perpetual-1.0.40.dist-info/RECORD +12 -0
- perpetual-1.0.40.dist-info/WHEEL +4 -0
- perpetual-1.0.40.dist-info/licenses/LICENSE +674 -0
perpetual/data.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Iterable, Optional, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Node:
|
|
7
|
+
"""Dataclass representation of a node, this represents all of the fields present in a tree node."""
|
|
8
|
+
|
|
9
|
+
num: int
|
|
10
|
+
weight_value: float
|
|
11
|
+
hessian_sum: float
|
|
12
|
+
depth: int = 0
|
|
13
|
+
split_value: float = 0.0
|
|
14
|
+
split_feature: Union[str, int] = ""
|
|
15
|
+
split_gain: float = 0.0
|
|
16
|
+
missing_node: int = 0
|
|
17
|
+
left_child: int = 0
|
|
18
|
+
right_child: int = 0
|
|
19
|
+
is_leaf: bool = False
|
|
20
|
+
node_type: str = "split"
|
|
21
|
+
parent_node: int = 0
|
|
22
|
+
generalization: Optional[float] = None
|
|
23
|
+
left_cats: Optional[Iterable] = None
|
|
24
|
+
right_cats: Optional[Iterable] = None
|
|
25
|
+
count: int = 0
|
|
26
|
+
weights: Optional[Iterable[float]] = None
|
|
27
|
+
stats: Optional[Any] = None
|
|
Binary file
|
perpetual/serialize.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from ast import literal_eval
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Dict, Generic, List, Tuple, TypeVar, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import numpy.typing as npt
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseSerializer(ABC, Generic[T]):
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def serialize(self, obj: T) -> str:
|
|
18
|
+
"""serialize method - should take an object and return a string"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def deserialize(self, obj_repr: str) -> T:
|
|
22
|
+
"""deserialize method - should take a string and return original object"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Scaler = Union[int, float, str]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ScalerSerializer(BaseSerializer[Scaler]):
|
|
29
|
+
def serialize(self, obj: Scaler) -> str:
|
|
30
|
+
if isinstance(obj, str):
|
|
31
|
+
obj_ = f"'{obj}'"
|
|
32
|
+
else:
|
|
33
|
+
obj_ = str(obj)
|
|
34
|
+
return obj_
|
|
35
|
+
|
|
36
|
+
def deserialize(self, obj_repr: str) -> Scaler:
|
|
37
|
+
return literal_eval(node_or_string=obj_repr)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
ObjectItem = Union[
|
|
41
|
+
List[Scaler],
|
|
42
|
+
Dict[str, Scaler],
|
|
43
|
+
Scaler,
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ObjectSerializer(BaseSerializer[ObjectItem]):
|
|
48
|
+
def serialize(self, obj: ObjectItem) -> str:
|
|
49
|
+
return json.dumps(obj)
|
|
50
|
+
|
|
51
|
+
def deserialize(self, obj_repr: str) -> ObjectItem:
|
|
52
|
+
return json.loads(obj_repr)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class NumpyData:
|
|
57
|
+
array: Union[List[float], List[int]]
|
|
58
|
+
dtype: str
|
|
59
|
+
shape: Tuple[int, ...]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class NumpySerializer(BaseSerializer[npt.NDArray]):
|
|
63
|
+
def serialize(self, obj: npt.NDArray) -> str:
|
|
64
|
+
return json.dumps(
|
|
65
|
+
{"array": obj.tolist(), "dtype": str(obj.dtype), "shape": obj.shape}
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def deserialize(self, obj_repr: str) -> npt.NDArray:
|
|
69
|
+
data = NumpyData(**json.loads(obj_repr))
|
|
70
|
+
a = np.array(data.array, dtype=data.dtype) # type: ignore
|
|
71
|
+
if len(data.shape) == 1:
|
|
72
|
+
return a
|
|
73
|
+
else:
|
|
74
|
+
return a.reshape(data.shape)
|
perpetual/sklearn.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from types import FunctionType
|
|
3
|
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sklearn.base import ClassifierMixin, RegressorMixin
|
|
6
|
+
from sklearn.metrics import accuracy_score, r2_score
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
from perpetual.booster import PerpetualBooster
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PerpetualClassifier(PerpetualBooster, ClassifierMixin):
|
|
13
|
+
"""
|
|
14
|
+
A scikit-learn compatible classifier based on PerpetualBooster.
|
|
15
|
+
Uses 'LogLoss' as the default objective.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
# Expose the objective explicitly in the __init__ signature to allow
|
|
19
|
+
# scikit-learn to correctly discover and set it via set_params.
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
*,
|
|
23
|
+
objective: Union[
|
|
24
|
+
str, Tuple[FunctionType, FunctionType, FunctionType]
|
|
25
|
+
] = "LogLoss",
|
|
26
|
+
budget: float = 0.5,
|
|
27
|
+
num_threads: Optional[int] = None,
|
|
28
|
+
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
29
|
+
# ... other parameters ...
|
|
30
|
+
max_bin: int = 256,
|
|
31
|
+
max_cat: int = 1000,
|
|
32
|
+
# Capture all parameters in a way that BaseEstimator can handle
|
|
33
|
+
**kwargs,
|
|
34
|
+
):
|
|
35
|
+
# Ensure the objective is one of the valid classification objectives
|
|
36
|
+
valid_objectives = {
|
|
37
|
+
"LogLoss"
|
|
38
|
+
} # Assuming only LogLoss for classification for simplicity
|
|
39
|
+
if isinstance(objective, str) and objective not in valid_objectives:
|
|
40
|
+
# Custom objectives are allowed via the tuple form
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
super().__init__(
|
|
44
|
+
objective=objective,
|
|
45
|
+
budget=budget,
|
|
46
|
+
num_threads=num_threads,
|
|
47
|
+
monotone_constraints=monotone_constraints,
|
|
48
|
+
# ... pass all other parameters ...
|
|
49
|
+
max_bin=max_bin,
|
|
50
|
+
max_cat=max_cat,
|
|
51
|
+
**kwargs, # Catch-all for any other parameters passed by user or set_params
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# fit, predict, predict_proba, and predict_log_proba are inherited
|
|
55
|
+
# and properly adapted in PerpetualBooster.
|
|
56
|
+
|
|
57
|
+
def score(self, X, y, sample_weight=None):
|
|
58
|
+
"""Returns the mean accuracy on the given test data and labels."""
|
|
59
|
+
preds = self.predict(X)
|
|
60
|
+
return accuracy_score(y, preds, sample_weight=sample_weight)
|
|
61
|
+
|
|
62
|
+
def fit(self, X, y, sample_weight=None, **fit_params) -> Self:
|
|
63
|
+
"""A wrapper for the base fit method."""
|
|
64
|
+
# Check if objective is appropriate for classification if it's a string
|
|
65
|
+
if isinstance(self.objective, str) and self.objective not in ["LogLoss"]:
|
|
66
|
+
warnings.warn(
|
|
67
|
+
f"Objective '{self.objective}' is typically for regression/ranking but used in PerpetualClassifier. Consider 'LogLoss'."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# In classification, the labels (classes_) are set in the base fit.
|
|
71
|
+
return super().fit(X, y, sample_weight=sample_weight, **fit_params)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class PerpetualRegressor(PerpetualBooster, RegressorMixin):
|
|
75
|
+
"""
|
|
76
|
+
A scikit-learn compatible regressor based on PerpetualBooster.
|
|
77
|
+
Uses 'SquaredLoss' as the default objective.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
*,
|
|
83
|
+
objective: Union[
|
|
84
|
+
str, Tuple[FunctionType, FunctionType, FunctionType]
|
|
85
|
+
] = "SquaredLoss",
|
|
86
|
+
budget: float = 0.5,
|
|
87
|
+
num_threads: Optional[int] = None,
|
|
88
|
+
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
89
|
+
# ... other parameters ...
|
|
90
|
+
max_bin: int = 256,
|
|
91
|
+
max_cat: int = 1000,
|
|
92
|
+
**kwargs,
|
|
93
|
+
):
|
|
94
|
+
# Enforce or warn about regression objectives
|
|
95
|
+
valid_objectives = {
|
|
96
|
+
"SquaredLoss",
|
|
97
|
+
"QuantileLoss",
|
|
98
|
+
"HuberLoss",
|
|
99
|
+
"AdaptiveHuberLoss",
|
|
100
|
+
}
|
|
101
|
+
if isinstance(objective, str) and objective not in valid_objectives:
|
|
102
|
+
pass # Allow for custom string or tuple objective
|
|
103
|
+
|
|
104
|
+
super().__init__(
|
|
105
|
+
objective=objective,
|
|
106
|
+
budget=budget,
|
|
107
|
+
num_threads=num_threads,
|
|
108
|
+
monotone_constraints=monotone_constraints,
|
|
109
|
+
# ... pass all other parameters ...
|
|
110
|
+
max_bin=max_bin,
|
|
111
|
+
max_cat=max_cat,
|
|
112
|
+
**kwargs,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def fit(self, X, y, sample_weight=None, **fit_params) -> Self:
|
|
116
|
+
"""A wrapper for the base fit method."""
|
|
117
|
+
# For regression, we typically enforce len(self.classes_) == 0 after fit
|
|
118
|
+
if isinstance(self.objective, str) and self.objective not in [
|
|
119
|
+
"SquaredLoss",
|
|
120
|
+
"QuantileLoss",
|
|
121
|
+
"HuberLoss",
|
|
122
|
+
"AdaptiveHuberLoss",
|
|
123
|
+
]:
|
|
124
|
+
warnings.warn(
|
|
125
|
+
f"Objective '{self.objective}' may not be suitable for PerpetualRegressor. Consider 'SquaredLoss' or a quantile/huber loss."
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return super().fit(X, y, sample_weight=sample_weight, **fit_params)
|
|
129
|
+
|
|
130
|
+
def score(self, X, y, sample_weight=None):
|
|
131
|
+
"""Returns the coefficient of determination ($R^2$) of the prediction."""
|
|
132
|
+
preds = self.predict(X)
|
|
133
|
+
return r2_score(y, preds, sample_weight=sample_weight)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class PerpetualRanker(
|
|
137
|
+
PerpetualBooster, RegressorMixin
|
|
138
|
+
): # Ranking models sometimes inherit from RegressorMixin for compatibility
|
|
139
|
+
"""
|
|
140
|
+
A scikit-learn compatible ranker based on PerpetualBooster.
|
|
141
|
+
Uses 'ListNetLoss' as the default objective.
|
|
142
|
+
Requires the 'group' parameter to be passed to fit.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self,
|
|
147
|
+
*,
|
|
148
|
+
objective: Union[
|
|
149
|
+
str, Tuple[FunctionType, FunctionType, FunctionType]
|
|
150
|
+
] = "ListNetLoss",
|
|
151
|
+
budget: float = 0.5,
|
|
152
|
+
num_threads: Optional[int] = None,
|
|
153
|
+
monotone_constraints: Union[Dict[Any, int], None] = None,
|
|
154
|
+
# ... other parameters ...
|
|
155
|
+
max_bin: int = 256,
|
|
156
|
+
max_cat: int = 1000,
|
|
157
|
+
**kwargs,
|
|
158
|
+
):
|
|
159
|
+
if isinstance(objective, str) and objective not in {"ListNetLoss"}:
|
|
160
|
+
warnings.warn(
|
|
161
|
+
f"Objective '{objective}' may not be suitable for PerpetualRanker. Consider 'ListNetLoss'."
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
super().__init__(
|
|
165
|
+
objective=objective,
|
|
166
|
+
budget=budget,
|
|
167
|
+
num_threads=num_threads,
|
|
168
|
+
monotone_constraints=monotone_constraints,
|
|
169
|
+
# ... pass all other parameters ...
|
|
170
|
+
max_bin=max_bin,
|
|
171
|
+
max_cat=max_cat,
|
|
172
|
+
**kwargs,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def fit(self, X, y, group=None, sample_weight=None, **fit_params) -> Self:
|
|
176
|
+
"""
|
|
177
|
+
Fit the ranker. Requires the 'group' parameter.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
X: Training data.
|
|
181
|
+
y: Target relevance scores.
|
|
182
|
+
group: Group lengths to use for a ranking objective. (Required for ListNetLoss).
|
|
183
|
+
sample_weight: Instance weights.
|
|
184
|
+
"""
|
|
185
|
+
if (
|
|
186
|
+
group is None
|
|
187
|
+
and isinstance(self.objective, str)
|
|
188
|
+
and self.objective == "ListNetLoss"
|
|
189
|
+
):
|
|
190
|
+
raise ValueError(
|
|
191
|
+
"The 'group' parameter must be provided when using the 'ListNetLoss' objective for ranking."
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return super().fit(X, y, sample_weight=sample_weight, group=group, **fit_params)
|
perpetual/types.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterable, Protocol, Set
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing_extensions import Self
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BoosterType(Protocol):
|
|
8
|
+
monotone_constraints: Dict[int, int]
|
|
9
|
+
terminate_missing_features: Set[int]
|
|
10
|
+
number_of_trees: int
|
|
11
|
+
base_score: float
|
|
12
|
+
|
|
13
|
+
def fit(
|
|
14
|
+
self,
|
|
15
|
+
flat_data: np.ndarray,
|
|
16
|
+
rows: int,
|
|
17
|
+
cols: int,
|
|
18
|
+
y: np.ndarray,
|
|
19
|
+
budget: float,
|
|
20
|
+
sample_weight: np.ndarray,
|
|
21
|
+
parallel: bool = False,
|
|
22
|
+
):
|
|
23
|
+
"""Fit method"""
|
|
24
|
+
|
|
25
|
+
def predict(
|
|
26
|
+
self,
|
|
27
|
+
flat_data: np.ndarray,
|
|
28
|
+
rows: int,
|
|
29
|
+
cols: int,
|
|
30
|
+
parallel: bool = True,
|
|
31
|
+
) -> np.ndarray:
|
|
32
|
+
"""predict method"""
|
|
33
|
+
|
|
34
|
+
def predict_proba(
|
|
35
|
+
self,
|
|
36
|
+
flat_data: np.ndarray,
|
|
37
|
+
rows: int,
|
|
38
|
+
cols: int,
|
|
39
|
+
parallel: bool = True,
|
|
40
|
+
) -> np.ndarray:
|
|
41
|
+
"""predict probabilities method"""
|
|
42
|
+
|
|
43
|
+
def predict_contributions(
|
|
44
|
+
self,
|
|
45
|
+
flat_data: np.ndarray,
|
|
46
|
+
rows: int,
|
|
47
|
+
cols: int,
|
|
48
|
+
method: str,
|
|
49
|
+
parallel: bool = True,
|
|
50
|
+
) -> np.ndarray:
|
|
51
|
+
"""method"""
|
|
52
|
+
|
|
53
|
+
def value_partial_dependence(
|
|
54
|
+
self,
|
|
55
|
+
feature: int,
|
|
56
|
+
value: float,
|
|
57
|
+
) -> float:
|
|
58
|
+
"""pass"""
|
|
59
|
+
|
|
60
|
+
def calculate_feature_importance(
|
|
61
|
+
self,
|
|
62
|
+
method: str,
|
|
63
|
+
normalize: bool,
|
|
64
|
+
) -> Dict[int, float]:
|
|
65
|
+
"""pass"""
|
|
66
|
+
|
|
67
|
+
def text_dump(self) -> Iterable[str]:
|
|
68
|
+
"""pass"""
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def load_booster(cls, path: str) -> Self:
|
|
72
|
+
"""pass"""
|
|
73
|
+
|
|
74
|
+
def save_booster(self, path: str):
|
|
75
|
+
"""pass"""
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def from_json(cls, json_str: str) -> Self:
|
|
79
|
+
"""pass"""
|
|
80
|
+
|
|
81
|
+
def json_dump(self) -> str:
|
|
82
|
+
"""pass"""
|
|
83
|
+
|
|
84
|
+
def get_params(self) -> Dict[str, Any]:
|
|
85
|
+
"""pass"""
|
|
86
|
+
|
|
87
|
+
def insert_metadata(self, key: str, value: str) -> None:
|
|
88
|
+
"""pass"""
|
|
89
|
+
|
|
90
|
+
def get_metadata(self, key: str) -> str:
|
|
91
|
+
"""pass"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class MultiOutputBoosterType(Protocol):
|
|
95
|
+
monotone_constraints: Dict[int, int]
|
|
96
|
+
terminate_missing_features: Set[int]
|
|
97
|
+
number_of_trees: Iterable[int]
|
|
98
|
+
base_score: Iterable[float]
|
|
99
|
+
|
|
100
|
+
def fit(
|
|
101
|
+
self,
|
|
102
|
+
flat_data: np.ndarray,
|
|
103
|
+
rows: int,
|
|
104
|
+
cols: int,
|
|
105
|
+
y: np.ndarray,
|
|
106
|
+
budget: float,
|
|
107
|
+
sample_weight: np.ndarray,
|
|
108
|
+
parallel: bool = False,
|
|
109
|
+
):
|
|
110
|
+
"""Fit method"""
|
|
111
|
+
|
|
112
|
+
def predict(
|
|
113
|
+
self,
|
|
114
|
+
flat_data: np.ndarray,
|
|
115
|
+
rows: int,
|
|
116
|
+
cols: int,
|
|
117
|
+
parallel: bool = True,
|
|
118
|
+
) -> np.ndarray:
|
|
119
|
+
"""predict method"""
|
|
120
|
+
|
|
121
|
+
def predict_proba(
|
|
122
|
+
self,
|
|
123
|
+
flat_data: np.ndarray,
|
|
124
|
+
rows: int,
|
|
125
|
+
cols: int,
|
|
126
|
+
parallel: bool = True,
|
|
127
|
+
) -> np.ndarray:
|
|
128
|
+
"""predict probabilities method"""
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def load_booster(cls, path: str) -> Self:
|
|
132
|
+
"""pass"""
|
|
133
|
+
|
|
134
|
+
def save_booster(self, path: str):
|
|
135
|
+
"""pass"""
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def from_json(cls, json_str: str) -> Self:
|
|
139
|
+
"""pass"""
|
|
140
|
+
|
|
141
|
+
def json_dump(self) -> str:
|
|
142
|
+
"""pass"""
|
|
143
|
+
|
|
144
|
+
def get_params(self) -> Dict[str, Any]:
|
|
145
|
+
"""pass"""
|
|
146
|
+
|
|
147
|
+
def insert_metadata(self, key: str, value: str) -> None:
|
|
148
|
+
"""pass"""
|
|
149
|
+
|
|
150
|
+
def get_metadata(self, key: str) -> str:
|
|
151
|
+
"""pass"""
|