perpetual 0.9.1__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of perpetual might be problematic. Click here for more details.
- perpetual/__init__.py +6 -0
- perpetual/booster.py +1064 -0
- perpetual/data.py +25 -0
- perpetual/perpetual.cp39-win_amd64.pyd +0 -0
- perpetual/serialize.py +74 -0
- perpetual/types.py +150 -0
- perpetual/utils.py +217 -0
- perpetual-0.9.1.dist-info/METADATA +165 -0
- perpetual-0.9.1.dist-info/RECORD +11 -0
- perpetual-0.9.1.dist-info/WHEEL +4 -0
- perpetual-0.9.1.dist-info/licenses/LICENSE +674 -0
perpetual/data.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Iterable, Optional, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Node:
|
|
7
|
+
"""Dataclass representation of a node, this represents all of the fields present in a tree node."""
|
|
8
|
+
|
|
9
|
+
num: int
|
|
10
|
+
weight_value: float
|
|
11
|
+
hessian_sum: float
|
|
12
|
+
depth: int
|
|
13
|
+
split_value: float
|
|
14
|
+
split_feature: Union[str, int]
|
|
15
|
+
split_gain: float
|
|
16
|
+
missing_node: int
|
|
17
|
+
left_child: int
|
|
18
|
+
right_child: int
|
|
19
|
+
is_leaf: bool
|
|
20
|
+
node_type: str
|
|
21
|
+
parent_node: int
|
|
22
|
+
generalization: Optional[float]
|
|
23
|
+
left_cats: Optional[Iterable]
|
|
24
|
+
right_cats: Optional[Iterable]
|
|
25
|
+
count: int
|
|
Binary file
|
perpetual/serialize.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from ast import literal_eval
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Dict, Generic, List, Tuple, TypeVar, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import numpy.typing as npt
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseSerializer(ABC, Generic[T]):
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def serialize(self, obj: T) -> str:
|
|
18
|
+
"""serialize method - should take an object and return a string"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def deserialize(self, obj_repr: str) -> T:
|
|
22
|
+
"""deserialize method - should take a string and return original object"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Scaler = Union[int, float, str]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ScalerSerializer(BaseSerializer[Scaler]):
|
|
29
|
+
def serialize(self, obj: Scaler) -> str:
|
|
30
|
+
if isinstance(obj, str):
|
|
31
|
+
obj_ = f"'{obj}'"
|
|
32
|
+
else:
|
|
33
|
+
obj_ = str(obj)
|
|
34
|
+
return obj_
|
|
35
|
+
|
|
36
|
+
def deserialize(self, obj_repr: str) -> Scaler:
|
|
37
|
+
return literal_eval(node_or_string=obj_repr)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
ObjectItem = Union[
|
|
41
|
+
List[Scaler],
|
|
42
|
+
Dict[str, Scaler],
|
|
43
|
+
Scaler,
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ObjectSerializer(BaseSerializer[ObjectItem]):
|
|
48
|
+
def serialize(self, obj: ObjectItem) -> str:
|
|
49
|
+
return json.dumps(obj)
|
|
50
|
+
|
|
51
|
+
def deserialize(self, obj_repr: str) -> ObjectItem:
|
|
52
|
+
return json.loads(obj_repr)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class NumpyData:
|
|
57
|
+
array: Union[List[float], List[int]]
|
|
58
|
+
dtype: str
|
|
59
|
+
shape: Tuple[int, ...]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class NumpySerializer(BaseSerializer[npt.NDArray]):
|
|
63
|
+
def serialize(self, obj: npt.NDArray) -> str:
|
|
64
|
+
return json.dumps(
|
|
65
|
+
{"array": obj.tolist(), "dtype": str(obj.dtype), "shape": obj.shape}
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def deserialize(self, obj_repr: str) -> npt.NDArray:
|
|
69
|
+
data = NumpyData(**json.loads(obj_repr))
|
|
70
|
+
a = np.array(data.array, dtype=data.dtype) # type: ignore
|
|
71
|
+
if len(data.shape) == 1:
|
|
72
|
+
return a
|
|
73
|
+
else:
|
|
74
|
+
return a.reshape(data.shape)
|
perpetual/types.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing_extensions import Self
|
|
3
|
+
from typing import Any, Dict, Iterable, Protocol, Set
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BoosterType(Protocol):
|
|
7
|
+
monotone_constraints: Dict[int, int]
|
|
8
|
+
terminate_missing_features: Set[int]
|
|
9
|
+
number_of_trees: int
|
|
10
|
+
base_score: float
|
|
11
|
+
|
|
12
|
+
def fit(
|
|
13
|
+
self,
|
|
14
|
+
flat_data: np.ndarray,
|
|
15
|
+
rows: int,
|
|
16
|
+
cols: int,
|
|
17
|
+
y: np.ndarray,
|
|
18
|
+
budget: float,
|
|
19
|
+
sample_weight: np.ndarray,
|
|
20
|
+
parallel: bool = False,
|
|
21
|
+
):
|
|
22
|
+
"""Fit method"""
|
|
23
|
+
|
|
24
|
+
def predict(
|
|
25
|
+
self,
|
|
26
|
+
flat_data: np.ndarray,
|
|
27
|
+
rows: int,
|
|
28
|
+
cols: int,
|
|
29
|
+
parallel: bool = True,
|
|
30
|
+
) -> np.ndarray:
|
|
31
|
+
"""predict method"""
|
|
32
|
+
|
|
33
|
+
def predict_proba(
|
|
34
|
+
self,
|
|
35
|
+
flat_data: np.ndarray,
|
|
36
|
+
rows: int,
|
|
37
|
+
cols: int,
|
|
38
|
+
parallel: bool = True,
|
|
39
|
+
) -> np.ndarray:
|
|
40
|
+
"""predict probabilities method"""
|
|
41
|
+
|
|
42
|
+
def predict_contributions(
|
|
43
|
+
self,
|
|
44
|
+
flat_data: np.ndarray,
|
|
45
|
+
rows: int,
|
|
46
|
+
cols: int,
|
|
47
|
+
method: str,
|
|
48
|
+
parallel: bool = True,
|
|
49
|
+
) -> np.ndarray:
|
|
50
|
+
"""method"""
|
|
51
|
+
|
|
52
|
+
def value_partial_dependence(
|
|
53
|
+
self,
|
|
54
|
+
feature: int,
|
|
55
|
+
value: float,
|
|
56
|
+
) -> float:
|
|
57
|
+
"""pass"""
|
|
58
|
+
|
|
59
|
+
def calculate_feature_importance(
|
|
60
|
+
self,
|
|
61
|
+
method: str,
|
|
62
|
+
normalize: bool,
|
|
63
|
+
) -> Dict[int, float]:
|
|
64
|
+
"""pass"""
|
|
65
|
+
|
|
66
|
+
def text_dump(self) -> Iterable[str]:
|
|
67
|
+
"""pass"""
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def load_booster(cls, path: str) -> Self:
|
|
71
|
+
"""pass"""
|
|
72
|
+
|
|
73
|
+
def save_booster(self, path: str):
|
|
74
|
+
"""pass"""
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def from_json(cls, json_str: str) -> Self:
|
|
78
|
+
"""pass"""
|
|
79
|
+
|
|
80
|
+
def json_dump(self) -> str:
|
|
81
|
+
"""pass"""
|
|
82
|
+
|
|
83
|
+
def get_params(self) -> Dict[str, Any]:
|
|
84
|
+
"""pass"""
|
|
85
|
+
|
|
86
|
+
def insert_metadata(self, key: str, value: str) -> None:
|
|
87
|
+
"""pass"""
|
|
88
|
+
|
|
89
|
+
def get_metadata(self, key: str) -> str:
|
|
90
|
+
"""pass"""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class MultiOutputBoosterType(Protocol):
|
|
94
|
+
monotone_constraints: Dict[int, int]
|
|
95
|
+
terminate_missing_features: Set[int]
|
|
96
|
+
number_of_trees: Iterable[int]
|
|
97
|
+
base_score: Iterable[float]
|
|
98
|
+
|
|
99
|
+
def fit(
|
|
100
|
+
self,
|
|
101
|
+
flat_data: np.ndarray,
|
|
102
|
+
rows: int,
|
|
103
|
+
cols: int,
|
|
104
|
+
y: np.ndarray,
|
|
105
|
+
budget: float,
|
|
106
|
+
sample_weight: np.ndarray,
|
|
107
|
+
parallel: bool = False,
|
|
108
|
+
):
|
|
109
|
+
"""Fit method"""
|
|
110
|
+
|
|
111
|
+
def predict(
|
|
112
|
+
self,
|
|
113
|
+
flat_data: np.ndarray,
|
|
114
|
+
rows: int,
|
|
115
|
+
cols: int,
|
|
116
|
+
parallel: bool = True,
|
|
117
|
+
) -> np.ndarray:
|
|
118
|
+
"""predict method"""
|
|
119
|
+
|
|
120
|
+
def predict_proba(
|
|
121
|
+
self,
|
|
122
|
+
flat_data: np.ndarray,
|
|
123
|
+
rows: int,
|
|
124
|
+
cols: int,
|
|
125
|
+
parallel: bool = True,
|
|
126
|
+
) -> np.ndarray:
|
|
127
|
+
"""predict probabilities method"""
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def load_booster(cls, path: str) -> Self:
|
|
131
|
+
"""pass"""
|
|
132
|
+
|
|
133
|
+
def save_booster(self, path: str):
|
|
134
|
+
"""pass"""
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def from_json(cls, json_str: str) -> Self:
|
|
138
|
+
"""pass"""
|
|
139
|
+
|
|
140
|
+
def json_dump(self) -> str:
|
|
141
|
+
"""pass"""
|
|
142
|
+
|
|
143
|
+
def get_params(self) -> Dict[str, Any]:
|
|
144
|
+
"""pass"""
|
|
145
|
+
|
|
146
|
+
def insert_metadata(self, key: str, value: str) -> None:
|
|
147
|
+
"""pass"""
|
|
148
|
+
|
|
149
|
+
def get_metadata(self, key: str) -> str:
|
|
150
|
+
"""pass"""
|
perpetual/utils.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def type_df(df):
|
|
10
|
+
library_name = type(df).__module__.split(".")[0]
|
|
11
|
+
if type(df).__name__ == "DataFrame":
|
|
12
|
+
if library_name == "pandas":
|
|
13
|
+
return "pandas_df"
|
|
14
|
+
elif library_name == "polars":
|
|
15
|
+
return "polars_df"
|
|
16
|
+
elif library_name == "numpy":
|
|
17
|
+
return "numpy"
|
|
18
|
+
else:
|
|
19
|
+
return ""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def type_series(y):
|
|
23
|
+
library_name = type(y).__module__.split(".")[0]
|
|
24
|
+
if type(y).__name__ == "Series":
|
|
25
|
+
if library_name == "pandas":
|
|
26
|
+
return "pandas_series"
|
|
27
|
+
elif library_name == "polars":
|
|
28
|
+
return "polars_series"
|
|
29
|
+
elif library_name == "numpy":
|
|
30
|
+
return "numpy"
|
|
31
|
+
else:
|
|
32
|
+
return ""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def convert_input_array(x, objective) -> np.ndarray:
|
|
36
|
+
classes_ = []
|
|
37
|
+
|
|
38
|
+
if type(x).__module__.split(".")[0] == "numpy":
|
|
39
|
+
if len(x.shape) == 2:
|
|
40
|
+
classes_, x_, *_ = convert_input_frame(x, None, 1000)
|
|
41
|
+
else:
|
|
42
|
+
x_ = x
|
|
43
|
+
elif type_series(x) == "pandas_series":
|
|
44
|
+
x_ = x.to_numpy()
|
|
45
|
+
elif type_series(x) == "polars_series":
|
|
46
|
+
x_ = x.to_numpy(allow_copy=False)
|
|
47
|
+
elif type_df(x) == "polars_df" or type_df(x) == "pandas_df":
|
|
48
|
+
classes_, x_, *_ = convert_input_frame(x, None, 1000)
|
|
49
|
+
else:
|
|
50
|
+
x_ = x.to_numpy()
|
|
51
|
+
|
|
52
|
+
if objective == "LogLoss" and len(x_.shape) == 1:
|
|
53
|
+
classes_ = np.unique(x_)
|
|
54
|
+
x_index = np.array([np.where(classes_ == i) for i in x_])
|
|
55
|
+
if len(classes_) > 2:
|
|
56
|
+
x_ = np.squeeze(np.eye(len(classes_))[x_index])
|
|
57
|
+
|
|
58
|
+
if not np.issubdtype(x_.dtype, "float64"):
|
|
59
|
+
x_ = x_.astype(dtype="float64", copy=False)
|
|
60
|
+
|
|
61
|
+
if len(x_.shape) == 2:
|
|
62
|
+
x_ = x_.ravel(order="F")
|
|
63
|
+
|
|
64
|
+
return x_, classes_
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def convert_input_frame(
|
|
68
|
+
X,
|
|
69
|
+
categorical_features,
|
|
70
|
+
max_cat,
|
|
71
|
+
) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
|
|
72
|
+
"""Convert data to format needed by booster.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping
|
|
76
|
+
"""
|
|
77
|
+
categorical_features_ = None
|
|
78
|
+
if type_df(X) == "pandas_df":
|
|
79
|
+
X_ = X.to_numpy()
|
|
80
|
+
features_ = X.columns.to_list()
|
|
81
|
+
if categorical_features == "auto":
|
|
82
|
+
categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
|
|
83
|
+
categorical_features_ = [
|
|
84
|
+
features_.index(c) for c in categorical_columns
|
|
85
|
+
] or None
|
|
86
|
+
elif type_df(X) == "polars_df":
|
|
87
|
+
import polars.selectors as cs
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
X_ = X.to_numpy(allow_copy=False)
|
|
91
|
+
except RuntimeError:
|
|
92
|
+
X_ = X.to_numpy(allow_copy=True)
|
|
93
|
+
|
|
94
|
+
features_ = X.columns
|
|
95
|
+
if categorical_features == "auto":
|
|
96
|
+
categorical_columns = X.select(cs.categorical()).columns
|
|
97
|
+
categorical_features_ = [
|
|
98
|
+
features_.index(c) for c in categorical_columns
|
|
99
|
+
] or None
|
|
100
|
+
else:
|
|
101
|
+
# Assume it's a numpy array.
|
|
102
|
+
X_ = X
|
|
103
|
+
features_ = list(map(str, range(X_.shape[1])))
|
|
104
|
+
|
|
105
|
+
if (
|
|
106
|
+
categorical_features
|
|
107
|
+
and all(isinstance(s, int) for s in categorical_features)
|
|
108
|
+
and isinstance(categorical_features, list)
|
|
109
|
+
):
|
|
110
|
+
categorical_features_ = categorical_features
|
|
111
|
+
elif (
|
|
112
|
+
categorical_features
|
|
113
|
+
and all(isinstance(s, str) for s in categorical_features)
|
|
114
|
+
and isinstance(categorical_features, list)
|
|
115
|
+
):
|
|
116
|
+
categorical_features_ = [features_.index(c) for c in categorical_features]
|
|
117
|
+
|
|
118
|
+
cat_mapping = {} # key: feature_name, value: ordered category names
|
|
119
|
+
cat_to_num = []
|
|
120
|
+
if categorical_features_:
|
|
121
|
+
for i in categorical_features_:
|
|
122
|
+
categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True)
|
|
123
|
+
|
|
124
|
+
categories = list(categories)
|
|
125
|
+
if "nan" in categories:
|
|
126
|
+
categories.remove("nan")
|
|
127
|
+
categories.insert(0, "nan")
|
|
128
|
+
|
|
129
|
+
inversed = inversed + 1.0
|
|
130
|
+
|
|
131
|
+
if len(categories) > max_cat:
|
|
132
|
+
cat_to_num.append(i)
|
|
133
|
+
logger.warning(
|
|
134
|
+
f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
feature_name = features_[i]
|
|
138
|
+
cat_mapping[feature_name] = categories
|
|
139
|
+
ind_nan = len(categories)
|
|
140
|
+
inversed[inversed == ind_nan] = np.nan
|
|
141
|
+
X_[:, i] = inversed
|
|
142
|
+
|
|
143
|
+
categorical_features_ = [
|
|
144
|
+
x for x in categorical_features_ if x not in cat_to_num
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
logger.info(f"Categorical features: {categorical_features_}")
|
|
148
|
+
logger.info(f"Mapping of categories: {cat_mapping}")
|
|
149
|
+
|
|
150
|
+
if not np.issubdtype(X_.dtype, "float64"):
|
|
151
|
+
X_ = X_.astype(dtype="float64", copy=False)
|
|
152
|
+
flat_data = X_.ravel(order="F")
|
|
153
|
+
rows, cols = X_.shape
|
|
154
|
+
|
|
155
|
+
if isinstance(categorical_features_, list):
|
|
156
|
+
categorical_features_ = set(categorical_features_)
|
|
157
|
+
|
|
158
|
+
return features_, flat_data, rows, cols, categorical_features_, cat_mapping
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, int]:
|
|
162
|
+
"""Convert data to format needed by booster.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Tuple[List[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns
|
|
166
|
+
"""
|
|
167
|
+
if type_df(X) == "pandas_df":
|
|
168
|
+
X_ = X.to_numpy()
|
|
169
|
+
features_ = X.columns.to_list()
|
|
170
|
+
elif type_df(X) == "polars_df":
|
|
171
|
+
try:
|
|
172
|
+
X_ = X.to_numpy(allow_copy=False)
|
|
173
|
+
except RuntimeError:
|
|
174
|
+
X_ = X.to_numpy(allow_copy=True)
|
|
175
|
+
features_ = X.columns
|
|
176
|
+
else:
|
|
177
|
+
# Assume it's a numpy array.
|
|
178
|
+
X_ = X
|
|
179
|
+
features_ = list(map(str, range(X_.shape[1])))
|
|
180
|
+
|
|
181
|
+
if cat_mapping:
|
|
182
|
+
for feature_name, categories in cat_mapping.items():
|
|
183
|
+
feature_index = features_.index(feature_name)
|
|
184
|
+
cats = categories.copy()
|
|
185
|
+
cats.remove("nan")
|
|
186
|
+
x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str))
|
|
187
|
+
x_enc = x_enc + 1.0
|
|
188
|
+
ind_nan = len(categories)
|
|
189
|
+
x_enc[x_enc == ind_nan] = np.nan
|
|
190
|
+
X_[:, feature_index] = x_enc
|
|
191
|
+
|
|
192
|
+
if not np.issubdtype(X_.dtype, "float64"):
|
|
193
|
+
X_ = X_.astype(dtype="float64", copy=False)
|
|
194
|
+
flat_data = X_.ravel(order="F")
|
|
195
|
+
rows, cols = X_.shape
|
|
196
|
+
|
|
197
|
+
return features_, flat_data, rows, cols
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
CONTRIBUTION_METHODS = {
|
|
201
|
+
"weight": "Weight",
|
|
202
|
+
"Weight": "Weight",
|
|
203
|
+
"average": "Average",
|
|
204
|
+
"Average": "Average",
|
|
205
|
+
"branch-difference": "BranchDifference",
|
|
206
|
+
"branchdifference": "BranchDifference",
|
|
207
|
+
"BranchDifference": "BranchDifference",
|
|
208
|
+
"midpoint-difference": "MidpointDifference",
|
|
209
|
+
"midpointdifference": "MidpointDifference",
|
|
210
|
+
"MidpointDifference": "MidpointDifference",
|
|
211
|
+
"mode-difference": "ModeDifference",
|
|
212
|
+
"modedifference": "ModeDifference",
|
|
213
|
+
"ModeDifference": "ModeDifference",
|
|
214
|
+
"ProbabilityChange": "ProbabilityChange",
|
|
215
|
+
"probabilitychange": "ProbabilityChange",
|
|
216
|
+
"probability-change": "ProbabilityChange",
|
|
217
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: perpetual
|
|
3
|
+
Version: 0.9.1
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: 3
|
|
6
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
11
|
+
Requires-Dist: numpy
|
|
12
|
+
Requires-Dist: typing-extensions
|
|
13
|
+
Requires-Dist: black ; extra == 'dev'
|
|
14
|
+
Requires-Dist: pandas ; extra == 'dev'
|
|
15
|
+
Requires-Dist: polars ; extra == 'dev'
|
|
16
|
+
Requires-Dist: pyarrow ; extra == 'dev'
|
|
17
|
+
Requires-Dist: maturin ; extra == 'dev'
|
|
18
|
+
Requires-Dist: pytest ; extra == 'dev'
|
|
19
|
+
Requires-Dist: seaborn ; extra == 'dev'
|
|
20
|
+
Requires-Dist: scikit-learn ; extra == 'dev'
|
|
21
|
+
Requires-Dist: mkdocs-material ; extra == 'dev'
|
|
22
|
+
Requires-Dist: mkdocstrings[python] ; extra == 'dev'
|
|
23
|
+
Requires-Dist: mkdocs-autorefs ; extra == 'dev'
|
|
24
|
+
Requires-Dist: ruff ; extra == 'dev'
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Summary: A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization
|
|
28
|
+
Keywords: rust,perpetual,machine learning,tree model,decision tree,gradient boosted decision tree,gradient boosting machine
|
|
29
|
+
Home-Page: https://perpetual-ml.com
|
|
30
|
+
Author: Mutlu Simsek
|
|
31
|
+
Author-email: Mutlu Simsek <msimsek@perpetual-ml.com>
|
|
32
|
+
Requires-Python: >=3.9
|
|
33
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
34
|
+
Project-URL: Source Code, https://github.com/perpetual-ml/perpetual
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<img height="120" src="https://github.com/perpetual-ml/perpetual/raw/main/resources/perp_logo.png">
|
|
38
|
+
</p>
|
|
39
|
+
|
|
40
|
+
<div align="center">
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/perpetual)
|
|
43
|
+
[](https://pypi.org/project/perpetual)
|
|
44
|
+
[](https://crates.io/crates/perpetual)
|
|
45
|
+
[](https://discord.gg/AyUK7rr6wy)
|
|
46
|
+

|
|
47
|
+
|
|
48
|
+
</div>
|
|
49
|
+
|
|
50
|
+
# Perpetual
|
|
51
|
+
|
|
52
|
+
PerpetualBooster is a gradient boosting machine (GBM) algorithm that doesn't need hyperparameter optimization unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
|
|
53
|
+
|
|
54
|
+
## Usage
|
|
55
|
+
|
|
56
|
+
You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from perpetual import PerpetualBooster
|
|
60
|
+
|
|
61
|
+
model = PerpetualBooster(objective="SquaredLoss", budget=0.5)
|
|
62
|
+
model.fit(X, y)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Documentation
|
|
66
|
+
|
|
67
|
+
Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/).
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
## Benchmark
|
|
71
|
+
|
|
72
|
+
### PerpetualBooster vs. Optuna + LightGBM
|
|
73
|
+
|
|
74
|
+
Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
|
|
75
|
+
|
|
76
|
+
The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression):
|
|
77
|
+
|
|
78
|
+
| Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Speed-up wall time | Speed-up cpu time |
|
|
79
|
+
| ---------------- | --------------------- | ------------- | ------------ | ------------------ | ----------------- |
|
|
80
|
+
| 1.0 | 100 | 0.192 | 0.192 | 54x | 56x |
|
|
81
|
+
| 1.5 | 300 | 0.188 | 0.188 | 59x | 58x |
|
|
82
|
+
| 2.1 | 1000 | 0.185 | 0.186 | 42x | 41x |
|
|
83
|
+
|
|
84
|
+
The following table summarizes the results for the [Cover Types](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html) dataset (classification):
|
|
85
|
+
|
|
86
|
+
| Perpetual budget | LightGBM n_estimators | Perpetual log loss | LightGBM log loss | Speed-up wall time | Speed-up cpu time |
|
|
87
|
+
| ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- |
|
|
88
|
+
| 0.9 | 100 | 0.091 | 0.084 | 72x | 78x |
|
|
89
|
+
|
|
90
|
+
The results can be reproduced using the scripts in the [examples](./python-package/examples) folder.
|
|
91
|
+
|
|
92
|
+
### PerpetualBooster vs. AutoGluon
|
|
93
|
+
|
|
94
|
+
PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/) for both regression and classification tasks.
|
|
95
|
+
|
|
96
|
+
The results are summarized in the following table for regression tasks:
|
|
97
|
+
|
|
98
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
|
|
99
|
+
| -------------------------------------------------------- | ----- | ----- | ------------------- | -------- | ------ | ------------------ |
|
|
100
|
+
| [Airlines_DepDelay_10M](https://www.openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 | <ins> 28.8 </ins> |
|
|
101
|
+
| [bates_regr_100](https://www.openml.org/t/361940) | 3421 | 15.1 | <ins> 1.084 </ins> | OOM | OOM | OOM |
|
|
102
|
+
| [BNG(libras_move)](https://www.openml.org/t/7327) | 1956 | 4.2 | <ins> 2.51 </ins> | 1922 | 97.6 | 2.53 |
|
|
103
|
+
| [BNG(satellite_image)](https://www.openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 | <ins> 0.721 </ins> |
|
|
104
|
+
| [COMET_MC](https://www.openml.org/t/14949) | 44 | 1.0 | <ins> 0.0615 </ins> | 47 | 5.0 | 0.0662 |
|
|
105
|
+
| [friedman1](https://www.openml.org/t/361939) | 275 | 4.2 | <ins> 1.047 </ins> | 278 | 5.1 | 1.487 |
|
|
106
|
+
| [poker](https://www.openml.org/t/10102) | 38 | 0.6 | <ins> 0.256 </ins> | 41 | 1.2 | 0.722 |
|
|
107
|
+
| [subset_higgs](https://www.openml.org/t/361955) | 868 | 10.6 | <ins> 0.420 </ins> | 870 | 24.5 | 0.421 |
|
|
108
|
+
| [BNG(autoHorse)](https://www.openml.org/t/7319) | 107 | 1.1 | <ins> 19.0 </ins> | 107 | 3.2 | 20.5 |
|
|
109
|
+
| [BNG(pbc)](https://www.openml.org/t/7318) | 48 | 0.6 | <ins> 836.5 </ins> | 51 | 0.2 | 957.1 |
|
|
110
|
+
| average | 465 | 3.9 | - | 464 | 19.7 | - |
|
|
111
|
+
|
|
112
|
+
PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster.
|
|
113
|
+
|
|
114
|
+
The results are summarized in the following table for classification tasks:
|
|
115
|
+
|
|
116
|
+
| OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC |
|
|
117
|
+
| -------------------------------------------------------- | ------- | ------ | ------------------- | -------- | ------ | ------------------ |
|
|
118
|
+
| [BNG(spambase)](https://www.openml.org/t/146163) | 70.1 | 2.1 | <ins> 0.671 </ins> | 73.1 | 3.7 | 0.669 |
|
|
119
|
+
| [BNG(trains)](https://www.openml.org/t/208) | 89.5 | 1.7 | <ins> 0.996 </ins> | 106.4 | 2.4 | 0.994 |
|
|
120
|
+
| [breast](https://www.openml.org/t/361942) | 13699.3 | 97.7 | <ins> 0.991 </ins> | 13330.7 | 79.7 | 0.949 |
|
|
121
|
+
| [Click_prediction_small](https://www.openml.org/t/7291) | 89.1 | 1.0 | <ins> 0.749 </ins> | 101.0 | 2.8 | 0.703 |
|
|
122
|
+
| [colon](https://www.openml.org/t/361938) | 12435.2 | 126.7 | <ins> 0.997 </ins> | 12356.2 | 152.3 | 0.997 |
|
|
123
|
+
| [Higgs](https://www.openml.org/t/362113) | 3485.3 | 40.9 | <ins> 0.843 </ins> | 3501.4 | 67.9 | 0.816 |
|
|
124
|
+
| [SEA(50000)](https://www.openml.org/t/230) | 21.9 | 0.2 | <ins> 0.936 </ins> | 25.6 | 0.5 | 0.935 |
|
|
125
|
+
| [sf-police-incidents](https://www.openml.org/t/359994) | 85.8 | 1.5 | <ins> 0.687 </ins> | 99.4 | 2.8 | 0.659 |
|
|
126
|
+
| [bates_classif_100](https://www.openml.org/t/361941) | 11152.8 | 50.0 | <ins> 0.864 </ins> | OOM | OOM | OOM |
|
|
127
|
+
| [prostate](https://www.openml.org/t/361945) | 13699.9 | 79.8 | <ins> 0.987 </ins> | OOM | OOM | OOM |
|
|
128
|
+
| average | 3747.0 | 34.0 | - | 3699.2 | 39.0 | - |
|
|
129
|
+
|
|
130
|
+
PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster.
|
|
131
|
+
|
|
132
|
+
PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks.
|
|
133
|
+
|
|
134
|
+
The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark).
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
## Installation
|
|
139
|
+
|
|
140
|
+
The package can be installed directly from [pypi](https://pypi.org/project/perpetual):
|
|
141
|
+
|
|
142
|
+
```shell
|
|
143
|
+
pip install perpetual
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Using [conda-forge](https://anaconda.org/conda-forge/perpetual):
|
|
147
|
+
|
|
148
|
+
```shell
|
|
149
|
+
conda install conda-forge::perpetual
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual):
|
|
153
|
+
|
|
154
|
+
```shell
|
|
155
|
+
cargo add perpetual
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Contribution
|
|
159
|
+
|
|
160
|
+
Contributions are welcome. Check CONTRIBUTING.md for the guideline.
|
|
161
|
+
|
|
162
|
+
## Paper
|
|
163
|
+
|
|
164
|
+
PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our [blog post](https://perpetual-ml.com/blog/how-perpetual-works) for a high level introduction to the algorithm.
|
|
165
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
perpetual-0.9.1.dist-info/METADATA,sha256=CVBQY3JrR_yM4_gNV4rmUmwlC2TLdZRUZZ5DR0uQV3g,10722
|
|
2
|
+
perpetual-0.9.1.dist-info/WHEEL,sha256=mDFV3bKFgwlxLHvOsPqpR9up9dUKYzsUQNKBdkW5c08,94
|
|
3
|
+
perpetual-0.9.1.dist-info/licenses/LICENSE,sha256=gcuuhKKc5-dwvyvHsXjlC9oM6N5gZ6umYbC8ewW1Yvg,35821
|
|
4
|
+
perpetual/booster.py,sha256=QvKEz-tZmhrRwh39quy1WNbeBF7IWtitquW_8Q8LkII,50750
|
|
5
|
+
perpetual/data.py,sha256=vhjWEc_ESYWoaczz0GkUPtfS0iRSKdVZSrCkQn8yLPw,630
|
|
6
|
+
perpetual/serialize.py,sha256=FeW4JsUFVsrft9N7gz-ebn5mXvDv4LiJC2sgBEeGxYo,1957
|
|
7
|
+
perpetual/types.py,sha256=idZNsDErNTur_rJ_5Co8Pb6fik-AUn9lkrXmjbQJVX0,3381
|
|
8
|
+
perpetual/utils.py,sha256=nqwO6GFHi7I5iltuvgLT3NFaPm1h9cHlnomjFcdSfHY,7455
|
|
9
|
+
perpetual/__init__.py,sha256=V0RhghaG0CuKxKrzYUBYqrf7Drb-gjmznsbz9KT12lk,122
|
|
10
|
+
perpetual/perpetual.cp39-win_amd64.pyd,sha256=xA-4ubUas4061qgWqCdZTUcV8E56O4h4tz6Ag7UnrYo,1698304
|
|
11
|
+
perpetual-0.9.1.dist-info/RECORD,,
|