linearrf 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linearrf-1.0.0.dist-info/LICENSE.md +0 -0
- linearrf-1.0.0.dist-info/METADATA +17 -0
- linearrf-1.0.0.dist-info/RECORD +13 -0
- linearrf-1.0.0.dist-info/WHEEL +5 -0
- linearrf-1.0.0.dist-info/top_level.txt +1 -0
- lrf/__init__.py +1 -0
- lrf/_base_lrf.py +360 -0
- lrf/_bfgs.py +176 -0
- lrf/_criterion.py +147 -0
- lrf/_linear_models.py +255 -0
- lrf/_node.py +18 -0
- lrf/_preprocessor.py +152 -0
- lrf/lrf.py +221 -0
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: linearrf
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A python libary to build Random Forests with Linear Models at the leaves.
|
|
5
|
+
Author-email: Marian Biermann <marianbiermann@gmx.de>
|
|
6
|
+
Project-URL: homepage, https://github.com/marianbiermann/lrf
|
|
7
|
+
Keywords: ml,rf,linear model,tree,dart,model tree,linear tree
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Programming Language :: Python
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Requires-Python: >=3.7
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE.md
|
|
14
|
+
Requires-Dist: numpy>=1.20.3
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: sklearn; extra == "dev"
|
|
17
|
+
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
lrf/__init__.py,sha256=uiaviCEyab13xWdm0xvVwliqyK5Tk0_KyOvMIvXdKwE,44
|
|
2
|
+
lrf/_base_lrf.py,sha256=xgVQItslZd1c8lHosr-oX8C3jigGyFELpWAl_HO3EVE,14446
|
|
3
|
+
lrf/_bfgs.py,sha256=xMgi3vV3JuYLpAF8VIjmoCtT4vmMmrc-1T2rpCWyOlc,6341
|
|
4
|
+
lrf/_criterion.py,sha256=iKBzfroMT7AuN6urZxFDuMlvH46FZZcG7y36pEpGr24,3989
|
|
5
|
+
lrf/_linear_models.py,sha256=b-hwsKJc3sH7vqtf_VVTj7Vfp0IC85WUlaNdbCc7L5U,8512
|
|
6
|
+
lrf/_node.py,sha256=_4EmSxUZwWPtu0CJhC5iGJ0MOFNyHQlQByLwYF_r4tA,448
|
|
7
|
+
lrf/_preprocessor.py,sha256=KibBIUhXOoVCvX3v3ZBsaZ4rjq5do7oA0Zjm26Qur2w,4103
|
|
8
|
+
lrf/lrf.py,sha256=EZujkK7tCf9sMEjBT2lI5JjyAvXAhiImcYSf5LVEy2E,10027
|
|
9
|
+
linearrf-1.0.0.dist-info/LICENSE.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
linearrf-1.0.0.dist-info/METADATA,sha256=BIflfzWZuSBx1DFgVz9CpB2JnJetwJz6_iZlJpE6JEA,631
|
|
11
|
+
linearrf-1.0.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
12
|
+
linearrf-1.0.0.dist-info/top_level.txt,sha256=Er_kmcN7GxzkBUkt7tBX7sAOd4tNlNDLkEPEX61pVok,4
|
|
13
|
+
linearrf-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
lrf
|
lrf/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .lrf import LRFClassifier, LRFRegressor
|
lrf/_base_lrf.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import datetime
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
5
|
+
from multiprocessing import cpu_count
|
|
6
|
+
from typing import List, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from lrf._linear_models import Regressor, Classifier
|
|
11
|
+
from lrf._node import Node
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _LinearRandomForest:
|
|
15
|
+
def __init__(self, linear_model: Union[Regressor, Classifier] = None, n_estimators: int = 100, max_depth: int = 5,
|
|
16
|
+
criterion: str = None, n_splits: int = 15, split_samples_to_features_ratio: float = 4.5,
|
|
17
|
+
leaf_samples_to_features_ratio: float = 2.0, min_abs_improvement: float = 5*10**(-4),
|
|
18
|
+
warm_start: bool = True, n_jobs: int = -1, random_state: int = None, verbose: bool = False,
|
|
19
|
+
classification: bool = False):
|
|
20
|
+
self.linear_model = linear_model
|
|
21
|
+
self.n_estimators = n_estimators
|
|
22
|
+
self.max_depth = max_depth
|
|
23
|
+
self.criterion = criterion
|
|
24
|
+
self.n_splits = n_splits
|
|
25
|
+
self.split_samples_to_features_ratio = split_samples_to_features_ratio
|
|
26
|
+
self.leaf_samples_to_features_ratio = leaf_samples_to_features_ratio
|
|
27
|
+
self.min_abs_improvement = min_abs_improvement
|
|
28
|
+
self.warm_start = warm_start
|
|
29
|
+
self.n_jobs = n_jobs
|
|
30
|
+
self.random_state = random_state
|
|
31
|
+
self.verbose = verbose
|
|
32
|
+
self.classification = classification
|
|
33
|
+
|
|
34
|
+
def _init_more_attributes(self, y):
|
|
35
|
+
if self.classification:
|
|
36
|
+
self.classes_ = None
|
|
37
|
+
|
|
38
|
+
self.forest = None
|
|
39
|
+
self.min_samples_split = None
|
|
40
|
+
self.min_samples_leaf = None
|
|
41
|
+
|
|
42
|
+
if self.max_depth is None:
|
|
43
|
+
self.max_depth = 10 ** 32
|
|
44
|
+
|
|
45
|
+
if self.n_jobs == -1 or self.n_jobs == 0:
|
|
46
|
+
self.n_jobs = cpu_count()
|
|
47
|
+
else:
|
|
48
|
+
self.n_jobs = min(self.n_jobs, cpu_count())
|
|
49
|
+
|
|
50
|
+
if self.split_samples_to_features_ratio < self.leaf_samples_to_features_ratio * 2:
|
|
51
|
+
self.split_samples_to_features_ratio = self.leaf_samples_to_features_ratio * 2
|
|
52
|
+
|
|
53
|
+
self.min_samples_split = None
|
|
54
|
+
self.min_samples_leaf = None
|
|
55
|
+
|
|
56
|
+
self.total_data_points = y.shape[0]
|
|
57
|
+
|
|
58
|
+
def fit(self, x: np.ndarray, y: np.ndarray):
|
|
59
|
+
|
|
60
|
+
assert y.ndim == 1
|
|
61
|
+
assert not np.all(y == y[0])
|
|
62
|
+
|
|
63
|
+
self._init_more_attributes(y=y)
|
|
64
|
+
|
|
65
|
+
if self.classification:
|
|
66
|
+
self._check_targets_classification(y)
|
|
67
|
+
|
|
68
|
+
random_state_list = np.random.default_rng(self.random_state).integers(2**63, size=self.n_estimators)
|
|
69
|
+
|
|
70
|
+
self.min_samples_split = self.split_samples_to_features_ratio * x.shape[1]
|
|
71
|
+
self.min_samples_leaf = self.leaf_samples_to_features_ratio * x.shape[1]
|
|
72
|
+
|
|
73
|
+
forest = []
|
|
74
|
+
|
|
75
|
+
# add intercept here and not inside linear model for performance reasons
|
|
76
|
+
x = np.insert(x, 0, 1, axis=1)
|
|
77
|
+
|
|
78
|
+
# parallel process combinations of chunks of the data
|
|
79
|
+
with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
|
|
80
|
+
if self.verbose:
|
|
81
|
+
print('\nStart growing trees...')
|
|
82
|
+
finished_tasks = 0
|
|
83
|
+
start_time = time.time()
|
|
84
|
+
|
|
85
|
+
results = [executor.submit(self._grow_tree, x=x, y=y, random_state=i) for i in random_state_list]
|
|
86
|
+
|
|
87
|
+
# collect the results and print the progress
|
|
88
|
+
for r in as_completed(results):
|
|
89
|
+
# collecting results
|
|
90
|
+
grown_tree = r.result()
|
|
91
|
+
forest.append(grown_tree)
|
|
92
|
+
|
|
93
|
+
# printing progress
|
|
94
|
+
if self.verbose:
|
|
95
|
+
finished_tasks += 1
|
|
96
|
+
self._print_progress(frac=finished_tasks/self.n_estimators, start_time=start_time)
|
|
97
|
+
|
|
98
|
+
self.forest = forest
|
|
99
|
+
|
|
100
|
+
if self.verbose:
|
|
101
|
+
elapsed_seconds = round(time.time() - start_time)
|
|
102
|
+
print('Finished planting the forest in {} '.format(str(datetime.timedelta(seconds=elapsed_seconds))))
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def _print_progress(frac: float, start_time: float):
|
|
106
|
+
"""
|
|
107
|
+
Prints the progress of the parallel multiprocessing
|
|
108
|
+
Args:
|
|
109
|
+
frac (int): Fraction of tasks which are already finished
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
elapsed_seconds = round(time.time() - start_time)
|
|
113
|
+
remaining_seconds = round(elapsed_seconds / frac - elapsed_seconds)
|
|
114
|
+
print('LRF - Progress: {}%, [{}<{}]'.format(
|
|
115
|
+
round(100 * frac, 2),
|
|
116
|
+
str(datetime.timedelta(seconds=elapsed_seconds)),
|
|
117
|
+
str(datetime.timedelta(seconds=remaining_seconds))
|
|
118
|
+
), end='\r')
|
|
119
|
+
|
|
120
|
+
def _grow_tree(self, x: np.ndarray, y: np.ndarray, random_state: int):
|
|
121
|
+
|
|
122
|
+
rng = np.random.default_rng(random_state)
|
|
123
|
+
|
|
124
|
+
idx = rng.choice(np.arange(x.shape[0]), x.shape[0])
|
|
125
|
+
x = x[idx]
|
|
126
|
+
y = y[idx]
|
|
127
|
+
|
|
128
|
+
tree = self._root_node(x=x, y=y)
|
|
129
|
+
|
|
130
|
+
# split
|
|
131
|
+
tree = self._split(node=tree, x=x, y=y, depth=0, rng=rng)
|
|
132
|
+
|
|
133
|
+
return tree
|
|
134
|
+
|
|
135
|
+
def _root_node(self, x: np.ndarray, y: np.ndarray):
|
|
136
|
+
# initial linear model
|
|
137
|
+
root_model = copy.deepcopy(self.linear_model)
|
|
138
|
+
|
|
139
|
+
if isinstance(root_model, (Regressor, Classifier)):
|
|
140
|
+
root_model.fit(x, y, None)
|
|
141
|
+
else:
|
|
142
|
+
root_model.fit(x, y)
|
|
143
|
+
|
|
144
|
+
if self.criterion == 'cross_entropy':
|
|
145
|
+
y_pred = root_model.predict_proba(x)
|
|
146
|
+
elif (self.criterion == 'neg_roc_auc') or (self.criterion == 'neg_pr_auc'):
|
|
147
|
+
y_pred = root_model.predict_proba(x)[:, 1]
|
|
148
|
+
else:
|
|
149
|
+
y_pred = root_model.predict(x)
|
|
150
|
+
|
|
151
|
+
metric = self._calculate_metric(y_true=y, y_pred=y_pred)
|
|
152
|
+
|
|
153
|
+
# create node object
|
|
154
|
+
tree = Node(depth=0, metric=metric, model=root_model)
|
|
155
|
+
|
|
156
|
+
return tree
|
|
157
|
+
|
|
158
|
+
def _split(self, node: Node, x: np.ndarray, y: np.ndarray, depth: int, rng: np.random.Generator):
|
|
159
|
+
if (depth == self.max_depth) or np.all(np.all(x == x[0, :], axis=1)) or (x.shape[0] < self.min_samples_split):
|
|
160
|
+
return node
|
|
161
|
+
else:
|
|
162
|
+
split = self._find_best_split(x=x, y=y, last_metric=node.metric, old_coefs=node.model.coef_, rng=rng)
|
|
163
|
+
|
|
164
|
+
if split.get('threshold') is not None:
|
|
165
|
+
node.threshold = split['threshold']
|
|
166
|
+
node.split_col_idx = split['column']
|
|
167
|
+
|
|
168
|
+
left_node = Node(depth=depth + 1, model=split['model_left'], metric=split['metric_left'])
|
|
169
|
+
left_node = self._split(node=left_node, x=split['x_left'], y=split['y_left'],
|
|
170
|
+
depth=depth + 1, rng=rng)
|
|
171
|
+
|
|
172
|
+
right_node = Node(depth=depth + 1, model=split['model_right'], metric=split['metric_right'])
|
|
173
|
+
right_node = self._split(node=right_node, x=split['x_right'], y=split['y_right'],
|
|
174
|
+
depth=depth + 1, rng=rng)
|
|
175
|
+
|
|
176
|
+
node.left_node = left_node
|
|
177
|
+
node.right_node = right_node
|
|
178
|
+
node.model = None
|
|
179
|
+
|
|
180
|
+
return node
|
|
181
|
+
|
|
182
|
+
def _find_best_split(self, x: np.ndarray, y: np.ndarray,
|
|
183
|
+
last_metric: float, old_coefs: np.ndarray, rng: np.random.Generator):
|
|
184
|
+
split = {}
|
|
185
|
+
|
|
186
|
+
random_col_ids = rng.choice(np.arange(1, (x.shape[1])), int(round(np.sqrt(x.shape[1] - 1))), replace=False)
|
|
187
|
+
|
|
188
|
+
for col in random_col_ids:
|
|
189
|
+
split_candidates = self._split_values(x[:, col], rng=rng)
|
|
190
|
+
|
|
191
|
+
for thresh in split_candidates:
|
|
192
|
+
left_idx = x[:, col] <= thresh
|
|
193
|
+
left_idx, right_idx = left_idx.nonzero()[0], (~left_idx).nonzero()[0]
|
|
194
|
+
|
|
195
|
+
if x[:, col].max() == thresh:
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
x_left, y_left = x.take(left_idx, axis=0), y.take(left_idx, axis=0)
|
|
199
|
+
x_right, y_right = x.take(right_idx, axis=0), y.take(right_idx, axis=0)
|
|
200
|
+
|
|
201
|
+
if np.all(y_left == y_left[0]) or np.all(y_right == y_right[0]):
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
observations_left, observations_right = y_left.shape[0], y_right.shape[0]
|
|
205
|
+
|
|
206
|
+
if (
|
|
207
|
+
observations_left < self.min_samples_leaf
|
|
208
|
+
) or (
|
|
209
|
+
observations_right < self.min_samples_leaf
|
|
210
|
+
):
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
# initialize models
|
|
214
|
+
model_left, model_right = copy.deepcopy(self.linear_model), copy.deepcopy(self.linear_model)
|
|
215
|
+
|
|
216
|
+
# fit models
|
|
217
|
+
if self.warm_start and isinstance(model_left, (Regressor, Classifier)) and isinstance(
|
|
218
|
+
model_right, (Regressor, Classifier)):
|
|
219
|
+
model_left.fit(x_left, y_left, initial_coefs=old_coefs)
|
|
220
|
+
model_right.fit(x_right, y_right, initial_coefs=old_coefs)
|
|
221
|
+
else:
|
|
222
|
+
model_left.fit(x_left, y_left, None)
|
|
223
|
+
model_right.fit(x_right, y_right, None)
|
|
224
|
+
|
|
225
|
+
# get prediction for these nodes
|
|
226
|
+
if self.criterion == 'cross_entropy':
|
|
227
|
+
y_pred_left = model_left.predict_proba(x_left)
|
|
228
|
+
y_pred_right = model_right.predict_proba(x_right)
|
|
229
|
+
elif (self.criterion == 'neg_roc_auc') or (self.criterion == 'neg_pr_auc'):
|
|
230
|
+
y_pred_left = model_left.predict_proba(x_left)[:, 1]
|
|
231
|
+
y_pred_right = model_right.predict_proba(x_right)[:, 1]
|
|
232
|
+
else:
|
|
233
|
+
y_pred_left = model_left.predict(x_left)
|
|
234
|
+
y_pred_right = model_right.predict(x_right)
|
|
235
|
+
|
|
236
|
+
# get metrics for these nodes
|
|
237
|
+
metric_left = self._calculate_metric(y_true=y_left, y_pred=y_pred_left)
|
|
238
|
+
metric_right = self._calculate_metric(y_true=y_right, y_pred=y_pred_right)
|
|
239
|
+
|
|
240
|
+
new_metric = ((metric_left * observations_left + metric_right * observations_right)
|
|
241
|
+
/ (observations_left + observations_right))
|
|
242
|
+
better_split = new_metric < (last_metric - self.min_abs_improvement)
|
|
243
|
+
|
|
244
|
+
if better_split:
|
|
245
|
+
last_metric = new_metric
|
|
246
|
+
|
|
247
|
+
split = {'column': col,
|
|
248
|
+
'threshold': thresh,
|
|
249
|
+
'model_left': model_left,
|
|
250
|
+
'model_right': model_right,
|
|
251
|
+
'x_right': x_right,
|
|
252
|
+
'y_right': y_right,
|
|
253
|
+
'x_left': x_left,
|
|
254
|
+
'y_left': y_left,
|
|
255
|
+
'metric_left': metric_left,
|
|
256
|
+
'metric_right': metric_right}
|
|
257
|
+
|
|
258
|
+
return split
|
|
259
|
+
|
|
260
|
+
def _split_values(self, values: np.ndarray, rng: np.random.Generator) -> List:
|
|
261
|
+
unique_values = np.unique(values)
|
|
262
|
+
if unique_values.shape[0] <= self.n_splits:
|
|
263
|
+
split_values = unique_values.tolist()
|
|
264
|
+
else:
|
|
265
|
+
perc_splits = np.ceil(2 * self.n_splits / 3)
|
|
266
|
+
perc_splits = np.unique(np.percentile(values, np.arange(50 / perc_splits, 100, 100 / perc_splits),
|
|
267
|
+
method='closest_observation'))
|
|
268
|
+
|
|
269
|
+
n_smart_splits = self.n_splits - perc_splits.shape[0]
|
|
270
|
+
|
|
271
|
+
diff = np.diff(unique_values, prepend=unique_values[0])
|
|
272
|
+
std = diff[1:].std()
|
|
273
|
+
length = unique_values.shape[0]
|
|
274
|
+
|
|
275
|
+
mask = np.array([False] * length)
|
|
276
|
+
k = np.arange(0, 15, 0.02)
|
|
277
|
+
for j in k:
|
|
278
|
+
mask += diff > (0.5 - j / 200) * length ** (1 / (2 + j)) * std
|
|
279
|
+
if np.count_nonzero(mask) >= n_smart_splits:
|
|
280
|
+
break
|
|
281
|
+
|
|
282
|
+
mask = mask.nonzero()[0]
|
|
283
|
+
smart_splits = unique_values.take(mask, axis=0)
|
|
284
|
+
if smart_splits.shape[0] > n_smart_splits:
|
|
285
|
+
smart_splits = rng.choice(smart_splits, n_smart_splits, replace=False)
|
|
286
|
+
|
|
287
|
+
split_values = perc_splits.tolist() + smart_splits.tolist()
|
|
288
|
+
|
|
289
|
+
max_value = unique_values[-1]
|
|
290
|
+
split_values = [val for val in split_values if val != max_value]
|
|
291
|
+
return split_values
|
|
292
|
+
|
|
293
|
+
def predict(self, x: np.ndarray):
|
|
294
|
+
NotImplementedError()
|
|
295
|
+
|
|
296
|
+
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
297
|
+
NotImplementedError()
|
|
298
|
+
|
|
299
|
+
def export_text(self, tree: int = None, column_names: List[str] = None, ndigits: int = 5):
|
|
300
|
+
txt = ''
|
|
301
|
+
if tree is None:
|
|
302
|
+
for i, node in enumerate(self.forest):
|
|
303
|
+
txt += 'Tree {}:\n'.format(i)
|
|
304
|
+
txt += self._node_to_text(node=node, column_names=column_names, ndigits=ndigits)
|
|
305
|
+
txt += '\n' + '\n'
|
|
306
|
+
else:
|
|
307
|
+
txt += 'Tree {}:\n'.format(tree)
|
|
308
|
+
node = self.forest[tree]
|
|
309
|
+
txt += self._node_to_text(node=node, column_names=column_names, ndigits=ndigits)
|
|
310
|
+
|
|
311
|
+
return txt
|
|
312
|
+
|
|
313
|
+
def _node_to_text(self, node: Node, column_names: List[str] = None, ndigits: int = 3):
|
|
314
|
+
|
|
315
|
+
txt = ''.join(['| ']*node.depth)
|
|
316
|
+
txt += '|---'
|
|
317
|
+
|
|
318
|
+
if node.model is None:
|
|
319
|
+
if column_names is None:
|
|
320
|
+
col = 'col_{}'.format(node.split_col_idx - 1)
|
|
321
|
+
else:
|
|
322
|
+
col = column_names[node.split_col_idx - 1]
|
|
323
|
+
|
|
324
|
+
txt += ' '.join([col, '<', str(round(node.threshold, ndigits))])
|
|
325
|
+
txt += '\n'
|
|
326
|
+
|
|
327
|
+
txt += self._node_to_text(node=node.left_node, column_names=column_names, ndigits=ndigits)
|
|
328
|
+
|
|
329
|
+
txt += ''.join(['| '] * node.depth)
|
|
330
|
+
txt += '|---'
|
|
331
|
+
txt += ' '.join([col, '>=', str(round(node.threshold, ndigits))])
|
|
332
|
+
txt += '\n'
|
|
333
|
+
|
|
334
|
+
txt += self._node_to_text(node=node.right_node, column_names=column_names, ndigits=ndigits)
|
|
335
|
+
|
|
336
|
+
else:
|
|
337
|
+
intercept = node.model.coef_[0]
|
|
338
|
+
weights = node.model.coef_[1:]
|
|
339
|
+
weights = ['+' + str(round(w, ndigits)) if w > 0 else str(round(w, ndigits)) for w in weights]
|
|
340
|
+
if column_names is None:
|
|
341
|
+
cols = ['col_{}'.format(i) for i in range(len(weights))]
|
|
342
|
+
else:
|
|
343
|
+
cols = column_names
|
|
344
|
+
|
|
345
|
+
weights_and_cols = ' '.join(['*'.join(p) for p in list(zip(weights, cols))])
|
|
346
|
+
|
|
347
|
+
txt += ' '.join(['model: y =', str(round(intercept, ndigits)), weights_and_cols])
|
|
348
|
+
|
|
349
|
+
txt += '\n'
|
|
350
|
+
|
|
351
|
+
return txt
|
|
352
|
+
|
|
353
|
+
def set_params(self, **parameters):
|
|
354
|
+
for parameter, value in parameters.items():
|
|
355
|
+
setattr(self, parameter, value)
|
|
356
|
+
return self
|
|
357
|
+
|
|
358
|
+
def _check_targets_classification(self, y: np.ndarray):
|
|
359
|
+
self.classes_ = np.unique(y)
|
|
360
|
+
assert issubclass(self.classes_.dtype.type, np.integer), 'Please convert targets to integer values'
|
lrf/_bfgs.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from lrf._criterion import cross_entropy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BFGS:
|
|
7
|
+
def __init__(self, n_iter: int = 100, tol: float = 10**(-4), intercept: bool = True):
|
|
8
|
+
self.n_iter = n_iter
|
|
9
|
+
self.tol = tol
|
|
10
|
+
self.intercept = intercept
|
|
11
|
+
|
|
12
|
+
def classification(self, x: np.ndarray, y_true: np.ndarray, coef_: np.ndarray, C: float = 1.0):
|
|
13
|
+
y_true = y_true[:, np.newaxis]
|
|
14
|
+
|
|
15
|
+
coef_ = coef_[:, np.newaxis]
|
|
16
|
+
new_grad = self._grad_cross_entropy_logistic(y_true=y_true, x=x, coef_=coef_, C=C,
|
|
17
|
+
y_pred=self._sigmoid(x@coef_))
|
|
18
|
+
|
|
19
|
+
H_inv = np.eye(coef_.shape[0]) / 0.2
|
|
20
|
+
|
|
21
|
+
alpha = 1
|
|
22
|
+
for _ in range(self.n_iter):
|
|
23
|
+
|
|
24
|
+
grad = new_grad
|
|
25
|
+
|
|
26
|
+
direction = -H_inv @ grad
|
|
27
|
+
|
|
28
|
+
alpha, new_grad = self._line_search(x=x, y=y_true, coef_=coef_, direction=direction,
|
|
29
|
+
grad=grad, C=C, alpha=alpha)
|
|
30
|
+
|
|
31
|
+
if alpha is None:
|
|
32
|
+
break
|
|
33
|
+
|
|
34
|
+
s = alpha * direction
|
|
35
|
+
|
|
36
|
+
change_mask = coef_ != 0
|
|
37
|
+
change = np.abs(s[change_mask] / coef_[change_mask]) if np.count_nonzero(change_mask) > 0 else 1
|
|
38
|
+
|
|
39
|
+
coef_ += s
|
|
40
|
+
|
|
41
|
+
if (np.max(change) <= self.tol) or np.all(new_grad == 0):
|
|
42
|
+
break
|
|
43
|
+
else:
|
|
44
|
+
grad_diff = new_grad - grad
|
|
45
|
+
|
|
46
|
+
st_grad_diff = s.T @ grad_diff
|
|
47
|
+
|
|
48
|
+
A = ((st_grad_diff + grad_diff.T @ H_inv @ grad_diff) * (s @ s.T)) / (st_grad_diff**2)
|
|
49
|
+
|
|
50
|
+
B = (H_inv @ grad_diff @ s.T + s @ grad_diff.T @ H_inv) / st_grad_diff
|
|
51
|
+
|
|
52
|
+
H_inv += A - B
|
|
53
|
+
|
|
54
|
+
return coef_.flatten()
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def _sigmoid(y: np.ndarray):
|
|
58
|
+
"""
|
|
59
|
+
Sigmoid function to map input to values between 0 and 1 on the characteristic s-shaped curve (sigmoid curve).
|
|
60
|
+
This is the probability for the positive class.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
y: np.ndarray
|
|
64
|
+
Input values, which will be mapped to values between 0 and 1.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
np.ndarray: Returns the probability for the positive class.
|
|
68
|
+
"""
|
|
69
|
+
return np.exp(-np.logaddexp(0, -y))
|
|
70
|
+
|
|
71
|
+
def _grad_cross_entropy_logistic(self, y_true: np.ndarray, x: np.ndarray, y_pred: np.ndarray,
|
|
72
|
+
coef_: np.ndarray, C: float):
|
|
73
|
+
|
|
74
|
+
weights = coef_.copy()
|
|
75
|
+
if self.intercept:
|
|
76
|
+
weights[0] = 0
|
|
77
|
+
|
|
78
|
+
norm = np.linalg.norm(weights)
|
|
79
|
+
if norm != 0.0:
|
|
80
|
+
penalty = np.einsum('ij->', weights) / (C * norm)
|
|
81
|
+
else:
|
|
82
|
+
penalty = 0
|
|
83
|
+
|
|
84
|
+
grad = x.T @ (y_pred - y_true) + penalty
|
|
85
|
+
norm = np.linalg.norm(grad)
|
|
86
|
+
if norm == 0:
|
|
87
|
+
grad = np.zeros(grad.shape)
|
|
88
|
+
else:
|
|
89
|
+
grad /= norm
|
|
90
|
+
return grad
|
|
91
|
+
|
|
92
|
+
def _armijo(self, y: np.ndarray, y_pred: np.ndarray, coef_: np.ndarray, alpha: float, C: float,
|
|
93
|
+
c1: float, grad_dir: float, cross_entropy_value: float):
|
|
94
|
+
|
|
95
|
+
penalty = self.get_penalty(coef=coef_, C=C)
|
|
96
|
+
|
|
97
|
+
left_armijo = cross_entropy(y_true=y, y_pred=y_pred, penalty=penalty)
|
|
98
|
+
right_armijo = cross_entropy_value + c1 * alpha * grad_dir
|
|
99
|
+
|
|
100
|
+
armijo = left_armijo <= right_armijo
|
|
101
|
+
|
|
102
|
+
return armijo
|
|
103
|
+
|
|
104
|
+
def _wolfe(self, x: np.ndarray, y: np.ndarray, coef_: np.ndarray, alpha: float, C: float,
|
|
105
|
+
direction: np.ndarray, c1: float, c2: float, grad_dir: float, cross_entropy_value: float,
|
|
106
|
+
x_coef: np.ndarray, x_direction: np.ndarray):
|
|
107
|
+
|
|
108
|
+
y_pred = self._sigmoid(x_coef + alpha*x_direction)
|
|
109
|
+
|
|
110
|
+
armijo = self._armijo(y=y, coef_=coef_, alpha=alpha, c1=c1, grad_dir=grad_dir,
|
|
111
|
+
cross_entropy_value=cross_entropy_value, C=C, y_pred=y_pred)
|
|
112
|
+
|
|
113
|
+
if armijo:
|
|
114
|
+
grad = self._grad_cross_entropy_logistic(y_true=y, x=x, coef_=coef_ + alpha * direction, C=C, y_pred=y_pred)
|
|
115
|
+
left_curvature = (direction.T @ grad).item()
|
|
116
|
+
right_curvature = c2 * grad_dir
|
|
117
|
+
|
|
118
|
+
# since wolfe conditions are armijo and weak/strong curvature, the curvature directly implies weak or
|
|
119
|
+
# strong wolfe since armijo is given to be True at this point
|
|
120
|
+
weak_wolfe = left_curvature >= right_curvature
|
|
121
|
+
strong_wolfe = np.abs(left_curvature) <= np.abs(right_curvature)
|
|
122
|
+
else:
|
|
123
|
+
weak_wolfe, strong_wolfe = False, False
|
|
124
|
+
grad = None
|
|
125
|
+
|
|
126
|
+
return weak_wolfe, strong_wolfe, grad
|
|
127
|
+
|
|
128
|
+
def _line_search(self, x: np.ndarray, y: np.ndarray, coef_: np.ndarray,
|
|
129
|
+
direction: np.ndarray, grad: np.ndarray, C: float,
|
|
130
|
+
c1: float = 10 ** (-4), c2: float = 0.9,
|
|
131
|
+
alpha_upper: float = 2.0, alpha_lower: float = 10**-10, alpha: float = 1.0,
|
|
132
|
+
n_iter: int = 10):
|
|
133
|
+
|
|
134
|
+
grad_dir = (direction.T @ grad).item()
|
|
135
|
+
x_coef = x @ coef_
|
|
136
|
+
x_direction = x @ direction
|
|
137
|
+
|
|
138
|
+
penalty = self.get_penalty(coef=coef_, C=C)
|
|
139
|
+
|
|
140
|
+
cross_entropy_value = cross_entropy(y_true=y, y_pred=self._sigmoid(x_coef), penalty=penalty)
|
|
141
|
+
|
|
142
|
+
weak_wolfe_value, grad_value = 0, 0
|
|
143
|
+
for _ in range(n_iter):
|
|
144
|
+
weak_wolfe, strong_wolfe, grad = self._wolfe(x=x, y=y, coef_=coef_, alpha=alpha, direction=direction, c1=c1,
|
|
145
|
+
c2=c2, grad_dir=grad_dir,
|
|
146
|
+
cross_entropy_value=cross_entropy_value,
|
|
147
|
+
C=C, x_coef=x_coef, x_direction=x_direction)
|
|
148
|
+
|
|
149
|
+
if strong_wolfe:
|
|
150
|
+
break
|
|
151
|
+
else:
|
|
152
|
+
if weak_wolfe and alpha > weak_wolfe_value:
|
|
153
|
+
weak_wolfe_value = alpha
|
|
154
|
+
grad_value = grad
|
|
155
|
+
|
|
156
|
+
alpha_lower = alpha
|
|
157
|
+
else:
|
|
158
|
+
alpha_upper = alpha
|
|
159
|
+
|
|
160
|
+
alpha = (alpha_lower + alpha_upper) / 2
|
|
161
|
+
else:
|
|
162
|
+
if weak_wolfe_value != 0:
|
|
163
|
+
alpha = weak_wolfe_value
|
|
164
|
+
grad = grad_value
|
|
165
|
+
else:
|
|
166
|
+
alpha, grad = None, None
|
|
167
|
+
|
|
168
|
+
return alpha, grad
|
|
169
|
+
|
|
170
|
+
def get_penalty(self, coef: np.ndarray, C: float):
|
|
171
|
+
if self.intercept:
|
|
172
|
+
penalty = np.linalg.norm(coef[1:]) / C
|
|
173
|
+
else:
|
|
174
|
+
penalty = np.linalg.norm(coef) / C
|
|
175
|
+
|
|
176
|
+
return penalty
|
lrf/_criterion.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def mse(y_true: np.ndarray, y_pred: np.ndarray):
|
|
5
|
+
return ((y_true - y_pred)**2).mean()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def rmse(y_true: np.ndarray, y_pred: np.ndarray):
|
|
9
|
+
return np.sqrt(((y_true - y_pred)**2).mean())
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def mae(y_true: np.ndarray, y_pred: np.ndarray):
|
|
13
|
+
return (np.abs(y_true - y_pred)).mean()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def mape(y_true: np.ndarray, y_pred: np.ndarray):
|
|
17
|
+
return np.abs((y_true - y_pred)/y_true).mean()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def neg_explained_variance(y_true: np.ndarray, y_pred: np.ndarray):
|
|
21
|
+
return np.var(y_true - y_pred)/np.var(y_true) - 1
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def neg_r2(y_true: np.ndarray, y_pred: np.ndarray):
|
|
25
|
+
return -np.einsum('i->', (y_true - y_pred)**2)/np.einsum('i->', (y_true - y_true.mean())**2)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def wape(y_true: np.ndarray, y_pred: np.ndarray):
|
|
29
|
+
return np.einsum('i->', np.abs(y_true - y_pred))/np.einsum('i->', np.abs(y_true))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, thresh: float = 0.5):
|
|
33
|
+
mask_ones = y_pred >= thresh
|
|
34
|
+
|
|
35
|
+
tmp = y_true[mask_ones] == 1
|
|
36
|
+
tp = np.count_nonzero(tmp)
|
|
37
|
+
fp = tmp.shape[0] - tp
|
|
38
|
+
|
|
39
|
+
tmp = y_true[~mask_ones] == 1
|
|
40
|
+
fn = np.count_nonzero(tmp)
|
|
41
|
+
tn = tmp.shape[0] - fn
|
|
42
|
+
|
|
43
|
+
return tn, fp, fn, tp
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _thresholds(y_pred: np.ndarray):
|
|
47
|
+
if y_pred.shape[0] < 10_000:
|
|
48
|
+
thresholds = np.unique(y_pred).tolist()
|
|
49
|
+
else:
|
|
50
|
+
step = 1 / 6180
|
|
51
|
+
thresholds = np.arange(max(np.min(y_pred) - 2 * step, 0), min(np.max(y_pred) + 2 * step, 1), step).tolist()
|
|
52
|
+
|
|
53
|
+
if np.min(y_pred) > 0:
|
|
54
|
+
thresholds = [0] + thresholds
|
|
55
|
+
|
|
56
|
+
if np.max(y_pred) < 1:
|
|
57
|
+
thresholds = thresholds + [1]
|
|
58
|
+
|
|
59
|
+
return thresholds
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def hamming(y_true: np.ndarray, y_pred: np.ndarray):
|
|
63
|
+
return (y_true != y_pred).mean()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def cross_entropy(y_true: np.ndarray, y_pred: np.ndarray, penalty: float = 0):
|
|
67
|
+
if y_true.ndim == 1:
|
|
68
|
+
y_true = y_true[:, np.newaxis]
|
|
69
|
+
|
|
70
|
+
y_pred = np.clip(y_pred, 1e-12, 1 - 1e-12)
|
|
71
|
+
return -(y_true*np.log(y_pred)+(1-y_true)*np.log(1-y_pred)).sum()/y_pred.shape[0] + penalty
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def neg_mcc(y_true: np.ndarray, y_pred: np.ndarray):
|
|
75
|
+
samples = y_true.shape[0]
|
|
76
|
+
tn, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred, thresh=0.5)
|
|
77
|
+
cm = np.array([[tn, fp], [fn, tp]]).reshape(2, 2)
|
|
78
|
+
|
|
79
|
+
c = np.einsum('ii', cm)
|
|
80
|
+
t = np.einsum('ij->j', cm)
|
|
81
|
+
p = np.einsum('ij->i', cm)
|
|
82
|
+
|
|
83
|
+
dividend = (c*samples - t @ p)
|
|
84
|
+
divisor = (np.sqrt(samples**2 - p @ p) * np.sqrt(samples**2 - t @ t))
|
|
85
|
+
|
|
86
|
+
if divisor == 0:
|
|
87
|
+
mcc = 0.0
|
|
88
|
+
else:
|
|
89
|
+
mcc = dividend / divisor
|
|
90
|
+
|
|
91
|
+
return -mcc
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def neg_roc_auc(y_true: np.ndarray, y_pred: np.ndarray):
|
|
95
|
+
thresholds = _thresholds(y_pred=y_pred)
|
|
96
|
+
|
|
97
|
+
positives = y_true.sum()
|
|
98
|
+
negatives = y_true.shape[0] - positives
|
|
99
|
+
|
|
100
|
+
fpr, tpr = [], []
|
|
101
|
+
|
|
102
|
+
for thresh in thresholds:
|
|
103
|
+
_, fp, _, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred, thresh=thresh)
|
|
104
|
+
|
|
105
|
+
tpr.append(tp / positives)
|
|
106
|
+
fpr.append(fp / negatives)
|
|
107
|
+
|
|
108
|
+
# integration is from right to left, therefore this is already the negative ROC AUC
|
|
109
|
+
neg_auc = np.trapz(tpr, fpr)
|
|
110
|
+
|
|
111
|
+
return neg_auc
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def neg_pr_auc(y_true: np.ndarray, y_pred: np.ndarray):
|
|
115
|
+
thresholds = _thresholds(y_pred=y_pred)
|
|
116
|
+
|
|
117
|
+
precision, recall = [], []
|
|
118
|
+
|
|
119
|
+
for thresh in thresholds:
|
|
120
|
+
_, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred, thresh=thresh)
|
|
121
|
+
|
|
122
|
+
p = tp / (tp + fp) if (tp + fp) > 0 else 0
|
|
123
|
+
r = tp / (tp + fn) if (tp + fn) > 0 else 0
|
|
124
|
+
|
|
125
|
+
precision.append(p)
|
|
126
|
+
recall.append(r)
|
|
127
|
+
|
|
128
|
+
# integration is from right to left, therefore this is already the negative PR AUC
|
|
129
|
+
neg_auc = np.trapz(precision, recall)
|
|
130
|
+
|
|
131
|
+
return neg_auc
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def neg_auk(y_true: np.ndarray, y_pred: np.ndarray):
|
|
135
|
+
NotImplementedError()
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def neg_g_mean(y_true: np.ndarray, y_pred: np.ndarray):
|
|
139
|
+
tn, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred)
|
|
140
|
+
|
|
141
|
+
return -(tp * tn / ((tp + fn) * (tn + fp)))
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def neg_f1(y_true: np.ndarray, y_pred: np.ndarray):
|
|
145
|
+
_, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred)
|
|
146
|
+
|
|
147
|
+
return -(tp/(tp + 0.5*(fp + fn)))
|
lrf/_linear_models.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from lrf._bfgs import BFGS
|
|
4
|
+
from lrf._preprocessor import Preprocessor
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Regressor:
|
|
8
|
+
def __init__(self, alpha: float = 2.0, preprocessing: str = None, fit_intercept: bool = True,
|
|
9
|
+
intercept_in_input: bool = False):
|
|
10
|
+
"""
|
|
11
|
+
Linear least-squares with l2 regularization, also knows as Ridge Regression or Tikhonov regularization. This
|
|
12
|
+
implementation supports several preprocessing methods, namely centering, normalizing and standardizing the
|
|
13
|
+
data.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
alpha : float, default=2.0
|
|
17
|
+
Regularization strength, larger values imply stronger regularization.
|
|
18
|
+
Must be a positive float. If alpha=0, there is no regularization and this implementation is equal to
|
|
19
|
+
a linear regression using least-squares.
|
|
20
|
+
|
|
21
|
+
preprocessing: str, default=None
|
|
22
|
+
Specifies the method for data preprocessing. Can be either 'center', 'normalize', 'standardize' or
|
|
23
|
+
None (default)
|
|
24
|
+
|
|
25
|
+
fit_intercept: bool, default=True
|
|
26
|
+
Whether to calculate the intercept.
|
|
27
|
+
|
|
28
|
+
intercept_in_input: bool, default=False
|
|
29
|
+
Whether there is an intercept column at index 0 in the data.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
self.alpha = alpha
|
|
33
|
+
self.preprocessing = preprocessing
|
|
34
|
+
self.fit_intercept = fit_intercept
|
|
35
|
+
self.intercept_in_input = intercept_in_input
|
|
36
|
+
|
|
37
|
+
if self.preprocessing is not None:
|
|
38
|
+
self.preprocessor = Preprocessor(method=self.preprocessing)
|
|
39
|
+
|
|
40
|
+
self.coef_ = None
|
|
41
|
+
|
|
42
|
+
def fit(self, x: np.ndarray, y: np.ndarray, initial_coefs: np.ndarray = None):
|
|
43
|
+
"""
|
|
44
|
+
Fit linear regression model.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
x: np.ndarray
|
|
48
|
+
Training data, containing the feature values.
|
|
49
|
+
|
|
50
|
+
y: np.ndarray
|
|
51
|
+
Target values.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
self:
|
|
55
|
+
Returns an instance of self.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
assert self.alpha >= 0
|
|
59
|
+
|
|
60
|
+
x = self._preprocessing(x, fit=True, intercept=self.intercept_in_input)
|
|
61
|
+
|
|
62
|
+
if self.fit_intercept and not self.intercept_in_input:
|
|
63
|
+
# insert a 1 as the intercept
|
|
64
|
+
x = np.insert(x, 0, 1, axis=1)
|
|
65
|
+
|
|
66
|
+
# ridge regression, by l2 regularization
|
|
67
|
+
A = self.alpha * np.identity(x.shape[1])
|
|
68
|
+
# we do not want to regularize the intercept
|
|
69
|
+
A[0, 0] = 0
|
|
70
|
+
|
|
71
|
+
# self.coef_ = np.dot(np.dot(np.linalg.pinv(np.dot(x.T, x) + A), x.T), y)
|
|
72
|
+
self.coef_ = np.linalg.lstsq(np.dot(x.T, x) + A, np.dot(x.T, y), rcond=None)[0]
|
|
73
|
+
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
def predict(self, x: np.ndarray):
|
|
77
|
+
"""
|
|
78
|
+
Make a prediction using the linear regression model.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
x: Samples of the features to derive the prediction from.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
np.ndarray: Returns the predicted values.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
assert self.coef_ is not None, 'This linear model is not fitted yet. Call "fit" before using this model to ' \
|
|
88
|
+
'make predictions'
|
|
89
|
+
|
|
90
|
+
x = self._preprocessing(x, fit=False, intercept=self.intercept_in_input)
|
|
91
|
+
|
|
92
|
+
if self.fit_intercept and not self.intercept_in_input:
|
|
93
|
+
# insert a 1 as the intercept
|
|
94
|
+
x = np.insert(x, 0, 1, axis=1)
|
|
95
|
+
|
|
96
|
+
return np.dot(x, self.coef_)
|
|
97
|
+
|
|
98
|
+
def _preprocessing(self, x: np.ndarray, fit: bool, intercept: bool):
|
|
99
|
+
"""
|
|
100
|
+
Fit the preprocessor or transform the data according to a given method.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
x: np.ndarray
|
|
104
|
+
Samples to be processed or on which the processor should be fitted.
|
|
105
|
+
|
|
106
|
+
fit: bool
|
|
107
|
+
Whether to call the 'fit' method of the preprocessor or to call the 'transform' method.
|
|
108
|
+
|
|
109
|
+
intercept: bool
|
|
110
|
+
Whether there is an intercept column at index 0 of x.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
np.ndarray: Returns the data, which is transformed if fit=False.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
if self.preprocessing is not None:
|
|
117
|
+
if fit:
|
|
118
|
+
self.preprocessor.fit(x)
|
|
119
|
+
|
|
120
|
+
x = self.preprocessor.transform(x, intercept)
|
|
121
|
+
|
|
122
|
+
return x
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class Classifier:
|
|
126
|
+
def __init__(self, n_iter: int = 100, tol: float = 10**(-6), C: float = 1,
|
|
127
|
+
preprocessing: str = None, fit_intercept: bool = True, intercept_in_input: bool = False):
|
|
128
|
+
self.n_iter = n_iter
|
|
129
|
+
self.tol = tol
|
|
130
|
+
self.C = C
|
|
131
|
+
self.preprocessing = preprocessing
|
|
132
|
+
self.fit_intercept = fit_intercept
|
|
133
|
+
self.intercept_in_input = intercept_in_input
|
|
134
|
+
|
|
135
|
+
if self.preprocessing is not None:
|
|
136
|
+
self.preprocessor = Preprocessor(method=self.preprocessing)
|
|
137
|
+
|
|
138
|
+
self.coef_ = None
|
|
139
|
+
|
|
140
|
+
def fit(self, x: np.ndarray, y: np.ndarray, initial_coefs: np.ndarray = None):
|
|
141
|
+
"""
|
|
142
|
+
Fit linear classification model.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
x: np.ndarray
|
|
146
|
+
Training data, containing the feature values.
|
|
147
|
+
|
|
148
|
+
y: np.ndarray
|
|
149
|
+
Target classes.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
self:
|
|
153
|
+
Returns an instance of self.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
if np.unique(y).shape[0] > 2:
|
|
157
|
+
raise ValueError('You can only use the internal linear classification model for binary classification.'
|
|
158
|
+
'For multi-class classification provide a suitable model to the "linear_model" parameter.')
|
|
159
|
+
|
|
160
|
+
x = self._preprocessing(x, fit=True, intercept=self.intercept_in_input)
|
|
161
|
+
|
|
162
|
+
if self.fit_intercept and not self.intercept_in_input:
|
|
163
|
+
# insert a 1 as the intercept
|
|
164
|
+
x = np.insert(x, 0, 1, axis=1)
|
|
165
|
+
|
|
166
|
+
self._logistic_regression(x=x, y=y, initial_coefs=initial_coefs)
|
|
167
|
+
|
|
168
|
+
return self
|
|
169
|
+
|
|
170
|
+
def predict_proba(self, x: np.ndarray):
|
|
171
|
+
"""
|
|
172
|
+
Make a prediction of the probability of each class using the linear classification model.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
x: np.ndarray
|
|
176
|
+
Samples of the features to derive the prediction from.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
np.ndarray: Returns the predicted probability of each class.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
x = self._preprocessing(x, fit=False, intercept=self.intercept_in_input)
|
|
183
|
+
|
|
184
|
+
if self.fit_intercept and not self.intercept_in_input:
|
|
185
|
+
# insert a 1 as the intercept
|
|
186
|
+
x = np.insert(x, 0, 1, axis=1)
|
|
187
|
+
|
|
188
|
+
one_proba = self._sigmoid(np.dot(x, self.coef_))
|
|
189
|
+
y_pred_proba = np.ones((x.shape[0], 2))
|
|
190
|
+
|
|
191
|
+
y_pred_proba[:, 0] -= one_proba
|
|
192
|
+
y_pred_proba[:, 1] = one_proba
|
|
193
|
+
|
|
194
|
+
return y_pred_proba
|
|
195
|
+
|
|
196
|
+
def predict(self, x: np.ndarray):
|
|
197
|
+
"""
|
|
198
|
+
Predict the classes using the linear classification model.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
x: np.ndarray
|
|
202
|
+
Samples of the features to derive the prediction from.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
np.ndarray: Returns the predicted classes.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
return np.argmax(self.predict_proba(x), axis=1)
|
|
209
|
+
|
|
210
|
+
@staticmethod
|
|
211
|
+
def _sigmoid(y: np.ndarray):
|
|
212
|
+
"""
|
|
213
|
+
Sigmoid function to map input to values between 0 and 1 on the characteristic s-shaped curve (sigmoid curve).
|
|
214
|
+
This is the probability for the positive class.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
y: np.ndarray
|
|
218
|
+
Input values, which will be mapped to values between 0 and 1.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
np.ndarray: Returns the probability for the positive class.
|
|
222
|
+
"""
|
|
223
|
+
return np.exp(-np.logaddexp(0, -y))
|
|
224
|
+
|
|
225
|
+
def _preprocessing(self, x: np.ndarray, fit: bool, intercept: bool):
|
|
226
|
+
"""
|
|
227
|
+
Fit the preprocessor or transform the data according to a given method.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
x: np.ndarray
|
|
231
|
+
Samples to be processed or on which the processor should be fitted.
|
|
232
|
+
|
|
233
|
+
fit: bool
|
|
234
|
+
Whether to call the 'fit' method of the preprocessor or to call the 'transform' method.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
np.ndarray: Returns the data, which is transformed if fit=False.
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
if self.preprocessing is not None:
|
|
241
|
+
if fit:
|
|
242
|
+
self.preprocessor.fit(x)
|
|
243
|
+
|
|
244
|
+
x = self.preprocessor.transform(x, intercept)
|
|
245
|
+
|
|
246
|
+
return x
|
|
247
|
+
|
|
248
|
+
def _logistic_regression(self, x: np.ndarray, y: np.ndarray, initial_coefs: np.ndarray):
|
|
249
|
+
intercept = self.fit_intercept or self.intercept_in_input
|
|
250
|
+
|
|
251
|
+
bfgs = BFGS(intercept=intercept)
|
|
252
|
+
if initial_coefs is None:
|
|
253
|
+
self.coef_ = bfgs.classification(x=x, y_true=y, coef_=np.zeros((x.shape[1], )), C=self.C)
|
|
254
|
+
else:
|
|
255
|
+
self.coef_ = bfgs.classification(x=x, y_true=y, coef_=initial_coefs, C=self.C)
|
lrf/_node.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
from lrf._linear_models import Regressor, Classifier
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Node:
|
|
9
|
+
"""
|
|
10
|
+
Consolidates all attributes needed at a node, regardless of whether the node is a leaf or not.
|
|
11
|
+
"""
|
|
12
|
+
depth: int = None
|
|
13
|
+
split_col_idx: int = None
|
|
14
|
+
threshold: float = None
|
|
15
|
+
metric: float = None
|
|
16
|
+
left_node = None
|
|
17
|
+
right_node = None
|
|
18
|
+
model: Union[Regressor, Classifier] = None
|
lrf/_preprocessor.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Preprocessor:
|
|
5
|
+
def __init__(self, method: str):
|
|
6
|
+
"""
|
|
7
|
+
Preprocessor for data transformation by a given method.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
method: str
|
|
11
|
+
Specifies the transformation method, which could be 'center', 'normalize' or 'standardize'.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
assert method in ['center', 'normalize', 'standardize'],\
|
|
15
|
+
'The scaling method should be "center", "normalize" or "standardize".'
|
|
16
|
+
|
|
17
|
+
self.method = method
|
|
18
|
+
|
|
19
|
+
self.mean = None
|
|
20
|
+
self.min = None
|
|
21
|
+
self.max = None
|
|
22
|
+
self.std = None
|
|
23
|
+
|
|
24
|
+
def fit(self, x: np.ndarray):
|
|
25
|
+
"""
|
|
26
|
+
Compute the values which are needed for the given preprocessing method.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
x: np.ndarray
|
|
30
|
+
The data used to compute the values which are needed for the given preprocessing method.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
self: Returns an instance of self.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
if self.method == 'center':
|
|
37
|
+
self.mean = x.mean(axis=0)
|
|
38
|
+
elif self.method == 'normalize':
|
|
39
|
+
self.min = x.min(axis=0)
|
|
40
|
+
self.max = x.max(axis=0)
|
|
41
|
+
|
|
42
|
+
# if the difference between min and max is 0, this would raise an error. In this case, do not preprocess
|
|
43
|
+
# the data.
|
|
44
|
+
diff = self.max - self.min
|
|
45
|
+
diff_zero = diff == 0
|
|
46
|
+
self.min[diff_zero] = 0
|
|
47
|
+
self.max[diff_zero] = 1
|
|
48
|
+
elif self.method == 'standardize':
|
|
49
|
+
self.mean = x.mean(axis=0)
|
|
50
|
+
self.std = x.std(axis=0)
|
|
51
|
+
|
|
52
|
+
# if the standard deviation is 0, this would raise an error. In this case, do not preprocess the data.
|
|
53
|
+
zero_std = self.std == 0
|
|
54
|
+
self.mean[zero_std] = 0
|
|
55
|
+
self.std[zero_std] = 1
|
|
56
|
+
|
|
57
|
+
return self
|
|
58
|
+
|
|
59
|
+
def transform(self, x: np.ndarray, intercept: bool):
|
|
60
|
+
"""
|
|
61
|
+
Transform the data according to the given method.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
x: np.ndarray
|
|
65
|
+
The data which will be transformed.
|
|
66
|
+
|
|
67
|
+
intercept: bool
|
|
68
|
+
Whether there is an intercept column at index 0 of x.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
np.ndarray: Returns the transformed data.
|
|
72
|
+
"""
|
|
73
|
+
if self.method == 'center':
|
|
74
|
+
return self._center(x, intercept)
|
|
75
|
+
elif self.method == 'normalize':
|
|
76
|
+
return self._normalize(x, intercept)
|
|
77
|
+
elif self.method == 'standardize':
|
|
78
|
+
return self._standardize(x, intercept)
|
|
79
|
+
|
|
80
|
+
def _center(self, x: np.ndarray, intercept: bool):
|
|
81
|
+
"""
|
|
82
|
+
Center the data.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
x: np.ndarray
|
|
86
|
+
The data which will be centered.
|
|
87
|
+
|
|
88
|
+
intercept: bool
|
|
89
|
+
Whether there is an intercept column at index 0 of x.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
np.ndarray: Returns the centered data.
|
|
93
|
+
"""
|
|
94
|
+
x -= self.mean
|
|
95
|
+
|
|
96
|
+
if intercept:
|
|
97
|
+
if x.ndim == 2:
|
|
98
|
+
x[:, 0] = 1
|
|
99
|
+
else:
|
|
100
|
+
x[0] = 1
|
|
101
|
+
|
|
102
|
+
return x
|
|
103
|
+
|
|
104
|
+
def _normalize(self, x: np.ndarray, intercept: bool):
|
|
105
|
+
"""
|
|
106
|
+
Normalize the data.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
x: np.ndarray
|
|
110
|
+
The data which will be normalized.
|
|
111
|
+
|
|
112
|
+
intercept: bool
|
|
113
|
+
Whether there is an intercept column at index 0 of x.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
np.ndarray: Returns the normalized data.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
x = (x - self.min)/(self.max - self.min)
|
|
120
|
+
|
|
121
|
+
if intercept:
|
|
122
|
+
if x.ndim == 2:
|
|
123
|
+
x[:, 0] = 1
|
|
124
|
+
else:
|
|
125
|
+
x[0] = 1
|
|
126
|
+
|
|
127
|
+
return x
|
|
128
|
+
|
|
129
|
+
def _standardize(self, x: np.ndarray, intercept: bool):
|
|
130
|
+
"""
|
|
131
|
+
Standardize the data.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
x: np.ndarray
|
|
135
|
+
The data which will be standardized.
|
|
136
|
+
|
|
137
|
+
intercept: bool
|
|
138
|
+
Whether there is an intercept column at index 0 of x.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
np.ndarray: Returns the standardized data.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
x = (x - self.mean) / self.std
|
|
145
|
+
|
|
146
|
+
if intercept:
|
|
147
|
+
if x.ndim == 2:
|
|
148
|
+
x[:, 0] = 1
|
|
149
|
+
else:
|
|
150
|
+
x[0] = 1
|
|
151
|
+
|
|
152
|
+
return x
|
lrf/lrf.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from typing import List, Any
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from lrf._base_lrf import _LinearRandomForest
|
|
6
|
+
from lrf._linear_models import Regressor, Classifier
|
|
7
|
+
from lrf._criterion import (mse, rmse, mae, mape, wape, neg_explained_variance, neg_r2,
|
|
8
|
+
hamming, cross_entropy, neg_mcc, neg_roc_auc, neg_pr_auc)
|
|
9
|
+
from lrf._node import Node
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LRFRegressor(_LinearRandomForest):
|
|
13
|
+
def __init__(self, linear_model: Any = None, alpha: float = 2.0, preprocessing: str = None,
|
|
14
|
+
n_estimators: int = 100, max_depth: int = 5, n_splits: int = 15,
|
|
15
|
+
split_samples_to_features_ratio: float = 4.5, min_abs_improvement: float = 1 * 10 ** (-4),
|
|
16
|
+
leaf_samples_to_features_ratio: float = 2.0, criterion: str = 'mse',
|
|
17
|
+
n_jobs: int = -1, random_state: int = None, verbose: bool = False):
|
|
18
|
+
|
|
19
|
+
self.alpha = alpha
|
|
20
|
+
self.preprocessing = preprocessing
|
|
21
|
+
self._estimator_type = 'regressor'
|
|
22
|
+
|
|
23
|
+
if linear_model is None:
|
|
24
|
+
linear_model = Regressor(alpha=self.alpha, preprocessing=self.preprocessing, intercept_in_input=True)
|
|
25
|
+
else:
|
|
26
|
+
assert hasattr(linear_model, 'fit')
|
|
27
|
+
assert hasattr(linear_model, 'predict')
|
|
28
|
+
|
|
29
|
+
super().__init__(linear_model=linear_model,
|
|
30
|
+
n_estimators=n_estimators, max_depth=max_depth, criterion=criterion,
|
|
31
|
+
n_splits=n_splits, split_samples_to_features_ratio=split_samples_to_features_ratio,
|
|
32
|
+
leaf_samples_to_features_ratio=leaf_samples_to_features_ratio,
|
|
33
|
+
min_abs_improvement=min_abs_improvement,
|
|
34
|
+
n_jobs=n_jobs, random_state=random_state, verbose=verbose)
|
|
35
|
+
|
|
36
|
+
def predict(self, x: np.ndarray):
|
|
37
|
+
# add intercept here and not inside linear model for performance reasons
|
|
38
|
+
x = np.insert(x, 0, 1, axis=1)
|
|
39
|
+
|
|
40
|
+
# add columns with row index for sorting after multiprocessing
|
|
41
|
+
x = np.insert(x, 0, np.arange(x.shape[0]), axis=1)
|
|
42
|
+
|
|
43
|
+
results = [self._predict_tree(node=node, x=x, results=[]) for node in self.forest]
|
|
44
|
+
|
|
45
|
+
results = [np.vstack(i) for i in results]
|
|
46
|
+
results = np.array([i[np.argsort(i[:, 0])][:, 1:] for i in results])
|
|
47
|
+
results = results.mean(axis=0).flatten()
|
|
48
|
+
return results
|
|
49
|
+
|
|
50
|
+
def _predict_tree(self, node: Node, x: np.ndarray, results: List):
|
|
51
|
+
if node.model is None:
|
|
52
|
+
if x.shape[0] > 0:
|
|
53
|
+
left_indices = x[:, node.split_col_idx + 1] < node.threshold
|
|
54
|
+
right_indices = ~left_indices
|
|
55
|
+
|
|
56
|
+
results = self._predict_tree(node.left_node, x[left_indices], results)
|
|
57
|
+
results = self._predict_tree(node.right_node, x[right_indices], results)
|
|
58
|
+
else:
|
|
59
|
+
if x.shape[0] > 0:
|
|
60
|
+
node_results = node.model.predict(x[:, 1:])
|
|
61
|
+
node_results = np.insert(node_results[:, np.newaxis], 0, x[:, 0], axis=1)
|
|
62
|
+
|
|
63
|
+
results.extend(node_results)
|
|
64
|
+
|
|
65
|
+
return results
|
|
66
|
+
|
|
67
|
+
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray):
|
|
68
|
+
if self.criterion == 'mse':
|
|
69
|
+
val = mse(y_true=y_true, y_pred=y_pred)
|
|
70
|
+
elif self.criterion == 'rmse':
|
|
71
|
+
val = rmse(y_true=y_true, y_pred=y_pred)
|
|
72
|
+
elif self.criterion == 'mae':
|
|
73
|
+
val = mae(y_true=y_true, y_pred=y_pred)
|
|
74
|
+
elif self.criterion == 'mape':
|
|
75
|
+
val = mape(y_true=y_true, y_pred=y_pred)
|
|
76
|
+
elif self.criterion == 'wape':
|
|
77
|
+
val = wape(y_true=y_true, y_pred=y_pred)
|
|
78
|
+
elif self.criterion == 'neg_explained_variance':
|
|
79
|
+
val = neg_explained_variance(y_true=y_true, y_pred=y_pred)
|
|
80
|
+
elif self.criterion == 'neg_r2':
|
|
81
|
+
val = neg_r2(y_true=y_true, y_pred=y_pred)
|
|
82
|
+
else:
|
|
83
|
+
print(' Metric "{}" is not implemented, MSE is used instead.'.format(self.criterion))
|
|
84
|
+
val = mse(y_true=y_true, y_pred=y_pred)
|
|
85
|
+
|
|
86
|
+
return val
|
|
87
|
+
|
|
88
|
+
def get_params(self, deep: bool = True):
|
|
89
|
+
return {'linear_model': self.linear_model, 'alpha': self.alpha, 'preprocessing': self.preprocessing,
|
|
90
|
+
'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'n_splits': self.n_splits,
|
|
91
|
+
'split_samples_to_features_ratio': self.split_samples_to_features_ratio,
|
|
92
|
+
'leaf_samples_to_features_ratio': self.leaf_samples_to_features_ratio, 'criterion': self.criterion,
|
|
93
|
+
'n_jobs': self.n_jobs, 'random_state': self.random_state, 'verbose': self.verbose}
|
|
94
|
+
|
|
95
|
+
def score(self, X: np.ndarray, y: np.ndarray):
|
|
96
|
+
y_pred = self.predict(X)
|
|
97
|
+
return -neg_r2(y_true=y, y_pred=y_pred)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class LRFClassifier(_LinearRandomForest):
|
|
101
|
+
def __init__(self, linear_model: Any = None, C: float = 1.0, n_estimators: int = 100, max_depth: int = 5,
|
|
102
|
+
n_splits: int = 15, split_samples_to_features_ratio: float = 4.5,
|
|
103
|
+
leaf_samples_to_features_ratio: float = 2.0, criterion: str = 'neg_mcc',
|
|
104
|
+
min_abs_improvement: float = 5*10**(-4), n_jobs: int = -1, random_state: int = None,
|
|
105
|
+
verbose: bool = False, preprocessing: str = 'standardize', warm_start: bool = True):
|
|
106
|
+
|
|
107
|
+
self.C = C
|
|
108
|
+
self.preprocessing = preprocessing
|
|
109
|
+
self._estimator_type = 'classifier'
|
|
110
|
+
|
|
111
|
+
if linear_model is None:
|
|
112
|
+
linear_model = Classifier(C=self.C, preprocessing=self.preprocessing, intercept_in_input=True)
|
|
113
|
+
else:
|
|
114
|
+
assert hasattr(linear_model, 'fit')
|
|
115
|
+
assert hasattr(linear_model, 'predict')
|
|
116
|
+
assert hasattr(linear_model, 'predict_proba')
|
|
117
|
+
|
|
118
|
+
super().__init__(linear_model=linear_model,
|
|
119
|
+
n_estimators=n_estimators, max_depth=max_depth, criterion=criterion,
|
|
120
|
+
n_splits=n_splits, min_abs_improvement=min_abs_improvement,
|
|
121
|
+
split_samples_to_features_ratio=split_samples_to_features_ratio, random_state=random_state,
|
|
122
|
+
leaf_samples_to_features_ratio=leaf_samples_to_features_ratio, n_jobs=n_jobs, verbose=verbose,
|
|
123
|
+
classification=True, warm_start=warm_start)
|
|
124
|
+
|
|
125
|
+
def predict(self, x: np.ndarray):
|
|
126
|
+
"""
|
|
127
|
+
Predict the classes using the linear random forest.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
x: Samples of the features to derive the prediction from.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
np.ndarray: Returns the predicted classes.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
return np.argmax(self.predict_proba(x=x), axis=1)
|
|
137
|
+
|
|
138
|
+
def predict_proba(self, x: np.ndarray):
|
|
139
|
+
"""
|
|
140
|
+
Make a prediction of the probability of each class using the linear random forest.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
x: Samples of the features to derive the prediction from.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
np.ndarray: Returns the predicted probability of each class.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
# add intercept here and not inside linear model for performance reasons
|
|
150
|
+
x = np.insert(x, 0, 1, axis=1)
|
|
151
|
+
|
|
152
|
+
# add columns with row index for sorting after multiprocessing
|
|
153
|
+
x = np.insert(x, 0, np.arange(x.shape[0]), axis=1)
|
|
154
|
+
|
|
155
|
+
results = [self._predict_proba_tree(node=node, x=x, results=[]) for node in self.forest]
|
|
156
|
+
|
|
157
|
+
c = results[0][0].shape[0]
|
|
158
|
+
|
|
159
|
+
results = [np.concatenate(i).reshape((-1, c)) for i in results]
|
|
160
|
+
results = np.array([i[np.argsort(i[:, 0])][:, 1:] for i in results])
|
|
161
|
+
results = results.mean(axis=0)
|
|
162
|
+
return results
|
|
163
|
+
|
|
164
|
+
def _predict_proba_tree(self, node: Node, x: np.ndarray, results: List):
|
|
165
|
+
"""
|
|
166
|
+
Make a prediction of the probability of each class using one given tree of the linear random forest.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
x: Samples of the features to derive the prediction from.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
np.ndarray: Returns the predicted probability of each class.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
if node.model is None:
|
|
176
|
+
if x.shape[0] > 0:
|
|
177
|
+
left_indices = x[:, node.split_col_idx + 1] < node.threshold
|
|
178
|
+
right_indices = ~left_indices
|
|
179
|
+
|
|
180
|
+
results = self._predict_proba_tree(node.left_node, x[left_indices], results)
|
|
181
|
+
results = self._predict_proba_tree(node.right_node, x[right_indices], results)
|
|
182
|
+
else:
|
|
183
|
+
if x.shape[0] > 0:
|
|
184
|
+
node_results = node.model.predict_proba(x[:, 1:])
|
|
185
|
+
n, m = node_results.shape
|
|
186
|
+
res = np.empty((n, m + 1))
|
|
187
|
+
res[:, 0] = x[:, 0]
|
|
188
|
+
res[:, 1:] = node_results
|
|
189
|
+
|
|
190
|
+
results.extend(res)
|
|
191
|
+
|
|
192
|
+
return results
|
|
193
|
+
|
|
194
|
+
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray):
|
|
195
|
+
if self.criterion == 'neg_mcc':
|
|
196
|
+
val = neg_mcc(y_true=y_true, y_pred=y_pred)
|
|
197
|
+
elif self.criterion == 'neg_pr_auc':
|
|
198
|
+
val = neg_pr_auc(y_true=y_true, y_pred=y_pred)
|
|
199
|
+
elif self.criterion == 'hamming':
|
|
200
|
+
val = hamming(y_true=y_true, y_pred=y_pred)
|
|
201
|
+
elif self.criterion == 'cross_entropy':
|
|
202
|
+
val = cross_entropy(y_true=y_true, y_pred=y_pred)
|
|
203
|
+
elif self.criterion == 'neg_roc_auc':
|
|
204
|
+
val = neg_roc_auc(y_true=y_true, y_pred=y_pred)
|
|
205
|
+
else:
|
|
206
|
+
print(' Metric "{}" is not implemented, the negative Matthews Correlation Coefficient is used '
|
|
207
|
+
'instead.'.format(self.criterion))
|
|
208
|
+
val = neg_mcc(y_true=y_true, y_pred=y_pred)
|
|
209
|
+
|
|
210
|
+
return val
|
|
211
|
+
|
|
212
|
+
def get_params(self, deep=True):
|
|
213
|
+
return {'linear_model': self.linear_model, 'C': self.C, 'preprocessing': self.preprocessing,
|
|
214
|
+
'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'n_splits': self.n_splits,
|
|
215
|
+
'split_samples_to_features_ratio': self.split_samples_to_features_ratio,
|
|
216
|
+
'leaf_samples_to_features_ratio': self.leaf_samples_to_features_ratio, 'criterion': self.criterion,
|
|
217
|
+
'n_jobs': self.n_jobs, 'random_state': self.random_state, 'verbose': self.verbose}
|
|
218
|
+
|
|
219
|
+
def score(self, X: np.ndarray, y: np.ndarray):
|
|
220
|
+
y_pred = self.predict(X)
|
|
221
|
+
return np.count_nonzero(y == y_pred)/y.shape[0]
|