linearrf 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.1
2
+ Name: linearrf
3
+ Version: 1.0.0
4
+ Summary: A python libary to build Random Forests with Linear Models at the leaves.
5
+ Author-email: Marian Biermann <marianbiermann@gmx.de>
6
+ Project-URL: homepage, https://github.com/marianbiermann/lrf
7
+ Keywords: ml,rf,linear model,tree,dart,model tree,linear tree
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE.md
14
+ Requires-Dist: numpy>=1.20.3
15
+ Provides-Extra: dev
16
+ Requires-Dist: sklearn; extra == "dev"
17
+
@@ -0,0 +1,13 @@
1
+ lrf/__init__.py,sha256=uiaviCEyab13xWdm0xvVwliqyK5Tk0_KyOvMIvXdKwE,44
2
+ lrf/_base_lrf.py,sha256=xgVQItslZd1c8lHosr-oX8C3jigGyFELpWAl_HO3EVE,14446
3
+ lrf/_bfgs.py,sha256=xMgi3vV3JuYLpAF8VIjmoCtT4vmMmrc-1T2rpCWyOlc,6341
4
+ lrf/_criterion.py,sha256=iKBzfroMT7AuN6urZxFDuMlvH46FZZcG7y36pEpGr24,3989
5
+ lrf/_linear_models.py,sha256=b-hwsKJc3sH7vqtf_VVTj7Vfp0IC85WUlaNdbCc7L5U,8512
6
+ lrf/_node.py,sha256=_4EmSxUZwWPtu0CJhC5iGJ0MOFNyHQlQByLwYF_r4tA,448
7
+ lrf/_preprocessor.py,sha256=KibBIUhXOoVCvX3v3ZBsaZ4rjq5do7oA0Zjm26Qur2w,4103
8
+ lrf/lrf.py,sha256=EZujkK7tCf9sMEjBT2lI5JjyAvXAhiImcYSf5LVEy2E,10027
9
+ linearrf-1.0.0.dist-info/LICENSE.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ linearrf-1.0.0.dist-info/METADATA,sha256=BIflfzWZuSBx1DFgVz9CpB2JnJetwJz6_iZlJpE6JEA,631
11
+ linearrf-1.0.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
12
+ linearrf-1.0.0.dist-info/top_level.txt,sha256=Er_kmcN7GxzkBUkt7tBX7sAOd4tNlNDLkEPEX61pVok,4
13
+ linearrf-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.3.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ lrf
lrf/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .lrf import LRFClassifier, LRFRegressor
lrf/_base_lrf.py ADDED
@@ -0,0 +1,360 @@
1
+ import copy
2
+ import datetime
3
+ import time
4
+ from concurrent.futures import ProcessPoolExecutor, as_completed
5
+ from multiprocessing import cpu_count
6
+ from typing import List, Union
7
+
8
+ import numpy as np
9
+
10
+ from lrf._linear_models import Regressor, Classifier
11
+ from lrf._node import Node
12
+
13
+
14
+ class _LinearRandomForest:
15
+ def __init__(self, linear_model: Union[Regressor, Classifier] = None, n_estimators: int = 100, max_depth: int = 5,
16
+ criterion: str = None, n_splits: int = 15, split_samples_to_features_ratio: float = 4.5,
17
+ leaf_samples_to_features_ratio: float = 2.0, min_abs_improvement: float = 5*10**(-4),
18
+ warm_start: bool = True, n_jobs: int = -1, random_state: int = None, verbose: bool = False,
19
+ classification: bool = False):
20
+ self.linear_model = linear_model
21
+ self.n_estimators = n_estimators
22
+ self.max_depth = max_depth
23
+ self.criterion = criterion
24
+ self.n_splits = n_splits
25
+ self.split_samples_to_features_ratio = split_samples_to_features_ratio
26
+ self.leaf_samples_to_features_ratio = leaf_samples_to_features_ratio
27
+ self.min_abs_improvement = min_abs_improvement
28
+ self.warm_start = warm_start
29
+ self.n_jobs = n_jobs
30
+ self.random_state = random_state
31
+ self.verbose = verbose
32
+ self.classification = classification
33
+
34
+ def _init_more_attributes(self, y):
35
+ if self.classification:
36
+ self.classes_ = None
37
+
38
+ self.forest = None
39
+ self.min_samples_split = None
40
+ self.min_samples_leaf = None
41
+
42
+ if self.max_depth is None:
43
+ self.max_depth = 10 ** 32
44
+
45
+ if self.n_jobs == -1 or self.n_jobs == 0:
46
+ self.n_jobs = cpu_count()
47
+ else:
48
+ self.n_jobs = min(self.n_jobs, cpu_count())
49
+
50
+ if self.split_samples_to_features_ratio < self.leaf_samples_to_features_ratio * 2:
51
+ self.split_samples_to_features_ratio = self.leaf_samples_to_features_ratio * 2
52
+
53
+ self.min_samples_split = None
54
+ self.min_samples_leaf = None
55
+
56
+ self.total_data_points = y.shape[0]
57
+
58
+ def fit(self, x: np.ndarray, y: np.ndarray):
59
+
60
+ assert y.ndim == 1
61
+ assert not np.all(y == y[0])
62
+
63
+ self._init_more_attributes(y=y)
64
+
65
+ if self.classification:
66
+ self._check_targets_classification(y)
67
+
68
+ random_state_list = np.random.default_rng(self.random_state).integers(2**63, size=self.n_estimators)
69
+
70
+ self.min_samples_split = self.split_samples_to_features_ratio * x.shape[1]
71
+ self.min_samples_leaf = self.leaf_samples_to_features_ratio * x.shape[1]
72
+
73
+ forest = []
74
+
75
+ # add intercept here and not inside linear model for performance reasons
76
+ x = np.insert(x, 0, 1, axis=1)
77
+
78
+ # parallel process combinations of chunks of the data
79
+ with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
80
+ if self.verbose:
81
+ print('\nStart growing trees...')
82
+ finished_tasks = 0
83
+ start_time = time.time()
84
+
85
+ results = [executor.submit(self._grow_tree, x=x, y=y, random_state=i) for i in random_state_list]
86
+
87
+ # collect the results and print the progress
88
+ for r in as_completed(results):
89
+ # collecting results
90
+ grown_tree = r.result()
91
+ forest.append(grown_tree)
92
+
93
+ # printing progress
94
+ if self.verbose:
95
+ finished_tasks += 1
96
+ self._print_progress(frac=finished_tasks/self.n_estimators, start_time=start_time)
97
+
98
+ self.forest = forest
99
+
100
+ if self.verbose:
101
+ elapsed_seconds = round(time.time() - start_time)
102
+ print('Finished planting the forest in {} '.format(str(datetime.timedelta(seconds=elapsed_seconds))))
103
+
104
+ @staticmethod
105
+ def _print_progress(frac: float, start_time: float):
106
+ """
107
+ Prints the progress of the parallel multiprocessing
108
+ Args:
109
+ frac (int): Fraction of tasks which are already finished
110
+
111
+ """
112
+ elapsed_seconds = round(time.time() - start_time)
113
+ remaining_seconds = round(elapsed_seconds / frac - elapsed_seconds)
114
+ print('LRF - Progress: {}%, [{}<{}]'.format(
115
+ round(100 * frac, 2),
116
+ str(datetime.timedelta(seconds=elapsed_seconds)),
117
+ str(datetime.timedelta(seconds=remaining_seconds))
118
+ ), end='\r')
119
+
120
+ def _grow_tree(self, x: np.ndarray, y: np.ndarray, random_state: int):
121
+
122
+ rng = np.random.default_rng(random_state)
123
+
124
+ idx = rng.choice(np.arange(x.shape[0]), x.shape[0])
125
+ x = x[idx]
126
+ y = y[idx]
127
+
128
+ tree = self._root_node(x=x, y=y)
129
+
130
+ # split
131
+ tree = self._split(node=tree, x=x, y=y, depth=0, rng=rng)
132
+
133
+ return tree
134
+
135
+ def _root_node(self, x: np.ndarray, y: np.ndarray):
136
+ # initial linear model
137
+ root_model = copy.deepcopy(self.linear_model)
138
+
139
+ if isinstance(root_model, (Regressor, Classifier)):
140
+ root_model.fit(x, y, None)
141
+ else:
142
+ root_model.fit(x, y)
143
+
144
+ if self.criterion == 'cross_entropy':
145
+ y_pred = root_model.predict_proba(x)
146
+ elif (self.criterion == 'neg_roc_auc') or (self.criterion == 'neg_pr_auc'):
147
+ y_pred = root_model.predict_proba(x)[:, 1]
148
+ else:
149
+ y_pred = root_model.predict(x)
150
+
151
+ metric = self._calculate_metric(y_true=y, y_pred=y_pred)
152
+
153
+ # create node object
154
+ tree = Node(depth=0, metric=metric, model=root_model)
155
+
156
+ return tree
157
+
158
+ def _split(self, node: Node, x: np.ndarray, y: np.ndarray, depth: int, rng: np.random.Generator):
159
+ if (depth == self.max_depth) or np.all(np.all(x == x[0, :], axis=1)) or (x.shape[0] < self.min_samples_split):
160
+ return node
161
+ else:
162
+ split = self._find_best_split(x=x, y=y, last_metric=node.metric, old_coefs=node.model.coef_, rng=rng)
163
+
164
+ if split.get('threshold') is not None:
165
+ node.threshold = split['threshold']
166
+ node.split_col_idx = split['column']
167
+
168
+ left_node = Node(depth=depth + 1, model=split['model_left'], metric=split['metric_left'])
169
+ left_node = self._split(node=left_node, x=split['x_left'], y=split['y_left'],
170
+ depth=depth + 1, rng=rng)
171
+
172
+ right_node = Node(depth=depth + 1, model=split['model_right'], metric=split['metric_right'])
173
+ right_node = self._split(node=right_node, x=split['x_right'], y=split['y_right'],
174
+ depth=depth + 1, rng=rng)
175
+
176
+ node.left_node = left_node
177
+ node.right_node = right_node
178
+ node.model = None
179
+
180
+ return node
181
+
182
+ def _find_best_split(self, x: np.ndarray, y: np.ndarray,
183
+ last_metric: float, old_coefs: np.ndarray, rng: np.random.Generator):
184
+ split = {}
185
+
186
+ random_col_ids = rng.choice(np.arange(1, (x.shape[1])), int(round(np.sqrt(x.shape[1] - 1))), replace=False)
187
+
188
+ for col in random_col_ids:
189
+ split_candidates = self._split_values(x[:, col], rng=rng)
190
+
191
+ for thresh in split_candidates:
192
+ left_idx = x[:, col] <= thresh
193
+ left_idx, right_idx = left_idx.nonzero()[0], (~left_idx).nonzero()[0]
194
+
195
+ if x[:, col].max() == thresh:
196
+ continue
197
+
198
+ x_left, y_left = x.take(left_idx, axis=0), y.take(left_idx, axis=0)
199
+ x_right, y_right = x.take(right_idx, axis=0), y.take(right_idx, axis=0)
200
+
201
+ if np.all(y_left == y_left[0]) or np.all(y_right == y_right[0]):
202
+ continue
203
+
204
+ observations_left, observations_right = y_left.shape[0], y_right.shape[0]
205
+
206
+ if (
207
+ observations_left < self.min_samples_leaf
208
+ ) or (
209
+ observations_right < self.min_samples_leaf
210
+ ):
211
+ continue
212
+
213
+ # initialize models
214
+ model_left, model_right = copy.deepcopy(self.linear_model), copy.deepcopy(self.linear_model)
215
+
216
+ # fit models
217
+ if self.warm_start and isinstance(model_left, (Regressor, Classifier)) and isinstance(
218
+ model_right, (Regressor, Classifier)):
219
+ model_left.fit(x_left, y_left, initial_coefs=old_coefs)
220
+ model_right.fit(x_right, y_right, initial_coefs=old_coefs)
221
+ else:
222
+ model_left.fit(x_left, y_left, None)
223
+ model_right.fit(x_right, y_right, None)
224
+
225
+ # get prediction for these nodes
226
+ if self.criterion == 'cross_entropy':
227
+ y_pred_left = model_left.predict_proba(x_left)
228
+ y_pred_right = model_right.predict_proba(x_right)
229
+ elif (self.criterion == 'neg_roc_auc') or (self.criterion == 'neg_pr_auc'):
230
+ y_pred_left = model_left.predict_proba(x_left)[:, 1]
231
+ y_pred_right = model_right.predict_proba(x_right)[:, 1]
232
+ else:
233
+ y_pred_left = model_left.predict(x_left)
234
+ y_pred_right = model_right.predict(x_right)
235
+
236
+ # get metrics for these nodes
237
+ metric_left = self._calculate_metric(y_true=y_left, y_pred=y_pred_left)
238
+ metric_right = self._calculate_metric(y_true=y_right, y_pred=y_pred_right)
239
+
240
+ new_metric = ((metric_left * observations_left + metric_right * observations_right)
241
+ / (observations_left + observations_right))
242
+ better_split = new_metric < (last_metric - self.min_abs_improvement)
243
+
244
+ if better_split:
245
+ last_metric = new_metric
246
+
247
+ split = {'column': col,
248
+ 'threshold': thresh,
249
+ 'model_left': model_left,
250
+ 'model_right': model_right,
251
+ 'x_right': x_right,
252
+ 'y_right': y_right,
253
+ 'x_left': x_left,
254
+ 'y_left': y_left,
255
+ 'metric_left': metric_left,
256
+ 'metric_right': metric_right}
257
+
258
+ return split
259
+
260
+ def _split_values(self, values: np.ndarray, rng: np.random.Generator) -> List:
261
+ unique_values = np.unique(values)
262
+ if unique_values.shape[0] <= self.n_splits:
263
+ split_values = unique_values.tolist()
264
+ else:
265
+ perc_splits = np.ceil(2 * self.n_splits / 3)
266
+ perc_splits = np.unique(np.percentile(values, np.arange(50 / perc_splits, 100, 100 / perc_splits),
267
+ method='closest_observation'))
268
+
269
+ n_smart_splits = self.n_splits - perc_splits.shape[0]
270
+
271
+ diff = np.diff(unique_values, prepend=unique_values[0])
272
+ std = diff[1:].std()
273
+ length = unique_values.shape[0]
274
+
275
+ mask = np.array([False] * length)
276
+ k = np.arange(0, 15, 0.02)
277
+ for j in k:
278
+ mask += diff > (0.5 - j / 200) * length ** (1 / (2 + j)) * std
279
+ if np.count_nonzero(mask) >= n_smart_splits:
280
+ break
281
+
282
+ mask = mask.nonzero()[0]
283
+ smart_splits = unique_values.take(mask, axis=0)
284
+ if smart_splits.shape[0] > n_smart_splits:
285
+ smart_splits = rng.choice(smart_splits, n_smart_splits, replace=False)
286
+
287
+ split_values = perc_splits.tolist() + smart_splits.tolist()
288
+
289
+ max_value = unique_values[-1]
290
+ split_values = [val for val in split_values if val != max_value]
291
+ return split_values
292
+
293
+ def predict(self, x: np.ndarray):
294
+ NotImplementedError()
295
+
296
+ def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
297
+ NotImplementedError()
298
+
299
+ def export_text(self, tree: int = None, column_names: List[str] = None, ndigits: int = 5):
300
+ txt = ''
301
+ if tree is None:
302
+ for i, node in enumerate(self.forest):
303
+ txt += 'Tree {}:\n'.format(i)
304
+ txt += self._node_to_text(node=node, column_names=column_names, ndigits=ndigits)
305
+ txt += '\n' + '\n'
306
+ else:
307
+ txt += 'Tree {}:\n'.format(tree)
308
+ node = self.forest[tree]
309
+ txt += self._node_to_text(node=node, column_names=column_names, ndigits=ndigits)
310
+
311
+ return txt
312
+
313
+ def _node_to_text(self, node: Node, column_names: List[str] = None, ndigits: int = 3):
314
+
315
+ txt = ''.join(['| ']*node.depth)
316
+ txt += '|---'
317
+
318
+ if node.model is None:
319
+ if column_names is None:
320
+ col = 'col_{}'.format(node.split_col_idx - 1)
321
+ else:
322
+ col = column_names[node.split_col_idx - 1]
323
+
324
+ txt += ' '.join([col, '<', str(round(node.threshold, ndigits))])
325
+ txt += '\n'
326
+
327
+ txt += self._node_to_text(node=node.left_node, column_names=column_names, ndigits=ndigits)
328
+
329
+ txt += ''.join(['| '] * node.depth)
330
+ txt += '|---'
331
+ txt += ' '.join([col, '>=', str(round(node.threshold, ndigits))])
332
+ txt += '\n'
333
+
334
+ txt += self._node_to_text(node=node.right_node, column_names=column_names, ndigits=ndigits)
335
+
336
+ else:
337
+ intercept = node.model.coef_[0]
338
+ weights = node.model.coef_[1:]
339
+ weights = ['+' + str(round(w, ndigits)) if w > 0 else str(round(w, ndigits)) for w in weights]
340
+ if column_names is None:
341
+ cols = ['col_{}'.format(i) for i in range(len(weights))]
342
+ else:
343
+ cols = column_names
344
+
345
+ weights_and_cols = ' '.join(['*'.join(p) for p in list(zip(weights, cols))])
346
+
347
+ txt += ' '.join(['model: y =', str(round(intercept, ndigits)), weights_and_cols])
348
+
349
+ txt += '\n'
350
+
351
+ return txt
352
+
353
+ def set_params(self, **parameters):
354
+ for parameter, value in parameters.items():
355
+ setattr(self, parameter, value)
356
+ return self
357
+
358
+ def _check_targets_classification(self, y: np.ndarray):
359
+ self.classes_ = np.unique(y)
360
+ assert issubclass(self.classes_.dtype.type, np.integer), 'Please convert targets to integer values'
lrf/_bfgs.py ADDED
@@ -0,0 +1,176 @@
1
+ import numpy as np
2
+
3
+ from lrf._criterion import cross_entropy
4
+
5
+
6
+ class BFGS:
7
+ def __init__(self, n_iter: int = 100, tol: float = 10**(-4), intercept: bool = True):
8
+ self.n_iter = n_iter
9
+ self.tol = tol
10
+ self.intercept = intercept
11
+
12
+ def classification(self, x: np.ndarray, y_true: np.ndarray, coef_: np.ndarray, C: float = 1.0):
13
+ y_true = y_true[:, np.newaxis]
14
+
15
+ coef_ = coef_[:, np.newaxis]
16
+ new_grad = self._grad_cross_entropy_logistic(y_true=y_true, x=x, coef_=coef_, C=C,
17
+ y_pred=self._sigmoid(x@coef_))
18
+
19
+ H_inv = np.eye(coef_.shape[0]) / 0.2
20
+
21
+ alpha = 1
22
+ for _ in range(self.n_iter):
23
+
24
+ grad = new_grad
25
+
26
+ direction = -H_inv @ grad
27
+
28
+ alpha, new_grad = self._line_search(x=x, y=y_true, coef_=coef_, direction=direction,
29
+ grad=grad, C=C, alpha=alpha)
30
+
31
+ if alpha is None:
32
+ break
33
+
34
+ s = alpha * direction
35
+
36
+ change_mask = coef_ != 0
37
+ change = np.abs(s[change_mask] / coef_[change_mask]) if np.count_nonzero(change_mask) > 0 else 1
38
+
39
+ coef_ += s
40
+
41
+ if (np.max(change) <= self.tol) or np.all(new_grad == 0):
42
+ break
43
+ else:
44
+ grad_diff = new_grad - grad
45
+
46
+ st_grad_diff = s.T @ grad_diff
47
+
48
+ A = ((st_grad_diff + grad_diff.T @ H_inv @ grad_diff) * (s @ s.T)) / (st_grad_diff**2)
49
+
50
+ B = (H_inv @ grad_diff @ s.T + s @ grad_diff.T @ H_inv) / st_grad_diff
51
+
52
+ H_inv += A - B
53
+
54
+ return coef_.flatten()
55
+
56
+ @staticmethod
57
+ def _sigmoid(y: np.ndarray):
58
+ """
59
+ Sigmoid function to map input to values between 0 and 1 on the characteristic s-shaped curve (sigmoid curve).
60
+ This is the probability for the positive class.
61
+
62
+ Args:
63
+ y: np.ndarray
64
+ Input values, which will be mapped to values between 0 and 1.
65
+
66
+ Returns:
67
+ np.ndarray: Returns the probability for the positive class.
68
+ """
69
+ return np.exp(-np.logaddexp(0, -y))
70
+
71
+ def _grad_cross_entropy_logistic(self, y_true: np.ndarray, x: np.ndarray, y_pred: np.ndarray,
72
+ coef_: np.ndarray, C: float):
73
+
74
+ weights = coef_.copy()
75
+ if self.intercept:
76
+ weights[0] = 0
77
+
78
+ norm = np.linalg.norm(weights)
79
+ if norm != 0.0:
80
+ penalty = np.einsum('ij->', weights) / (C * norm)
81
+ else:
82
+ penalty = 0
83
+
84
+ grad = x.T @ (y_pred - y_true) + penalty
85
+ norm = np.linalg.norm(grad)
86
+ if norm == 0:
87
+ grad = np.zeros(grad.shape)
88
+ else:
89
+ grad /= norm
90
+ return grad
91
+
92
+ def _armijo(self, y: np.ndarray, y_pred: np.ndarray, coef_: np.ndarray, alpha: float, C: float,
93
+ c1: float, grad_dir: float, cross_entropy_value: float):
94
+
95
+ penalty = self.get_penalty(coef=coef_, C=C)
96
+
97
+ left_armijo = cross_entropy(y_true=y, y_pred=y_pred, penalty=penalty)
98
+ right_armijo = cross_entropy_value + c1 * alpha * grad_dir
99
+
100
+ armijo = left_armijo <= right_armijo
101
+
102
+ return armijo
103
+
104
+ def _wolfe(self, x: np.ndarray, y: np.ndarray, coef_: np.ndarray, alpha: float, C: float,
105
+ direction: np.ndarray, c1: float, c2: float, grad_dir: float, cross_entropy_value: float,
106
+ x_coef: np.ndarray, x_direction: np.ndarray):
107
+
108
+ y_pred = self._sigmoid(x_coef + alpha*x_direction)
109
+
110
+ armijo = self._armijo(y=y, coef_=coef_, alpha=alpha, c1=c1, grad_dir=grad_dir,
111
+ cross_entropy_value=cross_entropy_value, C=C, y_pred=y_pred)
112
+
113
+ if armijo:
114
+ grad = self._grad_cross_entropy_logistic(y_true=y, x=x, coef_=coef_ + alpha * direction, C=C, y_pred=y_pred)
115
+ left_curvature = (direction.T @ grad).item()
116
+ right_curvature = c2 * grad_dir
117
+
118
+ # since wolfe conditions are armijo and weak/strong curvature, the curvature directly implies weak or
119
+ # strong wolfe since armijo is given to be True at this point
120
+ weak_wolfe = left_curvature >= right_curvature
121
+ strong_wolfe = np.abs(left_curvature) <= np.abs(right_curvature)
122
+ else:
123
+ weak_wolfe, strong_wolfe = False, False
124
+ grad = None
125
+
126
+ return weak_wolfe, strong_wolfe, grad
127
+
128
+ def _line_search(self, x: np.ndarray, y: np.ndarray, coef_: np.ndarray,
129
+ direction: np.ndarray, grad: np.ndarray, C: float,
130
+ c1: float = 10 ** (-4), c2: float = 0.9,
131
+ alpha_upper: float = 2.0, alpha_lower: float = 10**-10, alpha: float = 1.0,
132
+ n_iter: int = 10):
133
+
134
+ grad_dir = (direction.T @ grad).item()
135
+ x_coef = x @ coef_
136
+ x_direction = x @ direction
137
+
138
+ penalty = self.get_penalty(coef=coef_, C=C)
139
+
140
+ cross_entropy_value = cross_entropy(y_true=y, y_pred=self._sigmoid(x_coef), penalty=penalty)
141
+
142
+ weak_wolfe_value, grad_value = 0, 0
143
+ for _ in range(n_iter):
144
+ weak_wolfe, strong_wolfe, grad = self._wolfe(x=x, y=y, coef_=coef_, alpha=alpha, direction=direction, c1=c1,
145
+ c2=c2, grad_dir=grad_dir,
146
+ cross_entropy_value=cross_entropy_value,
147
+ C=C, x_coef=x_coef, x_direction=x_direction)
148
+
149
+ if strong_wolfe:
150
+ break
151
+ else:
152
+ if weak_wolfe and alpha > weak_wolfe_value:
153
+ weak_wolfe_value = alpha
154
+ grad_value = grad
155
+
156
+ alpha_lower = alpha
157
+ else:
158
+ alpha_upper = alpha
159
+
160
+ alpha = (alpha_lower + alpha_upper) / 2
161
+ else:
162
+ if weak_wolfe_value != 0:
163
+ alpha = weak_wolfe_value
164
+ grad = grad_value
165
+ else:
166
+ alpha, grad = None, None
167
+
168
+ return alpha, grad
169
+
170
+ def get_penalty(self, coef: np.ndarray, C: float):
171
+ if self.intercept:
172
+ penalty = np.linalg.norm(coef[1:]) / C
173
+ else:
174
+ penalty = np.linalg.norm(coef) / C
175
+
176
+ return penalty
lrf/_criterion.py ADDED
@@ -0,0 +1,147 @@
1
+ import numpy as np
2
+
3
+
4
+ def mse(y_true: np.ndarray, y_pred: np.ndarray):
5
+ return ((y_true - y_pred)**2).mean()
6
+
7
+
8
+ def rmse(y_true: np.ndarray, y_pred: np.ndarray):
9
+ return np.sqrt(((y_true - y_pred)**2).mean())
10
+
11
+
12
+ def mae(y_true: np.ndarray, y_pred: np.ndarray):
13
+ return (np.abs(y_true - y_pred)).mean()
14
+
15
+
16
+ def mape(y_true: np.ndarray, y_pred: np.ndarray):
17
+ return np.abs((y_true - y_pred)/y_true).mean()
18
+
19
+
20
+ def neg_explained_variance(y_true: np.ndarray, y_pred: np.ndarray):
21
+ return np.var(y_true - y_pred)/np.var(y_true) - 1
22
+
23
+
24
+ def neg_r2(y_true: np.ndarray, y_pred: np.ndarray):
25
+ return -np.einsum('i->', (y_true - y_pred)**2)/np.einsum('i->', (y_true - y_true.mean())**2)
26
+
27
+
28
+ def wape(y_true: np.ndarray, y_pred: np.ndarray):
29
+ return np.einsum('i->', np.abs(y_true - y_pred))/np.einsum('i->', np.abs(y_true))
30
+
31
+
32
+ def _confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, thresh: float = 0.5):
33
+ mask_ones = y_pred >= thresh
34
+
35
+ tmp = y_true[mask_ones] == 1
36
+ tp = np.count_nonzero(tmp)
37
+ fp = tmp.shape[0] - tp
38
+
39
+ tmp = y_true[~mask_ones] == 1
40
+ fn = np.count_nonzero(tmp)
41
+ tn = tmp.shape[0] - fn
42
+
43
+ return tn, fp, fn, tp
44
+
45
+
46
+ def _thresholds(y_pred: np.ndarray):
47
+ if y_pred.shape[0] < 10_000:
48
+ thresholds = np.unique(y_pred).tolist()
49
+ else:
50
+ step = 1 / 6180
51
+ thresholds = np.arange(max(np.min(y_pred) - 2 * step, 0), min(np.max(y_pred) + 2 * step, 1), step).tolist()
52
+
53
+ if np.min(y_pred) > 0:
54
+ thresholds = [0] + thresholds
55
+
56
+ if np.max(y_pred) < 1:
57
+ thresholds = thresholds + [1]
58
+
59
+ return thresholds
60
+
61
+
62
+ def hamming(y_true: np.ndarray, y_pred: np.ndarray):
63
+ return (y_true != y_pred).mean()
64
+
65
+
66
+ def cross_entropy(y_true: np.ndarray, y_pred: np.ndarray, penalty: float = 0):
67
+ if y_true.ndim == 1:
68
+ y_true = y_true[:, np.newaxis]
69
+
70
+ y_pred = np.clip(y_pred, 1e-12, 1 - 1e-12)
71
+ return -(y_true*np.log(y_pred)+(1-y_true)*np.log(1-y_pred)).sum()/y_pred.shape[0] + penalty
72
+
73
+
74
+ def neg_mcc(y_true: np.ndarray, y_pred: np.ndarray):
75
+ samples = y_true.shape[0]
76
+ tn, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred, thresh=0.5)
77
+ cm = np.array([[tn, fp], [fn, tp]]).reshape(2, 2)
78
+
79
+ c = np.einsum('ii', cm)
80
+ t = np.einsum('ij->j', cm)
81
+ p = np.einsum('ij->i', cm)
82
+
83
+ dividend = (c*samples - t @ p)
84
+ divisor = (np.sqrt(samples**2 - p @ p) * np.sqrt(samples**2 - t @ t))
85
+
86
+ if divisor == 0:
87
+ mcc = 0.0
88
+ else:
89
+ mcc = dividend / divisor
90
+
91
+ return -mcc
92
+
93
+
94
+ def neg_roc_auc(y_true: np.ndarray, y_pred: np.ndarray):
95
+ thresholds = _thresholds(y_pred=y_pred)
96
+
97
+ positives = y_true.sum()
98
+ negatives = y_true.shape[0] - positives
99
+
100
+ fpr, tpr = [], []
101
+
102
+ for thresh in thresholds:
103
+ _, fp, _, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred, thresh=thresh)
104
+
105
+ tpr.append(tp / positives)
106
+ fpr.append(fp / negatives)
107
+
108
+ # integration is from right to left, therefore this is already the negative ROC AUC
109
+ neg_auc = np.trapz(tpr, fpr)
110
+
111
+ return neg_auc
112
+
113
+
114
+ def neg_pr_auc(y_true: np.ndarray, y_pred: np.ndarray):
115
+ thresholds = _thresholds(y_pred=y_pred)
116
+
117
+ precision, recall = [], []
118
+
119
+ for thresh in thresholds:
120
+ _, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred, thresh=thresh)
121
+
122
+ p = tp / (tp + fp) if (tp + fp) > 0 else 0
123
+ r = tp / (tp + fn) if (tp + fn) > 0 else 0
124
+
125
+ precision.append(p)
126
+ recall.append(r)
127
+
128
+ # integration is from right to left, therefore this is already the negative PR AUC
129
+ neg_auc = np.trapz(precision, recall)
130
+
131
+ return neg_auc
132
+
133
+
134
+ def neg_auk(y_true: np.ndarray, y_pred: np.ndarray):
135
+ NotImplementedError()
136
+
137
+
138
+ def neg_g_mean(y_true: np.ndarray, y_pred: np.ndarray):
139
+ tn, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred)
140
+
141
+ return -(tp * tn / ((tp + fn) * (tn + fp)))
142
+
143
+
144
+ def neg_f1(y_true: np.ndarray, y_pred: np.ndarray):
145
+ _, fp, fn, tp = _confusion_matrix(y_true=y_true, y_pred=y_pred)
146
+
147
+ return -(tp/(tp + 0.5*(fp + fn)))
lrf/_linear_models.py ADDED
@@ -0,0 +1,255 @@
1
+ import numpy as np
2
+
3
+ from lrf._bfgs import BFGS
4
+ from lrf._preprocessor import Preprocessor
5
+
6
+
7
+ class Regressor:
8
+ def __init__(self, alpha: float = 2.0, preprocessing: str = None, fit_intercept: bool = True,
9
+ intercept_in_input: bool = False):
10
+ """
11
+ Linear least-squares with l2 regularization, also knows as Ridge Regression or Tikhonov regularization. This
12
+ implementation supports several preprocessing methods, namely centering, normalizing and standardizing the
13
+ data.
14
+
15
+ Args:
16
+ alpha : float, default=2.0
17
+ Regularization strength, larger values imply stronger regularization.
18
+ Must be a positive float. If alpha=0, there is no regularization and this implementation is equal to
19
+ a linear regression using least-squares.
20
+
21
+ preprocessing: str, default=None
22
+ Specifies the method for data preprocessing. Can be either 'center', 'normalize', 'standardize' or
23
+ None (default)
24
+
25
+ fit_intercept: bool, default=True
26
+ Whether to calculate the intercept.
27
+
28
+ intercept_in_input: bool, default=False
29
+ Whether there is an intercept column at index 0 in the data.
30
+ """
31
+
32
+ self.alpha = alpha
33
+ self.preprocessing = preprocessing
34
+ self.fit_intercept = fit_intercept
35
+ self.intercept_in_input = intercept_in_input
36
+
37
+ if self.preprocessing is not None:
38
+ self.preprocessor = Preprocessor(method=self.preprocessing)
39
+
40
+ self.coef_ = None
41
+
42
+ def fit(self, x: np.ndarray, y: np.ndarray, initial_coefs: np.ndarray = None):
43
+ """
44
+ Fit linear regression model.
45
+
46
+ Args:
47
+ x: np.ndarray
48
+ Training data, containing the feature values.
49
+
50
+ y: np.ndarray
51
+ Target values.
52
+
53
+ Returns:
54
+ self:
55
+ Returns an instance of self.
56
+ """
57
+
58
+ assert self.alpha >= 0
59
+
60
+ x = self._preprocessing(x, fit=True, intercept=self.intercept_in_input)
61
+
62
+ if self.fit_intercept and not self.intercept_in_input:
63
+ # insert a 1 as the intercept
64
+ x = np.insert(x, 0, 1, axis=1)
65
+
66
+ # ridge regression, by l2 regularization
67
+ A = self.alpha * np.identity(x.shape[1])
68
+ # we do not want to regularize the intercept
69
+ A[0, 0] = 0
70
+
71
+ # self.coef_ = np.dot(np.dot(np.linalg.pinv(np.dot(x.T, x) + A), x.T), y)
72
+ self.coef_ = np.linalg.lstsq(np.dot(x.T, x) + A, np.dot(x.T, y), rcond=None)[0]
73
+
74
+ return self
75
+
76
+ def predict(self, x: np.ndarray):
77
+ """
78
+ Make a prediction using the linear regression model.
79
+
80
+ Args:
81
+ x: Samples of the features to derive the prediction from.
82
+
83
+ Returns:
84
+ np.ndarray: Returns the predicted values.
85
+ """
86
+
87
+ assert self.coef_ is not None, 'This linear model is not fitted yet. Call "fit" before using this model to ' \
88
+ 'make predictions'
89
+
90
+ x = self._preprocessing(x, fit=False, intercept=self.intercept_in_input)
91
+
92
+ if self.fit_intercept and not self.intercept_in_input:
93
+ # insert a 1 as the intercept
94
+ x = np.insert(x, 0, 1, axis=1)
95
+
96
+ return np.dot(x, self.coef_)
97
+
98
+ def _preprocessing(self, x: np.ndarray, fit: bool, intercept: bool):
99
+ """
100
+ Fit the preprocessor or transform the data according to a given method.
101
+
102
+ Args:
103
+ x: np.ndarray
104
+ Samples to be processed or on which the processor should be fitted.
105
+
106
+ fit: bool
107
+ Whether to call the 'fit' method of the preprocessor or to call the 'transform' method.
108
+
109
+ intercept: bool
110
+ Whether there is an intercept column at index 0 of x.
111
+
112
+ Returns:
113
+ np.ndarray: Returns the data, which is transformed if fit=False.
114
+ """
115
+
116
+ if self.preprocessing is not None:
117
+ if fit:
118
+ self.preprocessor.fit(x)
119
+
120
+ x = self.preprocessor.transform(x, intercept)
121
+
122
+ return x
123
+
124
+
125
+ class Classifier:
126
+ def __init__(self, n_iter: int = 100, tol: float = 10**(-6), C: float = 1,
127
+ preprocessing: str = None, fit_intercept: bool = True, intercept_in_input: bool = False):
128
+ self.n_iter = n_iter
129
+ self.tol = tol
130
+ self.C = C
131
+ self.preprocessing = preprocessing
132
+ self.fit_intercept = fit_intercept
133
+ self.intercept_in_input = intercept_in_input
134
+
135
+ if self.preprocessing is not None:
136
+ self.preprocessor = Preprocessor(method=self.preprocessing)
137
+
138
+ self.coef_ = None
139
+
140
+ def fit(self, x: np.ndarray, y: np.ndarray, initial_coefs: np.ndarray = None):
141
+ """
142
+ Fit linear classification model.
143
+
144
+ Args:
145
+ x: np.ndarray
146
+ Training data, containing the feature values.
147
+
148
+ y: np.ndarray
149
+ Target classes.
150
+
151
+ Returns:
152
+ self:
153
+ Returns an instance of self.
154
+ """
155
+
156
+ if np.unique(y).shape[0] > 2:
157
+ raise ValueError('You can only use the internal linear classification model for binary classification.'
158
+ 'For multi-class classification provide a suitable model to the "linear_model" parameter.')
159
+
160
+ x = self._preprocessing(x, fit=True, intercept=self.intercept_in_input)
161
+
162
+ if self.fit_intercept and not self.intercept_in_input:
163
+ # insert a 1 as the intercept
164
+ x = np.insert(x, 0, 1, axis=1)
165
+
166
+ self._logistic_regression(x=x, y=y, initial_coefs=initial_coefs)
167
+
168
+ return self
169
+
170
+ def predict_proba(self, x: np.ndarray):
171
+ """
172
+ Make a prediction of the probability of each class using the linear classification model.
173
+
174
+ Args:
175
+ x: np.ndarray
176
+ Samples of the features to derive the prediction from.
177
+
178
+ Returns:
179
+ np.ndarray: Returns the predicted probability of each class.
180
+ """
181
+
182
+ x = self._preprocessing(x, fit=False, intercept=self.intercept_in_input)
183
+
184
+ if self.fit_intercept and not self.intercept_in_input:
185
+ # insert a 1 as the intercept
186
+ x = np.insert(x, 0, 1, axis=1)
187
+
188
+ one_proba = self._sigmoid(np.dot(x, self.coef_))
189
+ y_pred_proba = np.ones((x.shape[0], 2))
190
+
191
+ y_pred_proba[:, 0] -= one_proba
192
+ y_pred_proba[:, 1] = one_proba
193
+
194
+ return y_pred_proba
195
+
196
+ def predict(self, x: np.ndarray):
197
+ """
198
+ Predict the classes using the linear classification model.
199
+
200
+ Args:
201
+ x: np.ndarray
202
+ Samples of the features to derive the prediction from.
203
+
204
+ Returns:
205
+ np.ndarray: Returns the predicted classes.
206
+ """
207
+
208
+ return np.argmax(self.predict_proba(x), axis=1)
209
+
210
+ @staticmethod
211
+ def _sigmoid(y: np.ndarray):
212
+ """
213
+ Sigmoid function to map input to values between 0 and 1 on the characteristic s-shaped curve (sigmoid curve).
214
+ This is the probability for the positive class.
215
+
216
+ Args:
217
+ y: np.ndarray
218
+ Input values, which will be mapped to values between 0 and 1.
219
+
220
+ Returns:
221
+ np.ndarray: Returns the probability for the positive class.
222
+ """
223
+ return np.exp(-np.logaddexp(0, -y))
224
+
225
+ def _preprocessing(self, x: np.ndarray, fit: bool, intercept: bool):
226
+ """
227
+ Fit the preprocessor or transform the data according to a given method.
228
+
229
+ Args:
230
+ x: np.ndarray
231
+ Samples to be processed or on which the processor should be fitted.
232
+
233
+ fit: bool
234
+ Whether to call the 'fit' method of the preprocessor or to call the 'transform' method.
235
+
236
+ Returns:
237
+ np.ndarray: Returns the data, which is transformed if fit=False.
238
+ """
239
+
240
+ if self.preprocessing is not None:
241
+ if fit:
242
+ self.preprocessor.fit(x)
243
+
244
+ x = self.preprocessor.transform(x, intercept)
245
+
246
+ return x
247
+
248
+ def _logistic_regression(self, x: np.ndarray, y: np.ndarray, initial_coefs: np.ndarray):
249
+ intercept = self.fit_intercept or self.intercept_in_input
250
+
251
+ bfgs = BFGS(intercept=intercept)
252
+ if initial_coefs is None:
253
+ self.coef_ = bfgs.classification(x=x, y_true=y, coef_=np.zeros((x.shape[1], )), C=self.C)
254
+ else:
255
+ self.coef_ = bfgs.classification(x=x, y_true=y, coef_=initial_coefs, C=self.C)
lrf/_node.py ADDED
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass
2
+ from typing import Union
3
+
4
+ from lrf._linear_models import Regressor, Classifier
5
+
6
+
7
+ @dataclass
8
+ class Node:
9
+ """
10
+ Consolidates all attributes needed at a node, regardless of whether the node is a leaf or not.
11
+ """
12
+ depth: int = None
13
+ split_col_idx: int = None
14
+ threshold: float = None
15
+ metric: float = None
16
+ left_node = None
17
+ right_node = None
18
+ model: Union[Regressor, Classifier] = None
lrf/_preprocessor.py ADDED
@@ -0,0 +1,152 @@
1
+ import numpy as np
2
+
3
+
4
+ class Preprocessor:
5
+ def __init__(self, method: str):
6
+ """
7
+ Preprocessor for data transformation by a given method.
8
+
9
+ Args:
10
+ method: str
11
+ Specifies the transformation method, which could be 'center', 'normalize' or 'standardize'.
12
+ """
13
+
14
+ assert method in ['center', 'normalize', 'standardize'],\
15
+ 'The scaling method should be "center", "normalize" or "standardize".'
16
+
17
+ self.method = method
18
+
19
+ self.mean = None
20
+ self.min = None
21
+ self.max = None
22
+ self.std = None
23
+
24
+ def fit(self, x: np.ndarray):
25
+ """
26
+ Compute the values which are needed for the given preprocessing method.
27
+
28
+ Args:
29
+ x: np.ndarray
30
+ The data used to compute the values which are needed for the given preprocessing method.
31
+
32
+ Returns:
33
+ self: Returns an instance of self.
34
+ """
35
+
36
+ if self.method == 'center':
37
+ self.mean = x.mean(axis=0)
38
+ elif self.method == 'normalize':
39
+ self.min = x.min(axis=0)
40
+ self.max = x.max(axis=0)
41
+
42
+ # if the difference between min and max is 0, this would raise an error. In this case, do not preprocess
43
+ # the data.
44
+ diff = self.max - self.min
45
+ diff_zero = diff == 0
46
+ self.min[diff_zero] = 0
47
+ self.max[diff_zero] = 1
48
+ elif self.method == 'standardize':
49
+ self.mean = x.mean(axis=0)
50
+ self.std = x.std(axis=0)
51
+
52
+ # if the standard deviation is 0, this would raise an error. In this case, do not preprocess the data.
53
+ zero_std = self.std == 0
54
+ self.mean[zero_std] = 0
55
+ self.std[zero_std] = 1
56
+
57
+ return self
58
+
59
+ def transform(self, x: np.ndarray, intercept: bool):
60
+ """
61
+ Transform the data according to the given method.
62
+
63
+ Args:
64
+ x: np.ndarray
65
+ The data which will be transformed.
66
+
67
+ intercept: bool
68
+ Whether there is an intercept column at index 0 of x.
69
+
70
+ Returns:
71
+ np.ndarray: Returns the transformed data.
72
+ """
73
+ if self.method == 'center':
74
+ return self._center(x, intercept)
75
+ elif self.method == 'normalize':
76
+ return self._normalize(x, intercept)
77
+ elif self.method == 'standardize':
78
+ return self._standardize(x, intercept)
79
+
80
+ def _center(self, x: np.ndarray, intercept: bool):
81
+ """
82
+ Center the data.
83
+
84
+ Args:
85
+ x: np.ndarray
86
+ The data which will be centered.
87
+
88
+ intercept: bool
89
+ Whether there is an intercept column at index 0 of x.
90
+
91
+ Returns:
92
+ np.ndarray: Returns the centered data.
93
+ """
94
+ x -= self.mean
95
+
96
+ if intercept:
97
+ if x.ndim == 2:
98
+ x[:, 0] = 1
99
+ else:
100
+ x[0] = 1
101
+
102
+ return x
103
+
104
+ def _normalize(self, x: np.ndarray, intercept: bool):
105
+ """
106
+ Normalize the data.
107
+
108
+ Args:
109
+ x: np.ndarray
110
+ The data which will be normalized.
111
+
112
+ intercept: bool
113
+ Whether there is an intercept column at index 0 of x.
114
+
115
+ Returns:
116
+ np.ndarray: Returns the normalized data.
117
+ """
118
+
119
+ x = (x - self.min)/(self.max - self.min)
120
+
121
+ if intercept:
122
+ if x.ndim == 2:
123
+ x[:, 0] = 1
124
+ else:
125
+ x[0] = 1
126
+
127
+ return x
128
+
129
+ def _standardize(self, x: np.ndarray, intercept: bool):
130
+ """
131
+ Standardize the data.
132
+
133
+ Args:
134
+ x: np.ndarray
135
+ The data which will be standardized.
136
+
137
+ intercept: bool
138
+ Whether there is an intercept column at index 0 of x.
139
+
140
+ Returns:
141
+ np.ndarray: Returns the standardized data.
142
+ """
143
+
144
+ x = (x - self.mean) / self.std
145
+
146
+ if intercept:
147
+ if x.ndim == 2:
148
+ x[:, 0] = 1
149
+ else:
150
+ x[0] = 1
151
+
152
+ return x
lrf/lrf.py ADDED
@@ -0,0 +1,221 @@
1
+ from typing import List, Any
2
+
3
+ import numpy as np
4
+
5
+ from lrf._base_lrf import _LinearRandomForest
6
+ from lrf._linear_models import Regressor, Classifier
7
+ from lrf._criterion import (mse, rmse, mae, mape, wape, neg_explained_variance, neg_r2,
8
+ hamming, cross_entropy, neg_mcc, neg_roc_auc, neg_pr_auc)
9
+ from lrf._node import Node
10
+
11
+
12
+ class LRFRegressor(_LinearRandomForest):
13
+ def __init__(self, linear_model: Any = None, alpha: float = 2.0, preprocessing: str = None,
14
+ n_estimators: int = 100, max_depth: int = 5, n_splits: int = 15,
15
+ split_samples_to_features_ratio: float = 4.5, min_abs_improvement: float = 1 * 10 ** (-4),
16
+ leaf_samples_to_features_ratio: float = 2.0, criterion: str = 'mse',
17
+ n_jobs: int = -1, random_state: int = None, verbose: bool = False):
18
+
19
+ self.alpha = alpha
20
+ self.preprocessing = preprocessing
21
+ self._estimator_type = 'regressor'
22
+
23
+ if linear_model is None:
24
+ linear_model = Regressor(alpha=self.alpha, preprocessing=self.preprocessing, intercept_in_input=True)
25
+ else:
26
+ assert hasattr(linear_model, 'fit')
27
+ assert hasattr(linear_model, 'predict')
28
+
29
+ super().__init__(linear_model=linear_model,
30
+ n_estimators=n_estimators, max_depth=max_depth, criterion=criterion,
31
+ n_splits=n_splits, split_samples_to_features_ratio=split_samples_to_features_ratio,
32
+ leaf_samples_to_features_ratio=leaf_samples_to_features_ratio,
33
+ min_abs_improvement=min_abs_improvement,
34
+ n_jobs=n_jobs, random_state=random_state, verbose=verbose)
35
+
36
+ def predict(self, x: np.ndarray):
37
+ # add intercept here and not inside linear model for performance reasons
38
+ x = np.insert(x, 0, 1, axis=1)
39
+
40
+ # add columns with row index for sorting after multiprocessing
41
+ x = np.insert(x, 0, np.arange(x.shape[0]), axis=1)
42
+
43
+ results = [self._predict_tree(node=node, x=x, results=[]) for node in self.forest]
44
+
45
+ results = [np.vstack(i) for i in results]
46
+ results = np.array([i[np.argsort(i[:, 0])][:, 1:] for i in results])
47
+ results = results.mean(axis=0).flatten()
48
+ return results
49
+
50
+ def _predict_tree(self, node: Node, x: np.ndarray, results: List):
51
+ if node.model is None:
52
+ if x.shape[0] > 0:
53
+ left_indices = x[:, node.split_col_idx + 1] < node.threshold
54
+ right_indices = ~left_indices
55
+
56
+ results = self._predict_tree(node.left_node, x[left_indices], results)
57
+ results = self._predict_tree(node.right_node, x[right_indices], results)
58
+ else:
59
+ if x.shape[0] > 0:
60
+ node_results = node.model.predict(x[:, 1:])
61
+ node_results = np.insert(node_results[:, np.newaxis], 0, x[:, 0], axis=1)
62
+
63
+ results.extend(node_results)
64
+
65
+ return results
66
+
67
+ def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray):
68
+ if self.criterion == 'mse':
69
+ val = mse(y_true=y_true, y_pred=y_pred)
70
+ elif self.criterion == 'rmse':
71
+ val = rmse(y_true=y_true, y_pred=y_pred)
72
+ elif self.criterion == 'mae':
73
+ val = mae(y_true=y_true, y_pred=y_pred)
74
+ elif self.criterion == 'mape':
75
+ val = mape(y_true=y_true, y_pred=y_pred)
76
+ elif self.criterion == 'wape':
77
+ val = wape(y_true=y_true, y_pred=y_pred)
78
+ elif self.criterion == 'neg_explained_variance':
79
+ val = neg_explained_variance(y_true=y_true, y_pred=y_pred)
80
+ elif self.criterion == 'neg_r2':
81
+ val = neg_r2(y_true=y_true, y_pred=y_pred)
82
+ else:
83
+ print(' Metric "{}" is not implemented, MSE is used instead.'.format(self.criterion))
84
+ val = mse(y_true=y_true, y_pred=y_pred)
85
+
86
+ return val
87
+
88
+ def get_params(self, deep: bool = True):
89
+ return {'linear_model': self.linear_model, 'alpha': self.alpha, 'preprocessing': self.preprocessing,
90
+ 'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'n_splits': self.n_splits,
91
+ 'split_samples_to_features_ratio': self.split_samples_to_features_ratio,
92
+ 'leaf_samples_to_features_ratio': self.leaf_samples_to_features_ratio, 'criterion': self.criterion,
93
+ 'n_jobs': self.n_jobs, 'random_state': self.random_state, 'verbose': self.verbose}
94
+
95
+ def score(self, X: np.ndarray, y: np.ndarray):
96
+ y_pred = self.predict(X)
97
+ return -neg_r2(y_true=y, y_pred=y_pred)
98
+
99
+
100
+ class LRFClassifier(_LinearRandomForest):
101
+ def __init__(self, linear_model: Any = None, C: float = 1.0, n_estimators: int = 100, max_depth: int = 5,
102
+ n_splits: int = 15, split_samples_to_features_ratio: float = 4.5,
103
+ leaf_samples_to_features_ratio: float = 2.0, criterion: str = 'neg_mcc',
104
+ min_abs_improvement: float = 5*10**(-4), n_jobs: int = -1, random_state: int = None,
105
+ verbose: bool = False, preprocessing: str = 'standardize', warm_start: bool = True):
106
+
107
+ self.C = C
108
+ self.preprocessing = preprocessing
109
+ self._estimator_type = 'classifier'
110
+
111
+ if linear_model is None:
112
+ linear_model = Classifier(C=self.C, preprocessing=self.preprocessing, intercept_in_input=True)
113
+ else:
114
+ assert hasattr(linear_model, 'fit')
115
+ assert hasattr(linear_model, 'predict')
116
+ assert hasattr(linear_model, 'predict_proba')
117
+
118
+ super().__init__(linear_model=linear_model,
119
+ n_estimators=n_estimators, max_depth=max_depth, criterion=criterion,
120
+ n_splits=n_splits, min_abs_improvement=min_abs_improvement,
121
+ split_samples_to_features_ratio=split_samples_to_features_ratio, random_state=random_state,
122
+ leaf_samples_to_features_ratio=leaf_samples_to_features_ratio, n_jobs=n_jobs, verbose=verbose,
123
+ classification=True, warm_start=warm_start)
124
+
125
+ def predict(self, x: np.ndarray):
126
+ """
127
+ Predict the classes using the linear random forest.
128
+
129
+ Args:
130
+ x: Samples of the features to derive the prediction from.
131
+
132
+ Returns:
133
+ np.ndarray: Returns the predicted classes.
134
+ """
135
+
136
+ return np.argmax(self.predict_proba(x=x), axis=1)
137
+
138
+ def predict_proba(self, x: np.ndarray):
139
+ """
140
+ Make a prediction of the probability of each class using the linear random forest.
141
+
142
+ Args:
143
+ x: Samples of the features to derive the prediction from.
144
+
145
+ Returns:
146
+ np.ndarray: Returns the predicted probability of each class.
147
+ """
148
+
149
+ # add intercept here and not inside linear model for performance reasons
150
+ x = np.insert(x, 0, 1, axis=1)
151
+
152
+ # add columns with row index for sorting after multiprocessing
153
+ x = np.insert(x, 0, np.arange(x.shape[0]), axis=1)
154
+
155
+ results = [self._predict_proba_tree(node=node, x=x, results=[]) for node in self.forest]
156
+
157
+ c = results[0][0].shape[0]
158
+
159
+ results = [np.concatenate(i).reshape((-1, c)) for i in results]
160
+ results = np.array([i[np.argsort(i[:, 0])][:, 1:] for i in results])
161
+ results = results.mean(axis=0)
162
+ return results
163
+
164
+ def _predict_proba_tree(self, node: Node, x: np.ndarray, results: List):
165
+ """
166
+ Make a prediction of the probability of each class using one given tree of the linear random forest.
167
+
168
+ Args:
169
+ x: Samples of the features to derive the prediction from.
170
+
171
+ Returns:
172
+ np.ndarray: Returns the predicted probability of each class.
173
+ """
174
+
175
+ if node.model is None:
176
+ if x.shape[0] > 0:
177
+ left_indices = x[:, node.split_col_idx + 1] < node.threshold
178
+ right_indices = ~left_indices
179
+
180
+ results = self._predict_proba_tree(node.left_node, x[left_indices], results)
181
+ results = self._predict_proba_tree(node.right_node, x[right_indices], results)
182
+ else:
183
+ if x.shape[0] > 0:
184
+ node_results = node.model.predict_proba(x[:, 1:])
185
+ n, m = node_results.shape
186
+ res = np.empty((n, m + 1))
187
+ res[:, 0] = x[:, 0]
188
+ res[:, 1:] = node_results
189
+
190
+ results.extend(res)
191
+
192
+ return results
193
+
194
+ def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray):
195
+ if self.criterion == 'neg_mcc':
196
+ val = neg_mcc(y_true=y_true, y_pred=y_pred)
197
+ elif self.criterion == 'neg_pr_auc':
198
+ val = neg_pr_auc(y_true=y_true, y_pred=y_pred)
199
+ elif self.criterion == 'hamming':
200
+ val = hamming(y_true=y_true, y_pred=y_pred)
201
+ elif self.criterion == 'cross_entropy':
202
+ val = cross_entropy(y_true=y_true, y_pred=y_pred)
203
+ elif self.criterion == 'neg_roc_auc':
204
+ val = neg_roc_auc(y_true=y_true, y_pred=y_pred)
205
+ else:
206
+ print(' Metric "{}" is not implemented, the negative Matthews Correlation Coefficient is used '
207
+ 'instead.'.format(self.criterion))
208
+ val = neg_mcc(y_true=y_true, y_pred=y_pred)
209
+
210
+ return val
211
+
212
+ def get_params(self, deep=True):
213
+ return {'linear_model': self.linear_model, 'C': self.C, 'preprocessing': self.preprocessing,
214
+ 'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'n_splits': self.n_splits,
215
+ 'split_samples_to_features_ratio': self.split_samples_to_features_ratio,
216
+ 'leaf_samples_to_features_ratio': self.leaf_samples_to_features_ratio, 'criterion': self.criterion,
217
+ 'n_jobs': self.n_jobs, 'random_state': self.random_state, 'verbose': self.verbose}
218
+
219
+ def score(self, X: np.ndarray, y: np.ndarray):
220
+ y_pred = self.predict(X)
221
+ return np.count_nonzero(y == y_pred)/y.shape[0]