scikit-survival 0.23.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scikit_survival-0.23.1.dist-info/COPYING +674 -0
- scikit_survival-0.23.1.dist-info/METADATA +888 -0
- scikit_survival-0.23.1.dist-info/RECORD +55 -0
- scikit_survival-0.23.1.dist-info/WHEEL +5 -0
- scikit_survival-0.23.1.dist-info/top_level.txt +1 -0
- sksurv/__init__.py +138 -0
- sksurv/base.py +103 -0
- sksurv/bintrees/__init__.py +15 -0
- sksurv/bintrees/_binarytrees.cpython-313-darwin.so +0 -0
- sksurv/column.py +201 -0
- sksurv/compare.py +123 -0
- sksurv/datasets/__init__.py +10 -0
- sksurv/datasets/base.py +436 -0
- sksurv/datasets/data/GBSG2.arff +700 -0
- sksurv/datasets/data/actg320.arff +1169 -0
- sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
- sksurv/datasets/data/flchain.arff +7887 -0
- sksurv/datasets/data/veteran.arff +148 -0
- sksurv/datasets/data/whas500.arff +520 -0
- sksurv/ensemble/__init__.py +2 -0
- sksurv/ensemble/_coxph_loss.cpython-313-darwin.so +0 -0
- sksurv/ensemble/boosting.py +1610 -0
- sksurv/ensemble/forest.py +947 -0
- sksurv/ensemble/survival_loss.py +151 -0
- sksurv/exceptions.py +18 -0
- sksurv/functions.py +114 -0
- sksurv/io/__init__.py +2 -0
- sksurv/io/arffread.py +58 -0
- sksurv/io/arffwrite.py +145 -0
- sksurv/kernels/__init__.py +1 -0
- sksurv/kernels/_clinical_kernel.cpython-313-darwin.so +0 -0
- sksurv/kernels/clinical.py +328 -0
- sksurv/linear_model/__init__.py +3 -0
- sksurv/linear_model/_coxnet.cpython-313-darwin.so +0 -0
- sksurv/linear_model/aft.py +205 -0
- sksurv/linear_model/coxnet.py +543 -0
- sksurv/linear_model/coxph.py +618 -0
- sksurv/meta/__init__.py +4 -0
- sksurv/meta/base.py +35 -0
- sksurv/meta/ensemble_selection.py +642 -0
- sksurv/meta/stacking.py +349 -0
- sksurv/metrics.py +996 -0
- sksurv/nonparametric.py +588 -0
- sksurv/preprocessing.py +155 -0
- sksurv/svm/__init__.py +11 -0
- sksurv/svm/_minlip.cpython-313-darwin.so +0 -0
- sksurv/svm/_prsvm.cpython-313-darwin.so +0 -0
- sksurv/svm/minlip.py +606 -0
- sksurv/svm/naive_survival_svm.py +221 -0
- sksurv/svm/survival_svm.py +1228 -0
- sksurv/testing.py +108 -0
- sksurv/tree/__init__.py +1 -0
- sksurv/tree/_criterion.cpython-313-darwin.so +0 -0
- sksurv/tree/tree.py +703 -0
- sksurv/util.py +333 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
# This program is free software: you can redistribute it and/or modify
|
|
2
|
+
# it under the terms of the GNU General Public License as published by
|
|
3
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
4
|
+
# (at your option) any later version.
|
|
5
|
+
#
|
|
6
|
+
# This program is distributed in the hope that it will be useful,
|
|
7
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
8
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
9
|
+
# GNU General Public License for more details.
|
|
10
|
+
#
|
|
11
|
+
# You should have received a copy of the GNU General Public License
|
|
12
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
13
|
+
import numbers
|
|
14
|
+
|
|
15
|
+
from joblib import Parallel, delayed
|
|
16
|
+
import numpy as np
|
|
17
|
+
from scipy.stats import kendalltau, rankdata, spearmanr
|
|
18
|
+
from sklearn.base import BaseEstimator, clone
|
|
19
|
+
from sklearn.model_selection import check_cv
|
|
20
|
+
from sklearn.utils._param_validation import Interval, StrOptions
|
|
21
|
+
|
|
22
|
+
from .base import _fit_and_score
|
|
23
|
+
from .stacking import Stacking
|
|
24
|
+
|
|
25
|
+
__all__ = ["EnsembleSelection", "EnsembleSelectionRegressor", "MeanEstimator"]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _corr_kendalltau(X):
|
|
29
|
+
n_variables = X.shape[1]
|
|
30
|
+
mat = np.empty((n_variables, n_variables), dtype=float)
|
|
31
|
+
for i in range(n_variables):
|
|
32
|
+
for j in range(i):
|
|
33
|
+
v = kendalltau(X[:, i], X[:, j]).correlation
|
|
34
|
+
mat[i, j] = v
|
|
35
|
+
mat[j, i] = v
|
|
36
|
+
return mat
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class EnsembleAverage(BaseEstimator):
|
|
40
|
+
def __init__(self, base_estimators, name=None):
|
|
41
|
+
self.base_estimators = base_estimators
|
|
42
|
+
self.name = name
|
|
43
|
+
assert not hasattr(self.base_estimators[0], "classes_"), "base estimator cannot be a classifier"
|
|
44
|
+
|
|
45
|
+
def get_base_params(self):
|
|
46
|
+
return self.base_estimators[0].get_params()
|
|
47
|
+
|
|
48
|
+
def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def predict(self, X):
|
|
52
|
+
prediction = np.zeros(X.shape[0])
|
|
53
|
+
for est in self.base_estimators:
|
|
54
|
+
prediction += est.predict(X)
|
|
55
|
+
|
|
56
|
+
return prediction / len(self.base_estimators)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class MeanEstimator(BaseEstimator):
|
|
60
|
+
def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
|
|
61
|
+
return self
|
|
62
|
+
|
|
63
|
+
def predict(self, X): # pylint: disable=no-self-use
|
|
64
|
+
return X.mean(axis=X.ndim - 1)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class MeanRankEstimator(BaseEstimator):
|
|
68
|
+
def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def predict(self, X): # pylint: disable=no-self-use
|
|
72
|
+
# convert predictions of individual models into ranks
|
|
73
|
+
ranks = np.apply_along_axis(rankdata, 0, X)
|
|
74
|
+
# average predicted ranks
|
|
75
|
+
return ranks.mean(axis=X.ndim - 1)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _fit_and_score_fold(est, x, y, scorer, train_index, test_index, fit_params, idx, fold):
|
|
79
|
+
score = _fit_and_score(est, x, y, scorer, train_index, test_index, est.get_params(), fit_params, {})
|
|
80
|
+
return idx, fold, score, est
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _predict(estimator, X, idx):
|
|
84
|
+
return idx, estimator.predict(X)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _score_regressor(estimator, X, y, idx):
|
|
88
|
+
name_time = y.dtype.names[1]
|
|
89
|
+
error = (estimator.predict(X).ravel() - y[name_time]) ** 2
|
|
90
|
+
return idx, error
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class BaseEnsembleSelection(Stacking):
|
|
94
|
+
_parameter_constraints = {
|
|
95
|
+
**Stacking._parameter_constraints,
|
|
96
|
+
"scorer": [callable],
|
|
97
|
+
"n_estimators": [
|
|
98
|
+
Interval(numbers.Integral, 1, None, closed="left"),
|
|
99
|
+
Interval(numbers.Real, 0.0, 1.0, closed="right"),
|
|
100
|
+
],
|
|
101
|
+
"min_score": [numbers.Real],
|
|
102
|
+
"correlation": [StrOptions({"pearson", "kendall", "spearman"})],
|
|
103
|
+
"min_correlation": [Interval(numbers.Real, -1, 1, closed="both")],
|
|
104
|
+
"cv": ["cv_object"],
|
|
105
|
+
"n_jobs": [Interval(numbers.Integral, 1, None, closed="left")],
|
|
106
|
+
"verbose": ["verbose"],
|
|
107
|
+
}
|
|
108
|
+
_parameter_constraints.pop("probabilities")
|
|
109
|
+
|
|
110
|
+
def __init__(
|
|
111
|
+
self,
|
|
112
|
+
meta_estimator,
|
|
113
|
+
base_estimators,
|
|
114
|
+
scorer=None,
|
|
115
|
+
n_estimators=0.2,
|
|
116
|
+
min_score=0.66,
|
|
117
|
+
correlation="pearson",
|
|
118
|
+
min_correlation=0.6,
|
|
119
|
+
cv=None,
|
|
120
|
+
n_jobs=1,
|
|
121
|
+
verbose=0,
|
|
122
|
+
):
|
|
123
|
+
super().__init__(meta_estimator=meta_estimator, base_estimators=base_estimators)
|
|
124
|
+
|
|
125
|
+
self.scorer = scorer
|
|
126
|
+
self.n_estimators = n_estimators
|
|
127
|
+
self.min_score = min_score
|
|
128
|
+
self.correlation = correlation
|
|
129
|
+
self.min_correlation = min_correlation
|
|
130
|
+
self.cv = cv
|
|
131
|
+
self.n_jobs = n_jobs
|
|
132
|
+
self.verbose = verbose
|
|
133
|
+
|
|
134
|
+
self._extra_params.extend(["scorer", "n_estimators", "min_score", "min_correlation", "cv", "n_jobs", "verbose"])
|
|
135
|
+
|
|
136
|
+
def __len__(self):
|
|
137
|
+
if hasattr(self, "fitted_models_"):
|
|
138
|
+
return len(self.fitted_models_)
|
|
139
|
+
return 0
|
|
140
|
+
|
|
141
|
+
def _check_params(self):
|
|
142
|
+
self._validate_params()
|
|
143
|
+
|
|
144
|
+
if self.n_estimators > len(self.base_estimators):
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"n_estimators ({self.n_estimators}) must not exceed"
|
|
147
|
+
f" number of base learners ({len(self.base_estimators)})"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if isinstance(self.n_estimators, numbers.Integral):
|
|
151
|
+
self.n_estimators_ = self.n_estimators
|
|
152
|
+
else:
|
|
153
|
+
self.n_estimators_ = max(int(self.n_estimators * len(self.base_estimators)), 1)
|
|
154
|
+
|
|
155
|
+
if self.correlation == "pearson":
|
|
156
|
+
self._corr_func = lambda x: np.corrcoef(x, rowvar=0)
|
|
157
|
+
elif self.correlation == "kendall":
|
|
158
|
+
self._corr_func = _corr_kendalltau
|
|
159
|
+
elif self.correlation == "spearman":
|
|
160
|
+
self._corr_func = lambda x: spearmanr(x, axis=0).correlation
|
|
161
|
+
|
|
162
|
+
def _create_base_ensemble(self, out, n_estimators, n_folds):
|
|
163
|
+
"""For each base estimator collect models trained on each fold"""
|
|
164
|
+
if hasattr(self, "feature_names_in_"):
|
|
165
|
+
# Delete the attribute when the estimator is fitted on a new dataset
|
|
166
|
+
# that has no feature names.
|
|
167
|
+
delattr(self, "feature_names_in_")
|
|
168
|
+
|
|
169
|
+
ensemble_scores = np.empty((n_estimators, n_folds))
|
|
170
|
+
base_ensemble = np.empty_like(ensemble_scores, dtype=object)
|
|
171
|
+
for model, fold, score, est in out:
|
|
172
|
+
ensemble_scores[model, fold] = score
|
|
173
|
+
base_ensemble[model, fold] = est
|
|
174
|
+
|
|
175
|
+
if hasattr(est, "n_features_in_"):
|
|
176
|
+
self.n_features_in_ = est.n_features_in_
|
|
177
|
+
if hasattr(est, "feature_names_in_"):
|
|
178
|
+
self.feature_names_in_ = est.feature_names_in_
|
|
179
|
+
|
|
180
|
+
self.final_estimator_ = self.meta_estimator
|
|
181
|
+
|
|
182
|
+
return ensemble_scores, base_ensemble
|
|
183
|
+
|
|
184
|
+
def _create_cv_ensemble(self, base_ensemble, idx_models_included, model_names=None):
|
|
185
|
+
"""For each selected base estimator, average models trained on each fold"""
|
|
186
|
+
fitted_models = np.empty(len(idx_models_included), dtype=object)
|
|
187
|
+
for i, idx in enumerate(idx_models_included):
|
|
188
|
+
model_name = self.base_estimators[idx][0] if model_names is None else model_names[idx]
|
|
189
|
+
avg_model = EnsembleAverage(base_ensemble[idx, :], name=model_name)
|
|
190
|
+
fitted_models[i] = avg_model
|
|
191
|
+
|
|
192
|
+
return fitted_models
|
|
193
|
+
|
|
194
|
+
def _get_base_estimators(self, X):
|
|
195
|
+
"""Takes special care of estimators using custom kernel function
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
X : array, shape = (n_samples, n_features)
|
|
200
|
+
Samples to pre-compute kernel matrix from.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
base_estimators : list
|
|
205
|
+
Same as `self.base_estimators`, expect that estimators with custom kernel function
|
|
206
|
+
use ``kernel='precomputed'``.
|
|
207
|
+
|
|
208
|
+
kernel_cache : dict
|
|
209
|
+
Maps estimator name to kernel matrix. Use this for cross-validation instead of `X`.
|
|
210
|
+
"""
|
|
211
|
+
base_estimators = []
|
|
212
|
+
|
|
213
|
+
kernel_cache = {}
|
|
214
|
+
kernel_fns = {}
|
|
215
|
+
for i, (name, estimator) in enumerate(self.base_estimators):
|
|
216
|
+
if hasattr(estimator, "kernel") and callable(estimator.kernel):
|
|
217
|
+
if not hasattr(estimator, "_get_kernel"):
|
|
218
|
+
raise ValueError(
|
|
219
|
+
f"estimator {name} uses a custom kernel function, but does not have a _get_kernel method"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
kernel_mat = kernel_fns.get(estimator.kernel, None)
|
|
223
|
+
if kernel_mat is None:
|
|
224
|
+
kernel_mat = estimator._get_kernel(X)
|
|
225
|
+
kernel_cache[i] = kernel_mat
|
|
226
|
+
kernel_fns[estimator.kernel] = kernel_mat
|
|
227
|
+
|
|
228
|
+
kernel_cache[i] = kernel_mat
|
|
229
|
+
|
|
230
|
+
# We precompute kernel, but only for training, for testing use original custom kernel function
|
|
231
|
+
kernel_estimator = clone(estimator)
|
|
232
|
+
kernel_estimator.set_params(kernel="precomputed")
|
|
233
|
+
base_estimators.append((name, kernel_estimator))
|
|
234
|
+
else:
|
|
235
|
+
base_estimators.append((name, estimator))
|
|
236
|
+
|
|
237
|
+
return base_estimators, kernel_cache
|
|
238
|
+
|
|
239
|
+
def _restore_base_estimators(self, kernel_cache, out, X, cv):
|
|
240
|
+
"""Restore custom kernel functions of estimators for predictions"""
|
|
241
|
+
train_folds = {fold: train_index for fold, (train_index, _) in enumerate(cv)}
|
|
242
|
+
|
|
243
|
+
for idx, fold, _, est in out:
|
|
244
|
+
if idx in kernel_cache:
|
|
245
|
+
if not hasattr(est, "fit_X_"):
|
|
246
|
+
raise ValueError(
|
|
247
|
+
f"estimator {self.base_estimators[idx][0]} uses a custom kernel function, "
|
|
248
|
+
"but does not have the attribute `fit_X_` after training"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
est.set_params(kernel=self.base_estimators[idx][1].kernel)
|
|
252
|
+
est.fit_X_ = X[train_folds[fold]]
|
|
253
|
+
|
|
254
|
+
return out
|
|
255
|
+
|
|
256
|
+
def _fit_and_score_ensemble(self, X, y, cv, **fit_params):
|
|
257
|
+
"""Create a cross-validated model by training a model for each fold with the same model parameters"""
|
|
258
|
+
fit_params_steps = self._split_fit_params(fit_params)
|
|
259
|
+
|
|
260
|
+
folds = list(cv.split(X, y))
|
|
261
|
+
|
|
262
|
+
# Take care of custom kernel functions
|
|
263
|
+
base_estimators, kernel_cache = self._get_base_estimators(X)
|
|
264
|
+
|
|
265
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
266
|
+
delayed(_fit_and_score_fold)(
|
|
267
|
+
clone(estimator),
|
|
268
|
+
X if i not in kernel_cache else kernel_cache[i],
|
|
269
|
+
y,
|
|
270
|
+
self.scorer,
|
|
271
|
+
train_index,
|
|
272
|
+
test_index,
|
|
273
|
+
fit_params_steps[name],
|
|
274
|
+
i,
|
|
275
|
+
fold,
|
|
276
|
+
)
|
|
277
|
+
for i, (name, estimator) in enumerate(base_estimators)
|
|
278
|
+
for fold, (train_index, test_index) in enumerate(folds)
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if len(kernel_cache) > 0:
|
|
282
|
+
out = self._restore_base_estimators(kernel_cache, out, X, folds)
|
|
283
|
+
|
|
284
|
+
return self._create_base_ensemble(out, len(base_estimators), len(folds))
|
|
285
|
+
|
|
286
|
+
def _add_diversity_score(self, scores, predictions):
|
|
287
|
+
n_models = predictions.shape[1]
|
|
288
|
+
|
|
289
|
+
cor = self._corr_func(predictions)
|
|
290
|
+
assert cor.shape == (n_models, n_models)
|
|
291
|
+
np.fill_diagonal(cor, 0)
|
|
292
|
+
|
|
293
|
+
final_scores = scores.copy()
|
|
294
|
+
diversity = np.apply_along_axis(lambda x: (n_models - np.sum(x >= self.min_correlation)) / n_models, 0, cor)
|
|
295
|
+
|
|
296
|
+
final_scores += diversity
|
|
297
|
+
return final_scores
|
|
298
|
+
|
|
299
|
+
def _fit(self, X, y, cv, **fit_params):
|
|
300
|
+
raise NotImplementedError()
|
|
301
|
+
|
|
302
|
+
def fit(self, X, y=None, **fit_params):
|
|
303
|
+
"""Fit ensemble of models
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
X : array-like, shape = (n_samples, n_features)
|
|
308
|
+
Training data.
|
|
309
|
+
|
|
310
|
+
y : array-like, optional
|
|
311
|
+
Target data if base estimators are supervised.
|
|
312
|
+
|
|
313
|
+
Returns
|
|
314
|
+
-------
|
|
315
|
+
self
|
|
316
|
+
"""
|
|
317
|
+
self._check_params()
|
|
318
|
+
|
|
319
|
+
cv = check_cv(self.cv, X)
|
|
320
|
+
self._fit(X, y, cv, **fit_params)
|
|
321
|
+
|
|
322
|
+
return self
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
class EnsembleSelection(BaseEnsembleSelection):
|
|
326
|
+
"""Ensemble selection for survival analysis that accounts for a score and correlations between predictions.
|
|
327
|
+
|
|
328
|
+
The ensemble is pruned during training only according to the specified score (accuracy) and
|
|
329
|
+
additionally for prediction according to the correlation between predictions (diversity).
|
|
330
|
+
|
|
331
|
+
The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
|
|
332
|
+
|
|
333
|
+
See [1]_, [2]_, [3]_ for further description.
|
|
334
|
+
|
|
335
|
+
Parameters
|
|
336
|
+
----------
|
|
337
|
+
base_estimators : list
|
|
338
|
+
List of (name, estimator) tuples (implementing fit/predict) that are
|
|
339
|
+
part of the ensemble.
|
|
340
|
+
|
|
341
|
+
scorer : callable
|
|
342
|
+
Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
|
|
343
|
+
of the prediction on the test data. The function should return a scalar value.
|
|
344
|
+
*Larger* values of the score are assumed to be better.
|
|
345
|
+
|
|
346
|
+
n_estimators : float or int, optional, default: 0.2
|
|
347
|
+
If a float, the percentage of estimators in the ensemble to retain, if an int the
|
|
348
|
+
absolute number of estimators to retain.
|
|
349
|
+
|
|
350
|
+
min_score : float, optional, default: 0.66
|
|
351
|
+
Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
|
|
352
|
+
with a score above `min_score` are retained.
|
|
353
|
+
|
|
354
|
+
min_correlation : float, optional, default: 0.6
|
|
355
|
+
Threshold for Pearson's correlation coefficient that determines when predictions of
|
|
356
|
+
two estimators are significantly correlated.
|
|
357
|
+
|
|
358
|
+
cv : int, a cv generator instance, or None, optional
|
|
359
|
+
The input specifying which cv generator to use. It can be an
|
|
360
|
+
integer, in which case it is the number of folds in a KFold,
|
|
361
|
+
None, in which case 3 fold is used, or another object, that
|
|
362
|
+
will then be used as a cv generator. The generator has to ensure
|
|
363
|
+
that each sample is only used once for testing.
|
|
364
|
+
|
|
365
|
+
n_jobs : int, optional, default: 1
|
|
366
|
+
Number of jobs to run in parallel.
|
|
367
|
+
|
|
368
|
+
verbose : integer
|
|
369
|
+
Controls the verbosity: the higher, the more messages.
|
|
370
|
+
|
|
371
|
+
Attributes
|
|
372
|
+
----------
|
|
373
|
+
scores_ : ndarray, shape = (n_base_estimators,)
|
|
374
|
+
Array of scores (relative to best performing estimator)
|
|
375
|
+
|
|
376
|
+
fitted_models_ : ndarray
|
|
377
|
+
Selected models during training based on `scorer`.
|
|
378
|
+
|
|
379
|
+
n_features_in_ : int
|
|
380
|
+
Number of features seen during ``fit``.
|
|
381
|
+
|
|
382
|
+
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
383
|
+
Names of features seen during ``fit``. Defined only when `X`
|
|
384
|
+
has feature names that are all strings.
|
|
385
|
+
|
|
386
|
+
References
|
|
387
|
+
----------
|
|
388
|
+
|
|
389
|
+
.. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
|
|
390
|
+
"Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
|
|
391
|
+
F1000Research, vol. 5, no. 2676, 2016
|
|
392
|
+
|
|
393
|
+
.. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
|
|
394
|
+
"Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
|
|
395
|
+
|
|
396
|
+
.. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
|
|
397
|
+
"Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
|
|
398
|
+
Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
|
|
399
|
+
"""
|
|
400
|
+
|
|
401
|
+
_parameter_constraints = {
|
|
402
|
+
**BaseEnsembleSelection._parameter_constraints,
|
|
403
|
+
}
|
|
404
|
+
_parameter_constraints.pop("meta_estimator")
|
|
405
|
+
|
|
406
|
+
def __init__(
|
|
407
|
+
self,
|
|
408
|
+
base_estimators,
|
|
409
|
+
*,
|
|
410
|
+
scorer=None,
|
|
411
|
+
n_estimators=0.2,
|
|
412
|
+
min_score=0.2,
|
|
413
|
+
correlation="pearson",
|
|
414
|
+
min_correlation=0.6,
|
|
415
|
+
cv=None,
|
|
416
|
+
n_jobs=1,
|
|
417
|
+
verbose=0,
|
|
418
|
+
):
|
|
419
|
+
super().__init__(
|
|
420
|
+
meta_estimator=MeanRankEstimator(),
|
|
421
|
+
base_estimators=base_estimators,
|
|
422
|
+
scorer=scorer,
|
|
423
|
+
n_estimators=n_estimators,
|
|
424
|
+
min_score=min_score,
|
|
425
|
+
correlation=correlation,
|
|
426
|
+
min_correlation=min_correlation,
|
|
427
|
+
cv=cv,
|
|
428
|
+
n_jobs=n_jobs,
|
|
429
|
+
verbose=verbose,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
def _fit(self, X, y, cv, **fit_params):
|
|
433
|
+
scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
|
|
434
|
+
self.fitted_models_, self.scores_ = self._prune_by_cv_score(scores, base_ensemble)
|
|
435
|
+
|
|
436
|
+
def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
|
|
437
|
+
mean_scores = scores.mean(axis=1)
|
|
438
|
+
idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
|
|
439
|
+
if len(idx_good_models) == 0:
|
|
440
|
+
raise ValueError("no base estimator exceeds min_score, try decreasing it")
|
|
441
|
+
|
|
442
|
+
total_score = mean_scores[idx_good_models]
|
|
443
|
+
max_score = total_score.max()
|
|
444
|
+
total_score /= max_score
|
|
445
|
+
|
|
446
|
+
fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
|
|
447
|
+
|
|
448
|
+
return fitted_models, total_score
|
|
449
|
+
|
|
450
|
+
def _prune_by_correlation(self, X):
|
|
451
|
+
n_models = len(self.fitted_models_)
|
|
452
|
+
|
|
453
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
454
|
+
delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
predictions = np.empty((X.shape[0], n_models), order="F")
|
|
458
|
+
for i, p in out:
|
|
459
|
+
predictions[:, i] = p
|
|
460
|
+
|
|
461
|
+
if n_models > self.n_estimators_:
|
|
462
|
+
final_scores = self._add_diversity_score(self.scores_, predictions)
|
|
463
|
+
sorted_idx = np.argsort(-final_scores, kind="mergesort")
|
|
464
|
+
|
|
465
|
+
selected_models = sorted_idx[: self.n_estimators_]
|
|
466
|
+
return predictions[:, selected_models]
|
|
467
|
+
|
|
468
|
+
return predictions
|
|
469
|
+
|
|
470
|
+
def _predict_estimators(self, X):
|
|
471
|
+
predictions = self._prune_by_correlation(X)
|
|
472
|
+
return predictions
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
class EnsembleSelectionRegressor(BaseEnsembleSelection):
|
|
476
|
+
"""Ensemble selection for regression that accounts for the accuracy and correlation of errors.
|
|
477
|
+
|
|
478
|
+
The ensemble is pruned during training according to estimators' accuracy and the correlation
|
|
479
|
+
between prediction errors per sample. The accuracy of the *i*-th estimator defined as
|
|
480
|
+
:math:`\\frac{ \\min_{i=1,\\ldots, n}(error_i) }{ error_i }`.
|
|
481
|
+
In addition to the accuracy, models are selected based on the correlation between residuals
|
|
482
|
+
of different models (diversity). The diversity of the *i*-th estimator is defined as
|
|
483
|
+
:math:`\\frac{n-count}{n}`, where *count* is the number of estimators for whom the correlation
|
|
484
|
+
of residuals exceeds `min_correlation`.
|
|
485
|
+
|
|
486
|
+
The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
|
|
487
|
+
|
|
488
|
+
See [1]_, [2]_, [3]_ for further description.
|
|
489
|
+
|
|
490
|
+
Parameters
|
|
491
|
+
----------
|
|
492
|
+
base_estimators : list
|
|
493
|
+
List of (name, estimator) tuples (implementing fit/predict) that are
|
|
494
|
+
part of the ensemble.
|
|
495
|
+
|
|
496
|
+
scorer : callable
|
|
497
|
+
Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
|
|
498
|
+
of the prediction on the test data. The function should return a scalar value.
|
|
499
|
+
*Smaller* values of the score are assumed to be better.
|
|
500
|
+
|
|
501
|
+
n_estimators : float or int, optional, default: 0.2
|
|
502
|
+
If a float, the percentage of estimators in the ensemble to retain, if an int the
|
|
503
|
+
absolute number of estimators to retain.
|
|
504
|
+
|
|
505
|
+
min_score : float, optional, default: 0.66
|
|
506
|
+
Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
|
|
507
|
+
with a accuracy above `min_score` are retained.
|
|
508
|
+
|
|
509
|
+
min_correlation : float, optional, default: 0.6
|
|
510
|
+
Threshold for Pearson's correlation coefficient that determines when residuals of
|
|
511
|
+
two estimators are significantly correlated.
|
|
512
|
+
|
|
513
|
+
cv : int, a cv generator instance, or None, optional
|
|
514
|
+
The input specifying which cv generator to use. It can be an
|
|
515
|
+
integer, in which case it is the number of folds in a KFold,
|
|
516
|
+
None, in which case 3 fold is used, or another object, that
|
|
517
|
+
will then be used as a cv generator. The generator has to ensure
|
|
518
|
+
that each sample is only used once for testing.
|
|
519
|
+
|
|
520
|
+
n_jobs : int, optional, default: 1
|
|
521
|
+
Number of jobs to run in parallel.
|
|
522
|
+
|
|
523
|
+
verbose : int, optional, default: 0
|
|
524
|
+
Controls the verbosity: the higher, the more messages.
|
|
525
|
+
|
|
526
|
+
Attributes
|
|
527
|
+
----------
|
|
528
|
+
scores_ : ndarray, shape = (n_base_estimators,)
|
|
529
|
+
Array of scores (relative to best performing estimator)
|
|
530
|
+
|
|
531
|
+
fitted_models_ : ndarray
|
|
532
|
+
Selected models during training based on `scorer`.
|
|
533
|
+
|
|
534
|
+
n_features_in_ : int
|
|
535
|
+
Number of features seen during ``fit``.
|
|
536
|
+
|
|
537
|
+
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
538
|
+
Names of features seen during ``fit``. Defined only when `X`
|
|
539
|
+
has feature names that are all strings.
|
|
540
|
+
|
|
541
|
+
References
|
|
542
|
+
----------
|
|
543
|
+
|
|
544
|
+
.. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
|
|
545
|
+
"Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
|
|
546
|
+
F1000Research, vol. 5, no. 2676, 2016
|
|
547
|
+
|
|
548
|
+
.. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
|
|
549
|
+
"Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
|
|
550
|
+
|
|
551
|
+
.. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
|
|
552
|
+
"Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
|
|
553
|
+
Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
|
|
554
|
+
"""
|
|
555
|
+
|
|
556
|
+
_parameter_constraints = {
|
|
557
|
+
**BaseEnsembleSelection._parameter_constraints,
|
|
558
|
+
}
|
|
559
|
+
_parameter_constraints.pop("meta_estimator")
|
|
560
|
+
|
|
561
|
+
def __init__(
|
|
562
|
+
self,
|
|
563
|
+
base_estimators,
|
|
564
|
+
*,
|
|
565
|
+
scorer=None,
|
|
566
|
+
n_estimators=0.2,
|
|
567
|
+
min_score=0.66,
|
|
568
|
+
correlation="pearson",
|
|
569
|
+
min_correlation=0.6,
|
|
570
|
+
cv=None,
|
|
571
|
+
n_jobs=1,
|
|
572
|
+
verbose=0,
|
|
573
|
+
):
|
|
574
|
+
super().__init__(
|
|
575
|
+
meta_estimator=MeanEstimator(),
|
|
576
|
+
base_estimators=base_estimators,
|
|
577
|
+
scorer=scorer,
|
|
578
|
+
n_estimators=n_estimators,
|
|
579
|
+
min_score=min_score,
|
|
580
|
+
correlation=correlation,
|
|
581
|
+
min_correlation=min_correlation,
|
|
582
|
+
cv=cv,
|
|
583
|
+
n_jobs=n_jobs,
|
|
584
|
+
verbose=verbose,
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
@property
|
|
588
|
+
def _predict_risk_score(self):
|
|
589
|
+
return False
|
|
590
|
+
|
|
591
|
+
def _fit(self, X, y, cv, **fit_params):
|
|
592
|
+
scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
|
|
593
|
+
fitted_models, scores = self._prune_by_cv_score(scores, base_ensemble)
|
|
594
|
+
|
|
595
|
+
if len(fitted_models) > self.n_estimators_:
|
|
596
|
+
fitted_models, scores = self._prune_by_correlation(fitted_models, scores, X, y)
|
|
597
|
+
|
|
598
|
+
self.fitted_models_ = fitted_models
|
|
599
|
+
self.scores_ = scores
|
|
600
|
+
|
|
601
|
+
def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
|
|
602
|
+
mean_scores = scores.mean(axis=1)
|
|
603
|
+
mean_scores = mean_scores.min() / mean_scores
|
|
604
|
+
|
|
605
|
+
idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
|
|
606
|
+
if len(idx_good_models) == 0:
|
|
607
|
+
raise ValueError("no base estimator exceeds min_score, try decreasing it")
|
|
608
|
+
|
|
609
|
+
fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
|
|
610
|
+
|
|
611
|
+
return fitted_models, mean_scores[idx_good_models]
|
|
612
|
+
|
|
613
|
+
def _prune_by_correlation(self, fitted_models, scores, X, y):
|
|
614
|
+
n_models = len(fitted_models)
|
|
615
|
+
|
|
616
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
617
|
+
delayed(_score_regressor)(est, X, y, i) for i, est in enumerate(fitted_models)
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
error = np.empty((X.shape[0], n_models), order="F")
|
|
621
|
+
for i, err in out:
|
|
622
|
+
error[:, i] = err
|
|
623
|
+
|
|
624
|
+
final_scores = self._add_diversity_score(scores, error)
|
|
625
|
+
sorted_idx = np.argsort(-final_scores, kind="mergesort")
|
|
626
|
+
|
|
627
|
+
selected_models = sorted_idx[: self.n_estimators_]
|
|
628
|
+
|
|
629
|
+
return fitted_models[selected_models], final_scores
|
|
630
|
+
|
|
631
|
+
def _predict_estimators(self, X):
|
|
632
|
+
n_models = len(self.fitted_models_)
|
|
633
|
+
|
|
634
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
635
|
+
delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
predictions = np.empty((X.shape[0], n_models), order="F")
|
|
639
|
+
for i, p in out:
|
|
640
|
+
predictions[:, i] = p
|
|
641
|
+
|
|
642
|
+
return predictions
|