scikit-survival 0.26.0__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scikit_survival-0.26.0.dist-info/METADATA +185 -0
- scikit_survival-0.26.0.dist-info/RECORD +58 -0
- scikit_survival-0.26.0.dist-info/WHEEL +6 -0
- scikit_survival-0.26.0.dist-info/licenses/COPYING +674 -0
- scikit_survival-0.26.0.dist-info/top_level.txt +1 -0
- sksurv/__init__.py +183 -0
- sksurv/base.py +115 -0
- sksurv/bintrees/__init__.py +15 -0
- sksurv/bintrees/_binarytrees.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/column.py +204 -0
- sksurv/compare.py +123 -0
- sksurv/datasets/__init__.py +12 -0
- sksurv/datasets/base.py +614 -0
- sksurv/datasets/data/GBSG2.arff +700 -0
- sksurv/datasets/data/actg320.arff +1169 -0
- sksurv/datasets/data/bmt.arff +46 -0
- sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
- sksurv/datasets/data/cgvhd.arff +118 -0
- sksurv/datasets/data/flchain.arff +7887 -0
- sksurv/datasets/data/veteran.arff +148 -0
- sksurv/datasets/data/whas500.arff +520 -0
- sksurv/docstrings.py +99 -0
- sksurv/ensemble/__init__.py +2 -0
- sksurv/ensemble/_coxph_loss.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/ensemble/boosting.py +1564 -0
- sksurv/ensemble/forest.py +902 -0
- sksurv/ensemble/survival_loss.py +151 -0
- sksurv/exceptions.py +18 -0
- sksurv/functions.py +114 -0
- sksurv/io/__init__.py +2 -0
- sksurv/io/arffread.py +91 -0
- sksurv/io/arffwrite.py +181 -0
- sksurv/kernels/__init__.py +1 -0
- sksurv/kernels/_clinical_kernel.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/kernels/clinical.py +348 -0
- sksurv/linear_model/__init__.py +3 -0
- sksurv/linear_model/_coxnet.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/linear_model/aft.py +208 -0
- sksurv/linear_model/coxnet.py +592 -0
- sksurv/linear_model/coxph.py +637 -0
- sksurv/meta/__init__.py +4 -0
- sksurv/meta/base.py +35 -0
- sksurv/meta/ensemble_selection.py +724 -0
- sksurv/meta/stacking.py +370 -0
- sksurv/metrics.py +1028 -0
- sksurv/nonparametric.py +911 -0
- sksurv/preprocessing.py +195 -0
- sksurv/svm/__init__.py +11 -0
- sksurv/svm/_minlip.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/svm/_prsvm.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/svm/minlip.py +695 -0
- sksurv/svm/naive_survival_svm.py +249 -0
- sksurv/svm/survival_svm.py +1236 -0
- sksurv/testing.py +155 -0
- sksurv/tree/__init__.py +1 -0
- sksurv/tree/_criterion.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/tree/tree.py +790 -0
- sksurv/util.py +416 -0
|
@@ -0,0 +1,724 @@
|
|
|
1
|
+
# This program is free software: you can redistribute it and/or modify
|
|
2
|
+
# it under the terms of the GNU General Public License as published by
|
|
3
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
4
|
+
# (at your option) any later version.
|
|
5
|
+
#
|
|
6
|
+
# This program is distributed in the hope that it will be useful,
|
|
7
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
8
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
9
|
+
# GNU General Public License for more details.
|
|
10
|
+
#
|
|
11
|
+
# You should have received a copy of the GNU General Public License
|
|
12
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
13
|
+
import numbers
|
|
14
|
+
|
|
15
|
+
from joblib import Parallel, delayed
|
|
16
|
+
import numpy as np
|
|
17
|
+
from scipy.stats import kendalltau, rankdata, spearmanr
|
|
18
|
+
from sklearn.base import BaseEstimator, clone
|
|
19
|
+
from sklearn.model_selection import check_cv
|
|
20
|
+
from sklearn.utils._param_validation import Interval, StrOptions
|
|
21
|
+
|
|
22
|
+
from .base import _fit_and_score
|
|
23
|
+
from .stacking import Stacking
|
|
24
|
+
|
|
25
|
+
__all__ = ["EnsembleSelection", "EnsembleSelectionRegressor", "MeanEstimator"]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _corr_kendalltau(X):
|
|
29
|
+
n_variables = X.shape[1]
|
|
30
|
+
mat = np.empty((n_variables, n_variables), dtype=float)
|
|
31
|
+
for i in range(n_variables):
|
|
32
|
+
for j in range(i):
|
|
33
|
+
v = kendalltau(X[:, i], X[:, j]).correlation
|
|
34
|
+
mat[i, j] = v
|
|
35
|
+
mat[j, i] = v
|
|
36
|
+
return mat
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class EnsembleAverage(BaseEstimator):
|
|
40
|
+
"""A meta-estimator that averages the predictions of base estimators.
|
|
41
|
+
|
|
42
|
+
This estimator is for internal use by :class:`BaseEnsembleSelection`.
|
|
43
|
+
It takes a list of estimators that have already been fitted and
|
|
44
|
+
averages their predictions.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
base_estimators : list of estimators
|
|
49
|
+
The base estimators to average. The estimators must be fitted.
|
|
50
|
+
|
|
51
|
+
name : str, optional, default: None
|
|
52
|
+
The name of the ensemble.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, base_estimators, name=None):
|
|
56
|
+
self.base_estimators = base_estimators
|
|
57
|
+
self.name = name
|
|
58
|
+
assert not hasattr(self.base_estimators[0], "classes_"), "base estimator cannot be a classifier"
|
|
59
|
+
|
|
60
|
+
def get_base_params(self):
|
|
61
|
+
"""Get parameters for this estimator's first base estimator.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
params : dict
|
|
66
|
+
Parameter names mapped to their values.
|
|
67
|
+
"""
|
|
68
|
+
return self.base_estimators[0].get_params()
|
|
69
|
+
|
|
70
|
+
def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
|
|
71
|
+
return self
|
|
72
|
+
|
|
73
|
+
def predict(self, X):
|
|
74
|
+
"""Predict using the ensemble of estimators.
|
|
75
|
+
|
|
76
|
+
The prediction is the average of the predictions of all base
|
|
77
|
+
estimators.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
X : array-like, shape = (n_samples, n_features)
|
|
82
|
+
Data to predict on.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
y_pred : ndarray, shape = (n_samples,)
|
|
87
|
+
The predicted values.
|
|
88
|
+
"""
|
|
89
|
+
prediction = np.zeros(X.shape[0])
|
|
90
|
+
for est in self.base_estimators:
|
|
91
|
+
prediction += est.predict(X)
|
|
92
|
+
|
|
93
|
+
return prediction / len(self.base_estimators)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class MeanEstimator(BaseEstimator):
|
|
97
|
+
"""A meta-estimator that averages predictions.
|
|
98
|
+
|
|
99
|
+
This estimator computes the mean of an array along its last axis.
|
|
100
|
+
It is intended to be used as a ``meta_estimator`` in an ensemble model,
|
|
101
|
+
where it averages the predictions of the base estimators.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
|
|
105
|
+
return self
|
|
106
|
+
|
|
107
|
+
def predict(self, X): # pylint: disable=no-self-use
|
|
108
|
+
"""Return the mean of an array along its last axis.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
X : array-like, shape = (n_samples, n_estimators)
|
|
113
|
+
The predictions of base estimators.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
y_pred : ndarray, shape = (n_samples,)
|
|
118
|
+
The averaged predictions.
|
|
119
|
+
"""
|
|
120
|
+
return X.mean(axis=X.ndim - 1)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class MeanRankEstimator(BaseEstimator):
|
|
124
|
+
"""A meta-estimator that averages the ranks of predictions of base estimators.
|
|
125
|
+
|
|
126
|
+
This estimator first converts the predictions of each base estimator
|
|
127
|
+
into ranks and then averages the ranks. It is intended to be used as
|
|
128
|
+
a ``meta_estimator`` in an ensemble model.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def predict(self, X): # pylint: disable=no-self-use
|
|
135
|
+
"""Return the mean of ranks.
|
|
136
|
+
|
|
137
|
+
The predictions of each base estimator are first converted into
|
|
138
|
+
ranks and then averaged.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
X : array-like, shape = (n_samples, n_estimators)
|
|
143
|
+
The predictions of base estimators.
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
y_pred : ndarray, shape = (n_samples,)
|
|
148
|
+
The averaged ranks.
|
|
149
|
+
"""
|
|
150
|
+
# convert predictions of individual models into ranks
|
|
151
|
+
ranks = np.apply_along_axis(rankdata, 0, X)
|
|
152
|
+
# average predicted ranks
|
|
153
|
+
return ranks.mean(axis=X.ndim - 1)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _fit_and_score_fold(est, x, y, scorer, train_index, test_index, fit_params, idx, fold):
|
|
157
|
+
score = _fit_and_score(est, x, y, scorer, train_index, test_index, est.get_params(), fit_params, {})
|
|
158
|
+
return idx, fold, score, est
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _predict(estimator, X, idx):
|
|
162
|
+
return idx, estimator.predict(X)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _score_regressor(estimator, X, y, idx):
|
|
166
|
+
name_time = y.dtype.names[1]
|
|
167
|
+
error = (estimator.predict(X).ravel() - y[name_time]) ** 2
|
|
168
|
+
return idx, error
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class BaseEnsembleSelection(Stacking):
|
|
172
|
+
_parameter_constraints = {
|
|
173
|
+
**Stacking._parameter_constraints,
|
|
174
|
+
"scorer": [callable],
|
|
175
|
+
"n_estimators": [
|
|
176
|
+
Interval(numbers.Integral, 1, None, closed="left"),
|
|
177
|
+
Interval(numbers.Real, 0.0, 1.0, closed="right"),
|
|
178
|
+
],
|
|
179
|
+
"min_score": [numbers.Real],
|
|
180
|
+
"correlation": [StrOptions({"pearson", "kendall", "spearman"})],
|
|
181
|
+
"min_correlation": [Interval(numbers.Real, -1, 1, closed="both")],
|
|
182
|
+
"cv": ["cv_object"],
|
|
183
|
+
"n_jobs": [Interval(numbers.Integral, 1, None, closed="left")],
|
|
184
|
+
"verbose": ["verbose"],
|
|
185
|
+
}
|
|
186
|
+
_parameter_constraints.pop("probabilities")
|
|
187
|
+
|
|
188
|
+
def __init__(
|
|
189
|
+
self,
|
|
190
|
+
meta_estimator,
|
|
191
|
+
base_estimators,
|
|
192
|
+
scorer=None,
|
|
193
|
+
n_estimators=0.2,
|
|
194
|
+
min_score=0.66,
|
|
195
|
+
correlation="pearson",
|
|
196
|
+
min_correlation=0.6,
|
|
197
|
+
cv=None,
|
|
198
|
+
n_jobs=1,
|
|
199
|
+
verbose=0,
|
|
200
|
+
):
|
|
201
|
+
super().__init__(meta_estimator=meta_estimator, base_estimators=base_estimators)
|
|
202
|
+
|
|
203
|
+
self.scorer = scorer
|
|
204
|
+
self.n_estimators = n_estimators
|
|
205
|
+
self.min_score = min_score
|
|
206
|
+
self.correlation = correlation
|
|
207
|
+
self.min_correlation = min_correlation
|
|
208
|
+
self.cv = cv
|
|
209
|
+
self.n_jobs = n_jobs
|
|
210
|
+
self.verbose = verbose
|
|
211
|
+
|
|
212
|
+
self._extra_params.extend(["scorer", "n_estimators", "min_score", "min_correlation", "cv", "n_jobs", "verbose"])
|
|
213
|
+
|
|
214
|
+
def __len__(self):
|
|
215
|
+
"""Return the number of fitted models."""
|
|
216
|
+
if hasattr(self, "fitted_models_"):
|
|
217
|
+
return len(self.fitted_models_)
|
|
218
|
+
return 0
|
|
219
|
+
|
|
220
|
+
def _check_params(self):
|
|
221
|
+
self._validate_params()
|
|
222
|
+
|
|
223
|
+
if self.n_estimators > len(self.base_estimators):
|
|
224
|
+
raise ValueError(
|
|
225
|
+
f"n_estimators ({self.n_estimators}) must not exceed"
|
|
226
|
+
f" number of base learners ({len(self.base_estimators)})"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if isinstance(self.n_estimators, numbers.Integral):
|
|
230
|
+
self.n_estimators_ = self.n_estimators
|
|
231
|
+
else:
|
|
232
|
+
self.n_estimators_ = max(int(self.n_estimators * len(self.base_estimators)), 1)
|
|
233
|
+
|
|
234
|
+
if self.correlation == "pearson":
|
|
235
|
+
self._corr_func = lambda x: np.corrcoef(x, rowvar=0)
|
|
236
|
+
elif self.correlation == "kendall":
|
|
237
|
+
self._corr_func = _corr_kendalltau
|
|
238
|
+
elif self.correlation == "spearman":
|
|
239
|
+
self._corr_func = lambda x: spearmanr(x, axis=0).correlation
|
|
240
|
+
|
|
241
|
+
def _create_base_ensemble(self, out, n_estimators, n_folds):
|
|
242
|
+
"""For each base estimator collect models trained on each fold"""
|
|
243
|
+
if hasattr(self, "feature_names_in_"):
|
|
244
|
+
# Delete the attribute when the estimator is fitted on a new dataset
|
|
245
|
+
# that has no feature names.
|
|
246
|
+
delattr(self, "feature_names_in_")
|
|
247
|
+
|
|
248
|
+
ensemble_scores = np.empty((n_estimators, n_folds))
|
|
249
|
+
base_ensemble = np.empty_like(ensemble_scores, dtype=object)
|
|
250
|
+
for model, fold, score, est in out:
|
|
251
|
+
ensemble_scores[model, fold] = score
|
|
252
|
+
base_ensemble[model, fold] = est
|
|
253
|
+
|
|
254
|
+
if hasattr(est, "n_features_in_"):
|
|
255
|
+
self.n_features_in_ = est.n_features_in_
|
|
256
|
+
if hasattr(est, "feature_names_in_"):
|
|
257
|
+
self.feature_names_in_ = est.feature_names_in_
|
|
258
|
+
|
|
259
|
+
self.final_estimator_ = self.meta_estimator
|
|
260
|
+
|
|
261
|
+
return ensemble_scores, base_ensemble
|
|
262
|
+
|
|
263
|
+
def _create_cv_ensemble(self, base_ensemble, idx_models_included, model_names=None):
|
|
264
|
+
"""For each selected base estimator, average models trained on each fold"""
|
|
265
|
+
fitted_models = np.empty(len(idx_models_included), dtype=object)
|
|
266
|
+
for i, idx in enumerate(idx_models_included):
|
|
267
|
+
model_name = self.base_estimators[idx][0] if model_names is None else model_names[idx]
|
|
268
|
+
avg_model = EnsembleAverage(base_ensemble[idx, :], name=model_name)
|
|
269
|
+
fitted_models[i] = avg_model
|
|
270
|
+
|
|
271
|
+
return fitted_models
|
|
272
|
+
|
|
273
|
+
def _get_base_estimators(self, X):
|
|
274
|
+
"""Takes special care of estimators using custom kernel function
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
X : array, shape = (n_samples, n_features)
|
|
279
|
+
Samples to pre-compute kernel matrix from.
|
|
280
|
+
|
|
281
|
+
Returns
|
|
282
|
+
-------
|
|
283
|
+
base_estimators : list
|
|
284
|
+
Same as `self.base_estimators`, expect that estimators with custom kernel function
|
|
285
|
+
use ``kernel='precomputed'``.
|
|
286
|
+
|
|
287
|
+
kernel_cache : dict
|
|
288
|
+
Maps estimator name to kernel matrix. Use this for cross-validation instead of `X`.
|
|
289
|
+
"""
|
|
290
|
+
base_estimators = []
|
|
291
|
+
|
|
292
|
+
kernel_cache = {}
|
|
293
|
+
kernel_fns = {}
|
|
294
|
+
for i, (name, estimator) in enumerate(self.base_estimators):
|
|
295
|
+
if hasattr(estimator, "kernel") and callable(estimator.kernel):
|
|
296
|
+
if not hasattr(estimator, "_get_kernel"):
|
|
297
|
+
raise ValueError(
|
|
298
|
+
f"estimator {name} uses a custom kernel function, but does not have a _get_kernel method"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
kernel_mat = kernel_fns.get(estimator.kernel, None)
|
|
302
|
+
if kernel_mat is None:
|
|
303
|
+
kernel_mat = estimator._get_kernel(X)
|
|
304
|
+
kernel_cache[i] = kernel_mat
|
|
305
|
+
kernel_fns[estimator.kernel] = kernel_mat
|
|
306
|
+
|
|
307
|
+
kernel_cache[i] = kernel_mat
|
|
308
|
+
|
|
309
|
+
# We precompute kernel, but only for training, for testing use original custom kernel function
|
|
310
|
+
kernel_estimator = clone(estimator)
|
|
311
|
+
kernel_estimator.set_params(kernel="precomputed")
|
|
312
|
+
base_estimators.append((name, kernel_estimator))
|
|
313
|
+
else:
|
|
314
|
+
base_estimators.append((name, estimator))
|
|
315
|
+
|
|
316
|
+
return base_estimators, kernel_cache
|
|
317
|
+
|
|
318
|
+
def _restore_base_estimators(self, kernel_cache, out, X, cv):
|
|
319
|
+
"""Restore custom kernel functions of estimators for predictions"""
|
|
320
|
+
train_folds = {fold: train_index for fold, (train_index, _) in enumerate(cv)}
|
|
321
|
+
|
|
322
|
+
for idx, fold, _, est in out:
|
|
323
|
+
if idx in kernel_cache:
|
|
324
|
+
if not hasattr(est, "fit_X_"):
|
|
325
|
+
raise ValueError(
|
|
326
|
+
f"estimator {self.base_estimators[idx][0]} uses a custom kernel function, "
|
|
327
|
+
"but does not have the attribute `fit_X_` after training"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
est.set_params(kernel=self.base_estimators[idx][1].kernel)
|
|
331
|
+
est.fit_X_ = X[train_folds[fold]]
|
|
332
|
+
|
|
333
|
+
return out
|
|
334
|
+
|
|
335
|
+
def _fit_and_score_ensemble(self, X, y, cv, **fit_params):
|
|
336
|
+
"""Create a cross-validated model by training a model for each fold with the same model parameters"""
|
|
337
|
+
fit_params_steps = self._split_fit_params(fit_params)
|
|
338
|
+
|
|
339
|
+
folds = list(cv.split(X, y))
|
|
340
|
+
|
|
341
|
+
# Take care of custom kernel functions
|
|
342
|
+
base_estimators, kernel_cache = self._get_base_estimators(X)
|
|
343
|
+
|
|
344
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
345
|
+
delayed(_fit_and_score_fold)(
|
|
346
|
+
clone(estimator),
|
|
347
|
+
X if i not in kernel_cache else kernel_cache[i],
|
|
348
|
+
y,
|
|
349
|
+
self.scorer,
|
|
350
|
+
train_index,
|
|
351
|
+
test_index,
|
|
352
|
+
fit_params_steps[name],
|
|
353
|
+
i,
|
|
354
|
+
fold,
|
|
355
|
+
)
|
|
356
|
+
for i, (name, estimator) in enumerate(base_estimators)
|
|
357
|
+
for fold, (train_index, test_index) in enumerate(folds)
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
if len(kernel_cache) > 0:
|
|
361
|
+
out = self._restore_base_estimators(kernel_cache, out, X, folds)
|
|
362
|
+
|
|
363
|
+
return self._create_base_ensemble(out, len(base_estimators), len(folds))
|
|
364
|
+
|
|
365
|
+
def _add_diversity_score(self, scores, predictions):
|
|
366
|
+
n_models = predictions.shape[1]
|
|
367
|
+
|
|
368
|
+
cor = self._corr_func(predictions)
|
|
369
|
+
assert cor.shape == (n_models, n_models)
|
|
370
|
+
np.fill_diagonal(cor, 0)
|
|
371
|
+
|
|
372
|
+
final_scores = scores.copy()
|
|
373
|
+
diversity = np.apply_along_axis(lambda x: (n_models - np.sum(x >= self.min_correlation)) / n_models, 0, cor)
|
|
374
|
+
|
|
375
|
+
final_scores += diversity
|
|
376
|
+
return final_scores
|
|
377
|
+
|
|
378
|
+
def _fit(self, X, y, cv, **fit_params):
|
|
379
|
+
raise NotImplementedError()
|
|
380
|
+
|
|
381
|
+
def fit(self, X, y=None, **fit_params):
|
|
382
|
+
"""Fit ensemble of models.
|
|
383
|
+
|
|
384
|
+
Parameters
|
|
385
|
+
----------
|
|
386
|
+
X : array-like, shape = (n_samples, n_features)
|
|
387
|
+
Training data.
|
|
388
|
+
|
|
389
|
+
y : array-like, shape = (n_samples,), optional
|
|
390
|
+
Target data if base estimators are supervised.
|
|
391
|
+
|
|
392
|
+
**fit_params : dict
|
|
393
|
+
Parameters passed to the ``fit`` method of each base estimator.
|
|
394
|
+
|
|
395
|
+
Returns
|
|
396
|
+
-------
|
|
397
|
+
self
|
|
398
|
+
"""
|
|
399
|
+
self._check_params()
|
|
400
|
+
|
|
401
|
+
cv = check_cv(self.cv, X)
|
|
402
|
+
self._fit(X, y, cv, **fit_params)
|
|
403
|
+
|
|
404
|
+
return self
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
class EnsembleSelection(BaseEnsembleSelection):
|
|
408
|
+
"""Ensemble selection for survival analysis that accounts for a score and correlations between predictions.
|
|
409
|
+
|
|
410
|
+
The ensemble is pruned during training only according to the specified score (accuracy) and
|
|
411
|
+
additionally for prediction according to the correlation between predictions (diversity).
|
|
412
|
+
|
|
413
|
+
The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
|
|
414
|
+
|
|
415
|
+
See [1]_, [2]_, [3]_ for further description.
|
|
416
|
+
|
|
417
|
+
Parameters
|
|
418
|
+
----------
|
|
419
|
+
base_estimators : list
|
|
420
|
+
List of (name, estimator) tuples (implementing fit/predict) that are
|
|
421
|
+
part of the ensemble.
|
|
422
|
+
|
|
423
|
+
scorer : callable
|
|
424
|
+
Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
|
|
425
|
+
of the prediction on the test data. The function should return a scalar value.
|
|
426
|
+
*Larger* values of the score are assumed to be better.
|
|
427
|
+
|
|
428
|
+
n_estimators : float or int, optional, default: 0.2
|
|
429
|
+
If a float, the percentage of estimators in the ensemble to retain, if an int the
|
|
430
|
+
absolute number of estimators to retain.
|
|
431
|
+
|
|
432
|
+
min_score : float, optional, default: 0.2
|
|
433
|
+
Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
|
|
434
|
+
with a score above `min_score` are retained.
|
|
435
|
+
|
|
436
|
+
min_correlation : float, optional, default: 0.6
|
|
437
|
+
Threshold for Pearson's correlation coefficient that determines when predictions of
|
|
438
|
+
two estimators are significantly correlated.
|
|
439
|
+
|
|
440
|
+
cv : int, a cv generator instance, or None, optional
|
|
441
|
+
The input specifying which cv generator to use. It can be an
|
|
442
|
+
integer, in which case it is the number of folds in a KFold,
|
|
443
|
+
None, in which case 3 fold is used, or another object, that
|
|
444
|
+
will then be used as a cv generator. The generator has to ensure
|
|
445
|
+
that each sample is only used once for testing.
|
|
446
|
+
|
|
447
|
+
n_jobs : int, optional, default: 1
|
|
448
|
+
Number of jobs to run in parallel.
|
|
449
|
+
|
|
450
|
+
verbose : integer
|
|
451
|
+
Controls the verbosity: the higher, the more messages.
|
|
452
|
+
|
|
453
|
+
Attributes
|
|
454
|
+
----------
|
|
455
|
+
scores_ : ndarray, shape = (n_base_estimators,)
|
|
456
|
+
Array of scores (relative to best performing estimator)
|
|
457
|
+
|
|
458
|
+
fitted_models_ : ndarray
|
|
459
|
+
Selected models during training based on `scorer`.
|
|
460
|
+
|
|
461
|
+
n_features_in_ : int
|
|
462
|
+
Number of features seen during ``fit``.
|
|
463
|
+
|
|
464
|
+
feature_names_in_ : ndarray, shape = (`n_features_in_`,)
|
|
465
|
+
Names of features seen during ``fit``. Defined only when `X`
|
|
466
|
+
has feature names that are all strings.
|
|
467
|
+
|
|
468
|
+
References
|
|
469
|
+
----------
|
|
470
|
+
|
|
471
|
+
.. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
|
|
472
|
+
"Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
|
|
473
|
+
F1000Research, vol. 5, no. 2676, 2016
|
|
474
|
+
|
|
475
|
+
.. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
|
|
476
|
+
"Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
|
|
477
|
+
|
|
478
|
+
.. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
|
|
479
|
+
"Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
|
|
480
|
+
Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
|
|
481
|
+
"""
|
|
482
|
+
|
|
483
|
+
_parameter_constraints = {
|
|
484
|
+
**BaseEnsembleSelection._parameter_constraints,
|
|
485
|
+
}
|
|
486
|
+
_parameter_constraints.pop("meta_estimator")
|
|
487
|
+
|
|
488
|
+
def __init__(
|
|
489
|
+
self,
|
|
490
|
+
base_estimators,
|
|
491
|
+
*,
|
|
492
|
+
scorer=None,
|
|
493
|
+
n_estimators=0.2,
|
|
494
|
+
min_score=0.2,
|
|
495
|
+
correlation="pearson",
|
|
496
|
+
min_correlation=0.6,
|
|
497
|
+
cv=None,
|
|
498
|
+
n_jobs=1,
|
|
499
|
+
verbose=0,
|
|
500
|
+
):
|
|
501
|
+
super().__init__(
|
|
502
|
+
meta_estimator=MeanRankEstimator(),
|
|
503
|
+
base_estimators=base_estimators,
|
|
504
|
+
scorer=scorer,
|
|
505
|
+
n_estimators=n_estimators,
|
|
506
|
+
min_score=min_score,
|
|
507
|
+
correlation=correlation,
|
|
508
|
+
min_correlation=min_correlation,
|
|
509
|
+
cv=cv,
|
|
510
|
+
n_jobs=n_jobs,
|
|
511
|
+
verbose=verbose,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
def _fit(self, X, y, cv, **fit_params):
|
|
515
|
+
scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
|
|
516
|
+
self.fitted_models_, self.scores_ = self._prune_by_cv_score(scores, base_ensemble)
|
|
517
|
+
|
|
518
|
+
def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
|
|
519
|
+
mean_scores = scores.mean(axis=1)
|
|
520
|
+
idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
|
|
521
|
+
if len(idx_good_models) == 0:
|
|
522
|
+
raise ValueError("no base estimator exceeds min_score, try decreasing it")
|
|
523
|
+
|
|
524
|
+
total_score = mean_scores[idx_good_models]
|
|
525
|
+
max_score = total_score.max()
|
|
526
|
+
total_score /= max_score
|
|
527
|
+
|
|
528
|
+
fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
|
|
529
|
+
|
|
530
|
+
return fitted_models, total_score
|
|
531
|
+
|
|
532
|
+
def _prune_by_correlation(self, X):
|
|
533
|
+
n_models = len(self.fitted_models_)
|
|
534
|
+
|
|
535
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
536
|
+
delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
predictions = np.empty((X.shape[0], n_models), order="F")
|
|
540
|
+
for i, p in out:
|
|
541
|
+
predictions[:, i] = p
|
|
542
|
+
|
|
543
|
+
if n_models > self.n_estimators_:
|
|
544
|
+
final_scores = self._add_diversity_score(self.scores_, predictions)
|
|
545
|
+
sorted_idx = np.argsort(-final_scores, kind="mergesort")
|
|
546
|
+
|
|
547
|
+
selected_models = sorted_idx[: self.n_estimators_]
|
|
548
|
+
return predictions[:, selected_models]
|
|
549
|
+
|
|
550
|
+
return predictions
|
|
551
|
+
|
|
552
|
+
def _predict_estimators(self, X):
|
|
553
|
+
predictions = self._prune_by_correlation(X)
|
|
554
|
+
return predictions
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
class EnsembleSelectionRegressor(BaseEnsembleSelection):
|
|
558
|
+
r"""Ensemble selection for regression that accounts for the accuracy and correlation of errors.
|
|
559
|
+
|
|
560
|
+
The ensemble is pruned during training according to estimators' accuracy and the correlation
|
|
561
|
+
between prediction errors per sample. The accuracy of the *i*-th estimator defined as
|
|
562
|
+
:math:`\frac{ \min_{i=1,\ldots, n}(error_i) }{ error_i }`.
|
|
563
|
+
In addition to the accuracy, models are selected based on the correlation between residuals
|
|
564
|
+
of different models (diversity). The diversity of the *i*-th estimator is defined as
|
|
565
|
+
:math:`\frac{n-count}{n}`, where *count* is the number of estimators for whom the correlation
|
|
566
|
+
of residuals exceeds `min_correlation`.
|
|
567
|
+
|
|
568
|
+
The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
|
|
569
|
+
|
|
570
|
+
See [1]_, [2]_, [3]_ for further description.
|
|
571
|
+
|
|
572
|
+
Parameters
|
|
573
|
+
----------
|
|
574
|
+
base_estimators : list
|
|
575
|
+
List of (name, estimator) tuples (implementing fit/predict) that are
|
|
576
|
+
part of the ensemble.
|
|
577
|
+
|
|
578
|
+
scorer : callable
|
|
579
|
+
Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
|
|
580
|
+
of the prediction on the test data. The function should return a scalar value.
|
|
581
|
+
*Smaller* values of the score are assumed to be better.
|
|
582
|
+
|
|
583
|
+
n_estimators : float or int, optional, default: 0.2
|
|
584
|
+
If a float, the percentage of estimators in the ensemble to retain, if an int the
|
|
585
|
+
absolute number of estimators to retain.
|
|
586
|
+
|
|
587
|
+
min_score : float, optional, default: 0.66
|
|
588
|
+
Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
|
|
589
|
+
with an accuracy above `min_score` are retained.
|
|
590
|
+
|
|
591
|
+
min_correlation : float, optional, default: 0.6
|
|
592
|
+
Threshold for Pearson's correlation coefficient that determines when residuals of
|
|
593
|
+
two estimators are significantly correlated.
|
|
594
|
+
|
|
595
|
+
cv : int, a cv generator instance, or None, optional
|
|
596
|
+
The input specifying which cv generator to use. It can be an
|
|
597
|
+
integer, in which case it is the number of folds in a KFold,
|
|
598
|
+
None, in which case 3 fold is used, or another object, that
|
|
599
|
+
will then be used as a cv generator. The generator has to ensure
|
|
600
|
+
that each sample is only used once for testing.
|
|
601
|
+
|
|
602
|
+
n_jobs : int, optional, default: 1
|
|
603
|
+
Number of jobs to run in parallel.
|
|
604
|
+
|
|
605
|
+
verbose : int, optional, default: 0
|
|
606
|
+
Controls the verbosity: the higher, the more messages.
|
|
607
|
+
|
|
608
|
+
Attributes
|
|
609
|
+
----------
|
|
610
|
+
scores_ : ndarray, shape = (n_base_estimators,)
|
|
611
|
+
Array of scores (relative to best performing estimator)
|
|
612
|
+
|
|
613
|
+
fitted_models_ : ndarray
|
|
614
|
+
Selected models during training based on `scorer`.
|
|
615
|
+
|
|
616
|
+
n_features_in_ : int
|
|
617
|
+
Number of features seen during ``fit``.
|
|
618
|
+
|
|
619
|
+
feature_names_in_ : ndarray, shape = (`n_features_in_`,)
|
|
620
|
+
Names of features seen during ``fit``. Defined only when `X`
|
|
621
|
+
has feature names that are all strings.
|
|
622
|
+
|
|
623
|
+
References
|
|
624
|
+
----------
|
|
625
|
+
|
|
626
|
+
.. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
|
|
627
|
+
"Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
|
|
628
|
+
F1000Research, vol. 5, no. 2676, 2016
|
|
629
|
+
|
|
630
|
+
.. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
|
|
631
|
+
"Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
|
|
632
|
+
|
|
633
|
+
.. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
|
|
634
|
+
"Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
|
|
635
|
+
Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
|
|
636
|
+
"""
|
|
637
|
+
|
|
638
|
+
_parameter_constraints = {
|
|
639
|
+
**BaseEnsembleSelection._parameter_constraints,
|
|
640
|
+
}
|
|
641
|
+
_parameter_constraints.pop("meta_estimator")
|
|
642
|
+
|
|
643
|
+
def __init__(
|
|
644
|
+
self,
|
|
645
|
+
base_estimators,
|
|
646
|
+
*,
|
|
647
|
+
scorer=None,
|
|
648
|
+
n_estimators=0.2,
|
|
649
|
+
min_score=0.66,
|
|
650
|
+
correlation="pearson",
|
|
651
|
+
min_correlation=0.6,
|
|
652
|
+
cv=None,
|
|
653
|
+
n_jobs=1,
|
|
654
|
+
verbose=0,
|
|
655
|
+
):
|
|
656
|
+
super().__init__(
|
|
657
|
+
meta_estimator=MeanEstimator(),
|
|
658
|
+
base_estimators=base_estimators,
|
|
659
|
+
scorer=scorer,
|
|
660
|
+
n_estimators=n_estimators,
|
|
661
|
+
min_score=min_score,
|
|
662
|
+
correlation=correlation,
|
|
663
|
+
min_correlation=min_correlation,
|
|
664
|
+
cv=cv,
|
|
665
|
+
n_jobs=n_jobs,
|
|
666
|
+
verbose=verbose,
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
@property
|
|
670
|
+
def _predict_risk_score(self):
|
|
671
|
+
return False
|
|
672
|
+
|
|
673
|
+
def _fit(self, X, y, cv, **fit_params):
|
|
674
|
+
scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
|
|
675
|
+
fitted_models, scores = self._prune_by_cv_score(scores, base_ensemble)
|
|
676
|
+
|
|
677
|
+
if len(fitted_models) > self.n_estimators_:
|
|
678
|
+
fitted_models, scores = self._prune_by_correlation(fitted_models, scores, X, y)
|
|
679
|
+
|
|
680
|
+
self.fitted_models_ = fitted_models
|
|
681
|
+
self.scores_ = scores
|
|
682
|
+
|
|
683
|
+
def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
|
|
684
|
+
mean_scores = scores.mean(axis=1)
|
|
685
|
+
mean_scores = mean_scores.min() / mean_scores
|
|
686
|
+
|
|
687
|
+
idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
|
|
688
|
+
if len(idx_good_models) == 0:
|
|
689
|
+
raise ValueError("no base estimator exceeds min_score, try decreasing it")
|
|
690
|
+
|
|
691
|
+
fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
|
|
692
|
+
|
|
693
|
+
return fitted_models, mean_scores[idx_good_models]
|
|
694
|
+
|
|
695
|
+
def _prune_by_correlation(self, fitted_models, scores, X, y):
|
|
696
|
+
n_models = len(fitted_models)
|
|
697
|
+
|
|
698
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
699
|
+
delayed(_score_regressor)(est, X, y, i) for i, est in enumerate(fitted_models)
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
error = np.empty((X.shape[0], n_models), order="F")
|
|
703
|
+
for i, err in out:
|
|
704
|
+
error[:, i] = err
|
|
705
|
+
|
|
706
|
+
final_scores = self._add_diversity_score(scores, error)
|
|
707
|
+
sorted_idx = np.argsort(-final_scores, kind="mergesort")
|
|
708
|
+
|
|
709
|
+
selected_models = sorted_idx[: self.n_estimators_]
|
|
710
|
+
|
|
711
|
+
return fitted_models[selected_models], final_scores
|
|
712
|
+
|
|
713
|
+
def _predict_estimators(self, X):
|
|
714
|
+
n_models = len(self.fitted_models_)
|
|
715
|
+
|
|
716
|
+
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
717
|
+
delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
predictions = np.empty((X.shape[0], n_models), order="F")
|
|
721
|
+
for i, p in out:
|
|
722
|
+
predictions[:, i] = p
|
|
723
|
+
|
|
724
|
+
return predictions
|