scikit-survival 0.26.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. scikit_survival-0.26.0.dist-info/METADATA +185 -0
  2. scikit_survival-0.26.0.dist-info/RECORD +58 -0
  3. scikit_survival-0.26.0.dist-info/WHEEL +6 -0
  4. scikit_survival-0.26.0.dist-info/licenses/COPYING +674 -0
  5. scikit_survival-0.26.0.dist-info/top_level.txt +1 -0
  6. sksurv/__init__.py +183 -0
  7. sksurv/base.py +115 -0
  8. sksurv/bintrees/__init__.py +15 -0
  9. sksurv/bintrees/_binarytrees.cpython-312-x86_64-linux-gnu.so +0 -0
  10. sksurv/column.py +204 -0
  11. sksurv/compare.py +123 -0
  12. sksurv/datasets/__init__.py +12 -0
  13. sksurv/datasets/base.py +614 -0
  14. sksurv/datasets/data/GBSG2.arff +700 -0
  15. sksurv/datasets/data/actg320.arff +1169 -0
  16. sksurv/datasets/data/bmt.arff +46 -0
  17. sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
  18. sksurv/datasets/data/cgvhd.arff +118 -0
  19. sksurv/datasets/data/flchain.arff +7887 -0
  20. sksurv/datasets/data/veteran.arff +148 -0
  21. sksurv/datasets/data/whas500.arff +520 -0
  22. sksurv/docstrings.py +99 -0
  23. sksurv/ensemble/__init__.py +2 -0
  24. sksurv/ensemble/_coxph_loss.cpython-312-x86_64-linux-gnu.so +0 -0
  25. sksurv/ensemble/boosting.py +1564 -0
  26. sksurv/ensemble/forest.py +902 -0
  27. sksurv/ensemble/survival_loss.py +151 -0
  28. sksurv/exceptions.py +18 -0
  29. sksurv/functions.py +114 -0
  30. sksurv/io/__init__.py +2 -0
  31. sksurv/io/arffread.py +91 -0
  32. sksurv/io/arffwrite.py +181 -0
  33. sksurv/kernels/__init__.py +1 -0
  34. sksurv/kernels/_clinical_kernel.cpython-312-x86_64-linux-gnu.so +0 -0
  35. sksurv/kernels/clinical.py +348 -0
  36. sksurv/linear_model/__init__.py +3 -0
  37. sksurv/linear_model/_coxnet.cpython-312-x86_64-linux-gnu.so +0 -0
  38. sksurv/linear_model/aft.py +208 -0
  39. sksurv/linear_model/coxnet.py +592 -0
  40. sksurv/linear_model/coxph.py +637 -0
  41. sksurv/meta/__init__.py +4 -0
  42. sksurv/meta/base.py +35 -0
  43. sksurv/meta/ensemble_selection.py +724 -0
  44. sksurv/meta/stacking.py +370 -0
  45. sksurv/metrics.py +1028 -0
  46. sksurv/nonparametric.py +911 -0
  47. sksurv/preprocessing.py +195 -0
  48. sksurv/svm/__init__.py +11 -0
  49. sksurv/svm/_minlip.cpython-312-x86_64-linux-gnu.so +0 -0
  50. sksurv/svm/_prsvm.cpython-312-x86_64-linux-gnu.so +0 -0
  51. sksurv/svm/minlip.py +695 -0
  52. sksurv/svm/naive_survival_svm.py +249 -0
  53. sksurv/svm/survival_svm.py +1236 -0
  54. sksurv/testing.py +155 -0
  55. sksurv/tree/__init__.py +1 -0
  56. sksurv/tree/_criterion.cpython-312-x86_64-linux-gnu.so +0 -0
  57. sksurv/tree/tree.py +790 -0
  58. sksurv/util.py +416 -0
@@ -0,0 +1,724 @@
1
+ # This program is free software: you can redistribute it and/or modify
2
+ # it under the terms of the GNU General Public License as published by
3
+ # the Free Software Foundation, either version 3 of the License, or
4
+ # (at your option) any later version.
5
+ #
6
+ # This program is distributed in the hope that it will be useful,
7
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
8
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
+ # GNU General Public License for more details.
10
+ #
11
+ # You should have received a copy of the GNU General Public License
12
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
13
+ import numbers
14
+
15
+ from joblib import Parallel, delayed
16
+ import numpy as np
17
+ from scipy.stats import kendalltau, rankdata, spearmanr
18
+ from sklearn.base import BaseEstimator, clone
19
+ from sklearn.model_selection import check_cv
20
+ from sklearn.utils._param_validation import Interval, StrOptions
21
+
22
+ from .base import _fit_and_score
23
+ from .stacking import Stacking
24
+
25
+ __all__ = ["EnsembleSelection", "EnsembleSelectionRegressor", "MeanEstimator"]
26
+
27
+
28
+ def _corr_kendalltau(X):
29
+ n_variables = X.shape[1]
30
+ mat = np.empty((n_variables, n_variables), dtype=float)
31
+ for i in range(n_variables):
32
+ for j in range(i):
33
+ v = kendalltau(X[:, i], X[:, j]).correlation
34
+ mat[i, j] = v
35
+ mat[j, i] = v
36
+ return mat
37
+
38
+
39
+ class EnsembleAverage(BaseEstimator):
40
+ """A meta-estimator that averages the predictions of base estimators.
41
+
42
+ This estimator is for internal use by :class:`BaseEnsembleSelection`.
43
+ It takes a list of estimators that have already been fitted and
44
+ averages their predictions.
45
+
46
+ Parameters
47
+ ----------
48
+ base_estimators : list of estimators
49
+ The base estimators to average. The estimators must be fitted.
50
+
51
+ name : str, optional, default: None
52
+ The name of the ensemble.
53
+ """
54
+
55
+ def __init__(self, base_estimators, name=None):
56
+ self.base_estimators = base_estimators
57
+ self.name = name
58
+ assert not hasattr(self.base_estimators[0], "classes_"), "base estimator cannot be a classifier"
59
+
60
+ def get_base_params(self):
61
+ """Get parameters for this estimator's first base estimator.
62
+
63
+ Returns
64
+ -------
65
+ params : dict
66
+ Parameter names mapped to their values.
67
+ """
68
+ return self.base_estimators[0].get_params()
69
+
70
+ def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
71
+ return self
72
+
73
+ def predict(self, X):
74
+ """Predict using the ensemble of estimators.
75
+
76
+ The prediction is the average of the predictions of all base
77
+ estimators.
78
+
79
+ Parameters
80
+ ----------
81
+ X : array-like, shape = (n_samples, n_features)
82
+ Data to predict on.
83
+
84
+ Returns
85
+ -------
86
+ y_pred : ndarray, shape = (n_samples,)
87
+ The predicted values.
88
+ """
89
+ prediction = np.zeros(X.shape[0])
90
+ for est in self.base_estimators:
91
+ prediction += est.predict(X)
92
+
93
+ return prediction / len(self.base_estimators)
94
+
95
+
96
+ class MeanEstimator(BaseEstimator):
97
+ """A meta-estimator that averages predictions.
98
+
99
+ This estimator computes the mean of an array along its last axis.
100
+ It is intended to be used as a ``meta_estimator`` in an ensemble model,
101
+ where it averages the predictions of the base estimators.
102
+ """
103
+
104
+ def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
105
+ return self
106
+
107
+ def predict(self, X): # pylint: disable=no-self-use
108
+ """Return the mean of an array along its last axis.
109
+
110
+ Parameters
111
+ ----------
112
+ X : array-like, shape = (n_samples, n_estimators)
113
+ The predictions of base estimators.
114
+
115
+ Returns
116
+ -------
117
+ y_pred : ndarray, shape = (n_samples,)
118
+ The averaged predictions.
119
+ """
120
+ return X.mean(axis=X.ndim - 1)
121
+
122
+
123
+ class MeanRankEstimator(BaseEstimator):
124
+ """A meta-estimator that averages the ranks of predictions of base estimators.
125
+
126
+ This estimator first converts the predictions of each base estimator
127
+ into ranks and then averages the ranks. It is intended to be used as
128
+ a ``meta_estimator`` in an ensemble model.
129
+ """
130
+
131
+ def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
132
+ return self
133
+
134
+ def predict(self, X): # pylint: disable=no-self-use
135
+ """Return the mean of ranks.
136
+
137
+ The predictions of each base estimator are first converted into
138
+ ranks and then averaged.
139
+
140
+ Parameters
141
+ ----------
142
+ X : array-like, shape = (n_samples, n_estimators)
143
+ The predictions of base estimators.
144
+
145
+ Returns
146
+ -------
147
+ y_pred : ndarray, shape = (n_samples,)
148
+ The averaged ranks.
149
+ """
150
+ # convert predictions of individual models into ranks
151
+ ranks = np.apply_along_axis(rankdata, 0, X)
152
+ # average predicted ranks
153
+ return ranks.mean(axis=X.ndim - 1)
154
+
155
+
156
+ def _fit_and_score_fold(est, x, y, scorer, train_index, test_index, fit_params, idx, fold):
157
+ score = _fit_and_score(est, x, y, scorer, train_index, test_index, est.get_params(), fit_params, {})
158
+ return idx, fold, score, est
159
+
160
+
161
+ def _predict(estimator, X, idx):
162
+ return idx, estimator.predict(X)
163
+
164
+
165
+ def _score_regressor(estimator, X, y, idx):
166
+ name_time = y.dtype.names[1]
167
+ error = (estimator.predict(X).ravel() - y[name_time]) ** 2
168
+ return idx, error
169
+
170
+
171
+ class BaseEnsembleSelection(Stacking):
172
+ _parameter_constraints = {
173
+ **Stacking._parameter_constraints,
174
+ "scorer": [callable],
175
+ "n_estimators": [
176
+ Interval(numbers.Integral, 1, None, closed="left"),
177
+ Interval(numbers.Real, 0.0, 1.0, closed="right"),
178
+ ],
179
+ "min_score": [numbers.Real],
180
+ "correlation": [StrOptions({"pearson", "kendall", "spearman"})],
181
+ "min_correlation": [Interval(numbers.Real, -1, 1, closed="both")],
182
+ "cv": ["cv_object"],
183
+ "n_jobs": [Interval(numbers.Integral, 1, None, closed="left")],
184
+ "verbose": ["verbose"],
185
+ }
186
+ _parameter_constraints.pop("probabilities")
187
+
188
+ def __init__(
189
+ self,
190
+ meta_estimator,
191
+ base_estimators,
192
+ scorer=None,
193
+ n_estimators=0.2,
194
+ min_score=0.66,
195
+ correlation="pearson",
196
+ min_correlation=0.6,
197
+ cv=None,
198
+ n_jobs=1,
199
+ verbose=0,
200
+ ):
201
+ super().__init__(meta_estimator=meta_estimator, base_estimators=base_estimators)
202
+
203
+ self.scorer = scorer
204
+ self.n_estimators = n_estimators
205
+ self.min_score = min_score
206
+ self.correlation = correlation
207
+ self.min_correlation = min_correlation
208
+ self.cv = cv
209
+ self.n_jobs = n_jobs
210
+ self.verbose = verbose
211
+
212
+ self._extra_params.extend(["scorer", "n_estimators", "min_score", "min_correlation", "cv", "n_jobs", "verbose"])
213
+
214
+ def __len__(self):
215
+ """Return the number of fitted models."""
216
+ if hasattr(self, "fitted_models_"):
217
+ return len(self.fitted_models_)
218
+ return 0
219
+
220
+ def _check_params(self):
221
+ self._validate_params()
222
+
223
+ if self.n_estimators > len(self.base_estimators):
224
+ raise ValueError(
225
+ f"n_estimators ({self.n_estimators}) must not exceed"
226
+ f" number of base learners ({len(self.base_estimators)})"
227
+ )
228
+
229
+ if isinstance(self.n_estimators, numbers.Integral):
230
+ self.n_estimators_ = self.n_estimators
231
+ else:
232
+ self.n_estimators_ = max(int(self.n_estimators * len(self.base_estimators)), 1)
233
+
234
+ if self.correlation == "pearson":
235
+ self._corr_func = lambda x: np.corrcoef(x, rowvar=0)
236
+ elif self.correlation == "kendall":
237
+ self._corr_func = _corr_kendalltau
238
+ elif self.correlation == "spearman":
239
+ self._corr_func = lambda x: spearmanr(x, axis=0).correlation
240
+
241
+ def _create_base_ensemble(self, out, n_estimators, n_folds):
242
+ """For each base estimator collect models trained on each fold"""
243
+ if hasattr(self, "feature_names_in_"):
244
+ # Delete the attribute when the estimator is fitted on a new dataset
245
+ # that has no feature names.
246
+ delattr(self, "feature_names_in_")
247
+
248
+ ensemble_scores = np.empty((n_estimators, n_folds))
249
+ base_ensemble = np.empty_like(ensemble_scores, dtype=object)
250
+ for model, fold, score, est in out:
251
+ ensemble_scores[model, fold] = score
252
+ base_ensemble[model, fold] = est
253
+
254
+ if hasattr(est, "n_features_in_"):
255
+ self.n_features_in_ = est.n_features_in_
256
+ if hasattr(est, "feature_names_in_"):
257
+ self.feature_names_in_ = est.feature_names_in_
258
+
259
+ self.final_estimator_ = self.meta_estimator
260
+
261
+ return ensemble_scores, base_ensemble
262
+
263
+ def _create_cv_ensemble(self, base_ensemble, idx_models_included, model_names=None):
264
+ """For each selected base estimator, average models trained on each fold"""
265
+ fitted_models = np.empty(len(idx_models_included), dtype=object)
266
+ for i, idx in enumerate(idx_models_included):
267
+ model_name = self.base_estimators[idx][0] if model_names is None else model_names[idx]
268
+ avg_model = EnsembleAverage(base_ensemble[idx, :], name=model_name)
269
+ fitted_models[i] = avg_model
270
+
271
+ return fitted_models
272
+
273
+ def _get_base_estimators(self, X):
274
+ """Takes special care of estimators using custom kernel function
275
+
276
+ Parameters
277
+ ----------
278
+ X : array, shape = (n_samples, n_features)
279
+ Samples to pre-compute kernel matrix from.
280
+
281
+ Returns
282
+ -------
283
+ base_estimators : list
284
+ Same as `self.base_estimators`, expect that estimators with custom kernel function
285
+ use ``kernel='precomputed'``.
286
+
287
+ kernel_cache : dict
288
+ Maps estimator name to kernel matrix. Use this for cross-validation instead of `X`.
289
+ """
290
+ base_estimators = []
291
+
292
+ kernel_cache = {}
293
+ kernel_fns = {}
294
+ for i, (name, estimator) in enumerate(self.base_estimators):
295
+ if hasattr(estimator, "kernel") and callable(estimator.kernel):
296
+ if not hasattr(estimator, "_get_kernel"):
297
+ raise ValueError(
298
+ f"estimator {name} uses a custom kernel function, but does not have a _get_kernel method"
299
+ )
300
+
301
+ kernel_mat = kernel_fns.get(estimator.kernel, None)
302
+ if kernel_mat is None:
303
+ kernel_mat = estimator._get_kernel(X)
304
+ kernel_cache[i] = kernel_mat
305
+ kernel_fns[estimator.kernel] = kernel_mat
306
+
307
+ kernel_cache[i] = kernel_mat
308
+
309
+ # We precompute kernel, but only for training, for testing use original custom kernel function
310
+ kernel_estimator = clone(estimator)
311
+ kernel_estimator.set_params(kernel="precomputed")
312
+ base_estimators.append((name, kernel_estimator))
313
+ else:
314
+ base_estimators.append((name, estimator))
315
+
316
+ return base_estimators, kernel_cache
317
+
318
+ def _restore_base_estimators(self, kernel_cache, out, X, cv):
319
+ """Restore custom kernel functions of estimators for predictions"""
320
+ train_folds = {fold: train_index for fold, (train_index, _) in enumerate(cv)}
321
+
322
+ for idx, fold, _, est in out:
323
+ if idx in kernel_cache:
324
+ if not hasattr(est, "fit_X_"):
325
+ raise ValueError(
326
+ f"estimator {self.base_estimators[idx][0]} uses a custom kernel function, "
327
+ "but does not have the attribute `fit_X_` after training"
328
+ )
329
+
330
+ est.set_params(kernel=self.base_estimators[idx][1].kernel)
331
+ est.fit_X_ = X[train_folds[fold]]
332
+
333
+ return out
334
+
335
+ def _fit_and_score_ensemble(self, X, y, cv, **fit_params):
336
+ """Create a cross-validated model by training a model for each fold with the same model parameters"""
337
+ fit_params_steps = self._split_fit_params(fit_params)
338
+
339
+ folds = list(cv.split(X, y))
340
+
341
+ # Take care of custom kernel functions
342
+ base_estimators, kernel_cache = self._get_base_estimators(X)
343
+
344
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
345
+ delayed(_fit_and_score_fold)(
346
+ clone(estimator),
347
+ X if i not in kernel_cache else kernel_cache[i],
348
+ y,
349
+ self.scorer,
350
+ train_index,
351
+ test_index,
352
+ fit_params_steps[name],
353
+ i,
354
+ fold,
355
+ )
356
+ for i, (name, estimator) in enumerate(base_estimators)
357
+ for fold, (train_index, test_index) in enumerate(folds)
358
+ )
359
+
360
+ if len(kernel_cache) > 0:
361
+ out = self._restore_base_estimators(kernel_cache, out, X, folds)
362
+
363
+ return self._create_base_ensemble(out, len(base_estimators), len(folds))
364
+
365
+ def _add_diversity_score(self, scores, predictions):
366
+ n_models = predictions.shape[1]
367
+
368
+ cor = self._corr_func(predictions)
369
+ assert cor.shape == (n_models, n_models)
370
+ np.fill_diagonal(cor, 0)
371
+
372
+ final_scores = scores.copy()
373
+ diversity = np.apply_along_axis(lambda x: (n_models - np.sum(x >= self.min_correlation)) / n_models, 0, cor)
374
+
375
+ final_scores += diversity
376
+ return final_scores
377
+
378
+ def _fit(self, X, y, cv, **fit_params):
379
+ raise NotImplementedError()
380
+
381
+ def fit(self, X, y=None, **fit_params):
382
+ """Fit ensemble of models.
383
+
384
+ Parameters
385
+ ----------
386
+ X : array-like, shape = (n_samples, n_features)
387
+ Training data.
388
+
389
+ y : array-like, shape = (n_samples,), optional
390
+ Target data if base estimators are supervised.
391
+
392
+ **fit_params : dict
393
+ Parameters passed to the ``fit`` method of each base estimator.
394
+
395
+ Returns
396
+ -------
397
+ self
398
+ """
399
+ self._check_params()
400
+
401
+ cv = check_cv(self.cv, X)
402
+ self._fit(X, y, cv, **fit_params)
403
+
404
+ return self
405
+
406
+
407
+ class EnsembleSelection(BaseEnsembleSelection):
408
+ """Ensemble selection for survival analysis that accounts for a score and correlations between predictions.
409
+
410
+ The ensemble is pruned during training only according to the specified score (accuracy) and
411
+ additionally for prediction according to the correlation between predictions (diversity).
412
+
413
+ The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
414
+
415
+ See [1]_, [2]_, [3]_ for further description.
416
+
417
+ Parameters
418
+ ----------
419
+ base_estimators : list
420
+ List of (name, estimator) tuples (implementing fit/predict) that are
421
+ part of the ensemble.
422
+
423
+ scorer : callable
424
+ Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
425
+ of the prediction on the test data. The function should return a scalar value.
426
+ *Larger* values of the score are assumed to be better.
427
+
428
+ n_estimators : float or int, optional, default: 0.2
429
+ If a float, the percentage of estimators in the ensemble to retain, if an int the
430
+ absolute number of estimators to retain.
431
+
432
+ min_score : float, optional, default: 0.2
433
+ Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
434
+ with a score above `min_score` are retained.
435
+
436
+ min_correlation : float, optional, default: 0.6
437
+ Threshold for Pearson's correlation coefficient that determines when predictions of
438
+ two estimators are significantly correlated.
439
+
440
+ cv : int, a cv generator instance, or None, optional
441
+ The input specifying which cv generator to use. It can be an
442
+ integer, in which case it is the number of folds in a KFold,
443
+ None, in which case 3 fold is used, or another object, that
444
+ will then be used as a cv generator. The generator has to ensure
445
+ that each sample is only used once for testing.
446
+
447
+ n_jobs : int, optional, default: 1
448
+ Number of jobs to run in parallel.
449
+
450
+ verbose : integer
451
+ Controls the verbosity: the higher, the more messages.
452
+
453
+ Attributes
454
+ ----------
455
+ scores_ : ndarray, shape = (n_base_estimators,)
456
+ Array of scores (relative to best performing estimator)
457
+
458
+ fitted_models_ : ndarray
459
+ Selected models during training based on `scorer`.
460
+
461
+ n_features_in_ : int
462
+ Number of features seen during ``fit``.
463
+
464
+ feature_names_in_ : ndarray, shape = (`n_features_in_`,)
465
+ Names of features seen during ``fit``. Defined only when `X`
466
+ has feature names that are all strings.
467
+
468
+ References
469
+ ----------
470
+
471
+ .. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
472
+ "Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
473
+ F1000Research, vol. 5, no. 2676, 2016
474
+
475
+ .. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
476
+ "Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
477
+
478
+ .. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
479
+ "Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
480
+ Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
481
+ """
482
+
483
+ _parameter_constraints = {
484
+ **BaseEnsembleSelection._parameter_constraints,
485
+ }
486
+ _parameter_constraints.pop("meta_estimator")
487
+
488
+ def __init__(
489
+ self,
490
+ base_estimators,
491
+ *,
492
+ scorer=None,
493
+ n_estimators=0.2,
494
+ min_score=0.2,
495
+ correlation="pearson",
496
+ min_correlation=0.6,
497
+ cv=None,
498
+ n_jobs=1,
499
+ verbose=0,
500
+ ):
501
+ super().__init__(
502
+ meta_estimator=MeanRankEstimator(),
503
+ base_estimators=base_estimators,
504
+ scorer=scorer,
505
+ n_estimators=n_estimators,
506
+ min_score=min_score,
507
+ correlation=correlation,
508
+ min_correlation=min_correlation,
509
+ cv=cv,
510
+ n_jobs=n_jobs,
511
+ verbose=verbose,
512
+ )
513
+
514
+ def _fit(self, X, y, cv, **fit_params):
515
+ scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
516
+ self.fitted_models_, self.scores_ = self._prune_by_cv_score(scores, base_ensemble)
517
+
518
+ def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
519
+ mean_scores = scores.mean(axis=1)
520
+ idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
521
+ if len(idx_good_models) == 0:
522
+ raise ValueError("no base estimator exceeds min_score, try decreasing it")
523
+
524
+ total_score = mean_scores[idx_good_models]
525
+ max_score = total_score.max()
526
+ total_score /= max_score
527
+
528
+ fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
529
+
530
+ return fitted_models, total_score
531
+
532
+ def _prune_by_correlation(self, X):
533
+ n_models = len(self.fitted_models_)
534
+
535
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
536
+ delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
537
+ )
538
+
539
+ predictions = np.empty((X.shape[0], n_models), order="F")
540
+ for i, p in out:
541
+ predictions[:, i] = p
542
+
543
+ if n_models > self.n_estimators_:
544
+ final_scores = self._add_diversity_score(self.scores_, predictions)
545
+ sorted_idx = np.argsort(-final_scores, kind="mergesort")
546
+
547
+ selected_models = sorted_idx[: self.n_estimators_]
548
+ return predictions[:, selected_models]
549
+
550
+ return predictions
551
+
552
+ def _predict_estimators(self, X):
553
+ predictions = self._prune_by_correlation(X)
554
+ return predictions
555
+
556
+
557
+ class EnsembleSelectionRegressor(BaseEnsembleSelection):
558
+ r"""Ensemble selection for regression that accounts for the accuracy and correlation of errors.
559
+
560
+ The ensemble is pruned during training according to estimators' accuracy and the correlation
561
+ between prediction errors per sample. The accuracy of the *i*-th estimator defined as
562
+ :math:`\frac{ \min_{i=1,\ldots, n}(error_i) }{ error_i }`.
563
+ In addition to the accuracy, models are selected based on the correlation between residuals
564
+ of different models (diversity). The diversity of the *i*-th estimator is defined as
565
+ :math:`\frac{n-count}{n}`, where *count* is the number of estimators for whom the correlation
566
+ of residuals exceeds `min_correlation`.
567
+
568
+ The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
569
+
570
+ See [1]_, [2]_, [3]_ for further description.
571
+
572
+ Parameters
573
+ ----------
574
+ base_estimators : list
575
+ List of (name, estimator) tuples (implementing fit/predict) that are
576
+ part of the ensemble.
577
+
578
+ scorer : callable
579
+ Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
580
+ of the prediction on the test data. The function should return a scalar value.
581
+ *Smaller* values of the score are assumed to be better.
582
+
583
+ n_estimators : float or int, optional, default: 0.2
584
+ If a float, the percentage of estimators in the ensemble to retain, if an int the
585
+ absolute number of estimators to retain.
586
+
587
+ min_score : float, optional, default: 0.66
588
+ Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
589
+ with an accuracy above `min_score` are retained.
590
+
591
+ min_correlation : float, optional, default: 0.6
592
+ Threshold for Pearson's correlation coefficient that determines when residuals of
593
+ two estimators are significantly correlated.
594
+
595
+ cv : int, a cv generator instance, or None, optional
596
+ The input specifying which cv generator to use. It can be an
597
+ integer, in which case it is the number of folds in a KFold,
598
+ None, in which case 3 fold is used, or another object, that
599
+ will then be used as a cv generator. The generator has to ensure
600
+ that each sample is only used once for testing.
601
+
602
+ n_jobs : int, optional, default: 1
603
+ Number of jobs to run in parallel.
604
+
605
+ verbose : int, optional, default: 0
606
+ Controls the verbosity: the higher, the more messages.
607
+
608
+ Attributes
609
+ ----------
610
+ scores_ : ndarray, shape = (n_base_estimators,)
611
+ Array of scores (relative to best performing estimator)
612
+
613
+ fitted_models_ : ndarray
614
+ Selected models during training based on `scorer`.
615
+
616
+ n_features_in_ : int
617
+ Number of features seen during ``fit``.
618
+
619
+ feature_names_in_ : ndarray, shape = (`n_features_in_`,)
620
+ Names of features seen during ``fit``. Defined only when `X`
621
+ has feature names that are all strings.
622
+
623
+ References
624
+ ----------
625
+
626
+ .. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
627
+ "Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
628
+ F1000Research, vol. 5, no. 2676, 2016
629
+
630
+ .. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
631
+ "Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
632
+
633
+ .. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
634
+ "Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
635
+ Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
636
+ """
637
+
638
+ _parameter_constraints = {
639
+ **BaseEnsembleSelection._parameter_constraints,
640
+ }
641
+ _parameter_constraints.pop("meta_estimator")
642
+
643
+ def __init__(
644
+ self,
645
+ base_estimators,
646
+ *,
647
+ scorer=None,
648
+ n_estimators=0.2,
649
+ min_score=0.66,
650
+ correlation="pearson",
651
+ min_correlation=0.6,
652
+ cv=None,
653
+ n_jobs=1,
654
+ verbose=0,
655
+ ):
656
+ super().__init__(
657
+ meta_estimator=MeanEstimator(),
658
+ base_estimators=base_estimators,
659
+ scorer=scorer,
660
+ n_estimators=n_estimators,
661
+ min_score=min_score,
662
+ correlation=correlation,
663
+ min_correlation=min_correlation,
664
+ cv=cv,
665
+ n_jobs=n_jobs,
666
+ verbose=verbose,
667
+ )
668
+
669
+ @property
670
+ def _predict_risk_score(self):
671
+ return False
672
+
673
+ def _fit(self, X, y, cv, **fit_params):
674
+ scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
675
+ fitted_models, scores = self._prune_by_cv_score(scores, base_ensemble)
676
+
677
+ if len(fitted_models) > self.n_estimators_:
678
+ fitted_models, scores = self._prune_by_correlation(fitted_models, scores, X, y)
679
+
680
+ self.fitted_models_ = fitted_models
681
+ self.scores_ = scores
682
+
683
+ def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
684
+ mean_scores = scores.mean(axis=1)
685
+ mean_scores = mean_scores.min() / mean_scores
686
+
687
+ idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
688
+ if len(idx_good_models) == 0:
689
+ raise ValueError("no base estimator exceeds min_score, try decreasing it")
690
+
691
+ fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
692
+
693
+ return fitted_models, mean_scores[idx_good_models]
694
+
695
+ def _prune_by_correlation(self, fitted_models, scores, X, y):
696
+ n_models = len(fitted_models)
697
+
698
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
699
+ delayed(_score_regressor)(est, X, y, i) for i, est in enumerate(fitted_models)
700
+ )
701
+
702
+ error = np.empty((X.shape[0], n_models), order="F")
703
+ for i, err in out:
704
+ error[:, i] = err
705
+
706
+ final_scores = self._add_diversity_score(scores, error)
707
+ sorted_idx = np.argsort(-final_scores, kind="mergesort")
708
+
709
+ selected_models = sorted_idx[: self.n_estimators_]
710
+
711
+ return fitted_models[selected_models], final_scores
712
+
713
+ def _predict_estimators(self, X):
714
+ n_models = len(self.fitted_models_)
715
+
716
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
717
+ delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
718
+ )
719
+
720
+ predictions = np.empty((X.shape[0], n_models), order="F")
721
+ for i, p in out:
722
+ predictions[:, i] = p
723
+
724
+ return predictions