scikit-survival 0.23.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. scikit_survival-0.23.1.dist-info/COPYING +674 -0
  2. scikit_survival-0.23.1.dist-info/METADATA +888 -0
  3. scikit_survival-0.23.1.dist-info/RECORD +55 -0
  4. scikit_survival-0.23.1.dist-info/WHEEL +5 -0
  5. scikit_survival-0.23.1.dist-info/top_level.txt +1 -0
  6. sksurv/__init__.py +138 -0
  7. sksurv/base.py +103 -0
  8. sksurv/bintrees/__init__.py +15 -0
  9. sksurv/bintrees/_binarytrees.cpython-313-darwin.so +0 -0
  10. sksurv/column.py +201 -0
  11. sksurv/compare.py +123 -0
  12. sksurv/datasets/__init__.py +10 -0
  13. sksurv/datasets/base.py +436 -0
  14. sksurv/datasets/data/GBSG2.arff +700 -0
  15. sksurv/datasets/data/actg320.arff +1169 -0
  16. sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
  17. sksurv/datasets/data/flchain.arff +7887 -0
  18. sksurv/datasets/data/veteran.arff +148 -0
  19. sksurv/datasets/data/whas500.arff +520 -0
  20. sksurv/ensemble/__init__.py +2 -0
  21. sksurv/ensemble/_coxph_loss.cpython-313-darwin.so +0 -0
  22. sksurv/ensemble/boosting.py +1610 -0
  23. sksurv/ensemble/forest.py +947 -0
  24. sksurv/ensemble/survival_loss.py +151 -0
  25. sksurv/exceptions.py +18 -0
  26. sksurv/functions.py +114 -0
  27. sksurv/io/__init__.py +2 -0
  28. sksurv/io/arffread.py +58 -0
  29. sksurv/io/arffwrite.py +145 -0
  30. sksurv/kernels/__init__.py +1 -0
  31. sksurv/kernels/_clinical_kernel.cpython-313-darwin.so +0 -0
  32. sksurv/kernels/clinical.py +328 -0
  33. sksurv/linear_model/__init__.py +3 -0
  34. sksurv/linear_model/_coxnet.cpython-313-darwin.so +0 -0
  35. sksurv/linear_model/aft.py +205 -0
  36. sksurv/linear_model/coxnet.py +543 -0
  37. sksurv/linear_model/coxph.py +618 -0
  38. sksurv/meta/__init__.py +4 -0
  39. sksurv/meta/base.py +35 -0
  40. sksurv/meta/ensemble_selection.py +642 -0
  41. sksurv/meta/stacking.py +349 -0
  42. sksurv/metrics.py +996 -0
  43. sksurv/nonparametric.py +588 -0
  44. sksurv/preprocessing.py +155 -0
  45. sksurv/svm/__init__.py +11 -0
  46. sksurv/svm/_minlip.cpython-313-darwin.so +0 -0
  47. sksurv/svm/_prsvm.cpython-313-darwin.so +0 -0
  48. sksurv/svm/minlip.py +606 -0
  49. sksurv/svm/naive_survival_svm.py +221 -0
  50. sksurv/svm/survival_svm.py +1228 -0
  51. sksurv/testing.py +108 -0
  52. sksurv/tree/__init__.py +1 -0
  53. sksurv/tree/_criterion.cpython-313-darwin.so +0 -0
  54. sksurv/tree/tree.py +703 -0
  55. sksurv/util.py +333 -0
@@ -0,0 +1,642 @@
1
+ # This program is free software: you can redistribute it and/or modify
2
+ # it under the terms of the GNU General Public License as published by
3
+ # the Free Software Foundation, either version 3 of the License, or
4
+ # (at your option) any later version.
5
+ #
6
+ # This program is distributed in the hope that it will be useful,
7
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
8
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
+ # GNU General Public License for more details.
10
+ #
11
+ # You should have received a copy of the GNU General Public License
12
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
13
+ import numbers
14
+
15
+ from joblib import Parallel, delayed
16
+ import numpy as np
17
+ from scipy.stats import kendalltau, rankdata, spearmanr
18
+ from sklearn.base import BaseEstimator, clone
19
+ from sklearn.model_selection import check_cv
20
+ from sklearn.utils._param_validation import Interval, StrOptions
21
+
22
+ from .base import _fit_and_score
23
+ from .stacking import Stacking
24
+
25
+ __all__ = ["EnsembleSelection", "EnsembleSelectionRegressor", "MeanEstimator"]
26
+
27
+
28
+ def _corr_kendalltau(X):
29
+ n_variables = X.shape[1]
30
+ mat = np.empty((n_variables, n_variables), dtype=float)
31
+ for i in range(n_variables):
32
+ for j in range(i):
33
+ v = kendalltau(X[:, i], X[:, j]).correlation
34
+ mat[i, j] = v
35
+ mat[j, i] = v
36
+ return mat
37
+
38
+
39
+ class EnsembleAverage(BaseEstimator):
40
+ def __init__(self, base_estimators, name=None):
41
+ self.base_estimators = base_estimators
42
+ self.name = name
43
+ assert not hasattr(self.base_estimators[0], "classes_"), "base estimator cannot be a classifier"
44
+
45
+ def get_base_params(self):
46
+ return self.base_estimators[0].get_params()
47
+
48
+ def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
49
+ return self
50
+
51
+ def predict(self, X):
52
+ prediction = np.zeros(X.shape[0])
53
+ for est in self.base_estimators:
54
+ prediction += est.predict(X)
55
+
56
+ return prediction / len(self.base_estimators)
57
+
58
+
59
+ class MeanEstimator(BaseEstimator):
60
+ def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
61
+ return self
62
+
63
+ def predict(self, X): # pylint: disable=no-self-use
64
+ return X.mean(axis=X.ndim - 1)
65
+
66
+
67
+ class MeanRankEstimator(BaseEstimator):
68
+ def fit(self, X, y=None, **kwargs): # pragma: no cover; # pylint: disable=unused-argument
69
+ return self
70
+
71
+ def predict(self, X): # pylint: disable=no-self-use
72
+ # convert predictions of individual models into ranks
73
+ ranks = np.apply_along_axis(rankdata, 0, X)
74
+ # average predicted ranks
75
+ return ranks.mean(axis=X.ndim - 1)
76
+
77
+
78
+ def _fit_and_score_fold(est, x, y, scorer, train_index, test_index, fit_params, idx, fold):
79
+ score = _fit_and_score(est, x, y, scorer, train_index, test_index, est.get_params(), fit_params, {})
80
+ return idx, fold, score, est
81
+
82
+
83
+ def _predict(estimator, X, idx):
84
+ return idx, estimator.predict(X)
85
+
86
+
87
+ def _score_regressor(estimator, X, y, idx):
88
+ name_time = y.dtype.names[1]
89
+ error = (estimator.predict(X).ravel() - y[name_time]) ** 2
90
+ return idx, error
91
+
92
+
93
+ class BaseEnsembleSelection(Stacking):
94
+ _parameter_constraints = {
95
+ **Stacking._parameter_constraints,
96
+ "scorer": [callable],
97
+ "n_estimators": [
98
+ Interval(numbers.Integral, 1, None, closed="left"),
99
+ Interval(numbers.Real, 0.0, 1.0, closed="right"),
100
+ ],
101
+ "min_score": [numbers.Real],
102
+ "correlation": [StrOptions({"pearson", "kendall", "spearman"})],
103
+ "min_correlation": [Interval(numbers.Real, -1, 1, closed="both")],
104
+ "cv": ["cv_object"],
105
+ "n_jobs": [Interval(numbers.Integral, 1, None, closed="left")],
106
+ "verbose": ["verbose"],
107
+ }
108
+ _parameter_constraints.pop("probabilities")
109
+
110
+ def __init__(
111
+ self,
112
+ meta_estimator,
113
+ base_estimators,
114
+ scorer=None,
115
+ n_estimators=0.2,
116
+ min_score=0.66,
117
+ correlation="pearson",
118
+ min_correlation=0.6,
119
+ cv=None,
120
+ n_jobs=1,
121
+ verbose=0,
122
+ ):
123
+ super().__init__(meta_estimator=meta_estimator, base_estimators=base_estimators)
124
+
125
+ self.scorer = scorer
126
+ self.n_estimators = n_estimators
127
+ self.min_score = min_score
128
+ self.correlation = correlation
129
+ self.min_correlation = min_correlation
130
+ self.cv = cv
131
+ self.n_jobs = n_jobs
132
+ self.verbose = verbose
133
+
134
+ self._extra_params.extend(["scorer", "n_estimators", "min_score", "min_correlation", "cv", "n_jobs", "verbose"])
135
+
136
+ def __len__(self):
137
+ if hasattr(self, "fitted_models_"):
138
+ return len(self.fitted_models_)
139
+ return 0
140
+
141
+ def _check_params(self):
142
+ self._validate_params()
143
+
144
+ if self.n_estimators > len(self.base_estimators):
145
+ raise ValueError(
146
+ f"n_estimators ({self.n_estimators}) must not exceed"
147
+ f" number of base learners ({len(self.base_estimators)})"
148
+ )
149
+
150
+ if isinstance(self.n_estimators, numbers.Integral):
151
+ self.n_estimators_ = self.n_estimators
152
+ else:
153
+ self.n_estimators_ = max(int(self.n_estimators * len(self.base_estimators)), 1)
154
+
155
+ if self.correlation == "pearson":
156
+ self._corr_func = lambda x: np.corrcoef(x, rowvar=0)
157
+ elif self.correlation == "kendall":
158
+ self._corr_func = _corr_kendalltau
159
+ elif self.correlation == "spearman":
160
+ self._corr_func = lambda x: spearmanr(x, axis=0).correlation
161
+
162
+ def _create_base_ensemble(self, out, n_estimators, n_folds):
163
+ """For each base estimator collect models trained on each fold"""
164
+ if hasattr(self, "feature_names_in_"):
165
+ # Delete the attribute when the estimator is fitted on a new dataset
166
+ # that has no feature names.
167
+ delattr(self, "feature_names_in_")
168
+
169
+ ensemble_scores = np.empty((n_estimators, n_folds))
170
+ base_ensemble = np.empty_like(ensemble_scores, dtype=object)
171
+ for model, fold, score, est in out:
172
+ ensemble_scores[model, fold] = score
173
+ base_ensemble[model, fold] = est
174
+
175
+ if hasattr(est, "n_features_in_"):
176
+ self.n_features_in_ = est.n_features_in_
177
+ if hasattr(est, "feature_names_in_"):
178
+ self.feature_names_in_ = est.feature_names_in_
179
+
180
+ self.final_estimator_ = self.meta_estimator
181
+
182
+ return ensemble_scores, base_ensemble
183
+
184
+ def _create_cv_ensemble(self, base_ensemble, idx_models_included, model_names=None):
185
+ """For each selected base estimator, average models trained on each fold"""
186
+ fitted_models = np.empty(len(idx_models_included), dtype=object)
187
+ for i, idx in enumerate(idx_models_included):
188
+ model_name = self.base_estimators[idx][0] if model_names is None else model_names[idx]
189
+ avg_model = EnsembleAverage(base_ensemble[idx, :], name=model_name)
190
+ fitted_models[i] = avg_model
191
+
192
+ return fitted_models
193
+
194
+ def _get_base_estimators(self, X):
195
+ """Takes special care of estimators using custom kernel function
196
+
197
+ Parameters
198
+ ----------
199
+ X : array, shape = (n_samples, n_features)
200
+ Samples to pre-compute kernel matrix from.
201
+
202
+ Returns
203
+ -------
204
+ base_estimators : list
205
+ Same as `self.base_estimators`, expect that estimators with custom kernel function
206
+ use ``kernel='precomputed'``.
207
+
208
+ kernel_cache : dict
209
+ Maps estimator name to kernel matrix. Use this for cross-validation instead of `X`.
210
+ """
211
+ base_estimators = []
212
+
213
+ kernel_cache = {}
214
+ kernel_fns = {}
215
+ for i, (name, estimator) in enumerate(self.base_estimators):
216
+ if hasattr(estimator, "kernel") and callable(estimator.kernel):
217
+ if not hasattr(estimator, "_get_kernel"):
218
+ raise ValueError(
219
+ f"estimator {name} uses a custom kernel function, but does not have a _get_kernel method"
220
+ )
221
+
222
+ kernel_mat = kernel_fns.get(estimator.kernel, None)
223
+ if kernel_mat is None:
224
+ kernel_mat = estimator._get_kernel(X)
225
+ kernel_cache[i] = kernel_mat
226
+ kernel_fns[estimator.kernel] = kernel_mat
227
+
228
+ kernel_cache[i] = kernel_mat
229
+
230
+ # We precompute kernel, but only for training, for testing use original custom kernel function
231
+ kernel_estimator = clone(estimator)
232
+ kernel_estimator.set_params(kernel="precomputed")
233
+ base_estimators.append((name, kernel_estimator))
234
+ else:
235
+ base_estimators.append((name, estimator))
236
+
237
+ return base_estimators, kernel_cache
238
+
239
+ def _restore_base_estimators(self, kernel_cache, out, X, cv):
240
+ """Restore custom kernel functions of estimators for predictions"""
241
+ train_folds = {fold: train_index for fold, (train_index, _) in enumerate(cv)}
242
+
243
+ for idx, fold, _, est in out:
244
+ if idx in kernel_cache:
245
+ if not hasattr(est, "fit_X_"):
246
+ raise ValueError(
247
+ f"estimator {self.base_estimators[idx][0]} uses a custom kernel function, "
248
+ "but does not have the attribute `fit_X_` after training"
249
+ )
250
+
251
+ est.set_params(kernel=self.base_estimators[idx][1].kernel)
252
+ est.fit_X_ = X[train_folds[fold]]
253
+
254
+ return out
255
+
256
+ def _fit_and_score_ensemble(self, X, y, cv, **fit_params):
257
+ """Create a cross-validated model by training a model for each fold with the same model parameters"""
258
+ fit_params_steps = self._split_fit_params(fit_params)
259
+
260
+ folds = list(cv.split(X, y))
261
+
262
+ # Take care of custom kernel functions
263
+ base_estimators, kernel_cache = self._get_base_estimators(X)
264
+
265
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
266
+ delayed(_fit_and_score_fold)(
267
+ clone(estimator),
268
+ X if i not in kernel_cache else kernel_cache[i],
269
+ y,
270
+ self.scorer,
271
+ train_index,
272
+ test_index,
273
+ fit_params_steps[name],
274
+ i,
275
+ fold,
276
+ )
277
+ for i, (name, estimator) in enumerate(base_estimators)
278
+ for fold, (train_index, test_index) in enumerate(folds)
279
+ )
280
+
281
+ if len(kernel_cache) > 0:
282
+ out = self._restore_base_estimators(kernel_cache, out, X, folds)
283
+
284
+ return self._create_base_ensemble(out, len(base_estimators), len(folds))
285
+
286
+ def _add_diversity_score(self, scores, predictions):
287
+ n_models = predictions.shape[1]
288
+
289
+ cor = self._corr_func(predictions)
290
+ assert cor.shape == (n_models, n_models)
291
+ np.fill_diagonal(cor, 0)
292
+
293
+ final_scores = scores.copy()
294
+ diversity = np.apply_along_axis(lambda x: (n_models - np.sum(x >= self.min_correlation)) / n_models, 0, cor)
295
+
296
+ final_scores += diversity
297
+ return final_scores
298
+
299
+ def _fit(self, X, y, cv, **fit_params):
300
+ raise NotImplementedError()
301
+
302
+ def fit(self, X, y=None, **fit_params):
303
+ """Fit ensemble of models
304
+
305
+ Parameters
306
+ ----------
307
+ X : array-like, shape = (n_samples, n_features)
308
+ Training data.
309
+
310
+ y : array-like, optional
311
+ Target data if base estimators are supervised.
312
+
313
+ Returns
314
+ -------
315
+ self
316
+ """
317
+ self._check_params()
318
+
319
+ cv = check_cv(self.cv, X)
320
+ self._fit(X, y, cv, **fit_params)
321
+
322
+ return self
323
+
324
+
325
+ class EnsembleSelection(BaseEnsembleSelection):
326
+ """Ensemble selection for survival analysis that accounts for a score and correlations between predictions.
327
+
328
+ The ensemble is pruned during training only according to the specified score (accuracy) and
329
+ additionally for prediction according to the correlation between predictions (diversity).
330
+
331
+ The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
332
+
333
+ See [1]_, [2]_, [3]_ for further description.
334
+
335
+ Parameters
336
+ ----------
337
+ base_estimators : list
338
+ List of (name, estimator) tuples (implementing fit/predict) that are
339
+ part of the ensemble.
340
+
341
+ scorer : callable
342
+ Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
343
+ of the prediction on the test data. The function should return a scalar value.
344
+ *Larger* values of the score are assumed to be better.
345
+
346
+ n_estimators : float or int, optional, default: 0.2
347
+ If a float, the percentage of estimators in the ensemble to retain, if an int the
348
+ absolute number of estimators to retain.
349
+
350
+ min_score : float, optional, default: 0.66
351
+ Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
352
+ with a score above `min_score` are retained.
353
+
354
+ min_correlation : float, optional, default: 0.6
355
+ Threshold for Pearson's correlation coefficient that determines when predictions of
356
+ two estimators are significantly correlated.
357
+
358
+ cv : int, a cv generator instance, or None, optional
359
+ The input specifying which cv generator to use. It can be an
360
+ integer, in which case it is the number of folds in a KFold,
361
+ None, in which case 3 fold is used, or another object, that
362
+ will then be used as a cv generator. The generator has to ensure
363
+ that each sample is only used once for testing.
364
+
365
+ n_jobs : int, optional, default: 1
366
+ Number of jobs to run in parallel.
367
+
368
+ verbose : integer
369
+ Controls the verbosity: the higher, the more messages.
370
+
371
+ Attributes
372
+ ----------
373
+ scores_ : ndarray, shape = (n_base_estimators,)
374
+ Array of scores (relative to best performing estimator)
375
+
376
+ fitted_models_ : ndarray
377
+ Selected models during training based on `scorer`.
378
+
379
+ n_features_in_ : int
380
+ Number of features seen during ``fit``.
381
+
382
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
383
+ Names of features seen during ``fit``. Defined only when `X`
384
+ has feature names that are all strings.
385
+
386
+ References
387
+ ----------
388
+
389
+ .. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
390
+ "Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
391
+ F1000Research, vol. 5, no. 2676, 2016
392
+
393
+ .. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
394
+ "Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
395
+
396
+ .. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
397
+ "Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
398
+ Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
399
+ """
400
+
401
+ _parameter_constraints = {
402
+ **BaseEnsembleSelection._parameter_constraints,
403
+ }
404
+ _parameter_constraints.pop("meta_estimator")
405
+
406
+ def __init__(
407
+ self,
408
+ base_estimators,
409
+ *,
410
+ scorer=None,
411
+ n_estimators=0.2,
412
+ min_score=0.2,
413
+ correlation="pearson",
414
+ min_correlation=0.6,
415
+ cv=None,
416
+ n_jobs=1,
417
+ verbose=0,
418
+ ):
419
+ super().__init__(
420
+ meta_estimator=MeanRankEstimator(),
421
+ base_estimators=base_estimators,
422
+ scorer=scorer,
423
+ n_estimators=n_estimators,
424
+ min_score=min_score,
425
+ correlation=correlation,
426
+ min_correlation=min_correlation,
427
+ cv=cv,
428
+ n_jobs=n_jobs,
429
+ verbose=verbose,
430
+ )
431
+
432
+ def _fit(self, X, y, cv, **fit_params):
433
+ scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
434
+ self.fitted_models_, self.scores_ = self._prune_by_cv_score(scores, base_ensemble)
435
+
436
+ def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
437
+ mean_scores = scores.mean(axis=1)
438
+ idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
439
+ if len(idx_good_models) == 0:
440
+ raise ValueError("no base estimator exceeds min_score, try decreasing it")
441
+
442
+ total_score = mean_scores[idx_good_models]
443
+ max_score = total_score.max()
444
+ total_score /= max_score
445
+
446
+ fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
447
+
448
+ return fitted_models, total_score
449
+
450
+ def _prune_by_correlation(self, X):
451
+ n_models = len(self.fitted_models_)
452
+
453
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
454
+ delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
455
+ )
456
+
457
+ predictions = np.empty((X.shape[0], n_models), order="F")
458
+ for i, p in out:
459
+ predictions[:, i] = p
460
+
461
+ if n_models > self.n_estimators_:
462
+ final_scores = self._add_diversity_score(self.scores_, predictions)
463
+ sorted_idx = np.argsort(-final_scores, kind="mergesort")
464
+
465
+ selected_models = sorted_idx[: self.n_estimators_]
466
+ return predictions[:, selected_models]
467
+
468
+ return predictions
469
+
470
+ def _predict_estimators(self, X):
471
+ predictions = self._prune_by_correlation(X)
472
+ return predictions
473
+
474
+
475
+ class EnsembleSelectionRegressor(BaseEnsembleSelection):
476
+ """Ensemble selection for regression that accounts for the accuracy and correlation of errors.
477
+
478
+ The ensemble is pruned during training according to estimators' accuracy and the correlation
479
+ between prediction errors per sample. The accuracy of the *i*-th estimator defined as
480
+ :math:`\\frac{ \\min_{i=1,\\ldots, n}(error_i) }{ error_i }`.
481
+ In addition to the accuracy, models are selected based on the correlation between residuals
482
+ of different models (diversity). The diversity of the *i*-th estimator is defined as
483
+ :math:`\\frac{n-count}{n}`, where *count* is the number of estimators for whom the correlation
484
+ of residuals exceeds `min_correlation`.
485
+
486
+ The hillclimbing is based on cross-validation to avoid having to create a separate validation set.
487
+
488
+ See [1]_, [2]_, [3]_ for further description.
489
+
490
+ Parameters
491
+ ----------
492
+ base_estimators : list
493
+ List of (name, estimator) tuples (implementing fit/predict) that are
494
+ part of the ensemble.
495
+
496
+ scorer : callable
497
+ Function with signature ``func(estimator, X_test, y_test, **test_predict_params)`` that evaluates the error
498
+ of the prediction on the test data. The function should return a scalar value.
499
+ *Smaller* values of the score are assumed to be better.
500
+
501
+ n_estimators : float or int, optional, default: 0.2
502
+ If a float, the percentage of estimators in the ensemble to retain, if an int the
503
+ absolute number of estimators to retain.
504
+
505
+ min_score : float, optional, default: 0.66
506
+ Threshold for pruning estimators based on scoring metric. After `fit`, only estimators
507
+ with a accuracy above `min_score` are retained.
508
+
509
+ min_correlation : float, optional, default: 0.6
510
+ Threshold for Pearson's correlation coefficient that determines when residuals of
511
+ two estimators are significantly correlated.
512
+
513
+ cv : int, a cv generator instance, or None, optional
514
+ The input specifying which cv generator to use. It can be an
515
+ integer, in which case it is the number of folds in a KFold,
516
+ None, in which case 3 fold is used, or another object, that
517
+ will then be used as a cv generator. The generator has to ensure
518
+ that each sample is only used once for testing.
519
+
520
+ n_jobs : int, optional, default: 1
521
+ Number of jobs to run in parallel.
522
+
523
+ verbose : int, optional, default: 0
524
+ Controls the verbosity: the higher, the more messages.
525
+
526
+ Attributes
527
+ ----------
528
+ scores_ : ndarray, shape = (n_base_estimators,)
529
+ Array of scores (relative to best performing estimator)
530
+
531
+ fitted_models_ : ndarray
532
+ Selected models during training based on `scorer`.
533
+
534
+ n_features_in_ : int
535
+ Number of features seen during ``fit``.
536
+
537
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
538
+ Names of features seen during ``fit``. Defined only when `X`
539
+ has feature names that are all strings.
540
+
541
+ References
542
+ ----------
543
+
544
+ .. [1] Pölsterl, S., Gupta, P., Wang, L., Conjeti, S., Katouzian, A., and Navab, N.,
545
+ "Heterogeneous ensembles for predicting survival of metastatic, castrate-resistant prostate cancer patients".
546
+ F1000Research, vol. 5, no. 2676, 2016
547
+
548
+ .. [2] Caruana, R., Munson, A., Niculescu-Mizil, A.
549
+ "Getting the most out of ensemble selection". 6th IEEE International Conference on Data Mining, 828-833, 2006
550
+
551
+ .. [3] Rooney, N., Patterson, D., Anand, S., Tsymbal, A.
552
+ "Dynamic integration of regression models. International Workshop on Multiple Classifier Systems".
553
+ Lecture Notes in Computer Science, vol. 3181, 164-173, 2004
554
+ """
555
+
556
+ _parameter_constraints = {
557
+ **BaseEnsembleSelection._parameter_constraints,
558
+ }
559
+ _parameter_constraints.pop("meta_estimator")
560
+
561
+ def __init__(
562
+ self,
563
+ base_estimators,
564
+ *,
565
+ scorer=None,
566
+ n_estimators=0.2,
567
+ min_score=0.66,
568
+ correlation="pearson",
569
+ min_correlation=0.6,
570
+ cv=None,
571
+ n_jobs=1,
572
+ verbose=0,
573
+ ):
574
+ super().__init__(
575
+ meta_estimator=MeanEstimator(),
576
+ base_estimators=base_estimators,
577
+ scorer=scorer,
578
+ n_estimators=n_estimators,
579
+ min_score=min_score,
580
+ correlation=correlation,
581
+ min_correlation=min_correlation,
582
+ cv=cv,
583
+ n_jobs=n_jobs,
584
+ verbose=verbose,
585
+ )
586
+
587
+ @property
588
+ def _predict_risk_score(self):
589
+ return False
590
+
591
+ def _fit(self, X, y, cv, **fit_params):
592
+ scores, base_ensemble = self._fit_and_score_ensemble(X, y, cv, **fit_params)
593
+ fitted_models, scores = self._prune_by_cv_score(scores, base_ensemble)
594
+
595
+ if len(fitted_models) > self.n_estimators_:
596
+ fitted_models, scores = self._prune_by_correlation(fitted_models, scores, X, y)
597
+
598
+ self.fitted_models_ = fitted_models
599
+ self.scores_ = scores
600
+
601
+ def _prune_by_cv_score(self, scores, base_ensemble, model_names=None):
602
+ mean_scores = scores.mean(axis=1)
603
+ mean_scores = mean_scores.min() / mean_scores
604
+
605
+ idx_good_models = np.flatnonzero(mean_scores >= self.min_score)
606
+ if len(idx_good_models) == 0:
607
+ raise ValueError("no base estimator exceeds min_score, try decreasing it")
608
+
609
+ fitted_models = self._create_cv_ensemble(base_ensemble, idx_good_models, model_names)
610
+
611
+ return fitted_models, mean_scores[idx_good_models]
612
+
613
+ def _prune_by_correlation(self, fitted_models, scores, X, y):
614
+ n_models = len(fitted_models)
615
+
616
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
617
+ delayed(_score_regressor)(est, X, y, i) for i, est in enumerate(fitted_models)
618
+ )
619
+
620
+ error = np.empty((X.shape[0], n_models), order="F")
621
+ for i, err in out:
622
+ error[:, i] = err
623
+
624
+ final_scores = self._add_diversity_score(scores, error)
625
+ sorted_idx = np.argsort(-final_scores, kind="mergesort")
626
+
627
+ selected_models = sorted_idx[: self.n_estimators_]
628
+
629
+ return fitted_models[selected_models], final_scores
630
+
631
+ def _predict_estimators(self, X):
632
+ n_models = len(self.fitted_models_)
633
+
634
+ out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
635
+ delayed(_predict)(est, X, i) for i, est in enumerate(self.fitted_models_)
636
+ )
637
+
638
+ predictions = np.empty((X.shape[0], n_models), order="F")
639
+ for i, p in out:
640
+ predictions[:, i] = p
641
+
642
+ return predictions