scikit-survival 0.26.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scikit_survival-0.26.0.dist-info/METADATA +185 -0
- scikit_survival-0.26.0.dist-info/RECORD +58 -0
- scikit_survival-0.26.0.dist-info/WHEEL +6 -0
- scikit_survival-0.26.0.dist-info/licenses/COPYING +674 -0
- scikit_survival-0.26.0.dist-info/top_level.txt +1 -0
- sksurv/__init__.py +183 -0
- sksurv/base.py +115 -0
- sksurv/bintrees/__init__.py +15 -0
- sksurv/bintrees/_binarytrees.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/column.py +204 -0
- sksurv/compare.py +123 -0
- sksurv/datasets/__init__.py +12 -0
- sksurv/datasets/base.py +614 -0
- sksurv/datasets/data/GBSG2.arff +700 -0
- sksurv/datasets/data/actg320.arff +1169 -0
- sksurv/datasets/data/bmt.arff +46 -0
- sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
- sksurv/datasets/data/cgvhd.arff +118 -0
- sksurv/datasets/data/flchain.arff +7887 -0
- sksurv/datasets/data/veteran.arff +148 -0
- sksurv/datasets/data/whas500.arff +520 -0
- sksurv/docstrings.py +99 -0
- sksurv/ensemble/__init__.py +2 -0
- sksurv/ensemble/_coxph_loss.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/ensemble/boosting.py +1564 -0
- sksurv/ensemble/forest.py +902 -0
- sksurv/ensemble/survival_loss.py +151 -0
- sksurv/exceptions.py +18 -0
- sksurv/functions.py +114 -0
- sksurv/io/__init__.py +2 -0
- sksurv/io/arffread.py +91 -0
- sksurv/io/arffwrite.py +181 -0
- sksurv/kernels/__init__.py +1 -0
- sksurv/kernels/_clinical_kernel.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/kernels/clinical.py +348 -0
- sksurv/linear_model/__init__.py +3 -0
- sksurv/linear_model/_coxnet.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/linear_model/aft.py +208 -0
- sksurv/linear_model/coxnet.py +592 -0
- sksurv/linear_model/coxph.py +637 -0
- sksurv/meta/__init__.py +4 -0
- sksurv/meta/base.py +35 -0
- sksurv/meta/ensemble_selection.py +724 -0
- sksurv/meta/stacking.py +370 -0
- sksurv/metrics.py +1028 -0
- sksurv/nonparametric.py +911 -0
- sksurv/preprocessing.py +195 -0
- sksurv/svm/__init__.py +11 -0
- sksurv/svm/_minlip.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/svm/_prsvm.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/svm/minlip.py +695 -0
- sksurv/svm/naive_survival_svm.py +249 -0
- sksurv/svm/survival_svm.py +1236 -0
- sksurv/testing.py +155 -0
- sksurv/tree/__init__.py +1 -0
- sksurv/tree/_criterion.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/tree/tree.py +790 -0
- sksurv/util.py +416 -0
|
@@ -0,0 +1,1564 @@
|
|
|
1
|
+
# This program is free software: you can redistribute it and/or modify
|
|
2
|
+
# it under the terms of the GNU General Public License as published by
|
|
3
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
4
|
+
# (at your option) any later version.
|
|
5
|
+
#
|
|
6
|
+
# This program is distributed in the hope that it will be useful,
|
|
7
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
8
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
9
|
+
# GNU General Public License for more details.
|
|
10
|
+
#
|
|
11
|
+
# You should have received a copy of the GNU General Public License
|
|
12
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
13
|
+
import numbers
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from scipy.sparse import csc_matrix, csr_matrix, issparse
|
|
17
|
+
from sklearn.base import BaseEstimator
|
|
18
|
+
from sklearn.ensemble._base import BaseEnsemble
|
|
19
|
+
from sklearn.ensemble._gb import BaseGradientBoosting, VerboseReporter
|
|
20
|
+
from sklearn.ensemble._gradient_boosting import _random_sample_mask
|
|
21
|
+
from sklearn.model_selection import train_test_split
|
|
22
|
+
from sklearn.tree import DecisionTreeRegressor
|
|
23
|
+
from sklearn.tree._tree import DTYPE
|
|
24
|
+
from sklearn.utils._param_validation import Interval, StrOptions
|
|
25
|
+
from sklearn.utils.extmath import squared_norm
|
|
26
|
+
from sklearn.utils.validation import (
|
|
27
|
+
_check_sample_weight,
|
|
28
|
+
check_array,
|
|
29
|
+
check_is_fitted,
|
|
30
|
+
check_random_state,
|
|
31
|
+
validate_data,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
from ..base import SurvivalAnalysisMixin
|
|
35
|
+
from ..docstrings import append_cumulative_hazard_example, append_survival_function_example
|
|
36
|
+
from ..linear_model.coxph import BreslowEstimator
|
|
37
|
+
from ..util import check_array_survival
|
|
38
|
+
from .survival_loss import LOSS_FUNCTIONS, CensoredSquaredLoss, CoxPH, IPCWLeastSquaresError
|
|
39
|
+
|
|
40
|
+
__all__ = ["ComponentwiseGradientBoostingSurvivalAnalysis", "GradientBoostingSurvivalAnalysis"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _sample_binomial_plus_one(p, size, random_state):
|
|
44
|
+
drop_model = random_state.binomial(1, p=p, size=size)
|
|
45
|
+
n_dropped = np.sum(drop_model)
|
|
46
|
+
if n_dropped == 0:
|
|
47
|
+
idx = random_state.randint(0, size)
|
|
48
|
+
drop_model[idx] = 1
|
|
49
|
+
n_dropped = 1
|
|
50
|
+
return drop_model, n_dropped
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class _ComponentwiseLeastSquares(BaseEstimator):
|
|
54
|
+
def __init__(self, component):
|
|
55
|
+
self.component = component
|
|
56
|
+
|
|
57
|
+
def fit(self, X, y, sample_weight):
|
|
58
|
+
xw = X[:, self.component] * sample_weight
|
|
59
|
+
b = np.dot(xw, y)
|
|
60
|
+
if b == 0:
|
|
61
|
+
self.coef_ = 0
|
|
62
|
+
else:
|
|
63
|
+
a = np.dot(xw, xw)
|
|
64
|
+
self.coef_ = b / a
|
|
65
|
+
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def predict(self, X):
|
|
69
|
+
return X[:, self.component] * self.coef_
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _fit_stage_componentwise(X, residuals, sample_weight, **fit_params): # pylint: disable=unused-argument
|
|
73
|
+
"""Fit component-wise weighted least squares model"""
|
|
74
|
+
n_features = X.shape[1]
|
|
75
|
+
|
|
76
|
+
base_learners = []
|
|
77
|
+
error = np.empty(n_features)
|
|
78
|
+
for component in range(n_features):
|
|
79
|
+
learner = _ComponentwiseLeastSquares(component).fit(X, residuals, sample_weight)
|
|
80
|
+
l_pred = learner.predict(X)
|
|
81
|
+
error[component] = squared_norm(residuals - l_pred)
|
|
82
|
+
base_learners.append(learner)
|
|
83
|
+
|
|
84
|
+
# TODO: could use bottleneck.nanargmin for speed
|
|
85
|
+
best_component = np.nanargmin(error)
|
|
86
|
+
best_learner = base_learners[best_component]
|
|
87
|
+
return best_learner
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ComponentwiseGradientBoostingSurvivalAnalysis(BaseEnsemble, SurvivalAnalysisMixin):
|
|
91
|
+
r"""Gradient boosting with component-wise least squares as base learner.
|
|
92
|
+
|
|
93
|
+
See the :ref:`User Guide </user_guide/boosting.ipynb>` and [1]_ for further description.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
loss : {'coxph', 'squared', 'ipcwls'}, optional, default: 'coxph'
|
|
98
|
+
loss function to be optimized. 'coxph' refers to partial likelihood loss
|
|
99
|
+
of Cox's proportional hazards model. The loss 'squared' minimizes a
|
|
100
|
+
squared regression loss that ignores predictions beyond the time of censoring,
|
|
101
|
+
and 'ipcwls' refers to inverse-probability of censoring weighted least squares error.
|
|
102
|
+
|
|
103
|
+
learning_rate : float, optional, default: 0.1
|
|
104
|
+
learning rate shrinks the contribution of each base learner by `learning_rate`.
|
|
105
|
+
There is a trade-off between `learning_rate` and `n_estimators`.
|
|
106
|
+
Values must be in the range `[0.0, inf)`.
|
|
107
|
+
|
|
108
|
+
n_estimators : int, optional, default: 100
|
|
109
|
+
The number of boosting stages to perform. Gradient boosting
|
|
110
|
+
is fairly robust to over-fitting so a large number usually
|
|
111
|
+
results in better performance.
|
|
112
|
+
Values must be in the range `[1, inf)`.
|
|
113
|
+
|
|
114
|
+
subsample : float, optional, default: 1.0
|
|
115
|
+
The fraction of samples to be used for fitting the individual base
|
|
116
|
+
learners. If smaller than 1.0 this results in Stochastic Gradient
|
|
117
|
+
Boosting. `subsample` interacts with the parameter `n_estimators`.
|
|
118
|
+
Choosing `subsample < 1.0` leads to a reduction of variance
|
|
119
|
+
and an increase in bias.
|
|
120
|
+
Values must be in the range `(0.0, 1.0]`.
|
|
121
|
+
|
|
122
|
+
warm_start : bool, optional, default: False
|
|
123
|
+
When set to ``True``, reuse the solution of the previous call to fit
|
|
124
|
+
and add more estimators to the ensemble, otherwise, just erase the
|
|
125
|
+
previous solution.
|
|
126
|
+
|
|
127
|
+
dropout_rate : float, optional, default: 0.0
|
|
128
|
+
If larger than zero, the residuals at each iteration are only computed
|
|
129
|
+
from a random subset of base learners. The value corresponds to the
|
|
130
|
+
percentage of base learners that are dropped. In each iteration,
|
|
131
|
+
at least one base learner is dropped. This is an alternative regularization
|
|
132
|
+
to shrinkage, i.e., setting `learning_rate < 1.0`.
|
|
133
|
+
Values must be in the range `[0.0, 1.0)`.
|
|
134
|
+
|
|
135
|
+
random_state : int, RandomState instance or None, optional, default: None
|
|
136
|
+
Controls the randomness of the subsampling of the data if ``subsample < 1.0``,
|
|
137
|
+
and the random selection of base learners to drop if ``dropout_rate > 0``.
|
|
138
|
+
Pass an int for reproducible output across multiple function calls.
|
|
139
|
+
|
|
140
|
+
verbose : int, optional, default: 0
|
|
141
|
+
Enable verbose output. If 1 then it prints progress and performance
|
|
142
|
+
once in a while.
|
|
143
|
+
Values must be in the range `[0, inf)`.
|
|
144
|
+
|
|
145
|
+
Attributes
|
|
146
|
+
----------
|
|
147
|
+
coef_ : ndarray, shape = (n_features + 1,), dtype = float
|
|
148
|
+
The aggregated coefficients. The first element `coef\_[0]` corresponds
|
|
149
|
+
to the intercept. If loss is `coxph`, the intercept will always be zero.
|
|
150
|
+
|
|
151
|
+
estimators_ : list of base learners
|
|
152
|
+
The collection of fitted sub-estimators.
|
|
153
|
+
|
|
154
|
+
train_score_ : ndarray, shape = (n_estimators,)
|
|
155
|
+
The i-th score ``train_score_[i]`` is the loss of the
|
|
156
|
+
model at iteration ``i`` on the in-bag sample.
|
|
157
|
+
If ``subsample == 1`` this is the loss on the training data.
|
|
158
|
+
|
|
159
|
+
oob_improvement_ : ndarray, shape = (n_estimators,)
|
|
160
|
+
The improvement in loss on the out-of-bag samples
|
|
161
|
+
relative to the previous iteration.
|
|
162
|
+
``oob_improvement_[0]`` is the improvement in
|
|
163
|
+
loss of the first stage over the ``init`` estimator.
|
|
164
|
+
Only available if ``subsample < 1.0``.
|
|
165
|
+
|
|
166
|
+
oob_scores_ : ndarray, shape = (n_estimators,)
|
|
167
|
+
The full history of the loss values on the out-of-bag
|
|
168
|
+
samples. Only available if ``subsample < 1.0``.
|
|
169
|
+
|
|
170
|
+
oob_score_ : float
|
|
171
|
+
The last value of the loss on the out-of-bag samples. It is
|
|
172
|
+
the same as ``oob_scores_[-1]``. Only available if ``subsample < 1.0``.
|
|
173
|
+
|
|
174
|
+
n_features_in_ : int
|
|
175
|
+
Number of features seen during ``fit``.
|
|
176
|
+
|
|
177
|
+
feature_names_in_ : ndarray, shape = (`n_features_in_`,)
|
|
178
|
+
Names of features seen during ``fit``. Defined only when `X`
|
|
179
|
+
has feature names that are all strings.
|
|
180
|
+
|
|
181
|
+
unique_times_ : ndarray, shape = (n_unique_times,)
|
|
182
|
+
Unique time points.
|
|
183
|
+
|
|
184
|
+
References
|
|
185
|
+
----------
|
|
186
|
+
.. [1] Hothorn, T., Bühlmann, P., Dudoit, S., Molinaro, A., van der Laan, M. J.,
|
|
187
|
+
"Survival ensembles", Biostatistics, 7(3), 355-73, 2006
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
_parameter_constraints = {
|
|
191
|
+
"loss": [StrOptions(frozenset(LOSS_FUNCTIONS.keys()))],
|
|
192
|
+
"learning_rate": [Interval(numbers.Real, 0.0, None, closed="left")],
|
|
193
|
+
"n_estimators": [Interval(numbers.Integral, 1, None, closed="left")],
|
|
194
|
+
"subsample": [Interval(numbers.Real, 0.0, 1.0, closed="right")],
|
|
195
|
+
"warm_start": ["boolean"],
|
|
196
|
+
"dropout_rate": [Interval(numbers.Real, 0.0, 1.0, closed="left")],
|
|
197
|
+
"random_state": ["random_state"],
|
|
198
|
+
"verbose": ["verbose"],
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
def __init__(
|
|
202
|
+
self,
|
|
203
|
+
*,
|
|
204
|
+
loss="coxph",
|
|
205
|
+
learning_rate=0.1,
|
|
206
|
+
n_estimators=100,
|
|
207
|
+
subsample=1.0,
|
|
208
|
+
warm_start=False,
|
|
209
|
+
dropout_rate=0,
|
|
210
|
+
random_state=None,
|
|
211
|
+
verbose=0,
|
|
212
|
+
):
|
|
213
|
+
self.loss = loss
|
|
214
|
+
self.n_estimators = n_estimators
|
|
215
|
+
self.learning_rate = learning_rate
|
|
216
|
+
self.subsample = subsample
|
|
217
|
+
self.warm_start = warm_start
|
|
218
|
+
self.dropout_rate = dropout_rate
|
|
219
|
+
self.random_state = random_state
|
|
220
|
+
self.verbose = verbose
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def _predict_risk_score(self):
|
|
224
|
+
return isinstance(self._loss, CoxPH)
|
|
225
|
+
|
|
226
|
+
def _is_fitted(self):
|
|
227
|
+
return len(getattr(self, "estimators_", [])) > 0
|
|
228
|
+
|
|
229
|
+
def _init_state(self):
|
|
230
|
+
self.estimators_ = np.empty(self.n_estimators, dtype=object)
|
|
231
|
+
|
|
232
|
+
self.train_score_ = np.zeros(self.n_estimators, dtype=np.float64)
|
|
233
|
+
# do oob?
|
|
234
|
+
if self.subsample < 1.0:
|
|
235
|
+
self.oob_improvement_ = np.zeros(self.n_estimators, dtype=np.float64)
|
|
236
|
+
self.oob_scores_ = np.zeros(self.n_estimators, dtype=np.float64)
|
|
237
|
+
self.oob_score_ = np.nan
|
|
238
|
+
|
|
239
|
+
if self.dropout_rate > 0:
|
|
240
|
+
self._scale = np.ones(int(self.n_estimators), dtype=float)
|
|
241
|
+
|
|
242
|
+
def _resize_state(self):
|
|
243
|
+
"""Add additional ``n_estimators`` entries to all attributes."""
|
|
244
|
+
# self.n_estimators is the number of additional est to fit
|
|
245
|
+
total_n_estimators = self.n_estimators
|
|
246
|
+
|
|
247
|
+
self.estimators_ = np.resize(self.estimators_, total_n_estimators)
|
|
248
|
+
self.train_score_ = np.resize(self.train_score_, total_n_estimators)
|
|
249
|
+
if self.subsample < 1 or hasattr(self, "oob_improvement_"):
|
|
250
|
+
# if do oob resize arrays or create new if not available
|
|
251
|
+
if hasattr(self, "oob_improvement_"):
|
|
252
|
+
self.oob_improvement_ = np.resize(self.oob_improvement_, total_n_estimators)
|
|
253
|
+
self.oob_scores_ = np.resize(self.oob_scores_, total_n_estimators)
|
|
254
|
+
self.oob_score_ = np.nan
|
|
255
|
+
else:
|
|
256
|
+
self.oob_improvement_ = np.zeros(total_n_estimators, dtype=np.float64)
|
|
257
|
+
self.oob_scores_ = np.zeros((total_n_estimators,), dtype=np.float64)
|
|
258
|
+
self.oob_score_ = np.nan
|
|
259
|
+
|
|
260
|
+
if self.dropout_rate > 0:
|
|
261
|
+
if not hasattr(self, "_scale"):
|
|
262
|
+
raise ValueError(
|
|
263
|
+
"fitting with warm_start=True and dropout_rate > 0 is only "
|
|
264
|
+
"supported if the previous fit used dropout_rate > 0 too"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
self._scale = np.resize(self._scale, total_n_estimators)
|
|
268
|
+
self._scale[self.n_estimators_ :] = 1
|
|
269
|
+
|
|
270
|
+
def _clear_state(self):
|
|
271
|
+
"""Clear the state of the gradient boosting model."""
|
|
272
|
+
if hasattr(self, "estimators_"):
|
|
273
|
+
self.estimators_ = np.empty(0, dtype=object)
|
|
274
|
+
if hasattr(self, "train_score_"):
|
|
275
|
+
del self.train_score_
|
|
276
|
+
if hasattr(self, "oob_improvement_"):
|
|
277
|
+
del self.oob_improvement_
|
|
278
|
+
if hasattr(self, "oob_scores_"):
|
|
279
|
+
del self.oob_scores_
|
|
280
|
+
if hasattr(self, "oob_score_"):
|
|
281
|
+
del self.oob_score_
|
|
282
|
+
if hasattr(self, "_rng"):
|
|
283
|
+
del self._rng
|
|
284
|
+
if hasattr(self, "_scale"):
|
|
285
|
+
del self._scale
|
|
286
|
+
|
|
287
|
+
def _update_with_dropout(self, i, X, raw_predictions, scale, random_state):
|
|
288
|
+
# select base learners to be dropped for next iteration
|
|
289
|
+
drop_model, n_dropped = _sample_binomial_plus_one(self.dropout_rate, i + 1, random_state)
|
|
290
|
+
|
|
291
|
+
# adjust scaling factor of tree that is going to be trained in next iteration
|
|
292
|
+
scale[i + 1] = 1.0 / (n_dropped + 1.0)
|
|
293
|
+
|
|
294
|
+
raw_predictions[:] = 0
|
|
295
|
+
for m in range(i + 1):
|
|
296
|
+
if drop_model[m] == 1:
|
|
297
|
+
# adjust scaling factor of dropped trees
|
|
298
|
+
scale[m] *= n_dropped / (n_dropped + 1.0)
|
|
299
|
+
else:
|
|
300
|
+
# pseudoresponse of next iteration (without contribution of dropped trees)
|
|
301
|
+
raw_predictions += self.learning_rate * scale[m] * self.estimators_[m].predict(X)
|
|
302
|
+
|
|
303
|
+
def _fit(self, X, event, time, y_pred, sample_weight, random_state, begin_at_stage=0): # noqa: C901
|
|
304
|
+
n_samples = X.shape[0]
|
|
305
|
+
# account for intercept
|
|
306
|
+
y = np.fromiter(zip(event, time), dtype=[("event", bool), ("time", np.float64)])
|
|
307
|
+
|
|
308
|
+
do_oob = self.subsample < 1.0
|
|
309
|
+
if do_oob:
|
|
310
|
+
n_inbag = max(1, int(self.subsample * n_samples))
|
|
311
|
+
|
|
312
|
+
do_dropout = self.dropout_rate > 0
|
|
313
|
+
if do_dropout:
|
|
314
|
+
scale = self._scale
|
|
315
|
+
|
|
316
|
+
if self.verbose:
|
|
317
|
+
verbose_reporter = VerboseReporter(verbose=self.verbose)
|
|
318
|
+
verbose_reporter.init(self, 0)
|
|
319
|
+
|
|
320
|
+
# perform boosting iterations
|
|
321
|
+
i = begin_at_stage
|
|
322
|
+
for i in range(begin_at_stage, int(self.n_estimators)):
|
|
323
|
+
# subsampling
|
|
324
|
+
if do_oob:
|
|
325
|
+
sample_mask = _random_sample_mask(n_samples, n_inbag, random_state)
|
|
326
|
+
subsample_weight = sample_weight * sample_mask.astype(np.float64)
|
|
327
|
+
|
|
328
|
+
# OOB score before adding this stage
|
|
329
|
+
y_oob_masked = y[~sample_mask]
|
|
330
|
+
sample_weight_oob_masked = sample_weight[~sample_mask]
|
|
331
|
+
if i == 0: # store the initial loss to compute the OOB score
|
|
332
|
+
initial_loss = self._loss(
|
|
333
|
+
y_true=y_oob_masked,
|
|
334
|
+
raw_prediction=y_pred[~sample_mask],
|
|
335
|
+
sample_weight=sample_weight_oob_masked,
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
subsample_weight = sample_weight
|
|
339
|
+
|
|
340
|
+
residuals = self._loss.gradient(y, y_pred, sample_weight=sample_weight)
|
|
341
|
+
|
|
342
|
+
best_learner = _fit_stage_componentwise(X, residuals, subsample_weight)
|
|
343
|
+
self.estimators_[i] = best_learner
|
|
344
|
+
|
|
345
|
+
if do_dropout and i < len(scale) - 1:
|
|
346
|
+
self._update_with_dropout(i, X, y_pred, scale, random_state)
|
|
347
|
+
else:
|
|
348
|
+
y_pred += self.learning_rate * best_learner.predict(X)
|
|
349
|
+
|
|
350
|
+
# track loss
|
|
351
|
+
if do_oob:
|
|
352
|
+
self.train_score_[i] = self._loss(
|
|
353
|
+
y_true=y[sample_mask],
|
|
354
|
+
raw_prediction=y_pred[sample_mask],
|
|
355
|
+
sample_weight=sample_weight[sample_mask],
|
|
356
|
+
)
|
|
357
|
+
self.oob_scores_[i] = self._loss(
|
|
358
|
+
y_true=y_oob_masked,
|
|
359
|
+
raw_prediction=y_pred[~sample_mask],
|
|
360
|
+
sample_weight=sample_weight_oob_masked,
|
|
361
|
+
)
|
|
362
|
+
previous_loss = initial_loss if i == 0 else self.oob_scores_[i - 1]
|
|
363
|
+
self.oob_improvement_[i] = previous_loss - self.oob_scores_[i]
|
|
364
|
+
self.oob_score_ = self.oob_scores_[-1]
|
|
365
|
+
else:
|
|
366
|
+
# no need to fancy index w/ no subsampling
|
|
367
|
+
self.train_score_[i] = self._loss(y_true=y, raw_prediction=y_pred, sample_weight=sample_weight)
|
|
368
|
+
|
|
369
|
+
if self.verbose > 0:
|
|
370
|
+
verbose_reporter.update(i, self)
|
|
371
|
+
|
|
372
|
+
return i + 1
|
|
373
|
+
|
|
374
|
+
def fit(self, X, y, sample_weight=None):
|
|
375
|
+
"""Fit estimator.
|
|
376
|
+
|
|
377
|
+
Parameters
|
|
378
|
+
----------
|
|
379
|
+
X : array-like, shape = (n_samples, n_features)
|
|
380
|
+
Data matrix
|
|
381
|
+
|
|
382
|
+
y : structured array, shape = (n_samples,)
|
|
383
|
+
A structured array with two fields. The first field is a boolean
|
|
384
|
+
where ``True`` indicates an event and ``False`` indicates right-censoring.
|
|
385
|
+
The second field is a float with the time of event or time of censoring.
|
|
386
|
+
|
|
387
|
+
sample_weight : array-like, shape = (n_samples,), optional
|
|
388
|
+
Weights given to each sample. If omitted, all samples have weight 1.
|
|
389
|
+
|
|
390
|
+
Returns
|
|
391
|
+
-------
|
|
392
|
+
self
|
|
393
|
+
"""
|
|
394
|
+
self._validate_params()
|
|
395
|
+
|
|
396
|
+
if not self.warm_start:
|
|
397
|
+
self._clear_state()
|
|
398
|
+
|
|
399
|
+
X = validate_data(self, X, ensure_min_samples=2)
|
|
400
|
+
event, time = check_array_survival(X, y)
|
|
401
|
+
|
|
402
|
+
sample_weight = _check_sample_weight(sample_weight, X)
|
|
403
|
+
|
|
404
|
+
n_samples = X.shape[0]
|
|
405
|
+
Xi = np.column_stack((np.ones(n_samples), X))
|
|
406
|
+
|
|
407
|
+
self._loss = LOSS_FUNCTIONS[self.loss]()
|
|
408
|
+
if isinstance(self._loss, CensoredSquaredLoss | IPCWLeastSquaresError):
|
|
409
|
+
time = np.log(time)
|
|
410
|
+
|
|
411
|
+
if not self._is_fitted():
|
|
412
|
+
self._init_state()
|
|
413
|
+
|
|
414
|
+
y_pred = np.zeros(n_samples, dtype=np.float64)
|
|
415
|
+
|
|
416
|
+
begin_at_stage = 0
|
|
417
|
+
|
|
418
|
+
self._rng = check_random_state(self.random_state)
|
|
419
|
+
else:
|
|
420
|
+
# add more estimators to fitted model
|
|
421
|
+
# invariant: warm_start = True
|
|
422
|
+
if self.n_estimators < self.estimators_.shape[0]:
|
|
423
|
+
raise ValueError(
|
|
424
|
+
"n_estimators=%d must be larger or equal to "
|
|
425
|
+
"estimators_.shape[0]=%d when "
|
|
426
|
+
"warm_start==True" % (self.n_estimators, self.estimators_.shape[0])
|
|
427
|
+
)
|
|
428
|
+
begin_at_stage = self.estimators_.shape[0]
|
|
429
|
+
y_pred = self._raw_predict(Xi)
|
|
430
|
+
self._resize_state()
|
|
431
|
+
|
|
432
|
+
# apply dropout to last stage of previous fit
|
|
433
|
+
if hasattr(self, "_scale") and self.dropout_rate > 0:
|
|
434
|
+
# pylint: disable-next=access-member-before-definition
|
|
435
|
+
self._update_with_dropout(self.n_estimators_ - 1, Xi, y_pred, self._scale, self._rng)
|
|
436
|
+
|
|
437
|
+
self.n_estimators_ = self._fit(Xi, event, time, y_pred, sample_weight, self._rng, begin_at_stage)
|
|
438
|
+
|
|
439
|
+
self._set_baseline_model(X, event, time)
|
|
440
|
+
return self
|
|
441
|
+
|
|
442
|
+
def _set_baseline_model(self, X, event, time):
|
|
443
|
+
if isinstance(self._loss, CoxPH):
|
|
444
|
+
risk_scores = self._predict(X)
|
|
445
|
+
self._baseline_model = BreslowEstimator().fit(risk_scores, event, time)
|
|
446
|
+
else:
|
|
447
|
+
self._baseline_model = None
|
|
448
|
+
|
|
449
|
+
def _raw_predict(self, X):
|
|
450
|
+
pred = np.zeros(X.shape[0], dtype=float)
|
|
451
|
+
for estimator in self.estimators_:
|
|
452
|
+
pred += self.learning_rate * estimator.predict(X)
|
|
453
|
+
return pred
|
|
454
|
+
|
|
455
|
+
def _predict(self, X):
|
|
456
|
+
# account for intercept
|
|
457
|
+
Xi = np.column_stack((np.ones(X.shape[0]), X))
|
|
458
|
+
pred = self._raw_predict(Xi)
|
|
459
|
+
return self._loss._scale_raw_prediction(pred)
|
|
460
|
+
|
|
461
|
+
def predict(self, X):
|
|
462
|
+
"""Predict risk scores.
|
|
463
|
+
|
|
464
|
+
If `loss='coxph'`, predictions can be interpreted as log hazard ratio
|
|
465
|
+
corresponding to the linear predictor of a Cox proportional hazards
|
|
466
|
+
model. If `loss='squared'` or `loss='ipcwls'`, predictions are the
|
|
467
|
+
time to event.
|
|
468
|
+
|
|
469
|
+
Parameters
|
|
470
|
+
----------
|
|
471
|
+
X : array-like, shape = (n_samples, n_features)
|
|
472
|
+
Data matrix.
|
|
473
|
+
|
|
474
|
+
Returns
|
|
475
|
+
-------
|
|
476
|
+
risk_score : array, shape = (n_samples,)
|
|
477
|
+
Predicted risk scores.
|
|
478
|
+
"""
|
|
479
|
+
check_is_fitted(self, "estimators_")
|
|
480
|
+
X = validate_data(self, X, reset=False)
|
|
481
|
+
|
|
482
|
+
return self._predict(X)
|
|
483
|
+
|
|
484
|
+
def _get_baseline_model(self):
|
|
485
|
+
if self._baseline_model is None:
|
|
486
|
+
raise ValueError("`fit` must be called with the loss option set to 'coxph'.")
|
|
487
|
+
return self._baseline_model
|
|
488
|
+
|
|
489
|
+
@append_cumulative_hazard_example(
|
|
490
|
+
estimator_mod="ensemble", estimator_class="ComponentwiseGradientBoostingSurvivalAnalysis"
|
|
491
|
+
)
|
|
492
|
+
def predict_cumulative_hazard_function(self, X, return_array=False):
|
|
493
|
+
r"""Predict cumulative hazard function.
|
|
494
|
+
|
|
495
|
+
Only available if :meth:`fit` has been called with `loss = "coxph"`.
|
|
496
|
+
|
|
497
|
+
The cumulative hazard function for an individual
|
|
498
|
+
with feature vector :math:`x` is defined as
|
|
499
|
+
|
|
500
|
+
.. math::
|
|
501
|
+
|
|
502
|
+
H(t \mid x) = \exp(f(x)) H_0(t) ,
|
|
503
|
+
|
|
504
|
+
where :math:`f(\cdot)` is the additive ensemble of base learners,
|
|
505
|
+
and :math:`H_0(t)` is the baseline hazard function,
|
|
506
|
+
estimated by Breslow's estimator.
|
|
507
|
+
|
|
508
|
+
Parameters
|
|
509
|
+
----------
|
|
510
|
+
X : array-like, shape = (n_samples, n_features)
|
|
511
|
+
Data matrix.
|
|
512
|
+
|
|
513
|
+
return_array : bool, default: False
|
|
514
|
+
Whether to return a single array of cumulative hazard values
|
|
515
|
+
or a list of step functions.
|
|
516
|
+
|
|
517
|
+
If `False`, a list of :class:`sksurv.functions.StepFunction`
|
|
518
|
+
objects is returned.
|
|
519
|
+
|
|
520
|
+
If `True`, a 2d-array of shape `(n_samples, n_unique_times)` is
|
|
521
|
+
returned, where `n_unique_times` is the number of unique
|
|
522
|
+
event times in the training data. Each row represents the cumulative
|
|
523
|
+
hazard function of an individual evaluated at `unique_times_`.
|
|
524
|
+
|
|
525
|
+
Returns
|
|
526
|
+
-------
|
|
527
|
+
cum_hazard : ndarray
|
|
528
|
+
If `return_array` is `False`, an array of `n_samples`
|
|
529
|
+
:class:`sksurv.functions.StepFunction` instances is returned.
|
|
530
|
+
|
|
531
|
+
If `return_array` is `True`, a numeric array of shape
|
|
532
|
+
`(n_samples, n_unique_times_)` is returned.
|
|
533
|
+
|
|
534
|
+
Examples
|
|
535
|
+
--------
|
|
536
|
+
"""
|
|
537
|
+
return self._predict_cumulative_hazard_function(self._get_baseline_model(), self.predict(X), return_array)
|
|
538
|
+
|
|
539
|
+
@append_survival_function_example(
|
|
540
|
+
estimator_mod="ensemble", estimator_class="ComponentwiseGradientBoostingSurvivalAnalysis"
|
|
541
|
+
)
|
|
542
|
+
def predict_survival_function(self, X, return_array=False):
|
|
543
|
+
r"""Predict survival function.
|
|
544
|
+
|
|
545
|
+
Only available if :meth:`fit` has been called with `loss = "coxph"`.
|
|
546
|
+
|
|
547
|
+
The survival function for an individual
|
|
548
|
+
with feature vector :math:`x` is defined as
|
|
549
|
+
|
|
550
|
+
.. math::
|
|
551
|
+
|
|
552
|
+
S(t \mid x) = S_0(t)^{\exp(f(x)} ,
|
|
553
|
+
|
|
554
|
+
where :math:`f(\cdot)` is the additive ensemble of base learners,
|
|
555
|
+
and :math:`S_0(t)` is the baseline survival function,
|
|
556
|
+
estimated by Breslow's estimator.
|
|
557
|
+
|
|
558
|
+
Parameters
|
|
559
|
+
----------
|
|
560
|
+
X : array-like, shape = (n_samples, n_features)
|
|
561
|
+
Data matrix.
|
|
562
|
+
|
|
563
|
+
return_array : bool, default: False
|
|
564
|
+
Whether to return a single array of survival probabilities
|
|
565
|
+
or a list of step functions.
|
|
566
|
+
|
|
567
|
+
If `False`, a list of :class:`sksurv.functions.StepFunction`
|
|
568
|
+
objects is returned.
|
|
569
|
+
|
|
570
|
+
If `True`, a 2d-array of shape `(n_samples, n_unique_times)` is
|
|
571
|
+
returned, where `n_unique_times` is the number of unique
|
|
572
|
+
event times in the training data. Each row represents the survival
|
|
573
|
+
function of an individual evaluated at `unique_times_`.
|
|
574
|
+
|
|
575
|
+
Returns
|
|
576
|
+
-------
|
|
577
|
+
survival : ndarray
|
|
578
|
+
If `return_array` is `False`, an array of `n_samples`
|
|
579
|
+
:class:`sksurv.functions.StepFunction` instances is returned.
|
|
580
|
+
|
|
581
|
+
If `return_array` is `True`, a numeric array of shape
|
|
582
|
+
`(n_samples, n_unique_times_)` is returned.
|
|
583
|
+
|
|
584
|
+
Examples
|
|
585
|
+
--------
|
|
586
|
+
"""
|
|
587
|
+
return self._predict_survival_function(self._get_baseline_model(), self.predict(X), return_array)
|
|
588
|
+
|
|
589
|
+
@property
|
|
590
|
+
def coef_(self):
|
|
591
|
+
coef = np.zeros(self.n_features_in_ + 1, dtype=float)
|
|
592
|
+
|
|
593
|
+
for estimator in self.estimators_:
|
|
594
|
+
coef[estimator.component] += self.learning_rate * estimator.coef_
|
|
595
|
+
|
|
596
|
+
return coef
|
|
597
|
+
|
|
598
|
+
@property
|
|
599
|
+
def unique_times_(self):
|
|
600
|
+
return self._get_baseline_model().unique_times_
|
|
601
|
+
|
|
602
|
+
@property
|
|
603
|
+
def feature_importances_(self):
|
|
604
|
+
imp = np.empty(self.n_features_in_ + 1, dtype=object)
|
|
605
|
+
for i in range(imp.shape[0]):
|
|
606
|
+
imp[i] = []
|
|
607
|
+
|
|
608
|
+
for k, estimator in enumerate(self.estimators_):
|
|
609
|
+
imp[estimator.component].append(k + 1)
|
|
610
|
+
|
|
611
|
+
def _importance(x):
|
|
612
|
+
if len(x) > 0:
|
|
613
|
+
return np.min(x)
|
|
614
|
+
return np.nan
|
|
615
|
+
|
|
616
|
+
ret = np.array([_importance(x) for x in imp])
|
|
617
|
+
return ret
|
|
618
|
+
|
|
619
|
+
def _make_estimator(self, append=True, random_state=None):
|
|
620
|
+
# we don't need _make_estimator
|
|
621
|
+
raise NotImplementedError()
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
class GradientBoostingSurvivalAnalysis(BaseGradientBoosting, SurvivalAnalysisMixin):
|
|
625
|
+
r"""Gradient-boosted Cox proportional hazard loss with
|
|
626
|
+
regression trees as base learner.
|
|
627
|
+
|
|
628
|
+
In each stage, a regression tree is fit on the negative gradient
|
|
629
|
+
of the loss function.
|
|
630
|
+
|
|
631
|
+
For more details on gradient boosting see [1]_ and [2]_. If `loss='coxph'`,
|
|
632
|
+
the partial likelihood of the proportional hazards model is optimized as
|
|
633
|
+
described in [3]_. If `loss='ipcwls'`, the accelerated failure time model with
|
|
634
|
+
inverse-probability of censoring weighted least squares error is optimized as
|
|
635
|
+
described in [4]_. When using a non-zero `dropout_rate`, regularization is
|
|
636
|
+
applied during training following [5]_.
|
|
637
|
+
|
|
638
|
+
See the :ref:`User Guide </user_guide/boosting.ipynb>` for examples.
|
|
639
|
+
|
|
640
|
+
Parameters
|
|
641
|
+
----------
|
|
642
|
+
loss : {'coxph', 'squared', 'ipcwls'}, optional, default: 'coxph'
|
|
643
|
+
loss function to be optimized. 'coxph' refers to partial likelihood loss
|
|
644
|
+
of Cox's proportional hazards model. The loss 'squared' minimizes a
|
|
645
|
+
squared regression loss that ignores predictions beyond the time of censoring,
|
|
646
|
+
and 'ipcwls' refers to inverse-probability of censoring weighted least squares error.
|
|
647
|
+
|
|
648
|
+
learning_rate : float, optional, default: 0.1
|
|
649
|
+
learning rate shrinks the contribution of each tree by `learning_rate`.
|
|
650
|
+
There is a trade-off between `learning_rate` and `n_estimators`.
|
|
651
|
+
Values must be in the range `[0.0, inf)`.
|
|
652
|
+
|
|
653
|
+
n_estimators : int, optional, default: 100
|
|
654
|
+
The number of regression trees to create. Gradient boosting
|
|
655
|
+
is fairly robust to over-fitting so a large number usually
|
|
656
|
+
results in better performance.
|
|
657
|
+
Values must be in the range `[1, inf)`.
|
|
658
|
+
|
|
659
|
+
subsample : float, optional, default: 1.0
|
|
660
|
+
The fraction of samples to be used for fitting the individual base
|
|
661
|
+
learners. If smaller than 1.0 this results in Stochastic Gradient
|
|
662
|
+
Boosting. `subsample` interacts with the parameter `n_estimators`.
|
|
663
|
+
Choosing `subsample < 1.0` leads to a reduction of variance
|
|
664
|
+
and an increase in bias.
|
|
665
|
+
Values must be in the range `(0.0, 1.0]`.
|
|
666
|
+
|
|
667
|
+
criterion : {'friedman_mse', 'squared_error'}, optional, default: 'friedman_mse'
|
|
668
|
+
The function to measure the quality of a split. Supported criteria are
|
|
669
|
+
'friedman_mse' for the mean squared error with improvement score by
|
|
670
|
+
Friedman, 'squared_error' for mean squared error. The default value of
|
|
671
|
+
'friedman_mse' is generally the best as it can provide a better
|
|
672
|
+
approximation in some cases.
|
|
673
|
+
|
|
674
|
+
min_samples_split : int or float, optional, default: 2
|
|
675
|
+
The minimum number of samples required to split an internal node:
|
|
676
|
+
|
|
677
|
+
- If int, values must be in the range `[2, inf)`.
|
|
678
|
+
- If float, values must be in the range `(0.0, 1.0]` and `min_samples_split`
|
|
679
|
+
will be `ceil(min_samples_split * n_samples)`.
|
|
680
|
+
|
|
681
|
+
min_samples_leaf : int or float, optional, default: 1
|
|
682
|
+
The minimum number of samples required to be at a leaf node.
|
|
683
|
+
A split point at any depth will only be considered if it leaves at
|
|
684
|
+
least ``min_samples_leaf`` training samples in each of the left and
|
|
685
|
+
right branches. This may have the effect of smoothing the model,
|
|
686
|
+
especially in regression.
|
|
687
|
+
|
|
688
|
+
- If int, values must be in the range `[1, inf)`.
|
|
689
|
+
- If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf`
|
|
690
|
+
will be `ceil(min_samples_leaf * n_samples)`.
|
|
691
|
+
|
|
692
|
+
min_weight_fraction_leaf : float, optional, default: 0.
|
|
693
|
+
The minimum weighted fraction of the sum total of weights (of all
|
|
694
|
+
the input samples) required to be at a leaf node. Samples have
|
|
695
|
+
equal weight when `sample_weight` is not provided.
|
|
696
|
+
Values must be in the range `[0.0, 0.5]`.
|
|
697
|
+
|
|
698
|
+
max_depth : int or None, optional, default: 3
|
|
699
|
+
Maximum depth of the individual regression estimators. The maximum
|
|
700
|
+
depth limits the number of nodes in the tree. Tune this parameter
|
|
701
|
+
for best performance; the best value depends on the interaction
|
|
702
|
+
of the input variables. If None, then nodes are expanded until
|
|
703
|
+
all leaves are pure or until all leaves contain less than
|
|
704
|
+
`min_samples_split` samples.
|
|
705
|
+
If int, values must be in the range `[1, inf)`.
|
|
706
|
+
|
|
707
|
+
min_impurity_decrease : float, optional, default: 0.
|
|
708
|
+
A node will be split if this split induces a decrease of the impurity
|
|
709
|
+
greater than or equal to this value.
|
|
710
|
+
|
|
711
|
+
The weighted impurity decrease equation is the following::
|
|
712
|
+
|
|
713
|
+
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
714
|
+
- N_t_L / N_t * left_impurity)
|
|
715
|
+
|
|
716
|
+
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
717
|
+
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
718
|
+
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
719
|
+
|
|
720
|
+
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
721
|
+
if ``sample_weight`` is passed.
|
|
722
|
+
|
|
723
|
+
random_state : int, RandomState instance, or None, optional, default: None
|
|
724
|
+
Controls the random seed given to each Tree estimator at each
|
|
725
|
+
boosting iteration.
|
|
726
|
+
In addition, it controls the random permutation of the features at
|
|
727
|
+
each split.
|
|
728
|
+
It also controls the random splitting of the training data to obtain a
|
|
729
|
+
validation set if `n_iter_no_change` is not None.
|
|
730
|
+
Pass an int for reproducible output across multiple function calls.
|
|
731
|
+
|
|
732
|
+
max_features : int, float, {'sqrt', 'log2'} or None, optional, default: None
|
|
733
|
+
The number of features to consider when looking for the best split:
|
|
734
|
+
|
|
735
|
+
- If int, values must be in the range `[1, inf)`.
|
|
736
|
+
- If float, values must be in the range `(0.0, 1.0]` and the features
|
|
737
|
+
considered at each split will be `max(1, int(max_features * n_features_in_))`.
|
|
738
|
+
- If 'sqrt', then `max_features=sqrt(n_features)`.
|
|
739
|
+
- If 'log2', then `max_features=log2(n_features)`.
|
|
740
|
+
- If None, then `max_features=n_features`.
|
|
741
|
+
|
|
742
|
+
Choosing `max_features < n_features` leads to a reduction of variance
|
|
743
|
+
and an increase in bias.
|
|
744
|
+
|
|
745
|
+
Note: the search for a split does not stop until at least one
|
|
746
|
+
valid partition of the node samples is found, even if it requires to
|
|
747
|
+
effectively inspect more than ``max_features`` features.
|
|
748
|
+
|
|
749
|
+
max_leaf_nodes : int or None, optional, default: None
|
|
750
|
+
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
751
|
+
Best nodes are defined as relative reduction in impurity.
|
|
752
|
+
Values must be in the range `[2, inf)`.
|
|
753
|
+
If `None`, then unlimited number of leaf nodes.
|
|
754
|
+
|
|
755
|
+
warm_start : bool, optional, default: False
|
|
756
|
+
When set to ``True``, reuse the solution of the previous call to fit
|
|
757
|
+
and add more estimators to the ensemble, otherwise, just erase the
|
|
758
|
+
previous solution.
|
|
759
|
+
|
|
760
|
+
validation_fraction : float, optional, default: 0.1
|
|
761
|
+
The proportion of training data to set aside as validation set for
|
|
762
|
+
early stopping. Values must be in the range `(0.0, 1.0)`.
|
|
763
|
+
Only used if ``n_iter_no_change`` is set to an integer.
|
|
764
|
+
|
|
765
|
+
n_iter_no_change : int, optional, default: None
|
|
766
|
+
``n_iter_no_change`` is used to decide if early stopping will be used
|
|
767
|
+
to terminate training when validation score is not improving. By
|
|
768
|
+
default it is set to None to disable early stopping. If set to a
|
|
769
|
+
number, it will set aside ``validation_fraction`` size of the training
|
|
770
|
+
data as validation and terminate training when validation score is not
|
|
771
|
+
improving in all of the previous ``n_iter_no_change`` numbers of
|
|
772
|
+
iterations. The split is stratified.
|
|
773
|
+
Values must be in the range `[1, inf)`.
|
|
774
|
+
|
|
775
|
+
tol : float, optional, default: 1e-4
|
|
776
|
+
Tolerance for the early stopping. When the loss is not improving
|
|
777
|
+
by at least tol for ``n_iter_no_change`` iterations (if set to a
|
|
778
|
+
number), the training stops.
|
|
779
|
+
Values must be in the range `[0.0, inf)`.
|
|
780
|
+
|
|
781
|
+
dropout_rate : float, optional, default: 0.0
|
|
782
|
+
If larger than zero, the residuals at each iteration are only computed
|
|
783
|
+
from a random subset of base learners. The value corresponds to the
|
|
784
|
+
percentage of base learners that are dropped. In each iteration,
|
|
785
|
+
at least one base learner is dropped. This is an alternative regularization
|
|
786
|
+
to shrinkage, i.e., setting `learning_rate < 1.0`.
|
|
787
|
+
Values must be in the range `[0.0, 1.0)`.
|
|
788
|
+
|
|
789
|
+
verbose : int, optional, default: 0
|
|
790
|
+
Enable verbose output. If 1 then it prints progress and performance
|
|
791
|
+
once in a while (the more trees the lower the frequency). If greater
|
|
792
|
+
than 1 then it prints progress and performance for every tree.
|
|
793
|
+
Values must be in the range `[0, inf)`.
|
|
794
|
+
|
|
795
|
+
ccp_alpha : float, optional, default: 0.0
|
|
796
|
+
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
|
797
|
+
subtree with the largest cost complexity that is smaller than
|
|
798
|
+
``ccp_alpha`` will be chosen. By default, no pruning is performed.
|
|
799
|
+
Values must be in the range `[0.0, inf)`.
|
|
800
|
+
|
|
801
|
+
Attributes
|
|
802
|
+
----------
|
|
803
|
+
n_estimators_ : int
|
|
804
|
+
The number of estimators as selected by early stopping (if
|
|
805
|
+
``n_iter_no_change`` is specified). Otherwise it is set to
|
|
806
|
+
``n_estimators``.
|
|
807
|
+
|
|
808
|
+
feature_importances_ : ndarray, shape = (n_features,)
|
|
809
|
+
The feature importances (the higher, the more important the feature).
|
|
810
|
+
|
|
811
|
+
estimators_ : ndarray of DecisionTreeRegressor, shape = (n_estimators, 1)
|
|
812
|
+
The collection of fitted sub-estimators.
|
|
813
|
+
|
|
814
|
+
train_score_ : ndarray, shape = (n_estimators,)
|
|
815
|
+
The i-th score ``train_score_[i]`` is the loss of the
|
|
816
|
+
model at iteration ``i`` on the in-bag sample.
|
|
817
|
+
If ``subsample == 1`` this is the loss on the training data.
|
|
818
|
+
|
|
819
|
+
oob_improvement_ : ndarray, shape = (n_estimators,)
|
|
820
|
+
The improvement in loss on the out-of-bag samples
|
|
821
|
+
relative to the previous iteration.
|
|
822
|
+
``oob_improvement_[0]`` is the improvement in
|
|
823
|
+
loss of the first stage over the ``init`` estimator.
|
|
824
|
+
Only available if ``subsample < 1.0``.
|
|
825
|
+
|
|
826
|
+
oob_scores_ : ndarray, shape = (n_estimators,)
|
|
827
|
+
The full history of the loss values on the out-of-bag
|
|
828
|
+
samples. Only available if ``subsample < 1.0``.
|
|
829
|
+
|
|
830
|
+
oob_score_ : float
|
|
831
|
+
The last value of the loss on the out-of-bag samples. It is
|
|
832
|
+
the same as ``oob_scores_[-1]``. Only available if ``subsample < 1.0``.
|
|
833
|
+
|
|
834
|
+
n_features_in_ : int
|
|
835
|
+
Number of features seen during ``fit``.
|
|
836
|
+
|
|
837
|
+
feature_names_in_ : ndarray, shape = (`n_features_in_`,)
|
|
838
|
+
Names of features seen during ``fit``. Defined only when `X`
|
|
839
|
+
has feature names that are all strings.
|
|
840
|
+
|
|
841
|
+
max_features_ : int
|
|
842
|
+
The inferred value of max_features.
|
|
843
|
+
|
|
844
|
+
unique_times_ : ndarray, shape = (n_unique_times,)
|
|
845
|
+
Unique time points.
|
|
846
|
+
|
|
847
|
+
See also
|
|
848
|
+
--------
|
|
849
|
+
sksurv.ensemble.ComponentwiseGradientBoostingSurvivalAnalysis
|
|
850
|
+
Gradient boosting with component-wise least squares as base learner.
|
|
851
|
+
|
|
852
|
+
References
|
|
853
|
+
----------
|
|
854
|
+
.. [1] J. H. Friedman, "Greedy function approximation: A gradient boosting machine,"
|
|
855
|
+
The Annals of Statistics, 29(5), 1189–1232, 2001.
|
|
856
|
+
.. [2] J. H. Friedman, "Stochastic gradient boosting,"
|
|
857
|
+
Computational Statistics & Data Analysis, 38(4), 367–378, 2002.
|
|
858
|
+
.. [3] G. Ridgeway, "The state of boosting,"
|
|
859
|
+
Computing Science and Statistics, 172–181, 1999.
|
|
860
|
+
.. [4] Hothorn, T., Bühlmann, P., Dudoit, S., Molinaro, A., van der Laan, M. J.,
|
|
861
|
+
"Survival ensembles", Biostatistics, 7(3), 355-73, 2006.
|
|
862
|
+
.. [5] K. V. Rashmi and R. Gilad-Bachrach,
|
|
863
|
+
"DART: Dropouts meet multiple additive regression trees,"
|
|
864
|
+
in 18th International Conference on Artificial Intelligence and Statistics,
|
|
865
|
+
2015, 489–497.
|
|
866
|
+
"""
|
|
867
|
+
|
|
868
|
+
_parameter_constraints = {
|
|
869
|
+
**BaseGradientBoosting._parameter_constraints,
|
|
870
|
+
"loss": [StrOptions(frozenset(LOSS_FUNCTIONS.keys()))],
|
|
871
|
+
"dropout_rate": [Interval(numbers.Real, 0.0, 1.0, closed="left")],
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
def __init__(
|
|
875
|
+
self,
|
|
876
|
+
*,
|
|
877
|
+
loss="coxph",
|
|
878
|
+
learning_rate=0.1,
|
|
879
|
+
n_estimators=100,
|
|
880
|
+
subsample=1.0,
|
|
881
|
+
criterion="friedman_mse",
|
|
882
|
+
min_samples_split=2,
|
|
883
|
+
min_samples_leaf=1,
|
|
884
|
+
min_weight_fraction_leaf=0.0,
|
|
885
|
+
max_depth=3,
|
|
886
|
+
min_impurity_decrease=0.0,
|
|
887
|
+
random_state=None,
|
|
888
|
+
max_features=None,
|
|
889
|
+
max_leaf_nodes=None,
|
|
890
|
+
warm_start=False,
|
|
891
|
+
validation_fraction=0.1,
|
|
892
|
+
n_iter_no_change=None,
|
|
893
|
+
tol=1e-4,
|
|
894
|
+
dropout_rate=0.0,
|
|
895
|
+
verbose=0,
|
|
896
|
+
ccp_alpha=0.0,
|
|
897
|
+
):
|
|
898
|
+
super().__init__(
|
|
899
|
+
loss=loss,
|
|
900
|
+
learning_rate=learning_rate,
|
|
901
|
+
n_estimators=n_estimators,
|
|
902
|
+
criterion=criterion,
|
|
903
|
+
min_samples_split=min_samples_split,
|
|
904
|
+
min_samples_leaf=min_samples_leaf,
|
|
905
|
+
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
|
906
|
+
max_depth=max_depth,
|
|
907
|
+
init="zero",
|
|
908
|
+
subsample=subsample,
|
|
909
|
+
max_features=max_features,
|
|
910
|
+
random_state=random_state,
|
|
911
|
+
verbose=verbose,
|
|
912
|
+
max_leaf_nodes=max_leaf_nodes,
|
|
913
|
+
warm_start=warm_start,
|
|
914
|
+
min_impurity_decrease=min_impurity_decrease,
|
|
915
|
+
validation_fraction=validation_fraction,
|
|
916
|
+
n_iter_no_change=n_iter_no_change,
|
|
917
|
+
tol=tol,
|
|
918
|
+
ccp_alpha=ccp_alpha,
|
|
919
|
+
)
|
|
920
|
+
self.dropout_rate = dropout_rate
|
|
921
|
+
|
|
922
|
+
def _encode_y(self, y, sample_weight):
|
|
923
|
+
self.n_trees_per_iteration_ = 1
|
|
924
|
+
return y
|
|
925
|
+
|
|
926
|
+
def _get_loss(self, sample_weight):
|
|
927
|
+
return LOSS_FUNCTIONS[self.loss]()
|
|
928
|
+
|
|
929
|
+
@property
|
|
930
|
+
def _predict_risk_score(self):
|
|
931
|
+
return isinstance(self._loss, CoxPH)
|
|
932
|
+
|
|
933
|
+
def _set_max_features(self):
|
|
934
|
+
"""Set self.max_features_."""
|
|
935
|
+
if isinstance(self.max_features, str):
|
|
936
|
+
if self.max_features == "sqrt":
|
|
937
|
+
max_features = max(1, int(np.sqrt(self.n_features_in_)))
|
|
938
|
+
elif self.max_features == "log2":
|
|
939
|
+
max_features = max(1, int(np.log2(self.n_features_in_)))
|
|
940
|
+
elif self.max_features is None:
|
|
941
|
+
max_features = self.n_features_in_
|
|
942
|
+
elif isinstance(self.max_features, numbers.Integral):
|
|
943
|
+
max_features = self.max_features
|
|
944
|
+
else: # float
|
|
945
|
+
max_features = max(1, int(self.max_features * self.n_features_in_))
|
|
946
|
+
|
|
947
|
+
self.max_features_ = max_features
|
|
948
|
+
|
|
949
|
+
def _update_with_dropout(self, i, X, raw_predictions, k, scale, random_state):
|
|
950
|
+
# select base learners to be dropped for next iteration
|
|
951
|
+
drop_model, n_dropped = _sample_binomial_plus_one(self.dropout_rate, i + 1, random_state)
|
|
952
|
+
|
|
953
|
+
# adjust scaling factor of tree that is going to be trained in next iteration
|
|
954
|
+
scale[i + 1] = 1.0 / (n_dropped + 1.0)
|
|
955
|
+
|
|
956
|
+
raw_predictions[:, k] = 0
|
|
957
|
+
for m in range(i + 1):
|
|
958
|
+
if drop_model[m] == 1:
|
|
959
|
+
# adjust scaling factor of dropped trees
|
|
960
|
+
scale[m] *= n_dropped / (n_dropped + 1.0)
|
|
961
|
+
else:
|
|
962
|
+
# pseudoresponse of next iteration (without contribution of dropped trees)
|
|
963
|
+
raw_predictions[:, k] += self.learning_rate * scale[m] * self.estimators_[m, k].predict(X).ravel()
|
|
964
|
+
|
|
965
|
+
def _fit_stage(
|
|
966
|
+
self,
|
|
967
|
+
i,
|
|
968
|
+
X,
|
|
969
|
+
y,
|
|
970
|
+
raw_predictions,
|
|
971
|
+
sample_weight,
|
|
972
|
+
sample_mask,
|
|
973
|
+
random_state,
|
|
974
|
+
scale,
|
|
975
|
+
X_csc=None,
|
|
976
|
+
X_csr=None,
|
|
977
|
+
):
|
|
978
|
+
"""Fit another stage of ``n_classes_`` trees to the boosting model."""
|
|
979
|
+
|
|
980
|
+
assert sample_mask.dtype == bool
|
|
981
|
+
|
|
982
|
+
# whether to use dropout in next iteration
|
|
983
|
+
do_dropout = self.dropout_rate > 0.0 and i < len(scale) - 1
|
|
984
|
+
|
|
985
|
+
# Need to pass a copy of raw_predictions to negative_gradient()
|
|
986
|
+
# because raw_predictions is partially updated at the end of the loop
|
|
987
|
+
# in update_terminal_regions(), and gradients need to be evaluated at
|
|
988
|
+
# iteration i - 1.
|
|
989
|
+
raw_predictions_copy = raw_predictions.copy()
|
|
990
|
+
|
|
991
|
+
neg_gradient = self._loss.gradient(
|
|
992
|
+
y_true=y,
|
|
993
|
+
raw_prediction=raw_predictions_copy,
|
|
994
|
+
sample_weight=None, # We pass sample_weights to the tree directly.
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
for k in range(self.n_trees_per_iteration_):
|
|
998
|
+
# induce regression tree on the negative gradient
|
|
999
|
+
tree = DecisionTreeRegressor(
|
|
1000
|
+
criterion=self.criterion,
|
|
1001
|
+
splitter="best",
|
|
1002
|
+
max_depth=self.max_depth,
|
|
1003
|
+
min_samples_split=self.min_samples_split,
|
|
1004
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
1005
|
+
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
|
|
1006
|
+
min_impurity_decrease=self.min_impurity_decrease,
|
|
1007
|
+
max_features=self.max_features,
|
|
1008
|
+
max_leaf_nodes=self.max_leaf_nodes,
|
|
1009
|
+
random_state=random_state,
|
|
1010
|
+
ccp_alpha=self.ccp_alpha,
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
if self.subsample < 1.0:
|
|
1014
|
+
# no inplace multiplication!
|
|
1015
|
+
sample_weight = sample_weight * sample_mask.astype(np.float64)
|
|
1016
|
+
|
|
1017
|
+
X = X_csc if X_csc is not None else X
|
|
1018
|
+
tree.fit(X, neg_gradient, sample_weight=sample_weight, check_input=False)
|
|
1019
|
+
|
|
1020
|
+
# add tree to ensemble
|
|
1021
|
+
self.estimators_[i, k] = tree
|
|
1022
|
+
|
|
1023
|
+
# update tree leaves
|
|
1024
|
+
if do_dropout:
|
|
1025
|
+
self._update_with_dropout(i, X, raw_predictions, k, scale, random_state)
|
|
1026
|
+
else:
|
|
1027
|
+
# update tree leaves
|
|
1028
|
+
X_for_tree_update = X_csr if X_csr is not None else X
|
|
1029
|
+
self._loss.update_terminal_regions(
|
|
1030
|
+
tree.tree_,
|
|
1031
|
+
X_for_tree_update,
|
|
1032
|
+
y,
|
|
1033
|
+
neg_gradient,
|
|
1034
|
+
raw_predictions,
|
|
1035
|
+
sample_weight,
|
|
1036
|
+
sample_mask,
|
|
1037
|
+
learning_rate=self.learning_rate,
|
|
1038
|
+
k=k,
|
|
1039
|
+
)
|
|
1040
|
+
|
|
1041
|
+
return raw_predictions
|
|
1042
|
+
|
|
1043
|
+
def _fit_stages( # noqa: C901
|
|
1044
|
+
self,
|
|
1045
|
+
X,
|
|
1046
|
+
y,
|
|
1047
|
+
raw_predictions,
|
|
1048
|
+
sample_weight,
|
|
1049
|
+
random_state,
|
|
1050
|
+
X_val,
|
|
1051
|
+
y_val,
|
|
1052
|
+
sample_weight_val,
|
|
1053
|
+
scale,
|
|
1054
|
+
begin_at_stage=0,
|
|
1055
|
+
monitor=None,
|
|
1056
|
+
):
|
|
1057
|
+
"""Iteratively fits the stages.
|
|
1058
|
+
|
|
1059
|
+
For each stage it computes the progress (OOB, train score)
|
|
1060
|
+
and delegates to ``_fit_stage``.
|
|
1061
|
+
Returns the number of stages fit; might differ from ``n_estimators``
|
|
1062
|
+
due to early stopping.
|
|
1063
|
+
"""
|
|
1064
|
+
n_samples = X.shape[0]
|
|
1065
|
+
do_oob = self.subsample < 1.0
|
|
1066
|
+
sample_mask = np.ones((n_samples,), dtype=bool)
|
|
1067
|
+
n_inbag = max(1, int(self.subsample * n_samples))
|
|
1068
|
+
|
|
1069
|
+
if self.verbose:
|
|
1070
|
+
verbose_reporter = VerboseReporter(verbose=self.verbose)
|
|
1071
|
+
verbose_reporter.init(self, begin_at_stage)
|
|
1072
|
+
|
|
1073
|
+
X_csc = csc_matrix(X) if issparse(X) else None
|
|
1074
|
+
X_csr = csr_matrix(X) if issparse(X) else None
|
|
1075
|
+
|
|
1076
|
+
if self.n_iter_no_change is not None:
|
|
1077
|
+
loss_history = np.full(self.n_iter_no_change, np.inf)
|
|
1078
|
+
# We create a generator to get the predictions for X_val after
|
|
1079
|
+
# the addition of each successive stage
|
|
1080
|
+
y_val_pred_iter = self._staged_raw_predict(X_val, check_input=False)
|
|
1081
|
+
|
|
1082
|
+
# perform boosting iterations
|
|
1083
|
+
i = begin_at_stage
|
|
1084
|
+
for i in range(begin_at_stage, self.n_estimators):
|
|
1085
|
+
# subsampling
|
|
1086
|
+
if do_oob:
|
|
1087
|
+
sample_mask = _random_sample_mask(n_samples, n_inbag, random_state)
|
|
1088
|
+
# OOB score before adding this stage
|
|
1089
|
+
y_oob_masked = y[~sample_mask]
|
|
1090
|
+
sample_weight_oob_masked = sample_weight[~sample_mask]
|
|
1091
|
+
if i == 0: # store the initial loss to compute the OOB score
|
|
1092
|
+
initial_loss = self._loss(
|
|
1093
|
+
y_true=y_oob_masked,
|
|
1094
|
+
raw_prediction=raw_predictions[~sample_mask],
|
|
1095
|
+
sample_weight=sample_weight_oob_masked,
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
# fit next stage of trees
|
|
1099
|
+
raw_predictions = self._fit_stage(
|
|
1100
|
+
i,
|
|
1101
|
+
X,
|
|
1102
|
+
y,
|
|
1103
|
+
raw_predictions,
|
|
1104
|
+
sample_weight,
|
|
1105
|
+
sample_mask,
|
|
1106
|
+
random_state,
|
|
1107
|
+
scale,
|
|
1108
|
+
X_csc=X_csc,
|
|
1109
|
+
X_csr=X_csr,
|
|
1110
|
+
)
|
|
1111
|
+
|
|
1112
|
+
# track loss
|
|
1113
|
+
if do_oob:
|
|
1114
|
+
self.train_score_[i] = self._loss(
|
|
1115
|
+
y_true=y[sample_mask],
|
|
1116
|
+
raw_prediction=raw_predictions[sample_mask],
|
|
1117
|
+
sample_weight=sample_weight[sample_mask],
|
|
1118
|
+
)
|
|
1119
|
+
self.oob_scores_[i] = self._loss(
|
|
1120
|
+
y_true=y_oob_masked,
|
|
1121
|
+
raw_prediction=raw_predictions[~sample_mask],
|
|
1122
|
+
sample_weight=sample_weight_oob_masked,
|
|
1123
|
+
)
|
|
1124
|
+
previous_loss = initial_loss if i == 0 else self.oob_scores_[i - 1]
|
|
1125
|
+
self.oob_improvement_[i] = previous_loss - self.oob_scores_[i]
|
|
1126
|
+
self.oob_score_ = self.oob_scores_[-1]
|
|
1127
|
+
else:
|
|
1128
|
+
# no need to fancy index w/ no subsampling
|
|
1129
|
+
self.train_score_[i] = self._loss(y_true=y, raw_prediction=raw_predictions, sample_weight=sample_weight)
|
|
1130
|
+
|
|
1131
|
+
if self.verbose > 0:
|
|
1132
|
+
verbose_reporter.update(i, self)
|
|
1133
|
+
|
|
1134
|
+
if monitor is not None:
|
|
1135
|
+
early_stopping = monitor(i, self, locals())
|
|
1136
|
+
if early_stopping:
|
|
1137
|
+
break
|
|
1138
|
+
|
|
1139
|
+
# We also provide an early stopping based on the score from
|
|
1140
|
+
# validation set (X_val, y_val), if n_iter_no_change is set
|
|
1141
|
+
if self.n_iter_no_change is not None:
|
|
1142
|
+
# By calling next(y_val_pred_iter), we get the predictions
|
|
1143
|
+
# for X_val after the addition of the current stage
|
|
1144
|
+
validation_loss = self._loss(y_val, next(y_val_pred_iter), sample_weight_val)
|
|
1145
|
+
|
|
1146
|
+
# Require validation_score to be better (less) than at least
|
|
1147
|
+
# one of the last n_iter_no_change evaluations
|
|
1148
|
+
if np.any(validation_loss + self.tol < loss_history):
|
|
1149
|
+
loss_history[i % len(loss_history)] = validation_loss
|
|
1150
|
+
else:
|
|
1151
|
+
break
|
|
1152
|
+
|
|
1153
|
+
return i + 1
|
|
1154
|
+
|
|
1155
|
+
def _init_state(self):
|
|
1156
|
+
super()._init_state()
|
|
1157
|
+
|
|
1158
|
+
if self.dropout_rate > 0.0:
|
|
1159
|
+
self._scale = np.ones(self.n_estimators, dtype=float)
|
|
1160
|
+
|
|
1161
|
+
def _resize_state(self):
|
|
1162
|
+
super()._resize_state()
|
|
1163
|
+
|
|
1164
|
+
if self.dropout_rate > 0:
|
|
1165
|
+
if not hasattr(self, "_scale"):
|
|
1166
|
+
raise ValueError(
|
|
1167
|
+
"fitting with warm_start=True and dropout_rate > 0 is only "
|
|
1168
|
+
"supported if the previous fit used dropout_rate > 0 too"
|
|
1169
|
+
)
|
|
1170
|
+
|
|
1171
|
+
self._scale = np.resize(self._scale, self.n_estimators)
|
|
1172
|
+
self._scale[self.n_estimators_ :] = 1
|
|
1173
|
+
|
|
1174
|
+
def _shrink_state(self, n_stages):
|
|
1175
|
+
self.estimators_ = self.estimators_[:n_stages]
|
|
1176
|
+
self.train_score_ = self.train_score_[:n_stages]
|
|
1177
|
+
if hasattr(self, "oob_improvement_"):
|
|
1178
|
+
self.oob_improvement_ = self.oob_improvement_[:n_stages]
|
|
1179
|
+
self.oob_scores_ = self.oob_scores_[:n_stages]
|
|
1180
|
+
self.oob_score_ = self.oob_scores_[-1]
|
|
1181
|
+
if hasattr(self, "_scale"):
|
|
1182
|
+
self._scale = self._scale[:n_stages]
|
|
1183
|
+
|
|
1184
|
+
def fit(self, X, y, sample_weight=None, monitor=None):
|
|
1185
|
+
"""Fit the gradient boosting model.
|
|
1186
|
+
|
|
1187
|
+
Parameters
|
|
1188
|
+
----------
|
|
1189
|
+
X : array-like, shape = (n_samples, n_features)
|
|
1190
|
+
Data matrix
|
|
1191
|
+
|
|
1192
|
+
y : structured array, shape = (n_samples,)
|
|
1193
|
+
A structured array with two fields. The first field is a boolean
|
|
1194
|
+
where ``True`` indicates an event and ``False`` indicates right-censoring.
|
|
1195
|
+
The second field is a float with the time of event or time of censoring.
|
|
1196
|
+
|
|
1197
|
+
sample_weight : array-like, shape = (n_samples,), optional
|
|
1198
|
+
Weights given to each sample. If omitted, all samples have weight 1.
|
|
1199
|
+
|
|
1200
|
+
monitor : callable, optional
|
|
1201
|
+
The monitor is called after each iteration with the current
|
|
1202
|
+
iteration, a reference to the estimator and the local variables of
|
|
1203
|
+
``_fit_stages`` as keyword arguments ``callable(i, self,
|
|
1204
|
+
locals())``. If the callable returns ``True`` the fitting procedure
|
|
1205
|
+
is stopped. The monitor can be used for various things such as
|
|
1206
|
+
computing held-out estimates, early stopping, model introspect, and
|
|
1207
|
+
snapshoting.
|
|
1208
|
+
|
|
1209
|
+
Returns
|
|
1210
|
+
-------
|
|
1211
|
+
self : object
|
|
1212
|
+
Returns self.
|
|
1213
|
+
"""
|
|
1214
|
+
self._validate_params()
|
|
1215
|
+
|
|
1216
|
+
if not self.warm_start:
|
|
1217
|
+
self._clear_state()
|
|
1218
|
+
|
|
1219
|
+
X = validate_data(
|
|
1220
|
+
self,
|
|
1221
|
+
X,
|
|
1222
|
+
ensure_min_samples=2,
|
|
1223
|
+
order="C",
|
|
1224
|
+
accept_sparse=["csr", "csc", "coo"],
|
|
1225
|
+
dtype=DTYPE,
|
|
1226
|
+
)
|
|
1227
|
+
event, time = check_array_survival(X, y)
|
|
1228
|
+
|
|
1229
|
+
sample_weight_is_none = sample_weight is None
|
|
1230
|
+
sample_weight = _check_sample_weight(sample_weight, X)
|
|
1231
|
+
|
|
1232
|
+
if sample_weight_is_none:
|
|
1233
|
+
y = self._encode_y(y=y, sample_weight=None)
|
|
1234
|
+
else:
|
|
1235
|
+
y = self._encode_y(y=y, sample_weight=sample_weight)
|
|
1236
|
+
|
|
1237
|
+
self._set_max_features()
|
|
1238
|
+
|
|
1239
|
+
# self.loss is guaranteed to be a string
|
|
1240
|
+
self._loss = self._get_loss(sample_weight=sample_weight)
|
|
1241
|
+
|
|
1242
|
+
if isinstance(self._loss, CensoredSquaredLoss | IPCWLeastSquaresError):
|
|
1243
|
+
time = np.log(time)
|
|
1244
|
+
|
|
1245
|
+
if self.n_iter_no_change is not None:
|
|
1246
|
+
(
|
|
1247
|
+
X_train,
|
|
1248
|
+
X_val,
|
|
1249
|
+
event_train,
|
|
1250
|
+
event_val,
|
|
1251
|
+
time_train,
|
|
1252
|
+
time_val,
|
|
1253
|
+
sample_weight_train,
|
|
1254
|
+
sample_weight_val,
|
|
1255
|
+
) = train_test_split(
|
|
1256
|
+
X,
|
|
1257
|
+
event,
|
|
1258
|
+
time,
|
|
1259
|
+
sample_weight,
|
|
1260
|
+
random_state=self.random_state,
|
|
1261
|
+
test_size=self.validation_fraction,
|
|
1262
|
+
stratify=event,
|
|
1263
|
+
)
|
|
1264
|
+
y_val = np.fromiter(zip(event_val, time_val), dtype=[("event", bool), ("time", np.float64)])
|
|
1265
|
+
else:
|
|
1266
|
+
X_train, sample_weight_train = X, sample_weight
|
|
1267
|
+
event_train, time_train = event, time
|
|
1268
|
+
X_val = y_val = sample_weight_val = None
|
|
1269
|
+
|
|
1270
|
+
y_train = np.fromiter(zip(event_train, time_train), dtype=[("event", bool), ("time", np.float64)])
|
|
1271
|
+
n_samples = X_train.shape[0]
|
|
1272
|
+
|
|
1273
|
+
# First time calling fit.
|
|
1274
|
+
if not self._is_fitted():
|
|
1275
|
+
# init state
|
|
1276
|
+
self._init_state()
|
|
1277
|
+
|
|
1278
|
+
raw_predictions = np.zeros(
|
|
1279
|
+
shape=(n_samples, self.n_trees_per_iteration_),
|
|
1280
|
+
dtype=np.float64,
|
|
1281
|
+
)
|
|
1282
|
+
|
|
1283
|
+
begin_at_stage = 0
|
|
1284
|
+
|
|
1285
|
+
# The rng state must be preserved if warm_start is True
|
|
1286
|
+
self._rng = check_random_state(self.random_state)
|
|
1287
|
+
|
|
1288
|
+
# warm start: this is not the first time fit was called
|
|
1289
|
+
else:
|
|
1290
|
+
# add more estimators to fitted model
|
|
1291
|
+
# invariant: warm_start = True
|
|
1292
|
+
if self.n_estimators < self.estimators_.shape[0]:
|
|
1293
|
+
raise ValueError(
|
|
1294
|
+
"n_estimators=%d must be larger or equal to "
|
|
1295
|
+
"estimators_.shape[0]=%d when "
|
|
1296
|
+
"warm_start==True" % (self.n_estimators, self.estimators_.shape[0])
|
|
1297
|
+
)
|
|
1298
|
+
begin_at_stage = self.estimators_.shape[0]
|
|
1299
|
+
# The requirements of _raw_predict
|
|
1300
|
+
# are more constrained than fit. It accepts only CSR
|
|
1301
|
+
# matrices. Finite values have already been checked in validate_data.
|
|
1302
|
+
X_train = check_array(
|
|
1303
|
+
X_train,
|
|
1304
|
+
dtype=DTYPE,
|
|
1305
|
+
order="C",
|
|
1306
|
+
accept_sparse="csr",
|
|
1307
|
+
ensure_all_finite=False,
|
|
1308
|
+
)
|
|
1309
|
+
raw_predictions = self._raw_predict(X_train)
|
|
1310
|
+
self._resize_state()
|
|
1311
|
+
|
|
1312
|
+
# apply dropout to last stage of previous fit
|
|
1313
|
+
if hasattr(self, "_scale") and self.dropout_rate > 0:
|
|
1314
|
+
for k in range(self.n_trees_per_iteration_):
|
|
1315
|
+
self._update_with_dropout(
|
|
1316
|
+
# pylint: disable-next=access-member-before-definition
|
|
1317
|
+
self.n_estimators_ - 1,
|
|
1318
|
+
X_train,
|
|
1319
|
+
raw_predictions,
|
|
1320
|
+
k,
|
|
1321
|
+
self._scale,
|
|
1322
|
+
self._rng,
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
scale = getattr(self, "_scale", None)
|
|
1326
|
+
|
|
1327
|
+
# fit the boosting stages
|
|
1328
|
+
n_stages = self._fit_stages(
|
|
1329
|
+
X_train,
|
|
1330
|
+
y_train,
|
|
1331
|
+
raw_predictions,
|
|
1332
|
+
sample_weight_train,
|
|
1333
|
+
self._rng,
|
|
1334
|
+
X_val,
|
|
1335
|
+
y_val,
|
|
1336
|
+
sample_weight_val,
|
|
1337
|
+
scale,
|
|
1338
|
+
begin_at_stage,
|
|
1339
|
+
monitor,
|
|
1340
|
+
)
|
|
1341
|
+
# change shape of arrays after fit (early-stopping or additional tests)
|
|
1342
|
+
if n_stages != self.estimators_.shape[0]:
|
|
1343
|
+
self._shrink_state(n_stages)
|
|
1344
|
+
self.n_estimators_ = n_stages
|
|
1345
|
+
|
|
1346
|
+
self._set_baseline_model(X_train, event_train, time_train)
|
|
1347
|
+
|
|
1348
|
+
return self
|
|
1349
|
+
|
|
1350
|
+
def _set_baseline_model(self, X, event, time):
|
|
1351
|
+
if isinstance(self._loss, CoxPH):
|
|
1352
|
+
X_pred = X
|
|
1353
|
+
if issparse(X):
|
|
1354
|
+
X_pred = X.asformat("csr")
|
|
1355
|
+
risk_scores = self._predict(X_pred)
|
|
1356
|
+
self._baseline_model = BreslowEstimator().fit(risk_scores, event, time)
|
|
1357
|
+
else:
|
|
1358
|
+
self._baseline_model = None
|
|
1359
|
+
|
|
1360
|
+
def _dropout_predict_stage(self, X, i, K, score):
|
|
1361
|
+
for k in range(K):
|
|
1362
|
+
tree = self.estimators_[i, k].tree_
|
|
1363
|
+
score += self.learning_rate * self._scale[i] * tree.predict(X).reshape((-1, 1))
|
|
1364
|
+
return score
|
|
1365
|
+
|
|
1366
|
+
def _dropout_raw_predict(self, X):
|
|
1367
|
+
raw_predictions = self._raw_predict_init(X)
|
|
1368
|
+
|
|
1369
|
+
n_estimators, K = self.estimators_.shape
|
|
1370
|
+
for i in range(n_estimators):
|
|
1371
|
+
self._dropout_predict_stage(X, i, K, raw_predictions)
|
|
1372
|
+
|
|
1373
|
+
return raw_predictions
|
|
1374
|
+
|
|
1375
|
+
def _dropout_staged_raw_predict(self, X):
|
|
1376
|
+
X = validate_data(self, X, dtype=DTYPE, order="C", accept_sparse="csr")
|
|
1377
|
+
raw_predictions = self._raw_predict_init(X)
|
|
1378
|
+
|
|
1379
|
+
n_estimators, K = self.estimators_.shape
|
|
1380
|
+
for i in range(n_estimators):
|
|
1381
|
+
self._dropout_predict_stage(X, i, K, raw_predictions)
|
|
1382
|
+
yield raw_predictions.copy()
|
|
1383
|
+
|
|
1384
|
+
def _raw_predict(self, X):
|
|
1385
|
+
# if dropout wasn't used during training, proceed as usual,
|
|
1386
|
+
# otherwise consider scaling factor of individual trees
|
|
1387
|
+
if not hasattr(self, "_scale"):
|
|
1388
|
+
return super()._raw_predict(X)
|
|
1389
|
+
return self._dropout_raw_predict(X)
|
|
1390
|
+
|
|
1391
|
+
def _init_decision_function(self, X): # pragma: no cover
|
|
1392
|
+
return super()._init_decision_function(X).reshape(-1, 1)
|
|
1393
|
+
|
|
1394
|
+
def _decision_function(self, X): # pragma: no cover
|
|
1395
|
+
return self._raw_predict(X)
|
|
1396
|
+
|
|
1397
|
+
def _predict(self, X):
|
|
1398
|
+
score = self._raw_predict(X)
|
|
1399
|
+
if score.shape[1] == 1:
|
|
1400
|
+
score = score.ravel()
|
|
1401
|
+
|
|
1402
|
+
return self._loss._scale_raw_prediction(score)
|
|
1403
|
+
|
|
1404
|
+
def predict(self, X):
|
|
1405
|
+
"""Predict risk scores.
|
|
1406
|
+
|
|
1407
|
+
If `loss='coxph'`, predictions can be interpreted as log hazard ratio
|
|
1408
|
+
similar to the linear predictor of a Cox proportional hazards
|
|
1409
|
+
model. If `loss='squared'` or `loss='ipcwls'`, predictions are the
|
|
1410
|
+
time to event.
|
|
1411
|
+
|
|
1412
|
+
Parameters
|
|
1413
|
+
----------
|
|
1414
|
+
X : array-like, shape = (n_samples, n_features)
|
|
1415
|
+
The input samples.
|
|
1416
|
+
|
|
1417
|
+
Returns
|
|
1418
|
+
-------
|
|
1419
|
+
y : ndarray, shape = (n_samples,)
|
|
1420
|
+
The risk scores.
|
|
1421
|
+
"""
|
|
1422
|
+
check_is_fitted(self, "estimators_")
|
|
1423
|
+
|
|
1424
|
+
X = validate_data(self, X, reset=False, order="C", accept_sparse="csr", dtype=DTYPE)
|
|
1425
|
+
return self._predict(X)
|
|
1426
|
+
|
|
1427
|
+
def staged_predict(self, X):
|
|
1428
|
+
"""Predict risk scores at each stage for X.
|
|
1429
|
+
|
|
1430
|
+
This method allows monitoring (i.e. determine error on testing set)
|
|
1431
|
+
after each stage.
|
|
1432
|
+
|
|
1433
|
+
If `loss='coxph'`, predictions can be interpreted as log hazard ratio
|
|
1434
|
+
similar to the linear predictor of a Cox proportional hazards
|
|
1435
|
+
model. If `loss='squared'` or `loss='ipcwls'`, predictions are the
|
|
1436
|
+
time to event.
|
|
1437
|
+
|
|
1438
|
+
Parameters
|
|
1439
|
+
----------
|
|
1440
|
+
X : array-like, shape = (n_samples, n_features)
|
|
1441
|
+
The input samples.
|
|
1442
|
+
|
|
1443
|
+
Returns
|
|
1444
|
+
-------
|
|
1445
|
+
y : generator of array of shape = (n_samples,)
|
|
1446
|
+
The predicted value of the input samples.
|
|
1447
|
+
"""
|
|
1448
|
+
check_is_fitted(self, "estimators_")
|
|
1449
|
+
|
|
1450
|
+
# if dropout wasn't used during training, proceed as usual,
|
|
1451
|
+
# otherwise consider scaling factor of individual trees
|
|
1452
|
+
if not hasattr(self, "_scale"):
|
|
1453
|
+
predictions_iter = self._staged_raw_predict(X)
|
|
1454
|
+
else:
|
|
1455
|
+
predictions_iter = self._dropout_staged_raw_predict(X)
|
|
1456
|
+
|
|
1457
|
+
for raw_predictions in predictions_iter:
|
|
1458
|
+
y = self._loss._scale_raw_prediction(raw_predictions)
|
|
1459
|
+
yield y.ravel()
|
|
1460
|
+
|
|
1461
|
+
def _get_baseline_model(self):
|
|
1462
|
+
if self._baseline_model is None:
|
|
1463
|
+
raise ValueError("`fit` must be called with the loss option set to 'coxph'.")
|
|
1464
|
+
return self._baseline_model
|
|
1465
|
+
|
|
1466
|
+
@append_cumulative_hazard_example(estimator_mod="ensemble", estimator_class="GradientBoostingSurvivalAnalysis")
|
|
1467
|
+
def predict_cumulative_hazard_function(self, X, return_array=False):
|
|
1468
|
+
r"""Predict cumulative hazard function.
|
|
1469
|
+
|
|
1470
|
+
Only available if :meth:`fit` has been called with `loss = "coxph"`.
|
|
1471
|
+
|
|
1472
|
+
The cumulative hazard function for an individual
|
|
1473
|
+
with feature vector :math:`x` is defined as
|
|
1474
|
+
|
|
1475
|
+
.. math::
|
|
1476
|
+
|
|
1477
|
+
H(t \mid x) = \exp(f(x)) H_0(t) ,
|
|
1478
|
+
|
|
1479
|
+
where :math:`f(\cdot)` is the additive ensemble of base learners,
|
|
1480
|
+
and :math:`H_0(t)` is the baseline hazard function,
|
|
1481
|
+
estimated by Breslow's estimator.
|
|
1482
|
+
|
|
1483
|
+
Parameters
|
|
1484
|
+
----------
|
|
1485
|
+
X : array-like, shape = (n_samples, n_features)
|
|
1486
|
+
Data matrix.
|
|
1487
|
+
|
|
1488
|
+
return_array : bool, default: False
|
|
1489
|
+
Whether to return a single array of cumulative hazard values
|
|
1490
|
+
or a list of step functions.
|
|
1491
|
+
|
|
1492
|
+
If `False`, a list of :class:`sksurv.functions.StepFunction`
|
|
1493
|
+
objects is returned.
|
|
1494
|
+
|
|
1495
|
+
If `True`, a 2d-array of shape `(n_samples, n_unique_times)` is
|
|
1496
|
+
returned, where `n_unique_times` is the number of unique
|
|
1497
|
+
event times in the training data. Each row represents the cumulative
|
|
1498
|
+
hazard function of an individual evaluated at `unique_times_`.
|
|
1499
|
+
|
|
1500
|
+
Returns
|
|
1501
|
+
-------
|
|
1502
|
+
cum_hazard : ndarray
|
|
1503
|
+
If `return_array` is `False`, an array of `n_samples`
|
|
1504
|
+
:class:`sksurv.functions.StepFunction` instances is returned.
|
|
1505
|
+
|
|
1506
|
+
If `return_array` is `True`, a numeric array of shape
|
|
1507
|
+
`(n_samples, n_unique_times_)` is returned.
|
|
1508
|
+
|
|
1509
|
+
Examples
|
|
1510
|
+
--------
|
|
1511
|
+
"""
|
|
1512
|
+
return self._predict_cumulative_hazard_function(self._get_baseline_model(), self.predict(X), return_array)
|
|
1513
|
+
|
|
1514
|
+
@append_survival_function_example(estimator_mod="ensemble", estimator_class="GradientBoostingSurvivalAnalysis")
|
|
1515
|
+
def predict_survival_function(self, X, return_array=False):
|
|
1516
|
+
r"""Predict survival function.
|
|
1517
|
+
|
|
1518
|
+
Only available if :meth:`fit` has been called with `loss = "coxph"`.
|
|
1519
|
+
|
|
1520
|
+
The survival function for an individual
|
|
1521
|
+
with feature vector :math:`x` is defined as
|
|
1522
|
+
|
|
1523
|
+
.. math::
|
|
1524
|
+
|
|
1525
|
+
S(t \mid x) = S_0(t)^{\exp(f(x)} ,
|
|
1526
|
+
|
|
1527
|
+
where :math:`f(\cdot)` is the additive ensemble of base learners,
|
|
1528
|
+
and :math:`S_0(t)` is the baseline survival function,
|
|
1529
|
+
estimated by Breslow's estimator.
|
|
1530
|
+
|
|
1531
|
+
Parameters
|
|
1532
|
+
----------
|
|
1533
|
+
X : array-like, shape = (n_samples, n_features)
|
|
1534
|
+
Data matrix.
|
|
1535
|
+
|
|
1536
|
+
return_array : bool, default: False
|
|
1537
|
+
Whether to return a single array of survival probabilities
|
|
1538
|
+
or a list of step functions.
|
|
1539
|
+
|
|
1540
|
+
If `False`, a list of :class:`sksurv.functions.StepFunction`
|
|
1541
|
+
objects is returned.
|
|
1542
|
+
|
|
1543
|
+
If `True`, a 2d-array of shape `(n_samples, n_unique_times)` is
|
|
1544
|
+
returned, where `n_unique_times` is the number of unique
|
|
1545
|
+
event times in the training data. Each row represents the survival
|
|
1546
|
+
function of an individual evaluated at `unique_times_`.
|
|
1547
|
+
|
|
1548
|
+
Returns
|
|
1549
|
+
-------
|
|
1550
|
+
survival : ndarray
|
|
1551
|
+
If `return_array` is `False`, an array of `n_samples`
|
|
1552
|
+
:class:`sksurv.functions.StepFunction` instances is returned.
|
|
1553
|
+
|
|
1554
|
+
If `return_array` is `True`, a numeric array of shape
|
|
1555
|
+
`(n_samples, n_unique_times_)` is returned.
|
|
1556
|
+
|
|
1557
|
+
Examples
|
|
1558
|
+
--------
|
|
1559
|
+
"""
|
|
1560
|
+
return self._predict_survival_function(self._get_baseline_model(), self.predict(X), return_array)
|
|
1561
|
+
|
|
1562
|
+
@property
|
|
1563
|
+
def unique_times_(self):
|
|
1564
|
+
return self._get_baseline_model().unique_times_
|