scikit-survival 0.25.0__cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scikit_survival-0.25.0.dist-info/METADATA +185 -0
- scikit_survival-0.25.0.dist-info/RECORD +58 -0
- scikit_survival-0.25.0.dist-info/WHEEL +6 -0
- scikit_survival-0.25.0.dist-info/licenses/COPYING +674 -0
- scikit_survival-0.25.0.dist-info/top_level.txt +1 -0
- sksurv/__init__.py +183 -0
- sksurv/base.py +115 -0
- sksurv/bintrees/__init__.py +15 -0
- sksurv/bintrees/_binarytrees.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/column.py +205 -0
- sksurv/compare.py +123 -0
- sksurv/datasets/__init__.py +12 -0
- sksurv/datasets/base.py +614 -0
- sksurv/datasets/data/GBSG2.arff +700 -0
- sksurv/datasets/data/actg320.arff +1169 -0
- sksurv/datasets/data/bmt.arff +46 -0
- sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
- sksurv/datasets/data/cgvhd.arff +118 -0
- sksurv/datasets/data/flchain.arff +7887 -0
- sksurv/datasets/data/veteran.arff +148 -0
- sksurv/datasets/data/whas500.arff +520 -0
- sksurv/docstrings.py +99 -0
- sksurv/ensemble/__init__.py +2 -0
- sksurv/ensemble/_coxph_loss.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/ensemble/boosting.py +1564 -0
- sksurv/ensemble/forest.py +902 -0
- sksurv/ensemble/survival_loss.py +151 -0
- sksurv/exceptions.py +18 -0
- sksurv/functions.py +114 -0
- sksurv/io/__init__.py +2 -0
- sksurv/io/arffread.py +89 -0
- sksurv/io/arffwrite.py +181 -0
- sksurv/kernels/__init__.py +1 -0
- sksurv/kernels/_clinical_kernel.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/kernels/clinical.py +348 -0
- sksurv/linear_model/__init__.py +3 -0
- sksurv/linear_model/_coxnet.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/linear_model/aft.py +208 -0
- sksurv/linear_model/coxnet.py +592 -0
- sksurv/linear_model/coxph.py +637 -0
- sksurv/meta/__init__.py +4 -0
- sksurv/meta/base.py +35 -0
- sksurv/meta/ensemble_selection.py +724 -0
- sksurv/meta/stacking.py +370 -0
- sksurv/metrics.py +1028 -0
- sksurv/nonparametric.py +911 -0
- sksurv/preprocessing.py +183 -0
- sksurv/svm/__init__.py +11 -0
- sksurv/svm/_minlip.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/svm/_prsvm.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/svm/minlip.py +690 -0
- sksurv/svm/naive_survival_svm.py +249 -0
- sksurv/svm/survival_svm.py +1236 -0
- sksurv/testing.py +108 -0
- sksurv/tree/__init__.py +1 -0
- sksurv/tree/_criterion.cpython-311-x86_64-linux-gnu.so +0 -0
- sksurv/tree/tree.py +790 -0
- sksurv/util.py +415 -0
sksurv/nonparametric.py
ADDED
|
@@ -0,0 +1,911 @@
|
|
|
1
|
+
# This program is free software: you can redistribute it and/or modify
|
|
2
|
+
# it under the terms of the GNU General Public License as published by
|
|
3
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
4
|
+
# (at your option) any later version.
|
|
5
|
+
#
|
|
6
|
+
# This program is distributed in the hope that it will be useful,
|
|
7
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
8
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
9
|
+
# GNU General Public License for more details.
|
|
10
|
+
#
|
|
11
|
+
# You should have received a copy of the GNU General Public License
|
|
12
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
13
|
+
import numbers
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from scipy import stats
|
|
17
|
+
from sklearn.base import BaseEstimator
|
|
18
|
+
from sklearn.utils._param_validation import Interval, StrOptions
|
|
19
|
+
from sklearn.utils.validation import check_array, check_consistent_length, check_is_fitted
|
|
20
|
+
|
|
21
|
+
from .util import check_y_survival
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"CensoringDistributionEstimator",
|
|
25
|
+
"kaplan_meier_estimator",
|
|
26
|
+
"nelson_aalen_estimator",
|
|
27
|
+
"ipc_weights",
|
|
28
|
+
"SurvivalFunctionEstimator",
|
|
29
|
+
"cumulative_incidence_competing_risks",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _compute_counts(event, time, order=None):
|
|
34
|
+
"""Count right-censored and uncensored samples at each unique time point.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
event : ndarray
|
|
39
|
+
Boolean event indicator.
|
|
40
|
+
Integer in the case of multiple risks.
|
|
41
|
+
Zero means right-censored event.
|
|
42
|
+
Positive values for each of the possible risk events.
|
|
43
|
+
|
|
44
|
+
time : ndarray
|
|
45
|
+
Survival time or time of censoring.
|
|
46
|
+
|
|
47
|
+
order : ndarray or None
|
|
48
|
+
Indices to order time in ascending order.
|
|
49
|
+
If None, order will be computed.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
times : ndarray
|
|
54
|
+
Unique time points.
|
|
55
|
+
|
|
56
|
+
n_events : ndarray
|
|
57
|
+
Number of events at each time point.
|
|
58
|
+
2D array with shape `(n_unique_time_points, n_risks + 1)` in the case of competing risks.
|
|
59
|
+
|
|
60
|
+
n_at_risk : ndarray
|
|
61
|
+
Number of samples that have not been censored or have not had an event at each time point.
|
|
62
|
+
|
|
63
|
+
n_censored : ndarray
|
|
64
|
+
Number of censored samples at each time point.
|
|
65
|
+
"""
|
|
66
|
+
n_samples = event.shape[0]
|
|
67
|
+
n_risks = event.max() if (np.issubdtype(event.dtype, np.integer) and event.max() > 1) else 0
|
|
68
|
+
|
|
69
|
+
if order is None:
|
|
70
|
+
order = np.argsort(time, kind="mergesort")
|
|
71
|
+
|
|
72
|
+
uniq_times = np.empty(n_samples, dtype=time.dtype)
|
|
73
|
+
uniq_events = np.empty((n_samples, n_risks + 1), dtype=int)
|
|
74
|
+
uniq_counts = np.empty(n_samples, dtype=int)
|
|
75
|
+
|
|
76
|
+
i = 0
|
|
77
|
+
prev_val = time[order[0]]
|
|
78
|
+
j = 0
|
|
79
|
+
while True:
|
|
80
|
+
count_event = np.zeros(n_risks + 1, dtype=int)
|
|
81
|
+
count = 0
|
|
82
|
+
while i < n_samples and prev_val == time[order[i]]:
|
|
83
|
+
event_type = event[order[i]]
|
|
84
|
+
if event_type:
|
|
85
|
+
count_event[0] += 1
|
|
86
|
+
if n_risks:
|
|
87
|
+
count_event[event_type] += 1
|
|
88
|
+
|
|
89
|
+
count += 1
|
|
90
|
+
i += 1
|
|
91
|
+
|
|
92
|
+
uniq_times[j] = prev_val
|
|
93
|
+
uniq_events[j] = count_event
|
|
94
|
+
uniq_counts[j] = count
|
|
95
|
+
j += 1
|
|
96
|
+
|
|
97
|
+
if i == n_samples:
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
prev_val = time[order[i]]
|
|
101
|
+
|
|
102
|
+
times = np.resize(uniq_times, j)
|
|
103
|
+
total_count = np.resize(uniq_counts, j)
|
|
104
|
+
if n_risks:
|
|
105
|
+
n_events = np.resize(uniq_events, (j, n_risks + 1))
|
|
106
|
+
n_censored = total_count - n_events[:, 0]
|
|
107
|
+
else:
|
|
108
|
+
n_events = np.resize(uniq_events, j)
|
|
109
|
+
n_censored = total_count - n_events
|
|
110
|
+
|
|
111
|
+
# offset cumulative sum by one
|
|
112
|
+
total_count = np.r_[0, total_count]
|
|
113
|
+
n_at_risk = n_samples - np.cumsum(total_count)
|
|
114
|
+
|
|
115
|
+
return times, n_events, n_at_risk[:-1], n_censored
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _compute_counts_truncated(event, time_enter, time_exit):
|
|
119
|
+
"""Compute counts for left truncated and right-censored survival data.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
event : ndarray
|
|
124
|
+
Boolean event indicator.
|
|
125
|
+
|
|
126
|
+
time_enter : ndarray
|
|
127
|
+
Time when a subject entered the study.
|
|
128
|
+
|
|
129
|
+
time_exit : ndarray
|
|
130
|
+
Time when a subject left the study due to an
|
|
131
|
+
event or censoring.
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
times : ndarray
|
|
136
|
+
Unique time points.
|
|
137
|
+
|
|
138
|
+
n_events : ndarray
|
|
139
|
+
Number of events at each time point.
|
|
140
|
+
|
|
141
|
+
n_at_risk : ndarray
|
|
142
|
+
Number of samples that are censored or have an event at each time point.
|
|
143
|
+
"""
|
|
144
|
+
if (time_enter > time_exit).any():
|
|
145
|
+
raise ValueError("exit time must be larger start time for all samples")
|
|
146
|
+
|
|
147
|
+
n_samples = event.shape[0]
|
|
148
|
+
|
|
149
|
+
uniq_times = np.sort(np.unique(np.r_[time_enter, time_exit]), kind="mergesort")
|
|
150
|
+
total_counts = np.empty(len(uniq_times), dtype=int)
|
|
151
|
+
event_counts = np.empty(len(uniq_times), dtype=int)
|
|
152
|
+
|
|
153
|
+
order_enter = np.argsort(time_enter, kind="mergesort")
|
|
154
|
+
order_exit = np.argsort(time_exit, kind="mergesort")
|
|
155
|
+
s_time_enter = time_enter[order_enter]
|
|
156
|
+
s_time_exit = time_exit[order_exit]
|
|
157
|
+
|
|
158
|
+
t0 = uniq_times[0]
|
|
159
|
+
# everything larger is included
|
|
160
|
+
idx_enter = np.searchsorted(s_time_enter, t0, side="right")
|
|
161
|
+
# everything smaller is excluded
|
|
162
|
+
idx_exit = np.searchsorted(s_time_exit, t0, side="left")
|
|
163
|
+
|
|
164
|
+
total_counts[0] = idx_enter
|
|
165
|
+
# except people die on the day they enter
|
|
166
|
+
event_counts[0] = 0
|
|
167
|
+
|
|
168
|
+
for i in range(1, len(uniq_times)):
|
|
169
|
+
ti = uniq_times[i]
|
|
170
|
+
|
|
171
|
+
while idx_enter < n_samples and s_time_enter[idx_enter] < ti:
|
|
172
|
+
idx_enter += 1
|
|
173
|
+
|
|
174
|
+
while idx_exit < n_samples and s_time_exit[idx_exit] < ti:
|
|
175
|
+
idx_exit += 1
|
|
176
|
+
|
|
177
|
+
risk_set = np.setdiff1d(order_enter[:idx_enter], order_exit[:idx_exit], assume_unique=True)
|
|
178
|
+
total_counts[i] = len(risk_set)
|
|
179
|
+
|
|
180
|
+
count_event = 0
|
|
181
|
+
k = idx_exit
|
|
182
|
+
while k < n_samples and s_time_exit[k] == ti:
|
|
183
|
+
if event[order_exit[k]]:
|
|
184
|
+
count_event += 1
|
|
185
|
+
k += 1
|
|
186
|
+
event_counts[i] = count_event
|
|
187
|
+
|
|
188
|
+
return uniq_times, event_counts, total_counts
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _ci_logmlog(s, sigma_t, conf_level):
|
|
192
|
+
r"""Compute the pointwise log-minus-log transformed confidence intervals.
|
|
193
|
+
s refers to the prob_survival or the cum_inc (for the competing risks case).
|
|
194
|
+
sigma_t is the square root of the variance of the log of the estimator of s.
|
|
195
|
+
|
|
196
|
+
.. math::
|
|
197
|
+
|
|
198
|
+
\sigma_t = \mathrm{Var}(\log(\hat{S}(t)))
|
|
199
|
+
"""
|
|
200
|
+
eps = np.finfo(s.dtype).eps
|
|
201
|
+
mask = s > eps
|
|
202
|
+
log_p = np.zeros_like(s)
|
|
203
|
+
np.log(s, where=mask, out=log_p)
|
|
204
|
+
theta = np.zeros_like(s)
|
|
205
|
+
np.true_divide(sigma_t, log_p, where=log_p < -eps, out=theta)
|
|
206
|
+
|
|
207
|
+
z = stats.norm.isf((1.0 - conf_level) / 2.0)
|
|
208
|
+
theta = z * np.multiply.outer([-1, 1], theta)
|
|
209
|
+
ci = np.exp(np.exp(theta) * log_p)
|
|
210
|
+
ci[:, ~mask] = 0.0
|
|
211
|
+
return ci
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type):
|
|
215
|
+
"""Helper to compute confidence intervals for the Kaplan-Meier estimate.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
prob_survival : ndarray, shape = (n_times,)
|
|
220
|
+
Survival probability at each unique time point.
|
|
221
|
+
|
|
222
|
+
ratio_var : ndarray, shape = (n_times,)
|
|
223
|
+
The variance ratio term for each unique time point.
|
|
224
|
+
|
|
225
|
+
conf_level : float
|
|
226
|
+
The level for a two-sided confidence interval.
|
|
227
|
+
|
|
228
|
+
conf_type : {'log-log'}
|
|
229
|
+
The type of confidence intervals to estimate.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
ci : ndarray, shape = (2, n_times)
|
|
234
|
+
Pointwise confidence interval.
|
|
235
|
+
"""
|
|
236
|
+
if conf_type not in {"log-log"}:
|
|
237
|
+
raise ValueError(f"conf_type must be None or a str among {{'log-log'}}, but was {conf_type!r}")
|
|
238
|
+
|
|
239
|
+
if not isinstance(conf_level, numbers.Real) or not np.isfinite(conf_level) or conf_level <= 0 or conf_level >= 1.0:
|
|
240
|
+
raise ValueError(f"conf_level must be a float in the range (0.0, 1.0), but was {conf_level!r}")
|
|
241
|
+
|
|
242
|
+
sigma = np.sqrt(np.cumsum(ratio_var))
|
|
243
|
+
ci = _ci_logmlog(prob_survival, sigma, conf_level)
|
|
244
|
+
return ci
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def kaplan_meier_estimator(
|
|
248
|
+
event,
|
|
249
|
+
time_exit,
|
|
250
|
+
time_enter=None,
|
|
251
|
+
time_min=None,
|
|
252
|
+
reverse=False,
|
|
253
|
+
conf_level=0.95,
|
|
254
|
+
conf_type=None,
|
|
255
|
+
):
|
|
256
|
+
"""Computes the Kaplan-Meier estimate of the survival function.
|
|
257
|
+
|
|
258
|
+
See [1]_ for further description.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
event : array-like, shape = (n_samples,)
|
|
263
|
+
A boolean array where ``True`` indicates an event and ``False`` indicates
|
|
264
|
+
right-censoring.
|
|
265
|
+
|
|
266
|
+
time_exit : array-like, shape = (n_samples,)
|
|
267
|
+
Time of event or censoring.
|
|
268
|
+
|
|
269
|
+
time_enter : array-like, shape = (n_samples,), optional
|
|
270
|
+
Contains time when each individual entered the study for
|
|
271
|
+
left truncated survival data.
|
|
272
|
+
|
|
273
|
+
time_min : float, optional
|
|
274
|
+
Compute estimator conditional on survival at least up to
|
|
275
|
+
the specified time.
|
|
276
|
+
|
|
277
|
+
reverse : bool, optional, default: False
|
|
278
|
+
Whether to estimate the censoring distribution.
|
|
279
|
+
When there are ties between times at which events are observed,
|
|
280
|
+
then events come first and are subtracted from the denominator.
|
|
281
|
+
Only available for right-censored data, i.e. `time_enter` must
|
|
282
|
+
be None.
|
|
283
|
+
|
|
284
|
+
conf_level : float, optional, default: 0.95
|
|
285
|
+
The level for a two-sided confidence interval on the survival curves.
|
|
286
|
+
|
|
287
|
+
conf_type : None or {'log-log'}, optional, default: None.
|
|
288
|
+
The type of confidence intervals to estimate.
|
|
289
|
+
If `None`, no confidence intervals are estimated.
|
|
290
|
+
If "log-log", estimate confidence intervals using
|
|
291
|
+
the log hazard or :math:`log(-log(S(t)))` as described in [2]_.
|
|
292
|
+
|
|
293
|
+
Returns
|
|
294
|
+
-------
|
|
295
|
+
time : ndarray, shape = (n_times,)
|
|
296
|
+
Unique times.
|
|
297
|
+
|
|
298
|
+
prob_survival : ndarray, shape = (n_times,)
|
|
299
|
+
Survival probability at each unique time point.
|
|
300
|
+
If `time_enter` is provided, estimates are conditional probabilities.
|
|
301
|
+
|
|
302
|
+
conf_int : ndarray, shape = (2, n_times)
|
|
303
|
+
Pointwise confidence interval of the Kaplan-Meier estimator
|
|
304
|
+
at each unique time point.
|
|
305
|
+
Only provided if `conf_type` is not None.
|
|
306
|
+
|
|
307
|
+
Examples
|
|
308
|
+
--------
|
|
309
|
+
Creating a Kaplan-Meier curve:
|
|
310
|
+
|
|
311
|
+
.. plot::
|
|
312
|
+
|
|
313
|
+
>>> import matplotlib.pyplot as plt
|
|
314
|
+
>>> from sksurv.datasets import load_veterans_lung_cancer
|
|
315
|
+
>>> from sksurv.nonparametric import kaplan_meier_estimator
|
|
316
|
+
>>>
|
|
317
|
+
>>> _, y = load_veterans_lung_cancer()
|
|
318
|
+
>>> time, prob_surv, conf_int = kaplan_meier_estimator(
|
|
319
|
+
... y["Status"], y["Survival_in_days"], conf_type="log-log"
|
|
320
|
+
... )
|
|
321
|
+
>>> plt.step(time, prob_surv, where="post")
|
|
322
|
+
[...]
|
|
323
|
+
>>> plt.fill_between(time, conf_int[0], conf_int[1], alpha=0.25, step="post")
|
|
324
|
+
<matplotlib.collections.PolyCollection object at 0x...>
|
|
325
|
+
>>> plt.ylim(0, 1)
|
|
326
|
+
(0.0, 1.0)
|
|
327
|
+
>>> plt.show() # doctest: +SKIP
|
|
328
|
+
|
|
329
|
+
See also
|
|
330
|
+
--------
|
|
331
|
+
sksurv.nonparametric.SurvivalFunctionEstimator
|
|
332
|
+
Estimator API of the Kaplan-Meier estimator.
|
|
333
|
+
|
|
334
|
+
References
|
|
335
|
+
----------
|
|
336
|
+
.. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations",
|
|
337
|
+
Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958.
|
|
338
|
+
.. [2] Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the
|
|
339
|
+
Survival Function Based on Transformations", Scandinavian Journal of
|
|
340
|
+
Statistics. 1990;17(1):35–41.
|
|
341
|
+
"""
|
|
342
|
+
event, time_enter, time_exit = check_y_survival(event, time_enter, time_exit, allow_all_censored=True)
|
|
343
|
+
check_consistent_length(event, time_enter, time_exit)
|
|
344
|
+
|
|
345
|
+
if conf_type is not None and reverse:
|
|
346
|
+
raise NotImplementedError("Confidence intervals of the censoring distribution is not implemented.")
|
|
347
|
+
|
|
348
|
+
if time_enter is None:
|
|
349
|
+
uniq_times, n_events, n_at_risk, n_censored = _compute_counts(event, time_exit)
|
|
350
|
+
|
|
351
|
+
if reverse:
|
|
352
|
+
n_at_risk -= n_events
|
|
353
|
+
n_events = n_censored
|
|
354
|
+
else:
|
|
355
|
+
if reverse:
|
|
356
|
+
raise ValueError("The censoring distribution cannot be estimated from left truncated data")
|
|
357
|
+
|
|
358
|
+
uniq_times, n_events, n_at_risk = _compute_counts_truncated(event, time_enter, time_exit)
|
|
359
|
+
|
|
360
|
+
# account for 0/0 = nan
|
|
361
|
+
ratio = np.divide(
|
|
362
|
+
n_events,
|
|
363
|
+
n_at_risk,
|
|
364
|
+
out=np.zeros(uniq_times.shape[0], dtype=float),
|
|
365
|
+
where=n_events != 0,
|
|
366
|
+
)
|
|
367
|
+
values = 1.0 - ratio
|
|
368
|
+
|
|
369
|
+
if conf_type is not None:
|
|
370
|
+
ratio_var = np.divide(
|
|
371
|
+
n_events,
|
|
372
|
+
n_at_risk * (n_at_risk - n_events),
|
|
373
|
+
out=np.zeros(uniq_times.shape[0], dtype=float),
|
|
374
|
+
where=(n_events != 0) & (n_at_risk != n_events),
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
if time_min is not None:
|
|
378
|
+
mask = uniq_times >= time_min
|
|
379
|
+
uniq_times = np.compress(mask, uniq_times)
|
|
380
|
+
values = np.compress(mask, values)
|
|
381
|
+
|
|
382
|
+
prob_survival = np.cumprod(values)
|
|
383
|
+
|
|
384
|
+
if conf_type is None:
|
|
385
|
+
return uniq_times, prob_survival
|
|
386
|
+
|
|
387
|
+
if time_min is not None:
|
|
388
|
+
ratio_var = np.compress(mask, ratio_var)
|
|
389
|
+
|
|
390
|
+
ci = _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type)
|
|
391
|
+
|
|
392
|
+
return uniq_times, prob_survival, ci
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def nelson_aalen_estimator(event, time):
|
|
396
|
+
"""Computes the Nelson-Aalen estimate of the cumulative hazard function.
|
|
397
|
+
|
|
398
|
+
See [1]_, [2]_ for further description.
|
|
399
|
+
|
|
400
|
+
Parameters
|
|
401
|
+
----------
|
|
402
|
+
event : array-like, shape = (n_samples,)
|
|
403
|
+
A boolean array where ``True`` indicates an event and ``False`` indicates
|
|
404
|
+
right-censoring.
|
|
405
|
+
|
|
406
|
+
time : array-like, shape = (n_samples,)
|
|
407
|
+
Time of event or censoring.
|
|
408
|
+
|
|
409
|
+
Returns
|
|
410
|
+
-------
|
|
411
|
+
time : ndarray, shape = (n_times,)
|
|
412
|
+
Unique times.
|
|
413
|
+
|
|
414
|
+
cum_hazard : ndarray, shape = (n_times,)
|
|
415
|
+
Cumulative hazard at each unique time point.
|
|
416
|
+
|
|
417
|
+
Examples
|
|
418
|
+
--------
|
|
419
|
+
Creating a cumulative hazard curve:
|
|
420
|
+
|
|
421
|
+
.. plot::
|
|
422
|
+
|
|
423
|
+
>>> import matplotlib.pyplot as plt
|
|
424
|
+
>>> from sksurv.datasets import load_aids
|
|
425
|
+
>>> from sksurv.nonparametric import nelson_aalen_estimator
|
|
426
|
+
>>>
|
|
427
|
+
>>> _, y = load_aids(endpoint="death")
|
|
428
|
+
>>> time, cum_hazard = nelson_aalen_estimator(y["censor_d"], y["time_d"])
|
|
429
|
+
>>>
|
|
430
|
+
>>> plt.step(time, cum_hazard, where="post")
|
|
431
|
+
[...]
|
|
432
|
+
>>> plt.show() # doctest: +SKIP
|
|
433
|
+
|
|
434
|
+
References
|
|
435
|
+
----------
|
|
436
|
+
.. [1] Nelson, W., "Theory and applications of hazard plotting for censored failure data",
|
|
437
|
+
Technometrics, vol. 14, pp. 945-965, 1972.
|
|
438
|
+
|
|
439
|
+
.. [2] Aalen, O. O., "Nonparametric inference for a family of counting processes",
|
|
440
|
+
Annals of Statistics, vol. 6, pp. 701–726, 1978.
|
|
441
|
+
"""
|
|
442
|
+
event, time = check_y_survival(event, time)
|
|
443
|
+
check_consistent_length(event, time)
|
|
444
|
+
uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time)
|
|
445
|
+
|
|
446
|
+
y = np.cumsum(n_events / n_at_risk)
|
|
447
|
+
|
|
448
|
+
return uniq_times, y
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def ipc_weights(event, time):
|
|
452
|
+
"""Compute inverse probability of censoring weights
|
|
453
|
+
|
|
454
|
+
Parameters
|
|
455
|
+
----------
|
|
456
|
+
event : array-like, shape = (n_samples,)
|
|
457
|
+
A boolean array where ``True`` indicates an event and ``False`` indicates
|
|
458
|
+
right-censoring.
|
|
459
|
+
|
|
460
|
+
time : array-like, shape = (n_samples,)
|
|
461
|
+
Time when a subject experienced an event or was censored.
|
|
462
|
+
|
|
463
|
+
Returns
|
|
464
|
+
-------
|
|
465
|
+
weights : ndarray, shape = (n_samples,)
|
|
466
|
+
inverse probability of censoring weights
|
|
467
|
+
|
|
468
|
+
See also
|
|
469
|
+
--------
|
|
470
|
+
CensoringDistributionEstimator
|
|
471
|
+
An estimator interface for estimating inverse probability
|
|
472
|
+
of censoring weights for unseen time points.
|
|
473
|
+
"""
|
|
474
|
+
if event.all():
|
|
475
|
+
return np.ones(time.shape[0])
|
|
476
|
+
|
|
477
|
+
unique_time, p = kaplan_meier_estimator(event, time, reverse=True)
|
|
478
|
+
|
|
479
|
+
idx = np.searchsorted(unique_time, time[event])
|
|
480
|
+
Ghat = p[idx]
|
|
481
|
+
|
|
482
|
+
assert (Ghat > 0).all()
|
|
483
|
+
|
|
484
|
+
weights = np.zeros(time.shape[0])
|
|
485
|
+
weights[event] = 1.0 / Ghat
|
|
486
|
+
|
|
487
|
+
return weights
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
class SurvivalFunctionEstimator(BaseEstimator):
|
|
491
|
+
"""Kaplan–Meier estimate of the survival function.
|
|
492
|
+
|
|
493
|
+
Parameters
|
|
494
|
+
----------
|
|
495
|
+
conf_level : float, optional, default: 0.95
|
|
496
|
+
The level for a two-sided confidence interval on the survival curves.
|
|
497
|
+
|
|
498
|
+
conf_type : None or {'log-log'}, optional, default: None.
|
|
499
|
+
The type of confidence intervals to estimate.
|
|
500
|
+
If `None`, no confidence intervals are estimated.
|
|
501
|
+
If "log-log", estimate confidence intervals using
|
|
502
|
+
the log hazard or :math:`log(-log(S(t)))`.
|
|
503
|
+
|
|
504
|
+
See also
|
|
505
|
+
--------
|
|
506
|
+
sksurv.nonparametric.kaplan_meier_estimator
|
|
507
|
+
Functional API of the Kaplan-Meier estimator.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
_parameter_constraints = {
|
|
511
|
+
"conf_level": [Interval(numbers.Real, 0.0, 1.0, closed="neither")],
|
|
512
|
+
"conf_type": [None, StrOptions({"log-log"})],
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
def __init__(self, conf_level=0.95, conf_type=None):
|
|
516
|
+
self.conf_level = conf_level
|
|
517
|
+
self.conf_type = conf_type
|
|
518
|
+
|
|
519
|
+
def fit(self, y):
|
|
520
|
+
"""Estimate survival distribution from training data.
|
|
521
|
+
|
|
522
|
+
Parameters
|
|
523
|
+
----------
|
|
524
|
+
y : structured array, shape = (n_samples,)
|
|
525
|
+
A structured array with two fields. The first field is a boolean
|
|
526
|
+
where ``True`` indicates an event and ``False`` indicates right-censoring.
|
|
527
|
+
The second field is a float with the time of event or time of censoring.
|
|
528
|
+
|
|
529
|
+
Returns
|
|
530
|
+
-------
|
|
531
|
+
self
|
|
532
|
+
"""
|
|
533
|
+
self._validate_params()
|
|
534
|
+
event, time = check_y_survival(y, allow_all_censored=True)
|
|
535
|
+
|
|
536
|
+
values = kaplan_meier_estimator(event, time, conf_level=self.conf_level, conf_type=self.conf_type)
|
|
537
|
+
if self.conf_type is None:
|
|
538
|
+
unique_time, prob = values
|
|
539
|
+
else:
|
|
540
|
+
unique_time, prob, conf_int = values
|
|
541
|
+
self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int))
|
|
542
|
+
|
|
543
|
+
self.unique_time_ = np.r_[-np.inf, unique_time]
|
|
544
|
+
self.prob_ = np.r_[1.0, prob]
|
|
545
|
+
|
|
546
|
+
return self
|
|
547
|
+
|
|
548
|
+
def predict_proba(self, time, return_conf_int=False):
|
|
549
|
+
r"""Return probability of remaining event-free at given time points.
|
|
550
|
+
|
|
551
|
+
:math:`\hat{S}(t) = P(T > t)`
|
|
552
|
+
|
|
553
|
+
Parameters
|
|
554
|
+
----------
|
|
555
|
+
time : array-like, shape = (n_samples,)
|
|
556
|
+
Time to estimate probability at.
|
|
557
|
+
|
|
558
|
+
return_conf_int : bool, optional, default: False
|
|
559
|
+
Whether to return the pointwise confidence interval
|
|
560
|
+
of the survival function.
|
|
561
|
+
Only available if :meth:`fit()` has been called
|
|
562
|
+
with the `conf_type` parameter set.
|
|
563
|
+
|
|
564
|
+
Returns
|
|
565
|
+
-------
|
|
566
|
+
prob : ndarray, shape = (n_samples,)
|
|
567
|
+
Probability of remaining event-free at the given time points.
|
|
568
|
+
|
|
569
|
+
conf_int : ndarray, shape = (2, n_samples)
|
|
570
|
+
Pointwise confidence interval at the passed time points.
|
|
571
|
+
Only provided if `return_conf_int` is True.
|
|
572
|
+
"""
|
|
573
|
+
check_is_fitted(self, "unique_time_")
|
|
574
|
+
if return_conf_int and not hasattr(self, "conf_int_"):
|
|
575
|
+
raise ValueError(
|
|
576
|
+
"If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None"
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
time = check_array(time, ensure_2d=False, estimator=self, input_name="time")
|
|
580
|
+
|
|
581
|
+
# K-M is undefined if estimate at last time point is non-zero
|
|
582
|
+
extends = time > self.unique_time_[-1]
|
|
583
|
+
if self.prob_[-1] > 0 and extends.any():
|
|
584
|
+
raise ValueError(f"time must be smaller than largest observed time point: {self.unique_time_[-1]}")
|
|
585
|
+
|
|
586
|
+
# beyond last time point is zero probability
|
|
587
|
+
Shat = np.empty(time.shape, dtype=float)
|
|
588
|
+
Shat[extends] = 0.0
|
|
589
|
+
|
|
590
|
+
valid = ~extends
|
|
591
|
+
time = time[valid]
|
|
592
|
+
idx = np.searchsorted(self.unique_time_, time)
|
|
593
|
+
# for non-exact matches, we need to shift the index to left
|
|
594
|
+
eps = np.finfo(self.unique_time_.dtype).eps
|
|
595
|
+
exact = np.absolute(self.unique_time_[idx] - time) < eps
|
|
596
|
+
idx[~exact] -= 1
|
|
597
|
+
Shat[valid] = self.prob_[idx]
|
|
598
|
+
|
|
599
|
+
if not return_conf_int:
|
|
600
|
+
return Shat
|
|
601
|
+
|
|
602
|
+
ci = np.empty((2, time.shape[0]), dtype=float)
|
|
603
|
+
ci[:, extends] = np.nan
|
|
604
|
+
ci[:, valid] = self.conf_int_[:, idx]
|
|
605
|
+
return Shat, ci
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
class CensoringDistributionEstimator(SurvivalFunctionEstimator):
|
|
609
|
+
"""Kaplan–Meier estimator for the censoring distribution."""
|
|
610
|
+
|
|
611
|
+
def fit(self, y):
|
|
612
|
+
"""Estimate censoring distribution from training data.
|
|
613
|
+
|
|
614
|
+
Parameters
|
|
615
|
+
----------
|
|
616
|
+
y : structured array, shape = (n_samples,)
|
|
617
|
+
A structured array with two fields. The first field is a boolean
|
|
618
|
+
where ``True`` indicates an event and ``False`` indicates right-censoring.
|
|
619
|
+
The second field is a float with the time of event or time of censoring.
|
|
620
|
+
|
|
621
|
+
Returns
|
|
622
|
+
-------
|
|
623
|
+
self
|
|
624
|
+
"""
|
|
625
|
+
event, time = check_y_survival(y)
|
|
626
|
+
if event.all():
|
|
627
|
+
self.unique_time_ = np.unique(time)
|
|
628
|
+
self.prob_ = np.ones(self.unique_time_.shape[0])
|
|
629
|
+
else:
|
|
630
|
+
unique_time, prob = kaplan_meier_estimator(event, time, reverse=True)
|
|
631
|
+
self.unique_time_ = np.r_[-np.inf, unique_time]
|
|
632
|
+
self.prob_ = np.r_[1.0, prob]
|
|
633
|
+
|
|
634
|
+
return self
|
|
635
|
+
|
|
636
|
+
def predict_ipcw(self, y):
|
|
637
|
+
r"""Return inverse probability of censoring weights at given time points.
|
|
638
|
+
|
|
639
|
+
:math:`\omega_i = \delta_i / \hat{G}(y_i)`
|
|
640
|
+
|
|
641
|
+
Parameters
|
|
642
|
+
----------
|
|
643
|
+
y : structured array, shape = (n_samples,)
|
|
644
|
+
A structured array with two fields. The first field is a boolean
|
|
645
|
+
where ``True`` indicates an event and ``False`` indicates right-censoring.
|
|
646
|
+
The second field is a float with the time of event or time of censoring.
|
|
647
|
+
|
|
648
|
+
Returns
|
|
649
|
+
-------
|
|
650
|
+
ipcw : ndarray, shape = (n_samples,)
|
|
651
|
+
Inverse probability of censoring weights.
|
|
652
|
+
"""
|
|
653
|
+
event, time = check_y_survival(y)
|
|
654
|
+
Ghat = self.predict_proba(time[event])
|
|
655
|
+
|
|
656
|
+
if (Ghat == 0.0).any():
|
|
657
|
+
raise ValueError("censoring survival function is zero at one or more time points")
|
|
658
|
+
|
|
659
|
+
weights = np.zeros(time.shape[0])
|
|
660
|
+
weights[event] = 1.0 / Ghat
|
|
661
|
+
|
|
662
|
+
return weights
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
def _cum_inc_cr_ci_estimator(cum_inc, var, conf_level, conf_type):
|
|
666
|
+
if conf_type not in {"log-log"}:
|
|
667
|
+
raise ValueError(f"conf_type must be None or a str among {{'log-log'}}, but was {conf_type!r}")
|
|
668
|
+
|
|
669
|
+
if not isinstance(conf_level, numbers.Real) or not np.isfinite(conf_level) or conf_level <= 0 or conf_level >= 1.0:
|
|
670
|
+
raise ValueError(f"conf_level must be a float in the range (0.0, 1.0), but was {conf_level!r}")
|
|
671
|
+
eps = np.finfo(var.dtype).eps
|
|
672
|
+
sigma = np.zeros_like(var)
|
|
673
|
+
np.divide(np.sqrt(var), cum_inc, where=var > eps, out=sigma)
|
|
674
|
+
ci = _ci_logmlog(cum_inc, sigma, conf_level)
|
|
675
|
+
# make first axis the competing risks, the second axis the lower and upper confidence interval
|
|
676
|
+
ci = np.swapaxes(ci, 0, 1)
|
|
677
|
+
return ci
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def cumulative_incidence_competing_risks(
|
|
681
|
+
event,
|
|
682
|
+
time_exit,
|
|
683
|
+
time_min=None,
|
|
684
|
+
conf_level=0.95,
|
|
685
|
+
conf_type=None,
|
|
686
|
+
var_type="Aalen",
|
|
687
|
+
):
|
|
688
|
+
"""Non-parametric estimator of Cumulative Incidence function in the case of competing risks.
|
|
689
|
+
|
|
690
|
+
See the :ref:`User Guide </user_guide/competing-risks.ipynb>` and [1]_ for further details.
|
|
691
|
+
|
|
692
|
+
Parameters
|
|
693
|
+
----------
|
|
694
|
+
event : array-like, shape = (n_samples,), dtype = int
|
|
695
|
+
Contains event indicators. A value of 0 indicates right-censoring,
|
|
696
|
+
while a positive integer from 1 to `n_risks` corresponds to a specific risk.
|
|
697
|
+
`n_risks` is the total number of different risks.
|
|
698
|
+
It assumes there are events for all possible risks.
|
|
699
|
+
|
|
700
|
+
time_exit : array-like, shape = (n_samples,)
|
|
701
|
+
Contains event or censoring times.
|
|
702
|
+
|
|
703
|
+
time_min : float, optional, default: None
|
|
704
|
+
Compute estimator conditional on survival at least up to
|
|
705
|
+
the specified time.
|
|
706
|
+
|
|
707
|
+
conf_level : float, optional, default: 0.95
|
|
708
|
+
The level for a two-sided confidence interval on the cumulative incidence curves.
|
|
709
|
+
|
|
710
|
+
conf_type : None or {'log-log'}, optional, default: None.
|
|
711
|
+
The type of confidence intervals to estimate.
|
|
712
|
+
If `None`, no confidence intervals are estimated.
|
|
713
|
+
If "log-log", estimate confidence intervals using
|
|
714
|
+
the log hazard or :math:`log(-log(S(t)))`.
|
|
715
|
+
|
|
716
|
+
var_type : {'Aalen', 'Dinse', 'Dinse_Approx'}, optional, default: 'Aalen'
|
|
717
|
+
The method for estimating the variance of the estimator.
|
|
718
|
+
See [2]_, [3]_ and [4]_ for each of the methods.
|
|
719
|
+
Only used if `conf_type` is not None.
|
|
720
|
+
|
|
721
|
+
Returns
|
|
722
|
+
-------
|
|
723
|
+
time : ndarray, shape = (n_times,)
|
|
724
|
+
Unique times.
|
|
725
|
+
|
|
726
|
+
cum_incidence : ndarray, shape = (n_risks + 1, n_times)
|
|
727
|
+
Cumulative incidence for each risk. The first row (``cum_incidence[0]``)
|
|
728
|
+
is the cumulative incidence of any risk (total risk). The remaining
|
|
729
|
+
rows (``cum_incidence[1:]``) are the cumulative incidences for each
|
|
730
|
+
competing risk.
|
|
731
|
+
|
|
732
|
+
conf_int : ndarray, shape = (n_risks + 1, 2, n_times)
|
|
733
|
+
Pointwise confidence interval (second axis) of the cumulative incidence function
|
|
734
|
+
at each unique time point (last axis)
|
|
735
|
+
for all possible risks (first axis), including overall risk (``conf_int[0]``).
|
|
736
|
+
Only provided if `conf_type` is not None.
|
|
737
|
+
|
|
738
|
+
Examples
|
|
739
|
+
--------
|
|
740
|
+
Creating cumulative incidence curves:
|
|
741
|
+
|
|
742
|
+
.. plot::
|
|
743
|
+
|
|
744
|
+
>>> import matplotlib.pyplot as plt
|
|
745
|
+
>>> from sksurv.datasets import load_bmt
|
|
746
|
+
>>> from sksurv.nonparametric import cumulative_incidence_competing_risks
|
|
747
|
+
>>>
|
|
748
|
+
>>> dis, bmt_df = load_bmt()
|
|
749
|
+
>>> event = bmt_df["status"]
|
|
750
|
+
>>> time = bmt_df["ftime"]
|
|
751
|
+
>>> n_risks = event.max()
|
|
752
|
+
>>>
|
|
753
|
+
>>> x, y, conf_int = cumulative_incidence_competing_risks(
|
|
754
|
+
... event, time, conf_type="log-log"
|
|
755
|
+
... )
|
|
756
|
+
>>>
|
|
757
|
+
>>> plt.step(x, y[0], where="post", label="Total risk")
|
|
758
|
+
[...]
|
|
759
|
+
>>> plt.fill_between(x, conf_int[0, 0], conf_int[0, 1], alpha=0.25, step="post")
|
|
760
|
+
<matplotlib.collections.PolyCollection object at 0x...>
|
|
761
|
+
>>> for i in range(1, n_risks + 1):
|
|
762
|
+
... plt.step(x, y[i], where="post", label=f"{i}-risk")
|
|
763
|
+
... plt.fill_between(x, conf_int[i, 0], conf_int[i, 1], alpha=0.25, step="post")
|
|
764
|
+
[...]
|
|
765
|
+
<matplotlib.collections.PolyCollection object at 0x...>
|
|
766
|
+
>>> plt.ylim(0, 1)
|
|
767
|
+
(0.0, 1.0)
|
|
768
|
+
>>> plt.legend()
|
|
769
|
+
<matplotlib.legend.Legend object at 0x...>
|
|
770
|
+
>>> plt.show() # doctest: +SKIP
|
|
771
|
+
|
|
772
|
+
References
|
|
773
|
+
----------
|
|
774
|
+
.. [1] Kalbfleisch, J.D. and Prentice, R.L. (2002)
|
|
775
|
+
The Statistical Analysis of Failure Time Data. 2nd Edition, John Wiley and Sons, New York.
|
|
776
|
+
.. [2] Aalen, O. (1978a). Annals of Statistics, 6, 534–545.
|
|
777
|
+
We implement the formula in M. Pintilie: "Competing Risks: A Practical Perspective".
|
|
778
|
+
John Wiley & Sons, 2006, Eq. 4.5
|
|
779
|
+
.. [3] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eqs. 4 and 5.
|
|
780
|
+
.. [4] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eq. 6.
|
|
781
|
+
"""
|
|
782
|
+
event, time_exit = check_y_survival(event, time_exit, allow_all_censored=True, competing_risks=True)
|
|
783
|
+
check_consistent_length(event, time_exit)
|
|
784
|
+
|
|
785
|
+
n_risks = event.max()
|
|
786
|
+
uniq_times, n_events_cr, n_at_risk, _n_censored = _compute_counts(event, time_exit)
|
|
787
|
+
|
|
788
|
+
# account for 0/0 = nan
|
|
789
|
+
n_t = uniq_times.shape[0]
|
|
790
|
+
ratio = np.divide(
|
|
791
|
+
n_events_cr,
|
|
792
|
+
n_at_risk[..., np.newaxis],
|
|
793
|
+
out=np.zeros((n_t, n_risks + 1), dtype=float),
|
|
794
|
+
where=n_events_cr != 0,
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
if time_min is not None:
|
|
798
|
+
mask = uniq_times >= time_min
|
|
799
|
+
uniq_times = np.compress(mask, uniq_times)
|
|
800
|
+
ratio = np.compress(mask, ratio, axis=0)
|
|
801
|
+
|
|
802
|
+
kpe = np.cumprod(1.0 - ratio[:, 0])
|
|
803
|
+
kpe_prime = np.r_[1.0, kpe[:-1]]
|
|
804
|
+
cum_inc = np.empty((n_risks + 1, n_t), dtype=float)
|
|
805
|
+
cum_inc[0] = 1.0 - kpe
|
|
806
|
+
cum_inc[1:] = np.cumsum((ratio[:, 1:].T * kpe_prime), axis=1)
|
|
807
|
+
|
|
808
|
+
if conf_type is None:
|
|
809
|
+
return uniq_times, cum_inc
|
|
810
|
+
|
|
811
|
+
if var_type == "Aalen":
|
|
812
|
+
var = _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc)
|
|
813
|
+
elif var_type == "Dinse_Approx":
|
|
814
|
+
var = _var_dinse_approx(n_events_cr, kpe_prime, n_at_risk, cum_inc)
|
|
815
|
+
elif var_type == "Dinse":
|
|
816
|
+
var = _var_dinse(n_events_cr, kpe_prime, n_at_risk)
|
|
817
|
+
else:
|
|
818
|
+
raise ValueError(f"{var_type=} must be one of 'Aalen', 'Dinse', or 'Dinse_Approx'.")
|
|
819
|
+
|
|
820
|
+
_x, _y, conf_int_km = kaplan_meier_estimator(event > 0, time_exit, conf_type="log-log")
|
|
821
|
+
ci = np.empty(shape=(n_risks + 1, 2, n_t), dtype=conf_int_km.dtype)
|
|
822
|
+
ci[0, :, :] = 1 - conf_int_km
|
|
823
|
+
ci[1:, :, :] = _cum_inc_cr_ci_estimator(cum_inc[1:], var, conf_level, conf_type)
|
|
824
|
+
|
|
825
|
+
return uniq_times, cum_inc, ci
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def _var_dinse_approx(n_events_cr, kpe_prime, n_at_risk, cum_inc):
|
|
829
|
+
"""
|
|
830
|
+
Variance estimator from Dinse and Larson, Biometrika (1986), 379
|
|
831
|
+
See Section 4, Eqs. 6.
|
|
832
|
+
This is an approximation from the _var_dinse, so that one should be preferred.
|
|
833
|
+
However, this seems to be more common in the literature.
|
|
834
|
+
"""
|
|
835
|
+
dr = n_events_cr[:, 0]
|
|
836
|
+
dr_cr = n_events_cr[:, 1:].T
|
|
837
|
+
irt = cum_inc[1:, :, np.newaxis] - cum_inc[1:, np.newaxis, :]
|
|
838
|
+
mask = np.tril(np.ones_like(irt[0]))
|
|
839
|
+
|
|
840
|
+
# var_a = np.sum(irt**2 * mask * (dr / (n_at_risk * (n_at_risk - dr))), axis=2)
|
|
841
|
+
var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, dr / (n_at_risk * (n_at_risk - dr)))
|
|
842
|
+
var_b = np.cumsum(((n_at_risk - dr_cr) / n_at_risk) * (dr_cr / n_at_risk**2) * kpe_prime**2, axis=1)
|
|
843
|
+
# var_c = -2 * np.sum(irt * mask * dr_cr[:, np.newaxis, :] * (kpe_prime / n_at_risk**2), axis=2)
|
|
844
|
+
var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, dr_cr, kpe_prime / n_at_risk**2)
|
|
845
|
+
|
|
846
|
+
var = var_a + var_b + var_c
|
|
847
|
+
return var
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
def _var_dinse(n_events_cr, kpe_prime, n_at_risk):
|
|
851
|
+
"""
|
|
852
|
+
Variance estimator from Dinse and Larson, Biometrika (1986), 379
|
|
853
|
+
See Section 4, Eqs. 4 and 5
|
|
854
|
+
"""
|
|
855
|
+
dr = n_events_cr[:, 0]
|
|
856
|
+
dr_cr = n_events_cr[:, 1:].T
|
|
857
|
+
theta = dr_cr * kpe_prime / n_at_risk
|
|
858
|
+
x = dr / (n_at_risk * (n_at_risk - dr))
|
|
859
|
+
cprod = np.cumprod(1 + x) / (1 + x)
|
|
860
|
+
|
|
861
|
+
nt_range = np.arange(dr.size)
|
|
862
|
+
i_idx = nt_range[:, None, None]
|
|
863
|
+
j_idx = nt_range[None, :, None]
|
|
864
|
+
k_idx = nt_range[None, None, :]
|
|
865
|
+
mask = ((j_idx < i_idx) & (k_idx > j_idx) & (k_idx <= i_idx)).astype(int)
|
|
866
|
+
|
|
867
|
+
_v1 = np.zeros_like(theta)
|
|
868
|
+
np.divide((n_at_risk - dr_cr), n_at_risk * dr_cr, out=_v1, where=dr_cr > 0)
|
|
869
|
+
v1 = np.cumsum(theta**2 * ((1 + _v1) * cprod - 1), axis=1)
|
|
870
|
+
|
|
871
|
+
corr = (1 - 1 / n_at_risk) * cprod - 1
|
|
872
|
+
v2 = 2 * np.einsum("rj,rk,ijk->ri", theta * corr, theta, mask)
|
|
873
|
+
var = v1 + v2
|
|
874
|
+
|
|
875
|
+
return var
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc):
|
|
879
|
+
"""
|
|
880
|
+
Variance estimator from Aalen
|
|
881
|
+
Aalen, O. (1978a). Nonparametric estimation of partial transition
|
|
882
|
+
probabilities in multiple decrement models. Annals of Statistics, 6, 534–545.
|
|
883
|
+
We implement it as shown in
|
|
884
|
+
M. Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006, Eq. 4.5
|
|
885
|
+
This seems to be the estimator used in cmprsk, but there are some numerical differences with our implementation.
|
|
886
|
+
"""
|
|
887
|
+
dr = n_events_cr[:, 0]
|
|
888
|
+
dr_cr = n_events_cr[:, 1:].T
|
|
889
|
+
irt = cum_inc[1:, :, np.newaxis] - cum_inc[1:, np.newaxis, :]
|
|
890
|
+
mask = np.tril(np.ones_like(irt[0]))
|
|
891
|
+
|
|
892
|
+
_va = np.zeros_like(kpe_prime)
|
|
893
|
+
den_a = (n_at_risk - 1) * (n_at_risk - dr)
|
|
894
|
+
np.divide(dr, den_a, out=_va, where=den_a > 0)
|
|
895
|
+
# var_a = np.sum(irt**2 * mask * _va, axis=2)
|
|
896
|
+
var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, _va)
|
|
897
|
+
|
|
898
|
+
_vb = np.zeros_like(kpe_prime)
|
|
899
|
+
den_b = (n_at_risk - 1) * n_at_risk**2
|
|
900
|
+
np.divide(1.0, den_b, out=_vb, where=den_b > 0)
|
|
901
|
+
var_b = np.cumsum((n_at_risk - dr_cr) * dr_cr * _vb * kpe_prime**2, axis=1)
|
|
902
|
+
|
|
903
|
+
_vca = dr_cr * (n_at_risk - dr_cr)
|
|
904
|
+
_vcb = np.zeros_like(kpe_prime)
|
|
905
|
+
den_c = n_at_risk * (n_at_risk - dr) * (n_at_risk - 1)
|
|
906
|
+
np.divide(kpe_prime, den_c, out=_vcb, where=den_c > 0)
|
|
907
|
+
# var_c = -2 * np.sum(irt * mask * _vca[:, np.newaxis, :] * _vcb, axis=2)
|
|
908
|
+
var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, _vca, _vcb)
|
|
909
|
+
|
|
910
|
+
var = var_a + var_b + var_c
|
|
911
|
+
return var
|