scikit-survival 0.26.0__cp314-cp314-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. scikit_survival-0.26.0.dist-info/METADATA +185 -0
  2. scikit_survival-0.26.0.dist-info/RECORD +58 -0
  3. scikit_survival-0.26.0.dist-info/WHEEL +6 -0
  4. scikit_survival-0.26.0.dist-info/licenses/COPYING +674 -0
  5. scikit_survival-0.26.0.dist-info/top_level.txt +1 -0
  6. sksurv/__init__.py +183 -0
  7. sksurv/base.py +115 -0
  8. sksurv/bintrees/__init__.py +15 -0
  9. sksurv/bintrees/_binarytrees.cpython-314-darwin.so +0 -0
  10. sksurv/column.py +204 -0
  11. sksurv/compare.py +123 -0
  12. sksurv/datasets/__init__.py +12 -0
  13. sksurv/datasets/base.py +614 -0
  14. sksurv/datasets/data/GBSG2.arff +700 -0
  15. sksurv/datasets/data/actg320.arff +1169 -0
  16. sksurv/datasets/data/bmt.arff +46 -0
  17. sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
  18. sksurv/datasets/data/cgvhd.arff +118 -0
  19. sksurv/datasets/data/flchain.arff +7887 -0
  20. sksurv/datasets/data/veteran.arff +148 -0
  21. sksurv/datasets/data/whas500.arff +520 -0
  22. sksurv/docstrings.py +99 -0
  23. sksurv/ensemble/__init__.py +2 -0
  24. sksurv/ensemble/_coxph_loss.cpython-314-darwin.so +0 -0
  25. sksurv/ensemble/boosting.py +1564 -0
  26. sksurv/ensemble/forest.py +902 -0
  27. sksurv/ensemble/survival_loss.py +151 -0
  28. sksurv/exceptions.py +18 -0
  29. sksurv/functions.py +114 -0
  30. sksurv/io/__init__.py +2 -0
  31. sksurv/io/arffread.py +91 -0
  32. sksurv/io/arffwrite.py +181 -0
  33. sksurv/kernels/__init__.py +1 -0
  34. sksurv/kernels/_clinical_kernel.cpython-314-darwin.so +0 -0
  35. sksurv/kernels/clinical.py +348 -0
  36. sksurv/linear_model/__init__.py +3 -0
  37. sksurv/linear_model/_coxnet.cpython-314-darwin.so +0 -0
  38. sksurv/linear_model/aft.py +208 -0
  39. sksurv/linear_model/coxnet.py +592 -0
  40. sksurv/linear_model/coxph.py +637 -0
  41. sksurv/meta/__init__.py +4 -0
  42. sksurv/meta/base.py +35 -0
  43. sksurv/meta/ensemble_selection.py +724 -0
  44. sksurv/meta/stacking.py +370 -0
  45. sksurv/metrics.py +1028 -0
  46. sksurv/nonparametric.py +911 -0
  47. sksurv/preprocessing.py +195 -0
  48. sksurv/svm/__init__.py +11 -0
  49. sksurv/svm/_minlip.cpython-314-darwin.so +0 -0
  50. sksurv/svm/_prsvm.cpython-314-darwin.so +0 -0
  51. sksurv/svm/minlip.py +695 -0
  52. sksurv/svm/naive_survival_svm.py +249 -0
  53. sksurv/svm/survival_svm.py +1236 -0
  54. sksurv/testing.py +155 -0
  55. sksurv/tree/__init__.py +1 -0
  56. sksurv/tree/_criterion.cpython-314-darwin.so +0 -0
  57. sksurv/tree/tree.py +790 -0
  58. sksurv/util.py +416 -0
@@ -0,0 +1,911 @@
1
+ # This program is free software: you can redistribute it and/or modify
2
+ # it under the terms of the GNU General Public License as published by
3
+ # the Free Software Foundation, either version 3 of the License, or
4
+ # (at your option) any later version.
5
+ #
6
+ # This program is distributed in the hope that it will be useful,
7
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
8
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
+ # GNU General Public License for more details.
10
+ #
11
+ # You should have received a copy of the GNU General Public License
12
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
13
+ import numbers
14
+
15
+ import numpy as np
16
+ from scipy import stats
17
+ from sklearn.base import BaseEstimator
18
+ from sklearn.utils._param_validation import Interval, StrOptions
19
+ from sklearn.utils.validation import check_array, check_consistent_length, check_is_fitted
20
+
21
+ from .util import check_y_survival
22
+
23
+ __all__ = [
24
+ "CensoringDistributionEstimator",
25
+ "kaplan_meier_estimator",
26
+ "nelson_aalen_estimator",
27
+ "ipc_weights",
28
+ "SurvivalFunctionEstimator",
29
+ "cumulative_incidence_competing_risks",
30
+ ]
31
+
32
+
33
+ def _compute_counts(event, time, order=None):
34
+ """Count right-censored and uncensored samples at each unique time point.
35
+
36
+ Parameters
37
+ ----------
38
+ event : ndarray
39
+ Boolean event indicator.
40
+ Integer in the case of multiple risks.
41
+ Zero means right-censored event.
42
+ Positive values for each of the possible risk events.
43
+
44
+ time : ndarray
45
+ Survival time or time of censoring.
46
+
47
+ order : ndarray or None
48
+ Indices to order time in ascending order.
49
+ If None, order will be computed.
50
+
51
+ Returns
52
+ -------
53
+ times : ndarray
54
+ Unique time points.
55
+
56
+ n_events : ndarray
57
+ Number of events at each time point.
58
+ 2D array with shape `(n_unique_time_points, n_risks + 1)` in the case of competing risks.
59
+
60
+ n_at_risk : ndarray
61
+ Number of samples that have not been censored or have not had an event at each time point.
62
+
63
+ n_censored : ndarray
64
+ Number of censored samples at each time point.
65
+ """
66
+ n_samples = event.shape[0]
67
+ n_risks = event.max() if (np.issubdtype(event.dtype, np.integer) and event.max() > 1) else 0
68
+
69
+ if order is None:
70
+ order = np.argsort(time, kind="mergesort")
71
+
72
+ uniq_times = np.empty(n_samples, dtype=time.dtype)
73
+ uniq_events = np.empty((n_samples, n_risks + 1), dtype=int)
74
+ uniq_counts = np.empty(n_samples, dtype=int)
75
+
76
+ i = 0
77
+ prev_val = time[order[0]]
78
+ j = 0
79
+ while True:
80
+ count_event = np.zeros(n_risks + 1, dtype=int)
81
+ count = 0
82
+ while i < n_samples and prev_val == time[order[i]]:
83
+ event_type = event[order[i]]
84
+ if event_type:
85
+ count_event[0] += 1
86
+ if n_risks:
87
+ count_event[event_type] += 1
88
+
89
+ count += 1
90
+ i += 1
91
+
92
+ uniq_times[j] = prev_val
93
+ uniq_events[j] = count_event
94
+ uniq_counts[j] = count
95
+ j += 1
96
+
97
+ if i == n_samples:
98
+ break
99
+
100
+ prev_val = time[order[i]]
101
+
102
+ times = np.resize(uniq_times, j)
103
+ total_count = np.resize(uniq_counts, j)
104
+ if n_risks:
105
+ n_events = np.resize(uniq_events, (j, n_risks + 1))
106
+ n_censored = total_count - n_events[:, 0]
107
+ else:
108
+ n_events = np.resize(uniq_events, j)
109
+ n_censored = total_count - n_events
110
+
111
+ # offset cumulative sum by one
112
+ total_count = np.r_[0, total_count]
113
+ n_at_risk = n_samples - np.cumsum(total_count)
114
+
115
+ return times, n_events, n_at_risk[:-1], n_censored
116
+
117
+
118
+ def _compute_counts_truncated(event, time_enter, time_exit):
119
+ """Compute counts for left truncated and right-censored survival data.
120
+
121
+ Parameters
122
+ ----------
123
+ event : ndarray
124
+ Boolean event indicator.
125
+
126
+ time_enter : ndarray
127
+ Time when a subject entered the study.
128
+
129
+ time_exit : ndarray
130
+ Time when a subject left the study due to an
131
+ event or censoring.
132
+
133
+ Returns
134
+ -------
135
+ times : ndarray
136
+ Unique time points.
137
+
138
+ n_events : ndarray
139
+ Number of events at each time point.
140
+
141
+ n_at_risk : ndarray
142
+ Number of samples that are censored or have an event at each time point.
143
+ """
144
+ if (time_enter > time_exit).any():
145
+ raise ValueError("exit time must be larger start time for all samples")
146
+
147
+ n_samples = event.shape[0]
148
+
149
+ uniq_times = np.sort(np.unique(np.r_[time_enter, time_exit]), kind="mergesort")
150
+ total_counts = np.empty(len(uniq_times), dtype=int)
151
+ event_counts = np.empty(len(uniq_times), dtype=int)
152
+
153
+ order_enter = np.argsort(time_enter, kind="mergesort")
154
+ order_exit = np.argsort(time_exit, kind="mergesort")
155
+ s_time_enter = time_enter[order_enter]
156
+ s_time_exit = time_exit[order_exit]
157
+
158
+ t0 = uniq_times[0]
159
+ # everything larger is included
160
+ idx_enter = np.searchsorted(s_time_enter, t0, side="right")
161
+ # everything smaller is excluded
162
+ idx_exit = np.searchsorted(s_time_exit, t0, side="left")
163
+
164
+ total_counts[0] = idx_enter
165
+ # except people die on the day they enter
166
+ event_counts[0] = 0
167
+
168
+ for i in range(1, len(uniq_times)):
169
+ ti = uniq_times[i]
170
+
171
+ while idx_enter < n_samples and s_time_enter[idx_enter] < ti:
172
+ idx_enter += 1
173
+
174
+ while idx_exit < n_samples and s_time_exit[idx_exit] < ti:
175
+ idx_exit += 1
176
+
177
+ risk_set = np.setdiff1d(order_enter[:idx_enter], order_exit[:idx_exit], assume_unique=True)
178
+ total_counts[i] = len(risk_set)
179
+
180
+ count_event = 0
181
+ k = idx_exit
182
+ while k < n_samples and s_time_exit[k] == ti:
183
+ if event[order_exit[k]]:
184
+ count_event += 1
185
+ k += 1
186
+ event_counts[i] = count_event
187
+
188
+ return uniq_times, event_counts, total_counts
189
+
190
+
191
+ def _ci_logmlog(s, sigma_t, conf_level):
192
+ r"""Compute the pointwise log-minus-log transformed confidence intervals.
193
+ s refers to the prob_survival or the cum_inc (for the competing risks case).
194
+ sigma_t is the square root of the variance of the log of the estimator of s.
195
+
196
+ .. math::
197
+
198
+ \sigma_t = \mathrm{Var}(\log(\hat{S}(t)))
199
+ """
200
+ eps = np.finfo(s.dtype).eps
201
+ mask = s > eps
202
+ log_p = np.zeros_like(s)
203
+ np.log(s, where=mask, out=log_p)
204
+ theta = np.zeros_like(s)
205
+ np.true_divide(sigma_t, log_p, where=log_p < -eps, out=theta)
206
+
207
+ z = stats.norm.isf((1.0 - conf_level) / 2.0)
208
+ theta = z * np.multiply.outer([-1, 1], theta)
209
+ ci = np.exp(np.exp(theta) * log_p)
210
+ ci[:, ~mask] = 0.0
211
+ return ci
212
+
213
+
214
+ def _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type):
215
+ """Helper to compute confidence intervals for the Kaplan-Meier estimate.
216
+
217
+ Parameters
218
+ ----------
219
+ prob_survival : ndarray, shape = (n_times,)
220
+ Survival probability at each unique time point.
221
+
222
+ ratio_var : ndarray, shape = (n_times,)
223
+ The variance ratio term for each unique time point.
224
+
225
+ conf_level : float
226
+ The level for a two-sided confidence interval.
227
+
228
+ conf_type : {'log-log'}
229
+ The type of confidence intervals to estimate.
230
+
231
+ Returns
232
+ -------
233
+ ci : ndarray, shape = (2, n_times)
234
+ Pointwise confidence interval.
235
+ """
236
+ if conf_type not in {"log-log"}:
237
+ raise ValueError(f"conf_type must be None or a str among {{'log-log'}}, but was {conf_type!r}")
238
+
239
+ if not isinstance(conf_level, numbers.Real) or not np.isfinite(conf_level) or conf_level <= 0 or conf_level >= 1.0:
240
+ raise ValueError(f"conf_level must be a float in the range (0.0, 1.0), but was {conf_level!r}")
241
+
242
+ sigma = np.sqrt(np.cumsum(ratio_var))
243
+ ci = _ci_logmlog(prob_survival, sigma, conf_level)
244
+ return ci
245
+
246
+
247
+ def kaplan_meier_estimator(
248
+ event,
249
+ time_exit,
250
+ time_enter=None,
251
+ time_min=None,
252
+ reverse=False,
253
+ conf_level=0.95,
254
+ conf_type=None,
255
+ ):
256
+ """Computes the Kaplan-Meier estimate of the survival function.
257
+
258
+ See [1]_ for further description.
259
+
260
+ Parameters
261
+ ----------
262
+ event : array-like, shape = (n_samples,)
263
+ A boolean array where ``True`` indicates an event and ``False`` indicates
264
+ right-censoring.
265
+
266
+ time_exit : array-like, shape = (n_samples,)
267
+ Time of event or censoring.
268
+
269
+ time_enter : array-like, shape = (n_samples,), optional
270
+ Contains time when each individual entered the study for
271
+ left truncated survival data.
272
+
273
+ time_min : float, optional
274
+ Compute estimator conditional on survival at least up to
275
+ the specified time.
276
+
277
+ reverse : bool, optional, default: False
278
+ Whether to estimate the censoring distribution.
279
+ When there are ties between times at which events are observed,
280
+ then events come first and are subtracted from the denominator.
281
+ Only available for right-censored data, i.e. `time_enter` must
282
+ be None.
283
+
284
+ conf_level : float, optional, default: 0.95
285
+ The level for a two-sided confidence interval on the survival curves.
286
+
287
+ conf_type : None or {'log-log'}, optional, default: None.
288
+ The type of confidence intervals to estimate.
289
+ If `None`, no confidence intervals are estimated.
290
+ If "log-log", estimate confidence intervals using
291
+ the log hazard or :math:`log(-log(S(t)))` as described in [2]_.
292
+
293
+ Returns
294
+ -------
295
+ time : ndarray, shape = (n_times,)
296
+ Unique times.
297
+
298
+ prob_survival : ndarray, shape = (n_times,)
299
+ Survival probability at each unique time point.
300
+ If `time_enter` is provided, estimates are conditional probabilities.
301
+
302
+ conf_int : ndarray, shape = (2, n_times)
303
+ Pointwise confidence interval of the Kaplan-Meier estimator
304
+ at each unique time point.
305
+ Only provided if `conf_type` is not None.
306
+
307
+ Examples
308
+ --------
309
+ Creating a Kaplan-Meier curve:
310
+
311
+ .. plot::
312
+
313
+ >>> import matplotlib.pyplot as plt
314
+ >>> from sksurv.datasets import load_veterans_lung_cancer
315
+ >>> from sksurv.nonparametric import kaplan_meier_estimator
316
+ >>>
317
+ >>> _, y = load_veterans_lung_cancer()
318
+ >>> time, prob_surv, conf_int = kaplan_meier_estimator(
319
+ ... y["Status"], y["Survival_in_days"], conf_type="log-log"
320
+ ... )
321
+ >>> plt.step(time, prob_surv, where="post")
322
+ [...]
323
+ >>> plt.fill_between(time, conf_int[0], conf_int[1], alpha=0.25, step="post")
324
+ <matplotlib.collections.FillBetweenPolyCollection object at 0x...>
325
+ >>> plt.ylim(0, 1)
326
+ (0.0, 1.0)
327
+ >>> plt.show() # doctest: +SKIP
328
+
329
+ See also
330
+ --------
331
+ sksurv.nonparametric.SurvivalFunctionEstimator
332
+ Estimator API of the Kaplan-Meier estimator.
333
+
334
+ References
335
+ ----------
336
+ .. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations",
337
+ Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958.
338
+ .. [2] Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the
339
+ Survival Function Based on Transformations", Scandinavian Journal of
340
+ Statistics. 1990;17(1):35–41.
341
+ """
342
+ event, time_enter, time_exit = check_y_survival(event, time_enter, time_exit, allow_all_censored=True)
343
+ check_consistent_length(event, time_enter, time_exit)
344
+
345
+ if conf_type is not None and reverse:
346
+ raise NotImplementedError("Confidence intervals of the censoring distribution is not implemented.")
347
+
348
+ if time_enter is None:
349
+ uniq_times, n_events, n_at_risk, n_censored = _compute_counts(event, time_exit)
350
+
351
+ if reverse:
352
+ n_at_risk -= n_events
353
+ n_events = n_censored
354
+ else:
355
+ if reverse:
356
+ raise ValueError("The censoring distribution cannot be estimated from left truncated data")
357
+
358
+ uniq_times, n_events, n_at_risk = _compute_counts_truncated(event, time_enter, time_exit)
359
+
360
+ # account for 0/0 = nan
361
+ ratio = np.divide(
362
+ n_events,
363
+ n_at_risk,
364
+ out=np.zeros(uniq_times.shape[0], dtype=float),
365
+ where=n_events != 0,
366
+ )
367
+ values = 1.0 - ratio
368
+
369
+ if conf_type is not None:
370
+ ratio_var = np.divide(
371
+ n_events,
372
+ n_at_risk * (n_at_risk - n_events),
373
+ out=np.zeros(uniq_times.shape[0], dtype=float),
374
+ where=(n_events != 0) & (n_at_risk != n_events),
375
+ )
376
+
377
+ if time_min is not None:
378
+ mask = uniq_times >= time_min
379
+ uniq_times = np.compress(mask, uniq_times)
380
+ values = np.compress(mask, values)
381
+
382
+ prob_survival = np.cumprod(values)
383
+
384
+ if conf_type is None:
385
+ return uniq_times, prob_survival
386
+
387
+ if time_min is not None:
388
+ ratio_var = np.compress(mask, ratio_var)
389
+
390
+ ci = _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type)
391
+
392
+ return uniq_times, prob_survival, ci
393
+
394
+
395
+ def nelson_aalen_estimator(event, time):
396
+ """Computes the Nelson-Aalen estimate of the cumulative hazard function.
397
+
398
+ See [1]_, [2]_ for further description.
399
+
400
+ Parameters
401
+ ----------
402
+ event : array-like, shape = (n_samples,)
403
+ A boolean array where ``True`` indicates an event and ``False`` indicates
404
+ right-censoring.
405
+
406
+ time : array-like, shape = (n_samples,)
407
+ Time of event or censoring.
408
+
409
+ Returns
410
+ -------
411
+ time : ndarray, shape = (n_times,)
412
+ Unique times.
413
+
414
+ cum_hazard : ndarray, shape = (n_times,)
415
+ Cumulative hazard at each unique time point.
416
+
417
+ Examples
418
+ --------
419
+ Creating a cumulative hazard curve:
420
+
421
+ .. plot::
422
+
423
+ >>> import matplotlib.pyplot as plt
424
+ >>> from sksurv.datasets import load_aids
425
+ >>> from sksurv.nonparametric import nelson_aalen_estimator
426
+ >>>
427
+ >>> _, y = load_aids(endpoint="death")
428
+ >>> time, cum_hazard = nelson_aalen_estimator(y["censor_d"], y["time_d"])
429
+ >>>
430
+ >>> plt.step(time, cum_hazard, where="post")
431
+ [...]
432
+ >>> plt.show() # doctest: +SKIP
433
+
434
+ References
435
+ ----------
436
+ .. [1] Nelson, W., "Theory and applications of hazard plotting for censored failure data",
437
+ Technometrics, vol. 14, pp. 945-965, 1972.
438
+
439
+ .. [2] Aalen, O. O., "Nonparametric inference for a family of counting processes",
440
+ Annals of Statistics, vol. 6, pp. 701–726, 1978.
441
+ """
442
+ event, time = check_y_survival(event, time)
443
+ check_consistent_length(event, time)
444
+ uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time)
445
+
446
+ y = np.cumsum(n_events / n_at_risk)
447
+
448
+ return uniq_times, y
449
+
450
+
451
+ def ipc_weights(event, time):
452
+ """Compute inverse probability of censoring weights
453
+
454
+ Parameters
455
+ ----------
456
+ event : array-like, shape = (n_samples,)
457
+ A boolean array where ``True`` indicates an event and ``False`` indicates
458
+ right-censoring.
459
+
460
+ time : array-like, shape = (n_samples,)
461
+ Time when a subject experienced an event or was censored.
462
+
463
+ Returns
464
+ -------
465
+ weights : ndarray, shape = (n_samples,)
466
+ inverse probability of censoring weights
467
+
468
+ See also
469
+ --------
470
+ CensoringDistributionEstimator
471
+ An estimator interface for estimating inverse probability
472
+ of censoring weights for unseen time points.
473
+ """
474
+ if event.all():
475
+ return np.ones(time.shape[0])
476
+
477
+ unique_time, p = kaplan_meier_estimator(event, time, reverse=True)
478
+
479
+ idx = np.searchsorted(unique_time, time[event])
480
+ Ghat = p[idx]
481
+
482
+ assert (Ghat > 0).all()
483
+
484
+ weights = np.zeros(time.shape[0])
485
+ weights[event] = 1.0 / Ghat
486
+
487
+ return weights
488
+
489
+
490
+ class SurvivalFunctionEstimator(BaseEstimator):
491
+ """Kaplan–Meier estimate of the survival function.
492
+
493
+ Parameters
494
+ ----------
495
+ conf_level : float, optional, default: 0.95
496
+ The level for a two-sided confidence interval on the survival curves.
497
+
498
+ conf_type : None or {'log-log'}, optional, default: None.
499
+ The type of confidence intervals to estimate.
500
+ If `None`, no confidence intervals are estimated.
501
+ If "log-log", estimate confidence intervals using
502
+ the log hazard or :math:`log(-log(S(t)))`.
503
+
504
+ See also
505
+ --------
506
+ sksurv.nonparametric.kaplan_meier_estimator
507
+ Functional API of the Kaplan-Meier estimator.
508
+ """
509
+
510
+ _parameter_constraints = {
511
+ "conf_level": [Interval(numbers.Real, 0.0, 1.0, closed="neither")],
512
+ "conf_type": [None, StrOptions({"log-log"})],
513
+ }
514
+
515
+ def __init__(self, conf_level=0.95, conf_type=None):
516
+ self.conf_level = conf_level
517
+ self.conf_type = conf_type
518
+
519
+ def fit(self, y):
520
+ """Estimate survival distribution from training data.
521
+
522
+ Parameters
523
+ ----------
524
+ y : structured array, shape = (n_samples,)
525
+ A structured array with two fields. The first field is a boolean
526
+ where ``True`` indicates an event and ``False`` indicates right-censoring.
527
+ The second field is a float with the time of event or time of censoring.
528
+
529
+ Returns
530
+ -------
531
+ self
532
+ """
533
+ self._validate_params()
534
+ event, time = check_y_survival(y, allow_all_censored=True)
535
+
536
+ values = kaplan_meier_estimator(event, time, conf_level=self.conf_level, conf_type=self.conf_type)
537
+ if self.conf_type is None:
538
+ unique_time, prob = values
539
+ else:
540
+ unique_time, prob, conf_int = values
541
+ self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int))
542
+
543
+ self.unique_time_ = np.r_[-np.inf, unique_time]
544
+ self.prob_ = np.r_[1.0, prob]
545
+
546
+ return self
547
+
548
+ def predict_proba(self, time, return_conf_int=False):
549
+ r"""Return probability of remaining event-free at given time points.
550
+
551
+ :math:`\hat{S}(t) = P(T > t)`
552
+
553
+ Parameters
554
+ ----------
555
+ time : array-like, shape = (n_samples,)
556
+ Time to estimate probability at.
557
+
558
+ return_conf_int : bool, optional, default: False
559
+ Whether to return the pointwise confidence interval
560
+ of the survival function.
561
+ Only available if :meth:`fit()` has been called
562
+ with the `conf_type` parameter set.
563
+
564
+ Returns
565
+ -------
566
+ prob : ndarray, shape = (n_samples,)
567
+ Probability of remaining event-free at the given time points.
568
+
569
+ conf_int : ndarray, shape = (2, n_samples)
570
+ Pointwise confidence interval at the passed time points.
571
+ Only provided if `return_conf_int` is True.
572
+ """
573
+ check_is_fitted(self, "unique_time_")
574
+ if return_conf_int and not hasattr(self, "conf_int_"):
575
+ raise ValueError(
576
+ "If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None"
577
+ )
578
+
579
+ time = check_array(time, ensure_2d=False, estimator=self, input_name="time")
580
+
581
+ # K-M is undefined if estimate at last time point is non-zero
582
+ extends = time > self.unique_time_[-1]
583
+ if self.prob_[-1] > 0 and extends.any():
584
+ raise ValueError(f"time must be smaller than largest observed time point: {self.unique_time_[-1]}")
585
+
586
+ # beyond last time point is zero probability
587
+ Shat = np.empty(time.shape, dtype=float)
588
+ Shat[extends] = 0.0
589
+
590
+ valid = ~extends
591
+ time = time[valid]
592
+ idx = np.searchsorted(self.unique_time_, time)
593
+ # for non-exact matches, we need to shift the index to left
594
+ eps = np.finfo(self.unique_time_.dtype).eps
595
+ exact = np.absolute(self.unique_time_[idx] - time) < eps
596
+ idx[~exact] -= 1
597
+ Shat[valid] = self.prob_[idx]
598
+
599
+ if not return_conf_int:
600
+ return Shat
601
+
602
+ ci = np.empty((2, time.shape[0]), dtype=float)
603
+ ci[:, extends] = np.nan
604
+ ci[:, valid] = self.conf_int_[:, idx]
605
+ return Shat, ci
606
+
607
+
608
+ class CensoringDistributionEstimator(SurvivalFunctionEstimator):
609
+ """Kaplan–Meier estimator for the censoring distribution."""
610
+
611
+ def fit(self, y):
612
+ """Estimate censoring distribution from training data.
613
+
614
+ Parameters
615
+ ----------
616
+ y : structured array, shape = (n_samples,)
617
+ A structured array with two fields. The first field is a boolean
618
+ where ``True`` indicates an event and ``False`` indicates right-censoring.
619
+ The second field is a float with the time of event or time of censoring.
620
+
621
+ Returns
622
+ -------
623
+ self
624
+ """
625
+ event, time = check_y_survival(y)
626
+ if event.all():
627
+ self.unique_time_ = np.unique(time)
628
+ self.prob_ = np.ones(self.unique_time_.shape[0])
629
+ else:
630
+ unique_time, prob = kaplan_meier_estimator(event, time, reverse=True)
631
+ self.unique_time_ = np.r_[-np.inf, unique_time]
632
+ self.prob_ = np.r_[1.0, prob]
633
+
634
+ return self
635
+
636
+ def predict_ipcw(self, y):
637
+ r"""Return inverse probability of censoring weights at given time points.
638
+
639
+ :math:`\omega_i = \delta_i / \hat{G}(y_i)`
640
+
641
+ Parameters
642
+ ----------
643
+ y : structured array, shape = (n_samples,)
644
+ A structured array with two fields. The first field is a boolean
645
+ where ``True`` indicates an event and ``False`` indicates right-censoring.
646
+ The second field is a float with the time of event or time of censoring.
647
+
648
+ Returns
649
+ -------
650
+ ipcw : ndarray, shape = (n_samples,)
651
+ Inverse probability of censoring weights.
652
+ """
653
+ event, time = check_y_survival(y)
654
+ Ghat = self.predict_proba(time[event])
655
+
656
+ if (Ghat == 0.0).any():
657
+ raise ValueError("censoring survival function is zero at one or more time points")
658
+
659
+ weights = np.zeros(time.shape[0])
660
+ weights[event] = 1.0 / Ghat
661
+
662
+ return weights
663
+
664
+
665
+ def _cum_inc_cr_ci_estimator(cum_inc, var, conf_level, conf_type):
666
+ if conf_type not in {"log-log"}:
667
+ raise ValueError(f"conf_type must be None or a str among {{'log-log'}}, but was {conf_type!r}")
668
+
669
+ if not isinstance(conf_level, numbers.Real) or not np.isfinite(conf_level) or conf_level <= 0 or conf_level >= 1.0:
670
+ raise ValueError(f"conf_level must be a float in the range (0.0, 1.0), but was {conf_level!r}")
671
+ eps = np.finfo(var.dtype).eps
672
+ sigma = np.zeros_like(var)
673
+ np.divide(np.sqrt(var), cum_inc, where=var > eps, out=sigma)
674
+ ci = _ci_logmlog(cum_inc, sigma, conf_level)
675
+ # make first axis the competing risks, the second axis the lower and upper confidence interval
676
+ ci = np.swapaxes(ci, 0, 1)
677
+ return ci
678
+
679
+
680
+ def cumulative_incidence_competing_risks(
681
+ event,
682
+ time_exit,
683
+ time_min=None,
684
+ conf_level=0.95,
685
+ conf_type=None,
686
+ var_type="Aalen",
687
+ ):
688
+ """Non-parametric estimator of Cumulative Incidence function in the case of competing risks.
689
+
690
+ See the :ref:`User Guide </user_guide/competing-risks.ipynb>` and [1]_ for further details.
691
+
692
+ Parameters
693
+ ----------
694
+ event : array-like, shape = (n_samples,), dtype = int
695
+ Contains event indicators. A value of 0 indicates right-censoring,
696
+ while a positive integer from 1 to `n_risks` corresponds to a specific risk.
697
+ `n_risks` is the total number of different risks.
698
+ It assumes there are events for all possible risks.
699
+
700
+ time_exit : array-like, shape = (n_samples,)
701
+ Contains event or censoring times.
702
+
703
+ time_min : float, optional, default: None
704
+ Compute estimator conditional on survival at least up to
705
+ the specified time.
706
+
707
+ conf_level : float, optional, default: 0.95
708
+ The level for a two-sided confidence interval on the cumulative incidence curves.
709
+
710
+ conf_type : None or {'log-log'}, optional, default: None.
711
+ The type of confidence intervals to estimate.
712
+ If `None`, no confidence intervals are estimated.
713
+ If "log-log", estimate confidence intervals using
714
+ the log hazard or :math:`log(-log(S(t)))`.
715
+
716
+ var_type : {'Aalen', 'Dinse', 'Dinse_Approx'}, optional, default: 'Aalen'
717
+ The method for estimating the variance of the estimator.
718
+ See [2]_, [3]_ and [4]_ for each of the methods.
719
+ Only used if `conf_type` is not None.
720
+
721
+ Returns
722
+ -------
723
+ time : ndarray, shape = (n_times,)
724
+ Unique times.
725
+
726
+ cum_incidence : ndarray, shape = (n_risks + 1, n_times)
727
+ Cumulative incidence for each risk. The first row (``cum_incidence[0]``)
728
+ is the cumulative incidence of any risk (total risk). The remaining
729
+ rows (``cum_incidence[1:]``) are the cumulative incidences for each
730
+ competing risk.
731
+
732
+ conf_int : ndarray, shape = (n_risks + 1, 2, n_times)
733
+ Pointwise confidence interval (second axis) of the cumulative incidence function
734
+ at each unique time point (last axis)
735
+ for all possible risks (first axis), including overall risk (``conf_int[0]``).
736
+ Only provided if `conf_type` is not None.
737
+
738
+ Examples
739
+ --------
740
+ Creating cumulative incidence curves:
741
+
742
+ .. plot::
743
+
744
+ >>> import matplotlib.pyplot as plt
745
+ >>> from sksurv.datasets import load_bmt
746
+ >>> from sksurv.nonparametric import cumulative_incidence_competing_risks
747
+ >>>
748
+ >>> dis, bmt_df = load_bmt()
749
+ >>> event = bmt_df["status"]
750
+ >>> time = bmt_df["ftime"]
751
+ >>> n_risks = event.max()
752
+ >>>
753
+ >>> x, y, conf_int = cumulative_incidence_competing_risks(
754
+ ... event, time, conf_type="log-log"
755
+ ... )
756
+ >>>
757
+ >>> plt.step(x, y[0], where="post", label="Total risk")
758
+ [...]
759
+ >>> plt.fill_between(x, conf_int[0, 0], conf_int[0, 1], alpha=0.25, step="post")
760
+ <matplotlib.collections.FillBetweenPolyCollection object at 0x...>
761
+ >>> for i in range(1, n_risks + 1):
762
+ ... plt.step(x, y[i], where="post", label=f"{i}-risk")
763
+ ... plt.fill_between(x, conf_int[i, 0], conf_int[i, 1], alpha=0.25, step="post")
764
+ [...]
765
+ <matplotlib.collections.FillBetweenPolyCollection object at 0x...>
766
+ >>> plt.ylim(0, 1)
767
+ (0.0, 1.0)
768
+ >>> plt.legend()
769
+ <matplotlib.legend.Legend object at 0x...>
770
+ >>> plt.show() # doctest: +SKIP
771
+
772
+ References
773
+ ----------
774
+ .. [1] Kalbfleisch, J.D. and Prentice, R.L. (2002)
775
+ The Statistical Analysis of Failure Time Data. 2nd Edition, John Wiley and Sons, New York.
776
+ .. [2] Aalen, O. (1978a). Annals of Statistics, 6, 534–545.
777
+ We implement the formula in M. Pintilie: "Competing Risks: A Practical Perspective".
778
+ John Wiley & Sons, 2006, Eq. 4.5
779
+ .. [3] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eqs. 4 and 5.
780
+ .. [4] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eq. 6.
781
+ """
782
+ event, time_exit = check_y_survival(event, time_exit, allow_all_censored=True, competing_risks=True)
783
+ check_consistent_length(event, time_exit)
784
+
785
+ n_risks = event.max()
786
+ uniq_times, n_events_cr, n_at_risk, _n_censored = _compute_counts(event, time_exit)
787
+
788
+ # account for 0/0 = nan
789
+ n_t = uniq_times.shape[0]
790
+ ratio = np.divide(
791
+ n_events_cr,
792
+ n_at_risk[..., np.newaxis],
793
+ out=np.zeros((n_t, n_risks + 1), dtype=float),
794
+ where=n_events_cr != 0,
795
+ )
796
+
797
+ if time_min is not None:
798
+ mask = uniq_times >= time_min
799
+ uniq_times = np.compress(mask, uniq_times)
800
+ ratio = np.compress(mask, ratio, axis=0)
801
+
802
+ kpe = np.cumprod(1.0 - ratio[:, 0])
803
+ kpe_prime = np.r_[1.0, kpe[:-1]]
804
+ cum_inc = np.empty((n_risks + 1, n_t), dtype=float)
805
+ cum_inc[0] = 1.0 - kpe
806
+ cum_inc[1:] = np.cumsum((ratio[:, 1:].T * kpe_prime), axis=1)
807
+
808
+ if conf_type is None:
809
+ return uniq_times, cum_inc
810
+
811
+ if var_type == "Aalen":
812
+ var = _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc)
813
+ elif var_type == "Dinse_Approx":
814
+ var = _var_dinse_approx(n_events_cr, kpe_prime, n_at_risk, cum_inc)
815
+ elif var_type == "Dinse":
816
+ var = _var_dinse(n_events_cr, kpe_prime, n_at_risk)
817
+ else:
818
+ raise ValueError(f"{var_type=} must be one of 'Aalen', 'Dinse', or 'Dinse_Approx'.")
819
+
820
+ _x, _y, conf_int_km = kaplan_meier_estimator(event > 0, time_exit, conf_type="log-log")
821
+ ci = np.empty(shape=(n_risks + 1, 2, n_t), dtype=conf_int_km.dtype)
822
+ ci[0, :, :] = 1 - conf_int_km
823
+ ci[1:, :, :] = _cum_inc_cr_ci_estimator(cum_inc[1:], var, conf_level, conf_type)
824
+
825
+ return uniq_times, cum_inc, ci
826
+
827
+
828
+ def _var_dinse_approx(n_events_cr, kpe_prime, n_at_risk, cum_inc):
829
+ """
830
+ Variance estimator from Dinse and Larson, Biometrika (1986), 379
831
+ See Section 4, Eqs. 6.
832
+ This is an approximation from the _var_dinse, so that one should be preferred.
833
+ However, this seems to be more common in the literature.
834
+ """
835
+ dr = n_events_cr[:, 0]
836
+ dr_cr = n_events_cr[:, 1:].T
837
+ irt = cum_inc[1:, :, np.newaxis] - cum_inc[1:, np.newaxis, :]
838
+ mask = np.tril(np.ones_like(irt[0]))
839
+
840
+ # var_a = np.sum(irt**2 * mask * (dr / (n_at_risk * (n_at_risk - dr))), axis=2)
841
+ var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, dr / (n_at_risk * (n_at_risk - dr)))
842
+ var_b = np.cumsum(((n_at_risk - dr_cr) / n_at_risk) * (dr_cr / n_at_risk**2) * kpe_prime**2, axis=1)
843
+ # var_c = -2 * np.sum(irt * mask * dr_cr[:, np.newaxis, :] * (kpe_prime / n_at_risk**2), axis=2)
844
+ var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, dr_cr, kpe_prime / n_at_risk**2)
845
+
846
+ var = var_a + var_b + var_c
847
+ return var
848
+
849
+
850
+ def _var_dinse(n_events_cr, kpe_prime, n_at_risk):
851
+ """
852
+ Variance estimator from Dinse and Larson, Biometrika (1986), 379
853
+ See Section 4, Eqs. 4 and 5
854
+ """
855
+ dr = n_events_cr[:, 0]
856
+ dr_cr = n_events_cr[:, 1:].T
857
+ theta = dr_cr * kpe_prime / n_at_risk
858
+ x = dr / (n_at_risk * (n_at_risk - dr))
859
+ cprod = np.cumprod(1 + x) / (1 + x)
860
+
861
+ nt_range = np.arange(dr.size)
862
+ i_idx = nt_range[:, None, None]
863
+ j_idx = nt_range[None, :, None]
864
+ k_idx = nt_range[None, None, :]
865
+ mask = ((j_idx < i_idx) & (k_idx > j_idx) & (k_idx <= i_idx)).astype(int)
866
+
867
+ _v1 = np.zeros_like(theta)
868
+ np.divide((n_at_risk - dr_cr), n_at_risk * dr_cr, out=_v1, where=dr_cr > 0)
869
+ v1 = np.cumsum(theta**2 * ((1 + _v1) * cprod - 1), axis=1)
870
+
871
+ corr = (1 - 1 / n_at_risk) * cprod - 1
872
+ v2 = 2 * np.einsum("rj,rk,ijk->ri", theta * corr, theta, mask)
873
+ var = v1 + v2
874
+
875
+ return var
876
+
877
+
878
+ def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc):
879
+ """
880
+ Variance estimator from Aalen
881
+ Aalen, O. (1978a). Nonparametric estimation of partial transition
882
+ probabilities in multiple decrement models. Annals of Statistics, 6, 534–545.
883
+ We implement it as shown in
884
+ M. Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006, Eq. 4.5
885
+ This seems to be the estimator used in cmprsk, but there are some numerical differences with our implementation.
886
+ """
887
+ dr = n_events_cr[:, 0]
888
+ dr_cr = n_events_cr[:, 1:].T
889
+ irt = cum_inc[1:, :, np.newaxis] - cum_inc[1:, np.newaxis, :]
890
+ mask = np.tril(np.ones_like(irt[0]))
891
+
892
+ _va = np.zeros_like(kpe_prime)
893
+ den_a = (n_at_risk - 1) * (n_at_risk - dr)
894
+ np.divide(dr, den_a, out=_va, where=den_a > 0)
895
+ # var_a = np.sum(irt**2 * mask * _va, axis=2)
896
+ var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, _va)
897
+
898
+ _vb = np.zeros_like(kpe_prime)
899
+ den_b = (n_at_risk - 1) * n_at_risk**2
900
+ np.divide(1.0, den_b, out=_vb, where=den_b > 0)
901
+ var_b = np.cumsum((n_at_risk - dr_cr) * dr_cr * _vb * kpe_prime**2, axis=1)
902
+
903
+ _vca = dr_cr * (n_at_risk - dr_cr)
904
+ _vcb = np.zeros_like(kpe_prime)
905
+ den_c = n_at_risk * (n_at_risk - dr) * (n_at_risk - 1)
906
+ np.divide(kpe_prime, den_c, out=_vcb, where=den_c > 0)
907
+ # var_c = -2 * np.sum(irt * mask * _vca[:, np.newaxis, :] * _vcb, axis=2)
908
+ var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, _vca, _vcb)
909
+
910
+ var = var_a + var_b + var_c
911
+ return var