scikit-survival 0.26.0__cp314-cp314-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. scikit_survival-0.26.0.dist-info/METADATA +185 -0
  2. scikit_survival-0.26.0.dist-info/RECORD +58 -0
  3. scikit_survival-0.26.0.dist-info/WHEEL +6 -0
  4. scikit_survival-0.26.0.dist-info/licenses/COPYING +674 -0
  5. scikit_survival-0.26.0.dist-info/top_level.txt +1 -0
  6. sksurv/__init__.py +183 -0
  7. sksurv/base.py +115 -0
  8. sksurv/bintrees/__init__.py +15 -0
  9. sksurv/bintrees/_binarytrees.cpython-314-darwin.so +0 -0
  10. sksurv/column.py +204 -0
  11. sksurv/compare.py +123 -0
  12. sksurv/datasets/__init__.py +12 -0
  13. sksurv/datasets/base.py +614 -0
  14. sksurv/datasets/data/GBSG2.arff +700 -0
  15. sksurv/datasets/data/actg320.arff +1169 -0
  16. sksurv/datasets/data/bmt.arff +46 -0
  17. sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
  18. sksurv/datasets/data/cgvhd.arff +118 -0
  19. sksurv/datasets/data/flchain.arff +7887 -0
  20. sksurv/datasets/data/veteran.arff +148 -0
  21. sksurv/datasets/data/whas500.arff +520 -0
  22. sksurv/docstrings.py +99 -0
  23. sksurv/ensemble/__init__.py +2 -0
  24. sksurv/ensemble/_coxph_loss.cpython-314-darwin.so +0 -0
  25. sksurv/ensemble/boosting.py +1564 -0
  26. sksurv/ensemble/forest.py +902 -0
  27. sksurv/ensemble/survival_loss.py +151 -0
  28. sksurv/exceptions.py +18 -0
  29. sksurv/functions.py +114 -0
  30. sksurv/io/__init__.py +2 -0
  31. sksurv/io/arffread.py +91 -0
  32. sksurv/io/arffwrite.py +181 -0
  33. sksurv/kernels/__init__.py +1 -0
  34. sksurv/kernels/_clinical_kernel.cpython-314-darwin.so +0 -0
  35. sksurv/kernels/clinical.py +348 -0
  36. sksurv/linear_model/__init__.py +3 -0
  37. sksurv/linear_model/_coxnet.cpython-314-darwin.so +0 -0
  38. sksurv/linear_model/aft.py +208 -0
  39. sksurv/linear_model/coxnet.py +592 -0
  40. sksurv/linear_model/coxph.py +637 -0
  41. sksurv/meta/__init__.py +4 -0
  42. sksurv/meta/base.py +35 -0
  43. sksurv/meta/ensemble_selection.py +724 -0
  44. sksurv/meta/stacking.py +370 -0
  45. sksurv/metrics.py +1028 -0
  46. sksurv/nonparametric.py +911 -0
  47. sksurv/preprocessing.py +195 -0
  48. sksurv/svm/__init__.py +11 -0
  49. sksurv/svm/_minlip.cpython-314-darwin.so +0 -0
  50. sksurv/svm/_prsvm.cpython-314-darwin.so +0 -0
  51. sksurv/svm/minlip.py +695 -0
  52. sksurv/svm/naive_survival_svm.py +249 -0
  53. sksurv/svm/survival_svm.py +1236 -0
  54. sksurv/testing.py +155 -0
  55. sksurv/tree/__init__.py +1 -0
  56. sksurv/tree/_criterion.cpython-314-darwin.so +0 -0
  57. sksurv/tree/tree.py +790 -0
  58. sksurv/util.py +416 -0
@@ -0,0 +1,614 @@
1
+ import warnings
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from pandas.api.types import CategoricalDtype
6
+
7
+ from ..column import categorical_to_numeric, standardize
8
+ from ..io import loadarff
9
+ from ..util import safe_concat
10
+
11
+ __all__ = [
12
+ "get_x_y",
13
+ "load_arff_files_standardized",
14
+ "load_aids",
15
+ "load_bmt",
16
+ "load_cgvhd",
17
+ "load_breast_cancer",
18
+ "load_flchain",
19
+ "load_gbsg2",
20
+ "load_whas500",
21
+ "load_veterans_lung_cancer",
22
+ ]
23
+
24
+
25
+ def _get_data_path(name):
26
+ from importlib.resources import files
27
+
28
+ return files(__package__) / "data" / name
29
+
30
+
31
+ def _get_x_y_survival(dataset, col_event, col_time, val_outcome, competing_risks=False):
32
+ if col_event is None or col_time is None:
33
+ y = None
34
+ x_frame = dataset
35
+ else:
36
+ event_type = np.int64 if competing_risks else bool
37
+ y = np.empty(dtype=[(col_event, event_type), (col_time, np.float64)], shape=dataset.shape[0])
38
+ if competing_risks:
39
+ y[col_event] = dataset[col_event].to_numpy()
40
+ else:
41
+ y[col_event] = (dataset[col_event] == val_outcome).to_numpy()
42
+ y[col_time] = dataset[col_time].to_numpy()
43
+
44
+ x_frame = dataset.drop([col_event, col_time], axis=1)
45
+
46
+ return x_frame, y
47
+
48
+
49
+ def _get_x_y_other(dataset, col_label):
50
+ if col_label is None:
51
+ y = None
52
+ x_frame = dataset
53
+ else:
54
+ y = dataset.loc[:, col_label]
55
+ x_frame = dataset.drop(col_label, axis=1)
56
+
57
+ return x_frame, y
58
+
59
+
60
+ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True, competing_risks=False):
61
+ """Split data frame into features and labels.
62
+
63
+ Parameters
64
+ ----------
65
+ data_frame : pandas.DataFrame, shape = (n_samples, n_columns)
66
+ A data frame.
67
+
68
+ attr_labels : sequence of str or None
69
+ A list of one or more columns that are considered the label.
70
+ If `survival` is `True`, then attr_labels has two elements:
71
+ 1) the name of the column denoting the event indicator, and
72
+ 2) the name of the column denoting the survival time.
73
+ If the sequence contains `None`, then labels are not retrieved
74
+ and only a data frame with features is returned.
75
+
76
+ pos_label : any, optional
77
+ Which value of the event indicator column denotes that a
78
+ patient experienced an event. This value is ignored if
79
+ `survival` is `False`.
80
+
81
+ survival : bool, optional, default: True
82
+ Whether to return `y` that can be used for survival analysis.
83
+
84
+ competing_risks : bool, optional, default: False
85
+ Whether `y` refers to competing risks situation. Only used if `survival` is `True`.
86
+
87
+ Returns
88
+ -------
89
+ X : pandas.DataFrame, shape = (n_samples, n_columns - len(attr_labels))
90
+ Data frame containing features.
91
+
92
+ y : structured array, shape = (n_samples,), or pandas.DataFrame, shape = (n_samples, len(attr_labels)), or None
93
+ If `survival` is `True`, a structured array with two fields.
94
+ The first field is a boolean where ``True`` indicates an event and ``False``
95
+ indicates right-censoring. The second field is a float with the time of
96
+ event or time of censoring.
97
+
98
+ If `survival` is `False` and `attr_labels` not `None`, a :class:`pandas.DataFrame`
99
+ with columns specified by `attr_labels`.
100
+
101
+ If `survival` is `False` and `attr_labels` is `None`, `y` is set to `None`.
102
+ """
103
+ if survival:
104
+ if len(attr_labels) != 2:
105
+ raise ValueError(f"expected sequence of length two for attr_labels, but got {len(attr_labels)}")
106
+ if pos_label is None and not competing_risks:
107
+ raise ValueError("pos_label needs to be specified if survival=True")
108
+ return _get_x_y_survival(data_frame, attr_labels[0], attr_labels[1], pos_label, competing_risks)
109
+
110
+ return _get_x_y_other(data_frame, attr_labels)
111
+
112
+
113
+ def _loadarff_with_index(filename):
114
+ dataset = loadarff(filename)
115
+ if "index" in dataset.columns:
116
+ if isinstance(dataset["index"].dtype, CategoricalDtype):
117
+ # concatenating categorical index may raise TypeError
118
+ # see https://github.com/pandas-dev/pandas/issues/14586
119
+ dataset = dataset.astype({"index": "str"})
120
+ dataset.set_index("index", inplace=True)
121
+ return dataset
122
+
123
+
124
+ def load_arff_files_standardized(
125
+ path_training,
126
+ attr_labels,
127
+ pos_label=None,
128
+ path_testing=None,
129
+ survival=True,
130
+ standardize_numeric=True,
131
+ to_numeric=True,
132
+ ):
133
+ """Load dataset in ARFF format.
134
+
135
+ Parameters
136
+ ----------
137
+ path_training : str
138
+ Path to ARFF file containing data.
139
+
140
+ attr_labels : sequence of str
141
+ Names of attributes denoting dependent variables.
142
+ If ``survival`` is set, it must be a sequence with two items:
143
+ the name of the event indicator and the name of the survival/censoring time.
144
+
145
+ pos_label : any type, optional
146
+ Value corresponding to an event in survival analysis.
147
+ Only considered if ``survival`` is ``True``.
148
+
149
+ path_testing : str, optional
150
+ Path to ARFF file containing hold-out data. Only columns that are available in both
151
+ training and testing are considered (excluding dependent variables).
152
+ If ``standardize_numeric`` is set, data is normalized by considering both training
153
+ and testing data.
154
+
155
+ survival : bool, optional, default: True
156
+ Whether the dependent variables denote event indicator and survival/censoring time.
157
+
158
+ standardize_numeric : bool, optional, default: True
159
+ Whether to standardize data to zero mean and unit variance.
160
+ See :func:`sksurv.column.standardize`.
161
+
162
+ to_numeric : bool, optional, default: True
163
+ Whether to convert categorical variables to numeric values.
164
+ See :func:`sksurv.column.categorical_to_numeric`.
165
+
166
+ Returns
167
+ -------
168
+ x_train : pandas.DataFrame, shape = (n_train, n_features)
169
+ Training data.
170
+
171
+ y_train : structured array, shape = (n_train,), or pandas.DataFrame, shape = (n_train, len(attr_labels))
172
+ Dependent variables of training data.
173
+
174
+ If `survival` is `True`, a structured array with two fields.
175
+ The first field is a boolean where ``True`` indicates an event and ``False``
176
+ indicates right-censoring. The second field is a float with the time of
177
+ event or time of censoring.
178
+
179
+ If `survival` is `False` and `attr_labels` not `None`, a :class:`pandas.DataFrame`
180
+ with columns specified by `attr_labels`.
181
+
182
+ If `survival` is `False` and `attr_labels` is `None`, `y_train` is set to `None`.
183
+
184
+ x_test : None or pandas.DataFrame, shape = (n_test, n_features)
185
+ Testing data if `path_testing` was provided.
186
+
187
+ y_test : None or structured array, shape = (n_test,)
188
+ Dependent variables of testing data if `path_testing` was provided.
189
+
190
+ If `survival` is `True`, a structured array with two fields.
191
+ The first field is a boolean where ``True`` indicates an event and ``False``
192
+ indicates right-censoring. The second field is a float with the time of
193
+ event or time of censoring.
194
+
195
+ If `survival` is `False` and `attr_labels` not `None`, a :class:`pandas.DataFrame`
196
+ with columns specified by `attr_labels`.
197
+
198
+ If `survival` is `False` and `attr_labels` is `None`, `y_test` is set to `None`.
199
+ """
200
+ dataset = _loadarff_with_index(path_training)
201
+
202
+ x_train, y_train = get_x_y(dataset, attr_labels, pos_label, survival)
203
+
204
+ if path_testing is not None:
205
+ x_test, y_test = _load_arff_testing(path_testing, attr_labels, pos_label, survival)
206
+
207
+ if len(x_train.columns.symmetric_difference(x_test.columns)) > 0:
208
+ warnings.warn("Restricting columns to intersection between training and testing data", stacklevel=2)
209
+
210
+ cols = x_train.columns.intersection(x_test.columns)
211
+ if len(cols) == 0:
212
+ raise ValueError("columns of training and test data do not intersect")
213
+
214
+ x_train = x_train.loc[:, cols]
215
+ x_test = x_test.loc[:, cols]
216
+
217
+ x = safe_concat((x_train, x_test), axis=0)
218
+ if standardize_numeric:
219
+ x = standardize(x)
220
+ if to_numeric:
221
+ x = categorical_to_numeric(x)
222
+
223
+ n_train = x_train.shape[0]
224
+ x_train = x.iloc[:n_train, :]
225
+ x_test = x.iloc[n_train:, :]
226
+ else:
227
+ if standardize_numeric:
228
+ x_train = standardize(x_train)
229
+ if to_numeric:
230
+ x_train = categorical_to_numeric(x_train)
231
+
232
+ x_test = None
233
+ y_test = None
234
+
235
+ return x_train, y_train, x_test, y_test
236
+
237
+
238
+ def _load_arff_testing(path_testing, attr_labels, pos_label, survival):
239
+ test_dataset = _loadarff_with_index(path_testing)
240
+
241
+ has_labels = pd.Index(attr_labels).isin(test_dataset.columns).all()
242
+ if not has_labels:
243
+ if survival:
244
+ attr_labels = [None, None]
245
+ else:
246
+ attr_labels = None
247
+ return get_x_y(test_dataset, attr_labels, pos_label, survival)
248
+
249
+
250
+ def load_whas500():
251
+ """Load and return the Worcester Heart Attack Study dataset
252
+
253
+ The dataset has 500 samples and 14 features.
254
+ The endpoint is death, which occurred for 215 patients (43.0%).
255
+
256
+ See [1]_, [2]_ for further description.
257
+
258
+ Returns
259
+ -------
260
+ x : pandas.DataFrame
261
+ The measurements for each patient.
262
+
263
+ y : structured array with 2 fields
264
+ *fstat*: boolean indicating whether the endpoint has been reached
265
+ or the event time is right-censored.
266
+
267
+ *lenfol*: total length of follow-up (days from hospital admission date
268
+ to date of last follow-up)
269
+
270
+ References
271
+ ----------
272
+ .. [1] https://web.archive.org/web/20170114043458/http://www.umass.edu/statdata/statdata/data/
273
+
274
+ .. [2] Hosmer, D., Lemeshow, S., May, S.:
275
+ "Applied Survival Analysis: Regression Modeling of Time to Event Data."
276
+ John Wiley & Sons, Inc. (2008)
277
+ """
278
+ fn = _get_data_path("whas500.arff")
279
+ return get_x_y(loadarff(fn), attr_labels=["fstat", "lenfol"], pos_label="1")
280
+
281
+
282
+ def load_gbsg2():
283
+ """Load and return the German Breast Cancer Study Group 2 dataset
284
+
285
+ The dataset has 686 samples and 8 features.
286
+ The endpoint is recurrence free survival, which occurred for 299 patients (43.6%).
287
+
288
+ See [1]_, [2]_ for further description.
289
+
290
+ Returns
291
+ -------
292
+ x : pandas.DataFrame
293
+ The measurements for each patient.
294
+
295
+ y : structured array with 2 fields
296
+ *cens*: boolean indicating whether the endpoint has been reached
297
+ or the event time is right-censored.
298
+
299
+ *time*: total length of follow-up
300
+
301
+ References
302
+ ----------
303
+ .. [1] http://ascopubs.org/doi/abs/10.1200/jco.1994.12.10.2086
304
+
305
+ .. [2] Schumacher, M., Basert, G., Bojar, H., et al.
306
+ "Randomized 2 × 2 trial evaluating hormonal treatment and the duration of chemotherapy
307
+ in node-positive breast cancer patients."
308
+ Journal of Clinical Oncology 12, 2086–2093. (1994)
309
+ """
310
+ fn = _get_data_path("GBSG2.arff")
311
+ return get_x_y(loadarff(fn), attr_labels=["cens", "time"], pos_label="1")
312
+
313
+
314
+ def load_veterans_lung_cancer():
315
+ """Load and return data from the Veterans' Administration
316
+ Lung Cancer Trial
317
+
318
+ The dataset has 137 samples and 6 features.
319
+ The endpoint is death, which occurred for 128 patients (93.4%).
320
+
321
+ See [1]_ for further description.
322
+
323
+ Returns
324
+ -------
325
+ x : pandas.DataFrame
326
+ The measurements for each patient.
327
+
328
+ y : structured array with 2 fields
329
+ *Status*: boolean indicating whether the endpoint has been reached
330
+ or the event time is right-censored.
331
+
332
+ *Survival_in_days*: total length of follow-up
333
+
334
+ References
335
+ ----------
336
+ .. [1] Kalbfleisch, J.D., Prentice, R.L.:
337
+ "The Statistical Analysis of Failure Time Data." John Wiley & Sons, Inc. (2002)
338
+ """
339
+ fn = _get_data_path("veteran.arff")
340
+ return get_x_y(loadarff(fn), attr_labels=["Status", "Survival_in_days"], pos_label="dead")
341
+
342
+
343
+ def load_aids(endpoint="aids"):
344
+ """Load and return the AIDS Clinical Trial dataset
345
+
346
+ The dataset has 1,151 samples and 11 features.
347
+ The dataset has 2 endpoints:
348
+
349
+ 1. AIDS defining event, which occurred for 96 patients (8.3%)
350
+ 2. Death, which occurred for 26 patients (2.3%)
351
+
352
+ See [1]_, [2]_ for further description.
353
+
354
+ Parameters
355
+ ----------
356
+ endpoint : {'aids', 'death'}, default: 'aids'
357
+ The endpoint.
358
+
359
+ Returns
360
+ -------
361
+ x : pandas.DataFrame
362
+ The measurements for each patient.
363
+
364
+ y : structured array with 2 fields
365
+ *censor*: boolean indicating whether the endpoint has been reached
366
+ or the event time is right-censored.
367
+
368
+ *time*: total length of follow-up
369
+
370
+ If ``endpoint`` is death, the fields are named *censor_d* and *time_d*.
371
+
372
+ References
373
+ ----------
374
+ .. [1] https://web.archive.org/web/20170114043458/http://www.umass.edu/statdata/statdata/data/
375
+
376
+ .. [2] Hosmer, D., Lemeshow, S., May, S.:
377
+ "Applied Survival Analysis: Regression Modeling of Time to Event Data."
378
+ John Wiley & Sons, Inc. (2008)
379
+ """
380
+ labels_aids = ["censor", "time"]
381
+ labels_death = ["censor_d", "time_d"]
382
+ if endpoint == "aids":
383
+ attr_labels = labels_aids
384
+ drop_columns = labels_death
385
+ elif endpoint == "death":
386
+ attr_labels = labels_death
387
+ drop_columns = labels_aids
388
+ else:
389
+ raise ValueError("endpoint must be 'aids' or 'death'")
390
+
391
+ fn = _get_data_path("actg320.arff")
392
+ x, y = get_x_y(loadarff(fn), attr_labels=attr_labels, pos_label="1")
393
+ x.drop(drop_columns, axis=1, inplace=True)
394
+ return x, y
395
+
396
+
397
+ def load_breast_cancer():
398
+ """Load and return the breast cancer dataset
399
+
400
+ The dataset has 198 samples and 80 features.
401
+ The endpoint is the presence of distance metastases, which occurred for 51 patients (25.8%).
402
+
403
+ See [1]_, [2]_ for further description.
404
+
405
+ Returns
406
+ -------
407
+ x : pandas.DataFrame
408
+ The measurements for each patient.
409
+
410
+ y : structured array with 2 fields
411
+ *e.tdm*: boolean indicating whether the endpoint has been reached
412
+ or the event time is right-censored.
413
+
414
+ *t.tdm*: time to distant metastasis (days)
415
+
416
+ References
417
+ ----------
418
+ .. [1] https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE7390
419
+
420
+ .. [2] Desmedt, C., Piette, F., Loi et al.:
421
+ "Strong Time Dependence of the 76-Gene Prognostic Signature for Node-Negative Breast Cancer
422
+ Patients in the TRANSBIG Multicenter Independent Validation Series."
423
+ Clin. Cancer Res. 13(11), 3207–14 (2007)
424
+ """
425
+ fn = _get_data_path("breast_cancer_GSE7390-metastasis.arff")
426
+ return get_x_y(loadarff(fn), attr_labels=["e.tdm", "t.tdm"], pos_label="1")
427
+
428
+
429
+ def load_flchain():
430
+ """Load and return assay of serum free light chain for 7874 subjects.
431
+
432
+ The dataset has 7874 samples and 9 features:
433
+
434
+ 1. age: age in years
435
+ 2. sex: F=female, M=male
436
+ 3. sample.yr: the calendar year in which a blood sample was obtained
437
+ 4. kappa: serum free light chain, kappa portion
438
+ 5. lambda: serum free light chain, lambda portion
439
+ 6. flc.grp: the serum free light chain group for the subject, as used in the original analysis
440
+ 7. creatinine: serum creatinine
441
+ 8. mgus: whether the subject had been diagnosed with monoclonal gammapothy (MGUS)
442
+ 9. chapter: for those who died, a grouping of their primary cause of death by chapter headings
443
+ of the International Code of Diseases ICD-9
444
+
445
+ The endpoint is death, which occurred for 2169 subjects (27.5%).
446
+
447
+ See [1]_, [2]_ for further description.
448
+
449
+ Returns
450
+ -------
451
+ x : pandas.DataFrame
452
+ The measurements for each patient.
453
+
454
+ y : structured array with 2 fields
455
+ *death*: boolean indicating whether the subject died
456
+ or the event time is right-censored.
457
+
458
+ *futime*: total length of follow-up or time of death.
459
+
460
+ References
461
+ ----------
462
+ .. [1] https://doi.org/10.1016/j.mayocp.2012.03.009
463
+
464
+ .. [2] Dispenzieri, A., Katzmann, J., Kyle, R., Larson, D., Therneau, T., Colby, C., Clark, R.,
465
+ Mead, G., Kumar, S., Melton III, LJ. and Rajkumar, SV.
466
+ Use of nonclonal serum immunoglobulin free light chains to predict overall survival in
467
+ the general population, Mayo Clinic Proceedings 87:512-523. (2012)
468
+ """
469
+ fn = _get_data_path("flchain.arff")
470
+ return get_x_y(loadarff(fn), attr_labels=["death", "futime"], pos_label="dead")
471
+
472
+
473
+ def load_bmt():
474
+ """Load and return response to hematopoietic stem cell transplantation (HSCT) for acute leukemia patients.
475
+
476
+ The dataset has 35 samples and 1 feature "dis" indicating the type of leukemia::
477
+
478
+ 0=ALL (Acute Lymphoblastic Leukemia)
479
+ 1=AML (Acute Myeloid Leukemia)
480
+
481
+ The endpoint (status) is defined as
482
+
483
+ +-------+------------------------------------+---------------------+
484
+ | Value | Description | Count (%) |
485
+ +=======+====================================+=====================+
486
+ | 0 | Survival (Right-censored data) | 11 patients (31.4%) |
487
+ +-------+------------------------------------+---------------------+
488
+ | 1 | Transplant related mortality (TRM) | 9 events (25.7%) |
489
+ +-------+------------------------------------+---------------------+
490
+ | 2 | Relapse | 15 events (42.8%) |
491
+ +-------+------------------------------------+---------------------+
492
+
493
+ See [1]_ for further description and [2]_ for the dataset.
494
+
495
+ Returns
496
+ -------
497
+ x : pandas.DataFrame
498
+ The measurements for each patient.
499
+
500
+ y : structured array with 2 fields
501
+ *status*: Integer indicating the endpoint: 0-(survival i.e. right-censored data), 1-(TRM), 2-(relapse)
502
+
503
+ *ftime*: total length of follow-up or time of event.
504
+
505
+ References
506
+ ----------
507
+ .. [1] https://doi.org/10.1038/sj.bmt.1705727
508
+ Scrucca, L., Santucci, A. & Aversa, F.:
509
+ "Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"
510
+
511
+ .. [2] https://luca-scr.github.io/R/bmt.csv
512
+ """
513
+ full_path = _get_data_path("bmt.arff")
514
+ data = loadarff(full_path)
515
+ data = data.astype({"ftime": int})
516
+ return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)
517
+
518
+
519
+ def load_cgvhd():
520
+ r"""Load and return data from multicentre randomized clinical trial
521
+ initiated for patients with a myeloid malignancy who were to
522
+ undergo an allogeneic bone marrow transplant.
523
+
524
+ The dataset is a 100 patient subsample of the full data set. See [2]_ for further details.
525
+
526
+ +-------+------------+----------------------------------------------+-------------------------------------------+
527
+ | Index | Name | Description | Encoding |
528
+ +=======+============+==============================================+===========================================+
529
+ | 1 | dx | Diagnosis | | AML=acute myeloid leukaemia |
530
+ | | | | | CML=chronic myeloid leukaemia |
531
+ +-------+------------+----------------------------------------------+-------------------------------------------+
532
+ | 2 | tx | Randomized treatment | | BM=cell harvested from the bone marrow |
533
+ | | | | | PB=cell harvested from peripheral blood |
534
+ +-------+------------+----------------------------------------------+-------------------------------------------+
535
+ | 3 | extent | Extent of disease | L=limited, E=extensive |
536
+ +-------+------------+----------------------------------------------+-------------------------------------------+
537
+ | 4 | agvhdgd | Grade of acute GVHD | |
538
+ +-------+------------+----------------------------------------------+-------------------------------------------+
539
+ | 5 | age | Age | Years |
540
+ +-------+------------+----------------------------------------------+-------------------------------------------+
541
+ | 6 | survtime | Time from date of transplant to death | Years |
542
+ | | | or last follow-up | |
543
+ +-------+------------+----------------------------------------------+-------------------------------------------+
544
+ | 7 | reltime | Time from date of transplant to relapse | Years |
545
+ | | | or last follow-up | |
546
+ +-------+------------+----------------------------------------------+-------------------------------------------+
547
+ | 8 | agvhtime | Time from date of transplant to acute GVHD | Years |
548
+ | | | or last follow-up | |
549
+ +-------+------------+----------------------------------------------+-------------------------------------------+
550
+ | 9 | cgvhtime | Time from date of transplant to chronic GVHD | Years |
551
+ | | | or last follow-up | |
552
+ +-------+------------+----------------------------------------------+-------------------------------------------+
553
+ | 10 | stat | Status | 1=Dead, 0=Alive |
554
+ +-------+------------+----------------------------------------------+-------------------------------------------+
555
+ | 11 | rcens | Relapse | 1=Yes, 0=No |
556
+ +-------+------------+----------------------------------------------+-------------------------------------------+
557
+ | 12 | agvh | Acute GVHD | 1=Yes, 0=No |
558
+ +-------+------------+----------------------------------------------+-------------------------------------------+
559
+ | 13 | cgvh | Chronic GVHD | 1=Yes, 0=No |
560
+ +-------+------------+----------------------------------------------+-------------------------------------------+
561
+ | 14 | stnum | patient ID | |
562
+ +-------+------------+----------------------------------------------+-------------------------------------------+
563
+
564
+ Columns 6,7 and 9 contain the time to death, relapse and CGVHD
565
+ calculated in years (survtime, reltime, cgvhtime) and the
566
+ respective indicator variables are in columns 10,11 and 13 (stat,
567
+ rcens, cgvh). The earliest time that any of these events happened
568
+ is calculated by taking the minimum of the observed times. The
569
+ censoring variable cens is coded as 0 when no events were
570
+ observed, 1 if CGVHD was observed as first event, 2 if a relapse
571
+ was observed as the first event and 3 if death occurred before
572
+ either of the events: The endpoint (status) is therefore defined as
573
+
574
+ +-------+-------------------------------------------+-----------------+
575
+ | Value | Description | Count (%) |
576
+ +=======+===========================================+=================+
577
+ | 0 | Survival (Right-censored data) | 4 patients (4%) |
578
+ +-------+-------------------------------------------+-----------------+
579
+ | 1 | Chronic graft versus host disease (CGVHD) | 86 events (86%) |
580
+ +-------+-------------------------------------------+-----------------+
581
+ | 2 | Relapse (TRM) | 5 events (5%) |
582
+ +-------+-------------------------------------------+-----------------+
583
+ | 3 | Death | 5 events (5%) |
584
+ +-------+-------------------------------------------+-----------------+
585
+
586
+ The dataset has been obtained from [1]_.
587
+
588
+ Returns
589
+ -------
590
+ x : pandas.DataFrame
591
+ The measurements for each patient.
592
+
593
+ y : structured array with 2 fields
594
+ *status*: Integer indicating the endpoint: 0: right-censored data; 1: CGVHD; 2: relapse; 3: death.
595
+
596
+ *ftime*: total length of follow-up or time of event.
597
+
598
+ References
599
+ ----------
600
+ .. [1] https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics
601
+
602
+ .. [2] Melania Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006
603
+ """
604
+ full_path = _get_data_path("cgvhd.arff")
605
+ data = loadarff(full_path)
606
+ data.loc[:, "ftime"] = data[["survtime", "reltime", "cgvhtime"]].min(axis=1)
607
+ data.loc[:, "status"] = (
608
+ ((data["ftime"] == data["cgvhtime"]) & (data["cgvh"] == "1")).astype(int)
609
+ + 2 * ((data["ftime"] == data["reltime"]) & (data["rcens"] == "1")).astype(int)
610
+ + 3 * ((data["ftime"] == data["survtime"]) & (data["stat"] == "1")).astype(int)
611
+ )
612
+ data = data[["ftime", "status", "dx", "tx", "extent", "age"]]
613
+
614
+ return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)