scikit-survival 0.26.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scikit_survival-0.26.0.dist-info/METADATA +185 -0
- scikit_survival-0.26.0.dist-info/RECORD +58 -0
- scikit_survival-0.26.0.dist-info/WHEEL +6 -0
- scikit_survival-0.26.0.dist-info/licenses/COPYING +674 -0
- scikit_survival-0.26.0.dist-info/top_level.txt +1 -0
- sksurv/__init__.py +183 -0
- sksurv/base.py +115 -0
- sksurv/bintrees/__init__.py +15 -0
- sksurv/bintrees/_binarytrees.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/column.py +204 -0
- sksurv/compare.py +123 -0
- sksurv/datasets/__init__.py +12 -0
- sksurv/datasets/base.py +614 -0
- sksurv/datasets/data/GBSG2.arff +700 -0
- sksurv/datasets/data/actg320.arff +1169 -0
- sksurv/datasets/data/bmt.arff +46 -0
- sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
- sksurv/datasets/data/cgvhd.arff +118 -0
- sksurv/datasets/data/flchain.arff +7887 -0
- sksurv/datasets/data/veteran.arff +148 -0
- sksurv/datasets/data/whas500.arff +520 -0
- sksurv/docstrings.py +99 -0
- sksurv/ensemble/__init__.py +2 -0
- sksurv/ensemble/_coxph_loss.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/ensemble/boosting.py +1564 -0
- sksurv/ensemble/forest.py +902 -0
- sksurv/ensemble/survival_loss.py +151 -0
- sksurv/exceptions.py +18 -0
- sksurv/functions.py +114 -0
- sksurv/io/__init__.py +2 -0
- sksurv/io/arffread.py +91 -0
- sksurv/io/arffwrite.py +181 -0
- sksurv/kernels/__init__.py +1 -0
- sksurv/kernels/_clinical_kernel.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/kernels/clinical.py +348 -0
- sksurv/linear_model/__init__.py +3 -0
- sksurv/linear_model/_coxnet.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/linear_model/aft.py +208 -0
- sksurv/linear_model/coxnet.py +592 -0
- sksurv/linear_model/coxph.py +637 -0
- sksurv/meta/__init__.py +4 -0
- sksurv/meta/base.py +35 -0
- sksurv/meta/ensemble_selection.py +724 -0
- sksurv/meta/stacking.py +370 -0
- sksurv/metrics.py +1028 -0
- sksurv/nonparametric.py +911 -0
- sksurv/preprocessing.py +195 -0
- sksurv/svm/__init__.py +11 -0
- sksurv/svm/_minlip.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/svm/_prsvm.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/svm/minlip.py +695 -0
- sksurv/svm/naive_survival_svm.py +249 -0
- sksurv/svm/survival_svm.py +1236 -0
- sksurv/testing.py +155 -0
- sksurv/tree/__init__.py +1 -0
- sksurv/tree/_criterion.cpython-312-x86_64-linux-gnu.so +0 -0
- sksurv/tree/tree.py +790 -0
- sksurv/util.py +416 -0
sksurv/datasets/base.py
ADDED
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pandas.api.types import CategoricalDtype
|
|
6
|
+
|
|
7
|
+
from ..column import categorical_to_numeric, standardize
|
|
8
|
+
from ..io import loadarff
|
|
9
|
+
from ..util import safe_concat
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"get_x_y",
|
|
13
|
+
"load_arff_files_standardized",
|
|
14
|
+
"load_aids",
|
|
15
|
+
"load_bmt",
|
|
16
|
+
"load_cgvhd",
|
|
17
|
+
"load_breast_cancer",
|
|
18
|
+
"load_flchain",
|
|
19
|
+
"load_gbsg2",
|
|
20
|
+
"load_whas500",
|
|
21
|
+
"load_veterans_lung_cancer",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_data_path(name):
|
|
26
|
+
from importlib.resources import files
|
|
27
|
+
|
|
28
|
+
return files(__package__) / "data" / name
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_x_y_survival(dataset, col_event, col_time, val_outcome, competing_risks=False):
|
|
32
|
+
if col_event is None or col_time is None:
|
|
33
|
+
y = None
|
|
34
|
+
x_frame = dataset
|
|
35
|
+
else:
|
|
36
|
+
event_type = np.int64 if competing_risks else bool
|
|
37
|
+
y = np.empty(dtype=[(col_event, event_type), (col_time, np.float64)], shape=dataset.shape[0])
|
|
38
|
+
if competing_risks:
|
|
39
|
+
y[col_event] = dataset[col_event].to_numpy()
|
|
40
|
+
else:
|
|
41
|
+
y[col_event] = (dataset[col_event] == val_outcome).to_numpy()
|
|
42
|
+
y[col_time] = dataset[col_time].to_numpy()
|
|
43
|
+
|
|
44
|
+
x_frame = dataset.drop([col_event, col_time], axis=1)
|
|
45
|
+
|
|
46
|
+
return x_frame, y
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _get_x_y_other(dataset, col_label):
|
|
50
|
+
if col_label is None:
|
|
51
|
+
y = None
|
|
52
|
+
x_frame = dataset
|
|
53
|
+
else:
|
|
54
|
+
y = dataset.loc[:, col_label]
|
|
55
|
+
x_frame = dataset.drop(col_label, axis=1)
|
|
56
|
+
|
|
57
|
+
return x_frame, y
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_x_y(data_frame, attr_labels, pos_label=None, survival=True, competing_risks=False):
|
|
61
|
+
"""Split data frame into features and labels.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
data_frame : pandas.DataFrame, shape = (n_samples, n_columns)
|
|
66
|
+
A data frame.
|
|
67
|
+
|
|
68
|
+
attr_labels : sequence of str or None
|
|
69
|
+
A list of one or more columns that are considered the label.
|
|
70
|
+
If `survival` is `True`, then attr_labels has two elements:
|
|
71
|
+
1) the name of the column denoting the event indicator, and
|
|
72
|
+
2) the name of the column denoting the survival time.
|
|
73
|
+
If the sequence contains `None`, then labels are not retrieved
|
|
74
|
+
and only a data frame with features is returned.
|
|
75
|
+
|
|
76
|
+
pos_label : any, optional
|
|
77
|
+
Which value of the event indicator column denotes that a
|
|
78
|
+
patient experienced an event. This value is ignored if
|
|
79
|
+
`survival` is `False`.
|
|
80
|
+
|
|
81
|
+
survival : bool, optional, default: True
|
|
82
|
+
Whether to return `y` that can be used for survival analysis.
|
|
83
|
+
|
|
84
|
+
competing_risks : bool, optional, default: False
|
|
85
|
+
Whether `y` refers to competing risks situation. Only used if `survival` is `True`.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
X : pandas.DataFrame, shape = (n_samples, n_columns - len(attr_labels))
|
|
90
|
+
Data frame containing features.
|
|
91
|
+
|
|
92
|
+
y : structured array, shape = (n_samples,), or pandas.DataFrame, shape = (n_samples, len(attr_labels)), or None
|
|
93
|
+
If `survival` is `True`, a structured array with two fields.
|
|
94
|
+
The first field is a boolean where ``True`` indicates an event and ``False``
|
|
95
|
+
indicates right-censoring. The second field is a float with the time of
|
|
96
|
+
event or time of censoring.
|
|
97
|
+
|
|
98
|
+
If `survival` is `False` and `attr_labels` not `None`, a :class:`pandas.DataFrame`
|
|
99
|
+
with columns specified by `attr_labels`.
|
|
100
|
+
|
|
101
|
+
If `survival` is `False` and `attr_labels` is `None`, `y` is set to `None`.
|
|
102
|
+
"""
|
|
103
|
+
if survival:
|
|
104
|
+
if len(attr_labels) != 2:
|
|
105
|
+
raise ValueError(f"expected sequence of length two for attr_labels, but got {len(attr_labels)}")
|
|
106
|
+
if pos_label is None and not competing_risks:
|
|
107
|
+
raise ValueError("pos_label needs to be specified if survival=True")
|
|
108
|
+
return _get_x_y_survival(data_frame, attr_labels[0], attr_labels[1], pos_label, competing_risks)
|
|
109
|
+
|
|
110
|
+
return _get_x_y_other(data_frame, attr_labels)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _loadarff_with_index(filename):
|
|
114
|
+
dataset = loadarff(filename)
|
|
115
|
+
if "index" in dataset.columns:
|
|
116
|
+
if isinstance(dataset["index"].dtype, CategoricalDtype):
|
|
117
|
+
# concatenating categorical index may raise TypeError
|
|
118
|
+
# see https://github.com/pandas-dev/pandas/issues/14586
|
|
119
|
+
dataset = dataset.astype({"index": "str"})
|
|
120
|
+
dataset.set_index("index", inplace=True)
|
|
121
|
+
return dataset
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def load_arff_files_standardized(
|
|
125
|
+
path_training,
|
|
126
|
+
attr_labels,
|
|
127
|
+
pos_label=None,
|
|
128
|
+
path_testing=None,
|
|
129
|
+
survival=True,
|
|
130
|
+
standardize_numeric=True,
|
|
131
|
+
to_numeric=True,
|
|
132
|
+
):
|
|
133
|
+
"""Load dataset in ARFF format.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
path_training : str
|
|
138
|
+
Path to ARFF file containing data.
|
|
139
|
+
|
|
140
|
+
attr_labels : sequence of str
|
|
141
|
+
Names of attributes denoting dependent variables.
|
|
142
|
+
If ``survival`` is set, it must be a sequence with two items:
|
|
143
|
+
the name of the event indicator and the name of the survival/censoring time.
|
|
144
|
+
|
|
145
|
+
pos_label : any type, optional
|
|
146
|
+
Value corresponding to an event in survival analysis.
|
|
147
|
+
Only considered if ``survival`` is ``True``.
|
|
148
|
+
|
|
149
|
+
path_testing : str, optional
|
|
150
|
+
Path to ARFF file containing hold-out data. Only columns that are available in both
|
|
151
|
+
training and testing are considered (excluding dependent variables).
|
|
152
|
+
If ``standardize_numeric`` is set, data is normalized by considering both training
|
|
153
|
+
and testing data.
|
|
154
|
+
|
|
155
|
+
survival : bool, optional, default: True
|
|
156
|
+
Whether the dependent variables denote event indicator and survival/censoring time.
|
|
157
|
+
|
|
158
|
+
standardize_numeric : bool, optional, default: True
|
|
159
|
+
Whether to standardize data to zero mean and unit variance.
|
|
160
|
+
See :func:`sksurv.column.standardize`.
|
|
161
|
+
|
|
162
|
+
to_numeric : bool, optional, default: True
|
|
163
|
+
Whether to convert categorical variables to numeric values.
|
|
164
|
+
See :func:`sksurv.column.categorical_to_numeric`.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
x_train : pandas.DataFrame, shape = (n_train, n_features)
|
|
169
|
+
Training data.
|
|
170
|
+
|
|
171
|
+
y_train : structured array, shape = (n_train,), or pandas.DataFrame, shape = (n_train, len(attr_labels))
|
|
172
|
+
Dependent variables of training data.
|
|
173
|
+
|
|
174
|
+
If `survival` is `True`, a structured array with two fields.
|
|
175
|
+
The first field is a boolean where ``True`` indicates an event and ``False``
|
|
176
|
+
indicates right-censoring. The second field is a float with the time of
|
|
177
|
+
event or time of censoring.
|
|
178
|
+
|
|
179
|
+
If `survival` is `False` and `attr_labels` not `None`, a :class:`pandas.DataFrame`
|
|
180
|
+
with columns specified by `attr_labels`.
|
|
181
|
+
|
|
182
|
+
If `survival` is `False` and `attr_labels` is `None`, `y_train` is set to `None`.
|
|
183
|
+
|
|
184
|
+
x_test : None or pandas.DataFrame, shape = (n_test, n_features)
|
|
185
|
+
Testing data if `path_testing` was provided.
|
|
186
|
+
|
|
187
|
+
y_test : None or structured array, shape = (n_test,)
|
|
188
|
+
Dependent variables of testing data if `path_testing` was provided.
|
|
189
|
+
|
|
190
|
+
If `survival` is `True`, a structured array with two fields.
|
|
191
|
+
The first field is a boolean where ``True`` indicates an event and ``False``
|
|
192
|
+
indicates right-censoring. The second field is a float with the time of
|
|
193
|
+
event or time of censoring.
|
|
194
|
+
|
|
195
|
+
If `survival` is `False` and `attr_labels` not `None`, a :class:`pandas.DataFrame`
|
|
196
|
+
with columns specified by `attr_labels`.
|
|
197
|
+
|
|
198
|
+
If `survival` is `False` and `attr_labels` is `None`, `y_test` is set to `None`.
|
|
199
|
+
"""
|
|
200
|
+
dataset = _loadarff_with_index(path_training)
|
|
201
|
+
|
|
202
|
+
x_train, y_train = get_x_y(dataset, attr_labels, pos_label, survival)
|
|
203
|
+
|
|
204
|
+
if path_testing is not None:
|
|
205
|
+
x_test, y_test = _load_arff_testing(path_testing, attr_labels, pos_label, survival)
|
|
206
|
+
|
|
207
|
+
if len(x_train.columns.symmetric_difference(x_test.columns)) > 0:
|
|
208
|
+
warnings.warn("Restricting columns to intersection between training and testing data", stacklevel=2)
|
|
209
|
+
|
|
210
|
+
cols = x_train.columns.intersection(x_test.columns)
|
|
211
|
+
if len(cols) == 0:
|
|
212
|
+
raise ValueError("columns of training and test data do not intersect")
|
|
213
|
+
|
|
214
|
+
x_train = x_train.loc[:, cols]
|
|
215
|
+
x_test = x_test.loc[:, cols]
|
|
216
|
+
|
|
217
|
+
x = safe_concat((x_train, x_test), axis=0)
|
|
218
|
+
if standardize_numeric:
|
|
219
|
+
x = standardize(x)
|
|
220
|
+
if to_numeric:
|
|
221
|
+
x = categorical_to_numeric(x)
|
|
222
|
+
|
|
223
|
+
n_train = x_train.shape[0]
|
|
224
|
+
x_train = x.iloc[:n_train, :]
|
|
225
|
+
x_test = x.iloc[n_train:, :]
|
|
226
|
+
else:
|
|
227
|
+
if standardize_numeric:
|
|
228
|
+
x_train = standardize(x_train)
|
|
229
|
+
if to_numeric:
|
|
230
|
+
x_train = categorical_to_numeric(x_train)
|
|
231
|
+
|
|
232
|
+
x_test = None
|
|
233
|
+
y_test = None
|
|
234
|
+
|
|
235
|
+
return x_train, y_train, x_test, y_test
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _load_arff_testing(path_testing, attr_labels, pos_label, survival):
|
|
239
|
+
test_dataset = _loadarff_with_index(path_testing)
|
|
240
|
+
|
|
241
|
+
has_labels = pd.Index(attr_labels).isin(test_dataset.columns).all()
|
|
242
|
+
if not has_labels:
|
|
243
|
+
if survival:
|
|
244
|
+
attr_labels = [None, None]
|
|
245
|
+
else:
|
|
246
|
+
attr_labels = None
|
|
247
|
+
return get_x_y(test_dataset, attr_labels, pos_label, survival)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def load_whas500():
|
|
251
|
+
"""Load and return the Worcester Heart Attack Study dataset
|
|
252
|
+
|
|
253
|
+
The dataset has 500 samples and 14 features.
|
|
254
|
+
The endpoint is death, which occurred for 215 patients (43.0%).
|
|
255
|
+
|
|
256
|
+
See [1]_, [2]_ for further description.
|
|
257
|
+
|
|
258
|
+
Returns
|
|
259
|
+
-------
|
|
260
|
+
x : pandas.DataFrame
|
|
261
|
+
The measurements for each patient.
|
|
262
|
+
|
|
263
|
+
y : structured array with 2 fields
|
|
264
|
+
*fstat*: boolean indicating whether the endpoint has been reached
|
|
265
|
+
or the event time is right-censored.
|
|
266
|
+
|
|
267
|
+
*lenfol*: total length of follow-up (days from hospital admission date
|
|
268
|
+
to date of last follow-up)
|
|
269
|
+
|
|
270
|
+
References
|
|
271
|
+
----------
|
|
272
|
+
.. [1] https://web.archive.org/web/20170114043458/http://www.umass.edu/statdata/statdata/data/
|
|
273
|
+
|
|
274
|
+
.. [2] Hosmer, D., Lemeshow, S., May, S.:
|
|
275
|
+
"Applied Survival Analysis: Regression Modeling of Time to Event Data."
|
|
276
|
+
John Wiley & Sons, Inc. (2008)
|
|
277
|
+
"""
|
|
278
|
+
fn = _get_data_path("whas500.arff")
|
|
279
|
+
return get_x_y(loadarff(fn), attr_labels=["fstat", "lenfol"], pos_label="1")
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def load_gbsg2():
|
|
283
|
+
"""Load and return the German Breast Cancer Study Group 2 dataset
|
|
284
|
+
|
|
285
|
+
The dataset has 686 samples and 8 features.
|
|
286
|
+
The endpoint is recurrence free survival, which occurred for 299 patients (43.6%).
|
|
287
|
+
|
|
288
|
+
See [1]_, [2]_ for further description.
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
x : pandas.DataFrame
|
|
293
|
+
The measurements for each patient.
|
|
294
|
+
|
|
295
|
+
y : structured array with 2 fields
|
|
296
|
+
*cens*: boolean indicating whether the endpoint has been reached
|
|
297
|
+
or the event time is right-censored.
|
|
298
|
+
|
|
299
|
+
*time*: total length of follow-up
|
|
300
|
+
|
|
301
|
+
References
|
|
302
|
+
----------
|
|
303
|
+
.. [1] http://ascopubs.org/doi/abs/10.1200/jco.1994.12.10.2086
|
|
304
|
+
|
|
305
|
+
.. [2] Schumacher, M., Basert, G., Bojar, H., et al.
|
|
306
|
+
"Randomized 2 × 2 trial evaluating hormonal treatment and the duration of chemotherapy
|
|
307
|
+
in node-positive breast cancer patients."
|
|
308
|
+
Journal of Clinical Oncology 12, 2086–2093. (1994)
|
|
309
|
+
"""
|
|
310
|
+
fn = _get_data_path("GBSG2.arff")
|
|
311
|
+
return get_x_y(loadarff(fn), attr_labels=["cens", "time"], pos_label="1")
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def load_veterans_lung_cancer():
|
|
315
|
+
"""Load and return data from the Veterans' Administration
|
|
316
|
+
Lung Cancer Trial
|
|
317
|
+
|
|
318
|
+
The dataset has 137 samples and 6 features.
|
|
319
|
+
The endpoint is death, which occurred for 128 patients (93.4%).
|
|
320
|
+
|
|
321
|
+
See [1]_ for further description.
|
|
322
|
+
|
|
323
|
+
Returns
|
|
324
|
+
-------
|
|
325
|
+
x : pandas.DataFrame
|
|
326
|
+
The measurements for each patient.
|
|
327
|
+
|
|
328
|
+
y : structured array with 2 fields
|
|
329
|
+
*Status*: boolean indicating whether the endpoint has been reached
|
|
330
|
+
or the event time is right-censored.
|
|
331
|
+
|
|
332
|
+
*Survival_in_days*: total length of follow-up
|
|
333
|
+
|
|
334
|
+
References
|
|
335
|
+
----------
|
|
336
|
+
.. [1] Kalbfleisch, J.D., Prentice, R.L.:
|
|
337
|
+
"The Statistical Analysis of Failure Time Data." John Wiley & Sons, Inc. (2002)
|
|
338
|
+
"""
|
|
339
|
+
fn = _get_data_path("veteran.arff")
|
|
340
|
+
return get_x_y(loadarff(fn), attr_labels=["Status", "Survival_in_days"], pos_label="dead")
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def load_aids(endpoint="aids"):
|
|
344
|
+
"""Load and return the AIDS Clinical Trial dataset
|
|
345
|
+
|
|
346
|
+
The dataset has 1,151 samples and 11 features.
|
|
347
|
+
The dataset has 2 endpoints:
|
|
348
|
+
|
|
349
|
+
1. AIDS defining event, which occurred for 96 patients (8.3%)
|
|
350
|
+
2. Death, which occurred for 26 patients (2.3%)
|
|
351
|
+
|
|
352
|
+
See [1]_, [2]_ for further description.
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
endpoint : {'aids', 'death'}, default: 'aids'
|
|
357
|
+
The endpoint.
|
|
358
|
+
|
|
359
|
+
Returns
|
|
360
|
+
-------
|
|
361
|
+
x : pandas.DataFrame
|
|
362
|
+
The measurements for each patient.
|
|
363
|
+
|
|
364
|
+
y : structured array with 2 fields
|
|
365
|
+
*censor*: boolean indicating whether the endpoint has been reached
|
|
366
|
+
or the event time is right-censored.
|
|
367
|
+
|
|
368
|
+
*time*: total length of follow-up
|
|
369
|
+
|
|
370
|
+
If ``endpoint`` is death, the fields are named *censor_d* and *time_d*.
|
|
371
|
+
|
|
372
|
+
References
|
|
373
|
+
----------
|
|
374
|
+
.. [1] https://web.archive.org/web/20170114043458/http://www.umass.edu/statdata/statdata/data/
|
|
375
|
+
|
|
376
|
+
.. [2] Hosmer, D., Lemeshow, S., May, S.:
|
|
377
|
+
"Applied Survival Analysis: Regression Modeling of Time to Event Data."
|
|
378
|
+
John Wiley & Sons, Inc. (2008)
|
|
379
|
+
"""
|
|
380
|
+
labels_aids = ["censor", "time"]
|
|
381
|
+
labels_death = ["censor_d", "time_d"]
|
|
382
|
+
if endpoint == "aids":
|
|
383
|
+
attr_labels = labels_aids
|
|
384
|
+
drop_columns = labels_death
|
|
385
|
+
elif endpoint == "death":
|
|
386
|
+
attr_labels = labels_death
|
|
387
|
+
drop_columns = labels_aids
|
|
388
|
+
else:
|
|
389
|
+
raise ValueError("endpoint must be 'aids' or 'death'")
|
|
390
|
+
|
|
391
|
+
fn = _get_data_path("actg320.arff")
|
|
392
|
+
x, y = get_x_y(loadarff(fn), attr_labels=attr_labels, pos_label="1")
|
|
393
|
+
x.drop(drop_columns, axis=1, inplace=True)
|
|
394
|
+
return x, y
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def load_breast_cancer():
|
|
398
|
+
"""Load and return the breast cancer dataset
|
|
399
|
+
|
|
400
|
+
The dataset has 198 samples and 80 features.
|
|
401
|
+
The endpoint is the presence of distance metastases, which occurred for 51 patients (25.8%).
|
|
402
|
+
|
|
403
|
+
See [1]_, [2]_ for further description.
|
|
404
|
+
|
|
405
|
+
Returns
|
|
406
|
+
-------
|
|
407
|
+
x : pandas.DataFrame
|
|
408
|
+
The measurements for each patient.
|
|
409
|
+
|
|
410
|
+
y : structured array with 2 fields
|
|
411
|
+
*e.tdm*: boolean indicating whether the endpoint has been reached
|
|
412
|
+
or the event time is right-censored.
|
|
413
|
+
|
|
414
|
+
*t.tdm*: time to distant metastasis (days)
|
|
415
|
+
|
|
416
|
+
References
|
|
417
|
+
----------
|
|
418
|
+
.. [1] https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE7390
|
|
419
|
+
|
|
420
|
+
.. [2] Desmedt, C., Piette, F., Loi et al.:
|
|
421
|
+
"Strong Time Dependence of the 76-Gene Prognostic Signature for Node-Negative Breast Cancer
|
|
422
|
+
Patients in the TRANSBIG Multicenter Independent Validation Series."
|
|
423
|
+
Clin. Cancer Res. 13(11), 3207–14 (2007)
|
|
424
|
+
"""
|
|
425
|
+
fn = _get_data_path("breast_cancer_GSE7390-metastasis.arff")
|
|
426
|
+
return get_x_y(loadarff(fn), attr_labels=["e.tdm", "t.tdm"], pos_label="1")
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def load_flchain():
|
|
430
|
+
"""Load and return assay of serum free light chain for 7874 subjects.
|
|
431
|
+
|
|
432
|
+
The dataset has 7874 samples and 9 features:
|
|
433
|
+
|
|
434
|
+
1. age: age in years
|
|
435
|
+
2. sex: F=female, M=male
|
|
436
|
+
3. sample.yr: the calendar year in which a blood sample was obtained
|
|
437
|
+
4. kappa: serum free light chain, kappa portion
|
|
438
|
+
5. lambda: serum free light chain, lambda portion
|
|
439
|
+
6. flc.grp: the serum free light chain group for the subject, as used in the original analysis
|
|
440
|
+
7. creatinine: serum creatinine
|
|
441
|
+
8. mgus: whether the subject had been diagnosed with monoclonal gammapothy (MGUS)
|
|
442
|
+
9. chapter: for those who died, a grouping of their primary cause of death by chapter headings
|
|
443
|
+
of the International Code of Diseases ICD-9
|
|
444
|
+
|
|
445
|
+
The endpoint is death, which occurred for 2169 subjects (27.5%).
|
|
446
|
+
|
|
447
|
+
See [1]_, [2]_ for further description.
|
|
448
|
+
|
|
449
|
+
Returns
|
|
450
|
+
-------
|
|
451
|
+
x : pandas.DataFrame
|
|
452
|
+
The measurements for each patient.
|
|
453
|
+
|
|
454
|
+
y : structured array with 2 fields
|
|
455
|
+
*death*: boolean indicating whether the subject died
|
|
456
|
+
or the event time is right-censored.
|
|
457
|
+
|
|
458
|
+
*futime*: total length of follow-up or time of death.
|
|
459
|
+
|
|
460
|
+
References
|
|
461
|
+
----------
|
|
462
|
+
.. [1] https://doi.org/10.1016/j.mayocp.2012.03.009
|
|
463
|
+
|
|
464
|
+
.. [2] Dispenzieri, A., Katzmann, J., Kyle, R., Larson, D., Therneau, T., Colby, C., Clark, R.,
|
|
465
|
+
Mead, G., Kumar, S., Melton III, LJ. and Rajkumar, SV.
|
|
466
|
+
Use of nonclonal serum immunoglobulin free light chains to predict overall survival in
|
|
467
|
+
the general population, Mayo Clinic Proceedings 87:512-523. (2012)
|
|
468
|
+
"""
|
|
469
|
+
fn = _get_data_path("flchain.arff")
|
|
470
|
+
return get_x_y(loadarff(fn), attr_labels=["death", "futime"], pos_label="dead")
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def load_bmt():
|
|
474
|
+
"""Load and return response to hematopoietic stem cell transplantation (HSCT) for acute leukemia patients.
|
|
475
|
+
|
|
476
|
+
The dataset has 35 samples and 1 feature "dis" indicating the type of leukemia::
|
|
477
|
+
|
|
478
|
+
0=ALL (Acute Lymphoblastic Leukemia)
|
|
479
|
+
1=AML (Acute Myeloid Leukemia)
|
|
480
|
+
|
|
481
|
+
The endpoint (status) is defined as
|
|
482
|
+
|
|
483
|
+
+-------+------------------------------------+---------------------+
|
|
484
|
+
| Value | Description | Count (%) |
|
|
485
|
+
+=======+====================================+=====================+
|
|
486
|
+
| 0 | Survival (Right-censored data) | 11 patients (31.4%) |
|
|
487
|
+
+-------+------------------------------------+---------------------+
|
|
488
|
+
| 1 | Transplant related mortality (TRM) | 9 events (25.7%) |
|
|
489
|
+
+-------+------------------------------------+---------------------+
|
|
490
|
+
| 2 | Relapse | 15 events (42.8%) |
|
|
491
|
+
+-------+------------------------------------+---------------------+
|
|
492
|
+
|
|
493
|
+
See [1]_ for further description and [2]_ for the dataset.
|
|
494
|
+
|
|
495
|
+
Returns
|
|
496
|
+
-------
|
|
497
|
+
x : pandas.DataFrame
|
|
498
|
+
The measurements for each patient.
|
|
499
|
+
|
|
500
|
+
y : structured array with 2 fields
|
|
501
|
+
*status*: Integer indicating the endpoint: 0-(survival i.e. right-censored data), 1-(TRM), 2-(relapse)
|
|
502
|
+
|
|
503
|
+
*ftime*: total length of follow-up or time of event.
|
|
504
|
+
|
|
505
|
+
References
|
|
506
|
+
----------
|
|
507
|
+
.. [1] https://doi.org/10.1038/sj.bmt.1705727
|
|
508
|
+
Scrucca, L., Santucci, A. & Aversa, F.:
|
|
509
|
+
"Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"
|
|
510
|
+
|
|
511
|
+
.. [2] https://luca-scr.github.io/R/bmt.csv
|
|
512
|
+
"""
|
|
513
|
+
full_path = _get_data_path("bmt.arff")
|
|
514
|
+
data = loadarff(full_path)
|
|
515
|
+
data = data.astype({"ftime": int})
|
|
516
|
+
return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def load_cgvhd():
|
|
520
|
+
r"""Load and return data from multicentre randomized clinical trial
|
|
521
|
+
initiated for patients with a myeloid malignancy who were to
|
|
522
|
+
undergo an allogeneic bone marrow transplant.
|
|
523
|
+
|
|
524
|
+
The dataset is a 100 patient subsample of the full data set. See [2]_ for further details.
|
|
525
|
+
|
|
526
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
527
|
+
| Index | Name | Description | Encoding |
|
|
528
|
+
+=======+============+==============================================+===========================================+
|
|
529
|
+
| 1 | dx | Diagnosis | | AML=acute myeloid leukaemia |
|
|
530
|
+
| | | | | CML=chronic myeloid leukaemia |
|
|
531
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
532
|
+
| 2 | tx | Randomized treatment | | BM=cell harvested from the bone marrow |
|
|
533
|
+
| | | | | PB=cell harvested from peripheral blood |
|
|
534
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
535
|
+
| 3 | extent | Extent of disease | L=limited, E=extensive |
|
|
536
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
537
|
+
| 4 | agvhdgd | Grade of acute GVHD | |
|
|
538
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
539
|
+
| 5 | age | Age | Years |
|
|
540
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
541
|
+
| 6 | survtime | Time from date of transplant to death | Years |
|
|
542
|
+
| | | or last follow-up | |
|
|
543
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
544
|
+
| 7 | reltime | Time from date of transplant to relapse | Years |
|
|
545
|
+
| | | or last follow-up | |
|
|
546
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
547
|
+
| 8 | agvhtime | Time from date of transplant to acute GVHD | Years |
|
|
548
|
+
| | | or last follow-up | |
|
|
549
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
550
|
+
| 9 | cgvhtime | Time from date of transplant to chronic GVHD | Years |
|
|
551
|
+
| | | or last follow-up | |
|
|
552
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
553
|
+
| 10 | stat | Status | 1=Dead, 0=Alive |
|
|
554
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
555
|
+
| 11 | rcens | Relapse | 1=Yes, 0=No |
|
|
556
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
557
|
+
| 12 | agvh | Acute GVHD | 1=Yes, 0=No |
|
|
558
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
559
|
+
| 13 | cgvh | Chronic GVHD | 1=Yes, 0=No |
|
|
560
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
561
|
+
| 14 | stnum | patient ID | |
|
|
562
|
+
+-------+------------+----------------------------------------------+-------------------------------------------+
|
|
563
|
+
|
|
564
|
+
Columns 6,7 and 9 contain the time to death, relapse and CGVHD
|
|
565
|
+
calculated in years (survtime, reltime, cgvhtime) and the
|
|
566
|
+
respective indicator variables are in columns 10,11 and 13 (stat,
|
|
567
|
+
rcens, cgvh). The earliest time that any of these events happened
|
|
568
|
+
is calculated by taking the minimum of the observed times. The
|
|
569
|
+
censoring variable cens is coded as 0 when no events were
|
|
570
|
+
observed, 1 if CGVHD was observed as first event, 2 if a relapse
|
|
571
|
+
was observed as the first event and 3 if death occurred before
|
|
572
|
+
either of the events: The endpoint (status) is therefore defined as
|
|
573
|
+
|
|
574
|
+
+-------+-------------------------------------------+-----------------+
|
|
575
|
+
| Value | Description | Count (%) |
|
|
576
|
+
+=======+===========================================+=================+
|
|
577
|
+
| 0 | Survival (Right-censored data) | 4 patients (4%) |
|
|
578
|
+
+-------+-------------------------------------------+-----------------+
|
|
579
|
+
| 1 | Chronic graft versus host disease (CGVHD) | 86 events (86%) |
|
|
580
|
+
+-------+-------------------------------------------+-----------------+
|
|
581
|
+
| 2 | Relapse (TRM) | 5 events (5%) |
|
|
582
|
+
+-------+-------------------------------------------+-----------------+
|
|
583
|
+
| 3 | Death | 5 events (5%) |
|
|
584
|
+
+-------+-------------------------------------------+-----------------+
|
|
585
|
+
|
|
586
|
+
The dataset has been obtained from [1]_.
|
|
587
|
+
|
|
588
|
+
Returns
|
|
589
|
+
-------
|
|
590
|
+
x : pandas.DataFrame
|
|
591
|
+
The measurements for each patient.
|
|
592
|
+
|
|
593
|
+
y : structured array with 2 fields
|
|
594
|
+
*status*: Integer indicating the endpoint: 0: right-censored data; 1: CGVHD; 2: relapse; 3: death.
|
|
595
|
+
|
|
596
|
+
*ftime*: total length of follow-up or time of event.
|
|
597
|
+
|
|
598
|
+
References
|
|
599
|
+
----------
|
|
600
|
+
.. [1] https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics
|
|
601
|
+
|
|
602
|
+
.. [2] Melania Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006
|
|
603
|
+
"""
|
|
604
|
+
full_path = _get_data_path("cgvhd.arff")
|
|
605
|
+
data = loadarff(full_path)
|
|
606
|
+
data.loc[:, "ftime"] = data[["survtime", "reltime", "cgvhtime"]].min(axis=1)
|
|
607
|
+
data.loc[:, "status"] = (
|
|
608
|
+
((data["ftime"] == data["cgvhtime"]) & (data["cgvh"] == "1")).astype(int)
|
|
609
|
+
+ 2 * ((data["ftime"] == data["reltime"]) & (data["rcens"] == "1")).astype(int)
|
|
610
|
+
+ 3 * ((data["ftime"] == data["survtime"]) & (data["stat"] == "1")).astype(int)
|
|
611
|
+
)
|
|
612
|
+
data = data[["ftime", "status", "dx", "tx", "extent", "age"]]
|
|
613
|
+
|
|
614
|
+
return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)
|