scikit-survival 0.25.0__cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. scikit_survival-0.25.0.dist-info/METADATA +185 -0
  2. scikit_survival-0.25.0.dist-info/RECORD +58 -0
  3. scikit_survival-0.25.0.dist-info/WHEEL +6 -0
  4. scikit_survival-0.25.0.dist-info/licenses/COPYING +674 -0
  5. scikit_survival-0.25.0.dist-info/top_level.txt +1 -0
  6. sksurv/__init__.py +183 -0
  7. sksurv/base.py +115 -0
  8. sksurv/bintrees/__init__.py +15 -0
  9. sksurv/bintrees/_binarytrees.cpython-313-x86_64-linux-gnu.so +0 -0
  10. sksurv/column.py +205 -0
  11. sksurv/compare.py +123 -0
  12. sksurv/datasets/__init__.py +12 -0
  13. sksurv/datasets/base.py +614 -0
  14. sksurv/datasets/data/GBSG2.arff +700 -0
  15. sksurv/datasets/data/actg320.arff +1169 -0
  16. sksurv/datasets/data/bmt.arff +46 -0
  17. sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
  18. sksurv/datasets/data/cgvhd.arff +118 -0
  19. sksurv/datasets/data/flchain.arff +7887 -0
  20. sksurv/datasets/data/veteran.arff +148 -0
  21. sksurv/datasets/data/whas500.arff +520 -0
  22. sksurv/docstrings.py +99 -0
  23. sksurv/ensemble/__init__.py +2 -0
  24. sksurv/ensemble/_coxph_loss.cpython-313-x86_64-linux-gnu.so +0 -0
  25. sksurv/ensemble/boosting.py +1564 -0
  26. sksurv/ensemble/forest.py +902 -0
  27. sksurv/ensemble/survival_loss.py +151 -0
  28. sksurv/exceptions.py +18 -0
  29. sksurv/functions.py +114 -0
  30. sksurv/io/__init__.py +2 -0
  31. sksurv/io/arffread.py +89 -0
  32. sksurv/io/arffwrite.py +181 -0
  33. sksurv/kernels/__init__.py +1 -0
  34. sksurv/kernels/_clinical_kernel.cpython-313-x86_64-linux-gnu.so +0 -0
  35. sksurv/kernels/clinical.py +348 -0
  36. sksurv/linear_model/__init__.py +3 -0
  37. sksurv/linear_model/_coxnet.cpython-313-x86_64-linux-gnu.so +0 -0
  38. sksurv/linear_model/aft.py +208 -0
  39. sksurv/linear_model/coxnet.py +592 -0
  40. sksurv/linear_model/coxph.py +637 -0
  41. sksurv/meta/__init__.py +4 -0
  42. sksurv/meta/base.py +35 -0
  43. sksurv/meta/ensemble_selection.py +724 -0
  44. sksurv/meta/stacking.py +370 -0
  45. sksurv/metrics.py +1028 -0
  46. sksurv/nonparametric.py +911 -0
  47. sksurv/preprocessing.py +183 -0
  48. sksurv/svm/__init__.py +11 -0
  49. sksurv/svm/_minlip.cpython-313-x86_64-linux-gnu.so +0 -0
  50. sksurv/svm/_prsvm.cpython-313-x86_64-linux-gnu.so +0 -0
  51. sksurv/svm/minlip.py +690 -0
  52. sksurv/svm/naive_survival_svm.py +249 -0
  53. sksurv/svm/survival_svm.py +1236 -0
  54. sksurv/testing.py +108 -0
  55. sksurv/tree/__init__.py +1 -0
  56. sksurv/tree/_criterion.cpython-313-x86_64-linux-gnu.so +0 -0
  57. sksurv/tree/tree.py +790 -0
  58. sksurv/util.py +415 -0
sksurv/util.py ADDED
@@ -0,0 +1,415 @@
1
+ # This program is free software: you can redistribute it and/or modify
2
+ # it under the terms of the GNU General Public License as published by
3
+ # the Free Software Foundation, either version 3 of the License, or
4
+ # (at your option) any later version.
5
+ #
6
+ # This program is distributed in the hope that it will be useful,
7
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
8
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
+ # GNU General Public License for more details.
10
+ #
11
+ # You should have received a copy of the GNU General Public License
12
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
13
+ import numpy as np
14
+ import pandas as pd
15
+ from pandas.api.types import CategoricalDtype
16
+ from sklearn.utils.validation import check_array, check_consistent_length
17
+
18
+ __all__ = ["check_array_survival", "check_y_survival", "safe_concat", "Surv"]
19
+
20
+
21
+ class Surv:
22
+ """A helper class to create a structured array for survival analysis.
23
+
24
+ This class provides helper functions to create a structured array that
25
+ encapsulates the event indicator and the observed time. The resulting
26
+ structured array is the recommended format for the ``y`` argument in
27
+ scikit-survival's estimators.
28
+ """
29
+
30
+ @staticmethod
31
+ def from_arrays(event, time, name_event=None, name_time=None):
32
+ """Create structured array from event indicator and time arrays.
33
+
34
+ Parameters
35
+ ----------
36
+ event : array-like, shape=(n_samples,)
37
+ Event indicator. A boolean array or array with values 0/1,
38
+ where ``True`` or 1 indicates an event and ``False`` or 0
39
+ indicates right-censoring.
40
+ time : array-like, shape=(n_samples,)
41
+ Observed time. Time to event or time of censoring.
42
+ name_event : str, optional, default: 'event'
43
+ Name of the event field in the structured array.
44
+ name_time : str, optional, default: 'time'
45
+ Name of the observed time field in the structured array.
46
+
47
+ Returns
48
+ -------
49
+ y : numpy.ndarray
50
+ A structured array with two fields. The first field is a boolean
51
+ where ``True`` indicates an event and ``False`` indicates right-censoring.
52
+ The second field is a float with the time of event or time of censoring.
53
+ The names of the fields are set to the values of `name_event` and `name_time`.
54
+
55
+ Examples
56
+ --------
57
+ >>> from sksurv.util import Surv
58
+ >>>
59
+ >>> y = Surv.from_arrays(event=[True, False, True],
60
+ ... time=[10, 25, 15])
61
+ >>> y
62
+ array([( True, 10.), (False, 25.), ( True, 15.)],
63
+ dtype=[('event', '?'), ('time', '<f8')])
64
+ >>> y['event']
65
+ array([ True, False, True])
66
+ >>> y['time']
67
+ array([10., 25., 15.])
68
+ """
69
+ name_event = name_event or "event"
70
+ name_time = name_time or "time"
71
+ if name_time == name_event:
72
+ raise ValueError("name_time must be different from name_event")
73
+
74
+ time = np.asanyarray(time, dtype=float)
75
+ y = np.empty(time.shape[0], dtype=[(name_event, bool), (name_time, float)])
76
+ y[name_time] = time
77
+
78
+ event = np.asanyarray(event)
79
+ check_consistent_length(time, event)
80
+
81
+ if np.issubdtype(event.dtype, np.bool_):
82
+ y[name_event] = event
83
+ else:
84
+ events = np.unique(event)
85
+ events.sort()
86
+ if len(events) != 2:
87
+ raise ValueError("event indicator must be binary")
88
+
89
+ if np.all(events == np.array([0, 1], dtype=events.dtype)):
90
+ y[name_event] = event.astype(bool)
91
+ else:
92
+ raise ValueError("non-boolean event indicator must contain 0 and 1 only")
93
+
94
+ return y
95
+
96
+ @staticmethod
97
+ def from_dataframe(event, time, data):
98
+ """Create structured array from columns in a pandas DataFrame.
99
+
100
+ Parameters
101
+ ----------
102
+ event : str
103
+ Name of the column in ``data`` containing the event indicator.
104
+ It must be a boolean or have values 0/1,
105
+ where ``True`` or 1 indicates an event and ``False`` or 0
106
+ indicates right-censoring.
107
+ time : str
108
+ Name of the column in ``data`` containing the observed time
109
+ (time to event or time of censoring).
110
+ data : pandas.DataFrame
111
+ A DataFrame with columns for event and time.
112
+
113
+ Returns
114
+ -------
115
+ y : numpy.ndarray
116
+ A structured array with two fields. The first field is a boolean
117
+ where ``True`` indicates an event and ``False`` indicates right-censoring.
118
+ The second field is a float with the time of event or time of censoring.
119
+ The names of the fields are the respective column names.
120
+
121
+ Examples
122
+ --------
123
+ >>> import pandas as pd
124
+ >>> from sksurv.util import Surv
125
+ >>>
126
+ >>> df = pd.DataFrame({
127
+ ... 'status': [True, False, True],
128
+ ... 'followup_time': [10, 25, 15],
129
+ ... })
130
+ >>> y = Surv.from_dataframe(
131
+ ... event='status', time='followup_time', data=df,
132
+ ... )
133
+ >>> y
134
+ array([( True, 10.), (False, 25.), ( True, 15.)],
135
+ dtype=[('status', '?'), ('followup_time', '<f8')])
136
+ >>> y['status']
137
+ array([ True, False, True])
138
+ >>> y['followup_time']
139
+ array([10., 25., 15.])
140
+ """
141
+ if not isinstance(data, pd.DataFrame):
142
+ raise TypeError(f"expected pandas.DataFrame, but got {type(data)!r}")
143
+
144
+ return Surv.from_arrays(
145
+ data.loc[:, event].values, data.loc[:, time].values, name_event=str(event), name_time=str(time)
146
+ )
147
+
148
+
149
+ def check_y_survival(y_or_event, *args, allow_all_censored=False, allow_time_zero=True, competing_risks=False):
150
+ """Check that array correctly represents an outcome for survival analysis.
151
+
152
+ Parameters
153
+ ----------
154
+ y_or_event : structured array with two fields, or boolean array
155
+ If a structured array, it must contain the binary event indicator
156
+ as first field, and time of event or time of censoring as
157
+ second field. Otherwise, it is assumed that a boolean array
158
+ representing the event indicator is passed.
159
+ If `competing_risks` is `True`, it should be a non-negative valued integer array,
160
+ also all risks must appear at least once in the event array.
161
+
162
+ *args : list of array-likes
163
+ Any number of array-like objects representing time information.
164
+ Elements that are `None` are passed along in the return value.
165
+
166
+ allow_all_censored : bool, optional, default: False
167
+ Whether to allow all events to be censored.
168
+
169
+ allow_time_zero : bool, optional, default: True
170
+ Whether to allow event times to be zero.
171
+
172
+ competing_risks : bool, optional, default: False
173
+ Whether there are multiple risks. (See y_or_event)
174
+
175
+ Returns
176
+ -------
177
+ event : array, shape=[n_samples,], dtype=bool
178
+ Binary event indicator.
179
+
180
+ time : array, shape=[n_samples,], dtype=float
181
+ Time of event or censoring.
182
+ """
183
+ if len(args) == 0:
184
+ y = y_or_event
185
+
186
+ if not isinstance(y, np.ndarray) or y.dtype.fields is None or len(y.dtype.fields) != 2:
187
+ raise ValueError(
188
+ "y must be a structured array with the first field"
189
+ " being a binary class event indicator and the second field"
190
+ " the time of the event/censoring"
191
+ )
192
+
193
+ event_field, time_field = y.dtype.names
194
+ y_event = y[event_field]
195
+ time_args = (y[time_field],)
196
+ else:
197
+ y_event = np.asanyarray(y_or_event)
198
+ time_args = args
199
+
200
+ event = check_array(y_event, ensure_2d=False)
201
+ check_event_dtype(event, competing_risks)
202
+ if competing_risks and not np.all(np.isin(range(1, np.max(event) + 1), event)):
203
+ raise ValueError("Some risks do not appear in the event array.")
204
+
205
+ if not (allow_all_censored or np.any(event)):
206
+ raise ValueError("all samples are censored")
207
+
208
+ return_val = [event]
209
+ for i, yt in enumerate(time_args):
210
+ if yt is None:
211
+ return_val.append(yt)
212
+ continue
213
+
214
+ yt = check_array(yt, ensure_2d=False)
215
+ if not np.issubdtype(yt.dtype, np.number):
216
+ raise ValueError(f"time must be numeric, but found {yt.dtype} for argument {i + 2}")
217
+
218
+ if allow_time_zero:
219
+ cond = yt < 0
220
+ msg = "observed time contains values smaller zero"
221
+ else:
222
+ cond = yt <= 0
223
+ msg = "observed time contains values smaller or equal to zero"
224
+ if np.any(cond):
225
+ raise ValueError(msg)
226
+
227
+ return_val.append(yt)
228
+
229
+ return tuple(return_val)
230
+
231
+
232
+ def check_event_dtype(event, competing_risks=False):
233
+ """Check that the event array has the correct dtype.
234
+
235
+ For single-event survival analysis, the event indicator must be a
236
+ boolean array. For competing risk analysis, it must be an integer
237
+ array.
238
+
239
+ Parameters
240
+ ----------
241
+ event : ndarray, shape=(n_samples,), dtype=bool | int
242
+ Array containing the event indicator.
243
+
244
+ competing_risks : bool, optional, default: False
245
+ Whether `event` is for a competing risks analysis.
246
+ """
247
+ if competing_risks:
248
+ if not np.issubdtype(event.dtype, np.integer):
249
+ raise ValueError(f"Elements of event indicator must be integer, but found {event.dtype}")
250
+ if np.any(event < 0):
251
+ raise ValueError("Elements of event indicator must be non-negative")
252
+ return
253
+
254
+ if not np.issubdtype(event.dtype, np.bool_):
255
+ raise ValueError(f"elements of event indicator must be boolean, but found {event.dtype}")
256
+
257
+
258
+ def check_array_survival(X, y, **kwargs):
259
+ """Check that all arrays have consistent first dimensions.
260
+
261
+ Parameters
262
+ ----------
263
+ X : array-like
264
+ Data matrix containing feature vectors.
265
+
266
+ y : structured array with two fields
267
+ A structured array containing the binary event indicator
268
+ as first field, and time of event or time of censoring as
269
+ second field.
270
+
271
+ kwargs : dict
272
+ Additional arguments passed to :func:`check_y_survival`.
273
+
274
+ Returns
275
+ -------
276
+ event : array, shape=[n_samples,], dtype=bool
277
+ Binary event indicator.
278
+
279
+ time : array, shape=[n_samples,], dtype=float
280
+ Time of event or censoring.
281
+ """
282
+ event, time = check_y_survival(y, **kwargs)
283
+ check_consistent_length(X, event, time)
284
+ return event, time
285
+
286
+
287
+ def safe_concat(objs, *args, **kwargs):
288
+ """Alternative to :func:`pandas.concat` that preserves categorical variables.
289
+
290
+ Parameters
291
+ ----------
292
+ objs : a sequence or mapping of Series, DataFrame, or Panel objects
293
+ If a dict is passed, the sorted keys will be used as the `keys`
294
+ argument, unless it is passed, in which case the values will be
295
+ selected (see below). Any None objects will be dropped silently unless
296
+ they are all None in which case a ValueError will be raised
297
+ axis : {0, 1, ...}, default 0
298
+ The axis to concatenate along
299
+ join : {'inner', 'outer'}, default 'outer'
300
+ How to handle indexes on other axis(es)
301
+ join_axes : list of Index objects
302
+ Specific indexes to use for the other n - 1 axes instead of performing
303
+ inner/outer set logic
304
+ verify_integrity : boolean, default False
305
+ Check whether the new concatenated axis contains duplicates. This can
306
+ be very expensive relative to the actual data concatenation
307
+ keys : sequence, default None
308
+ If multiple levels passed, should contain tuples. Construct
309
+ hierarchical index using the passed keys as the outermost level
310
+ levels : list of sequences, default None
311
+ Specific levels (unique values) to use for constructing a
312
+ MultiIndex. Otherwise they will be inferred from the keys
313
+ names : list, default None
314
+ Names for the levels in the resulting hierarchical index
315
+ ignore_index : boolean, default False
316
+ If True, do not use the index values along the concatenation axis. The
317
+ resulting axis will be labeled 0, ..., n - 1. This is useful if you are
318
+ concatenating objects where the concatenation axis does not have
319
+ meaningful indexing information. Note that the index values on the other
320
+ axes are still respected in the join.
321
+ copy : boolean, default True
322
+ If False, do not copy data unnecessarily
323
+
324
+ Notes
325
+ -----
326
+ The keys, levels, and names arguments are all optional
327
+
328
+ Returns
329
+ -------
330
+ concatenated : type of objects
331
+ """
332
+ axis = kwargs.pop("axis", 0)
333
+ categories = {}
334
+ for df in objs:
335
+ if isinstance(df, pd.Series):
336
+ if isinstance(df.dtype, CategoricalDtype):
337
+ categories[df.name] = {"categories": df.cat.categories, "ordered": df.cat.ordered}
338
+ else:
339
+ dfc = df.select_dtypes(include=["category"])
340
+ for name, s in dfc.items():
341
+ if name in categories:
342
+ if axis == 1:
343
+ raise ValueError(f"duplicate columns {name}")
344
+ if not categories[name]["categories"].equals(s.cat.categories):
345
+ raise ValueError(f"categories for column {name} do not match")
346
+ else:
347
+ categories[name] = {"categories": s.cat.categories, "ordered": s.cat.ordered}
348
+ df[name] = df[name].astype(object)
349
+
350
+ concatenated = pd.concat(objs, *args, axis=axis, **kwargs)
351
+
352
+ for name, params in categories.items():
353
+ concatenated[name] = pd.Categorical(concatenated[name], **params)
354
+
355
+ return concatenated
356
+
357
+
358
+ class _PropertyAvailableIfDescriptor:
359
+ """Implements a conditional property using the descriptor protocol based on the property decorator.
360
+
361
+ The corresponding class in scikit-learn (`_AvailableIfDescriptor`) only supports callables.
362
+ This class adopts the property decorator as described in the descriptor guide in the offical Python documentation.
363
+
364
+ See also
365
+ --------
366
+ https://docs.python.org/3/howto/descriptor.html
367
+ Descriptor HowTo Guide
368
+
369
+ :class:`sklearn.utils.available_if._AvailableIfDescriptor`
370
+ The original class in scikit-learn.
371
+ """
372
+
373
+ def __init__(self, check, fget, doc=None):
374
+ self.check = check
375
+ self.fget = fget
376
+ if doc is None and fget is not None:
377
+ doc = fget.__doc__
378
+ self.__doc__ = doc
379
+ self._name = ""
380
+
381
+ def __set_name__(self, owner, name):
382
+ self._name = name
383
+
384
+ def __get__(self, obj, objtype=None):
385
+ if obj is None:
386
+ return self
387
+
388
+ attr_err = AttributeError(f"This {obj!r} has no attribute {self._name!r}")
389
+ if not self.check(obj):
390
+ raise attr_err
391
+
392
+ if self.fget is None:
393
+ raise AttributeError(f"property '{self._name}' has no getter")
394
+ return self.fget(obj)
395
+
396
+
397
+ def property_available_if(check):
398
+ """A property attribute that is available only if check returns a truthy value.
399
+
400
+ Only supports getting an attribute value, setting or deleting an attribute value are not supported.
401
+
402
+ Parameters
403
+ ----------
404
+ check : callable
405
+ When passed the object of the decorated method, this should return
406
+ `True` if the property attribute is available, and either return `False`
407
+ or raise an `AttributeError` if not available.
408
+
409
+ Returns
410
+ -------
411
+ callable
412
+ Callable makes the decorated property available if `check` returns
413
+ `True`, otherwise the decorated property is unavailable.
414
+ """
415
+ return lambda fn: _PropertyAvailableIfDescriptor(check=check, fget=fn)