scikit-survival 0.23.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. scikit_survival-0.23.1.dist-info/COPYING +674 -0
  2. scikit_survival-0.23.1.dist-info/METADATA +888 -0
  3. scikit_survival-0.23.1.dist-info/RECORD +55 -0
  4. scikit_survival-0.23.1.dist-info/WHEEL +5 -0
  5. scikit_survival-0.23.1.dist-info/top_level.txt +1 -0
  6. sksurv/__init__.py +138 -0
  7. sksurv/base.py +103 -0
  8. sksurv/bintrees/__init__.py +15 -0
  9. sksurv/bintrees/_binarytrees.cpython-313-darwin.so +0 -0
  10. sksurv/column.py +201 -0
  11. sksurv/compare.py +123 -0
  12. sksurv/datasets/__init__.py +10 -0
  13. sksurv/datasets/base.py +436 -0
  14. sksurv/datasets/data/GBSG2.arff +700 -0
  15. sksurv/datasets/data/actg320.arff +1169 -0
  16. sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
  17. sksurv/datasets/data/flchain.arff +7887 -0
  18. sksurv/datasets/data/veteran.arff +148 -0
  19. sksurv/datasets/data/whas500.arff +520 -0
  20. sksurv/ensemble/__init__.py +2 -0
  21. sksurv/ensemble/_coxph_loss.cpython-313-darwin.so +0 -0
  22. sksurv/ensemble/boosting.py +1610 -0
  23. sksurv/ensemble/forest.py +947 -0
  24. sksurv/ensemble/survival_loss.py +151 -0
  25. sksurv/exceptions.py +18 -0
  26. sksurv/functions.py +114 -0
  27. sksurv/io/__init__.py +2 -0
  28. sksurv/io/arffread.py +58 -0
  29. sksurv/io/arffwrite.py +145 -0
  30. sksurv/kernels/__init__.py +1 -0
  31. sksurv/kernels/_clinical_kernel.cpython-313-darwin.so +0 -0
  32. sksurv/kernels/clinical.py +328 -0
  33. sksurv/linear_model/__init__.py +3 -0
  34. sksurv/linear_model/_coxnet.cpython-313-darwin.so +0 -0
  35. sksurv/linear_model/aft.py +205 -0
  36. sksurv/linear_model/coxnet.py +543 -0
  37. sksurv/linear_model/coxph.py +618 -0
  38. sksurv/meta/__init__.py +4 -0
  39. sksurv/meta/base.py +35 -0
  40. sksurv/meta/ensemble_selection.py +642 -0
  41. sksurv/meta/stacking.py +349 -0
  42. sksurv/metrics.py +996 -0
  43. sksurv/nonparametric.py +588 -0
  44. sksurv/preprocessing.py +155 -0
  45. sksurv/svm/__init__.py +11 -0
  46. sksurv/svm/_minlip.cpython-313-darwin.so +0 -0
  47. sksurv/svm/_prsvm.cpython-313-darwin.so +0 -0
  48. sksurv/svm/minlip.py +606 -0
  49. sksurv/svm/naive_survival_svm.py +221 -0
  50. sksurv/svm/survival_svm.py +1228 -0
  51. sksurv/testing.py +108 -0
  52. sksurv/tree/__init__.py +1 -0
  53. sksurv/tree/_criterion.cpython-313-darwin.so +0 -0
  54. sksurv/tree/tree.py +703 -0
  55. sksurv/util.py +333 -0
@@ -0,0 +1,328 @@
1
+ # This program is free software: you can redistribute it and/or modify
2
+ # it under the terms of the GNU General Public License as published by
3
+ # the Free Software Foundation, either version 3 of the License, or
4
+ # (at your option) any later version.
5
+ #
6
+ # This program is distributed in the hope that it will be useful,
7
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
8
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
+ # GNU General Public License for more details.
10
+ #
11
+ # You should have received a copy of the GNU General Public License
12
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
13
+ import numpy as np
14
+ import pandas as pd
15
+ from pandas.api.types import CategoricalDtype, is_numeric_dtype
16
+ from sklearn.base import BaseEstimator, TransformerMixin
17
+ from sklearn.utils.validation import check_is_fitted
18
+
19
+ from ._clinical_kernel import (
20
+ continuous_ordinal_kernel,
21
+ continuous_ordinal_kernel_with_ranges,
22
+ pairwise_continuous_ordinal_kernel,
23
+ pairwise_nominal_kernel,
24
+ )
25
+
26
+ __all__ = ["clinical_kernel", "ClinicalKernelTransform"]
27
+
28
+
29
+ def _nominal_kernel(x, y, out):
30
+ """Number of features that match exactly"""
31
+ for i in range(x.shape[0]):
32
+ for j in range(y.shape[0]):
33
+ out[i, j] += (x[i, :] == y[j, :]).sum()
34
+
35
+ return out
36
+
37
+
38
+ def _get_continuous_and_ordinal_array(x):
39
+ """Convert array from continuous and ordered categorical columns"""
40
+ nominal_columns = x.select_dtypes(include=["object", "category"]).columns
41
+ ordinal_columns = pd.Index([v for v in nominal_columns if x[v].cat.ordered])
42
+ continuous_columns = x.select_dtypes(include=[np.number]).columns
43
+
44
+ x_num = x.loc[:, continuous_columns].astype(np.float64).values
45
+ if len(ordinal_columns) > 0:
46
+ x = _ordinal_as_numeric(x, ordinal_columns)
47
+
48
+ nominal_columns = nominal_columns.difference(ordinal_columns)
49
+ x_out = np.column_stack((x_num, x))
50
+ else:
51
+ x_out = x_num
52
+
53
+ return x_out, nominal_columns
54
+
55
+
56
+ def _ordinal_as_numeric(x, ordinal_columns):
57
+ x_numeric = np.empty((x.shape[0], len(ordinal_columns)), dtype=np.float64)
58
+
59
+ for i, c in enumerate(ordinal_columns):
60
+ x_numeric[:, i] = x[c].cat.codes
61
+ return x_numeric
62
+
63
+
64
+ def clinical_kernel(x, y=None):
65
+ """Computes clinical kernel
66
+
67
+ The clinical kernel distinguishes between continuous
68
+ ordinal,and nominal variables.
69
+
70
+ See [1]_ for further description.
71
+
72
+ Parameters
73
+ ----------
74
+ x : pandas.DataFrame, shape = (n_samples_x, n_features)
75
+ Training data
76
+
77
+ y : pandas.DataFrame, shape = (n_samples_y, n_features)
78
+ Testing data
79
+
80
+ Returns
81
+ -------
82
+ kernel : array, shape = (n_samples_x, n_samples_y)
83
+ Kernel matrix. Values are normalized to lie within [0, 1].
84
+
85
+ References
86
+ ----------
87
+ .. [1] Daemen, A., De Moor, B.,
88
+ "Development of a kernel function for clinical data".
89
+ Annual International Conference of the IEEE Engineering in Medicine and Biology Society, 5913-7, 2009
90
+ """
91
+ if y is not None:
92
+ if x.shape[1] != y.shape[1]:
93
+ raise ValueError("x and y have different number of features")
94
+ if not x.columns.equals(y.columns):
95
+ raise ValueError("columns do not match")
96
+ else:
97
+ y = x
98
+
99
+ mat = np.zeros((x.shape[0], y.shape[0]), dtype=float)
100
+
101
+ x_numeric, nominal_columns = _get_continuous_and_ordinal_array(x)
102
+ if id(x) != id(y):
103
+ y_numeric, _ = _get_continuous_and_ordinal_array(y)
104
+ else:
105
+ y_numeric = x_numeric
106
+
107
+ continuous_ordinal_kernel(x_numeric, y_numeric, mat)
108
+ _nominal_kernel(x.loc[:, nominal_columns].values, y.loc[:, nominal_columns].values, mat)
109
+ mat /= x.shape[1]
110
+ return mat
111
+
112
+
113
+ class ClinicalKernelTransform(BaseEstimator, TransformerMixin):
114
+ """Transform data using a clinical Kernel
115
+
116
+ The clinical kernel distinguishes between continuous
117
+ ordinal,and nominal variables.
118
+
119
+ See [1]_ for further description.
120
+
121
+ Parameters
122
+ ----------
123
+ fit_once : bool, optional
124
+ If set to ``True``, fit() does only transform the training data, but not update
125
+ its internal state. You should call prepare() once before calling transform().
126
+ If set to ``False``, it behaves like a regular estimator, i.e., you need to
127
+ call fit() before transform().
128
+
129
+ Attributes
130
+ ----------
131
+ n_features_in_ : int
132
+ Number of features seen during ``fit``.
133
+
134
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
135
+ Names of features seen during ``fit``. Defined only when `X`
136
+ has feature names that are all strings.
137
+
138
+ References
139
+ ----------
140
+ .. [1] Daemen, A., De Moor, B.,
141
+ "Development of a kernel function for clinical data".
142
+ Annual International Conference of the IEEE Engineering in Medicine and Biology Society, 5913-7, 2009
143
+ """
144
+
145
+ def __init__(self, *, fit_once=False, _numeric_ranges=None, _numeric_columns=None, _nominal_columns=None):
146
+ self.fit_once = fit_once
147
+
148
+ self._numeric_ranges = _numeric_ranges
149
+ self._numeric_columns = _numeric_columns
150
+ self._nominal_columns = _nominal_columns
151
+
152
+ def prepare(self, X):
153
+ """Determine transformation parameters from data in X.
154
+
155
+ Use if `fit_once` is `True`, in which case `fit()` does
156
+ not set the parameters of the clinical kernel.
157
+
158
+ Parameters
159
+ ----------
160
+ X: pandas.DataFrame, shape = (n_samples, n_features)
161
+ Data to estimate parameters from.
162
+ """
163
+ if not self.fit_once:
164
+ raise ValueError("prepare can only be used if fit_once parameter is set to True")
165
+
166
+ self._prepare_by_column_dtype(X)
167
+
168
+ def _prepare_by_column_dtype(self, X):
169
+ """Get distance functions for each column's dtype"""
170
+ if not isinstance(X, pd.DataFrame):
171
+ raise TypeError("X must be a pandas DataFrame")
172
+
173
+ numeric_columns = []
174
+ nominal_columns = []
175
+ numeric_ranges = []
176
+
177
+ fit_data = np.empty(X.shape, dtype=np.float64)
178
+
179
+ for i, dt in enumerate(X.dtypes):
180
+ col = X.iloc[:, i]
181
+ if isinstance(dt, CategoricalDtype):
182
+ if col.cat.ordered:
183
+ numeric_ranges.append(col.cat.codes.max() - col.cat.codes.min())
184
+ numeric_columns.append(i)
185
+ else:
186
+ nominal_columns.append(i)
187
+
188
+ col = col.cat.codes
189
+ elif is_numeric_dtype(dt):
190
+ numeric_ranges.append(col.max() - col.min())
191
+ numeric_columns.append(i)
192
+ else:
193
+ raise TypeError(f"unsupported dtype: {dt!r}")
194
+
195
+ fit_data[:, i] = col.values
196
+
197
+ self._numeric_columns = np.asarray(numeric_columns)
198
+ self._nominal_columns = np.asarray(nominal_columns)
199
+ self._numeric_ranges = np.asarray(numeric_ranges, dtype=float)
200
+ self.X_fit_ = fit_data
201
+
202
+ def fit(self, X, y=None, **kwargs): # pylint: disable=unused-argument
203
+ """Determine transformation parameters from data in X.
204
+
205
+ Subsequent calls to `transform(Y)` compute the pairwise
206
+ distance to `X`.
207
+ Parameters of the clinical kernel are only updated
208
+ if `fit_once` is `False`, otherwise you have to
209
+ explicitly call `prepare()` once.
210
+
211
+ Parameters
212
+ ----------
213
+ X: pandas.DataFrame, shape = (n_samples, n_features)
214
+ Data to estimate parameters from.
215
+
216
+ y : None
217
+ Argument is ignored (included for compatibility reasons).
218
+
219
+ kwargs : dict
220
+ Argument is ignored (included for compatibility reasons).
221
+
222
+ Returns
223
+ -------
224
+ self : object
225
+ Returns the instance itself.
226
+ """
227
+ if X.ndim != 2:
228
+ raise ValueError(f"expected 2d array, but got {X.ndim}")
229
+
230
+ self._check_feature_names(X, reset=True)
231
+ self._check_n_features(X, reset=True)
232
+
233
+ if self.fit_once:
234
+ self.X_fit_ = X
235
+ else:
236
+ self._prepare_by_column_dtype(X)
237
+
238
+ return self
239
+
240
+ def transform(self, Y):
241
+ r"""Compute all pairwise distances between `self.X_fit_` and `Y`.
242
+
243
+ Parameters
244
+ ----------
245
+ Y : array-like, shape = (n_samples_y, n_features)
246
+
247
+ Returns
248
+ -------
249
+ kernel : ndarray, shape = (n_samples_y, n_samples_X_fit\_)
250
+ Kernel matrix. Values are normalized to lie within [0, 1].
251
+ """
252
+ check_is_fitted(self, "X_fit_")
253
+
254
+ self._check_feature_names(Y, reset=False)
255
+ self._check_n_features(Y, reset=False)
256
+
257
+ n_samples_x = self.X_fit_.shape[0]
258
+
259
+ Y = np.asarray(Y)
260
+
261
+ n_samples_y = Y.shape[0]
262
+
263
+ mat = np.zeros((n_samples_y, n_samples_x), dtype=float)
264
+
265
+ continuous_ordinal_kernel_with_ranges(
266
+ Y[:, self._numeric_columns].astype(np.float64),
267
+ self.X_fit_[:, self._numeric_columns].astype(np.float64),
268
+ self._numeric_ranges,
269
+ mat,
270
+ )
271
+
272
+ if len(self._nominal_columns) > 0:
273
+ _nominal_kernel(Y[:, self._nominal_columns], self.X_fit_[:, self._nominal_columns], mat)
274
+
275
+ mat /= self.n_features_in_
276
+
277
+ return mat
278
+
279
+ def __call__(self, X, Y):
280
+ """Compute Kernel matrix between `X` and `Y`.
281
+
282
+ Parameters
283
+ ----------
284
+ x : array-like, shape = (n_samples_x, n_features)
285
+ Training data
286
+
287
+ y : array-like, shape = (n_samples_y, n_features)
288
+ Testing data
289
+
290
+ Returns
291
+ -------
292
+ kernel : ndarray, shape = (n_samples_x, n_samples_y)
293
+ Kernel matrix. Values are normalized to lie within [0, 1].
294
+ """
295
+ return self.fit(X).transform(Y).T
296
+
297
+ def pairwise_kernel(self, X, Y):
298
+ """Function to use with :func:`sklearn.metrics.pairwise.pairwise_kernels`
299
+
300
+ Parameters
301
+ ----------
302
+ X : array, shape = (n_features,)
303
+
304
+ Y : array, shape = (n_features,)
305
+
306
+ Returns
307
+ -------
308
+ similarity : float
309
+ Similarities are normalized to be within [0, 1]
310
+ """
311
+ check_is_fitted(self, "X_fit_")
312
+ if X.shape[0] != Y.shape[0]:
313
+ raise ValueError(
314
+ f"Incompatible dimension for X and Y matrices: X.shape[0] == {X.shape[0]} "
315
+ f"while Y.shape[0] == {Y.shape[0]}"
316
+ )
317
+
318
+ val = pairwise_continuous_ordinal_kernel(
319
+ X[self._numeric_columns], Y[self._numeric_columns], self._numeric_ranges
320
+ )
321
+ if len(self._nominal_columns) > 0:
322
+ val += pairwise_nominal_kernel(
323
+ X[self._nominal_columns].astype(np.int8), Y[self._nominal_columns].astype(np.int8)
324
+ )
325
+
326
+ val /= X.shape[0]
327
+
328
+ return val
@@ -0,0 +1,3 @@
1
+ from .aft import IPCRidge # noqa: F401
2
+ from .coxnet import CoxnetSurvivalAnalysis # noqa: F401
3
+ from .coxph import CoxPHSurvivalAnalysis # noqa: F401
@@ -0,0 +1,205 @@
1
+ # This program is free software: you can redistribute it and/or modify
2
+ # it under the terms of the GNU General Public License as published by
3
+ # the Free Software Foundation, either version 3 of the License, or
4
+ # (at your option) any later version.
5
+ #
6
+ # This program is distributed in the hope that it will be useful,
7
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
8
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
+ # GNU General Public License for more details.
10
+ #
11
+ # You should have received a copy of the GNU General Public License
12
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
13
+ import numpy as np
14
+ from sklearn.linear_model import Ridge
15
+
16
+ from ..base import SurvivalAnalysisMixin
17
+ from ..nonparametric import ipc_weights
18
+ from ..util import check_array_survival
19
+
20
+
21
+ class IPCRidge(Ridge, SurvivalAnalysisMixin):
22
+ """Accelerated failure time model with inverse probability of censoring weights.
23
+
24
+ This model assumes a regression model of the form
25
+
26
+ .. math::
27
+
28
+ \\log y = \\beta_0 + \\mathbf{X} \\beta + \\epsilon
29
+
30
+ L2-shrinkage is applied to the coefficients :math:`\\beta` and
31
+ each sample is weighted by the inverse probability of censoring
32
+ to account for right censoring (under the assumption that
33
+ censoring is independent of the features, i.e., random censoring).
34
+
35
+ See [1]_ for further description.
36
+
37
+ Parameters
38
+ ----------
39
+ alpha : float, optional, default: 1.0
40
+ Small positive values of alpha improve the conditioning of the problem
41
+ and reduce the variance of the estimates.
42
+ `alpha` must be a non-negative float i.e. in `[0, inf)`.
43
+
44
+ For numerical reasons, using `alpha = 0` is not advised.
45
+
46
+ fit_intercept : bool, default: True
47
+ Whether to fit the intercept for this model. If set
48
+ to false, no intercept will be used in calculations
49
+ (i.e. ``X`` and ``y`` are expected to be centered).
50
+
51
+ copy_X : bool, default: True
52
+ If True, X will be copied; else, it may be overwritten.
53
+
54
+ max_iter : int, default: None
55
+ Maximum number of iterations for conjugate gradient solver.
56
+ For 'sparse_cg' and 'lsqr' solvers, the default value is determined
57
+ by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
58
+ For 'lbfgs' solver, the default value is 15000.
59
+
60
+ tol : float, default: 1e-4
61
+ Precision of the solution. Note that `tol` has no effect for solvers 'svd' and
62
+ 'cholesky'.
63
+
64
+ solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
65
+ 'sag', 'saga', 'lbfgs'}, default: 'auto'
66
+ Solver to use in the computational routines:
67
+
68
+ - 'auto' chooses the solver automatically based on the type of data.
69
+
70
+ - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
71
+ coefficients. It is the most stable solver, in particular more stable
72
+ for singular matrices than 'cholesky' at the cost of being slower.
73
+
74
+ - 'cholesky' uses the standard scipy.linalg.solve function to
75
+ obtain a closed-form solution.
76
+
77
+ - 'sparse_cg' uses the conjugate gradient solver as found in
78
+ scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
79
+ more appropriate than 'cholesky' for large-scale data
80
+ (possibility to set `tol` and `max_iter`).
81
+
82
+ - 'lsqr' uses the dedicated regularized least-squares routine
83
+ scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
84
+ procedure.
85
+
86
+ - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
87
+ its improved, unbiased version named SAGA. Both methods also use an
88
+ iterative procedure, and are often faster than other solvers when
89
+ both n_samples and n_features are large. Note that 'sag' and
90
+ 'saga' fast convergence is only guaranteed on features with
91
+ approximately the same scale. You can preprocess the data with a
92
+ scaler from sklearn.preprocessing.
93
+
94
+ - 'lbfgs' uses L-BFGS-B algorithm implemented in
95
+ `scipy.optimize.minimize`. It can be used only when `positive`
96
+ is True.
97
+
98
+ All solvers except 'svd' support both dense and sparse data. However, only
99
+ 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
100
+ `fit_intercept` is True.
101
+
102
+ positive : bool, default: False
103
+ When set to ``True``, forces the coefficients to be positive.
104
+ Only 'lbfgs' solver is supported in this case.
105
+
106
+ random_state : int, RandomState instance, default: None
107
+ Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
108
+
109
+ Attributes
110
+ ----------
111
+ coef_ : ndarray, shape = (n_features,)
112
+ Weight vector.
113
+
114
+ intercept_ : float or ndarray of shape (n_targets,)
115
+ Independent term in decision function. Set to 0.0 if
116
+ ``fit_intercept = False``.
117
+
118
+ n_iter_ : None or ndarray of shape (n_targets,)
119
+ Actual number of iterations for each target. Available only for
120
+ sag and lsqr solvers. Other solvers will return None.
121
+
122
+ n_features_in_ : int
123
+ Number of features seen during ``fit``.
124
+
125
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
126
+ Names of features seen during ``fit``. Defined only when `X`
127
+ has feature names that are all strings.
128
+
129
+ References
130
+ ----------
131
+ .. [1] W. Stute, "Consistent estimation under random censorship when covariables are
132
+ present", Journal of Multivariate Analysis, vol. 45, no. 1, pp. 89-103, 1993.
133
+ doi:10.1006/jmva.1993.1028.
134
+ """
135
+
136
+ _parameter_constraints = {**Ridge._parameter_constraints}
137
+
138
+ def __init__(
139
+ self,
140
+ alpha=1.0,
141
+ *,
142
+ fit_intercept=True,
143
+ copy_X=True,
144
+ max_iter=None,
145
+ tol=1e-3,
146
+ solver="auto",
147
+ positive=False,
148
+ random_state=None,
149
+ ):
150
+ super().__init__(
151
+ alpha=alpha,
152
+ fit_intercept=fit_intercept,
153
+ copy_X=copy_X,
154
+ max_iter=max_iter,
155
+ tol=tol,
156
+ solver=solver,
157
+ positive=positive,
158
+ random_state=random_state,
159
+ )
160
+
161
+ @property
162
+ def _predict_risk_score(self):
163
+ return False
164
+
165
+ def fit(self, X, y):
166
+ """Build an accelerated failure time model.
167
+
168
+ Parameters
169
+ ----------
170
+ X : array-like, shape = (n_samples, n_features)
171
+ Data matrix.
172
+
173
+ y : structured array, shape = (n_samples,)
174
+ A structured array containing the binary event indicator
175
+ as first field, and time of event or time of censoring as
176
+ second field.
177
+
178
+ Returns
179
+ -------
180
+ self
181
+ """
182
+ event, time = check_array_survival(X, y)
183
+
184
+ weights = ipc_weights(event, time)
185
+ super().fit(X, np.log(time), sample_weight=weights)
186
+
187
+ return self
188
+
189
+ def predict(self, X):
190
+ """Predict using the linear accelerated failure time model.
191
+
192
+ Parameters
193
+ ----------
194
+ X : {array-like, sparse matrix}, shape = (n_samples, n_features)
195
+ Samples.
196
+
197
+ Returns
198
+ -------
199
+ C : array, shape = (n_samples,)
200
+ Returns predicted values on original scale (NOT log scale).
201
+ """
202
+ return np.exp(super().predict(X))
203
+
204
+ def score(self, X, y, sample_weight=None):
205
+ return SurvivalAnalysisMixin.score(self, X, y)