PyPI - scikit-survival - Versions diffs - 0.25.0__cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - Mend

scikit-survival 0.25.0__cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

scikit_survival-0.25.0.dist-info/METADATA +185 -0
scikit_survival-0.25.0.dist-info/RECORD +58 -0
scikit_survival-0.25.0.dist-info/WHEEL +6 -0
scikit_survival-0.25.0.dist-info/licenses/COPYING +674 -0
scikit_survival-0.25.0.dist-info/top_level.txt +1 -0
sksurv/__init__.py +183 -0
sksurv/base.py +115 -0
sksurv/bintrees/__init__.py +15 -0
sksurv/bintrees/_binarytrees.cpython-311-x86_64-linux-gnu.so +0 -0
sksurv/column.py +205 -0
sksurv/compare.py +123 -0
sksurv/datasets/__init__.py +12 -0
sksurv/datasets/base.py +614 -0
sksurv/datasets/data/GBSG2.arff +700 -0
sksurv/datasets/data/actg320.arff +1169 -0
sksurv/datasets/data/bmt.arff +46 -0
sksurv/datasets/data/breast_cancer_GSE7390-metastasis.arff +283 -0
sksurv/datasets/data/cgvhd.arff +118 -0
sksurv/datasets/data/flchain.arff +7887 -0
sksurv/datasets/data/veteran.arff +148 -0
sksurv/datasets/data/whas500.arff +520 -0
sksurv/docstrings.py +99 -0
sksurv/ensemble/__init__.py +2 -0
sksurv/ensemble/_coxph_loss.cpython-311-x86_64-linux-gnu.so +0 -0
sksurv/ensemble/boosting.py +1564 -0
sksurv/ensemble/forest.py +902 -0
sksurv/ensemble/survival_loss.py +151 -0
sksurv/exceptions.py +18 -0
sksurv/functions.py +114 -0
sksurv/io/__init__.py +2 -0
sksurv/io/arffread.py +89 -0
sksurv/io/arffwrite.py +181 -0
sksurv/kernels/__init__.py +1 -0
sksurv/kernels/_clinical_kernel.cpython-311-x86_64-linux-gnu.so +0 -0
sksurv/kernels/clinical.py +348 -0
sksurv/linear_model/__init__.py +3 -0
sksurv/linear_model/_coxnet.cpython-311-x86_64-linux-gnu.so +0 -0
sksurv/linear_model/aft.py +208 -0
sksurv/linear_model/coxnet.py +592 -0
sksurv/linear_model/coxph.py +637 -0
sksurv/meta/__init__.py +4 -0
sksurv/meta/base.py +35 -0
sksurv/meta/ensemble_selection.py +724 -0
sksurv/meta/stacking.py +370 -0
sksurv/metrics.py +1028 -0
sksurv/nonparametric.py +911 -0
sksurv/preprocessing.py +183 -0
sksurv/svm/__init__.py +11 -0
sksurv/svm/_minlip.cpython-311-x86_64-linux-gnu.so +0 -0
sksurv/svm/_prsvm.cpython-311-x86_64-linux-gnu.so +0 -0
sksurv/svm/minlip.py +690 -0
sksurv/svm/naive_survival_svm.py +249 -0
sksurv/svm/survival_svm.py +1236 -0
sksurv/testing.py +108 -0
sksurv/tree/__init__.py +1 -0
sksurv/tree/_criterion.cpython-311-x86_64-linux-gnu.so +0 -0
sksurv/tree/tree.py +790 -0
sksurv/util.py +415 -0

sksurv/kernels/clinical.py ADDED Viewed

@@ -0,0 +1,348 @@
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+import numpy as np
+import pandas as pd
+from pandas.api.types import CategoricalDtype, is_numeric_dtype
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import _check_feature_names, _check_n_features, check_is_fitted
+from ._clinical_kernel import (
+    continuous_ordinal_kernel,
+    continuous_ordinal_kernel_with_ranges,
+    pairwise_continuous_ordinal_kernel,
+    pairwise_nominal_kernel,
+)
+__all__ = ["clinical_kernel", "ClinicalKernelTransform"]
+def _nominal_kernel(x, y, out):
+    """Number of features that match exactly"""
+    for i in range(x.shape[0]):
+        for j in range(y.shape[0]):
+            out[i, j] += (x[i, :] == y[j, :]).sum()
+    return out
+def _get_continuous_and_ordinal_array(x):
+    """Convert array from continuous and ordered categorical columns"""
+    nominal_columns = x.select_dtypes(include=["object", "category"]).columns
+    ordinal_columns = pd.Index([v for v in nominal_columns if x[v].cat.ordered])
+    continuous_columns = x.select_dtypes(include=[np.number]).columns
+    x_num = x.loc[:, continuous_columns].astype(np.float64).values
+    if len(ordinal_columns) > 0:
+        x = _ordinal_as_numeric(x, ordinal_columns)
+        nominal_columns = nominal_columns.difference(ordinal_columns)
+        x_out = np.column_stack((x_num, x))
+    else:
+        x_out = x_num
+    return x_out, nominal_columns
+def _ordinal_as_numeric(x, ordinal_columns):
+    x_numeric = np.empty((x.shape[0], len(ordinal_columns)), dtype=np.float64)
+    for i, c in enumerate(ordinal_columns):
+        x_numeric[:, i] = x[c].cat.codes
+    return x_numeric
+def clinical_kernel(x, y=None):
+    """Computes clinical kernel.
+    The clinical kernel distinguishes between continuous
+    ordinal, and nominal variables.
+    Kernel values are normalized to lie within [0, 1].
+    See [1]_ for further description.
+    Parameters
+    ----------
+    x : pandas.DataFrame, shape = (n_samples_x, n_features)
+        Training data
+    y : pandas.DataFrame, shape = (n_samples_y, n_features)
+        Testing data
+    Returns
+    -------
+    kernel : array, shape = (n_samples_x, n_samples_y)
+        Kernel matrix.
+    References
+    ----------
+    .. [1] Daemen, A., De Moor, B.,
+           "Development of a kernel function for clinical data".
+           Annual International Conference of the IEEE Engineering in Medicine and Biology Society, 5913-7, 2009
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> from sksurv.kernels import clinical_kernel
+    >>>
+    >>> data = pd.DataFrame({
+    ...     'feature_num': [1.0, 2.0, 3.0],
+    ...     'feature_ord': pd.Categorical(['low', 'medium', 'high'], ordered=True),
+    ...     'feature_nom': pd.Categorical(['A', 'B', 'A'])
+    ... })
+    >>>
+    >>> kernel_matrix = clinical_kernel(data)
+    >>> print(kernel_matrix)
+    [[1.         0.33333333 0.5       ]
+     [0.33333333 1.         0.16666667]
+     [0.5        0.16666667 1.        ]]
+    """
+    if y is not None:
+        if x.shape[1] != y.shape[1]:
+            raise ValueError("x and y have different number of features")
+        if not x.columns.equals(y.columns):
+            raise ValueError("columns do not match")
+    else:
+        y = x
+    mat = np.zeros((x.shape[0], y.shape[0]), dtype=float)
+    x_numeric, nominal_columns = _get_continuous_and_ordinal_array(x)
+    if id(x) != id(y):
+        y_numeric, _ = _get_continuous_and_ordinal_array(y)
+    else:
+        y_numeric = x_numeric
+    continuous_ordinal_kernel(x_numeric, y_numeric, mat)
+    _nominal_kernel(x.loc[:, nominal_columns].values, y.loc[:, nominal_columns].values, mat)
+    mat /= x.shape[1]
+    return mat
+class ClinicalKernelTransform(BaseEstimator, TransformerMixin):
+    """Transform data using a clinical Kernel
+    The clinical kernel distinguishes between continuous
+    ordinal, and nominal variables.
+    See [1]_ for further description.
+    Parameters
+    ----------
+    fit_once : bool, optional
+        If set to ``True``, fit() does only transform the training data, but not update
+        its internal state. You should call prepare() once before calling transform().
+        If set to ``False``, it behaves like a regular estimator, i.e., you need to
+        call fit() before transform().
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during ``fit``.
+    feature_names_in_ : ndarray, shape = (`n_features_in_`,)
+        Names of features seen during ``fit``. Defined only when `X`
+        has feature names that are all strings.
+    References
+    ----------
+    .. [1] Daemen, A., De Moor, B.,
+           "Development of a kernel function for clinical data".
+           Annual International Conference of the IEEE Engineering in Medicine and Biology Society, 5913-7, 2009
+    """
+    def __init__(self, *, fit_once=False, _numeric_ranges=None, _numeric_columns=None, _nominal_columns=None):
+        self.fit_once = fit_once
+        self._numeric_ranges = _numeric_ranges
+        self._numeric_columns = _numeric_columns
+        self._nominal_columns = _nominal_columns
+    def prepare(self, X):
+        """Determine transformation parameters from data in X.
+        Use if `fit_once` is `True`, in which case `fit()` does
+        not set the parameters of the clinical kernel.
+        Parameters
+        ----------
+        X: pandas.DataFrame, shape = (n_samples, n_features)
+            Data to estimate parameters from.
+        """
+        if not self.fit_once:
+            raise ValueError("prepare can only be used if fit_once parameter is set to True")
+        self._prepare_by_column_dtype(X)
+    def _prepare_by_column_dtype(self, X):
+        """Get distance functions for each column's dtype"""
+        if not isinstance(X, pd.DataFrame):
+            raise TypeError("X must be a pandas DataFrame")
+        numeric_columns = []
+        nominal_columns = []
+        numeric_ranges = []
+        fit_data = np.empty(X.shape, dtype=np.float64)
+        for i, dt in enumerate(X.dtypes):
+            col = X.iloc[:, i]
+            if isinstance(dt, CategoricalDtype):
+                if col.cat.ordered:
+                    numeric_ranges.append(col.cat.codes.max() - col.cat.codes.min())
+                    numeric_columns.append(i)
+                else:
+                    nominal_columns.append(i)
+                col = col.cat.codes
+            elif is_numeric_dtype(dt):
+                numeric_ranges.append(col.max() - col.min())
+                numeric_columns.append(i)
+            else:
+                raise TypeError(f"unsupported dtype: {dt!r}")
+            fit_data[:, i] = col.values
+        self._numeric_columns = np.asarray(numeric_columns)
+        self._nominal_columns = np.asarray(nominal_columns)
+        self._numeric_ranges = np.asarray(numeric_ranges, dtype=float)
+        self.X_fit_ = fit_data
+    def fit(self, X, y=None, **kwargs):  # pylint: disable=unused-argument
+        """Determine transformation parameters from data in X.
+        Subsequent calls to `transform(Y)` compute the pairwise
+        distance to `X`.
+        Parameters of the clinical kernel are only updated
+        if `fit_once` is `False`, otherwise you have to
+        explicitly call `prepare()` once.
+        Parameters
+        ----------
+        X: pandas.DataFrame, shape = (n_samples, n_features)
+            Data to estimate parameters from.
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
+        kwargs : dict
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        if X.ndim != 2:
+            raise ValueError(f"expected 2d array, but got {X.ndim}")
+        _check_feature_names(self, X, reset=True)
+        _check_n_features(self, X, reset=True)
+        if self.fit_once:
+            self.X_fit_ = X
+        else:
+            self._prepare_by_column_dtype(X)
+        return self
+    def transform(self, Y):
+        r"""Compute all pairwise distances between `self.X_fit_` and `Y`.
+        Parameters
+        ----------
+        Y : array-like, shape = (n_samples_y, n_features)
+        Returns
+        -------
+        kernel : ndarray, shape = (n_samples_y, n_samples_X_fit\_)
+            Kernel matrix. Values are normalized to lie within [0, 1].
+        """
+        check_is_fitted(self, "X_fit_")
+        _check_feature_names(self, Y, reset=False)
+        _check_n_features(self, Y, reset=False)
+        n_samples_x = self.X_fit_.shape[0]
+        Y = np.asarray(Y)
+        n_samples_y = Y.shape[0]
+        mat = np.zeros((n_samples_y, n_samples_x), dtype=float)
+        continuous_ordinal_kernel_with_ranges(
+            Y[:, self._numeric_columns].astype(np.float64),
+            self.X_fit_[:, self._numeric_columns].astype(np.float64),
+            self._numeric_ranges,
+            mat,
+        )
+        if len(self._nominal_columns) > 0:
+            _nominal_kernel(Y[:, self._nominal_columns], self.X_fit_[:, self._nominal_columns], mat)
+        mat /= self.n_features_in_
+        return mat
+    def __call__(self, X, Y):
+        """Compute Kernel matrix between `X` and `Y`.
+        Parameters
+        ----------
+        x : pandas.DataFrame, shape = (n_samples_x, n_features)
+            Training data
+        y : pandas.DataFrame, shape = (n_samples_y, n_features)
+            Testing data
+        Returns
+        -------
+        kernel : ndarray, shape = (n_samples_x, n_samples_y)
+            Kernel matrix. Values are normalized to lie within [0, 1].
+        """
+        return self.fit(X).transform(Y).T
+    def pairwise_kernel(self, X, Y):
+        """Function to use with :func:`sklearn.metrics.pairwise.pairwise_kernels`.
+        Parameters
+        ----------
+        X : ndarray, shape = (n_features,)
+        Y : ndarray, shape = (n_features,)
+        Returns
+        -------
+        similarity : float
+            Similarities are normalized to be within [0, 1].
+        """
+        check_is_fitted(self, "X_fit_")
+        if X.shape[0] != Y.shape[0]:
+            raise ValueError(
+                f"Incompatible dimension for X and Y matrices: X.shape[0] == {X.shape[0]} "
+                f"while Y.shape[0] == {Y.shape[0]}"
+            )
+        val = pairwise_continuous_ordinal_kernel(
+            X[self._numeric_columns], Y[self._numeric_columns], self._numeric_ranges
+        )
+        if len(self._nominal_columns) > 0:
+            val += pairwise_nominal_kernel(
+                X[self._nominal_columns].astype(np.int8), Y[self._nominal_columns].astype(np.int8)
+            )
+        val /= X.shape[0]
+        return val

sksurv/linear_model/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .aft import IPCRidge  # noqa: F401
+from .coxnet import CoxnetSurvivalAnalysis  # noqa: F401
+from .coxph import CoxPHSurvivalAnalysis  # noqa: F401

sksurv/linear_model/_coxnet.cpython-311-x86_64-linux-gnu.so ADDED Viewed

Binary file

sksurv/linear_model/aft.py ADDED Viewed

@@ -0,0 +1,208 @@
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+import numpy as np
+from sklearn.linear_model import Ridge
+from ..base import SurvivalAnalysisMixin
+from ..nonparametric import ipc_weights
+from ..util import check_array_survival
+class IPCRidge(Ridge, SurvivalAnalysisMixin):
+    r"""Accelerated failure time model with inverse probability of censoring weights.
+    This model assumes a regression model of the form
+    .. math::
+        \log y = \beta_0 + \mathbf{X} \beta + \epsilon
+    L2-shrinkage is applied to the coefficients :math:`\beta` and
+    each sample is weighted by the inverse probability of censoring
+    to account for right censoring (under the assumption that
+    censoring is independent of the features, i.e., random censoring).
+    See [1]_ for further description.
+    Parameters
+    ----------
+    alpha : float, optional, default: 1.0
+        Small positive values of alpha improve the conditioning of the problem
+        and reduce the variance of the estimates.
+        `alpha` must be a non-negative float i.e. in `[0, inf)`.
+        For numerical reasons, using `alpha = 0` is not advised.
+    fit_intercept : bool, default: True
+        Whether to fit the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. ``X`` and ``y`` are expected to be centered).
+    copy_X : bool, default: True
+        If True, X will be copied; else, it may be overwritten.
+    max_iter : int, default: None
+        Maximum number of iterations for conjugate gradient solver.
+        For 'sparse_cg' and 'lsqr' solvers, the default value is determined
+        by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
+        For 'lbfgs' solver, the default value is 15000.
+    tol : float, default: 1e-3
+        Precision of the solution. Note that `tol` has no effect for solvers 'svd' and
+        'cholesky'.
+    solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
+            'sag', 'saga', 'lbfgs'}, default: 'auto'
+        Solver to use in the computational routines:
+        - 'auto' chooses the solver automatically based on the type of data.
+        - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
+          coefficients. It is the most stable solver, in particular more stable
+          for singular matrices than 'cholesky' at the cost of being slower.
+        - 'cholesky' uses the standard scipy.linalg.solve function to
+          obtain a closed-form solution.
+        - 'sparse_cg' uses the conjugate gradient solver as found in
+          scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
+          more appropriate than 'cholesky' for large-scale data
+          (possibility to set `tol` and `max_iter`).
+        - 'lsqr' uses the dedicated regularized least-squares routine
+          scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
+          procedure.
+        - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
+          its improved, unbiased version named SAGA. Both methods also use an
+          iterative procedure, and are often faster than other solvers when
+          both n_samples and n_features are large. Note that 'sag' and
+          'saga' fast convergence is only guaranteed on features with
+          approximately the same scale. You can preprocess the data with a
+          scaler from sklearn.preprocessing.
+        - 'lbfgs' uses L-BFGS-B algorithm implemented in
+          `scipy.optimize.minimize`. It can be used only when `positive`
+          is True.
+        All solvers except 'svd' support both dense and sparse data. However, only
+        'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
+        `fit_intercept` is True.
+    positive : bool, default: False
+        When set to ``True``, forces the coefficients to be positive.
+        Only 'lbfgs' solver is supported in this case.
+    random_state : int, RandomState instance, default: None
+        Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
+    Attributes
+    ----------
+    coef_ : ndarray, shape = (n_features,)
+        Weight vector.
+    intercept_ : float or ndarray, shape = (n_targets,)
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+    n_iter_ : None or ndarray, shape = (n_targets,)
+        Actual number of iterations for each target. Available only for
+        sag and lsqr solvers. Other solvers will return None.
+    n_features_in_ : int
+        Number of features seen during ``fit``.
+    feature_names_in_ : ndarray, shape = (`n_features_in_`,)
+        Names of features seen during ``fit``. Defined only when `X`
+        has feature names that are all strings.
+    References
+    ----------
+    .. [1] W. Stute, "Consistent estimation under random censorship when covariables are
+           present", Journal of Multivariate Analysis, vol. 45, no. 1, pp. 89-103, 1993.
+           doi:10.1006/jmva.1993.1028.
+    """
+    _parameter_constraints = {**Ridge._parameter_constraints}
+    def __init__(
+        self,
+        alpha=1.0,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        max_iter=None,
+        tol=1e-3,
+        solver="auto",
+        positive=False,
+        random_state=None,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            copy_X=copy_X,
+            max_iter=max_iter,
+            tol=tol,
+            solver=solver,
+            positive=positive,
+            random_state=random_state,
+        )
+    @property
+    def _predict_risk_score(self):
+        return False
+    def fit(self, X, y):
+        """Build an accelerated failure time model.
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Data matrix.
+        y : structured array, shape = (n_samples,)
+            A structured array with two fields. The first field is a boolean
+            where ``True`` indicates an event and ``False`` indicates right-censoring.
+            The second field is a float with the time of event or time of censoring.
+        Returns
+        -------
+        self
+        """
+        event, time = check_array_survival(X, y)
+        weights = ipc_weights(event, time)
+        super().fit(X, np.log(time), sample_weight=weights)
+        return self
+    def predict(self, X):
+        """Predict using the linear accelerated failure time model.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features)
+            Samples.
+        Returns
+        -------
+        y_pred : array, shape = (n_samples,)
+            Returns predicted values on original scale (NOT log scale).
+        """
+        return np.exp(super().predict(X))
+    def score(self, X, y, sample_weight=None):
+        return SurvivalAnalysisMixin.score(self, X, y)
+IPCRidge.score.__doc__ = SurvivalAnalysisMixin.score.__doc__