PyPI - scikit-survival - Versions diffs - 0.24.1__cp311-cp311-win_amd64.whl → 0.25.0__cp311-cp311-win_amd64.whl - Mend

scikit-survival 0.24.1__cp311-cp311-win_amd64.whl → 0.25.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

scikit_survival-0.25.0.dist-info/METADATA +185 -0
scikit_survival-0.25.0.dist-info/RECORD +58 -0
{scikit_survival-0.24.1.dist-info → scikit_survival-0.25.0.dist-info}/WHEEL +1 -1
sksurv/__init__.py +51 -6
sksurv/base.py +12 -2
sksurv/bintrees/_binarytrees.cp311-win_amd64.pyd +0 -0
sksurv/column.py +33 -29
sksurv/compare.py +22 -22
sksurv/datasets/base.py +45 -20
sksurv/docstrings.py +99 -0
sksurv/ensemble/_coxph_loss.cp311-win_amd64.pyd +0 -0
sksurv/ensemble/boosting.py +116 -168
sksurv/ensemble/forest.py +94 -151
sksurv/functions.py +29 -29
sksurv/io/arffread.py +34 -3
sksurv/io/arffwrite.py +38 -2
sksurv/kernels/_clinical_kernel.cp311-win_amd64.pyd +0 -0
sksurv/kernels/clinical.py +33 -13
sksurv/linear_model/_coxnet.cp311-win_amd64.pyd +0 -0
sksurv/linear_model/aft.py +14 -11
sksurv/linear_model/coxnet.py +138 -89
sksurv/linear_model/coxph.py +102 -83
sksurv/meta/ensemble_selection.py +91 -9
sksurv/meta/stacking.py +47 -26
sksurv/metrics.py +257 -224
sksurv/nonparametric.py +150 -81
sksurv/preprocessing.py +55 -27
sksurv/svm/_minlip.cp311-win_amd64.pyd +0 -0
sksurv/svm/_prsvm.cp311-win_amd64.pyd +0 -0
sksurv/svm/minlip.py +160 -79
sksurv/svm/naive_survival_svm.py +63 -34
sksurv/svm/survival_svm.py +103 -103
sksurv/tree/_criterion.cp311-win_amd64.pyd +0 -0
sksurv/tree/tree.py +170 -84
sksurv/util.py +80 -26
scikit_survival-0.24.1.dist-info/METADATA +0 -889
scikit_survival-0.24.1.dist-info/RECORD +0 -57
{scikit_survival-0.24.1.dist-info → scikit_survival-0.25.0.dist-info}/licenses/COPYING +0 -0
{scikit_survival-0.24.1.dist-info → scikit_survival-0.25.0.dist-info}/top_level.txt +0 -0

sksurv/tree/tree.py CHANGED Viewed

@@ -20,6 +20,7 @@ from sklearn.utils.validation import (
 )
 from ..base import SurvivalAnalysisMixin
+from ..docstrings import append_cumulative_hazard_example, append_survival_function_example
 from ..functions import StepFunction
 from ..util import check_array_survival
 from ._criterion import LogrankCriterion, get_unique_times
@@ -38,10 +39,9 @@ def _array_to_step_function(x, array):
 class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
-    """A survival tree.
+    """A single survival tree.
-    The quality of a split is measured by the
-    log-rank splitting rule.
+    The quality of a split is measured by the log-rank splitting rule.
     If ``splitter='best'``, fit and predict methods support
     missing values. See :ref:`tree_missing_value_support` for details.
@@ -85,7 +85,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
-    max_features : int, float, string or None, optional, default: None
+    max_features : int, float or {'sqrt', 'log2'} or None, optional, default: None
         The number of features to consider when looking for the best split:
         - If int, then consider `max_features` features at each split.
@@ -116,22 +116,22 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
-    low_memory : boolean, default: False
-        If set, ``predict`` computations use reduced memory but ``predict_cumulative_hazard_function``
-        and ``predict_survival_function`` are not implemented.
+    low_memory : bool, optional, default: False
+        If set, :meth:`predict` computations use reduced memory but :meth:`predict_cumulative_hazard_function`
+        and :meth:`predict_survival_function` are not implemented.
     Attributes
     ----------
-    unique_times_ : array of shape = (n_unique_times,)
+    unique_times_ : ndarray, shape = (n_unique_times,), dtype = float
         Unique time points.
-    max_features_ : int,
+    max_features_ : int
         The inferred value of max_features.
     n_features_in_ : int
         Number of features seen during ``fit``.
-    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+    feature_names_in_ : ndarray, shape = (`n_features_in_`,), dtype = object
         Names of features seen during ``fit``. Defined only when `X`
         has feature names that are all strings.
@@ -141,8 +141,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
     See also
     --------
-    sksurv.ensemble.RandomSurvivalForest
-        An ensemble of SurvivalTrees.
+    sksurv.ensemble.RandomSurvivalForest : An ensemble of SurvivalTrees.
     References
     ----------
@@ -219,7 +218,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         Parameter
         ---------
-        X : array-like of shape (n_samples, n_features), dtype=DOUBLE
+        X : array-like, shape = (n_samples, n_features), dtype = DOUBLE
             Input data.
         estimator_name : str or None, default=None
@@ -267,9 +266,9 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
             Data matrix
         y : structured array, shape = (n_samples,)
-            A structured array containing the binary event indicator
-            as first field, and time of event or time of censoring as
-            second field.
+            A structured array with two fields. The first field is a boolean
+            where ``True`` indicates an event and ``False`` indicates right-censoring.
+            The second field is a float with the time of event or time of censoring.
         check_input : boolean, default: True
             Allow to bypass several input checking.
@@ -440,15 +439,15 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         return X
     def predict(self, X, check_input=True):
-        """Predict risk score.
+        r"""Predict risk score.
         The risk score is the total number of events, which can
         be estimated by the sum of the estimated cumulative
-        hazard function :math:`\\hat{H}_h` in terminal node :math:`h`.
+        hazard function :math:`\hat{H}_h` in terminal node :math:`h`.
         .. math::
-            \\sum_{j=1}^{n(h)} \\hat{H}_h(T_{j} \\mid x) ,
+            \sum_{j=1}^{n(h)} \hat{H}_h(T_{j} \mid x) ,
         where :math:`n(h)` denotes the number of distinct event times
         of samples belonging to the same terminal node as :math:`x`.
@@ -467,7 +466,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         Returns
         -------
-        risk_scores : ndarray, shape = (n_samples,)
+        risk_scores : ndarray, shape = (n_samples,), dtype=float
             Predicted risk scores.
         """
@@ -480,6 +479,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         chf = self.predict_cumulative_hazard_function(X, check_input, return_array=True)
         return chf[:, self.is_event_time_].sum(1)
+    @append_cumulative_hazard_example(estimator_mod="tree", estimator_class="SurvivalTree")
     def predict_cumulative_hazard_function(self, X, check_input=True, return_array=False):
         """Predict cumulative hazard function.
@@ -501,44 +501,29 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
-        return_array : boolean, default: False
-            If set, return an array with the cumulative hazard rate
-            for each `self.unique_times_`, otherwise an array of
-            :class:`sksurv.functions.StepFunction`.
+        return_array : bool, default: False
+            Whether to return a single array of cumulative hazard values
+            or a list of step functions.
+            If `False`, a list of :class:`sksurv.functions.StepFunction`
+            objects is returned.
+            If `True`, a 2d-array of shape `(n_samples, n_unique_times)` is
+            returned, where `n_unique_times` is the number of unique
+            event times in the training data. Each row represents the cumulative
+            hazard function of an individual evaluated at `unique_times_`.
         Returns
         -------
         cum_hazard : ndarray
-            If `return_array` is set, an array with the cumulative hazard rate
-            for each `self.unique_times_`, otherwise an array of length `n_samples`
-            of :class:`sksurv.functions.StepFunction` instances will be returned.
+            If `return_array` is `False`, an array of `n_samples`
+            :class:`sksurv.functions.StepFunction` instances is returned.
+            If `return_array` is `True`, a numeric array of shape
+            `(n_samples, n_unique_times_)` is returned.
         Examples
         --------
-        >>> import matplotlib.pyplot as plt
-        >>> from sksurv.datasets import load_whas500
-        >>> from sksurv.tree import SurvivalTree
-        Load and prepare the data.
-        >>> X, y = load_whas500()
-        >>> X = X.astype(float)
-        Fit the model.
-        >>> estimator = SurvivalTree().fit(X, y)
-        Estimate the cumulative hazard function for the first 5 samples.
-        >>> chf_funcs = estimator.predict_cumulative_hazard_function(X.iloc[:5])
-        Plot the estimated cumulative hazard functions.
-        >>> for fn in chf_funcs:
-        ...    plt.step(fn.x, fn(fn.x), where="post")
-        ...
-        >>> plt.ylim(0, 1)
-        >>> plt.show()
         """
         self._check_low_memory("predict_cumulative_hazard_function")
         check_is_fitted(self, "tree_")
@@ -550,6 +535,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
             return arr
         return _array_to_step_function(self.unique_times_, arr)
+    @append_survival_function_example(estimator_mod="tree", estimator_class="SurvivalTree")
     def predict_survival_function(self, X, check_input=True, return_array=False):
         """Predict survival function.
@@ -571,45 +557,29 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
-        return_array : boolean, default: False
-            If set, return an array with the probability
-            of survival for each `self.unique_times_`,
-            otherwise an array of :class:`sksurv.functions.StepFunction`.
+        return_array : bool, default: False
+            Whether to return a single array of survival probabilities
+            or a list of step functions.
+            If `False`, a list of :class:`sksurv.functions.StepFunction`
+            objects is returned.
+            If `True`, a 2d-array of shape `(n_samples, n_unique_times)` is
+            returned, where `n_unique_times` is the number of unique
+            event times in the training data. Each row represents the survival
+            function of an individual evaluated at `unique_times_`.
         Returns
         -------
         survival : ndarray
-            If `return_array` is set, an array with the probability of
-            survival for each `self.unique_times_`, otherwise an array of
-            length `n_samples` of :class:`sksurv.functions.StepFunction`
-            instances will be returned.
+            If `return_array` is `False`, an array of `n_samples`
+            :class:`sksurv.functions.StepFunction` instances is returned.
+            If `return_array` is `True`, a numeric array of shape
+            `(n_samples, n_unique_times_)` is returned.
         Examples
         --------
-        >>> import matplotlib.pyplot as plt
-        >>> from sksurv.datasets import load_whas500
-        >>> from sksurv.tree import SurvivalTree
-        Load and prepare the data.
-        >>> X, y = load_whas500()
-        >>> X = X.astype(float)
-        Fit the model.
-        >>> estimator = SurvivalTree().fit(X, y)
-        Estimate the survival function for the first 5 samples.
-        >>> surv_funcs = estimator.predict_survival_function(X.iloc[:5])
-        Plot the estimated survival functions.
-        >>> for fn in surv_funcs:
-        ...    plt.step(fn.x, fn(fn.x), where="post")
-        ...
-        >>> plt.ylim(0, 1)
-        >>> plt.show()
         """
         self._check_low_memory("predict_survival_function")
         check_is_fitted(self, "tree_")
@@ -640,7 +610,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         Returns
         -------
-        X_leaves : array-like, shape = (n_samples,)
+        X_leaves : ndarray, shape = (n_samples,), dtype=int
             For each datapoint x in X, return the index of the leaf x
             ends up in. Leaves are numbered within
             ``[0; self.tree_.node_count)``, possibly with gaps in the
@@ -678,6 +648,110 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
 class ExtraSurvivalTree(SurvivalTree):
+    """An Extremely Randomized Survival Tree.
+    This class implements an Extremely Randomized Tree for survival analysis.
+    It differs from :class:`SurvivalTree` in how splits are chosen:
+    instead of searching for the optimal split, it considers a random subset
+    of features and random thresholds for each feature, then picks the best
+    among these random candidates.
+    Parameters
+    ----------
+    splitter : {'best', 'random'}, default: 'random'
+        The strategy used to choose the split at each node. Supported
+        strategies are 'best' to choose the best split and 'random' to choose
+        the best random split.
+    max_depth : int or None, optional, default: None
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        `min_samples_split` samples.
+    min_samples_split : int, float, optional, default: 6
+        The minimum number of samples required to split an internal node:
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+    min_samples_leaf : int, float, optional, default: 3
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+    min_weight_fraction_leaf : float, optional, default: 0.
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
+    max_features : int, float or {'sqrt', 'log2'} or None, optional, default: None
+        The number of features to consider when looking for the best split:
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `max(1, int(max_features * n_features_in_))` features are considered at
+          each split.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+    random_state : int, RandomState instance or None, optional, default: None
+        Controls the randomness of the estimator. The features are always
+        randomly permuted at each split, even if ``splitter`` is set to
+        ``"best"``. When ``max_features < n_features``, the algorithm will
+        select ``max_features`` at random at each split before finding the best
+        split among them. But the best found split may vary across different
+        runs, even if ``max_features=n_features``. That is the case, if the
+        improvement of the criterion is identical for several splits and one
+        split has to be selected at random. To obtain a deterministic behavior
+        during fitting, ``random_state`` has to be fixed to an integer.
+    max_leaf_nodes : int or None, optional, default: None
+        Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+    low_memory : bool, optional, default: False
+        If set, :meth:`predict` computations use reduced memory but :meth:`predict_cumulative_hazard_function`
+        and :meth:`predict_survival_function` are not implemented.
+    Attributes
+    ----------
+    unique_times_ : ndarray, shape = (n_unique_times,), dtype = float
+        Unique time points.
+    max_features_ : int
+        The inferred value of max_features.
+    n_features_in_ : int
+        Number of features seen during ``fit``.
+    feature_names_in_ : ndarray, shape = (`n_features_in_`,), dtype = object
+        Names of features seen during ``fit``. Defined only when `X`
+        has feature names that are all strings.
+    tree_ : Tree object
+        The underlying Tree object. Please refer to
+        ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object.
+    See also
+    --------
+    sksurv.ensemble.ExtraSurvivalTrees : An ensemble of ExtraSurvivalTrees.
+    """
     def __init__(
         self,
         *,
@@ -702,3 +776,15 @@ class ExtraSurvivalTree(SurvivalTree):
             max_leaf_nodes=max_leaf_nodes,
             low_memory=low_memory,
         )
+    def predict_cumulative_hazard_function(self, X, check_input=True, return_array=False):
+        ExtraSurvivalTree.predict_cumulative_hazard_function.__doc__ = (
+            SurvivalTree.predict_cumulative_hazard_function.__doc__.replace("SurvivalTree", "ExtraSurvivalTree")
+        )
+        return super().predict_cumulative_hazard_function(X, check_input=check_input, return_array=return_array)
+    def predict_survival_function(self, X, check_input=True, return_array=False):
+        ExtraSurvivalTree.predict_survival_function.__doc__ = SurvivalTree.predict_survival_function.__doc__.replace(
+            "SurvivalTree", "ExtraSurvivalTree"
+        )
+        return super().predict_survival_function(X, check_input=check_input, return_array=return_array)

sksurv/util.py CHANGED Viewed

@@ -19,29 +19,52 @@ __all__ = ["check_array_survival", "check_y_survival", "safe_concat", "Surv"]
 class Surv:
-    """
-    Helper class to construct structured array of event indicator and observed time.
+    """A helper class to create a structured array for survival analysis.
+    This class provides helper functions to create a structured array that
+    encapsulates the event indicator and the observed time. The resulting
+    structured array is the recommended format for the ``y`` argument in
+    scikit-survival's estimators.
     """
     @staticmethod
     def from_arrays(event, time, name_event=None, name_time=None):
-        """Create structured array.
+        """Create structured array from event indicator and time arrays.
         Parameters
         ----------
-        event : array-like
-            Event indicator. A boolean array or array with values 0/1.
-        time : array-like
-            Observed time.
-        name_event : str|None
-            Name of event, optional, default: 'event'
-        name_time : str|None
-            Name of observed time, optional, default: 'time'
+        event : array-like, shape=(n_samples,)
+            Event indicator. A boolean array or array with values 0/1,
+            where ``True`` or 1 indicates an event and ``False`` or 0
+            indicates right-censoring.
+        time : array-like, shape=(n_samples,)
+            Observed time. Time to event or time of censoring.
+        name_event : str, optional, default: 'event'
+            Name of the event field in the structured array.
+        name_time : str, optional, default: 'time'
+            Name of the observed time field in the structured array.
         Returns
         -------
-        y : np.array
-            Structured array with two fields.
+        y : numpy.ndarray
+            A structured array with two fields. The first field is a boolean
+            where ``True`` indicates an event and ``False`` indicates right-censoring.
+            The second field is a float with the time of event or time of censoring.
+            The names of the fields are set to the values of `name_event` and `name_time`.
+        Examples
+        --------
+        >>> from sksurv.util import Surv
+        >>>
+        >>> y = Surv.from_arrays(event=[True, False, True],
+        ...                      time=[10, 25, 15])
+        >>> y
+        array([( True, 10.), (False, 25.), ( True, 15.)],
+            dtype=[('event', '?'), ('time', '<f8')])
+        >>> y['event']
+        array([ True, False,  True])
+        >>> y['time']
+        array([10., 25., 15.])
         """
         name_event = name_event or "event"
         name_time = name_time or "time"
@@ -72,21 +95,48 @@ class Surv:
     @staticmethod
     def from_dataframe(event, time, data):
-        """Create structured array from data frame.
+        """Create structured array from columns in a pandas DataFrame.
         Parameters
         ----------
-        event : object
-            Identifier of column containing event indicator.
-        time : object
-            Identifier of column containing time.
+        event : str
+            Name of the column in ``data`` containing the event indicator.
+            It must be a boolean or have values 0/1,
+            where ``True`` or 1 indicates an event and ``False`` or 0
+            indicates right-censoring.
+        time : str
+            Name of the column in ``data`` containing the observed time
+            (time to event or time of censoring).
         data : pandas.DataFrame
-            Dataset.
+            A DataFrame with columns for event and time.
         Returns
         -------
-        y : np.array
-            Structured array with two fields.
+        y : numpy.ndarray
+            A structured array with two fields. The first field is a boolean
+            where ``True`` indicates an event and ``False`` indicates right-censoring.
+            The second field is a float with the time of event or time of censoring.
+            The names of the fields are the respective column names.
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> from sksurv.util import Surv
+        >>>
+        >>> df = pd.DataFrame({
+        ...     'status': [True, False, True],
+        ...     'followup_time': [10, 25, 15],
+        ... })
+        >>> y = Surv.from_dataframe(
+        ...     event='status', time='followup_time', data=df,
+        ... )
+        >>> y
+        array([( True, 10.), (False, 25.), ( True, 15.)],
+            dtype=[('status', '?'), ('followup_time', '<f8')])
+        >>> y['status']
+        array([ True, False,  True])
+        >>> y['followup_time']
+        array([10., 25., 15.])
         """
         if not isinstance(data, pd.DataFrame):
             raise TypeError(f"expected pandas.DataFrame, but got {type(data)!r}")
@@ -180,15 +230,19 @@ def check_y_survival(y_or_event, *args, allow_all_censored=False, allow_time_zer
 def check_event_dtype(event, competing_risks=False):
-    """Check that the event array has the correct dtypes:
-        Boolean for the general case and Integer in the case of competing risks.
+    """Check that the event array has the correct dtype.
+    For single-event survival analysis, the event indicator must be a
+    boolean array. For competing risk analysis, it must be an integer
+    array.
     Parameters
     ----------
-    event : array, shape=(n_samples,), dtype=bool | int
-            Array containing the censoring events.
+    event : ndarray, shape=(n_samples,), dtype=bool | int
+        Array containing the event indicator.
     competing_risks : bool, optional, default: False
-            Whether `event` contains competing risks.
+        Whether `event` is for a competing risks analysis.
     """
     if competing_risks:
         if not np.issubdtype(event.dtype, np.integer):