PyPI - scikit-survival - Versions diffs - 0.24.1__cp310-cp310-macosx_11_0_arm64.whl → 0.25.0__cp310-cp310-macosx_11_0_arm64.whl - Mend

scikit-survival 0.24.1__cp310-cp310-macosx_11_0_arm64.whl → 0.25.0__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

scikit_survival-0.25.0.dist-info/METADATA +185 -0
scikit_survival-0.25.0.dist-info/RECORD +58 -0
{scikit_survival-0.24.1.dist-info → scikit_survival-0.25.0.dist-info}/WHEEL +1 -1
sksurv/__init__.py +51 -6
sksurv/base.py +12 -2
sksurv/bintrees/_binarytrees.cpython-310-darwin.so +0 -0
sksurv/column.py +33 -29
sksurv/compare.py +22 -22
sksurv/datasets/base.py +45 -20
sksurv/docstrings.py +99 -0
sksurv/ensemble/_coxph_loss.cpython-310-darwin.so +0 -0
sksurv/ensemble/boosting.py +116 -168
sksurv/ensemble/forest.py +94 -151
sksurv/functions.py +29 -29
sksurv/io/arffread.py +34 -3
sksurv/io/arffwrite.py +38 -2
sksurv/kernels/_clinical_kernel.cpython-310-darwin.so +0 -0
sksurv/kernels/clinical.py +33 -13
sksurv/linear_model/_coxnet.cpython-310-darwin.so +0 -0
sksurv/linear_model/aft.py +14 -11
sksurv/linear_model/coxnet.py +138 -89
sksurv/linear_model/coxph.py +102 -83
sksurv/meta/ensemble_selection.py +91 -9
sksurv/meta/stacking.py +47 -26
sksurv/metrics.py +257 -224
sksurv/nonparametric.py +150 -81
sksurv/preprocessing.py +55 -27
sksurv/svm/_minlip.cpython-310-darwin.so +0 -0
sksurv/svm/_prsvm.cpython-310-darwin.so +0 -0
sksurv/svm/minlip.py +160 -79
sksurv/svm/naive_survival_svm.py +63 -34
sksurv/svm/survival_svm.py +103 -103
sksurv/tree/_criterion.cpython-310-darwin.so +0 -0
sksurv/tree/tree.py +170 -84
sksurv/util.py +80 -26
scikit_survival-0.24.1.dist-info/METADATA +0 -889
scikit_survival-0.24.1.dist-info/RECORD +0 -57
{scikit_survival-0.24.1.dist-info → scikit_survival-0.25.0.dist-info}/licenses/COPYING +0 -0
{scikit_survival-0.24.1.dist-info → scikit_survival-0.25.0.dist-info}/top_level.txt +0 -0

sksurv/nonparametric.py CHANGED Viewed

@@ -31,36 +31,36 @@ __all__ = [
 def _compute_counts(event, time, order=None):
-    """Count right censored and uncensored samples at each unique time point.
+    """Count right-censored and uncensored samples at each unique time point.
     Parameters
     ----------
-    event : array
+    event : ndarray
         Boolean event indicator.
         Integer in the case of multiple risks.
         Zero means right-censored event.
         Positive values for each of the possible risk events.
-    time : array
+    time : ndarray
         Survival time or time of censoring.
-    order : array or None
+    order : ndarray or None
         Indices to order time in ascending order.
         If None, order will be computed.
     Returns
     -------
-    times : array
+    times : ndarray
         Unique time points.
-    n_events : array
+    n_events : ndarray
         Number of events at each time point.
         2D array with shape `(n_unique_time_points, n_risks + 1)` in the case of competing risks.
-    n_at_risk : array
+    n_at_risk : ndarray
         Number of samples that have not been censored or have not had an event at each time point.
-    n_censored : array
+    n_censored : ndarray
         Number of censored samples at each time point.
     """
     n_samples = event.shape[0]
@@ -116,29 +116,29 @@ def _compute_counts(event, time, order=None):
 def _compute_counts_truncated(event, time_enter, time_exit):
-    """Compute counts for left truncated and right censored survival data.
+    """Compute counts for left truncated and right-censored survival data.
     Parameters
     ----------
-    event : array
+    event : ndarray
         Boolean event indicator.
-    time_start : array
+    time_enter : ndarray
         Time when a subject entered the study.
-    time_exit : array
+    time_exit : ndarray
         Time when a subject left the study due to an
         event or censoring.
     Returns
     -------
-    times : array
+    times : ndarray
         Unique time points.
-    n_events : array
+    n_events : ndarray
         Number of events at each time point.
-    n_at_risk : array
+    n_at_risk : ndarray
         Number of samples that are censored or have an event at each time point.
     """
     if (time_enter > time_exit).any():
@@ -212,6 +212,27 @@ def _ci_logmlog(s, sigma_t, conf_level):
 def _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type):
+    """Helper to compute confidence intervals for the Kaplan-Meier estimate.
+    Parameters
+    ----------
+    prob_survival : ndarray, shape = (n_times,)
+        Survival probability at each unique time point.
+    ratio_var : ndarray, shape = (n_times,)
+        The variance ratio term for each unique time point.
+    conf_level : float
+        The level for a two-sided confidence interval.
+    conf_type : {'log-log'}
+        The type of confidence intervals to estimate.
+    Returns
+    -------
+    ci : ndarray, shape = (2, n_times)
+        Pointwise confidence interval.
+    """
     if conf_type not in {"log-log"}:
         raise ValueError(f"conf_type must be None or a str among {{'log-log'}}, but was {conf_type!r}")
@@ -232,17 +253,18 @@ def kaplan_meier_estimator(
     conf_level=0.95,
     conf_type=None,
 ):
-    """Kaplan-Meier estimator of survival function.
+    """Computes the Kaplan-Meier estimate of the survival function.
     See [1]_ for further description.
     Parameters
     ----------
     event : array-like, shape = (n_samples,)
-        Contains binary event indicators.
+        A boolean array where ``True`` indicates an event and ``False`` indicates
+        right-censoring.
     time_exit : array-like, shape = (n_samples,)
-        Contains event/censoring times.
+        Time of event or censoring.
     time_enter : array-like, shape = (n_samples,), optional
         Contains time when each individual entered the study for
@@ -270,14 +292,14 @@ def kaplan_meier_estimator(
     Returns
     -------
-    time : array, shape = (n_times,)
+    time : ndarray, shape = (n_times,)
         Unique times.
-    prob_survival : array, shape = (n_times,)
+    prob_survival : ndarray, shape = (n_times,)
         Survival probability at each unique time point.
         If `time_enter` is provided, estimates are conditional probabilities.
-    conf_int : array, shape = (2, n_times)
+    conf_int : ndarray, shape = (2, n_times)
         Pointwise confidence interval of the Kaplan-Meier estimator
         at each unique time point.
         Only provided if `conf_type` is not None.
@@ -286,11 +308,23 @@ def kaplan_meier_estimator(
     --------
     Creating a Kaplan-Meier curve:
-    >>> x, y, conf_int = kaplan_meier_estimator(event, time, conf_type="log-log")
-    >>> plt.step(x, y, where="post")
-    >>> plt.fill_between(x, conf_int[0], conf_int[1], alpha=0.25, step="post")
-    >>> plt.ylim(0, 1)
-    >>> plt.show()
+    .. plot::
+        >>> import matplotlib.pyplot as plt
+        >>> from sksurv.datasets import load_veterans_lung_cancer
+        >>> from sksurv.nonparametric import kaplan_meier_estimator
+        >>>
+        >>> _, y = load_veterans_lung_cancer()
+        >>> time, prob_surv, conf_int = kaplan_meier_estimator(
+        ...     y["Status"], y["Survival_in_days"], conf_type="log-log"
+        ... )
+        >>> plt.step(time, prob_surv, where="post")
+        [...]
+        >>> plt.fill_between(time, conf_int[0], conf_int[1], alpha=0.25, step="post")
+        <matplotlib.collections.PolyCollection object at 0x...>
+        >>> plt.ylim(0, 1)
+        (0.0, 1.0)
+        >>> plt.show()  # doctest: +SKIP
     See also
     --------
@@ -359,26 +393,44 @@ def kaplan_meier_estimator(
 def nelson_aalen_estimator(event, time):
-    """Nelson-Aalen estimator of cumulative hazard function.
+    """Computes the Nelson-Aalen estimate of the cumulative hazard function.
     See [1]_, [2]_ for further description.
     Parameters
     ----------
     event : array-like, shape = (n_samples,)
-        Contains binary event indicators.
+        A boolean array where ``True`` indicates an event and ``False`` indicates
+        right-censoring.
     time : array-like, shape = (n_samples,)
-        Contains event/censoring times.
+        Time of event or censoring.
     Returns
     -------
-    time : array, shape = (n_times,)
+    time : ndarray, shape = (n_times,)
         Unique times.
-    cum_hazard : array, shape = (n_times,)
+    cum_hazard : ndarray, shape = (n_times,)
         Cumulative hazard at each unique time point.
+    Examples
+    --------
+    Creating a cumulative hazard curve:
+    .. plot::
+        >>> import matplotlib.pyplot as plt
+        >>> from sksurv.datasets import load_aids
+        >>> from sksurv.nonparametric import nelson_aalen_estimator
+        >>>
+        >>> _, y = load_aids(endpoint="death")
+        >>> time, cum_hazard = nelson_aalen_estimator(y["censor_d"], y["time_d"])
+        >>>
+        >>> plt.step(time, cum_hazard, where="post")
+        [...]
+        >>> plt.show()  # doctest: +SKIP
     References
     ----------
     .. [1] Nelson, W., "Theory and applications of hazard plotting for censored failure data",
@@ -401,15 +453,16 @@ def ipc_weights(event, time):
     Parameters
     ----------
-    event : array, shape = (n_samples,)
-        Boolean event indicator.
+    event : array-like, shape = (n_samples,)
+        A boolean array where ``True`` indicates an event and ``False`` indicates
+        right-censoring.
-    time : array, shape = (n_samples,)
+    time : array-like, shape = (n_samples,)
         Time when a subject experienced an event or was censored.
     Returns
     -------
-    weights : array, shape = (n_samples,)
+    weights : ndarray, shape = (n_samples,)
         inverse probability of censoring weights
     See also
@@ -469,9 +522,9 @@ class SurvivalFunctionEstimator(BaseEstimator):
         Parameters
         ----------
         y : structured array, shape = (n_samples,)
-            A structured array containing the binary event indicator
-            as first field, and time of event or time of censoring as
-            second field.
+            A structured array with two fields. The first field is a boolean
+            where ``True`` indicates an event and ``False`` indicates right-censoring.
+            The second field is a float with the time of event or time of censoring.
         Returns
         -------
@@ -493,13 +546,13 @@ class SurvivalFunctionEstimator(BaseEstimator):
         return self
     def predict_proba(self, time, return_conf_int=False):
-        """Return probability of an event after given time point.
+        r"""Return probability of remaining event-free at given time points.
-        :math:`\\hat{S}(t) = P(T > t)`
+        :math:`\hat{S}(t) = P(T > t)`
         Parameters
         ----------
-        time : array, shape = (n_samples,)
+        time : array-like, shape = (n_samples,)
             Time to estimate probability at.
         return_conf_int : bool, optional, default: False
@@ -510,10 +563,10 @@ class SurvivalFunctionEstimator(BaseEstimator):
         Returns
         -------
-        prob : array, shape = (n_samples,)
-            Probability of an event at the passed time points.
+        prob : ndarray, shape = (n_samples,)
+            Probability of remaining event-free at the given time points.
-        conf_int : array, shape = (2, n_samples)
+        conf_int : ndarray, shape = (2, n_samples)
             Pointwise confidence interval at the passed time points.
             Only provided if `return_conf_int` is True.
         """
@@ -561,9 +614,9 @@ class CensoringDistributionEstimator(SurvivalFunctionEstimator):
         Parameters
         ----------
         y : structured array, shape = (n_samples,)
-            A structured array containing the binary event indicator
-            as first field, and time of event or time of censoring as
-            second field.
+            A structured array with two fields. The first field is a boolean
+            where ``True`` indicates an event and ``False`` indicates right-censoring.
+            The second field is a float with the time of event or time of censoring.
         Returns
         -------
@@ -581,20 +634,20 @@ class CensoringDistributionEstimator(SurvivalFunctionEstimator):
         return self
     def predict_ipcw(self, y):
-        """Return inverse probability of censoring weights at given time points.
+        r"""Return inverse probability of censoring weights at given time points.
-        :math:`\\omega_i = \\delta_i / \\hat{G}(y_i)`
+        :math:`\omega_i = \delta_i / \hat{G}(y_i)`
         Parameters
         ----------
         y : structured array, shape = (n_samples,)
-            A structured array containing the binary event indicator
-            as first field, and time of event or time of censoring as
-            second field.
+            A structured array with two fields. The first field is a boolean
+            where ``True`` indicates an event and ``False`` indicates right-censoring.
+            The second field is a float with the time of event or time of censoring.
         Returns
         -------
-        ipcw : array, shape = (n_samples,)
+        ipcw : ndarray, shape = (n_samples,)
             Inverse probability of censoring weights.
         """
         event, time = check_y_survival(y)
@@ -638,14 +691,14 @@ def cumulative_incidence_competing_risks(
     Parameters
     ----------
-    event : array-like, shape = (n_samples,)
-        Contains event indicators.
+    event : array-like, shape = (n_samples,), dtype = int
+        Contains event indicators. A value of 0 indicates right-censoring,
+        while a positive integer from 1 to `n_risks` corresponds to a specific risk.
+        `n_risks` is the total number of different risks.
+        It assumes there are events for all possible risks.
     time_exit : array-like, shape = (n_samples,)
-        Contains event/censoring times. '0' indicates right-censoring.
-        Positive integers (between 1 and n_risks, n_risks being the total number of different risks)
-        indicate the possible different risks.
-        It assumes there are events for all possible risks.
+        Contains event or censoring times.
     time_min : float, optional, default: None
         Compute estimator conditional on survival at least up to
@@ -660,23 +713,24 @@ def cumulative_incidence_competing_risks(
         If "log-log", estimate confidence intervals using
         the log hazard or :math:`log(-log(S(t)))`.
-    var_type : None or one of {'Aalen', 'Dinse', 'Dinse_Approx'}, optional, default: 'Aalen'
+    var_type : {'Aalen', 'Dinse', 'Dinse_Approx'}, optional, default: 'Aalen'
         The method for estimating the variance of the estimator.
         See [2]_, [3]_ and [4]_ for each of the methods.
         Only used if `conf_type` is not None.
     Returns
     -------
-    time : array, shape = (n_times,)
+    time : ndarray, shape = (n_times,)
         Unique times.
-    cum_incidence : array, shape = (n_risks + 1, n_times)
-        Cumulative incidence at each unique time point.
-        The first dimension indicates total risk (``cum_incidence[0]``),
-        the dimension `i=1,...,n_risks` the incidence for each competing risk.
+    cum_incidence : ndarray, shape = (n_risks + 1, n_times)
+        Cumulative incidence for each risk. The first row (``cum_incidence[0]``)
+        is the cumulative incidence of any risk (total risk). The remaining
+        rows (``cum_incidence[1:]``) are the cumulative incidences for each
+        competing risk.
-    conf_int : array, shape = (n_risks + 1, 2, n_times)
-        Pointwise confidence interval (second axis) of the Kaplan-Meier estimator
+    conf_int : ndarray, shape = (n_risks + 1, 2, n_times)
+        Pointwise confidence interval (second axis) of the cumulative incidence function
         at each unique time point (last axis)
         for all possible risks (first axis), including overall risk (``conf_int[0]``).
         Only provided if `conf_type` is not None.
@@ -685,20 +739,35 @@ def cumulative_incidence_competing_risks(
     --------
     Creating cumulative incidence curves:
-    >>> from sksurv.datasets import load_bmt
-    >>> dis, bmt_df = load_bmt()
-    >>> event = bmt_df["status"]
-    >>> time = bmt_df["ftime"]
-    >>> n_risks = event.max()
-    >>> x, y, conf_int = cumulative_incidence_competing_risks(event, time, conf_type="log-log")
-    >>> plt.step(x, y[0], where="post", label="Total risk")
-    >>> plt.fill_between(x, conf_int[0, 0], conf_int[0, 1], alpha=0.25, step="post")
-    >>> for i in range(1, n_risks + 1):
-    >>>    plt.step(x, y[i], where="post", label=f"{i}-risk")
-    >>>    plt.fill_between(x, conf_int[i, 0], conf_int[i, 1], alpha=0.25, step="post")
-    >>> plt.ylim(0, 1)
-    >>> plt.legend()
-    >>> plt.show()
+    .. plot::
+        >>> import matplotlib.pyplot as plt
+        >>> from sksurv.datasets import load_bmt
+        >>> from sksurv.nonparametric import cumulative_incidence_competing_risks
+        >>>
+        >>> dis, bmt_df = load_bmt()
+        >>> event = bmt_df["status"]
+        >>> time = bmt_df["ftime"]
+        >>> n_risks = event.max()
+        >>>
+        >>> x, y, conf_int = cumulative_incidence_competing_risks(
+        ...     event, time, conf_type="log-log"
+        ... )
+        >>>
+        >>> plt.step(x, y[0], where="post", label="Total risk")
+        [...]
+        >>> plt.fill_between(x, conf_int[0, 0], conf_int[0, 1], alpha=0.25, step="post")
+        <matplotlib.collections.PolyCollection object at 0x...>
+        >>> for i in range(1, n_risks + 1):
+        ...     plt.step(x, y[i], where="post", label=f"{i}-risk")
+        ...     plt.fill_between(x, conf_int[i, 0], conf_int[i, 1], alpha=0.25, step="post")
+        [...]
+        <matplotlib.collections.PolyCollection object at 0x...>
+        >>> plt.ylim(0, 1)
+        (0.0, 1.0)
+        >>> plt.legend()
+        <matplotlib.legend.Legend object at 0x...>
+        >>> plt.show()  # doctest: +SKIP
     References
     ----------

sksurv/preprocessing.py CHANGED Viewed

@@ -19,40 +19,60 @@ __all__ = ["OneHotEncoder"]
 def check_columns_exist(actual, expected):
+    """Check if all expected columns are present in a dataframe.
+    Parameters
+    ----------
+    actual : pandas.Index
+        The actual columns of a dataframe.
+    expected : pandas.Index
+        The expected columns.
+    Raises
+    ------
+    ValueError
+        If any of the expected columns are missing from the actual columns.
+    """
     missing_features = expected.difference(actual)
     if len(missing_features) != 0:
         raise ValueError(f"{len(missing_features)} features are missing from data: {missing_features.tolist()}")
 class OneHotEncoder(BaseEstimator, TransformerMixin):
-    """Encode categorical columns with `M` categories into `M-1` columns according
-    to the one-hot scheme.
+    """Encode categorical features using a one-hot scheme.
-    The order of non-categorical columns is preserved, encoded columns are inserted
-    inplace of the original column.
+    This transformer only works on pandas DataFrames. It identifies columns
+    with `category` or `object` data type as categorical features.
+    The features are encoded using a one-hot (or dummy) encoding scheme, which
+    creates a binary column for each category. By default, one category per feature
+    is dropped. a column with `M` categories is encoded as `M-1` integer columns
+    according to the one-hot scheme.
+    The order of non-categorical columns is preserved. Encoded columns are inserted
+    in place of the original column.
     Parameters
     ----------
-    allow_drop : boolean, optional, default: True
+    allow_drop : bool, optional, default: True
         Whether to allow dropping categorical columns that only consist
         of a single category.
     Attributes
     ----------
     feature_names_ : pandas.Index
-        List of encoded columns.
+        Names of categorical features that were encoded.
     categories_ : dict
-        Categories of encoded columns.
+        A dictionary mapping each categorical feature name to a list of its
+        categories.
-    encoded_columns_ : list
-        Name of columns after encoding.
-        Includes names of non-categorical columns.
+    encoded_columns_ : pandas.Index
+        The full list of feature names in the transformed output.
     n_features_in_ : int
         Number of features seen during ``fit``.
-    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+    feature_names_in_ : ndarray, shape = (`n_features_in_`,)
         Names of features seen during ``fit``. Defined only when `X`
         has feature names that are all strings.
     """
@@ -61,18 +81,20 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         self.allow_drop = allow_drop
     def fit(self, X, y=None):  # pylint: disable=unused-argument
-        """Retrieve categorical columns.
+        """Determine which features are categorical and should be one-hot encoded.
         Parameters
         ----------
         X : pandas.DataFrame
-            Data to encode.
-        y :
-            Ignored. For compatibility with Pipeline.
+            The data to determine categorical features from.
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
         Returns
         -------
         self : object
-            Returns self
+            Returns the instance itself.
         """
         self.fit_transform(X)
         return self
@@ -81,21 +103,27 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         return encode_categorical(X, columns=columns_to_encode, allow_drop=self.allow_drop)
     def fit_transform(self, X, y=None, **fit_params):  # pylint: disable=unused-argument
-        """Convert categorical columns to numeric values.
+        """Fit to data, then transform it.
+        Fits the transformer to ``X`` by identifying categorical features and
+        then returns a transformed version of ``X`` with categorical features
+        one-hot encoded.
         Parameters
         ----------
         X : pandas.DataFrame
-            Data to encode.
-        y :
-            Ignored. For compatibility with TransformerMixin.
-        fit_params :
-            Ignored. For compatibility with TransformerMixin.
+            The data to fit and transform.
+        y : None, optional
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
+        fit_params : dict, optional
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
         Returns
         -------
         Xt : pandas.DataFrame
-            Encoded data.
+            The transformed data.
         """
         _check_feature_names(self, X, reset=True)
         _check_n_features(self, X, reset=True)
@@ -108,17 +136,17 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         return x_dummy
     def transform(self, X):
-        """Convert categorical columns to numeric values.
+        """Transform ``X`` by one-hot encoding categorical features.
         Parameters
         ----------
         X : pandas.DataFrame
-            Data to encode.
+            The data to transform.
         Returns
         -------
         Xt : pandas.DataFrame
-            Encoded data.
+            The transformed data.
         """
         check_is_fitted(self, "encoded_columns_")
         _check_n_features(self, X, reset=False)
@@ -136,7 +164,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
         Parameters
         ----------
-        input_features : array-like of str or None, default=None
+        input_features : array-like of str or None, default: None
             Input features.
             - If `input_features` is `None`, then `feature_names_in_` is

sksurv/svm/_minlip.cpython-310-darwin.so CHANGED Viewed

Binary file

sksurv/svm/_prsvm.cpython-310-darwin.so CHANGED Viewed

Binary file