PyPI - skfolio - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

skfolio 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

skfolio/optimization/cluster/hierarchical/_base.py +12 -65
skfolio/optimization/cluster/hierarchical/_herc.py +75 -26
skfolio/optimization/cluster/hierarchical/_hrp.py +68 -12
skfolio/population/_population.py +1 -1
skfolio/pre_selection/__init__.py +12 -6
skfolio/pre_selection/_drop_correlated.py +108 -0
skfolio/pre_selection/_select_complete.py +116 -0
skfolio/pre_selection/_select_k_extremes.py +100 -0
skfolio/pre_selection/_select_non_dominated.py +161 -0
skfolio/pre_selection/_select_non_expiring.py +148 -0
skfolio/preprocessing/_returns.py +9 -3
skfolio/utils/stats.py +87 -0
{skfolio-0.4.2.dist-info → skfolio-0.5.0.dist-info}/METADATA +2 -2
{skfolio-0.4.2.dist-info → skfolio-0.5.0.dist-info}/RECORD +17 -13
{skfolio-0.4.2.dist-info → skfolio-0.5.0.dist-info}/WHEEL +1 -1
skfolio/pre_selection/_pre_selection.py +0 -343
{skfolio-0.4.2.dist-info → skfolio-0.5.0.dist-info}/LICENSE +0 -0
{skfolio-0.4.2.dist-info → skfolio-0.5.0.dist-info}/top_level.txt +0 -0

skfolio/optimization/cluster/hierarchical/_base.py CHANGED Viewed

@@ -52,8 +52,6 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
             * ENTROPIC_RISK_MEASURE
             * FOURTH_CENTRAL_MOMENT
             * FOURTH_LOWER_PARTIAL_MOMENT
-            * SKEW
-            * KURTOSIS
         The default is `RiskMeasure.VARIANCE`.
@@ -80,12 +78,12 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
     min_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
         Minimum assets weights (weights lower bounds). Negative weights are not allowed.
-        If a float is provided, it is applied to each asset. `None` is equivalent to
-        `-np.Inf` (no lower bound). If a dictionary is provided, its (key/value) pair
-        must be the (asset name/asset minium weight) and the input `X` of the `fit`
-        methods must be a DataFrame with the assets names in columns. When using a
-        dictionary, assets values that are not provided are assigned a minimum weight
-        of `0.0`. The default is 0.0 (no short selling).
+        If a float is provided, it is applied to each asset.
+        If a dictionary is provided, its (key/value) pair must be the
+        (asset name/asset minium weight) and the input `X` of the `fit` methods must be
+        a DataFrame with the assets names in columns.
+        When using a dictionary, assets values that are not provided are assigned a
+        minimum weight of `0.0`. The default is 0.0 (no short selling).
         Example:
@@ -96,12 +94,12 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
     max_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=1.0
         Maximum assets weights (weights upper bounds). Weights above 1.0 are not
-        allowed. If a float is provided, it is applied to each asset. `None` is
-        equivalent to `+np.Inf` (no upper bound). If a dictionary is provided, its
-        (key/value) pair must be the (asset name/asset maximum weight) and the input `X`
-        of the `fit` method must be a DataFrame with the assets names in columns. When
-        using a dictionary, assets values that are not provided are assigned a minimum
-        weight of `1.0`. The default is 1.0 (each asset is below 100%).
+        allowed. If a float is provided, it is applied to each asset.
+        If a dictionary is provided, its (key/value) pair must be the
+        (asset name/asset maximum weight) and the input `X` of the `fit` method must be
+        a DataFrame with the assets names in columns.
+        When using a dictionary, assets values that are not provided are assigned a
+        minimum weight of `1.0`. The default is 1.0 (each asset is below 100%).
         Example:
@@ -388,57 +386,6 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
         return min_weights, max_weights
-    @staticmethod
-    def _apply_weight_constraints_to_alpha(
-        alpha: float,
-        max_weights: np.ndarray,
-        min_weights: np.ndarray,
-        weights: np.ndarray,
-        left_cluster: np.ndarray,
-        right_cluster: np.ndarray,
-    ) -> float:
-        """Apply weight constraints to the alpha multiplication factor of the
-        Hierarchical Tree Clustering algorithm.
-        Parameters
-        ----------
-        alpha : float
-            The alpha multiplication factor of the Hierarchical Tree Clustering
-            algorithm.
-         min_weights : ndarray of shape (n_assets,)
-            The weight lower bound 1D array.
-        max_weights : ndarray of shape (n_assets,)
-            The weight upper bound 1D array.
-        weights : np.ndarray of shape (n_assets,)
-            The assets weights.
-        left_cluster : ndarray of shape (n_left_cluster,)
-            Indices of the left cluster weights.
-        right_cluster : ndarray of shape (n_right_cluster,)
-            Indices of the right cluster weights.
-        Returns
-        -------
-        value : float
-            The transformed alpha incorporating the weight constraints.
-        """
-        alpha = min(
-            np.sum(max_weights[left_cluster]) / weights[left_cluster[0]],
-            max(np.sum(min_weights[left_cluster]) / weights[left_cluster[0]], alpha),
-        )
-        alpha = 1 - min(
-            np.sum(max_weights[right_cluster]) / weights[right_cluster[0]],
-            max(
-                np.sum(min_weights[right_cluster]) / weights[right_cluster[0]],
-                1 - alpha,
-            ),
-        )
-        return alpha
     def get_metadata_routing(self):
         # noinspection PyTypeChecker
         router = (

skfolio/optimization/cluster/hierarchical/_herc.py CHANGED Viewed

@@ -3,8 +3,7 @@
 # Copyright (c) 2023
 # Author: Hugo Delatte <delatte.hugo@gmail.com>
 # License: BSD 3 clause
-# The risk measure generalization and constraint features are derived
-# from Riskfolio-Lib, Copyright (c) 2020-2023, Dany Cajas, Licensed under BSD 3 clause.
+# Weight constraints is a novel implementation, see docstring for more details.
 import numpy as np
 import numpy.typing as npt
@@ -20,6 +19,7 @@ from skfolio.optimization.cluster.hierarchical._base import (
     BaseHierarchicalOptimization,
 )
 from skfolio.prior import BasePrior, EmpiricalPrior
+from skfolio.utils.stats import minimize_relative_weight_deviation
 from skfolio.utils.tools import check_estimator
@@ -45,6 +45,32 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
         which is more stable and has better properties than the single-linkage
         method [4]_.
+        Also, the initial paper does not provide an algorithm for handling weight
+        constraints, and no standard solution currently exists.
+        In contrast to HRP (Hierarchical Risk Parity), where weight constraints
+        can be applied to the split factor at each bisection step, HERC
+        (Hierarchical Equal Risk Contribution) cannot incorporate weight constraints
+        during the intermediate steps of the allocation. Therefore, in HERC, the
+        weight constraints must be enforced after the top-down allocation has been
+        completed.
+        In skfolio, we minimize the relative deviation of the final weights from
+        the initial weights. This is formulated as a convex optimization problem:
+        .. math::
+            \begin{cases}
+            \begin{aligned}
+            &\min_{w} & & \Vert \frac{w - w_{init}}{w_{init}} \Vert_{2}^{2} \\
+            &\text{s.t.} & & \sum_{i=1}^{N} w_{i} = 1 \\
+            & & & w_{min} \leq w_i \leq w_{max}, \quad \forall i
+            \end{aligned}
+            \end{cases}
+        The reason for minimizing the relative deviation (as opposed to the absolute
+        deviation) is that we want to limit the impact on the risk contribution of
+        each asset. Since HERC allocates inversely to risk, adjusting the weights
+        based on relative deviation ensures that the assets' risk contributions
+        remain proportionally consistent with the initial allocation.
     Parameters
     ----------
     risk_measure : RiskMeasure or ExtraRiskMeasure, default=RiskMeasure.VARIANCE
@@ -70,8 +96,6 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
             * ENTROPIC_RISK_MEASURE
             * FOURTH_CENTRAL_MOMENT
             * FOURTH_LOWER_PARTIAL_MOMENT
-            * SKEW
-            * KURTOSIS
         The default is `RiskMeasure.VARIANCE`.
@@ -98,12 +122,12 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
     min_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
         Minimum assets weights (weights lower bounds). Negative weights are not allowed.
-        If a float is provided, it is applied to each asset. `None` is equivalent to
-        `-np.Inf` (no lower bound). If a dictionary is provided, its (key/value) pair
-        must be the (asset name/asset minium weight) and the input `X` of the `fit`
-        methods must be a DataFrame with the assets names in columns. When using a
-        dictionary, assets values that are not provided are assigned a minimum weight
-        of `0.0`. The default is 0.0 (no short selling).
+        If a float is provided, it is applied to each asset.
+        If a dictionary is provided, its (key/value) pair must be the
+        (asset name/asset minium weight) and the input `X` of the `fit` methods must be
+        a DataFrame with the assets names in columns.
+        When using a dictionary, assets values that are not provided are assigned a
+        minimum weight of `0.0`. The default is 0.0 (no short selling).
         Example:
@@ -114,12 +138,12 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
     max_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=1.0
         Maximum assets weights (weights upper bounds). Weights above 1.0 are not
-        allowed. If a float is provided, it is applied to each asset. `None` is
-        equivalent to `+np.Inf` (no upper bound). If a dictionary is provided, its
-        (key/value) pair must be the (asset name/asset maximum weight) and the input `X`
-        of the `fit` method must be a DataFrame with the assets names in columns. When
-        using a dictionary, assets values that are not provided are assigned a minimum
-        weight of `1.0`. The default is 1.0 (each asset is below 100%).
+        allowed. If a float is provided, it is applied to each asset.
+        If a dictionary is provided, its (key/value) pair must be the
+        (asset name/asset maximum weight) and the input `X` of the `fit` method must be
+        a DataFrame with the assets names in columns.
+        When using a dictionary, assets values that are not provided are assigned a
+        minimum weight of `1.0`. The default is 1.0 (each asset is below 100%).
         Example:
@@ -208,6 +232,19 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
         `management_fees`, `previous_weights` and `risk_free_rate` are copied from the
         optimization model and passed to the portfolio.
+    solver : str, default="CLARABEL"
+        The solver used for the weights constraints optimization. The default is
+        "CLARABEL" which is written in Rust and has better numerical stability and
+        performance than ECOS and SCS.
+        For more details about available solvers, check the CVXPY documentation:
+        https://www.cvxpy.org/tutorial/advanced/index.html#choosing-a-solver
+    solver_params : dict, optional
+        Solver parameters. For example, `solver_params=dict(verbose=True)`.
+        The default (`None`) is to use the CVXPY default.
+        For more details about solver arguments, check the CVXPY documentation:
+        https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options
     Attributes
     ----------
     weights_ : ndarray of shape (n_assets,)
@@ -251,6 +288,8 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
         hierarchical_clustering_estimator: HierarchicalClustering | None = None,
         min_weights: skt.MultiInput | None = 0.0,
         max_weights: skt.MultiInput | None = 1.0,
+        solver: str = "CLARABEL",
+        solver_params: dict | None = None,
         transaction_costs: skt.MultiInput = 0.0,
         management_fees: skt.MultiInput = 0.0,
         previous_weights: skt.MultiInput | None = None,
@@ -268,6 +307,8 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
             previous_weights=previous_weights,
             portfolio_params=portfolio_params,
         )
+        self.solver = solver
+        self.solver_params = solver_params
     def fit(
         self, X: npt.ArrayLike, y: None = None, **fit_params
@@ -301,6 +342,13 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
             raise TypeError(
                 "`risk_measure` must be of type `RiskMeasure` or `ExtraRiskMeasure`"
             )
+        if self.risk_measure in [ExtraRiskMeasure.SKEW, ExtraRiskMeasure.KURTOSIS]:
+            # Because Skew and Kurtosis can take negative values
+            raise ValueError(
+                f"risk_measure {self.risk_measure} currently not supported" f"in HERC"
+            )
         self.prior_estimator_ = check_estimator(
             self.prior_estimator,
             default=EmpiricalPrior(),
@@ -393,21 +441,12 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
             left_cluster = np.array(left_cluster)
             right_cluster = np.array(right_cluster)
             left_risk = np.sum(cluster_risks[left_cluster])
             right_risk = np.sum(cluster_risks[right_cluster])
             alpha = 1 - left_risk / (left_risk + right_risk)
-            # Weights constraints
-            alpha = self._apply_weight_constraints_to_alpha(
-                alpha=alpha,
-                weights=weights,
-                max_weights=max_weights,
-                min_weights=min_weights,
-                left_cluster=left_cluster,
-                right_cluster=right_cluster,
-            )
             clusters_weights[left_cluster] *= alpha
             clusters_weights[right_cluster] *= 1 - alpha
@@ -421,5 +460,15 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
         for i, cluster_ids in enumerate(clusters):
             weights[cluster_ids] *= clusters_weights[i]
+        # Apply weights constraints
+        weights = minimize_relative_weight_deviation(
+            weights=weights,
+            min_weights=min_weights,
+            max_weights=max_weights,
+            solver=self.solver,
+            solver_params=self.solver_params,
+        )
         self.weights_ = weights
         return self

skfolio/optimization/cluster/hierarchical/_hrp.py CHANGED Viewed

@@ -72,8 +72,6 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
             * ENTROPIC_RISK_MEASURE
             * FOURTH_CENTRAL_MOMENT
             * FOURTH_LOWER_PARTIAL_MOMENT
-            * SKEW
-            * KURTOSIS
         The default is `RiskMeasure.VARIANCE`.
@@ -100,9 +98,9 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
     min_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
         Minimum assets weights (weights lower bounds). Negative weights are not allowed.
-        If a float is provided, it is applied to each asset. `None` is equivalent to
-        `-np.Inf` (no lower bound). If a dictionary is provided, its (key/value) pair
-        must be the (asset name/asset minium weight) and the input `X` of the `fit`
+        If a float is provided, it is applied to each asset.
+        If a dictionary is provided, its (key/value) pair must be the
+        (asset name/asset minium weight) and the input `X` of the `fit`
         methods must be a DataFrame with the assets names in columns. When using a
         dictionary, assets values that are not provided are assigned a minimum weight
         of `0.0`. The default is 0.0 (no short selling).
@@ -116,12 +114,12 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
     max_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=1.0
         Maximum assets weights (weights upper bounds). Weights above 1.0 are not
-        allowed. If a float is provided, it is applied to each asset. `None` is
-        equivalent to `+np.Inf` (no upper bound). If a dictionary is provided, its
-        (key/value) pair must be the (asset name/asset maximum weight) and the input `X`
-        of the `fit` method must be a DataFrame with the assets names in columns. When
-        using a dictionary, assets values that are not provided are assigned a minimum
-        weight of `1.0`. The default is 1.0 (each asset is below 100%).
+        allowed. If a float is provided, it is applied to each asset.
+        If a dictionary is provided, its (key/value) pair must be the
+        (asset name/asset maximum weight) and the input `X` of the `fit` method must
+        be a DataFrame with the assets names in columns.
+        When using a dictionary, assets values that are not provided are assigned a
+        minimum weight of `1.0`. The default is 1.0 (each asset is below 100%).
         Example:
@@ -296,6 +294,13 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
             raise TypeError(
                 "`risk_measure` must be of type `RiskMeasure` or `ExtraRiskMeasure`"
             )
+        if self.risk_measure in [ExtraRiskMeasure.SKEW, ExtraRiskMeasure.KURTOSIS]:
+            # Because Skew and Kurtosis can take negative values
+            raise ValueError(
+                f"risk_measure {self.risk_measure} currently not supported" f"in HRP"
+            )
         self.prior_estimator_ = check_estimator(
             self.prior_estimator,
             default=EmpiricalPrior(),
@@ -365,7 +370,7 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
                 left_cluster, right_cluster = clusters_ids
                 alpha = 1 - left_risk / (left_risk + right_risk)
                 # Weights constraints
-                alpha = self._apply_weight_constraints_to_alpha(
+                alpha = _apply_weight_constraints_to_split_factor(
                     alpha=alpha,
                     weights=weights,
                     max_weights=max_weights,
@@ -379,3 +384,54 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
         self.weights_ = weights
         return self
+def _apply_weight_constraints_to_split_factor(
+    alpha: float,
+    max_weights: np.ndarray,
+    min_weights: np.ndarray,
+    weights: np.ndarray,
+    left_cluster: np.ndarray,
+    right_cluster: np.ndarray,
+) -> float:
+    """
+    Apply weight constraints to the split factor alpha of the ,Hierarchical Tree
+    Clustering algorithm.
+    Parameters
+    ----------
+    alpha : float
+        The split factor alpha of the Hierarchical Tree Clustering algorithm.
+    min_weights : ndarray of shape (n_assets,)
+        The weight lower bound 1D array.
+    max_weights : ndarray of shape (n_assets,)
+        The weight upper bound 1D array.
+    weights : np.ndarray of shape (n_assets,)
+        The assets weights.
+    left_cluster : ndarray of shape (n_left_cluster,)
+        Indices of the left cluster weights.
+    right_cluster : ndarray of shape (n_right_cluster,)
+        Indices of the right cluster weights.
+    Returns
+    -------
+    value : float
+        The transformed split factor alpha incorporating the weight constraints.
+    """
+    alpha = min(
+        np.sum(max_weights[left_cluster]) / weights[left_cluster[0]],
+        max(np.sum(min_weights[left_cluster]) / weights[left_cluster[0]], alpha),
+    )
+    alpha = 1 - min(
+        np.sum(max_weights[right_cluster]) / weights[right_cluster[0]],
+        max(
+            np.sum(min_weights[right_cluster]) / weights[right_cluster[0]],
+            1 - alpha,
+        ),
+    )
+    return alpha

skfolio/population/_population.py CHANGED Viewed

@@ -653,7 +653,7 @@ class Population(list):
         spacing: float | None = None,
         display_sub_ptf_name: bool = True,
     ) -> go.Figure:
-        """Plot the contribution of each asset to a given measure of the portfolios
+        r"""Plot the contribution of each asset to a given measure of the portfolios
         in the population.
         Parameters

skfolio/pre_selection/__init__.py CHANGED Viewed

@@ -1,7 +1,13 @@
-from skfolio.pre_selection._pre_selection import (
-    DropCorrelated,
-    SelectKExtremes,
-    SelectNonDominated,
-)
+from skfolio.pre_selection._drop_correlated import DropCorrelated
+from skfolio.pre_selection._select_complete import SelectComplete
+from skfolio.pre_selection._select_k_extremes import SelectKExtremes
+from skfolio.pre_selection._select_non_dominated import SelectNonDominated
+from skfolio.pre_selection._select_non_expiring import SelectNonExpiring
-__all__ = ["DropCorrelated", "SelectKExtremes", "SelectNonDominated"]
+__all__ = [
+    "DropCorrelated",
+    "SelectKExtremes",
+    "SelectNonDominated",
+    "SelectComplete",
+    "SelectNonExpiring",
+]

skfolio/pre_selection/_drop_correlated.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""Pre-selection DropCorrelated module"""
+# Copyright (c) 2023
+# Author: Hugo Delatte <delatte.hugo@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import numpy.typing as npt
+import sklearn.base as skb
+import sklearn.feature_selection as skf
+import sklearn.utils.validation as skv
+class DropCorrelated(skf.SelectorMixin, skb.BaseEstimator):
+    """Transformer for dropping highly correlated assets.
+    Simply removing all correlation pairs above the threshold will remove more assets
+    than necessary and a naive sequential removal is suboptimal and depends on the
+    initial assets ordering.
+    Let's suppose X,Y,Z are three random variables with corr(X,Y) and corr(X,Z) above
+    the threshold and corr(Y,Z) below.
+    The first approach would remove X,Y,Z and the second approach would remove either
+    Y and Z or X depending on the initial ordering.
+    To avoid these shortcomings, we implement the below algorithm:
+        * Step 1: select all correlation pairs above the threshold.
+        * Step 2: sort all the selected correlation pairs from highest to lowest.
+        * Step 3: for each pair, if none of the two assets has been removed, keep the
+          asset with the lowest average correlation against the other assets.
+    Parameters
+    ----------
+    threshold : float, default=0.95
+        Correlation threshold. The default value is `0.95`.
+    absolute : bool, default=False
+        If this is set to True, we take the absolute value of the correlation. This has
+        for effect to also include negatively correlated assets.
+    Attributes
+    ----------
+    to_keep_ : ndarray of shape (n_assets, )
+        Boolean array indicating which assets are remaining.
+    n_features_in_ : int
+        Number of assets seen during `fit`.
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of assets seen during `fit`. Defined only when `X`
+        has assets names that are all strings.
+    """
+    to_keep_: np.ndarray
+    def __init__(self, threshold: float = 0.95, absolute: bool = False):
+        self.threshold = threshold
+        self.absolute = absolute
+    def fit(self, X: npt.ArrayLike, y=None):
+        """Run the correlation transformer and get the appropriate assets.
+        Parameters
+        ----------
+        X : array-like of shape (n_observations, n_assets)
+            Price returns of the assets.
+        y : Ignored
+            Not used, present for API consistency by convention.
+        Returns
+        -------
+        self : DropCorrelated
+            Fitted estimator.
+        """
+        X = self._validate_data(X)
+        if not -1 <= self.threshold <= 1:
+            raise ValueError("`threshold` must be between -1 and 1")
+        n_assets = X.shape[1]
+        corr = np.corrcoef(X.T)
+        mean_corr = corr.mean(axis=0)
+        triu_idx = np.triu_indices(n_assets, 1)
+        # select all correlation pairs above the threshold
+        selected_idx = np.argwhere(corr[triu_idx] > self.threshold).flatten()
+        # sort all the selected correlation pairs from highest to lowest
+        selected_idx = selected_idx[np.argsort(-corr[triu_idx][selected_idx])]
+        # for each pair, if none of the two assets has been removed, keep the asset with
+        # the lowest average correlation with other assets
+        to_remove = set()
+        for idx in selected_idx:
+            i, j = triu_idx[0][idx], triu_idx[1][idx]
+            if i not in to_remove and j not in to_remove:
+                if mean_corr[i] > mean_corr[j]:
+                    to_remove.add(i)
+                else:
+                    to_remove.add(j)
+        self.to_keep_ = ~np.isin(np.arange(n_assets), list(to_remove))
+        return self
+    def _get_support_mask(self):
+        skv.check_is_fitted(self)
+        return self.to_keep_

skfolio/pre_selection/_select_complete.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""pre-selection SelectComplete module"""
+# Copyright (c) 2023
+# Author: Hugo Delatte <delatte.hugo@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import numpy.typing as npt
+import sklearn.base as skb
+import sklearn.feature_selection as skf
+import sklearn.utils.validation as skv
+class SelectComplete(skf.SelectorMixin, skb.BaseEstimator):
+    """
+    Transformer to select assets with complete data across the entire observation
+    period.
+    This transformer removes assets (columns) that have missing values (NaNs) at the
+    beginning or end of the period.
+    This transformer is especially useful for financial datasets where assets
+    (e.g., stocks, bonds) may have data gaps due to late inception (assets that started
+    trading later), early expiry or default (assets that stopped trading before the
+    end of the period).
+    If missing values are not at the beginning or end but occur between non-missing
+    values, the asset is not removed unless `drop_assets_with_internal_nan` is set to
+    `True`.
+    Parameters
+    ----------
+    drop_assets_with_internal_nan : bool, default=False
+        If set to True, assets with missing values (NaNs) that appear between
+        non-missing values (i.e., internal NaNs) will also be removed. By default,
+        only assets with leading or trailing NaNs are removed.
+    Attributes
+    ----------
+    to_keep_ : ndarray of shape (n_assets, )
+       Boolean array indicating which assets are remaining.
+    n_features_in_ : int
+       Number of assets seen during `fit`.
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+       Names of features seen during `fit`. Defined only when `X`
+       has feature names that are all strings.
+    Examples
+    --------
+        >>> import numpy as np
+        >>> import pandas as pd
+        >>> from skfolio.pre_selection import SelectComplete
+        >>> X = pd.DataFrame({
+        ...     'asset1': [np.nan, np.nan, 2, 3, 4],    # Starts late (inception)
+        ...     'asset2': [1, 2, 3, 4, 5],         # Complete data
+        ...     'asset3': [1, 2, 3, np.nan, 5], # Missing values within data
+        ...     'asset4': [1, 2, 3, 4, np.nan]      # Ends early (expiration)
+        ... })
+        >>> selector = SelectComplete()
+        >>> selector.fit_transform(X)
+         array([[ 1.,  1.],
+                [ 2.,  2.],
+                [ 3.,  3.],
+                [ 4., nan],
+                [ 5.,  5.]])
+        >>> selector = SelectComplete(drop_assets_with_internal_nan=True)
+        >>> selector.fit_transform(X)
+         array([[1.],
+               [2.],
+               [3.],
+               [4.],
+               [5.]])
+    """
+    to_keep_: np.ndarray
+    def __init__(self, drop_assets_with_internal_nan: bool = False):
+        self.drop_assets_with_internal_nan = drop_assets_with_internal_nan
+    def fit(self, X: npt.ArrayLike, y=None) -> "SelectComplete":
+        """Run the SelectComplete transformer and get the appropriate assets.
+        Parameters
+        ----------
+        X : array-like of shape (n_observations, n_assets)
+            Returns of the assets.
+        y : Ignored
+            Not used, present for API consistency by convention.
+        Returns
+        -------
+        self : SelectComplete
+            Fitted estimator.
+        """
+        # Validate by allowing NaNs
+        X = self._validate_data(X, force_all_finite="allow-nan")
+        if self.drop_assets_with_internal_nan:
+            # Identify columns with any NaNs
+            self.to_keep_ = ~np.isnan(X).any(axis=0)
+        else:
+            # Identify columns with no leading or trailing NaNs
+            self.to_keep_ = ~np.isnan(X[0, :]) & ~np.isnan(X[-1, :])
+        return self
+    def _get_support_mask(self):
+        skv.check_is_fitted(self)
+        return self.to_keep_
+    def _more_tags(self):
+        return {"allow_nan": True}

skfolio 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

skfolio 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl