PyPI - skfolio - Versions diffs - 0.0.1__py3-none-any.whl - Mend

skfolio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

skfolio/__init__.py +29 -0
skfolio/cluster/__init__.py +8 -0
skfolio/cluster/_hierarchical.py +387 -0
skfolio/datasets/__init__.py +20 -0
skfolio/datasets/_base.py +389 -0
skfolio/datasets/data/__init__.py +0 -0
skfolio/datasets/data/factors_dataset.csv.gz +0 -0
skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
skfolio/datasets/data/sp500_index.csv.gz +0 -0
skfolio/distance/__init__.py +26 -0
skfolio/distance/_base.py +55 -0
skfolio/distance/_distance.py +574 -0
skfolio/exceptions.py +30 -0
skfolio/measures/__init__.py +76 -0
skfolio/measures/_enums.py +355 -0
skfolio/measures/_measures.py +607 -0
skfolio/metrics/__init__.py +3 -0
skfolio/metrics/_scorer.py +121 -0
skfolio/model_selection/__init__.py +18 -0
skfolio/model_selection/_combinatorial.py +407 -0
skfolio/model_selection/_validation.py +194 -0
skfolio/model_selection/_walk_forward.py +221 -0
skfolio/moments/__init__.py +41 -0
skfolio/moments/covariance/__init__.py +29 -0
skfolio/moments/covariance/_base.py +101 -0
skfolio/moments/covariance/_covariance.py +1108 -0
skfolio/moments/expected_returns/__init__.py +21 -0
skfolio/moments/expected_returns/_base.py +31 -0
skfolio/moments/expected_returns/_expected_returns.py +415 -0
skfolio/optimization/__init__.py +36 -0
skfolio/optimization/_base.py +147 -0
skfolio/optimization/cluster/__init__.py +13 -0
skfolio/optimization/cluster/_nco.py +348 -0
skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
skfolio/optimization/cluster/hierarchical/_base.py +440 -0
skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
skfolio/optimization/convex/__init__.py +16 -0
skfolio/optimization/convex/_base.py +1944 -0
skfolio/optimization/convex/_distributionally_robust.py +392 -0
skfolio/optimization/convex/_maximum_diversification.py +417 -0
skfolio/optimization/convex/_mean_risk.py +974 -0
skfolio/optimization/convex/_risk_budgeting.py +560 -0
skfolio/optimization/ensemble/__init__.py +6 -0
skfolio/optimization/ensemble/_base.py +87 -0
skfolio/optimization/ensemble/_stacking.py +326 -0
skfolio/optimization/naive/__init__.py +3 -0
skfolio/optimization/naive/_naive.py +173 -0
skfolio/population/__init__.py +3 -0
skfolio/population/_population.py +883 -0
skfolio/portfolio/__init__.py +13 -0
skfolio/portfolio/_base.py +1096 -0
skfolio/portfolio/_multi_period_portfolio.py +610 -0
skfolio/portfolio/_portfolio.py +842 -0
skfolio/pre_selection/__init__.py +7 -0
skfolio/pre_selection/_pre_selection.py +342 -0
skfolio/preprocessing/__init__.py +3 -0
skfolio/preprocessing/_returns.py +114 -0
skfolio/prior/__init__.py +18 -0
skfolio/prior/_base.py +63 -0
skfolio/prior/_black_litterman.py +238 -0
skfolio/prior/_empirical.py +163 -0
skfolio/prior/_factor_model.py +268 -0
skfolio/typing.py +50 -0
skfolio/uncertainty_set/__init__.py +23 -0
skfolio/uncertainty_set/_base.py +108 -0
skfolio/uncertainty_set/_bootstrap.py +281 -0
skfolio/uncertainty_set/_empirical.py +237 -0
skfolio/utils/__init__.py +0 -0
skfolio/utils/bootstrap.py +115 -0
skfolio/utils/equations.py +350 -0
skfolio/utils/sorting.py +117 -0
skfolio/utils/stats.py +466 -0
skfolio/utils/tools.py +567 -0
skfolio-0.0.1.dist-info/LICENSE +29 -0
skfolio-0.0.1.dist-info/METADATA +568 -0
skfolio-0.0.1.dist-info/RECORD +79 -0
skfolio-0.0.1.dist-info/WHEEL +5 -0
skfolio-0.0.1.dist-info/top_level.txt +1 -0

skfolio/pre_selection/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from skfolio.pre_selection._pre_selection import (
+    DropCorrelated,
+    SelectKExtremes,
+    SelectNonDominated,
+)
+__all__ = ["DropCorrelated", "SelectKExtremes", "SelectNonDominated"]

skfolio/pre_selection/_pre_selection.py ADDED Viewed

@@ -0,0 +1,342 @@
+"""pre-selection estimators module"""
+# Author: Hugo Delatte <delatte.hugo@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import numpy.typing as npt
+import sklearn.base as skb
+import sklearn.feature_selection as skf
+import sklearn.utils.validation as skv
+import skfolio.typing as skt
+from skfolio.measures import RatioMeasure
+from skfolio.population import Population
+from skfolio.portfolio import Portfolio
+class DropCorrelated(skf.SelectorMixin, skb.BaseEstimator):
+    """Transformer for dropping highly correlated assets.
+    Simply removing all correlation pairs above the threshold will remove more assets
+    than necessary and a naive sequential removal is suboptimal and depends on the
+    initial assets ordering.
+    Let's suppose X,Y,Z are three random variables with corr(X,Y) and corr(X,Z) above
+    the threshold and corr(Y,Z) below.
+    The first approach would remove X,Y,Z and the second approach would remove either
+    Y and Z or X depending on the initial ordering.
+    To avoid these shortcomings, we implement the below algorithm:
+        * Step 1: select all correlation pairs above the threshold.
+        * Step 2: sort all the selected correlation pairs from highest to lowest.
+        * Step 3: for each pair, if none of the two assets has been removed, keep the
+          asset with the lowest average correlation against the other assets.
+    Parameters
+    ----------
+    threshold : float, default=0.95
+        Correlation threshold. The default value is `0.95`.
+    absolute : bool, default=False
+        If this is set to True, we take the absolute value of the correlation. This has
+        for effect to also include negatively correlated assets.
+    Attributes
+    ----------
+    to_keep_ : ndarray of shape (n_assets, )
+        Boolean array indicating which assets are remaining.
+    n_features_in_ : int
+        Number of assets seen during `fit`.
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of assets seen during `fit`. Defined only when `X`
+        has assets names that are all strings.
+    """
+    to_keep_: np.ndarray
+    def __init__(self, threshold: float = 0.95, absolute: bool = False):
+        self.threshold = threshold
+        self.absolute = absolute
+    def fit(self, X: npt.ArrayLike, y=None):
+        """Run the correlation transformer and get the appropriate assets.
+        Parameters
+        ----------
+        X : array-like of shape (n_observations, n_assets)
+            Price returns of the assets.
+        y : Ignored
+            Not used, present for API consistency by convention.
+        Returns
+        -------
+        self : DropCorrelated
+            Fitted estimator.
+        """
+        X = self._validate_data(X)
+        if not -1 <= self.threshold <= 1:
+            raise ValueError("`threshold` must be between -1 and 1")
+        n_assets = X.shape[1]
+        corr = np.corrcoef(X.T)
+        mean_corr = corr.mean(axis=0)
+        triu_idx = np.triu_indices(n_assets, 1)
+        # select all correlation pairs above the threshold
+        selected_idx = np.argwhere(corr[triu_idx] > self.threshold).flatten()
+        # sort all the selected correlation pairs from highest to lowest
+        selected_idx = selected_idx[np.argsort(-corr[triu_idx][selected_idx])]
+        # for each pair, if none of the two assets has been removed, keep the asset with
+        # the lowest average correlation with other assets
+        to_remove = set()
+        for idx in selected_idx:
+            i, j = triu_idx[0][idx], triu_idx[1][idx]
+            if i not in to_remove and j not in to_remove:
+                if mean_corr[i] > mean_corr[j]:
+                    to_remove.add(i)
+                else:
+                    to_remove.add(j)
+        self.to_keep_ = ~np.isin(np.arange(n_assets), list(to_remove))
+        return self
+    def _get_support_mask(self):
+        skv.check_is_fitted(self)
+        return self.to_keep_
+class SelectKExtremes(skf.SelectorMixin, skb.BaseEstimator):
+    """Transformer for selecting the `k` best or worst assets.
+    Keep the `k` best or worst assets according to a given measure.
+    Parameters
+    ----------
+    k : int, default=10
+        Number of assets to select. If `k` is higher than the number of assets, all
+        assets are selected.
+    measure : Measure, default=RatioMeasure.SHARPE_RATIO
+        The :ref:`measure <measures_ref>` used to sort the assets.
+        The default is `RatioMeasure.SHARPE_RATIO`.
+    highest : bool, default=True
+        If this is set to True, the `k` assets with the highest `measure` are selected,
+        otherwise it is the `k` lowest.
+    Attributes
+    ----------
+    to_keep_ : ndarray of shape (n_assets, )
+       Boolean array indicating which assets are remaining.
+    n_features_in_ : int
+       Number of assets seen during `fit`.
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+       Names of features seen during `fit`. Defined only when `X`
+       has feature names that are all strings.
+    """
+    to_keep_: np.ndarray
+    def __init__(
+        self,
+        k: int = 10,
+        measure: skt.Measure = RatioMeasure.SHARPE_RATIO,
+        highest: bool = True,
+    ):
+        self.k = k
+        self.measure = measure
+        self.highest = highest
+    def fit(self, X: npt.ArrayLike, y=None) -> "SelectKExtremes":
+        """Run the SelectKExtremes transformer and get the appropriate assets.
+        Parameters
+        ----------
+        X : array-like of shape (n_observations, n_assets)
+            Price returns of the assets.
+        y : Ignored
+            Not used, present for API consistency by convention.
+        Returns
+        -------
+        self : SelectKExtremes
+            Fitted estimator.
+        """
+        X = self._validate_data(X)
+        k = int(self.k)
+        if k <= 0:
+            raise ValueError("`k` must be strictly positive")
+        n_assets = X.shape[1]
+        # Build a population of single assets portfolio
+        population = Population([])
+        for i in range(n_assets):
+            weights = np.zeros(n_assets)
+            weights[i] = 1
+            population.append(Portfolio(X=X, weights=weights))
+        selected = population.sort_measure(measure=self.measure, reverse=self.highest)[
+            :k
+        ]
+        selected_idx = [x.nonzero_assets_index[0] for x in selected]
+        self.to_keep_ = np.isin(np.arange(n_assets), selected_idx)
+        return self
+    def _get_support_mask(self):
+        skv.check_is_fitted(self)
+        return self.to_keep_
+class SelectNonDominated(skf.SelectorMixin, skb.BaseEstimator):
+    """Transformer for selecting non dominated assets.
+    Pre-selection based on the Assets Preselection Process 2 [1]_.
+    Good single asset (for example with high return and low risk) is likely to
+    contribute to the final optimized portfolio. Each asset is considered as a portfolio
+    and these assets are ranked using the non-domination sorting method. The selection
+    is based on the ranks assigned to each asset based on their fitness until the number
+    of selected assets reaches the user-defined number.
+    Considering only the fitness of individual asset is insufficient because a pair of
+    negatively correlated assets has the potential to reduce the risk. Therefore,
+    negatively correlated pairs of assets are also considered.
+    Parameters
+    ----------
+    min_n_assets : int, optional
+        The minimum number of assets to select. If `min_n_assets` is reached before the
+        end of the current non-dominated front, we return the remaining assets of this
+        front. This is because all assets in the same front have same rank.
+        The default (`None`) is to select the first front.
+    threshold : float, default=0.0
+        Asset pair with a correlation below this threshold are included in the
+        non-domination sorting. The default value is `0.0`.
+    fitness_measures : list[Measure], optional
+        A list of :ref:`measure <measures_ref>` used to compute the portfolio fitness.
+        The fitness is used to compare portfolios in terms of domination, compute the
+        pareto fronts and run the portfolio selection using non-denominated sorting.
+        The default (`None`) is to use the list [PerfMeasure.MEAN, RiskMeasure.VARIANCE]
+    Attributes
+    ----------
+    to_keep_ : ndarray of shape (n_assets, )
+        Boolean array indicating which assets are remaining.
+    n_features_in_ : int
+        Number of assets seen during `fit`.
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during `fit`. Defined only when `X`
+        has feature names that are all strings.
+    References
+    ----------
+    .. [1]  "Large-Scale Portfolio Optimization Using Multi-objective Evolutionary
+        Algorithms and Preselection Methods",
+        B.Y. Qu and Q.Zhou (2017).
+    """
+    to_keep_: np.ndarray
+    def __init__(
+        self,
+        min_n_assets: int | None = None,
+        threshold: float = -0.5,
+        fitness_measures: list[skt.Measure] | None = None,
+    ):
+        self.min_n_assets = min_n_assets
+        self.threshold = threshold
+        self.fitness_measures = fitness_measures
+    def fit(self, X: npt.ArrayLike, y=None):
+        """Run the Non Dominated transformer and get the appropriate assets.
+        Parameters
+        ----------
+        X : array-like of shape (n_observations, n_assets)
+            Price returns of the assets.
+        y : Ignored
+            Not used, present for API consistency by convention.
+        Returns
+        -------
+        self : SelectNonDominated
+            Fitted estimator.
+        """
+        X = self._validate_data(X)
+        if not -1 <= self.threshold <= 1:
+            raise ValueError("`threshold` must be between -1 and 1")
+        n_assets = X.shape[1]
+        if self.min_n_assets is not None and self.min_n_assets >= n_assets:
+            self.to_keep_ = np.full(n_assets, True)
+            return self
+        # Build a population of portfolio
+        population = Population([])
+        # Add single assets
+        for i in range(n_assets):
+            weights = np.zeros(n_assets)
+            weights[i] = 1
+            population.append(
+                Portfolio(X=X, weights=weights, fitness_measures=self.fitness_measures)
+            )
+        # Add pairs with correlation below threshold with minimum variance
+        # ptf_variance = 𝜎1^2 𝑤1^2 + 𝜎2^2 𝑤2^2 + 2 𝜎12 𝑤1 𝑤2 (1)
+        # with 𝑤1 + 𝑤2 = 1
+        # To find the minimum we substitute 𝑤2 = 1 - 𝑤1 in (1) and differentiate with
+        # respect to 𝑤1 and set to zero.
+        # By solving the obtained equation, we get:
+        # 𝑤1 = (𝜎2^2 - 𝜎12) / (𝜎1^2 + 𝜎2^2 - 2 𝜎12)
+        # 𝑤2 = 1 - 𝑤1
+        corr = np.corrcoef(X.T)
+        covariance = np.cov(X.T)
+        for i, j in zip(*np.triu_indices(n_assets, 1), strict=True):
+            if corr[i, j] < self.threshold:
+                cov = covariance[i, j]
+                var1 = covariance[i, i]
+                var2 = covariance[j, j]
+                weights = np.zeros(n_assets)
+                weights[i] = (var2 - cov) / (var1 + var2 - 2 * cov)
+                weights[j] = 1 - weights[i]
+                population.append(
+                    Portfolio(
+                        X=X, weights=weights, fitness_measures=self.fitness_measures
+                    )
+                )
+        fronts = population.non_denominated_sort(
+            first_front_only=self.min_n_assets is None
+        )
+        new_assets_idx = set()
+        i = 0
+        while i < len(fronts):
+            if (
+                self.min_n_assets is not None
+                and len(new_assets_idx) > self.min_n_assets
+            ):
+                break
+            for idx in fronts[i]:
+                new_assets_idx.update(population[idx].nonzero_assets_index)
+            i += 1
+        self.to_keep_ = np.isin(np.arange(n_assets), list(new_assets_idx))
+        return self
+    def _get_support_mask(self):
+        skv.check_is_fitted(self)
+        return self.to_keep_

skfolio/preprocessing/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from skfolio.preprocessing._returns import prices_to_returns
+__all__ = ["prices_to_returns"]

skfolio/preprocessing/_returns.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Preprocessing module to transform X to returns."""
+# Author: Hugo Delatte <delatte.hugo@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import pandas as pd
+def prices_to_returns(
+    X: pd.DataFrame,
+    y: pd.DataFrame | None = None,
+    log_returns: bool = False,
+    nan_threshold: float = 1,
+    join: str = "outer",
+) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
+    r"""Transforms a DataFrame of prices to linear or logarithmic returns.
+    Linear returns (also called simple returns) are defined as:
+        .. math:: \frac{S_{t}}{S_{t-1}} - 1
+    Logarithmic returns (also called continuously compounded return) are defined as:
+        .. math:: ln\Biggl(\frac{S_{t}}{S_{t-1}}\Biggr)
+    With :math:`S_{t}` the asset price at time :math:`t`.
+    .. warning::
+        The linear returns aggregate across securities, meaning that the linear return
+        of the portfolio is the weighted average of the linear returns of the
+        securities. For this reason, **portfolio optimization should be performed
+        using linear returns** [1]_.
+        On the other hand, the logarithmic returns aggregate across time, meaning that
+        the total logarithmic return over K time periods is the sum of all K
+        single-period logarithmic returns.
+    .. seealso::
+        :ref:`data preparation <data_preparation>`
+    Parameters
+    ----------
+    X : DataFrame
+        The DataFrame of assets prices.
+    y : DataFrame, optional
+        The DataFrame of target or factors prices.
+        If provided, it is joined with the DataFrame of prices to ensure identical
+        observations.
+    log_returns : bool, default=True
+        If this is set to True, logarithmic returns are used instead of simple returns.
+    join : str, default='outer
+        The join method between `X` and `y` when `y` is provided.
+    nan_threshold : float, default=1.0
+        Drop observations (rows) that have a percentage of missing assets prices above
+        this threshold. The default (`1.0`) is to keep all the observations.
+    Returns
+    -------
+    X : DataFrame
+        The DataFrame of price returns of the input `X`.
+    y : DataFrame, optional
+        The DataFrame of price returns of the input `y` when provided.
+    References
+    ----------
+    .. [1]  "Linear vs. Compounded Returns – Common Pitfalls in Portfolio Management".
+        GARP Risk Professional.
+        Attilio Meucci (2010).
+    """
+    if not isinstance(X, pd.DataFrame):
+        raise TypeError("`X` must be a DataFrame")
+    if y is None:
+        df = X.copy()
+    else:
+        if not isinstance(y, pd.DataFrame):
+            raise TypeError("`y` must be a DataFrame")
+        df = pd.concat([X, y], join=join, axis=1)
+    n_observations, n_assets = X.shape
+    # Remove observations with missing X above threshold
+    if nan_threshold is not None:
+        nan_threshold = float(nan_threshold)
+        if not 0 < nan_threshold <= 1:
+            raise ValueError("`nan_threshold` must be between 0 and 1")
+        count_nan = df.isna().sum(axis=1)
+        to_drop = count_nan[count_nan > n_assets * nan_threshold].index
+        if len(to_drop) > 0:
+            df.drop(to_drop, axis=0, inplace=True)
+    # Forward fill missing values
+    df.ffill(inplace=True)
+    # Drop rows if any of its values is missing
+    df.dropna(axis=0, how="any", inplace=True)
+    # Drop column if all its values are missing
+    df.dropna(axis=1, how="all", inplace=True)
+    # returns
+    all_returns = df.pct_change().dropna()
+    if log_returns:
+        all_returns = np.log1p(all_returns)
+    if y is None:
+        return all_returns
+    returns = all_returns[[x for x in X.columns if x in df.columns]]
+    factor_returns = all_returns[[x for x in y.columns if x in df.columns]]
+    return returns, factor_returns

skfolio/prior/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from skfolio.prior._base import BasePrior, PriorModel
+from skfolio.prior._black_litterman import BlackLitterman
+from skfolio.prior._empirical import EmpiricalPrior
+from skfolio.prior._factor_model import (
+    FactorModel,
+    BaseLoadingMatrix,
+    LoadingMatrixRegression,
+)
+__all__ = [
+    "PriorModel",
+    "BasePrior",
+    "EmpiricalPrior",
+    "BlackLitterman",
+    "FactorModel",
+    "BaseLoadingMatrix",
+    "LoadingMatrixRegression",
+]

skfolio/prior/_base.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Base Prior estimator"""
+# Author: Hugo Delatte <delatte.hugo@gmail.com>
+# License: BSD 3 clause
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import numpy as np
+import numpy.typing as npt
+import sklearn.base as skb
+# frozen=True with eq=False will lead to an id-based hashing which is needed for
+# caching CVX models in Optimization without impacting performance
+@dataclass(frozen=True, eq=False)
+class PriorModel:
+    """Prior model dataclass.
+    Attributes
+    ----------
+    mu : ndarray of shape (n_assets,)
+        Estimation of the assets expected returns.
+    covariance : ndarray of shape (n_assets, n_assets)
+        Estimation of the assets covariance matrix.
+    returns : ndarray of shape (n_observations, n_assets)
+        Estimation of the assets returns.
+    cholesky : ndarray, optional
+        Lower-triangular Cholesky factor of the covariance. In some cases it is possible
+        to obtain a cholesky factor with less dimension compared to the one obtained
+        directly by applying the cholesky decomposition to the covariance estimation
+        (for example in Factor Models). When provided, this cholesky factor is use in
+        some optimizations (for example in mean-variance) to improve performance and
+        convergence. The default is `None`.
+    """
+    mu: np.ndarray
+    covariance: np.ndarray
+    returns: np.ndarray
+    cholesky: np.ndarray | None = None
+class BasePrior(skb.BaseEstimator, ABC):
+    """Base class for all prior estimators in skfolio.
+    Notes
+    -----
+    All estimators should specify all the parameters that can be set
+    at the class level in their ``__init__`` as explicit keyword
+    arguments (no ``*args`` or ``**kwargs``).
+    """
+    prior_model_: PriorModel
+    @abstractmethod
+    def __init__(self):
+        pass
+    @abstractmethod
+    def fit(self, X: npt.ArrayLike, y=None):
+        pass