PyPI - skfolio - Versions diffs - 0.0.1__py3-none-any.whl - Mend

skfolio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

skfolio/__init__.py +29 -0
skfolio/cluster/__init__.py +8 -0
skfolio/cluster/_hierarchical.py +387 -0
skfolio/datasets/__init__.py +20 -0
skfolio/datasets/_base.py +389 -0
skfolio/datasets/data/__init__.py +0 -0
skfolio/datasets/data/factors_dataset.csv.gz +0 -0
skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
skfolio/datasets/data/sp500_index.csv.gz +0 -0
skfolio/distance/__init__.py +26 -0
skfolio/distance/_base.py +55 -0
skfolio/distance/_distance.py +574 -0
skfolio/exceptions.py +30 -0
skfolio/measures/__init__.py +76 -0
skfolio/measures/_enums.py +355 -0
skfolio/measures/_measures.py +607 -0
skfolio/metrics/__init__.py +3 -0
skfolio/metrics/_scorer.py +121 -0
skfolio/model_selection/__init__.py +18 -0
skfolio/model_selection/_combinatorial.py +407 -0
skfolio/model_selection/_validation.py +194 -0
skfolio/model_selection/_walk_forward.py +221 -0
skfolio/moments/__init__.py +41 -0
skfolio/moments/covariance/__init__.py +29 -0
skfolio/moments/covariance/_base.py +101 -0
skfolio/moments/covariance/_covariance.py +1108 -0
skfolio/moments/expected_returns/__init__.py +21 -0
skfolio/moments/expected_returns/_base.py +31 -0
skfolio/moments/expected_returns/_expected_returns.py +415 -0
skfolio/optimization/__init__.py +36 -0
skfolio/optimization/_base.py +147 -0
skfolio/optimization/cluster/__init__.py +13 -0
skfolio/optimization/cluster/_nco.py +348 -0
skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
skfolio/optimization/cluster/hierarchical/_base.py +440 -0
skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
skfolio/optimization/convex/__init__.py +16 -0
skfolio/optimization/convex/_base.py +1944 -0
skfolio/optimization/convex/_distributionally_robust.py +392 -0
skfolio/optimization/convex/_maximum_diversification.py +417 -0
skfolio/optimization/convex/_mean_risk.py +974 -0
skfolio/optimization/convex/_risk_budgeting.py +560 -0
skfolio/optimization/ensemble/__init__.py +6 -0
skfolio/optimization/ensemble/_base.py +87 -0
skfolio/optimization/ensemble/_stacking.py +326 -0
skfolio/optimization/naive/__init__.py +3 -0
skfolio/optimization/naive/_naive.py +173 -0
skfolio/population/__init__.py +3 -0
skfolio/population/_population.py +883 -0
skfolio/portfolio/__init__.py +13 -0
skfolio/portfolio/_base.py +1096 -0
skfolio/portfolio/_multi_period_portfolio.py +610 -0
skfolio/portfolio/_portfolio.py +842 -0
skfolio/pre_selection/__init__.py +7 -0
skfolio/pre_selection/_pre_selection.py +342 -0
skfolio/preprocessing/__init__.py +3 -0
skfolio/preprocessing/_returns.py +114 -0
skfolio/prior/__init__.py +18 -0
skfolio/prior/_base.py +63 -0
skfolio/prior/_black_litterman.py +238 -0
skfolio/prior/_empirical.py +163 -0
skfolio/prior/_factor_model.py +268 -0
skfolio/typing.py +50 -0
skfolio/uncertainty_set/__init__.py +23 -0
skfolio/uncertainty_set/_base.py +108 -0
skfolio/uncertainty_set/_bootstrap.py +281 -0
skfolio/uncertainty_set/_empirical.py +237 -0
skfolio/utils/__init__.py +0 -0
skfolio/utils/bootstrap.py +115 -0
skfolio/utils/equations.py +350 -0
skfolio/utils/sorting.py +117 -0
skfolio/utils/stats.py +466 -0
skfolio/utils/tools.py +567 -0
skfolio-0.0.1.dist-info/LICENSE +29 -0
skfolio-0.0.1.dist-info/METADATA +568 -0
skfolio-0.0.1.dist-info/RECORD +79 -0
skfolio-0.0.1.dist-info/WHEEL +5 -0
skfolio-0.0.1.dist-info/top_level.txt +1 -0

skfolio/model_selection/_validation.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""Model validation module."""
+# Author: Hugo Delatte <delatte.hugo@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import numpy.typing as npt
+import sklearn as sk
+import sklearn.base as skb
+import sklearn.model_selection as skm
+import sklearn.utils as sku
+import sklearn.utils.parallel as skp
+from skfolio.model_selection._combinatorial import BaseCombinatorialCV
+from skfolio.population import Population
+from skfolio.portfolio import MultiPeriodPortfolio
+from skfolio.utils.tools import fit_and_predict, safe_split
+def cross_val_predict(
+    estimator: skb.BaseEstimator,
+    X: npt.ArrayLike,
+    y: npt.ArrayLike = None,
+    groups: np.ndarray | None = None,
+    cv: skm.BaseCrossValidator | BaseCombinatorialCV | int | None = None,
+    n_jobs: int | None = None,
+    method: str = "predict",
+    verbose: int = 0,
+    fit_params: dict | None = None,
+    pre_dispatch: str = "2*n_jobs",
+    column_indices: np.ndarray | None = None,
+    portfolio_params: dict | None = None,
+) -> MultiPeriodPortfolio | Population:
+    """Generate cross-validated `Portfolios` estimates.
+    The data is split according to the `cv` parameter.
+    The optimization estimator is fitted on the training set and portfolios are
+    predicted on the corresponding test set.
+    For non-combinatorial cross-validation like `Kfold`, the output is the predicted
+    :class:`~skfolio.portfolio.MultiPeriodPortfolio` where
+    each :class:`~skfolio.portfolio.Portfolio` corresponds to the prediction on each
+    train/test pair (`k` portfolios for `Kfold`).
+    For combinatorial cross-validation
+    like :class:`~skfolio.model_selection.CombinatorialPurgedCV`, the output is the
+    predicted :class:`~skfolio.population.Population` of multiple
+    :class:`~skfolio.portfolio.MultiPeriodPortfolio` (each test outputs are a
+    collection of multiple paths instead of one single path).
+    Parameters
+    ----------
+    estimator : BaseOptimization
+        :ref:`Optimization estimators <optimization>` use to fit the data.
+    X : array-like of shape (n_observations, n_assets)
+        Price returns of the assets.
+    y : array-like of shape (n_observations, n_targets), optional
+        Target data (optional).
+        For example, the price returns of the factors.
+    groups : array-like of shape (n_observations,), optional
+        Group labels for the samples used while splitting the dataset into
+        train/test set. Only used in conjunction with a "Group" `cv`
+        instance (e.g., `GroupKFold`).
+    cv : int | cross-validation generator, optional
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+        * None, to use the default 5-fold cross validation,
+        * int, to specify the number of folds in a `(Stratified)KFold`,
+        * `CV splitter`,
+        * An iterable that generates (train, test) splits as arrays of indices.
+    n_jobs : int, optional
+        The number of jobs to run in parallel for `fit` of all `estimators`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors.
+    method : str
+        Invokes the passed method name of the passed estimator.
+    verbose : int, default=0
+        The verbosity level.
+    fit_params : dict, optional
+        Parameters to pass to the fit method of the estimator.
+    pre_dispatch : int or str, default='2*n_jobs'
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+            * None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+            * An int, giving the exact number of total jobs that are
+              spawned
+            * A str, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+    column_indices : ndarray, optional
+        Indices of the `X` columns to cross-validate on.
+    portfolio_params :  dict, optional
+        Additional portfolio parameters passed to `MultiPeriodPortfolio`.
+    Returns
+    -------
+    predictions : MultiPeriodPortfolio | Population
+        This is the result of calling `predict`
+    """
+    X, y = safe_split(X, y, indices=column_indices, axis=1)
+    X, y, groups = sku.indexable(X, y, groups)
+    cv = skm.check_cv(cv, y)
+    splits = list(cv.split(X, y, groups))
+    portfolio_params = {} if portfolio_params is None else portfolio_params.copy()
+    # We ensure that the folds are not shuffled
+    if not isinstance(cv, BaseCombinatorialCV):
+        try:
+            if cv.shuffle:
+                raise ValueError(
+                    "`cross_val_predict` only works with cross-validation setting"
+                    " `shuffle=False`"
+                )
+        except AttributeError:
+            # If we cannot find the attribute shuffle, we check if the first folds
+            # are shuffled
+            for fold in splits[0]:
+                if not np.all(np.diff(fold) > 0):
+                    raise ValueError(
+                        "`cross_val_predict` only works with un-shuffled folds"
+                    ) from None
+    # We clone the estimator to make sure that all the folds are independent
+    # and that it is pickle-able.
+    parallel = skp.Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
+    # TODO remove when https://github.com/joblib/joblib/issues/1071 is fixed
+    predictions = parallel(
+        skp.delayed(fit_and_predict)(
+            sk.clone(estimator),
+            X,
+            y,
+            train=train,
+            test=test,
+            fit_params=fit_params,
+            method=method,
+        )
+        for train, test in splits
+    )
+    if isinstance(cv, BaseCombinatorialCV):
+        path_ids = cv.get_path_ids()
+        path_nb = np.max(path_ids) + 1
+        portfolios = [[] for _ in range(path_nb)]
+        for i, prediction in enumerate(predictions):
+            for j, p in enumerate(prediction):
+                path_id = path_ids[i, j]
+                portfolios[path_id].append(p)
+        name = portfolio_params.pop("name", "path")
+        pred = Population(
+            [
+                MultiPeriodPortfolio(
+                    name=f"{name}_{i}", portfolios=portfolios[i], **portfolio_params
+                )
+                for i in range(path_nb)
+            ]
+        )
+    else:
+        # We need to re-order the test folds in case they were un-ordered by the
+        # CV generator.
+        # Because the tests folds are not shuffled, we use the first index of each
+        # fold to order them.
+        test_indices = np.concatenate([test for _, test in splits])
+        if np.unique(test_indices, axis=0).shape[0] != test_indices.shape[0]:
+            raise ValueError(
+                "`cross_val_predict` only works with non-duplicated test indices"
+            )
+        test_indices = [test for _, test in splits]
+        sorted_fold_id = np.argsort([x[0] for x in test_indices])
+        pred = MultiPeriodPortfolio(
+            portfolios=[predictions[fold_id] for fold_id in sorted_fold_id],
+            check_observations_order=False,
+            **portfolio_params,
+        )
+    return pred

skfolio/model_selection/_walk_forward.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""Walk Forward cross-validator"""
+# Author: Hugo Delatte <delatte.hugo@gmail.com>
+# License: BSD 3 clause
+from collections.abc import Iterator
+import numpy as np
+import numpy.typing as npt
+import sklearn.model_selection as skm
+import sklearn.utils as sku
+class WalkForward(skm.BaseCrossValidator):
+    """Walk Forward cross-validator.
+    Provides train/test indices to split time series data samples in a walk forward
+    logic.
+    In each split, test indices must be higher than before, and thus shuffling
+    in cross validator is inappropriate.
+    Compared to `sklearn.model_selection.TimeSeriesSplit`, you control the train/test
+    folds by providing a number of training and test samples instead of a number of
+    split making it more suitable for portfolio cross-validation.
+    Parameters
+    ----------
+    test_size : int
+        Number of observations in each test set.
+    train_size : int
+        Number of observations in each training set.
+    expend_train : bool, default=False
+        If this is set to True, each subsequent training set after the first one will
+        use all past observations.
+        The default is `False`
+    reduce_test : bool, default=False
+        If this is set to True, the last train/test split will be returned even if the
+        test set is partial (if it constains less observations than `test_size`),
+        otherwise it will be ignored.
+        The default is `False`
+    purged_size : int, default=0
+        Number of observations to exclude from the end of each train set before the
+        test set.
+        The default value is `0`
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from skfolio.model_selection import WalkForward
+    >>> X = np.random.randn(6, 2)
+    >>> cv = WalkForward(test_size=1, train_size=2)
+    >>> for i, (train_index, test_index) in enumerate(cv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1]
+      Test:  index=[2]
+    Fold 1:
+      Train: index=[1 2]
+      Test:  index=[3]
+    Fold 2:
+      Train: index=[2 3]
+      Test:  index=[4]
+    Fold 3:
+      Train: index=[3 4]
+      Test:  index=[5]
+    >>> cv = WalkForward(test_size=1, train_size=2, purged_size=1)
+    >>> for i, (train_index, test_index) in enumerate(cv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1]
+      Test:  index=[3]
+    Fold 1:
+      Train: index=[1 2]
+      Test:  index=[4]
+    Fold 2:
+      Train: index=[2 3]
+      Test:  index=[5]
+    >>> cv = WalkForward(test_size=2, train_size=3)
+    >>> for i, (train_index, test_index) in enumerate(cv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1 2]
+      Test:  index=[3 4]
+    >>> cv = WalkForward(test_size=2, train_size=3, reduce_test=True)
+    >>> for i, (train_index, test_index) in enumerate(cv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1 2]
+      Test:  index=[3 4]
+    Fold 1:
+      Train: index=[2 3 4]
+      Test:  index=[5]
+    >>> cv = WalkForward(test_size=2, train_size=3, expend_train=True, reduce_test=True)
+    >>> for i, (train_index, test_index) in enumerate(cv.split(X)):
+    ...     print(f"Fold {i}:")
+    ...     print(f"  Train: index={train_index}")
+    ...     print(f"  Test:  index={test_index}")
+    Fold 0:
+      Train: index=[0 1 2]
+      Test:  index=[3 4]
+    Fold 1:
+      Train: index=[0 1 2 3 4]
+      Test:  index=[5]
+    """
+    def __init__(
+        self,
+        test_size: int,
+        train_size: int,
+        expend_train: bool = False,
+        reduce_test: bool = False,
+        purged_size: int = 0,
+    ):
+        self.test_size = test_size
+        self.train_size = train_size
+        self.expend_train = expend_train
+        self.reduce_test = reduce_test
+        self.purged_size = purged_size
+    def split(
+        self, X: npt.ArrayLike, y=None, groups=None
+    ) -> Iterator[np.ndarray, np.ndarray]:
+        """Generate indices to split data into training and test set.
+        Parameters
+        ----------
+        X : array-like of shape (n_observations, n_assets)
+            Price returns of the assets.
+        y : array-like of shape (n_observations, n_targets)
+            Always ignored, exists for compatibility.
+        groups : array-like of shape (n_observations,)
+            Always ignored, exists for compatibility.
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+        test : ndarray
+            The testing set indices for that split.
+        """
+        X, y = sku.indexable(X, y)
+        n_samples = X.shape[0]
+        # Make sure we have enough samples for the given split parameters
+        if self.train_size + self.purged_size >= n_samples:
+            raise ValueError(
+                "The sum of `train_size` with `purged_size` "
+                f"({self.train_size + self.purged_size}) cannot be greater than the"
+                f" number of samples ({n_samples})."
+            )
+        indices = np.arange(n_samples)
+        test_start = self.train_size + self.purged_size
+        while True:
+            if test_start >= n_samples:
+                return
+            test_end = test_start + self.test_size
+            train_end = test_start - self.purged_size
+            if self.expend_train:
+                train_start = 0
+            else:
+                train_start = train_end - self.train_size
+            if test_end > n_samples:
+                if not self.reduce_test:
+                    return
+                yield (
+                    indices[train_start:train_end],
+                    indices[test_start:],
+                )
+            else:
+                yield (
+                    indices[train_start:train_end],
+                    indices[test_start:test_end],
+                )
+            test_start = test_end
+    def get_n_splits(self, X: npt.ArrayLike, y=None, groups=None) -> int:
+        """Returns the number of splitting iterations in the cross-validator
+        Parameters
+        ----------
+         X : array-like of shape (n_observations, n_assets)
+            Price returns of the assets.
+        y : array-like of shape (n_observations, n_targets)
+            Always ignored, exists for compatibility.
+        groups : array-like of shape (n_observations,)
+            Always ignored, exists for compatibility.
+        Returns
+        -------
+        n_folds : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        if X is None:
+            raise ValueError("The 'X' parameter should not be None.")
+        X, y = sku.indexable(X, y)
+        n_samples = X.shape[0]
+        n = n_samples - self.train_size - self.purged_size
+        if self.reduce_test and n % self.test_size != 0:
+            return n // self.test_size + 1
+        return n // self.test_size

skfolio/moments/__init__.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Moments module."""
+from skfolio.moments.covariance import (
+    OAS,
+    BaseCovariance,
+    DenoiseCovariance,
+    DenoteCovariance,
+    EWCovariance,
+    EmpiricalCovariance,
+    GerberCovariance,
+    GraphicalLassoCV,
+    LedoitWolf,
+    ShrunkCovariance,
+)
+from skfolio.moments.expected_returns import (
+    BaseMu,
+    EWMu,
+    EmpiricalMu,
+    EquilibriumMu,
+    ShrunkMu,
+    ShrunkMuMethods,
+)
+__all__ = [
+    "BaseMu",
+    "EmpiricalMu",
+    "EWMu",
+    "ShrunkMu",
+    "EquilibriumMu",
+    "ShrunkMuMethods",
+    "BaseCovariance",
+    "EmpiricalCovariance",
+    "EWCovariance",
+    "GerberCovariance",
+    "DenoiseCovariance",
+    "DenoteCovariance",
+    "LedoitWolf",
+    "OAS",
+    "ShrunkCovariance",
+    "GraphicalLassoCV",
+]

skfolio/moments/covariance/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""Covariance module."""
+from skfolio.moments.covariance._base import (
+    BaseCovariance,
+)
+from skfolio.moments.covariance._covariance import (
+    OAS,
+    DenoiseCovariance,
+    DenoteCovariance,
+    EWCovariance,
+    EmpiricalCovariance,
+    GerberCovariance,
+    GraphicalLassoCV,
+    LedoitWolf,
+    ShrunkCovariance,
+)
+__all__ = [
+    "BaseCovariance",
+    "EmpiricalCovariance",
+    "EWCovariance",
+    "GerberCovariance",
+    "DenoiseCovariance",
+    "DenoteCovariance",
+    "LedoitWolf",
+    "OAS",
+    "ShrunkCovariance",
+    "GraphicalLassoCV",
+]

skfolio/moments/covariance/_base.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Base Covariance Estimators."""
+from abc import ABC, abstractmethod
+import numpy as np
+import numpy.typing as npt
+import sklearn.base as skb
+from skfolio.exceptions import NonPositiveVarianceError
+from skfolio.utils.stats import cov_nearest
+class BaseCovariance(skb.BaseEstimator, ABC):
+    """Base class for all covariance estimators in `skfolio`.
+    Parameters
+    ----------
+    nearest : bool, default=False
+        If this is set to True, the covariance is replaced by the nearest covariance
+        matrix that is positive definite and with a Cholesky decomposition than can be
+        computed. The variance is left unchanged. A covariance matrix is in theory PSD.
+        However, due to floating-point inaccuracies, we can end up with a covariance
+        matrix that is slightly non-PSD or where Cholesky decomposition is failing.
+        This often occurs in high dimensional problems.
+        For more details, see :func:`~skfolio.units.stats.cov_nearest`.
+        The default is `False`.
+    higham : bool, default=False
+        If this is set to True, the Higham & Nick (2002) algorithm is used to find the
+        nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
+        above zeros (1e-13). The default is `False` and use the clipping method as the
+        Higham & Nick algorithm can be slow for large datasets.
+    higham_max_iteration : int, default=100
+        Maximum number of iteration of the Higham & Nick (2002) algorithm.
+        The default value is `100`.
+    Attributes
+    ----------
+    covariance_ : ndarray of shape (n_assets, n_assets)
+        Estimated covariance matrix.
+    Notes
+    -----
+    All estimators should specify all the parameters that can be set
+    at the class level in their ``__init__`` as explicit keyword
+    arguments (no ``*args`` or ``**kwargs``).
+    """
+    covariance_: np.ndarray
+    @abstractmethod
+    def __init__(
+        self,
+        nearest: bool = False,
+        higham: bool = False,
+        higham_max_iteration: int = 100,
+    ):
+        self.nearest = nearest
+        self.higham = higham
+        self.higham_max_iteration = higham_max_iteration
+    @abstractmethod
+    def fit(self, X: npt.ArrayLike, y=None):
+        pass
+    def _sanity_check(self, covariance: np.ndarray) -> None:
+        """Perform a sanity check on the covariance matrix by verifying that all
+        diagonal elements are strictly positive.
+        The goal is to early detect corrupted asset data (with zero variance) that
+        would lead to optimizations errors.
+        """
+        cond = np.diag(covariance) < 1e-15
+        if np.any(cond):
+            corrupted_assets = list(np.argwhere(cond).flatten())
+            detail = "assets indices"
+            if hasattr(self, "feature_names_in_"):
+                corrupted_assets = list(self.feature_names_in_[corrupted_assets])
+                detail = "assets"
+            raise NonPositiveVarianceError(
+                f"The following {detail} have a non positive variance:"
+                f" {corrupted_assets}"
+            )
+    def _set_covariance(self, covariance: np.ndarray) -> None:
+        """Perform checks, convert to nearest PSD if specified and saves the covariance.
+        Parameters
+        ----------
+        covariance : array-like of shape (n_assets, n_assets)
+            Estimated covariance matrix to be stored.
+        """
+        self._sanity_check(covariance)
+        if self.nearest:
+            covariance = cov_nearest(
+                covariance,
+                higham=self.higham,
+                higham_max_iteration=self.higham_max_iteration,
+            )
+        # set covariance
+        self.covariance_ = covariance