skfolio 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skfolio/__init__.py +29 -0
- skfolio/cluster/__init__.py +8 -0
- skfolio/cluster/_hierarchical.py +387 -0
- skfolio/datasets/__init__.py +20 -0
- skfolio/datasets/_base.py +389 -0
- skfolio/datasets/data/__init__.py +0 -0
- skfolio/datasets/data/factors_dataset.csv.gz +0 -0
- skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
- skfolio/datasets/data/sp500_index.csv.gz +0 -0
- skfolio/distance/__init__.py +26 -0
- skfolio/distance/_base.py +55 -0
- skfolio/distance/_distance.py +574 -0
- skfolio/exceptions.py +30 -0
- skfolio/measures/__init__.py +76 -0
- skfolio/measures/_enums.py +355 -0
- skfolio/measures/_measures.py +607 -0
- skfolio/metrics/__init__.py +3 -0
- skfolio/metrics/_scorer.py +121 -0
- skfolio/model_selection/__init__.py +18 -0
- skfolio/model_selection/_combinatorial.py +407 -0
- skfolio/model_selection/_validation.py +194 -0
- skfolio/model_selection/_walk_forward.py +221 -0
- skfolio/moments/__init__.py +41 -0
- skfolio/moments/covariance/__init__.py +29 -0
- skfolio/moments/covariance/_base.py +101 -0
- skfolio/moments/covariance/_covariance.py +1108 -0
- skfolio/moments/expected_returns/__init__.py +21 -0
- skfolio/moments/expected_returns/_base.py +31 -0
- skfolio/moments/expected_returns/_expected_returns.py +415 -0
- skfolio/optimization/__init__.py +36 -0
- skfolio/optimization/_base.py +147 -0
- skfolio/optimization/cluster/__init__.py +13 -0
- skfolio/optimization/cluster/_nco.py +348 -0
- skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
- skfolio/optimization/cluster/hierarchical/_base.py +440 -0
- skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
- skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
- skfolio/optimization/convex/__init__.py +16 -0
- skfolio/optimization/convex/_base.py +1944 -0
- skfolio/optimization/convex/_distributionally_robust.py +392 -0
- skfolio/optimization/convex/_maximum_diversification.py +417 -0
- skfolio/optimization/convex/_mean_risk.py +974 -0
- skfolio/optimization/convex/_risk_budgeting.py +560 -0
- skfolio/optimization/ensemble/__init__.py +6 -0
- skfolio/optimization/ensemble/_base.py +87 -0
- skfolio/optimization/ensemble/_stacking.py +326 -0
- skfolio/optimization/naive/__init__.py +3 -0
- skfolio/optimization/naive/_naive.py +173 -0
- skfolio/population/__init__.py +3 -0
- skfolio/population/_population.py +883 -0
- skfolio/portfolio/__init__.py +13 -0
- skfolio/portfolio/_base.py +1096 -0
- skfolio/portfolio/_multi_period_portfolio.py +610 -0
- skfolio/portfolio/_portfolio.py +842 -0
- skfolio/pre_selection/__init__.py +7 -0
- skfolio/pre_selection/_pre_selection.py +342 -0
- skfolio/preprocessing/__init__.py +3 -0
- skfolio/preprocessing/_returns.py +114 -0
- skfolio/prior/__init__.py +18 -0
- skfolio/prior/_base.py +63 -0
- skfolio/prior/_black_litterman.py +238 -0
- skfolio/prior/_empirical.py +163 -0
- skfolio/prior/_factor_model.py +268 -0
- skfolio/typing.py +50 -0
- skfolio/uncertainty_set/__init__.py +23 -0
- skfolio/uncertainty_set/_base.py +108 -0
- skfolio/uncertainty_set/_bootstrap.py +281 -0
- skfolio/uncertainty_set/_empirical.py +237 -0
- skfolio/utils/__init__.py +0 -0
- skfolio/utils/bootstrap.py +115 -0
- skfolio/utils/equations.py +350 -0
- skfolio/utils/sorting.py +117 -0
- skfolio/utils/stats.py +466 -0
- skfolio/utils/tools.py +567 -0
- skfolio-0.0.1.dist-info/LICENSE +29 -0
- skfolio-0.0.1.dist-info/METADATA +568 -0
- skfolio-0.0.1.dist-info/RECORD +79 -0
- skfolio-0.0.1.dist-info/WHEEL +5 -0
- skfolio-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,348 @@
|
|
1
|
+
"""Nested Clusters Optimization estimator."""
|
2
|
+
|
3
|
+
# Author: Hugo Delatte <delatte.hugo@gmail.com>
|
4
|
+
# License: BSD 3 clause
|
5
|
+
|
6
|
+
from copy import deepcopy
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import numpy.typing as npt
|
10
|
+
import pandas as pd
|
11
|
+
import sklearn as sk
|
12
|
+
import sklearn.base as skb
|
13
|
+
import sklearn.model_selection as skm
|
14
|
+
import sklearn.utils.parallel as skp
|
15
|
+
|
16
|
+
import skfolio.typing as skt
|
17
|
+
from skfolio.cluster import HierarchicalClustering
|
18
|
+
from skfolio.distance import BaseDistance, PearsonDistance
|
19
|
+
from skfolio.measures import RatioMeasure
|
20
|
+
from skfolio.model_selection import BaseCombinatorialCV, cross_val_predict
|
21
|
+
from skfolio.optimization._base import BaseOptimization
|
22
|
+
from skfolio.optimization.convex import MeanRisk
|
23
|
+
from skfolio.utils.tools import check_estimator, fit_single_estimator
|
24
|
+
|
25
|
+
|
26
|
+
class NestedClustersOptimization(BaseOptimization):
|
27
|
+
"""Nested Clusters Optimization estimator.
|
28
|
+
|
29
|
+
Nested Clusters Optimization (NCO) is a portfolio optimization method developed by
|
30
|
+
Marcos Lopez de Prado.
|
31
|
+
|
32
|
+
It uses a distance matrix to compute clusters using a clustering algorithm (
|
33
|
+
Hierarchical Tree Clustering, KMeans, etc..). For each cluster, the inner-cluster
|
34
|
+
weights are computed by fitting the inner-estimator on each cluster using the whole
|
35
|
+
training data. Then the outer-cluster weights are computed by training the
|
36
|
+
outer-estimator using out-of-sample estimates of the inner-estimators with
|
37
|
+
cross-validation. Finally, the final assets weights are the dot-product of the
|
38
|
+
inner-weights and outer-weights.
|
39
|
+
|
40
|
+
.. note ::
|
41
|
+
|
42
|
+
The original paper uses KMeans as the clustering algorithm, minimum Variance for
|
43
|
+
the inner-estimator and equal-weight for the outer-estimator. Here we generalize
|
44
|
+
it to all `sklearn` and `skfolio` clustering algorithm (HierarchicalClustering,
|
45
|
+
KMeans, ...), all optimization estimators (Mean-Variance, HRP, ...)
|
46
|
+
and risk measures (Variance, CVaR,...).
|
47
|
+
To avoid data leakage at the outer-estimator, we use out-of-sample estimates to
|
48
|
+
fit the outer optimization.
|
49
|
+
|
50
|
+
Parameters
|
51
|
+
----------
|
52
|
+
inner_estimator : BaseOptimization, optional
|
53
|
+
:ref:`Optimization estimator <optimization>` used to estimate the inner-weights
|
54
|
+
(also called intra-weights) which are the assets weights inside each cluster.
|
55
|
+
The default `None` is to use :class:`~skfolio.optimization.MeanRisk`.
|
56
|
+
|
57
|
+
outer_estimator : BaseOptimization, optional
|
58
|
+
:ref:`Optimization estimator <optimization>` used to estimate the outer-weights
|
59
|
+
(also called inter-weights) which are the weights applied to each cluster.
|
60
|
+
The default `None` is to use :class:`~skfolio.optimization.MeanRisk`.
|
61
|
+
|
62
|
+
distance_estimator : BaseDistance, optional
|
63
|
+
:ref:`Distance estimator <distance>`.
|
64
|
+
The distance estimator is used to estimate the codependence and the distance
|
65
|
+
matrix needed for the computation of the linkage matrix.
|
66
|
+
The default (`None`) is to use :class:`~skfolio.distance.PearsonDistance`.
|
67
|
+
|
68
|
+
clustering_estimator : BaseEstimator, optional
|
69
|
+
Clustering estimator. Must expose a `labels_` attribute after fitting.
|
70
|
+
The clustering estimator is used to compute the clusters of the assets based on
|
71
|
+
the distance matrix. The default (`None`) is to use
|
72
|
+
:class:`~skfolio.cluster.HierarchicalClustering`.
|
73
|
+
|
74
|
+
.. note ::
|
75
|
+
|
76
|
+
Clustering estimators from `sklearn` are also supported. For example:
|
77
|
+
`sklearn.cluster.KMeans`.
|
78
|
+
|
79
|
+
cv : BaseCrossValidator | BaseCombinatorialCV | int | "ignore", optional
|
80
|
+
Determines the cross-validation splitting strategy.
|
81
|
+
The default (`None`) is to use the 5-fold cross validation `KFold()`.
|
82
|
+
It is applied to the inner-estimators. Its out-of-sample outputs are used to
|
83
|
+
train the outer-estimator.
|
84
|
+
Possible inputs for `cv` are:
|
85
|
+
|
86
|
+
* "ignore": no cross-validation is used (note that it will likely lead to data leakage with a high risk of overfitting)
|
87
|
+
* Integer, to specify the number of folds in a :class:`sklearn.model_selection.KFold`
|
88
|
+
* An object to be used as a cross-validation generator
|
89
|
+
* An iterable yielding train, test splits
|
90
|
+
* A :class:`~skfolio.model_selection.CombinatorialPurgedCV`
|
91
|
+
|
92
|
+
If a `CombinatorialCV` cross-validator is used, each cluster out-of-sample
|
93
|
+
outputs becomes a collection of multiple paths instead of one single path. The
|
94
|
+
selected out-of-sample path among this collection of paths is chosen according
|
95
|
+
to the `quantile` and `quantile_measure` parameters.
|
96
|
+
|
97
|
+
n_jobs : int, optional
|
98
|
+
The number of jobs to run in parallel for `fit` of all `estimators`.
|
99
|
+
The value `-1` means using all processors.
|
100
|
+
The default (`None`) means 1 unless in a `joblib.parallel_backend` context.
|
101
|
+
|
102
|
+
quantile : float, default=0.5
|
103
|
+
Quantile for a given measure (`quantile_measure`) of the out-of-sample
|
104
|
+
inner-estimator paths when the `cv` parameter is a
|
105
|
+
:class:`~skfolio.model_selection.CombinatorialPurgedCV` cross-validator.
|
106
|
+
The default value is `0.5` corresponding to the path with the median measure.
|
107
|
+
(see `cv`)
|
108
|
+
|
109
|
+
quantile_measure : PerfMeasure or RatioMeasure or RiskMeasure or ExtraRiskMeasure, default=RatioMeasure.SHARPE_RATIO
|
110
|
+
Measure used for the quantile path selection (see `quantile` and `cv`).
|
111
|
+
The default is `RatioMeasure.SHARPE_RATIO`.
|
112
|
+
|
113
|
+
verbose : int, default=0
|
114
|
+
The verbosity level. The default value is `0`.
|
115
|
+
|
116
|
+
portfolio_params : dict, optional
|
117
|
+
Portfolio parameters passed to the portfolio evaluated by the `predict` and
|
118
|
+
`score` methods. If not provided, the `name` is copied from the optimization
|
119
|
+
model and systematically passed to the portfolio.
|
120
|
+
|
121
|
+
Attributes
|
122
|
+
----------
|
123
|
+
weights_ : ndarray of shape (n_assets,)
|
124
|
+
Weights of the assets.
|
125
|
+
|
126
|
+
distance_estimator_ : BaseDistance
|
127
|
+
Fitted `distance_estimator`.
|
128
|
+
|
129
|
+
inner_estimators_ : list[BaseOptimization]
|
130
|
+
List of fitted `inner_estimator`. One per cluster for clusters containing more
|
131
|
+
than one asset.
|
132
|
+
|
133
|
+
outer_estimator_ : BaseOptimization
|
134
|
+
Fitted `outer_estimator`.
|
135
|
+
|
136
|
+
clustering_estimator_ : BaseEstimator
|
137
|
+
Fitted `clustering_estimator`.
|
138
|
+
|
139
|
+
n_features_in_ : int
|
140
|
+
Number of assets seen during `fit`.
|
141
|
+
|
142
|
+
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
143
|
+
Names of assets seen during `fit`. Defined only when `X`
|
144
|
+
has assets names that are all strings.
|
145
|
+
|
146
|
+
References
|
147
|
+
----------
|
148
|
+
.. [1] "Building diversified portfolios that outperform out of sample",
|
149
|
+
The Journal of Portfolio Management,
|
150
|
+
Marcos López de Prado (2016)
|
151
|
+
|
152
|
+
.. [2] "A robust estimator of the efficient frontier",
|
153
|
+
SSRN Electronic Journal,
|
154
|
+
Marcos López de Prado (2019)
|
155
|
+
|
156
|
+
.. [3] "Machine Learning for Asset Managers",
|
157
|
+
Elements in Quantitative Finance. Cambridge University Press,
|
158
|
+
Marcos López de Prado (2020)
|
159
|
+
"""
|
160
|
+
|
161
|
+
inner_estimators_: list[BaseOptimization]
|
162
|
+
outer_estimator_: BaseOptimization
|
163
|
+
distance_estimator_: BaseDistance
|
164
|
+
clustering_estimator_: skb.BaseEstimator
|
165
|
+
|
166
|
+
def __init__(
|
167
|
+
self,
|
168
|
+
inner_estimator: BaseOptimization | None = None,
|
169
|
+
outer_estimator: BaseOptimization | None = None,
|
170
|
+
distance_estimator: BaseDistance | None = None,
|
171
|
+
clustering_estimator: skb.BaseEstimator | None = None,
|
172
|
+
cv: skm.BaseCrossValidator | BaseCombinatorialCV | str | int | None = None,
|
173
|
+
quantile: float = 0.5,
|
174
|
+
quantile_measure: skt.Measure = RatioMeasure.SHARPE_RATIO,
|
175
|
+
n_jobs: int | None = None,
|
176
|
+
verbose: int = 0,
|
177
|
+
portfolio_params: dict | None = None,
|
178
|
+
):
|
179
|
+
super().__init__(portfolio_params=portfolio_params)
|
180
|
+
self.distance_estimator = distance_estimator
|
181
|
+
self.clustering_estimator = clustering_estimator
|
182
|
+
self.inner_estimator = inner_estimator
|
183
|
+
self.outer_estimator = outer_estimator
|
184
|
+
self.cv = cv
|
185
|
+
self.quantile = quantile
|
186
|
+
self.quantile_measure = quantile_measure
|
187
|
+
self.n_jobs = n_jobs
|
188
|
+
self.verbose = verbose
|
189
|
+
|
190
|
+
def fit(
|
191
|
+
self, X: npt.ArrayLike, y: npt.ArrayLike | None = None
|
192
|
+
) -> "NestedClustersOptimization":
|
193
|
+
"""Fit the Nested Clusters Optimization estimator.
|
194
|
+
|
195
|
+
Parameters
|
196
|
+
----------
|
197
|
+
X : array-like of shape (n_observations, n_assets)
|
198
|
+
Price returns of the assets.
|
199
|
+
|
200
|
+
y : array-like of shape (n_observations, n_targets), optional
|
201
|
+
Price returns of factors or a target benchmark.
|
202
|
+
The default is `None`.
|
203
|
+
|
204
|
+
Returns
|
205
|
+
-------
|
206
|
+
self : NestedClustersOptimization
|
207
|
+
Fitted estimator.
|
208
|
+
"""
|
209
|
+
self.distance_estimator_ = check_estimator(
|
210
|
+
self.distance_estimator,
|
211
|
+
default=PearsonDistance(),
|
212
|
+
check_type=BaseDistance,
|
213
|
+
)
|
214
|
+
self.clustering_estimator_ = check_estimator(
|
215
|
+
self.clustering_estimator,
|
216
|
+
default=HierarchicalClustering(),
|
217
|
+
check_type=skb.BaseEstimator,
|
218
|
+
)
|
219
|
+
self.outer_estimator_ = check_estimator(
|
220
|
+
self.outer_estimator,
|
221
|
+
default=MeanRisk(),
|
222
|
+
check_type=BaseOptimization,
|
223
|
+
)
|
224
|
+
_inner_estimator = check_estimator(
|
225
|
+
self.inner_estimator,
|
226
|
+
default=MeanRisk(),
|
227
|
+
check_type=BaseOptimization,
|
228
|
+
)
|
229
|
+
|
230
|
+
self.distance_estimator_.fit(X)
|
231
|
+
distance = self.distance_estimator_.distance_
|
232
|
+
n_assets = distance.shape[0]
|
233
|
+
|
234
|
+
# To keep the asset_names --> used for visualisation
|
235
|
+
if isinstance(X, pd.DataFrame):
|
236
|
+
distance = pd.DataFrame(distance, columns=X.columns)
|
237
|
+
|
238
|
+
# noinspection PyUnresolvedReferences
|
239
|
+
self.clustering_estimator_.fit(distance)
|
240
|
+
# noinspection PyUnresolvedReferences
|
241
|
+
labels = self.clustering_estimator_.labels_
|
242
|
+
n_clusters = max(labels) + 1
|
243
|
+
clusters = [np.argwhere(labels == i).flatten() for i in range(n_clusters)]
|
244
|
+
|
245
|
+
# Intra cluster weights
|
246
|
+
# Fit the inner estimator on the whole training data. Those
|
247
|
+
# base estimators will be used to retrieve the inner weights.
|
248
|
+
# They are exposed publicly.
|
249
|
+
# noinspection PyCallingNonCallable
|
250
|
+
fitted_inner_estimators = skp.Parallel(n_jobs=self.n_jobs)(
|
251
|
+
skp.delayed(fit_single_estimator)(
|
252
|
+
sk.clone(_inner_estimator), X, y, indices=cluster_ids, axis=1
|
253
|
+
)
|
254
|
+
for cluster_ids in clusters
|
255
|
+
if len(cluster_ids) != 1
|
256
|
+
)
|
257
|
+
fitted_inner_estimators = iter(fitted_inner_estimators)
|
258
|
+
|
259
|
+
self.inner_estimators_ = []
|
260
|
+
inner_weights = []
|
261
|
+
for cluster_ids in clusters:
|
262
|
+
w = np.zeros(n_assets)
|
263
|
+
# For single assets, we don't run the inner optimization estimator.
|
264
|
+
if len(cluster_ids) == 1:
|
265
|
+
w[cluster_ids] = 1
|
266
|
+
else:
|
267
|
+
fitted_inner_estimator = next(fitted_inner_estimators)
|
268
|
+
self.inner_estimators_.append(fitted_inner_estimator)
|
269
|
+
w[cluster_ids] = fitted_inner_estimator.weights_
|
270
|
+
inner_weights.append(w)
|
271
|
+
inner_weights = np.array(inner_weights)
|
272
|
+
assert not any(
|
273
|
+
fitted_inner_estimators
|
274
|
+
), "fitted_inner_estimator iterator must be empty"
|
275
|
+
|
276
|
+
# Outer cluster weights
|
277
|
+
# To train the outer-estimator using the most data as possible, we use
|
278
|
+
# a cross-validation to obtain the output of the cluster estimators.
|
279
|
+
# To ensure that the data provided to each estimator are the same,
|
280
|
+
# we need to set the random state of the cv if there is one and we
|
281
|
+
# need to take a copy.
|
282
|
+
if self.cv == "ignore":
|
283
|
+
cv_predictions = None
|
284
|
+
test_indices = slice(None)
|
285
|
+
else:
|
286
|
+
cv = skm.check_cv(self.cv)
|
287
|
+
if hasattr(cv, "random_state") and cv.random_state is None:
|
288
|
+
cv.random_state = np.random.RandomState()
|
289
|
+
# noinspection PyCallingNonCallable
|
290
|
+
cv_predictions = skp.Parallel(n_jobs=self.n_jobs)(
|
291
|
+
skp.delayed(cross_val_predict)(
|
292
|
+
sk.clone(_inner_estimator),
|
293
|
+
X,
|
294
|
+
y,
|
295
|
+
cv=deepcopy(cv),
|
296
|
+
n_jobs=self.n_jobs,
|
297
|
+
verbose=self.verbose,
|
298
|
+
column_indices=cluster_ids,
|
299
|
+
method="predict",
|
300
|
+
)
|
301
|
+
for cluster_ids in clusters
|
302
|
+
if len(cluster_ids) != 1
|
303
|
+
)
|
304
|
+
cv_predictions = iter(cv_predictions)
|
305
|
+
if isinstance(self.cv, BaseCombinatorialCV):
|
306
|
+
test_indices = slice(None)
|
307
|
+
else:
|
308
|
+
test_indices = np.sort(
|
309
|
+
np.concatenate([test for _, test in cv.split(X, y)])
|
310
|
+
)
|
311
|
+
|
312
|
+
# We validate and convert to numpy array only after inner-estimator fitting to
|
313
|
+
# keep the assets names in case they are used in the estimator.
|
314
|
+
if y is not None:
|
315
|
+
X, y = self._validate_data(X, y)
|
316
|
+
y_pred = y[test_indices]
|
317
|
+
else:
|
318
|
+
X = self._validate_data(X)
|
319
|
+
y_pred = None
|
320
|
+
|
321
|
+
X_pred = []
|
322
|
+
fitted_inner_estimators = iter(self.inner_estimators_)
|
323
|
+
for cluster_ids in clusters:
|
324
|
+
if len(cluster_ids) == 1:
|
325
|
+
pred = X[test_indices, cluster_ids[0]]
|
326
|
+
else:
|
327
|
+
if cv_predictions is None:
|
328
|
+
fitted_inner_estimator = next(fitted_inner_estimators)
|
329
|
+
pred = fitted_inner_estimator.predict(X[test_indices, cluster_ids])
|
330
|
+
else:
|
331
|
+
pred = next(cv_predictions)
|
332
|
+
if isinstance(self.cv, BaseCombinatorialCV):
|
333
|
+
pred = pred.quantile(
|
334
|
+
measure=self.quantile_measure, q=self.quantile
|
335
|
+
)
|
336
|
+
X_pred.append(np.asarray(pred))
|
337
|
+
X_pred = np.array(X_pred).T
|
338
|
+
if cv_predictions is None:
|
339
|
+
assert not any(
|
340
|
+
fitted_inner_estimators
|
341
|
+
), "fitted_inner_estimator iterator must be empty"
|
342
|
+
else:
|
343
|
+
assert not any(cv_predictions), "cv_predictions iterator must be empty"
|
344
|
+
|
345
|
+
fit_single_estimator(self.outer_estimator_, X=X_pred, y=y_pred)
|
346
|
+
outer_weights = self.outer_estimator_.weights_
|
347
|
+
self.weights_ = outer_weights @ inner_weights
|
348
|
+
return self
|
@@ -0,0 +1,13 @@
|
|
1
|
+
from skfolio.optimization.cluster.hierarchical._base import (
|
2
|
+
BaseHierarchicalOptimization,
|
3
|
+
)
|
4
|
+
from skfolio.optimization.cluster.hierarchical._herc import (
|
5
|
+
HierarchicalEqualRiskContribution,
|
6
|
+
)
|
7
|
+
from skfolio.optimization.cluster.hierarchical._hrp import HierarchicalRiskParity
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"BaseHierarchicalOptimization",
|
11
|
+
"HierarchicalRiskParity",
|
12
|
+
"HierarchicalEqualRiskContribution",
|
13
|
+
]
|