skfolio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skfolio/__init__.py +29 -0
  2. skfolio/cluster/__init__.py +8 -0
  3. skfolio/cluster/_hierarchical.py +387 -0
  4. skfolio/datasets/__init__.py +20 -0
  5. skfolio/datasets/_base.py +389 -0
  6. skfolio/datasets/data/__init__.py +0 -0
  7. skfolio/datasets/data/factors_dataset.csv.gz +0 -0
  8. skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
  9. skfolio/datasets/data/sp500_index.csv.gz +0 -0
  10. skfolio/distance/__init__.py +26 -0
  11. skfolio/distance/_base.py +55 -0
  12. skfolio/distance/_distance.py +574 -0
  13. skfolio/exceptions.py +30 -0
  14. skfolio/measures/__init__.py +76 -0
  15. skfolio/measures/_enums.py +355 -0
  16. skfolio/measures/_measures.py +607 -0
  17. skfolio/metrics/__init__.py +3 -0
  18. skfolio/metrics/_scorer.py +121 -0
  19. skfolio/model_selection/__init__.py +18 -0
  20. skfolio/model_selection/_combinatorial.py +407 -0
  21. skfolio/model_selection/_validation.py +194 -0
  22. skfolio/model_selection/_walk_forward.py +221 -0
  23. skfolio/moments/__init__.py +41 -0
  24. skfolio/moments/covariance/__init__.py +29 -0
  25. skfolio/moments/covariance/_base.py +101 -0
  26. skfolio/moments/covariance/_covariance.py +1108 -0
  27. skfolio/moments/expected_returns/__init__.py +21 -0
  28. skfolio/moments/expected_returns/_base.py +31 -0
  29. skfolio/moments/expected_returns/_expected_returns.py +415 -0
  30. skfolio/optimization/__init__.py +36 -0
  31. skfolio/optimization/_base.py +147 -0
  32. skfolio/optimization/cluster/__init__.py +13 -0
  33. skfolio/optimization/cluster/_nco.py +348 -0
  34. skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
  35. skfolio/optimization/cluster/hierarchical/_base.py +440 -0
  36. skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
  37. skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
  38. skfolio/optimization/convex/__init__.py +16 -0
  39. skfolio/optimization/convex/_base.py +1944 -0
  40. skfolio/optimization/convex/_distributionally_robust.py +392 -0
  41. skfolio/optimization/convex/_maximum_diversification.py +417 -0
  42. skfolio/optimization/convex/_mean_risk.py +974 -0
  43. skfolio/optimization/convex/_risk_budgeting.py +560 -0
  44. skfolio/optimization/ensemble/__init__.py +6 -0
  45. skfolio/optimization/ensemble/_base.py +87 -0
  46. skfolio/optimization/ensemble/_stacking.py +326 -0
  47. skfolio/optimization/naive/__init__.py +3 -0
  48. skfolio/optimization/naive/_naive.py +173 -0
  49. skfolio/population/__init__.py +3 -0
  50. skfolio/population/_population.py +883 -0
  51. skfolio/portfolio/__init__.py +13 -0
  52. skfolio/portfolio/_base.py +1096 -0
  53. skfolio/portfolio/_multi_period_portfolio.py +610 -0
  54. skfolio/portfolio/_portfolio.py +842 -0
  55. skfolio/pre_selection/__init__.py +7 -0
  56. skfolio/pre_selection/_pre_selection.py +342 -0
  57. skfolio/preprocessing/__init__.py +3 -0
  58. skfolio/preprocessing/_returns.py +114 -0
  59. skfolio/prior/__init__.py +18 -0
  60. skfolio/prior/_base.py +63 -0
  61. skfolio/prior/_black_litterman.py +238 -0
  62. skfolio/prior/_empirical.py +163 -0
  63. skfolio/prior/_factor_model.py +268 -0
  64. skfolio/typing.py +50 -0
  65. skfolio/uncertainty_set/__init__.py +23 -0
  66. skfolio/uncertainty_set/_base.py +108 -0
  67. skfolio/uncertainty_set/_bootstrap.py +281 -0
  68. skfolio/uncertainty_set/_empirical.py +237 -0
  69. skfolio/utils/__init__.py +0 -0
  70. skfolio/utils/bootstrap.py +115 -0
  71. skfolio/utils/equations.py +350 -0
  72. skfolio/utils/sorting.py +117 -0
  73. skfolio/utils/stats.py +466 -0
  74. skfolio/utils/tools.py +567 -0
  75. skfolio-0.0.1.dist-info/LICENSE +29 -0
  76. skfolio-0.0.1.dist-info/METADATA +568 -0
  77. skfolio-0.0.1.dist-info/RECORD +79 -0
  78. skfolio-0.0.1.dist-info/WHEEL +5 -0
  79. skfolio-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,348 @@
1
+ """Nested Clusters Optimization estimator."""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+ from copy import deepcopy
7
+
8
+ import numpy as np
9
+ import numpy.typing as npt
10
+ import pandas as pd
11
+ import sklearn as sk
12
+ import sklearn.base as skb
13
+ import sklearn.model_selection as skm
14
+ import sklearn.utils.parallel as skp
15
+
16
+ import skfolio.typing as skt
17
+ from skfolio.cluster import HierarchicalClustering
18
+ from skfolio.distance import BaseDistance, PearsonDistance
19
+ from skfolio.measures import RatioMeasure
20
+ from skfolio.model_selection import BaseCombinatorialCV, cross_val_predict
21
+ from skfolio.optimization._base import BaseOptimization
22
+ from skfolio.optimization.convex import MeanRisk
23
+ from skfolio.utils.tools import check_estimator, fit_single_estimator
24
+
25
+
26
+ class NestedClustersOptimization(BaseOptimization):
27
+ """Nested Clusters Optimization estimator.
28
+
29
+ Nested Clusters Optimization (NCO) is a portfolio optimization method developed by
30
+ Marcos Lopez de Prado.
31
+
32
+ It uses a distance matrix to compute clusters using a clustering algorithm (
33
+ Hierarchical Tree Clustering, KMeans, etc..). For each cluster, the inner-cluster
34
+ weights are computed by fitting the inner-estimator on each cluster using the whole
35
+ training data. Then the outer-cluster weights are computed by training the
36
+ outer-estimator using out-of-sample estimates of the inner-estimators with
37
+ cross-validation. Finally, the final assets weights are the dot-product of the
38
+ inner-weights and outer-weights.
39
+
40
+ .. note ::
41
+
42
+ The original paper uses KMeans as the clustering algorithm, minimum Variance for
43
+ the inner-estimator and equal-weight for the outer-estimator. Here we generalize
44
+ it to all `sklearn` and `skfolio` clustering algorithm (HierarchicalClustering,
45
+ KMeans, ...), all optimization estimators (Mean-Variance, HRP, ...)
46
+ and risk measures (Variance, CVaR,...).
47
+ To avoid data leakage at the outer-estimator, we use out-of-sample estimates to
48
+ fit the outer optimization.
49
+
50
+ Parameters
51
+ ----------
52
+ inner_estimator : BaseOptimization, optional
53
+ :ref:`Optimization estimator <optimization>` used to estimate the inner-weights
54
+ (also called intra-weights) which are the assets weights inside each cluster.
55
+ The default `None` is to use :class:`~skfolio.optimization.MeanRisk`.
56
+
57
+ outer_estimator : BaseOptimization, optional
58
+ :ref:`Optimization estimator <optimization>` used to estimate the outer-weights
59
+ (also called inter-weights) which are the weights applied to each cluster.
60
+ The default `None` is to use :class:`~skfolio.optimization.MeanRisk`.
61
+
62
+ distance_estimator : BaseDistance, optional
63
+ :ref:`Distance estimator <distance>`.
64
+ The distance estimator is used to estimate the codependence and the distance
65
+ matrix needed for the computation of the linkage matrix.
66
+ The default (`None`) is to use :class:`~skfolio.distance.PearsonDistance`.
67
+
68
+ clustering_estimator : BaseEstimator, optional
69
+ Clustering estimator. Must expose a `labels_` attribute after fitting.
70
+ The clustering estimator is used to compute the clusters of the assets based on
71
+ the distance matrix. The default (`None`) is to use
72
+ :class:`~skfolio.cluster.HierarchicalClustering`.
73
+
74
+ .. note ::
75
+
76
+ Clustering estimators from `sklearn` are also supported. For example:
77
+ `sklearn.cluster.KMeans`.
78
+
79
+ cv : BaseCrossValidator | BaseCombinatorialCV | int | "ignore", optional
80
+ Determines the cross-validation splitting strategy.
81
+ The default (`None`) is to use the 5-fold cross validation `KFold()`.
82
+ It is applied to the inner-estimators. Its out-of-sample outputs are used to
83
+ train the outer-estimator.
84
+ Possible inputs for `cv` are:
85
+
86
+ * "ignore": no cross-validation is used (note that it will likely lead to data leakage with a high risk of overfitting)
87
+ * Integer, to specify the number of folds in a :class:`sklearn.model_selection.KFold`
88
+ * An object to be used as a cross-validation generator
89
+ * An iterable yielding train, test splits
90
+ * A :class:`~skfolio.model_selection.CombinatorialPurgedCV`
91
+
92
+ If a `CombinatorialCV` cross-validator is used, each cluster out-of-sample
93
+ outputs becomes a collection of multiple paths instead of one single path. The
94
+ selected out-of-sample path among this collection of paths is chosen according
95
+ to the `quantile` and `quantile_measure` parameters.
96
+
97
+ n_jobs : int, optional
98
+ The number of jobs to run in parallel for `fit` of all `estimators`.
99
+ The value `-1` means using all processors.
100
+ The default (`None`) means 1 unless in a `joblib.parallel_backend` context.
101
+
102
+ quantile : float, default=0.5
103
+ Quantile for a given measure (`quantile_measure`) of the out-of-sample
104
+ inner-estimator paths when the `cv` parameter is a
105
+ :class:`~skfolio.model_selection.CombinatorialPurgedCV` cross-validator.
106
+ The default value is `0.5` corresponding to the path with the median measure.
107
+ (see `cv`)
108
+
109
+ quantile_measure : PerfMeasure or RatioMeasure or RiskMeasure or ExtraRiskMeasure, default=RatioMeasure.SHARPE_RATIO
110
+ Measure used for the quantile path selection (see `quantile` and `cv`).
111
+ The default is `RatioMeasure.SHARPE_RATIO`.
112
+
113
+ verbose : int, default=0
114
+ The verbosity level. The default value is `0`.
115
+
116
+ portfolio_params : dict, optional
117
+ Portfolio parameters passed to the portfolio evaluated by the `predict` and
118
+ `score` methods. If not provided, the `name` is copied from the optimization
119
+ model and systematically passed to the portfolio.
120
+
121
+ Attributes
122
+ ----------
123
+ weights_ : ndarray of shape (n_assets,)
124
+ Weights of the assets.
125
+
126
+ distance_estimator_ : BaseDistance
127
+ Fitted `distance_estimator`.
128
+
129
+ inner_estimators_ : list[BaseOptimization]
130
+ List of fitted `inner_estimator`. One per cluster for clusters containing more
131
+ than one asset.
132
+
133
+ outer_estimator_ : BaseOptimization
134
+ Fitted `outer_estimator`.
135
+
136
+ clustering_estimator_ : BaseEstimator
137
+ Fitted `clustering_estimator`.
138
+
139
+ n_features_in_ : int
140
+ Number of assets seen during `fit`.
141
+
142
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
143
+ Names of assets seen during `fit`. Defined only when `X`
144
+ has assets names that are all strings.
145
+
146
+ References
147
+ ----------
148
+ .. [1] "Building diversified portfolios that outperform out of sample",
149
+ The Journal of Portfolio Management,
150
+ Marcos López de Prado (2016)
151
+
152
+ .. [2] "A robust estimator of the efficient frontier",
153
+ SSRN Electronic Journal,
154
+ Marcos López de Prado (2019)
155
+
156
+ .. [3] "Machine Learning for Asset Managers",
157
+ Elements in Quantitative Finance. Cambridge University Press,
158
+ Marcos López de Prado (2020)
159
+ """
160
+
161
+ inner_estimators_: list[BaseOptimization]
162
+ outer_estimator_: BaseOptimization
163
+ distance_estimator_: BaseDistance
164
+ clustering_estimator_: skb.BaseEstimator
165
+
166
+ def __init__(
167
+ self,
168
+ inner_estimator: BaseOptimization | None = None,
169
+ outer_estimator: BaseOptimization | None = None,
170
+ distance_estimator: BaseDistance | None = None,
171
+ clustering_estimator: skb.BaseEstimator | None = None,
172
+ cv: skm.BaseCrossValidator | BaseCombinatorialCV | str | int | None = None,
173
+ quantile: float = 0.5,
174
+ quantile_measure: skt.Measure = RatioMeasure.SHARPE_RATIO,
175
+ n_jobs: int | None = None,
176
+ verbose: int = 0,
177
+ portfolio_params: dict | None = None,
178
+ ):
179
+ super().__init__(portfolio_params=portfolio_params)
180
+ self.distance_estimator = distance_estimator
181
+ self.clustering_estimator = clustering_estimator
182
+ self.inner_estimator = inner_estimator
183
+ self.outer_estimator = outer_estimator
184
+ self.cv = cv
185
+ self.quantile = quantile
186
+ self.quantile_measure = quantile_measure
187
+ self.n_jobs = n_jobs
188
+ self.verbose = verbose
189
+
190
+ def fit(
191
+ self, X: npt.ArrayLike, y: npt.ArrayLike | None = None
192
+ ) -> "NestedClustersOptimization":
193
+ """Fit the Nested Clusters Optimization estimator.
194
+
195
+ Parameters
196
+ ----------
197
+ X : array-like of shape (n_observations, n_assets)
198
+ Price returns of the assets.
199
+
200
+ y : array-like of shape (n_observations, n_targets), optional
201
+ Price returns of factors or a target benchmark.
202
+ The default is `None`.
203
+
204
+ Returns
205
+ -------
206
+ self : NestedClustersOptimization
207
+ Fitted estimator.
208
+ """
209
+ self.distance_estimator_ = check_estimator(
210
+ self.distance_estimator,
211
+ default=PearsonDistance(),
212
+ check_type=BaseDistance,
213
+ )
214
+ self.clustering_estimator_ = check_estimator(
215
+ self.clustering_estimator,
216
+ default=HierarchicalClustering(),
217
+ check_type=skb.BaseEstimator,
218
+ )
219
+ self.outer_estimator_ = check_estimator(
220
+ self.outer_estimator,
221
+ default=MeanRisk(),
222
+ check_type=BaseOptimization,
223
+ )
224
+ _inner_estimator = check_estimator(
225
+ self.inner_estimator,
226
+ default=MeanRisk(),
227
+ check_type=BaseOptimization,
228
+ )
229
+
230
+ self.distance_estimator_.fit(X)
231
+ distance = self.distance_estimator_.distance_
232
+ n_assets = distance.shape[0]
233
+
234
+ # To keep the asset_names --> used for visualisation
235
+ if isinstance(X, pd.DataFrame):
236
+ distance = pd.DataFrame(distance, columns=X.columns)
237
+
238
+ # noinspection PyUnresolvedReferences
239
+ self.clustering_estimator_.fit(distance)
240
+ # noinspection PyUnresolvedReferences
241
+ labels = self.clustering_estimator_.labels_
242
+ n_clusters = max(labels) + 1
243
+ clusters = [np.argwhere(labels == i).flatten() for i in range(n_clusters)]
244
+
245
+ # Intra cluster weights
246
+ # Fit the inner estimator on the whole training data. Those
247
+ # base estimators will be used to retrieve the inner weights.
248
+ # They are exposed publicly.
249
+ # noinspection PyCallingNonCallable
250
+ fitted_inner_estimators = skp.Parallel(n_jobs=self.n_jobs)(
251
+ skp.delayed(fit_single_estimator)(
252
+ sk.clone(_inner_estimator), X, y, indices=cluster_ids, axis=1
253
+ )
254
+ for cluster_ids in clusters
255
+ if len(cluster_ids) != 1
256
+ )
257
+ fitted_inner_estimators = iter(fitted_inner_estimators)
258
+
259
+ self.inner_estimators_ = []
260
+ inner_weights = []
261
+ for cluster_ids in clusters:
262
+ w = np.zeros(n_assets)
263
+ # For single assets, we don't run the inner optimization estimator.
264
+ if len(cluster_ids) == 1:
265
+ w[cluster_ids] = 1
266
+ else:
267
+ fitted_inner_estimator = next(fitted_inner_estimators)
268
+ self.inner_estimators_.append(fitted_inner_estimator)
269
+ w[cluster_ids] = fitted_inner_estimator.weights_
270
+ inner_weights.append(w)
271
+ inner_weights = np.array(inner_weights)
272
+ assert not any(
273
+ fitted_inner_estimators
274
+ ), "fitted_inner_estimator iterator must be empty"
275
+
276
+ # Outer cluster weights
277
+ # To train the outer-estimator using the most data as possible, we use
278
+ # a cross-validation to obtain the output of the cluster estimators.
279
+ # To ensure that the data provided to each estimator are the same,
280
+ # we need to set the random state of the cv if there is one and we
281
+ # need to take a copy.
282
+ if self.cv == "ignore":
283
+ cv_predictions = None
284
+ test_indices = slice(None)
285
+ else:
286
+ cv = skm.check_cv(self.cv)
287
+ if hasattr(cv, "random_state") and cv.random_state is None:
288
+ cv.random_state = np.random.RandomState()
289
+ # noinspection PyCallingNonCallable
290
+ cv_predictions = skp.Parallel(n_jobs=self.n_jobs)(
291
+ skp.delayed(cross_val_predict)(
292
+ sk.clone(_inner_estimator),
293
+ X,
294
+ y,
295
+ cv=deepcopy(cv),
296
+ n_jobs=self.n_jobs,
297
+ verbose=self.verbose,
298
+ column_indices=cluster_ids,
299
+ method="predict",
300
+ )
301
+ for cluster_ids in clusters
302
+ if len(cluster_ids) != 1
303
+ )
304
+ cv_predictions = iter(cv_predictions)
305
+ if isinstance(self.cv, BaseCombinatorialCV):
306
+ test_indices = slice(None)
307
+ else:
308
+ test_indices = np.sort(
309
+ np.concatenate([test for _, test in cv.split(X, y)])
310
+ )
311
+
312
+ # We validate and convert to numpy array only after inner-estimator fitting to
313
+ # keep the assets names in case they are used in the estimator.
314
+ if y is not None:
315
+ X, y = self._validate_data(X, y)
316
+ y_pred = y[test_indices]
317
+ else:
318
+ X = self._validate_data(X)
319
+ y_pred = None
320
+
321
+ X_pred = []
322
+ fitted_inner_estimators = iter(self.inner_estimators_)
323
+ for cluster_ids in clusters:
324
+ if len(cluster_ids) == 1:
325
+ pred = X[test_indices, cluster_ids[0]]
326
+ else:
327
+ if cv_predictions is None:
328
+ fitted_inner_estimator = next(fitted_inner_estimators)
329
+ pred = fitted_inner_estimator.predict(X[test_indices, cluster_ids])
330
+ else:
331
+ pred = next(cv_predictions)
332
+ if isinstance(self.cv, BaseCombinatorialCV):
333
+ pred = pred.quantile(
334
+ measure=self.quantile_measure, q=self.quantile
335
+ )
336
+ X_pred.append(np.asarray(pred))
337
+ X_pred = np.array(X_pred).T
338
+ if cv_predictions is None:
339
+ assert not any(
340
+ fitted_inner_estimators
341
+ ), "fitted_inner_estimator iterator must be empty"
342
+ else:
343
+ assert not any(cv_predictions), "cv_predictions iterator must be empty"
344
+
345
+ fit_single_estimator(self.outer_estimator_, X=X_pred, y=y_pred)
346
+ outer_weights = self.outer_estimator_.weights_
347
+ self.weights_ = outer_weights @ inner_weights
348
+ return self
@@ -0,0 +1,13 @@
1
+ from skfolio.optimization.cluster.hierarchical._base import (
2
+ BaseHierarchicalOptimization,
3
+ )
4
+ from skfolio.optimization.cluster.hierarchical._herc import (
5
+ HierarchicalEqualRiskContribution,
6
+ )
7
+ from skfolio.optimization.cluster.hierarchical._hrp import HierarchicalRiskParity
8
+
9
+ __all__ = [
10
+ "BaseHierarchicalOptimization",
11
+ "HierarchicalRiskParity",
12
+ "HierarchicalEqualRiskContribution",
13
+ ]