skfolio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skfolio/__init__.py +29 -0
  2. skfolio/cluster/__init__.py +8 -0
  3. skfolio/cluster/_hierarchical.py +387 -0
  4. skfolio/datasets/__init__.py +20 -0
  5. skfolio/datasets/_base.py +389 -0
  6. skfolio/datasets/data/__init__.py +0 -0
  7. skfolio/datasets/data/factors_dataset.csv.gz +0 -0
  8. skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
  9. skfolio/datasets/data/sp500_index.csv.gz +0 -0
  10. skfolio/distance/__init__.py +26 -0
  11. skfolio/distance/_base.py +55 -0
  12. skfolio/distance/_distance.py +574 -0
  13. skfolio/exceptions.py +30 -0
  14. skfolio/measures/__init__.py +76 -0
  15. skfolio/measures/_enums.py +355 -0
  16. skfolio/measures/_measures.py +607 -0
  17. skfolio/metrics/__init__.py +3 -0
  18. skfolio/metrics/_scorer.py +121 -0
  19. skfolio/model_selection/__init__.py +18 -0
  20. skfolio/model_selection/_combinatorial.py +407 -0
  21. skfolio/model_selection/_validation.py +194 -0
  22. skfolio/model_selection/_walk_forward.py +221 -0
  23. skfolio/moments/__init__.py +41 -0
  24. skfolio/moments/covariance/__init__.py +29 -0
  25. skfolio/moments/covariance/_base.py +101 -0
  26. skfolio/moments/covariance/_covariance.py +1108 -0
  27. skfolio/moments/expected_returns/__init__.py +21 -0
  28. skfolio/moments/expected_returns/_base.py +31 -0
  29. skfolio/moments/expected_returns/_expected_returns.py +415 -0
  30. skfolio/optimization/__init__.py +36 -0
  31. skfolio/optimization/_base.py +147 -0
  32. skfolio/optimization/cluster/__init__.py +13 -0
  33. skfolio/optimization/cluster/_nco.py +348 -0
  34. skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
  35. skfolio/optimization/cluster/hierarchical/_base.py +440 -0
  36. skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
  37. skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
  38. skfolio/optimization/convex/__init__.py +16 -0
  39. skfolio/optimization/convex/_base.py +1944 -0
  40. skfolio/optimization/convex/_distributionally_robust.py +392 -0
  41. skfolio/optimization/convex/_maximum_diversification.py +417 -0
  42. skfolio/optimization/convex/_mean_risk.py +974 -0
  43. skfolio/optimization/convex/_risk_budgeting.py +560 -0
  44. skfolio/optimization/ensemble/__init__.py +6 -0
  45. skfolio/optimization/ensemble/_base.py +87 -0
  46. skfolio/optimization/ensemble/_stacking.py +326 -0
  47. skfolio/optimization/naive/__init__.py +3 -0
  48. skfolio/optimization/naive/_naive.py +173 -0
  49. skfolio/population/__init__.py +3 -0
  50. skfolio/population/_population.py +883 -0
  51. skfolio/portfolio/__init__.py +13 -0
  52. skfolio/portfolio/_base.py +1096 -0
  53. skfolio/portfolio/_multi_period_portfolio.py +610 -0
  54. skfolio/portfolio/_portfolio.py +842 -0
  55. skfolio/pre_selection/__init__.py +7 -0
  56. skfolio/pre_selection/_pre_selection.py +342 -0
  57. skfolio/preprocessing/__init__.py +3 -0
  58. skfolio/preprocessing/_returns.py +114 -0
  59. skfolio/prior/__init__.py +18 -0
  60. skfolio/prior/_base.py +63 -0
  61. skfolio/prior/_black_litterman.py +238 -0
  62. skfolio/prior/_empirical.py +163 -0
  63. skfolio/prior/_factor_model.py +268 -0
  64. skfolio/typing.py +50 -0
  65. skfolio/uncertainty_set/__init__.py +23 -0
  66. skfolio/uncertainty_set/_base.py +108 -0
  67. skfolio/uncertainty_set/_bootstrap.py +281 -0
  68. skfolio/uncertainty_set/_empirical.py +237 -0
  69. skfolio/utils/__init__.py +0 -0
  70. skfolio/utils/bootstrap.py +115 -0
  71. skfolio/utils/equations.py +350 -0
  72. skfolio/utils/sorting.py +117 -0
  73. skfolio/utils/stats.py +466 -0
  74. skfolio/utils/tools.py +567 -0
  75. skfolio-0.0.1.dist-info/LICENSE +29 -0
  76. skfolio-0.0.1.dist-info/METADATA +568 -0
  77. skfolio-0.0.1.dist-info/RECORD +79 -0
  78. skfolio-0.0.1.dist-info/WHEEL +5 -0
  79. skfolio-0.0.1.dist-info/top_level.txt +1 -0
skfolio/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ """skfolio package"""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+ import importlib.metadata
6
+
7
+ from skfolio.measures import (
8
+ BaseMeasure,
9
+ ExtraRiskMeasure,
10
+ PerfMeasure,
11
+ RatioMeasure,
12
+ RiskMeasure,
13
+ )
14
+ from skfolio.population import Population
15
+ from skfolio.portfolio import BasePortfolio, MultiPeriodPortfolio, Portfolio
16
+
17
+ __version__ = importlib.metadata.version("skfolio")
18
+
19
+ __all__ = [
20
+ "BaseMeasure",
21
+ "PerfMeasure",
22
+ "RiskMeasure",
23
+ "ExtraRiskMeasure",
24
+ "RatioMeasure",
25
+ "BasePortfolio",
26
+ "Portfolio",
27
+ "MultiPeriodPortfolio",
28
+ "Population",
29
+ ]
@@ -0,0 +1,8 @@
1
+ """Hierarchical Clustering estimators."""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+ from skfolio.cluster._hierarchical import HierarchicalClustering, LinkageMethod
7
+
8
+ __all__ = ["LinkageMethod", "HierarchicalClustering"]
@@ -0,0 +1,387 @@
1
+ """Hierarchical Clustering estimators."""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+ from enum import auto
7
+
8
+ import numpy as np
9
+ import numpy.typing as npt
10
+ import plotly.figure_factory as ff
11
+ import plotly.graph_objects as go
12
+ import scipy.cluster.hierarchy as sch
13
+ import scipy.spatial.distance as scd
14
+ import sklearn.base as skb
15
+ import sklearn.utils.validation as skv
16
+
17
+ from skfolio.utils.stats import assert_is_distance, compute_optimal_n_clusters
18
+ from skfolio.utils.tools import AutoEnum, default_asset_names
19
+
20
+
21
+ class LinkageMethod(AutoEnum):
22
+ r"""Methods for calculating the distance between clusters in the linkage matrix.
23
+ See the `Linkage Methods` section of `scipy.cluster.hierarchy.linkage`
24
+ for full descriptions.
25
+
26
+ Parameters
27
+ ----------
28
+ SINGLE : str
29
+ Assigns
30
+
31
+ .. math:: d(u,v) = \min(dist(u[i],v[j]))
32
+
33
+ for all points :math:`i` in cluster :math:`u` and
34
+ :math:`j` in cluster :math:`v`. This is also known as the
35
+ Nearest Point Algorithm.
36
+
37
+ COMPLETE : str
38
+ Assigns
39
+
40
+ .. math:: d(u, v) = \max(dist(u[i],v[j]))
41
+
42
+ for all points :math:`i` in cluster u and :math:`j` in
43
+ cluster :math:`v`. This is also known by the Farthest Point
44
+ Algorithm or Voor Hees Algorithm.
45
+
46
+ AVERAGE : str
47
+ Assigns
48
+
49
+ .. math:: d(u,v) = \sum_{ij} \frac{d(u[i], v[j])}{(|u|*|v|)}
50
+
51
+ for all points :math:`i` and :math:`j` where :math:`|u|`
52
+ and :math:`|v|` are the cardinalities of clusters :math:`u`
53
+ and :math:`v`, respectively. This is also called the UPGMA
54
+ algorithm.
55
+
56
+ WEIGHTED : str
57
+ Assigns
58
+
59
+ .. math:: d(u,v) = (dist(s,v) + dist(t,v))/2
60
+
61
+ where cluster u was formed with cluster s and t and v
62
+ is a remaining cluster in the forest (also called WPGMA).
63
+
64
+ CENTROID : str
65
+ Assigns
66
+
67
+ .. math::
68
+ dist(s,t) = ||c_s-c_t||_2
69
+
70
+ where :math:`c_s` and :math:`c_t` are the centroids of
71
+ clusters :math:`s` and :math:`t`, respectively.
72
+ This is also known as the UPGMC
73
+ algorithm.
74
+
75
+ MEDIAN : str
76
+ assigns :math:`d(s,t)` like the ``centroid`` method.
77
+ This is also known as the WPGMC algorithm.
78
+
79
+ WARD : str
80
+ Uses the Ward variance minimization algorithm.
81
+ The new entry :math:`d(u,v)` is computed as follows,
82
+
83
+ .. math::
84
+
85
+ d(u,v) = \sqrt{\frac{|v|+|s|}
86
+ {T}d(v,s)^2
87
+ + \frac{|v|+|t|}
88
+ {T}d(v,t)^2
89
+ - \frac{|v|}
90
+ {T}d(s,t)^2}
91
+
92
+ where :math:`u` is the newly joined cluster consisting of
93
+ clusters :math:`s` and :math:`t`, :math:`v` is an unused
94
+ cluster in the forest, :math:`T=|v|+|s|+|t|`, and
95
+ :math:`|*|` is the cardinality of its argument. This is also
96
+ known as the incremental algorithm.
97
+ """
98
+
99
+ SINGLE = auto()
100
+ COMPLETE = auto()
101
+ AVERAGE = auto()
102
+ WEIGHTED = auto()
103
+ CENTROID = auto()
104
+ MEDIAN = auto()
105
+ WARD = auto()
106
+
107
+
108
+ class HierarchicalClustering(skb.ClusterMixin, skb.BaseEstimator):
109
+ r"""Hierarchical Clustering.
110
+
111
+ Parameters
112
+ ----------
113
+ max_clusters : int, optional
114
+ For coherent clustering, the algorithm finds a minimum threshold ``r`` so that
115
+ the cophenetic distance between any two original observations in the same flat
116
+ cluster is no more than ``r`` and no more than `max_clusters` flat clusters are
117
+ formed. The default (`None`) is to estimate the maximal number of clusters
118
+ based on the Two-Order Difference to Gap Statistic [1]_.
119
+
120
+ linkage_method : LinkageMethod, default=LinkageMethod.WARD
121
+ Methods for calculating the distance between clusters in the linkage matrix.
122
+ See the `Linkage Methods` section of `scipy.cluster.hierarchy.linkage` for
123
+ the full descriptions.
124
+ The default is the Ward variance minimization algorithm `LinkageMethod.WARD`.
125
+
126
+ Attributes
127
+ ----------
128
+ n_clusters_ : int
129
+ Number of formed clusters.
130
+
131
+ labels_ : ndarray of shape (n_assets,)
132
+ Labels of each asset.
133
+
134
+ linkage_matrix_ : ndarray of shape (n_assets - 1, 4)
135
+ Linkage matrix computed from the distance matrix of the `distance_estimator`.
136
+
137
+ condensed_distance_ : ndarray of shape (\\binom{n_assets}{2}, )
138
+ The 1-D condensed distance matrix.
139
+
140
+ n_features_in_ : int
141
+ Number of assets seen during `fit`.
142
+
143
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
144
+ Names of assets seen during `fit`. Defined only when `X`
145
+ has assets names that are all strings.
146
+
147
+ References
148
+ ----------
149
+ .. [1] "Application of two-order difference to gap statistic".
150
+ Yue, Wang & Wei (2009)
151
+ """
152
+ n_clusters_: int
153
+ labels_: np.ndarray
154
+ linkage_matrix_: np.ndarray
155
+ condensed_distance_: np.ndarray
156
+
157
+ def __init__(
158
+ self,
159
+ max_clusters: int | None = None,
160
+ linkage_method: LinkageMethod = LinkageMethod.WARD,
161
+ ):
162
+ self.max_clusters = max_clusters
163
+ self.linkage_method = linkage_method
164
+
165
+ def fit(self, X: npt.ArrayLike, y: None = None) -> "HierarchicalClustering":
166
+ """Fit the Hierarchical Equal Risk Contribution estimator.
167
+
168
+ Parameters
169
+ ----------
170
+ X : array-like of shape (n_assets, n_assets)
171
+ Distance matrix of the assets.
172
+
173
+ y : Ignored
174
+ Not used, present for API consistency by convention.
175
+
176
+ Returns
177
+ -------
178
+ self : HierarchicalClustering
179
+ Fitted estimator.
180
+ """
181
+ X = self._validate_data(X)
182
+ assert_is_distance(X)
183
+ self.condensed_distance_ = scd.squareform(X, checks=False)
184
+ self.linkage_matrix_ = sch.linkage(
185
+ self.condensed_distance_,
186
+ method=str(self.linkage_method.value),
187
+ # Not needed for clustering, only for
188
+ # visualization and can be slow. So we perform the optimal ordering only
189
+ # in `plot_dendrogram`.
190
+ optimal_ordering=False,
191
+ )
192
+ max_clusters = self.max_clusters
193
+ if max_clusters is None:
194
+ max_clusters = compute_optimal_n_clusters(
195
+ distance=X,
196
+ linkage_matrix=self.linkage_matrix_,
197
+ )
198
+ # Get the clusters from the linkage matrix
199
+ labels_ = sch.fcluster(
200
+ self.linkage_matrix_, t=max_clusters, criterion="maxclust"
201
+ )
202
+ labels_ -= 1 # Start at 0
203
+ self.n_clusters_ = len(set(labels_))
204
+ assert self.n_clusters_ == max(labels_) + 1 <= max_clusters
205
+ self.labels_ = labels_
206
+ return self
207
+
208
+ def plot_dendrogram(self, heatmap: bool = True) -> go.Figure:
209
+ """Plot the dendrogram.
210
+
211
+ The blue lines represent distinct clusters composed of a single asset.
212
+ The remaining colors represent clusters of more than one asset.
213
+
214
+ When `heatmap` is set to True, the heatmap of the reordered distance matrix is
215
+ displayed below the dendrogram and clusters are outlined with yellow squares.
216
+
217
+ The number of clusters used in the plot is the same as the `n_clusters_`
218
+ attribute if it exists, otherwise a default number is used corresponding to the
219
+ number of cluster with a distance above 70% of the maximum cluster distance.
220
+
221
+ Parameters
222
+ ----------
223
+ heatmap : bool, default=True
224
+ If this is set to True, the distance heatmap is returned with the clustered
225
+ outlined in yellow.
226
+
227
+ Returns
228
+ -------
229
+ fig : Figure
230
+ The dendrogram figure.
231
+ """
232
+ skv.check_is_fitted(self, "linkage_matrix_")
233
+ linkage_matrix = sch.optimal_leaf_ordering(
234
+ self.linkage_matrix_, self.condensed_distance_
235
+ )
236
+
237
+ n_assets = linkage_matrix.shape[0] + 1
238
+ cophenetic_distance_threshold = linkage_matrix[-(self.n_clusters_ - 1), 2]
239
+
240
+ if hasattr(self, "feature_names_in_"):
241
+ asset_names = self.feature_names_in_
242
+ else:
243
+ asset_names = default_asset_names(n_assets=n_assets)
244
+
245
+ if not heatmap:
246
+ fig = ff.create_dendrogram(
247
+ np.ones(1),
248
+ distfun=lambda x: None,
249
+ linkagefun=lambda x: linkage_matrix,
250
+ color_threshold=cophenetic_distance_threshold,
251
+ labels=asset_names,
252
+ )
253
+ fig.update_layout(
254
+ title="Dendrogram",
255
+ width=800,
256
+ height=400,
257
+ showlegend=False,
258
+ hovermode="closest",
259
+ xaxis={"title": "Assets"},
260
+ yaxis={"title": "Distance"},
261
+ )
262
+ return fig
263
+
264
+ # Initialize figure by creating upper dendrogram
265
+ fig = ff.create_dendrogram(
266
+ np.ones(1),
267
+ orientation="bottom",
268
+ distfun=lambda x: None,
269
+ linkagefun=lambda x: linkage_matrix,
270
+ color_threshold=cophenetic_distance_threshold,
271
+ labels=asset_names,
272
+ )
273
+
274
+ for i in range(len(fig["data"])):
275
+ fig["data"][i]["yaxis"] = "y2"
276
+
277
+ # Create Side Dendrogram
278
+ side_dendrogram = ff.create_dendrogram(
279
+ np.ones(1),
280
+ orientation="right",
281
+ distfun=lambda x: None,
282
+ linkagefun=lambda x: linkage_matrix,
283
+ color_threshold=cophenetic_distance_threshold,
284
+ labels=asset_names,
285
+ )
286
+ for i in range(len(side_dendrogram["data"])):
287
+ side_dendrogram["data"][i]["xaxis"] = "x2"
288
+
289
+ # Add Side Dendrogram Data to Figure
290
+ for data in side_dendrogram["data"]:
291
+ fig.add_trace(data)
292
+
293
+ # Create Heatmap
294
+ ordered_asset_names = side_dendrogram["layout"]["yaxis"]["ticktext"]
295
+ ordered_asset_names_idx = np.array(
296
+ [np.argwhere(x == asset_names)[0][0] for x in ordered_asset_names]
297
+ )
298
+ assert np.array_equal(asset_names[ordered_asset_names_idx], ordered_asset_names)
299
+
300
+ distance = scd.squareform(self.condensed_distance_, checks=False)
301
+ heat_data = distance[ordered_asset_names_idx, :][:, ordered_asset_names_idx]
302
+
303
+ heatmap = [
304
+ go.Heatmap(
305
+ x=ordered_asset_names,
306
+ y=ordered_asset_names,
307
+ z=heat_data,
308
+ colorscale="Blues",
309
+ name="",
310
+ )
311
+ ]
312
+
313
+ heatmap[0]["x"] = fig["layout"]["xaxis"]["tickvals"]
314
+ heatmap[0]["y"] = side_dendrogram["layout"]["yaxis"]["tickvals"]
315
+
316
+ # Add Heatmap Data to Figure
317
+ for data in heatmap:
318
+ fig.add_trace(data)
319
+
320
+ # Outline clusters
321
+ delta = heatmap[0]["x"][1] - heatmap[0]["x"][0]
322
+
323
+ clusters_ids = self.labels_[ordered_asset_names_idx]
324
+
325
+ for i in range(max(clusters_ids) + 1):
326
+ c_ids = np.argwhere(clusters_ids == i).ravel()
327
+ a = c_ids[0] * delta
328
+ b = (c_ids[-1] + 1) * delta
329
+ fig.add_shape(
330
+ type="rect",
331
+ x0=a,
332
+ y0=a,
333
+ x1=b,
334
+ y1=b,
335
+ line=dict(
336
+ color="gold",
337
+ width=2,
338
+ ),
339
+ )
340
+ fig.update_layout(
341
+ title="Dendrogram",
342
+ width=800,
343
+ height=800,
344
+ showlegend=False,
345
+ hovermode="closest",
346
+ xaxis={
347
+ "title": "Assets",
348
+ "domain": [0.15, 1],
349
+ "mirror": False,
350
+ "showgrid": False,
351
+ "showline": False,
352
+ "zeroline": False,
353
+ "ticks": "",
354
+ },
355
+ xaxis2={
356
+ "domain": [0, 0.15],
357
+ "mirror": False,
358
+ "showgrid": False,
359
+ "showline": False,
360
+ "zeroline": False,
361
+ "showticklabels": False,
362
+ "ticks": "",
363
+ },
364
+ yaxis={
365
+ "title": "Assets",
366
+ "domain": [0, 0.85],
367
+ "mirror": False,
368
+ "showgrid": False,
369
+ "showline": False,
370
+ "zeroline": False,
371
+ "showticklabels": False,
372
+ "ticks": "",
373
+ "tickvals": fig["layout"]["xaxis"]["tickvals"],
374
+ "ticktext": fig["layout"]["xaxis"]["ticktext"],
375
+ },
376
+ yaxis2={
377
+ "domain": [0.825, 0.975],
378
+ "mirror": False,
379
+ "showgrid": False,
380
+ "showline": False,
381
+ "zeroline": False,
382
+ "showticklabels": False,
383
+ "ticks": "",
384
+ },
385
+ )
386
+
387
+ return fig
@@ -0,0 +1,20 @@
1
+ """Datasets module."""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+ from skfolio.datasets._base import (
7
+ load_factors_dataset,
8
+ load_ftse100_dataset,
9
+ load_nasdaq_dataset,
10
+ load_sp500_dataset,
11
+ load_sp500_index,
12
+ )
13
+
14
+ __all__ = [
15
+ "load_nasdaq_dataset",
16
+ "load_factors_dataset",
17
+ "load_ftse100_dataset",
18
+ "load_sp500_dataset",
19
+ "load_sp500_index",
20
+ ]