skfolio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skfolio/__init__.py +29 -0
  2. skfolio/cluster/__init__.py +8 -0
  3. skfolio/cluster/_hierarchical.py +387 -0
  4. skfolio/datasets/__init__.py +20 -0
  5. skfolio/datasets/_base.py +389 -0
  6. skfolio/datasets/data/__init__.py +0 -0
  7. skfolio/datasets/data/factors_dataset.csv.gz +0 -0
  8. skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
  9. skfolio/datasets/data/sp500_index.csv.gz +0 -0
  10. skfolio/distance/__init__.py +26 -0
  11. skfolio/distance/_base.py +55 -0
  12. skfolio/distance/_distance.py +574 -0
  13. skfolio/exceptions.py +30 -0
  14. skfolio/measures/__init__.py +76 -0
  15. skfolio/measures/_enums.py +355 -0
  16. skfolio/measures/_measures.py +607 -0
  17. skfolio/metrics/__init__.py +3 -0
  18. skfolio/metrics/_scorer.py +121 -0
  19. skfolio/model_selection/__init__.py +18 -0
  20. skfolio/model_selection/_combinatorial.py +407 -0
  21. skfolio/model_selection/_validation.py +194 -0
  22. skfolio/model_selection/_walk_forward.py +221 -0
  23. skfolio/moments/__init__.py +41 -0
  24. skfolio/moments/covariance/__init__.py +29 -0
  25. skfolio/moments/covariance/_base.py +101 -0
  26. skfolio/moments/covariance/_covariance.py +1108 -0
  27. skfolio/moments/expected_returns/__init__.py +21 -0
  28. skfolio/moments/expected_returns/_base.py +31 -0
  29. skfolio/moments/expected_returns/_expected_returns.py +415 -0
  30. skfolio/optimization/__init__.py +36 -0
  31. skfolio/optimization/_base.py +147 -0
  32. skfolio/optimization/cluster/__init__.py +13 -0
  33. skfolio/optimization/cluster/_nco.py +348 -0
  34. skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
  35. skfolio/optimization/cluster/hierarchical/_base.py +440 -0
  36. skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
  37. skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
  38. skfolio/optimization/convex/__init__.py +16 -0
  39. skfolio/optimization/convex/_base.py +1944 -0
  40. skfolio/optimization/convex/_distributionally_robust.py +392 -0
  41. skfolio/optimization/convex/_maximum_diversification.py +417 -0
  42. skfolio/optimization/convex/_mean_risk.py +974 -0
  43. skfolio/optimization/convex/_risk_budgeting.py +560 -0
  44. skfolio/optimization/ensemble/__init__.py +6 -0
  45. skfolio/optimization/ensemble/_base.py +87 -0
  46. skfolio/optimization/ensemble/_stacking.py +326 -0
  47. skfolio/optimization/naive/__init__.py +3 -0
  48. skfolio/optimization/naive/_naive.py +173 -0
  49. skfolio/population/__init__.py +3 -0
  50. skfolio/population/_population.py +883 -0
  51. skfolio/portfolio/__init__.py +13 -0
  52. skfolio/portfolio/_base.py +1096 -0
  53. skfolio/portfolio/_multi_period_portfolio.py +610 -0
  54. skfolio/portfolio/_portfolio.py +842 -0
  55. skfolio/pre_selection/__init__.py +7 -0
  56. skfolio/pre_selection/_pre_selection.py +342 -0
  57. skfolio/preprocessing/__init__.py +3 -0
  58. skfolio/preprocessing/_returns.py +114 -0
  59. skfolio/prior/__init__.py +18 -0
  60. skfolio/prior/_base.py +63 -0
  61. skfolio/prior/_black_litterman.py +238 -0
  62. skfolio/prior/_empirical.py +163 -0
  63. skfolio/prior/_factor_model.py +268 -0
  64. skfolio/typing.py +50 -0
  65. skfolio/uncertainty_set/__init__.py +23 -0
  66. skfolio/uncertainty_set/_base.py +108 -0
  67. skfolio/uncertainty_set/_bootstrap.py +281 -0
  68. skfolio/uncertainty_set/_empirical.py +237 -0
  69. skfolio/utils/__init__.py +0 -0
  70. skfolio/utils/bootstrap.py +115 -0
  71. skfolio/utils/equations.py +350 -0
  72. skfolio/utils/sorting.py +117 -0
  73. skfolio/utils/stats.py +466 -0
  74. skfolio/utils/tools.py +567 -0
  75. skfolio-0.0.1.dist-info/LICENSE +29 -0
  76. skfolio-0.0.1.dist-info/METADATA +568 -0
  77. skfolio-0.0.1.dist-info/RECORD +79 -0
  78. skfolio-0.0.1.dist-info/WHEEL +5 -0
  79. skfolio-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1108 @@
1
+ """Covariance Estimators."""
2
+
3
+
4
+ import numpy as np
5
+ import numpy.typing as npt
6
+ import pandas as pd
7
+ import scipy.optimize as sco
8
+ import sklearn.covariance as skc
9
+ import sklearn.neighbors as skn
10
+
11
+ from skfolio.moments.covariance._base import BaseCovariance
12
+ from skfolio.utils.stats import corr_to_cov, cov_to_corr
13
+ from skfolio.utils.tools import check_estimator
14
+
15
+
16
+ class EmpiricalCovariance(BaseCovariance):
17
+ """Empirical covariance estimator.
18
+
19
+ Parameters
20
+ ----------
21
+ window_size : int, optional
22
+ Window size. The model is fitted on the last `window_size` observations.
23
+ The default (`None`) is to use all the data.
24
+
25
+ ddof : int, default=1
26
+ Normalization is by `(n_observations - ddof)`.
27
+ Note that `ddof=1` will return the unbiased estimate, and `ddof=0`
28
+ will return the simple average. The default value is `1`.
29
+
30
+ nearest : bool, default=False
31
+ If this is set to True, the covariance is replaced by the nearest covariance
32
+ matrix that is positive definite and with a Cholesky decomposition than can be
33
+ computed. The variance is left unchanged. A covariance matrix is in theory PSD.
34
+ However, due to floating-point inaccuracies, we can end up with a covariance
35
+ matrix that is slightly non-PSD or where Cholesky decomposition is failing.
36
+ This often occurs in high dimensional problems.
37
+ For more details, see :func:`~skfolio.units.stats.cov_nearest`.
38
+ The default is `False`.
39
+
40
+ higham : bool, default=False
41
+ If this is set to True, the Higham & Nick (2002) algorithm is used to find the
42
+ nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
43
+ above zeros (1e-13). The default is `False` and use the clipping method as the
44
+ Higham & Nick algorithm can be slow for large datasets.
45
+
46
+ higham_max_iteration : int, default=100
47
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
48
+ The default value is `100`.
49
+
50
+ Attributes
51
+ ----------
52
+ covariance_ : ndarray of shape (n_assets, n_assets)
53
+ Estimated covariance matrix.
54
+
55
+ n_features_in_ : int
56
+ Number of assets seen during `fit`.
57
+
58
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
59
+ Names of assets seen during `fit`. Defined only when `X`
60
+ has assets names that are all strings.
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ window_size: int | None = None,
66
+ ddof: int = 1,
67
+ nearest: bool = False,
68
+ higham: bool = False,
69
+ higham_max_iteration: int = 100,
70
+ ):
71
+ super().__init__(
72
+ nearest=nearest,
73
+ higham=higham,
74
+ higham_max_iteration=higham_max_iteration,
75
+ )
76
+ self.window_size = window_size
77
+ self.ddof = ddof
78
+
79
+ def fit(self, X: npt.ArrayLike, y=None) -> "EmpiricalCovariance":
80
+ """Fit the empirical covariance estimator.
81
+
82
+ Parameters
83
+ ----------
84
+ X : array-like of shape (n_observations, n_assets)
85
+ Price returns of the assets.
86
+
87
+ y : Ignored
88
+ Not used, present for API consistency by convention.
89
+
90
+ Returns
91
+ -------
92
+ self : EmpiricalCovariance
93
+ Fitted estimator.
94
+ """
95
+ X = self._validate_data(X)
96
+ if self.window_size is not None:
97
+ X = X[-self.window_size :]
98
+ covariance = np.cov(X.T, ddof=self.ddof)
99
+ self._set_covariance(covariance)
100
+ return self
101
+
102
+
103
+ class GerberCovariance(BaseCovariance):
104
+ """Gerber covariance estimator.
105
+
106
+ Robust co-movement measure which ignores fluctuations below a certain threshold
107
+ while simultaneously limiting the effects of extreme movements.
108
+ The Gerber statistic extends Kendall's Tau by counting the proportion of
109
+ simultaneous co-movements in series when their amplitudes exceed data-dependent
110
+ thresholds.
111
+
112
+ Three variant has been published:
113
+
114
+ * Gerber et al. (2015): tend to produce matrices that are non-PSD.
115
+ * Gerber et al. (2019): alteration of the denominator of the above statistic.
116
+ * Gerber et al. (2022): final alteration to ensure PSD matrix.
117
+
118
+ The last two variants are implemented.
119
+
120
+ Parameters
121
+ ----------
122
+ window_size : int, optional
123
+ Window size. The model is fitted on the last `window_size` observations.
124
+ The default (`None`) is to use all the data.
125
+
126
+ threshold : float, default=0.5
127
+ Gerber threshold. The default value is `0.5`.
128
+
129
+ psd_variant : bool, default=True
130
+ If this is set to True, the Gerber et al. (2022) variant is used to ensure a
131
+ positive semi-definite matrix.
132
+ Otherwise, the Gerber et al. (2019) variant is used.
133
+ The default is `True`.
134
+
135
+ nearest : bool, default=False
136
+ If this is set to True, the covariance is replaced by the nearest covariance
137
+ matrix that is positive definite and with a Cholesky decomposition than can be
138
+ computed. The variance is left unchanged. A covariance matrix is in theory PSD.
139
+ However, due to floating-point inaccuracies, we can end up with a covariance
140
+ matrix that is slightly non-PSD or where Cholesky decomposition is failing.
141
+ This often occurs in high dimensional problems.
142
+ For more details, see :func:`~skfolio.units.stats.cov_nearest`.
143
+ The default is `False`.
144
+
145
+ higham : bool, default=False
146
+ If this is set to True, the Higham & Nick (2002) algorithm is used to find the
147
+ nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
148
+ above zeros (1e-13). The default is `False` and use the clipping method as the
149
+ Higham & Nick algorithm can be slow for large datasets.
150
+
151
+ higham_max_iteration : int, default=100
152
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
153
+ The default value is `100`.
154
+
155
+ Attributes
156
+ ----------
157
+ covariance_ : ndarray of shape (n_assets, n_assets)
158
+ Estimated covariance.
159
+
160
+ n_features_in_ : int
161
+ Number of assets seen during `fit`.
162
+
163
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
164
+ Names of assets seen during `fit`. Defined only when `X`
165
+ has assets names that are all strings.
166
+
167
+ References
168
+ ----------
169
+ .. [1] "The gerber statistic: A robust co-movement measure for portfolio
170
+ optimization".
171
+ The Journal of Portfolio Management.
172
+ Gerber, S., B. Javid, H. Markowitz, P. Sargen, and D. Starer (2022).
173
+
174
+ .. [2] "The gerber statistic: A robust measure of correlation".
175
+ Gerber, S., B. Javid, H. Markowitz, P. Sargen, and D. Starer (2019).
176
+
177
+ .. [3] "Enhancing multi-asset portfolio construction under modern portfolio theory
178
+ with a robust co-movement measure".
179
+ Social Science Research network Working Paper Series.
180
+ Gerber, S., H. Markowitz, and P. Pujara (2015).
181
+
182
+ .. [4] "Deconstructing the Gerber Statistic".
183
+ Flint & Polakow, 2023.
184
+ """
185
+
186
+ def __init__(
187
+ self,
188
+ window_size: int | None = None,
189
+ threshold: float = 0.5,
190
+ psd_variant: bool = True,
191
+ nearest: bool = False,
192
+ higham: bool = False,
193
+ higham_max_iteration: int = 100,
194
+ ):
195
+ super().__init__(
196
+ nearest=nearest,
197
+ higham=higham,
198
+ higham_max_iteration=higham_max_iteration,
199
+ )
200
+ self.window_size = window_size
201
+ self.threshold = threshold
202
+ self.psd_variant = psd_variant
203
+
204
+ def fit(self, X: npt.ArrayLike, y=None) -> "GerberCovariance":
205
+ """Fit the Gerber covariance estimator.
206
+
207
+ Parameters
208
+ ----------
209
+ X : array-like of shape (n_observations, n_assets)
210
+ Price returns of the assets.
211
+
212
+ y : Ignored
213
+ Not used, present for API consistency by convention.
214
+
215
+ Returns
216
+ -------
217
+ self : GerberCovariance
218
+ Fitted estimator.
219
+ """
220
+ X = self._validate_data(X)
221
+ if self.window_size is not None:
222
+ X = X[-self.window_size :]
223
+ if not (1 > self.threshold > 0):
224
+ raise ValueError("The threshold must be between 0 and 1")
225
+ n_observations = X.shape[0]
226
+ std = X.std(axis=0).reshape((-1, 1))
227
+ u = X >= std.T * self.threshold
228
+ d = X <= -std.T * self.threshold
229
+ n = np.invert(u) & np.invert(d) # np.invert preferred that ~ for type hint
230
+ n = n.astype(int)
231
+ u = u.astype(int)
232
+ d = d.astype(int)
233
+ concordant = u.T @ u + d.T @ d
234
+ discordant = u.T @ d + d.T @ u
235
+ h = concordant - discordant
236
+ if self.psd_variant:
237
+ corr = h / (n_observations - n.T @ n)
238
+ else:
239
+ h_sqrt = np.sqrt(np.diag(h)).reshape((-1, 1))
240
+ corr = h / (h_sqrt @ h_sqrt.T)
241
+ covariance = corr_to_cov(corr, std.reshape(-1))
242
+ self._set_covariance(covariance)
243
+ return self
244
+
245
+
246
+ class DenoiseCovariance(BaseCovariance):
247
+ """Covariance Denoising estimator.
248
+
249
+ The goal of Covariance Denoising is to reduce the noise and enhance the signal of
250
+ the empirical covariance matrix [1]_.
251
+ It reduces the ill-conditioning of the traditional covariance estimate by
252
+ differentiating the eigenvalues associated with noise from the eigenvalues
253
+ associated with signal.
254
+ Denoising replaces the eigenvalues of the eigenvectors classified as random by
255
+ Marčenko-Pastur with a constant eigenvalue.
256
+
257
+ Parameters
258
+ ----------
259
+ covariance_estimator : BaseCovariance, optional
260
+ :ref:`Covariance estimator <covariance_estimator>` to estimate the covariance
261
+ matrix that will be denoised.
262
+ The default (`None`) is to use :class:`~skfolio.moments.EmpiricalCovariance`.
263
+
264
+ nearest : bool, default=False
265
+ If this is set to True, the covariance is replaced by the nearest covariance
266
+ matrix that is positive definite and with a Cholesky decomposition than can be
267
+ computed. The variance is left unchanged. A covariance matrix is in theory PSD.
268
+ However, due to floating-point inaccuracies, we can end up with a covariance
269
+ matrix that is slightly non-PSD or where Cholesky decomposition is failing.
270
+ This often occurs in high dimensional problems.
271
+ For more details, see :func:`~skfolio.units.stats.cov_nearest`.
272
+ The default is `False`.
273
+
274
+ higham : bool, default=False
275
+ If this is set to True, the Higham & Nick (2002) algorithm is used to find the
276
+ nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
277
+ above zeros (1e-13). The default is `False` and use the clipping method as the
278
+ Higham & Nick algorithm can be slow for large datasets.
279
+
280
+ higham_max_iteration : int, default=100
281
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
282
+ The default value is `100`.
283
+
284
+ Attributes
285
+ ----------
286
+ covariance_ : ndarray of shape (n_assets, n_assets)
287
+ Estimated covariance.
288
+
289
+ covariance_estimator_ : BaseCovariance
290
+ Fitted `covariance_estimator`.
291
+
292
+ n_features_in_ : int
293
+ Number of assets seen during `fit`.
294
+
295
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
296
+ Names of assets seen during `fit`. Defined only when `X`
297
+ has assets names that are all strings.
298
+
299
+ References
300
+ ----------
301
+ .. [1] "Machine Learning for Asset Managers".
302
+ Elements in Quantitative Finance.
303
+ Lòpez de Prado (2020).
304
+ """
305
+
306
+ covariance_estimator_: BaseCovariance
307
+
308
+ def __init__(
309
+ self,
310
+ covariance_estimator: BaseCovariance | None = None,
311
+ nearest: bool = False,
312
+ higham: bool = False,
313
+ higham_max_iteration: int = 100,
314
+ ):
315
+ super().__init__(
316
+ nearest=nearest,
317
+ higham=higham,
318
+ higham_max_iteration=higham_max_iteration,
319
+ )
320
+ self.covariance_estimator = covariance_estimator
321
+
322
+ def fit(self, X: npt.ArrayLike, y=None) -> "DenoiseCovariance":
323
+ """Fit the Covariance Denoising estimator.
324
+
325
+ Parameters
326
+ ----------
327
+ X : array-like of shape (n_observations, n_assets)
328
+ Price returns of the assets.
329
+
330
+ y : Ignored
331
+ Not used, present for API consistency by convention.
332
+
333
+ Returns
334
+ -------
335
+ self : DenoiseCovariance
336
+ Fitted estimator.
337
+ """
338
+ # fitting estimators
339
+ self.covariance_estimator_ = check_estimator(
340
+ self.covariance_estimator,
341
+ default=EmpiricalCovariance(),
342
+ check_type=BaseCovariance,
343
+ )
344
+ self.covariance_estimator_.fit(X)
345
+
346
+ # we validate and convert to numpy after all models have been fitted to keep
347
+ # features names information.
348
+ X = self._validate_data(X)
349
+ n_observations, n_assets = X.shape
350
+ q = n_observations / n_assets
351
+ corr, std = cov_to_corr(self.covariance_estimator_.covariance_)
352
+ e_val, e_vec = np.linalg.eigh(corr)
353
+ indices = e_val.argsort()[::-1]
354
+ e_val, e_vec = e_val[indices], e_vec[:, indices]
355
+
356
+ def _marchenko(x_var):
357
+ e_min, e_max = (
358
+ x_var * (1 - (1.0 / q) ** 0.5) ** 2,
359
+ x_var * (1 + (1.0 / q) ** 0.5) ** 2,
360
+ )
361
+ e_val_lin = np.linspace(e_min, e_max, 1000)
362
+ pdf_0 = (
363
+ q
364
+ / (2 * np.pi * x_var * e_val_lin)
365
+ * ((e_max - e_val_lin) * (e_val_lin - e_min)) ** 0.5
366
+ )
367
+ kde = skn.KernelDensity(kernel="gaussian", bandwidth=0.01).fit(
368
+ e_val.reshape(-1, 1)
369
+ )
370
+ # noinspection PyUnresolvedReferences
371
+ pdf_1 = np.exp(kde.score_samples(pdf_0.reshape(-1, 1)))
372
+ return np.sum((pdf_1 - pdf_0) ** 2)
373
+
374
+ # noinspection PyTypeChecker
375
+ res = sco.minimize(_marchenko, x0=0.5, bounds=((1e-5, 1 - 1e-5),))
376
+
377
+ var = res["x"][0]
378
+ n_facts = e_val.shape[0] - e_val[::-1].searchsorted(
379
+ var * (1 + (1.0 / q) ** 0.5) ** 2
380
+ )
381
+ e_val_ = e_val.copy()
382
+ e_val_[n_facts:] = e_val_[n_facts:].sum() / float(e_val_.shape[0] - n_facts)
383
+ corr = e_vec @ np.diag(e_val_) @ e_vec.T
384
+ corr, _ = cov_to_corr(corr)
385
+ covariance = corr_to_cov(corr, std)
386
+ self._set_covariance(covariance)
387
+ return self
388
+
389
+
390
+ class DenoteCovariance(BaseCovariance):
391
+ """Covariance Denoting estimator.
392
+
393
+ Financial covariance matrices usually incorporate a market component corresponding
394
+ to the first eigenvectors [1]_.
395
+ For some applications like clustering, removing the market component (loud tone)
396
+ allow a greater portion of the covariance to be explained by components that affect
397
+ specific subsets of the securities.
398
+
399
+ Parameters
400
+ ----------
401
+ covariance_estimator : BaseCovariance, optional
402
+ :ref:`Covariance estimator <covariance_estimator>` to estimate the covariance
403
+ matrix prior denoting.
404
+ The default (`None`) is to use :class:`~skfolio.moments.EmpiricalCovariance`.
405
+
406
+ n_markets : int, default=1
407
+ Number of eigenvectors related to the market.
408
+ The default value is `1`.
409
+
410
+ nearest : bool, default=False
411
+ If this is set to True, the covariance is replaced by the nearest covariance
412
+ matrix that is positive definite and with a Cholesky decomposition than can be
413
+ computed. The variance is left unchanged. A covariance matrix is in theory PSD.
414
+ However, due to floating-point inaccuracies, we can end up with a covariance
415
+ matrix that is slightly non-PSD or where Cholesky decomposition is failing.
416
+ This often occurs in high dimensional problems.
417
+ For more details, see :func:`~skfolio.units.stats.cov_nearest`.
418
+ The default is `False`.
419
+
420
+ higham : bool, default=False
421
+ If this is set to True, the Higham & Nick (2002) algorithm is used to find the
422
+ nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
423
+ above zeros (1e-13). The default is `False` and use the clipping method as the
424
+ Higham & Nick algorithm can be slow for large datasets.
425
+
426
+ higham_max_iteration : int, default=100
427
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
428
+ The default value is `100`.
429
+
430
+ Attributes
431
+ ----------
432
+ covariance_ : ndarray of shape (n_assets, n_assets)
433
+ Estimated covariance.
434
+
435
+ covariance_estimator_ : BaseCovariance
436
+ Fitted `covariance_estimator`.
437
+
438
+ n_features_in_ : int
439
+ Number of assets seen during `fit`.
440
+
441
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
442
+ Names of assets seen during `fit`. Defined only when `X`
443
+ has assets names that are all strings.
444
+
445
+ References
446
+ ----------
447
+ .. [1] "Machine Learning for Asset Managers".
448
+ Elements in Quantitative Finance.
449
+ Lòpez de Prado (2020).
450
+ """
451
+
452
+ covariance_estimator_: BaseCovariance
453
+
454
+ def __init__(
455
+ self,
456
+ covariance_estimator: BaseCovariance | None = None,
457
+ n_markets: float = 1,
458
+ nearest: bool = False,
459
+ higham: bool = False,
460
+ higham_max_iteration: int = 100,
461
+ ):
462
+ super().__init__(
463
+ nearest=nearest,
464
+ higham=higham,
465
+ higham_max_iteration=higham_max_iteration,
466
+ )
467
+ self.covariance_estimator = covariance_estimator
468
+ self.n_markets = n_markets
469
+
470
+ def fit(self, X: npt.ArrayLike, y=None) -> "DenoteCovariance":
471
+ """Fit the Covariance Denoting estimator.
472
+
473
+ Parameters
474
+ ----------
475
+ X : array-like of shape (n_observations, n_assets)
476
+ Price returns of the assets.
477
+
478
+ y : Ignored
479
+ Not used, present for API consistency by convention.
480
+
481
+ Returns
482
+ -------
483
+ self : DenoteCovariance
484
+ Fitted estimator.
485
+ """
486
+ # fitting estimators
487
+ self.covariance_estimator_ = check_estimator(
488
+ self.covariance_estimator,
489
+ default=EmpiricalCovariance(),
490
+ check_type=BaseCovariance,
491
+ )
492
+ self.covariance_estimator_.fit(X)
493
+
494
+ # we validate and convert to numpy after all models have been fitted to keep
495
+ # features names information.
496
+ _ = self._validate_data(X)
497
+ corr, std = cov_to_corr(self.covariance_estimator_.covariance_)
498
+ e_val, e_vec = np.linalg.eigh(corr)
499
+ indices = e_val.argsort()[::-1]
500
+ e_val, e_vec = e_val[indices], e_vec[:, indices]
501
+ # market eigenvalues and eigenvectors
502
+ market_e_val, market_e_vec = e_val[: self.n_markets], e_vec[:, : self.n_markets]
503
+ # market correlation
504
+ market_corr = market_e_vec @ np.diag(market_e_val) @ market_e_vec.T
505
+ # Removing the market correlation
506
+ corr -= market_corr
507
+ corr, _ = cov_to_corr(corr)
508
+ covariance = corr_to_cov(corr, std)
509
+ self._set_covariance(covariance)
510
+ return self
511
+
512
+
513
+ class EWCovariance(BaseCovariance):
514
+ r"""Exponentially Weighted Covariance estimator.
515
+
516
+ Estimator of the covariance using the historical exponentially weighted returns.
517
+
518
+ Parameters
519
+ ----------
520
+ window_size : int, optional
521
+ Window size. The model is fitted on the last `window_size` observations.
522
+ The default (`None`) is to use all the data.
523
+
524
+ alpha : float, default=0.2
525
+ Exponential smoothing factor. The default value is `0.2`.
526
+
527
+ :math:`0 < \alpha \leq 1`.
528
+
529
+ nearest : bool, default=False
530
+ If this is set to True, the covariance is replaced by the nearest covariance
531
+ matrix that is positive definite and with a Cholesky decomposition than can be
532
+ computed. The variance is left unchanged. A covariance matrix is in theory PSD.
533
+ However, due to floating-point inaccuracies, we can end up with a covariance
534
+ matrix that is slightly non-PSD or where Cholesky decomposition is failing.
535
+ This often occurs in high dimensional problems.
536
+ For more details, see :func:`~skfolio.units.stats.cov_nearest`.
537
+ The default is `False`.
538
+
539
+ higham : bool, default=False
540
+ If this is set to True, the Higham & Nick (2002) algorithm is used to find the
541
+ nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
542
+ above zeros (1e-13). The default is `False` and use the clipping method as the
543
+ Higham & Nick algorithm can be slow for large datasets.
544
+
545
+ higham_max_iteration : int, default=100
546
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
547
+ The default value is `100`.
548
+
549
+ Attributes
550
+ ----------
551
+ covariance_ : ndarray of shape (n_assets, n_assets)
552
+ Estimated covariance.
553
+
554
+ n_features_in_ : int
555
+ Number of assets seen during `fit`.
556
+
557
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
558
+ Names of features seen during `fit`. Defined only when `X`
559
+ has feature names that are all strings.
560
+ """
561
+
562
+ def __init__(
563
+ self,
564
+ window_size: int | None = None,
565
+ alpha: float = 0.2,
566
+ nearest: bool = False,
567
+ higham: bool = False,
568
+ higham_max_iteration: int = 100,
569
+ ):
570
+ super().__init__(
571
+ nearest=nearest,
572
+ higham=higham,
573
+ higham_max_iteration=higham_max_iteration,
574
+ )
575
+ self.window_size = window_size
576
+ self.alpha = alpha
577
+
578
+ def fit(self, X: npt.ArrayLike, y=None):
579
+ """Fit the Exponentially Weighted Covariance estimator.
580
+
581
+ Parameters
582
+ ----------
583
+ X : array-like of shape (n_observations, n_assets)
584
+ Price returns of the assets.
585
+
586
+ y : Ignored
587
+ Not used, present for API consistency by convention.
588
+
589
+ Returns
590
+ -------
591
+ self : EWCovariance
592
+ Fitted estimator.
593
+ """
594
+ X = self._validate_data(X)
595
+ if self.window_size is not None:
596
+ X = X[-self.window_size :]
597
+ n_observations = X.shape[0]
598
+ covariance = (
599
+ pd.DataFrame(X)
600
+ .ewm(alpha=self.alpha)
601
+ .cov()
602
+ .loc[(n_observations - 1, slice(None)), :]
603
+ .to_numpy()
604
+ )
605
+ self._set_covariance(covariance)
606
+ return self
607
+
608
+
609
+ class LedoitWolf(BaseCovariance, skc.LedoitWolf):
610
+ """LedoitWolf Estimator.
611
+
612
+ Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
613
+ coefficient is computed using O. Ledoit and M. Wolf's formula as
614
+ described in [1]_.
615
+
616
+ Read more in `scikit-learn
617
+ <https://scikit-learn.org/stable/modules/generated/sklearn.covariance.ShrunkCovariance.html>`_.
618
+
619
+ Parameters
620
+ ----------
621
+ store_precision : bool, default=True
622
+ Specify if the estimated precision is stored.
623
+
624
+ assume_centered : bool, default=False
625
+ If True, data will not be centered before computation.
626
+ Useful when working with data whose mean is almost, but not exactly
627
+ zero.
628
+ If False (default), data will be centered before computation.
629
+
630
+ block_size : int, default=1000
631
+ Size of blocks into which the covariance matrix will be split
632
+ during its Ledoit-Wolf estimation. This is purely a memory
633
+ optimization and does not affect results.
634
+
635
+ nearest : bool, default=False
636
+ If this is set to True, the covariance is replaced by the nearest covariance
637
+ matrix that is positive definite and with a Cholesky decomposition than can be
638
+ computed. The variance is left unchanged. A covariance matrix is in theory PSD.
639
+ However, due to floating-point inaccuracies, we can end up with a covariance
640
+ matrix that is slightly non-PSD or where Cholesky decomposition is failing.
641
+ This often occurs in high dimensional problems.
642
+ For more details, see :func:`~skfolio.units.stats.cov_nearest`.
643
+ The default is `False`.
644
+
645
+ higham : bool, default=False
646
+ If this is set to True, the Higham & Nick (2002) algorithm is used to find the
647
+ nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
648
+ above zeros (1e-13). The default is `False` and use the clipping method as the
649
+ Higham & Nick algorithm can be slow for large datasets.
650
+
651
+ higham_max_iteration : int, default=100
652
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
653
+ The default value is `100`.
654
+
655
+ Attributes
656
+ ----------
657
+ covariance_ : ndarray of shape (n_assets, n_assets)
658
+ Estimated covariance.
659
+
660
+ location_ : ndarray of shape (n_assets,)
661
+ Estimated location, i.e. the estimated mean.
662
+
663
+ precision_ : ndarray of shape (n_assets, n_assets)
664
+ Estimated pseudo inverse matrix.
665
+ (stored only if store_precision is True)
666
+
667
+ shrinkage_ : float
668
+ Coefficient in the convex combination used for the computation
669
+ of the shrunk estimate. Range is [0, 1].
670
+
671
+ n_features_in_ : int
672
+ Number of assets seen during `fit`.
673
+
674
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
675
+ Names of features seen during `fit`. Defined only when `X`
676
+ has feature names that are all strings.
677
+
678
+ Notes
679
+ -----
680
+ The regularised covariance is:
681
+
682
+ (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
683
+
684
+ where mu = trace(cov) / n_features
685
+ and shrinkage is given by the Ledoit and Wolf formula (see References)
686
+
687
+ References
688
+ ----------
689
+ .. [1] "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices".
690
+ Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2.
691
+ February 2004, pages 365-41.
692
+ """
693
+
694
+ def __init__(
695
+ self,
696
+ store_precision=True,
697
+ assume_centered=False,
698
+ block_size=1000,
699
+ nearest: bool = False,
700
+ higham: bool = False,
701
+ higham_max_iteration: int = 100,
702
+ ):
703
+ super().__init__(
704
+ nearest=nearest,
705
+ higham=higham,
706
+ higham_max_iteration=higham_max_iteration,
707
+ )
708
+ skc.LedoitWolf.__init__(
709
+ self,
710
+ store_precision=store_precision,
711
+ assume_centered=assume_centered,
712
+ block_size=block_size,
713
+ )
714
+
715
+ def fit(self, X: npt.ArrayLike, y=None) -> "LedoitWolf":
716
+ """Fit the Ledoit-Wolf shrunk covariance model to X.
717
+
718
+ Parameters
719
+ ----------
720
+ X : array-like of shape (n_observations, n_assets)
721
+ Price returns of the assets.
722
+
723
+ y : Ignored
724
+ Not used, present for API consistency by convention.
725
+
726
+ Returns
727
+ -------
728
+ self : LedoitWolf
729
+ Fitted estimator.
730
+ """
731
+ skc.LedoitWolf.fit(self, X)
732
+ self._set_covariance(self.covariance_)
733
+ return self
734
+
735
+
736
+ class OAS(BaseCovariance, skc.OAS):
737
+ """Oracle Approximating Shrinkage Estimator as proposed in [1]_.
738
+
739
+ Read more in `scikit-learn
740
+ <https://scikit-learn.org/stable/modules/generated/sklearn.covariance.ShrunkCovariance.html>`_.
741
+
742
+ Parameters
743
+ ----------
744
+ store_precision : bool, default=True
745
+ Specify if the estimated precision is stored.
746
+
747
+ assume_centered : bool, default=False
748
+ If True, data will not be centered before computation.
749
+ Useful when working with data whose mean is almost, but not exactly
750
+ zero.
751
+ If False (default), data will be centered before computation.
752
+
753
+ Attributes
754
+ ----------
755
+ covariance_ : ndarray of shape (n_assets, n_assets)
756
+ Estimated covariance.
757
+
758
+ location_ : ndarray of shape (n_assets,)
759
+ Estimated location, i.e. the estimated mean.
760
+
761
+ precision_ : ndarray of shape (n_assets, n_assets)
762
+ Estimated pseudo inverse matrix.
763
+ (stored only if store_precision is True)
764
+
765
+ shrinkage_ : float
766
+ Coefficient in the convex combination used for the computation
767
+ of the shrunk estimate. Range is [0, 1].
768
+
769
+ n_features_in_ : int
770
+ Number of assets seen during `fit`.
771
+
772
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
773
+ Names of features seen during `fit`. Defined only when `X`
774
+ has feature names that are all strings.
775
+
776
+ Notes
777
+ -----
778
+ The regularised covariance is:
779
+
780
+ (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
781
+
782
+ where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
783
+ (see [1]_).
784
+
785
+ The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
786
+ the original article, formula (23) states that 2/p (p being the number of
787
+ features) is multiplied by Trace(cov*cov) in both the numerator and
788
+ denominator, but this operation is omitted because for a large p, the value
789
+ of 2/p is so small that it doesn't affect the value of the estimator.
790
+
791
+ References
792
+ ----------
793
+ .. [1] "Shrinkage algorithms for MMSE covariance estimation".
794
+ Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
795
+ IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
796
+ """
797
+
798
+ def __init__(
799
+ self,
800
+ store_precision=True,
801
+ assume_centered=False,
802
+ nearest: bool = False,
803
+ higham: bool = False,
804
+ higham_max_iteration: int = 100,
805
+ ):
806
+ super().__init__(
807
+ nearest=nearest,
808
+ higham=higham,
809
+ higham_max_iteration=higham_max_iteration,
810
+ )
811
+ skc.OAS.__init__(
812
+ self,
813
+ store_precision=store_precision,
814
+ assume_centered=assume_centered,
815
+ )
816
+
817
+ def fit(self, X: npt.ArrayLike, y=None) -> "OAS":
818
+ """Fit the Oracle Approximating Shrinkage covariance model to X.
819
+
820
+ Parameters
821
+ ----------
822
+ X : array-like of shape (n_observations, n_assets)
823
+ Price returns of the assets.
824
+
825
+ y : Ignored
826
+ Not used, present for API consistency by convention.
827
+
828
+ Returns
829
+ -------
830
+ self : OAS
831
+ Fitted estimator.
832
+ """
833
+ skc.OAS.fit(self, X)
834
+ self._set_covariance(self.covariance_)
835
+ return self
836
+
837
+
838
+ class ShrunkCovariance(BaseCovariance, skc.ShrunkCovariance):
839
+ """Covariance estimator with shrinkage.
840
+
841
+ Read more in `scikit-learn
842
+ <https://scikit-learn.org/stable/modules/generated/sklearn.covariance.ShrunkCovariance.html>`_.
843
+
844
+ Parameters
845
+ ----------
846
+ store_precision : bool, default=True
847
+ Specify if the estimated precision is stored.
848
+
849
+ assume_centered : bool, default=False
850
+ If True, data will not be centered before computation.
851
+ Useful when working with data whose mean is almost, but not exactly
852
+ zero.
853
+ If False (default), data will be centered before computation.
854
+
855
+ shrinkage : float, default=0.1
856
+ Coefficient in the convex combination used for the computation
857
+ of the shrunk estimate. Range is [0, 1].
858
+
859
+ Attributes
860
+ ----------
861
+ covariance_ : ndarray of shape (n_assets, n_assets)
862
+ Estimated covariance.
863
+
864
+ location_ : ndarray of shape (n_assets,)
865
+ Estimated location, i.e. the estimated mean.
866
+
867
+ precision_ : ndarray of shape (n_assets, n_assets)
868
+ Estimated pseudo inverse matrix.
869
+ (stored only if store_precision is True)
870
+
871
+ n_features_in_ : int
872
+ Number of assets seen during `fit`.
873
+
874
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
875
+ Names of features seen during `fit`. Defined only when `X`
876
+ has feature names that are all strings.
877
+
878
+ Notes
879
+ -----
880
+ The regularized covariance is given by:
881
+
882
+ (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
883
+
884
+ where mu = trace(cov) / n_features
885
+ """
886
+
887
+ def __init__(
888
+ self,
889
+ store_precision=True,
890
+ assume_centered=False,
891
+ shrinkage=0.1,
892
+ nearest: bool = False,
893
+ higham: bool = False,
894
+ higham_max_iteration: int = 100,
895
+ ):
896
+ super().__init__(
897
+ nearest=nearest,
898
+ higham=higham,
899
+ higham_max_iteration=higham_max_iteration,
900
+ )
901
+ skc.ShrunkCovariance.__init__(
902
+ self,
903
+ store_precision=store_precision,
904
+ assume_centered=assume_centered,
905
+ shrinkage=shrinkage,
906
+ )
907
+
908
+ def fit(self, X: npt.ArrayLike, y=None) -> "ShrunkCovariance":
909
+ """Fit the shrunk covariance model to X.
910
+
911
+ Parameters
912
+ ----------
913
+ X : array-like of shape (n_observations, n_assets)
914
+ Price returns of the assets.
915
+
916
+ y : Ignored
917
+ Not used, present for API consistency by convention.
918
+
919
+ Returns
920
+ -------
921
+ self : ShrunkCovariance
922
+ Fitted estimator.
923
+ """
924
+ skc.ShrunkCovariance.fit(self, X)
925
+ self._set_covariance(self.covariance_)
926
+ return self
927
+
928
+
929
+ class GraphicalLassoCV(BaseCovariance, skc.GraphicalLassoCV):
930
+ """Sparse inverse covariance with cross-validated choice of the l1 penalty.
931
+
932
+ Read more in `scikit-learn
933
+ <https://scikit-learn.org/stable/auto_examples/covariance/plot_sparse_cov.html>`_.
934
+
935
+ Parameters
936
+ ----------
937
+ alphas : int or array-like of shape (n_alphas,), dtype=float, default=4
938
+ If an integer is given, it fixes the number of points on the
939
+ grids of alpha to be used. If a list is given, it gives the
940
+ grid to be used. See the notes in the class docstring for
941
+ more details. Range is [1, inf) for an integer.
942
+ Range is (0, inf] for an array-like of floats.
943
+
944
+ n_refinements : int, default=4
945
+ The number of times the grid is refined. Not used if explicit
946
+ values of alphas are passed. Range is [1, inf).
947
+
948
+ cv : int, cross-validation generator or iterable, default=None
949
+ Determines the cross-validation splitting strategy.
950
+ Possible inputs for cv are:
951
+
952
+ - None, to use the default 5-fold cross-validation,
953
+ - integer, to specify the number of folds.
954
+ - `CV splitter`,
955
+ - An iterable yielding (train, test) splits as arrays of indices.
956
+
957
+ For integer/None inputs :class:`KFold` is used.
958
+
959
+ tol : float, default=1e-4
960
+ The tolerance to declare convergence: if the dual gap goes below
961
+ this value, iterations are stopped. Range is (0, inf].
962
+
963
+ enet_tol : float, default=1e-4
964
+ The tolerance for the elastic net solver used to calculate the descent
965
+ direction. This parameter controls the accuracy of the search direction
966
+ for a given column update, not of the overall parameter estimate. Only
967
+ used for mode='cd'. Range is (0, inf].
968
+
969
+ max_iter : int, default=100
970
+ Maximum number of iterations.
971
+
972
+ mode : {'cd', 'lars'}, default='cd'
973
+ The Lasso solver to use: coordinate descent or LARS. Use LARS for
974
+ very sparse underlying graphs, where number of features is greater
975
+ than number of samples. Elsewhere prefer cd which is more numerically
976
+ stable.
977
+
978
+ n_jobs : int, default=None
979
+ Number of jobs to run in parallel.
980
+ `None` means 1 unless in a :obj:`joblib.parallel_backend` context.
981
+ `-1` means using all processors.
982
+
983
+ verbose : bool, default=False
984
+ If verbose is True, the objective function and duality gap are
985
+ printed at each iteration.
986
+
987
+ assume_centered : bool, default=False
988
+ If True, data are not centered before computation.
989
+ Useful when working with data whose mean is almost, but not exactly
990
+ zero.
991
+ If False, data are centered before computation.
992
+
993
+ Attributes
994
+ ----------
995
+ covariance_ : ndarray of shape (n_assets, n_assets)
996
+ Estimated covariance.
997
+
998
+ location_ : ndarray of shape (n_assets,)
999
+ Estimated location, i.e. the estimated mean.
1000
+
1001
+ precision_ : ndarray of shape (n_assets, n_assets)
1002
+ Estimated pseudo inverse matrix.
1003
+ (stored only if store_precision is True)
1004
+
1005
+ alpha_ : float
1006
+ Penalization parameter selected.
1007
+
1008
+ cv_results_ : dict of ndarrays
1009
+ A dict with keys:
1010
+
1011
+ alphas : ndarray of shape (n_alphas,)
1012
+ All penalization parameters explored.
1013
+
1014
+ split(k)_test_score : ndarray of shape (n_alphas,)
1015
+ Log-likelihood score on left-out data across (k)th fold.
1016
+
1017
+ .. versionadded:: 1.0
1018
+
1019
+ mean_test_score : ndarray of shape (n_alphas,)
1020
+ Mean of scores over the folds.
1021
+
1022
+ .. versionadded:: 1.0
1023
+
1024
+ std_test_score : ndarray of shape (n_alphas,)
1025
+ Standard deviation of scores over the folds.
1026
+
1027
+ .. versionadded:: 1.0
1028
+
1029
+ n_iter_ : int
1030
+ Number of iterations run for the optimal alpha.
1031
+
1032
+ n_features_in_ : int
1033
+ Number of assets seen during `fit`.
1034
+
1035
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
1036
+ Names of features seen during `fit`. Defined only when `X`
1037
+ has feature names that are all strings.
1038
+
1039
+ Notes
1040
+ -----
1041
+ The search for the optimal penalization parameter (`alpha`) is done on an
1042
+ iteratively refined grid: first the cross-validated scores on a grid are
1043
+ computed, then a new refined grid is centered around the maximum, and so
1044
+ on.
1045
+
1046
+ One of the challenges which is faced here is that the solvers can
1047
+ fail to converge to a well-conditioned estimate. The corresponding
1048
+ values of `alpha` then come out as missing values, but the optimum may
1049
+ be close to these missing values.
1050
+
1051
+ In `fit`, once the best parameter `alpha` is found through
1052
+ cross-validation, the model is fit again using the entire training set.
1053
+ """
1054
+
1055
+ def __init__(
1056
+ self,
1057
+ alphas=4,
1058
+ n_refinements=4,
1059
+ cv=None,
1060
+ tol=1e-4,
1061
+ enet_tol=1e-4,
1062
+ max_iter=100,
1063
+ mode="cd",
1064
+ n_jobs=None,
1065
+ verbose=False,
1066
+ assume_centered=False,
1067
+ nearest: bool = False,
1068
+ higham: bool = False,
1069
+ higham_max_iteration: int = 100,
1070
+ ):
1071
+ super().__init__(
1072
+ nearest=nearest,
1073
+ higham=higham,
1074
+ higham_max_iteration=higham_max_iteration,
1075
+ )
1076
+ skc.GraphicalLassoCV.__init__(
1077
+ self,
1078
+ alphas=alphas,
1079
+ n_refinements=n_refinements,
1080
+ cv=cv,
1081
+ tol=tol,
1082
+ enet_tol=enet_tol,
1083
+ max_iter=max_iter,
1084
+ mode=mode,
1085
+ n_jobs=n_jobs,
1086
+ verbose=verbose,
1087
+ assume_centered=assume_centered,
1088
+ )
1089
+
1090
+ def fit(self, X, y=None) -> "GraphicalLassoCV":
1091
+ """Fit the GraphicalLasso covariance model to X.
1092
+
1093
+ Parameters
1094
+ ----------
1095
+ X : array-like of shape (n_observations, n_assets)
1096
+ Price returns of the assets.
1097
+
1098
+ y : Ignored
1099
+ Not used, present for API consistency by convention.
1100
+
1101
+ Returns
1102
+ -------
1103
+ self : GraphicalLassoCV
1104
+ Fitted estimator.
1105
+ """
1106
+ skc.GraphicalLassoCV.fit(self, X)
1107
+ self._set_covariance(self.covariance_)
1108
+ return self