skfolio 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. skfolio/datasets/__init__.py +2 -0
  2. skfolio/datasets/_base.py +51 -0
  3. skfolio/distance/_distance.py +15 -4
  4. skfolio/model_selection/_combinatorial.py +2 -2
  5. skfolio/model_selection/_validation.py +70 -15
  6. skfolio/model_selection/_walk_forward.py +3 -3
  7. skfolio/moments/__init__.py +2 -0
  8. skfolio/moments/covariance/__init__.py +11 -11
  9. skfolio/moments/covariance/_base.py +10 -9
  10. skfolio/moments/covariance/_denoise_covariance.py +181 -0
  11. skfolio/moments/covariance/_detone_covariance.py +158 -0
  12. skfolio/moments/covariance/_empirical_covariance.py +100 -0
  13. skfolio/moments/covariance/_ew_covariance.py +109 -0
  14. skfolio/moments/covariance/_gerber_covariance.py +157 -0
  15. skfolio/moments/covariance/_graphical_lasso_cv.py +194 -0
  16. skfolio/moments/covariance/_implied_covariance.py +454 -0
  17. skfolio/moments/covariance/_ledoit_wolf.py +140 -0
  18. skfolio/moments/covariance/_oas.py +115 -0
  19. skfolio/moments/covariance/_shrunk_covariance.py +104 -0
  20. skfolio/moments/expected_returns/__init__.py +4 -7
  21. skfolio/moments/expected_returns/_empirical_mu.py +63 -0
  22. skfolio/moments/expected_returns/_equilibrium_mu.py +124 -0
  23. skfolio/moments/expected_returns/_ew_mu.py +69 -0
  24. skfolio/moments/expected_returns/{_expected_returns.py → _shrunk_mu.py} +22 -200
  25. skfolio/optimization/cluster/_nco.py +46 -8
  26. skfolio/optimization/cluster/hierarchical/_base.py +21 -1
  27. skfolio/optimization/cluster/hierarchical/_herc.py +18 -4
  28. skfolio/optimization/cluster/hierarchical/_hrp.py +13 -4
  29. skfolio/optimization/convex/_base.py +10 -1
  30. skfolio/optimization/convex/_distributionally_robust.py +12 -2
  31. skfolio/optimization/convex/_maximum_diversification.py +9 -2
  32. skfolio/optimization/convex/_mean_risk.py +33 -6
  33. skfolio/optimization/convex/_risk_budgeting.py +5 -2
  34. skfolio/optimization/ensemble/_stacking.py +32 -9
  35. skfolio/optimization/naive/_naive.py +20 -2
  36. skfolio/population/_population.py +2 -0
  37. skfolio/prior/_base.py +1 -1
  38. skfolio/prior/_black_litterman.py +20 -2
  39. skfolio/prior/_empirical.py +38 -5
  40. skfolio/prior/_factor_model.py +44 -7
  41. skfolio/uncertainty_set/_base.py +30 -9
  42. skfolio/uncertainty_set/_bootstrap.py +26 -10
  43. skfolio/uncertainty_set/_empirical.py +25 -10
  44. skfolio/utils/stats.py +24 -3
  45. skfolio/utils/tools.py +213 -79
  46. {skfolio-0.2.2.dist-info → skfolio-0.3.0.dist-info}/METADATA +4 -3
  47. skfolio-0.3.0.dist-info/RECORD +91 -0
  48. {skfolio-0.2.2.dist-info → skfolio-0.3.0.dist-info}/WHEEL +1 -1
  49. skfolio/moments/covariance/_covariance.py +0 -1114
  50. skfolio-0.2.2.dist-info/RECORD +0 -79
  51. {skfolio-0.2.2.dist-info → skfolio-0.3.0.dist-info}/LICENSE +0 -0
  52. {skfolio-0.2.2.dist-info → skfolio-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,1114 +0,0 @@
1
- """Covariance Estimators."""
2
-
3
- # Copyright (c) 2023
4
- # Author: Hugo Delatte <delatte.hugo@gmail.com>
5
- # License: BSD 3 clause
6
- # Implementation derived from:
7
- # scikit-learn, Copyright (c) 2007-2010 David Cournapeau, Fabian Pedregosa, Olivier
8
- # Grisel Licensed under BSD 3 clause.
9
-
10
- import numpy as np
11
- import numpy.typing as npt
12
- import pandas as pd
13
- import scipy.optimize as sco
14
- import sklearn.covariance as skc
15
- import sklearn.neighbors as skn
16
-
17
- from skfolio.moments.covariance._base import BaseCovariance
18
- from skfolio.utils.stats import corr_to_cov, cov_to_corr
19
- from skfolio.utils.tools import check_estimator
20
-
21
-
22
- class EmpiricalCovariance(BaseCovariance):
23
- """Empirical covariance estimator.
24
-
25
- Parameters
26
- ----------
27
- window_size : int, optional
28
- Window size. The model is fitted on the last `window_size` observations.
29
- The default (`None`) is to use all the data.
30
-
31
- ddof : int, default=1
32
- Normalization is by `(n_observations - ddof)`.
33
- Note that `ddof=1` will return the unbiased estimate, and `ddof=0`
34
- will return the simple average. The default value is `1`.
35
-
36
- nearest : bool, default=False
37
- If this is set to True, the covariance is replaced by the nearest covariance
38
- matrix that is positive definite and with a Cholesky decomposition than can be
39
- computed. The variance is left unchanged. A covariance matrix is in theory PSD.
40
- However, due to floating-point inaccuracies, we can end up with a covariance
41
- matrix that is slightly non-PSD or where Cholesky decomposition is failing.
42
- This often occurs in high dimensional problems.
43
- For more details, see :func:`~skfolio.units.stats.cov_nearest`.
44
- The default is `False`.
45
-
46
- higham : bool, default=False
47
- If this is set to True, the Higham & Nick (2002) algorithm is used to find the
48
- nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
49
- above zeros (1e-13). The default is `False` and use the clipping method as the
50
- Higham & Nick algorithm can be slow for large datasets.
51
-
52
- higham_max_iteration : int, default=100
53
- Maximum number of iteration of the Higham & Nick (2002) algorithm.
54
- The default value is `100`.
55
-
56
- Attributes
57
- ----------
58
- covariance_ : ndarray of shape (n_assets, n_assets)
59
- Estimated covariance matrix.
60
-
61
- n_features_in_ : int
62
- Number of assets seen during `fit`.
63
-
64
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
65
- Names of assets seen during `fit`. Defined only when `X`
66
- has assets names that are all strings.
67
- """
68
-
69
- def __init__(
70
- self,
71
- window_size: int | None = None,
72
- ddof: int = 1,
73
- nearest: bool = False,
74
- higham: bool = False,
75
- higham_max_iteration: int = 100,
76
- ):
77
- super().__init__(
78
- nearest=nearest,
79
- higham=higham,
80
- higham_max_iteration=higham_max_iteration,
81
- )
82
- self.window_size = window_size
83
- self.ddof = ddof
84
-
85
- def fit(self, X: npt.ArrayLike, y=None) -> "EmpiricalCovariance":
86
- """Fit the empirical covariance estimator.
87
-
88
- Parameters
89
- ----------
90
- X : array-like of shape (n_observations, n_assets)
91
- Price returns of the assets.
92
-
93
- y : Ignored
94
- Not used, present for API consistency by convention.
95
-
96
- Returns
97
- -------
98
- self : EmpiricalCovariance
99
- Fitted estimator.
100
- """
101
- X = self._validate_data(X)
102
- if self.window_size is not None:
103
- X = X[-self.window_size :]
104
- covariance = np.cov(X.T, ddof=self.ddof)
105
- self._set_covariance(covariance)
106
- return self
107
-
108
-
109
- class GerberCovariance(BaseCovariance):
110
- """Gerber covariance estimator.
111
-
112
- Robust co-movement measure which ignores fluctuations below a certain threshold
113
- while simultaneously limiting the effects of extreme movements.
114
- The Gerber statistic extends Kendall's Tau by counting the proportion of
115
- simultaneous co-movements in series when their amplitudes exceed data-dependent
116
- thresholds.
117
-
118
- Three variant has been published:
119
-
120
- * Gerber et al. (2015): tend to produce matrices that are non-PSD.
121
- * Gerber et al. (2019): alteration of the denominator of the above statistic.
122
- * Gerber et al. (2022): final alteration to ensure PSD matrix.
123
-
124
- The last two variants are implemented.
125
-
126
- Parameters
127
- ----------
128
- window_size : int, optional
129
- Window size. The model is fitted on the last `window_size` observations.
130
- The default (`None`) is to use all the data.
131
-
132
- threshold : float, default=0.5
133
- Gerber threshold. The default value is `0.5`.
134
-
135
- psd_variant : bool, default=True
136
- If this is set to True, the Gerber et al. (2022) variant is used to ensure a
137
- positive semi-definite matrix.
138
- Otherwise, the Gerber et al. (2019) variant is used.
139
- The default is `True`.
140
-
141
- nearest : bool, default=False
142
- If this is set to True, the covariance is replaced by the nearest covariance
143
- matrix that is positive definite and with a Cholesky decomposition than can be
144
- computed. The variance is left unchanged. A covariance matrix is in theory PSD.
145
- However, due to floating-point inaccuracies, we can end up with a covariance
146
- matrix that is slightly non-PSD or where Cholesky decomposition is failing.
147
- This often occurs in high dimensional problems.
148
- For more details, see :func:`~skfolio.units.stats.cov_nearest`.
149
- The default is `False`.
150
-
151
- higham : bool, default=False
152
- If this is set to True, the Higham & Nick (2002) algorithm is used to find the
153
- nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
154
- above zeros (1e-13). The default is `False` and use the clipping method as the
155
- Higham & Nick algorithm can be slow for large datasets.
156
-
157
- higham_max_iteration : int, default=100
158
- Maximum number of iteration of the Higham & Nick (2002) algorithm.
159
- The default value is `100`.
160
-
161
- Attributes
162
- ----------
163
- covariance_ : ndarray of shape (n_assets, n_assets)
164
- Estimated covariance.
165
-
166
- n_features_in_ : int
167
- Number of assets seen during `fit`.
168
-
169
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
170
- Names of assets seen during `fit`. Defined only when `X`
171
- has assets names that are all strings.
172
-
173
- References
174
- ----------
175
- .. [1] "The gerber statistic: A robust co-movement measure for portfolio
176
- optimization".
177
- The Journal of Portfolio Management.
178
- Gerber, S., B. Javid, H. Markowitz, P. Sargen, and D. Starer (2022).
179
-
180
- .. [2] "The gerber statistic: A robust measure of correlation".
181
- Gerber, S., B. Javid, H. Markowitz, P. Sargen, and D. Starer (2019).
182
-
183
- .. [3] "Enhancing multi-asset portfolio construction under modern portfolio theory
184
- with a robust co-movement measure".
185
- Social Science Research network Working Paper Series.
186
- Gerber, S., H. Markowitz, and P. Pujara (2015).
187
-
188
- .. [4] "Deconstructing the Gerber Statistic".
189
- Flint & Polakow, 2023.
190
- """
191
-
192
- def __init__(
193
- self,
194
- window_size: int | None = None,
195
- threshold: float = 0.5,
196
- psd_variant: bool = True,
197
- nearest: bool = False,
198
- higham: bool = False,
199
- higham_max_iteration: int = 100,
200
- ):
201
- super().__init__(
202
- nearest=nearest,
203
- higham=higham,
204
- higham_max_iteration=higham_max_iteration,
205
- )
206
- self.window_size = window_size
207
- self.threshold = threshold
208
- self.psd_variant = psd_variant
209
-
210
- def fit(self, X: npt.ArrayLike, y=None) -> "GerberCovariance":
211
- """Fit the Gerber covariance estimator.
212
-
213
- Parameters
214
- ----------
215
- X : array-like of shape (n_observations, n_assets)
216
- Price returns of the assets.
217
-
218
- y : Ignored
219
- Not used, present for API consistency by convention.
220
-
221
- Returns
222
- -------
223
- self : GerberCovariance
224
- Fitted estimator.
225
- """
226
- X = self._validate_data(X)
227
- if self.window_size is not None:
228
- X = X[-self.window_size :]
229
- if not (1 > self.threshold > 0):
230
- raise ValueError("The threshold must be between 0 and 1")
231
- n_observations = X.shape[0]
232
- std = X.std(axis=0).reshape((-1, 1))
233
- u = X >= std.T * self.threshold
234
- d = X <= -std.T * self.threshold
235
- n = np.invert(u) & np.invert(d) # np.invert preferred that ~ for type hint
236
- n = n.astype(int)
237
- u = u.astype(int)
238
- d = d.astype(int)
239
- concordant = u.T @ u + d.T @ d
240
- discordant = u.T @ d + d.T @ u
241
- h = concordant - discordant
242
- if self.psd_variant:
243
- corr = h / (n_observations - n.T @ n)
244
- else:
245
- h_sqrt = np.sqrt(np.diag(h)).reshape((-1, 1))
246
- corr = h / (h_sqrt @ h_sqrt.T)
247
- covariance = corr_to_cov(corr, std.reshape(-1))
248
- self._set_covariance(covariance)
249
- return self
250
-
251
-
252
- class DenoiseCovariance(BaseCovariance):
253
- """Covariance Denoising estimator.
254
-
255
- The goal of Covariance Denoising is to reduce the noise and enhance the signal of
256
- the empirical covariance matrix [1]_.
257
- It reduces the ill-conditioning of the traditional covariance estimate by
258
- differentiating the eigenvalues associated with noise from the eigenvalues
259
- associated with signal.
260
- Denoising replaces the eigenvalues of the eigenvectors classified as random by
261
- Marčenko-Pastur with a constant eigenvalue.
262
-
263
- Parameters
264
- ----------
265
- covariance_estimator : BaseCovariance, optional
266
- :ref:`Covariance estimator <covariance_estimator>` to estimate the covariance
267
- matrix that will be denoised.
268
- The default (`None`) is to use :class:`~skfolio.moments.EmpiricalCovariance`.
269
-
270
- nearest : bool, default=False
271
- If this is set to True, the covariance is replaced by the nearest covariance
272
- matrix that is positive definite and with a Cholesky decomposition than can be
273
- computed. The variance is left unchanged. A covariance matrix is in theory PSD.
274
- However, due to floating-point inaccuracies, we can end up with a covariance
275
- matrix that is slightly non-PSD or where Cholesky decomposition is failing.
276
- This often occurs in high dimensional problems.
277
- For more details, see :func:`~skfolio.units.stats.cov_nearest`.
278
- The default is `False`.
279
-
280
- higham : bool, default=False
281
- If this is set to True, the Higham & Nick (2002) algorithm is used to find the
282
- nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
283
- above zeros (1e-13). The default is `False` and use the clipping method as the
284
- Higham & Nick algorithm can be slow for large datasets.
285
-
286
- higham_max_iteration : int, default=100
287
- Maximum number of iteration of the Higham & Nick (2002) algorithm.
288
- The default value is `100`.
289
-
290
- Attributes
291
- ----------
292
- covariance_ : ndarray of shape (n_assets, n_assets)
293
- Estimated covariance.
294
-
295
- covariance_estimator_ : BaseCovariance
296
- Fitted `covariance_estimator`.
297
-
298
- n_features_in_ : int
299
- Number of assets seen during `fit`.
300
-
301
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
302
- Names of assets seen during `fit`. Defined only when `X`
303
- has assets names that are all strings.
304
-
305
- References
306
- ----------
307
- .. [1] "Machine Learning for Asset Managers".
308
- Elements in Quantitative Finance.
309
- Lòpez de Prado (2020).
310
- """
311
-
312
- covariance_estimator_: BaseCovariance
313
-
314
- def __init__(
315
- self,
316
- covariance_estimator: BaseCovariance | None = None,
317
- nearest: bool = False,
318
- higham: bool = False,
319
- higham_max_iteration: int = 100,
320
- ):
321
- super().__init__(
322
- nearest=nearest,
323
- higham=higham,
324
- higham_max_iteration=higham_max_iteration,
325
- )
326
- self.covariance_estimator = covariance_estimator
327
-
328
- def fit(self, X: npt.ArrayLike, y=None) -> "DenoiseCovariance":
329
- """Fit the Covariance Denoising estimator.
330
-
331
- Parameters
332
- ----------
333
- X : array-like of shape (n_observations, n_assets)
334
- Price returns of the assets.
335
-
336
- y : Ignored
337
- Not used, present for API consistency by convention.
338
-
339
- Returns
340
- -------
341
- self : DenoiseCovariance
342
- Fitted estimator.
343
- """
344
- # fitting estimators
345
- self.covariance_estimator_ = check_estimator(
346
- self.covariance_estimator,
347
- default=EmpiricalCovariance(),
348
- check_type=BaseCovariance,
349
- )
350
- self.covariance_estimator_.fit(X)
351
-
352
- # we validate and convert to numpy after all models have been fitted to keep
353
- # features names information.
354
- X = self._validate_data(X)
355
- n_observations, n_assets = X.shape
356
- q = n_observations / n_assets
357
- corr, std = cov_to_corr(self.covariance_estimator_.covariance_)
358
- e_val, e_vec = np.linalg.eigh(corr)
359
- indices = e_val.argsort()[::-1]
360
- e_val, e_vec = e_val[indices], e_vec[:, indices]
361
-
362
- def _marchenko(x_var):
363
- e_min, e_max = (
364
- x_var * (1 - (1.0 / q) ** 0.5) ** 2,
365
- x_var * (1 + (1.0 / q) ** 0.5) ** 2,
366
- )
367
- e_val_lin = np.linspace(e_min, e_max, 1000)
368
- pdf_0 = (
369
- q
370
- / (2 * np.pi * x_var * e_val_lin)
371
- * ((e_max - e_val_lin) * (e_val_lin - e_min)) ** 0.5
372
- )
373
- kde = skn.KernelDensity(kernel="gaussian", bandwidth=0.01).fit(
374
- e_val.reshape(-1, 1)
375
- )
376
- # noinspection PyUnresolvedReferences
377
- pdf_1 = np.exp(kde.score_samples(pdf_0.reshape(-1, 1)))
378
- return np.sum((pdf_1 - pdf_0) ** 2)
379
-
380
- # noinspection PyTypeChecker
381
- res = sco.minimize(_marchenko, x0=0.5, bounds=((1e-5, 1 - 1e-5),))
382
-
383
- var = res["x"][0]
384
- n_facts = e_val.shape[0] - e_val[::-1].searchsorted(
385
- var * (1 + (1.0 / q) ** 0.5) ** 2
386
- )
387
- e_val_ = e_val.copy()
388
- e_val_[n_facts:] = e_val_[n_facts:].sum() / float(e_val_.shape[0] - n_facts)
389
- corr = e_vec @ np.diag(e_val_) @ e_vec.T
390
- corr, _ = cov_to_corr(corr)
391
- covariance = corr_to_cov(corr, std)
392
- self._set_covariance(covariance)
393
- return self
394
-
395
-
396
- class DetoneCovariance(BaseCovariance):
397
- """Covariance Detoning estimator.
398
-
399
- Financial covariance matrices usually incorporate a market component corresponding
400
- to the first eigenvectors [1]_.
401
- For some applications like clustering, removing the market component (loud tone)
402
- allow a greater portion of the covariance to be explained by components that affect
403
- specific subsets of the securities.
404
-
405
- Parameters
406
- ----------
407
- covariance_estimator : BaseCovariance, optional
408
- :ref:`Covariance estimator <covariance_estimator>` to estimate the covariance
409
- matrix prior detoning.
410
- The default (`None`) is to use :class:`~skfolio.moments.EmpiricalCovariance`.
411
-
412
- n_markets : int, default=1
413
- Number of eigenvectors related to the market.
414
- The default value is `1`.
415
-
416
- nearest : bool, default=False
417
- If this is set to True, the covariance is replaced by the nearest covariance
418
- matrix that is positive definite and with a Cholesky decomposition than can be
419
- computed. The variance is left unchanged. A covariance matrix is in theory PSD.
420
- However, due to floating-point inaccuracies, we can end up with a covariance
421
- matrix that is slightly non-PSD or where Cholesky decomposition is failing.
422
- This often occurs in high dimensional problems.
423
- For more details, see :func:`~skfolio.units.stats.cov_nearest`.
424
- The default is `False`.
425
-
426
- higham : bool, default=False
427
- If this is set to True, the Higham & Nick (2002) algorithm is used to find the
428
- nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
429
- above zeros (1e-13). The default is `False` and use the clipping method as the
430
- Higham & Nick algorithm can be slow for large datasets.
431
-
432
- higham_max_iteration : int, default=100
433
- Maximum number of iteration of the Higham & Nick (2002) algorithm.
434
- The default value is `100`.
435
-
436
- Attributes
437
- ----------
438
- covariance_ : ndarray of shape (n_assets, n_assets)
439
- Estimated covariance.
440
-
441
- covariance_estimator_ : BaseCovariance
442
- Fitted `covariance_estimator`.
443
-
444
- n_features_in_ : int
445
- Number of assets seen during `fit`.
446
-
447
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
448
- Names of assets seen during `fit`. Defined only when `X`
449
- has assets names that are all strings.
450
-
451
- References
452
- ----------
453
- .. [1] "Machine Learning for Asset Managers".
454
- Elements in Quantitative Finance.
455
- Lòpez de Prado (2020).
456
- """
457
-
458
- covariance_estimator_: BaseCovariance
459
-
460
- def __init__(
461
- self,
462
- covariance_estimator: BaseCovariance | None = None,
463
- n_markets: float = 1,
464
- nearest: bool = False,
465
- higham: bool = False,
466
- higham_max_iteration: int = 100,
467
- ):
468
- super().__init__(
469
- nearest=nearest,
470
- higham=higham,
471
- higham_max_iteration=higham_max_iteration,
472
- )
473
- self.covariance_estimator = covariance_estimator
474
- self.n_markets = n_markets
475
-
476
- def fit(self, X: npt.ArrayLike, y=None) -> "DetoneCovariance":
477
- """Fit the Covariance Detoning estimator.
478
-
479
- Parameters
480
- ----------
481
- X : array-like of shape (n_observations, n_assets)
482
- Price returns of the assets.
483
-
484
- y : Ignored
485
- Not used, present for API consistency by convention.
486
-
487
- Returns
488
- -------
489
- self : DetoneCovariance
490
- Fitted estimator.
491
- """
492
- # fitting estimators
493
- self.covariance_estimator_ = check_estimator(
494
- self.covariance_estimator,
495
- default=EmpiricalCovariance(),
496
- check_type=BaseCovariance,
497
- )
498
- self.covariance_estimator_.fit(X)
499
-
500
- # we validate and convert to numpy after all models have been fitted to keep
501
- # features names information.
502
- _ = self._validate_data(X)
503
- corr, std = cov_to_corr(self.covariance_estimator_.covariance_)
504
- e_val, e_vec = np.linalg.eigh(corr)
505
- indices = e_val.argsort()[::-1]
506
- e_val, e_vec = e_val[indices], e_vec[:, indices]
507
- # market eigenvalues and eigenvectors
508
- market_e_val, market_e_vec = e_val[: self.n_markets], e_vec[:, : self.n_markets]
509
- # market correlation
510
- market_corr = market_e_vec @ np.diag(market_e_val) @ market_e_vec.T
511
- # Removing the market correlation
512
- corr -= market_corr
513
- corr, _ = cov_to_corr(corr)
514
- covariance = corr_to_cov(corr, std)
515
- self._set_covariance(covariance)
516
- return self
517
-
518
-
519
- class EWCovariance(BaseCovariance):
520
- r"""Exponentially Weighted Covariance estimator.
521
-
522
- Estimator of the covariance using the historical exponentially weighted returns.
523
-
524
- Parameters
525
- ----------
526
- window_size : int, optional
527
- Window size. The model is fitted on the last `window_size` observations.
528
- The default (`None`) is to use all the data.
529
-
530
- alpha : float, default=0.2
531
- Exponential smoothing factor. The default value is `0.2`.
532
-
533
- :math:`0 < \alpha \leq 1`.
534
-
535
- nearest : bool, default=False
536
- If this is set to True, the covariance is replaced by the nearest covariance
537
- matrix that is positive definite and with a Cholesky decomposition than can be
538
- computed. The variance is left unchanged. A covariance matrix is in theory PSD.
539
- However, due to floating-point inaccuracies, we can end up with a covariance
540
- matrix that is slightly non-PSD or where Cholesky decomposition is failing.
541
- This often occurs in high dimensional problems.
542
- For more details, see :func:`~skfolio.units.stats.cov_nearest`.
543
- The default is `False`.
544
-
545
- higham : bool, default=False
546
- If this is set to True, the Higham & Nick (2002) algorithm is used to find the
547
- nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
548
- above zeros (1e-13). The default is `False` and use the clipping method as the
549
- Higham & Nick algorithm can be slow for large datasets.
550
-
551
- higham_max_iteration : int, default=100
552
- Maximum number of iteration of the Higham & Nick (2002) algorithm.
553
- The default value is `100`.
554
-
555
- Attributes
556
- ----------
557
- covariance_ : ndarray of shape (n_assets, n_assets)
558
- Estimated covariance.
559
-
560
- n_features_in_ : int
561
- Number of assets seen during `fit`.
562
-
563
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
564
- Names of features seen during `fit`. Defined only when `X`
565
- has feature names that are all strings.
566
- """
567
-
568
- def __init__(
569
- self,
570
- window_size: int | None = None,
571
- alpha: float = 0.2,
572
- nearest: bool = False,
573
- higham: bool = False,
574
- higham_max_iteration: int = 100,
575
- ):
576
- super().__init__(
577
- nearest=nearest,
578
- higham=higham,
579
- higham_max_iteration=higham_max_iteration,
580
- )
581
- self.window_size = window_size
582
- self.alpha = alpha
583
-
584
- def fit(self, X: npt.ArrayLike, y=None):
585
- """Fit the Exponentially Weighted Covariance estimator.
586
-
587
- Parameters
588
- ----------
589
- X : array-like of shape (n_observations, n_assets)
590
- Price returns of the assets.
591
-
592
- y : Ignored
593
- Not used, present for API consistency by convention.
594
-
595
- Returns
596
- -------
597
- self : EWCovariance
598
- Fitted estimator.
599
- """
600
- X = self._validate_data(X)
601
- if self.window_size is not None:
602
- X = X[-self.window_size :]
603
- n_observations = X.shape[0]
604
- covariance = (
605
- pd.DataFrame(X)
606
- .ewm(alpha=self.alpha)
607
- .cov()
608
- .loc[(n_observations - 1, slice(None)), :]
609
- .to_numpy()
610
- )
611
- self._set_covariance(covariance)
612
- return self
613
-
614
-
615
- class LedoitWolf(BaseCovariance, skc.LedoitWolf):
616
- """LedoitWolf Estimator.
617
-
618
- Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
619
- coefficient is computed using O. Ledoit and M. Wolf's formula as
620
- described in [1]_.
621
-
622
- Read more in `scikit-learn
623
- <https://scikit-learn.org/stable/modules/generated/sklearn.covariance.ShrunkCovariance.html>`_.
624
-
625
- Parameters
626
- ----------
627
- store_precision : bool, default=True
628
- Specify if the estimated precision is stored.
629
-
630
- assume_centered : bool, default=False
631
- If True, data will not be centered before computation.
632
- Useful when working with data whose mean is almost, but not exactly
633
- zero.
634
- If False (default), data will be centered before computation.
635
-
636
- block_size : int, default=1000
637
- Size of blocks into which the covariance matrix will be split
638
- during its Ledoit-Wolf estimation. This is purely a memory
639
- optimization and does not affect results.
640
-
641
- nearest : bool, default=False
642
- If this is set to True, the covariance is replaced by the nearest covariance
643
- matrix that is positive definite and with a Cholesky decomposition than can be
644
- computed. The variance is left unchanged. A covariance matrix is in theory PSD.
645
- However, due to floating-point inaccuracies, we can end up with a covariance
646
- matrix that is slightly non-PSD or where Cholesky decomposition is failing.
647
- This often occurs in high dimensional problems.
648
- For more details, see :func:`~skfolio.units.stats.cov_nearest`.
649
- The default is `False`.
650
-
651
- higham : bool, default=False
652
- If this is set to True, the Higham & Nick (2002) algorithm is used to find the
653
- nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
654
- above zeros (1e-13). The default is `False` and use the clipping method as the
655
- Higham & Nick algorithm can be slow for large datasets.
656
-
657
- higham_max_iteration : int, default=100
658
- Maximum number of iteration of the Higham & Nick (2002) algorithm.
659
- The default value is `100`.
660
-
661
- Attributes
662
- ----------
663
- covariance_ : ndarray of shape (n_assets, n_assets)
664
- Estimated covariance.
665
-
666
- location_ : ndarray of shape (n_assets,)
667
- Estimated location, i.e. the estimated mean.
668
-
669
- precision_ : ndarray of shape (n_assets, n_assets)
670
- Estimated pseudo inverse matrix.
671
- (stored only if store_precision is True)
672
-
673
- shrinkage_ : float
674
- Coefficient in the convex combination used for the computation
675
- of the shrunk estimate. Range is [0, 1].
676
-
677
- n_features_in_ : int
678
- Number of assets seen during `fit`.
679
-
680
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
681
- Names of features seen during `fit`. Defined only when `X`
682
- has feature names that are all strings.
683
-
684
- Notes
685
- -----
686
- The regularised covariance is:
687
-
688
- (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
689
-
690
- where mu = trace(cov) / n_features
691
- and shrinkage is given by the Ledoit and Wolf formula (see References)
692
-
693
- References
694
- ----------
695
- .. [1] "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices".
696
- Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2.
697
- February 2004, pages 365-41.
698
- """
699
-
700
- def __init__(
701
- self,
702
- store_precision=True,
703
- assume_centered=False,
704
- block_size=1000,
705
- nearest: bool = False,
706
- higham: bool = False,
707
- higham_max_iteration: int = 100,
708
- ):
709
- super().__init__(
710
- nearest=nearest,
711
- higham=higham,
712
- higham_max_iteration=higham_max_iteration,
713
- )
714
- skc.LedoitWolf.__init__(
715
- self,
716
- store_precision=store_precision,
717
- assume_centered=assume_centered,
718
- block_size=block_size,
719
- )
720
-
721
- def fit(self, X: npt.ArrayLike, y=None) -> "LedoitWolf":
722
- """Fit the Ledoit-Wolf shrunk covariance model to X.
723
-
724
- Parameters
725
- ----------
726
- X : array-like of shape (n_observations, n_assets)
727
- Price returns of the assets.
728
-
729
- y : Ignored
730
- Not used, present for API consistency by convention.
731
-
732
- Returns
733
- -------
734
- self : LedoitWolf
735
- Fitted estimator.
736
- """
737
- skc.LedoitWolf.fit(self, X)
738
- self._set_covariance(self.covariance_)
739
- return self
740
-
741
-
742
- class OAS(BaseCovariance, skc.OAS):
743
- """Oracle Approximating Shrinkage Estimator as proposed in [1]_.
744
-
745
- Read more in `scikit-learn
746
- <https://scikit-learn.org/stable/modules/generated/sklearn.covariance.ShrunkCovariance.html>`_.
747
-
748
- Parameters
749
- ----------
750
- store_precision : bool, default=True
751
- Specify if the estimated precision is stored.
752
-
753
- assume_centered : bool, default=False
754
- If True, data will not be centered before computation.
755
- Useful when working with data whose mean is almost, but not exactly
756
- zero.
757
- If False (default), data will be centered before computation.
758
-
759
- Attributes
760
- ----------
761
- covariance_ : ndarray of shape (n_assets, n_assets)
762
- Estimated covariance.
763
-
764
- location_ : ndarray of shape (n_assets,)
765
- Estimated location, i.e. the estimated mean.
766
-
767
- precision_ : ndarray of shape (n_assets, n_assets)
768
- Estimated pseudo inverse matrix.
769
- (stored only if store_precision is True)
770
-
771
- shrinkage_ : float
772
- Coefficient in the convex combination used for the computation
773
- of the shrunk estimate. Range is [0, 1].
774
-
775
- n_features_in_ : int
776
- Number of assets seen during `fit`.
777
-
778
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
779
- Names of features seen during `fit`. Defined only when `X`
780
- has feature names that are all strings.
781
-
782
- Notes
783
- -----
784
- The regularised covariance is:
785
-
786
- (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
787
-
788
- where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
789
- (see [1]_).
790
-
791
- The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
792
- the original article, formula (23) states that 2/p (p being the number of
793
- features) is multiplied by Trace(cov*cov) in both the numerator and
794
- denominator, but this operation is omitted because for a large p, the value
795
- of 2/p is so small that it doesn't affect the value of the estimator.
796
-
797
- References
798
- ----------
799
- .. [1] "Shrinkage algorithms for MMSE covariance estimation".
800
- Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
801
- IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
802
- """
803
-
804
- def __init__(
805
- self,
806
- store_precision=True,
807
- assume_centered=False,
808
- nearest: bool = False,
809
- higham: bool = False,
810
- higham_max_iteration: int = 100,
811
- ):
812
- super().__init__(
813
- nearest=nearest,
814
- higham=higham,
815
- higham_max_iteration=higham_max_iteration,
816
- )
817
- skc.OAS.__init__(
818
- self,
819
- store_precision=store_precision,
820
- assume_centered=assume_centered,
821
- )
822
-
823
- def fit(self, X: npt.ArrayLike, y=None) -> "OAS":
824
- """Fit the Oracle Approximating Shrinkage covariance model to X.
825
-
826
- Parameters
827
- ----------
828
- X : array-like of shape (n_observations, n_assets)
829
- Price returns of the assets.
830
-
831
- y : Ignored
832
- Not used, present for API consistency by convention.
833
-
834
- Returns
835
- -------
836
- self : OAS
837
- Fitted estimator.
838
- """
839
- skc.OAS.fit(self, X)
840
- self._set_covariance(self.covariance_)
841
- return self
842
-
843
-
844
- class ShrunkCovariance(BaseCovariance, skc.ShrunkCovariance):
845
- """Covariance estimator with shrinkage.
846
-
847
- Read more in `scikit-learn
848
- <https://scikit-learn.org/stable/modules/generated/sklearn.covariance.ShrunkCovariance.html>`_.
849
-
850
- Parameters
851
- ----------
852
- store_precision : bool, default=True
853
- Specify if the estimated precision is stored.
854
-
855
- assume_centered : bool, default=False
856
- If True, data will not be centered before computation.
857
- Useful when working with data whose mean is almost, but not exactly
858
- zero.
859
- If False (default), data will be centered before computation.
860
-
861
- shrinkage : float, default=0.1
862
- Coefficient in the convex combination used for the computation
863
- of the shrunk estimate. Range is [0, 1].
864
-
865
- Attributes
866
- ----------
867
- covariance_ : ndarray of shape (n_assets, n_assets)
868
- Estimated covariance.
869
-
870
- location_ : ndarray of shape (n_assets,)
871
- Estimated location, i.e. the estimated mean.
872
-
873
- precision_ : ndarray of shape (n_assets, n_assets)
874
- Estimated pseudo inverse matrix.
875
- (stored only if store_precision is True)
876
-
877
- n_features_in_ : int
878
- Number of assets seen during `fit`.
879
-
880
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
881
- Names of features seen during `fit`. Defined only when `X`
882
- has feature names that are all strings.
883
-
884
- Notes
885
- -----
886
- The regularized covariance is given by:
887
-
888
- (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
889
-
890
- where mu = trace(cov) / n_features
891
- """
892
-
893
- def __init__(
894
- self,
895
- store_precision=True,
896
- assume_centered=False,
897
- shrinkage=0.1,
898
- nearest: bool = False,
899
- higham: bool = False,
900
- higham_max_iteration: int = 100,
901
- ):
902
- super().__init__(
903
- nearest=nearest,
904
- higham=higham,
905
- higham_max_iteration=higham_max_iteration,
906
- )
907
- skc.ShrunkCovariance.__init__(
908
- self,
909
- store_precision=store_precision,
910
- assume_centered=assume_centered,
911
- shrinkage=shrinkage,
912
- )
913
-
914
- def fit(self, X: npt.ArrayLike, y=None) -> "ShrunkCovariance":
915
- """Fit the shrunk covariance model to X.
916
-
917
- Parameters
918
- ----------
919
- X : array-like of shape (n_observations, n_assets)
920
- Price returns of the assets.
921
-
922
- y : Ignored
923
- Not used, present for API consistency by convention.
924
-
925
- Returns
926
- -------
927
- self : ShrunkCovariance
928
- Fitted estimator.
929
- """
930
- skc.ShrunkCovariance.fit(self, X)
931
- self._set_covariance(self.covariance_)
932
- return self
933
-
934
-
935
- class GraphicalLassoCV(BaseCovariance, skc.GraphicalLassoCV):
936
- """Sparse inverse covariance with cross-validated choice of the l1 penalty.
937
-
938
- Read more in `scikit-learn
939
- <https://scikit-learn.org/stable/auto_examples/covariance/plot_sparse_cov.html>`_.
940
-
941
- Parameters
942
- ----------
943
- alphas : int or array-like of shape (n_alphas,), dtype=float, default=4
944
- If an integer is given, it fixes the number of points on the
945
- grids of alpha to be used. If a list is given, it gives the
946
- grid to be used. See the notes in the class docstring for
947
- more details. Range is [1, inf) for an integer.
948
- Range is (0, inf] for an array-like of floats.
949
-
950
- n_refinements : int, default=4
951
- The number of times the grid is refined. Not used if explicit
952
- values of alphas are passed. Range is [1, inf).
953
-
954
- cv : int, cross-validation generator or iterable, default=None
955
- Determines the cross-validation splitting strategy.
956
- Possible inputs for cv are:
957
-
958
- - None, to use the default 5-fold cross-validation,
959
- - integer, to specify the number of folds.
960
- - `CV splitter`,
961
- - An iterable yielding (train, test) splits as arrays of indices.
962
-
963
- For integer/None inputs :class:`KFold` is used.
964
-
965
- tol : float, default=1e-4
966
- The tolerance to declare convergence: if the dual gap goes below
967
- this value, iterations are stopped. Range is (0, inf].
968
-
969
- enet_tol : float, default=1e-4
970
- The tolerance for the elastic net solver used to calculate the descent
971
- direction. This parameter controls the accuracy of the search direction
972
- for a given column update, not of the overall parameter estimate. Only
973
- used for mode='cd'. Range is (0, inf].
974
-
975
- max_iter : int, default=100
976
- Maximum number of iterations.
977
-
978
- mode : {'cd', 'lars'}, default='cd'
979
- The Lasso solver to use: coordinate descent or LARS. Use LARS for
980
- very sparse underlying graphs, where number of features is greater
981
- than number of samples. Elsewhere prefer cd which is more numerically
982
- stable.
983
-
984
- n_jobs : int, default=None
985
- Number of jobs to run in parallel.
986
- `None` means 1 unless in a :obj:`joblib.parallel_backend` context.
987
- `-1` means using all processors.
988
-
989
- verbose : bool, default=False
990
- If verbose is True, the objective function and duality gap are
991
- printed at each iteration.
992
-
993
- assume_centered : bool, default=False
994
- If True, data are not centered before computation.
995
- Useful when working with data whose mean is almost, but not exactly
996
- zero.
997
- If False, data are centered before computation.
998
-
999
- Attributes
1000
- ----------
1001
- covariance_ : ndarray of shape (n_assets, n_assets)
1002
- Estimated covariance.
1003
-
1004
- location_ : ndarray of shape (n_assets,)
1005
- Estimated location, i.e. the estimated mean.
1006
-
1007
- precision_ : ndarray of shape (n_assets, n_assets)
1008
- Estimated pseudo inverse matrix.
1009
- (stored only if store_precision is True)
1010
-
1011
- alpha_ : float
1012
- Penalization parameter selected.
1013
-
1014
- cv_results_ : dict of ndarrays
1015
- A dict with keys:
1016
-
1017
- alphas : ndarray of shape (n_alphas,)
1018
- All penalization parameters explored.
1019
-
1020
- split(k)_test_score : ndarray of shape (n_alphas,)
1021
- Log-likelihood score on left-out data across (k)th fold.
1022
-
1023
- .. versionadded:: 1.0
1024
-
1025
- mean_test_score : ndarray of shape (n_alphas,)
1026
- Mean of scores over the folds.
1027
-
1028
- .. versionadded:: 1.0
1029
-
1030
- std_test_score : ndarray of shape (n_alphas,)
1031
- Standard deviation of scores over the folds.
1032
-
1033
- .. versionadded:: 1.0
1034
-
1035
- n_iter_ : int
1036
- Number of iterations run for the optimal alpha.
1037
-
1038
- n_features_in_ : int
1039
- Number of assets seen during `fit`.
1040
-
1041
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
1042
- Names of features seen during `fit`. Defined only when `X`
1043
- has feature names that are all strings.
1044
-
1045
- Notes
1046
- -----
1047
- The search for the optimal penalization parameter (`alpha`) is done on an
1048
- iteratively refined grid: first the cross-validated scores on a grid are
1049
- computed, then a new refined grid is centered around the maximum, and so
1050
- on.
1051
-
1052
- One of the challenges which is faced here is that the solvers can
1053
- fail to converge to a well-conditioned estimate. The corresponding
1054
- values of `alpha` then come out as missing values, but the optimum may
1055
- be close to these missing values.
1056
-
1057
- In `fit`, once the best parameter `alpha` is found through
1058
- cross-validation, the model is fit again using the entire training set.
1059
- """
1060
-
1061
- def __init__(
1062
- self,
1063
- alphas=4,
1064
- n_refinements=4,
1065
- cv=None,
1066
- tol=1e-4,
1067
- enet_tol=1e-4,
1068
- max_iter=100,
1069
- mode="cd",
1070
- n_jobs=None,
1071
- verbose=False,
1072
- assume_centered=False,
1073
- nearest: bool = False,
1074
- higham: bool = False,
1075
- higham_max_iteration: int = 100,
1076
- ):
1077
- super().__init__(
1078
- nearest=nearest,
1079
- higham=higham,
1080
- higham_max_iteration=higham_max_iteration,
1081
- )
1082
- skc.GraphicalLassoCV.__init__(
1083
- self,
1084
- alphas=alphas,
1085
- n_refinements=n_refinements,
1086
- cv=cv,
1087
- tol=tol,
1088
- enet_tol=enet_tol,
1089
- max_iter=max_iter,
1090
- mode=mode,
1091
- n_jobs=n_jobs,
1092
- verbose=verbose,
1093
- assume_centered=assume_centered,
1094
- )
1095
-
1096
- def fit(self, X, y=None) -> "GraphicalLassoCV":
1097
- """Fit the GraphicalLasso covariance model to X.
1098
-
1099
- Parameters
1100
- ----------
1101
- X : array-like of shape (n_observations, n_assets)
1102
- Price returns of the assets.
1103
-
1104
- y : Ignored
1105
- Not used, present for API consistency by convention.
1106
-
1107
- Returns
1108
- -------
1109
- self : GraphicalLassoCV
1110
- Fitted estimator.
1111
- """
1112
- skc.GraphicalLassoCV.fit(self, X)
1113
- self._set_covariance(self.covariance_)
1114
- return self