skfolio 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. skfolio/datasets/__init__.py +2 -0
  2. skfolio/datasets/_base.py +51 -0
  3. skfolio/distance/_distance.py +15 -4
  4. skfolio/model_selection/_combinatorial.py +2 -2
  5. skfolio/model_selection/_validation.py +70 -15
  6. skfolio/model_selection/_walk_forward.py +3 -3
  7. skfolio/moments/__init__.py +2 -0
  8. skfolio/moments/covariance/__init__.py +11 -11
  9. skfolio/moments/covariance/_base.py +10 -9
  10. skfolio/moments/covariance/_denoise_covariance.py +181 -0
  11. skfolio/moments/covariance/_detone_covariance.py +158 -0
  12. skfolio/moments/covariance/_empirical_covariance.py +100 -0
  13. skfolio/moments/covariance/_ew_covariance.py +109 -0
  14. skfolio/moments/covariance/_gerber_covariance.py +157 -0
  15. skfolio/moments/covariance/_graphical_lasso_cv.py +194 -0
  16. skfolio/moments/covariance/_implied_covariance.py +454 -0
  17. skfolio/moments/covariance/_ledoit_wolf.py +140 -0
  18. skfolio/moments/covariance/_oas.py +115 -0
  19. skfolio/moments/covariance/_shrunk_covariance.py +104 -0
  20. skfolio/moments/expected_returns/__init__.py +4 -7
  21. skfolio/moments/expected_returns/_empirical_mu.py +63 -0
  22. skfolio/moments/expected_returns/_equilibrium_mu.py +124 -0
  23. skfolio/moments/expected_returns/_ew_mu.py +69 -0
  24. skfolio/moments/expected_returns/{_expected_returns.py → _shrunk_mu.py} +22 -200
  25. skfolio/optimization/cluster/_nco.py +46 -8
  26. skfolio/optimization/cluster/hierarchical/_base.py +21 -1
  27. skfolio/optimization/cluster/hierarchical/_herc.py +18 -4
  28. skfolio/optimization/cluster/hierarchical/_hrp.py +13 -4
  29. skfolio/optimization/convex/_base.py +10 -1
  30. skfolio/optimization/convex/_distributionally_robust.py +12 -2
  31. skfolio/optimization/convex/_maximum_diversification.py +9 -2
  32. skfolio/optimization/convex/_mean_risk.py +33 -6
  33. skfolio/optimization/convex/_risk_budgeting.py +5 -2
  34. skfolio/optimization/ensemble/_stacking.py +32 -9
  35. skfolio/optimization/naive/_naive.py +20 -2
  36. skfolio/population/_population.py +2 -0
  37. skfolio/prior/_base.py +1 -1
  38. skfolio/prior/_black_litterman.py +20 -2
  39. skfolio/prior/_empirical.py +38 -5
  40. skfolio/prior/_factor_model.py +44 -7
  41. skfolio/uncertainty_set/_base.py +30 -9
  42. skfolio/uncertainty_set/_bootstrap.py +26 -10
  43. skfolio/uncertainty_set/_empirical.py +25 -10
  44. skfolio/utils/stats.py +24 -3
  45. skfolio/utils/tools.py +213 -79
  46. {skfolio-0.2.3.dist-info → skfolio-0.3.0.dist-info}/METADATA +3 -2
  47. skfolio-0.3.0.dist-info/RECORD +91 -0
  48. {skfolio-0.2.3.dist-info → skfolio-0.3.0.dist-info}/WHEEL +1 -1
  49. skfolio/moments/covariance/_covariance.py +0 -1114
  50. skfolio-0.2.3.dist-info/RECORD +0 -79
  51. {skfolio-0.2.3.dist-info → skfolio-0.3.0.dist-info}/LICENSE +0 -0
  52. {skfolio-0.2.3.dist-info → skfolio-0.3.0.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ from skfolio.datasets._base import (
8
8
  load_ftse100_dataset,
9
9
  load_nasdaq_dataset,
10
10
  load_sp500_dataset,
11
+ load_sp500_implied_vol_dataset,
11
12
  load_sp500_index,
12
13
  )
13
14
 
@@ -17,4 +18,5 @@ __all__ = [
17
18
  "load_ftse100_dataset",
18
19
  "load_sp500_dataset",
19
20
  "load_sp500_index",
21
+ "load_sp500_implied_vol_dataset",
20
22
  ]
skfolio/datasets/_base.py CHANGED
@@ -392,3 +392,54 @@ def load_nasdaq_dataset(data_home=None, download_if_missing=True) -> pd.DataFram
392
392
  data_filename, data_home=data_home, download_if_missing=download_if_missing
393
393
  )
394
394
  return df
395
+
396
+
397
+ def load_sp500_implied_vol_dataset(
398
+ data_home=None, download_if_missing=True
399
+ ) -> pd.DataFrame:
400
+ """Load the 3 months ATM implied volatility of the 20 assets from the
401
+ SP500 dataset.
402
+
403
+ This dataset is composed of the 3 months ATM implied volatility of 20 assets
404
+ from the S&P 500 composition starting from 2010-01-04 up to 2022-12-28.
405
+
406
+ The data comes from the Yahoo public API option chains.
407
+
408
+ ============== ==================
409
+ Observations 3270
410
+ Assets 20
411
+ ============== ==================
412
+
413
+ Parameters
414
+ ----------
415
+ data_home : str, optional
416
+ Specify another download and cache folder for the datasets.
417
+ By default, all skfolio data is stored in `~/skfolio_data` subfolders.
418
+
419
+ download_if_missing : bool, default=True
420
+ If False, raise an OSError if the data is not locally available
421
+ instead of trying to download the data from the source site.
422
+
423
+ Returns
424
+ -------
425
+ df : DataFrame of shape (n_observations, n_assets)
426
+ Implied volatility DataFrame
427
+
428
+ Examples
429
+ --------
430
+ >>> from skfolio.datasets import load_sp500_implied_vol_dataset
431
+ >>> implied_vol = load_sp500_implied_vol_dataset()
432
+ >>> implied_vol.head()
433
+ AAPL AMD BAC ... UNH WMT XOM
434
+ Date ...
435
+ 2010-01-04 0.364353 0.572056 0.382926 ... 0.362751 0.171737 0.201485
436
+ 2010-01-05 0.371865 0.568791 0.374699 ... 0.368504 0.174764 0.203852
437
+ 2010-01-06 0.356746 0.558054 0.349220 ... 0.368514 0.171892 0.197475
438
+ 2010-01-07 0.361084 0.560475 0.354942 ... 0.355792 0.169083 0.200046
439
+ 2010-01-08 0.348085 0.543932 0.360345 ... 0.351130 0.170897 0.204832
440
+ """
441
+ data_filename = "sp500_implied_vol_dataset"
442
+ df = download_dataset(
443
+ data_filename, data_home=data_home, download_if_missing=download_if_missing
444
+ )
445
+ return df
@@ -9,7 +9,8 @@ import numpy.typing as npt
9
9
  import pandas as pd
10
10
  import scipy.spatial.distance as scd
11
11
  import scipy.stats as sct
12
- import sklearn.metrics as skm
12
+ import sklearn.metrics as skmc
13
+ import sklearn.utils.metadata_routing as skm
13
14
 
14
15
  from skfolio.distance._base import BaseDistance
15
16
  from skfolio.moments import BaseCovariance, GerberCovariance
@@ -300,7 +301,15 @@ class CovarianceDistance(BaseDistance):
300
301
  self.absolute = absolute
301
302
  self.power = power
302
303
 
303
- def fit(self, X: npt.ArrayLike, y=None) -> "CovarianceDistance":
304
+ def get_metadata_routing(self):
305
+ # noinspection PyTypeChecker
306
+ router = skm.MetadataRouter(owner=self.__class__.__name__).add(
307
+ covariance_estimator=self.covariance_estimator,
308
+ method_mapping=skm.MethodMapping().add(caller="fit", callee="fit"),
309
+ )
310
+ return router
311
+
312
+ def fit(self, X: npt.ArrayLike, y=None, **fit_params) -> "CovarianceDistance":
304
313
  """Fit the Covariance Distance estimator.
305
314
 
306
315
  Parameters
@@ -316,13 +325,15 @@ class CovarianceDistance(BaseDistance):
316
325
  self : CovarianceDistance
317
326
  Fitted estimator.
318
327
  """
328
+ routed_params = skm.process_routing(self, "fit", **fit_params)
329
+
319
330
  # fitting estimators
320
331
  self.covariance_estimator_ = check_estimator(
321
332
  self.covariance_estimator,
322
333
  default=GerberCovariance(),
323
334
  check_type=BaseCovariance,
324
335
  )
325
- self.covariance_estimator_.fit(X)
336
+ self.covariance_estimator_.fit(X, y, **routed_params.covariance_estimator.fit)
326
337
 
327
338
  # we validate and convert to numpy after all models have been fitted to keep the
328
339
  # features names information.
@@ -512,7 +523,7 @@ class MutualInformation(BaseDistance):
512
523
  x = X[:, i]
513
524
  y = X[:, j]
514
525
  contingency = np.histogram2d(x, y, bins=n_bins)[0]
515
- mutual_information = skm.mutual_info_score(
526
+ mutual_information = skmc.mutual_info_score(
516
527
  None, None, contingency=contingency
517
528
  )
518
529
  entropy_x = sct.entropy(np.histogram(x, n_bins)[0])
@@ -18,7 +18,7 @@ import numpy as np
18
18
  import numpy.typing as npt
19
19
  import pandas as pd
20
20
  import plotly.graph_objects as go
21
- import sklearn.model_selection as skm
21
+ import sklearn.model_selection as sks
22
22
  import sklearn.utils as sku
23
23
 
24
24
  import skfolio.typing as skt
@@ -39,7 +39,7 @@ class BaseCombinatorialCV(ABC):
39
39
  """Return the path id of each test sets in each split"""
40
40
  pass
41
41
 
42
- __repr__ = skm.BaseCrossValidator.__repr__
42
+ __repr__ = sks.BaseCrossValidator.__repr__
43
43
 
44
44
 
45
45
  # TODO: review params and function naming
@@ -12,8 +12,10 @@ import numpy as np
12
12
  import numpy.typing as npt
13
13
  import sklearn as sk
14
14
  import sklearn.base as skb
15
- import sklearn.model_selection as skm
15
+ import sklearn.exceptions as ske
16
+ import sklearn.model_selection as sks
16
17
  import sklearn.utils as sku
18
+ import sklearn.utils.metadata_routing as skm
17
19
  import sklearn.utils.parallel as skp
18
20
 
19
21
  from skfolio.model_selection._combinatorial import BaseCombinatorialCV
@@ -22,16 +24,26 @@ from skfolio.portfolio import MultiPeriodPortfolio
22
24
  from skfolio.utils.tools import fit_and_predict, safe_split
23
25
 
24
26
 
27
+ def _routing_enabled():
28
+ """Return whether metadata routing is enabled.
29
+ Returns
30
+ -------
31
+ enabled : bool
32
+ Whether metadata routing is enabled. If the config is not set, it
33
+ defaults to False.
34
+ """
35
+ return sk.get_config().get("enable_metadata_routing", False)
36
+
37
+
25
38
  def cross_val_predict(
26
39
  estimator: skb.BaseEstimator,
27
40
  X: npt.ArrayLike,
28
41
  y: npt.ArrayLike = None,
29
- groups: np.ndarray | None = None,
30
- cv: skm.BaseCrossValidator | BaseCombinatorialCV | int | None = None,
42
+ cv: sks.BaseCrossValidator | BaseCombinatorialCV | int | None = None,
31
43
  n_jobs: int | None = None,
32
44
  method: str = "predict",
33
45
  verbose: int = 0,
34
- fit_params: dict | None = None,
46
+ params: dict | None = None,
35
47
  pre_dispatch: str = "2*n_jobs",
36
48
  column_indices: np.ndarray | None = None,
37
49
  portfolio_params: dict | None = None,
@@ -65,11 +77,6 @@ def cross_val_predict(
65
77
  Target data (optional).
66
78
  For example, the price returns of the factors.
67
79
 
68
- groups : array-like of shape (n_observations,), optional
69
- Group labels for the samples used while splitting the dataset into
70
- train/test set. Only used in conjunction with a "Group" `cv`
71
- instance (e.g., `GroupKFold`).
72
-
73
80
  cv : int | cross-validation generator, optional
74
81
  Determines the cross-validation splitting strategy.
75
82
  Possible inputs for cv are:
@@ -90,8 +97,8 @@ def cross_val_predict(
90
97
  verbose : int, default=0
91
98
  The verbosity level.
92
99
 
93
- fit_params : dict, optional
94
- Parameters to pass to the fit method of the estimator.
100
+ params : dict, optional
101
+ Parameters to pass to the underlying estimator's ``fit`` and the CV splitter.
95
102
 
96
103
  pre_dispatch : int or str, default='2*n_jobs'
97
104
  Controls the number of jobs that get dispatched during parallel
@@ -121,10 +128,57 @@ def cross_val_predict(
121
128
  predictions : MultiPeriodPortfolio | Population
122
129
  This is the result of calling `predict`
123
130
  """
131
+ params = {} if params is None else params
132
+
124
133
  X, y = safe_split(X, y, indices=column_indices, axis=1)
125
- X, y, groups = sku.indexable(X, y, groups)
126
- cv = skm.check_cv(cv, y)
127
- splits = list(cv.split(X, y, groups))
134
+ X, y = sku.indexable(X, y)
135
+
136
+ if _routing_enabled():
137
+ # For estimators, a MetadataRouter is created in get_metadata_routing
138
+ # methods. For these router methods, we create the router to use
139
+ # `process_routing` on it.
140
+ # noinspection PyTypeChecker
141
+ router = (
142
+ skm.MetadataRouter(owner="cross_validate")
143
+ .add(
144
+ splitter=cv,
145
+ method_mapping=skm.MethodMapping().add(caller="fit", callee="split"),
146
+ )
147
+ .add(
148
+ estimator=estimator,
149
+ method_mapping=skm.MethodMapping().add(caller="fit", callee="fit"),
150
+ )
151
+ )
152
+ try:
153
+ routed_params = skm.process_routing(router, "fit", **params)
154
+ except ske.UnsetMetadataPassedError as e:
155
+ # The default exception would mention `fit` since in the above
156
+ # `process_routing` code, we pass `fit` as the caller. However,
157
+ # the user is not calling `fit` directly, so we change the message
158
+ # to make it more suitable for this case.
159
+ unrequested_params = sorted(e.unrequested_params)
160
+ raise ske.UnsetMetadataPassedError(
161
+ message=(
162
+ f"{unrequested_params} are passed to `cross_val_predict` but are"
163
+ " not explicitly set as requested or not requested for"
164
+ f" cross_validate's estimator: {estimator.__class__.__name__} Call"
165
+ " `.set_fit_request({{metadata}}=True)` on the estimator for"
166
+ f" each metadata in {unrequested_params} that you want to use and"
167
+ " `metadata=False` for not using it. See the Metadata Routing User"
168
+ " guide <https://scikit-learn.org/stable/metadata_routing.html>"
169
+ " for more information."
170
+ ),
171
+ unrequested_params=e.unrequested_params,
172
+ routed_params=e.routed_params,
173
+ ) from None
174
+ else:
175
+ routed_params = sku.Bunch()
176
+ routed_params.splitter = sku.Bunch(split={})
177
+ routed_params.estimator = sku.Bunch(fit=params)
178
+
179
+ cv = sks.check_cv(cv, y)
180
+ splits = list(cv.split(X, y, **routed_params.splitter.split))
181
+
128
182
  portfolio_params = {} if portfolio_params is None else portfolio_params.copy()
129
183
 
130
184
  # We ensure that the folds are not shuffled
@@ -148,6 +202,7 @@ def cross_val_predict(
148
202
  # and that it is pickle-able.
149
203
  parallel = skp.Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
150
204
  # TODO remove when https://github.com/joblib/joblib/issues/1071 is fixed
205
+ # noinspection PyCallingNonCallable
151
206
  predictions = parallel(
152
207
  skp.delayed(fit_and_predict)(
153
208
  sk.clone(estimator),
@@ -155,7 +210,7 @@ def cross_val_predict(
155
210
  y,
156
211
  train=train,
157
212
  test=test,
158
- fit_params=fit_params,
213
+ fit_params=routed_params.estimator.fit,
159
214
  method=method,
160
215
  )
161
216
  for train, test in splits
@@ -12,11 +12,11 @@ from collections.abc import Iterator
12
12
 
13
13
  import numpy as np
14
14
  import numpy.typing as npt
15
- import sklearn.model_selection as skm
15
+ import sklearn.model_selection as sks
16
16
  import sklearn.utils as sku
17
17
 
18
18
 
19
- class WalkForward(skm.BaseCrossValidator):
19
+ class WalkForward(sks.BaseCrossValidator):
20
20
  """Walk Forward cross-validator.
21
21
 
22
22
  Provides train/test indices to split time series data samples in a walk forward
@@ -196,7 +196,7 @@ class WalkForward(skm.BaseCrossValidator):
196
196
  )
197
197
  test_start = test_end
198
198
 
199
- def get_n_splits(self, X: npt.ArrayLike, y=None, groups=None) -> int:
199
+ def get_n_splits(self, X=None, y=None, groups=None) -> int:
200
200
  """Returns the number of splitting iterations in the cross-validator
201
201
 
202
202
  Parameters
@@ -9,6 +9,7 @@ from skfolio.moments.covariance import (
9
9
  EmpiricalCovariance,
10
10
  GerberCovariance,
11
11
  GraphicalLassoCV,
12
+ ImpliedCovariance,
12
13
  LedoitWolf,
13
14
  ShrunkCovariance,
14
15
  )
@@ -38,4 +39,5 @@ __all__ = [
38
39
  "OAS",
39
40
  "ShrunkCovariance",
40
41
  "GraphicalLassoCV",
42
+ "ImpliedCovariance",
41
43
  ]
@@ -3,17 +3,16 @@
3
3
  from skfolio.moments.covariance._base import (
4
4
  BaseCovariance,
5
5
  )
6
- from skfolio.moments.covariance._covariance import (
7
- OAS,
8
- DenoiseCovariance,
9
- DetoneCovariance,
10
- EWCovariance,
11
- EmpiricalCovariance,
12
- GerberCovariance,
13
- GraphicalLassoCV,
14
- LedoitWolf,
15
- ShrunkCovariance,
16
- )
6
+ from skfolio.moments.covariance._denoise_covariance import DenoiseCovariance
7
+ from skfolio.moments.covariance._detone_covariance import DetoneCovariance
8
+ from skfolio.moments.covariance._empirical_covariance import EmpiricalCovariance
9
+ from skfolio.moments.covariance._ew_covariance import EWCovariance
10
+ from skfolio.moments.covariance._gerber_covariance import GerberCovariance
11
+ from skfolio.moments.covariance._graphical_lasso_cv import GraphicalLassoCV
12
+ from skfolio.moments.covariance._implied_covariance import ImpliedCovariance
13
+ from skfolio.moments.covariance._ledoit_wolf import LedoitWolf
14
+ from skfolio.moments.covariance._oas import OAS
15
+ from skfolio.moments.covariance._shrunk_covariance import ShrunkCovariance
17
16
 
18
17
  __all__ = [
19
18
  "BaseCovariance",
@@ -26,4 +25,5 @@ __all__ = [
26
25
  "OAS",
27
26
  "ShrunkCovariance",
28
27
  "GraphicalLassoCV",
28
+ "ImpliedCovariance",
29
29
  ]
@@ -22,19 +22,19 @@ class BaseCovariance(skb.BaseEstimator, ABC):
22
22
 
23
23
  Parameters
24
24
  ----------
25
- nearest : bool, default=False
25
+ nearest : bool, default=True
26
26
  If this is set to True, the covariance is replaced by the nearest covariance
27
27
  matrix that is positive definite and with a Cholesky decomposition than can be
28
- computed. The variance is left unchanged. A covariance matrix is in theory PSD.
29
- However, due to floating-point inaccuracies, we can end up with a covariance
30
- matrix that is slightly non-PSD or where Cholesky decomposition is failing.
31
- This often occurs in high dimensional problems.
32
- For more details, see :func:`~skfolio.units.stats.cov_nearest`.
33
- The default is `False`.
28
+ computed. The variance is left unchanged.
29
+ A covariance matrix that is not positive definite often occurs in high
30
+ dimensional problems. It can be due to multicollinearity, floating-point
31
+ inaccuracies, or when the number of observations is smaller than the number of
32
+ assets. For more details, see :func:`~skfolio.utils.stats.cov_nearest`.
33
+ The default is `True`.
34
34
 
35
35
  higham : bool, default=False
36
36
  If this is set to True, the Higham & Nick (2002) algorithm is used to find the
37
- nearest PSD covariance, otherwise the eigenvalues are clipped to a threshold
37
+ nearest PD covariance, otherwise the eigenvalues are clipped to a threshold
38
38
  above zeros (1e-13). The default is `False` and use the clipping method as the
39
39
  Higham & Nick algorithm can be slow for large datasets.
40
40
 
@@ -59,7 +59,7 @@ class BaseCovariance(skb.BaseEstimator, ABC):
59
59
  @abstractmethod
60
60
  def __init__(
61
61
  self,
62
- nearest: bool = False,
62
+ nearest: bool = True,
63
63
  higham: bool = False,
64
64
  higham_max_iteration: int = 100,
65
65
  ):
@@ -103,6 +103,7 @@ class BaseCovariance(skb.BaseEstimator, ABC):
103
103
  covariance,
104
104
  higham=self.higham,
105
105
  higham_max_iteration=self.higham_max_iteration,
106
+ warn=True,
106
107
  )
107
108
  # set covariance
108
109
  self.covariance_ = covariance
@@ -0,0 +1,181 @@
1
+ """Covariance Denoising Estimators."""
2
+
3
+ # Copyright (c) 2023
4
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
5
+ # License: BSD 3 clause
6
+ # Implementation derived from:
7
+ # scikit-learn, Copyright (c) 2007-2010 David Cournapeau, Fabian Pedregosa, Olivier
8
+ # Grisel Licensed under BSD 3 clause.
9
+
10
+ import numpy as np
11
+ import numpy.typing as npt
12
+ import scipy.optimize as sco
13
+ import sklearn.neighbors as skn
14
+ import sklearn.utils.metadata_routing as skm
15
+
16
+ from skfolio.moments.covariance._base import BaseCovariance
17
+ from skfolio.moments.covariance._empirical_covariance import EmpiricalCovariance
18
+ from skfolio.utils.stats import corr_to_cov, cov_to_corr
19
+ from skfolio.utils.tools import check_estimator
20
+
21
+
22
+ class DenoiseCovariance(BaseCovariance):
23
+ """Covariance Denoising estimator.
24
+
25
+ The goal of Covariance Denoising is to reduce the noise and enhance the signal of
26
+ the empirical covariance matrix [1]_.
27
+ It reduces the ill-conditioning of the traditional covariance estimate by
28
+ differentiating the eigenvalues associated with noise from the eigenvalues
29
+ associated with signal.
30
+ Denoising replaces the eigenvalues of the eigenvectors classified as random by
31
+ Marčenko-Pastur with a constant eigenvalue.
32
+
33
+ Parameters
34
+ ----------
35
+ covariance_estimator : BaseCovariance, optional
36
+ :ref:`Covariance estimator <covariance_estimator>` to estimate the covariance
37
+ matrix that will be denoised.
38
+ The default (`None`) is to use :class:`~skfolio.moments.EmpiricalCovariance`.
39
+
40
+ nearest : bool, default=True
41
+ If this is set to True, the covariance is replaced by the nearest covariance
42
+ matrix that is positive definite and with a Cholesky decomposition than can be
43
+ computed. The variance is left unchanged.
44
+ A covariance matrix that is not positive definite often occurs in high
45
+ dimensional problems. It can be due to multicollinearity, floating-point
46
+ inaccuracies, or when the number of observations is smaller than the number of
47
+ assets. For more details, see :func:`~skfolio.utils.stats.cov_nearest`.
48
+ The default is `True`.
49
+
50
+ higham : bool, default=False
51
+ If this is set to True, the Higham & Nick (2002) algorithm is used to find the
52
+ nearest PD covariance, otherwise the eigenvalues are clipped to a threshold
53
+ above zeros (1e-13). The default is `False` and use the clipping method as the
54
+ Higham & Nick algorithm can be slow for large datasets.
55
+
56
+ higham_max_iteration : int, default=100
57
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
58
+ The default value is `100`.
59
+
60
+ Attributes
61
+ ----------
62
+ covariance_ : ndarray of shape (n_assets, n_assets)
63
+ Estimated covariance.
64
+
65
+ covariance_estimator_ : BaseCovariance
66
+ Fitted `covariance_estimator`.
67
+
68
+ n_features_in_ : int
69
+ Number of assets seen during `fit`.
70
+
71
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
72
+ Names of assets seen during `fit`. Defined only when `X`
73
+ has assets names that are all strings.
74
+
75
+ References
76
+ ----------
77
+ .. [1] "Machine Learning for Asset Managers".
78
+ Elements in Quantitative Finance.
79
+ Lòpez de Prado (2020).
80
+ """
81
+
82
+ covariance_estimator_: BaseCovariance
83
+
84
+ def __init__(
85
+ self,
86
+ covariance_estimator: BaseCovariance | None = None,
87
+ nearest: bool = True,
88
+ higham: bool = False,
89
+ higham_max_iteration: int = 100,
90
+ ):
91
+ super().__init__(
92
+ nearest=nearest,
93
+ higham=higham,
94
+ higham_max_iteration=higham_max_iteration,
95
+ )
96
+ self.covariance_estimator = covariance_estimator
97
+
98
+ def get_metadata_routing(self):
99
+ # noinspection PyTypeChecker
100
+ router = skm.MetadataRouter(owner=self.__class__.__name__).add(
101
+ covariance_estimator=self.covariance_estimator,
102
+ method_mapping=skm.MethodMapping().add(caller="fit", callee="fit"),
103
+ )
104
+ return router
105
+
106
+ def fit(self, X: npt.ArrayLike, y=None, **fit_params) -> "DenoiseCovariance":
107
+ """Fit the Covariance Denoising estimator.
108
+
109
+ Parameters
110
+ ----------
111
+ X : array-like of shape (n_observations, n_assets)
112
+ Price returns of the assets.
113
+
114
+ y : Ignored
115
+ Not used, present for API consistency by convention.
116
+
117
+ **fit_params : dict
118
+ Parameters to pass to the underlying estimators.
119
+ Only available if `enable_metadata_routing=True`, which can be
120
+ set by using ``sklearn.set_config(enable_metadata_routing=True)``.
121
+ See :ref:`Metadata Routing User Guide <metadata_routing>` for
122
+ more details.
123
+
124
+ Returns
125
+ -------
126
+ self : DenoiseCovariance
127
+ Fitted estimator.
128
+ """
129
+ routed_params = skm.process_routing(self, "fit", **fit_params)
130
+
131
+ # fitting estimators
132
+ self.covariance_estimator_ = check_estimator(
133
+ self.covariance_estimator,
134
+ default=EmpiricalCovariance(),
135
+ check_type=BaseCovariance,
136
+ )
137
+ # noinspection PyArgumentList
138
+ self.covariance_estimator_.fit(X, y, **routed_params.covariance_estimator.fit)
139
+
140
+ # we validate and convert to numpy after all models have been fitted to keep
141
+ # features names information.
142
+ X = self._validate_data(X)
143
+ n_observations, n_assets = X.shape
144
+ q = n_observations / n_assets
145
+ corr, std = cov_to_corr(self.covariance_estimator_.covariance_)
146
+ e_val, e_vec = np.linalg.eigh(corr)
147
+ indices = e_val.argsort()[::-1]
148
+ e_val, e_vec = e_val[indices], e_vec[:, indices]
149
+
150
+ def _marchenko(x_var):
151
+ e_min, e_max = (
152
+ x_var * (1 - (1.0 / q) ** 0.5) ** 2,
153
+ x_var * (1 + (1.0 / q) ** 0.5) ** 2,
154
+ )
155
+ e_val_lin = np.linspace(e_min, e_max, 1000)
156
+ pdf_0 = (
157
+ q
158
+ / (2 * np.pi * x_var * e_val_lin)
159
+ * ((e_max - e_val_lin) * (e_val_lin - e_min)) ** 0.5
160
+ )
161
+ kde = skn.KernelDensity(kernel="gaussian", bandwidth=0.01).fit(
162
+ e_val.reshape(-1, 1)
163
+ )
164
+ # noinspection PyUnresolvedReferences
165
+ pdf_1 = np.exp(kde.score_samples(pdf_0.reshape(-1, 1)))
166
+ return np.sum((pdf_1 - pdf_0) ** 2)
167
+
168
+ # noinspection PyTypeChecker
169
+ res = sco.minimize(_marchenko, x0=0.5, bounds=((1e-5, 1 - 1e-5),))
170
+
171
+ var = res["x"][0]
172
+ n_facts = e_val.shape[0] - e_val[::-1].searchsorted(
173
+ var * (1 + (1.0 / q) ** 0.5) ** 2
174
+ )
175
+ e_val_ = e_val.copy()
176
+ e_val_[n_facts:] = e_val_[n_facts:].sum() / float(e_val_.shape[0] - n_facts)
177
+ corr = e_vec @ np.diag(e_val_) @ e_vec.T
178
+ corr, _ = cov_to_corr(corr)
179
+ covariance = corr_to_cov(corr, std)
180
+ self._set_covariance(covariance)
181
+ return self