skfolio 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,8 +52,6 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
52
52
  * ENTROPIC_RISK_MEASURE
53
53
  * FOURTH_CENTRAL_MOMENT
54
54
  * FOURTH_LOWER_PARTIAL_MOMENT
55
- * SKEW
56
- * KURTOSIS
57
55
 
58
56
  The default is `RiskMeasure.VARIANCE`.
59
57
 
@@ -80,12 +78,12 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
80
78
 
81
79
  min_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
82
80
  Minimum assets weights (weights lower bounds). Negative weights are not allowed.
83
- If a float is provided, it is applied to each asset. `None` is equivalent to
84
- `-np.Inf` (no lower bound). If a dictionary is provided, its (key/value) pair
85
- must be the (asset name/asset minium weight) and the input `X` of the `fit`
86
- methods must be a DataFrame with the assets names in columns. When using a
87
- dictionary, assets values that are not provided are assigned a minimum weight
88
- of `0.0`. The default is 0.0 (no short selling).
81
+ If a float is provided, it is applied to each asset.
82
+ If a dictionary is provided, its (key/value) pair must be the
83
+ (asset name/asset minium weight) and the input `X` of the `fit` methods must be
84
+ a DataFrame with the assets names in columns.
85
+ When using a dictionary, assets values that are not provided are assigned a
86
+ minimum weight of `0.0`. The default is 0.0 (no short selling).
89
87
 
90
88
  Example:
91
89
 
@@ -96,12 +94,12 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
96
94
 
97
95
  max_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=1.0
98
96
  Maximum assets weights (weights upper bounds). Weights above 1.0 are not
99
- allowed. If a float is provided, it is applied to each asset. `None` is
100
- equivalent to `+np.Inf` (no upper bound). If a dictionary is provided, its
101
- (key/value) pair must be the (asset name/asset maximum weight) and the input `X`
102
- of the `fit` method must be a DataFrame with the assets names in columns. When
103
- using a dictionary, assets values that are not provided are assigned a minimum
104
- weight of `1.0`. The default is 1.0 (each asset is below 100%).
97
+ allowed. If a float is provided, it is applied to each asset.
98
+ If a dictionary is provided, its (key/value) pair must be the
99
+ (asset name/asset maximum weight) and the input `X` of the `fit` method must be
100
+ a DataFrame with the assets names in columns.
101
+ When using a dictionary, assets values that are not provided are assigned a
102
+ minimum weight of `1.0`. The default is 1.0 (each asset is below 100%).
105
103
 
106
104
  Example:
107
105
 
@@ -388,57 +386,6 @@ class BaseHierarchicalOptimization(BaseOptimization, ABC):
388
386
 
389
387
  return min_weights, max_weights
390
388
 
391
- @staticmethod
392
- def _apply_weight_constraints_to_alpha(
393
- alpha: float,
394
- max_weights: np.ndarray,
395
- min_weights: np.ndarray,
396
- weights: np.ndarray,
397
- left_cluster: np.ndarray,
398
- right_cluster: np.ndarray,
399
- ) -> float:
400
- """Apply weight constraints to the alpha multiplication factor of the
401
- Hierarchical Tree Clustering algorithm.
402
-
403
- Parameters
404
- ----------
405
- alpha : float
406
- The alpha multiplication factor of the Hierarchical Tree Clustering
407
- algorithm.
408
-
409
- min_weights : ndarray of shape (n_assets,)
410
- The weight lower bound 1D array.
411
-
412
- max_weights : ndarray of shape (n_assets,)
413
- The weight upper bound 1D array.
414
-
415
- weights : np.ndarray of shape (n_assets,)
416
- The assets weights.
417
-
418
- left_cluster : ndarray of shape (n_left_cluster,)
419
- Indices of the left cluster weights.
420
-
421
- right_cluster : ndarray of shape (n_right_cluster,)
422
- Indices of the right cluster weights.
423
-
424
- Returns
425
- -------
426
- value : float
427
- The transformed alpha incorporating the weight constraints.
428
- """
429
- alpha = min(
430
- np.sum(max_weights[left_cluster]) / weights[left_cluster[0]],
431
- max(np.sum(min_weights[left_cluster]) / weights[left_cluster[0]], alpha),
432
- )
433
- alpha = 1 - min(
434
- np.sum(max_weights[right_cluster]) / weights[right_cluster[0]],
435
- max(
436
- np.sum(min_weights[right_cluster]) / weights[right_cluster[0]],
437
- 1 - alpha,
438
- ),
439
- )
440
- return alpha
441
-
442
389
  def get_metadata_routing(self):
443
390
  # noinspection PyTypeChecker
444
391
  router = (
@@ -3,8 +3,7 @@
3
3
  # Copyright (c) 2023
4
4
  # Author: Hugo Delatte <delatte.hugo@gmail.com>
5
5
  # License: BSD 3 clause
6
- # The risk measure generalization and constraint features are derived
7
- # from Riskfolio-Lib, Copyright (c) 2020-2023, Dany Cajas, Licensed under BSD 3 clause.
6
+ # Weight constraints is a novel implementation, see docstring for more details.
8
7
 
9
8
  import numpy as np
10
9
  import numpy.typing as npt
@@ -20,6 +19,7 @@ from skfolio.optimization.cluster.hierarchical._base import (
20
19
  BaseHierarchicalOptimization,
21
20
  )
22
21
  from skfolio.prior import BasePrior, EmpiricalPrior
22
+ from skfolio.utils.stats import minimize_relative_weight_deviation
23
23
  from skfolio.utils.tools import check_estimator
24
24
 
25
25
 
@@ -45,6 +45,32 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
45
45
  which is more stable and has better properties than the single-linkage
46
46
  method [4]_.
47
47
 
48
+ Also, the initial paper does not provide an algorithm for handling weight
49
+ constraints, and no standard solution currently exists.
50
+ In contrast to HRP (Hierarchical Risk Parity), where weight constraints
51
+ can be applied to the split factor at each bisection step, HERC
52
+ (Hierarchical Equal Risk Contribution) cannot incorporate weight constraints
53
+ during the intermediate steps of the allocation. Therefore, in HERC, the
54
+ weight constraints must be enforced after the top-down allocation has been
55
+ completed.
56
+ In skfolio, we minimize the relative deviation of the final weights from
57
+ the initial weights. This is formulated as a convex optimization problem:
58
+
59
+ .. math::
60
+ \begin{cases}
61
+ \begin{aligned}
62
+ &\min_{w} & & \Vert \frac{w - w_{init}}{w_{init}} \Vert_{2}^{2} \\
63
+ &\text{s.t.} & & \sum_{i=1}^{N} w_{i} = 1 \\
64
+ & & & w_{min} \leq w_i \leq w_{max}, \quad \forall i
65
+ \end{aligned}
66
+ \end{cases}
67
+
68
+ The reason for minimizing the relative deviation (as opposed to the absolute
69
+ deviation) is that we want to limit the impact on the risk contribution of
70
+ each asset. Since HERC allocates inversely to risk, adjusting the weights
71
+ based on relative deviation ensures that the assets' risk contributions
72
+ remain proportionally consistent with the initial allocation.
73
+
48
74
  Parameters
49
75
  ----------
50
76
  risk_measure : RiskMeasure or ExtraRiskMeasure, default=RiskMeasure.VARIANCE
@@ -70,8 +96,6 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
70
96
  * ENTROPIC_RISK_MEASURE
71
97
  * FOURTH_CENTRAL_MOMENT
72
98
  * FOURTH_LOWER_PARTIAL_MOMENT
73
- * SKEW
74
- * KURTOSIS
75
99
 
76
100
  The default is `RiskMeasure.VARIANCE`.
77
101
 
@@ -98,12 +122,12 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
98
122
 
99
123
  min_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
100
124
  Minimum assets weights (weights lower bounds). Negative weights are not allowed.
101
- If a float is provided, it is applied to each asset. `None` is equivalent to
102
- `-np.Inf` (no lower bound). If a dictionary is provided, its (key/value) pair
103
- must be the (asset name/asset minium weight) and the input `X` of the `fit`
104
- methods must be a DataFrame with the assets names in columns. When using a
105
- dictionary, assets values that are not provided are assigned a minimum weight
106
- of `0.0`. The default is 0.0 (no short selling).
125
+ If a float is provided, it is applied to each asset.
126
+ If a dictionary is provided, its (key/value) pair must be the
127
+ (asset name/asset minium weight) and the input `X` of the `fit` methods must be
128
+ a DataFrame with the assets names in columns.
129
+ When using a dictionary, assets values that are not provided are assigned a
130
+ minimum weight of `0.0`. The default is 0.0 (no short selling).
107
131
 
108
132
  Example:
109
133
 
@@ -114,12 +138,12 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
114
138
 
115
139
  max_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=1.0
116
140
  Maximum assets weights (weights upper bounds). Weights above 1.0 are not
117
- allowed. If a float is provided, it is applied to each asset. `None` is
118
- equivalent to `+np.Inf` (no upper bound). If a dictionary is provided, its
119
- (key/value) pair must be the (asset name/asset maximum weight) and the input `X`
120
- of the `fit` method must be a DataFrame with the assets names in columns. When
121
- using a dictionary, assets values that are not provided are assigned a minimum
122
- weight of `1.0`. The default is 1.0 (each asset is below 100%).
141
+ allowed. If a float is provided, it is applied to each asset.
142
+ If a dictionary is provided, its (key/value) pair must be the
143
+ (asset name/asset maximum weight) and the input `X` of the `fit` method must be
144
+ a DataFrame with the assets names in columns.
145
+ When using a dictionary, assets values that are not provided are assigned a
146
+ minimum weight of `1.0`. The default is 1.0 (each asset is below 100%).
123
147
 
124
148
  Example:
125
149
 
@@ -208,6 +232,19 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
208
232
  `management_fees`, `previous_weights` and `risk_free_rate` are copied from the
209
233
  optimization model and passed to the portfolio.
210
234
 
235
+ solver : str, default="CLARABEL"
236
+ The solver used for the weights constraints optimization. The default is
237
+ "CLARABEL" which is written in Rust and has better numerical stability and
238
+ performance than ECOS and SCS.
239
+ For more details about available solvers, check the CVXPY documentation:
240
+ https://www.cvxpy.org/tutorial/advanced/index.html#choosing-a-solver
241
+
242
+ solver_params : dict, optional
243
+ Solver parameters. For example, `solver_params=dict(verbose=True)`.
244
+ The default (`None`) is to use the CVXPY default.
245
+ For more details about solver arguments, check the CVXPY documentation:
246
+ https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options
247
+
211
248
  Attributes
212
249
  ----------
213
250
  weights_ : ndarray of shape (n_assets,)
@@ -251,6 +288,8 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
251
288
  hierarchical_clustering_estimator: HierarchicalClustering | None = None,
252
289
  min_weights: skt.MultiInput | None = 0.0,
253
290
  max_weights: skt.MultiInput | None = 1.0,
291
+ solver: str = "CLARABEL",
292
+ solver_params: dict | None = None,
254
293
  transaction_costs: skt.MultiInput = 0.0,
255
294
  management_fees: skt.MultiInput = 0.0,
256
295
  previous_weights: skt.MultiInput | None = None,
@@ -268,6 +307,8 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
268
307
  previous_weights=previous_weights,
269
308
  portfolio_params=portfolio_params,
270
309
  )
310
+ self.solver = solver
311
+ self.solver_params = solver_params
271
312
 
272
313
  def fit(
273
314
  self, X: npt.ArrayLike, y: None = None, **fit_params
@@ -301,6 +342,13 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
301
342
  raise TypeError(
302
343
  "`risk_measure` must be of type `RiskMeasure` or `ExtraRiskMeasure`"
303
344
  )
345
+
346
+ if self.risk_measure in [ExtraRiskMeasure.SKEW, ExtraRiskMeasure.KURTOSIS]:
347
+ # Because Skew and Kurtosis can take negative values
348
+ raise ValueError(
349
+ f"risk_measure {self.risk_measure} currently not supported" f"in HERC"
350
+ )
351
+
304
352
  self.prior_estimator_ = check_estimator(
305
353
  self.prior_estimator,
306
354
  default=EmpiricalPrior(),
@@ -393,21 +441,12 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
393
441
 
394
442
  left_cluster = np.array(left_cluster)
395
443
  right_cluster = np.array(right_cluster)
444
+
396
445
  left_risk = np.sum(cluster_risks[left_cluster])
397
446
  right_risk = np.sum(cluster_risks[right_cluster])
398
447
 
399
448
  alpha = 1 - left_risk / (left_risk + right_risk)
400
449
 
401
- # Weights constraints
402
- alpha = self._apply_weight_constraints_to_alpha(
403
- alpha=alpha,
404
- weights=weights,
405
- max_weights=max_weights,
406
- min_weights=min_weights,
407
- left_cluster=left_cluster,
408
- right_cluster=right_cluster,
409
- )
410
-
411
450
  clusters_weights[left_cluster] *= alpha
412
451
  clusters_weights[right_cluster] *= 1 - alpha
413
452
 
@@ -421,5 +460,15 @@ class HierarchicalEqualRiskContribution(BaseHierarchicalOptimization):
421
460
  for i, cluster_ids in enumerate(clusters):
422
461
  weights[cluster_ids] *= clusters_weights[i]
423
462
 
463
+ # Apply weights constraints
464
+ weights = minimize_relative_weight_deviation(
465
+ weights=weights,
466
+ min_weights=min_weights,
467
+ max_weights=max_weights,
468
+ solver=self.solver,
469
+ solver_params=self.solver_params,
470
+ )
471
+
424
472
  self.weights_ = weights
473
+
425
474
  return self
@@ -72,8 +72,6 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
72
72
  * ENTROPIC_RISK_MEASURE
73
73
  * FOURTH_CENTRAL_MOMENT
74
74
  * FOURTH_LOWER_PARTIAL_MOMENT
75
- * SKEW
76
- * KURTOSIS
77
75
 
78
76
  The default is `RiskMeasure.VARIANCE`.
79
77
 
@@ -100,9 +98,9 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
100
98
 
101
99
  min_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
102
100
  Minimum assets weights (weights lower bounds). Negative weights are not allowed.
103
- If a float is provided, it is applied to each asset. `None` is equivalent to
104
- `-np.Inf` (no lower bound). If a dictionary is provided, its (key/value) pair
105
- must be the (asset name/asset minium weight) and the input `X` of the `fit`
101
+ If a float is provided, it is applied to each asset.
102
+ If a dictionary is provided, its (key/value) pair must be the
103
+ (asset name/asset minium weight) and the input `X` of the `fit`
106
104
  methods must be a DataFrame with the assets names in columns. When using a
107
105
  dictionary, assets values that are not provided are assigned a minimum weight
108
106
  of `0.0`. The default is 0.0 (no short selling).
@@ -116,12 +114,12 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
116
114
 
117
115
  max_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=1.0
118
116
  Maximum assets weights (weights upper bounds). Weights above 1.0 are not
119
- allowed. If a float is provided, it is applied to each asset. `None` is
120
- equivalent to `+np.Inf` (no upper bound). If a dictionary is provided, its
121
- (key/value) pair must be the (asset name/asset maximum weight) and the input `X`
122
- of the `fit` method must be a DataFrame with the assets names in columns. When
123
- using a dictionary, assets values that are not provided are assigned a minimum
124
- weight of `1.0`. The default is 1.0 (each asset is below 100%).
117
+ allowed. If a float is provided, it is applied to each asset.
118
+ If a dictionary is provided, its (key/value) pair must be the
119
+ (asset name/asset maximum weight) and the input `X` of the `fit` method must
120
+ be a DataFrame with the assets names in columns.
121
+ When using a dictionary, assets values that are not provided are assigned a
122
+ minimum weight of `1.0`. The default is 1.0 (each asset is below 100%).
125
123
 
126
124
  Example:
127
125
 
@@ -296,6 +294,13 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
296
294
  raise TypeError(
297
295
  "`risk_measure` must be of type `RiskMeasure` or `ExtraRiskMeasure`"
298
296
  )
297
+
298
+ if self.risk_measure in [ExtraRiskMeasure.SKEW, ExtraRiskMeasure.KURTOSIS]:
299
+ # Because Skew and Kurtosis can take negative values
300
+ raise ValueError(
301
+ f"risk_measure {self.risk_measure} currently not supported" f"in HRP"
302
+ )
303
+
299
304
  self.prior_estimator_ = check_estimator(
300
305
  self.prior_estimator,
301
306
  default=EmpiricalPrior(),
@@ -365,7 +370,7 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
365
370
  left_cluster, right_cluster = clusters_ids
366
371
  alpha = 1 - left_risk / (left_risk + right_risk)
367
372
  # Weights constraints
368
- alpha = self._apply_weight_constraints_to_alpha(
373
+ alpha = _apply_weight_constraints_to_split_factor(
369
374
  alpha=alpha,
370
375
  weights=weights,
371
376
  max_weights=max_weights,
@@ -379,3 +384,54 @@ class HierarchicalRiskParity(BaseHierarchicalOptimization):
379
384
 
380
385
  self.weights_ = weights
381
386
  return self
387
+
388
+
389
+ def _apply_weight_constraints_to_split_factor(
390
+ alpha: float,
391
+ max_weights: np.ndarray,
392
+ min_weights: np.ndarray,
393
+ weights: np.ndarray,
394
+ left_cluster: np.ndarray,
395
+ right_cluster: np.ndarray,
396
+ ) -> float:
397
+ """
398
+ Apply weight constraints to the split factor alpha of the ,Hierarchical Tree
399
+ Clustering algorithm.
400
+
401
+ Parameters
402
+ ----------
403
+ alpha : float
404
+ The split factor alpha of the Hierarchical Tree Clustering algorithm.
405
+
406
+ min_weights : ndarray of shape (n_assets,)
407
+ The weight lower bound 1D array.
408
+
409
+ max_weights : ndarray of shape (n_assets,)
410
+ The weight upper bound 1D array.
411
+
412
+ weights : np.ndarray of shape (n_assets,)
413
+ The assets weights.
414
+
415
+ left_cluster : ndarray of shape (n_left_cluster,)
416
+ Indices of the left cluster weights.
417
+
418
+ right_cluster : ndarray of shape (n_right_cluster,)
419
+ Indices of the right cluster weights.
420
+
421
+ Returns
422
+ -------
423
+ value : float
424
+ The transformed split factor alpha incorporating the weight constraints.
425
+ """
426
+ alpha = min(
427
+ np.sum(max_weights[left_cluster]) / weights[left_cluster[0]],
428
+ max(np.sum(min_weights[left_cluster]) / weights[left_cluster[0]], alpha),
429
+ )
430
+ alpha = 1 - min(
431
+ np.sum(max_weights[right_cluster]) / weights[right_cluster[0]],
432
+ max(
433
+ np.sum(min_weights[right_cluster]) / weights[right_cluster[0]],
434
+ 1 - alpha,
435
+ ),
436
+ )
437
+ return alpha
@@ -653,7 +653,7 @@ class Population(list):
653
653
  spacing: float | None = None,
654
654
  display_sub_ptf_name: bool = True,
655
655
  ) -> go.Figure:
656
- """Plot the contribution of each asset to a given measure of the portfolios
656
+ r"""Plot the contribution of each asset to a given measure of the portfolios
657
657
  in the population.
658
658
 
659
659
  Parameters
@@ -1,7 +1,13 @@
1
- from skfolio.pre_selection._pre_selection import (
2
- DropCorrelated,
3
- SelectKExtremes,
4
- SelectNonDominated,
5
- )
1
+ from skfolio.pre_selection._drop_correlated import DropCorrelated
2
+ from skfolio.pre_selection._select_complete import SelectComplete
3
+ from skfolio.pre_selection._select_k_extremes import SelectKExtremes
4
+ from skfolio.pre_selection._select_non_dominated import SelectNonDominated
5
+ from skfolio.pre_selection._select_non_expiring import SelectNonExpiring
6
6
 
7
- __all__ = ["DropCorrelated", "SelectKExtremes", "SelectNonDominated"]
7
+ __all__ = [
8
+ "DropCorrelated",
9
+ "SelectKExtremes",
10
+ "SelectNonDominated",
11
+ "SelectComplete",
12
+ "SelectNonExpiring",
13
+ ]
@@ -0,0 +1,108 @@
1
+ """Pre-selection DropCorrelated module"""
2
+
3
+ # Copyright (c) 2023
4
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
5
+ # License: BSD 3 clause
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+ import sklearn.base as skb
10
+ import sklearn.feature_selection as skf
11
+ import sklearn.utils.validation as skv
12
+
13
+
14
+ class DropCorrelated(skf.SelectorMixin, skb.BaseEstimator):
15
+ """Transformer for dropping highly correlated assets.
16
+
17
+ Simply removing all correlation pairs above the threshold will remove more assets
18
+ than necessary and a naive sequential removal is suboptimal and depends on the
19
+ initial assets ordering.
20
+
21
+ Let's suppose X,Y,Z are three random variables with corr(X,Y) and corr(X,Z) above
22
+ the threshold and corr(Y,Z) below.
23
+ The first approach would remove X,Y,Z and the second approach would remove either
24
+ Y and Z or X depending on the initial ordering.
25
+
26
+ To avoid these shortcomings, we implement the below algorithm:
27
+
28
+ * Step 1: select all correlation pairs above the threshold.
29
+ * Step 2: sort all the selected correlation pairs from highest to lowest.
30
+ * Step 3: for each pair, if none of the two assets has been removed, keep the
31
+ asset with the lowest average correlation against the other assets.
32
+
33
+ Parameters
34
+ ----------
35
+ threshold : float, default=0.95
36
+ Correlation threshold. The default value is `0.95`.
37
+
38
+ absolute : bool, default=False
39
+ If this is set to True, we take the absolute value of the correlation. This has
40
+ for effect to also include negatively correlated assets.
41
+
42
+ Attributes
43
+ ----------
44
+ to_keep_ : ndarray of shape (n_assets, )
45
+ Boolean array indicating which assets are remaining.
46
+
47
+ n_features_in_ : int
48
+ Number of assets seen during `fit`.
49
+
50
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
51
+ Names of assets seen during `fit`. Defined only when `X`
52
+ has assets names that are all strings.
53
+ """
54
+
55
+ to_keep_: np.ndarray
56
+
57
+ def __init__(self, threshold: float = 0.95, absolute: bool = False):
58
+ self.threshold = threshold
59
+ self.absolute = absolute
60
+
61
+ def fit(self, X: npt.ArrayLike, y=None):
62
+ """Run the correlation transformer and get the appropriate assets.
63
+
64
+ Parameters
65
+ ----------
66
+ X : array-like of shape (n_observations, n_assets)
67
+ Price returns of the assets.
68
+
69
+ y : Ignored
70
+ Not used, present for API consistency by convention.
71
+
72
+ Returns
73
+ -------
74
+ self : DropCorrelated
75
+ Fitted estimator.
76
+ """
77
+ X = self._validate_data(X)
78
+ if not -1 <= self.threshold <= 1:
79
+ raise ValueError("`threshold` must be between -1 and 1")
80
+
81
+ n_assets = X.shape[1]
82
+ corr = np.corrcoef(X.T)
83
+ mean_corr = corr.mean(axis=0)
84
+
85
+ triu_idx = np.triu_indices(n_assets, 1)
86
+
87
+ # select all correlation pairs above the threshold
88
+ selected_idx = np.argwhere(corr[triu_idx] > self.threshold).flatten()
89
+
90
+ # sort all the selected correlation pairs from highest to lowest
91
+ selected_idx = selected_idx[np.argsort(-corr[triu_idx][selected_idx])]
92
+
93
+ # for each pair, if none of the two assets has been removed, keep the asset with
94
+ # the lowest average correlation with other assets
95
+ to_remove = set()
96
+ for idx in selected_idx:
97
+ i, j = triu_idx[0][idx], triu_idx[1][idx]
98
+ if i not in to_remove and j not in to_remove:
99
+ if mean_corr[i] > mean_corr[j]:
100
+ to_remove.add(i)
101
+ else:
102
+ to_remove.add(j)
103
+ self.to_keep_ = ~np.isin(np.arange(n_assets), list(to_remove))
104
+ return self
105
+
106
+ def _get_support_mask(self):
107
+ skv.check_is_fitted(self)
108
+ return self.to_keep_
@@ -0,0 +1,116 @@
1
+ """pre-selection SelectComplete module"""
2
+
3
+ # Copyright (c) 2023
4
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
5
+ # License: BSD 3 clause
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+ import sklearn.base as skb
10
+ import sklearn.feature_selection as skf
11
+ import sklearn.utils.validation as skv
12
+
13
+
14
+ class SelectComplete(skf.SelectorMixin, skb.BaseEstimator):
15
+ """
16
+ Transformer to select assets with complete data across the entire observation
17
+ period.
18
+
19
+ This transformer removes assets (columns) that have missing values (NaNs) at the
20
+ beginning or end of the period.
21
+
22
+ This transformer is especially useful for financial datasets where assets
23
+ (e.g., stocks, bonds) may have data gaps due to late inception (assets that started
24
+ trading later), early expiry or default (assets that stopped trading before the
25
+ end of the period).
26
+
27
+ If missing values are not at the beginning or end but occur between non-missing
28
+ values, the asset is not removed unless `drop_assets_with_internal_nan` is set to
29
+ `True`.
30
+
31
+ Parameters
32
+ ----------
33
+ drop_assets_with_internal_nan : bool, default=False
34
+ If set to True, assets with missing values (NaNs) that appear between
35
+ non-missing values (i.e., internal NaNs) will also be removed. By default,
36
+ only assets with leading or trailing NaNs are removed.
37
+
38
+ Attributes
39
+ ----------
40
+ to_keep_ : ndarray of shape (n_assets, )
41
+ Boolean array indicating which assets are remaining.
42
+
43
+ n_features_in_ : int
44
+ Number of assets seen during `fit`.
45
+
46
+ feature_names_in_ : ndarray of shape (`n_features_in_`,)
47
+ Names of features seen during `fit`. Defined only when `X`
48
+ has feature names that are all strings.
49
+
50
+ Examples
51
+ --------
52
+
53
+ >>> import numpy as np
54
+ >>> import pandas as pd
55
+ >>> from skfolio.pre_selection import SelectComplete
56
+ >>> X = pd.DataFrame({
57
+ ... 'asset1': [np.nan, np.nan, 2, 3, 4], # Starts late (inception)
58
+ ... 'asset2': [1, 2, 3, 4, 5], # Complete data
59
+ ... 'asset3': [1, 2, 3, np.nan, 5], # Missing values within data
60
+ ... 'asset4': [1, 2, 3, 4, np.nan] # Ends early (expiration)
61
+ ... })
62
+ >>> selector = SelectComplete()
63
+ >>> selector.fit_transform(X)
64
+ array([[ 1., 1.],
65
+ [ 2., 2.],
66
+ [ 3., 3.],
67
+ [ 4., nan],
68
+ [ 5., 5.]])
69
+ >>> selector = SelectComplete(drop_assets_with_internal_nan=True)
70
+ >>> selector.fit_transform(X)
71
+ array([[1.],
72
+ [2.],
73
+ [3.],
74
+ [4.],
75
+ [5.]])
76
+ """
77
+
78
+ to_keep_: np.ndarray
79
+
80
+ def __init__(self, drop_assets_with_internal_nan: bool = False):
81
+ self.drop_assets_with_internal_nan = drop_assets_with_internal_nan
82
+
83
+ def fit(self, X: npt.ArrayLike, y=None) -> "SelectComplete":
84
+ """Run the SelectComplete transformer and get the appropriate assets.
85
+
86
+ Parameters
87
+ ----------
88
+ X : array-like of shape (n_observations, n_assets)
89
+ Returns of the assets.
90
+
91
+ y : Ignored
92
+ Not used, present for API consistency by convention.
93
+
94
+ Returns
95
+ -------
96
+ self : SelectComplete
97
+ Fitted estimator.
98
+ """
99
+ # Validate by allowing NaNs
100
+ X = self._validate_data(X, force_all_finite="allow-nan")
101
+
102
+ if self.drop_assets_with_internal_nan:
103
+ # Identify columns with any NaNs
104
+ self.to_keep_ = ~np.isnan(X).any(axis=0)
105
+ else:
106
+ # Identify columns with no leading or trailing NaNs
107
+ self.to_keep_ = ~np.isnan(X[0, :]) & ~np.isnan(X[-1, :])
108
+
109
+ return self
110
+
111
+ def _get_support_mask(self):
112
+ skv.check_is_fitted(self)
113
+ return self.to_keep_
114
+
115
+ def _more_tags(self):
116
+ return {"allow_nan": True}