skfolio 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skfolio/__init__.py +29 -0
- skfolio/cluster/__init__.py +8 -0
- skfolio/cluster/_hierarchical.py +387 -0
- skfolio/datasets/__init__.py +20 -0
- skfolio/datasets/_base.py +389 -0
- skfolio/datasets/data/__init__.py +0 -0
- skfolio/datasets/data/factors_dataset.csv.gz +0 -0
- skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
- skfolio/datasets/data/sp500_index.csv.gz +0 -0
- skfolio/distance/__init__.py +26 -0
- skfolio/distance/_base.py +55 -0
- skfolio/distance/_distance.py +574 -0
- skfolio/exceptions.py +30 -0
- skfolio/measures/__init__.py +76 -0
- skfolio/measures/_enums.py +355 -0
- skfolio/measures/_measures.py +607 -0
- skfolio/metrics/__init__.py +3 -0
- skfolio/metrics/_scorer.py +121 -0
- skfolio/model_selection/__init__.py +18 -0
- skfolio/model_selection/_combinatorial.py +407 -0
- skfolio/model_selection/_validation.py +194 -0
- skfolio/model_selection/_walk_forward.py +221 -0
- skfolio/moments/__init__.py +41 -0
- skfolio/moments/covariance/__init__.py +29 -0
- skfolio/moments/covariance/_base.py +101 -0
- skfolio/moments/covariance/_covariance.py +1108 -0
- skfolio/moments/expected_returns/__init__.py +21 -0
- skfolio/moments/expected_returns/_base.py +31 -0
- skfolio/moments/expected_returns/_expected_returns.py +415 -0
- skfolio/optimization/__init__.py +36 -0
- skfolio/optimization/_base.py +147 -0
- skfolio/optimization/cluster/__init__.py +13 -0
- skfolio/optimization/cluster/_nco.py +348 -0
- skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
- skfolio/optimization/cluster/hierarchical/_base.py +440 -0
- skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
- skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
- skfolio/optimization/convex/__init__.py +16 -0
- skfolio/optimization/convex/_base.py +1944 -0
- skfolio/optimization/convex/_distributionally_robust.py +392 -0
- skfolio/optimization/convex/_maximum_diversification.py +417 -0
- skfolio/optimization/convex/_mean_risk.py +974 -0
- skfolio/optimization/convex/_risk_budgeting.py +560 -0
- skfolio/optimization/ensemble/__init__.py +6 -0
- skfolio/optimization/ensemble/_base.py +87 -0
- skfolio/optimization/ensemble/_stacking.py +326 -0
- skfolio/optimization/naive/__init__.py +3 -0
- skfolio/optimization/naive/_naive.py +173 -0
- skfolio/population/__init__.py +3 -0
- skfolio/population/_population.py +883 -0
- skfolio/portfolio/__init__.py +13 -0
- skfolio/portfolio/_base.py +1096 -0
- skfolio/portfolio/_multi_period_portfolio.py +610 -0
- skfolio/portfolio/_portfolio.py +842 -0
- skfolio/pre_selection/__init__.py +7 -0
- skfolio/pre_selection/_pre_selection.py +342 -0
- skfolio/preprocessing/__init__.py +3 -0
- skfolio/preprocessing/_returns.py +114 -0
- skfolio/prior/__init__.py +18 -0
- skfolio/prior/_base.py +63 -0
- skfolio/prior/_black_litterman.py +238 -0
- skfolio/prior/_empirical.py +163 -0
- skfolio/prior/_factor_model.py +268 -0
- skfolio/typing.py +50 -0
- skfolio/uncertainty_set/__init__.py +23 -0
- skfolio/uncertainty_set/_base.py +108 -0
- skfolio/uncertainty_set/_bootstrap.py +281 -0
- skfolio/uncertainty_set/_empirical.py +237 -0
- skfolio/utils/__init__.py +0 -0
- skfolio/utils/bootstrap.py +115 -0
- skfolio/utils/equations.py +350 -0
- skfolio/utils/sorting.py +117 -0
- skfolio/utils/stats.py +466 -0
- skfolio/utils/tools.py +567 -0
- skfolio-0.0.1.dist-info/LICENSE +29 -0
- skfolio-0.0.1.dist-info/METADATA +568 -0
- skfolio-0.0.1.dist-info/RECORD +79 -0
- skfolio-0.0.1.dist-info/WHEEL +5 -0
- skfolio-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,121 @@
|
|
1
|
+
"""Scorer module"""
|
2
|
+
|
3
|
+
# Author: Hugo Delatte <delatte.hugo@gmail.com>
|
4
|
+
# License: BSD 3 clause
|
5
|
+
|
6
|
+
from collections.abc import Callable
|
7
|
+
|
8
|
+
import numpy.typing as npt
|
9
|
+
|
10
|
+
import skfolio.typing as skt
|
11
|
+
from skfolio.optimization import BaseOptimization
|
12
|
+
from skfolio.portfolio import Portfolio
|
13
|
+
|
14
|
+
|
15
|
+
class _PortfolioScorer:
|
16
|
+
"""Portfolio Scorer wrapper"""
|
17
|
+
|
18
|
+
def __init__(self, score_func: Callable, sign: int, kwargs: dict):
|
19
|
+
self._score_func = score_func
|
20
|
+
self._kwargs = kwargs
|
21
|
+
self._sign = sign
|
22
|
+
|
23
|
+
def __repr__(self) -> str:
|
24
|
+
"""String representation of the `PortfolioScorer`."""
|
25
|
+
kwargs_string = "".join([f", {k}={v}" for k, v in self._kwargs.items()])
|
26
|
+
return (
|
27
|
+
f"make_scorer({self._score_func.__name__}"
|
28
|
+
f"{'' if self._sign > 0 else ', greater_is_better=False'}"
|
29
|
+
f"{kwargs_string})"
|
30
|
+
)
|
31
|
+
|
32
|
+
def __call__(self, estimator: BaseOptimization, X: npt.ArrayLike) -> float:
|
33
|
+
"""Compute the score of the estimator prediction on X.
|
34
|
+
|
35
|
+
Parameters
|
36
|
+
----------
|
37
|
+
estimator : BaseOptimization
|
38
|
+
Trained estimator to use for scoring.
|
39
|
+
|
40
|
+
X : array-like of shape (n_observations, n_assets)
|
41
|
+
Test data that will be fed to estimator.predict.
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
score : float
|
46
|
+
Score of the estimator prediction on X.
|
47
|
+
"""
|
48
|
+
pred = estimator.predict(X)
|
49
|
+
return self._sign * self._score_func(pred, **self._kwargs)
|
50
|
+
|
51
|
+
|
52
|
+
def make_scorer(
|
53
|
+
score_func: skt.Measure | Callable,
|
54
|
+
greater_is_better: bool | None = None,
|
55
|
+
**kwargs,
|
56
|
+
) -> Callable:
|
57
|
+
"""Make a scorer from a :ref:`measure <measures_ref>` or from a custom score
|
58
|
+
function.
|
59
|
+
|
60
|
+
This is a modified version from `scikit-learn` `make_scorer` for enhanced
|
61
|
+
functionalities with `Portfolio` objects.
|
62
|
+
|
63
|
+
This factory function wraps scoring functions for use in
|
64
|
+
`sklearn.model_selection.GridSearchCV` and
|
65
|
+
`sklearn.model_selection.cross_val_score`.
|
66
|
+
|
67
|
+
Parameters
|
68
|
+
----------
|
69
|
+
score_func : Measure | callable
|
70
|
+
If `score_func` is a :ref:`measure <measures_ref>`, we return the measure of
|
71
|
+
the predicted :class:`~skfolio.portfolio.Portfolio` times `1` or `-1`
|
72
|
+
depending on the `greater_is_better` parameter.
|
73
|
+
|
74
|
+
Otherwise, `score_func` must be a score function (or loss function) with
|
75
|
+
signature `score_func(pred, **kwargs)`. The argument `pred` is the predicted
|
76
|
+
:class:`~skfolio.portfolio.Portfolio`.
|
77
|
+
|
78
|
+
Note that you can convert this portfolio object into a numpy array of price
|
79
|
+
returns with `np.asarray(pred)`.
|
80
|
+
|
81
|
+
greater_is_better : bool, optional
|
82
|
+
If this is set to True, `score_func` is a score function (default) meaning high
|
83
|
+
is good, otherwise it is a loss function, meaning low is good.
|
84
|
+
In the latter case, the scorer object will sign-flip the outcome of the `score_func`.
|
85
|
+
The default (`None`) is to use:
|
86
|
+
|
87
|
+
* If `score_func` is a :ref:`measure <measures_ref>`:
|
88
|
+
|
89
|
+
* True for `PerfMeasure` and `RationMeasure`
|
90
|
+
* False for `RiskMeasure` and `ExtraRiskMeasure`.
|
91
|
+
|
92
|
+
* Otherwise, True.
|
93
|
+
|
94
|
+
**kwargs : additional arguments
|
95
|
+
Additional parameters to be passed to score_func.
|
96
|
+
|
97
|
+
Returns
|
98
|
+
-------
|
99
|
+
scorer : callable
|
100
|
+
Callable object that returns a scalar score.
|
101
|
+
"""
|
102
|
+
if callable(score_func):
|
103
|
+
if greater_is_better is None:
|
104
|
+
greater_is_better = True
|
105
|
+
|
106
|
+
else:
|
107
|
+
measure = score_func
|
108
|
+
if not isinstance(measure, skt.Measure):
|
109
|
+
raise TypeError("`score_func` must be a callable or a measure")
|
110
|
+
if greater_is_better is None:
|
111
|
+
if measure.is_perf or measure.is_ratio:
|
112
|
+
greater_is_better = True
|
113
|
+
else:
|
114
|
+
greater_is_better = False
|
115
|
+
|
116
|
+
def score_func(pred: Portfolio) -> float:
|
117
|
+
"""Score function"""
|
118
|
+
return getattr(pred, measure.value)
|
119
|
+
|
120
|
+
sign = 1 if greater_is_better else -1
|
121
|
+
return _PortfolioScorer(score_func, sign, kwargs)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
"""Model selection module"""
|
2
|
+
|
3
|
+
# Author: Hugo Delatte <delatte.hugo@gmail.com>
|
4
|
+
# License: BSD 3 clause
|
5
|
+
|
6
|
+
from skfolio.model_selection._combinatorial import (
|
7
|
+
BaseCombinatorialCV,
|
8
|
+
CombinatorialPurgedCV,
|
9
|
+
)
|
10
|
+
from skfolio.model_selection._validation import cross_val_predict
|
11
|
+
from skfolio.model_selection._walk_forward import WalkForward
|
12
|
+
|
13
|
+
__all__ = [
|
14
|
+
"cross_val_predict",
|
15
|
+
"WalkForward",
|
16
|
+
"BaseCombinatorialCV",
|
17
|
+
"CombinatorialPurgedCV",
|
18
|
+
]
|
@@ -0,0 +1,407 @@
|
|
1
|
+
"""Combinatorial module"""
|
2
|
+
|
3
|
+
# Author: Hugo Delatte <delatte.hugo@gmail.com>
|
4
|
+
# License: BSD 3 clause
|
5
|
+
|
6
|
+
import itertools
|
7
|
+
import math
|
8
|
+
import numbers
|
9
|
+
from abc import ABC, abstractmethod
|
10
|
+
from collections.abc import Iterator
|
11
|
+
|
12
|
+
import numpy as np
|
13
|
+
import numpy.typing as npt
|
14
|
+
import pandas as pd
|
15
|
+
import plotly.graph_objects as go
|
16
|
+
import sklearn.model_selection as skm
|
17
|
+
import sklearn.utils as sku
|
18
|
+
|
19
|
+
import skfolio.typing as skt
|
20
|
+
|
21
|
+
|
22
|
+
class BaseCombinatorialCV(ABC):
|
23
|
+
"""Base class for all combinatorial cross-validators.
|
24
|
+
|
25
|
+
Implementations must define `split` or `get_path_ids`.
|
26
|
+
"""
|
27
|
+
|
28
|
+
@abstractmethod
|
29
|
+
def split(self, X: npt.ArrayLike, y=None) -> tuple[np.ndarray, list[np.ndarray]]:
|
30
|
+
pass
|
31
|
+
|
32
|
+
@abstractmethod
|
33
|
+
def get_path_ids(self) -> np.ndarray:
|
34
|
+
"""Return the path id of each test sets in each split"""
|
35
|
+
pass
|
36
|
+
|
37
|
+
__repr__ = skm.BaseCrossValidator.__repr__
|
38
|
+
|
39
|
+
|
40
|
+
# TODO: review params and function naming
|
41
|
+
class CombinatorialPurgedCV(BaseCombinatorialCV):
|
42
|
+
"""Combinatorial Purged Cross-Validation.
|
43
|
+
|
44
|
+
Provides train/test indices to split time series data samples based on
|
45
|
+
Combinatorial Purged Cross-Validation [1]_.
|
46
|
+
|
47
|
+
Compared to `KFold` which split the data into `k` folds with `1` fold for the test
|
48
|
+
set and `k - 1` folds for the training set, `CombinatorialPurgedCV` uses `k - p`
|
49
|
+
folds for the training set with `p > 1` being the number of test folds.
|
50
|
+
|
51
|
+
`KFold` can recombine one single testing path while `CombinatorialPurgedCV` can
|
52
|
+
recombine multiple testing paths from the combinations of the train/test sets.
|
53
|
+
|
54
|
+
To avoid data leakage, purging and embargoing can be performed.
|
55
|
+
|
56
|
+
Purging consist of removing from the training set all observations whose labels
|
57
|
+
overlapped in time with those labels included in the testing set.
|
58
|
+
|
59
|
+
Embargoing consist of removing from the training set all observations that
|
60
|
+
immediately follow an observation in the testing set since financial features
|
61
|
+
often incorporate series that exhibit serial correlation (like ARMA processes).
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
n_folds : int, default=10
|
66
|
+
Number of folds. Must be at least 3.
|
67
|
+
|
68
|
+
n_test_folds : int, default=8
|
69
|
+
Number of test folds. Must be at least 2.
|
70
|
+
For only one test fold, use `sklearn.model_validation.KFold`.
|
71
|
+
|
72
|
+
purged_size : int, default=0
|
73
|
+
Number of observations to exclude from the start of each train set that are
|
74
|
+
after a test set **and** the number of observations to exclude from the end of
|
75
|
+
each training set that are before a test set.
|
76
|
+
|
77
|
+
embargo_size : int, default=0
|
78
|
+
Number of observations to exclude from the start of each training set that are
|
79
|
+
after a test set.
|
80
|
+
|
81
|
+
Attributes
|
82
|
+
----------
|
83
|
+
index_train_test_ : ndarray of shape (n_observations, n_splits)
|
84
|
+
|
85
|
+
Examples
|
86
|
+
--------
|
87
|
+
>>> import numpy as np
|
88
|
+
>>> from skfolio.model_selection import CombinatorialPurgedCV
|
89
|
+
>>> X = np.random.randn(12, 2)
|
90
|
+
>>> cv = CombinatorialPurgedCV(n_folds=3, n_test_folds=2)
|
91
|
+
>>> for i, (train_index, tests) in enumerate(cv.split(X)):
|
92
|
+
... print(f"Split {i}:")
|
93
|
+
... print(f" Train: index={train_index}")
|
94
|
+
... for j, test_index in enumerate(tests):
|
95
|
+
... print(f" Test {j}: index={test_index}")
|
96
|
+
Split 0:
|
97
|
+
Train: index=[ 8 9 10 11]
|
98
|
+
Test 0: index=[0 1 2 3]
|
99
|
+
Test 1: index=[4 5 6 7]
|
100
|
+
Split 1:
|
101
|
+
Train: index=[4 5 6 7]
|
102
|
+
Test 0: index=[0 1 2 3]
|
103
|
+
Test 1: index=[ 8 9 10 11]
|
104
|
+
Split 2:
|
105
|
+
Train: index=[0 1 2 3]
|
106
|
+
Test 0: index=[4 5 6 7]
|
107
|
+
Test 1: index=[ 8 9 10 11]
|
108
|
+
>>> cv = CombinatorialPurgedCV(n_folds=3, n_test_folds=2, purged_size=1)
|
109
|
+
>>> for i, (train_index, tests) in enumerate(cv.split(X)):
|
110
|
+
... print(f"Split {i}:")
|
111
|
+
... print(f" Train: index={train_index}")
|
112
|
+
... for j, test_index in enumerate(tests):
|
113
|
+
... print(f" Test {j}: index={test_index}")
|
114
|
+
Split 0:
|
115
|
+
Train: index=[ 9 10 11]
|
116
|
+
Test 0: index=[0 1 2 3]
|
117
|
+
Test 1: index=[4 5 6 7]
|
118
|
+
Split 1:
|
119
|
+
Train: index=[5 6]
|
120
|
+
Test 0: index=[0 1 2 3]
|
121
|
+
Test 1: index=[ 8 9 10 11]
|
122
|
+
Split 2:
|
123
|
+
Train: index=[0 1 2]
|
124
|
+
Test 0: index=[4 5 6 7]
|
125
|
+
Test 1: index=[ 8 9 10 11]
|
126
|
+
>>> cv = CombinatorialPurgedCV(n_folds=3, n_test_folds=2, embargo_size=1)
|
127
|
+
>>> for i, (train_index, tests) in enumerate(cv.split(X)):
|
128
|
+
... print(f"Split {i}:")
|
129
|
+
... print(f" Train: index={train_index}")
|
130
|
+
... for j, test_index in enumerate(tests):
|
131
|
+
... print(f" Test {j}: index={test_index}")
|
132
|
+
Split 0:
|
133
|
+
Train: index=[ 9 10 11]
|
134
|
+
Test 0: index=[0 1 2 3]
|
135
|
+
Test 1: index=[4 5 6 7]
|
136
|
+
Split 1:
|
137
|
+
Train: index=[5 6 7]
|
138
|
+
Test 0: index=[0 1 2 3]
|
139
|
+
Test 1: index=[ 8 9 10 11]
|
140
|
+
Split 2:
|
141
|
+
Train: index=[0 1 2 3]
|
142
|
+
Test 0: index=[4 5 6 7]
|
143
|
+
Test 1: index=[ 8 9 10 11]
|
144
|
+
|
145
|
+
References
|
146
|
+
----------
|
147
|
+
.. [1] "Advances in Financial Machine Learning",
|
148
|
+
Marcos López de Prado (2018)
|
149
|
+
"""
|
150
|
+
|
151
|
+
index_train_test_: np.ndarray
|
152
|
+
|
153
|
+
def __init__(
|
154
|
+
self,
|
155
|
+
n_folds: int = 10,
|
156
|
+
n_test_folds: int = 8,
|
157
|
+
purged_size: int = 0,
|
158
|
+
embargo_size: int = 0,
|
159
|
+
):
|
160
|
+
if not isinstance(n_folds, numbers.Integral):
|
161
|
+
raise ValueError(
|
162
|
+
"The number of folds must be of Integral type. "
|
163
|
+
f"{n_folds} of type {type(n_folds)} was passed."
|
164
|
+
)
|
165
|
+
n_folds = int(n_folds)
|
166
|
+
|
167
|
+
if n_folds <= 2:
|
168
|
+
raise ValueError(f"`n_folds` must be at least 3`, got `n_folds={n_folds}`.")
|
169
|
+
|
170
|
+
if n_test_folds <= 1:
|
171
|
+
raise ValueError(
|
172
|
+
f"`n_test_folds` must at least 2, got `n_test_folds={n_test_folds}`."
|
173
|
+
)
|
174
|
+
|
175
|
+
if n_test_folds >= n_folds:
|
176
|
+
raise ValueError(
|
177
|
+
"Combinatorial purged cross-validation requires `n_folds` "
|
178
|
+
"to be greater than `n_test_folds`."
|
179
|
+
)
|
180
|
+
|
181
|
+
if purged_size < 0:
|
182
|
+
raise ValueError("`purged_size` cannot be negative")
|
183
|
+
|
184
|
+
if embargo_size < 0:
|
185
|
+
raise ValueError("`embargo_size` cannot be negative")
|
186
|
+
|
187
|
+
self.n_folds = n_folds
|
188
|
+
self.n_test_folds = n_test_folds
|
189
|
+
self.purged_size = purged_size
|
190
|
+
self.embargo_size = embargo_size
|
191
|
+
|
192
|
+
@property
|
193
|
+
def n_splits(self) -> int:
|
194
|
+
"""Number of splits"""
|
195
|
+
return int(
|
196
|
+
math.factorial(self.n_folds)
|
197
|
+
/ (
|
198
|
+
math.factorial(self.n_test_folds)
|
199
|
+
* math.factorial(self.n_folds - self.n_test_folds)
|
200
|
+
)
|
201
|
+
)
|
202
|
+
|
203
|
+
@property
|
204
|
+
def n_test_paths(self) -> int:
|
205
|
+
"""Number of test paths that can be reconstructed from the train/test
|
206
|
+
combinations"""
|
207
|
+
return self.n_splits * self.n_test_folds // self.n_folds
|
208
|
+
|
209
|
+
@property
|
210
|
+
def test_set_index(self) -> np.ndarray:
|
211
|
+
"""Location of each test set"""
|
212
|
+
return np.array(
|
213
|
+
list(itertools.combinations(np.arange(self.n_folds), self.n_test_folds))
|
214
|
+
).reshape(-1, self.n_test_folds)
|
215
|
+
|
216
|
+
@property
|
217
|
+
def binary_train_test_sets(self) -> np.ndarray:
|
218
|
+
"""Identify training and test folds for each combinations by assigning `0` to
|
219
|
+
training folds and `1` to test folds"""
|
220
|
+
folds_train_test = np.zeros((self.n_folds, self.n_splits))
|
221
|
+
folds_train_test[
|
222
|
+
self.test_set_index, np.arange(self.n_splits)[:, np.newaxis]
|
223
|
+
] = 1
|
224
|
+
return folds_train_test
|
225
|
+
|
226
|
+
@property
|
227
|
+
def recombined_paths(self) -> np.ndarray:
|
228
|
+
"""Recombine each test path by returning the test set location in each split."""
|
229
|
+
return np.argwhere(self.binary_train_test_sets == 1)[:, 1].reshape(
|
230
|
+
self.n_folds, -1
|
231
|
+
)
|
232
|
+
|
233
|
+
def get_path_ids(self) -> np.ndarray:
|
234
|
+
"""Return the path id of each test sets in each split"""
|
235
|
+
recombine_paths = self.recombined_paths
|
236
|
+
path_ids = np.zeros((self.n_splits, self.n_test_folds), dtype=int)
|
237
|
+
for i in range(self.n_splits):
|
238
|
+
for j in range(self.n_test_folds):
|
239
|
+
path_ids[i, j] = np.argwhere(recombine_paths == i)[j][1]
|
240
|
+
return path_ids
|
241
|
+
|
242
|
+
def split(
|
243
|
+
self, X: npt.ArrayLike, y=None, groups=None
|
244
|
+
) -> Iterator[tuple[np.ndarray, list[np.ndarray]]]:
|
245
|
+
"""Generate indices to split data into training and test set.
|
246
|
+
|
247
|
+
Parameters
|
248
|
+
----------
|
249
|
+
X : array-like of shape (n_samples, n_features)
|
250
|
+
Training data, where `n_samples` is the number of samples
|
251
|
+
and `n_features` is the number of features.
|
252
|
+
|
253
|
+
y : array-like of shape (n_samples,), optional
|
254
|
+
The (multi-)target variable
|
255
|
+
|
256
|
+
groups : array-like of shape (n_samples,), optional
|
257
|
+
Group labels for the samples used while splitting the dataset into
|
258
|
+
train/test set.
|
259
|
+
|
260
|
+
Yields
|
261
|
+
------
|
262
|
+
train : ndarray
|
263
|
+
The training set indices for that split.
|
264
|
+
|
265
|
+
test : ndarray
|
266
|
+
The testing set indices for that split.
|
267
|
+
"""
|
268
|
+
test_set_index = self.test_set_index
|
269
|
+
recombine_paths = self.recombined_paths
|
270
|
+
|
271
|
+
X, y = sku.indexable(X, y)
|
272
|
+
n_samples = X.shape[0]
|
273
|
+
min_fold_size = n_samples // self.n_folds
|
274
|
+
if self.purged_size + self.embargo_size >= min_fold_size - 1:
|
275
|
+
raise ValueError(
|
276
|
+
"The sum of `purged_size` and `embargo_size` must be smaller than the"
|
277
|
+
f" size of a train fold which is {min_fold_size}"
|
278
|
+
)
|
279
|
+
|
280
|
+
fold_index_num = np.arange(n_samples) // (n_samples // self.n_folds)
|
281
|
+
fold_index_num[fold_index_num == self.n_folds] = self.n_folds - 1
|
282
|
+
|
283
|
+
index_train_test = np.zeros((n_samples, self.n_splits))
|
284
|
+
for i in range(self.n_splits):
|
285
|
+
index_train_test[
|
286
|
+
np.argwhere([fold_index_num == j for j in test_set_index[i]])[:, 1], i
|
287
|
+
] = 1
|
288
|
+
|
289
|
+
diff = np.diff(index_train_test, axis=0)
|
290
|
+
|
291
|
+
# Purge before
|
292
|
+
before_index = np.argwhere(diff == 1)
|
293
|
+
for k in range(self.purged_size):
|
294
|
+
index_train_test[
|
295
|
+
np.maximum(0, before_index[:, 0] - k), before_index[:, 1]
|
296
|
+
] = -1
|
297
|
+
|
298
|
+
# Purge after and Embargo
|
299
|
+
after_index = np.argwhere(diff == -1)
|
300
|
+
for k in range(self.purged_size + self.embargo_size):
|
301
|
+
index_train_test[
|
302
|
+
np.minimum(n_samples - 1, after_index[:, 0] + k + 1), after_index[:, 1]
|
303
|
+
] = -1
|
304
|
+
self.index_train_test_ = index_train_test
|
305
|
+
|
306
|
+
fold_index = {
|
307
|
+
fold_id: np.argwhere(fold_index_num == fold_id).reshape(-1)
|
308
|
+
for fold_id in range(self.n_folds)
|
309
|
+
}
|
310
|
+
for i in range(self.n_splits):
|
311
|
+
train_index = np.argwhere(index_train_test[:, i] == 0).reshape(-1)
|
312
|
+
test_index_list = [
|
313
|
+
fold_index[fold_id] for fold_id, _ in np.argwhere(recombine_paths == i)
|
314
|
+
]
|
315
|
+
yield train_index, test_index_list
|
316
|
+
|
317
|
+
def summary(self, X) -> pd.Series:
|
318
|
+
n_samples = X.shape[0]
|
319
|
+
return pd.Series(
|
320
|
+
{
|
321
|
+
"Number of Observations": n_samples,
|
322
|
+
"Total Number of Folds": self.n_folds,
|
323
|
+
"Number of Test Folds": self.n_test_folds,
|
324
|
+
"Purge Size": self.purged_size,
|
325
|
+
"Embargo Size": self.embargo_size,
|
326
|
+
"Average Training Size": int(
|
327
|
+
n_samples / self.n_folds * (self.n_folds - self.n_test_folds)
|
328
|
+
),
|
329
|
+
"Number of Test Paths": self.n_test_paths,
|
330
|
+
"Number of Training Combinations": self.n_splits,
|
331
|
+
}
|
332
|
+
)
|
333
|
+
|
334
|
+
def plot_train_test_folds(self) -> skt.Figure:
|
335
|
+
"""Plot the train/test fold locations"""
|
336
|
+
values = self.binary_train_test_sets
|
337
|
+
fill_color = np.where(values == 0, "blue", "red")
|
338
|
+
fill_color = fill_color.astype(object)
|
339
|
+
fill_color = np.insert(
|
340
|
+
fill_color, 0, np.array(["darkblue" for _ in range(self.n_splits)]), axis=0
|
341
|
+
)
|
342
|
+
values = np.insert(values, 0, np.arange(self.n_splits), axis=0)
|
343
|
+
fig = go.Figure(
|
344
|
+
data=[
|
345
|
+
go.Table(
|
346
|
+
header=dict(
|
347
|
+
values=["Train Combinations"]
|
348
|
+
+ [f"Fold {i}" for i in range(self.n_folds)],
|
349
|
+
fill_color="darkblue",
|
350
|
+
font=dict(color="white"),
|
351
|
+
align="left",
|
352
|
+
),
|
353
|
+
cells=dict(
|
354
|
+
values=values,
|
355
|
+
font=dict(color="white"),
|
356
|
+
fill_color=fill_color,
|
357
|
+
line_color="grey",
|
358
|
+
align="left",
|
359
|
+
),
|
360
|
+
)
|
361
|
+
]
|
362
|
+
)
|
363
|
+
fig.update_layout(title="Split Train (0) /Test (1) Folds per Combination")
|
364
|
+
return fig
|
365
|
+
|
366
|
+
def plot_train_test_index(self, X) -> skt.Figure:
|
367
|
+
"""Plot the training and test indices for each combinations by assigning `0` to
|
368
|
+
training, `1` to test and `-1` to both purge and embargo indices."""
|
369
|
+
next(self.split(X))
|
370
|
+
n_samples = X.shape[0]
|
371
|
+
cond = [
|
372
|
+
self.index_train_test_ == -1,
|
373
|
+
self.index_train_test_ == 0,
|
374
|
+
self.index_train_test_ == 1,
|
375
|
+
]
|
376
|
+
values = self.index_train_test_.T
|
377
|
+
values = np.insert(values, 0, np.arange(n_samples), axis=0)
|
378
|
+
fill_color = np.select(cond, ["green", "blue", "red"]).T
|
379
|
+
fill_color = fill_color.astype(object)
|
380
|
+
fill_color = np.insert(
|
381
|
+
fill_color, 0, np.array(["darkblue" for _ in range(n_samples)]), axis=0
|
382
|
+
)
|
383
|
+
fig = go.Figure(
|
384
|
+
data=[
|
385
|
+
go.Table(
|
386
|
+
header=dict(
|
387
|
+
values=["observations"]
|
388
|
+
+ [f"Split {i}" for i in range(self.n_splits)],
|
389
|
+
fill_color="darkblue",
|
390
|
+
font=dict(color="white"),
|
391
|
+
align="left",
|
392
|
+
),
|
393
|
+
cells=dict(
|
394
|
+
values=values,
|
395
|
+
font=dict(color="white"),
|
396
|
+
fill_color=fill_color,
|
397
|
+
line_color="grey",
|
398
|
+
align="left",
|
399
|
+
),
|
400
|
+
)
|
401
|
+
]
|
402
|
+
)
|
403
|
+
fig.update_layout(
|
404
|
+
title="Train (0), Test (1) and Purge/Embargo (-1) observations per splits"
|
405
|
+
)
|
406
|
+
|
407
|
+
return fig
|