skfolio 0.3.1__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {skfolio-0.3.1/src/skfolio.egg-info → skfolio-0.4.1}/PKG-INFO +2 -2
- {skfolio-0.3.1 → skfolio-0.4.1}/pyproject.toml +4 -3
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/datasets/_base.py +1 -1
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/measures/_measures.py +1 -1
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/model_selection/_combinatorial.py +1 -1
- skfolio-0.4.1/src/skfolio/model_selection/_walk_forward.py +440 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/population/_population.py +215 -152
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/portfolio/_base.py +48 -9
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/portfolio/_multi_period_portfolio.py +45 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/portfolio/_portfolio.py +82 -46
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/utils/tools.py +18 -1
- {skfolio-0.3.1 → skfolio-0.4.1/src/skfolio.egg-info}/PKG-INFO +2 -2
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio.egg-info/requires.txt +1 -1
- skfolio-0.3.1/src/skfolio/model_selection/_walk_forward.py +0 -226
- {skfolio-0.3.1 → skfolio-0.4.1}/LICENSE +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/MANIFEST.in +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/README.rst +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/setup.cfg +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/cluster/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/cluster/_hierarchical.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/datasets/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/datasets/data/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/datasets/data/factors_dataset.csv.gz +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/datasets/data/sp500_index.csv.gz +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/distance/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/distance/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/distance/_distance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/exceptions.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/measures/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/measures/_enums.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/metrics/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/metrics/_scorer.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/model_selection/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/model_selection/_validation.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_denoise_covariance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_detone_covariance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_empirical_covariance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_ew_covariance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_gerber_covariance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_graphical_lasso_cv.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_implied_covariance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_ledoit_wolf.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_oas.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/covariance/_shrunk_covariance.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/expected_returns/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/expected_returns/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/expected_returns/_empirical_mu.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/expected_returns/_equilibrium_mu.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/expected_returns/_ew_mu.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/moments/expected_returns/_shrunk_mu.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/cluster/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/cluster/_nco.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/cluster/hierarchical/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/cluster/hierarchical/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/cluster/hierarchical/_herc.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/cluster/hierarchical/_hrp.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/convex/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/convex/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/convex/_distributionally_robust.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/convex/_maximum_diversification.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/convex/_mean_risk.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/convex/_risk_budgeting.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/ensemble/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/ensemble/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/ensemble/_stacking.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/naive/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/optimization/naive/_naive.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/population/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/portfolio/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/pre_selection/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/pre_selection/_pre_selection.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/preprocessing/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/preprocessing/_returns.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/prior/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/prior/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/prior/_black_litterman.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/prior/_empirical.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/prior/_factor_model.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/typing.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/uncertainty_set/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/uncertainty_set/_base.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/uncertainty_set/_bootstrap.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/uncertainty_set/_empirical.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/utils/__init__.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/utils/bootstrap.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/utils/equations.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/utils/sorting.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio/utils/stats.py +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio.egg-info/SOURCES.txt +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio.egg-info/dependency_links.txt +0 -0
- {skfolio-0.3.1 → skfolio-0.4.1}/src/skfolio.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: skfolio
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: Portfolio optimization built on top of scikit-learn
|
5
5
|
Author-email: Hugo Delatte <delatte.hugo@gmail.com>
|
6
6
|
Maintainer-email: Hugo Delatte <delatte.hugo@gmail.com>
|
@@ -56,7 +56,7 @@ Classifier: Topic :: Software Development
|
|
56
56
|
Requires-Python: >=3.10
|
57
57
|
Description-Content-Type: text/x-rst
|
58
58
|
License-File: LICENSE
|
59
|
-
Requires-Dist: numpy
|
59
|
+
Requires-Dist: numpy>=1.23.4
|
60
60
|
Requires-Dist: scipy>=1.8.0
|
61
61
|
Requires-Dist: pandas>=1.4.1
|
62
62
|
Requires-Dist: cvxpy>=1.4.1
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "skfolio"
|
7
|
-
version = "0.
|
7
|
+
version = "0.4.1"
|
8
8
|
maintainers = [
|
9
9
|
{ name = "Hugo Delatte", email = "delatte.hugo@gmail.com" },
|
10
10
|
]
|
@@ -15,7 +15,7 @@ description = "Portfolio optimization built on top of scikit-learn"
|
|
15
15
|
readme = "README.rst"
|
16
16
|
requires-python = ">=3.10"
|
17
17
|
dependencies = [
|
18
|
-
"numpy>=1.23.4
|
18
|
+
"numpy>=1.23.4",
|
19
19
|
"scipy>=1.8.0",
|
20
20
|
"pandas>=1.4.1",
|
21
21
|
"cvxpy>=1.4.1",
|
@@ -95,7 +95,8 @@ version_toml = [
|
|
95
95
|
"pyproject.toml:project.version",
|
96
96
|
]
|
97
97
|
branch = "main"
|
98
|
-
|
98
|
+
allow_zero_version = true
|
99
|
+
major_on_zero = false
|
99
100
|
tag_format = "v{version}"
|
100
101
|
upload_to_pypi = false
|
101
102
|
upload_to_vcs_release = true
|
@@ -347,7 +347,7 @@ def entropic_risk_measure(
|
|
347
347
|
"""Compute the entropic risk measure.
|
348
348
|
|
349
349
|
The entropic risk measure is a risk measure which depends on the risk aversion
|
350
|
-
defined by the investor (
|
350
|
+
defined by the investor (theta) through the exponential utility function at a given
|
351
351
|
confidence level (beta).
|
352
352
|
|
353
353
|
Parameters
|
@@ -377,7 +377,7 @@ class CombinatorialPurgedCV(BaseCombinatorialCV):
|
|
377
377
|
]
|
378
378
|
values = self.index_train_test_.T
|
379
379
|
values = np.insert(values, 0, np.arange(n_samples), axis=0)
|
380
|
-
fill_color = np.select(cond, ["green", "blue", "red"]).T
|
380
|
+
fill_color = np.select(cond, ["green", "blue", "red"], default="green").T
|
381
381
|
fill_color = fill_color.astype(object)
|
382
382
|
fill_color = np.insert(
|
383
383
|
fill_color, 0, np.array(["darkblue" for _ in range(n_samples)]), axis=0
|
@@ -0,0 +1,440 @@
|
|
1
|
+
"""Walk Forward cross-validator"""
|
2
|
+
|
3
|
+
# Copyright (c) 2023
|
4
|
+
# Author: Hugo Delatte <delatte.hugo@gmail.com>
|
5
|
+
# License: BSD 3 clause
|
6
|
+
# Implementation derived from:
|
7
|
+
# scikit-portfolio, Copyright (c) 2022, Carlo Nicolini, Licensed under MIT Licence.
|
8
|
+
# scikit-learn, Copyright (c) 2007-2010 David Cournapeau, Fabian Pedregosa, Olivier
|
9
|
+
# Grisel Licensed under BSD 3 clause.
|
10
|
+
|
11
|
+
import datetime as dt
|
12
|
+
from collections.abc import Iterator
|
13
|
+
|
14
|
+
import numpy as np
|
15
|
+
import numpy.typing as npt
|
16
|
+
import pandas as pd
|
17
|
+
import sklearn.model_selection as sks
|
18
|
+
import sklearn.utils as sku
|
19
|
+
|
20
|
+
|
21
|
+
class WalkForward(sks.BaseCrossValidator):
|
22
|
+
"""Walk Forward Cross-Validator.
|
23
|
+
|
24
|
+
Provides train/test indices to split time series data samples using a walk-forward
|
25
|
+
logic.
|
26
|
+
|
27
|
+
In each split, test indices must be higher than the previous ones; therefore,
|
28
|
+
shuffling in cross-validator is inappropriate.
|
29
|
+
|
30
|
+
Compared to `sklearn.model_selection.TimeSeriesSplit`, you control the train/test
|
31
|
+
folds by specifying the number of training and test samples instead of the number
|
32
|
+
of splits, making it more suitable for portfolio cross-validation.
|
33
|
+
|
34
|
+
If your data is a DataFrame indexed with a DatetimeIndex, you can split the data
|
35
|
+
using specific datetime frequencies and offsets.
|
36
|
+
|
37
|
+
Parameters
|
38
|
+
----------
|
39
|
+
test_size : int
|
40
|
+
Length of each test set.
|
41
|
+
If `freq` is `None` (default), it represents the number of observations.
|
42
|
+
Otherwise, it represents the number of periods defined by `freq`.
|
43
|
+
|
44
|
+
train_size : int | pandas.offsets.DateOffset | datetime.timedelta
|
45
|
+
Length of each training set.
|
46
|
+
If `freq` is `None` (default), it represents the number of observations.
|
47
|
+
Otherwise, for integers, it represents the number of periods defined by `freq`;
|
48
|
+
for pandas DateOffset or datetime timedelta it represents the date offset
|
49
|
+
applied to the start of each period.
|
50
|
+
|
51
|
+
freq : str | pandas.offsets.DateOffset, optional
|
52
|
+
If provided, it must be a frequency string or a pandas DateOffset, and the
|
53
|
+
returns `X` must be a DataFrame with an index of type `DatetimeIndex`.
|
54
|
+
For a list of pandas frequencies and offsets, see `here <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases>`_.
|
55
|
+
The defaul (`None`) means `test_size` and `train_size` represent the number of
|
56
|
+
observations.
|
57
|
+
|
58
|
+
Below are some common examples:
|
59
|
+
|
60
|
+
* Rebalancing : Montly on the first day
|
61
|
+
* Test Duration : 1 month
|
62
|
+
* Train Duration : 6 months
|
63
|
+
|
64
|
+
>>> cv = WalkForward(test_size=1, train_size=6, freq="MS")
|
65
|
+
|
66
|
+
* Rebalancing : Quarterly on the first day
|
67
|
+
* Test Duration : 1 quarter
|
68
|
+
* Train Duration : 2 months
|
69
|
+
|
70
|
+
>>> cv = WalkForward(test_size=1, train_size=pd.DateOffset(months=2), freq="QS")
|
71
|
+
|
72
|
+
* Rebalancing : Montly on the third Friday
|
73
|
+
* Test Duration : 1 month
|
74
|
+
* Train Duration : 6 weeks
|
75
|
+
|
76
|
+
>>> cv = WalkForward(test_size=1, train_size=pd.offsets.Week(6), freq= "WOM-3FRI")
|
77
|
+
|
78
|
+
* Rebalancing : Semi-annually on the last day
|
79
|
+
* Test Duration : 6 months
|
80
|
+
* Train Duration : 1 year
|
81
|
+
|
82
|
+
>>> cv = WalkForward(test_size=1, train_size=2, freq=pd.offsets.SemiMonthEnd())
|
83
|
+
|
84
|
+
* Rebalancing : Every 2 months on the second day
|
85
|
+
* Test Duration : 2 months
|
86
|
+
* Train Duration : 6 months
|
87
|
+
|
88
|
+
>>> cv = WalkForward(test_size=2, train_size=6, freq="MS", freq_offset=dt.timedelta(days=2))
|
89
|
+
|
90
|
+
freq_offset : pandas DateOffset | datetime timedelta, optional
|
91
|
+
Only used if `freq` is provided. Offsets the `freq` by a pandas DateOffset or a
|
92
|
+
datetime timedelta offset.
|
93
|
+
|
94
|
+
previous : bool, default=False
|
95
|
+
Only used if `freq` is provided. If set to `True`, and if the period start
|
96
|
+
or period end is not in the `DatetimeIndex`, the previous observation is used;
|
97
|
+
otherwise, the next observation is used (default).
|
98
|
+
|
99
|
+
expend_train : bool, default=False
|
100
|
+
If set to `True`, each subsequent training set after the first one will
|
101
|
+
use all past observations.
|
102
|
+
The default is `False`.
|
103
|
+
|
104
|
+
reduce_test : bool, default=False
|
105
|
+
If set to `True`, the last train/test split will be returned even if the
|
106
|
+
test set is partial (i.e., it contains fewer observations than `test_size`),
|
107
|
+
otherwise, it will be ignored.
|
108
|
+
The default is `False`.
|
109
|
+
|
110
|
+
purged_size : int, default=0
|
111
|
+
The number of observations to exclude from the end of each training set before
|
112
|
+
the test set.
|
113
|
+
The default value is `0`.
|
114
|
+
|
115
|
+
Examples
|
116
|
+
--------
|
117
|
+
>>> import numpy as np
|
118
|
+
>>> from skfolio.model_selection import WalkForward
|
119
|
+
>>> X = np.random.randn(6, 2)
|
120
|
+
>>> cv = WalkForward(test_size=1, train_size=2)
|
121
|
+
>>> for i, (train_index, test_index) in enumerate(cv.split(X)):
|
122
|
+
... print(f"Fold {i}:")
|
123
|
+
... print(f" Train: index={train_index}")
|
124
|
+
... print(f" Test: index={test_index}")
|
125
|
+
Fold 0:
|
126
|
+
Train: index=[0 1]
|
127
|
+
Test: index=[2]
|
128
|
+
Fold 1:
|
129
|
+
Train: index=[1 2]
|
130
|
+
Test: index=[3]
|
131
|
+
Fold 2:
|
132
|
+
Train: index=[2 3]
|
133
|
+
Test: index=[4]
|
134
|
+
Fold 3:
|
135
|
+
Train: index=[3 4]
|
136
|
+
Test: index=[5]
|
137
|
+
>>> cv = WalkForward(test_size=1, train_size=2, purged_size=1)
|
138
|
+
>>> for i, (train_index, test_index) in enumerate(cv.split(X)):
|
139
|
+
... print(f"Fold {i}:")
|
140
|
+
... print(f" Train: index={train_index}")
|
141
|
+
... print(f" Test: index={test_index}")
|
142
|
+
Fold 0:
|
143
|
+
Train: index=[0 1]
|
144
|
+
Test: index=[3]
|
145
|
+
Fold 1:
|
146
|
+
Train: index=[1 2]
|
147
|
+
Test: index=[4]
|
148
|
+
Fold 2:
|
149
|
+
Train: index=[2 3]
|
150
|
+
Test: index=[5]
|
151
|
+
>>> cv = WalkForward(test_size=2, train_size=3)
|
152
|
+
>>> for i, (train_index, test_index) in enumerate(cv.split(X)):
|
153
|
+
... print(f"Fold {i}:")
|
154
|
+
... print(f" Train: index={train_index}")
|
155
|
+
... print(f" Test: index={test_index}")
|
156
|
+
Fold 0:
|
157
|
+
Train: index=[0 1 2]
|
158
|
+
Test: index=[3 4]
|
159
|
+
>>> cv = WalkForward(test_size=2, train_size=3, reduce_test=True)
|
160
|
+
>>> for i, (train_index, test_index) in enumerate(cv.split(X)):
|
161
|
+
... print(f"Fold {i}:")
|
162
|
+
... print(f" Train: index={train_index}")
|
163
|
+
... print(f" Test: index={test_index}")
|
164
|
+
Fold 0:
|
165
|
+
Train: index=[0 1 2]
|
166
|
+
Test: index=[3 4]
|
167
|
+
Fold 1:
|
168
|
+
Train: index=[2 3 4]
|
169
|
+
Test: index=[5]
|
170
|
+
>>> cv = WalkForward(test_size=2, train_size=3, expend_train=True, reduce_test=True)
|
171
|
+
>>> for i, (train_index, test_index) in enumerate(cv.split(X)):
|
172
|
+
... print(f"Fold {i}:")
|
173
|
+
... print(f" Train: index={train_index}")
|
174
|
+
... print(f" Test: index={test_index}")
|
175
|
+
Fold 0:
|
176
|
+
Train: index=[0 1 2]
|
177
|
+
Test: index=[3 4]
|
178
|
+
Fold 1:
|
179
|
+
Train: index=[0 1 2 3 4]
|
180
|
+
Test: index=[5]
|
181
|
+
"""
|
182
|
+
|
183
|
+
def __init__(
|
184
|
+
self,
|
185
|
+
test_size: int,
|
186
|
+
train_size: int | pd.offsets.BaseOffset | dt.timedelta,
|
187
|
+
freq: str | pd.offsets.BaseOffset | None = None,
|
188
|
+
freq_offset: pd.offsets.BaseOffset | dt.timedelta | None = None,
|
189
|
+
previous: bool = False,
|
190
|
+
expend_train: bool = False,
|
191
|
+
reduce_test: bool = False,
|
192
|
+
purged_size: int = 0,
|
193
|
+
):
|
194
|
+
self.test_size = test_size
|
195
|
+
self.train_size = train_size
|
196
|
+
self.freq = freq
|
197
|
+
self.freq_offset = freq_offset
|
198
|
+
self.previous = previous
|
199
|
+
self.expend_train = expend_train
|
200
|
+
self.reduce_test = reduce_test
|
201
|
+
self.purged_size = purged_size
|
202
|
+
|
203
|
+
def split(
|
204
|
+
self, X: npt.ArrayLike, y=None, groups=None
|
205
|
+
) -> Iterator[np.ndarray, np.ndarray]:
|
206
|
+
"""Generate indices to split data into training and test set.
|
207
|
+
|
208
|
+
Parameters
|
209
|
+
----------
|
210
|
+
X : array-like of shape (n_observations, n_assets)
|
211
|
+
Price returns of the assets.
|
212
|
+
|
213
|
+
y : array-like of shape (n_observations, n_targets)
|
214
|
+
Always ignored, exists for compatibility.
|
215
|
+
|
216
|
+
groups : array-like of shape (n_observations,)
|
217
|
+
Always ignored, exists for compatibility.
|
218
|
+
|
219
|
+
Yields
|
220
|
+
------
|
221
|
+
train : ndarray
|
222
|
+
The training set indices for that split.
|
223
|
+
|
224
|
+
test : ndarray
|
225
|
+
The testing set indices for that split.
|
226
|
+
"""
|
227
|
+
X, y = sku.indexable(X, y)
|
228
|
+
n_samples = X.shape[0]
|
229
|
+
|
230
|
+
if not isinstance(self.test_size, int):
|
231
|
+
raise ValueError("test_size` must be an integer")
|
232
|
+
|
233
|
+
if self.freq is None:
|
234
|
+
if not isinstance(self.train_size, int):
|
235
|
+
raise ValueError("When `freq` is None, `train_size` must be an integer")
|
236
|
+
return _split_without_period(
|
237
|
+
n_samples=n_samples,
|
238
|
+
train_size=self.train_size,
|
239
|
+
test_size=self.test_size,
|
240
|
+
purged_size=self.purged_size,
|
241
|
+
expend_train=self.expend_train,
|
242
|
+
reduce_test=self.reduce_test,
|
243
|
+
)
|
244
|
+
|
245
|
+
if not hasattr(X, "index") or not isinstance(X.index, pd.DatetimeIndex):
|
246
|
+
raise ValueError(
|
247
|
+
"X must be a DataFrame with an index of type DatetimeIndex"
|
248
|
+
)
|
249
|
+
if isinstance(self.train_size, int):
|
250
|
+
return _split_from_period_without_train_offset(
|
251
|
+
n_samples=n_samples,
|
252
|
+
train_size=self.train_size,
|
253
|
+
test_size=self.test_size,
|
254
|
+
freq=self.freq,
|
255
|
+
freq_offset=self.freq_offset,
|
256
|
+
previous=self.previous,
|
257
|
+
purged_size=self.purged_size,
|
258
|
+
expend_train=self.expend_train,
|
259
|
+
reduce_test=self.reduce_test,
|
260
|
+
ts_index=X.index,
|
261
|
+
)
|
262
|
+
return _split_from_period_with_train_offset(
|
263
|
+
n_samples=n_samples,
|
264
|
+
train_size=self.train_size,
|
265
|
+
test_size=self.test_size,
|
266
|
+
freq=self.freq,
|
267
|
+
freq_offset=self.freq_offset,
|
268
|
+
previous=self.previous,
|
269
|
+
purged_size=self.purged_size,
|
270
|
+
expend_train=self.expend_train,
|
271
|
+
reduce_test=self.reduce_test,
|
272
|
+
ts_index=X.index,
|
273
|
+
)
|
274
|
+
|
275
|
+
def get_n_splits(self, X=None, y=None, groups=None) -> int:
|
276
|
+
"""Returns the number of splitting iterations in the cross-validator
|
277
|
+
|
278
|
+
Parameters
|
279
|
+
----------
|
280
|
+
X : array-like of shape (n_observations, n_assets)
|
281
|
+
Price returns of the assets.
|
282
|
+
|
283
|
+
y : array-like of shape (n_observations, n_targets)
|
284
|
+
Always ignored, exists for compatibility.
|
285
|
+
|
286
|
+
groups : array-like of shape (n_observations,)
|
287
|
+
Always ignored, exists for compatibility.
|
288
|
+
|
289
|
+
Returns
|
290
|
+
-------
|
291
|
+
n_folds : int
|
292
|
+
Returns the number of splitting iterations in the cross-validator.
|
293
|
+
"""
|
294
|
+
if X is None:
|
295
|
+
raise ValueError("The 'X' parameter should not be None.")
|
296
|
+
X, y = sku.indexable(X, y)
|
297
|
+
n_samples = X.shape[0]
|
298
|
+
n = n_samples - self.train_size - self.purged_size
|
299
|
+
|
300
|
+
if self.reduce_test and n % self.test_size != 0:
|
301
|
+
return n // self.test_size + 1
|
302
|
+
return n // self.test_size
|
303
|
+
|
304
|
+
|
305
|
+
def _split_without_period(
|
306
|
+
n_samples: int,
|
307
|
+
train_size: int,
|
308
|
+
test_size: int,
|
309
|
+
purged_size: int,
|
310
|
+
expend_train: bool,
|
311
|
+
reduce_test: bool,
|
312
|
+
) -> Iterator[np.ndarray, np.ndarray]:
|
313
|
+
if train_size + purged_size >= n_samples:
|
314
|
+
raise ValueError(
|
315
|
+
"The sum of `train_size` with `purged_size` "
|
316
|
+
f"({train_size + purged_size}) cannot be greater than the"
|
317
|
+
f" number of samples ({n_samples})."
|
318
|
+
)
|
319
|
+
|
320
|
+
indices = np.arange(n_samples)
|
321
|
+
|
322
|
+
test_start = train_size + purged_size
|
323
|
+
while True:
|
324
|
+
if test_start >= n_samples:
|
325
|
+
return
|
326
|
+
test_end = test_start + test_size
|
327
|
+
train_end = test_start - purged_size
|
328
|
+
if expend_train:
|
329
|
+
train_start = 0
|
330
|
+
else:
|
331
|
+
train_start = train_end - train_size
|
332
|
+
|
333
|
+
if test_end > n_samples:
|
334
|
+
if not reduce_test:
|
335
|
+
return
|
336
|
+
test_indices = indices[test_start:]
|
337
|
+
else:
|
338
|
+
test_indices = indices[test_start:test_end]
|
339
|
+
train_indices = indices[train_start:train_end]
|
340
|
+
yield train_indices, test_indices
|
341
|
+
|
342
|
+
test_start = test_end
|
343
|
+
|
344
|
+
|
345
|
+
def _split_from_period_without_train_offset(
|
346
|
+
n_samples: int,
|
347
|
+
train_size: int,
|
348
|
+
test_size: int,
|
349
|
+
freq: str,
|
350
|
+
freq_offset: pd.offsets.BaseOffset | dt.timedelta | None,
|
351
|
+
previous: bool,
|
352
|
+
purged_size: int,
|
353
|
+
expend_train: bool,
|
354
|
+
reduce_test: bool,
|
355
|
+
ts_index,
|
356
|
+
) -> Iterator[np.ndarray, np.ndarray]:
|
357
|
+
start = ts_index[0]
|
358
|
+
end = ts_index[-1]
|
359
|
+
if freq_offset is not None:
|
360
|
+
start = min(start, start - freq_offset)
|
361
|
+
|
362
|
+
date_range = pd.date_range(start=start, end=end, freq=freq)
|
363
|
+
if freq_offset is not None:
|
364
|
+
date_range += freq_offset
|
365
|
+
|
366
|
+
idx = ts_index.get_indexer(date_range, method="ffill" if previous else "bfill")
|
367
|
+
n = len(idx)
|
368
|
+
i = 0
|
369
|
+
while True:
|
370
|
+
if i + train_size >= n:
|
371
|
+
return
|
372
|
+
|
373
|
+
if i + train_size + test_size >= n:
|
374
|
+
if not reduce_test:
|
375
|
+
return
|
376
|
+
test_indices = np.arange(idx[i + train_size], n_samples)
|
377
|
+
|
378
|
+
else:
|
379
|
+
test_indices = np.arange(
|
380
|
+
idx[i + train_size], idx[i + train_size + test_size]
|
381
|
+
)
|
382
|
+
if expend_train:
|
383
|
+
train_start = 0
|
384
|
+
else:
|
385
|
+
train_start = idx[i]
|
386
|
+
train_indices = np.arange(train_start, idx[i + train_size] - purged_size)
|
387
|
+
yield train_indices, test_indices
|
388
|
+
|
389
|
+
i += test_size
|
390
|
+
|
391
|
+
|
392
|
+
def _split_from_period_with_train_offset(
|
393
|
+
n_samples: int,
|
394
|
+
train_size: pd.offsets.BaseOffset | dt.timedelta,
|
395
|
+
test_size: int,
|
396
|
+
freq: str,
|
397
|
+
freq_offset: pd.offsets.BaseOffset | dt.timedelta | None,
|
398
|
+
previous: bool,
|
399
|
+
purged_size: int,
|
400
|
+
expend_train: bool,
|
401
|
+
reduce_test: bool,
|
402
|
+
ts_index,
|
403
|
+
) -> Iterator[np.ndarray, np.ndarray]:
|
404
|
+
start = ts_index[0]
|
405
|
+
end = ts_index[-1]
|
406
|
+
if freq_offset is not None:
|
407
|
+
start = min(start, start - freq_offset)
|
408
|
+
|
409
|
+
date_range = pd.date_range(start=start, end=end, freq=freq)
|
410
|
+
if freq_offset is not None:
|
411
|
+
date_range += freq_offset
|
412
|
+
|
413
|
+
idx = ts_index.get_indexer(date_range, method="ffill" if previous else "bfill")
|
414
|
+
train_idx = ts_index.get_indexer(date_range - train_size, method="ffill")
|
415
|
+
|
416
|
+
n = len(idx)
|
417
|
+
|
418
|
+
if np.all(train_idx == -1):
|
419
|
+
return
|
420
|
+
|
421
|
+
i = np.argmax(train_idx > -1)
|
422
|
+
while True:
|
423
|
+
if i >= n:
|
424
|
+
return
|
425
|
+
|
426
|
+
if i + test_size >= n:
|
427
|
+
if not reduce_test:
|
428
|
+
return
|
429
|
+
test_indices = np.arange(idx[i], n_samples)
|
430
|
+
else:
|
431
|
+
test_indices = np.arange(idx[i], idx[i + test_size] - purged_size)
|
432
|
+
|
433
|
+
if expend_train:
|
434
|
+
train_start = 0
|
435
|
+
else:
|
436
|
+
train_start = train_idx[i]
|
437
|
+
train_indices = np.arange(train_start, idx[i])
|
438
|
+
yield train_indices, test_indices
|
439
|
+
|
440
|
+
i += test_size
|