purgedcv 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- purgedcv-0.0.1/.github/workflows/ci.yml +127 -0
- purgedcv-0.0.1/.gitignore +227 -0
- purgedcv-0.0.1/.pre-commit-config.yaml +16 -0
- purgedcv-0.0.1/LICENSE +21 -0
- purgedcv-0.0.1/PKG-INFO +232 -0
- purgedcv-0.0.1/README.md +196 -0
- purgedcv-0.0.1/examples/README.md +127 -0
- purgedcv-0.0.1/examples/clinical_mortality_physionet.ipynb +789 -0
- purgedcv-0.0.1/examples/energy_demand_pjm.ipynb +839 -0
- purgedcv-0.0.1/examples/precipitation_noaa.ipynb +640 -0
- purgedcv-0.0.1/examples/predictive_maintenance_nasa.ipynb +582 -0
- purgedcv-0.0.1/pyproject.toml +104 -0
- purgedcv-0.0.1/src/purgedcv/__init__.py +47 -0
- purgedcv-0.0.1/src/purgedcv/_base.py +167 -0
- purgedcv-0.0.1/src/purgedcv/_cpcv.py +222 -0
- purgedcv-0.0.1/src/purgedcv/_embargo.py +63 -0
- purgedcv-0.0.1/src/purgedcv/_metrics.py +226 -0
- purgedcv-0.0.1/src/purgedcv/_paths.py +125 -0
- purgedcv-0.0.1/src/purgedcv/_purge.py +66 -0
- purgedcv-0.0.1/src/purgedcv/_purged_kfold.py +221 -0
- purgedcv-0.0.1/src/purgedcv/_time.py +136 -0
- purgedcv-0.0.1/src/purgedcv/_typing.py +17 -0
- purgedcv-0.0.1/src/purgedcv/_walk_forward.py +170 -0
- purgedcv-0.0.1/src/purgedcv/diagnostics.py +207 -0
- purgedcv-0.0.1/src/purgedcv/exceptions.py +25 -0
- purgedcv-0.0.1/tests/__init__.py +0 -0
- purgedcv-0.0.1/tests/conftest.py +12 -0
- purgedcv-0.0.1/tests/e2e/__init__.py +0 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_apply_embargo.py +90 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_assert_embargo_respected.py +92 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_assert_groups_disjoint.py +87 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_assert_no_temporal_leakage.py +101 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_backtest_paths.py +104 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_compute_overlap_fraction.py +96 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_cpcv.py +72 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_horizons_overlap.py +75 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_metrics.py +110 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_parse_horizon.py +77 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_purge.py +96 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_purged_group_kfold.py +76 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_purged_kfold.py +67 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_validate_times.py +82 -0
- purgedcv-0.0.1/tests/e2e/test_e2e_walk_forward.py +95 -0
- purgedcv-0.0.1/tests/e2e/test_install_smoke.py +44 -0
- purgedcv-0.0.1/tests/e2e/test_quality_gate.py +61 -0
- purgedcv-0.0.1/tests/test_base_splitter.py +301 -0
- purgedcv-0.0.1/tests/test_cpcv.py +227 -0
- purgedcv-0.0.1/tests/test_diagnostics.py +243 -0
- purgedcv-0.0.1/tests/test_embargo.py +88 -0
- purgedcv-0.0.1/tests/test_exceptions.py +47 -0
- purgedcv-0.0.1/tests/test_metrics.py +304 -0
- purgedcv-0.0.1/tests/test_paths.py +163 -0
- purgedcv-0.0.1/tests/test_public_api.py +120 -0
- purgedcv-0.0.1/tests/test_purge.py +176 -0
- purgedcv-0.0.1/tests/test_purge_embargo_properties.py +191 -0
- purgedcv-0.0.1/tests/test_purged_kfold.py +223 -0
- purgedcv-0.0.1/tests/test_sklearn_integration.py +153 -0
- purgedcv-0.0.1/tests/test_splitter_properties.py +148 -0
- purgedcv-0.0.1/tests/test_time.py +171 -0
- purgedcv-0.0.1/tests/test_time_properties.py +96 -0
- purgedcv-0.0.1/tests/test_walk_forward.py +146 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
strategy:
|
|
16
|
+
fail-fast: false
|
|
17
|
+
matrix:
|
|
18
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
|
|
27
|
+
- name: Install
|
|
28
|
+
run: |
|
|
29
|
+
python -m pip install --upgrade pip
|
|
30
|
+
pip install -e ".[dev]"
|
|
31
|
+
|
|
32
|
+
- name: Ruff lint
|
|
33
|
+
run: ruff check .
|
|
34
|
+
|
|
35
|
+
- name: Ruff format check
|
|
36
|
+
run: ruff format --check .
|
|
37
|
+
|
|
38
|
+
- name: Mypy (strict)
|
|
39
|
+
run: mypy src tests
|
|
40
|
+
|
|
41
|
+
- name: Pytest
|
|
42
|
+
run: pytest -q
|
|
43
|
+
|
|
44
|
+
# Publish the current version, then bump the patch for next time.
|
|
45
|
+
# Order is deliberate: file starts at 0.0.1, so the first automated
|
|
46
|
+
# release is exactly 0.0.1; --skip-existing makes re-runs safe.
|
|
47
|
+
release:
|
|
48
|
+
needs: test
|
|
49
|
+
if: >-
|
|
50
|
+
github.event_name == 'push' &&
|
|
51
|
+
github.ref == 'refs/heads/main' &&
|
|
52
|
+
github.actor != 'github-actions[bot]'
|
|
53
|
+
runs-on: ubuntu-latest
|
|
54
|
+
steps:
|
|
55
|
+
- uses: actions/checkout@v4
|
|
56
|
+
with:
|
|
57
|
+
token: ${{ secrets.GITHUB_TOKEN }}
|
|
58
|
+
|
|
59
|
+
- name: Set up Python
|
|
60
|
+
uses: actions/setup-python@v5
|
|
61
|
+
with:
|
|
62
|
+
python-version: "3.11"
|
|
63
|
+
|
|
64
|
+
- name: Install tooling
|
|
65
|
+
run: |
|
|
66
|
+
python -m pip install --upgrade pip
|
|
67
|
+
pip install build twine anybadge
|
|
68
|
+
pip install -e ".[dev]"
|
|
69
|
+
|
|
70
|
+
- name: Read current version
|
|
71
|
+
id: ver
|
|
72
|
+
run: |
|
|
73
|
+
v=$(python -c "import tomllib,pathlib;print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
|
|
74
|
+
echo "version=$v" >> "$GITHUB_OUTPUT"
|
|
75
|
+
|
|
76
|
+
- name: Build
|
|
77
|
+
run: python -m build
|
|
78
|
+
|
|
79
|
+
- name: Publish to PyPI
|
|
80
|
+
env:
|
|
81
|
+
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
|
82
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
|
83
|
+
run: twine upload --non-interactive --skip-existing dist/*
|
|
84
|
+
|
|
85
|
+
- name: Coverage badge
|
|
86
|
+
run: |
|
|
87
|
+
pytest -q --cov=purgedcv --cov-report=json:coverage.json >/dev/null
|
|
88
|
+
cov=$(jq '.totals.percent_covered | round' coverage.json)
|
|
89
|
+
echo "coverage: ${cov}%"
|
|
90
|
+
mkdir -p .github/badges
|
|
91
|
+
anybadge -o --value="${cov}%" --file=.github/badges/coverage.svg --label=coverage --color="#007ec6"
|
|
92
|
+
|
|
93
|
+
- name: Bump patch version for next release
|
|
94
|
+
id: bump
|
|
95
|
+
run: |
|
|
96
|
+
new=$(python - <<'PY'
|
|
97
|
+
import re, pathlib
|
|
98
|
+
p = pathlib.Path("pyproject.toml")
|
|
99
|
+
t = p.read_text()
|
|
100
|
+
m = re.search(r'(?m)^version = "(\d+)\.(\d+)\.(\d+)"$', t)
|
|
101
|
+
if not m:
|
|
102
|
+
raise SystemExit('expected version = "X.Y.Z" in pyproject.toml')
|
|
103
|
+
maj, mnr, pat = map(int, m.groups())
|
|
104
|
+
nv = f"{maj}.{mnr}.{pat + 1}"
|
|
105
|
+
p.write_text(t[: m.start()] + f'version = "{nv}"' + t[m.end() :])
|
|
106
|
+
print(nv)
|
|
107
|
+
PY
|
|
108
|
+
)
|
|
109
|
+
echo "next=$new" >> "$GITHUB_OUTPUT"
|
|
110
|
+
|
|
111
|
+
- name: Commit badge + version bump
|
|
112
|
+
run: |
|
|
113
|
+
git config user.name 'github-actions[bot]'
|
|
114
|
+
git config user.email 'github-actions[bot]@users.noreply.github.com'
|
|
115
|
+
git add pyproject.toml .github/badges/coverage.svg
|
|
116
|
+
git commit -m "chore: release v${{ steps.ver.outputs.version }}, bump to v${{ steps.bump.outputs.next }}"
|
|
117
|
+
git push
|
|
118
|
+
|
|
119
|
+
- name: GitHub Release
|
|
120
|
+
uses: softprops/action-gh-release@v2
|
|
121
|
+
with:
|
|
122
|
+
tag_name: "v${{ steps.ver.outputs.version }}"
|
|
123
|
+
name: "v${{ steps.ver.outputs.version }}"
|
|
124
|
+
body: "Automated release of v${{ steps.ver.outputs.version }}."
|
|
125
|
+
generate_release_notes: true
|
|
126
|
+
env:
|
|
127
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
.idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
# .vscode/
|
|
203
|
+
# Temporary file for partial code execution
|
|
204
|
+
tempCodeRunnerFile.py
|
|
205
|
+
|
|
206
|
+
# Ruff stuff:
|
|
207
|
+
.ruff_cache/
|
|
208
|
+
|
|
209
|
+
# PyPI configuration file
|
|
210
|
+
.pypirc
|
|
211
|
+
|
|
212
|
+
# Marimo
|
|
213
|
+
marimo/_static/
|
|
214
|
+
marimo/_lsp/
|
|
215
|
+
__marimo__/
|
|
216
|
+
|
|
217
|
+
# Streamlit
|
|
218
|
+
.streamlit/secrets.toml
|
|
219
|
+
|
|
220
|
+
# Example notebook downloaded datasets (lazily fetched on first run)
|
|
221
|
+
examples/data/
|
|
222
|
+
|
|
223
|
+
# docs/ kept locally for reference, not published (internal plans, drafts)
|
|
224
|
+
docs/
|
|
225
|
+
|
|
226
|
+
# Local dev tooling, not published
|
|
227
|
+
tools/
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.15.12
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
9
|
+
rev: v6.0.0
|
|
10
|
+
hooks:
|
|
11
|
+
- id: trailing-whitespace
|
|
12
|
+
- id: end-of-file-fixer
|
|
13
|
+
- id: check-yaml
|
|
14
|
+
- id: check-toml
|
|
15
|
+
- id: check-added-large-files
|
|
16
|
+
args: [--maxkb=500]
|
purgedcv-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Evgenii Lazarev
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
purgedcv-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: purgedcv
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: scikit-learn-compatible cross-validation for time-series machine learning: purging, embargoes, combinatorial backtest paths.
|
|
5
|
+
Project-URL: Homepage, https://github.com/eslazarev/purged-cross-validation
|
|
6
|
+
Project-URL: Repository, https://github.com/eslazarev/purged-cross-validation
|
|
7
|
+
Author: Evgenii Lazarev
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: combinatorial-cv,cross-validation,machine-learning,purged-kfold,scikit-learn,time-series
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: numpy>=1.24
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Requires-Dist: scikit-learn>=1.3
|
|
23
|
+
Requires-Dist: scipy>=1.10
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: hypothesis>=6.80; extra == 'dev'
|
|
26
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
27
|
+
Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pre-commit>=3.5; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
32
|
+
Provides-Extra: examples
|
|
33
|
+
Requires-Dist: jupyter>=1.0; extra == 'examples'
|
|
34
|
+
Requires-Dist: matplotlib>=3.7; extra == 'examples'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# Purged cross validation
|
|
38
|
+
|
|
39
|
+
**scikit-learn-compatible cross-validation for time-series machine learning: purging, embargoes, and combinatorial backtest paths.**
|
|
40
|
+
|
|
41
|
+
[](https://github.com/eslazarev/purged-cross-validation/actions/workflows/ci.yml)
|
|
42
|
+

|
|
43
|
+
[](https://pypi.org/project/purgedcv/)
|
|
44
|
+
[](https://pepy.tech/project/purgedcv)
|
|
45
|
+
[](https://pypi.org/project/purgedcv/#files)
|
|
46
|
+
|
|
47
|
+
[](https://www.python.org/downloads/)
|
|
48
|
+
[](LICENSE)
|
|
49
|
+
[](https://github.com/astral-sh/ruff)
|
|
50
|
+
[](https://mypy-lang.org/)
|
|
51
|
+
[](https://pre-commit.com/)
|
|
52
|
+
[](https://pypi.org/project/purgedcv/)
|
|
53
|
+
|
|
54
|
+
**[Example notebooks →](examples/)** — purge/embargo, walk-forward, and CPCV with PSR/DSR worked end to end on real ICU-mortality, turbofan-RUL, rainfall, and electricity-demand data.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## The problem
|
|
59
|
+
|
|
60
|
+
Standard k-fold cross-validation assumes the rows are independent. Time-series data is not. When a label resolves over the next few days, it overlaps the labels sitting right next to it, so an ordinary shuffle-split leaks tomorrow's answer back into training. The rows immediately after a test window leak too, because they are serially correlated with it. Both effects quietly inflate backtested Sharpe ratios and hand you strategies that look great on a chart and bleed money once they go live. This library removes both.
|
|
61
|
+
|
|
62
|
+
Why write another one? People have asked scikit-learn, auto-sklearn, and mlpack for purging and embargo support and been turned down or left waiting for years. The one mature implementation, mlfinlab, went closed-source and paid. The free alternative has been unmaintained since 2018. That gap is the reason this exists.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
|
|
70
|
+
pip install purgedcv
|
|
71
|
+
|
|
72
|
+
# Directly from the repository
|
|
73
|
+
pip install git+https://github.com/eslazarev/purged-cross-validation.git
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Quickstart
|
|
79
|
+
|
|
80
|
+
### 1. Foundation primitives: `purge`, `apply_embargo`, and diagnostics
|
|
81
|
+
|
|
82
|
+
Build a manual split, clean it with the purge and embargo primitives, then audit it with the diagnostics submodule.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import numpy as np
|
|
86
|
+
import pandas as pd
|
|
87
|
+
from purgedcv import purge, apply_embargo
|
|
88
|
+
from purgedcv.diagnostics import assert_no_temporal_leakage, assert_embargo_respected
|
|
89
|
+
|
|
90
|
+
# 30 daily bars; each bar's label resolves 2 days later
|
|
91
|
+
pred = pd.Series(pd.date_range("2024-01-01", periods=30, freq="D"))
|
|
92
|
+
evalu = pred + pd.Timedelta(days=2)
|
|
93
|
+
|
|
94
|
+
train_idx = np.arange(0, 20)
|
|
95
|
+
test_idx = np.arange(20, 30)
|
|
96
|
+
|
|
97
|
+
# Remove training rows whose label horizon overlaps the test window
|
|
98
|
+
clean_train = purge(train_idx, test_idx, pred, evalu)
|
|
99
|
+
|
|
100
|
+
# Drop the 3-day post-test buffer from training
|
|
101
|
+
clean_train = apply_embargo(
|
|
102
|
+
clean_train, test_idx, pred, evalu, embargo=pd.Timedelta(days=3)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Assert the split is now leak-free (raises TemporalLeakageError if not)
|
|
106
|
+
assert_no_temporal_leakage(clean_train, test_idx, pred, evalu)
|
|
107
|
+
assert_embargo_respected(clean_train, test_idx, pred, evalu, embargo="3D")
|
|
108
|
+
print(f"Clean training rows: {len(clean_train)}") # 19
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
### 2. Splitters with scikit-learn: `PurgedKFold` inside `cross_val_score`
|
|
114
|
+
|
|
115
|
+
Drop-in replacement for `KFold` that applies purge and embargo automatically on every fold.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
import numpy as np
|
|
119
|
+
import pandas as pd
|
|
120
|
+
from sklearn.linear_model import Ridge
|
|
121
|
+
from sklearn.model_selection import cross_val_score
|
|
122
|
+
from purgedcv import PurgedKFold
|
|
123
|
+
|
|
124
|
+
rng = np.random.default_rng(0)
|
|
125
|
+
n = 200
|
|
126
|
+
pred = pd.Series(pd.date_range("2022-01-01", periods=n, freq="D"))
|
|
127
|
+
evalu = pred + pd.Timedelta(days=3)
|
|
128
|
+
X = rng.standard_normal((n, 5))
|
|
129
|
+
y = X @ rng.standard_normal(5) + rng.standard_normal(n) * 0.5
|
|
130
|
+
|
|
131
|
+
cv = PurgedKFold(
|
|
132
|
+
n_splits=5,
|
|
133
|
+
prediction_times=pred,
|
|
134
|
+
evaluation_times=evalu,
|
|
135
|
+
purge_horizon="3D", # matches label horizon
|
|
136
|
+
embargo="1D", # 1-day post-test buffer
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
scores = cross_val_score(Ridge(), X, y, cv=cv, scoring="r2")
|
|
140
|
+
print(f"R² per fold: {scores.round(3)}")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
All four splitters (`WalkForwardSplit`, `PurgedKFold`, `PurgedGroupKFold`, `CombinatorialPurgedCV`) satisfy the sklearn splitter protocol and work inside `GridSearchCV` and `Pipeline`.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
### 3. CPCV + path reconstruction + metrics: the full workflow
|
|
148
|
+
|
|
149
|
+
Combinatorial Purged CV produces C(N, K) folds that tile into multiple out-of-sample backtest paths. Use PSR and DSR to evaluate them with corrections for non-normality and selection bias.
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
import numpy as np
|
|
153
|
+
import pandas as pd
|
|
154
|
+
from sklearn.dummy import DummyRegressor
|
|
155
|
+
from purgedcv import (
|
|
156
|
+
CombinatorialPurgedCV,
|
|
157
|
+
probabilistic_sharpe_ratio,
|
|
158
|
+
deflated_sharpe_ratio,
|
|
159
|
+
min_track_record_length,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
rng = np.random.default_rng(42)
|
|
163
|
+
n = 120
|
|
164
|
+
pred = pd.Series(pd.date_range("2023-01-01", periods=n, freq="D"))
|
|
165
|
+
evalu = pred + pd.Timedelta(days=2)
|
|
166
|
+
X = rng.standard_normal((n, 3))
|
|
167
|
+
y = X @ np.array([0.5, -0.3, 0.2]) + rng.standard_normal(n) * 0.1
|
|
168
|
+
|
|
169
|
+
# N=6, K=2 → C(6,2) = 15 folds → 6-2 = 4 backtest paths
|
|
170
|
+
cv = CombinatorialPurgedCV(
|
|
171
|
+
n_splits=6,
|
|
172
|
+
n_test_groups=2,
|
|
173
|
+
prediction_times=pred,
|
|
174
|
+
evaluation_times=evalu,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# paths.shape == (n_paths, n_samples); NaN where a sample was not OOS
|
|
178
|
+
paths = cv.backtest_paths(DummyRegressor(strategy="mean"), X, y)
|
|
179
|
+
print(f"Backtest paths: {paths.shape}") # (5, 120)
|
|
180
|
+
|
|
181
|
+
# Derive a toy "return" series and compute per-path PSR
|
|
182
|
+
per_path_returns = paths - y[np.newaxis, :]
|
|
183
|
+
per_path_psr = [
|
|
184
|
+
probabilistic_sharpe_ratio(row[np.isfinite(row)], benchmark_skill=0.0)
|
|
185
|
+
for row in per_path_returns
|
|
186
|
+
]
|
|
187
|
+
print(f"PSR per path: {[round(p, 3) for p in per_path_psr]}")
|
|
188
|
+
|
|
189
|
+
# DSR corrects for testing 5 paths simultaneously
|
|
190
|
+
first = per_path_returns[0]
|
|
191
|
+
dsr = deflated_sharpe_ratio(first[np.isfinite(first)], n_trials=5, var_sharpe=0.01**2)
|
|
192
|
+
print(f"Deflated SR (first path): {dsr:.3f}")
|
|
193
|
+
|
|
194
|
+
# Minimum observations needed to prove SR=0.7 beats benchmark SR=0.5 at 95% confidence
|
|
195
|
+
n_min = min_track_record_length(
|
|
196
|
+
observed_sharpe=0.7, target_sharpe=0.5, alpha=0.05, skew=0.0, kurtosis=3.0
|
|
197
|
+
)
|
|
198
|
+
print(f"MinTRL: {int(n_min)} observations")
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## API summary
|
|
204
|
+
|
|
205
|
+
| Symbol | Domain | Description |
|
|
206
|
+
|---|---|---|
|
|
207
|
+
| `purge` | D2 | Remove overlapping-horizon training rows |
|
|
208
|
+
| `apply_embargo` | D3 | Remove post-test buffer rows |
|
|
209
|
+
| `WalkForwardSplit` | D5.1 | Sliding / expanding walk-forward CV |
|
|
210
|
+
| `PurgedKFold` | D5.2 | Contiguous test folds with purge + embargo |
|
|
211
|
+
| `PurgedGroupKFold` | D5.3 | Group-aware purged k-fold |
|
|
212
|
+
| `CombinatorialPurgedCV` | D5.4 | C(N,K) combinatorial folds |
|
|
213
|
+
| `reconstruct_paths` | D6 | Assemble CPCV folds into backtest paths |
|
|
214
|
+
| `probabilistic_sharpe_ratio` | D7 | PSR: P(true SR > benchmark) |
|
|
215
|
+
| `deflated_sharpe_ratio` | D7 | DSR: PSR corrected for multiple testing |
|
|
216
|
+
| `min_track_record_length` | D7 | Minimum observations to establish SR |
|
|
217
|
+
| `diagnostics.*` | D8 | Leakage and embargo audit functions |
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
## Methodology references
|
|
223
|
+
|
|
224
|
+
- Lopez de Prado, M. (2018). *Advances in Financial Machine Learning*. Wiley. Chapters 7 (purge/embargo) and 12 (CPCV).
|
|
225
|
+
- Bailey, D. H., & Lopez de Prado, M. (2012). The Sharpe Ratio Efficient Frontier. *Journal of Risk*, 15(2).
|
|
226
|
+
- Bailey, D. H., & Lopez de Prado, M. (2014). The Deflated Sharpe Ratio: Correcting for Selection Bias, Backtest Overfitting and Non-Normality. *Journal of Portfolio Management*, 40(5).
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## License
|
|
231
|
+
|
|
232
|
+
MIT. See [LICENSE](LICENSE).
|