purgedcv 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. purgedcv-0.0.1/.github/workflows/ci.yml +127 -0
  2. purgedcv-0.0.1/.gitignore +227 -0
  3. purgedcv-0.0.1/.pre-commit-config.yaml +16 -0
  4. purgedcv-0.0.1/LICENSE +21 -0
  5. purgedcv-0.0.1/PKG-INFO +232 -0
  6. purgedcv-0.0.1/README.md +196 -0
  7. purgedcv-0.0.1/examples/README.md +127 -0
  8. purgedcv-0.0.1/examples/clinical_mortality_physionet.ipynb +789 -0
  9. purgedcv-0.0.1/examples/energy_demand_pjm.ipynb +839 -0
  10. purgedcv-0.0.1/examples/precipitation_noaa.ipynb +640 -0
  11. purgedcv-0.0.1/examples/predictive_maintenance_nasa.ipynb +582 -0
  12. purgedcv-0.0.1/pyproject.toml +104 -0
  13. purgedcv-0.0.1/src/purgedcv/__init__.py +47 -0
  14. purgedcv-0.0.1/src/purgedcv/_base.py +167 -0
  15. purgedcv-0.0.1/src/purgedcv/_cpcv.py +222 -0
  16. purgedcv-0.0.1/src/purgedcv/_embargo.py +63 -0
  17. purgedcv-0.0.1/src/purgedcv/_metrics.py +226 -0
  18. purgedcv-0.0.1/src/purgedcv/_paths.py +125 -0
  19. purgedcv-0.0.1/src/purgedcv/_purge.py +66 -0
  20. purgedcv-0.0.1/src/purgedcv/_purged_kfold.py +221 -0
  21. purgedcv-0.0.1/src/purgedcv/_time.py +136 -0
  22. purgedcv-0.0.1/src/purgedcv/_typing.py +17 -0
  23. purgedcv-0.0.1/src/purgedcv/_walk_forward.py +170 -0
  24. purgedcv-0.0.1/src/purgedcv/diagnostics.py +207 -0
  25. purgedcv-0.0.1/src/purgedcv/exceptions.py +25 -0
  26. purgedcv-0.0.1/tests/__init__.py +0 -0
  27. purgedcv-0.0.1/tests/conftest.py +12 -0
  28. purgedcv-0.0.1/tests/e2e/__init__.py +0 -0
  29. purgedcv-0.0.1/tests/e2e/test_e2e_apply_embargo.py +90 -0
  30. purgedcv-0.0.1/tests/e2e/test_e2e_assert_embargo_respected.py +92 -0
  31. purgedcv-0.0.1/tests/e2e/test_e2e_assert_groups_disjoint.py +87 -0
  32. purgedcv-0.0.1/tests/e2e/test_e2e_assert_no_temporal_leakage.py +101 -0
  33. purgedcv-0.0.1/tests/e2e/test_e2e_backtest_paths.py +104 -0
  34. purgedcv-0.0.1/tests/e2e/test_e2e_compute_overlap_fraction.py +96 -0
  35. purgedcv-0.0.1/tests/e2e/test_e2e_cpcv.py +72 -0
  36. purgedcv-0.0.1/tests/e2e/test_e2e_horizons_overlap.py +75 -0
  37. purgedcv-0.0.1/tests/e2e/test_e2e_metrics.py +110 -0
  38. purgedcv-0.0.1/tests/e2e/test_e2e_parse_horizon.py +77 -0
  39. purgedcv-0.0.1/tests/e2e/test_e2e_purge.py +96 -0
  40. purgedcv-0.0.1/tests/e2e/test_e2e_purged_group_kfold.py +76 -0
  41. purgedcv-0.0.1/tests/e2e/test_e2e_purged_kfold.py +67 -0
  42. purgedcv-0.0.1/tests/e2e/test_e2e_validate_times.py +82 -0
  43. purgedcv-0.0.1/tests/e2e/test_e2e_walk_forward.py +95 -0
  44. purgedcv-0.0.1/tests/e2e/test_install_smoke.py +44 -0
  45. purgedcv-0.0.1/tests/e2e/test_quality_gate.py +61 -0
  46. purgedcv-0.0.1/tests/test_base_splitter.py +301 -0
  47. purgedcv-0.0.1/tests/test_cpcv.py +227 -0
  48. purgedcv-0.0.1/tests/test_diagnostics.py +243 -0
  49. purgedcv-0.0.1/tests/test_embargo.py +88 -0
  50. purgedcv-0.0.1/tests/test_exceptions.py +47 -0
  51. purgedcv-0.0.1/tests/test_metrics.py +304 -0
  52. purgedcv-0.0.1/tests/test_paths.py +163 -0
  53. purgedcv-0.0.1/tests/test_public_api.py +120 -0
  54. purgedcv-0.0.1/tests/test_purge.py +176 -0
  55. purgedcv-0.0.1/tests/test_purge_embargo_properties.py +191 -0
  56. purgedcv-0.0.1/tests/test_purged_kfold.py +223 -0
  57. purgedcv-0.0.1/tests/test_sklearn_integration.py +153 -0
  58. purgedcv-0.0.1/tests/test_splitter_properties.py +148 -0
  59. purgedcv-0.0.1/tests/test_time.py +171 -0
  60. purgedcv-0.0.1/tests/test_time_properties.py +96 -0
  61. purgedcv-0.0.1/tests/test_walk_forward.py +146 -0
@@ -0,0 +1,127 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ permissions:
10
+ contents: write
11
+
12
+ jobs:
13
+ test:
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ python-version: ["3.10", "3.11", "3.12"]
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+
27
+ - name: Install
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ pip install -e ".[dev]"
31
+
32
+ - name: Ruff lint
33
+ run: ruff check .
34
+
35
+ - name: Ruff format check
36
+ run: ruff format --check .
37
+
38
+ - name: Mypy (strict)
39
+ run: mypy src tests
40
+
41
+ - name: Pytest
42
+ run: pytest -q
43
+
44
+ # Publish the current version, then bump the patch for next time.
45
+ # Order is deliberate: file starts at 0.0.1, so the first automated
46
+ # release is exactly 0.0.1; --skip-existing makes re-runs safe.
47
+ release:
48
+ needs: test
49
+ if: >-
50
+ github.event_name == 'push' &&
51
+ github.ref == 'refs/heads/main' &&
52
+ github.actor != 'github-actions[bot]'
53
+ runs-on: ubuntu-latest
54
+ steps:
55
+ - uses: actions/checkout@v4
56
+ with:
57
+ token: ${{ secrets.GITHUB_TOKEN }}
58
+
59
+ - name: Set up Python
60
+ uses: actions/setup-python@v5
61
+ with:
62
+ python-version: "3.11"
63
+
64
+ - name: Install tooling
65
+ run: |
66
+ python -m pip install --upgrade pip
67
+ pip install build twine anybadge
68
+ pip install -e ".[dev]"
69
+
70
+ - name: Read current version
71
+ id: ver
72
+ run: |
73
+ v=$(python -c "import tomllib,pathlib;print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
74
+ echo "version=$v" >> "$GITHUB_OUTPUT"
75
+
76
+ - name: Build
77
+ run: python -m build
78
+
79
+ - name: Publish to PyPI
80
+ env:
81
+ TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
82
+ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
83
+ run: twine upload --non-interactive --skip-existing dist/*
84
+
85
+ - name: Coverage badge
86
+ run: |
87
+ pytest -q --cov=purgedcv --cov-report=json:coverage.json >/dev/null
88
+ cov=$(jq '.totals.percent_covered | round' coverage.json)
89
+ echo "coverage: ${cov}%"
90
+ mkdir -p .github/badges
91
+ anybadge -o --value="${cov}%" --file=.github/badges/coverage.svg --label=coverage --color="#007ec6"
92
+
93
+ - name: Bump patch version for next release
94
+ id: bump
95
+ run: |
96
+ new=$(python - <<'PY'
97
+ import re, pathlib
98
+ p = pathlib.Path("pyproject.toml")
99
+ t = p.read_text()
100
+ m = re.search(r'(?m)^version = "(\d+)\.(\d+)\.(\d+)"$', t)
101
+ if not m:
102
+ raise SystemExit('expected version = "X.Y.Z" in pyproject.toml')
103
+ maj, mnr, pat = map(int, m.groups())
104
+ nv = f"{maj}.{mnr}.{pat + 1}"
105
+ p.write_text(t[: m.start()] + f'version = "{nv}"' + t[m.end() :])
106
+ print(nv)
107
+ PY
108
+ )
109
+ echo "next=$new" >> "$GITHUB_OUTPUT"
110
+
111
+ - name: Commit badge + version bump
112
+ run: |
113
+ git config user.name 'github-actions[bot]'
114
+ git config user.email 'github-actions[bot]@users.noreply.github.com'
115
+ git add pyproject.toml .github/badges/coverage.svg
116
+ git commit -m "chore: release v${{ steps.ver.outputs.version }}, bump to v${{ steps.bump.outputs.next }}"
117
+ git push
118
+
119
+ - name: GitHub Release
120
+ uses: softprops/action-gh-release@v2
121
+ with:
122
+ tag_name: "v${{ steps.ver.outputs.version }}"
123
+ name: "v${{ steps.ver.outputs.version }}"
124
+ body: "Automated release of v${{ steps.ver.outputs.version }}."
125
+ generate_release_notes: true
126
+ env:
127
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,227 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
219
+
220
+ # Example notebook downloaded datasets (lazily fetched on first run)
221
+ examples/data/
222
+
223
+ # docs/ kept locally for reference, not published (internal plans, drafts)
224
+ docs/
225
+
226
+ # Local dev tooling, not published
227
+ tools/
@@ -0,0 +1,16 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.15.12
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+ - repo: https://github.com/pre-commit/pre-commit-hooks
9
+ rev: v6.0.0
10
+ hooks:
11
+ - id: trailing-whitespace
12
+ - id: end-of-file-fixer
13
+ - id: check-yaml
14
+ - id: check-toml
15
+ - id: check-added-large-files
16
+ args: [--maxkb=500]
purgedcv-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Evgenii Lazarev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,232 @@
1
+ Metadata-Version: 2.4
2
+ Name: purgedcv
3
+ Version: 0.0.1
4
+ Summary: scikit-learn-compatible cross-validation for time-series machine learning: purging, embargoes, combinatorial backtest paths.
5
+ Project-URL: Homepage, https://github.com/eslazarev/purged-cross-validation
6
+ Project-URL: Repository, https://github.com/eslazarev/purged-cross-validation
7
+ Author: Evgenii Lazarev
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: combinatorial-cv,cross-validation,machine-learning,purged-kfold,scikit-learn,time-series
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: numpy>=1.24
21
+ Requires-Dist: pandas>=2.0
22
+ Requires-Dist: scikit-learn>=1.3
23
+ Requires-Dist: scipy>=1.10
24
+ Provides-Extra: dev
25
+ Requires-Dist: hypothesis>=6.80; extra == 'dev'
26
+ Requires-Dist: mypy>=1.10; extra == 'dev'
27
+ Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
28
+ Requires-Dist: pre-commit>=3.5; extra == 'dev'
29
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
30
+ Requires-Dist: pytest>=7.4; extra == 'dev'
31
+ Requires-Dist: ruff>=0.5; extra == 'dev'
32
+ Provides-Extra: examples
33
+ Requires-Dist: jupyter>=1.0; extra == 'examples'
34
+ Requires-Dist: matplotlib>=3.7; extra == 'examples'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # Purged cross validation
38
+
39
+ **scikit-learn-compatible cross-validation for time-series machine learning: purging, embargoes, and combinatorial backtest paths.**
40
+
41
+ [![CI](https://github.com/eslazarev/purged-cross-validation/actions/workflows/ci.yml/badge.svg)](https://github.com/eslazarev/purged-cross-validation/actions/workflows/ci.yml)
42
+ ![Coverage](https://raw.githubusercontent.com/eslazarev/purged-cross-validation/refs/heads/main/.github/badges/coverage.svg)
43
+ [![PyPI version](https://img.shields.io/pypi/v/purgedcv)](https://pypi.org/project/purgedcv/)
44
+ [![PyPI downloads](https://static.pepy.tech/badge/purgedcv)](https://pepy.tech/project/purgedcv)
45
+ [![PyPI wheel](https://img.shields.io/pypi/wheel/purgedcv)](https://pypi.org/project/purgedcv/#files)
46
+
47
+ [![Python versions](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)](https://www.python.org/downloads/)
48
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
49
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
50
+ [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/)
51
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://pre-commit.com/)
52
+ [![Development status: alpha](https://img.shields.io/badge/status-alpha-orange)](https://pypi.org/project/purgedcv/)
53
+
54
+ **[Example notebooks →](examples/)** — purge/embargo, walk-forward, and CPCV with PSR/DSR worked end to end on real ICU-mortality, turbofan-RUL, rainfall, and electricity-demand data.
55
+
56
+ ---
57
+
58
+ ## The problem
59
+
60
+ Standard k-fold cross-validation assumes the rows are independent. Time-series data is not. When a label resolves over the next few days, it overlaps the labels sitting right next to it, so an ordinary shuffle-split leaks tomorrow's answer back into training. The rows immediately after a test window leak too, because they are serially correlated with it. Both effects quietly inflate backtested Sharpe ratios and hand you strategies that look great on a chart and bleed money once they go live. This library removes both.
61
+
62
+ Why write another one? People have asked scikit-learn, auto-sklearn, and mlpack for purging and embargo support and been turned down or left waiting for years. The one mature implementation, mlfinlab, went closed-source and paid. The free alternative has been unmaintained since 2018. That gap is the reason this exists.
63
+
64
+ ---
65
+
66
+ ## Installation
67
+
68
+ ```bash
69
+
70
+ pip install purgedcv
71
+
72
+ # Directly from the repository
73
+ pip install git+https://github.com/eslazarev/purged-cross-validation.git
74
+ ```
75
+
76
+ ---
77
+
78
+ ## Quickstart
79
+
80
+ ### 1. Foundation primitives: `purge`, `apply_embargo`, and diagnostics
81
+
82
+ Build a manual split, clean it with the purge and embargo primitives, then audit it with the diagnostics submodule.
83
+
84
+ ```python
85
+ import numpy as np
86
+ import pandas as pd
87
+ from purgedcv import purge, apply_embargo
88
+ from purgedcv.diagnostics import assert_no_temporal_leakage, assert_embargo_respected
89
+
90
+ # 30 daily bars; each bar's label resolves 2 days later
91
+ pred = pd.Series(pd.date_range("2024-01-01", periods=30, freq="D"))
92
+ evalu = pred + pd.Timedelta(days=2)
93
+
94
+ train_idx = np.arange(0, 20)
95
+ test_idx = np.arange(20, 30)
96
+
97
+ # Remove training rows whose label horizon overlaps the test window
98
+ clean_train = purge(train_idx, test_idx, pred, evalu)
99
+
100
+ # Drop the 3-day post-test buffer from training
101
+ clean_train = apply_embargo(
102
+ clean_train, test_idx, pred, evalu, embargo=pd.Timedelta(days=3)
103
+ )
104
+
105
+ # Assert the split is now leak-free (raises TemporalLeakageError if not)
106
+ assert_no_temporal_leakage(clean_train, test_idx, pred, evalu)
107
+ assert_embargo_respected(clean_train, test_idx, pred, evalu, embargo="3D")
108
+ print(f"Clean training rows: {len(clean_train)}") # 19
109
+ ```
110
+
111
+ ---
112
+
113
+ ### 2. Splitters with scikit-learn: `PurgedKFold` inside `cross_val_score`
114
+
115
+ Drop-in replacement for `KFold` that applies purge and embargo automatically on every fold.
116
+
117
+ ```python
118
+ import numpy as np
119
+ import pandas as pd
120
+ from sklearn.linear_model import Ridge
121
+ from sklearn.model_selection import cross_val_score
122
+ from purgedcv import PurgedKFold
123
+
124
+ rng = np.random.default_rng(0)
125
+ n = 200
126
+ pred = pd.Series(pd.date_range("2022-01-01", periods=n, freq="D"))
127
+ evalu = pred + pd.Timedelta(days=3)
128
+ X = rng.standard_normal((n, 5))
129
+ y = X @ rng.standard_normal(5) + rng.standard_normal(n) * 0.5
130
+
131
+ cv = PurgedKFold(
132
+ n_splits=5,
133
+ prediction_times=pred,
134
+ evaluation_times=evalu,
135
+ purge_horizon="3D", # matches label horizon
136
+ embargo="1D", # 1-day post-test buffer
137
+ )
138
+
139
+ scores = cross_val_score(Ridge(), X, y, cv=cv, scoring="r2")
140
+ print(f"R² per fold: {scores.round(3)}")
141
+ ```
142
+
143
+ All four splitters (`WalkForwardSplit`, `PurgedKFold`, `PurgedGroupKFold`, `CombinatorialPurgedCV`) satisfy the sklearn splitter protocol and work inside `GridSearchCV` and `Pipeline`.
144
+
145
+ ---
146
+
147
+ ### 3. CPCV + path reconstruction + metrics: the full workflow
148
+
149
+ Combinatorial Purged CV produces C(N, K) folds that tile into multiple out-of-sample backtest paths. Use PSR and DSR to evaluate them with corrections for non-normality and selection bias.
150
+
151
+ ```python
152
+ import numpy as np
153
+ import pandas as pd
154
+ from sklearn.dummy import DummyRegressor
155
+ from purgedcv import (
156
+ CombinatorialPurgedCV,
157
+ probabilistic_sharpe_ratio,
158
+ deflated_sharpe_ratio,
159
+ min_track_record_length,
160
+ )
161
+
162
+ rng = np.random.default_rng(42)
163
+ n = 120
164
+ pred = pd.Series(pd.date_range("2023-01-01", periods=n, freq="D"))
165
+ evalu = pred + pd.Timedelta(days=2)
166
+ X = rng.standard_normal((n, 3))
167
+ y = X @ np.array([0.5, -0.3, 0.2]) + rng.standard_normal(n) * 0.1
168
+
169
+ # N=6, K=2 → C(6,2) = 15 folds → 6-2 = 4 backtest paths
170
+ cv = CombinatorialPurgedCV(
171
+ n_splits=6,
172
+ n_test_groups=2,
173
+ prediction_times=pred,
174
+ evaluation_times=evalu,
175
+ )
176
+
177
+ # paths.shape == (n_paths, n_samples); NaN where a sample was not OOS
178
+ paths = cv.backtest_paths(DummyRegressor(strategy="mean"), X, y)
179
+ print(f"Backtest paths: {paths.shape}") # (5, 120)
180
+
181
+ # Derive a toy "return" series and compute per-path PSR
182
+ per_path_returns = paths - y[np.newaxis, :]
183
+ per_path_psr = [
184
+ probabilistic_sharpe_ratio(row[np.isfinite(row)], benchmark_skill=0.0)
185
+ for row in per_path_returns
186
+ ]
187
+ print(f"PSR per path: {[round(p, 3) for p in per_path_psr]}")
188
+
189
+ # DSR corrects for testing 5 paths simultaneously
190
+ first = per_path_returns[0]
191
+ dsr = deflated_sharpe_ratio(first[np.isfinite(first)], n_trials=5, var_sharpe=0.01**2)
192
+ print(f"Deflated SR (first path): {dsr:.3f}")
193
+
194
+ # Minimum observations needed to prove SR=0.7 beats benchmark SR=0.5 at 95% confidence
195
+ n_min = min_track_record_length(
196
+ observed_sharpe=0.7, target_sharpe=0.5, alpha=0.05, skew=0.0, kurtosis=3.0
197
+ )
198
+ print(f"MinTRL: {int(n_min)} observations")
199
+ ```
200
+
201
+ ---
202
+
203
+ ## API summary
204
+
205
+ | Symbol | Domain | Description |
206
+ |---|---|---|
207
+ | `purge` | D2 | Remove overlapping-horizon training rows |
208
+ | `apply_embargo` | D3 | Remove post-test buffer rows |
209
+ | `WalkForwardSplit` | D5.1 | Sliding / expanding walk-forward CV |
210
+ | `PurgedKFold` | D5.2 | Contiguous test folds with purge + embargo |
211
+ | `PurgedGroupKFold` | D5.3 | Group-aware purged k-fold |
212
+ | `CombinatorialPurgedCV` | D5.4 | C(N,K) combinatorial folds |
213
+ | `reconstruct_paths` | D6 | Assemble CPCV folds into backtest paths |
214
+ | `probabilistic_sharpe_ratio` | D7 | PSR: P(true SR > benchmark) |
215
+ | `deflated_sharpe_ratio` | D7 | DSR: PSR corrected for multiple testing |
216
+ | `min_track_record_length` | D7 | Minimum observations to establish SR |
217
+ | `diagnostics.*` | D8 | Leakage and embargo audit functions |
218
+
219
+ ---
220
+
221
+
222
+ ## Methodology references
223
+
224
+ - Lopez de Prado, M. (2018). *Advances in Financial Machine Learning*. Wiley. Chapters 7 (purge/embargo) and 12 (CPCV).
225
+ - Bailey, D. H., & Lopez de Prado, M. (2012). The Sharpe Ratio Efficient Frontier. *Journal of Risk*, 15(2).
226
+ - Bailey, D. H., & Lopez de Prado, M. (2014). The Deflated Sharpe Ratio: Correcting for Selection Bias, Backtest Overfitting and Non-Normality. *Journal of Portfolio Management*, 40(5).
227
+
228
+ ---
229
+
230
+ ## License
231
+
232
+ MIT. See [LICENSE](LICENSE).