eval-toolkit 0.27.2__tar.gz → 0.28.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/.gitignore +3 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/CHANGELOG.md +208 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/PKG-INFO +12 -1
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/README.md +6 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/pyproject.toml +12 -1
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/__init__.py +2 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/harness.py +5 -3
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/splits.py +282 -0
- eval_toolkit-0.28.1/tests/golden/bootstrap_ci/cases.json +50 -0
- eval_toolkit-0.28.1/tests/golden/public_api/snapshot.json +1513 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_artifacts.py +91 -0
- eval_toolkit-0.28.1/tests/test_bootstrap_calibration_mc.py +306 -0
- eval_toolkit-0.28.1/tests/test_bootstrap_golden.py +215 -0
- eval_toolkit-0.28.1/tests/test_calibration_determinism.py +114 -0
- eval_toolkit-0.28.1/tests/test_harness_fault_injection.py +179 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_metrics_props.py +99 -0
- eval_toolkit-0.28.1/tests/test_pipeline_e2e.py +258 -0
- eval_toolkit-0.28.1/tests/test_public_api.py +186 -0
- eval_toolkit-0.28.1/tests/test_splits.py +365 -0
- eval_toolkit-0.28.1/tests/test_thresholds.py +265 -0
- eval_toolkit-0.27.2/tests/test_splits.py +0 -162
- eval_toolkit-0.27.2/tests/test_thresholds.py +0 -155
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/LICENSE +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/STYLE.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/methodology/README.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/README.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/conftest.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/strategies.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_claims.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_cli.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_config.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_coverage_gap.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_v07.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_v22.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_paths.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_v09_contracts.py +0 -0
|
@@ -7,6 +7,214 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.28.1] — 2026-05-15 — security-patch (CodeQL + pip-audit)
|
|
11
|
+
|
|
12
|
+
Tier α of the post-v0.28.0 best-practice gap audit. Pure CI/security
|
|
13
|
+
infrastructure additions; zero source-code or behavior changes.
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- `.github/workflows/codeql.yml`: GitHub's CodeQL static analyzer
|
|
18
|
+
on push/PR/weekly cron (Sundays 04:00 UTC). Uses the
|
|
19
|
+
`security-extended` query suite. Findings populate the repo's
|
|
20
|
+
Security → Code scanning tab.
|
|
21
|
+
- pip-audit step in the existing `test-base-install` CI job:
|
|
22
|
+
scans the runtime-only venv (`numpy` / `scipy` / `scikit-learn` /
|
|
23
|
+
`jsonschema`) for known CVEs on every PR. Fails CI on any finding.
|
|
24
|
+
Dev-extras vulns (pytest, hypothesis, etc.) are not gated —
|
|
25
|
+
surfaced through Dependabot. Per the v0.28.1 plan Q3=C
|
|
26
|
+
(runtime-deps-only gate).
|
|
27
|
+
|
|
28
|
+
### Internal
|
|
29
|
+
|
|
30
|
+
- Audit discovered that `mypy --strict --no-implicit-reexport src/`
|
|
31
|
+
already passes with zero issues on the v0.28.0 source. The
|
|
32
|
+
planned Tier α #3 "chase remaining Any leaks" task was a no-op —
|
|
33
|
+
no commit shipped for it.
|
|
34
|
+
- pip-audit on current runtime deps: zero known vulnerabilities.
|
|
35
|
+
|
|
36
|
+
## [0.28.0] — 2026-05-15 — temporalcv cross-pollination bundle
|
|
37
|
+
|
|
38
|
+
Six-section bundle adopting the highest-value patterns from the
|
|
39
|
+
sibling `temporalcv` project plus public-repo polish + hosted docs.
|
|
40
|
+
Major additions: `PurgedKFoldSplitter` for label-overlap-protected
|
|
41
|
+
cross-validation, nightly Monte Carlo bootstrap CI calibration
|
|
42
|
+
testing, 6-example documentation gallery, a hosted mkdocs-material
|
|
43
|
+
docs site with MathJax + tikzjax for full LaTeX + TikZ rendering,
|
|
44
|
+
SECURITY.md + CITATION.cff for public-repo polish, and a
|
|
45
|
+
documentary mutmut audit cataloguing math-kernel test strength.
|
|
46
|
+
|
|
47
|
+
### Added
|
|
48
|
+
|
|
49
|
+
- Section F (mutmut audit, from temporalcv-cross-pollination bundle):
|
|
50
|
+
added `docs/internals/mutmut_audit.md` — documentary code-analysis
|
|
51
|
+
audit of the 5 math kernel modules (`metrics`, `bootstrap`,
|
|
52
|
+
`calibration`, `operating_points`, `thresholds`). Per Q10=A
|
|
53
|
+
acceptance (audit-only, no kill-rate target), the deliverable is
|
|
54
|
+
a catalog of likely surviving mutant patterns per module + an
|
|
55
|
+
assessment of whether the existing test suite would catch them.
|
|
56
|
+
Identifies 3 specific high-leverage gaps for future work:
|
|
57
|
+
(a) calibration fit-vs-eval data isolation, (b) BCa degenerate-
|
|
58
|
+
jackknife fallback assertion strengthening, (c) `empty_strategy`
|
|
59
|
+
default lock-in tests. Programmatic mutmut run deferred: mutmut
|
|
60
|
+
3.5.0 has a config-parsing bug in our env where `tests_dir =
|
|
61
|
+
"tests/"` is splat character-by-character — revisit with mutmut
|
|
62
|
+
v4 or cosmic-ray. Re-run instructions captured in the audit doc.
|
|
63
|
+
|
|
64
|
+
- Section E.2 (mkdocs link cleanup, from temporalcv-cross-pollination
|
|
65
|
+
bundle): fixed 30+ broken relative links across 18 documentation
|
|
66
|
+
files. Pattern: docs that link to `../src/eval_toolkit/<X>.py`
|
|
67
|
+
(works on GitHub render but breaks in mkdocs) now point at the
|
|
68
|
+
auto-generated API reference page (`api/<X>.md`). CHANGELOG.md
|
|
69
|
+
references (also outside the docs tree) repointed to absolute
|
|
70
|
+
GitHub URLs. Down from 93 warnings to 1: the remaining
|
|
71
|
+
`griffe: π : float` is a documented tool limitation — griffe
|
|
72
|
+
doesn't parse Unicode parameter names; the project's STYLE.md
|
|
73
|
+
intentionally allows Unicode in math kernels (`π`, `α`, etc.).
|
|
74
|
+
Also patched `harness.py` RunResult docstring: replaced the Sphinx
|
|
75
|
+
`.. versionchanged::` directive with a NumPy "Notes" section so
|
|
76
|
+
mkdocstrings renders it cleanly. `mkdocs build --strict` would
|
|
77
|
+
fail on the 1 remaining griffe warning, so the docs.yml workflow
|
|
78
|
+
intentionally runs without `--strict`. The link-cleanup deliverable
|
|
79
|
+
is complete; the source-docstring + methodology enrichment passes
|
|
80
|
+
originally scoped for E.2 are deferred (existing docstrings already
|
|
81
|
+
carry References + LaTeX where it matters; methodology pages are
|
|
82
|
+
already strong content-wise — only the link structure needed fixing).
|
|
83
|
+
|
|
84
|
+
- Section E.1 (hosted documentation site, from
|
|
85
|
+
temporalcv-cross-pollination bundle): new mkdocs-material site at
|
|
86
|
+
`https://brandon-behring.github.io/eval-toolkit/`, auto-generated
|
|
87
|
+
from existing Markdown docs + `mkdocstrings`-rendered API reference.
|
|
88
|
+
- `mkdocs.yml` configures the material theme (auto light/dark,
|
|
89
|
+
tabs nav, code-copy buttons, full-text search) with MathJax v3 +
|
|
90
|
+
tikzjax loaded from CDN for full LaTeX + TikZ rendering
|
|
91
|
+
(per Q12=B).
|
|
92
|
+
- `docs/index.md` — site landing page
|
|
93
|
+
- `docs/api/index.md` — curated API landing organized by README's
|
|
94
|
+
three-tier architecture (Tier 1 functional core, Tier 2 protocol
|
|
95
|
+
orchestration, Tier 3 reproducibility scaffolding); per Q8=C.
|
|
96
|
+
- `docs/api/<module>.md` — 22 per-module auto-gen stubs invoking
|
|
97
|
+
`::: eval_toolkit.<module>` mkdocstrings directives.
|
|
98
|
+
- `docs/javascripts/mathjax-config.js` — MathJax v3 init script
|
|
99
|
+
matching mkdocs-material's pymdownx.arithmatex (generic: true).
|
|
100
|
+
- `.github/workflows/docs.yml` deploys to GitHub Pages on every
|
|
101
|
+
push to main + every tag push. Single-version site (no `mike`,
|
|
102
|
+
per Q11=A).
|
|
103
|
+
- `[docs]` optional extra added to `pyproject.toml` listing
|
|
104
|
+
mkdocs-material, mkdocstrings[python], pymdown-extensions.
|
|
105
|
+
- `pyproject.urls.Documentation` repointed at the hosted-docs URL.
|
|
106
|
+
- README badge added: `Docs` linking to the GitHub Pages site.
|
|
107
|
+
- `.gitignore` extended to exclude the mkdocs build output (`/site/`).
|
|
108
|
+
- **Known follow-up**: 30+ relative-link warnings in
|
|
109
|
+
`docs/methodology/*.md` files (links to `../../src/...` and
|
|
110
|
+
`../../CHANGELOG.md`). Workflow temporarily runs without
|
|
111
|
+
`--strict`; Section E.2 will fix these and re-enable strict mode.
|
|
112
|
+
|
|
113
|
+
- Section D (public-repo polish from temporalcv-cross-pollination bundle):
|
|
114
|
+
added `SECURITY.md` (security disclosure policy with response SLAs,
|
|
115
|
+
scope, and reporter-credit policy); added `CITATION.cff` (machine-
|
|
116
|
+
readable academic citation metadata, exposing the GitHub web UI
|
|
117
|
+
"Cite this repository" button — methodology-relevant primary
|
|
118
|
+
references listed for `bootstrap_ci`, `brier_score`,
|
|
119
|
+
`fit_platt_calibrator`, `delong_roc_variance`, `PurgedKFoldSplitter`).
|
|
120
|
+
Added four trust-set badges to README (CI status, PyPI version,
|
|
121
|
+
Python ≥3.13, License MIT). Extended `pyproject.urls` with a
|
|
122
|
+
`Documentation` key pointing at `docs/getting-started.md` (the
|
|
123
|
+
hosted-docs URL replaces this in Section E.1). Module-docstring
|
|
124
|
+
audit across all 22 `src/eval_toolkit/*.py` modules — all already
|
|
125
|
+
carry adequate module-level docstrings; no patches needed.
|
|
126
|
+
|
|
127
|
+
- Section C (example gallery from temporalcv-cross-pollination bundle):
|
|
128
|
+
six new minimal worked examples in `docs/examples/`, each one
|
|
129
|
+
concept per file, Sybil-validated end-to-end in CI:
|
|
130
|
+
- `metrics_and_bootstrap.md` — `pr_auc` / `roc_auc` / `brier_score`
|
|
131
|
+
+ `bootstrap_ci` (BCa vs percentile)
|
|
132
|
+
- `evaluate_harness.md` — slice-aware `evaluate(...)` with two
|
|
133
|
+
scorers, `write_run_result(...)`, JSON schema validation
|
|
134
|
+
- `calibration.md` — Platt + isotonic recalibration, ECE before/after
|
|
135
|
+
- `leakage_detection.md` — `ExactDuplicateCheck` +
|
|
136
|
+
`NormalizedFormLeakageCheck` + `LabelConflictCheck` on a
|
|
137
|
+
contaminated train/test pair
|
|
138
|
+
- `claims_and_gates.md` — `EvidenceGate` composition (metric
|
|
139
|
+
threshold + minimum slice size) for release-decision gating
|
|
140
|
+
- `paired_comparison.md` — `paired_bootstrap_diff` for two-scorer
|
|
141
|
+
significance + `mde_from_ci` for power analysis
|
|
142
|
+
- `index.md` — examples landing page mapping each example to the
|
|
143
|
+
capability it demonstrates + the minimum extras required
|
|
144
|
+
Total: 28 sybil-validated code blocks. Each is the headline-import
|
|
145
|
+
→ usable-output minimum surface; together they cover the public API
|
|
146
|
+
surface a new user needs to be productive.
|
|
147
|
+
|
|
148
|
+
- Section B (PurgedKFold splitter from temporalcv-cross-pollination bundle):
|
|
149
|
+
`PurgedKFoldSplitter(n_splits, purge_gap, embargo_pct, time_col)` and a
|
|
150
|
+
standalone `compute_label_overlap(t_train, t_test, horizon)` helper, both
|
|
151
|
+
now public via `from eval_toolkit import ...`. Time-aware k-fold with
|
|
152
|
+
explicit purge gap straddling the test fold + post-test embargo —
|
|
153
|
+
prevents label-window leakage when labels have a forward horizon
|
|
154
|
+
(e.g., H-step forward returns). The standalone helper audits arbitrary
|
|
155
|
+
train/test overlap independent of the splitter. Adapted from López de
|
|
156
|
+
Prado (2018) Chapter 7 via temporalcv's `cv_financial.py`; API names
|
|
157
|
+
preserved verbatim for cross-library muscle memory. Public-API
|
|
158
|
+
drift-guard snapshot regenerated for the two new exports.
|
|
159
|
+
|
|
160
|
+
### Internal
|
|
161
|
+
|
|
162
|
+
- Section A (Monte Carlo bootstrap CI calibration, from temporalcv-cross-pollination
|
|
163
|
+
bundle): added `tests/test_bootstrap_calibration_mc.py` (slow-marker) that runs
|
|
164
|
+
500-replicate MC validation of `bootstrap_ci` coverage + bias across 5 cases
|
|
165
|
+
(pr_auc / roc_auc × balanced / imbalanced × n=200 / n=1000 × BCa / percentile
|
|
166
|
+
method). Asserts empirical coverage ∈ [0.90, 0.99] for nominal 95% CIs and
|
|
167
|
+
|bias| < 0.05. Complements Tier 1's golden tests: goldens pin exact numerical
|
|
168
|
+
output (drift detection), MC tests validate that the math is correct (a buggy
|
|
169
|
+
implementation producing self-consistent wrong values fails MC but passes
|
|
170
|
+
goldens). Also added CI width-scaling test (width should shrink as ~1/√n).
|
|
171
|
+
New workflow `.github/workflows/nightly-mc.yml` triggers this suite weekly
|
|
172
|
+
on Sundays at 03:00 UTC (plus `workflow_dispatch` for manual runs). Harness
|
|
173
|
+
pattern adapted from temporalcv's `tests/conftest.py` Monte Carlo helpers.
|
|
174
|
+
|
|
175
|
+
- Test coverage (Tier 1 — math kernel correctness + integration backbone):
|
|
176
|
+
added end-to-end pipeline tests (`tests/test_pipeline_e2e.py`) that
|
|
177
|
+
exercise loader → `evaluate` → `write_run_result` → JSON schema
|
|
178
|
+
validation for `DataFrameLoader` and `SingleSliceLoader` (incl.
|
|
179
|
+
paired-diffs path). Extended `tests/test_metrics_props.py` with
|
|
180
|
+
Brier-score bounds + label/score inversion symmetry properties. Added
|
|
181
|
+
bootstrap CI golden tests (`tests/test_bootstrap_golden.py`, fixture at
|
|
182
|
+
`tests/golden/bootstrap_ci/cases.json`) pinning BCa/percentile output
|
|
183
|
+
on 6 canonical stress points (balanced, imbalanced 5%, small-n=10,
|
|
184
|
+
tied scores) to ±1e-9. Expanded the `golden` pytest marker doc.
|
|
185
|
+
|
|
186
|
+
- Test coverage (Tier 3 — resilience moat): added multi-slice
|
|
187
|
+
fault-injection tests (`tests/test_harness_fault_injection.py`) that
|
|
188
|
+
exercise `on_scorer_error="record"` across three slices where the
|
|
189
|
+
scorer succeeds on the middle one and fails on the outer two —
|
|
190
|
+
asserts per-(slice, scorer) independence (no error-state bleed) plus
|
|
191
|
+
a healthy-vs-faulting scorer parity check against a no-fault control.
|
|
192
|
+
Added exactness tests for `TargetFPRSelector`
|
|
193
|
+
(`tests/test_thresholds.py`): analytical answer on
|
|
194
|
+
perfectly-separable data plus a golden-style pinned threshold value
|
|
195
|
+
for a canonical (n=500, seed=42) overlapping distribution across
|
|
196
|
+
target FPRs 0.01 / 0.05 / 0.10 / 0.20, with a monotonicity invariant.
|
|
197
|
+
Added calibration determinism tests
|
|
198
|
+
(`tests/test_calibration_determinism.py`): same `(y, score)` produces
|
|
199
|
+
bit-identical Platt fit `a`/`b` parameters and isotonic transform
|
|
200
|
+
output across runs, parametrized over 1% / 50% / 99% positive
|
|
201
|
+
prevalence. Added NaN/+inf/-inf rejection tests for `pr_auc`,
|
|
202
|
+
`roc_auc`, `brier_score` to `tests/test_metrics_props.py` —
|
|
203
|
+
parametrized; locks the input-validation contract.
|
|
204
|
+
|
|
205
|
+
- Test coverage (Tier 2 — public-contract + integration breadth):
|
|
206
|
+
added public-API drift guard (`tests/test_public_api.py`, fixture at
|
|
207
|
+
`tests/golden/public_api/snapshot.json`) that snapshots all 199 names
|
|
208
|
+
in `eval_toolkit.__all__` with signatures, class bases, first
|
|
209
|
+
docstring lines, and primitive-value summaries. Drift now requires an
|
|
210
|
+
explicit golden-regeneration commit. Extended
|
|
211
|
+
`tests/test_pipeline_e2e.py` with `ParquetGlobLoader` round-trip
|
|
212
|
+
(synthetic parquet → glob → load → evaluate → schema-validate; gated
|
|
213
|
+
on `pyarrow`). Extended `tests/test_artifacts.py` with four manifest
|
|
214
|
+
v2↔v3 dispatcher tests: v3 well-formed accepted; v3 missing
|
|
215
|
+
`contamination_flags` rejected; v3 with unknown enum value rejected;
|
|
216
|
+
v2 payloads still routed to v2 schema (no eager v3 demotion).
|
|
217
|
+
|
|
10
218
|
## [0.27.2] — 2026-05-15 — fix base-install pandas import
|
|
11
219
|
|
|
12
220
|
Base install of `eval-toolkit` (no extras) was broken in 0.27.1: every
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.28.1
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
|
+
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
6
7
|
Project-URL: Repository, https://github.com/brandon-behring/eval-toolkit.git
|
|
7
8
|
Project-URL: Issues, https://github.com/brandon-behring/eval-toolkit/issues
|
|
8
9
|
Project-URL: Changelog, https://github.com/brandon-behring/eval-toolkit/blob/main/CHANGELOG.md
|
|
@@ -49,6 +50,10 @@ Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
|
49
50
|
Requires-Dist: pyyaml>=6.0; extra == 'dev'
|
|
50
51
|
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
51
52
|
Requires-Dist: sybil>=10.0; extra == 'dev'
|
|
53
|
+
Provides-Extra: docs
|
|
54
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
55
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
|
|
56
|
+
Requires-Dist: pymdown-extensions>=10.7; extra == 'docs'
|
|
52
57
|
Provides-Extra: parquet
|
|
53
58
|
Requires-Dist: pyarrow>=15.0; extra == 'parquet'
|
|
54
59
|
Provides-Extra: plotting
|
|
@@ -63,6 +68,12 @@ Description-Content-Type: text/markdown
|
|
|
63
68
|
|
|
64
69
|
# eval-toolkit
|
|
65
70
|
|
|
71
|
+
[](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml)
|
|
72
|
+
[](https://brandon-behring.github.io/eval-toolkit/)
|
|
73
|
+
[](https://pypi.org/project/eval-toolkit/)
|
|
74
|
+
[](https://pypi.org/project/eval-toolkit/)
|
|
75
|
+
[](LICENSE)
|
|
76
|
+
|
|
66
77
|
A **methodology-aware evaluation harness for binary classification**:
|
|
67
78
|
metrics, bootstrap CIs, calibration, leakage detection, splitting,
|
|
68
79
|
threshold selection, dataset loading, reproducibility manifests, and a
|
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# eval-toolkit
|
|
2
2
|
|
|
3
|
+
[](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml)
|
|
4
|
+
[](https://brandon-behring.github.io/eval-toolkit/)
|
|
5
|
+
[](https://pypi.org/project/eval-toolkit/)
|
|
6
|
+
[](https://pypi.org/project/eval-toolkit/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
3
9
|
A **methodology-aware evaluation harness for binary classification**:
|
|
4
10
|
metrics, bootstrap CIs, calibration, leakage detection, splitting,
|
|
5
11
|
threshold selection, dataset loading, reproducibility manifests, and a
|
|
@@ -54,6 +54,15 @@ parquet = ["pyarrow>=15.0"]
|
|
|
54
54
|
# resolve cleanly; jsonschema is now in the base deps. Remove in v0.17.0
|
|
55
55
|
# after a deprecation window if no downstream consumer complains.
|
|
56
56
|
validation = []
|
|
57
|
+
# v0.28.0 (Section E.1): docs site via mkdocs-material + mkdocstrings.
|
|
58
|
+
# Per Q4=B in the temporalcv-cross-pollination plan; LaTeX via MathJax v3
|
|
59
|
+
# (configured in docs/javascripts/mathjax-config.js) and tikzjax loaded
|
|
60
|
+
# from CDN in mkdocs.yml.
|
|
61
|
+
docs = [
|
|
62
|
+
"mkdocs-material>=9.5",
|
|
63
|
+
"mkdocstrings[python]>=0.24",
|
|
64
|
+
"pymdown-extensions>=10.7",
|
|
65
|
+
]
|
|
57
66
|
# `all` references the sub-extras directly (PEP 685 self-reference). This
|
|
58
67
|
# avoids drift: adding a dep to e.g. `dataframe` no longer requires a
|
|
59
68
|
# mirroring edit here.
|
|
@@ -72,6 +81,7 @@ dev = [
|
|
|
72
81
|
|
|
73
82
|
[project.urls]
|
|
74
83
|
Homepage = "https://github.com/brandon-behring/eval-toolkit"
|
|
84
|
+
Documentation = "https://brandon-behring.github.io/eval-toolkit/"
|
|
75
85
|
Repository = "https://github.com/brandon-behring/eval-toolkit.git"
|
|
76
86
|
Issues = "https://github.com/brandon-behring/eval-toolkit/issues"
|
|
77
87
|
Changelog = "https://github.com/brandon-behring/eval-toolkit/blob/main/CHANGELOG.md"
|
|
@@ -146,8 +156,9 @@ markers = [
|
|
|
146
156
|
"unit: Sklearn-reference and analytical correctness tests",
|
|
147
157
|
"property: Hypothesis property-based invariant tests",
|
|
148
158
|
"smoke: End-to-end smoke tests",
|
|
149
|
-
"golden: Snapshot tests for deterministic outputs (docs.
|
|
159
|
+
"golden: Snapshot tests for deterministic outputs (docs renderer, bootstrap CI numerical pins, public API surface).",
|
|
150
160
|
"slow: Tests > 2s (bootstrap-t studentized, multi-seed K-fold). Opt out with `pytest -m 'not slow'`.",
|
|
161
|
+
"monte_carlo: Monte Carlo calibration suite (~14 min). Skipped in PR CI; runs only in the nightly-mc workflow via `-m monte_carlo`.",
|
|
151
162
|
]
|
|
152
163
|
|
|
153
164
|
[tool.coverage.run]
|
|
@@ -183,10 +183,12 @@ _EXPORTS: dict[str, str] = {
|
|
|
183
183
|
"GroupKFoldSplitter": "eval_toolkit.splits",
|
|
184
184
|
"HoldoutSplitter": "eval_toolkit.splits",
|
|
185
185
|
"PoolBuilder": "eval_toolkit.splits",
|
|
186
|
+
"PurgedKFoldSplitter": "eval_toolkit.splits",
|
|
186
187
|
"SourceDisjointKFoldSplitter": "eval_toolkit.splits",
|
|
187
188
|
"Splitter": "eval_toolkit.splits",
|
|
188
189
|
"StratifiedKFoldSplitter": "eval_toolkit.splits",
|
|
189
190
|
"TimeSeriesSplitter": "eval_toolkit.splits",
|
|
191
|
+
"compute_label_overlap": "eval_toolkit.splits",
|
|
190
192
|
"iter_folds_with_pool": "eval_toolkit.splits",
|
|
191
193
|
"DEFAULT_DEDUP_THRESHOLD": "eval_toolkit.text_dedup",
|
|
192
194
|
"DedupReport": "eval_toolkit.text_dedup",
|
|
@@ -185,9 +185,11 @@ class RunResult:
|
|
|
185
185
|
JSON schema version. ``"v1"`` for v0.7.0+; downstream parsers gate
|
|
186
186
|
on this.
|
|
187
187
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
188
|
+
Notes
|
|
189
|
+
-----
|
|
190
|
+
Changed in 0.7.0: added ``by_fold``, ``fold_summary``,
|
|
191
|
+
``schema_version`` (additive, defaults empty / ``"v1"`` — backward
|
|
192
|
+
compatible).
|
|
191
193
|
"""
|
|
192
194
|
|
|
193
195
|
run_id: str
|
|
@@ -36,10 +36,12 @@ __all__ = [
|
|
|
36
36
|
"GroupKFoldSplitter",
|
|
37
37
|
"HoldoutSplitter",
|
|
38
38
|
"PoolBuilder",
|
|
39
|
+
"PurgedKFoldSplitter",
|
|
39
40
|
"SourceDisjointKFoldSplitter",
|
|
40
41
|
"Splitter",
|
|
41
42
|
"StratifiedKFoldSplitter",
|
|
42
43
|
"TimeSeriesSplitter",
|
|
44
|
+
"compute_label_overlap",
|
|
43
45
|
"iter_folds_with_pool",
|
|
44
46
|
]
|
|
45
47
|
|
|
@@ -513,3 +515,283 @@ def iter_folds_with_pool(
|
|
|
513
515
|
# PoolBuilder's keys (train, val, possibly more) take precedence;
|
|
514
516
|
# test is reattached from the Splitter.
|
|
515
517
|
yield {**built, "test": test}
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
# ---------------------------------------------------------------------------
|
|
521
|
+
# Purged K-fold for label-overlap protection (v0.28.0)
|
|
522
|
+
#
|
|
523
|
+
# Adapted from temporalcv (Behring 2026) for the financial / forecasting
|
|
524
|
+
# label-overlap case: when labels use future data (e.g., H-day forward
|
|
525
|
+
# returns), train and test folds can overlap in their LABEL windows even
|
|
526
|
+
# when their FEATURE windows don't. Purging drops a band of training
|
|
527
|
+
# samples within ``purge_gap`` of each test fold; embargo drops an
|
|
528
|
+
# additional fraction of n samples bordering each test fold.
|
|
529
|
+
# ---------------------------------------------------------------------------
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def compute_label_overlap(
|
|
533
|
+
t_train: np.ndarray,
|
|
534
|
+
t_test: np.ndarray,
|
|
535
|
+
horizon: int,
|
|
536
|
+
) -> np.ndarray:
|
|
537
|
+
r"""Boolean ``(n_train, n_test)`` matrix: True where label windows overlap.
|
|
538
|
+
|
|
539
|
+
For h-step forward labels, the label at time ``t`` depends on the data
|
|
540
|
+
at times ``[t, t+h]``. Two samples ``t_train[i]`` and ``t_test[j]``
|
|
541
|
+
have label-window overlap if their windows share at least one time
|
|
542
|
+
point — equivalently, if ``|t_train[i] - t_test[j]| < horizon``.
|
|
543
|
+
|
|
544
|
+
Use this to audit whether a given train/test split has any label
|
|
545
|
+
leakage. Standalone helper; does NOT require a particular splitter.
|
|
546
|
+
|
|
547
|
+
Parameters
|
|
548
|
+
----------
|
|
549
|
+
t_train : np.ndarray, shape (n_train,)
|
|
550
|
+
Time indices of the training set (any sortable numeric type).
|
|
551
|
+
t_test : np.ndarray, shape (n_test,)
|
|
552
|
+
Time indices of the test set.
|
|
553
|
+
horizon : int
|
|
554
|
+
Label horizon (e.g., ``5`` for 5-step forward returns). Must be
|
|
555
|
+
non-negative; ``horizon=0`` means no overlap is possible.
|
|
556
|
+
|
|
557
|
+
Returns
|
|
558
|
+
-------
|
|
559
|
+
np.ndarray, shape (n_train, n_test), dtype bool
|
|
560
|
+
Entry ``(i, j)`` is ``True`` iff
|
|
561
|
+
``|t_train[i] - t_test[j]| < horizon``.
|
|
562
|
+
|
|
563
|
+
Raises
|
|
564
|
+
------
|
|
565
|
+
ValueError
|
|
566
|
+
If ``horizon`` is negative.
|
|
567
|
+
|
|
568
|
+
Examples
|
|
569
|
+
--------
|
|
570
|
+
>>> import numpy as np
|
|
571
|
+
>>> t_train = np.array([0, 1, 5, 6])
|
|
572
|
+
>>> t_test = np.array([3, 4])
|
|
573
|
+
>>> overlap = compute_label_overlap(t_train, t_test, horizon=3)
|
|
574
|
+
>>> overlap
|
|
575
|
+
array([[False, False],
|
|
576
|
+
[ True, False],
|
|
577
|
+
[ True, True],
|
|
578
|
+
[False, True]])
|
|
579
|
+
>>> # Sample 0 (t=0): no overlap with test (|0-3|=3, |0-4|=4 ≥ horizon)
|
|
580
|
+
>>> # Sample 1 (t=1): overlaps test[0]=3 (|1-3|=2 < 3)
|
|
581
|
+
>>> # Sample 2 (t=5): overlaps both (|5-3|=2, |5-4|=1)
|
|
582
|
+
>>> # Sample 3 (t=6): overlaps test[1]=4 (|6-4|=2 < 3)
|
|
583
|
+
|
|
584
|
+
Notes
|
|
585
|
+
-----
|
|
586
|
+
The check is **symmetric in time**: ``|t_train - t_test| < horizon``
|
|
587
|
+
treats overlap in either temporal direction equally. For strictly
|
|
588
|
+
forward-only label overlap (train-before-test), filter the result
|
|
589
|
+
with ``(t_test[None, :] - t_train[:, None]) > 0``.
|
|
590
|
+
|
|
591
|
+
For h-step forward labels: label at time t covers ``[t, t+h)``, so
|
|
592
|
+
two labels at times ``t1, t2`` share data iff their intervals
|
|
593
|
+
overlap, which holds iff ``|t1 - t2| < h``.
|
|
594
|
+
|
|
595
|
+
References
|
|
596
|
+
----------
|
|
597
|
+
.. [1] López de Prado, M. (2018). "Advances in Financial Machine
|
|
598
|
+
Learning." Wiley. Chapter 7: Cross-Validation in Finance.
|
|
599
|
+
"""
|
|
600
|
+
if horizon < 0:
|
|
601
|
+
raise ValueError(f"horizon must be >= 0, got {horizon}")
|
|
602
|
+
if horizon == 0:
|
|
603
|
+
return np.zeros((len(t_train), len(t_test)), dtype=bool)
|
|
604
|
+
t_train_arr = np.asarray(t_train)
|
|
605
|
+
t_test_arr = np.asarray(t_test)
|
|
606
|
+
# Outer absolute difference: (n_train, n_test)
|
|
607
|
+
dist = np.abs(t_train_arr[:, None] - t_test_arr[None, :])
|
|
608
|
+
overlap: np.ndarray = dist < horizon
|
|
609
|
+
return overlap
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def _apply_purge_embargo(
|
|
613
|
+
test_idx: np.ndarray,
|
|
614
|
+
n_samples: int,
|
|
615
|
+
purge_gap: int,
|
|
616
|
+
embargo_pct: float,
|
|
617
|
+
) -> np.ndarray:
|
|
618
|
+
"""Build a training-index array excluding the test fold + purge + embargo.
|
|
619
|
+
|
|
620
|
+
The test fold's indices are contiguous (TimeSeriesSplit-style); purging
|
|
621
|
+
drops `[test_min - purge_gap, test_max + purge_gap]` from training;
|
|
622
|
+
embargo drops an additional `floor(embargo_pct * n_samples)` indices
|
|
623
|
+
after the test fold (one-sided: protects the post-test region from
|
|
624
|
+
label-window leakage when labels are forward-looking).
|
|
625
|
+
|
|
626
|
+
Adapted from temporalcv's ``_apply_purge_and_embargo`` but vectorized
|
|
627
|
+
(no Python-level set/loop) and asymmetric-by-default (embargo only on
|
|
628
|
+
the post-test side, matching López de Prado's original definition).
|
|
629
|
+
"""
|
|
630
|
+
test_min = int(np.min(test_idx))
|
|
631
|
+
test_max = int(np.max(test_idx))
|
|
632
|
+
purge_start = max(0, test_min - purge_gap)
|
|
633
|
+
purge_end = min(n_samples, test_max + 1 + purge_gap)
|
|
634
|
+
n_embargo = int(embargo_pct * n_samples)
|
|
635
|
+
embargo_end = min(n_samples, test_max + 1 + n_embargo)
|
|
636
|
+
|
|
637
|
+
full_idx = np.arange(n_samples)
|
|
638
|
+
# Mask out: the test fold itself + purge band on both sides + post-test embargo
|
|
639
|
+
keep = np.ones(n_samples, dtype=bool)
|
|
640
|
+
keep[purge_start:purge_end] = False # zeroes out test + purge band
|
|
641
|
+
keep[test_max + 1 : embargo_end] = False # post-test embargo
|
|
642
|
+
return full_idx[keep]
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
@dataclass(frozen=True, slots=True)
|
|
646
|
+
class PurgedKFoldSplitter:
|
|
647
|
+
r"""Time-aware k-fold with explicit purge gap + post-test embargo.
|
|
648
|
+
|
|
649
|
+
Pattern from López de Prado (2018) Ch. 7: when labels have a forward
|
|
650
|
+
lookahead (e.g., H-step returns), train and test folds can overlap in
|
|
651
|
+
their **label windows** even when their **feature windows** don't.
|
|
652
|
+
Standard k-fold leaks information through this overlap. PurgedKFold
|
|
653
|
+
drops a ``purge_gap``-sample band straddling each test fold's boundary
|
|
654
|
+
plus a post-test ``embargo_pct * n`` window — preventing both
|
|
655
|
+
backward and forward label-overlap leakage.
|
|
656
|
+
|
|
657
|
+
Implements the :class:`Splitter` Protocol, yielding
|
|
658
|
+
``{"train": EvalSlice, "test": EvalSlice}`` dicts.
|
|
659
|
+
|
|
660
|
+
Parameters
|
|
661
|
+
----------
|
|
662
|
+
n_splits : int, optional
|
|
663
|
+
Number of folds. Default 5. Must be ≥ 2.
|
|
664
|
+
purge_gap : int, optional
|
|
665
|
+
Samples to drop on each side of every test fold's boundary.
|
|
666
|
+
Default 0 (no purging — equivalent to vanilla TimeSeriesSplit).
|
|
667
|
+
For h-step forward labels, ``purge_gap=h`` is the canonical choice.
|
|
668
|
+
embargo_pct : float, optional
|
|
669
|
+
Additional embargo as a fraction of total ``n``, applied **after**
|
|
670
|
+
each test fold (one-sided, López de Prado convention). Default
|
|
671
|
+
0.0. Typical: 0.01 (1%).
|
|
672
|
+
time_col : str or None, optional
|
|
673
|
+
Column carrying a sortable timestamp. If set, the parent slice is
|
|
674
|
+
sorted by this column before splitting. ``None`` assumes the slice
|
|
675
|
+
is already in temporal order. Default ``"timestamp"``.
|
|
676
|
+
|
|
677
|
+
Raises
|
|
678
|
+
------
|
|
679
|
+
ValueError
|
|
680
|
+
At construction time if ``n_splits < 2`` or ``purge_gap < 0`` or
|
|
681
|
+
``embargo_pct ∉ [0, 1)``.
|
|
682
|
+
KeyError
|
|
683
|
+
At ``iter_folds`` time if ``time_col`` is set but not present in
|
|
684
|
+
the slice DataFrame.
|
|
685
|
+
|
|
686
|
+
Examples
|
|
687
|
+
--------
|
|
688
|
+
>>> import pandas as pd
|
|
689
|
+
>>> from eval_toolkit.harness import EvalSlice
|
|
690
|
+
>>> from eval_toolkit.splits import PurgedKFoldSplitter
|
|
691
|
+
>>> df = pd.DataFrame({
|
|
692
|
+
... "text": [f"row{i}" for i in range(50)],
|
|
693
|
+
... "label": [i % 2 for i in range(50)],
|
|
694
|
+
... "t": list(range(50)),
|
|
695
|
+
... })
|
|
696
|
+
>>> parent = EvalSlice(name="all", df=df)
|
|
697
|
+
>>> spl = PurgedKFoldSplitter(n_splits=5, purge_gap=2, embargo_pct=0.02, time_col="t")
|
|
698
|
+
>>> folds = list(spl.iter_folds(parent))
|
|
699
|
+
>>> len(folds)
|
|
700
|
+
5
|
|
701
|
+
>>> sorted(folds[0].keys())
|
|
702
|
+
['test', 'train']
|
|
703
|
+
|
|
704
|
+
Notes
|
|
705
|
+
-----
|
|
706
|
+
**Two units in one signature**: ``purge_gap`` is an absolute count of
|
|
707
|
+
samples (int) while ``embargo_pct`` is a fraction (float). This
|
|
708
|
+
mirrors López de Prado / temporalcv conventions verbatim — users
|
|
709
|
+
moving between libraries see the same parameter names. Use the
|
|
710
|
+
standalone helper :func:`compute_label_overlap` to size ``purge_gap``
|
|
711
|
+
for a known label horizon.
|
|
712
|
+
|
|
713
|
+
See Also
|
|
714
|
+
--------
|
|
715
|
+
eval_toolkit.splits.compute_label_overlap :
|
|
716
|
+
Audit label-window overlap between arbitrary train/test sets.
|
|
717
|
+
eval_toolkit.splits.TimeSeriesSplitter :
|
|
718
|
+
Time-aware k-fold without purging — use when labels have no
|
|
719
|
+
lookahead horizon.
|
|
720
|
+
|
|
721
|
+
References
|
|
722
|
+
----------
|
|
723
|
+
.. [1] López de Prado, M. (2018). "Advances in Financial Machine
|
|
724
|
+
Learning." Wiley. Chapter 7.
|
|
725
|
+
"""
|
|
726
|
+
|
|
727
|
+
n_splits: int = 5
|
|
728
|
+
purge_gap: int = 0
|
|
729
|
+
embargo_pct: float = 0.0
|
|
730
|
+
time_col: str | None = "timestamp"
|
|
731
|
+
|
|
732
|
+
def __post_init__(self) -> None:
|
|
733
|
+
"""Validate parameters."""
|
|
734
|
+
if self.n_splits < 2:
|
|
735
|
+
raise ValueError(f"n_splits must be >= 2, got {self.n_splits}")
|
|
736
|
+
if self.purge_gap < 0:
|
|
737
|
+
raise ValueError(f"purge_gap must be >= 0, got {self.purge_gap}")
|
|
738
|
+
if not 0.0 <= self.embargo_pct < 1.0:
|
|
739
|
+
raise ValueError(f"embargo_pct must be in [0, 1), got {self.embargo_pct}")
|
|
740
|
+
|
|
741
|
+
def iter_folds(
|
|
742
|
+
self,
|
|
743
|
+
slice_: EvalSlice,
|
|
744
|
+
*,
|
|
745
|
+
groups: np.ndarray | None = None,
|
|
746
|
+
) -> Iterator[dict[str, EvalSlice]]:
|
|
747
|
+
"""Yield ``n_splits`` fold dicts with purge + embargo applied.
|
|
748
|
+
|
|
749
|
+
Raises
|
|
750
|
+
------
|
|
751
|
+
KeyError
|
|
752
|
+
If ``self.time_col`` is set but not present in ``slice_.df``.
|
|
753
|
+
"""
|
|
754
|
+
if self.time_col is not None:
|
|
755
|
+
if self.time_col not in slice_.df.columns:
|
|
756
|
+
raise KeyError(
|
|
757
|
+
f"time_col {self.time_col!r} not in slice columns " f"{list(slice_.df.columns)}"
|
|
758
|
+
)
|
|
759
|
+
sorted_df = slice_.df.sort_values(self.time_col).reset_index(drop=True)
|
|
760
|
+
sorted_slice = EvalSlice(
|
|
761
|
+
name=slice_.name,
|
|
762
|
+
df=sorted_df,
|
|
763
|
+
description=slice_.description,
|
|
764
|
+
feature_col=slice_.feature_col,
|
|
765
|
+
label_col=slice_.label_col,
|
|
766
|
+
strata_col=slice_.strata_col,
|
|
767
|
+
)
|
|
768
|
+
else:
|
|
769
|
+
sorted_slice = slice_
|
|
770
|
+
|
|
771
|
+
n_samples = len(sorted_slice.df)
|
|
772
|
+
if self.n_splits >= n_samples:
|
|
773
|
+
raise ValueError(f"n_splits ({self.n_splits}) must be < n_samples ({n_samples})")
|
|
774
|
+
|
|
775
|
+
# Fold sizes (mirrors TimeSeriesSplit / temporalcv: trailing folds
|
|
776
|
+
# absorb the remainder)
|
|
777
|
+
fold_sizes = np.full(self.n_splits, n_samples // self.n_splits)
|
|
778
|
+
fold_sizes[: n_samples % self.n_splits] += 1
|
|
779
|
+
|
|
780
|
+
current = 0
|
|
781
|
+
for fold_size in fold_sizes:
|
|
782
|
+
test_idx = np.arange(current, current + fold_size)
|
|
783
|
+
train_idx = _apply_purge_embargo(
|
|
784
|
+
test_idx,
|
|
785
|
+
n_samples=n_samples,
|
|
786
|
+
purge_gap=self.purge_gap,
|
|
787
|
+
embargo_pct=self.embargo_pct,
|
|
788
|
+
)
|
|
789
|
+
yield {
|
|
790
|
+
"train": _slice_subset(sorted_slice, train_idx, "train"),
|
|
791
|
+
"test": _slice_subset(sorted_slice, test_idx, "test"),
|
|
792
|
+
}
|
|
793
|
+
current += fold_size
|
|
794
|
+
|
|
795
|
+
def get_n_splits(self, slice_: EvalSlice) -> int:
|
|
796
|
+
"""Return ``self.n_splits``."""
|
|
797
|
+
return self.n_splits
|