eval-toolkit 0.27.2__tar.gz → 0.28.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/.gitignore +3 -0
  2. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/CHANGELOG.md +208 -0
  3. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/PKG-INFO +12 -1
  4. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/README.md +6 -0
  5. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/pyproject.toml +12 -1
  6. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/__init__.py +2 -0
  7. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/_version.py +1 -1
  8. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/harness.py +5 -3
  9. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/splits.py +282 -0
  10. eval_toolkit-0.28.1/tests/golden/bootstrap_ci/cases.json +50 -0
  11. eval_toolkit-0.28.1/tests/golden/public_api/snapshot.json +1513 -0
  12. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_artifacts.py +91 -0
  13. eval_toolkit-0.28.1/tests/test_bootstrap_calibration_mc.py +306 -0
  14. eval_toolkit-0.28.1/tests/test_bootstrap_golden.py +215 -0
  15. eval_toolkit-0.28.1/tests/test_calibration_determinism.py +114 -0
  16. eval_toolkit-0.28.1/tests/test_harness_fault_injection.py +179 -0
  17. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_metrics_props.py +99 -0
  18. eval_toolkit-0.28.1/tests/test_pipeline_e2e.py +258 -0
  19. eval_toolkit-0.28.1/tests/test_public_api.py +186 -0
  20. eval_toolkit-0.28.1/tests/test_splits.py +365 -0
  21. eval_toolkit-0.28.1/tests/test_thresholds.py +265 -0
  22. eval_toolkit-0.27.2/tests/test_splits.py +0 -162
  23. eval_toolkit-0.27.2/tests/test_thresholds.py +0 -155
  24. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/LICENSE +0 -0
  25. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/STYLE.md +0 -0
  26. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/methodology/README.md +0 -0
  27. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/README.md +0 -0
  28. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/datasets/README.md +0 -0
  29. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/data-integrity/README.md +0 -0
  30. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/eval-ecosystem/README.md +0 -0
  31. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/inference/README.md +0 -0
  32. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/docs/research/papers/prompt-injection/README.md +0 -0
  33. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/__main__.py +0 -0
  34. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/analysis.py +0 -0
  35. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/artifacts.py +0 -0
  36. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/bootstrap.py +0 -0
  37. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/calibration.py +0 -0
  38. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/claims.py +0 -0
  39. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/config.py +0 -0
  40. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/docs.py +0 -0
  41. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/evidence.py +0 -0
  42. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/leakage.py +0 -0
  43. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/loaders.py +0 -0
  44. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/manifest.py +0 -0
  45. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/metrics.py +0 -0
  46. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/operating_points.py +0 -0
  47. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/paths.py +0 -0
  48. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/plotting.py +0 -0
  49. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/protocols.py +0 -0
  50. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/provenance.py +0 -0
  51. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/py.typed +0 -0
  52. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  53. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  54. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  55. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/results.v1.json +0 -0
  56. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  57. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/seeds.py +0 -0
  58. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/text_dedup.py +0 -0
  59. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/src/eval_toolkit/thresholds.py +0 -0
  60. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  61. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  62. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  63. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  64. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  65. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  66. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  67. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/conftest.py +0 -0
  68. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/golden/docs/expected.md +0 -0
  69. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/golden/docs/input.md +0 -0
  70. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/golden/docs/metrics.json +0 -0
  71. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/strategies.py +0 -0
  72. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_analysis.py +0 -0
  73. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_edge_cases.py +0 -0
  74. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_props.py +0 -0
  75. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_research_grounded.py +0 -0
  76. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_bootstrap_unit.py +0 -0
  77. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_bootstrap_chain.py +0 -0
  78. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_optimization_failures.py +0 -0
  79. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_props.py +0 -0
  80. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_research_grounded.py +0 -0
  81. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_calibration_unit.py +0 -0
  82. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_claims.py +0 -0
  83. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_claims_coverage.py +0 -0
  84. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_claims_props.py +0 -0
  85. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_cli.py +0 -0
  86. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_config.py +0 -0
  87. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_coverage_gap.py +0 -0
  88. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_dedup_split_leakage_chain.py +0 -0
  89. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_docs_golden.py +0 -0
  90. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_docs_props.py +0 -0
  91. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_evidence_validators.py +0 -0
  92. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_edge_cases.py +0 -0
  93. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_internals.py +0 -0
  94. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_smoke.py +0 -0
  95. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_v07.py +0 -0
  96. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_harness_v22.py +0 -0
  97. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_import_boundaries.py +0 -0
  98. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_leakage.py +0 -0
  99. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_leakage_error_paths.py +0 -0
  100. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_leakage_props.py +0 -0
  101. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_loaders.py +0 -0
  102. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_loaders_coverage.py +0 -0
  103. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_loaders_props.py +0 -0
  104. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest.py +0 -0
  105. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest_contamination_round_trip.py +0 -0
  106. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest_props.py +0 -0
  107. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_manifest_validation.py +0 -0
  108. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_metrics_stratified_subsets.py +0 -0
  109. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_metrics_unit.py +0 -0
  110. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_misc_coverage.py +0 -0
  111. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_numeric_edge_cases.py +0 -0
  112. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_operating_points.py +0 -0
  113. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_operating_points_props.py +0 -0
  114. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_paths.py +0 -0
  115. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_plotting_edge.py +0 -0
  116. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_plotting_smoke.py +0 -0
  117. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_plotting_visual.py +0 -0
  118. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_protocol_conformance.py +0 -0
  119. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_provenance.py +0 -0
  120. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_reference_equivalence.py +0 -0
  121. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_reproducibility_integration.py +0 -0
  122. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_schemas.py +0 -0
  123. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_seeds.py +0 -0
  124. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_splits_leakage_integration.py +0 -0
  125. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_splits_props.py +0 -0
  126. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup.py +0 -0
  127. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup_coverage.py +0 -0
  128. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup_props.py +0 -0
  129. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_text_dedup_strategies.py +0 -0
  130. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_constant_score.py +0 -0
  131. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_coverage.py +0 -0
  132. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_props.py +0 -0
  133. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_thresholds_research_grounded.py +0 -0
  134. {eval_toolkit-0.27.2 → eval_toolkit-0.28.1}/tests/test_v09_contracts.py +0 -0
@@ -40,3 +40,6 @@ coverage.xml
40
40
 
41
41
  # Claude Code project settings (machine-local)
42
42
  .claude/
43
+
44
+ # mkdocs build output (Section E.1 v0.28.0)
45
+ /site/
@@ -7,6 +7,214 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.28.1] — 2026-05-15 — security-patch (CodeQL + pip-audit)
11
+
12
+ Tier α of the post-v0.28.0 best-practice gap audit. Pure CI/security
13
+ infrastructure additions; zero source-code or behavior changes.
14
+
15
+ ### Added
16
+
17
+ - `.github/workflows/codeql.yml`: GitHub's CodeQL static analyzer
18
+ on push/PR/weekly cron (Sundays 04:00 UTC). Uses the
19
+ `security-extended` query suite. Findings populate the repo's
20
+ Security → Code scanning tab.
21
+ - pip-audit step in the existing `test-base-install` CI job:
22
+ scans the runtime-only venv (`numpy` / `scipy` / `scikit-learn` /
23
+ `jsonschema`) for known CVEs on every PR. Fails CI on any finding.
24
+ Dev-extras vulns (pytest, hypothesis, etc.) are not gated —
25
+ surfaced through Dependabot. Per the v0.28.1 plan Q3=C
26
+ (runtime-deps-only gate).
27
+
28
+ ### Internal
29
+
30
+ - Audit discovered that `mypy --strict --no-implicit-reexport src/`
31
+ already passes with zero issues on the v0.28.0 source. The
32
+ planned Tier α #3 "chase remaining Any leaks" task was a no-op —
33
+ no commit shipped for it.
34
+ - pip-audit on current runtime deps: zero known vulnerabilities.
35
+
36
+ ## [0.28.0] — 2026-05-15 — temporalcv cross-pollination bundle
37
+
38
+ Six-section bundle adopting the highest-value patterns from the
39
+ sibling `temporalcv` project plus public-repo polish + hosted docs.
40
+ Major additions: `PurgedKFoldSplitter` for label-overlap-protected
41
+ cross-validation, nightly Monte Carlo bootstrap CI calibration
42
+ testing, 6-example documentation gallery, a hosted mkdocs-material
43
+ docs site with MathJax + tikzjax for full LaTeX + TikZ rendering,
44
+ SECURITY.md + CITATION.cff for public-repo polish, and a
45
+ documentary mutmut audit cataloguing math-kernel test strength.
46
+
47
+ ### Added
48
+
49
+ - Section F (mutmut audit, from temporalcv-cross-pollination bundle):
50
+ added `docs/internals/mutmut_audit.md` — documentary code-analysis
51
+ audit of the 5 math kernel modules (`metrics`, `bootstrap`,
52
+ `calibration`, `operating_points`, `thresholds`). Per Q10=A
53
+ acceptance (audit-only, no kill-rate target), the deliverable is
54
+ a catalog of likely surviving mutant patterns per module + an
55
+ assessment of whether the existing test suite would catch them.
56
+ Identifies 3 specific high-leverage gaps for future work:
57
+ (a) calibration fit-vs-eval data isolation, (b) BCa degenerate-
58
+ jackknife fallback assertion strengthening, (c) `empty_strategy`
59
+ default lock-in tests. Programmatic mutmut run deferred: mutmut
60
+ 3.5.0 has a config-parsing bug in our env where `tests_dir =
61
+ "tests/"` is splat character-by-character — revisit with mutmut
62
+ v4 or cosmic-ray. Re-run instructions captured in the audit doc.
63
+
64
+ - Section E.2 (mkdocs link cleanup, from temporalcv-cross-pollination
65
+ bundle): fixed 30+ broken relative links across 18 documentation
66
+ files. Pattern: docs that link to `../src/eval_toolkit/<X>.py`
67
+ (works on GitHub render but breaks in mkdocs) now point at the
68
+ auto-generated API reference page (`api/<X>.md`). CHANGELOG.md
69
+ references (also outside the docs tree) repointed to absolute
70
+ GitHub URLs. Down from 93 warnings to 1: the remaining
71
+ `griffe: π : float` is a documented tool limitation — griffe
72
+ doesn't parse Unicode parameter names; the project's STYLE.md
73
+ intentionally allows Unicode in math kernels (`π`, `α`, etc.).
74
+ Also patched `harness.py` RunResult docstring: replaced the Sphinx
75
+ `.. versionchanged::` directive with a NumPy "Notes" section so
76
+ mkdocstrings renders it cleanly. `mkdocs build --strict` would
77
+ fail on the 1 remaining griffe warning, so the docs.yml workflow
78
+ intentionally runs without `--strict`. The link-cleanup deliverable
79
+ is complete; the source-docstring + methodology enrichment passes
80
+ originally scoped for E.2 are deferred (existing docstrings already
81
+ carry References + LaTeX where it matters; methodology pages are
82
+ already strong content-wise — only the link structure needed fixing).
83
+
84
+ - Section E.1 (hosted documentation site, from
85
+ temporalcv-cross-pollination bundle): new mkdocs-material site at
86
+ `https://brandon-behring.github.io/eval-toolkit/`, auto-generated
87
+ from existing Markdown docs + `mkdocstrings`-rendered API reference.
88
+ - `mkdocs.yml` configures the material theme (auto light/dark,
89
+ tabs nav, code-copy buttons, full-text search) with MathJax v3 +
90
+ tikzjax loaded from CDN for full LaTeX + TikZ rendering
91
+ (per Q12=B).
92
+ - `docs/index.md` — site landing page
93
+ - `docs/api/index.md` — curated API landing organized by README's
94
+ three-tier architecture (Tier 1 functional core, Tier 2 protocol
95
+ orchestration, Tier 3 reproducibility scaffolding); per Q8=C.
96
+ - `docs/api/<module>.md` — 22 per-module auto-gen stubs invoking
97
+ `::: eval_toolkit.<module>` mkdocstrings directives.
98
+ - `docs/javascripts/mathjax-config.js` — MathJax v3 init script
99
+ matching mkdocs-material's pymdownx.arithmatex (generic: true).
100
+ - `.github/workflows/docs.yml` deploys to GitHub Pages on every
101
+ push to main + every tag push. Single-version site (no `mike`,
102
+ per Q11=A).
103
+ - `[docs]` optional extra added to `pyproject.toml` listing
104
+ mkdocs-material, mkdocstrings[python], pymdown-extensions.
105
+ - `pyproject.urls.Documentation` repointed at the hosted-docs URL.
106
+ - README badge added: `Docs` linking to the GitHub Pages site.
107
+ - `.gitignore` extended to exclude the mkdocs build output (`/site/`).
108
+ - **Known follow-up**: 30+ relative-link warnings in
109
+ `docs/methodology/*.md` files (links to `../../src/...` and
110
+ `../../CHANGELOG.md`). Workflow temporarily runs without
111
+ `--strict`; Section E.2 will fix these and re-enable strict mode.
112
+
113
+ - Section D (public-repo polish from temporalcv-cross-pollination bundle):
114
+ added `SECURITY.md` (security disclosure policy with response SLAs,
115
+ scope, and reporter-credit policy); added `CITATION.cff` (machine-
116
+ readable academic citation metadata, exposing the GitHub web UI
117
+ "Cite this repository" button — methodology-relevant primary
118
+ references listed for `bootstrap_ci`, `brier_score`,
119
+ `fit_platt_calibrator`, `delong_roc_variance`, `PurgedKFoldSplitter`).
120
+ Added four trust-set badges to README (CI status, PyPI version,
121
+ Python ≥3.13, License MIT). Extended `pyproject.urls` with a
122
+ `Documentation` key pointing at `docs/getting-started.md` (the
123
+ hosted-docs URL replaces this in Section E.1). Module-docstring
124
+ audit across all 22 `src/eval_toolkit/*.py` modules — all already
125
+ carry adequate module-level docstrings; no patches needed.
126
+
127
+ - Section C (example gallery from temporalcv-cross-pollination bundle):
128
+ six new minimal worked examples in `docs/examples/`, each one
129
+ concept per file, Sybil-validated end-to-end in CI:
130
+ - `metrics_and_bootstrap.md` — `pr_auc` / `roc_auc` / `brier_score`
131
+ + `bootstrap_ci` (BCa vs percentile)
132
+ - `evaluate_harness.md` — slice-aware `evaluate(...)` with two
133
+ scorers, `write_run_result(...)`, JSON schema validation
134
+ - `calibration.md` — Platt + isotonic recalibration, ECE before/after
135
+ - `leakage_detection.md` — `ExactDuplicateCheck` +
136
+ `NormalizedFormLeakageCheck` + `LabelConflictCheck` on a
137
+ contaminated train/test pair
138
+ - `claims_and_gates.md` — `EvidenceGate` composition (metric
139
+ threshold + minimum slice size) for release-decision gating
140
+ - `paired_comparison.md` — `paired_bootstrap_diff` for two-scorer
141
+ significance + `mde_from_ci` for power analysis
142
+ - `index.md` — examples landing page mapping each example to the
143
+ capability it demonstrates + the minimum extras required
144
+ Total: 28 sybil-validated code blocks. Each is the headline-import
145
+ → usable-output minimum surface; together they cover the public API
146
+ surface a new user needs to be productive.
147
+
148
+ - Section B (PurgedKFold splitter from temporalcv-cross-pollination bundle):
149
+ `PurgedKFoldSplitter(n_splits, purge_gap, embargo_pct, time_col)` and a
150
+ standalone `compute_label_overlap(t_train, t_test, horizon)` helper, both
151
+ now public via `from eval_toolkit import ...`. Time-aware k-fold with
152
+ explicit purge gap straddling the test fold + post-test embargo —
153
+ prevents label-window leakage when labels have a forward horizon
154
+ (e.g., H-step forward returns). The standalone helper audits arbitrary
155
+ train/test overlap independent of the splitter. Adapted from López de
156
+ Prado (2018) Chapter 7 via temporalcv's `cv_financial.py`; API names
157
+ preserved verbatim for cross-library muscle memory. Public-API
158
+ drift-guard snapshot regenerated for the two new exports.
159
+
160
+ ### Internal
161
+
162
+ - Section A (Monte Carlo bootstrap CI calibration, from temporalcv-cross-pollination
163
+ bundle): added `tests/test_bootstrap_calibration_mc.py` (slow-marker) that runs
164
+ 500-replicate MC validation of `bootstrap_ci` coverage + bias across 5 cases
165
+ (pr_auc / roc_auc × balanced / imbalanced × n=200 / n=1000 × BCa / percentile
166
+ method). Asserts empirical coverage ∈ [0.90, 0.99] for nominal 95% CIs and
167
+ |bias| < 0.05. Complements Tier 1's golden tests: goldens pin exact numerical
168
+ output (drift detection), MC tests validate that the math is correct (a buggy
169
+ implementation producing self-consistent wrong values fails MC but passes
170
+ goldens). Also added CI width-scaling test (width should shrink as ~1/√n).
171
+ New workflow `.github/workflows/nightly-mc.yml` triggers this suite weekly
172
+ on Sundays at 03:00 UTC (plus `workflow_dispatch` for manual runs). Harness
173
+ pattern adapted from temporalcv's `tests/conftest.py` Monte Carlo helpers.
174
+
175
+ - Test coverage (Tier 1 — math kernel correctness + integration backbone):
176
+ added end-to-end pipeline tests (`tests/test_pipeline_e2e.py`) that
177
+ exercise loader → `evaluate` → `write_run_result` → JSON schema
178
+ validation for `DataFrameLoader` and `SingleSliceLoader` (incl.
179
+ paired-diffs path). Extended `tests/test_metrics_props.py` with
180
+ Brier-score bounds + label/score inversion symmetry properties. Added
181
+ bootstrap CI golden tests (`tests/test_bootstrap_golden.py`, fixture at
182
+ `tests/golden/bootstrap_ci/cases.json`) pinning BCa/percentile output
183
+ on 6 canonical stress points (balanced, imbalanced 5%, small-n=10,
184
+ tied scores) to ±1e-9. Expanded the `golden` pytest marker doc.
185
+
186
+ - Test coverage (Tier 3 — resilience moat): added multi-slice
187
+ fault-injection tests (`tests/test_harness_fault_injection.py`) that
188
+ exercise `on_scorer_error="record"` across three slices where the
189
+ scorer succeeds on the middle one and fails on the outer two —
190
+ asserts per-(slice, scorer) independence (no error-state bleed) plus
191
+ a healthy-vs-faulting scorer parity check against a no-fault control.
192
+ Added exactness tests for `TargetFPRSelector`
193
+ (`tests/test_thresholds.py`): analytical answer on
194
+ perfectly-separable data plus a golden-style pinned threshold value
195
+ for a canonical (n=500, seed=42) overlapping distribution across
196
+ target FPRs 0.01 / 0.05 / 0.10 / 0.20, with a monotonicity invariant.
197
+ Added calibration determinism tests
198
+ (`tests/test_calibration_determinism.py`): same `(y, score)` produces
199
+ bit-identical Platt fit `a`/`b` parameters and isotonic transform
200
+ output across runs, parametrized over 1% / 50% / 99% positive
201
+ prevalence. Added NaN/+inf/-inf rejection tests for `pr_auc`,
202
+ `roc_auc`, `brier_score` to `tests/test_metrics_props.py` —
203
+ parametrized; locks the input-validation contract.
204
+
205
+ - Test coverage (Tier 2 — public-contract + integration breadth):
206
+ added public-API drift guard (`tests/test_public_api.py`, fixture at
207
+ `tests/golden/public_api/snapshot.json`) that snapshots all 199 names
208
+ in `eval_toolkit.__all__` with signatures, class bases, first
209
+ docstring lines, and primitive-value summaries. Drift now requires an
210
+ explicit golden-regeneration commit. Extended
211
+ `tests/test_pipeline_e2e.py` with `ParquetGlobLoader` round-trip
212
+ (synthetic parquet → glob → load → evaluate → schema-validate; gated
213
+ on `pyarrow`). Extended `tests/test_artifacts.py` with four manifest
214
+ v2↔v3 dispatcher tests: v3 well-formed accepted; v3 missing
215
+ `contamination_flags` rejected; v3 with unknown enum value rejected;
216
+ v2 payloads still routed to v2 schema (no eager v3 demotion).
217
+
10
218
  ## [0.27.2] — 2026-05-15 — fix base-install pandas import
11
219
 
12
220
  Base install of `eval-toolkit` (no extras) was broken in 0.27.1: every
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.27.2
3
+ Version: 0.28.1
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
+ Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
6
7
  Project-URL: Repository, https://github.com/brandon-behring/eval-toolkit.git
7
8
  Project-URL: Issues, https://github.com/brandon-behring/eval-toolkit/issues
8
9
  Project-URL: Changelog, https://github.com/brandon-behring/eval-toolkit/blob/main/CHANGELOG.md
@@ -49,6 +50,10 @@ Requires-Dist: pytest>=8.0; extra == 'dev'
49
50
  Requires-Dist: pyyaml>=6.0; extra == 'dev'
50
51
  Requires-Dist: ruff>=0.5; extra == 'dev'
51
52
  Requires-Dist: sybil>=10.0; extra == 'dev'
53
+ Provides-Extra: docs
54
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
55
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
56
+ Requires-Dist: pymdown-extensions>=10.7; extra == 'docs'
52
57
  Provides-Extra: parquet
53
58
  Requires-Dist: pyarrow>=15.0; extra == 'parquet'
54
59
  Provides-Extra: plotting
@@ -63,6 +68,12 @@ Description-Content-Type: text/markdown
63
68
 
64
69
  # eval-toolkit
65
70
 
71
+ [![CI](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml)
72
+ [![Docs](https://github.com/brandon-behring/eval-toolkit/actions/workflows/docs.yml/badge.svg)](https://brandon-behring.github.io/eval-toolkit/)
73
+ [![PyPI version](https://img.shields.io/pypi/v/eval-toolkit.svg)](https://pypi.org/project/eval-toolkit/)
74
+ [![Python ≥3.13](https://img.shields.io/badge/python-%E2%89%A53.13-blue.svg)](https://pypi.org/project/eval-toolkit/)
75
+ [![License: MIT](https://img.shields.io/badge/license-MIT-yellow.svg)](LICENSE)
76
+
66
77
  A **methodology-aware evaluation harness for binary classification**:
67
78
  metrics, bootstrap CIs, calibration, leakage detection, splitting,
68
79
  threshold selection, dataset loading, reproducibility manifests, and a
@@ -1,5 +1,11 @@
1
1
  # eval-toolkit
2
2
 
3
+ [![CI](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml)
4
+ [![Docs](https://github.com/brandon-behring/eval-toolkit/actions/workflows/docs.yml/badge.svg)](https://brandon-behring.github.io/eval-toolkit/)
5
+ [![PyPI version](https://img.shields.io/pypi/v/eval-toolkit.svg)](https://pypi.org/project/eval-toolkit/)
6
+ [![Python ≥3.13](https://img.shields.io/badge/python-%E2%89%A53.13-blue.svg)](https://pypi.org/project/eval-toolkit/)
7
+ [![License: MIT](https://img.shields.io/badge/license-MIT-yellow.svg)](LICENSE)
8
+
3
9
  A **methodology-aware evaluation harness for binary classification**:
4
10
  metrics, bootstrap CIs, calibration, leakage detection, splitting,
5
11
  threshold selection, dataset loading, reproducibility manifests, and a
@@ -54,6 +54,15 @@ parquet = ["pyarrow>=15.0"]
54
54
  # resolve cleanly; jsonschema is now in the base deps. Remove in v0.17.0
55
55
  # after a deprecation window if no downstream consumer complains.
56
56
  validation = []
57
+ # v0.28.0 (Section E.1): docs site via mkdocs-material + mkdocstrings.
58
+ # Per Q4=B in the temporalcv-cross-pollination plan; LaTeX via MathJax v3
59
+ # (configured in docs/javascripts/mathjax-config.js) and tikzjax loaded
60
+ # from CDN in mkdocs.yml.
61
+ docs = [
62
+ "mkdocs-material>=9.5",
63
+ "mkdocstrings[python]>=0.24",
64
+ "pymdown-extensions>=10.7",
65
+ ]
57
66
  # `all` references the sub-extras directly (PEP 685 self-reference). This
58
67
  # avoids drift: adding a dep to e.g. `dataframe` no longer requires a
59
68
  # mirroring edit here.
@@ -72,6 +81,7 @@ dev = [
72
81
 
73
82
  [project.urls]
74
83
  Homepage = "https://github.com/brandon-behring/eval-toolkit"
84
+ Documentation = "https://brandon-behring.github.io/eval-toolkit/"
75
85
  Repository = "https://github.com/brandon-behring/eval-toolkit.git"
76
86
  Issues = "https://github.com/brandon-behring/eval-toolkit/issues"
77
87
  Changelog = "https://github.com/brandon-behring/eval-toolkit/blob/main/CHANGELOG.md"
@@ -146,8 +156,9 @@ markers = [
146
156
  "unit: Sklearn-reference and analytical correctness tests",
147
157
  "property: Hypothesis property-based invariant tests",
148
158
  "smoke: End-to-end smoke tests",
149
- "golden: Snapshot tests for deterministic outputs (docs.py)",
159
+ "golden: Snapshot tests for deterministic outputs (docs renderer, bootstrap CI numerical pins, public API surface).",
150
160
  "slow: Tests > 2s (bootstrap-t studentized, multi-seed K-fold). Opt out with `pytest -m 'not slow'`.",
161
+ "monte_carlo: Monte Carlo calibration suite (~14 min). Skipped in PR CI; runs only in the nightly-mc workflow via `-m monte_carlo`.",
151
162
  ]
152
163
 
153
164
  [tool.coverage.run]
@@ -183,10 +183,12 @@ _EXPORTS: dict[str, str] = {
183
183
  "GroupKFoldSplitter": "eval_toolkit.splits",
184
184
  "HoldoutSplitter": "eval_toolkit.splits",
185
185
  "PoolBuilder": "eval_toolkit.splits",
186
+ "PurgedKFoldSplitter": "eval_toolkit.splits",
186
187
  "SourceDisjointKFoldSplitter": "eval_toolkit.splits",
187
188
  "Splitter": "eval_toolkit.splits",
188
189
  "StratifiedKFoldSplitter": "eval_toolkit.splits",
189
190
  "TimeSeriesSplitter": "eval_toolkit.splits",
191
+ "compute_label_overlap": "eval_toolkit.splits",
190
192
  "iter_folds_with_pool": "eval_toolkit.splits",
191
193
  "DEFAULT_DEDUP_THRESHOLD": "eval_toolkit.text_dedup",
192
194
  "DedupReport": "eval_toolkit.text_dedup",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.27.2"
5
+ __version__ = "0.28.1"
@@ -185,9 +185,11 @@ class RunResult:
185
185
  JSON schema version. ``"v1"`` for v0.7.0+; downstream parsers gate
186
186
  on this.
187
187
 
188
- .. versionchanged:: 0.7.0
189
- Added ``by_fold``, ``fold_summary``, ``schema_version`` (additive,
190
- defaults empty / ``"v1"`` — backward compatible).
188
+ Notes
189
+ -----
190
+ Changed in 0.7.0: added ``by_fold``, ``fold_summary``,
191
+ ``schema_version`` (additive, defaults empty / ``"v1"`` — backward
192
+ compatible).
191
193
  """
192
194
 
193
195
  run_id: str
@@ -36,10 +36,12 @@ __all__ = [
36
36
  "GroupKFoldSplitter",
37
37
  "HoldoutSplitter",
38
38
  "PoolBuilder",
39
+ "PurgedKFoldSplitter",
39
40
  "SourceDisjointKFoldSplitter",
40
41
  "Splitter",
41
42
  "StratifiedKFoldSplitter",
42
43
  "TimeSeriesSplitter",
44
+ "compute_label_overlap",
43
45
  "iter_folds_with_pool",
44
46
  ]
45
47
 
@@ -513,3 +515,283 @@ def iter_folds_with_pool(
513
515
  # PoolBuilder's keys (train, val, possibly more) take precedence;
514
516
  # test is reattached from the Splitter.
515
517
  yield {**built, "test": test}
518
+
519
+
520
+ # ---------------------------------------------------------------------------
521
+ # Purged K-fold for label-overlap protection (v0.28.0)
522
+ #
523
+ # Adapted from temporalcv (Behring 2026) for the financial / forecasting
524
+ # label-overlap case: when labels use future data (e.g., H-day forward
525
+ # returns), train and test folds can overlap in their LABEL windows even
526
+ # when their FEATURE windows don't. Purging drops a band of training
527
+ # samples within ``purge_gap`` of each test fold; embargo drops an
528
+ # additional fraction of n samples bordering each test fold.
529
+ # ---------------------------------------------------------------------------
530
+
531
+
532
+ def compute_label_overlap(
533
+ t_train: np.ndarray,
534
+ t_test: np.ndarray,
535
+ horizon: int,
536
+ ) -> np.ndarray:
537
+ r"""Boolean ``(n_train, n_test)`` matrix: True where label windows overlap.
538
+
539
+ For h-step forward labels, the label at time ``t`` depends on the data
540
+ at times ``[t, t+h]``. Two samples ``t_train[i]`` and ``t_test[j]``
541
+ have label-window overlap if their windows share at least one time
542
+ point — equivalently, if ``|t_train[i] - t_test[j]| < horizon``.
543
+
544
+ Use this to audit whether a given train/test split has any label
545
+ leakage. Standalone helper; does NOT require a particular splitter.
546
+
547
+ Parameters
548
+ ----------
549
+ t_train : np.ndarray, shape (n_train,)
550
+ Time indices of the training set (any sortable numeric type).
551
+ t_test : np.ndarray, shape (n_test,)
552
+ Time indices of the test set.
553
+ horizon : int
554
+ Label horizon (e.g., ``5`` for 5-step forward returns). Must be
555
+ non-negative; ``horizon=0`` means no overlap is possible.
556
+
557
+ Returns
558
+ -------
559
+ np.ndarray, shape (n_train, n_test), dtype bool
560
+ Entry ``(i, j)`` is ``True`` iff
561
+ ``|t_train[i] - t_test[j]| < horizon``.
562
+
563
+ Raises
564
+ ------
565
+ ValueError
566
+ If ``horizon`` is negative.
567
+
568
+ Examples
569
+ --------
570
+ >>> import numpy as np
571
+ >>> t_train = np.array([0, 1, 5, 6])
572
+ >>> t_test = np.array([3, 4])
573
+ >>> overlap = compute_label_overlap(t_train, t_test, horizon=3)
574
+ >>> overlap
575
+ array([[False, False],
576
+ [ True, False],
577
+ [ True, True],
578
+ [False, True]])
579
+ >>> # Sample 0 (t=0): no overlap with test (|0-3|=3, |0-4|=4 ≥ horizon)
580
+ >>> # Sample 1 (t=1): overlaps test[0]=3 (|1-3|=2 < 3)
581
+ >>> # Sample 2 (t=5): overlaps both (|5-3|=2, |5-4|=1)
582
+ >>> # Sample 3 (t=6): overlaps test[1]=4 (|6-4|=2 < 3)
583
+
584
+ Notes
585
+ -----
586
+ The check is **symmetric in time**: ``|t_train - t_test| < horizon``
587
+ treats overlap in either temporal direction equally. For strictly
588
+ forward-only label overlap (train-before-test), filter the result
589
+ with ``(t_test[None, :] - t_train[:, None]) > 0``.
590
+
591
+ For h-step forward labels: label at time t covers ``[t, t+h)``, so
592
+ two labels at times ``t1, t2`` share data iff their intervals
593
+ overlap, which holds iff ``|t1 - t2| < h``.
594
+
595
+ References
596
+ ----------
597
+ .. [1] López de Prado, M. (2018). "Advances in Financial Machine
598
+ Learning." Wiley. Chapter 7: Cross-Validation in Finance.
599
+ """
600
+ if horizon < 0:
601
+ raise ValueError(f"horizon must be >= 0, got {horizon}")
602
+ if horizon == 0:
603
+ return np.zeros((len(t_train), len(t_test)), dtype=bool)
604
+ t_train_arr = np.asarray(t_train)
605
+ t_test_arr = np.asarray(t_test)
606
+ # Outer absolute difference: (n_train, n_test)
607
+ dist = np.abs(t_train_arr[:, None] - t_test_arr[None, :])
608
+ overlap: np.ndarray = dist < horizon
609
+ return overlap
610
+
611
+
612
+ def _apply_purge_embargo(
613
+ test_idx: np.ndarray,
614
+ n_samples: int,
615
+ purge_gap: int,
616
+ embargo_pct: float,
617
+ ) -> np.ndarray:
618
+ """Build a training-index array excluding the test fold + purge + embargo.
619
+
620
+ The test fold's indices are contiguous (TimeSeriesSplit-style); purging
621
+ drops `[test_min - purge_gap, test_max + purge_gap]` from training;
622
+ embargo drops an additional `floor(embargo_pct * n_samples)` indices
623
+ after the test fold (one-sided: protects the post-test region from
624
+ label-window leakage when labels are forward-looking).
625
+
626
+ Adapted from temporalcv's ``_apply_purge_and_embargo`` but vectorized
627
+ (no Python-level set/loop) and asymmetric-by-default (embargo only on
628
+ the post-test side, matching López de Prado's original definition).
629
+ """
630
+ test_min = int(np.min(test_idx))
631
+ test_max = int(np.max(test_idx))
632
+ purge_start = max(0, test_min - purge_gap)
633
+ purge_end = min(n_samples, test_max + 1 + purge_gap)
634
+ n_embargo = int(embargo_pct * n_samples)
635
+ embargo_end = min(n_samples, test_max + 1 + n_embargo)
636
+
637
+ full_idx = np.arange(n_samples)
638
+ # Mask out: the test fold itself + purge band on both sides + post-test embargo
639
+ keep = np.ones(n_samples, dtype=bool)
640
+ keep[purge_start:purge_end] = False # zeroes out test + purge band
641
+ keep[test_max + 1 : embargo_end] = False # post-test embargo
642
+ return full_idx[keep]
643
+
644
+
645
+ @dataclass(frozen=True, slots=True)
646
+ class PurgedKFoldSplitter:
647
+ r"""Time-aware k-fold with explicit purge gap + post-test embargo.
648
+
649
+ Pattern from López de Prado (2018) Ch. 7: when labels have a forward
650
+ lookahead (e.g., H-step returns), train and test folds can overlap in
651
+ their **label windows** even when their **feature windows** don't.
652
+ Standard k-fold leaks information through this overlap. PurgedKFold
653
+ drops a ``purge_gap``-sample band straddling each test fold's boundary
654
+ plus a post-test ``embargo_pct * n`` window — preventing both
655
+ backward and forward label-overlap leakage.
656
+
657
+ Implements the :class:`Splitter` Protocol, yielding
658
+ ``{"train": EvalSlice, "test": EvalSlice}`` dicts.
659
+
660
+ Parameters
661
+ ----------
662
+ n_splits : int, optional
663
+ Number of folds. Default 5. Must be ≥ 2.
664
+ purge_gap : int, optional
665
+ Samples to drop on each side of every test fold's boundary.
666
+ Default 0 (no purging — equivalent to vanilla TimeSeriesSplit).
667
+ For h-step forward labels, ``purge_gap=h`` is the canonical choice.
668
+ embargo_pct : float, optional
669
+ Additional embargo as a fraction of total ``n``, applied **after**
670
+ each test fold (one-sided, López de Prado convention). Default
671
+ 0.0. Typical: 0.01 (1%).
672
+ time_col : str or None, optional
673
+ Column carrying a sortable timestamp. If set, the parent slice is
674
+ sorted by this column before splitting. ``None`` assumes the slice
675
+ is already in temporal order. Default ``"timestamp"``.
676
+
677
+ Raises
678
+ ------
679
+ ValueError
680
+ At construction time if ``n_splits < 2`` or ``purge_gap < 0`` or
681
+ ``embargo_pct ∉ [0, 1)``.
682
+ KeyError
683
+ At ``iter_folds`` time if ``time_col`` is set but not present in
684
+ the slice DataFrame.
685
+
686
+ Examples
687
+ --------
688
+ >>> import pandas as pd
689
+ >>> from eval_toolkit.harness import EvalSlice
690
+ >>> from eval_toolkit.splits import PurgedKFoldSplitter
691
+ >>> df = pd.DataFrame({
692
+ ... "text": [f"row{i}" for i in range(50)],
693
+ ... "label": [i % 2 for i in range(50)],
694
+ ... "t": list(range(50)),
695
+ ... })
696
+ >>> parent = EvalSlice(name="all", df=df)
697
+ >>> spl = PurgedKFoldSplitter(n_splits=5, purge_gap=2, embargo_pct=0.02, time_col="t")
698
+ >>> folds = list(spl.iter_folds(parent))
699
+ >>> len(folds)
700
+ 5
701
+ >>> sorted(folds[0].keys())
702
+ ['test', 'train']
703
+
704
+ Notes
705
+ -----
706
+ **Two units in one signature**: ``purge_gap`` is an absolute count of
707
+ samples (int) while ``embargo_pct`` is a fraction (float). This
708
+ mirrors López de Prado / temporalcv conventions verbatim — users
709
+ moving between libraries see the same parameter names. Use the
710
+ standalone helper :func:`compute_label_overlap` to size ``purge_gap``
711
+ for a known label horizon.
712
+
713
+ See Also
714
+ --------
715
+ eval_toolkit.splits.compute_label_overlap :
716
+ Audit label-window overlap between arbitrary train/test sets.
717
+ eval_toolkit.splits.TimeSeriesSplitter :
718
+ Time-aware k-fold without purging — use when labels have no
719
+ lookahead horizon.
720
+
721
+ References
722
+ ----------
723
+ .. [1] López de Prado, M. (2018). "Advances in Financial Machine
724
+ Learning." Wiley. Chapter 7.
725
+ """
726
+
727
+ n_splits: int = 5
728
+ purge_gap: int = 0
729
+ embargo_pct: float = 0.0
730
+ time_col: str | None = "timestamp"
731
+
732
+ def __post_init__(self) -> None:
733
+ """Validate parameters."""
734
+ if self.n_splits < 2:
735
+ raise ValueError(f"n_splits must be >= 2, got {self.n_splits}")
736
+ if self.purge_gap < 0:
737
+ raise ValueError(f"purge_gap must be >= 0, got {self.purge_gap}")
738
+ if not 0.0 <= self.embargo_pct < 1.0:
739
+ raise ValueError(f"embargo_pct must be in [0, 1), got {self.embargo_pct}")
740
+
741
+ def iter_folds(
742
+ self,
743
+ slice_: EvalSlice,
744
+ *,
745
+ groups: np.ndarray | None = None,
746
+ ) -> Iterator[dict[str, EvalSlice]]:
747
+ """Yield ``n_splits`` fold dicts with purge + embargo applied.
748
+
749
+ Raises
750
+ ------
751
+ KeyError
752
+ If ``self.time_col`` is set but not present in ``slice_.df``.
753
+ """
754
+ if self.time_col is not None:
755
+ if self.time_col not in slice_.df.columns:
756
+ raise KeyError(
757
+ f"time_col {self.time_col!r} not in slice columns " f"{list(slice_.df.columns)}"
758
+ )
759
+ sorted_df = slice_.df.sort_values(self.time_col).reset_index(drop=True)
760
+ sorted_slice = EvalSlice(
761
+ name=slice_.name,
762
+ df=sorted_df,
763
+ description=slice_.description,
764
+ feature_col=slice_.feature_col,
765
+ label_col=slice_.label_col,
766
+ strata_col=slice_.strata_col,
767
+ )
768
+ else:
769
+ sorted_slice = slice_
770
+
771
+ n_samples = len(sorted_slice.df)
772
+ if self.n_splits >= n_samples:
773
+ raise ValueError(f"n_splits ({self.n_splits}) must be < n_samples ({n_samples})")
774
+
775
+ # Fold sizes (mirrors TimeSeriesSplit / temporalcv: trailing folds
776
+ # absorb the remainder)
777
+ fold_sizes = np.full(self.n_splits, n_samples // self.n_splits)
778
+ fold_sizes[: n_samples % self.n_splits] += 1
779
+
780
+ current = 0
781
+ for fold_size in fold_sizes:
782
+ test_idx = np.arange(current, current + fold_size)
783
+ train_idx = _apply_purge_embargo(
784
+ test_idx,
785
+ n_samples=n_samples,
786
+ purge_gap=self.purge_gap,
787
+ embargo_pct=self.embargo_pct,
788
+ )
789
+ yield {
790
+ "train": _slice_subset(sorted_slice, train_idx, "train"),
791
+ "test": _slice_subset(sorted_slice, test_idx, "test"),
792
+ }
793
+ current += fold_size
794
+
795
+ def get_n_splits(self, slice_: EvalSlice) -> int:
796
+ """Return ``self.n_splits``."""
797
+ return self.n_splits