eval-toolkit 0.27.2__tar.gz → 0.28.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/.gitignore +3 -0
  2. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/CHANGELOG.md +182 -0
  3. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/PKG-INFO +12 -1
  4. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/README.md +6 -0
  5. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/pyproject.toml +12 -1
  6. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/__init__.py +2 -0
  7. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/_version.py +1 -1
  8. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/harness.py +5 -3
  9. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/splits.py +282 -0
  10. eval_toolkit-0.28.0/tests/golden/bootstrap_ci/cases.json +50 -0
  11. eval_toolkit-0.28.0/tests/golden/public_api/snapshot.json +1513 -0
  12. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_artifacts.py +91 -0
  13. eval_toolkit-0.28.0/tests/test_bootstrap_calibration_mc.py +306 -0
  14. eval_toolkit-0.28.0/tests/test_bootstrap_golden.py +215 -0
  15. eval_toolkit-0.28.0/tests/test_calibration_determinism.py +114 -0
  16. eval_toolkit-0.28.0/tests/test_harness_fault_injection.py +179 -0
  17. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_metrics_props.py +99 -0
  18. eval_toolkit-0.28.0/tests/test_pipeline_e2e.py +258 -0
  19. eval_toolkit-0.28.0/tests/test_public_api.py +186 -0
  20. eval_toolkit-0.28.0/tests/test_splits.py +365 -0
  21. eval_toolkit-0.28.0/tests/test_thresholds.py +265 -0
  22. eval_toolkit-0.27.2/tests/test_splits.py +0 -162
  23. eval_toolkit-0.27.2/tests/test_thresholds.py +0 -155
  24. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/LICENSE +0 -0
  25. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/STYLE.md +0 -0
  26. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/docs/methodology/README.md +0 -0
  27. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/docs/research/README.md +0 -0
  28. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/docs/research/datasets/README.md +0 -0
  29. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/docs/research/papers/data-integrity/README.md +0 -0
  30. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  31. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/docs/research/papers/inference/README.md +0 -0
  32. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/docs/research/papers/prompt-injection/README.md +0 -0
  33. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/__main__.py +0 -0
  34. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/analysis.py +0 -0
  35. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/artifacts.py +0 -0
  36. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/bootstrap.py +0 -0
  37. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/calibration.py +0 -0
  38. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/claims.py +0 -0
  39. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/config.py +0 -0
  40. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/docs.py +0 -0
  41. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/evidence.py +0 -0
  42. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/leakage.py +0 -0
  43. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/loaders.py +0 -0
  44. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/manifest.py +0 -0
  45. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/metrics.py +0 -0
  46. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/operating_points.py +0 -0
  47. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/paths.py +0 -0
  48. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/plotting.py +0 -0
  49. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/protocols.py +0 -0
  50. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/provenance.py +0 -0
  51. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/py.typed +0 -0
  52. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  53. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  54. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  55. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  56. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  57. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/seeds.py +0 -0
  58. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/text_dedup.py +0 -0
  59. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/src/eval_toolkit/thresholds.py +0 -0
  60. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  61. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  62. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  63. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  64. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  65. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  66. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  67. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/conftest.py +0 -0
  68. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/golden/docs/expected.md +0 -0
  69. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/golden/docs/input.md +0 -0
  70. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/golden/docs/metrics.json +0 -0
  71. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/strategies.py +0 -0
  72. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_analysis.py +0 -0
  73. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_bootstrap_edge_cases.py +0 -0
  74. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_bootstrap_props.py +0 -0
  75. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_bootstrap_research_grounded.py +0 -0
  76. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_bootstrap_unit.py +0 -0
  77. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  78. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_calibration_optimization_failures.py +0 -0
  79. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_calibration_props.py +0 -0
  80. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_calibration_research_grounded.py +0 -0
  81. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_calibration_unit.py +0 -0
  82. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_claims.py +0 -0
  83. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_claims_coverage.py +0 -0
  84. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_claims_props.py +0 -0
  85. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_cli.py +0 -0
  86. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_config.py +0 -0
  87. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_coverage_gap.py +0 -0
  88. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  89. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_docs_golden.py +0 -0
  90. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_docs_props.py +0 -0
  91. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_evidence_validators.py +0 -0
  92. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_harness_edge_cases.py +0 -0
  93. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_harness_internals.py +0 -0
  94. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_harness_smoke.py +0 -0
  95. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_harness_v07.py +0 -0
  96. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_harness_v22.py +0 -0
  97. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_import_boundaries.py +0 -0
  98. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_leakage.py +0 -0
  99. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_leakage_error_paths.py +0 -0
  100. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_leakage_props.py +0 -0
  101. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_loaders.py +0 -0
  102. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_loaders_coverage.py +0 -0
  103. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_loaders_props.py +0 -0
  104. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_manifest.py +0 -0
  105. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  106. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_manifest_props.py +0 -0
  107. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_manifest_validation.py +0 -0
  108. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_metrics_stratified_subsets.py +0 -0
  109. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_metrics_unit.py +0 -0
  110. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_misc_coverage.py +0 -0
  111. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_numeric_edge_cases.py +0 -0
  112. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_operating_points.py +0 -0
  113. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_operating_points_props.py +0 -0
  114. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_paths.py +0 -0
  115. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_plotting_edge.py +0 -0
  116. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_plotting_smoke.py +0 -0
  117. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_plotting_visual.py +0 -0
  118. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_protocol_conformance.py +0 -0
  119. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_provenance.py +0 -0
  120. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_reference_equivalence.py +0 -0
  121. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_reproducibility_integration.py +0 -0
  122. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_schemas.py +0 -0
  123. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_seeds.py +0 -0
  124. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_splits_leakage_integration.py +0 -0
  125. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_splits_props.py +0 -0
  126. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_text_dedup.py +0 -0
  127. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_text_dedup_coverage.py +0 -0
  128. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_text_dedup_props.py +0 -0
  129. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_text_dedup_strategies.py +0 -0
  130. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_thresholds_constant_score.py +0 -0
  131. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_thresholds_coverage.py +0 -0
  132. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_thresholds_props.py +0 -0
  133. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_thresholds_research_grounded.py +0 -0
  134. {eval_toolkit-0.27.2 → eval_toolkit-0.28.0}/tests/test_v09_contracts.py +0 -0
@@ -40,3 +40,6 @@ coverage.xml
40
40
 
41
41
  # Claude Code project settings (machine-local)
42
42
  .claude/
43
+
44
+ # mkdocs build output (Section E.1 v0.28.0)
45
+ /site/
@@ -7,6 +7,188 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.28.0] — 2026-05-15 — temporalcv cross-pollination bundle
11
+
12
+ Six-section bundle adopting the highest-value patterns from the
13
+ sibling `temporalcv` project plus public-repo polish + hosted docs.
14
+ Major additions: `PurgedKFoldSplitter` for label-overlap-protected
15
+ cross-validation, nightly Monte Carlo bootstrap CI calibration
16
+ testing, 6-example documentation gallery, a hosted mkdocs-material
17
+ docs site with MathJax + tikzjax for full LaTeX + TikZ rendering,
18
+ SECURITY.md + CITATION.cff for public-repo polish, and a
19
+ documentary mutmut audit cataloguing math-kernel test strength.
20
+
21
+ ### Added
22
+
23
+ - Section F (mutmut audit, from temporalcv-cross-pollination bundle):
24
+ added `docs/internals/mutmut_audit.md` — documentary code-analysis
25
+ audit of the 5 math kernel modules (`metrics`, `bootstrap`,
26
+ `calibration`, `operating_points`, `thresholds`). Per Q10=A
27
+ acceptance (audit-only, no kill-rate target), the deliverable is
28
+ a catalog of likely surviving mutant patterns per module + an
29
+ assessment of whether the existing test suite would catch them.
30
+ Identifies 3 specific high-leverage gaps for future work:
31
+ (a) calibration fit-vs-eval data isolation, (b) BCa degenerate-
32
+ jackknife fallback assertion strengthening, (c) `empty_strategy`
33
+ default lock-in tests. Programmatic mutmut run deferred: mutmut
34
+ 3.5.0 has a config-parsing bug in our env where `tests_dir =
35
+ "tests/"` is splat character-by-character — revisit with mutmut
36
+ v4 or cosmic-ray. Re-run instructions captured in the audit doc.
37
+
38
+ - Section E.2 (mkdocs link cleanup, from temporalcv-cross-pollination
39
+ bundle): fixed 30+ broken relative links across 18 documentation
40
+ files. Pattern: docs that link to `../src/eval_toolkit/<X>.py`
41
+ (works on GitHub render but breaks in mkdocs) now point at the
42
+ auto-generated API reference page (`api/<X>.md`). CHANGELOG.md
43
+ references (also outside the docs tree) repointed to absolute
44
+ GitHub URLs. Down from 93 warnings to 1: the remaining
45
+ `griffe: π : float` is a documented tool limitation — griffe
46
+ doesn't parse Unicode parameter names; the project's STYLE.md
47
+ intentionally allows Unicode in math kernels (`π`, `α`, etc.).
48
+ Also patched `harness.py` RunResult docstring: replaced the Sphinx
49
+ `.. versionchanged::` directive with a NumPy "Notes" section so
50
+ mkdocstrings renders it cleanly. `mkdocs build --strict` would
51
+ fail on the 1 remaining griffe warning, so the docs.yml workflow
52
+ intentionally runs without `--strict`. The link-cleanup deliverable
53
+ is complete; the source-docstring + methodology enrichment passes
54
+ originally scoped for E.2 are deferred (existing docstrings already
55
+ carry References + LaTeX where it matters; methodology pages are
56
+ already strong content-wise — only the link structure needed fixing).
57
+
58
+ - Section E.1 (hosted documentation site, from
59
+ temporalcv-cross-pollination bundle): new mkdocs-material site at
60
+ `https://brandon-behring.github.io/eval-toolkit/`, auto-generated
61
+ from existing Markdown docs + `mkdocstrings`-rendered API reference.
62
+ - `mkdocs.yml` configures the material theme (auto light/dark,
63
+ tabs nav, code-copy buttons, full-text search) with MathJax v3 +
64
+ tikzjax loaded from CDN for full LaTeX + TikZ rendering
65
+ (per Q12=B).
66
+ - `docs/index.md` — site landing page
67
+ - `docs/api/index.md` — curated API landing organized by README's
68
+ three-tier architecture (Tier 1 functional core, Tier 2 protocol
69
+ orchestration, Tier 3 reproducibility scaffolding); per Q8=C.
70
+ - `docs/api/<module>.md` — 22 per-module auto-gen stubs invoking
71
+ `::: eval_toolkit.<module>` mkdocstrings directives.
72
+ - `docs/javascripts/mathjax-config.js` — MathJax v3 init script
73
+ matching mkdocs-material's pymdownx.arithmatex (generic: true).
74
+ - `.github/workflows/docs.yml` deploys to GitHub Pages on every
75
+ push to main + every tag push. Single-version site (no `mike`,
76
+ per Q11=A).
77
+ - `[docs]` optional extra added to `pyproject.toml` listing
78
+ mkdocs-material, mkdocstrings[python], pymdown-extensions.
79
+ - `pyproject.urls.Documentation` repointed at the hosted-docs URL.
80
+ - README badge added: `Docs` linking to the GitHub Pages site.
81
+ - `.gitignore` extended to exclude the mkdocs build output (`/site/`).
82
+ - **Known follow-up**: 30+ relative-link warnings in
83
+ `docs/methodology/*.md` files (links to `../../src/...` and
84
+ `../../CHANGELOG.md`). Workflow temporarily runs without
85
+ `--strict`; Section E.2 will fix these and re-enable strict mode.
86
+
87
+ - Section D (public-repo polish from temporalcv-cross-pollination bundle):
88
+ added `SECURITY.md` (security disclosure policy with response SLAs,
89
+ scope, and reporter-credit policy); added `CITATION.cff` (machine-
90
+ readable academic citation metadata, exposing the GitHub web UI
91
+ "Cite this repository" button — methodology-relevant primary
92
+ references listed for `bootstrap_ci`, `brier_score`,
93
+ `fit_platt_calibrator`, `delong_roc_variance`, `PurgedKFoldSplitter`).
94
+ Added four trust-set badges to README (CI status, PyPI version,
95
+ Python ≥3.13, License MIT). Extended `pyproject.urls` with a
96
+ `Documentation` key pointing at `docs/getting-started.md` (the
97
+ hosted-docs URL replaces this in Section E.1). Module-docstring
98
+ audit across all 22 `src/eval_toolkit/*.py` modules — all already
99
+ carry adequate module-level docstrings; no patches needed.
100
+
101
+ - Section C (example gallery from temporalcv-cross-pollination bundle):
102
+ six new minimal worked examples in `docs/examples/`, each one
103
+ concept per file, Sybil-validated end-to-end in CI:
104
+ - `metrics_and_bootstrap.md` — `pr_auc` / `roc_auc` / `brier_score`
105
+ + `bootstrap_ci` (BCa vs percentile)
106
+ - `evaluate_harness.md` — slice-aware `evaluate(...)` with two
107
+ scorers, `write_run_result(...)`, JSON schema validation
108
+ - `calibration.md` — Platt + isotonic recalibration, ECE before/after
109
+ - `leakage_detection.md` — `ExactDuplicateCheck` +
110
+ `NormalizedFormLeakageCheck` + `LabelConflictCheck` on a
111
+ contaminated train/test pair
112
+ - `claims_and_gates.md` — `EvidenceGate` composition (metric
113
+ threshold + minimum slice size) for release-decision gating
114
+ - `paired_comparison.md` — `paired_bootstrap_diff` for two-scorer
115
+ significance + `mde_from_ci` for power analysis
116
+ - `index.md` — examples landing page mapping each example to the
117
+ capability it demonstrates + the minimum extras required
118
+ Total: 28 sybil-validated code blocks. Each is the headline-import
119
+ → usable-output minimum surface; together they cover the public API
120
+ surface a new user needs to be productive.
121
+
122
+ - Section B (PurgedKFold splitter from temporalcv-cross-pollination bundle):
123
+ `PurgedKFoldSplitter(n_splits, purge_gap, embargo_pct, time_col)` and a
124
+ standalone `compute_label_overlap(t_train, t_test, horizon)` helper, both
125
+ now public via `from eval_toolkit import ...`. Time-aware k-fold with
126
+ explicit purge gap straddling the test fold + post-test embargo —
127
+ prevents label-window leakage when labels have a forward horizon
128
+ (e.g., H-step forward returns). The standalone helper audits arbitrary
129
+ train/test overlap independent of the splitter. Adapted from López de
130
+ Prado (2018) Chapter 7 via temporalcv's `cv_financial.py`; API names
131
+ preserved verbatim for cross-library muscle memory. Public-API
132
+ drift-guard snapshot regenerated for the two new exports.
133
+
134
+ ### Internal
135
+
136
+ - Section A (Monte Carlo bootstrap CI calibration, from temporalcv-cross-pollination
137
+ bundle): added `tests/test_bootstrap_calibration_mc.py` (slow-marker) that runs
138
+ 500-replicate MC validation of `bootstrap_ci` coverage + bias across 5 cases
139
+ (pr_auc / roc_auc × balanced / imbalanced × n=200 / n=1000 × BCa / percentile
140
+ method). Asserts empirical coverage ∈ [0.90, 0.99] for nominal 95% CIs and
141
+ |bias| < 0.05. Complements Tier 1's golden tests: goldens pin exact numerical
142
+ output (drift detection), MC tests validate that the math is correct (a buggy
143
+ implementation producing self-consistent wrong values fails MC but passes
144
+ goldens). Also added CI width-scaling test (width should shrink as ~1/√n).
145
+ New workflow `.github/workflows/nightly-mc.yml` triggers this suite weekly
146
+ on Sundays at 03:00 UTC (plus `workflow_dispatch` for manual runs). Harness
147
+ pattern adapted from temporalcv's `tests/conftest.py` Monte Carlo helpers.
148
+
149
+ - Test coverage (Tier 1 — math kernel correctness + integration backbone):
150
+ added end-to-end pipeline tests (`tests/test_pipeline_e2e.py`) that
151
+ exercise loader → `evaluate` → `write_run_result` → JSON schema
152
+ validation for `DataFrameLoader` and `SingleSliceLoader` (incl.
153
+ paired-diffs path). Extended `tests/test_metrics_props.py` with
154
+ Brier-score bounds + label/score inversion symmetry properties. Added
155
+ bootstrap CI golden tests (`tests/test_bootstrap_golden.py`, fixture at
156
+ `tests/golden/bootstrap_ci/cases.json`) pinning BCa/percentile output
157
+ on 6 canonical stress points (balanced, imbalanced 5%, small-n=10,
158
+ tied scores) to ±1e-9. Expanded the `golden` pytest marker doc.
159
+
160
+ - Test coverage (Tier 3 — resilience moat): added multi-slice
161
+ fault-injection tests (`tests/test_harness_fault_injection.py`) that
162
+ exercise `on_scorer_error="record"` across three slices where the
163
+ scorer succeeds on the middle one and fails on the outer two —
164
+ asserts per-(slice, scorer) independence (no error-state bleed) plus
165
+ a healthy-vs-faulting scorer parity check against a no-fault control.
166
+ Added exactness tests for `TargetFPRSelector`
167
+ (`tests/test_thresholds.py`): analytical answer on
168
+ perfectly-separable data plus a golden-style pinned threshold value
169
+ for a canonical (n=500, seed=42) overlapping distribution across
170
+ target FPRs 0.01 / 0.05 / 0.10 / 0.20, with a monotonicity invariant.
171
+ Added calibration determinism tests
172
+ (`tests/test_calibration_determinism.py`): same `(y, score)` produces
173
+ bit-identical Platt fit `a`/`b` parameters and isotonic transform
174
+ output across runs, parametrized over 1% / 50% / 99% positive
175
+ prevalence. Added NaN/+inf/-inf rejection tests for `pr_auc`,
176
+ `roc_auc`, `brier_score` to `tests/test_metrics_props.py` —
177
+ parametrized; locks the input-validation contract.
178
+
179
+ - Test coverage (Tier 2 — public-contract + integration breadth):
180
+ added public-API drift guard (`tests/test_public_api.py`, fixture at
181
+ `tests/golden/public_api/snapshot.json`) that snapshots all 199 names
182
+ in `eval_toolkit.__all__` with signatures, class bases, first
183
+ docstring lines, and primitive-value summaries. Drift now requires an
184
+ explicit golden-regeneration commit. Extended
185
+ `tests/test_pipeline_e2e.py` with `ParquetGlobLoader` round-trip
186
+ (synthetic parquet → glob → load → evaluate → schema-validate; gated
187
+ on `pyarrow`). Extended `tests/test_artifacts.py` with four manifest
188
+ v2↔v3 dispatcher tests: v3 well-formed accepted; v3 missing
189
+ `contamination_flags` rejected; v3 with unknown enum value rejected;
190
+ v2 payloads still routed to v2 schema (no eager v3 demotion).
191
+
10
192
  ## [0.27.2] — 2026-05-15 — fix base-install pandas import
11
193
 
12
194
  Base install of `eval-toolkit` (no extras) was broken in 0.27.1: every
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.27.2
3
+ Version: 0.28.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
+ Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
6
7
  Project-URL: Repository, https://github.com/brandon-behring/eval-toolkit.git
7
8
  Project-URL: Issues, https://github.com/brandon-behring/eval-toolkit/issues
8
9
  Project-URL: Changelog, https://github.com/brandon-behring/eval-toolkit/blob/main/CHANGELOG.md
@@ -49,6 +50,10 @@ Requires-Dist: pytest>=8.0; extra == 'dev'
49
50
  Requires-Dist: pyyaml>=6.0; extra == 'dev'
50
51
  Requires-Dist: ruff>=0.5; extra == 'dev'
51
52
  Requires-Dist: sybil>=10.0; extra == 'dev'
53
+ Provides-Extra: docs
54
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
55
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
56
+ Requires-Dist: pymdown-extensions>=10.7; extra == 'docs'
52
57
  Provides-Extra: parquet
53
58
  Requires-Dist: pyarrow>=15.0; extra == 'parquet'
54
59
  Provides-Extra: plotting
@@ -63,6 +68,12 @@ Description-Content-Type: text/markdown
63
68
 
64
69
  # eval-toolkit
65
70
 
71
+ [![CI](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml)
72
+ [![Docs](https://github.com/brandon-behring/eval-toolkit/actions/workflows/docs.yml/badge.svg)](https://brandon-behring.github.io/eval-toolkit/)
73
+ [![PyPI version](https://img.shields.io/pypi/v/eval-toolkit.svg)](https://pypi.org/project/eval-toolkit/)
74
+ [![Python ≥3.13](https://img.shields.io/badge/python-%E2%89%A53.13-blue.svg)](https://pypi.org/project/eval-toolkit/)
75
+ [![License: MIT](https://img.shields.io/badge/license-MIT-yellow.svg)](LICENSE)
76
+
66
77
  A **methodology-aware evaluation harness for binary classification**:
67
78
  metrics, bootstrap CIs, calibration, leakage detection, splitting,
68
79
  threshold selection, dataset loading, reproducibility manifests, and a
@@ -1,5 +1,11 @@
1
1
  # eval-toolkit
2
2
 
3
+ [![CI](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/brandon-behring/eval-toolkit/actions/workflows/ci.yml)
4
+ [![Docs](https://github.com/brandon-behring/eval-toolkit/actions/workflows/docs.yml/badge.svg)](https://brandon-behring.github.io/eval-toolkit/)
5
+ [![PyPI version](https://img.shields.io/pypi/v/eval-toolkit.svg)](https://pypi.org/project/eval-toolkit/)
6
+ [![Python ≥3.13](https://img.shields.io/badge/python-%E2%89%A53.13-blue.svg)](https://pypi.org/project/eval-toolkit/)
7
+ [![License: MIT](https://img.shields.io/badge/license-MIT-yellow.svg)](LICENSE)
8
+
3
9
  A **methodology-aware evaluation harness for binary classification**:
4
10
  metrics, bootstrap CIs, calibration, leakage detection, splitting,
5
11
  threshold selection, dataset loading, reproducibility manifests, and a
@@ -54,6 +54,15 @@ parquet = ["pyarrow>=15.0"]
54
54
  # resolve cleanly; jsonschema is now in the base deps. Remove in v0.17.0
55
55
  # after a deprecation window if no downstream consumer complains.
56
56
  validation = []
57
+ # v0.28.0 (Section E.1): docs site via mkdocs-material + mkdocstrings.
58
+ # Per Q4=B in the temporalcv-cross-pollination plan; LaTeX via MathJax v3
59
+ # (configured in docs/javascripts/mathjax-config.js) and tikzjax loaded
60
+ # from CDN in mkdocs.yml.
61
+ docs = [
62
+ "mkdocs-material>=9.5",
63
+ "mkdocstrings[python]>=0.24",
64
+ "pymdown-extensions>=10.7",
65
+ ]
57
66
  # `all` references the sub-extras directly (PEP 685 self-reference). This
58
67
  # avoids drift: adding a dep to e.g. `dataframe` no longer requires a
59
68
  # mirroring edit here.
@@ -72,6 +81,7 @@ dev = [
72
81
 
73
82
  [project.urls]
74
83
  Homepage = "https://github.com/brandon-behring/eval-toolkit"
84
+ Documentation = "https://brandon-behring.github.io/eval-toolkit/"
75
85
  Repository = "https://github.com/brandon-behring/eval-toolkit.git"
76
86
  Issues = "https://github.com/brandon-behring/eval-toolkit/issues"
77
87
  Changelog = "https://github.com/brandon-behring/eval-toolkit/blob/main/CHANGELOG.md"
@@ -146,8 +156,9 @@ markers = [
146
156
  "unit: Sklearn-reference and analytical correctness tests",
147
157
  "property: Hypothesis property-based invariant tests",
148
158
  "smoke: End-to-end smoke tests",
149
- "golden: Snapshot tests for deterministic outputs (docs.py)",
159
+ "golden: Snapshot tests for deterministic outputs (docs renderer, bootstrap CI numerical pins, public API surface).",
150
160
  "slow: Tests > 2s (bootstrap-t studentized, multi-seed K-fold). Opt out with `pytest -m 'not slow'`.",
161
+ "monte_carlo: Monte Carlo calibration suite (~14 min). Skipped in PR CI; runs only in the nightly-mc workflow via `-m monte_carlo`.",
151
162
  ]
152
163
 
153
164
  [tool.coverage.run]
@@ -183,10 +183,12 @@ _EXPORTS: dict[str, str] = {
183
183
  "GroupKFoldSplitter": "eval_toolkit.splits",
184
184
  "HoldoutSplitter": "eval_toolkit.splits",
185
185
  "PoolBuilder": "eval_toolkit.splits",
186
+ "PurgedKFoldSplitter": "eval_toolkit.splits",
186
187
  "SourceDisjointKFoldSplitter": "eval_toolkit.splits",
187
188
  "Splitter": "eval_toolkit.splits",
188
189
  "StratifiedKFoldSplitter": "eval_toolkit.splits",
189
190
  "TimeSeriesSplitter": "eval_toolkit.splits",
191
+ "compute_label_overlap": "eval_toolkit.splits",
190
192
  "iter_folds_with_pool": "eval_toolkit.splits",
191
193
  "DEFAULT_DEDUP_THRESHOLD": "eval_toolkit.text_dedup",
192
194
  "DedupReport": "eval_toolkit.text_dedup",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.27.2"
5
+ __version__ = "0.28.0"
@@ -185,9 +185,11 @@ class RunResult:
185
185
  JSON schema version. ``"v1"`` for v0.7.0+; downstream parsers gate
186
186
  on this.
187
187
 
188
- .. versionchanged:: 0.7.0
189
- Added ``by_fold``, ``fold_summary``, ``schema_version`` (additive,
190
- defaults empty / ``"v1"`` — backward compatible).
188
+ Notes
189
+ -----
190
+ Changed in 0.7.0: added ``by_fold``, ``fold_summary``,
191
+ ``schema_version`` (additive, defaults empty / ``"v1"`` — backward
192
+ compatible).
191
193
  """
192
194
 
193
195
  run_id: str
@@ -36,10 +36,12 @@ __all__ = [
36
36
  "GroupKFoldSplitter",
37
37
  "HoldoutSplitter",
38
38
  "PoolBuilder",
39
+ "PurgedKFoldSplitter",
39
40
  "SourceDisjointKFoldSplitter",
40
41
  "Splitter",
41
42
  "StratifiedKFoldSplitter",
42
43
  "TimeSeriesSplitter",
44
+ "compute_label_overlap",
43
45
  "iter_folds_with_pool",
44
46
  ]
45
47
 
@@ -513,3 +515,283 @@ def iter_folds_with_pool(
513
515
  # PoolBuilder's keys (train, val, possibly more) take precedence;
514
516
  # test is reattached from the Splitter.
515
517
  yield {**built, "test": test}
518
+
519
+
520
+ # ---------------------------------------------------------------------------
521
+ # Purged K-fold for label-overlap protection (v0.28.0)
522
+ #
523
+ # Adapted from temporalcv (Behring 2026) for the financial / forecasting
524
+ # label-overlap case: when labels use future data (e.g., H-day forward
525
+ # returns), train and test folds can overlap in their LABEL windows even
526
+ # when their FEATURE windows don't. Purging drops a band of training
527
+ # samples within ``purge_gap`` of each test fold; embargo drops an
528
+ # additional fraction of n samples bordering each test fold.
529
+ # ---------------------------------------------------------------------------
530
+
531
+
532
+ def compute_label_overlap(
533
+ t_train: np.ndarray,
534
+ t_test: np.ndarray,
535
+ horizon: int,
536
+ ) -> np.ndarray:
537
+ r"""Boolean ``(n_train, n_test)`` matrix: True where label windows overlap.
538
+
539
+ For h-step forward labels, the label at time ``t`` depends on the data
540
+ at times ``[t, t+h]``. Two samples ``t_train[i]`` and ``t_test[j]``
541
+ have label-window overlap if their windows share at least one time
542
+ point — equivalently, if ``|t_train[i] - t_test[j]| < horizon``.
543
+
544
+ Use this to audit whether a given train/test split has any label
545
+ leakage. Standalone helper; does NOT require a particular splitter.
546
+
547
+ Parameters
548
+ ----------
549
+ t_train : np.ndarray, shape (n_train,)
550
+ Time indices of the training set (any sortable numeric type).
551
+ t_test : np.ndarray, shape (n_test,)
552
+ Time indices of the test set.
553
+ horizon : int
554
+ Label horizon (e.g., ``5`` for 5-step forward returns). Must be
555
+ non-negative; ``horizon=0`` means no overlap is possible.
556
+
557
+ Returns
558
+ -------
559
+ np.ndarray, shape (n_train, n_test), dtype bool
560
+ Entry ``(i, j)`` is ``True`` iff
561
+ ``|t_train[i] - t_test[j]| < horizon``.
562
+
563
+ Raises
564
+ ------
565
+ ValueError
566
+ If ``horizon`` is negative.
567
+
568
+ Examples
569
+ --------
570
+ >>> import numpy as np
571
+ >>> t_train = np.array([0, 1, 5, 6])
572
+ >>> t_test = np.array([3, 4])
573
+ >>> overlap = compute_label_overlap(t_train, t_test, horizon=3)
574
+ >>> overlap
575
+ array([[False, False],
576
+ [ True, False],
577
+ [ True, True],
578
+ [False, True]])
579
+ >>> # Sample 0 (t=0): no overlap with test (|0-3|=3, |0-4|=4 ≥ horizon)
580
+ >>> # Sample 1 (t=1): overlaps test[0]=3 (|1-3|=2 < 3)
581
+ >>> # Sample 2 (t=5): overlaps both (|5-3|=2, |5-4|=1)
582
+ >>> # Sample 3 (t=6): overlaps test[1]=4 (|6-4|=2 < 3)
583
+
584
+ Notes
585
+ -----
586
+ The check is **symmetric in time**: ``|t_train - t_test| < horizon``
587
+ treats overlap in either temporal direction equally. For strictly
588
+ forward-only label overlap (train-before-test), filter the result
589
+ with ``(t_test[None, :] - t_train[:, None]) > 0``.
590
+
591
+ For h-step forward labels: label at time t covers ``[t, t+h)``, so
592
+ two labels at times ``t1, t2`` share data iff their intervals
593
+ overlap, which holds iff ``|t1 - t2| < h``.
594
+
595
+ References
596
+ ----------
597
+ .. [1] López de Prado, M. (2018). "Advances in Financial Machine
598
+ Learning." Wiley. Chapter 7: Cross-Validation in Finance.
599
+ """
600
+ if horizon < 0:
601
+ raise ValueError(f"horizon must be >= 0, got {horizon}")
602
+ if horizon == 0:
603
+ return np.zeros((len(t_train), len(t_test)), dtype=bool)
604
+ t_train_arr = np.asarray(t_train)
605
+ t_test_arr = np.asarray(t_test)
606
+ # Outer absolute difference: (n_train, n_test)
607
+ dist = np.abs(t_train_arr[:, None] - t_test_arr[None, :])
608
+ overlap: np.ndarray = dist < horizon
609
+ return overlap
610
+
611
+
612
+ def _apply_purge_embargo(
613
+ test_idx: np.ndarray,
614
+ n_samples: int,
615
+ purge_gap: int,
616
+ embargo_pct: float,
617
+ ) -> np.ndarray:
618
+ """Build a training-index array excluding the test fold + purge + embargo.
619
+
620
+ The test fold's indices are contiguous (TimeSeriesSplit-style); purging
621
+ drops `[test_min - purge_gap, test_max + purge_gap]` from training;
622
+ embargo drops an additional `floor(embargo_pct * n_samples)` indices
623
+ after the test fold (one-sided: protects the post-test region from
624
+ label-window leakage when labels are forward-looking).
625
+
626
+ Adapted from temporalcv's ``_apply_purge_and_embargo`` but vectorized
627
+ (no Python-level set/loop) and asymmetric-by-default (embargo only on
628
+ the post-test side, matching López de Prado's original definition).
629
+ """
630
+ test_min = int(np.min(test_idx))
631
+ test_max = int(np.max(test_idx))
632
+ purge_start = max(0, test_min - purge_gap)
633
+ purge_end = min(n_samples, test_max + 1 + purge_gap)
634
+ n_embargo = int(embargo_pct * n_samples)
635
+ embargo_end = min(n_samples, test_max + 1 + n_embargo)
636
+
637
+ full_idx = np.arange(n_samples)
638
+ # Mask out: the test fold itself + purge band on both sides + post-test embargo
639
+ keep = np.ones(n_samples, dtype=bool)
640
+ keep[purge_start:purge_end] = False # zeroes out test + purge band
641
+ keep[test_max + 1 : embargo_end] = False # post-test embargo
642
+ return full_idx[keep]
643
+
644
+
645
+ @dataclass(frozen=True, slots=True)
646
+ class PurgedKFoldSplitter:
647
+ r"""Time-aware k-fold with explicit purge gap + post-test embargo.
648
+
649
+ Pattern from López de Prado (2018) Ch. 7: when labels have a forward
650
+ lookahead (e.g., H-step returns), train and test folds can overlap in
651
+ their **label windows** even when their **feature windows** don't.
652
+ Standard k-fold leaks information through this overlap. PurgedKFold
653
+ drops a ``purge_gap``-sample band straddling each test fold's boundary
654
+ plus a post-test ``embargo_pct * n`` window — preventing both
655
+ backward and forward label-overlap leakage.
656
+
657
+ Implements the :class:`Splitter` Protocol, yielding
658
+ ``{"train": EvalSlice, "test": EvalSlice}`` dicts.
659
+
660
+ Parameters
661
+ ----------
662
+ n_splits : int, optional
663
+ Number of folds. Default 5. Must be ≥ 2.
664
+ purge_gap : int, optional
665
+ Samples to drop on each side of every test fold's boundary.
666
+ Default 0 (no purging — equivalent to vanilla TimeSeriesSplit).
667
+ For h-step forward labels, ``purge_gap=h`` is the canonical choice.
668
+ embargo_pct : float, optional
669
+ Additional embargo as a fraction of total ``n``, applied **after**
670
+ each test fold (one-sided, López de Prado convention). Default
671
+ 0.0. Typical: 0.01 (1%).
672
+ time_col : str or None, optional
673
+ Column carrying a sortable timestamp. If set, the parent slice is
674
+ sorted by this column before splitting. ``None`` assumes the slice
675
+ is already in temporal order. Default ``"timestamp"``.
676
+
677
+ Raises
678
+ ------
679
+ ValueError
680
+ At construction time if ``n_splits < 2`` or ``purge_gap < 0`` or
681
+ ``embargo_pct ∉ [0, 1)``.
682
+ KeyError
683
+ At ``iter_folds`` time if ``time_col`` is set but not present in
684
+ the slice DataFrame.
685
+
686
+ Examples
687
+ --------
688
+ >>> import pandas as pd
689
+ >>> from eval_toolkit.harness import EvalSlice
690
+ >>> from eval_toolkit.splits import PurgedKFoldSplitter
691
+ >>> df = pd.DataFrame({
692
+ ... "text": [f"row{i}" for i in range(50)],
693
+ ... "label": [i % 2 for i in range(50)],
694
+ ... "t": list(range(50)),
695
+ ... })
696
+ >>> parent = EvalSlice(name="all", df=df)
697
+ >>> spl = PurgedKFoldSplitter(n_splits=5, purge_gap=2, embargo_pct=0.02, time_col="t")
698
+ >>> folds = list(spl.iter_folds(parent))
699
+ >>> len(folds)
700
+ 5
701
+ >>> sorted(folds[0].keys())
702
+ ['test', 'train']
703
+
704
+ Notes
705
+ -----
706
+ **Two units in one signature**: ``purge_gap`` is an absolute count of
707
+ samples (int) while ``embargo_pct`` is a fraction (float). This
708
+ mirrors López de Prado / temporalcv conventions verbatim — users
709
+ moving between libraries see the same parameter names. Use the
710
+ standalone helper :func:`compute_label_overlap` to size ``purge_gap``
711
+ for a known label horizon.
712
+
713
+ See Also
714
+ --------
715
+ eval_toolkit.splits.compute_label_overlap :
716
+ Audit label-window overlap between arbitrary train/test sets.
717
+ eval_toolkit.splits.TimeSeriesSplitter :
718
+ Time-aware k-fold without purging — use when labels have no
719
+ lookahead horizon.
720
+
721
+ References
722
+ ----------
723
+ .. [1] López de Prado, M. (2018). "Advances in Financial Machine
724
+ Learning." Wiley. Chapter 7.
725
+ """
726
+
727
+ n_splits: int = 5
728
+ purge_gap: int = 0
729
+ embargo_pct: float = 0.0
730
+ time_col: str | None = "timestamp"
731
+
732
+ def __post_init__(self) -> None:
733
+ """Validate parameters."""
734
+ if self.n_splits < 2:
735
+ raise ValueError(f"n_splits must be >= 2, got {self.n_splits}")
736
+ if self.purge_gap < 0:
737
+ raise ValueError(f"purge_gap must be >= 0, got {self.purge_gap}")
738
+ if not 0.0 <= self.embargo_pct < 1.0:
739
+ raise ValueError(f"embargo_pct must be in [0, 1), got {self.embargo_pct}")
740
+
741
+ def iter_folds(
742
+ self,
743
+ slice_: EvalSlice,
744
+ *,
745
+ groups: np.ndarray | None = None,
746
+ ) -> Iterator[dict[str, EvalSlice]]:
747
+ """Yield ``n_splits`` fold dicts with purge + embargo applied.
748
+
749
+ Raises
750
+ ------
751
+ KeyError
752
+ If ``self.time_col`` is set but not present in ``slice_.df``.
753
+ """
754
+ if self.time_col is not None:
755
+ if self.time_col not in slice_.df.columns:
756
+ raise KeyError(
757
+ f"time_col {self.time_col!r} not in slice columns " f"{list(slice_.df.columns)}"
758
+ )
759
+ sorted_df = slice_.df.sort_values(self.time_col).reset_index(drop=True)
760
+ sorted_slice = EvalSlice(
761
+ name=slice_.name,
762
+ df=sorted_df,
763
+ description=slice_.description,
764
+ feature_col=slice_.feature_col,
765
+ label_col=slice_.label_col,
766
+ strata_col=slice_.strata_col,
767
+ )
768
+ else:
769
+ sorted_slice = slice_
770
+
771
+ n_samples = len(sorted_slice.df)
772
+ if self.n_splits >= n_samples:
773
+ raise ValueError(f"n_splits ({self.n_splits}) must be < n_samples ({n_samples})")
774
+
775
+ # Fold sizes (mirrors TimeSeriesSplit / temporalcv: trailing folds
776
+ # absorb the remainder)
777
+ fold_sizes = np.full(self.n_splits, n_samples // self.n_splits)
778
+ fold_sizes[: n_samples % self.n_splits] += 1
779
+
780
+ current = 0
781
+ for fold_size in fold_sizes:
782
+ test_idx = np.arange(current, current + fold_size)
783
+ train_idx = _apply_purge_embargo(
784
+ test_idx,
785
+ n_samples=n_samples,
786
+ purge_gap=self.purge_gap,
787
+ embargo_pct=self.embargo_pct,
788
+ )
789
+ yield {
790
+ "train": _slice_subset(sorted_slice, train_idx, "train"),
791
+ "test": _slice_subset(sorted_slice, test_idx, "test"),
792
+ }
793
+ current += fold_size
794
+
795
+ def get_n_splits(self, slice_: EvalSlice) -> int:
796
+ """Return ``self.n_splits``."""
797
+ return self.n_splits
@@ -0,0 +1,50 @@
1
+ {
2
+ "pr_auc_balanced_n200_BCa_seed42": {
3
+ "ci_high": 0.911879605,
4
+ "ci_low": 0.786756306,
5
+ "confidence": 0.95,
6
+ "method": "BCa",
7
+ "n_resamples": 500,
8
+ "point_estimate": 0.857436357
9
+ },
10
+ "pr_auc_balanced_n200_percentile_seed42": {
11
+ "ci_high": 0.920789971,
12
+ "ci_low": 0.796620442,
13
+ "confidence": 0.95,
14
+ "method": "percentile",
15
+ "n_resamples": 500,
16
+ "point_estimate": 0.857436357
17
+ },
18
+ "pr_auc_imbalanced_p05_n200_BCa_seed42": {
19
+ "ci_high": 0.585177068,
20
+ "ci_low": 0.104087605,
21
+ "confidence": 0.95,
22
+ "method": "BCa",
23
+ "n_resamples": 500,
24
+ "point_estimate": 0.277566582
25
+ },
26
+ "pr_auc_n10_percentile_seed42": {
27
+ "ci_high": 1.0,
28
+ "ci_low": 0.530416667,
29
+ "confidence": 0.95,
30
+ "method": "percentile",
31
+ "n_resamples": 500,
32
+ "point_estimate": 0.858333333
33
+ },
34
+ "roc_auc_balanced_n200_BCa_seed42": {
35
+ "ci_high": 0.911376276,
36
+ "ci_low": 0.811418319,
37
+ "confidence": 0.95,
38
+ "method": "BCa",
39
+ "n_resamples": 500,
40
+ "point_estimate": 0.8632
41
+ },
42
+ "roc_auc_tied_n40_BCa_seed42": {
43
+ "ci_high": 0.9725,
44
+ "ci_low": 0.797211394,
45
+ "confidence": 0.95,
46
+ "method": "BCa",
47
+ "n_resamples": 500,
48
+ "point_estimate": 0.915
49
+ }
50
+ }