eval-toolkit 0.46.0__tar.gz → 0.47.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/.gitignore +10 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/CHANGELOG.md +199 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/PKG-INFO +6 -3
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/README.md +5 -2
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/source/adr/README.md +3 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/__init__.py +32 -76
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_scorecard.py +49 -5
- eval_toolkit-0.47.0/src/eval_toolkit/_sweep.py +184 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/adversarial.py +293 -173
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/metric_specs.py +92 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/preprocessing.py +75 -99
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/protocols.py +35 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/benchmarks/test_kernel_benchmarks.py +3 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/public_api/snapshot.json +178 -22
- eval_toolkit-0.47.0/tests/test_adversarial.py +420 -0
- eval_toolkit-0.47.0/tests/test_deprecated_scalars_shim.py +211 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_logging.py +2 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_preprocessing.py +91 -52
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_public_api.py +125 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_scorecard.py +316 -14
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_stacking.py +1 -1
- eval_toolkit-0.47.0/tests/test_sweep.py +180 -0
- eval_toolkit-0.46.0/tests/test_adversarial.py +0 -351
- eval_toolkit-0.46.0/tests/test_deprecated_scalars_shim.py +0 -184
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/LICENSE +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/STYLE.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/pyproject.toml +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_v09_contracts.py +0 -0
|
@@ -45,6 +45,16 @@ coverage.json
|
|
|
45
45
|
# Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
|
|
46
46
|
mutants/
|
|
47
47
|
|
|
48
|
+
# Local audit artifacts (Round 5+ Gate 3 LLM cross-review packets + reports).
|
|
49
|
+
# The canonical prompt lives at ~/.claude/plans/gate3-audit-prompt.md and the
|
|
50
|
+
# canonical findings ledger lives at docs/source/audit_findings.md; per-run
|
|
51
|
+
# raw model outputs are author-local working copies.
|
|
52
|
+
# Tracked: per-round briefing files (`gate3-audit-round-<N>.md`).
|
|
53
|
+
# Untracked: prompt template, generic report, per-round report files.
|
|
54
|
+
gate3-audit-prompt.md
|
|
55
|
+
gate3-audit-report.md
|
|
56
|
+
gate3-audit-round-*-report.md
|
|
57
|
+
|
|
48
58
|
# Claude Code project settings (machine-local)
|
|
49
59
|
.claude/
|
|
50
60
|
|
|
@@ -5,6 +5,205 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.47.0] — 2026-05-21 — Sweep unification + TextTransform + advanced-6 + cleanup + Round 6 follow-on
|
|
9
|
+
|
|
10
|
+
Second BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47 →
|
|
11
|
+
v0.48 → v1.0 release sequence (plan
|
|
12
|
+
``~/.claude/plans/evaluate-all-the-work-twinkly-kite.md``, Step 3).
|
|
13
|
+
|
|
14
|
+
Closes:
|
|
15
|
+
|
|
16
|
+
- The v0.43 CHANGELOG forward-look re: advanced-6 character-injection
|
|
17
|
+
techniques (Decision Q11→11.3 — "12-technique suite + new sweep API
|
|
18
|
+
in one migration step").
|
|
19
|
+
- Round 6 audit follow-on items per Decision R6-E (R6-A docstring,
|
|
20
|
+
R6-B duplicate name guard, R6-C to_pandas schema, R6-D Protocol
|
|
21
|
+
method-shape drift guard, R6-F5 narrow except, R6-F6 plan/roadmap
|
|
22
|
+
refresh, R6-H make_spec_name helper) — see ``docs/source/audit_findings.md``
|
|
23
|
+
for the per-finding ledger.
|
|
24
|
+
|
|
25
|
+
### Removed (BREAKING)
|
|
26
|
+
|
|
27
|
+
- **Top-level scalar metric names** (``eval_toolkit.pr_auc``,
|
|
28
|
+
``eval_toolkit.roc_auc``, ``eval_toolkit.brier_score``, all 5
|
|
29
|
+
``expected_calibration_*`` variants) — the v0.46 ``__getattr__``
|
|
30
|
+
deprecation shim has been deleted. These names now raise
|
|
31
|
+
``AttributeError`` at the top-level. Migration: use ``scorecard(...)``
|
|
32
|
+
with ``metric_specs`` (primary) OR import from the
|
|
33
|
+
``eval_toolkit.metrics`` submodule (internal API per ADR 0002).
|
|
34
|
+
(Decision L; plan §4D.)
|
|
35
|
+
- **Module-level ``adversarial.sweep`` + ``preprocessing.sweep``** —
|
|
36
|
+
consolidated into the top-level :func:`sweep` (Decision D + plan §4C).
|
|
37
|
+
Parity tests in Sub-PR 4 of this release proved 1:1 output equivalence
|
|
38
|
+
on the neutral subset.
|
|
39
|
+
- **``adversarial.character_injection`` + ``preprocessing.spotlighting``
|
|
40
|
+
``SimpleNamespace`` shortcuts** — removed (Decision N + plan §4E).
|
|
41
|
+
The 12 adversarial dataclasses + the 3 preprocessing variants + the
|
|
42
|
+
underlying functional API are the only public paths.
|
|
43
|
+
- **``adversarial.CharacterInjectionStrategy``** per-module Protocol —
|
|
44
|
+
removed. The top-level :class:`TextTransform` Protocol (Decision K)
|
|
45
|
+
is the single canonical contract; all 12 character-injection
|
|
46
|
+
dataclasses + 3 preprocessing variants satisfy it structurally.
|
|
47
|
+
|
|
48
|
+
### Added
|
|
49
|
+
|
|
50
|
+
- **``TextTransform`` Protocol** (top-level; ``eval_toolkit.protocols`` module).
|
|
51
|
+
Decision K + Audit R5-F3 (Codex Round 5): unifies the "name + transform(text)"
|
|
52
|
+
shape across preprocessing (defence) and adversarial (attack) strategies so
|
|
53
|
+
the v0.47 top-level :func:`sweep` (next sub-PR) can mix them in one call. The
|
|
54
|
+
9th strict Tier-2 Protocol per ADR 0003.
|
|
55
|
+
- **Advanced 6 character-injection techniques** (plan §4F, Decision Q11→11.3) —
|
|
56
|
+
closes the v0.43.0 CHANGELOG forward-look that referenced these as "scheduled
|
|
57
|
+
for v0.43.1" (a version that never shipped). Each satisfies the top-level
|
|
58
|
+
:class:`TextTransform` Protocol structurally; all are frozen + ``slots=True``
|
|
59
|
+
dataclasses with deterministic behaviour under their ``seed`` kwarg where
|
|
60
|
+
applicable:
|
|
61
|
+
|
|
62
|
+
- :class:`BidiRTLInjection` — wrap input in ``U+202E … U+202C``
|
|
63
|
+
RIGHT-TO-LEFT OVERRIDE block.
|
|
64
|
+
- :class:`TagStrippingInjection` — strip HTML/XML-like ``<…>`` tags
|
|
65
|
+
(idempotent).
|
|
66
|
+
- :class:`SynonymSubstitution` — replace whitelisted prompt-injection-
|
|
67
|
+
relevant function words / verbs with semantic-preserving synonyms.
|
|
68
|
+
- :class:`TokenSplitting` — insert a single space inside long enough
|
|
69
|
+
words; forces subword tokenizers to re-segment.
|
|
70
|
+
- :class:`UnicodeNormalization` — NFC / NFD / NFKC / NFKD; default NFKC
|
|
71
|
+
folds compatibility chars (e.g., fullwidth ``ABC`` → ``ABC``).
|
|
72
|
+
- :class:`InvisibleCharsInjection` — sample from the 5-element invisible-
|
|
73
|
+
code-point set (ZWSP, ZWNJ, ZWJ, word joiner, BOM) — distinct from the
|
|
74
|
+
single-codepoint :class:`ZeroWidthSpaceInjection`.
|
|
75
|
+
|
|
76
|
+
Also exported: ``ADVANCED_TECHNIQUES`` (6-tuple) and ``ALL_TECHNIQUES``
|
|
77
|
+
(12-tuple = core 6 + advanced 6).
|
|
78
|
+
- **Top-level :func:`sweep`** — single ``TextTransform`` enumeration entry
|
|
79
|
+
point (Decision K + Decision D + Audit R5-F3). Replaces the per-module
|
|
80
|
+
``adversarial.sweep`` + ``preprocessing.sweep`` (those are removed in a
|
|
81
|
+
subsequent sub-PR of this release). New contract:
|
|
82
|
+
|
|
83
|
+
- ``sweep(strategies, texts)`` → neutral DataFrame with ``text_id`` /
|
|
84
|
+
``variant`` / ``transformed_text`` columns. Pure text-transform
|
|
85
|
+
enumeration; defence + attack strategies compose freely.
|
|
86
|
+
- ``sweep(..., scorer=...)`` → also emits ``original_score`` /
|
|
87
|
+
``transformed_score`` columns (single batched scorer call per
|
|
88
|
+
strategy, not per-row).
|
|
89
|
+
- ``sweep(..., scorer=..., attack_threshold=t)`` → also emits ``asr``
|
|
90
|
+
(per-row attack-success flag). Explicit threshold REQUIRED to
|
|
91
|
+
materialize ``asr``; no magic ``threshold=0.5`` default.
|
|
92
|
+
``attack_threshold`` without ``scorer`` raises ``ValueError``.
|
|
93
|
+
|
|
94
|
+
Parity tests against the existing module-level sweeps ship in this
|
|
95
|
+
sub-PR (``tests/test_sweep.py``) and prove the v0.47 consolidation
|
|
96
|
+
produces identical transformed-text rows for the 6 core character-
|
|
97
|
+
injection techniques + the 3 spotlighting variants.
|
|
98
|
+
- **3 preprocessing dataclasses** (``DelimitVariant``, ``DatamarkVariant``,
|
|
99
|
+
``EncodeVariant``) in :mod:`eval_toolkit.preprocessing`. Frozen +
|
|
100
|
+
``slots=True`` thin wrappers over the existing :func:`delimit` /
|
|
101
|
+
:func:`datamark` / :func:`encode` functions. Closes Audit R5-F3
|
|
102
|
+
(Codex Round 5) — prior to this commit, ``preprocessing.__all__`` exported
|
|
103
|
+
only functions, so the "concrete classes satisfy ``TextTransform``
|
|
104
|
+
structurally" claim only held on the adversarial side. Now both sides
|
|
105
|
+
share the dataclass-strategy shape.
|
|
106
|
+
- ``metric_specs.make_spec_name(prefix, **kwargs)`` canonicalization helper
|
|
107
|
+
for custom parameterized :class:`MetricSpec` implementations. Alphabetized
|
|
108
|
+
kwargs joined by underscore — same convention the v0.46 ECE factory uses.
|
|
109
|
+
Lands in ``metric_specs.__all__`` only; **not** top-level ``__all__`` per
|
|
110
|
+
Decision R6-H. (Closes Round 6 Gemini R6-F4.)
|
|
111
|
+
|
|
112
|
+
### Changed (Round 6 follow-on)
|
|
113
|
+
|
|
114
|
+
- ``scorecard()`` now raises ``ValueError`` when two :class:`MetricSpec`
|
|
115
|
+
instances in the ``metrics`` list share a ``name``. Forces caller
|
|
116
|
+
disambiguation; the ``Mapping[str, MetricResult]`` contract never silently
|
|
117
|
+
drops a cell. Error message reports both indices. (Decision R6-B; closes
|
|
118
|
+
Round 6 Codex R6-F3.)
|
|
119
|
+
- ``scorecard(seed=None)`` docstring rewritten to document the deterministic-
|
|
120
|
+
by-default contract (``None`` is treated as ``seed=0``). No behavior
|
|
121
|
+
change; v0.46 documented the wrong contract. (Decision R6-A; closes Round 6
|
|
122
|
+
Codex R6-F4 + Gemini R6-F1.)
|
|
123
|
+
- ``_evaluate_spec()`` exception catches narrowed: ``MemoryError``,
|
|
124
|
+
``RecursionError``, ``KeyboardInterrupt``, and ``SystemExit`` now propagate
|
|
125
|
+
out of ``scorecard()`` instead of being captured as a ``status="error"``
|
|
126
|
+
cell. Per-cell isolation remains for ordinary application errors.
|
|
127
|
+
(Decision R6-F5; closes Round 6 Gemini R6-F5.)
|
|
128
|
+
- ``Scorecard.to_pandas()`` MultiIndex schema extended with two new inner-
|
|
129
|
+
field columns: ``n_resamples`` (int / NaN sentinel) and ``method``
|
|
130
|
+
(string / ``""`` sentinel). The DataFrame view is now lossless against
|
|
131
|
+
:meth:`BootstrapCI.to_dict` — trace provenance (resample count + CI
|
|
132
|
+
method) no longer drops at the DataFrame boundary. Callers indexing the
|
|
133
|
+
MultiIndex by name keep working; callers indexing by position must
|
|
134
|
+
re-check column offsets. (Decision R6-C; closes Round 6 Gemini R6-F3.)
|
|
135
|
+
- ``tests/test_public_api.py`` drift guard now captures method signatures
|
|
136
|
+
for ``typing.Protocol`` classes in ``__all__`` (a ``protocol_methods``
|
|
137
|
+
sub-entry in the snapshot). Together with a Tier-2 coverage test, this
|
|
138
|
+
actually enforces the strict method-shape stability ADR 0003 promises
|
|
139
|
+
for the 9 Tier-2 Protocols. (Decision R6-D; closes Round 6 Codex R6-F5.)
|
|
140
|
+
Public-API golden regenerated alongside this change.
|
|
141
|
+
|
|
142
|
+
## [0.46.1] — 2026-05-21 — Round 6 hotfix: ECE strategy validation + deprecation warning content
|
|
143
|
+
|
|
144
|
+
Hotfix release per **Decision Q** (data correctness regression + time-sensitive
|
|
145
|
+
warning content) + **Decision R6-E** (scope: R6-F1 + R6-F2 only; R6-A docstring
|
|
146
|
+
rolls forward to v0.47). All other Round 6 findings dispositioned to v0.47.0.
|
|
147
|
+
|
|
148
|
+
See [`docs/source/audit_findings.md`](docs/source/audit_findings.md) Round 6 for
|
|
149
|
+
the full disposition ledger.
|
|
150
|
+
|
|
151
|
+
### Fixed
|
|
152
|
+
|
|
153
|
+
- **`metric_specs.ece(strategy=<value>)` strategy validation** (Round 6 Codex
|
|
154
|
+
R6-F1). Prior to v0.46.1, an invalid strategy string (e.g.
|
|
155
|
+
`metric_specs.ece(strategy="typo")`) silently dispatched to quantile ECE and
|
|
156
|
+
returned a `scorecard()` cell with `status="ok"` under an invalid encoded key
|
|
157
|
+
(`"ece_n_bins_15_strategy_typo"`) — wrong-by-design data correctness path.
|
|
158
|
+
Verified by Codex via runtime probe. Now both the `ece()` factory and
|
|
159
|
+
`_EceSpec.compute()` raise:
|
|
160
|
+
```
|
|
161
|
+
ValueError: ECE strategy must be 'uniform' or 'quantile'; got 'typo'
|
|
162
|
+
```
|
|
163
|
+
Defence-in-depth: the factory validates eagerly (before LRU cache hit) AND
|
|
164
|
+
`compute()` validates at the compute boundary so direct construction of
|
|
165
|
+
`_EceSpec(strategy="typo")` (bypassing the factory) also raises.
|
|
166
|
+
|
|
167
|
+
- **Deprecation warning content for all 5 ECE variants** (Round 6 Codex R6-F2 +
|
|
168
|
+
Gemini R6-F2, with Decisions R6-F + R6-G). The v0.46.0 `__getattr__`
|
|
169
|
+
deprecation shim's warning messages produced broken migration snippets:
|
|
170
|
+
- For `expected_calibration_error` + `expected_calibration_error_equal_mass`:
|
|
171
|
+
the suggested `Scorecard` lookup key was the factory-call expression
|
|
172
|
+
(`"ece(n_bins=10)"`) instead of the encoded spec name
|
|
173
|
+
(`"ece_n_bins_10_strategy_uniform"`). Now uses the correct encoded key.
|
|
174
|
+
- For `expected_calibration_error_debiased` / `_l2` / `_l2_debiased`: these
|
|
175
|
+
variants are not in the v0.46 `metric_specs` namespace (Decision R6-G;
|
|
176
|
+
research-completeness primitives, deferred to v1.x if user demand
|
|
177
|
+
surfaces). Their warnings now point at the submodule path
|
|
178
|
+
(`from eval_toolkit.metrics import expected_calibration_error_debiased`)
|
|
179
|
+
instead of an unconstructable scorecard snippet.
|
|
180
|
+
- Pre-v0.46 default verification: Gemini's report claimed
|
|
181
|
+
`expected_calibration_error` defaulted to `n_bins=15`; verified against
|
|
182
|
+
`metrics.py:730-734` that the actual default is `n_bins=10`. Per Decision
|
|
183
|
+
R6-F, warning snippets use `n_bins=10` to preserve bit-identical pre-v0.46
|
|
184
|
+
math + add a migration note explaining the new `metric_specs.ece()` factory
|
|
185
|
+
default of `n_bins=15` (matching Hines et al.).
|
|
186
|
+
|
|
187
|
+
### Tests
|
|
188
|
+
|
|
189
|
+
- `tests/test_scorecard.py`: 4 new tests for ECE strategy validation
|
|
190
|
+
(parametrized factory-rejection + compute-defence-in-depth).
|
|
191
|
+
- `tests/test_deprecated_scalars_shim.py`: 4 new test classes — verify each
|
|
192
|
+
warning contains correct factory expression + encoded scorecard key, ECE
|
|
193
|
+
warnings carry the n_bins=10/15 migration note, submodule-only warnings cite
|
|
194
|
+
`eval_toolkit.metrics` path, and the snippet in each first-party warning is
|
|
195
|
+
EXECUTABLE (parses + runs against synthetic data + produces ok-status cell).
|
|
196
|
+
|
|
197
|
+
### Rolled forward to v0.47 (Decision R6-E)
|
|
198
|
+
|
|
199
|
+
- R6-A `seed=None` docstring fix (non-blocker per Decision Q).
|
|
200
|
+
- R6-F3 duplicate `MetricSpec.name` rejection.
|
|
201
|
+
- R6-F5 (Codex) Protocol method-shape drift guard.
|
|
202
|
+
- R6-F3 (Gemini) `Scorecard.to_pandas()` schema expansion.
|
|
203
|
+
- R6-F4 (Gemini) `make_spec_name()` helper.
|
|
204
|
+
- R6-F5 (Gemini) narrow `_evaluate_spec()` exception catch.
|
|
205
|
+
- R6-F6 (Codex) plan + roadmap state-drift refresh.
|
|
206
|
+
|
|
8
207
|
## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
|
|
9
208
|
|
|
10
209
|
Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.47.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -215,7 +215,7 @@ pip install "eval-toolkit[all]" # everything
|
|
|
215
215
|
|
|
216
216
|
```python
|
|
217
217
|
import numpy as np
|
|
218
|
-
from eval_toolkit import pr_auc, roc_auc, expected_calibration_error
|
|
218
|
+
from eval_toolkit.metrics import pr_auc, roc_auc, expected_calibration_error
|
|
219
219
|
|
|
220
220
|
rng = np.random.default_rng(42)
|
|
221
221
|
y = rng.integers(0, 2, size=200)
|
|
@@ -230,7 +230,8 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
|
|
|
230
230
|
### Bootstrap confidence intervals
|
|
231
231
|
|
|
232
232
|
```python
|
|
233
|
-
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
233
|
+
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
234
|
+
from eval_toolkit.metrics import pr_auc
|
|
234
235
|
|
|
235
236
|
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
|
|
236
237
|
print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
|
|
@@ -244,8 +245,10 @@ print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
|
|
|
244
245
|
### Temperature scaling (Guo et al. 2017)
|
|
245
246
|
|
|
246
247
|
```python
|
|
248
|
+
import numpy as np
|
|
247
249
|
from eval_toolkit import fit_temperature
|
|
248
250
|
|
|
251
|
+
rng = np.random.default_rng(42)
|
|
249
252
|
logits = rng.normal(size=(500, 2))
|
|
250
253
|
labels = (logits[:, 1] > logits[:, 0]).astype(int)
|
|
251
254
|
result = fit_temperature(logits, labels)
|
|
@@ -132,7 +132,7 @@ pip install "eval-toolkit[all]" # everything
|
|
|
132
132
|
|
|
133
133
|
```python
|
|
134
134
|
import numpy as np
|
|
135
|
-
from eval_toolkit import pr_auc, roc_auc, expected_calibration_error
|
|
135
|
+
from eval_toolkit.metrics import pr_auc, roc_auc, expected_calibration_error
|
|
136
136
|
|
|
137
137
|
rng = np.random.default_rng(42)
|
|
138
138
|
y = rng.integers(0, 2, size=200)
|
|
@@ -147,7 +147,8 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
|
|
|
147
147
|
### Bootstrap confidence intervals
|
|
148
148
|
|
|
149
149
|
```python
|
|
150
|
-
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
150
|
+
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
151
|
+
from eval_toolkit.metrics import pr_auc
|
|
151
152
|
|
|
152
153
|
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
|
|
153
154
|
print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
|
|
@@ -161,8 +162,10 @@ print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
|
|
|
161
162
|
### Temperature scaling (Guo et al. 2017)
|
|
162
163
|
|
|
163
164
|
```python
|
|
165
|
+
import numpy as np
|
|
164
166
|
from eval_toolkit import fit_temperature
|
|
165
167
|
|
|
168
|
+
rng = np.random.default_rng(42)
|
|
166
169
|
logits = rng.normal(size=(500, 2))
|
|
167
170
|
labels = (logits[:, 1] > logits[:, 0]).astype(int)
|
|
168
171
|
result = fit_temperature(logits, labels)
|
|
@@ -73,4 +73,6 @@ What would have to change for this decision to be reopened?
|
|
|
73
73
|
|
|
74
74
|
| # | Title | Status | Date |
|
|
75
75
|
|---|---|---|---|
|
|
76
|
-
|
|
|
76
|
+
| [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
|
|
77
|
+
| [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
|
|
78
|
+
| [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
|
|
@@ -31,22 +31,36 @@ _logging.getLogger("eval_toolkit").addHandler(_logging.NullHandler())
|
|
|
31
31
|
# tests/golden/public_api/ reads dict keys + values, not comments.
|
|
32
32
|
_EXPORTS: dict[str, str] = {
|
|
33
33
|
# --- adversarial ---
|
|
34
|
+
"ADVANCED_TECHNIQUES": "eval_toolkit.adversarial",
|
|
35
|
+
"ALL_TECHNIQUES": "eval_toolkit.adversarial",
|
|
36
|
+
"BidiRTLInjection": "eval_toolkit.adversarial",
|
|
34
37
|
"CORE_TECHNIQUES": "eval_toolkit.adversarial",
|
|
35
38
|
"CaseRandomization": "eval_toolkit.adversarial",
|
|
36
|
-
"CharacterInjectionStrategy": "eval_toolkit.adversarial",
|
|
37
39
|
"DiacriticInjection": "eval_toolkit.adversarial",
|
|
38
40
|
"HomoglyphSubstitution": "eval_toolkit.adversarial",
|
|
41
|
+
"InvisibleCharsInjection": "eval_toolkit.adversarial",
|
|
39
42
|
"PunctuationInjection": "eval_toolkit.adversarial",
|
|
43
|
+
"SynonymSubstitution": "eval_toolkit.adversarial",
|
|
44
|
+
"TagStrippingInjection": "eval_toolkit.adversarial",
|
|
45
|
+
"TokenSplitting": "eval_toolkit.adversarial",
|
|
46
|
+
"UnicodeNormalization": "eval_toolkit.adversarial",
|
|
40
47
|
"WhitespaceInjection": "eval_toolkit.adversarial",
|
|
41
48
|
"ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
|
|
42
|
-
|
|
49
|
+
# CharacterInjectionStrategy + character_injection SimpleNamespace
|
|
50
|
+
# removed at v0.47 (Decision N + plan §4E). TextTransform Protocol +
|
|
51
|
+
# the 12 concrete dataclasses are now the only public path.
|
|
43
52
|
# --- losses ---
|
|
44
53
|
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
45
54
|
# --- preprocessing ---
|
|
55
|
+
# `spotlighting` SimpleNamespace removed at v0.47 (Decision N + plan §4E).
|
|
56
|
+
# The 3 Variant dataclasses + the underlying functional API are the
|
|
57
|
+
# only public path.
|
|
58
|
+
"DatamarkVariant": "eval_toolkit.preprocessing",
|
|
59
|
+
"DelimitVariant": "eval_toolkit.preprocessing",
|
|
60
|
+
"EncodeVariant": "eval_toolkit.preprocessing",
|
|
46
61
|
"datamark": "eval_toolkit.preprocessing",
|
|
47
62
|
"delimit": "eval_toolkit.preprocessing",
|
|
48
63
|
"encode": "eval_toolkit.preprocessing",
|
|
49
|
-
"spotlighting": "eval_toolkit.preprocessing",
|
|
50
64
|
# --- probes ---
|
|
51
65
|
"ActivationDeltaProbe": "eval_toolkit.probes",
|
|
52
66
|
"ActivationExtractor": "eval_toolkit.probes",
|
|
@@ -247,6 +261,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
247
261
|
"PredictionReader": "eval_toolkit.protocols",
|
|
248
262
|
"Scorer": "eval_toolkit.protocols",
|
|
249
263
|
"SliceAwareScorer": "eval_toolkit.protocols",
|
|
264
|
+
"TextTransform": "eval_toolkit.protocols",
|
|
250
265
|
"Versioned": "eval_toolkit.protocols",
|
|
251
266
|
# --- seeds ---
|
|
252
267
|
"set_global_seeds": "eval_toolkit.seeds",
|
|
@@ -298,64 +313,28 @@ _EXPORTS: dict[str, str] = {
|
|
|
298
313
|
"MetricSpec": "eval_toolkit._scorecard",
|
|
299
314
|
"Scorecard": "eval_toolkit._scorecard",
|
|
300
315
|
"scorecard": "eval_toolkit._scorecard",
|
|
316
|
+
# --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
|
|
317
|
+
"sweep": "eval_toolkit._sweep",
|
|
301
318
|
}
|
|
302
319
|
|
|
303
320
|
__all__ = ["__version__", *_EXPORTS.keys()]
|
|
304
321
|
|
|
305
322
|
|
|
306
|
-
# ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
|
|
307
|
-
# At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
|
|
308
|
-
# in favor of the `scorecard()` surface (Decision A). To give the consumer one
|
|
309
|
-
# release of overlap before the hard removal at v0.47, the names below remain
|
|
310
|
-
# reachable via the package-level `__getattr__` (which delegates to the
|
|
311
|
-
# `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
|
|
312
|
-
# lookup pointing at the new API.
|
|
313
|
-
#
|
|
314
|
-
# WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
|
|
315
|
-
# `__getattr__` below is the load-bearing lazy export resolver for every name
|
|
316
|
-
# in `_EXPORTS`. The deprecation branch is a discrete `if name in
|
|
317
|
-
# _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
|
|
318
|
-
# behavior for non-deprecated names is unchanged. At v0.47 we delete this
|
|
319
|
-
# transitional block and the resolver continues to work for every remaining
|
|
320
|
-
# `_EXPORTS` entry.
|
|
321
|
-
_DEPRECATED_SCALARS: frozenset[str] = frozenset(
|
|
322
|
-
{
|
|
323
|
-
"pr_auc",
|
|
324
|
-
"roc_auc",
|
|
325
|
-
"brier_score",
|
|
326
|
-
"expected_calibration_error",
|
|
327
|
-
"expected_calibration_error_debiased",
|
|
328
|
-
"expected_calibration_error_equal_mass",
|
|
329
|
-
"expected_calibration_error_l2",
|
|
330
|
-
"expected_calibration_error_l2_debiased",
|
|
331
|
-
}
|
|
332
|
-
)
|
|
333
|
-
# ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
|
|
334
|
-
|
|
335
|
-
|
|
336
323
|
def __getattr__(name: str) -> Any:
|
|
337
|
-
"""Resolve public symbols lazily.
|
|
324
|
+
"""Resolve public symbols lazily.
|
|
325
|
+
|
|
326
|
+
v0.47 cleanup (Decision L): the BEGIN/END TRANSITIONAL DEPRECATION
|
|
327
|
+
BRANCH that v0.46 inserted in front of the resolver — together with the
|
|
328
|
+
``_DEPRECATED_SCALARS`` frozenset and the ``_deprecation_warning_for``
|
|
329
|
+
helper — has been removed. The lazy resolver below is the v0.46 base
|
|
330
|
+
behavior; with the transitional block gone, deprecated v0.45 scalar names
|
|
331
|
+
(``pr_auc``, ``roc_auc``, ``brier_score``, the 5 ``expected_calibration_*``
|
|
332
|
+
variants) now raise :class:`AttributeError` cleanly. Submodule-level
|
|
333
|
+
access (e.g., ``from eval_toolkit.metrics import pr_auc``) is unaffected
|
|
334
|
+
per Decision C / ADR 0002.
|
|
335
|
+
"""
|
|
338
336
|
if name == "__version__":
|
|
339
337
|
return __version__
|
|
340
|
-
# ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
|
|
341
|
-
if name in _DEPRECATED_SCALARS:
|
|
342
|
-
import warnings
|
|
343
|
-
|
|
344
|
-
warnings.warn(
|
|
345
|
-
f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
|
|
346
|
-
f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
|
|
347
|
-
f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
|
|
348
|
-
f" `eval_toolkit.metrics` submodule directly (internal API).",
|
|
349
|
-
DeprecationWarning,
|
|
350
|
-
stacklevel=2,
|
|
351
|
-
)
|
|
352
|
-
module = import_module("eval_toolkit.metrics")
|
|
353
|
-
value = getattr(module, name)
|
|
354
|
-
# Do NOT cache in globals() — repeated lookups should keep re-warning
|
|
355
|
-
# (one warning per call site, modulo Python's default
|
|
356
|
-
# DeprecationWarning de-duplication).
|
|
357
|
-
return value
|
|
358
|
-
# ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
|
|
359
338
|
module_name = _EXPORTS.get(name)
|
|
360
339
|
if module_name is None:
|
|
361
340
|
raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
|
|
@@ -365,29 +344,6 @@ def __getattr__(name: str) -> Any:
|
|
|
365
344
|
return value
|
|
366
345
|
|
|
367
346
|
|
|
368
|
-
# ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
369
|
-
def _scorecard_spec_for(deprecated_name: str) -> str:
|
|
370
|
-
"""Map a deprecated-scalar name to its `metric_specs` replacement name.
|
|
371
|
-
|
|
372
|
-
Used only inside the v0.46 deprecation warning message. Returns the
|
|
373
|
-
closest equivalent first-party spec name where one exists; falls back
|
|
374
|
-
to the original name for ECE variants whose exact-match spec isn't in
|
|
375
|
-
the v0.46 first-party namespace (e.g., the L2 / debiased variants —
|
|
376
|
-
callers either implement a custom `MetricSpec` or stay on the
|
|
377
|
-
submodule path).
|
|
378
|
-
"""
|
|
379
|
-
return {
|
|
380
|
-
"pr_auc": "pr_auc",
|
|
381
|
-
"roc_auc": "roc_auc",
|
|
382
|
-
"brier_score": "brier",
|
|
383
|
-
"expected_calibration_error": "ece(n_bins=10)",
|
|
384
|
-
"expected_calibration_error_equal_mass": 'ece(n_bins=10, strategy="quantile")',
|
|
385
|
-
}.get(deprecated_name, deprecated_name)
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
# ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
389
|
-
|
|
390
|
-
|
|
391
347
|
def __dir__() -> list[str]:
|
|
392
348
|
"""Expose lazy public symbols to introspection."""
|
|
393
349
|
return sorted(__all__)
|
|
@@ -261,10 +261,17 @@ class Scorecard(Mapping[str, MetricResult]):
|
|
|
261
261
|
``ImportError`` with an install hint when pandas is missing.
|
|
262
262
|
|
|
263
263
|
The DataFrame has 1 row (one slice) and a 2-level column index:
|
|
264
|
-
outer = metric name, inner = field name in
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
264
|
+
outer = metric name, inner = field name in ``{"value", "status",
|
|
265
|
+
"reason", "ci_low", "ci_high", "confidence", "n_resamples",
|
|
266
|
+
"method"}``. CI-related columns (``ci_low``, ``ci_high``,
|
|
267
|
+
``confidence``, ``n_resamples``, ``method``) are sentinel-valued
|
|
268
|
+
(``NaN`` for numeric, ``""`` for string) when no CI is present
|
|
269
|
+
(status="skipped" / "error", or bootstrap=False).
|
|
270
|
+
|
|
271
|
+
Decision R6-C (Round 6 audit, Gemini F3): the v0.47 expansion adds
|
|
272
|
+
``n_resamples`` + ``method`` so the schema is lossless against
|
|
273
|
+
:meth:`BootstrapCI.to_dict` — trace provenance no longer drops in
|
|
274
|
+
the DataFrame view.
|
|
268
275
|
"""
|
|
269
276
|
try:
|
|
270
277
|
import pandas as pd
|
|
@@ -285,6 +292,8 @@ class Scorecard(Mapping[str, MetricResult]):
|
|
|
285
292
|
(name, "ci_low"),
|
|
286
293
|
(name, "ci_high"),
|
|
287
294
|
(name, "confidence"),
|
|
295
|
+
(name, "n_resamples"),
|
|
296
|
+
(name, "method"),
|
|
288
297
|
]
|
|
289
298
|
)
|
|
290
299
|
values.extend(
|
|
@@ -295,6 +304,8 @@ class Scorecard(Mapping[str, MetricResult]):
|
|
|
295
304
|
result.ci.ci_low if result.ci is not None else float("nan"),
|
|
296
305
|
result.ci.ci_high if result.ci is not None else float("nan"),
|
|
297
306
|
result.ci.confidence if result.ci is not None else float("nan"),
|
|
307
|
+
result.ci.n_resamples if result.ci is not None else float("nan"),
|
|
308
|
+
result.ci.method if result.ci is not None else "",
|
|
298
309
|
]
|
|
299
310
|
)
|
|
300
311
|
|
|
@@ -341,7 +352,13 @@ def scorecard(
|
|
|
341
352
|
confidence : float, optional
|
|
342
353
|
Two-sided CI level ∈ ``(0, 1)``. Default ``0.95``.
|
|
343
354
|
seed : int or None, optional
|
|
344
|
-
Bootstrap RNG seed. Default ``None``
|
|
355
|
+
Bootstrap RNG seed. Default ``None``, which is treated as ``seed=0``
|
|
356
|
+
for reproducibility — eval-toolkit's evaluation pipelines are
|
|
357
|
+
deterministic by default. Pass an explicit integer to control the
|
|
358
|
+
bootstrap RNG; pass a value derived from
|
|
359
|
+
``np.random.SeedSequence().entropy`` for non-deterministic sampling.
|
|
360
|
+
Decision R6-A (Round 6 audit) locked the deterministic-by-default
|
|
361
|
+
contract; the prior docstring framing was incorrect.
|
|
345
362
|
|
|
346
363
|
Returns
|
|
347
364
|
-------
|
|
@@ -410,6 +427,7 @@ def scorecard(
|
|
|
410
427
|
confidence=confidence,
|
|
411
428
|
bootstrap=bootstrap,
|
|
412
429
|
)
|
|
430
|
+
_validate_unique_spec_names(metrics)
|
|
413
431
|
|
|
414
432
|
is_single_class = bool(np.unique(y_true_arr).size < 2)
|
|
415
433
|
|
|
@@ -450,6 +468,11 @@ def _evaluate_spec(
|
|
|
450
468
|
|
|
451
469
|
try:
|
|
452
470
|
point = float(spec.compute(y_true, y_score))
|
|
471
|
+
except (MemoryError, RecursionError, KeyboardInterrupt, SystemExit):
|
|
472
|
+
# Process-exhaustion / user-interrupt signals must propagate;
|
|
473
|
+
# per-cell isolation is for application-level errors only.
|
|
474
|
+
# Decision R6-F5 (Round 6 audit, Gemini).
|
|
475
|
+
raise
|
|
453
476
|
except Exception as exc: # noqa: BLE001 — broad catch is intentional (per-cell isolation)
|
|
454
477
|
return MetricResult(
|
|
455
478
|
value=None,
|
|
@@ -469,6 +492,9 @@ def _evaluate_spec(
|
|
|
469
492
|
confidence=confidence,
|
|
470
493
|
seed=seed if seed is not None else 0,
|
|
471
494
|
)
|
|
495
|
+
except (MemoryError, RecursionError, KeyboardInterrupt, SystemExit):
|
|
496
|
+
# Same R6-F5 invariant for the bootstrap path.
|
|
497
|
+
raise
|
|
472
498
|
except Exception as exc: # noqa: BLE001
|
|
473
499
|
# Point estimate succeeded; the bootstrap couldn't (e.g., n < 10
|
|
474
500
|
# floor from bootstrap.py:198, BCa degeneracy, etc.). Record the
|
|
@@ -507,3 +533,21 @@ def _validate_scorecard_inputs(
|
|
|
507
533
|
raise ValueError(f"n_resamples must be >= 1 when bootstrap=True; got {n_resamples}")
|
|
508
534
|
if not 0.0 < confidence < 1.0:
|
|
509
535
|
raise ValueError(f"confidence must be in (0, 1); got {confidence}")
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def _validate_unique_spec_names(metrics: Sequence[MetricSpec]) -> None:
|
|
539
|
+
"""Reject duplicate MetricSpec.name values in a single scorecard() call.
|
|
540
|
+
|
|
541
|
+
Locked by Decision R6-B (Round 6 audit, Codex R6-F3): silent last-wins
|
|
542
|
+
overwrite is not a documented Mapping[str, MetricResult] contract. Force
|
|
543
|
+
the caller to disambiguate so we never lose data on user error.
|
|
544
|
+
"""
|
|
545
|
+
seen: dict[str, int] = {}
|
|
546
|
+
for i, spec in enumerate(metrics):
|
|
547
|
+
if spec.name in seen:
|
|
548
|
+
raise ValueError(
|
|
549
|
+
f"Duplicate MetricSpec name {spec.name!r} at index {i} "
|
|
550
|
+
f"(previously at index {seen[spec.name]}); each spec must have a "
|
|
551
|
+
f"unique name for the Scorecard Mapping[str, MetricResult] contract."
|
|
552
|
+
)
|
|
553
|
+
seen[spec.name] = i
|