eval-toolkit 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/CHANGELOG.md +49 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/PKG-INFO +3 -2
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/README.md +2 -1
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/metrics.py +38 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/public_api/snapshot.json +1 -1
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_folded.py +23 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/.gitignore +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/LICENSE +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/STYLE.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/archive/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/pyproject.toml +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/conftest.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/strategies.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_claims.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_cli.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_config.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_logging.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_losses.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_paths.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_probes.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_rng.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_splits.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,55 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
|
|
9
|
+
|
|
10
|
+
Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
|
|
11
|
+
v1.0.1 (RC4) and v1.0.2 (this release). All P3, all NON-BREAKING.
|
|
12
|
+
|
|
13
|
+
### Changed (Tier-2 ADDITIVE: contract clarification only)
|
|
14
|
+
|
|
15
|
+
- **RC2** (#76) — `SimilarityStrategy` Protocol promoted from
|
|
16
|
+
"pre-v0.7 internal interface" (prose framing only) to formal
|
|
17
|
+
10th strict Tier-2 Protocol per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
18
|
+
Aligns prose surfaces (README, extending.md, strict_tier2_protocols.md,
|
|
19
|
+
api/protocols.md, ADR 0004 §D6, roadmap.md) with the contract
|
|
20
|
+
already locked in `tests/golden/public_api/snapshot.json` +
|
|
21
|
+
`src/eval_toolkit/__init__.py:_EXPORTS` since v1.0.0. **No code
|
|
22
|
+
change — documentation-only reconciliation.** Strict-Tier-2 count
|
|
23
|
+
goes 9 → 10 (+ 1 opt-in `Versioned`).
|
|
24
|
+
|
|
25
|
+
### Fixed
|
|
26
|
+
|
|
27
|
+
- **RC3** (#76) — `tests/test_harness_folded.py::test_evaluate_folded_reseed_splitter_varies_partitions`
|
|
28
|
+
test hardening. Previous assertions covered count + key existence
|
|
29
|
+
only; a regression silently reusing the splitter (R8-C1 pre-fix
|
|
30
|
+
behavior) could still pass. v1.0.2 adds row-content comparison:
|
|
31
|
+
replays `reseed_splitter` against the splitter for `seed=1` vs
|
|
32
|
+
`seed=2` and asserts fold-0 test partitions differ via feature-text
|
|
33
|
+
set membership (robust to `_slice_subset`'s `reset_index(drop=True)`
|
|
34
|
+
via stable text-column identifiers).
|
|
35
|
+
|
|
36
|
+
- **F-metrics-1** (#76) — `brier_score` docstring input-domain clarity.
|
|
37
|
+
Added explicit "Input domain" Notes subsection clarifying binary
|
|
38
|
+
labels in `{0, 1}` + calibrated probabilities in `[0, 1]` are
|
|
39
|
+
required; raw logits or unbounded ranking scores pass the finiteness
|
|
40
|
+
check but produce out-of-range MSE that misrepresents calibration
|
|
41
|
+
quality. Includes calibration-applying recipe pointer.
|
|
42
|
+
|
|
43
|
+
- **F-metrics-3** (#76) — `expected_calibration_error` docstring
|
|
44
|
+
uniform-scores note. Added explicit Notes subsection documenting
|
|
45
|
+
that constant `y_score` returns 0.0 (per-bin formula trivially
|
|
46
|
+
satisfied) but is semantically misleading — uninformative scorers
|
|
47
|
+
look "perfectly calibrated" despite zero discriminative power.
|
|
48
|
+
Callers should filter constant inputs before ECE.
|
|
49
|
+
|
|
50
|
+
- **F-metrics-4** (#76) — `brier_score` docstring single-class
|
|
51
|
+
edge-case explicit. Added Notes subsection with closed-form
|
|
52
|
+
expressions for all-zeros (`BS = mean(p²)`) and all-ones
|
|
53
|
+
(`BS = mean((1-p)²)`) cases. Explicit confirmation that
|
|
54
|
+
per-slice degenerate-class evaluation is supported (unlike
|
|
55
|
+
PR-AUC / ROC-AUC).
|
|
56
|
+
|
|
8
57
|
## [1.0.1] — 2026-05-25 — audit_citation_alignment + RC4 docs polish
|
|
9
58
|
|
|
10
59
|
First v1.x patch release. Ships the `audit_citation_alignment` validator
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -115,7 +115,8 @@ format changes.
|
|
|
115
115
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
116
116
|
│ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
|
|
117
117
|
│ ThresholdSelector / DatasetLoader / MetricSpec │
|
|
118
|
-
│ MetaLearner / Probe / TextTransform
|
|
118
|
+
│ MetaLearner / Probe / TextTransform / │
|
|
119
|
+
│ SimilarityStrategy (10 strict) │
|
|
119
120
|
│ Versioned (opt-in: per-object versions in manifest) │
|
|
120
121
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
121
122
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
@@ -32,7 +32,8 @@ format changes.
|
|
|
32
32
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
33
33
|
│ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
|
|
34
34
|
│ ThresholdSelector / DatasetLoader / MetricSpec │
|
|
35
|
-
│ MetaLearner / Probe / TextTransform
|
|
35
|
+
│ MetaLearner / Probe / TextTransform / │
|
|
36
|
+
│ SimilarityStrategy (10 strict) │
|
|
36
37
|
│ Versioned (opt-in: per-object versions in manifest) │
|
|
37
38
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
38
39
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
@@ -792,6 +792,20 @@ def expected_calibration_error(
|
|
|
792
792
|
empirical positive rate in the bin, and :math:`\\mathrm{conf}` is the
|
|
793
793
|
mean predicted score.
|
|
794
794
|
|
|
795
|
+
**Uniform / uninformative scores** (F-metrics-3 v1.0.2 clarity pass):
|
|
796
|
+
when ``y_score`` is constant (e.g., ``[0.5] * n`` — an uninformative
|
|
797
|
+
detector), this function returns ``0.0`` regardless of the true label
|
|
798
|
+
distribution. That's technically correct per the formula —
|
|
799
|
+
:math:`|\\mathrm{acc}(B_m) - \\mathrm{conf}(B_m)|` measures bin-level
|
|
800
|
+
calibration, and a single occupied bin with ``conf = base rate``
|
|
801
|
+
achieves perfect calibration locally. But it is semantically
|
|
802
|
+
misleading: an uninformative scorer looks "perfectly calibrated"
|
|
803
|
+
even though it has zero discriminative power. **Callers should
|
|
804
|
+
detect and filter uninformative inputs before passing to ECE** —
|
|
805
|
+
e.g., reject when ``np.unique(y_score).size == 1`` or when the
|
|
806
|
+
score variance is below a domain-specific threshold. Use
|
|
807
|
+
:func:`brier_score` or :func:`pr_auc` for resolution-aware metrics.
|
|
808
|
+
|
|
795
809
|
References
|
|
796
810
|
----------
|
|
797
811
|
.. [1] DeGroot, M. H. & Fienberg, S. E. "The comparison and evaluation of
|
|
@@ -1240,6 +1254,30 @@ def brier_score(
|
|
|
1240
1254
|
-----
|
|
1241
1255
|
.. math:: \mathrm{BS} = \frac{1}{n} \sum_i (p_i - y_i)^2
|
|
1242
1256
|
|
|
1257
|
+
**Input domain** (F-metrics-1 v1.0.2 clarity pass): ``y_true`` must
|
|
1258
|
+
be binary labels in ``{0, 1}`` (other label values raise
|
|
1259
|
+
``ValueError``). ``y_score`` must be calibrated probabilities in
|
|
1260
|
+
``[0, 1]`` — raw logits or unbounded ranking scores will pass the
|
|
1261
|
+
finiteness check but produce an out-of-range MSE that misrepresents
|
|
1262
|
+
calibration quality. If your scorer produces logits, apply
|
|
1263
|
+
sigmoid / softmax / a fitted calibrator (see
|
|
1264
|
+
:mod:`eval_toolkit.calibration`) before passing to ``brier_score``.
|
|
1265
|
+
|
|
1266
|
+
**Single-class behavior** (F-metrics-4 v1.0.2 clarity pass): unlike
|
|
1267
|
+
PR-AUC / ROC-AUC, ``brier_score`` is well-defined when ``y_true``
|
|
1268
|
+
is all-zeros or all-ones — it degenerates to the MSE around the
|
|
1269
|
+
constant class label. Specifically:
|
|
1270
|
+
|
|
1271
|
+
- All-zeros: :math:`\mathrm{BS} = \frac{1}{n} \sum_i p_i^2` —
|
|
1272
|
+
forecasting any positive probability incurs squared-error loss.
|
|
1273
|
+
- All-ones: :math:`\mathrm{BS} = \frac{1}{n} \sum_i (1 - p_i)^2`
|
|
1274
|
+
— forecasting low probability incurs squared-error loss.
|
|
1275
|
+
|
|
1276
|
+
This is the deliberate Brier-as-strict-proper-scoring-rule behavior
|
|
1277
|
+
(Brier 1950). Per-slice degenerate-class evaluation is supported
|
|
1278
|
+
via the ``empty_strategy`` parameter for ``n=0`` only; non-empty
|
|
1279
|
+
single-class slices score normally.
|
|
1280
|
+
|
|
1243
1281
|
See Also
|
|
1244
1282
|
--------
|
|
1245
1283
|
eval_toolkit.metrics.brier_decomposition :
|
|
@@ -1373,7 +1373,7 @@
|
|
|
1373
1373
|
"doc_first_line": "str(object='') -> str",
|
|
1374
1374
|
"kind": "value",
|
|
1375
1375
|
"type": "str",
|
|
1376
|
-
"value": "'1.0.
|
|
1376
|
+
"value": "'1.0.2'"
|
|
1377
1377
|
},
|
|
1378
1378
|
"apply_operating_points": {
|
|
1379
1379
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -162,6 +162,29 @@ def test_evaluate_folded_reseed_splitter_varies_partitions() -> None:
|
|
|
162
162
|
assert "seed=1/fold=0" in fold_ids
|
|
163
163
|
assert "seed=2/fold=0" in fold_ids
|
|
164
164
|
|
|
165
|
+
# R10-RC3 v1.0.2 hardening (#76): the previous assertions covered
|
|
166
|
+
# COUNT + key existence but did NOT verify the actual partition
|
|
167
|
+
# indices differ across seeds — a regression that silently reused
|
|
168
|
+
# the splitter (R8-C1 pre-fix behavior) could still pass. Directly
|
|
169
|
+
# verify the reseed_splitter callback yields different partitions
|
|
170
|
+
# by replaying it against the splitter.
|
|
171
|
+
splitter = StratifiedKFoldSplitter(k=2, seed=42)
|
|
172
|
+
splits_seed_1 = list(dataclasses.replace(splitter, seed=1).iter_folds(parent, groups=None))
|
|
173
|
+
splits_seed_2 = list(dataclasses.replace(splitter, seed=2).iter_folds(parent, groups=None))
|
|
174
|
+
# _slice_subset resets the child df index to [0..n-1], so compare
|
|
175
|
+
# the underlying text feature values instead (stable across the
|
|
176
|
+
# reset_index drop). Each child slice's `text` column carries the
|
|
177
|
+
# original row labels.
|
|
178
|
+
fold_0_test_texts_seed_1 = set(splits_seed_1[0]["test"].df["text"].tolist())
|
|
179
|
+
fold_0_test_texts_seed_2 = set(splits_seed_2[0]["test"].df["text"].tolist())
|
|
180
|
+
# Different seeds → different fold-0 test partitions (the whole
|
|
181
|
+
# point of reseed_splitter).
|
|
182
|
+
assert fold_0_test_texts_seed_1 != fold_0_test_texts_seed_2, (
|
|
183
|
+
"reseed_splitter callback failed to vary partitions: "
|
|
184
|
+
f"seed=1 fold=0 texts={sorted(fold_0_test_texts_seed_1)[:5]}... "
|
|
185
|
+
f"seed=2 fold=0 texts={sorted(fold_0_test_texts_seed_2)[:5]}..."
|
|
186
|
+
)
|
|
187
|
+
|
|
165
188
|
|
|
166
189
|
@pytest.mark.unit
|
|
167
190
|
def test_evaluate_folded_single_seed_no_deprecation_warning() -> None:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_lift_ci.png
RENAMED
|
File without changes
|
{eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_metric_bars.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pr_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.0.1 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_roc_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|