eval-toolkit 1.0.1__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/CHANGELOG.md +86 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/PKG-INFO +3 -2
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/README.md +2 -1
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/__init__.py +7 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-1.0.3/src/eval_toolkit/audit_value_bindings.py +448 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/metrics.py +38 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/public_api/snapshot.json +34 -1
- eval_toolkit-1.0.3/tests/test_audit_value_bindings.py +338 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_harness_folded.py +23 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/.gitignore +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/LICENSE +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/STYLE.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/archive/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/research/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/pyproject.toml +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/conftest.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/strategies.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_claims.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_cli.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_config.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_logging.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_losses.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_paths.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_probes.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_rng.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_splits.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.0.1 → eval_toolkit-1.0.3}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,92 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.3] — 2026-05-26 — `audit_value_bindings` module (closes #71)
|
|
9
|
+
|
|
10
|
+
Tier-2 ADDITIVE — second member of the audit-validator family
|
|
11
|
+
following `audit_citation_alignment` (v1.0.1). Flat-module per
|
|
12
|
+
[ADR 0001](docs/source/adr/0001-flat-module-layout.md).
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
|
|
16
|
+
- **`audit_value_bindings` module** exporting
|
|
17
|
+
`validate_reader_value_bindings()` + `Match` + `Violation` +
|
|
18
|
+
`ValueBindingsReport` as Tier 1 STRICT (per
|
|
19
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
|
|
20
|
+
Catches the bug class where a markdown surface pairs a detector name
|
|
21
|
+
with the **wrong** canonical value — both values exist in the
|
|
22
|
+
source-of-truth table but the binding is misordered. Motivated by
|
|
23
|
+
the consumer V1.3.1 ADR-080 audit-fix patch closure (2026-05-22)
|
|
24
|
+
where `WRITEUP_NARRATIVE.md:38` said "TF-IDF + logistic regression
|
|
25
|
+
baseline reaches 0.974 AUPRC" but canonical TF-IDF direct val AUPRC
|
|
26
|
+
is 0.971 (0.974 was LoRA's value). The existing `audit_numbers.py`
|
|
27
|
+
validates VALUES against source data but not BINDINGS — this
|
|
28
|
+
validator closes that gap.
|
|
29
|
+
- Cross-detector disambiguation: when multiple detectors and values
|
|
30
|
+
appear in the same paragraph (e.g., "TF-IDF achieves 0.971, while
|
|
31
|
+
LoRA reaches 0.974"), each value pairs with the LAST detector
|
|
32
|
+
appearing before it in text order (falling back to first detector
|
|
33
|
+
after if no before-detector is in range). Avoids false-positive
|
|
34
|
+
bindings across closely-spaced detector mentions.
|
|
35
|
+
- Coverage metric: `ValueBindingsReport.coverage` reports the fraction
|
|
36
|
+
of `(detector, metric)` keys in the canonical `bindings` dict that
|
|
37
|
+
produced at least one `Match` — useful for detecting stale or
|
|
38
|
+
unreferenced bindings in reader prose.
|
|
39
|
+
- 13 tests at `tests/test_audit_value_bindings.py` including the
|
|
40
|
+
verbatim WRITEUP_NARRATIVE seed-case regression, alias resolution
|
|
41
|
+
(detector + metric), distance-window edge, value-without-metric
|
|
42
|
+
skip, coverage fraction, tolerance band, multi-detector
|
|
43
|
+
disambiguation, frozen-dataclass invariants. Closes #71.
|
|
44
|
+
|
|
45
|
+
## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
|
|
46
|
+
|
|
47
|
+
Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
|
|
48
|
+
v1.0.1 (RC4) and v1.0.2 (this release). All P3, all NON-BREAKING.
|
|
49
|
+
|
|
50
|
+
### Changed (Tier-2 ADDITIVE: contract clarification only)
|
|
51
|
+
|
|
52
|
+
- **RC2** (#76) — `SimilarityStrategy` Protocol promoted from
|
|
53
|
+
"pre-v0.7 internal interface" (prose framing only) to formal
|
|
54
|
+
10th strict Tier-2 Protocol per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
55
|
+
Aligns prose surfaces (README, extending.md, strict_tier2_protocols.md,
|
|
56
|
+
api/protocols.md, ADR 0004 §D6, roadmap.md) with the contract
|
|
57
|
+
already locked in `tests/golden/public_api/snapshot.json` +
|
|
58
|
+
`src/eval_toolkit/__init__.py:_EXPORTS` since v1.0.0. **No code
|
|
59
|
+
change — documentation-only reconciliation.** Strict-Tier-2 count
|
|
60
|
+
goes 9 → 10 (+ 1 opt-in `Versioned`).
|
|
61
|
+
|
|
62
|
+
### Fixed
|
|
63
|
+
|
|
64
|
+
- **RC3** (#76) — `tests/test_harness_folded.py::test_evaluate_folded_reseed_splitter_varies_partitions`
|
|
65
|
+
test hardening. Previous assertions covered count + key existence
|
|
66
|
+
only; a regression silently reusing the splitter (R8-C1 pre-fix
|
|
67
|
+
behavior) could still pass. v1.0.2 adds row-content comparison:
|
|
68
|
+
replays `reseed_splitter` against the splitter for `seed=1` vs
|
|
69
|
+
`seed=2` and asserts fold-0 test partitions differ via feature-text
|
|
70
|
+
set membership (robust to `_slice_subset`'s `reset_index(drop=True)`
|
|
71
|
+
via stable text-column identifiers).
|
|
72
|
+
|
|
73
|
+
- **F-metrics-1** (#76) — `brier_score` docstring input-domain clarity.
|
|
74
|
+
Added explicit "Input domain" Notes subsection clarifying binary
|
|
75
|
+
labels in `{0, 1}` + calibrated probabilities in `[0, 1]` are
|
|
76
|
+
required; raw logits or unbounded ranking scores pass the finiteness
|
|
77
|
+
check but produce out-of-range MSE that misrepresents calibration
|
|
78
|
+
quality. Includes calibration-applying recipe pointer.
|
|
79
|
+
|
|
80
|
+
- **F-metrics-3** (#76) — `expected_calibration_error` docstring
|
|
81
|
+
uniform-scores note. Added explicit Notes subsection documenting
|
|
82
|
+
that constant `y_score` returns 0.0 (per-bin formula trivially
|
|
83
|
+
satisfied) but is semantically misleading — uninformative scorers
|
|
84
|
+
look "perfectly calibrated" despite zero discriminative power.
|
|
85
|
+
Callers should filter constant inputs before ECE.
|
|
86
|
+
|
|
87
|
+
- **F-metrics-4** (#76) — `brier_score` docstring single-class
|
|
88
|
+
edge-case explicit. Added Notes subsection with closed-form
|
|
89
|
+
expressions for all-zeros (`BS = mean(p²)`) and all-ones
|
|
90
|
+
(`BS = mean((1-p)²)`) cases. Explicit confirmation that
|
|
91
|
+
per-slice degenerate-class evaluation is supported (unlike
|
|
92
|
+
PR-AUC / ROC-AUC).
|
|
93
|
+
|
|
8
94
|
## [1.0.1] — 2026-05-25 — audit_citation_alignment + RC4 docs polish
|
|
9
95
|
|
|
10
96
|
First v1.x patch release. Ships the `audit_citation_alignment` validator
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -115,7 +115,8 @@ format changes.
|
|
|
115
115
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
116
116
|
│ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
|
|
117
117
|
│ ThresholdSelector / DatasetLoader / MetricSpec │
|
|
118
|
-
│ MetaLearner / Probe / TextTransform
|
|
118
|
+
│ MetaLearner / Probe / TextTransform / │
|
|
119
|
+
│ SimilarityStrategy (10 strict) │
|
|
119
120
|
│ Versioned (opt-in: per-object versions in manifest) │
|
|
120
121
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
121
122
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
@@ -32,7 +32,8 @@ format changes.
|
|
|
32
32
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
33
33
|
│ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
|
|
34
34
|
│ ThresholdSelector / DatasetLoader / MetricSpec │
|
|
35
|
-
│ MetaLearner / Probe / TextTransform
|
|
35
|
+
│ MetaLearner / Probe / TextTransform / │
|
|
36
|
+
│ SimilarityStrategy (10 strict) │
|
|
36
37
|
│ Versioned (opt-in: per-object versions in manifest) │
|
|
37
38
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
38
39
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
@@ -60,6 +60,13 @@ _EXPORTS: dict[str, str] = {
|
|
|
60
60
|
"CitationMisalignment": "eval_toolkit.audit_citation_alignment",
|
|
61
61
|
"extract_adr_subject_category": "eval_toolkit.audit_citation_alignment",
|
|
62
62
|
"validate_citations": "eval_toolkit.audit_citation_alignment",
|
|
63
|
+
# --- audit_value_bindings ---
|
|
64
|
+
# Flat-module per ADR 0001. Closes #71. Motivated by consumer V1.3.1
|
|
65
|
+
# ADR-080 audit-fix finding (TF-IDF / LoRA 0.974 value mis-binding).
|
|
66
|
+
"Match": "eval_toolkit.audit_value_bindings",
|
|
67
|
+
"ValueBindingsReport": "eval_toolkit.audit_value_bindings",
|
|
68
|
+
"Violation": "eval_toolkit.audit_value_bindings",
|
|
69
|
+
"validate_reader_value_bindings": "eval_toolkit.audit_value_bindings",
|
|
63
70
|
# --- losses ---
|
|
64
71
|
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
65
72
|
# --- preprocessing ---
|
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
r"""Reader-prose value-binding validator.
|
|
2
|
+
|
|
3
|
+
Catches the bug class where a reader-facing markdown surface pairs a
|
|
4
|
+
detector name with the **wrong** canonical value — both values are
|
|
5
|
+
present in the source-of-truth table, but the binding is misordered.
|
|
6
|
+
|
|
7
|
+
Motivating test case (from `prompt-injection-detection-prototype`
|
|
8
|
+
v1.3.1 audit-fix, ADR-080 patch closure 2026-05-22)::
|
|
9
|
+
|
|
10
|
+
WRITEUP_NARRATIVE.md:38:
|
|
11
|
+
"The TF-IDF + logistic regression baseline reaches 0.974 AUPRC
|
|
12
|
+
on balanced direct-versus-benign validation."
|
|
13
|
+
|
|
14
|
+
Canonical: TF-IDF direct val AUPRC = 0.971; LoRA direct val AUPRC =
|
|
15
|
+
0.974. Both values exist in the bindings table; the bug is the wrong
|
|
16
|
+
(detector, value) pairing. The pre-existing ``audit_numbers.py``-style
|
|
17
|
+
primitive validates VALUES against source data; this validator
|
|
18
|
+
validates BINDINGS — that each prose-mentioned (detector_token,
|
|
19
|
+
metric_token, value) triple matches the canonical binding.
|
|
20
|
+
|
|
21
|
+
Design (per ADR 0001 flat-module + ADR 0002 closed-config + ADR 0003
|
|
22
|
+
Tier 1 STRICT public-API contract):
|
|
23
|
+
|
|
24
|
+
- Consumer supplies the canonical-binding table + value/metric/detector
|
|
25
|
+
regex patterns; validator handles position-aware regex scan + binding
|
|
26
|
+
lookup + report assembly.
|
|
27
|
+
- Flat-module: `eval_toolkit.audit_value_bindings.*` (NOT a subpackage
|
|
28
|
+
per ADR 0001 stay-flat-through-v1.x).
|
|
29
|
+
- All Tier-1 STRICT public symbols (`validate_reader_value_bindings`,
|
|
30
|
+
`Match`, `Violation`, `ValueBindingsReport`) re-exported at top level
|
|
31
|
+
via `_EXPORTS` lazy resolver.
|
|
32
|
+
|
|
33
|
+
Closes upstream issue #71. v1.0.3.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import re
|
|
39
|
+
from collections.abc import Mapping, Sequence
|
|
40
|
+
from dataclasses import dataclass
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from types import MappingProxyType
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"Match",
|
|
46
|
+
"ValueBindingsReport",
|
|
47
|
+
"Violation",
|
|
48
|
+
"validate_reader_value_bindings",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
DEFAULT_VALUE_PATTERN: str = r"\d+\.\d{2,4}"
|
|
53
|
+
DEFAULT_MAX_DISTANCE_CHARS: int = 80
|
|
54
|
+
DEFAULT_TOLERANCE: float = 1e-4
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True)
|
|
58
|
+
class Match:
|
|
59
|
+
"""A reader-prose (detector, metric, value) triple that matches the canonical binding.
|
|
60
|
+
|
|
61
|
+
Attributes
|
|
62
|
+
----------
|
|
63
|
+
file : Path
|
|
64
|
+
File where the match was found.
|
|
65
|
+
line : int
|
|
66
|
+
1-indexed line number of the value occurrence.
|
|
67
|
+
detector : str
|
|
68
|
+
Canonical detector key from the ``bindings`` dict (NOT the
|
|
69
|
+
regex-matched surface form).
|
|
70
|
+
metric : str
|
|
71
|
+
Canonical metric key from the ``bindings`` dict.
|
|
72
|
+
value : float
|
|
73
|
+
The numeric value found in the prose.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
file: Path
|
|
77
|
+
line: int
|
|
78
|
+
detector: str
|
|
79
|
+
metric: str
|
|
80
|
+
value: float
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass(frozen=True)
|
|
84
|
+
class Violation:
|
|
85
|
+
"""A reader-prose (detector, metric, value) triple where the value disagrees with the canonical binding.
|
|
86
|
+
|
|
87
|
+
Attributes
|
|
88
|
+
----------
|
|
89
|
+
file : Path
|
|
90
|
+
File where the violation was found.
|
|
91
|
+
line : int
|
|
92
|
+
1-indexed line number of the offending value occurrence.
|
|
93
|
+
detector : str
|
|
94
|
+
Canonical detector key from the ``bindings`` dict (NOT the
|
|
95
|
+
regex-matched surface form).
|
|
96
|
+
metric : str
|
|
97
|
+
Canonical metric key from the ``bindings`` dict.
|
|
98
|
+
found_value : float
|
|
99
|
+
The numeric value the prose claims.
|
|
100
|
+
expected_value : float
|
|
101
|
+
The canonical value from the ``bindings`` dict.
|
|
102
|
+
surrounding_text : str
|
|
103
|
+
Excerpt centered on the value (configurable window) for
|
|
104
|
+
diagnostic display.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
file: Path
|
|
108
|
+
line: int
|
|
109
|
+
detector: str
|
|
110
|
+
metric: str
|
|
111
|
+
found_value: float
|
|
112
|
+
expected_value: float
|
|
113
|
+
surrounding_text: str
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass(frozen=True)
|
|
117
|
+
class ValueBindingsReport:
|
|
118
|
+
"""Result of :func:`validate_reader_value_bindings`.
|
|
119
|
+
|
|
120
|
+
Attributes
|
|
121
|
+
----------
|
|
122
|
+
violations : tuple[Violation, ...]
|
|
123
|
+
Each detected (detector, metric) → wrong-value triple. Empty
|
|
124
|
+
tuple if all reader-prose bindings match the canonical table.
|
|
125
|
+
matched : tuple[Match, ...]
|
|
126
|
+
Each detected (detector, metric, value) triple that matched
|
|
127
|
+
the canonical binding. Useful for coverage analysis +
|
|
128
|
+
regression-testing that the validator's regexes still fire.
|
|
129
|
+
coverage : float
|
|
130
|
+
Fraction of ``(detector, metric)`` keys in the ``bindings``
|
|
131
|
+
dict that produced at least one :class:`Match`. Range
|
|
132
|
+
``[0.0, 1.0]``. ``1.0`` means every binding was referenced in
|
|
133
|
+
the scanned prose; lower values flag potentially un-cited
|
|
134
|
+
bindings (which may be expected OR may indicate stale prose).
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
violations: tuple[Violation, ...]
|
|
138
|
+
matched: tuple[Match, ...]
|
|
139
|
+
coverage: float
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def validate_reader_value_bindings(
|
|
143
|
+
*,
|
|
144
|
+
files: Sequence[Path | str],
|
|
145
|
+
bindings: Mapping[tuple[str, str], float],
|
|
146
|
+
value_pattern: str = DEFAULT_VALUE_PATTERN,
|
|
147
|
+
max_distance_chars: int = DEFAULT_MAX_DISTANCE_CHARS,
|
|
148
|
+
metric_aliases: Mapping[str, Sequence[str]] = MappingProxyType({}),
|
|
149
|
+
detector_aliases: Mapping[str, Sequence[str]] = MappingProxyType({}),
|
|
150
|
+
tolerance: float = DEFAULT_TOLERANCE,
|
|
151
|
+
) -> ValueBindingsReport:
|
|
152
|
+
"""Validate (detector, metric, value) bindings in reader-prose markdown.
|
|
153
|
+
|
|
154
|
+
For each ``(detector_token, metric_token) -> expected_value`` entry
|
|
155
|
+
in ``bindings``, scan each file for triples of (detector mention,
|
|
156
|
+
metric mention, numeric value) within a ``max_distance_chars``
|
|
157
|
+
window. Compare the found value to the expected value; emit a
|
|
158
|
+
:class:`Violation` on mismatch, a :class:`Match` on agreement.
|
|
159
|
+
|
|
160
|
+
Both the detector and the metric must appear within the window
|
|
161
|
+
surrounding a candidate value for the triple to be considered —
|
|
162
|
+
a value that has only a detector or only a metric nearby is
|
|
163
|
+
ignored (those belong to a value-existence audit, not a binding
|
|
164
|
+
audit).
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
files : Sequence[Path | str]
|
|
169
|
+
Markdown files to scan. UTF-8 encoded.
|
|
170
|
+
bindings : Mapping[tuple[str, str], float]
|
|
171
|
+
Canonical (detector_name, metric_name) → expected_value table.
|
|
172
|
+
Keys are the canonical *identifiers* used in the report — the
|
|
173
|
+
regex patterns that match these in prose come from the
|
|
174
|
+
``*_aliases`` dicts (with the canonical name as a default
|
|
175
|
+
fallback pattern).
|
|
176
|
+
value_pattern : str, optional
|
|
177
|
+
Regex matching numeric values in prose. Default matches
|
|
178
|
+
``\\d+\\.\\d{2,4}`` (1+ integer part, 2-4 decimals).
|
|
179
|
+
max_distance_chars : int, optional
|
|
180
|
+
Maximum character distance allowed between a detector mention,
|
|
181
|
+
a metric mention, and a numeric value for them to be treated
|
|
182
|
+
as a triple. Default 80.
|
|
183
|
+
metric_aliases : Mapping[str, Sequence[str]], optional
|
|
184
|
+
``metric_name -> [regex_alternatives, ...]``. Each canonical
|
|
185
|
+
metric name in ``bindings`` may have multiple natural-language
|
|
186
|
+
forms (e.g., ``"direct_val_auprc"`` matches both ``"direct .*?
|
|
187
|
+
AUPRC"`` and ``"validation AUPRC"``). Missing keys default to
|
|
188
|
+
the canonical name itself, escaped.
|
|
189
|
+
detector_aliases : Mapping[str, Sequence[str]], optional
|
|
190
|
+
Same shape as ``metric_aliases``, applied case-insensitively.
|
|
191
|
+
Useful for ``"tf-idf + lr"`` → ``["TF-IDF", "TfIdf", "tfidf"]``.
|
|
192
|
+
tolerance : float, optional
|
|
193
|
+
Absolute tolerance for float comparison. Default ``1e-4``
|
|
194
|
+
(i.e., ``0.974`` and ``0.9740`` are considered equal).
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
ValueBindingsReport
|
|
199
|
+
``violations``, ``matched``, ``coverage`` per the dataclass.
|
|
200
|
+
|
|
201
|
+
Examples
|
|
202
|
+
--------
|
|
203
|
+
>>> from pathlib import Path
|
|
204
|
+
>>> import tempfile
|
|
205
|
+
>>> import textwrap
|
|
206
|
+
>>> with tempfile.NamedTemporaryFile(suffix=".md", mode="w", delete=False) as f:
|
|
207
|
+
... _ = f.write("TF-IDF + LR reaches 0.974 AUPRC on direct val.\\n")
|
|
208
|
+
... path = Path(f.name)
|
|
209
|
+
>>> report = validate_reader_value_bindings(
|
|
210
|
+
... files=[path],
|
|
211
|
+
... bindings={("tf-idf + lr", "direct_val_auprc"): 0.971},
|
|
212
|
+
... detector_aliases={"tf-idf + lr": ["TF-IDF"]},
|
|
213
|
+
... metric_aliases={"direct_val_auprc": ["direct val"]},
|
|
214
|
+
... )
|
|
215
|
+
>>> len(report.violations)
|
|
216
|
+
1
|
|
217
|
+
>>> report.violations[0].found_value
|
|
218
|
+
0.974
|
|
219
|
+
>>> report.violations[0].expected_value
|
|
220
|
+
0.971
|
|
221
|
+
|
|
222
|
+
Notes
|
|
223
|
+
-----
|
|
224
|
+
The validator is **pure**: consumer-side scripts glob markdown
|
|
225
|
+
files and parse canonical-binding tables (e.g., from a JSON
|
|
226
|
+
results file); this function does the regex + window + comparison
|
|
227
|
+
work and returns a structured report.
|
|
228
|
+
|
|
229
|
+
Multiple candidate values within the same detector+metric window
|
|
230
|
+
each produce their own Match / Violation entry. Coverage counts
|
|
231
|
+
a (detector, metric) key as covered iff at least one Match was
|
|
232
|
+
emitted for it (Violations don't count toward coverage — a
|
|
233
|
+
misbound mention proves the binding was REACHED but disproves
|
|
234
|
+
it was correct; the report makes both signals available).
|
|
235
|
+
|
|
236
|
+
Case-sensitivity: detector and metric regexes are applied with
|
|
237
|
+
``re.IGNORECASE``. The canonical names in ``bindings`` are used
|
|
238
|
+
verbatim in report keys regardless of how they were matched in
|
|
239
|
+
prose.
|
|
240
|
+
|
|
241
|
+
See Also
|
|
242
|
+
--------
|
|
243
|
+
eval_toolkit.audit_citation_alignment.validate_citations :
|
|
244
|
+
Sibling validator catching ADR-citation alignment drift.
|
|
245
|
+
"""
|
|
246
|
+
files_resolved = tuple(Path(f) for f in files)
|
|
247
|
+
|
|
248
|
+
bindings_dict = dict(bindings)
|
|
249
|
+
metric_aliases_dict = dict(metric_aliases)
|
|
250
|
+
detector_aliases_dict = dict(detector_aliases)
|
|
251
|
+
|
|
252
|
+
detector_keys = sorted({d for d, _ in bindings_dict})
|
|
253
|
+
metric_keys = sorted({m for _, m in bindings_dict})
|
|
254
|
+
|
|
255
|
+
detector_patterns: dict[str, re.Pattern[str]] = {
|
|
256
|
+
d: _build_pattern(d, detector_aliases_dict.get(d, ()), case_insensitive=True)
|
|
257
|
+
for d in detector_keys
|
|
258
|
+
}
|
|
259
|
+
metric_patterns: dict[str, re.Pattern[str]] = {
|
|
260
|
+
m: _build_pattern(m, metric_aliases_dict.get(m, ()), case_insensitive=True)
|
|
261
|
+
for m in metric_keys
|
|
262
|
+
}
|
|
263
|
+
value_re = re.compile(value_pattern)
|
|
264
|
+
|
|
265
|
+
violations: list[Violation] = []
|
|
266
|
+
matched: list[Match] = []
|
|
267
|
+
matched_keys: set[tuple[str, str]] = set()
|
|
268
|
+
|
|
269
|
+
for file_path in files_resolved:
|
|
270
|
+
text = file_path.read_text(encoding="utf-8")
|
|
271
|
+
line_starts = _line_starts(text)
|
|
272
|
+
|
|
273
|
+
# Pre-collect ALL detector positions (across every canonical
|
|
274
|
+
# detector key) so each value can be paired with its NEAREST
|
|
275
|
+
# detector. This avoids cross-detector contamination — e.g.,
|
|
276
|
+
# "TF-IDF achieves 0.971, while LoRA reaches 0.974" should
|
|
277
|
+
# pair 0.971 with TF-IDF and 0.974 with LoRA, NOT pair the
|
|
278
|
+
# 0.974 with TF-IDF's binding just because they happen to be
|
|
279
|
+
# within max_distance_chars of each other.
|
|
280
|
+
detector_positions: list[tuple[int, str]] = [] # (position, canonical_key)
|
|
281
|
+
for det_key, det_re in detector_patterns.items():
|
|
282
|
+
for det_match in det_re.finditer(text):
|
|
283
|
+
detector_positions.append((det_match.start(), det_key))
|
|
284
|
+
detector_positions.sort()
|
|
285
|
+
|
|
286
|
+
# For each binding, look in each file for triples.
|
|
287
|
+
for (det_key, met_key), expected in bindings_dict.items():
|
|
288
|
+
det_re = detector_patterns[det_key]
|
|
289
|
+
met_re = metric_patterns[met_key]
|
|
290
|
+
|
|
291
|
+
for det_match in det_re.finditer(text):
|
|
292
|
+
window_start = max(0, det_match.start() - max_distance_chars)
|
|
293
|
+
window_end = min(len(text), det_match.end() + max_distance_chars)
|
|
294
|
+
window_text = text[window_start:window_end]
|
|
295
|
+
window_offset = window_start
|
|
296
|
+
|
|
297
|
+
# Both metric and a value must appear in the window.
|
|
298
|
+
met_hits = list(met_re.finditer(window_text))
|
|
299
|
+
if not met_hits:
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
for val_match in value_re.finditer(window_text):
|
|
303
|
+
# Skip values immediately adjacent to digits (avoid
|
|
304
|
+
# picking up e.g., "0.974" inside "10.974" or version
|
|
305
|
+
# strings like "1.0.974"). Simple heuristic: the
|
|
306
|
+
# character before the match (if any) must not be a
|
|
307
|
+
# digit or dot.
|
|
308
|
+
val_start_in_full = window_offset + val_match.start()
|
|
309
|
+
if val_start_in_full > 0:
|
|
310
|
+
prev_char = text[val_start_in_full - 1]
|
|
311
|
+
if prev_char.isdigit() or prev_char == ".":
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
val_str = val_match.group(0)
|
|
315
|
+
try:
|
|
316
|
+
found = float(val_str)
|
|
317
|
+
except ValueError: # pragma: no cover
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
# Cross-detector disambiguation: require the current
|
|
321
|
+
# det_key to be the detector paired with this value
|
|
322
|
+
# by the text-order rule (last detector before; else
|
|
323
|
+
# first detector after). Avoids cross-contamination
|
|
324
|
+
# on multi-detector prose like "TF-IDF achieves
|
|
325
|
+
# 0.971, while LoRA reaches 0.974".
|
|
326
|
+
paired_key = _nearest_detector_key(
|
|
327
|
+
detector_positions, val_start_in_full, max_distance_chars
|
|
328
|
+
)
|
|
329
|
+
if paired_key != det_key:
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
# Require the metric mention be within distance of the value too,
|
|
333
|
+
# not just within the detector window.
|
|
334
|
+
met_close = any(
|
|
335
|
+
abs(mh.start() - val_match.start()) <= max_distance_chars for mh in met_hits
|
|
336
|
+
)
|
|
337
|
+
if not met_close:
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
line_no = _position_to_line(line_starts, val_start_in_full)
|
|
341
|
+
if abs(found - expected) <= tolerance:
|
|
342
|
+
matched.append(
|
|
343
|
+
Match(
|
|
344
|
+
file=file_path,
|
|
345
|
+
line=line_no,
|
|
346
|
+
detector=det_key,
|
|
347
|
+
metric=met_key,
|
|
348
|
+
value=found,
|
|
349
|
+
)
|
|
350
|
+
)
|
|
351
|
+
matched_keys.add((det_key, met_key))
|
|
352
|
+
else:
|
|
353
|
+
# Widen the surrounding context for diagnostic
|
|
354
|
+
# clarity. Center on the value but include
|
|
355
|
+
# ±60 chars to typically capture the detector
|
|
356
|
+
# mention.
|
|
357
|
+
ctx_start = max(0, val_start_in_full - 60)
|
|
358
|
+
ctx_end = min(len(text), val_start_in_full + len(val_str) + 60)
|
|
359
|
+
surrounding = text[ctx_start:ctx_end].replace("\n", " ").strip()
|
|
360
|
+
violations.append(
|
|
361
|
+
Violation(
|
|
362
|
+
file=file_path,
|
|
363
|
+
line=line_no,
|
|
364
|
+
detector=det_key,
|
|
365
|
+
metric=met_key,
|
|
366
|
+
found_value=found,
|
|
367
|
+
expected_value=expected,
|
|
368
|
+
surrounding_text=surrounding,
|
|
369
|
+
)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
coverage = len(matched_keys) / len(bindings_dict) if bindings_dict else 0.0
|
|
373
|
+
return ValueBindingsReport(
|
|
374
|
+
violations=tuple(violations),
|
|
375
|
+
matched=tuple(matched),
|
|
376
|
+
coverage=coverage,
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _build_pattern(
|
|
381
|
+
canonical: str,
|
|
382
|
+
aliases: Sequence[str],
|
|
383
|
+
*,
|
|
384
|
+
case_insensitive: bool,
|
|
385
|
+
) -> re.Pattern[str]:
|
|
386
|
+
"""Build an OR-joined regex covering canonical name + aliases."""
|
|
387
|
+
parts = [re.escape(canonical), *aliases]
|
|
388
|
+
pattern = "|".join(f"(?:{p})" for p in parts)
|
|
389
|
+
flags = re.IGNORECASE if case_insensitive else 0
|
|
390
|
+
return re.compile(pattern, flags)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _line_starts(text: str) -> list[int]:
|
|
394
|
+
"""Return character positions where each line starts. line[i] starts at line_starts[i]."""
|
|
395
|
+
starts = [0]
|
|
396
|
+
for i, ch in enumerate(text):
|
|
397
|
+
if ch == "\n":
|
|
398
|
+
starts.append(i + 1)
|
|
399
|
+
return starts
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _nearest_detector_key(
|
|
403
|
+
detector_positions: Sequence[tuple[int, str]],
|
|
404
|
+
value_pos: int,
|
|
405
|
+
max_distance: int,
|
|
406
|
+
) -> str | None:
|
|
407
|
+
"""Return the canonical detector key paired with ``value_pos``, or None.
|
|
408
|
+
|
|
409
|
+
Pairing rule: pick the LAST detector that appears BEFORE the value
|
|
410
|
+
(text-order); if none is within ``max_distance``, fall back to the
|
|
411
|
+
FIRST detector that appears AFTER the value within the same range.
|
|
412
|
+
This matches natural English prose patterns "<detector> ...
|
|
413
|
+
<value>" (predominant) and "<value> ... by <detector>" (rare).
|
|
414
|
+
|
|
415
|
+
The previous "absolute-distance nearest" heuristic produced false
|
|
416
|
+
positives on prose like "TF-IDF achieves 0.971, while LoRA reaches
|
|
417
|
+
0.974" where 0.971 is closer to LoRA in raw distance even though
|
|
418
|
+
it semantically belongs to TF-IDF.
|
|
419
|
+
"""
|
|
420
|
+
if not detector_positions:
|
|
421
|
+
return None
|
|
422
|
+
# Look for the LAST detector strictly before the value, within range.
|
|
423
|
+
last_before: str | None = None
|
|
424
|
+
for pos, key in detector_positions:
|
|
425
|
+
if pos < value_pos and (value_pos - pos) <= max_distance:
|
|
426
|
+
last_before = key
|
|
427
|
+
elif pos >= value_pos:
|
|
428
|
+
break
|
|
429
|
+
if last_before is not None:
|
|
430
|
+
return last_before
|
|
431
|
+
# Fall back: FIRST detector after the value, within range.
|
|
432
|
+
for pos, key in detector_positions:
|
|
433
|
+
if pos >= value_pos and (pos - value_pos) <= max_distance:
|
|
434
|
+
return key
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _position_to_line(line_starts: list[int], pos: int) -> int:
|
|
439
|
+
"""Convert a 0-indexed character position to a 1-indexed line number."""
|
|
440
|
+
# Binary-search-like; line_starts is sorted.
|
|
441
|
+
lo, hi = 0, len(line_starts) - 1
|
|
442
|
+
while lo < hi:
|
|
443
|
+
mid = (lo + hi + 1) // 2
|
|
444
|
+
if line_starts[mid] <= pos:
|
|
445
|
+
lo = mid
|
|
446
|
+
else:
|
|
447
|
+
hi = mid - 1
|
|
448
|
+
return lo + 1
|
|
@@ -792,6 +792,20 @@ def expected_calibration_error(
|
|
|
792
792
|
empirical positive rate in the bin, and :math:`\\mathrm{conf}` is the
|
|
793
793
|
mean predicted score.
|
|
794
794
|
|
|
795
|
+
**Uniform / uninformative scores** (F-metrics-3 v1.0.2 clarity pass):
|
|
796
|
+
when ``y_score`` is constant (e.g., ``[0.5] * n`` — an uninformative
|
|
797
|
+
detector), this function returns ``0.0`` regardless of the true label
|
|
798
|
+
distribution. That's technically correct per the formula —
|
|
799
|
+
:math:`|\\mathrm{acc}(B_m) - \\mathrm{conf}(B_m)|` measures bin-level
|
|
800
|
+
calibration, and a single occupied bin with ``conf = base rate``
|
|
801
|
+
achieves perfect calibration locally. But it is semantically
|
|
802
|
+
misleading: an uninformative scorer looks "perfectly calibrated"
|
|
803
|
+
even though it has zero discriminative power. **Callers should
|
|
804
|
+
detect and filter uninformative inputs before passing to ECE** —
|
|
805
|
+
e.g., reject when ``np.unique(y_score).size == 1`` or when the
|
|
806
|
+
score variance is below a domain-specific threshold. Use
|
|
807
|
+
:func:`brier_score` or :func:`pr_auc` for resolution-aware metrics.
|
|
808
|
+
|
|
795
809
|
References
|
|
796
810
|
----------
|
|
797
811
|
.. [1] DeGroot, M. H. & Fienberg, S. E. "The comparison and evaluation of
|
|
@@ -1240,6 +1254,30 @@ def brier_score(
|
|
|
1240
1254
|
-----
|
|
1241
1255
|
.. math:: \mathrm{BS} = \frac{1}{n} \sum_i (p_i - y_i)^2
|
|
1242
1256
|
|
|
1257
|
+
**Input domain** (F-metrics-1 v1.0.2 clarity pass): ``y_true`` must
|
|
1258
|
+
be binary labels in ``{0, 1}`` (other label values raise
|
|
1259
|
+
``ValueError``). ``y_score`` must be calibrated probabilities in
|
|
1260
|
+
``[0, 1]`` — raw logits or unbounded ranking scores will pass the
|
|
1261
|
+
finiteness check but produce an out-of-range MSE that misrepresents
|
|
1262
|
+
calibration quality. If your scorer produces logits, apply
|
|
1263
|
+
sigmoid / softmax / a fitted calibrator (see
|
|
1264
|
+
:mod:`eval_toolkit.calibration`) before passing to ``brier_score``.
|
|
1265
|
+
|
|
1266
|
+
**Single-class behavior** (F-metrics-4 v1.0.2 clarity pass): unlike
|
|
1267
|
+
PR-AUC / ROC-AUC, ``brier_score`` is well-defined when ``y_true``
|
|
1268
|
+
is all-zeros or all-ones — it degenerates to the MSE around the
|
|
1269
|
+
constant class label. Specifically:
|
|
1270
|
+
|
|
1271
|
+
- All-zeros: :math:`\mathrm{BS} = \frac{1}{n} \sum_i p_i^2` —
|
|
1272
|
+
forecasting any positive probability incurs squared-error loss.
|
|
1273
|
+
- All-ones: :math:`\mathrm{BS} = \frac{1}{n} \sum_i (1 - p_i)^2`
|
|
1274
|
+
— forecasting low probability incurs squared-error loss.
|
|
1275
|
+
|
|
1276
|
+
This is the deliberate Brier-as-strict-proper-scoring-rule behavior
|
|
1277
|
+
(Brier 1950). Per-slice degenerate-class evaluation is supported
|
|
1278
|
+
via the ``empty_strategy`` parameter for ``n=0`` only; non-empty
|
|
1279
|
+
single-class slices score normally.
|
|
1280
|
+
|
|
1243
1281
|
See Also
|
|
1244
1282
|
--------
|
|
1245
1283
|
eval_toolkit.metrics.brier_decomposition :
|