eval-toolkit 1.0.2__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/CHANGELOG.md +37 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/PKG-INFO +1 -1
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/__init__.py +7 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-1.0.3/src/eval_toolkit/audit_value_bindings.py +448 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/public_api/snapshot.json +34 -1
- eval_toolkit-1.0.3/tests/test_audit_value_bindings.py +338 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/.gitignore +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/LICENSE +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/STYLE.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/archive/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/pyproject.toml +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/conftest.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/strategies.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_claims.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_cli.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_config.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_logging.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_losses.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_paths.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_probes.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_rng.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_splits.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,43 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.3] — 2026-05-26 — `audit_value_bindings` module (closes #71)
|
|
9
|
+
|
|
10
|
+
Tier-2 ADDITIVE — second member of the audit-validator family
|
|
11
|
+
following `audit_citation_alignment` (v1.0.1). Flat-module per
|
|
12
|
+
[ADR 0001](docs/source/adr/0001-flat-module-layout.md).
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
|
|
16
|
+
- **`audit_value_bindings` module** exporting
|
|
17
|
+
`validate_reader_value_bindings()` + `Match` + `Violation` +
|
|
18
|
+
`ValueBindingsReport` as Tier 1 STRICT (per
|
|
19
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
|
|
20
|
+
Catches the bug class where a markdown surface pairs a detector name
|
|
21
|
+
with the **wrong** canonical value — both values exist in the
|
|
22
|
+
source-of-truth table but the binding is misordered. Motivated by
|
|
23
|
+
the consumer V1.3.1 ADR-080 audit-fix patch closure (2026-05-22)
|
|
24
|
+
where `WRITEUP_NARRATIVE.md:38` said "TF-IDF + logistic regression
|
|
25
|
+
baseline reaches 0.974 AUPRC" but canonical TF-IDF direct val AUPRC
|
|
26
|
+
is 0.971 (0.974 was LoRA's value). The existing `audit_numbers.py`
|
|
27
|
+
validates VALUES against source data but not BINDINGS — this
|
|
28
|
+
validator closes that gap.
|
|
29
|
+
- Cross-detector disambiguation: when multiple detectors and values
|
|
30
|
+
appear in the same paragraph (e.g., "TF-IDF achieves 0.971, while
|
|
31
|
+
LoRA reaches 0.974"), each value pairs with the LAST detector
|
|
32
|
+
appearing before it in text order (falling back to first detector
|
|
33
|
+
after if no before-detector is in range). Avoids false-positive
|
|
34
|
+
bindings across closely-spaced detector mentions.
|
|
35
|
+
- Coverage metric: `ValueBindingsReport.coverage` reports the fraction
|
|
36
|
+
of `(detector, metric)` keys in the canonical `bindings` dict that
|
|
37
|
+
produced at least one `Match` — useful for detecting stale or
|
|
38
|
+
unreferenced bindings in reader prose.
|
|
39
|
+
- 13 tests at `tests/test_audit_value_bindings.py` including the
|
|
40
|
+
verbatim WRITEUP_NARRATIVE seed-case regression, alias resolution
|
|
41
|
+
(detector + metric), distance-window edge, value-without-metric
|
|
42
|
+
skip, coverage fraction, tolerance band, multi-detector
|
|
43
|
+
disambiguation, frozen-dataclass invariants. Closes #71.
|
|
44
|
+
|
|
8
45
|
## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
|
|
9
46
|
|
|
10
47
|
Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -60,6 +60,13 @@ _EXPORTS: dict[str, str] = {
|
|
|
60
60
|
"CitationMisalignment": "eval_toolkit.audit_citation_alignment",
|
|
61
61
|
"extract_adr_subject_category": "eval_toolkit.audit_citation_alignment",
|
|
62
62
|
"validate_citations": "eval_toolkit.audit_citation_alignment",
|
|
63
|
+
# --- audit_value_bindings ---
|
|
64
|
+
# Flat-module per ADR 0001. Closes #71. Motivated by consumer V1.3.1
|
|
65
|
+
# ADR-080 audit-fix finding (TF-IDF / LoRA 0.974 value mis-binding).
|
|
66
|
+
"Match": "eval_toolkit.audit_value_bindings",
|
|
67
|
+
"ValueBindingsReport": "eval_toolkit.audit_value_bindings",
|
|
68
|
+
"Violation": "eval_toolkit.audit_value_bindings",
|
|
69
|
+
"validate_reader_value_bindings": "eval_toolkit.audit_value_bindings",
|
|
63
70
|
# --- losses ---
|
|
64
71
|
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
65
72
|
# --- preprocessing ---
|
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
r"""Reader-prose value-binding validator.
|
|
2
|
+
|
|
3
|
+
Catches the bug class where a reader-facing markdown surface pairs a
|
|
4
|
+
detector name with the **wrong** canonical value — both values are
|
|
5
|
+
present in the source-of-truth table, but the binding is misordered.
|
|
6
|
+
|
|
7
|
+
Motivating test case (from `prompt-injection-detection-prototype`
|
|
8
|
+
v1.3.1 audit-fix, ADR-080 patch closure 2026-05-22)::
|
|
9
|
+
|
|
10
|
+
WRITEUP_NARRATIVE.md:38:
|
|
11
|
+
"The TF-IDF + logistic regression baseline reaches 0.974 AUPRC
|
|
12
|
+
on balanced direct-versus-benign validation."
|
|
13
|
+
|
|
14
|
+
Canonical: TF-IDF direct val AUPRC = 0.971; LoRA direct val AUPRC =
|
|
15
|
+
0.974. Both values exist in the bindings table; the bug is the wrong
|
|
16
|
+
(detector, value) pairing. The pre-existing ``audit_numbers.py``-style
|
|
17
|
+
primitive validates VALUES against source data; this validator
|
|
18
|
+
validates BINDINGS — that each prose-mentioned (detector_token,
|
|
19
|
+
metric_token, value) triple matches the canonical binding.
|
|
20
|
+
|
|
21
|
+
Design (per ADR 0001 flat-module + ADR 0002 closed-config + ADR 0003
|
|
22
|
+
Tier 1 STRICT public-API contract):
|
|
23
|
+
|
|
24
|
+
- Consumer supplies the canonical-binding table + value/metric/detector
|
|
25
|
+
regex patterns; validator handles position-aware regex scan + binding
|
|
26
|
+
lookup + report assembly.
|
|
27
|
+
- Flat-module: `eval_toolkit.audit_value_bindings.*` (NOT a subpackage
|
|
28
|
+
per ADR 0001 stay-flat-through-v1.x).
|
|
29
|
+
- All Tier-1 STRICT public symbols (`validate_reader_value_bindings`,
|
|
30
|
+
`Match`, `Violation`, `ValueBindingsReport`) re-exported at top level
|
|
31
|
+
via `_EXPORTS` lazy resolver.
|
|
32
|
+
|
|
33
|
+
Closes upstream issue #71. v1.0.3.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import re
|
|
39
|
+
from collections.abc import Mapping, Sequence
|
|
40
|
+
from dataclasses import dataclass
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from types import MappingProxyType
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"Match",
|
|
46
|
+
"ValueBindingsReport",
|
|
47
|
+
"Violation",
|
|
48
|
+
"validate_reader_value_bindings",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
DEFAULT_VALUE_PATTERN: str = r"\d+\.\d{2,4}"
|
|
53
|
+
DEFAULT_MAX_DISTANCE_CHARS: int = 80
|
|
54
|
+
DEFAULT_TOLERANCE: float = 1e-4
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True)
|
|
58
|
+
class Match:
|
|
59
|
+
"""A reader-prose (detector, metric, value) triple that matches the canonical binding.
|
|
60
|
+
|
|
61
|
+
Attributes
|
|
62
|
+
----------
|
|
63
|
+
file : Path
|
|
64
|
+
File where the match was found.
|
|
65
|
+
line : int
|
|
66
|
+
1-indexed line number of the value occurrence.
|
|
67
|
+
detector : str
|
|
68
|
+
Canonical detector key from the ``bindings`` dict (NOT the
|
|
69
|
+
regex-matched surface form).
|
|
70
|
+
metric : str
|
|
71
|
+
Canonical metric key from the ``bindings`` dict.
|
|
72
|
+
value : float
|
|
73
|
+
The numeric value found in the prose.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
file: Path
|
|
77
|
+
line: int
|
|
78
|
+
detector: str
|
|
79
|
+
metric: str
|
|
80
|
+
value: float
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass(frozen=True)
|
|
84
|
+
class Violation:
|
|
85
|
+
"""A reader-prose (detector, metric, value) triple where the value disagrees with the canonical binding.
|
|
86
|
+
|
|
87
|
+
Attributes
|
|
88
|
+
----------
|
|
89
|
+
file : Path
|
|
90
|
+
File where the violation was found.
|
|
91
|
+
line : int
|
|
92
|
+
1-indexed line number of the offending value occurrence.
|
|
93
|
+
detector : str
|
|
94
|
+
Canonical detector key from the ``bindings`` dict (NOT the
|
|
95
|
+
regex-matched surface form).
|
|
96
|
+
metric : str
|
|
97
|
+
Canonical metric key from the ``bindings`` dict.
|
|
98
|
+
found_value : float
|
|
99
|
+
The numeric value the prose claims.
|
|
100
|
+
expected_value : float
|
|
101
|
+
The canonical value from the ``bindings`` dict.
|
|
102
|
+
surrounding_text : str
|
|
103
|
+
Excerpt centered on the value (configurable window) for
|
|
104
|
+
diagnostic display.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
file: Path
|
|
108
|
+
line: int
|
|
109
|
+
detector: str
|
|
110
|
+
metric: str
|
|
111
|
+
found_value: float
|
|
112
|
+
expected_value: float
|
|
113
|
+
surrounding_text: str
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass(frozen=True)
|
|
117
|
+
class ValueBindingsReport:
|
|
118
|
+
"""Result of :func:`validate_reader_value_bindings`.
|
|
119
|
+
|
|
120
|
+
Attributes
|
|
121
|
+
----------
|
|
122
|
+
violations : tuple[Violation, ...]
|
|
123
|
+
Each detected (detector, metric) → wrong-value triple. Empty
|
|
124
|
+
tuple if all reader-prose bindings match the canonical table.
|
|
125
|
+
matched : tuple[Match, ...]
|
|
126
|
+
Each detected (detector, metric, value) triple that matched
|
|
127
|
+
the canonical binding. Useful for coverage analysis +
|
|
128
|
+
regression-testing that the validator's regexes still fire.
|
|
129
|
+
coverage : float
|
|
130
|
+
Fraction of ``(detector, metric)`` keys in the ``bindings``
|
|
131
|
+
dict that produced at least one :class:`Match`. Range
|
|
132
|
+
``[0.0, 1.0]``. ``1.0`` means every binding was referenced in
|
|
133
|
+
the scanned prose; lower values flag potentially un-cited
|
|
134
|
+
bindings (which may be expected OR may indicate stale prose).
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
violations: tuple[Violation, ...]
|
|
138
|
+
matched: tuple[Match, ...]
|
|
139
|
+
coverage: float
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def validate_reader_value_bindings(
|
|
143
|
+
*,
|
|
144
|
+
files: Sequence[Path | str],
|
|
145
|
+
bindings: Mapping[tuple[str, str], float],
|
|
146
|
+
value_pattern: str = DEFAULT_VALUE_PATTERN,
|
|
147
|
+
max_distance_chars: int = DEFAULT_MAX_DISTANCE_CHARS,
|
|
148
|
+
metric_aliases: Mapping[str, Sequence[str]] = MappingProxyType({}),
|
|
149
|
+
detector_aliases: Mapping[str, Sequence[str]] = MappingProxyType({}),
|
|
150
|
+
tolerance: float = DEFAULT_TOLERANCE,
|
|
151
|
+
) -> ValueBindingsReport:
|
|
152
|
+
"""Validate (detector, metric, value) bindings in reader-prose markdown.
|
|
153
|
+
|
|
154
|
+
For each ``(detector_token, metric_token) -> expected_value`` entry
|
|
155
|
+
in ``bindings``, scan each file for triples of (detector mention,
|
|
156
|
+
metric mention, numeric value) within a ``max_distance_chars``
|
|
157
|
+
window. Compare the found value to the expected value; emit a
|
|
158
|
+
:class:`Violation` on mismatch, a :class:`Match` on agreement.
|
|
159
|
+
|
|
160
|
+
Both the detector and the metric must appear within the window
|
|
161
|
+
surrounding a candidate value for the triple to be considered —
|
|
162
|
+
a value that has only a detector or only a metric nearby is
|
|
163
|
+
ignored (those belong to a value-existence audit, not a binding
|
|
164
|
+
audit).
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
files : Sequence[Path | str]
|
|
169
|
+
Markdown files to scan. UTF-8 encoded.
|
|
170
|
+
bindings : Mapping[tuple[str, str], float]
|
|
171
|
+
Canonical (detector_name, metric_name) → expected_value table.
|
|
172
|
+
Keys are the canonical *identifiers* used in the report — the
|
|
173
|
+
regex patterns that match these in prose come from the
|
|
174
|
+
``*_aliases`` dicts (with the canonical name as a default
|
|
175
|
+
fallback pattern).
|
|
176
|
+
value_pattern : str, optional
|
|
177
|
+
Regex matching numeric values in prose. Default matches
|
|
178
|
+
``\\d+\\.\\d{2,4}`` (1+ integer part, 2-4 decimals).
|
|
179
|
+
max_distance_chars : int, optional
|
|
180
|
+
Maximum character distance allowed between a detector mention,
|
|
181
|
+
a metric mention, and a numeric value for them to be treated
|
|
182
|
+
as a triple. Default 80.
|
|
183
|
+
metric_aliases : Mapping[str, Sequence[str]], optional
|
|
184
|
+
``metric_name -> [regex_alternatives, ...]``. Each canonical
|
|
185
|
+
metric name in ``bindings`` may have multiple natural-language
|
|
186
|
+
forms (e.g., ``"direct_val_auprc"`` matches both ``"direct .*?
|
|
187
|
+
AUPRC"`` and ``"validation AUPRC"``). Missing keys default to
|
|
188
|
+
the canonical name itself, escaped.
|
|
189
|
+
detector_aliases : Mapping[str, Sequence[str]], optional
|
|
190
|
+
Same shape as ``metric_aliases``, applied case-insensitively.
|
|
191
|
+
Useful for ``"tf-idf + lr"`` → ``["TF-IDF", "TfIdf", "tfidf"]``.
|
|
192
|
+
tolerance : float, optional
|
|
193
|
+
Absolute tolerance for float comparison. Default ``1e-4``
|
|
194
|
+
(i.e., ``0.974`` and ``0.9740`` are considered equal).
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
ValueBindingsReport
|
|
199
|
+
``violations``, ``matched``, ``coverage`` per the dataclass.
|
|
200
|
+
|
|
201
|
+
Examples
|
|
202
|
+
--------
|
|
203
|
+
>>> from pathlib import Path
|
|
204
|
+
>>> import tempfile
|
|
205
|
+
>>> import textwrap
|
|
206
|
+
>>> with tempfile.NamedTemporaryFile(suffix=".md", mode="w", delete=False) as f:
|
|
207
|
+
... _ = f.write("TF-IDF + LR reaches 0.974 AUPRC on direct val.\\n")
|
|
208
|
+
... path = Path(f.name)
|
|
209
|
+
>>> report = validate_reader_value_bindings(
|
|
210
|
+
... files=[path],
|
|
211
|
+
... bindings={("tf-idf + lr", "direct_val_auprc"): 0.971},
|
|
212
|
+
... detector_aliases={"tf-idf + lr": ["TF-IDF"]},
|
|
213
|
+
... metric_aliases={"direct_val_auprc": ["direct val"]},
|
|
214
|
+
... )
|
|
215
|
+
>>> len(report.violations)
|
|
216
|
+
1
|
|
217
|
+
>>> report.violations[0].found_value
|
|
218
|
+
0.974
|
|
219
|
+
>>> report.violations[0].expected_value
|
|
220
|
+
0.971
|
|
221
|
+
|
|
222
|
+
Notes
|
|
223
|
+
-----
|
|
224
|
+
The validator is **pure**: consumer-side scripts glob markdown
|
|
225
|
+
files and parse canonical-binding tables (e.g., from a JSON
|
|
226
|
+
results file); this function does the regex + window + comparison
|
|
227
|
+
work and returns a structured report.
|
|
228
|
+
|
|
229
|
+
Multiple candidate values within the same detector+metric window
|
|
230
|
+
each produce their own Match / Violation entry. Coverage counts
|
|
231
|
+
a (detector, metric) key as covered iff at least one Match was
|
|
232
|
+
emitted for it (Violations don't count toward coverage — a
|
|
233
|
+
misbound mention proves the binding was REACHED but disproves
|
|
234
|
+
it was correct; the report makes both signals available).
|
|
235
|
+
|
|
236
|
+
Case-sensitivity: detector and metric regexes are applied with
|
|
237
|
+
``re.IGNORECASE``. The canonical names in ``bindings`` are used
|
|
238
|
+
verbatim in report keys regardless of how they were matched in
|
|
239
|
+
prose.
|
|
240
|
+
|
|
241
|
+
See Also
|
|
242
|
+
--------
|
|
243
|
+
eval_toolkit.audit_citation_alignment.validate_citations :
|
|
244
|
+
Sibling validator catching ADR-citation alignment drift.
|
|
245
|
+
"""
|
|
246
|
+
files_resolved = tuple(Path(f) for f in files)
|
|
247
|
+
|
|
248
|
+
bindings_dict = dict(bindings)
|
|
249
|
+
metric_aliases_dict = dict(metric_aliases)
|
|
250
|
+
detector_aliases_dict = dict(detector_aliases)
|
|
251
|
+
|
|
252
|
+
detector_keys = sorted({d for d, _ in bindings_dict})
|
|
253
|
+
metric_keys = sorted({m for _, m in bindings_dict})
|
|
254
|
+
|
|
255
|
+
detector_patterns: dict[str, re.Pattern[str]] = {
|
|
256
|
+
d: _build_pattern(d, detector_aliases_dict.get(d, ()), case_insensitive=True)
|
|
257
|
+
for d in detector_keys
|
|
258
|
+
}
|
|
259
|
+
metric_patterns: dict[str, re.Pattern[str]] = {
|
|
260
|
+
m: _build_pattern(m, metric_aliases_dict.get(m, ()), case_insensitive=True)
|
|
261
|
+
for m in metric_keys
|
|
262
|
+
}
|
|
263
|
+
value_re = re.compile(value_pattern)
|
|
264
|
+
|
|
265
|
+
violations: list[Violation] = []
|
|
266
|
+
matched: list[Match] = []
|
|
267
|
+
matched_keys: set[tuple[str, str]] = set()
|
|
268
|
+
|
|
269
|
+
for file_path in files_resolved:
|
|
270
|
+
text = file_path.read_text(encoding="utf-8")
|
|
271
|
+
line_starts = _line_starts(text)
|
|
272
|
+
|
|
273
|
+
# Pre-collect ALL detector positions (across every canonical
|
|
274
|
+
# detector key) so each value can be paired with its NEAREST
|
|
275
|
+
# detector. This avoids cross-detector contamination — e.g.,
|
|
276
|
+
# "TF-IDF achieves 0.971, while LoRA reaches 0.974" should
|
|
277
|
+
# pair 0.971 with TF-IDF and 0.974 with LoRA, NOT pair the
|
|
278
|
+
# 0.974 with TF-IDF's binding just because they happen to be
|
|
279
|
+
# within max_distance_chars of each other.
|
|
280
|
+
detector_positions: list[tuple[int, str]] = [] # (position, canonical_key)
|
|
281
|
+
for det_key, det_re in detector_patterns.items():
|
|
282
|
+
for det_match in det_re.finditer(text):
|
|
283
|
+
detector_positions.append((det_match.start(), det_key))
|
|
284
|
+
detector_positions.sort()
|
|
285
|
+
|
|
286
|
+
# For each binding, look in each file for triples.
|
|
287
|
+
for (det_key, met_key), expected in bindings_dict.items():
|
|
288
|
+
det_re = detector_patterns[det_key]
|
|
289
|
+
met_re = metric_patterns[met_key]
|
|
290
|
+
|
|
291
|
+
for det_match in det_re.finditer(text):
|
|
292
|
+
window_start = max(0, det_match.start() - max_distance_chars)
|
|
293
|
+
window_end = min(len(text), det_match.end() + max_distance_chars)
|
|
294
|
+
window_text = text[window_start:window_end]
|
|
295
|
+
window_offset = window_start
|
|
296
|
+
|
|
297
|
+
# Both metric and a value must appear in the window.
|
|
298
|
+
met_hits = list(met_re.finditer(window_text))
|
|
299
|
+
if not met_hits:
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
for val_match in value_re.finditer(window_text):
|
|
303
|
+
# Skip values immediately adjacent to digits (avoid
|
|
304
|
+
# picking up e.g., "0.974" inside "10.974" or version
|
|
305
|
+
# strings like "1.0.974"). Simple heuristic: the
|
|
306
|
+
# character before the match (if any) must not be a
|
|
307
|
+
# digit or dot.
|
|
308
|
+
val_start_in_full = window_offset + val_match.start()
|
|
309
|
+
if val_start_in_full > 0:
|
|
310
|
+
prev_char = text[val_start_in_full - 1]
|
|
311
|
+
if prev_char.isdigit() or prev_char == ".":
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
val_str = val_match.group(0)
|
|
315
|
+
try:
|
|
316
|
+
found = float(val_str)
|
|
317
|
+
except ValueError: # pragma: no cover
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
# Cross-detector disambiguation: require the current
|
|
321
|
+
# det_key to be the detector paired with this value
|
|
322
|
+
# by the text-order rule (last detector before; else
|
|
323
|
+
# first detector after). Avoids cross-contamination
|
|
324
|
+
# on multi-detector prose like "TF-IDF achieves
|
|
325
|
+
# 0.971, while LoRA reaches 0.974".
|
|
326
|
+
paired_key = _nearest_detector_key(
|
|
327
|
+
detector_positions, val_start_in_full, max_distance_chars
|
|
328
|
+
)
|
|
329
|
+
if paired_key != det_key:
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
# Require the metric mention be within distance of the value too,
|
|
333
|
+
# not just within the detector window.
|
|
334
|
+
met_close = any(
|
|
335
|
+
abs(mh.start() - val_match.start()) <= max_distance_chars for mh in met_hits
|
|
336
|
+
)
|
|
337
|
+
if not met_close:
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
line_no = _position_to_line(line_starts, val_start_in_full)
|
|
341
|
+
if abs(found - expected) <= tolerance:
|
|
342
|
+
matched.append(
|
|
343
|
+
Match(
|
|
344
|
+
file=file_path,
|
|
345
|
+
line=line_no,
|
|
346
|
+
detector=det_key,
|
|
347
|
+
metric=met_key,
|
|
348
|
+
value=found,
|
|
349
|
+
)
|
|
350
|
+
)
|
|
351
|
+
matched_keys.add((det_key, met_key))
|
|
352
|
+
else:
|
|
353
|
+
# Widen the surrounding context for diagnostic
|
|
354
|
+
# clarity. Center on the value but include
|
|
355
|
+
# ±60 chars to typically capture the detector
|
|
356
|
+
# mention.
|
|
357
|
+
ctx_start = max(0, val_start_in_full - 60)
|
|
358
|
+
ctx_end = min(len(text), val_start_in_full + len(val_str) + 60)
|
|
359
|
+
surrounding = text[ctx_start:ctx_end].replace("\n", " ").strip()
|
|
360
|
+
violations.append(
|
|
361
|
+
Violation(
|
|
362
|
+
file=file_path,
|
|
363
|
+
line=line_no,
|
|
364
|
+
detector=det_key,
|
|
365
|
+
metric=met_key,
|
|
366
|
+
found_value=found,
|
|
367
|
+
expected_value=expected,
|
|
368
|
+
surrounding_text=surrounding,
|
|
369
|
+
)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
coverage = len(matched_keys) / len(bindings_dict) if bindings_dict else 0.0
|
|
373
|
+
return ValueBindingsReport(
|
|
374
|
+
violations=tuple(violations),
|
|
375
|
+
matched=tuple(matched),
|
|
376
|
+
coverage=coverage,
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _build_pattern(
|
|
381
|
+
canonical: str,
|
|
382
|
+
aliases: Sequence[str],
|
|
383
|
+
*,
|
|
384
|
+
case_insensitive: bool,
|
|
385
|
+
) -> re.Pattern[str]:
|
|
386
|
+
"""Build an OR-joined regex covering canonical name + aliases."""
|
|
387
|
+
parts = [re.escape(canonical), *aliases]
|
|
388
|
+
pattern = "|".join(f"(?:{p})" for p in parts)
|
|
389
|
+
flags = re.IGNORECASE if case_insensitive else 0
|
|
390
|
+
return re.compile(pattern, flags)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _line_starts(text: str) -> list[int]:
|
|
394
|
+
"""Return character positions where each line starts. line[i] starts at line_starts[i]."""
|
|
395
|
+
starts = [0]
|
|
396
|
+
for i, ch in enumerate(text):
|
|
397
|
+
if ch == "\n":
|
|
398
|
+
starts.append(i + 1)
|
|
399
|
+
return starts
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _nearest_detector_key(
|
|
403
|
+
detector_positions: Sequence[tuple[int, str]],
|
|
404
|
+
value_pos: int,
|
|
405
|
+
max_distance: int,
|
|
406
|
+
) -> str | None:
|
|
407
|
+
"""Return the canonical detector key paired with ``value_pos``, or None.
|
|
408
|
+
|
|
409
|
+
Pairing rule: pick the LAST detector that appears BEFORE the value
|
|
410
|
+
(text-order); if none is within ``max_distance``, fall back to the
|
|
411
|
+
FIRST detector that appears AFTER the value within the same range.
|
|
412
|
+
This matches natural English prose patterns "<detector> ...
|
|
413
|
+
<value>" (predominant) and "<value> ... by <detector>" (rare).
|
|
414
|
+
|
|
415
|
+
The previous "absolute-distance nearest" heuristic produced false
|
|
416
|
+
positives on prose like "TF-IDF achieves 0.971, while LoRA reaches
|
|
417
|
+
0.974" where 0.971 is closer to LoRA in raw distance even though
|
|
418
|
+
it semantically belongs to TF-IDF.
|
|
419
|
+
"""
|
|
420
|
+
if not detector_positions:
|
|
421
|
+
return None
|
|
422
|
+
# Look for the LAST detector strictly before the value, within range.
|
|
423
|
+
last_before: str | None = None
|
|
424
|
+
for pos, key in detector_positions:
|
|
425
|
+
if pos < value_pos and (value_pos - pos) <= max_distance:
|
|
426
|
+
last_before = key
|
|
427
|
+
elif pos >= value_pos:
|
|
428
|
+
break
|
|
429
|
+
if last_before is not None:
|
|
430
|
+
return last_before
|
|
431
|
+
# Fall back: FIRST detector after the value, within range.
|
|
432
|
+
for pos, key in detector_positions:
|
|
433
|
+
if pos >= value_pos and (pos - value_pos) <= max_distance:
|
|
434
|
+
return key
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _position_to_line(line_starts: list[int], pos: int) -> int:
|
|
439
|
+
"""Convert a 0-indexed character position to a 1-indexed line number."""
|
|
440
|
+
# Binary-search-like; line_starts is sorted.
|
|
441
|
+
lo, hi = 0, len(line_starts) - 1
|
|
442
|
+
while lo < hi:
|
|
443
|
+
mid = (lo + hi + 1) // 2
|
|
444
|
+
if line_starts[mid] <= pos:
|
|
445
|
+
lo = mid
|
|
446
|
+
else:
|
|
447
|
+
hi = mid - 1
|
|
448
|
+
return lo + 1
|
|
@@ -68,6 +68,7 @@
|
|
|
68
68
|
"LogisticStacker",
|
|
69
69
|
"MANIFEST_SCHEMA_VERSION",
|
|
70
70
|
"MDEEstimate",
|
|
71
|
+
"Match",
|
|
71
72
|
"MaxF1Selector",
|
|
72
73
|
"MetaLearner",
|
|
73
74
|
"MetricFn",
|
|
@@ -127,7 +128,9 @@
|
|
|
127
128
|
"TokenSplittingInjection",
|
|
128
129
|
"TokenizationLeakageCheck",
|
|
129
130
|
"UnicodeNormalizationInjection",
|
|
131
|
+
"ValueBindingsReport",
|
|
130
132
|
"Versioned",
|
|
133
|
+
"Violation",
|
|
131
134
|
"WhitespaceInjection",
|
|
132
135
|
"WilsonInterval",
|
|
133
136
|
"YoudenJSelector",
|
|
@@ -242,6 +245,7 @@
|
|
|
242
245
|
"validate_manifest",
|
|
243
246
|
"validate_payload",
|
|
244
247
|
"validate_prediction_artifact_ref",
|
|
248
|
+
"validate_reader_value_bindings",
|
|
245
249
|
"validate_results",
|
|
246
250
|
"validate_source_roles",
|
|
247
251
|
"walk_path",
|
|
@@ -795,6 +799,14 @@
|
|
|
795
799
|
"kind": "class",
|
|
796
800
|
"signature": "(mde: 'float', sigma_delta: 'float', delta_observed: 'float', alpha: 'float', power: 'float', n_resamples: 'int', n: 'int') -> None"
|
|
797
801
|
},
|
|
802
|
+
"Match": {
|
|
803
|
+
"bases": [
|
|
804
|
+
"object"
|
|
805
|
+
],
|
|
806
|
+
"doc_first_line": "A reader-prose (detector, metric, value) triple that matches the canonical binding.",
|
|
807
|
+
"kind": "class",
|
|
808
|
+
"signature": "(file: 'Path', line: 'int', detector: 'str', metric: 'str', value: 'float') -> None"
|
|
809
|
+
},
|
|
798
810
|
"MaxF1Selector": {
|
|
799
811
|
"bases": [
|
|
800
812
|
"object"
|
|
@@ -1326,6 +1338,14 @@
|
|
|
1326
1338
|
"kind": "class",
|
|
1327
1339
|
"signature": "(form: 'str' = 'NFKC', name: 'str' = 'unicode_normalize') -> None"
|
|
1328
1340
|
},
|
|
1341
|
+
"ValueBindingsReport": {
|
|
1342
|
+
"bases": [
|
|
1343
|
+
"object"
|
|
1344
|
+
],
|
|
1345
|
+
"doc_first_line": "Result of :func:`validate_reader_value_bindings`.",
|
|
1346
|
+
"kind": "class",
|
|
1347
|
+
"signature": "(violations: 'tuple[Violation, ...]', matched: 'tuple[Match, ...]', coverage: 'float') -> None"
|
|
1348
|
+
},
|
|
1329
1349
|
"Versioned": {
|
|
1330
1350
|
"bases": [
|
|
1331
1351
|
"Protocol"
|
|
@@ -1337,6 +1357,14 @@
|
|
|
1337
1357
|
},
|
|
1338
1358
|
"signature": "(*args, **kwargs)"
|
|
1339
1359
|
},
|
|
1360
|
+
"Violation": {
|
|
1361
|
+
"bases": [
|
|
1362
|
+
"object"
|
|
1363
|
+
],
|
|
1364
|
+
"doc_first_line": "A reader-prose (detector, metric, value) triple where the value disagrees with the canonical binding.",
|
|
1365
|
+
"kind": "class",
|
|
1366
|
+
"signature": "(file: 'Path', line: 'int', detector: 'str', metric: 'str', found_value: 'float', expected_value: 'float', surrounding_text: 'str') -> None"
|
|
1367
|
+
},
|
|
1340
1368
|
"WhitespaceInjection": {
|
|
1341
1369
|
"bases": [
|
|
1342
1370
|
"object"
|
|
@@ -1373,7 +1401,7 @@
|
|
|
1373
1401
|
"doc_first_line": "str(object='') -> str",
|
|
1374
1402
|
"kind": "value",
|
|
1375
1403
|
"type": "str",
|
|
1376
|
-
"value": "'1.0.
|
|
1404
|
+
"value": "'1.0.3'"
|
|
1377
1405
|
},
|
|
1378
1406
|
"apply_operating_points": {
|
|
1379
1407
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1920,6 +1948,11 @@
|
|
|
1920
1948
|
"kind": "function",
|
|
1921
1949
|
"signature": "(payload: 'Mapping[str, object]') -> 'None'"
|
|
1922
1950
|
},
|
|
1951
|
+
"validate_reader_value_bindings": {
|
|
1952
|
+
"doc_first_line": "Validate (detector, metric, value) bindings in reader-prose markdown.",
|
|
1953
|
+
"kind": "function",
|
|
1954
|
+
"signature": "(*, files: 'Sequence[Path | str]', bindings: 'Mapping[tuple[str, str], float]', value_pattern: 'str' = '\\\\d+\\\\.\\\\d{2,4}', max_distance_chars: 'int' = 80, metric_aliases: 'Mapping[str, Sequence[str]]' = mappingproxy({}), detector_aliases: 'Mapping[str, Sequence[str]]' = mappingproxy({}), tolerance: 'float' = 0.0001) -> 'ValueBindingsReport'"
|
|
1955
|
+
},
|
|
1923
1956
|
"validate_results": {
|
|
1924
1957
|
"doc_first_line": "Validate a serialized ``RunResult`` payload against ``results.v1.json``.",
|
|
1925
1958
|
"kind": "function",
|