eval-toolkit 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/CHANGELOG.md +135 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/PKG-INFO +1 -1
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/source/adr/README.md +2 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/audit_value_bindings.py +334 -21
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/public_api/snapshot.json +2 -2
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_audit_value_bindings.py +268 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/.gitignore +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/LICENSE +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/STYLE.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/pyproject.toml +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_config.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.1.0 → eval_toolkit-1.2.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,141 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.2.0] — 2026-05-26 — `audit_value_bindings` context-aware noise reduction (consumer-feedback follow-on to #80)
|
|
9
|
+
|
|
10
|
+
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
11
|
+
Consumer-feedback follow-on after v1.1.0's adoption at
|
|
12
|
+
`prompt-injection-detection-submission@v1.3.11`. The v1.1.0
|
|
13
|
+
slice-axis fix achieved 62% noise reduction (96 → 36 warnings) on
|
|
14
|
+
the consumer's writeup; the residual 36 were positional-heuristic
|
|
15
|
+
limitations [ADR 0005](docs/source/adr/0005-structured-keys-for-audit-validators.md)
|
|
16
|
+
named as "Future work (deferred)" for v1.2.0+. This release
|
|
17
|
+
addresses 81% of that residual (36 → 7) via four context-aware
|
|
18
|
+
extensions to `scope="narrative"`. Combined with v1.1.0,
|
|
19
|
+
**93% total noise reduction** vs the pre-fix v1.0.5 baseline.
|
|
20
|
+
|
|
21
|
+
### Added — `audit_value_bindings.py` context-aware narrative filters
|
|
22
|
+
|
|
23
|
+
All four filters activate ONLY when `scope="narrative"`. Legacy
|
|
24
|
+
`scope="all"` callers see zero behavior change (Tier-1 ADDITIVE).
|
|
25
|
+
No new public kwargs; no signature drift; the keyword lists are
|
|
26
|
+
hardcoded module-level `frozenset` constants. Issue [#80](https://github.com/brandon-behring/eval-toolkit/issues/80)'s
|
|
27
|
+
acceptance criterion was ≤5 warnings; v1.2.0 hits 7 (close to the
|
|
28
|
+
target; the remaining 7 are pure cross-detector list-grammar cases
|
|
29
|
+
that require parser-level work — see "Out of scope" below).
|
|
30
|
+
|
|
31
|
+
- **T1: Delta-context filter.** Suppresses values that are
|
|
32
|
+
comparative magnitudes rather than binding claims. Two
|
|
33
|
+
sub-filters:
|
|
34
|
+
- Sign-prefix skip: values immediately preceded by `+` or `-`
|
|
35
|
+
(negative-magnitude markers like `-0.071 AUPRC`,
|
|
36
|
+
`+0.073 lift`) are dropped.
|
|
37
|
+
- Delta-keyword skip: values within 30 chars AFTER a
|
|
38
|
+
delta-marker token are dropped. The before-only window
|
|
39
|
+
prevents mis-firing on prose like `"frozen probe's 0.515
|
|
40
|
+
(delta -0.132)"` where the `"delta"` token refers to the
|
|
41
|
+
following `-0.132`, not the preceding `0.515`.
|
|
42
|
+
|
|
43
|
+
Keyword list (`_DELTA_KEYWORDS`, hardcoded frozenset):
|
|
44
|
+
`delta`, `drop`, `drops`, `lift`, `lifts`, `gap`, `margin`,
|
|
45
|
+
`regresses`, `improves`, `beats`, `exceeds`, `trails`,
|
|
46
|
+
`underperforms`, `vs`, `versus`, `below`. Excluded:
|
|
47
|
+
`against`, `above`, `ahead`, `behind` (too ambiguous; common
|
|
48
|
+
comparison prepositions in legitimate binding prose).
|
|
49
|
+
|
|
50
|
+
- **T2: Floor-context filter.** Suppresses values near random-
|
|
51
|
+
baseline / floor mentions. Window is asymmetric (50 chars
|
|
52
|
+
before, 5 chars after) because floor mentions canonically
|
|
53
|
+
precede the value (`"random AUPRC is 0.374"`).
|
|
54
|
+
|
|
55
|
+
Keyword list (`_FLOOR_KEYWORDS`): `random`, `floor`, `chance`,
|
|
56
|
+
`trivial`. Intentionally narrow — `baseline`, `prior`,
|
|
57
|
+
`majority` excluded because they have legitimate non-floor
|
|
58
|
+
senses (`"TF-IDF baseline"`, `"prior work"`). Multi-word
|
|
59
|
+
patterns like `"below the prevalence baseline of 0.374"` are
|
|
60
|
+
caught by T1's `"below"` keyword instead.
|
|
61
|
+
|
|
62
|
+
- **T3: Consume-on-match within sentence.** After a value
|
|
63
|
+
produces a Match for `(detector, metric, slice)`, subsequent
|
|
64
|
+
values for the same canonical binding in the same sentence are
|
|
65
|
+
suppressed. Catches dense multi-detector enumerations like
|
|
66
|
+
`"AUPRC 0.556 vs 0.519"` where the second value is implicitly
|
|
67
|
+
a contrasting detector's binding (cross-detector inference
|
|
68
|
+
remains out of scope per ADR 0005 A4).
|
|
69
|
+
|
|
70
|
+
- **T4: Sentence-boundary detector-pair reject.** When pairing a
|
|
71
|
+
detector mention with a value, if a sentence terminator (`.`,
|
|
72
|
+
`!`, `?`, `\n\n`) lies between them, the pair is rejected.
|
|
73
|
+
Sentence detection uses paragraph-aware abbreviation guarding
|
|
74
|
+
(`vs.`, `e.g.`, `i.e.`, `c.f.`, `etc.`, `cf.`, `fig.`,
|
|
75
|
+
`eq.`, `pp.`, `viz.`, `ca.` excluded; decimal numbers and
|
|
76
|
+
letter-dot-letter patterns also guarded). Single `\n` is a
|
|
77
|
+
soft break (markdown line-wrap, NOT a sentence boundary);
|
|
78
|
+
`\n\n` is hard.
|
|
79
|
+
|
|
80
|
+
### Internal changes (no public API impact)
|
|
81
|
+
|
|
82
|
+
- `_nearest_canonical_key()` now returns `(key, position)`
|
|
83
|
+
instead of just `key`. The position is needed for T4's
|
|
84
|
+
sentence-boundary check. The slice-pairing call site unpacks
|
|
85
|
+
and discards the position. Private helper; no consumer impact.
|
|
86
|
+
- New private helpers: `_is_sentence_terminator_dot`,
|
|
87
|
+
`_sentence_boundary_positions`, `_sentence_id_of`,
|
|
88
|
+
`_crosses_sentence_boundary`, `_is_signed_value`,
|
|
89
|
+
`_has_keyword_in_window`, `_compile_keyword_pattern`. All
|
|
90
|
+
underscore-prefixed; Tier-3 FREE.
|
|
91
|
+
|
|
92
|
+
### Dogfood evidence
|
|
93
|
+
|
|
94
|
+
| Configuration | Warnings on `prompt-injection-detection-submission` HEAD | Reduction vs v1.0.5 baseline |
|
|
95
|
+
|---|---|---|
|
|
96
|
+
| v1.0.5 (legacy 2-tuple) | 95 | — |
|
|
97
|
+
| v1.1.0 BindingKey + scope='narrative' (content-type filter only) | 23 | 76% |
|
|
98
|
+
| **v1.2.0 + context filters (this release)** | **7** | **93%** |
|
|
99
|
+
|
|
100
|
+
The 7 v1.2.0 residuals are all cross-detector list constructions
|
|
101
|
+
(e.g., `"0.293 versus 0.364 for the frozen probe and 0.291 for
|
|
102
|
+
TF-IDF + LR"` where the validator can't infer that 0.361 / 0.291
|
|
103
|
+
belong to ProtectAI-v1 and TF-IDF respectively because they're
|
|
104
|
+
introduced by `"and"` / `"for"` without an immediately-preceding
|
|
105
|
+
detector mention). These require true list-grammar parsing
|
|
106
|
+
(rejected for v1.x in ADR 0005 A4) and are tracked for v1.3.0+
|
|
107
|
+
with their own ADR design review.
|
|
108
|
+
|
|
109
|
+
### Consumer adoption path
|
|
110
|
+
|
|
111
|
+
`prompt-injection-detection-submission` and other consumers using
|
|
112
|
+
`scope="narrative"` get the v1.2.0 filters automatically with no
|
|
113
|
+
code change. Consumers on `scope="all"` (default) continue with
|
|
114
|
+
v1.1.0 behavior. Recommended consumer migration:
|
|
115
|
+
|
|
116
|
+
1. Re-pin `eval-toolkit>=1.2.0,<2` (additive; no consumer code
|
|
117
|
+
change required).
|
|
118
|
+
2. HARD-gate promotion is now credible: 7 residual warnings is
|
|
119
|
+
below the actionable threshold; consumer can promote
|
|
120
|
+
`audit_value_bindings` from SOFT to HARD bundled with
|
|
121
|
+
`audit_citation_alignment` per the v1.3.8 plan.
|
|
122
|
+
|
|
123
|
+
### Tests
|
|
124
|
+
|
|
125
|
+
36 in `tests/test_audit_value_bindings.py` (28 from v1.1.0 + 8
|
|
126
|
+
new for T1–T4 + sentence-boundary helper unit test). All pass.
|
|
127
|
+
Public API snapshot regenerated for `__version__` bump only (no
|
|
128
|
+
signature changes beyond an inspect-formatting normalization on
|
|
129
|
+
the `validate_reader_value_bindings` `bindings` annotation; same
|
|
130
|
+
type semantically).
|
|
131
|
+
|
|
132
|
+
### Out of scope (deferred)
|
|
133
|
+
|
|
134
|
+
- **Cross-detector list-grammar parsing** — the 7 residual
|
|
135
|
+
warnings. Requires lookahead context-aware list parsing
|
|
136
|
+
(`"X scored Y vs Z for W and V for U"`). Track as a v1.3.0+
|
|
137
|
+
candidate; needs ADR design before implementation.
|
|
138
|
+
- **Markdown AST parsing** (ADR 0005 A4) — v2.0 territory.
|
|
139
|
+
- **`extra_*_keywords` kwargs** for runtime extension of the
|
|
140
|
+
hardcoded keyword lists — YAGNI for now (consumer's prose is
|
|
141
|
+
covered); add in a v1.2.x patch if concrete demand emerges.
|
|
142
|
+
|
|
8
143
|
## [1.1.0] — 2026-05-26 — `audit_value_bindings` slice-aware matching via `BindingKey` (closes #80)
|
|
9
144
|
|
|
10
145
|
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -76,3 +76,5 @@ What would have to change for this decision to be reopened?
|
|
|
76
76
|
| [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
|
|
77
77
|
| [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
|
|
78
78
|
| [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
|
|
79
|
+
| [0004](0004-naming-conventions.md) | Naming conventions for modules, classes, and parameters | Accepted | 2026-05-23 |
|
|
80
|
+
| [0005](0005-structured-keys-for-audit-validators.md) | Structured keys over positional tuples for canonical-identity types in audit validators | Accepted | 2026-05-26 |
|
|
@@ -35,6 +35,7 @@ Closes upstream issue #71. v1.0.3.
|
|
|
35
35
|
|
|
36
36
|
from __future__ import annotations
|
|
37
37
|
|
|
38
|
+
import bisect
|
|
38
39
|
import logging
|
|
39
40
|
import re
|
|
40
41
|
from collections.abc import Mapping, Sequence
|
|
@@ -61,6 +62,98 @@ DEFAULT_SLICE_WINDOW_CHARS: int = 120
|
|
|
61
62
|
DEFAULT_TOLERANCE: float = 1e-4
|
|
62
63
|
|
|
63
64
|
|
|
65
|
+
# v1.2.0 context-aware narrative filters. Keyword lists are hardcoded
|
|
66
|
+
# module-level frozensets (per ADR 0005 §4: Tier-1 ADDITIVE — no new
|
|
67
|
+
# public kwargs; consumers can file an issue to extend the default
|
|
68
|
+
# lists if their prose surfaces missed patterns).
|
|
69
|
+
#
|
|
70
|
+
# _DELTA_KEYWORDS: case-insensitive whole-token markers indicating a
|
|
71
|
+
# value is a paired-delta or comparative magnitude, not a binding claim.
|
|
72
|
+
# T1 filter suppresses candidate values when any of these appears within
|
|
73
|
+
# ±30 chars of the value position (under scope="narrative").
|
|
74
|
+
_DELTA_KEYWORDS: frozenset[str] = frozenset(
|
|
75
|
+
{
|
|
76
|
+
# Unambiguous delta nouns/verbs (consumer prose patterns):
|
|
77
|
+
"delta",
|
|
78
|
+
"drop",
|
|
79
|
+
"drops",
|
|
80
|
+
"lift",
|
|
81
|
+
"lifts",
|
|
82
|
+
"gap",
|
|
83
|
+
"margin",
|
|
84
|
+
# Comparison verbs that signal "this is a relative magnitude":
|
|
85
|
+
"regresses",
|
|
86
|
+
"improves",
|
|
87
|
+
"beats",
|
|
88
|
+
"exceeds",
|
|
89
|
+
"trails",
|
|
90
|
+
"underperforms",
|
|
91
|
+
# "vs"/"versus" intentionally INCLUDED — they're the canonical
|
|
92
|
+
# delta separator in consumer prose ("AUPRC 0.556 vs 0.519").
|
|
93
|
+
# The before-only window keeps these tight: "X vs Y" fires on
|
|
94
|
+
# Y (preceded by "vs"), not X. T3 also catches the same-sentence
|
|
95
|
+
# duplicate-binding flag separately.
|
|
96
|
+
"vs",
|
|
97
|
+
"versus",
|
|
98
|
+
# Comparison directions — kept under before-only window so
|
|
99
|
+
# "drops -0.071 below" suppresses -0.071 (sign also catches),
|
|
100
|
+
# but "0.515 (delta -0.132)" doesn't suppress 0.515 ("delta"
|
|
101
|
+
# is AFTER 0.515).
|
|
102
|
+
# Excluded: "against", "above", "ahead", "behind" — too
|
|
103
|
+
# ambiguous; common comparison prepositions that appear in
|
|
104
|
+
# legitimate binding claims.
|
|
105
|
+
"below",
|
|
106
|
+
}
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# _FLOOR_KEYWORDS: markers indicating a value is a random-baseline or
|
|
110
|
+
# floor reference, not a detector binding. T2 filter suppresses
|
|
111
|
+
# candidate values when any of these appears within −50 / +5 chars
|
|
112
|
+
# (asymmetric: floor mentions canonically precede the value, e.g.,
|
|
113
|
+
# "random AUPRC is 0.374").
|
|
114
|
+
#
|
|
115
|
+
# Intentionally narrow: "baseline", "prior", "majority" are EXCLUDED
|
|
116
|
+
# because they have legitimate non-floor senses ("TF-IDF baseline",
|
|
117
|
+
# "prior work", "majority of detectors"). The consumer's prose
|
|
118
|
+
# patterns with these words ("below the prevalence baseline of 0.374")
|
|
119
|
+
# are caught by T1 via "below"/"above" instead — the comparative
|
|
120
|
+
# preposition is the reliable signal, not the noun.
|
|
121
|
+
_FLOOR_KEYWORDS: frozenset[str] = frozenset(
|
|
122
|
+
{
|
|
123
|
+
"random",
|
|
124
|
+
"floor",
|
|
125
|
+
"chance",
|
|
126
|
+
"trivial",
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# _ABBREV_BEFORE_DOT: tokens that should NOT trigger a sentence
|
|
131
|
+
# boundary when followed by `.`. The multi-letter pattern (e.g., i.e.,
|
|
132
|
+
# c.f.) is handled separately via letter-dot-letter detection.
|
|
133
|
+
_ABBREV_BEFORE_DOT: frozenset[str] = frozenset(
|
|
134
|
+
{
|
|
135
|
+
"vs",
|
|
136
|
+
"etc",
|
|
137
|
+
"cf",
|
|
138
|
+
"fig",
|
|
139
|
+
"eq",
|
|
140
|
+
"pp",
|
|
141
|
+
"viz",
|
|
142
|
+
"ca",
|
|
143
|
+
}
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _compile_keyword_pattern(keywords: frozenset[str]) -> re.Pattern[str]:
|
|
148
|
+
"""Compile case-insensitive word-boundary OR regex matching any keyword."""
|
|
149
|
+
parts = sorted(re.escape(kw) for kw in keywords)
|
|
150
|
+
return re.compile(r"\b(?:" + "|".join(parts) + r")\b", re.IGNORECASE)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
_DELTA_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_DELTA_KEYWORDS)
|
|
154
|
+
_FLOOR_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_FLOOR_KEYWORDS)
|
|
155
|
+
|
|
156
|
+
|
|
64
157
|
@dataclass(frozen=True)
|
|
65
158
|
class BindingKey:
|
|
66
159
|
"""Canonical identity for a `(detector, metric, slice)` measurement.
|
|
@@ -409,6 +502,19 @@ def validate_reader_value_bindings(
|
|
|
409
502
|
# for scope="all" (legacy semantics; no exclusion).
|
|
410
503
|
excluded_ranges = _build_exclusion_ranges(text, line_starts) if scope == "narrative" else []
|
|
411
504
|
|
|
505
|
+
# v1.2.0 T3 + T4 (narrative-scope only): precompute sentence
|
|
506
|
+
# boundaries once per file (paragraph-aware abbreviation guard).
|
|
507
|
+
# T3 uses a per-(sentence, canonical_key) set to suppress
|
|
508
|
+
# duplicate matches of the same binding within one sentence
|
|
509
|
+
# (e.g., "0.556 vs 0.519" — the second value belongs to a
|
|
510
|
+
# contrasting detector implicit in the prose). T4 uses the
|
|
511
|
+
# boundaries to reject (detector, value) pairings that cross
|
|
512
|
+
# a sentence terminator.
|
|
513
|
+
sentence_positions: Sequence[int] = (
|
|
514
|
+
_sentence_boundary_positions(text) if scope == "narrative" else ()
|
|
515
|
+
)
|
|
516
|
+
consumed_in_sentence: set[tuple[int, BindingKey]] = set()
|
|
517
|
+
|
|
412
518
|
# Pre-collect ALL detector positions (across every canonical
|
|
413
519
|
# detector key) so each value can be paired with its NEAREST
|
|
414
520
|
# detector. This avoids cross-detector contamination — e.g.,
|
|
@@ -460,12 +566,16 @@ def validate_reader_value_bindings(
|
|
|
460
566
|
# picking up e.g., "0.974" inside "10.974" or version
|
|
461
567
|
# strings like "1.0.974"). Simple heuristic: the
|
|
462
568
|
# character before the match (if any) must not be a
|
|
463
|
-
# digit or dot.
|
|
569
|
+
# digit or dot. v1.2.0 T1a (narrative-scope only):
|
|
570
|
+
# also skip values immediately preceded by `+` or
|
|
571
|
+
# `-` (delta-magnitude markers like "-0.071 AUPRC").
|
|
464
572
|
val_start_in_full = window_offset + val_match.start()
|
|
465
573
|
if val_start_in_full > 0:
|
|
466
574
|
prev_char = text[val_start_in_full - 1]
|
|
467
575
|
if prev_char.isdigit() or prev_char == ".":
|
|
468
576
|
continue
|
|
577
|
+
if scope == "narrative" and prev_char in "+-":
|
|
578
|
+
continue
|
|
469
579
|
|
|
470
580
|
val_str = val_match.group(0)
|
|
471
581
|
try:
|
|
@@ -479,16 +589,53 @@ def validate_reader_value_bindings(
|
|
|
479
589
|
if excluded_ranges and _is_excluded(val_start_in_full, excluded_ranges):
|
|
480
590
|
continue
|
|
481
591
|
|
|
592
|
+
# v1.2.0 T1b (narrative-scope only): delta-keyword
|
|
593
|
+
# context filter. Skip values whose preceding 30
|
|
594
|
+
# chars contain a delta-marker token (e.g.,
|
|
595
|
+
# "delta", "drop", "lift", "vs", "below"). Window
|
|
596
|
+
# is BEFORE-only: delta keywords canonically
|
|
597
|
+
# introduce the delta magnitude ("delta -0.132",
|
|
598
|
+
# "drops -0.071"). Symmetric ±30 windows
|
|
599
|
+
# mis-fire on prose like "X scored 0.515 (delta
|
|
600
|
+
# -0.132)" where "delta" describes a DIFFERENT
|
|
601
|
+
# value (-0.132), not the preceding 0.515.
|
|
602
|
+
if scope == "narrative" and _has_keyword_in_window(
|
|
603
|
+
text, val_start_in_full, _DELTA_PATTERN, 30, 0
|
|
604
|
+
):
|
|
605
|
+
continue
|
|
606
|
+
|
|
607
|
+
# v1.2.0 T2 (narrative-scope only): floor-keyword
|
|
608
|
+
# context filter. Skip values within −50/+5 chars of
|
|
609
|
+
# a floor-marker token (e.g., "random", "floor",
|
|
610
|
+
# "baseline"). Floor mentions canonically precede
|
|
611
|
+
# the value ("random AUPRC is 0.374"), hence the
|
|
612
|
+
# asymmetric window.
|
|
613
|
+
if scope == "narrative" and _has_keyword_in_window(
|
|
614
|
+
text, val_start_in_full, _FLOOR_PATTERN, 50, 5
|
|
615
|
+
):
|
|
616
|
+
continue
|
|
617
|
+
|
|
482
618
|
# Cross-detector disambiguation: require the current
|
|
483
619
|
# det_key to be the detector paired with this value
|
|
484
620
|
# by the text-order rule (last detector before; else
|
|
485
621
|
# first detector after). Avoids cross-contamination
|
|
486
622
|
# on multi-detector prose like "TF-IDF achieves
|
|
487
623
|
# 0.971, while LoRA reaches 0.974".
|
|
488
|
-
|
|
624
|
+
detector_match = _nearest_canonical_key(
|
|
489
625
|
detector_positions, val_start_in_full, max_distance_chars
|
|
490
626
|
)
|
|
491
|
-
if
|
|
627
|
+
if detector_match is None or detector_match[0] != det_key:
|
|
628
|
+
continue
|
|
629
|
+
paired_det_pos = detector_match[1]
|
|
630
|
+
|
|
631
|
+
# v1.2.0 T4 (narrative-scope only): reject the
|
|
632
|
+
# detector-value pair if a sentence boundary lies
|
|
633
|
+
# between them. Prevents prose like "X scored
|
|
634
|
+
# 0.291. The random floor is 0.374" from pairing
|
|
635
|
+
# 0.374 with X across the `.` boundary.
|
|
636
|
+
if scope == "narrative" and _crosses_sentence_boundary(
|
|
637
|
+
paired_det_pos, val_start_in_full, sentence_positions
|
|
638
|
+
):
|
|
492
639
|
continue
|
|
493
640
|
|
|
494
641
|
# Require the metric mention be within distance of the value too,
|
|
@@ -514,12 +661,12 @@ def validate_reader_value_bindings(
|
|
|
514
661
|
# (c) paired slice == this binding's slice →
|
|
515
662
|
# fall through to value comparison.
|
|
516
663
|
if slice_key != "any":
|
|
517
|
-
|
|
664
|
+
slice_match = _nearest_canonical_key(
|
|
518
665
|
slice_positions,
|
|
519
666
|
val_start_in_full,
|
|
520
667
|
slice_window_chars,
|
|
521
668
|
)
|
|
522
|
-
if
|
|
669
|
+
if slice_match is None:
|
|
523
670
|
unmatched_slice_count += 1
|
|
524
671
|
_logger.warning(
|
|
525
672
|
"audit_value_bindings: no slice mention "
|
|
@@ -533,9 +680,26 @@ def validate_reader_value_bindings(
|
|
|
533
680
|
canonical_key,
|
|
534
681
|
)
|
|
535
682
|
continue
|
|
683
|
+
paired_slice = slice_match[0]
|
|
536
684
|
if paired_slice != slice_key:
|
|
537
685
|
continue
|
|
538
686
|
|
|
687
|
+
# v1.2.0 T3 (narrative-scope only): suppress
|
|
688
|
+
# duplicate matches of the same binding within one
|
|
689
|
+
# sentence. After a Match is emitted for
|
|
690
|
+
# (canonical_key) at this sentence, subsequent
|
|
691
|
+
# candidate values in the same sentence for the
|
|
692
|
+
# same canonical_key are skipped. Catches dense
|
|
693
|
+
# multi-detector enumerations like "AUPRC 0.556 vs
|
|
694
|
+
# 0.519" where 0.519 is implicitly a contrasting
|
|
695
|
+
# detector's value.
|
|
696
|
+
if sentence_positions:
|
|
697
|
+
sent_id = _sentence_id_of(val_start_in_full, sentence_positions)
|
|
698
|
+
if (sent_id, canonical_key) in consumed_in_sentence:
|
|
699
|
+
continue
|
|
700
|
+
else:
|
|
701
|
+
sent_id = 0 # placeholder; not used when scope="all"
|
|
702
|
+
|
|
539
703
|
line_no = _position_to_line(line_starts, val_start_in_full)
|
|
540
704
|
if abs(found - expected) <= tolerance:
|
|
541
705
|
matched.append(
|
|
@@ -548,6 +712,8 @@ def validate_reader_value_bindings(
|
|
|
548
712
|
)
|
|
549
713
|
)
|
|
550
714
|
matched_keys.add(canonical_key)
|
|
715
|
+
if sentence_positions:
|
|
716
|
+
consumed_in_sentence.add((sent_id, canonical_key))
|
|
551
717
|
else:
|
|
552
718
|
# Widen the surrounding context for diagnostic
|
|
553
719
|
# clarity. Center on the value but include
|
|
@@ -705,6 +871,154 @@ def _is_excluded(pos: int, excluded: Sequence[tuple[int, int]]) -> bool:
|
|
|
705
871
|
return start <= pos < end
|
|
706
872
|
|
|
707
873
|
|
|
874
|
+
# ---------------------------------------------------------------------------
|
|
875
|
+
# v1.2.0 context-aware narrative filters.
|
|
876
|
+
# Helpers below implement T1 (delta/sign), T2 (floor), T3 (consume-on-match
|
|
877
|
+
# per-sentence), and T4 (sentence-boundary detector-pair reject) — all
|
|
878
|
+
# scoped to `scope="narrative"`. Per ADR 0005 §4, these are Tier-1
|
|
879
|
+
# ADDITIVE: legacy `scope="all"` callers see zero behavior change.
|
|
880
|
+
# ---------------------------------------------------------------------------
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
def _is_sentence_terminator_dot(text: str, dot_pos: int) -> bool:
|
|
884
|
+
"""Return True if the dot at ``dot_pos`` terminates a sentence.
|
|
885
|
+
|
|
886
|
+
False positives the abbreviation guard catches:
|
|
887
|
+
|
|
888
|
+
- Decimal numbers (digit-dot-digit): ``0.5``, ``§5.2``.
|
|
889
|
+
- Letter-dot-letter-dot patterns: ``e.g.``, ``i.e.``, ``c.f.``.
|
|
890
|
+
- Single-token abbreviations preceding the dot (whitespace- /
|
|
891
|
+
punctuation-separated): ``vs.``, ``etc.``, ``cf.``, ``fig.``,
|
|
892
|
+
``eq.``, ``pp.``, ``viz.``, ``ca.``. See ``_ABBREV_BEFORE_DOT``.
|
|
893
|
+
"""
|
|
894
|
+
n = len(text)
|
|
895
|
+
prev_char = text[dot_pos - 1] if dot_pos > 0 else ""
|
|
896
|
+
next_char = text[dot_pos + 1] if dot_pos + 1 < n else ""
|
|
897
|
+
# Decimal: digit-dot-digit.
|
|
898
|
+
if prev_char.isdigit() and next_char.isdigit():
|
|
899
|
+
return False
|
|
900
|
+
# Letter-dot-letter-dot pattern, dot is the SECOND dot in "x.y."
|
|
901
|
+
if (
|
|
902
|
+
dot_pos >= 3
|
|
903
|
+
and prev_char.isalpha()
|
|
904
|
+
and text[dot_pos - 2] == "."
|
|
905
|
+
and text[dot_pos - 3].isalpha()
|
|
906
|
+
):
|
|
907
|
+
return False
|
|
908
|
+
# Letter-dot-letter-dot pattern, dot is the FIRST dot in "x.y."
|
|
909
|
+
if dot_pos + 2 < n and next_char.isalpha() and text[dot_pos + 2] == ".":
|
|
910
|
+
return False
|
|
911
|
+
# Single-token abbreviation preceding the dot.
|
|
912
|
+
j = dot_pos - 1
|
|
913
|
+
while j >= 0 and text[j].isalpha():
|
|
914
|
+
j -= 1
|
|
915
|
+
word = text[j + 1 : dot_pos].lower()
|
|
916
|
+
return word not in _ABBREV_BEFORE_DOT
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def _sentence_boundary_positions(text: str) -> list[int]:
|
|
920
|
+
"""Return sorted character positions where each sentence STARTS.
|
|
921
|
+
|
|
922
|
+
Hard breaks (sentence terminators):
|
|
923
|
+
|
|
924
|
+
- ``!`` and ``?`` always terminate.
|
|
925
|
+
- ``.`` terminates unless the abbreviation guard
|
|
926
|
+
(:func:`_is_sentence_terminator_dot`) returns False.
|
|
927
|
+
- ``\\n\\n`` (paragraph break) terminates.
|
|
928
|
+
|
|
929
|
+
Soft breaks (NOT sentence boundaries):
|
|
930
|
+
|
|
931
|
+
- Single ``\\n`` (markdown line-wrap mid-sentence).
|
|
932
|
+
- ``;`` (semicolons in dense list constructions).
|
|
933
|
+
- ``:`` (colons preceding list items or definitions).
|
|
934
|
+
|
|
935
|
+
The first sentence starts at position 0. Subsequent sentence starts
|
|
936
|
+
are recorded at the first non-whitespace character after a hard
|
|
937
|
+
break. Used by T3 (consume-on-match) and T4 (sentence-boundary
|
|
938
|
+
detector-pair reject).
|
|
939
|
+
"""
|
|
940
|
+
positions = [0]
|
|
941
|
+
n = len(text)
|
|
942
|
+
i = 0
|
|
943
|
+
while i < n:
|
|
944
|
+
ch = text[i]
|
|
945
|
+
boundary = False
|
|
946
|
+
skip = 1
|
|
947
|
+
if ch in "!?" or ch == "." and _is_sentence_terminator_dot(text, i):
|
|
948
|
+
boundary = True
|
|
949
|
+
elif ch == "\n" and i + 1 < n and text[i + 1] == "\n":
|
|
950
|
+
boundary = True
|
|
951
|
+
skip = 2
|
|
952
|
+
if boundary:
|
|
953
|
+
j = i + skip
|
|
954
|
+
while j < n and text[j].isspace():
|
|
955
|
+
j += 1
|
|
956
|
+
if j < n and j > positions[-1]:
|
|
957
|
+
positions.append(j)
|
|
958
|
+
i = max(j, i + skip)
|
|
959
|
+
else:
|
|
960
|
+
i += 1
|
|
961
|
+
return positions
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def _sentence_id_of(pos: int, sentence_positions: Sequence[int]) -> int:
|
|
965
|
+
"""Return the zero-based sentence index containing ``pos``.
|
|
966
|
+
|
|
967
|
+
Uses binary search over the sorted ``sentence_positions``. Returns
|
|
968
|
+
``0`` for any position before the first sentence start.
|
|
969
|
+
"""
|
|
970
|
+
if not sentence_positions:
|
|
971
|
+
return 0
|
|
972
|
+
idx = bisect.bisect_right(sentence_positions, pos) - 1
|
|
973
|
+
return max(0, idx)
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
def _crosses_sentence_boundary(pos_a: int, pos_b: int, sentence_positions: Sequence[int]) -> bool:
|
|
977
|
+
"""Return True if a sentence boundary lies strictly between ``pos_a`` and ``pos_b``.
|
|
978
|
+
|
|
979
|
+
Sentence-boundary positions are derived from
|
|
980
|
+
:func:`_sentence_boundary_positions`. Used by T4 to reject
|
|
981
|
+
(detector, value) pairs whose detector mention is in a different
|
|
982
|
+
sentence than the value.
|
|
983
|
+
"""
|
|
984
|
+
if not sentence_positions:
|
|
985
|
+
return False
|
|
986
|
+
lo = min(pos_a, pos_b)
|
|
987
|
+
hi = max(pos_a, pos_b)
|
|
988
|
+
idx = bisect.bisect_right(sentence_positions, lo)
|
|
989
|
+
return idx < len(sentence_positions) and sentence_positions[idx] <= hi
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
def _is_signed_value(text: str, val_start: int) -> bool:
|
|
993
|
+
"""True if the value at ``val_start`` is immediately preceded by ``+`` or ``-``.
|
|
994
|
+
|
|
995
|
+
The sign marker indicates a paired-delta or comparative magnitude
|
|
996
|
+
(e.g., ``-0.071`` AUPRC delta), not a binding claim. T1 filter
|
|
997
|
+
skips these under ``scope="narrative"``.
|
|
998
|
+
"""
|
|
999
|
+
return val_start > 0 and text[val_start - 1] in "+-"
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
def _has_keyword_in_window(
|
|
1003
|
+
text: str,
|
|
1004
|
+
val_start: int,
|
|
1005
|
+
pattern: re.Pattern[str],
|
|
1006
|
+
before_chars: int,
|
|
1007
|
+
after_chars: int,
|
|
1008
|
+
) -> bool:
|
|
1009
|
+
"""True if ``pattern`` matches anywhere in the character window around ``val_start``.
|
|
1010
|
+
|
|
1011
|
+
Used by T1 (delta keywords) and T2 (floor keywords) to detect
|
|
1012
|
+
context cues near a candidate value. ``before_chars`` and
|
|
1013
|
+
``after_chars`` control the asymmetric window — floor mentions
|
|
1014
|
+
typically PRECEDE the value (e.g., "random AUPRC is 0.374"),
|
|
1015
|
+
while delta mentions can be on either side.
|
|
1016
|
+
"""
|
|
1017
|
+
start = max(0, val_start - before_chars)
|
|
1018
|
+
end = min(len(text), val_start + after_chars)
|
|
1019
|
+
return bool(pattern.search(text, start, end))
|
|
1020
|
+
|
|
1021
|
+
|
|
708
1022
|
def _build_pattern(
|
|
709
1023
|
canonical: str,
|
|
710
1024
|
aliases: Sequence[str],
|
|
@@ -731,8 +1045,8 @@ def _nearest_canonical_key(
|
|
|
731
1045
|
positions: Sequence[tuple[int, str]],
|
|
732
1046
|
value_pos: int,
|
|
733
1047
|
max_distance: int,
|
|
734
|
-
) -> str | None:
|
|
735
|
-
"""Return
|
|
1048
|
+
) -> tuple[str, int] | None:
|
|
1049
|
+
"""Return ``(key, position)`` paired with ``value_pos`` by text-order, or None.
|
|
736
1050
|
|
|
737
1051
|
Pairing rule: pick the LAST canonical occurrence that appears
|
|
738
1052
|
BEFORE the value (text-order); if none is within ``max_distance``,
|
|
@@ -741,25 +1055,24 @@ def _nearest_canonical_key(
|
|
|
741
1055
|
pattern "<token> ... <value>" (subject-verb-object, predominant)
|
|
742
1056
|
with a fallback for the inverted "<value> ... by <token>" form.
|
|
743
1057
|
|
|
744
|
-
Used for DETECTOR pairing. The
|
|
745
|
-
heuristic was rejected for detectors
|
|
746
|
-
on prose like "TF-IDF achieves
|
|
747
|
-
where 0.971 is closer to LoRA
|
|
748
|
-
semantically belongs to TF-IDF.
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
subsequent clauses.
|
|
1058
|
+
Used for DETECTOR pairing AND slice pairing. The
|
|
1059
|
+
"absolute-distance nearest" heuristic was rejected for detectors
|
|
1060
|
+
— it produces false positives on prose like "TF-IDF achieves
|
|
1061
|
+
0.971, while LoRA reaches 0.974" where 0.971 is closer to LoRA
|
|
1062
|
+
in raw distance even though it semantically belongs to TF-IDF.
|
|
1063
|
+
|
|
1064
|
+
v1.2.0: now returns ``(key, position)`` instead of just ``key``
|
|
1065
|
+
so callers can apply position-dependent secondary checks (e.g.,
|
|
1066
|
+
T4 sentence-boundary detector-pair reject). The slice-pairing
|
|
1067
|
+
call site discards the position.
|
|
755
1068
|
"""
|
|
756
1069
|
if not positions:
|
|
757
1070
|
return None
|
|
758
1071
|
# Look for the LAST position strictly before the value, within range.
|
|
759
|
-
last_before: str | None = None
|
|
1072
|
+
last_before: tuple[str, int] | None = None
|
|
760
1073
|
for pos, key in positions:
|
|
761
1074
|
if pos < value_pos and (value_pos - pos) <= max_distance:
|
|
762
|
-
last_before = key
|
|
1075
|
+
last_before = (key, pos)
|
|
763
1076
|
elif pos >= value_pos:
|
|
764
1077
|
break
|
|
765
1078
|
if last_before is not None:
|
|
@@ -767,7 +1080,7 @@ def _nearest_canonical_key(
|
|
|
767
1080
|
# Fall back: FIRST position after the value, within range.
|
|
768
1081
|
for pos, key in positions:
|
|
769
1082
|
if pos >= value_pos and (pos - value_pos) <= max_distance:
|
|
770
|
-
return key
|
|
1083
|
+
return (key, pos)
|
|
771
1084
|
return None
|
|
772
1085
|
|
|
773
1086
|
|
|
@@ -1429,7 +1429,7 @@
|
|
|
1429
1429
|
"doc_first_line": "str(object='') -> str",
|
|
1430
1430
|
"kind": "value",
|
|
1431
1431
|
"type": "str",
|
|
1432
|
-
"value": "'1.
|
|
1432
|
+
"value": "'1.2.0'"
|
|
1433
1433
|
},
|
|
1434
1434
|
"apply_operating_points": {
|
|
1435
1435
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1979,7 +1979,7 @@
|
|
|
1979
1979
|
"validate_reader_value_bindings": {
|
|
1980
1980
|
"doc_first_line": "Validate (detector, metric, value) bindings in reader-prose markdown.",
|
|
1981
1981
|
"kind": "function",
|
|
1982
|
-
"signature": "(*, files: 'Sequence[Path | str]', bindings:
|
|
1982
|
+
"signature": "(*, files: 'Sequence[Path | str]', bindings: 'Mapping[BindingKey | tuple[str, str] | tuple[str, str, str], float]', value_pattern: 'str' = '\\\\d+\\\\.\\\\d{2,4}', max_distance_chars: 'int' = 80, metric_aliases: 'Mapping[str, Sequence[str]]' = mappingproxy({}), detector_aliases: 'Mapping[str, Sequence[str]]' = mappingproxy({}), slice_aliases: 'Mapping[str, Sequence[str]] | None' = None, slice_window_chars: 'int' = 120, scope: \"Literal['all', 'narrative']\" = 'all', tolerance: 'float' = 0.0001) -> 'ValueBindingsReport'"
|
|
1983
1983
|
},
|
|
1984
1984
|
"validate_results": {
|
|
1985
1985
|
"doc_first_line": "Validate a serialized ``RunResult`` payload against ``results.v1.json``.",
|