eval-toolkit 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/CHANGELOG.md +114 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/PKG-INFO +1 -1
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/source/adr/README.md +1 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_value_bindings.py +295 -13
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/public_api/snapshot.json +1 -1
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_audit_value_bindings.py +266 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/.gitignore +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/LICENSE +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/STYLE.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/pyproject.toml +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_config.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,120 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.3.0] — 2026-05-26 — `audit_value_bindings` cross-detector list-grammar pairing rules (closes #81)
|
|
9
|
+
|
|
10
|
+
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
11
|
+
Closes [#81](https://github.com/brandon-behring/eval-toolkit/issues/81)
|
|
12
|
+
— consumer-feedback follow-on after v1.2.0's adoption at
|
|
13
|
+
`prompt-injection-detection-submission@v1.3.12` (4 residual
|
|
14
|
+
warnings, all cross-detector list-grammar or metric-axis
|
|
15
|
+
confusion). Introduces **Layer 3 — pairing rules** as the third
|
|
16
|
+
correctness layer alongside ADR 0005's identity + scope model
|
|
17
|
+
(see new [ADR 0006](docs/source/adr/0006-pairing-rules-for-cross-detector-list-grammar.md)).
|
|
18
|
+
|
|
19
|
+
Consumer-side dogfood result: **4 → 0 warnings**. Combined with
|
|
20
|
+
v1.1.0 + v1.2.0, **100% reduction vs the pre-fix v1.0.5 baseline**
|
|
21
|
+
on the consumer's writeup (95 → 0).
|
|
22
|
+
|
|
23
|
+
### Added — `audit_value_bindings.py` Layer 3 pairing rules
|
|
24
|
+
|
|
25
|
+
All four rules activate ONLY when `scope="narrative"`. Legacy
|
|
26
|
+
`scope="all"` callers see zero behavior change. No new public
|
|
27
|
+
kwargs; keyword sets are hardcoded module-level `frozenset`
|
|
28
|
+
constants.
|
|
29
|
+
|
|
30
|
+
- **Pattern A — `"for {detector}"` postfix override.** When a
|
|
31
|
+
candidate value is followed (within +50 chars) by `"for
|
|
32
|
+
{detector_alias}"` AND no other value lies between, the
|
|
33
|
+
postfix is authoritative: confirms pairing for this binding
|
|
34
|
+
OR skips if it names a different canonical detector.
|
|
35
|
+
Intervening-value check uses the v1.1.0 exclusion-ranges
|
|
36
|
+
infrastructure (CI brackets like `[0.286, 0.301]` don't count
|
|
37
|
+
as intervening values).
|
|
38
|
+
- **Pattern B — `"{detector}'s"` possessive override.** Same
|
|
39
|
+
mechanics; scans −80 chars before the value. Last possessive
|
|
40
|
+
in the pre-window is authoritative if its end is within 30
|
|
41
|
+
chars of the value start. Catches both immediate `"frozen
|
|
42
|
+
probe's 0.515"` and short-clause `"LoRA's ... AUROC is 0.383"`.
|
|
43
|
+
- **Pattern C — group-subject suppression.** When prose contains
|
|
44
|
+
`"for the {trained|frozen|baseline|all|both|other} detectors"`
|
|
45
|
+
within ±60 chars of the value AND on the same side of any
|
|
46
|
+
sentence boundary, the value is suppressed (it refers to a
|
|
47
|
+
multi-detector group statement that doesn't bind to a single
|
|
48
|
+
canonical detector). Multi-detector inference deferred to v1.4.0+
|
|
49
|
+
per ADR 0006.
|
|
50
|
+
- **Pattern D — metric-axis nearest-pairing.** Symmetric to
|
|
51
|
+
detector-axis pairing. Pre-collects ALL metric positions per
|
|
52
|
+
file (across `metric_aliases` keys, not just binding-derived
|
|
53
|
+
metrics). Requires the NEAREST metric to the value to be THIS
|
|
54
|
+
binding's metric. Catches prose like `"AUPRC delta suggests:
|
|
55
|
+
... AUROC is 0.383"` where the wider window-based metric
|
|
56
|
+
proximity check picks up the wrong metric.
|
|
57
|
+
|
|
58
|
+
### Internal changes (no public API impact)
|
|
59
|
+
|
|
60
|
+
- New module-level constants:
|
|
61
|
+
- `_GROUP_SUBJECT_KEYWORDS: frozenset[str]` — group adjectives.
|
|
62
|
+
- `_GROUP_SUBJECT_PATTERN: re.Pattern[str]` — compiled regex
|
|
63
|
+
matching `"for the {kw} detectors?"`.
|
|
64
|
+
- New private helpers:
|
|
65
|
+
- `_build_postfix_pattern(detector_aliases, detector_keys)` —
|
|
66
|
+
per-call regex builder for Pattern A.
|
|
67
|
+
- `_build_possessive_pattern(detector_aliases, detector_keys)` —
|
|
68
|
+
per-call regex builder for Pattern B.
|
|
69
|
+
- `metric_patterns` build extended to use the union of
|
|
70
|
+
`binding-derived` and `metric_aliases.keys()` so Pattern D can
|
|
71
|
+
pair against unbound-but-aliased metrics.
|
|
72
|
+
- Inner loop reordered to apply C-suppress → Pattern A → Pattern B
|
|
73
|
+
before proximity-based detector pairing. Pattern A/B record a
|
|
74
|
+
`pairing_confirmed_pos` that BYPASSES proximity when the override
|
|
75
|
+
confirms THIS binding's detector.
|
|
76
|
+
- Pattern D added as a separate check after the existing
|
|
77
|
+
metric_close proximity test.
|
|
78
|
+
|
|
79
|
+
### Dogfood evidence (compounded across the cycle)
|
|
80
|
+
|
|
81
|
+
| Release | Configuration | Warnings on consumer HEAD | Reduction vs v1.0.5 |
|
|
82
|
+
|---|---|---|---|
|
|
83
|
+
| v1.0.5 | Legacy 2-tuple, no scope | 95 | — |
|
|
84
|
+
| v1.1.0 | BindingKey + scope='narrative' content-type | 23 | -76% |
|
|
85
|
+
| v1.2.0 | + T1–T4 context filters | 7 | -93% |
|
|
86
|
+
| **v1.3.0** | + Pattern A/B/C/D pairing rules | **0** | **-100%** |
|
|
87
|
+
|
|
88
|
+
### Consumer adoption path
|
|
89
|
+
|
|
90
|
+
`prompt-injection-detection-submission` and other consumers using
|
|
91
|
+
`scope="narrative"` get the v1.3.0 pairing rules automatically with
|
|
92
|
+
no code change. Recommended migration:
|
|
93
|
+
|
|
94
|
+
1. Re-pin `eval-toolkit>=1.3.0,<2` (additive; no consumer code
|
|
95
|
+
change required).
|
|
96
|
+
2. **HARD-gate promotion is now credible.** With 0 residual
|
|
97
|
+
warnings, `audit_value_bindings` can be promoted from SOFT to
|
|
98
|
+
HARD (failing CI on violations) bundled with
|
|
99
|
+
`audit_citation_alignment` per the consumer's v1.3.8
|
|
100
|
+
bundled-promotion plan.
|
|
101
|
+
|
|
102
|
+
### Tests
|
|
103
|
+
|
|
104
|
+
43 in `tests/test_audit_value_bindings.py` (36 from v1.2.0 + 7
|
|
105
|
+
new for Pattern A/B/C/D + unknown-alias fall-through + scope='all'
|
|
106
|
+
backward-compat + combined dogfood). All pass. Public API
|
|
107
|
+
snapshot regenerated for `__version__` bump only (no signature
|
|
108
|
+
changes).
|
|
109
|
+
|
|
110
|
+
### Out of scope (deferred)
|
|
111
|
+
|
|
112
|
+
- **Multi-detector inference for Pattern C** — replace
|
|
113
|
+
suppression with explicit iteration over implied group
|
|
114
|
+
detectors. ~250 LOC; v1.4.0+ candidate if consumer demand
|
|
115
|
+
emerges.
|
|
116
|
+
- **Enumeration parsing** — `"X scored Y, Z, W for A, B, C
|
|
117
|
+
respectively"` patterns. Not in #81; v1.4.0+ if needed.
|
|
118
|
+
- **Markdown AST parsing** (ADR 0005 §A4) — v2.0 territory.
|
|
119
|
+
- **Public kwargs for pairing-rule keyword extension** — YAGNI;
|
|
120
|
+
add in v1.3.x patch if demand emerges.
|
|
121
|
+
|
|
8
122
|
## [1.2.0] — 2026-05-26 — `audit_value_bindings` context-aware noise reduction (consumer-feedback follow-on to #80)
|
|
9
123
|
|
|
10
124
|
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -78,3 +78,4 @@ What would have to change for this decision to be reopened?
|
|
|
78
78
|
| [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
|
|
79
79
|
| [0004](0004-naming-conventions.md) | Naming conventions for modules, classes, and parameters | Accepted | 2026-05-23 |
|
|
80
80
|
| [0005](0005-structured-keys-for-audit-validators.md) | Structured keys over positional tuples for canonical-identity types in audit validators | Accepted | 2026-05-26 |
|
|
81
|
+
| [0006](0006-pairing-rules-for-cross-detector-list-grammar.md) | Pairing rules for cross-detector list-grammar in audit validators | Accepted | 2026-05-26 |
|
|
@@ -154,6 +154,51 @@ _DELTA_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_DELTA_KEYWORDS)
|
|
|
154
154
|
_FLOOR_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_FLOOR_KEYWORDS)
|
|
155
155
|
|
|
156
156
|
|
|
157
|
+
# v1.3.0 Layer 3 (pairing rules) per ADR 0006. Three rules that
|
|
158
|
+
# override or suppress the proximity-based detector pairing under
|
|
159
|
+
# explicit grammar cues:
|
|
160
|
+
#
|
|
161
|
+
# - Pattern A: "for {detector}" postfix → re-pair value to that
|
|
162
|
+
# detector (override). Built per-call via _build_postfix_pattern
|
|
163
|
+
# since it depends on the consumer's detector_aliases dict.
|
|
164
|
+
# - Pattern B: "{detector}'s ... is {value}" possessive → re-pair
|
|
165
|
+
# value to the possessor (override). Built per-call via
|
|
166
|
+
# _build_possessive_pattern.
|
|
167
|
+
# - Pattern C: "for the {trained|frozen|baseline|all|both|other}
|
|
168
|
+
# detectors" group subject → suppress the candidate value entirely
|
|
169
|
+
# (it's a group statement that doesn't bind to a single detector).
|
|
170
|
+
# Pattern is detector-independent so it compiles once at module
|
|
171
|
+
# load.
|
|
172
|
+
|
|
173
|
+
# Group-subject adjectives that introduce a multi-detector statement.
|
|
174
|
+
# When prose says "for the trained detectors", the following value
|
|
175
|
+
# refers to a GROUP (LoRA + TF-IDF + ... whatever bindings exist),
|
|
176
|
+
# not a single canonical detector. The validator can't infer which
|
|
177
|
+
# specific detectors own the group value with positional heuristics,
|
|
178
|
+
# so v1.3.0 suppresses the candidate rather than attempting multi-
|
|
179
|
+
# detector inference (a v1.4.0+ candidate per ADR 0006).
|
|
180
|
+
_GROUP_SUBJECT_KEYWORDS: frozenset[str] = frozenset(
|
|
181
|
+
{
|
|
182
|
+
"trained",
|
|
183
|
+
"frozen",
|
|
184
|
+
"baseline",
|
|
185
|
+
"all",
|
|
186
|
+
"both",
|
|
187
|
+
"other",
|
|
188
|
+
}
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Module-level: detector-independent group-subject regex. Matches
|
|
192
|
+
# "for the {trained|frozen|...} detectors" (with optional "the"; both
|
|
193
|
+
# singular and plural "detector"/"detectors" tolerated).
|
|
194
|
+
_GROUP_SUBJECT_PATTERN: re.Pattern[str] = re.compile(
|
|
195
|
+
r"\bfor\s+(?:the\s+)?(?:"
|
|
196
|
+
+ "|".join(sorted(re.escape(kw) for kw in _GROUP_SUBJECT_KEYWORDS))
|
|
197
|
+
+ r")\s+detectors?\b",
|
|
198
|
+
re.IGNORECASE,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
157
202
|
@dataclass(frozen=True)
|
|
158
203
|
class BindingKey:
|
|
159
204
|
"""Canonical identity for a `(detector, metric, slice)` measurement.
|
|
@@ -469,7 +514,13 @@ def validate_reader_value_bindings(
|
|
|
469
514
|
slice_aliases_dict: dict[str, Sequence[str]] = dict(slice_aliases) if slice_aliases else {}
|
|
470
515
|
|
|
471
516
|
detector_keys = sorted({k.detector for k in canonical_bindings})
|
|
472
|
-
|
|
517
|
+
# v1.3.0 Pattern D (metric-axis nearest-pairing) requires knowing
|
|
518
|
+
# ALL metrics that might appear in prose, not just bound metrics —
|
|
519
|
+
# e.g., when prose mentions AUROC near a value but only AUPRC is
|
|
520
|
+
# bound, Pattern D needs the AUROC pattern to correctly pair the
|
|
521
|
+
# value with the right metric. Union of binding metrics +
|
|
522
|
+
# consumer-supplied metric_aliases keys.
|
|
523
|
+
metric_keys = sorted({k.metric for k in canonical_bindings} | set(metric_aliases_dict.keys()))
|
|
473
524
|
# Only compile slice patterns for non-"any" slice keys; "any"
|
|
474
525
|
# signals legacy 2-tuple semantics (no slice scoping).
|
|
475
526
|
slice_keys = sorted({k.slice for k in canonical_bindings if k.slice != "any"})
|
|
@@ -488,6 +539,43 @@ def validate_reader_value_bindings(
|
|
|
488
539
|
}
|
|
489
540
|
value_re = re.compile(value_pattern)
|
|
490
541
|
|
|
542
|
+
# v1.3.0 Layer 3 pairing rules (per ADR 0006). Built per-call
|
|
543
|
+
# because Patterns A and B depend on the consumer's detector
|
|
544
|
+
# aliases. `None` when scope="all" (legacy; rules don't fire).
|
|
545
|
+
postfix_pat: re.Pattern[str] | None = (
|
|
546
|
+
_build_postfix_pattern(detector_aliases_dict, detector_keys)
|
|
547
|
+
if scope == "narrative"
|
|
548
|
+
else None
|
|
549
|
+
)
|
|
550
|
+
possessive_pat: re.Pattern[str] | None = (
|
|
551
|
+
_build_possessive_pattern(detector_aliases_dict, detector_keys)
|
|
552
|
+
if scope == "narrative"
|
|
553
|
+
else None
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
# Inverse-alias index: alias-regex (string form) → canonical key.
|
|
557
|
+
# Used to resolve a matched postfix/possessive alias-group back to
|
|
558
|
+
# the canonical detector for override resolution. Each alias regex
|
|
559
|
+
# is keyed verbatim; the resolution path tries each canonical key's
|
|
560
|
+
# alias list + canonical-name fallback.
|
|
561
|
+
def _resolve_canonical_from_alias_match(alias_text: str) -> str | None:
|
|
562
|
+
"""Return the canonical detector key whose pattern matched ``alias_text``.
|
|
563
|
+
|
|
564
|
+
Iterates the per-detector patterns and tries to match the
|
|
565
|
+
alias_text. Uses re.IGNORECASE for consistency with the
|
|
566
|
+
outer postfix/possessive patterns. First-match wins (the
|
|
567
|
+
OR-build above means there's only one canonical key per
|
|
568
|
+
match anyway in practice).
|
|
569
|
+
"""
|
|
570
|
+
for det_key in detector_keys:
|
|
571
|
+
det_pat = detector_patterns[det_key]
|
|
572
|
+
# det_pat is the alias OR pattern from _build_pattern,
|
|
573
|
+
# case-insensitive. fullmatch on the alias_text checks
|
|
574
|
+
# whether this alias belongs to det_key's set.
|
|
575
|
+
if det_pat.fullmatch(alias_text):
|
|
576
|
+
return det_key
|
|
577
|
+
return None
|
|
578
|
+
|
|
491
579
|
violations: list[Violation] = []
|
|
492
580
|
matched: list[Match] = []
|
|
493
581
|
matched_keys: set[BindingKey] = set()
|
|
@@ -542,6 +630,19 @@ def validate_reader_value_bindings(
|
|
|
542
630
|
slice_positions.append((s_match.start(), s_key))
|
|
543
631
|
slice_positions.sort()
|
|
544
632
|
|
|
633
|
+
# v1.3.0 Pattern D — metric-axis nearest-pairing (Layer 3 per
|
|
634
|
+
# ADR 0006, narrative-scope only). Pre-collect ALL metric
|
|
635
|
+
# positions so each value can be paired with its NEAREST
|
|
636
|
+
# metric mention (text-order). Catches the case where prose
|
|
637
|
+
# mentions BOTH metrics ("AUPRC delta suggests: AUROC 0.383")
|
|
638
|
+
# and the validator's window-based metric proximity check
|
|
639
|
+
# picks up the wrong metric. Symmetric to detector pairing.
|
|
640
|
+
metric_positions: list[tuple[int, str]] = [] # (position, canonical_metric)
|
|
641
|
+
for m_key, m_re in metric_patterns.items():
|
|
642
|
+
for m_match in m_re.finditer(text):
|
|
643
|
+
metric_positions.append((m_match.start(), m_key))
|
|
644
|
+
metric_positions.sort()
|
|
645
|
+
|
|
545
646
|
# For each canonical binding, look in each file for triples.
|
|
546
647
|
for canonical_key, expected in canonical_bindings.items():
|
|
547
648
|
det_key = canonical_key.detector
|
|
@@ -615,18 +716,110 @@ def validate_reader_value_bindings(
|
|
|
615
716
|
):
|
|
616
717
|
continue
|
|
617
718
|
|
|
618
|
-
#
|
|
619
|
-
#
|
|
620
|
-
#
|
|
621
|
-
#
|
|
622
|
-
#
|
|
623
|
-
#
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
)
|
|
627
|
-
if
|
|
628
|
-
|
|
629
|
-
|
|
719
|
+
# v1.3.0 Pattern C — group-subject suppression
|
|
720
|
+
# (narrative-scope only). When prose says "for the
|
|
721
|
+
# {trained|frozen|baseline|all|both|other}
|
|
722
|
+
# detectors" within ±60 chars of the value AND on
|
|
723
|
+
# the same side of any sentence boundary, the
|
|
724
|
+
# value refers to a multi-detector group statement
|
|
725
|
+
# that doesn't bind to a single canonical detector.
|
|
726
|
+
# Suppress the candidate (v1.4.0+ may attempt
|
|
727
|
+
# multi-detector inference per ADR 0006).
|
|
728
|
+
if scope == "narrative":
|
|
729
|
+
gs_start = max(0, val_start_in_full - 60)
|
|
730
|
+
gs_end = min(len(text), val_start_in_full + len(val_str) + 60)
|
|
731
|
+
gs_match = _GROUP_SUBJECT_PATTERN.search(text, gs_start, gs_end)
|
|
732
|
+
if gs_match is not None and not _crosses_sentence_boundary(
|
|
733
|
+
gs_match.start(), val_start_in_full, sentence_positions
|
|
734
|
+
):
|
|
735
|
+
continue
|
|
736
|
+
|
|
737
|
+
# v1.3.0 Pattern A / B — Layer 3 pairing-rule
|
|
738
|
+
# OVERRIDES (narrative-scope only). When a postfix
|
|
739
|
+
# or possessive explicitly names a detector, the
|
|
740
|
+
# override is AUTHORITATIVE — it confirms or
|
|
741
|
+
# rejects the binding without falling through to
|
|
742
|
+
# the proximity-based detector pairing below.
|
|
743
|
+
#
|
|
744
|
+
# - postfix_confirmed_pos / possessive_confirmed_pos:
|
|
745
|
+
# the character position of the override match,
|
|
746
|
+
# used as the effective "paired detector
|
|
747
|
+
# position" for downstream T4 (sentence-
|
|
748
|
+
# boundary) check.
|
|
749
|
+
# - If postfix/possessive_canonical == det_key:
|
|
750
|
+
# confirmed; bypass proximity.
|
|
751
|
+
# - If != det_key AND is in bindings: skip (the
|
|
752
|
+
# other detector's loop iteration claims it).
|
|
753
|
+
# - If doesn't resolve / no match: fall through
|
|
754
|
+
# to proximity-based pairing.
|
|
755
|
+
pairing_confirmed_pos: int | None = None
|
|
756
|
+
|
|
757
|
+
# Pattern A — "for {detector}" postfix
|
|
758
|
+
if postfix_pat is not None:
|
|
759
|
+
val_end = val_start_in_full + len(val_str)
|
|
760
|
+
pf_match = postfix_pat.search(text, val_end, min(len(text), val_end + 50))
|
|
761
|
+
if pf_match is not None:
|
|
762
|
+
# Intervening-value guard: prose like
|
|
763
|
+
# "X 0.971 versus 0.293 for LoRA" — the
|
|
764
|
+
# "for LoRA" postfix belongs to 0.293,
|
|
765
|
+
# not 0.971. CI brackets like `[0.283,
|
|
766
|
+
# 0.298]` are excluded from intervening
|
|
767
|
+
# consideration via the existing
|
|
768
|
+
# excluded_ranges (v1.1.0 scope filter):
|
|
769
|
+
# values inside brackets aren't real
|
|
770
|
+
# binding-candidate intervening values.
|
|
771
|
+
intervening: re.Match[str] | None = None
|
|
772
|
+
for m in value_re.finditer(text, val_end, pf_match.start()):
|
|
773
|
+
if not (
|
|
774
|
+
excluded_ranges and _is_excluded(m.start(), excluded_ranges)
|
|
775
|
+
):
|
|
776
|
+
intervening = m
|
|
777
|
+
break
|
|
778
|
+
if intervening is None:
|
|
779
|
+
postfix_canonical = _resolve_canonical_from_alias_match(
|
|
780
|
+
pf_match.group(1)
|
|
781
|
+
)
|
|
782
|
+
if postfix_canonical is not None:
|
|
783
|
+
if postfix_canonical != det_key:
|
|
784
|
+
continue
|
|
785
|
+
pairing_confirmed_pos = pf_match.start()
|
|
786
|
+
|
|
787
|
+
# Pattern B — possessive `'s` (only if Pattern A
|
|
788
|
+
# didn't already confirm). Find the LAST possessive
|
|
789
|
+
# in the −80 char pre-window; if its end is within
|
|
790
|
+
# 30 chars of the value start, apply override.
|
|
791
|
+
if pairing_confirmed_pos is None and possessive_pat is not None:
|
|
792
|
+
ps_matches = list(
|
|
793
|
+
possessive_pat.finditer(
|
|
794
|
+
text, max(0, val_start_in_full - 80), val_start_in_full
|
|
795
|
+
)
|
|
796
|
+
)
|
|
797
|
+
if ps_matches:
|
|
798
|
+
ps_match = ps_matches[-1]
|
|
799
|
+
if val_start_in_full - ps_match.end() <= 30:
|
|
800
|
+
possessive_canonical = _resolve_canonical_from_alias_match(
|
|
801
|
+
ps_match.group(1)
|
|
802
|
+
)
|
|
803
|
+
if possessive_canonical is not None:
|
|
804
|
+
if possessive_canonical != det_key:
|
|
805
|
+
continue
|
|
806
|
+
pairing_confirmed_pos = ps_match.start()
|
|
807
|
+
|
|
808
|
+
# Detector pairing: when a Layer 3 override
|
|
809
|
+
# confirmed the binding (pairing_confirmed_pos
|
|
810
|
+
# set), skip the proximity check — the postfix /
|
|
811
|
+
# possessive is authoritative. Otherwise, fall
|
|
812
|
+
# back to the text-order proximity rule (last
|
|
813
|
+
# detector before; else first detector after).
|
|
814
|
+
if pairing_confirmed_pos is not None:
|
|
815
|
+
paired_det_pos = pairing_confirmed_pos
|
|
816
|
+
else:
|
|
817
|
+
detector_match = _nearest_canonical_key(
|
|
818
|
+
detector_positions, val_start_in_full, max_distance_chars
|
|
819
|
+
)
|
|
820
|
+
if detector_match is None or detector_match[0] != det_key:
|
|
821
|
+
continue
|
|
822
|
+
paired_det_pos = detector_match[1]
|
|
630
823
|
|
|
631
824
|
# v1.2.0 T4 (narrative-scope only): reject the
|
|
632
825
|
# detector-value pair if a sentence boundary lies
|
|
@@ -646,6 +839,22 @@ def validate_reader_value_bindings(
|
|
|
646
839
|
if not met_close:
|
|
647
840
|
continue
|
|
648
841
|
|
|
842
|
+
# v1.3.0 Pattern D — metric-axis nearest-pairing
|
|
843
|
+
# (narrative-scope only). Require the NEAREST
|
|
844
|
+
# metric mention to the value (by text-order
|
|
845
|
+
# last-before-first-after) to be THIS binding's
|
|
846
|
+
# canonical metric. Catches prose like "than the
|
|
847
|
+
# AUPRC delta suggests: LoRA's pooled OOD AUROC
|
|
848
|
+
# is 0.383" where the AUPRC mention from the
|
|
849
|
+
# delta clause is within window of 0.383 but
|
|
850
|
+
# AUROC is the metric semantically owning it.
|
|
851
|
+
if scope == "narrative":
|
|
852
|
+
metric_match = _nearest_canonical_key(
|
|
853
|
+
metric_positions, val_start_in_full, max_distance_chars
|
|
854
|
+
)
|
|
855
|
+
if metric_match is not None and metric_match[0] != met_key:
|
|
856
|
+
continue
|
|
857
|
+
|
|
649
858
|
# Slice disambiguation: when the canonical key is
|
|
650
859
|
# slice-scoped (slice != "any"), pair the value
|
|
651
860
|
# with the NEAREST slice mention by the same
|
|
@@ -1019,6 +1228,79 @@ def _has_keyword_in_window(
|
|
|
1019
1228
|
return bool(pattern.search(text, start, end))
|
|
1020
1229
|
|
|
1021
1230
|
|
|
1231
|
+
def _build_postfix_pattern(
|
|
1232
|
+
detector_aliases: Mapping[str, Sequence[str]],
|
|
1233
|
+
detector_keys: Sequence[str],
|
|
1234
|
+
) -> re.Pattern[str] | None:
|
|
1235
|
+
"""Build a regex matching `"for {detector_alias}"` postfix constructs.
|
|
1236
|
+
|
|
1237
|
+
v1.3.0 Pattern A (Layer 3 pairing rule per ADR 0006). Used to
|
|
1238
|
+
re-pair a candidate value with the detector named in a "for X"
|
|
1239
|
+
postfix (e.g., ``"0.291 [...] for TF-IDF + LR"`` binds 0.291 to
|
|
1240
|
+
TF-IDF + LR via the postfix, overriding proximity-based pairing).
|
|
1241
|
+
|
|
1242
|
+
Each alias is paired with its canonical key in a single named-group
|
|
1243
|
+
OR pattern; the capture group reveals which detector matched. The
|
|
1244
|
+
canonical-key-as-fallback ensures the canonical name itself is
|
|
1245
|
+
matched even if no alias regex is provided for that detector.
|
|
1246
|
+
|
|
1247
|
+
Returns None if there are no detectors to build patterns for
|
|
1248
|
+
(empty bindings).
|
|
1249
|
+
"""
|
|
1250
|
+
if not detector_keys:
|
|
1251
|
+
return None
|
|
1252
|
+
alts: list[str] = []
|
|
1253
|
+
for det_key in detector_keys:
|
|
1254
|
+
# Canonical name as a literal alternative + all alias regexes
|
|
1255
|
+
# (which may themselves contain regex syntax like `\+`).
|
|
1256
|
+
parts = [re.escape(det_key)] + list(detector_aliases.get(det_key, ()))
|
|
1257
|
+
# Each detector's parts collapse into a non-capturing group.
|
|
1258
|
+
alts.append("(?:" + "|".join(parts) + ")")
|
|
1259
|
+
# The outer capture group reveals which detector token matched.
|
|
1260
|
+
# The text-order rule means the first alternative wins per Python
|
|
1261
|
+
# re semantics, which is fine for our use case.
|
|
1262
|
+
return re.compile(
|
|
1263
|
+
r"\bfor\s+(?:the\s+)?(" + "|".join(alts) + r")(?=[\s,;.)\]]|$)",
|
|
1264
|
+
re.IGNORECASE,
|
|
1265
|
+
)
|
|
1266
|
+
|
|
1267
|
+
|
|
1268
|
+
def _build_possessive_pattern(
|
|
1269
|
+
detector_aliases: Mapping[str, Sequence[str]],
|
|
1270
|
+
detector_keys: Sequence[str],
|
|
1271
|
+
) -> re.Pattern[str] | None:
|
|
1272
|
+
"""Build a regex matching `"{detector_alias}'s"` possessive markers.
|
|
1273
|
+
|
|
1274
|
+
v1.3.0 Pattern B (Layer 3 pairing rule per ADR 0006). The
|
|
1275
|
+
possessive ``'s`` construction is a strong binding signal that
|
|
1276
|
+
isn't captured by detector-alias regex matching directly (alias
|
|
1277
|
+
patterns don't typically include the apostrophe). Re-pairs the
|
|
1278
|
+
candidate value with the possessor detector.
|
|
1279
|
+
|
|
1280
|
+
The pattern matches JUST the possessive marker (``{alias}'s``);
|
|
1281
|
+
binding-claim proximity is enforced at the call site (the
|
|
1282
|
+
inner loop's Pattern B block requires the LAST possessive
|
|
1283
|
+
in the pre-window to END within 30 chars of the value, which
|
|
1284
|
+
covers both `"frozen probe's 0.515"` (immediate) and
|
|
1285
|
+
`"LoRA's pooled OOD AUROC is 0.383"` (5-token clause).
|
|
1286
|
+
|
|
1287
|
+
Returns None if there are no detectors (empty bindings).
|
|
1288
|
+
"""
|
|
1289
|
+
if not detector_keys:
|
|
1290
|
+
return None
|
|
1291
|
+
alts: list[str] = []
|
|
1292
|
+
for det_key in detector_keys:
|
|
1293
|
+
parts = [re.escape(det_key)] + list(detector_aliases.get(det_key, ()))
|
|
1294
|
+
alts.append("(?:" + "|".join(parts) + ")")
|
|
1295
|
+
# Match `{alias}'s` (ASCII apostrophe or typographic ’s). Tight
|
|
1296
|
+
# — proximity to the value is enforced at the call site via
|
|
1297
|
+
# `match.end()` against the value position.
|
|
1298
|
+
return re.compile(
|
|
1299
|
+
r"(" + "|".join(alts) + r")[’']s\b",
|
|
1300
|
+
re.IGNORECASE,
|
|
1301
|
+
)
|
|
1302
|
+
|
|
1303
|
+
|
|
1022
1304
|
def _build_pattern(
|
|
1023
1305
|
canonical: str,
|
|
1024
1306
|
aliases: Sequence[str],
|
|
@@ -1429,7 +1429,7 @@
|
|
|
1429
1429
|
"doc_first_line": "str(object='') -> str",
|
|
1430
1430
|
"kind": "value",
|
|
1431
1431
|
"type": "str",
|
|
1432
|
-
"value": "'1.
|
|
1432
|
+
"value": "'1.3.0'"
|
|
1433
1433
|
},
|
|
1434
1434
|
"apply_operating_points": {
|
|
1435
1435
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|