eval-toolkit 1.3.0__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/CHANGELOG.md +139 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/PKG-INFO +4 -1
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/source/adr/README.md +1 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/pyproject.toml +8 -0
- eval_toolkit-1.5.0/src/eval_toolkit/_narrative.py +425 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_citation_alignment.py +188 -8
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_value_bindings.py +23 -383
- eval_toolkit-1.5.0/src/eval_toolkit/eda/__init__.py +80 -0
- eval_toolkit-1.5.0/src/eval_toolkit/eda/data_audit.py +785 -0
- eval_toolkit-1.5.0/src/eval_toolkit/eda/obfuscation.py +622 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/loaders.py +46 -8
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/public_api/snapshot.json +3 -3
- eval_toolkit-1.5.0/tests/test_audit_citation_alignment.py +458 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_audit_value_bindings.py +6 -2
- eval_toolkit-1.5.0/tests/test_eda.py +330 -0
- eval_toolkit-1.5.0/tests/test_eda_obfuscation.py +448 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_loaders.py +107 -0
- eval_toolkit-1.3.0/tests/test_audit_citation_alignment.py +0 -242
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/.gitignore +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/LICENSE +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/STYLE.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_config.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.3.0 → eval_toolkit-1.5.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,145 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.5.0] — 2026-05-29 — Tier-2 `eda` layer (#83) + schema-aware `HFDatasetsLoader` (#85)
|
|
9
|
+
|
|
10
|
+
Tier-2 / `loaders` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible.
|
|
11
|
+
|
|
12
|
+
- **`eda` Job-1 integrity gate (#83):** `audit_dataset` / `DataAudit` / `SplitSummary` + the
|
|
13
|
+
`class_balance` / `no_cross_split_leakage` / `context_window_fit` gates + the §B2 obfuscation
|
|
14
|
+
prevalence module.
|
|
15
|
+
- **schema-aware `HFDatasetsLoader` (#85):** load real-world dataset schemas without column
|
|
16
|
+
guessing — `feature_cols` + `feature_join` (join multiple columns into one feature; NaN-safe),
|
|
17
|
+
`label_map` (remap raw labels → int; fail-fast `ValueError` lists unmapped values), `revision`
|
|
18
|
+
(pin the HF dataset SHA). All new params default to the prior behavior; a missing feature/label
|
|
19
|
+
column raises `KeyError` listing the observed columns.
|
|
20
|
+
|
|
21
|
+
## [1.4.0] — 2026-05-26 — `audit_citation_alignment` Layer 2 + Layer 3 (closes #82); shared `_narrative` helpers (ADR 0007)
|
|
22
|
+
|
|
23
|
+
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
24
|
+
Closes [#82](https://github.com/brandon-behring/eval-toolkit/issues/82)
|
|
25
|
+
— consumer-feedback follow-on after v1.3.0 closed
|
|
26
|
+
`audit_value_bindings`. The consumer's 188 residual warnings on
|
|
27
|
+
`audit_citation_alignment` were the same architectural-class gap
|
|
28
|
+
(missing Layer 2 + Layer 3 context-awareness) that
|
|
29
|
+
`audit_value_bindings` worked through over v1.1.0 → v1.3.0.
|
|
30
|
+
|
|
31
|
+
Introduces [ADR 0007](docs/source/adr/0007-three-layer-architecture-for-audit-validators.md)
|
|
32
|
+
codifying the **three-layer correctness model** (identity + scope +
|
|
33
|
+
pairing) as the canonical architecture for ALL `audit_*` validators
|
|
34
|
+
in the family. ADR 0005 + ADR 0006 were originally validator-
|
|
35
|
+
specific; ADR 0007 generalizes.
|
|
36
|
+
|
|
37
|
+
### Added — `audit_citation_alignment` Layer 2 + Layer 3 (closes #82)
|
|
38
|
+
|
|
39
|
+
- **`scope: Literal["all", "narrative"] = "all"`** kwarg on
|
|
40
|
+
`validate_citations(...)`. Default `"all"` preserves v1.0.1 /
|
|
41
|
+
v1.3.x behavior exactly (Tier-1 ADDITIVE; byte-identical legacy
|
|
42
|
+
semantics).
|
|
43
|
+
- **Pattern β (Layer 2)** — under `scope="narrative"`, citations
|
|
44
|
+
inside markdown table rows, bracketed expressions, and fenced
|
|
45
|
+
code blocks are excluded. Mirrors `audit_value_bindings`'s
|
|
46
|
+
Layer 2 from v1.1.0. Closes ~67 of the consumer's residual
|
|
47
|
+
warnings (SPEC_SHEET.md table rows).
|
|
48
|
+
- **Pattern γ (Layer 3)** — the category-keyword extraction window
|
|
49
|
+
for a citation is bounded by the SENTENCE containing the
|
|
50
|
+
citation, not by a ±N-line window. Uses
|
|
51
|
+
`_sentence_boundary_positions` (paragraph-aware, abbreviation-
|
|
52
|
+
guarded) from `_narrative`. Catches the consumer's dense
|
|
53
|
+
multi-clause sentences where keywords from prior clauses pull
|
|
54
|
+
through.
|
|
55
|
+
- **Pattern α (Layer 3)** — when MULTIPLE ADR citations appear in
|
|
56
|
+
the same sentence (e.g.,
|
|
57
|
+
`"per ADR-025 + ADR-021 + ADR-034 + ADR-045"`), the validator
|
|
58
|
+
switches from first-match-wins category check to multi-category
|
|
59
|
+
set membership. Each ADR's actual category is accepted if it's
|
|
60
|
+
in the SET of categories matched by the sentence's keywords —
|
|
61
|
+
not just the dominant first-match. Catches the dense multi-ADR
|
|
62
|
+
list pattern where each ADR addresses a different topic.
|
|
63
|
+
|
|
64
|
+
### Refactor — Shared `_narrative.py` helpers
|
|
65
|
+
|
|
66
|
+
Per ADR 0007, narrative-prose helpers are extracted to private flat
|
|
67
|
+
module `src/eval_toolkit/_narrative.py` (consistent with ADR 0001's
|
|
68
|
+
`_rng.py` / `_parallel.py` / `_sweep.py` precedent — flat-module
|
|
69
|
+
compliant, private/underscore-prefixed). Both validators import:
|
|
70
|
+
|
|
71
|
+
- Keyword frozensets: `_DELTA_KEYWORDS`, `_FLOOR_KEYWORDS`,
|
|
72
|
+
`_GROUP_SUBJECT_KEYWORDS`, `_ABBREV_BEFORE_DOT`.
|
|
73
|
+
- Compiled patterns: `_DELTA_PATTERN`, `_FLOOR_PATTERN`,
|
|
74
|
+
`_GROUP_SUBJECT_PATTERN`.
|
|
75
|
+
- Helpers: `_build_exclusion_ranges`, `_is_excluded`,
|
|
76
|
+
`_is_sentence_terminator_dot`, `_sentence_boundary_positions`,
|
|
77
|
+
`_sentence_id_of`, `_crosses_sentence_boundary`,
|
|
78
|
+
`_is_signed_value`, `_has_keyword_in_window`,
|
|
79
|
+
`_compile_keyword_pattern`.
|
|
80
|
+
|
|
81
|
+
`audit_value_bindings.py` updated to import these from `_narrative`
|
|
82
|
+
instead of defining inline. **Signature-preserving refactor**: all
|
|
83
|
+
43 existing `audit_value_bindings` tests pass UNCHANGED. The
|
|
84
|
+
private helpers are non-public, so no Tier-1 STRICT impact.
|
|
85
|
+
|
|
86
|
+
### Dogfood result
|
|
87
|
+
|
|
88
|
+
| Configuration | Warnings on `prompt-injection-detection-submission` HEAD | Reduction |
|
|
89
|
+
|---|---|---|
|
|
90
|
+
| v1.3.0 (`audit_citation_alignment` with scope='all') | 188 | — (baseline) |
|
|
91
|
+
| **v1.4.0 (`scope='narrative'`)** | **37** | **80%** |
|
|
92
|
+
|
|
93
|
+
Verified locally via `.scratch/dogfood_v1_4_0_citation.py`
|
|
94
|
+
(monkey-patched consumer call with `scope="narrative"`).
|
|
95
|
+
|
|
96
|
+
The residual 37 are a mix of:
|
|
97
|
+
- **Real misalignments** consumer should triage (e.g., `ADR-025`
|
|
98
|
+
cited for a threshold claim when ADR-025 is the cost ADR — could
|
|
99
|
+
be a wrong-ADR bug or a multi-topic ADR not captured by the
|
|
100
|
+
consumer's category-keyword map).
|
|
101
|
+
- **Single-topic sentences** where the first-match category
|
|
102
|
+
inferred from the sentence genuinely differs from the ADR's
|
|
103
|
+
actual category. The multi-topic Pattern α fallback only fires
|
|
104
|
+
when ≥2 categories match; single-topic prose stays on the
|
|
105
|
+
legacy first-match check.
|
|
106
|
+
- **Edge cases** requiring parser-level understanding of how an
|
|
107
|
+
ADR's scope intersects with a multi-clause sentence's topics.
|
|
108
|
+
|
|
109
|
+
The original #82 acceptance criterion was ≤20 warnings (the
|
|
110
|
+
filer's estimate of "genuinely ambiguous citations"). v1.4.0
|
|
111
|
+
hits 37 — above the target but a 5× reduction overall. The
|
|
112
|
+
remaining gap requires either (a) consumer-side expansion of
|
|
113
|
+
`CATEGORY_KEYWORDS` to capture multi-topic ADRs, (b) consumer
|
|
114
|
+
prose adjustments for the real misalignments, or (c) future
|
|
115
|
+
v1.4.x refinements to the validator's heuristic. Consumer
|
|
116
|
+
HARD-gate promotion remains a judgment call — the residual 37
|
|
117
|
+
includes some real misalignments worth fixing.
|
|
118
|
+
|
|
119
|
+
### Consumer adoption path
|
|
120
|
+
|
|
121
|
+
Consumer (`prompt-injection-detection-submission`):
|
|
122
|
+
1. Re-pin `eval-toolkit>=1.4.0,<2`.
|
|
123
|
+
2. Add `scope="narrative"` to their `validate_citations(...)` call
|
|
124
|
+
in `scripts/audit_citation_alignment.py`.
|
|
125
|
+
3. **Bundled HARD-gate promotion** of BOTH `audit_value_bindings`
|
|
126
|
+
AND `audit_citation_alignment` now credible per the v1.3.8
|
|
127
|
+
plan. Promotes from SOFT to HARD in their next v1.3.X release.
|
|
128
|
+
|
|
129
|
+
### Tests
|
|
130
|
+
|
|
131
|
+
61 across the audit-validator suite (43 audit_value_bindings + 18
|
|
132
|
+
audit_citation_alignment; 6 new for v1.4.0 — Pattern α / β / γ /
|
|
133
|
+
scope='all' backward-compat / shared-helpers / combined dogfood).
|
|
134
|
+
All pass. Public API snapshot regenerated for `__version__` bump
|
|
135
|
+
+ `validate_citations` signature with new `scope` kwarg.
|
|
136
|
+
|
|
137
|
+
### Out of scope (deferred)
|
|
138
|
+
|
|
139
|
+
- **`audit_sister_doc_concept_drift` Layer 2 / Layer 3** — embedding-
|
|
140
|
+
based validator (v1.0.4); different false-positive surface. Add
|
|
141
|
+
layers only if consumer demand emerges.
|
|
142
|
+
- **Public helper promotion** (`eval_toolkit.audit_narrative`) —
|
|
143
|
+
YAGNI per ADR 0007 §A2.
|
|
144
|
+
- **Configurable category-keyword-window extension kwargs** —
|
|
145
|
+
YAGNI; add in v1.4.x patch if demand emerges.
|
|
146
|
+
|
|
8
147
|
## [1.3.0] — 2026-05-26 — `audit_value_bindings` cross-detector list-grammar pairing rules (closes #81)
|
|
9
148
|
|
|
10
149
|
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -60,6 +60,9 @@ Requires-Dist: sphinx-autodoc-typehints>=2.0; extra == 'docs'
|
|
|
60
60
|
Requires-Dist: sphinx-copybutton>=0.5; extra == 'docs'
|
|
61
61
|
Requires-Dist: sphinx-design>=0.6; extra == 'docs'
|
|
62
62
|
Requires-Dist: sphinx>=7.3; extra == 'docs'
|
|
63
|
+
Provides-Extra: eda
|
|
64
|
+
Requires-Dist: matplotlib>=3.8; extra == 'eda'
|
|
65
|
+
Requires-Dist: pandas>=2.0; extra == 'eda'
|
|
63
66
|
Provides-Extra: embeddings
|
|
64
67
|
Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
|
|
65
68
|
Provides-Extra: losses
|
|
@@ -79,3 +79,4 @@ What would have to change for this decision to be reopened?
|
|
|
79
79
|
| [0004](0004-naming-conventions.md) | Naming conventions for modules, classes, and parameters | Accepted | 2026-05-23 |
|
|
80
80
|
| [0005](0005-structured-keys-for-audit-validators.md) | Structured keys over positional tuples for canonical-identity types in audit validators | Accepted | 2026-05-26 |
|
|
81
81
|
| [0006](0006-pairing-rules-for-cross-detector-list-grammar.md) | Pairing rules for cross-detector list-grammar in audit validators | Accepted | 2026-05-26 |
|
|
82
|
+
| [0007](0007-three-layer-architecture-for-audit-validators.md) | Three-layer architecture for audit validators (family-wide) | Accepted | 2026-05-26 |
|
|
@@ -74,6 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
|
|
|
74
74
|
# (granular extras — losses callers should not have to install the larger
|
|
75
75
|
# transformers stack). Shares the torch version pin with [probes].
|
|
76
76
|
losses = ["torch>=2.0"]
|
|
77
|
+
# v1.5.0 (feat/eda-data-audit): eval_toolkit.eda Job-1 integrity-gate layer.
|
|
78
|
+
# Tier-2 surface (ADR 0003) — torch-free by design. pandas powers the
|
|
79
|
+
# DataFrameLoader reuse path; matplotlib is reserved for the EDA layer's
|
|
80
|
+
# future profiling plots. Intentionally NO sentence-transformers / torch:
|
|
81
|
+
# the near-dup / cross-split checks use the lexical TfidfCosineStrategy and
|
|
82
|
+
# token-length quantiles take a caller-supplied tokenizer (no transformers
|
|
83
|
+
# import in this module). NOT folded into [all] / [dev] — opt-in only.
|
|
84
|
+
eda = ["pandas>=2.0", "matplotlib>=3.8"]
|
|
77
85
|
# NO-OP extra kept for backward compatibility (R3 at v0.49.0).
|
|
78
86
|
#
|
|
79
87
|
# jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
|
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
"""Shared narrative-prose helpers for the `audit_*` validator family.
|
|
2
|
+
|
|
3
|
+
This private flat module hosts the Layer 2 (scope) + Layer 3 (pairing-
|
|
4
|
+
rule) building blocks that emerged from the v1.1.0 → v1.3.0 cycle of
|
|
5
|
+
``audit_value_bindings`` and are reused by ``audit_citation_alignment``
|
|
6
|
+
at v1.4.0+. Per ADR 0007, the three-layer correctness model (identity
|
|
7
|
+
+ scope + pairing) applies family-wide; this module is the canonical
|
|
8
|
+
home for the prose-pattern primitives that the scope + pairing layers
|
|
9
|
+
build on.
|
|
10
|
+
|
|
11
|
+
Design notes:
|
|
12
|
+
|
|
13
|
+
- **Private flat module** (underscore-prefixed name): matches ADR 0001's
|
|
14
|
+
`_rng.py` / `_parallel.py` / `_sweep.py` precedent. Not in the
|
|
15
|
+
package's public ``_EXPORTS`` resolver; consumers import via
|
|
16
|
+
``eval_toolkit.audit_*`` modules, which in turn import from here.
|
|
17
|
+
- **Helpers preserve their exact signatures from audit_value_bindings.py**
|
|
18
|
+
(v1.1.0–v1.3.0 vintage) — extraction is a signature-preserving
|
|
19
|
+
refactor. All 43 existing audit_value_bindings tests continue to
|
|
20
|
+
pass unchanged.
|
|
21
|
+
- **Keyword frozensets are audit_value_bindings-specific** (delta /
|
|
22
|
+
floor / group-subject keywords are about value-binding prose, not
|
|
23
|
+
citation prose). Other validators that need similar lists define
|
|
24
|
+
their own constants. The SHARED parts are the regex-compilation
|
|
25
|
+
utility and the structural helpers (exclusion ranges, sentence
|
|
26
|
+
boundaries) that are validator-agnostic.
|
|
27
|
+
|
|
28
|
+
Cross-references:
|
|
29
|
+
- ADR 0005 — identity layer (BindingKey)
|
|
30
|
+
- ADR 0006 — pairing layer (audit_value_bindings-specific)
|
|
31
|
+
- ADR 0007 — three-layer architecture, family-wide
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import bisect
|
|
37
|
+
import re
|
|
38
|
+
from collections.abc import Sequence
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Module-level keyword sets and compiled patterns.
|
|
42
|
+
# Specific to audit_value_bindings' Layer 2 filters (T1/T2/C); kept here
|
|
43
|
+
# so the validator can `from eval_toolkit._narrative import ...` rather
|
|
44
|
+
# than maintaining two copies. Other audit_* validators define their
|
|
45
|
+
# own keyword sets analogously.
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# _DELTA_KEYWORDS: case-insensitive whole-token markers indicating a
|
|
50
|
+
# value is a paired-delta or comparative magnitude, not a binding claim.
|
|
51
|
+
# T1 filter suppresses candidate values when any of these appears within
|
|
52
|
+
# ±30 chars of the value position (under scope="narrative").
|
|
53
|
+
_DELTA_KEYWORDS: frozenset[str] = frozenset(
|
|
54
|
+
{
|
|
55
|
+
# Unambiguous delta nouns/verbs (consumer prose patterns):
|
|
56
|
+
"delta",
|
|
57
|
+
"drop",
|
|
58
|
+
"drops",
|
|
59
|
+
"lift",
|
|
60
|
+
"lifts",
|
|
61
|
+
"gap",
|
|
62
|
+
"margin",
|
|
63
|
+
# Comparison verbs that signal "this is a relative magnitude":
|
|
64
|
+
"regresses",
|
|
65
|
+
"improves",
|
|
66
|
+
"beats",
|
|
67
|
+
"exceeds",
|
|
68
|
+
"trails",
|
|
69
|
+
"underperforms",
|
|
70
|
+
# "vs"/"versus" intentionally INCLUDED — they're the canonical
|
|
71
|
+
# delta separator in consumer prose ("AUPRC 0.556 vs 0.519").
|
|
72
|
+
# The before-only window keeps these tight: "X vs Y" fires on
|
|
73
|
+
# Y (preceded by "vs"), not X. T3 also catches the same-sentence
|
|
74
|
+
# duplicate-binding flag separately.
|
|
75
|
+
"vs",
|
|
76
|
+
"versus",
|
|
77
|
+
# Comparison directions — kept under before-only window so
|
|
78
|
+
# "drops -0.071 below" suppresses -0.071 (sign also catches),
|
|
79
|
+
# but "0.515 (delta -0.132)" doesn't suppress 0.515 ("delta"
|
|
80
|
+
# is AFTER 0.515).
|
|
81
|
+
# Excluded: "against", "above", "ahead", "behind" — too
|
|
82
|
+
# ambiguous; common comparison prepositions that appear in
|
|
83
|
+
# legitimate binding claims.
|
|
84
|
+
"below",
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# _FLOOR_KEYWORDS: markers indicating a value is a random-baseline or
|
|
89
|
+
# floor reference, not a detector binding. T2 filter suppresses
|
|
90
|
+
# candidate values when any of these appears within −50 / +5 chars
|
|
91
|
+
# (asymmetric: floor mentions canonically precede the value, e.g.,
|
|
92
|
+
# "random AUPRC is 0.374").
|
|
93
|
+
#
|
|
94
|
+
# Intentionally narrow: "baseline", "prior", "majority" are EXCLUDED
|
|
95
|
+
# because they have legitimate non-floor senses ("TF-IDF baseline",
|
|
96
|
+
# "prior work", "majority of detectors"). The consumer's prose
|
|
97
|
+
# patterns with these words ("below the prevalence baseline of 0.374")
|
|
98
|
+
# are caught by T1 via "below"/"above" instead — the comparative
|
|
99
|
+
# preposition is the reliable signal, not the noun.
|
|
100
|
+
_FLOOR_KEYWORDS: frozenset[str] = frozenset(
|
|
101
|
+
{
|
|
102
|
+
"random",
|
|
103
|
+
"floor",
|
|
104
|
+
"chance",
|
|
105
|
+
"trivial",
|
|
106
|
+
}
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# _ABBREV_BEFORE_DOT: tokens that should NOT trigger a sentence
|
|
110
|
+
# boundary when followed by `.`. The multi-letter pattern (e.g., i.e.,
|
|
111
|
+
# c.f.) is handled separately via letter-dot-letter detection.
|
|
112
|
+
_ABBREV_BEFORE_DOT: frozenset[str] = frozenset(
|
|
113
|
+
{
|
|
114
|
+
"vs",
|
|
115
|
+
"etc",
|
|
116
|
+
"cf",
|
|
117
|
+
"fig",
|
|
118
|
+
"eq",
|
|
119
|
+
"pp",
|
|
120
|
+
"viz",
|
|
121
|
+
"ca",
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _compile_keyword_pattern(keywords: frozenset[str]) -> re.Pattern[str]:
|
|
127
|
+
"""Compile case-insensitive word-boundary OR regex matching any keyword."""
|
|
128
|
+
parts = sorted(re.escape(kw) for kw in keywords)
|
|
129
|
+
return re.compile(r"\b(?:" + "|".join(parts) + r")\b", re.IGNORECASE)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
_DELTA_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_DELTA_KEYWORDS)
|
|
133
|
+
_FLOOR_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_FLOOR_KEYWORDS)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# Group-subject adjectives that introduce a multi-detector statement.
|
|
137
|
+
# When prose says "for the trained detectors", the following value
|
|
138
|
+
# refers to a GROUP (LoRA + TF-IDF + ... whatever bindings exist),
|
|
139
|
+
# not a single canonical detector. The validator can't infer which
|
|
140
|
+
# specific detectors own the group value with positional heuristics,
|
|
141
|
+
# so v1.3.0 suppresses the candidate rather than attempting multi-
|
|
142
|
+
# detector inference (a v1.4.0+ candidate per ADR 0006).
|
|
143
|
+
_GROUP_SUBJECT_KEYWORDS: frozenset[str] = frozenset(
|
|
144
|
+
{
|
|
145
|
+
"trained",
|
|
146
|
+
"frozen",
|
|
147
|
+
"baseline",
|
|
148
|
+
"all",
|
|
149
|
+
"both",
|
|
150
|
+
"other",
|
|
151
|
+
}
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Module-level: detector-independent group-subject regex. Matches
|
|
155
|
+
# "for the {trained|frozen|...} detectors" (with optional "the"; both
|
|
156
|
+
# singular and plural "detector"/"detectors" tolerated).
|
|
157
|
+
_GROUP_SUBJECT_PATTERN: re.Pattern[str] = re.compile(
|
|
158
|
+
r"\bfor\s+(?:the\s+)?(?:"
|
|
159
|
+
+ "|".join(sorted(re.escape(kw) for kw in _GROUP_SUBJECT_KEYWORDS))
|
|
160
|
+
+ r")\s+detectors?\b",
|
|
161
|
+
re.IGNORECASE,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
# Layer 2: content-type filtering helpers (`scope="narrative"`).
|
|
167
|
+
# Used by audit_value_bindings (v1.1.0+) and audit_citation_alignment
|
|
168
|
+
# (v1.4.0+) to exclude markdown table rows, bracketed expressions, and
|
|
169
|
+
# fenced code blocks from candidate-value / candidate-citation matching.
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _build_exclusion_ranges(
|
|
174
|
+
text: str,
|
|
175
|
+
line_starts: Sequence[int],
|
|
176
|
+
) -> list[tuple[int, int]]:
|
|
177
|
+
"""Compute sorted character ranges that ``scope="narrative"`` excludes.
|
|
178
|
+
|
|
179
|
+
Excluded content types (per the lint-scope design discussion in
|
|
180
|
+
ADR 0005):
|
|
181
|
+
|
|
182
|
+
- **Markdown table rows**: lines starting with optional whitespace
|
|
183
|
+
then ``|``. Tables are structured data audited via different
|
|
184
|
+
mechanisms (e.g., direct results-table verification), not via
|
|
185
|
+
narrative-prose binding-claim checks. Values in cells are
|
|
186
|
+
typically inline statistics (multiple metrics per row), and the
|
|
187
|
+
validator's positional heuristics can't disambiguate them.
|
|
188
|
+
- **Bracketed expressions** ``[...]``: confidence intervals,
|
|
189
|
+
reference markers, ranges. The numeric content inside brackets
|
|
190
|
+
is not a point-estimate claim; the validator should not flag it.
|
|
191
|
+
- **Fenced code blocks**: triple-backtick blocks contain code or
|
|
192
|
+
literal data, not narrative claims.
|
|
193
|
+
|
|
194
|
+
Returns a sorted list of ``(start, end)`` character intervals
|
|
195
|
+
(half-open) for use with :func:`_is_excluded`.
|
|
196
|
+
"""
|
|
197
|
+
excluded: list[tuple[int, int]] = []
|
|
198
|
+
in_code_block = False
|
|
199
|
+
code_block_start = 0
|
|
200
|
+
n_lines = len(line_starts)
|
|
201
|
+
for line_idx in range(n_lines):
|
|
202
|
+
line_start = line_starts[line_idx]
|
|
203
|
+
line_end = line_starts[line_idx + 1] if line_idx + 1 < n_lines else len(text)
|
|
204
|
+
line = text[line_start:line_end]
|
|
205
|
+
|
|
206
|
+
# Triple-backtick code-fence toggle. The fence line itself is
|
|
207
|
+
# also part of the excluded range (so values aren't matched
|
|
208
|
+
# from within the fence marker, though that's unlikely).
|
|
209
|
+
stripped = line.lstrip()
|
|
210
|
+
if stripped.startswith("```"):
|
|
211
|
+
if not in_code_block:
|
|
212
|
+
in_code_block = True
|
|
213
|
+
code_block_start = line_start
|
|
214
|
+
else:
|
|
215
|
+
in_code_block = False
|
|
216
|
+
excluded.append((code_block_start, line_end))
|
|
217
|
+
continue
|
|
218
|
+
if in_code_block:
|
|
219
|
+
# Lines inside a code block are folded into the outer
|
|
220
|
+
# range emitted at the closing fence; no per-line emission.
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# Markdown table row.
|
|
224
|
+
if stripped.startswith("|"):
|
|
225
|
+
excluded.append((line_start, line_end))
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
# Bracketed expressions on this line. Multiple `[...]` allowed.
|
|
229
|
+
# Nested brackets are rare in measurement prose; first close
|
|
230
|
+
# wins.
|
|
231
|
+
i = 0
|
|
232
|
+
while True:
|
|
233
|
+
open_idx = line.find("[", i)
|
|
234
|
+
if open_idx == -1:
|
|
235
|
+
break
|
|
236
|
+
close_idx = line.find("]", open_idx + 1)
|
|
237
|
+
if close_idx == -1:
|
|
238
|
+
break
|
|
239
|
+
excluded.append((line_start + open_idx, line_start + close_idx + 1))
|
|
240
|
+
i = close_idx + 1
|
|
241
|
+
|
|
242
|
+
# Handle unterminated code block (defensive: treat rest of file as
|
|
243
|
+
# excluded). Sort by start position.
|
|
244
|
+
if in_code_block:
|
|
245
|
+
excluded.append((code_block_start, len(text)))
|
|
246
|
+
excluded.sort()
|
|
247
|
+
return excluded
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _is_excluded(pos: int, excluded: Sequence[tuple[int, int]]) -> bool:
|
|
251
|
+
"""Return True if ``pos`` falls inside any excluded range.
|
|
252
|
+
|
|
253
|
+
Uses binary search on the sorted ranges. Half-open semantics: a
|
|
254
|
+
range ``(start, end)`` excludes positions ``start <= pos < end``.
|
|
255
|
+
"""
|
|
256
|
+
if not excluded:
|
|
257
|
+
return False
|
|
258
|
+
# Find rightmost range with start <= pos.
|
|
259
|
+
lo, hi = 0, len(excluded)
|
|
260
|
+
while lo < hi:
|
|
261
|
+
mid = (lo + hi) // 2
|
|
262
|
+
if excluded[mid][0] <= pos:
|
|
263
|
+
lo = mid + 1
|
|
264
|
+
else:
|
|
265
|
+
hi = mid
|
|
266
|
+
if lo == 0:
|
|
267
|
+
return False
|
|
268
|
+
start, end = excluded[lo - 1]
|
|
269
|
+
return start <= pos < end
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# ---------------------------------------------------------------------------
|
|
273
|
+
# Sentence-boundary detection (paragraph-aware, abbreviation-guarded).
|
|
274
|
+
# Used by v1.2.0 T3/T4 in audit_value_bindings and the v1.4.0 Layer 3
|
|
275
|
+
# rule γ in audit_citation_alignment (sentence-boundary respect for
|
|
276
|
+
# category-keyword window).
|
|
277
|
+
# ---------------------------------------------------------------------------
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _is_sentence_terminator_dot(text: str, dot_pos: int) -> bool:
|
|
281
|
+
"""Return True if the dot at ``dot_pos`` terminates a sentence.
|
|
282
|
+
|
|
283
|
+
False positives the abbreviation guard catches:
|
|
284
|
+
|
|
285
|
+
- Decimal numbers (digit-dot-digit): ``0.5``, ``§5.2``.
|
|
286
|
+
- Letter-dot-letter-dot patterns: ``e.g.``, ``i.e.``, ``c.f.``.
|
|
287
|
+
- Single-token abbreviations preceding the dot (whitespace- /
|
|
288
|
+
punctuation-separated): ``vs.``, ``etc.``, ``cf.``, ``fig.``,
|
|
289
|
+
``eq.``, ``pp.``, ``viz.``, ``ca.``. See ``_ABBREV_BEFORE_DOT``.
|
|
290
|
+
"""
|
|
291
|
+
n = len(text)
|
|
292
|
+
prev_char = text[dot_pos - 1] if dot_pos > 0 else ""
|
|
293
|
+
next_char = text[dot_pos + 1] if dot_pos + 1 < n else ""
|
|
294
|
+
# Decimal: digit-dot-digit.
|
|
295
|
+
if prev_char.isdigit() and next_char.isdigit():
|
|
296
|
+
return False
|
|
297
|
+
# Letter-dot-letter-dot pattern, dot is the SECOND dot in "x.y."
|
|
298
|
+
if (
|
|
299
|
+
dot_pos >= 3
|
|
300
|
+
and prev_char.isalpha()
|
|
301
|
+
and text[dot_pos - 2] == "."
|
|
302
|
+
and text[dot_pos - 3].isalpha()
|
|
303
|
+
):
|
|
304
|
+
return False
|
|
305
|
+
# Letter-dot-letter-dot pattern, dot is the FIRST dot in "x.y."
|
|
306
|
+
if dot_pos + 2 < n and next_char.isalpha() and text[dot_pos + 2] == ".":
|
|
307
|
+
return False
|
|
308
|
+
# Single-token abbreviation preceding the dot.
|
|
309
|
+
j = dot_pos - 1
|
|
310
|
+
while j >= 0 and text[j].isalpha():
|
|
311
|
+
j -= 1
|
|
312
|
+
word = text[j + 1 : dot_pos].lower()
|
|
313
|
+
return word not in _ABBREV_BEFORE_DOT
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _sentence_boundary_positions(text: str) -> list[int]:
|
|
317
|
+
"""Return sorted character positions where each sentence STARTS.
|
|
318
|
+
|
|
319
|
+
Hard breaks (sentence terminators):
|
|
320
|
+
|
|
321
|
+
- ``!`` and ``?`` always terminate.
|
|
322
|
+
- ``.`` terminates unless the abbreviation guard
|
|
323
|
+
(:func:`_is_sentence_terminator_dot`) returns False.
|
|
324
|
+
- ``\\n\\n`` (paragraph break) terminates.
|
|
325
|
+
|
|
326
|
+
Soft breaks (NOT sentence boundaries):
|
|
327
|
+
|
|
328
|
+
- Single ``\\n`` (markdown line-wrap mid-sentence).
|
|
329
|
+
- ``;`` (semicolons in dense list constructions).
|
|
330
|
+
- ``:`` (colons preceding list items or definitions).
|
|
331
|
+
|
|
332
|
+
The first sentence starts at position 0. Subsequent sentence starts
|
|
333
|
+
are recorded at the first non-whitespace character after a hard
|
|
334
|
+
break. Used by audit_value_bindings T3/T4 and
|
|
335
|
+
audit_citation_alignment Layer 3 rule γ.
|
|
336
|
+
"""
|
|
337
|
+
positions = [0]
|
|
338
|
+
n = len(text)
|
|
339
|
+
i = 0
|
|
340
|
+
while i < n:
|
|
341
|
+
ch = text[i]
|
|
342
|
+
boundary = False
|
|
343
|
+
skip = 1
|
|
344
|
+
if ch in "!?" or ch == "." and _is_sentence_terminator_dot(text, i):
|
|
345
|
+
boundary = True
|
|
346
|
+
elif ch == "\n" and i + 1 < n and text[i + 1] == "\n":
|
|
347
|
+
boundary = True
|
|
348
|
+
skip = 2
|
|
349
|
+
if boundary:
|
|
350
|
+
j = i + skip
|
|
351
|
+
while j < n and text[j].isspace():
|
|
352
|
+
j += 1
|
|
353
|
+
if j < n and j > positions[-1]:
|
|
354
|
+
positions.append(j)
|
|
355
|
+
i = max(j, i + skip)
|
|
356
|
+
else:
|
|
357
|
+
i += 1
|
|
358
|
+
return positions
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _sentence_id_of(pos: int, sentence_positions: Sequence[int]) -> int:
|
|
362
|
+
"""Return the zero-based sentence index containing ``pos``.
|
|
363
|
+
|
|
364
|
+
Uses binary search over the sorted ``sentence_positions``. Returns
|
|
365
|
+
``0`` for any position before the first sentence start.
|
|
366
|
+
"""
|
|
367
|
+
if not sentence_positions:
|
|
368
|
+
return 0
|
|
369
|
+
idx = bisect.bisect_right(sentence_positions, pos) - 1
|
|
370
|
+
return max(0, idx)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _crosses_sentence_boundary(pos_a: int, pos_b: int, sentence_positions: Sequence[int]) -> bool:
|
|
374
|
+
"""Return True if a sentence boundary lies strictly between ``pos_a`` and ``pos_b``.
|
|
375
|
+
|
|
376
|
+
Sentence-boundary positions are derived from
|
|
377
|
+
:func:`_sentence_boundary_positions`. Used by audit_value_bindings
|
|
378
|
+
T4 (reject (detector, value) pairs across a sentence boundary)
|
|
379
|
+
and audit_citation_alignment Layer 3 rule γ (the category-keyword
|
|
380
|
+
extraction window for an ADR citation must not cross a sentence
|
|
381
|
+
boundary).
|
|
382
|
+
"""
|
|
383
|
+
if not sentence_positions:
|
|
384
|
+
return False
|
|
385
|
+
lo = min(pos_a, pos_b)
|
|
386
|
+
hi = max(pos_a, pos_b)
|
|
387
|
+
idx = bisect.bisect_right(sentence_positions, lo)
|
|
388
|
+
return idx < len(sentence_positions) and sentence_positions[idx] <= hi
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
# ---------------------------------------------------------------------------
|
|
392
|
+
# Value-context helpers (used by audit_value_bindings T1/T2; kept here
|
|
393
|
+
# for any future audit_* validator that wants the same primitives).
|
|
394
|
+
# ---------------------------------------------------------------------------
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _is_signed_value(text: str, val_start: int) -> bool:
|
|
398
|
+
"""True if the value at ``val_start`` is immediately preceded by ``+`` or ``-``.
|
|
399
|
+
|
|
400
|
+
The sign marker indicates a paired-delta or comparative magnitude
|
|
401
|
+
(e.g., ``-0.071`` AUPRC delta), not a binding claim. T1 filter
|
|
402
|
+
skips these under ``scope="narrative"``.
|
|
403
|
+
"""
|
|
404
|
+
return val_start > 0 and text[val_start - 1] in "+-"
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _has_keyword_in_window(
|
|
408
|
+
text: str,
|
|
409
|
+
val_start: int,
|
|
410
|
+
pattern: re.Pattern[str],
|
|
411
|
+
before_chars: int,
|
|
412
|
+
after_chars: int,
|
|
413
|
+
) -> bool:
|
|
414
|
+
"""True if ``pattern`` matches anywhere in the character window around ``val_start``.
|
|
415
|
+
|
|
416
|
+
Used by audit_value_bindings T1 (delta keywords) and T2 (floor
|
|
417
|
+
keywords) to detect context cues near a candidate value.
|
|
418
|
+
``before_chars`` and ``after_chars`` control the asymmetric
|
|
419
|
+
window — floor mentions typically PRECEDE the value (e.g.,
|
|
420
|
+
"random AUPRC is 0.374"), while delta mentions can be on either
|
|
421
|
+
side.
|
|
422
|
+
"""
|
|
423
|
+
start = max(0, val_start - before_chars)
|
|
424
|
+
end = min(len(text), val_start + after_chars)
|
|
425
|
+
return bool(pattern.search(text, start, end))
|