eval-toolkit 1.0.0__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/CHANGELOG.md +86 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/PKG-INFO +3 -2
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/README.md +2 -1
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/__init__.py +8 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-1.0.2/src/eval_toolkit/audit_citation_alignment.py +301 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/metrics.py +38 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/public_api/snapshot.json +31 -1
- eval_toolkit-1.0.2/tests/test_audit_citation_alignment.py +242 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_folded.py +23 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/.gitignore +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/LICENSE +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/STYLE.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/archive/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/pyproject.toml +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/conftest.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/strategies.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_claims.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_cli.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_config.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_logging.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_losses.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_paths.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_probes.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_rng.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_splits.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,92 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
|
|
9
|
+
|
|
10
|
+
Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
|
|
11
|
+
v1.0.1 (RC4) and v1.0.2 (this release). All P3, all NON-BREAKING.
|
|
12
|
+
|
|
13
|
+
### Changed (Tier-2 ADDITIVE: contract clarification only)
|
|
14
|
+
|
|
15
|
+
- **RC2** (#76) — `SimilarityStrategy` Protocol promoted from
|
|
16
|
+
"pre-v0.7 internal interface" (prose framing only) to formal
|
|
17
|
+
10th strict Tier-2 Protocol per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
18
|
+
Aligns prose surfaces (README, extending.md, strict_tier2_protocols.md,
|
|
19
|
+
api/protocols.md, ADR 0004 §D6, roadmap.md) with the contract
|
|
20
|
+
already locked in `tests/golden/public_api/snapshot.json` +
|
|
21
|
+
`src/eval_toolkit/__init__.py:_EXPORTS` since v1.0.0. **No code
|
|
22
|
+
change — documentation-only reconciliation.** Strict-Tier-2 count
|
|
23
|
+
goes 9 → 10 (+ 1 opt-in `Versioned`).
|
|
24
|
+
|
|
25
|
+
### Fixed
|
|
26
|
+
|
|
27
|
+
- **RC3** (#76) — `tests/test_harness_folded.py::test_evaluate_folded_reseed_splitter_varies_partitions`
|
|
28
|
+
test hardening. Previous assertions covered count + key existence
|
|
29
|
+
only; a regression silently reusing the splitter (R8-C1 pre-fix
|
|
30
|
+
behavior) could still pass. v1.0.2 adds row-content comparison:
|
|
31
|
+
replays `reseed_splitter` against the splitter for `seed=1` vs
|
|
32
|
+
`seed=2` and asserts fold-0 test partitions differ via feature-text
|
|
33
|
+
set membership (robust to `_slice_subset`'s `reset_index(drop=True)`
|
|
34
|
+
via stable text-column identifiers).
|
|
35
|
+
|
|
36
|
+
- **F-metrics-1** (#76) — `brier_score` docstring input-domain clarity.
|
|
37
|
+
Added explicit "Input domain" Notes subsection clarifying binary
|
|
38
|
+
labels in `{0, 1}` + calibrated probabilities in `[0, 1]` are
|
|
39
|
+
required; raw logits or unbounded ranking scores pass the finiteness
|
|
40
|
+
check but produce out-of-range MSE that misrepresents calibration
|
|
41
|
+
quality. Includes calibration-applying recipe pointer.
|
|
42
|
+
|
|
43
|
+
- **F-metrics-3** (#76) — `expected_calibration_error` docstring
|
|
44
|
+
uniform-scores note. Added explicit Notes subsection documenting
|
|
45
|
+
that constant `y_score` returns 0.0 (per-bin formula trivially
|
|
46
|
+
satisfied) but is semantically misleading — uninformative scorers
|
|
47
|
+
look "perfectly calibrated" despite zero discriminative power.
|
|
48
|
+
Callers should filter constant inputs before ECE.
|
|
49
|
+
|
|
50
|
+
- **F-metrics-4** (#76) — `brier_score` docstring single-class
|
|
51
|
+
edge-case explicit. Added Notes subsection with closed-form
|
|
52
|
+
expressions for all-zeros (`BS = mean(p²)`) and all-ones
|
|
53
|
+
(`BS = mean((1-p)²)`) cases. Explicit confirmation that
|
|
54
|
+
per-slice degenerate-class evaluation is supported (unlike
|
|
55
|
+
PR-AUC / ROC-AUC).
|
|
56
|
+
|
|
57
|
+
## [1.0.1] — 2026-05-25 — audit_citation_alignment + RC4 docs polish
|
|
58
|
+
|
|
59
|
+
First v1.x patch release. Ships the `audit_citation_alignment` validator
|
|
60
|
+
that's been pre-staged by consumer `prompt-injection-detection-prototype`
|
|
61
|
+
(see #77), plus the smallest #76 cleanup item (RC4).
|
|
62
|
+
|
|
63
|
+
### Added
|
|
64
|
+
|
|
65
|
+
- **`audit_citation_alignment` module** — flat-module per [ADR 0001](docs/source/adr/0001-flat-module-layout.md)
|
|
66
|
+
(stay-flat-through-v1.x; subpackage restructure deferred to v2.0).
|
|
67
|
+
Exports `validate_citations(...)`, `ADRSubject`, `CitationMisalignment`,
|
|
68
|
+
and `extract_adr_subject_category` as Tier 1 STRICT (per
|
|
69
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
|
|
70
|
+
Validator catches the bug class where a markdown surface cites "per
|
|
71
|
+
ADR-NNN" but the cited ADR's actual subject doesn't match the
|
|
72
|
+
surrounding claim category — motivated by the V1.3.2 P1-2 finding in
|
|
73
|
+
the consumer `prompt-injection-detection-prototype` audit where
|
|
74
|
+
`docs/REPRODUCIBILITY.md:76` cited ADR-029 (test markers) for a
|
|
75
|
+
tier-lock claim that should have cited ADR-034 (reproducibility tier
|
|
76
|
+
ladder). The mis-citation went undetected by lychee (URL-resolves
|
|
77
|
+
check), consumer's `audit_numbers.py` (numeric values), and consumer's
|
|
78
|
+
`audit_adr_count_claims.py` (count claims). Closes #73.
|
|
79
|
+
- **Pre-tag dogfood**: `validate_citations()` exercised against
|
|
80
|
+
eval-toolkit's own docs (95 files including README + audit_findings +
|
|
81
|
+
methodology + migration guides) — 0 misalignments found. Validator
|
|
82
|
+
proven in production use before consumers adopt.
|
|
83
|
+
|
|
84
|
+
### Fixed
|
|
85
|
+
|
|
86
|
+
- **RC4** (#76 cleanup) — v0.51 documentation count-tally reconciliation
|
|
87
|
+
across `docs/source/audit_findings.md`, `docs/source/migration/v0.51.md`,
|
|
88
|
+
and `CHANGELOG.md` `[0.51.0]` section. Canonical tally now consistent:
|
|
89
|
+
**13 confirmed → fixed in v0.51 / 3 refuted (R8-G2 + R8-G5 + R8-V1+V2
|
|
90
|
+
paired) / 2 deferred (R8-G3, R8-G4) = 18 total**. Prior drift was
|
|
91
|
+
"left 2 undecided" (migration/v0.51) vs "deferred" (CHANGELOG) +
|
|
92
|
+
ambiguity in the audit_findings ship-status section. Closes RC4 of #76.
|
|
93
|
+
|
|
8
94
|
## [1.0.0] — 2026-05-25 — Stability contract activates per ADR 0003
|
|
9
95
|
|
|
10
96
|
v1.0 is a **stability-contract activation**, not a code delta from v0.51.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -115,7 +115,8 @@ format changes.
|
|
|
115
115
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
116
116
|
│ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
|
|
117
117
|
│ ThresholdSelector / DatasetLoader / MetricSpec │
|
|
118
|
-
│ MetaLearner / Probe / TextTransform
|
|
118
|
+
│ MetaLearner / Probe / TextTransform / │
|
|
119
|
+
│ SimilarityStrategy (10 strict) │
|
|
119
120
|
│ Versioned (opt-in: per-object versions in manifest) │
|
|
120
121
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
121
122
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
@@ -32,7 +32,8 @@ format changes.
|
|
|
32
32
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
33
33
|
│ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
|
|
34
34
|
│ ThresholdSelector / DatasetLoader / MetricSpec │
|
|
35
|
-
│ MetaLearner / Probe / TextTransform
|
|
35
|
+
│ MetaLearner / Probe / TextTransform / │
|
|
36
|
+
│ SimilarityStrategy (10 strict) │
|
|
36
37
|
│ Versioned (opt-in: per-object versions in manifest) │
|
|
37
38
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
38
39
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
@@ -52,6 +52,14 @@ _EXPORTS: dict[str, str] = {
|
|
|
52
52
|
# CharacterInjectionStrategy + character_injection SimpleNamespace
|
|
53
53
|
# removed at v0.47 (Decision N + plan §4E). TextTransform Protocol +
|
|
54
54
|
# the 12 concrete dataclasses are now the only public path.
|
|
55
|
+
# --- audit_citation_alignment ---
|
|
56
|
+
# Flat-module per ADR 0001 (Stay flat through v1.x; subpackage
|
|
57
|
+
# restructure deferred to v2.0). Closes #73. Motivated by consumer
|
|
58
|
+
# V1.3.2 P1-2 ADR-029 mis-citation finding.
|
|
59
|
+
"ADRSubject": "eval_toolkit.audit_citation_alignment",
|
|
60
|
+
"CitationMisalignment": "eval_toolkit.audit_citation_alignment",
|
|
61
|
+
"extract_adr_subject_category": "eval_toolkit.audit_citation_alignment",
|
|
62
|
+
"validate_citations": "eval_toolkit.audit_citation_alignment",
|
|
55
63
|
# --- losses ---
|
|
56
64
|
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
57
65
|
# --- preprocessing ---
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
r"""ADR-citation alignment validator.
|
|
2
|
+
|
|
3
|
+
Catches the bug class where a reader-facing markdown surface cites
|
|
4
|
+
"per ADR-NNN" but the cited ADR's actual subject doesn't match the
|
|
5
|
+
surrounding claim category.
|
|
6
|
+
|
|
7
|
+
Motivating test case (from `prompt-injection-detection-prototype` v1.3.2
|
|
8
|
+
audit, file `docs/REPRODUCIBILITY.md:76`)::
|
|
9
|
+
|
|
10
|
+
"Two-tier reproduction (locked at Phase 0-07 via ADR-029):"
|
|
11
|
+
|
|
12
|
+
ADR-029 is the test-marker-strategy ADR (unit / smoke / integration /
|
|
13
|
+
network markers). The actual reproducibility-tier-lock ADR is ADR-034.
|
|
14
|
+
The citation is wrong by 5 digits, with surrounding context "Two-tier
|
|
15
|
+
reproduction" clearly in the *reproducibility* category, not the
|
|
16
|
+
*test_markers* category. :func:`validate_citations` flags this case.
|
|
17
|
+
|
|
18
|
+
Design (per ADR 0001 contract-first; ADR 0002 metric-spec style for the
|
|
19
|
+
configurable categories):
|
|
20
|
+
|
|
21
|
+
- The validator is **pure**: pass in markdown text + ADR frontmatter +
|
|
22
|
+
a category-keyword map; get back a list of
|
|
23
|
+
:class:`CitationMisalignment` records. No filesystem I/O inside the
|
|
24
|
+
validator; the CLI wrapper handles globbing.
|
|
25
|
+
- Categories are **consumer-supplied**: this module ships no default
|
|
26
|
+
category map. Consumers wire their project's claim taxonomy
|
|
27
|
+
(reproducibility / cost / calibration / threshold / contamination /
|
|
28
|
+
test_markers / leakage / etc.) into the validator.
|
|
29
|
+
|
|
30
|
+
References
|
|
31
|
+
----------
|
|
32
|
+
.. [1] Nygard, M. "Documenting Architecture Decisions." 2011.
|
|
33
|
+
https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import re
|
|
39
|
+
from collections.abc import Sequence
|
|
40
|
+
from dataclasses import dataclass
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from typing import Final
|
|
43
|
+
|
|
44
|
+
# Default citation pattern: matches "per ADR-NNN", "via ADR-NNN", "by ADR-NNN",
|
|
45
|
+
# "under ADR-NNN" — case-insensitive on the citation phrase; ADR-NNN is
|
|
46
|
+
# 3-digit-zero-padded by Nygard convention.
|
|
47
|
+
DEFAULT_CITATION_PATTERN: Final[str] = r"(?i)(?:per|via|by|under)\s+ADR-(\d{3})"
|
|
48
|
+
|
|
49
|
+
# Sniff radius around a citation match for category-keyword matching.
|
|
50
|
+
# Locked at ±2 lines so the validator catches citations whose claim
|
|
51
|
+
# category is on the immediately-adjacent line (common in Markdown
|
|
52
|
+
# tables / bullet lists / wrapped prose).
|
|
53
|
+
DEFAULT_CONTEXT_LINES: Final[int] = 2
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(frozen=True)
|
|
57
|
+
class ADRSubject:
|
|
58
|
+
"""Subject category of a single ADR.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
adr_id : str
|
|
63
|
+
3-digit-zero-padded ADR id, e.g. ``"029"``.
|
|
64
|
+
title : str
|
|
65
|
+
ADR title (from frontmatter ``title:`` field).
|
|
66
|
+
slug : str
|
|
67
|
+
ADR slug (from frontmatter ``slug:`` field). Often informative
|
|
68
|
+
about the actual subject.
|
|
69
|
+
category : str | None
|
|
70
|
+
Claim-taxonomy category the ADR belongs to (e.g.
|
|
71
|
+
``"test_markers"``, ``"reproducibility"``, ``"cost"``). ``None``
|
|
72
|
+
if no category matched the ADR's title/slug keywords (caller
|
|
73
|
+
decides whether to treat ``None`` as a finding or skip).
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
adr_id: str
|
|
77
|
+
title: str
|
|
78
|
+
slug: str
|
|
79
|
+
category: str | None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True)
|
|
83
|
+
class CitationMisalignment:
|
|
84
|
+
"""A "per ADR-NNN" citation whose category doesn't match the cited ADR's subject.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
file : Path
|
|
89
|
+
Reader-facing markdown file the citation appears in.
|
|
90
|
+
line : int
|
|
91
|
+
1-indexed line number of the citation.
|
|
92
|
+
cited_adr_id : str
|
|
93
|
+
3-digit-zero-padded ADR id from the citation.
|
|
94
|
+
surrounding_text : str
|
|
95
|
+
≤120 chars of context around the citation (for human review).
|
|
96
|
+
claim_category : str | None
|
|
97
|
+
Category inferred from the surrounding text (None if no
|
|
98
|
+
category keyword matched).
|
|
99
|
+
adr_actual_category : str | None
|
|
100
|
+
Category inferred from the cited ADR's title+slug (None if no
|
|
101
|
+
category keyword matched).
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
file: Path
|
|
105
|
+
line: int
|
|
106
|
+
cited_adr_id: str
|
|
107
|
+
surrounding_text: str
|
|
108
|
+
claim_category: str | None
|
|
109
|
+
adr_actual_category: str | None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def extract_adr_subject_category(
|
|
113
|
+
title: str,
|
|
114
|
+
slug: str,
|
|
115
|
+
category_keywords: dict[str, list[str]],
|
|
116
|
+
) -> str | None:
|
|
117
|
+
"""Infer an ADR's claim-taxonomy category from its title + slug.
|
|
118
|
+
|
|
119
|
+
Walks each ``(category, keywords)`` entry in ``category_keywords``
|
|
120
|
+
and returns the first category whose keywords appear in the
|
|
121
|
+
concatenated title+slug (case-insensitive).
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
title : str
|
|
126
|
+
ADR title from frontmatter.
|
|
127
|
+
slug : str
|
|
128
|
+
ADR slug from frontmatter or filename.
|
|
129
|
+
category_keywords : dict[str, list[str]]
|
|
130
|
+
Map from category name to a list of keyword substrings. First
|
|
131
|
+
keyword match wins; categories are tested in dict-insertion
|
|
132
|
+
order, so the caller controls priority.
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
str | None
|
|
137
|
+
Matching category name, or ``None`` if no keyword matched.
|
|
138
|
+
|
|
139
|
+
Examples
|
|
140
|
+
--------
|
|
141
|
+
>>> extract_adr_subject_category(
|
|
142
|
+
... title="Reproducibility tier - full ladder T0 + T1 + T3",
|
|
143
|
+
... slug="reproducibility-tier-full-ladder",
|
|
144
|
+
... category_keywords={
|
|
145
|
+
... "test_markers": ["marker", "smoke marker"],
|
|
146
|
+
... "reproducibility": ["reproduc", "tier"],
|
|
147
|
+
... },
|
|
148
|
+
... )
|
|
149
|
+
'reproducibility'
|
|
150
|
+
"""
|
|
151
|
+
haystack = f"{title} {slug}".lower()
|
|
152
|
+
for category, keywords in category_keywords.items():
|
|
153
|
+
for keyword in keywords:
|
|
154
|
+
if keyword.lower() in haystack:
|
|
155
|
+
return category
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _extract_context_text(
|
|
160
|
+
lines: list[str],
|
|
161
|
+
line_index: int,
|
|
162
|
+
context_lines: int,
|
|
163
|
+
) -> str:
|
|
164
|
+
"""Return ≤120-char snippet of context around `line_index` (1-indexed)."""
|
|
165
|
+
start = max(0, line_index - 1 - context_lines)
|
|
166
|
+
end = min(len(lines), line_index + context_lines)
|
|
167
|
+
return " ".join(line.strip() for line in lines[start:end])[:300]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _infer_claim_category(
|
|
171
|
+
context: str,
|
|
172
|
+
category_keywords: dict[str, list[str]],
|
|
173
|
+
) -> str | None:
|
|
174
|
+
"""Same first-match-wins keyword check as ADR subject extraction, on prose context."""
|
|
175
|
+
haystack = context.lower()
|
|
176
|
+
for category, keywords in category_keywords.items():
|
|
177
|
+
for keyword in keywords:
|
|
178
|
+
if keyword.lower() in haystack:
|
|
179
|
+
return category
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def validate_citations(
|
|
184
|
+
*,
|
|
185
|
+
markdown_text: str,
|
|
186
|
+
markdown_path: Path,
|
|
187
|
+
adr_subjects: dict[str, ADRSubject],
|
|
188
|
+
category_keywords: dict[str, list[str]],
|
|
189
|
+
citation_pattern: str = DEFAULT_CITATION_PATTERN,
|
|
190
|
+
context_lines: int = DEFAULT_CONTEXT_LINES,
|
|
191
|
+
known_exempt_citations: Sequence[tuple[Path, int, str]] = (),
|
|
192
|
+
) -> list[CitationMisalignment]:
|
|
193
|
+
"""Find "per ADR-NNN" citations whose category doesn't match the cited ADR.
|
|
194
|
+
|
|
195
|
+
Parameters
|
|
196
|
+
----------
|
|
197
|
+
markdown_text : str
|
|
198
|
+
Body of the reader-facing markdown file.
|
|
199
|
+
markdown_path : Path
|
|
200
|
+
Path of the markdown file (for misalignment.file annotation).
|
|
201
|
+
adr_subjects : dict[str, ADRSubject]
|
|
202
|
+
Map from 3-digit ADR id to :class:`ADRSubject` records. Caller
|
|
203
|
+
builds this by parsing each ADR's frontmatter; the
|
|
204
|
+
``ADRSubject.category`` field is populated via
|
|
205
|
+
:func:`extract_adr_subject_category`.
|
|
206
|
+
category_keywords : dict[str, list[str]]
|
|
207
|
+
Map from category name to substring-keyword list (used both for
|
|
208
|
+
ADR subject inference and for surrounding-text category
|
|
209
|
+
inference). Same map MUST be used for both directions.
|
|
210
|
+
citation_pattern : str, optional
|
|
211
|
+
Regex finding the citation surface. Group 1 must capture the
|
|
212
|
+
3-digit ADR id. Default :data:`DEFAULT_CITATION_PATTERN`
|
|
213
|
+
matches "per/via/by/under ADR-NNN".
|
|
214
|
+
context_lines : int, optional
|
|
215
|
+
Number of lines (±) around the citation to consider when
|
|
216
|
+
inferring the claim category. Default
|
|
217
|
+
:data:`DEFAULT_CONTEXT_LINES` (=2).
|
|
218
|
+
known_exempt_citations : Sequence of (Path, int, str), optional
|
|
219
|
+
``(file, line, cited_adr_id)`` tuples to skip. Useful for
|
|
220
|
+
consumers with known historical drift that's been accepted by
|
|
221
|
+
policy (e.g., immutable ADR bodies with frozen-in errors that
|
|
222
|
+
a superseding ADR has already addressed).
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
list[CitationMisalignment]
|
|
227
|
+
One :class:`CitationMisalignment` per misaligned citation.
|
|
228
|
+
Empty if no misalignments OR no citations matched the pattern.
|
|
229
|
+
|
|
230
|
+
Notes
|
|
231
|
+
-----
|
|
232
|
+
A citation with ``claim_category=None`` (no category keyword
|
|
233
|
+
matched the surrounding context) is **NOT** flagged as a
|
|
234
|
+
misalignment. The validator defers to the caller's category map:
|
|
235
|
+
if the caller's vocabulary doesn't cover the claim, there's no
|
|
236
|
+
basis for saying the citation is misaligned. To force every
|
|
237
|
+
citation to be flaggable, the caller should ensure their
|
|
238
|
+
``category_keywords`` has broad coverage.
|
|
239
|
+
|
|
240
|
+
Examples
|
|
241
|
+
--------
|
|
242
|
+
>>> adr_subjects = {
|
|
243
|
+
... "029": ADRSubject(
|
|
244
|
+
... adr_id="029",
|
|
245
|
+
... title="Test marker strategy",
|
|
246
|
+
... slug="test-marker-strategy",
|
|
247
|
+
... category="test_markers",
|
|
248
|
+
... ),
|
|
249
|
+
... }
|
|
250
|
+
>>> result = validate_citations(
|
|
251
|
+
... markdown_text="Two-tier reproduction locked at Phase 0-07 via ADR-029.\\n",
|
|
252
|
+
... markdown_path=Path("docs/REPRODUCIBILITY.md"),
|
|
253
|
+
... adr_subjects=adr_subjects,
|
|
254
|
+
... category_keywords={
|
|
255
|
+
... "reproducibility": ["reproduc", "tier", "T0", "T1", "T3"],
|
|
256
|
+
... "test_markers": ["marker"],
|
|
257
|
+
... },
|
|
258
|
+
... )
|
|
259
|
+
>>> len(result)
|
|
260
|
+
1
|
|
261
|
+
>>> result[0].cited_adr_id
|
|
262
|
+
'029'
|
|
263
|
+
>>> result[0].claim_category
|
|
264
|
+
'reproducibility'
|
|
265
|
+
>>> result[0].adr_actual_category
|
|
266
|
+
'test_markers'
|
|
267
|
+
"""
|
|
268
|
+
exempt_set = {(str(p), ln, adr) for (p, ln, adr) in known_exempt_citations}
|
|
269
|
+
misalignments: list[CitationMisalignment] = []
|
|
270
|
+
lines = markdown_text.splitlines()
|
|
271
|
+
citation_re = re.compile(citation_pattern)
|
|
272
|
+
|
|
273
|
+
for line_no, line in enumerate(lines, start=1):
|
|
274
|
+
for match in citation_re.finditer(line):
|
|
275
|
+
adr_id = match.group(1)
|
|
276
|
+
if (str(markdown_path), line_no, adr_id) in exempt_set:
|
|
277
|
+
continue
|
|
278
|
+
subject = adr_subjects.get(adr_id)
|
|
279
|
+
if subject is None:
|
|
280
|
+
# Citation references an unknown ADR. Out of scope for
|
|
281
|
+
# this validator (a different validator should check
|
|
282
|
+
# "does ADR-NNN exist"). Skip.
|
|
283
|
+
continue
|
|
284
|
+
context = _extract_context_text(lines, line_no, context_lines)
|
|
285
|
+
claim_category = _infer_claim_category(context, category_keywords)
|
|
286
|
+
if claim_category is None:
|
|
287
|
+
# No category basis for comparison; skip per the deferral above.
|
|
288
|
+
continue
|
|
289
|
+
if claim_category == subject.category:
|
|
290
|
+
continue
|
|
291
|
+
misalignments.append(
|
|
292
|
+
CitationMisalignment(
|
|
293
|
+
file=markdown_path,
|
|
294
|
+
line=line_no,
|
|
295
|
+
cited_adr_id=adr_id,
|
|
296
|
+
surrounding_text=context,
|
|
297
|
+
claim_category=claim_category,
|
|
298
|
+
adr_actual_category=subject.category,
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
return misalignments
|
|
@@ -792,6 +792,20 @@ def expected_calibration_error(
|
|
|
792
792
|
empirical positive rate in the bin, and :math:`\\mathrm{conf}` is the
|
|
793
793
|
mean predicted score.
|
|
794
794
|
|
|
795
|
+
**Uniform / uninformative scores** (F-metrics-3 v1.0.2 clarity pass):
|
|
796
|
+
when ``y_score`` is constant (e.g., ``[0.5] * n`` — an uninformative
|
|
797
|
+
detector), this function returns ``0.0`` regardless of the true label
|
|
798
|
+
distribution. That's technically correct per the formula —
|
|
799
|
+
:math:`|\\mathrm{acc}(B_m) - \\mathrm{conf}(B_m)|` measures bin-level
|
|
800
|
+
calibration, and a single occupied bin with ``conf = base rate``
|
|
801
|
+
achieves perfect calibration locally. But it is semantically
|
|
802
|
+
misleading: an uninformative scorer looks "perfectly calibrated"
|
|
803
|
+
even though it has zero discriminative power. **Callers should
|
|
804
|
+
detect and filter uninformative inputs before passing to ECE** —
|
|
805
|
+
e.g., reject when ``np.unique(y_score).size == 1`` or when the
|
|
806
|
+
score variance is below a domain-specific threshold. Use
|
|
807
|
+
:func:`brier_score` or :func:`pr_auc` for resolution-aware metrics.
|
|
808
|
+
|
|
795
809
|
References
|
|
796
810
|
----------
|
|
797
811
|
.. [1] DeGroot, M. H. & Fienberg, S. E. "The comparison and evaluation of
|
|
@@ -1240,6 +1254,30 @@ def brier_score(
|
|
|
1240
1254
|
-----
|
|
1241
1255
|
.. math:: \mathrm{BS} = \frac{1}{n} \sum_i (p_i - y_i)^2
|
|
1242
1256
|
|
|
1257
|
+
**Input domain** (F-metrics-1 v1.0.2 clarity pass): ``y_true`` must
|
|
1258
|
+
be binary labels in ``{0, 1}`` (other label values raise
|
|
1259
|
+
``ValueError``). ``y_score`` must be calibrated probabilities in
|
|
1260
|
+
``[0, 1]`` — raw logits or unbounded ranking scores will pass the
|
|
1261
|
+
finiteness check but produce an out-of-range MSE that misrepresents
|
|
1262
|
+
calibration quality. If your scorer produces logits, apply
|
|
1263
|
+
sigmoid / softmax / a fitted calibrator (see
|
|
1264
|
+
:mod:`eval_toolkit.calibration`) before passing to ``brier_score``.
|
|
1265
|
+
|
|
1266
|
+
**Single-class behavior** (F-metrics-4 v1.0.2 clarity pass): unlike
|
|
1267
|
+
PR-AUC / ROC-AUC, ``brier_score`` is well-defined when ``y_true``
|
|
1268
|
+
is all-zeros or all-ones — it degenerates to the MSE around the
|
|
1269
|
+
constant class label. Specifically:
|
|
1270
|
+
|
|
1271
|
+
- All-zeros: :math:`\mathrm{BS} = \frac{1}{n} \sum_i p_i^2` —
|
|
1272
|
+
forecasting any positive probability incurs squared-error loss.
|
|
1273
|
+
- All-ones: :math:`\mathrm{BS} = \frac{1}{n} \sum_i (1 - p_i)^2`
|
|
1274
|
+
— forecasting low probability incurs squared-error loss.
|
|
1275
|
+
|
|
1276
|
+
This is the deliberate Brier-as-strict-proper-scoring-rule behavior
|
|
1277
|
+
(Brier 1950). Per-slice degenerate-class evaluation is supported
|
|
1278
|
+
via the ``empty_strategy`` parameter for ``n=0`` only; non-empty
|
|
1279
|
+
single-class slices score normally.
|
|
1280
|
+
|
|
1243
1281
|
See Also
|
|
1244
1282
|
--------
|
|
1245
1283
|
eval_toolkit.metrics.brier_decomposition :
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"__all__": [
|
|
3
|
+
"ADRSubject",
|
|
3
4
|
"ADVANCED_TECHNIQUES",
|
|
4
5
|
"ALL_TECHNIQUES",
|
|
5
6
|
"ANCHOR_RE",
|
|
@@ -11,6 +12,7 @@
|
|
|
11
12
|
"CISafeThresholdSelector",
|
|
12
13
|
"CORE_TECHNIQUES",
|
|
13
14
|
"CaseInjection",
|
|
15
|
+
"CitationMisalignment",
|
|
14
16
|
"ClaimReport",
|
|
15
17
|
"ClaimSpec",
|
|
16
18
|
"CorrectionMethod",
|
|
@@ -156,6 +158,7 @@
|
|
|
156
158
|
"evaluate_folded",
|
|
157
159
|
"evaluate_scorer_on_slice",
|
|
158
160
|
"external_diagnostic_gate",
|
|
161
|
+
"extract_adr_subject_category",
|
|
159
162
|
"fdr_bh_correct",
|
|
160
163
|
"figure_metadata",
|
|
161
164
|
"file_sha256",
|
|
@@ -235,6 +238,7 @@
|
|
|
235
238
|
"stratified_recall",
|
|
236
239
|
"strict_artifact_gate",
|
|
237
240
|
"sweep",
|
|
241
|
+
"validate_citations",
|
|
238
242
|
"validate_manifest",
|
|
239
243
|
"validate_payload",
|
|
240
244
|
"validate_prediction_artifact_ref",
|
|
@@ -248,6 +252,14 @@
|
|
|
248
252
|
"write_run_result"
|
|
249
253
|
],
|
|
250
254
|
"entries": {
|
|
255
|
+
"ADRSubject": {
|
|
256
|
+
"bases": [
|
|
257
|
+
"object"
|
|
258
|
+
],
|
|
259
|
+
"doc_first_line": "Subject category of a single ADR.",
|
|
260
|
+
"kind": "class",
|
|
261
|
+
"signature": "(adr_id: 'str', title: 'str', slug: 'str', category: 'str | None') -> None"
|
|
262
|
+
},
|
|
251
263
|
"ADVANCED_TECHNIQUES": {
|
|
252
264
|
"doc_first_line": "Built-in immutable sequence.",
|
|
253
265
|
"kind": "value",
|
|
@@ -329,6 +341,14 @@
|
|
|
329
341
|
"kind": "class",
|
|
330
342
|
"signature": "(ratio: 'float' = 0.5, seed: 'int' = 42, name: 'str' = 'case_random') -> None"
|
|
331
343
|
},
|
|
344
|
+
"CitationMisalignment": {
|
|
345
|
+
"bases": [
|
|
346
|
+
"object"
|
|
347
|
+
],
|
|
348
|
+
"doc_first_line": "A \"per ADR-NNN\" citation whose category doesn't match the cited ADR's subject.",
|
|
349
|
+
"kind": "class",
|
|
350
|
+
"signature": "(file: 'Path', line: 'int', cited_adr_id: 'str', surrounding_text: 'str', claim_category: 'str | None', adr_actual_category: 'str | None') -> None"
|
|
351
|
+
},
|
|
332
352
|
"ClaimReport": {
|
|
333
353
|
"bases": [
|
|
334
354
|
"object"
|
|
@@ -1353,7 +1373,7 @@
|
|
|
1353
1373
|
"doc_first_line": "str(object='') -> str",
|
|
1354
1374
|
"kind": "value",
|
|
1355
1375
|
"type": "str",
|
|
1356
|
-
"value": "'0.
|
|
1376
|
+
"value": "'1.0.2'"
|
|
1357
1377
|
},
|
|
1358
1378
|
"apply_operating_points": {
|
|
1359
1379
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1480,6 +1500,11 @@
|
|
|
1480
1500
|
"kind": "function",
|
|
1481
1501
|
"signature": "(path: 'str', *, op: \"Literal['<', '<=', '>', '>=', '=='] | None\" = None, threshold: 'float | None' = None, severity: 'GateSeverity' = 'error') -> 'EvidenceGate'"
|
|
1482
1502
|
},
|
|
1503
|
+
"extract_adr_subject_category": {
|
|
1504
|
+
"doc_first_line": "Infer an ADR's claim-taxonomy category from its title + slug.",
|
|
1505
|
+
"kind": "function",
|
|
1506
|
+
"signature": "(title: 'str', slug: 'str', category_keywords: 'dict[str, list[str]]') -> 'str | None'"
|
|
1507
|
+
},
|
|
1483
1508
|
"fdr_bh_correct": {
|
|
1484
1509
|
"doc_first_line": "Benjamini-Hochberg false-discovery-rate correction.",
|
|
1485
1510
|
"kind": "function",
|
|
@@ -1875,6 +1900,11 @@
|
|
|
1875
1900
|
"kind": "function",
|
|
1876
1901
|
"signature": "(strategies: 'Sequence[TextTransform]', texts: 'Sequence[str]', *, scorer: 'Scorer | None' = None, attack_threshold: 'float | None' = None) -> 'pd.DataFrame'"
|
|
1877
1902
|
},
|
|
1903
|
+
"validate_citations": {
|
|
1904
|
+
"doc_first_line": "Find \"per ADR-NNN\" citations whose category doesn't match the cited ADR.",
|
|
1905
|
+
"kind": "function",
|
|
1906
|
+
"signature": "(*, markdown_text: 'str', markdown_path: 'Path', adr_subjects: 'dict[str, ADRSubject]', category_keywords: 'dict[str, list[str]]', citation_pattern: 'str' = '(?i)(?:per|via|by|under)\\\\s+ADR-(\\\\d{3})', context_lines: 'int' = 2, known_exempt_citations: 'Sequence[tuple[Path, int, str]]' = ()) -> 'list[CitationMisalignment]'"
|
|
1907
|
+
},
|
|
1878
1908
|
"validate_manifest": {
|
|
1879
1909
|
"doc_first_line": "Validate a serialized ``RunManifest`` payload.",
|
|
1880
1910
|
"kind": "function",
|