eval-toolkit 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/CHANGELOG.md +86 -0
  2. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/PKG-INFO +3 -2
  3. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/README.md +2 -1
  4. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/__init__.py +8 -0
  5. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_version.py +1 -1
  6. eval_toolkit-1.0.2/src/eval_toolkit/audit_citation_alignment.py +301 -0
  7. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/metrics.py +38 -0
  8. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/public_api/snapshot.json +31 -1
  9. eval_toolkit-1.0.2/tests/test_audit_citation_alignment.py +242 -0
  10. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_folded.py +23 -0
  11. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/.gitignore +0 -0
  12. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/LICENSE +0 -0
  13. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/STYLE.md +0 -0
  14. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/archive/README.md +0 -0
  15. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/README.md +0 -0
  16. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/datasets/README.md +0 -0
  17. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/data-integrity/README.md +0 -0
  18. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/eval-ecosystem/README.md +0 -0
  19. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/inference/README.md +0 -0
  20. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/research/papers/prompt-injection/README.md +0 -0
  21. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/source/adr/README.md +0 -0
  22. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/docs/source/methodology/README.md +0 -0
  23. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/pyproject.toml +0 -0
  24. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/__main__.py +0 -0
  25. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_deprecated.py +0 -0
  26. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_parallel.py +0 -0
  27. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_rng.py +0 -0
  28. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/_sweep.py +0 -0
  29. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/adversarial.py +0 -0
  30. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/analysis.py +0 -0
  31. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/artifacts.py +0 -0
  32. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/bootstrap.py +0 -0
  33. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/calibration.py +0 -0
  34. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/claims.py +0 -0
  35. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/config.py +0 -0
  36. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/docs.py +0 -0
  37. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/embeddings.py +0 -0
  38. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/evidence.py +0 -0
  39. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/harness.py +0 -0
  40. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/leakage.py +0 -0
  41. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/loaders.py +0 -0
  42. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/losses.py +0 -0
  43. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/manifest.py +0 -0
  44. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/metric_specs.py +0 -0
  45. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/operating_points.py +0 -0
  46. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/paths.py +0 -0
  47. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/plotting.py +0 -0
  48. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/preprocessing.py +0 -0
  49. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/probes.py +0 -0
  50. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/protocols.py +0 -0
  51. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/provenance.py +0 -0
  52. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/py.typed +0 -0
  53. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  54. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  55. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  56. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  57. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results.v1.json +0 -0
  58. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  59. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/scorecards.py +0 -0
  60. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/seeds.py +0 -0
  61. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/splits.py +0 -0
  62. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/stacking.py +0 -0
  63. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/text_dedup.py +0 -0
  64. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/src/eval_toolkit/thresholds.py +0 -0
  65. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  66. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  67. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  68. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  69. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  70. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  71. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  72. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  73. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  74. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  75. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/benchmarks/__init__.py +0 -0
  76. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  77. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/conftest.py +0 -0
  78. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/bootstrap_ci/cases.json +0 -0
  79. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout.jsonl +0 -0
  80. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_expected.json +0 -0
  81. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  82. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/docs/expected.md +0 -0
  83. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/docs/input.md +0 -0
  84. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/docs/metrics.json +0 -0
  85. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  86. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/strategies.py +0 -0
  87. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_adversarial.py +0 -0
  88. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_analysis.py +0 -0
  89. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_artifacts.py +0 -0
  90. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_block_bootstrap_on_folds.py +0 -0
  91. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_calibration_mc.py +0 -0
  92. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_edge_cases.py +0 -0
  93. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_golden.py +0 -0
  94. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_njobs.py +0 -0
  95. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_props.py +0 -0
  96. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_research_grounded.py +0 -0
  97. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_bootstrap_unit.py +0 -0
  98. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_binary_adapters.py +0 -0
  99. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_bootstrap_chain.py +0 -0
  100. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_determinism.py +0 -0
  101. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_optimization_failures.py +0 -0
  102. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_props.py +0 -0
  103. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_research_grounded.py +0 -0
  104. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_calibration_unit.py +0 -0
  105. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_claims.py +0 -0
  106. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_claims_coverage.py +0 -0
  107. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_claims_props.py +0 -0
  108. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_cli.py +0 -0
  109. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_config.py +0 -0
  110. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_bootstrap.py +0 -0
  111. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_calibration.py +0 -0
  112. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_harness.py +0 -0
  113. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_metrics.py +0 -0
  114. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_coverage_plotting.py +0 -0
  115. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_croissant_e2e.py +0 -0
  116. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_dedup_split_leakage_chain.py +0 -0
  117. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_deprecated_scalars_shim.py +0 -0
  118. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_deprecations.py +0 -0
  119. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_docs_golden.py +0 -0
  120. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_docs_props.py +0 -0
  121. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_embeddings.py +0 -0
  122. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_evidence_validators.py +0 -0
  123. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_edge_cases.py +0 -0
  124. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_fault_injection.py +0 -0
  125. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_internals.py +0 -0
  126. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_metric_options.py +0 -0
  127. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_parallelism.py +0 -0
  128. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_harness_smoke.py +0 -0
  129. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_import_boundaries.py +0 -0
  130. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_is_metric_defined_for_slice.py +0 -0
  131. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_lazy_extras_messages.py +0 -0
  132. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_leakage.py +0 -0
  133. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_leakage_error_paths.py +0 -0
  134. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_leakage_props.py +0 -0
  135. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_loaders.py +0 -0
  136. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_loaders_coverage.py +0 -0
  137. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_loaders_props.py +0 -0
  138. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_logging.py +0 -0
  139. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_losses.py +0 -0
  140. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest.py +0 -0
  141. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest_contamination_round_trip.py +0 -0
  142. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest_props.py +0 -0
  143. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_manifest_validation.py +0 -0
  144. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_metrics_props.py +0 -0
  145. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_metrics_stratified_subsets.py +0 -0
  146. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_metrics_unit.py +0 -0
  147. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_misc_coverage.py +0 -0
  148. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_numeric_edge_cases.py +0 -0
  149. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_ood_loader.py +0 -0
  150. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_operating_points.py +0 -0
  151. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_operating_points_props.py +0 -0
  152. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_parallel.py +0 -0
  153. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_paths.py +0 -0
  154. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_pipeline_e2e.py +0 -0
  155. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_plotting_edge.py +0 -0
  156. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_plotting_smoke.py +0 -0
  157. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_plotting_visual.py +0 -0
  158. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_preprocessing.py +0 -0
  159. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_probes.py +0 -0
  160. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_protocol_conformance.py +0 -0
  161. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_provenance.py +0 -0
  162. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_public_api.py +0 -0
  163. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_recall_at_fpr.py +0 -0
  164. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_reference_equivalence.py +0 -0
  165. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_reproducibility_integration.py +0 -0
  166. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_rng.py +0 -0
  167. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_schemas.py +0 -0
  168. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_scorecard.py +0 -0
  169. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_seeds.py +0 -0
  170. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_splits.py +0 -0
  171. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_splits_leakage_integration.py +0 -0
  172. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_splits_props.py +0 -0
  173. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_stacking.py +0 -0
  174. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_sweep.py +0 -0
  175. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup.py +0 -0
  176. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup_coverage.py +0 -0
  177. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup_props.py +0 -0
  178. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_text_dedup_strategies.py +0 -0
  179. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds.py +0 -0
  180. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_constant_score.py +0 -0
  181. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_coverage.py +0 -0
  182. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_props.py +0 -0
  183. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_thresholds_research_grounded.py +0 -0
  184. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_tokenization_leakage_check.py +0 -0
  185. {eval_toolkit-1.0.0 → eval_toolkit-1.0.2}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,92 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
9
+
10
+ Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
11
+ v1.0.1 (RC4) and v1.0.2 (this release). All P3, all NON-BREAKING.
12
+
13
+ ### Changed (Tier-2 ADDITIVE: contract clarification only)
14
+
15
+ - **RC2** (#76) — `SimilarityStrategy` Protocol promoted from
16
+ "pre-v0.7 internal interface" (prose framing only) to formal
17
+ 10th strict Tier-2 Protocol per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
18
+ Aligns prose surfaces (README, extending.md, strict_tier2_protocols.md,
19
+ api/protocols.md, ADR 0004 §D6, roadmap.md) with the contract
20
+ already locked in `tests/golden/public_api/snapshot.json` +
21
+ `src/eval_toolkit/__init__.py:_EXPORTS` since v1.0.0. **No code
22
+ change — documentation-only reconciliation.** Strict-Tier-2 count
23
+ goes 9 → 10 (+ 1 opt-in `Versioned`).
24
+
25
+ ### Fixed
26
+
27
+ - **RC3** (#76) — `tests/test_harness_folded.py::test_evaluate_folded_reseed_splitter_varies_partitions`
28
+ test hardening. Previous assertions covered count + key existence
29
+ only; a regression silently reusing the splitter (R8-C1 pre-fix
30
+ behavior) could still pass. v1.0.2 adds row-content comparison:
31
+ replays `reseed_splitter` against the splitter for `seed=1` vs
32
+ `seed=2` and asserts fold-0 test partitions differ via feature-text
33
+ set membership (robust to `_slice_subset`'s `reset_index(drop=True)`
34
+ via stable text-column identifiers).
35
+
36
+ - **F-metrics-1** (#76) — `brier_score` docstring input-domain clarity.
37
+ Added explicit "Input domain" Notes subsection clarifying binary
38
+ labels in `{0, 1}` + calibrated probabilities in `[0, 1]` are
39
+ required; raw logits or unbounded ranking scores pass the finiteness
40
+ check but produce out-of-range MSE that misrepresents calibration
41
+ quality. Includes calibration-applying recipe pointer.
42
+
43
+ - **F-metrics-3** (#76) — `expected_calibration_error` docstring
44
+ uniform-scores note. Added explicit Notes subsection documenting
45
+ that constant `y_score` returns 0.0 (per-bin formula trivially
46
+ satisfied) but is semantically misleading — uninformative scorers
47
+ look "perfectly calibrated" despite zero discriminative power.
48
+ Callers should filter constant inputs before ECE.
49
+
50
+ - **F-metrics-4** (#76) — `brier_score` docstring single-class
51
+ edge-case explicit. Added Notes subsection with closed-form
52
+ expressions for all-zeros (`BS = mean(p²)`) and all-ones
53
+ (`BS = mean((1-p)²)`) cases. Explicit confirmation that
54
+ per-slice degenerate-class evaluation is supported (unlike
55
+ PR-AUC / ROC-AUC).
56
+
57
+ ## [1.0.1] — 2026-05-25 — audit_citation_alignment + RC4 docs polish
58
+
59
+ First v1.x patch release. Ships the `audit_citation_alignment` validator
60
+ that's been pre-staged by consumer `prompt-injection-detection-prototype`
61
+ (see #77), plus the smallest #76 cleanup item (RC4).
62
+
63
+ ### Added
64
+
65
+ - **`audit_citation_alignment` module** — flat-module per [ADR 0001](docs/source/adr/0001-flat-module-layout.md)
66
+ (stay-flat-through-v1.x; subpackage restructure deferred to v2.0).
67
+ Exports `validate_citations(...)`, `ADRSubject`, `CitationMisalignment`,
68
+ and `extract_adr_subject_category` as Tier 1 STRICT (per
69
+ [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
70
+ Validator catches the bug class where a markdown surface cites "per
71
+ ADR-NNN" but the cited ADR's actual subject doesn't match the
72
+ surrounding claim category — motivated by the V1.3.2 P1-2 finding in
73
+ the consumer `prompt-injection-detection-prototype` audit where
74
+ `docs/REPRODUCIBILITY.md:76` cited ADR-029 (test markers) for a
75
+ tier-lock claim that should have cited ADR-034 (reproducibility tier
76
+ ladder). The mis-citation went undetected by lychee (URL-resolves
77
+ check), consumer's `audit_numbers.py` (numeric values), and consumer's
78
+ `audit_adr_count_claims.py` (count claims). Closes #73.
79
+ - **Pre-tag dogfood**: `validate_citations()` exercised against
80
+ eval-toolkit's own docs (95 files including README + audit_findings +
81
+ methodology + migration guides) — 0 misalignments found. Validator
82
+ proven in production use before consumers adopt.
83
+
84
+ ### Fixed
85
+
86
+ - **RC4** (#76 cleanup) — v0.51 documentation count-tally reconciliation
87
+ across `docs/source/audit_findings.md`, `docs/source/migration/v0.51.md`,
88
+ and `CHANGELOG.md` `[0.51.0]` section. Canonical tally now consistent:
89
+ **13 confirmed → fixed in v0.51 / 3 refuted (R8-G2 + R8-G5 + R8-V1+V2
90
+ paired) / 2 deferred (R8-G3, R8-G4) = 18 total**. Prior drift was
91
+ "left 2 undecided" (migration/v0.51) vs "deferred" (CHANGELOG) +
92
+ ambiguity in the audit_findings ship-status section. Closes RC4 of #76.
93
+
8
94
  ## [1.0.0] — 2026-05-25 — Stability contract activates per ADR 0003
9
95
 
10
96
  v1.0 is a **stability-contract activation**, not a code delta from v0.51.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -115,7 +115,8 @@ format changes.
115
115
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
116
116
  │ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
117
117
  │ ThresholdSelector / DatasetLoader / MetricSpec │
118
- │ MetaLearner / Probe / TextTransform (9 strict)
118
+ │ MetaLearner / Probe / TextTransform /
119
+ │ SimilarityStrategy (10 strict) │
119
120
  │ Versioned (opt-in: per-object versions in manifest) │
120
121
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
121
122
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
@@ -32,7 +32,8 @@ format changes.
32
32
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
33
33
  │ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
34
34
  │ ThresholdSelector / DatasetLoader / MetricSpec │
35
- │ MetaLearner / Probe / TextTransform (9 strict)
35
+ │ MetaLearner / Probe / TextTransform /
36
+ │ SimilarityStrategy (10 strict) │
36
37
  │ Versioned (opt-in: per-object versions in manifest) │
37
38
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
38
39
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
@@ -52,6 +52,14 @@ _EXPORTS: dict[str, str] = {
52
52
  # CharacterInjectionStrategy + character_injection SimpleNamespace
53
53
  # removed at v0.47 (Decision N + plan §4E). TextTransform Protocol +
54
54
  # the 12 concrete dataclasses are now the only public path.
55
+ # --- audit_citation_alignment ---
56
+ # Flat-module per ADR 0001 (Stay flat through v1.x; subpackage
57
+ # restructure deferred to v2.0). Closes #73. Motivated by consumer
58
+ # V1.3.2 P1-2 ADR-029 mis-citation finding.
59
+ "ADRSubject": "eval_toolkit.audit_citation_alignment",
60
+ "CitationMisalignment": "eval_toolkit.audit_citation_alignment",
61
+ "extract_adr_subject_category": "eval_toolkit.audit_citation_alignment",
62
+ "validate_citations": "eval_toolkit.audit_citation_alignment",
55
63
  # --- losses ---
56
64
  "RecallAtLowFPR": "eval_toolkit.losses",
57
65
  # --- preprocessing ---
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.0.0"
5
+ __version__ = "1.0.2"
@@ -0,0 +1,301 @@
1
+ r"""ADR-citation alignment validator.
2
+
3
+ Catches the bug class where a reader-facing markdown surface cites
4
+ "per ADR-NNN" but the cited ADR's actual subject doesn't match the
5
+ surrounding claim category.
6
+
7
+ Motivating test case (from `prompt-injection-detection-prototype` v1.3.2
8
+ audit, file `docs/REPRODUCIBILITY.md:76`)::
9
+
10
+ "Two-tier reproduction (locked at Phase 0-07 via ADR-029):"
11
+
12
+ ADR-029 is the test-marker-strategy ADR (unit / smoke / integration /
13
+ network markers). The actual reproducibility-tier-lock ADR is ADR-034.
14
+ The citation is wrong by 5 digits, with surrounding context "Two-tier
15
+ reproduction" clearly in the *reproducibility* category, not the
16
+ *test_markers* category. :func:`validate_citations` flags this case.
17
+
18
+ Design (per ADR 0001 contract-first; ADR 0002 metric-spec style for the
19
+ configurable categories):
20
+
21
+ - The validator is **pure**: pass in markdown text + ADR frontmatter +
22
+ a category-keyword map; get back a list of
23
+ :class:`CitationMisalignment` records. No filesystem I/O inside the
24
+ validator; the CLI wrapper handles globbing.
25
+ - Categories are **consumer-supplied**: this module ships no default
26
+ category map. Consumers wire their project's claim taxonomy
27
+ (reproducibility / cost / calibration / threshold / contamination /
28
+ test_markers / leakage / etc.) into the validator.
29
+
30
+ References
31
+ ----------
32
+ .. [1] Nygard, M. "Documenting Architecture Decisions." 2011.
33
+ https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import re
39
+ from collections.abc import Sequence
40
+ from dataclasses import dataclass
41
+ from pathlib import Path
42
+ from typing import Final
43
+
44
+ # Default citation pattern: matches "per ADR-NNN", "via ADR-NNN", "by ADR-NNN",
45
+ # "under ADR-NNN" — case-insensitive on the citation phrase; ADR-NNN is
46
+ # 3-digit-zero-padded by Nygard convention.
47
+ DEFAULT_CITATION_PATTERN: Final[str] = r"(?i)(?:per|via|by|under)\s+ADR-(\d{3})"
48
+
49
+ # Sniff radius around a citation match for category-keyword matching.
50
+ # Locked at ±2 lines so the validator catches citations whose claim
51
+ # category is on the immediately-adjacent line (common in Markdown
52
+ # tables / bullet lists / wrapped prose).
53
+ DEFAULT_CONTEXT_LINES: Final[int] = 2
54
+
55
+
56
+ @dataclass(frozen=True)
57
+ class ADRSubject:
58
+ """Subject category of a single ADR.
59
+
60
+ Parameters
61
+ ----------
62
+ adr_id : str
63
+ 3-digit-zero-padded ADR id, e.g. ``"029"``.
64
+ title : str
65
+ ADR title (from frontmatter ``title:`` field).
66
+ slug : str
67
+ ADR slug (from frontmatter ``slug:`` field). Often informative
68
+ about the actual subject.
69
+ category : str | None
70
+ Claim-taxonomy category the ADR belongs to (e.g.
71
+ ``"test_markers"``, ``"reproducibility"``, ``"cost"``). ``None``
72
+ if no category matched the ADR's title/slug keywords (caller
73
+ decides whether to treat ``None`` as a finding or skip).
74
+ """
75
+
76
+ adr_id: str
77
+ title: str
78
+ slug: str
79
+ category: str | None
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class CitationMisalignment:
84
+ """A "per ADR-NNN" citation whose category doesn't match the cited ADR's subject.
85
+
86
+ Parameters
87
+ ----------
88
+ file : Path
89
+ Reader-facing markdown file the citation appears in.
90
+ line : int
91
+ 1-indexed line number of the citation.
92
+ cited_adr_id : str
93
+ 3-digit-zero-padded ADR id from the citation.
94
+ surrounding_text : str
95
+ ≤120 chars of context around the citation (for human review).
96
+ claim_category : str | None
97
+ Category inferred from the surrounding text (None if no
98
+ category keyword matched).
99
+ adr_actual_category : str | None
100
+ Category inferred from the cited ADR's title+slug (None if no
101
+ category keyword matched).
102
+ """
103
+
104
+ file: Path
105
+ line: int
106
+ cited_adr_id: str
107
+ surrounding_text: str
108
+ claim_category: str | None
109
+ adr_actual_category: str | None
110
+
111
+
112
+ def extract_adr_subject_category(
113
+ title: str,
114
+ slug: str,
115
+ category_keywords: dict[str, list[str]],
116
+ ) -> str | None:
117
+ """Infer an ADR's claim-taxonomy category from its title + slug.
118
+
119
+ Walks each ``(category, keywords)`` entry in ``category_keywords``
120
+ and returns the first category whose keywords appear in the
121
+ concatenated title+slug (case-insensitive).
122
+
123
+ Parameters
124
+ ----------
125
+ title : str
126
+ ADR title from frontmatter.
127
+ slug : str
128
+ ADR slug from frontmatter or filename.
129
+ category_keywords : dict[str, list[str]]
130
+ Map from category name to a list of keyword substrings. First
131
+ keyword match wins; categories are tested in dict-insertion
132
+ order, so the caller controls priority.
133
+
134
+ Returns
135
+ -------
136
+ str | None
137
+ Matching category name, or ``None`` if no keyword matched.
138
+
139
+ Examples
140
+ --------
141
+ >>> extract_adr_subject_category(
142
+ ... title="Reproducibility tier - full ladder T0 + T1 + T3",
143
+ ... slug="reproducibility-tier-full-ladder",
144
+ ... category_keywords={
145
+ ... "test_markers": ["marker", "smoke marker"],
146
+ ... "reproducibility": ["reproduc", "tier"],
147
+ ... },
148
+ ... )
149
+ 'reproducibility'
150
+ """
151
+ haystack = f"{title} {slug}".lower()
152
+ for category, keywords in category_keywords.items():
153
+ for keyword in keywords:
154
+ if keyword.lower() in haystack:
155
+ return category
156
+ return None
157
+
158
+
159
+ def _extract_context_text(
160
+ lines: list[str],
161
+ line_index: int,
162
+ context_lines: int,
163
+ ) -> str:
164
+ """Return ≤120-char snippet of context around `line_index` (1-indexed)."""
165
+ start = max(0, line_index - 1 - context_lines)
166
+ end = min(len(lines), line_index + context_lines)
167
+ return " ".join(line.strip() for line in lines[start:end])[:300]
168
+
169
+
170
+ def _infer_claim_category(
171
+ context: str,
172
+ category_keywords: dict[str, list[str]],
173
+ ) -> str | None:
174
+ """Same first-match-wins keyword check as ADR subject extraction, on prose context."""
175
+ haystack = context.lower()
176
+ for category, keywords in category_keywords.items():
177
+ for keyword in keywords:
178
+ if keyword.lower() in haystack:
179
+ return category
180
+ return None
181
+
182
+
183
+ def validate_citations(
184
+ *,
185
+ markdown_text: str,
186
+ markdown_path: Path,
187
+ adr_subjects: dict[str, ADRSubject],
188
+ category_keywords: dict[str, list[str]],
189
+ citation_pattern: str = DEFAULT_CITATION_PATTERN,
190
+ context_lines: int = DEFAULT_CONTEXT_LINES,
191
+ known_exempt_citations: Sequence[tuple[Path, int, str]] = (),
192
+ ) -> list[CitationMisalignment]:
193
+ """Find "per ADR-NNN" citations whose category doesn't match the cited ADR.
194
+
195
+ Parameters
196
+ ----------
197
+ markdown_text : str
198
+ Body of the reader-facing markdown file.
199
+ markdown_path : Path
200
+ Path of the markdown file (for misalignment.file annotation).
201
+ adr_subjects : dict[str, ADRSubject]
202
+ Map from 3-digit ADR id to :class:`ADRSubject` records. Caller
203
+ builds this by parsing each ADR's frontmatter; the
204
+ ``ADRSubject.category`` field is populated via
205
+ :func:`extract_adr_subject_category`.
206
+ category_keywords : dict[str, list[str]]
207
+ Map from category name to substring-keyword list (used both for
208
+ ADR subject inference and for surrounding-text category
209
+ inference). Same map MUST be used for both directions.
210
+ citation_pattern : str, optional
211
+ Regex finding the citation surface. Group 1 must capture the
212
+ 3-digit ADR id. Default :data:`DEFAULT_CITATION_PATTERN`
213
+ matches "per/via/by/under ADR-NNN".
214
+ context_lines : int, optional
215
+ Number of lines (±) around the citation to consider when
216
+ inferring the claim category. Default
217
+ :data:`DEFAULT_CONTEXT_LINES` (=2).
218
+ known_exempt_citations : Sequence of (Path, int, str), optional
219
+ ``(file, line, cited_adr_id)`` tuples to skip. Useful for
220
+ consumers with known historical drift that's been accepted by
221
+ policy (e.g., immutable ADR bodies with frozen-in errors that
222
+ a superseding ADR has already addressed).
223
+
224
+ Returns
225
+ -------
226
+ list[CitationMisalignment]
227
+ One :class:`CitationMisalignment` per misaligned citation.
228
+ Empty if no misalignments OR no citations matched the pattern.
229
+
230
+ Notes
231
+ -----
232
+ A citation with ``claim_category=None`` (no category keyword
233
+ matched the surrounding context) is **NOT** flagged as a
234
+ misalignment. The validator defers to the caller's category map:
235
+ if the caller's vocabulary doesn't cover the claim, there's no
236
+ basis for saying the citation is misaligned. To force every
237
+ citation to be flaggable, the caller should ensure their
238
+ ``category_keywords`` has broad coverage.
239
+
240
+ Examples
241
+ --------
242
+ >>> adr_subjects = {
243
+ ... "029": ADRSubject(
244
+ ... adr_id="029",
245
+ ... title="Test marker strategy",
246
+ ... slug="test-marker-strategy",
247
+ ... category="test_markers",
248
+ ... ),
249
+ ... }
250
+ >>> result = validate_citations(
251
+ ... markdown_text="Two-tier reproduction locked at Phase 0-07 via ADR-029.\\n",
252
+ ... markdown_path=Path("docs/REPRODUCIBILITY.md"),
253
+ ... adr_subjects=adr_subjects,
254
+ ... category_keywords={
255
+ ... "reproducibility": ["reproduc", "tier", "T0", "T1", "T3"],
256
+ ... "test_markers": ["marker"],
257
+ ... },
258
+ ... )
259
+ >>> len(result)
260
+ 1
261
+ >>> result[0].cited_adr_id
262
+ '029'
263
+ >>> result[0].claim_category
264
+ 'reproducibility'
265
+ >>> result[0].adr_actual_category
266
+ 'test_markers'
267
+ """
268
+ exempt_set = {(str(p), ln, adr) for (p, ln, adr) in known_exempt_citations}
269
+ misalignments: list[CitationMisalignment] = []
270
+ lines = markdown_text.splitlines()
271
+ citation_re = re.compile(citation_pattern)
272
+
273
+ for line_no, line in enumerate(lines, start=1):
274
+ for match in citation_re.finditer(line):
275
+ adr_id = match.group(1)
276
+ if (str(markdown_path), line_no, adr_id) in exempt_set:
277
+ continue
278
+ subject = adr_subjects.get(adr_id)
279
+ if subject is None:
280
+ # Citation references an unknown ADR. Out of scope for
281
+ # this validator (a different validator should check
282
+ # "does ADR-NNN exist"). Skip.
283
+ continue
284
+ context = _extract_context_text(lines, line_no, context_lines)
285
+ claim_category = _infer_claim_category(context, category_keywords)
286
+ if claim_category is None:
287
+ # No category basis for comparison; skip per the deferral above.
288
+ continue
289
+ if claim_category == subject.category:
290
+ continue
291
+ misalignments.append(
292
+ CitationMisalignment(
293
+ file=markdown_path,
294
+ line=line_no,
295
+ cited_adr_id=adr_id,
296
+ surrounding_text=context,
297
+ claim_category=claim_category,
298
+ adr_actual_category=subject.category,
299
+ )
300
+ )
301
+ return misalignments
@@ -792,6 +792,20 @@ def expected_calibration_error(
792
792
  empirical positive rate in the bin, and :math:`\\mathrm{conf}` is the
793
793
  mean predicted score.
794
794
 
795
+ **Uniform / uninformative scores** (F-metrics-3 v1.0.2 clarity pass):
796
+ when ``y_score`` is constant (e.g., ``[0.5] * n`` — an uninformative
797
+ detector), this function returns ``0.0`` regardless of the true label
798
+ distribution. That's technically correct per the formula —
799
+ :math:`|\\mathrm{acc}(B_m) - \\mathrm{conf}(B_m)|` measures bin-level
800
+ calibration, and a single occupied bin with ``conf = base rate``
801
+ achieves perfect calibration locally. But it is semantically
802
+ misleading: an uninformative scorer looks "perfectly calibrated"
803
+ even though it has zero discriminative power. **Callers should
804
+ detect and filter uninformative inputs before passing to ECE** —
805
+ e.g., reject when ``np.unique(y_score).size == 1`` or when the
806
+ score variance is below a domain-specific threshold. Use
807
+ :func:`brier_score` or :func:`pr_auc` for resolution-aware metrics.
808
+
795
809
  References
796
810
  ----------
797
811
  .. [1] DeGroot, M. H. & Fienberg, S. E. "The comparison and evaluation of
@@ -1240,6 +1254,30 @@ def brier_score(
1240
1254
  -----
1241
1255
  .. math:: \mathrm{BS} = \frac{1}{n} \sum_i (p_i - y_i)^2
1242
1256
 
1257
+ **Input domain** (F-metrics-1 v1.0.2 clarity pass): ``y_true`` must
1258
+ be binary labels in ``{0, 1}`` (other label values raise
1259
+ ``ValueError``). ``y_score`` must be calibrated probabilities in
1260
+ ``[0, 1]`` — raw logits or unbounded ranking scores will pass the
1261
+ finiteness check but produce an out-of-range MSE that misrepresents
1262
+ calibration quality. If your scorer produces logits, apply
1263
+ sigmoid / softmax / a fitted calibrator (see
1264
+ :mod:`eval_toolkit.calibration`) before passing to ``brier_score``.
1265
+
1266
+ **Single-class behavior** (F-metrics-4 v1.0.2 clarity pass): unlike
1267
+ PR-AUC / ROC-AUC, ``brier_score`` is well-defined when ``y_true``
1268
+ is all-zeros or all-ones — it degenerates to the MSE around the
1269
+ constant class label. Specifically:
1270
+
1271
+ - All-zeros: :math:`\mathrm{BS} = \frac{1}{n} \sum_i p_i^2` —
1272
+ forecasting any positive probability incurs squared-error loss.
1273
+ - All-ones: :math:`\mathrm{BS} = \frac{1}{n} \sum_i (1 - p_i)^2`
1274
+ — forecasting low probability incurs squared-error loss.
1275
+
1276
+ This is the deliberate Brier-as-strict-proper-scoring-rule behavior
1277
+ (Brier 1950). Per-slice degenerate-class evaluation is supported
1278
+ via the ``empty_strategy`` parameter for ``n=0`` only; non-empty
1279
+ single-class slices score normally.
1280
+
1243
1281
  See Also
1244
1282
  --------
1245
1283
  eval_toolkit.metrics.brier_decomposition :
@@ -1,5 +1,6 @@
1
1
  {
2
2
  "__all__": [
3
+ "ADRSubject",
3
4
  "ADVANCED_TECHNIQUES",
4
5
  "ALL_TECHNIQUES",
5
6
  "ANCHOR_RE",
@@ -11,6 +12,7 @@
11
12
  "CISafeThresholdSelector",
12
13
  "CORE_TECHNIQUES",
13
14
  "CaseInjection",
15
+ "CitationMisalignment",
14
16
  "ClaimReport",
15
17
  "ClaimSpec",
16
18
  "CorrectionMethod",
@@ -156,6 +158,7 @@
156
158
  "evaluate_folded",
157
159
  "evaluate_scorer_on_slice",
158
160
  "external_diagnostic_gate",
161
+ "extract_adr_subject_category",
159
162
  "fdr_bh_correct",
160
163
  "figure_metadata",
161
164
  "file_sha256",
@@ -235,6 +238,7 @@
235
238
  "stratified_recall",
236
239
  "strict_artifact_gate",
237
240
  "sweep",
241
+ "validate_citations",
238
242
  "validate_manifest",
239
243
  "validate_payload",
240
244
  "validate_prediction_artifact_ref",
@@ -248,6 +252,14 @@
248
252
  "write_run_result"
249
253
  ],
250
254
  "entries": {
255
+ "ADRSubject": {
256
+ "bases": [
257
+ "object"
258
+ ],
259
+ "doc_first_line": "Subject category of a single ADR.",
260
+ "kind": "class",
261
+ "signature": "(adr_id: 'str', title: 'str', slug: 'str', category: 'str | None') -> None"
262
+ },
251
263
  "ADVANCED_TECHNIQUES": {
252
264
  "doc_first_line": "Built-in immutable sequence.",
253
265
  "kind": "value",
@@ -329,6 +341,14 @@
329
341
  "kind": "class",
330
342
  "signature": "(ratio: 'float' = 0.5, seed: 'int' = 42, name: 'str' = 'case_random') -> None"
331
343
  },
344
+ "CitationMisalignment": {
345
+ "bases": [
346
+ "object"
347
+ ],
348
+ "doc_first_line": "A \"per ADR-NNN\" citation whose category doesn't match the cited ADR's subject.",
349
+ "kind": "class",
350
+ "signature": "(file: 'Path', line: 'int', cited_adr_id: 'str', surrounding_text: 'str', claim_category: 'str | None', adr_actual_category: 'str | None') -> None"
351
+ },
332
352
  "ClaimReport": {
333
353
  "bases": [
334
354
  "object"
@@ -1353,7 +1373,7 @@
1353
1373
  "doc_first_line": "str(object='') -> str",
1354
1374
  "kind": "value",
1355
1375
  "type": "str",
1356
- "value": "'0.51.0'"
1376
+ "value": "'1.0.2'"
1357
1377
  },
1358
1378
  "apply_operating_points": {
1359
1379
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1480,6 +1500,11 @@
1480
1500
  "kind": "function",
1481
1501
  "signature": "(path: 'str', *, op: \"Literal['<', '<=', '>', '>=', '=='] | None\" = None, threshold: 'float | None' = None, severity: 'GateSeverity' = 'error') -> 'EvidenceGate'"
1482
1502
  },
1503
+ "extract_adr_subject_category": {
1504
+ "doc_first_line": "Infer an ADR's claim-taxonomy category from its title + slug.",
1505
+ "kind": "function",
1506
+ "signature": "(title: 'str', slug: 'str', category_keywords: 'dict[str, list[str]]') -> 'str | None'"
1507
+ },
1483
1508
  "fdr_bh_correct": {
1484
1509
  "doc_first_line": "Benjamini-Hochberg false-discovery-rate correction.",
1485
1510
  "kind": "function",
@@ -1875,6 +1900,11 @@
1875
1900
  "kind": "function",
1876
1901
  "signature": "(strategies: 'Sequence[TextTransform]', texts: 'Sequence[str]', *, scorer: 'Scorer | None' = None, attack_threshold: 'float | None' = None) -> 'pd.DataFrame'"
1877
1902
  },
1903
+ "validate_citations": {
1904
+ "doc_first_line": "Find \"per ADR-NNN\" citations whose category doesn't match the cited ADR.",
1905
+ "kind": "function",
1906
+ "signature": "(*, markdown_text: 'str', markdown_path: 'Path', adr_subjects: 'dict[str, ADRSubject]', category_keywords: 'dict[str, list[str]]', citation_pattern: 'str' = '(?i)(?:per|via|by|under)\\\\s+ADR-(\\\\d{3})', context_lines: 'int' = 2, known_exempt_citations: 'Sequence[tuple[Path, int, str]]' = ()) -> 'list[CitationMisalignment]'"
1907
+ },
1878
1908
  "validate_manifest": {
1879
1909
  "doc_first_line": "Validate a serialized ``RunManifest`` payload.",
1880
1910
  "kind": "function",