eval-toolkit 1.0.2__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/CHANGELOG.md +37 -0
  2. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/PKG-INFO +1 -1
  3. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/__init__.py +7 -0
  4. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_version.py +1 -1
  5. eval_toolkit-1.0.3/src/eval_toolkit/audit_value_bindings.py +448 -0
  6. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/public_api/snapshot.json +34 -1
  7. eval_toolkit-1.0.3/tests/test_audit_value_bindings.py +338 -0
  8. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/.gitignore +0 -0
  9. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/LICENSE +0 -0
  10. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/README.md +0 -0
  11. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/STYLE.md +0 -0
  12. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/archive/README.md +0 -0
  13. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/README.md +0 -0
  14. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/datasets/README.md +0 -0
  15. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/data-integrity/README.md +0 -0
  16. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/eval-ecosystem/README.md +0 -0
  17. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/inference/README.md +0 -0
  18. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/research/papers/prompt-injection/README.md +0 -0
  19. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/source/adr/README.md +0 -0
  20. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/docs/source/methodology/README.md +0 -0
  21. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/pyproject.toml +0 -0
  22. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_rng.py +0 -0
  26. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/_sweep.py +0 -0
  27. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/adversarial.py +0 -0
  28. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  31. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/bootstrap.py +0 -0
  32. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/calibration.py +0 -0
  33. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/claims.py +0 -0
  34. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/config.py +0 -0
  35. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/docs.py +0 -0
  36. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/embeddings.py +0 -0
  37. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/evidence.py +0 -0
  38. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/harness.py +0 -0
  39. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/leakage.py +0 -0
  40. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/loaders.py +0 -0
  41. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/losses.py +0 -0
  42. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/manifest.py +0 -0
  43. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/metric_specs.py +0 -0
  44. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/metrics.py +0 -0
  45. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/operating_points.py +0 -0
  46. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/paths.py +0 -0
  47. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/plotting.py +0 -0
  48. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/preprocessing.py +0 -0
  49. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/probes.py +0 -0
  50. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/protocols.py +0 -0
  51. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/provenance.py +0 -0
  52. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/py.typed +0 -0
  53. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  54. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  55. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  56. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  57. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/results.v1.json +0 -0
  58. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  59. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/scorecards.py +0 -0
  60. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/seeds.py +0 -0
  61. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/splits.py +0 -0
  62. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/stacking.py +0 -0
  63. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/text_dedup.py +0 -0
  64. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/src/eval_toolkit/thresholds.py +0 -0
  65. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  66. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  67. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  68. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  69. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  70. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  71. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  72. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  73. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  74. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  75. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/benchmarks/__init__.py +0 -0
  76. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  77. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/conftest.py +0 -0
  78. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/bootstrap_ci/cases.json +0 -0
  79. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout.jsonl +0 -0
  80. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout_expected.json +0 -0
  81. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  82. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/docs/expected.md +0 -0
  83. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/docs/input.md +0 -0
  84. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/docs/metrics.json +0 -0
  85. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  86. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/strategies.py +0 -0
  87. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_adversarial.py +0 -0
  88. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_analysis.py +0 -0
  89. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_artifacts.py +0 -0
  90. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_audit_citation_alignment.py +0 -0
  91. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_block_bootstrap_on_folds.py +0 -0
  92. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_calibration_mc.py +0 -0
  93. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_edge_cases.py +0 -0
  94. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_golden.py +0 -0
  95. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_njobs.py +0 -0
  96. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_props.py +0 -0
  97. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_research_grounded.py +0 -0
  98. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_bootstrap_unit.py +0 -0
  99. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_binary_adapters.py +0 -0
  100. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_bootstrap_chain.py +0 -0
  101. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_determinism.py +0 -0
  102. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_optimization_failures.py +0 -0
  103. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_props.py +0 -0
  104. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_research_grounded.py +0 -0
  105. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_calibration_unit.py +0 -0
  106. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_claims.py +0 -0
  107. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_claims_coverage.py +0 -0
  108. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_claims_props.py +0 -0
  109. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_cli.py +0 -0
  110. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_config.py +0 -0
  111. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_bootstrap.py +0 -0
  112. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_calibration.py +0 -0
  113. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_harness.py +0 -0
  114. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_metrics.py +0 -0
  115. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_coverage_plotting.py +0 -0
  116. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_croissant_e2e.py +0 -0
  117. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_dedup_split_leakage_chain.py +0 -0
  118. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_deprecated_scalars_shim.py +0 -0
  119. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_deprecations.py +0 -0
  120. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_docs_golden.py +0 -0
  121. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_docs_props.py +0 -0
  122. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_embeddings.py +0 -0
  123. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_evidence_validators.py +0 -0
  124. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_edge_cases.py +0 -0
  125. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_fault_injection.py +0 -0
  126. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_folded.py +0 -0
  127. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_internals.py +0 -0
  128. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_metric_options.py +0 -0
  129. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_parallelism.py +0 -0
  130. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_harness_smoke.py +0 -0
  131. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_import_boundaries.py +0 -0
  132. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_is_metric_defined_for_slice.py +0 -0
  133. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_lazy_extras_messages.py +0 -0
  134. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_leakage.py +0 -0
  135. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_leakage_error_paths.py +0 -0
  136. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_leakage_props.py +0 -0
  137. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_loaders.py +0 -0
  138. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_loaders_coverage.py +0 -0
  139. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_loaders_props.py +0 -0
  140. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_logging.py +0 -0
  141. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_losses.py +0 -0
  142. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest.py +0 -0
  143. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest_contamination_round_trip.py +0 -0
  144. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest_props.py +0 -0
  145. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_manifest_validation.py +0 -0
  146. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_metrics_props.py +0 -0
  147. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_metrics_stratified_subsets.py +0 -0
  148. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_metrics_unit.py +0 -0
  149. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_misc_coverage.py +0 -0
  150. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_numeric_edge_cases.py +0 -0
  151. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_ood_loader.py +0 -0
  152. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_operating_points.py +0 -0
  153. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_operating_points_props.py +0 -0
  154. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_parallel.py +0 -0
  155. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_paths.py +0 -0
  156. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_pipeline_e2e.py +0 -0
  157. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_plotting_edge.py +0 -0
  158. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_plotting_smoke.py +0 -0
  159. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_plotting_visual.py +0 -0
  160. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_preprocessing.py +0 -0
  161. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_probes.py +0 -0
  162. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_protocol_conformance.py +0 -0
  163. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_provenance.py +0 -0
  164. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_public_api.py +0 -0
  165. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_recall_at_fpr.py +0 -0
  166. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_reference_equivalence.py +0 -0
  167. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_reproducibility_integration.py +0 -0
  168. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_rng.py +0 -0
  169. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_schemas.py +0 -0
  170. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_scorecard.py +0 -0
  171. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_seeds.py +0 -0
  172. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_splits.py +0 -0
  173. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_splits_leakage_integration.py +0 -0
  174. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_splits_props.py +0 -0
  175. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_stacking.py +0 -0
  176. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_sweep.py +0 -0
  177. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup.py +0 -0
  178. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup_coverage.py +0 -0
  179. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup_props.py +0 -0
  180. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_text_dedup_strategies.py +0 -0
  181. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds.py +0 -0
  182. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_constant_score.py +0 -0
  183. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_coverage.py +0 -0
  184. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_props.py +0 -0
  185. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_thresholds_research_grounded.py +0 -0
  186. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_tokenization_leakage_check.py +0 -0
  187. {eval_toolkit-1.0.2 → eval_toolkit-1.0.3}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,43 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.3] — 2026-05-26 — `audit_value_bindings` module (closes #71)
9
+
10
+ Tier-2 ADDITIVE — second member of the audit-validator family
11
+ following `audit_citation_alignment` (v1.0.1). Flat-module per
12
+ [ADR 0001](docs/source/adr/0001-flat-module-layout.md).
13
+
14
+ ### Added
15
+
16
+ - **`audit_value_bindings` module** exporting
17
+ `validate_reader_value_bindings()` + `Match` + `Violation` +
18
+ `ValueBindingsReport` as Tier 1 STRICT (per
19
+ [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)).
20
+ Catches the bug class where a markdown surface pairs a detector name
21
+ with the **wrong** canonical value — both values exist in the
22
+ source-of-truth table but the binding is misordered. Motivated by
23
+ the consumer V1.3.1 ADR-080 audit-fix patch closure (2026-05-22)
24
+ where `WRITEUP_NARRATIVE.md:38` said "TF-IDF + logistic regression
25
+ baseline reaches 0.974 AUPRC" but canonical TF-IDF direct val AUPRC
26
+ is 0.971 (0.974 was LoRA's value). The existing `audit_numbers.py`
27
+ validates VALUES against source data but not BINDINGS — this
28
+ validator closes that gap.
29
+ - Cross-detector disambiguation: when multiple detectors and values
30
+ appear in the same paragraph (e.g., "TF-IDF achieves 0.971, while
31
+ LoRA reaches 0.974"), each value pairs with the LAST detector
32
+ appearing before it in text order (falling back to first detector
33
+ after if no before-detector is in range). Avoids false-positive
34
+ bindings across closely-spaced detector mentions.
35
+ - Coverage metric: `ValueBindingsReport.coverage` reports the fraction
36
+ of `(detector, metric)` keys in the canonical `bindings` dict that
37
+ produced at least one `Match` — useful for detecting stale or
38
+ unreferenced bindings in reader prose.
39
+ - 13 tests at `tests/test_audit_value_bindings.py` including the
40
+ verbatim WRITEUP_NARRATIVE seed-case regression, alias resolution
41
+ (detector + metric), distance-window edge, value-without-metric
42
+ skip, coverage fraction, tolerance band, multi-detector
43
+ disambiguation, frozen-dataclass invariants. Closes #71.
44
+
8
45
  ## [1.0.2] — 2026-05-26 — #76 cleanup batch closes (RC2 + RC3 + F-metrics-1/3/4)
9
46
 
10
47
  Closes the GH #76 v1.0.1 cleanup tracker. All 6 items shipped across
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -60,6 +60,13 @@ _EXPORTS: dict[str, str] = {
60
60
  "CitationMisalignment": "eval_toolkit.audit_citation_alignment",
61
61
  "extract_adr_subject_category": "eval_toolkit.audit_citation_alignment",
62
62
  "validate_citations": "eval_toolkit.audit_citation_alignment",
63
+ # --- audit_value_bindings ---
64
+ # Flat-module per ADR 0001. Closes #71. Motivated by consumer V1.3.1
65
+ # ADR-080 audit-fix finding (TF-IDF / LoRA 0.974 value mis-binding).
66
+ "Match": "eval_toolkit.audit_value_bindings",
67
+ "ValueBindingsReport": "eval_toolkit.audit_value_bindings",
68
+ "Violation": "eval_toolkit.audit_value_bindings",
69
+ "validate_reader_value_bindings": "eval_toolkit.audit_value_bindings",
63
70
  # --- losses ---
64
71
  "RecallAtLowFPR": "eval_toolkit.losses",
65
72
  # --- preprocessing ---
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.0.2"
5
+ __version__ = "1.0.3"
@@ -0,0 +1,448 @@
1
+ r"""Reader-prose value-binding validator.
2
+
3
+ Catches the bug class where a reader-facing markdown surface pairs a
4
+ detector name with the **wrong** canonical value — both values are
5
+ present in the source-of-truth table, but the binding is misordered.
6
+
7
+ Motivating test case (from `prompt-injection-detection-prototype`
8
+ v1.3.1 audit-fix, ADR-080 patch closure 2026-05-22)::
9
+
10
+ WRITEUP_NARRATIVE.md:38:
11
+ "The TF-IDF + logistic regression baseline reaches 0.974 AUPRC
12
+ on balanced direct-versus-benign validation."
13
+
14
+ Canonical: TF-IDF direct val AUPRC = 0.971; LoRA direct val AUPRC =
15
+ 0.974. Both values exist in the bindings table; the bug is the wrong
16
+ (detector, value) pairing. The pre-existing ``audit_numbers.py``-style
17
+ primitive validates VALUES against source data; this validator
18
+ validates BINDINGS — that each prose-mentioned (detector_token,
19
+ metric_token, value) triple matches the canonical binding.
20
+
21
+ Design (per ADR 0001 flat-module + ADR 0002 closed-config + ADR 0003
22
+ Tier 1 STRICT public-API contract):
23
+
24
+ - Consumer supplies the canonical-binding table + value/metric/detector
25
+ regex patterns; validator handles position-aware regex scan + binding
26
+ lookup + report assembly.
27
+ - Flat-module: `eval_toolkit.audit_value_bindings.*` (NOT a subpackage
28
+ per ADR 0001 stay-flat-through-v1.x).
29
+ - All Tier-1 STRICT public symbols (`validate_reader_value_bindings`,
30
+ `Match`, `Violation`, `ValueBindingsReport`) re-exported at top level
31
+ via `_EXPORTS` lazy resolver.
32
+
33
+ Closes upstream issue #71. v1.0.3.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import re
39
+ from collections.abc import Mapping, Sequence
40
+ from dataclasses import dataclass
41
+ from pathlib import Path
42
+ from types import MappingProxyType
43
+
44
+ __all__ = [
45
+ "Match",
46
+ "ValueBindingsReport",
47
+ "Violation",
48
+ "validate_reader_value_bindings",
49
+ ]
50
+
51
+
52
+ DEFAULT_VALUE_PATTERN: str = r"\d+\.\d{2,4}"
53
+ DEFAULT_MAX_DISTANCE_CHARS: int = 80
54
+ DEFAULT_TOLERANCE: float = 1e-4
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class Match:
59
+ """A reader-prose (detector, metric, value) triple that matches the canonical binding.
60
+
61
+ Attributes
62
+ ----------
63
+ file : Path
64
+ File where the match was found.
65
+ line : int
66
+ 1-indexed line number of the value occurrence.
67
+ detector : str
68
+ Canonical detector key from the ``bindings`` dict (NOT the
69
+ regex-matched surface form).
70
+ metric : str
71
+ Canonical metric key from the ``bindings`` dict.
72
+ value : float
73
+ The numeric value found in the prose.
74
+ """
75
+
76
+ file: Path
77
+ line: int
78
+ detector: str
79
+ metric: str
80
+ value: float
81
+
82
+
83
+ @dataclass(frozen=True)
84
+ class Violation:
85
+ """A reader-prose (detector, metric, value) triple where the value disagrees with the canonical binding.
86
+
87
+ Attributes
88
+ ----------
89
+ file : Path
90
+ File where the violation was found.
91
+ line : int
92
+ 1-indexed line number of the offending value occurrence.
93
+ detector : str
94
+ Canonical detector key from the ``bindings`` dict (NOT the
95
+ regex-matched surface form).
96
+ metric : str
97
+ Canonical metric key from the ``bindings`` dict.
98
+ found_value : float
99
+ The numeric value the prose claims.
100
+ expected_value : float
101
+ The canonical value from the ``bindings`` dict.
102
+ surrounding_text : str
103
+ Excerpt centered on the value (configurable window) for
104
+ diagnostic display.
105
+ """
106
+
107
+ file: Path
108
+ line: int
109
+ detector: str
110
+ metric: str
111
+ found_value: float
112
+ expected_value: float
113
+ surrounding_text: str
114
+
115
+
116
+ @dataclass(frozen=True)
117
+ class ValueBindingsReport:
118
+ """Result of :func:`validate_reader_value_bindings`.
119
+
120
+ Attributes
121
+ ----------
122
+ violations : tuple[Violation, ...]
123
+ Each detected (detector, metric) → wrong-value triple. Empty
124
+ tuple if all reader-prose bindings match the canonical table.
125
+ matched : tuple[Match, ...]
126
+ Each detected (detector, metric, value) triple that matched
127
+ the canonical binding. Useful for coverage analysis +
128
+ regression-testing that the validator's regexes still fire.
129
+ coverage : float
130
+ Fraction of ``(detector, metric)`` keys in the ``bindings``
131
+ dict that produced at least one :class:`Match`. Range
132
+ ``[0.0, 1.0]``. ``1.0`` means every binding was referenced in
133
+ the scanned prose; lower values flag potentially un-cited
134
+ bindings (which may be expected OR may indicate stale prose).
135
+ """
136
+
137
+ violations: tuple[Violation, ...]
138
+ matched: tuple[Match, ...]
139
+ coverage: float
140
+
141
+
142
+ def validate_reader_value_bindings(
143
+ *,
144
+ files: Sequence[Path | str],
145
+ bindings: Mapping[tuple[str, str], float],
146
+ value_pattern: str = DEFAULT_VALUE_PATTERN,
147
+ max_distance_chars: int = DEFAULT_MAX_DISTANCE_CHARS,
148
+ metric_aliases: Mapping[str, Sequence[str]] = MappingProxyType({}),
149
+ detector_aliases: Mapping[str, Sequence[str]] = MappingProxyType({}),
150
+ tolerance: float = DEFAULT_TOLERANCE,
151
+ ) -> ValueBindingsReport:
152
+ """Validate (detector, metric, value) bindings in reader-prose markdown.
153
+
154
+ For each ``(detector_token, metric_token) -> expected_value`` entry
155
+ in ``bindings``, scan each file for triples of (detector mention,
156
+ metric mention, numeric value) within a ``max_distance_chars``
157
+ window. Compare the found value to the expected value; emit a
158
+ :class:`Violation` on mismatch, a :class:`Match` on agreement.
159
+
160
+ Both the detector and the metric must appear within the window
161
+ surrounding a candidate value for the triple to be considered —
162
+ a value that has only a detector or only a metric nearby is
163
+ ignored (those belong to a value-existence audit, not a binding
164
+ audit).
165
+
166
+ Parameters
167
+ ----------
168
+ files : Sequence[Path | str]
169
+ Markdown files to scan. UTF-8 encoded.
170
+ bindings : Mapping[tuple[str, str], float]
171
+ Canonical (detector_name, metric_name) → expected_value table.
172
+ Keys are the canonical *identifiers* used in the report — the
173
+ regex patterns that match these in prose come from the
174
+ ``*_aliases`` dicts (with the canonical name as a default
175
+ fallback pattern).
176
+ value_pattern : str, optional
177
+ Regex matching numeric values in prose. Default matches
178
+ ``\\d+\\.\\d{2,4}`` (1+ integer part, 2-4 decimals).
179
+ max_distance_chars : int, optional
180
+ Maximum character distance allowed between a detector mention,
181
+ a metric mention, and a numeric value for them to be treated
182
+ as a triple. Default 80.
183
+ metric_aliases : Mapping[str, Sequence[str]], optional
184
+ ``metric_name -> [regex_alternatives, ...]``. Each canonical
185
+ metric name in ``bindings`` may have multiple natural-language
186
+ forms (e.g., ``"direct_val_auprc"`` matches both ``"direct .*?
187
+ AUPRC"`` and ``"validation AUPRC"``). Missing keys default to
188
+ the canonical name itself, escaped.
189
+ detector_aliases : Mapping[str, Sequence[str]], optional
190
+ Same shape as ``metric_aliases``, applied case-insensitively.
191
+ Useful for ``"tf-idf + lr"`` → ``["TF-IDF", "TfIdf", "tfidf"]``.
192
+ tolerance : float, optional
193
+ Absolute tolerance for float comparison. Default ``1e-4``
194
+ (i.e., ``0.974`` and ``0.9740`` are considered equal).
195
+
196
+ Returns
197
+ -------
198
+ ValueBindingsReport
199
+ ``violations``, ``matched``, ``coverage`` per the dataclass.
200
+
201
+ Examples
202
+ --------
203
+ >>> from pathlib import Path
204
+ >>> import tempfile
205
+ >>> import textwrap
206
+ >>> with tempfile.NamedTemporaryFile(suffix=".md", mode="w", delete=False) as f:
207
+ ... _ = f.write("TF-IDF + LR reaches 0.974 AUPRC on direct val.\\n")
208
+ ... path = Path(f.name)
209
+ >>> report = validate_reader_value_bindings(
210
+ ... files=[path],
211
+ ... bindings={("tf-idf + lr", "direct_val_auprc"): 0.971},
212
+ ... detector_aliases={"tf-idf + lr": ["TF-IDF"]},
213
+ ... metric_aliases={"direct_val_auprc": ["direct val"]},
214
+ ... )
215
+ >>> len(report.violations)
216
+ 1
217
+ >>> report.violations[0].found_value
218
+ 0.974
219
+ >>> report.violations[0].expected_value
220
+ 0.971
221
+
222
+ Notes
223
+ -----
224
+ The validator is **pure**: consumer-side scripts glob markdown
225
+ files and parse canonical-binding tables (e.g., from a JSON
226
+ results file); this function does the regex + window + comparison
227
+ work and returns a structured report.
228
+
229
+ Multiple candidate values within the same detector+metric window
230
+ each produce their own Match / Violation entry. Coverage counts
231
+ a (detector, metric) key as covered iff at least one Match was
232
+ emitted for it (Violations don't count toward coverage — a
233
+ misbound mention proves the binding was REACHED but disproves
234
+ it was correct; the report makes both signals available).
235
+
236
+ Case-sensitivity: detector and metric regexes are applied with
237
+ ``re.IGNORECASE``. The canonical names in ``bindings`` are used
238
+ verbatim in report keys regardless of how they were matched in
239
+ prose.
240
+
241
+ See Also
242
+ --------
243
+ eval_toolkit.audit_citation_alignment.validate_citations :
244
+ Sibling validator catching ADR-citation alignment drift.
245
+ """
246
+ files_resolved = tuple(Path(f) for f in files)
247
+
248
+ bindings_dict = dict(bindings)
249
+ metric_aliases_dict = dict(metric_aliases)
250
+ detector_aliases_dict = dict(detector_aliases)
251
+
252
+ detector_keys = sorted({d for d, _ in bindings_dict})
253
+ metric_keys = sorted({m for _, m in bindings_dict})
254
+
255
+ detector_patterns: dict[str, re.Pattern[str]] = {
256
+ d: _build_pattern(d, detector_aliases_dict.get(d, ()), case_insensitive=True)
257
+ for d in detector_keys
258
+ }
259
+ metric_patterns: dict[str, re.Pattern[str]] = {
260
+ m: _build_pattern(m, metric_aliases_dict.get(m, ()), case_insensitive=True)
261
+ for m in metric_keys
262
+ }
263
+ value_re = re.compile(value_pattern)
264
+
265
+ violations: list[Violation] = []
266
+ matched: list[Match] = []
267
+ matched_keys: set[tuple[str, str]] = set()
268
+
269
+ for file_path in files_resolved:
270
+ text = file_path.read_text(encoding="utf-8")
271
+ line_starts = _line_starts(text)
272
+
273
+ # Pre-collect ALL detector positions (across every canonical
274
+ # detector key) so each value can be paired with its NEAREST
275
+ # detector. This avoids cross-detector contamination — e.g.,
276
+ # "TF-IDF achieves 0.971, while LoRA reaches 0.974" should
277
+ # pair 0.971 with TF-IDF and 0.974 with LoRA, NOT pair the
278
+ # 0.974 with TF-IDF's binding just because they happen to be
279
+ # within max_distance_chars of each other.
280
+ detector_positions: list[tuple[int, str]] = [] # (position, canonical_key)
281
+ for det_key, det_re in detector_patterns.items():
282
+ for det_match in det_re.finditer(text):
283
+ detector_positions.append((det_match.start(), det_key))
284
+ detector_positions.sort()
285
+
286
+ # For each binding, look in each file for triples.
287
+ for (det_key, met_key), expected in bindings_dict.items():
288
+ det_re = detector_patterns[det_key]
289
+ met_re = metric_patterns[met_key]
290
+
291
+ for det_match in det_re.finditer(text):
292
+ window_start = max(0, det_match.start() - max_distance_chars)
293
+ window_end = min(len(text), det_match.end() + max_distance_chars)
294
+ window_text = text[window_start:window_end]
295
+ window_offset = window_start
296
+
297
+ # Both metric and a value must appear in the window.
298
+ met_hits = list(met_re.finditer(window_text))
299
+ if not met_hits:
300
+ continue
301
+
302
+ for val_match in value_re.finditer(window_text):
303
+ # Skip values immediately adjacent to digits (avoid
304
+ # picking up e.g., "0.974" inside "10.974" or version
305
+ # strings like "1.0.974"). Simple heuristic: the
306
+ # character before the match (if any) must not be a
307
+ # digit or dot.
308
+ val_start_in_full = window_offset + val_match.start()
309
+ if val_start_in_full > 0:
310
+ prev_char = text[val_start_in_full - 1]
311
+ if prev_char.isdigit() or prev_char == ".":
312
+ continue
313
+
314
+ val_str = val_match.group(0)
315
+ try:
316
+ found = float(val_str)
317
+ except ValueError: # pragma: no cover
318
+ continue
319
+
320
+ # Cross-detector disambiguation: require the current
321
+ # det_key to be the detector paired with this value
322
+ # by the text-order rule (last detector before; else
323
+ # first detector after). Avoids cross-contamination
324
+ # on multi-detector prose like "TF-IDF achieves
325
+ # 0.971, while LoRA reaches 0.974".
326
+ paired_key = _nearest_detector_key(
327
+ detector_positions, val_start_in_full, max_distance_chars
328
+ )
329
+ if paired_key != det_key:
330
+ continue
331
+
332
+ # Require the metric mention be within distance of the value too,
333
+ # not just within the detector window.
334
+ met_close = any(
335
+ abs(mh.start() - val_match.start()) <= max_distance_chars for mh in met_hits
336
+ )
337
+ if not met_close:
338
+ continue
339
+
340
+ line_no = _position_to_line(line_starts, val_start_in_full)
341
+ if abs(found - expected) <= tolerance:
342
+ matched.append(
343
+ Match(
344
+ file=file_path,
345
+ line=line_no,
346
+ detector=det_key,
347
+ metric=met_key,
348
+ value=found,
349
+ )
350
+ )
351
+ matched_keys.add((det_key, met_key))
352
+ else:
353
+ # Widen the surrounding context for diagnostic
354
+ # clarity. Center on the value but include
355
+ # ±60 chars to typically capture the detector
356
+ # mention.
357
+ ctx_start = max(0, val_start_in_full - 60)
358
+ ctx_end = min(len(text), val_start_in_full + len(val_str) + 60)
359
+ surrounding = text[ctx_start:ctx_end].replace("\n", " ").strip()
360
+ violations.append(
361
+ Violation(
362
+ file=file_path,
363
+ line=line_no,
364
+ detector=det_key,
365
+ metric=met_key,
366
+ found_value=found,
367
+ expected_value=expected,
368
+ surrounding_text=surrounding,
369
+ )
370
+ )
371
+
372
+ coverage = len(matched_keys) / len(bindings_dict) if bindings_dict else 0.0
373
+ return ValueBindingsReport(
374
+ violations=tuple(violations),
375
+ matched=tuple(matched),
376
+ coverage=coverage,
377
+ )
378
+
379
+
380
+ def _build_pattern(
381
+ canonical: str,
382
+ aliases: Sequence[str],
383
+ *,
384
+ case_insensitive: bool,
385
+ ) -> re.Pattern[str]:
386
+ """Build an OR-joined regex covering canonical name + aliases."""
387
+ parts = [re.escape(canonical), *aliases]
388
+ pattern = "|".join(f"(?:{p})" for p in parts)
389
+ flags = re.IGNORECASE if case_insensitive else 0
390
+ return re.compile(pattern, flags)
391
+
392
+
393
+ def _line_starts(text: str) -> list[int]:
394
+ """Return character positions where each line starts. line[i] starts at line_starts[i]."""
395
+ starts = [0]
396
+ for i, ch in enumerate(text):
397
+ if ch == "\n":
398
+ starts.append(i + 1)
399
+ return starts
400
+
401
+
402
+ def _nearest_detector_key(
403
+ detector_positions: Sequence[tuple[int, str]],
404
+ value_pos: int,
405
+ max_distance: int,
406
+ ) -> str | None:
407
+ """Return the canonical detector key paired with ``value_pos``, or None.
408
+
409
+ Pairing rule: pick the LAST detector that appears BEFORE the value
410
+ (text-order); if none is within ``max_distance``, fall back to the
411
+ FIRST detector that appears AFTER the value within the same range.
412
+ This matches natural English prose patterns "<detector> ...
413
+ <value>" (predominant) and "<value> ... by <detector>" (rare).
414
+
415
+ The previous "absolute-distance nearest" heuristic produced false
416
+ positives on prose like "TF-IDF achieves 0.971, while LoRA reaches
417
+ 0.974" where 0.971 is closer to LoRA in raw distance even though
418
+ it semantically belongs to TF-IDF.
419
+ """
420
+ if not detector_positions:
421
+ return None
422
+ # Look for the LAST detector strictly before the value, within range.
423
+ last_before: str | None = None
424
+ for pos, key in detector_positions:
425
+ if pos < value_pos and (value_pos - pos) <= max_distance:
426
+ last_before = key
427
+ elif pos >= value_pos:
428
+ break
429
+ if last_before is not None:
430
+ return last_before
431
+ # Fall back: FIRST detector after the value, within range.
432
+ for pos, key in detector_positions:
433
+ if pos >= value_pos and (pos - value_pos) <= max_distance:
434
+ return key
435
+ return None
436
+
437
+
438
+ def _position_to_line(line_starts: list[int], pos: int) -> int:
439
+ """Convert a 0-indexed character position to a 1-indexed line number."""
440
+ # Binary-search-like; line_starts is sorted.
441
+ lo, hi = 0, len(line_starts) - 1
442
+ while lo < hi:
443
+ mid = (lo + hi + 1) // 2
444
+ if line_starts[mid] <= pos:
445
+ lo = mid
446
+ else:
447
+ hi = mid - 1
448
+ return lo + 1
@@ -68,6 +68,7 @@
68
68
  "LogisticStacker",
69
69
  "MANIFEST_SCHEMA_VERSION",
70
70
  "MDEEstimate",
71
+ "Match",
71
72
  "MaxF1Selector",
72
73
  "MetaLearner",
73
74
  "MetricFn",
@@ -127,7 +128,9 @@
127
128
  "TokenSplittingInjection",
128
129
  "TokenizationLeakageCheck",
129
130
  "UnicodeNormalizationInjection",
131
+ "ValueBindingsReport",
130
132
  "Versioned",
133
+ "Violation",
131
134
  "WhitespaceInjection",
132
135
  "WilsonInterval",
133
136
  "YoudenJSelector",
@@ -242,6 +245,7 @@
242
245
  "validate_manifest",
243
246
  "validate_payload",
244
247
  "validate_prediction_artifact_ref",
248
+ "validate_reader_value_bindings",
245
249
  "validate_results",
246
250
  "validate_source_roles",
247
251
  "walk_path",
@@ -795,6 +799,14 @@
795
799
  "kind": "class",
796
800
  "signature": "(mde: 'float', sigma_delta: 'float', delta_observed: 'float', alpha: 'float', power: 'float', n_resamples: 'int', n: 'int') -> None"
797
801
  },
802
+ "Match": {
803
+ "bases": [
804
+ "object"
805
+ ],
806
+ "doc_first_line": "A reader-prose (detector, metric, value) triple that matches the canonical binding.",
807
+ "kind": "class",
808
+ "signature": "(file: 'Path', line: 'int', detector: 'str', metric: 'str', value: 'float') -> None"
809
+ },
798
810
  "MaxF1Selector": {
799
811
  "bases": [
800
812
  "object"
@@ -1326,6 +1338,14 @@
1326
1338
  "kind": "class",
1327
1339
  "signature": "(form: 'str' = 'NFKC', name: 'str' = 'unicode_normalize') -> None"
1328
1340
  },
1341
+ "ValueBindingsReport": {
1342
+ "bases": [
1343
+ "object"
1344
+ ],
1345
+ "doc_first_line": "Result of :func:`validate_reader_value_bindings`.",
1346
+ "kind": "class",
1347
+ "signature": "(violations: 'tuple[Violation, ...]', matched: 'tuple[Match, ...]', coverage: 'float') -> None"
1348
+ },
1329
1349
  "Versioned": {
1330
1350
  "bases": [
1331
1351
  "Protocol"
@@ -1337,6 +1357,14 @@
1337
1357
  },
1338
1358
  "signature": "(*args, **kwargs)"
1339
1359
  },
1360
+ "Violation": {
1361
+ "bases": [
1362
+ "object"
1363
+ ],
1364
+ "doc_first_line": "A reader-prose (detector, metric, value) triple where the value disagrees with the canonical binding.",
1365
+ "kind": "class",
1366
+ "signature": "(file: 'Path', line: 'int', detector: 'str', metric: 'str', found_value: 'float', expected_value: 'float', surrounding_text: 'str') -> None"
1367
+ },
1340
1368
  "WhitespaceInjection": {
1341
1369
  "bases": [
1342
1370
  "object"
@@ -1373,7 +1401,7 @@
1373
1401
  "doc_first_line": "str(object='') -> str",
1374
1402
  "kind": "value",
1375
1403
  "type": "str",
1376
- "value": "'1.0.2'"
1404
+ "value": "'1.0.3'"
1377
1405
  },
1378
1406
  "apply_operating_points": {
1379
1407
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1920,6 +1948,11 @@
1920
1948
  "kind": "function",
1921
1949
  "signature": "(payload: 'Mapping[str, object]') -> 'None'"
1922
1950
  },
1951
+ "validate_reader_value_bindings": {
1952
+ "doc_first_line": "Validate (detector, metric, value) bindings in reader-prose markdown.",
1953
+ "kind": "function",
1954
+ "signature": "(*, files: 'Sequence[Path | str]', bindings: 'Mapping[tuple[str, str], float]', value_pattern: 'str' = '\\\\d+\\\\.\\\\d{2,4}', max_distance_chars: 'int' = 80, metric_aliases: 'Mapping[str, Sequence[str]]' = mappingproxy({}), detector_aliases: 'Mapping[str, Sequence[str]]' = mappingproxy({}), tolerance: 'float' = 0.0001) -> 'ValueBindingsReport'"
1955
+ },
1923
1956
  "validate_results": {
1924
1957
  "doc_first_line": "Validate a serialized ``RunResult`` payload against ``results.v1.json``.",
1925
1958
  "kind": "function",