eval-toolkit 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/CHANGELOG.md +114 -0
  2. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/PKG-INFO +1 -1
  3. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/source/adr/README.md +1 -0
  4. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_value_bindings.py +295 -13
  6. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/public_api/snapshot.json +1 -1
  7. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_audit_value_bindings.py +266 -0
  8. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/.gitignore +0 -0
  9. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/LICENSE +0 -0
  10. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/README.md +0 -0
  11. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/STYLE.md +0 -0
  12. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/archive/README.md +0 -0
  13. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/README.md +0 -0
  14. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/datasets/README.md +0 -0
  15. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/data-integrity/README.md +0 -0
  16. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  17. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/inference/README.md +0 -0
  18. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/research/papers/prompt-injection/README.md +0 -0
  19. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/docs/source/methodology/README.md +0 -0
  20. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/pyproject.toml +0 -0
  21. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/__init__.py +0 -0
  22. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_rng.py +0 -0
  26. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_sweep.py +0 -0
  27. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/adversarial.py +0 -0
  28. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  31. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  32. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/bootstrap.py +0 -0
  33. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/calibration.py +0 -0
  34. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/claims.py +0 -0
  35. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/config.py +0 -0
  36. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/docs.py +0 -0
  37. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/embeddings.py +0 -0
  38. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/evidence.py +0 -0
  39. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/harness.py +0 -0
  40. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/leakage.py +0 -0
  41. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/loaders.py +0 -0
  42. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/losses.py +0 -0
  43. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/manifest.py +0 -0
  44. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/metric_specs.py +0 -0
  45. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/metrics.py +0 -0
  46. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/operating_points.py +0 -0
  47. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/paths.py +0 -0
  48. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/plotting.py +0 -0
  49. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/preprocessing.py +0 -0
  50. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/probes.py +0 -0
  51. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/protocols.py +0 -0
  52. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/provenance.py +0 -0
  53. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/py.typed +0 -0
  54. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  55. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  56. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  57. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  58. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  59. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  60. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/scorecards.py +0 -0
  61. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/seeds.py +0 -0
  62. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/splits.py +0 -0
  63. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/stacking.py +0 -0
  64. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/text_dedup.py +0 -0
  65. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/src/eval_toolkit/thresholds.py +0 -0
  66. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  67. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  68. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  69. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  70. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  71. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  72. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  73. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  74. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  75. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  76. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/benchmarks/__init__.py +0 -0
  77. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  78. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/conftest.py +0 -0
  79. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  80. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  81. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  82. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  83. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/docs/expected.md +0 -0
  84. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/docs/input.md +0 -0
  85. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/docs/metrics.json +0 -0
  86. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  87. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/strategies.py +0 -0
  88. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_adversarial.py +0 -0
  89. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_analysis.py +0 -0
  90. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_artifacts.py +0 -0
  91. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_audit_citation_alignment.py +0 -0
  92. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  93. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  94. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  95. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_edge_cases.py +0 -0
  96. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_golden.py +0 -0
  97. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_njobs.py +0 -0
  98. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_props.py +0 -0
  99. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_research_grounded.py +0 -0
  100. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_unit.py +0 -0
  101. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_binary_adapters.py +0 -0
  102. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  103. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_determinism.py +0 -0
  104. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_optimization_failures.py +0 -0
  105. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_props.py +0 -0
  106. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_research_grounded.py +0 -0
  107. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_calibration_unit.py +0 -0
  108. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_claims.py +0 -0
  109. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_claims_coverage.py +0 -0
  110. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_claims_props.py +0 -0
  111. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_cli.py +0 -0
  112. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_config.py +0 -0
  113. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_bootstrap.py +0 -0
  114. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_calibration.py +0 -0
  115. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_harness.py +0 -0
  116. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_metrics.py +0 -0
  117. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_coverage_plotting.py +0 -0
  118. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_croissant_e2e.py +0 -0
  119. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  120. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_deprecated_scalars_shim.py +0 -0
  121. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_deprecations.py +0 -0
  122. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_docs_golden.py +0 -0
  123. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_docs_props.py +0 -0
  124. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_embeddings.py +0 -0
  125. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_evidence_validators.py +0 -0
  126. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_edge_cases.py +0 -0
  127. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_fault_injection.py +0 -0
  128. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_folded.py +0 -0
  129. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_internals.py +0 -0
  130. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_metric_options.py +0 -0
  131. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_parallelism.py +0 -0
  132. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_harness_smoke.py +0 -0
  133. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_import_boundaries.py +0 -0
  134. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  135. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_lazy_extras_messages.py +0 -0
  136. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_leakage.py +0 -0
  137. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_leakage_error_paths.py +0 -0
  138. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_leakage_props.py +0 -0
  139. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_loaders.py +0 -0
  140. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_loaders_coverage.py +0 -0
  141. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_loaders_props.py +0 -0
  142. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_logging.py +0 -0
  143. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_losses.py +0 -0
  144. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest.py +0 -0
  145. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  146. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest_props.py +0 -0
  147. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_manifest_validation.py +0 -0
  148. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_metrics_props.py +0 -0
  149. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_metrics_stratified_subsets.py +0 -0
  150. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_metrics_unit.py +0 -0
  151. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_misc_coverage.py +0 -0
  152. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_numeric_edge_cases.py +0 -0
  153. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_ood_loader.py +0 -0
  154. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_operating_points.py +0 -0
  155. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_operating_points_props.py +0 -0
  156. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_parallel.py +0 -0
  157. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_paths.py +0 -0
  158. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_pipeline_e2e.py +0 -0
  159. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_plotting_edge.py +0 -0
  160. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_plotting_smoke.py +0 -0
  161. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_plotting_visual.py +0 -0
  162. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_preprocessing.py +0 -0
  163. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_probes.py +0 -0
  164. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_protocol_conformance.py +0 -0
  165. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_provenance.py +0 -0
  166. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_public_api.py +0 -0
  167. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_recall_at_fpr.py +0 -0
  168. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_reference_equivalence.py +0 -0
  169. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_reproducibility_integration.py +0 -0
  170. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_rng.py +0 -0
  171. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_schemas.py +0 -0
  172. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_scorecard.py +0 -0
  173. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_seeds.py +0 -0
  174. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_splits.py +0 -0
  175. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_splits_leakage_integration.py +0 -0
  176. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_splits_props.py +0 -0
  177. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_stacking.py +0 -0
  178. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_sweep.py +0 -0
  179. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup.py +0 -0
  180. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_coverage.py +0 -0
  181. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_props.py +0 -0
  182. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_strategies.py +0 -0
  183. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds.py +0 -0
  184. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_constant_score.py +0 -0
  185. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_coverage.py +0 -0
  186. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_props.py +0 -0
  187. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_thresholds_research_grounded.py +0 -0
  188. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_tokenization_leakage_check.py +0 -0
  189. {eval_toolkit-1.2.0 → eval_toolkit-1.3.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,120 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.3.0] — 2026-05-26 — `audit_value_bindings` cross-detector list-grammar pairing rules (closes #81)
9
+
10
+ Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
11
+ Closes [#81](https://github.com/brandon-behring/eval-toolkit/issues/81)
12
+ — consumer-feedback follow-on after v1.2.0's adoption at
13
+ `prompt-injection-detection-submission@v1.3.12` (4 residual
14
+ warnings, all cross-detector list-grammar or metric-axis
15
+ confusion). Introduces **Layer 3 — pairing rules** as the third
16
+ correctness layer alongside ADR 0005's identity + scope model
17
+ (see new [ADR 0006](docs/source/adr/0006-pairing-rules-for-cross-detector-list-grammar.md)).
18
+
19
+ Consumer-side dogfood result: **4 → 0 warnings**. Combined with
20
+ v1.1.0 + v1.2.0, **100% reduction vs the pre-fix v1.0.5 baseline**
21
+ on the consumer's writeup (95 → 0).
22
+
23
+ ### Added — `audit_value_bindings.py` Layer 3 pairing rules
24
+
25
+ All four rules activate ONLY when `scope="narrative"`. Legacy
26
+ `scope="all"` callers see zero behavior change. No new public
27
+ kwargs; keyword sets are hardcoded module-level `frozenset`
28
+ constants.
29
+
30
+ - **Pattern A — `"for {detector}"` postfix override.** When a
31
+ candidate value is followed (within +50 chars) by `"for
32
+ {detector_alias}"` AND no other value lies between, the
33
+ postfix is authoritative: confirms pairing for this binding
34
+ OR skips if it names a different canonical detector.
35
+ Intervening-value check uses the v1.1.0 exclusion-ranges
36
+ infrastructure (CI brackets like `[0.286, 0.301]` don't count
37
+ as intervening values).
38
+ - **Pattern B — `"{detector}'s"` possessive override.** Same
39
+ mechanics; scans −80 chars before the value. Last possessive
40
+ in the pre-window is authoritative if its end is within 30
41
+ chars of the value start. Catches both immediate `"frozen
42
+ probe's 0.515"` and short-clause `"LoRA's ... AUROC is 0.383"`.
43
+ - **Pattern C — group-subject suppression.** When prose contains
44
+ `"for the {trained|frozen|baseline|all|both|other} detectors"`
45
+ within ±60 chars of the value AND on the same side of any
46
+ sentence boundary, the value is suppressed (it refers to a
47
+ multi-detector group statement that doesn't bind to a single
48
+ canonical detector). Multi-detector inference deferred to v1.4.0+
49
+ per ADR 0006.
50
+ - **Pattern D — metric-axis nearest-pairing.** Symmetric to
51
+ detector-axis pairing. Pre-collects ALL metric positions per
52
+ file (across `metric_aliases` keys, not just binding-derived
53
+ metrics). Requires the NEAREST metric to the value to be THIS
54
+ binding's metric. Catches prose like `"AUPRC delta suggests:
55
+ ... AUROC is 0.383"` where the wider window-based metric
56
+ proximity check picks up the wrong metric.
57
+
58
+ ### Internal changes (no public API impact)
59
+
60
+ - New module-level constants:
61
+ - `_GROUP_SUBJECT_KEYWORDS: frozenset[str]` — group adjectives.
62
+ - `_GROUP_SUBJECT_PATTERN: re.Pattern[str]` — compiled regex
63
+ matching `"for the {kw} detectors?"`.
64
+ - New private helpers:
65
+ - `_build_postfix_pattern(detector_aliases, detector_keys)` —
66
+ per-call regex builder for Pattern A.
67
+ - `_build_possessive_pattern(detector_aliases, detector_keys)` —
68
+ per-call regex builder for Pattern B.
69
+ - `metric_patterns` build extended to use the union of
70
+ `binding-derived` and `metric_aliases.keys()` so Pattern D can
71
+ pair against unbound-but-aliased metrics.
72
+ - Inner loop reordered to apply C-suppress → Pattern A → Pattern B
73
+ before proximity-based detector pairing. Pattern A/B record a
74
+ `pairing_confirmed_pos` that BYPASSES proximity when the override
75
+ confirms THIS binding's detector.
76
+ - Pattern D added as a separate check after the existing
77
+ metric_close proximity test.
78
+
79
+ ### Dogfood evidence (compounded across the cycle)
80
+
81
+ | Release | Configuration | Warnings on consumer HEAD | Reduction vs v1.0.5 |
82
+ |---|---|---|---|
83
+ | v1.0.5 | Legacy 2-tuple, no scope | 95 | — |
84
+ | v1.1.0 | BindingKey + scope='narrative' content-type | 23 | -76% |
85
+ | v1.2.0 | + T1–T4 context filters | 7 | -93% |
86
+ | **v1.3.0** | + Pattern A/B/C/D pairing rules | **0** | **-100%** |
87
+
88
+ ### Consumer adoption path
89
+
90
+ `prompt-injection-detection-submission` and other consumers using
91
+ `scope="narrative"` get the v1.3.0 pairing rules automatically with
92
+ no code change. Recommended migration:
93
+
94
+ 1. Re-pin `eval-toolkit>=1.3.0,<2` (additive; no consumer code
95
+ change required).
96
+ 2. **HARD-gate promotion is now credible.** With 0 residual
97
+ warnings, `audit_value_bindings` can be promoted from SOFT to
98
+ HARD (failing CI on violations) bundled with
99
+ `audit_citation_alignment` per the consumer's v1.3.8
100
+ bundled-promotion plan.
101
+
102
+ ### Tests
103
+
104
+ 43 in `tests/test_audit_value_bindings.py` (36 from v1.2.0 + 7
105
+ new for Pattern A/B/C/D + unknown-alias fall-through + scope='all'
106
+ backward-compat + combined dogfood). All pass. Public API
107
+ snapshot regenerated for `__version__` bump only (no signature
108
+ changes).
109
+
110
+ ### Out of scope (deferred)
111
+
112
+ - **Multi-detector inference for Pattern C** — replace
113
+ suppression with explicit iteration over implied group
114
+ detectors. ~250 LOC; v1.4.0+ candidate if consumer demand
115
+ emerges.
116
+ - **Enumeration parsing** — `"X scored Y, Z, W for A, B, C
117
+ respectively"` patterns. Not in #81; v1.4.0+ if needed.
118
+ - **Markdown AST parsing** (ADR 0005 §A4) — v2.0 territory.
119
+ - **Public kwargs for pairing-rule keyword extension** — YAGNI;
120
+ add in v1.3.x patch if demand emerges.
121
+
8
122
  ## [1.2.0] — 2026-05-26 — `audit_value_bindings` context-aware noise reduction (consumer-feedback follow-on to #80)
9
123
 
10
124
  Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -78,3 +78,4 @@ What would have to change for this decision to be reopened?
78
78
  | [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
79
79
  | [0004](0004-naming-conventions.md) | Naming conventions for modules, classes, and parameters | Accepted | 2026-05-23 |
80
80
  | [0005](0005-structured-keys-for-audit-validators.md) | Structured keys over positional tuples for canonical-identity types in audit validators | Accepted | 2026-05-26 |
81
+ | [0006](0006-pairing-rules-for-cross-detector-list-grammar.md) | Pairing rules for cross-detector list-grammar in audit validators | Accepted | 2026-05-26 |
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.2.0"
5
+ __version__ = "1.3.0"
@@ -154,6 +154,51 @@ _DELTA_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_DELTA_KEYWORDS)
154
154
  _FLOOR_PATTERN: re.Pattern[str] = _compile_keyword_pattern(_FLOOR_KEYWORDS)
155
155
 
156
156
 
157
+ # v1.3.0 Layer 3 (pairing rules) per ADR 0006. Three rules that
158
+ # override or suppress the proximity-based detector pairing under
159
+ # explicit grammar cues:
160
+ #
161
+ # - Pattern A: "for {detector}" postfix → re-pair value to that
162
+ # detector (override). Built per-call via _build_postfix_pattern
163
+ # since it depends on the consumer's detector_aliases dict.
164
+ # - Pattern B: "{detector}'s ... is {value}" possessive → re-pair
165
+ # value to the possessor (override). Built per-call via
166
+ # _build_possessive_pattern.
167
+ # - Pattern C: "for the {trained|frozen|baseline|all|both|other}
168
+ # detectors" group subject → suppress the candidate value entirely
169
+ # (it's a group statement that doesn't bind to a single detector).
170
+ # Pattern is detector-independent so it compiles once at module
171
+ # load.
172
+
173
+ # Group-subject adjectives that introduce a multi-detector statement.
174
+ # When prose says "for the trained detectors", the following value
175
+ # refers to a GROUP (LoRA + TF-IDF + ... whatever bindings exist),
176
+ # not a single canonical detector. The validator can't infer which
177
+ # specific detectors own the group value with positional heuristics,
178
+ # so v1.3.0 suppresses the candidate rather than attempting multi-
179
+ # detector inference (a v1.4.0+ candidate per ADR 0006).
180
+ _GROUP_SUBJECT_KEYWORDS: frozenset[str] = frozenset(
181
+ {
182
+ "trained",
183
+ "frozen",
184
+ "baseline",
185
+ "all",
186
+ "both",
187
+ "other",
188
+ }
189
+ )
190
+
191
+ # Module-level: detector-independent group-subject regex. Matches
192
+ # "for the {trained|frozen|...} detectors" (with optional "the"; both
193
+ # singular and plural "detector"/"detectors" tolerated).
194
+ _GROUP_SUBJECT_PATTERN: re.Pattern[str] = re.compile(
195
+ r"\bfor\s+(?:the\s+)?(?:"
196
+ + "|".join(sorted(re.escape(kw) for kw in _GROUP_SUBJECT_KEYWORDS))
197
+ + r")\s+detectors?\b",
198
+ re.IGNORECASE,
199
+ )
200
+
201
+
157
202
  @dataclass(frozen=True)
158
203
  class BindingKey:
159
204
  """Canonical identity for a `(detector, metric, slice)` measurement.
@@ -469,7 +514,13 @@ def validate_reader_value_bindings(
469
514
  slice_aliases_dict: dict[str, Sequence[str]] = dict(slice_aliases) if slice_aliases else {}
470
515
 
471
516
  detector_keys = sorted({k.detector for k in canonical_bindings})
472
- metric_keys = sorted({k.metric for k in canonical_bindings})
517
+ # v1.3.0 Pattern D (metric-axis nearest-pairing) requires knowing
518
+ # ALL metrics that might appear in prose, not just bound metrics —
519
+ # e.g., when prose mentions AUROC near a value but only AUPRC is
520
+ # bound, Pattern D needs the AUROC pattern to correctly pair the
521
+ # value with the right metric. Union of binding metrics +
522
+ # consumer-supplied metric_aliases keys.
523
+ metric_keys = sorted({k.metric for k in canonical_bindings} | set(metric_aliases_dict.keys()))
473
524
  # Only compile slice patterns for non-"any" slice keys; "any"
474
525
  # signals legacy 2-tuple semantics (no slice scoping).
475
526
  slice_keys = sorted({k.slice for k in canonical_bindings if k.slice != "any"})
@@ -488,6 +539,43 @@ def validate_reader_value_bindings(
488
539
  }
489
540
  value_re = re.compile(value_pattern)
490
541
 
542
+ # v1.3.0 Layer 3 pairing rules (per ADR 0006). Built per-call
543
+ # because Patterns A and B depend on the consumer's detector
544
+ # aliases. `None` when scope="all" (legacy; rules don't fire).
545
+ postfix_pat: re.Pattern[str] | None = (
546
+ _build_postfix_pattern(detector_aliases_dict, detector_keys)
547
+ if scope == "narrative"
548
+ else None
549
+ )
550
+ possessive_pat: re.Pattern[str] | None = (
551
+ _build_possessive_pattern(detector_aliases_dict, detector_keys)
552
+ if scope == "narrative"
553
+ else None
554
+ )
555
+
556
+ # Inverse-alias index: alias-regex (string form) → canonical key.
557
+ # Used to resolve a matched postfix/possessive alias-group back to
558
+ # the canonical detector for override resolution. Each alias regex
559
+ # is keyed verbatim; the resolution path tries each canonical key's
560
+ # alias list + canonical-name fallback.
561
+ def _resolve_canonical_from_alias_match(alias_text: str) -> str | None:
562
+ """Return the canonical detector key whose pattern matched ``alias_text``.
563
+
564
+ Iterates the per-detector patterns and tries to match the
565
+ alias_text. Uses re.IGNORECASE for consistency with the
566
+ outer postfix/possessive patterns. First-match wins (the
567
+ OR-build above means there's only one canonical key per
568
+ match anyway in practice).
569
+ """
570
+ for det_key in detector_keys:
571
+ det_pat = detector_patterns[det_key]
572
+ # det_pat is the alias OR pattern from _build_pattern,
573
+ # case-insensitive. fullmatch on the alias_text checks
574
+ # whether this alias belongs to det_key's set.
575
+ if det_pat.fullmatch(alias_text):
576
+ return det_key
577
+ return None
578
+
491
579
  violations: list[Violation] = []
492
580
  matched: list[Match] = []
493
581
  matched_keys: set[BindingKey] = set()
@@ -542,6 +630,19 @@ def validate_reader_value_bindings(
542
630
  slice_positions.append((s_match.start(), s_key))
543
631
  slice_positions.sort()
544
632
 
633
+ # v1.3.0 Pattern D — metric-axis nearest-pairing (Layer 3 per
634
+ # ADR 0006, narrative-scope only). Pre-collect ALL metric
635
+ # positions so each value can be paired with its NEAREST
636
+ # metric mention (text-order). Catches the case where prose
637
+ # mentions BOTH metrics ("AUPRC delta suggests: AUROC 0.383")
638
+ # and the validator's window-based metric proximity check
639
+ # picks up the wrong metric. Symmetric to detector pairing.
640
+ metric_positions: list[tuple[int, str]] = [] # (position, canonical_metric)
641
+ for m_key, m_re in metric_patterns.items():
642
+ for m_match in m_re.finditer(text):
643
+ metric_positions.append((m_match.start(), m_key))
644
+ metric_positions.sort()
645
+
545
646
  # For each canonical binding, look in each file for triples.
546
647
  for canonical_key, expected in canonical_bindings.items():
547
648
  det_key = canonical_key.detector
@@ -615,18 +716,110 @@ def validate_reader_value_bindings(
615
716
  ):
616
717
  continue
617
718
 
618
- # Cross-detector disambiguation: require the current
619
- # det_key to be the detector paired with this value
620
- # by the text-order rule (last detector before; else
621
- # first detector after). Avoids cross-contamination
622
- # on multi-detector prose like "TF-IDF achieves
623
- # 0.971, while LoRA reaches 0.974".
624
- detector_match = _nearest_canonical_key(
625
- detector_positions, val_start_in_full, max_distance_chars
626
- )
627
- if detector_match is None or detector_match[0] != det_key:
628
- continue
629
- paired_det_pos = detector_match[1]
719
+ # v1.3.0 Pattern C group-subject suppression
720
+ # (narrative-scope only). When prose says "for the
721
+ # {trained|frozen|baseline|all|both|other}
722
+ # detectors" within ±60 chars of the value AND on
723
+ # the same side of any sentence boundary, the
724
+ # value refers to a multi-detector group statement
725
+ # that doesn't bind to a single canonical detector.
726
+ # Suppress the candidate (v1.4.0+ may attempt
727
+ # multi-detector inference per ADR 0006).
728
+ if scope == "narrative":
729
+ gs_start = max(0, val_start_in_full - 60)
730
+ gs_end = min(len(text), val_start_in_full + len(val_str) + 60)
731
+ gs_match = _GROUP_SUBJECT_PATTERN.search(text, gs_start, gs_end)
732
+ if gs_match is not None and not _crosses_sentence_boundary(
733
+ gs_match.start(), val_start_in_full, sentence_positions
734
+ ):
735
+ continue
736
+
737
+ # v1.3.0 Pattern A / B — Layer 3 pairing-rule
738
+ # OVERRIDES (narrative-scope only). When a postfix
739
+ # or possessive explicitly names a detector, the
740
+ # override is AUTHORITATIVE — it confirms or
741
+ # rejects the binding without falling through to
742
+ # the proximity-based detector pairing below.
743
+ #
744
+ # - postfix_confirmed_pos / possessive_confirmed_pos:
745
+ # the character position of the override match,
746
+ # used as the effective "paired detector
747
+ # position" for downstream T4 (sentence-
748
+ # boundary) check.
749
+ # - If postfix/possessive_canonical == det_key:
750
+ # confirmed; bypass proximity.
751
+ # - If != det_key AND is in bindings: skip (the
752
+ # other detector's loop iteration claims it).
753
+ # - If doesn't resolve / no match: fall through
754
+ # to proximity-based pairing.
755
+ pairing_confirmed_pos: int | None = None
756
+
757
+ # Pattern A — "for {detector}" postfix
758
+ if postfix_pat is not None:
759
+ val_end = val_start_in_full + len(val_str)
760
+ pf_match = postfix_pat.search(text, val_end, min(len(text), val_end + 50))
761
+ if pf_match is not None:
762
+ # Intervening-value guard: prose like
763
+ # "X 0.971 versus 0.293 for LoRA" — the
764
+ # "for LoRA" postfix belongs to 0.293,
765
+ # not 0.971. CI brackets like `[0.283,
766
+ # 0.298]` are excluded from intervening
767
+ # consideration via the existing
768
+ # excluded_ranges (v1.1.0 scope filter):
769
+ # values inside brackets aren't real
770
+ # binding-candidate intervening values.
771
+ intervening: re.Match[str] | None = None
772
+ for m in value_re.finditer(text, val_end, pf_match.start()):
773
+ if not (
774
+ excluded_ranges and _is_excluded(m.start(), excluded_ranges)
775
+ ):
776
+ intervening = m
777
+ break
778
+ if intervening is None:
779
+ postfix_canonical = _resolve_canonical_from_alias_match(
780
+ pf_match.group(1)
781
+ )
782
+ if postfix_canonical is not None:
783
+ if postfix_canonical != det_key:
784
+ continue
785
+ pairing_confirmed_pos = pf_match.start()
786
+
787
+ # Pattern B — possessive `'s` (only if Pattern A
788
+ # didn't already confirm). Find the LAST possessive
789
+ # in the −80 char pre-window; if its end is within
790
+ # 30 chars of the value start, apply override.
791
+ if pairing_confirmed_pos is None and possessive_pat is not None:
792
+ ps_matches = list(
793
+ possessive_pat.finditer(
794
+ text, max(0, val_start_in_full - 80), val_start_in_full
795
+ )
796
+ )
797
+ if ps_matches:
798
+ ps_match = ps_matches[-1]
799
+ if val_start_in_full - ps_match.end() <= 30:
800
+ possessive_canonical = _resolve_canonical_from_alias_match(
801
+ ps_match.group(1)
802
+ )
803
+ if possessive_canonical is not None:
804
+ if possessive_canonical != det_key:
805
+ continue
806
+ pairing_confirmed_pos = ps_match.start()
807
+
808
+ # Detector pairing: when a Layer 3 override
809
+ # confirmed the binding (pairing_confirmed_pos
810
+ # set), skip the proximity check — the postfix /
811
+ # possessive is authoritative. Otherwise, fall
812
+ # back to the text-order proximity rule (last
813
+ # detector before; else first detector after).
814
+ if pairing_confirmed_pos is not None:
815
+ paired_det_pos = pairing_confirmed_pos
816
+ else:
817
+ detector_match = _nearest_canonical_key(
818
+ detector_positions, val_start_in_full, max_distance_chars
819
+ )
820
+ if detector_match is None or detector_match[0] != det_key:
821
+ continue
822
+ paired_det_pos = detector_match[1]
630
823
 
631
824
  # v1.2.0 T4 (narrative-scope only): reject the
632
825
  # detector-value pair if a sentence boundary lies
@@ -646,6 +839,22 @@ def validate_reader_value_bindings(
646
839
  if not met_close:
647
840
  continue
648
841
 
842
+ # v1.3.0 Pattern D — metric-axis nearest-pairing
843
+ # (narrative-scope only). Require the NEAREST
844
+ # metric mention to the value (by text-order
845
+ # last-before-first-after) to be THIS binding's
846
+ # canonical metric. Catches prose like "than the
847
+ # AUPRC delta suggests: LoRA's pooled OOD AUROC
848
+ # is 0.383" where the AUPRC mention from the
849
+ # delta clause is within window of 0.383 but
850
+ # AUROC is the metric semantically owning it.
851
+ if scope == "narrative":
852
+ metric_match = _nearest_canonical_key(
853
+ metric_positions, val_start_in_full, max_distance_chars
854
+ )
855
+ if metric_match is not None and metric_match[0] != met_key:
856
+ continue
857
+
649
858
  # Slice disambiguation: when the canonical key is
650
859
  # slice-scoped (slice != "any"), pair the value
651
860
  # with the NEAREST slice mention by the same
@@ -1019,6 +1228,79 @@ def _has_keyword_in_window(
1019
1228
  return bool(pattern.search(text, start, end))
1020
1229
 
1021
1230
 
1231
+ def _build_postfix_pattern(
1232
+ detector_aliases: Mapping[str, Sequence[str]],
1233
+ detector_keys: Sequence[str],
1234
+ ) -> re.Pattern[str] | None:
1235
+ """Build a regex matching `"for {detector_alias}"` postfix constructs.
1236
+
1237
+ v1.3.0 Pattern A (Layer 3 pairing rule per ADR 0006). Used to
1238
+ re-pair a candidate value with the detector named in a "for X"
1239
+ postfix (e.g., ``"0.291 [...] for TF-IDF + LR"`` binds 0.291 to
1240
+ TF-IDF + LR via the postfix, overriding proximity-based pairing).
1241
+
1242
+ Each alias is paired with its canonical key in a single named-group
1243
+ OR pattern; the capture group reveals which detector matched. The
1244
+ canonical-key-as-fallback ensures the canonical name itself is
1245
+ matched even if no alias regex is provided for that detector.
1246
+
1247
+ Returns None if there are no detectors to build patterns for
1248
+ (empty bindings).
1249
+ """
1250
+ if not detector_keys:
1251
+ return None
1252
+ alts: list[str] = []
1253
+ for det_key in detector_keys:
1254
+ # Canonical name as a literal alternative + all alias regexes
1255
+ # (which may themselves contain regex syntax like `\+`).
1256
+ parts = [re.escape(det_key)] + list(detector_aliases.get(det_key, ()))
1257
+ # Each detector's parts collapse into a non-capturing group.
1258
+ alts.append("(?:" + "|".join(parts) + ")")
1259
+ # The outer capture group reveals which detector token matched.
1260
+ # The text-order rule means the first alternative wins per Python
1261
+ # re semantics, which is fine for our use case.
1262
+ return re.compile(
1263
+ r"\bfor\s+(?:the\s+)?(" + "|".join(alts) + r")(?=[\s,;.)\]]|$)",
1264
+ re.IGNORECASE,
1265
+ )
1266
+
1267
+
1268
+ def _build_possessive_pattern(
1269
+ detector_aliases: Mapping[str, Sequence[str]],
1270
+ detector_keys: Sequence[str],
1271
+ ) -> re.Pattern[str] | None:
1272
+ """Build a regex matching `"{detector_alias}'s"` possessive markers.
1273
+
1274
+ v1.3.0 Pattern B (Layer 3 pairing rule per ADR 0006). The
1275
+ possessive ``'s`` construction is a strong binding signal that
1276
+ isn't captured by detector-alias regex matching directly (alias
1277
+ patterns don't typically include the apostrophe). Re-pairs the
1278
+ candidate value with the possessor detector.
1279
+
1280
+ The pattern matches JUST the possessive marker (``{alias}'s``);
1281
+ binding-claim proximity is enforced at the call site (the
1282
+ inner loop's Pattern B block requires the LAST possessive
1283
+ in the pre-window to END within 30 chars of the value, which
1284
+ covers both `"frozen probe's 0.515"` (immediate) and
1285
+ `"LoRA's pooled OOD AUROC is 0.383"` (5-token clause).
1286
+
1287
+ Returns None if there are no detectors (empty bindings).
1288
+ """
1289
+ if not detector_keys:
1290
+ return None
1291
+ alts: list[str] = []
1292
+ for det_key in detector_keys:
1293
+ parts = [re.escape(det_key)] + list(detector_aliases.get(det_key, ()))
1294
+ alts.append("(?:" + "|".join(parts) + ")")
1295
+ # Match `{alias}'s` (ASCII apostrophe or typographic ’s). Tight
1296
+ # — proximity to the value is enforced at the call site via
1297
+ # `match.end()` against the value position.
1298
+ return re.compile(
1299
+ r"(" + "|".join(alts) + r")[’']s\b",
1300
+ re.IGNORECASE,
1301
+ )
1302
+
1303
+
1022
1304
  def _build_pattern(
1023
1305
  canonical: str,
1024
1306
  aliases: Sequence[str],
@@ -1429,7 +1429,7 @@
1429
1429
  "doc_first_line": "str(object='') -> str",
1430
1430
  "kind": "value",
1431
1431
  "type": "str",
1432
- "value": "'1.2.0'"
1432
+ "value": "'1.3.0'"
1433
1433
  },
1434
1434
  "apply_operating_points": {
1435
1435
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",