eval-toolkit 1.1.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/CHANGELOG.md +249 -0
  2. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/PKG-INFO +1 -1
  3. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/source/adr/README.md +3 -0
  4. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_value_bindings.py +625 -30
  6. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/public_api/snapshot.json +2 -2
  7. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_audit_value_bindings.py +534 -0
  8. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/.gitignore +0 -0
  9. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/LICENSE +0 -0
  10. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/README.md +0 -0
  11. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/STYLE.md +0 -0
  12. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/archive/README.md +0 -0
  13. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/research/README.md +0 -0
  14. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/research/datasets/README.md +0 -0
  15. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/research/papers/data-integrity/README.md +0 -0
  16. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  17. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/research/papers/inference/README.md +0 -0
  18. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/research/papers/prompt-injection/README.md +0 -0
  19. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/docs/source/methodology/README.md +0 -0
  20. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/pyproject.toml +0 -0
  21. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/__init__.py +0 -0
  22. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_rng.py +0 -0
  26. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/_sweep.py +0 -0
  27. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/adversarial.py +0 -0
  28. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  31. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  32. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/bootstrap.py +0 -0
  33. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/calibration.py +0 -0
  34. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/claims.py +0 -0
  35. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/config.py +0 -0
  36. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/docs.py +0 -0
  37. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/embeddings.py +0 -0
  38. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/evidence.py +0 -0
  39. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/harness.py +0 -0
  40. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/leakage.py +0 -0
  41. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/loaders.py +0 -0
  42. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/losses.py +0 -0
  43. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/manifest.py +0 -0
  44. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/metric_specs.py +0 -0
  45. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/metrics.py +0 -0
  46. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/operating_points.py +0 -0
  47. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/paths.py +0 -0
  48. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/plotting.py +0 -0
  49. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/preprocessing.py +0 -0
  50. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/probes.py +0 -0
  51. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/protocols.py +0 -0
  52. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/provenance.py +0 -0
  53. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/py.typed +0 -0
  54. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  55. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  56. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  57. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  58. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  59. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  60. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/scorecards.py +0 -0
  61. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/seeds.py +0 -0
  62. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/splits.py +0 -0
  63. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/stacking.py +0 -0
  64. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/text_dedup.py +0 -0
  65. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/src/eval_toolkit/thresholds.py +0 -0
  66. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  67. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  68. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  69. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  70. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  71. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  72. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  73. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  74. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  75. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  76. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/benchmarks/__init__.py +0 -0
  77. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  78. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/conftest.py +0 -0
  79. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  80. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  81. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  82. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  83. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/docs/expected.md +0 -0
  84. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/docs/input.md +0 -0
  85. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/docs/metrics.json +0 -0
  86. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  87. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/strategies.py +0 -0
  88. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_adversarial.py +0 -0
  89. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_analysis.py +0 -0
  90. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_artifacts.py +0 -0
  91. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_audit_citation_alignment.py +0 -0
  92. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  93. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  94. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  95. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_edge_cases.py +0 -0
  96. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_golden.py +0 -0
  97. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_njobs.py +0 -0
  98. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_props.py +0 -0
  99. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_research_grounded.py +0 -0
  100. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_bootstrap_unit.py +0 -0
  101. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_calibration_binary_adapters.py +0 -0
  102. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  103. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_calibration_determinism.py +0 -0
  104. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_calibration_optimization_failures.py +0 -0
  105. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_calibration_props.py +0 -0
  106. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_calibration_research_grounded.py +0 -0
  107. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_calibration_unit.py +0 -0
  108. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_claims.py +0 -0
  109. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_claims_coverage.py +0 -0
  110. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_claims_props.py +0 -0
  111. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_cli.py +0 -0
  112. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_config.py +0 -0
  113. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_coverage_bootstrap.py +0 -0
  114. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_coverage_calibration.py +0 -0
  115. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_coverage_harness.py +0 -0
  116. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_coverage_metrics.py +0 -0
  117. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_coverage_plotting.py +0 -0
  118. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_croissant_e2e.py +0 -0
  119. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  120. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_deprecated_scalars_shim.py +0 -0
  121. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_deprecations.py +0 -0
  122. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_docs_golden.py +0 -0
  123. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_docs_props.py +0 -0
  124. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_embeddings.py +0 -0
  125. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_evidence_validators.py +0 -0
  126. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_harness_edge_cases.py +0 -0
  127. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_harness_fault_injection.py +0 -0
  128. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_harness_folded.py +0 -0
  129. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_harness_internals.py +0 -0
  130. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_harness_metric_options.py +0 -0
  131. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_harness_parallelism.py +0 -0
  132. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_harness_smoke.py +0 -0
  133. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_import_boundaries.py +0 -0
  134. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  135. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_lazy_extras_messages.py +0 -0
  136. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_leakage.py +0 -0
  137. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_leakage_error_paths.py +0 -0
  138. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_leakage_props.py +0 -0
  139. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_loaders.py +0 -0
  140. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_loaders_coverage.py +0 -0
  141. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_loaders_props.py +0 -0
  142. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_logging.py +0 -0
  143. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_losses.py +0 -0
  144. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_manifest.py +0 -0
  145. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  146. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_manifest_props.py +0 -0
  147. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_manifest_validation.py +0 -0
  148. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_metrics_props.py +0 -0
  149. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_metrics_stratified_subsets.py +0 -0
  150. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_metrics_unit.py +0 -0
  151. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_misc_coverage.py +0 -0
  152. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_numeric_edge_cases.py +0 -0
  153. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_ood_loader.py +0 -0
  154. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_operating_points.py +0 -0
  155. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_operating_points_props.py +0 -0
  156. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_parallel.py +0 -0
  157. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_paths.py +0 -0
  158. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_pipeline_e2e.py +0 -0
  159. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_plotting_edge.py +0 -0
  160. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_plotting_smoke.py +0 -0
  161. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_plotting_visual.py +0 -0
  162. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_preprocessing.py +0 -0
  163. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_probes.py +0 -0
  164. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_protocol_conformance.py +0 -0
  165. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_provenance.py +0 -0
  166. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_public_api.py +0 -0
  167. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_recall_at_fpr.py +0 -0
  168. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_reference_equivalence.py +0 -0
  169. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_reproducibility_integration.py +0 -0
  170. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_rng.py +0 -0
  171. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_schemas.py +0 -0
  172. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_scorecard.py +0 -0
  173. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_seeds.py +0 -0
  174. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_splits.py +0 -0
  175. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_splits_leakage_integration.py +0 -0
  176. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_splits_props.py +0 -0
  177. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_stacking.py +0 -0
  178. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_sweep.py +0 -0
  179. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_text_dedup.py +0 -0
  180. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_coverage.py +0 -0
  181. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_props.py +0 -0
  182. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_text_dedup_strategies.py +0 -0
  183. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_thresholds.py +0 -0
  184. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_thresholds_constant_score.py +0 -0
  185. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_thresholds_coverage.py +0 -0
  186. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_thresholds_props.py +0 -0
  187. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_thresholds_research_grounded.py +0 -0
  188. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_tokenization_leakage_check.py +0 -0
  189. {eval_toolkit-1.1.0 → eval_toolkit-1.3.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,255 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.3.0] — 2026-05-26 — `audit_value_bindings` cross-detector list-grammar pairing rules (closes #81)
9
+
10
+ Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
11
+ Closes [#81](https://github.com/brandon-behring/eval-toolkit/issues/81)
12
+ — consumer-feedback follow-on after v1.2.0's adoption at
13
+ `prompt-injection-detection-submission@v1.3.12` (4 residual
14
+ warnings, all cross-detector list-grammar or metric-axis
15
+ confusion). Introduces **Layer 3 — pairing rules** as the third
16
+ correctness layer alongside ADR 0005's identity + scope model
17
+ (see new [ADR 0006](docs/source/adr/0006-pairing-rules-for-cross-detector-list-grammar.md)).
18
+
19
+ Consumer-side dogfood result: **4 → 0 warnings**. Combined with
20
+ v1.1.0 + v1.2.0, **100% reduction vs the pre-fix v1.0.5 baseline**
21
+ on the consumer's writeup (95 → 0).
22
+
23
+ ### Added — `audit_value_bindings.py` Layer 3 pairing rules
24
+
25
+ All four rules activate ONLY when `scope="narrative"`. Legacy
26
+ `scope="all"` callers see zero behavior change. No new public
27
+ kwargs; keyword sets are hardcoded module-level `frozenset`
28
+ constants.
29
+
30
+ - **Pattern A — `"for {detector}"` postfix override.** When a
31
+ candidate value is followed (within +50 chars) by `"for
32
+ {detector_alias}"` AND no other value lies between, the
33
+ postfix is authoritative: confirms pairing for this binding
34
+ OR skips if it names a different canonical detector.
35
+ Intervening-value check uses the v1.1.0 exclusion-ranges
36
+ infrastructure (CI brackets like `[0.286, 0.301]` don't count
37
+ as intervening values).
38
+ - **Pattern B — `"{detector}'s"` possessive override.** Same
39
+ mechanics; scans −80 chars before the value. Last possessive
40
+ in the pre-window is authoritative if its end is within 30
41
+ chars of the value start. Catches both immediate `"frozen
42
+ probe's 0.515"` and short-clause `"LoRA's ... AUROC is 0.383"`.
43
+ - **Pattern C — group-subject suppression.** When prose contains
44
+ `"for the {trained|frozen|baseline|all|both|other} detectors"`
45
+ within ±60 chars of the value AND on the same side of any
46
+ sentence boundary, the value is suppressed (it refers to a
47
+ multi-detector group statement that doesn't bind to a single
48
+ canonical detector). Multi-detector inference deferred to v1.4.0+
49
+ per ADR 0006.
50
+ - **Pattern D — metric-axis nearest-pairing.** Symmetric to
51
+ detector-axis pairing. Pre-collects ALL metric positions per
52
+ file (across `metric_aliases` keys, not just binding-derived
53
+ metrics). Requires the NEAREST metric to the value to be THIS
54
+ binding's metric. Catches prose like `"AUPRC delta suggests:
55
+ ... AUROC is 0.383"` where the wider window-based metric
56
+ proximity check picks up the wrong metric.
57
+
58
+ ### Internal changes (no public API impact)
59
+
60
+ - New module-level constants:
61
+ - `_GROUP_SUBJECT_KEYWORDS: frozenset[str]` — group adjectives.
62
+ - `_GROUP_SUBJECT_PATTERN: re.Pattern[str]` — compiled regex
63
+ matching `"for the {kw} detectors?"`.
64
+ - New private helpers:
65
+ - `_build_postfix_pattern(detector_aliases, detector_keys)` —
66
+ per-call regex builder for Pattern A.
67
+ - `_build_possessive_pattern(detector_aliases, detector_keys)` —
68
+ per-call regex builder for Pattern B.
69
+ - `metric_patterns` build extended to use the union of
70
+ `binding-derived` and `metric_aliases.keys()` so Pattern D can
71
+ pair against unbound-but-aliased metrics.
72
+ - Inner loop reordered to apply C-suppress → Pattern A → Pattern B
73
+ before proximity-based detector pairing. Pattern A/B record a
74
+ `pairing_confirmed_pos` that BYPASSES proximity when the override
75
+ confirms THIS binding's detector.
76
+ - Pattern D added as a separate check after the existing
77
+ metric_close proximity test.
78
+
79
+ ### Dogfood evidence (compounded across the cycle)
80
+
81
+ | Release | Configuration | Warnings on consumer HEAD | Reduction vs v1.0.5 |
82
+ |---|---|---|---|
83
+ | v1.0.5 | Legacy 2-tuple, no scope | 95 | — |
84
+ | v1.1.0 | BindingKey + scope='narrative' content-type | 23 | -76% |
85
+ | v1.2.0 | + T1–T4 context filters | 7 | -93% |
86
+ | **v1.3.0** | + Pattern A/B/C/D pairing rules | **0** | **-100%** |
87
+
88
+ ### Consumer adoption path
89
+
90
+ `prompt-injection-detection-submission` and other consumers using
91
+ `scope="narrative"` get the v1.3.0 pairing rules automatically with
92
+ no code change. Recommended migration:
93
+
94
+ 1. Re-pin `eval-toolkit>=1.3.0,<2` (additive; no consumer code
95
+ change required).
96
+ 2. **HARD-gate promotion is now credible.** With 0 residual
97
+ warnings, `audit_value_bindings` can be promoted from SOFT to
98
+ HARD (failing CI on violations) bundled with
99
+ `audit_citation_alignment` per the consumer's v1.3.8
100
+ bundled-promotion plan.
101
+
102
+ ### Tests
103
+
104
+ 43 in `tests/test_audit_value_bindings.py` (36 from v1.2.0 + 7
105
+ new for Pattern A/B/C/D + unknown-alias fall-through + scope='all'
106
+ backward-compat + combined dogfood). All pass. Public API
107
+ snapshot regenerated for `__version__` bump only (no signature
108
+ changes).
109
+
110
+ ### Out of scope (deferred)
111
+
112
+ - **Multi-detector inference for Pattern C** — replace
113
+ suppression with explicit iteration over implied group
114
+ detectors. ~250 LOC; v1.4.0+ candidate if consumer demand
115
+ emerges.
116
+ - **Enumeration parsing** — `"X scored Y, Z, W for A, B, C
117
+ respectively"` patterns. Not in #81; v1.4.0+ if needed.
118
+ - **Markdown AST parsing** (ADR 0005 §A4) — v2.0 territory.
119
+ - **Public kwargs for pairing-rule keyword extension** — YAGNI;
120
+ add in v1.3.x patch if demand emerges.
121
+
122
+ ## [1.2.0] — 2026-05-26 — `audit_value_bindings` context-aware noise reduction (consumer-feedback follow-on to #80)
123
+
124
+ Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
125
+ Consumer-feedback follow-on after v1.1.0's adoption at
126
+ `prompt-injection-detection-submission@v1.3.11`. The v1.1.0
127
+ slice-axis fix achieved 62% noise reduction (96 → 36 warnings) on
128
+ the consumer's writeup; the residual 36 were positional-heuristic
129
+ limitations [ADR 0005](docs/source/adr/0005-structured-keys-for-audit-validators.md)
130
+ named as "Future work (deferred)" for v1.2.0+. This release
131
+ addresses 81% of that residual (36 → 7) via four context-aware
132
+ extensions to `scope="narrative"`. Combined with v1.1.0,
133
+ **93% total noise reduction** vs the pre-fix v1.0.5 baseline.
134
+
135
+ ### Added — `audit_value_bindings.py` context-aware narrative filters
136
+
137
+ All four filters activate ONLY when `scope="narrative"`. Legacy
138
+ `scope="all"` callers see zero behavior change (Tier-1 ADDITIVE).
139
+ No new public kwargs; no signature drift; the keyword lists are
140
+ hardcoded module-level `frozenset` constants. Issue [#80](https://github.com/brandon-behring/eval-toolkit/issues/80)'s
141
+ acceptance criterion was ≤5 warnings; v1.2.0 hits 7 (close to the
142
+ target; the remaining 7 are pure cross-detector list-grammar cases
143
+ that require parser-level work — see "Out of scope" below).
144
+
145
+ - **T1: Delta-context filter.** Suppresses values that are
146
+ comparative magnitudes rather than binding claims. Two
147
+ sub-filters:
148
+ - Sign-prefix skip: values immediately preceded by `+` or `-`
149
+ (negative-magnitude markers like `-0.071 AUPRC`,
150
+ `+0.073 lift`) are dropped.
151
+ - Delta-keyword skip: values within 30 chars AFTER a
152
+ delta-marker token are dropped. The before-only window
153
+ prevents mis-firing on prose like `"frozen probe's 0.515
154
+ (delta -0.132)"` where the `"delta"` token refers to the
155
+ following `-0.132`, not the preceding `0.515`.
156
+
157
+ Keyword list (`_DELTA_KEYWORDS`, hardcoded frozenset):
158
+ `delta`, `drop`, `drops`, `lift`, `lifts`, `gap`, `margin`,
159
+ `regresses`, `improves`, `beats`, `exceeds`, `trails`,
160
+ `underperforms`, `vs`, `versus`, `below`. Excluded:
161
+ `against`, `above`, `ahead`, `behind` (too ambiguous; common
162
+ comparison prepositions in legitimate binding prose).
163
+
164
+ - **T2: Floor-context filter.** Suppresses values near random-
165
+ baseline / floor mentions. Window is asymmetric (50 chars
166
+ before, 5 chars after) because floor mentions canonically
167
+ precede the value (`"random AUPRC is 0.374"`).
168
+
169
+ Keyword list (`_FLOOR_KEYWORDS`): `random`, `floor`, `chance`,
170
+ `trivial`. Intentionally narrow — `baseline`, `prior`,
171
+ `majority` excluded because they have legitimate non-floor
172
+ senses (`"TF-IDF baseline"`, `"prior work"`). Multi-word
173
+ patterns like `"below the prevalence baseline of 0.374"` are
174
+ caught by T1's `"below"` keyword instead.
175
+
176
+ - **T3: Consume-on-match within sentence.** After a value
177
+ produces a Match for `(detector, metric, slice)`, subsequent
178
+ values for the same canonical binding in the same sentence are
179
+ suppressed. Catches dense multi-detector enumerations like
180
+ `"AUPRC 0.556 vs 0.519"` where the second value is implicitly
181
+ a contrasting detector's binding (cross-detector inference
182
+ remains out of scope per ADR 0005 A4).
183
+
184
+ - **T4: Sentence-boundary detector-pair reject.** When pairing a
185
+ detector mention with a value, if a sentence terminator (`.`,
186
+ `!`, `?`, `\n\n`) lies between them, the pair is rejected.
187
+ Sentence detection uses paragraph-aware abbreviation guarding
188
+ (`vs.`, `e.g.`, `i.e.`, `c.f.`, `etc.`, `cf.`, `fig.`,
189
+ `eq.`, `pp.`, `viz.`, `ca.` excluded; decimal numbers and
190
+ letter-dot-letter patterns also guarded). Single `\n` is a
191
+ soft break (markdown line-wrap, NOT a sentence boundary);
192
+ `\n\n` is hard.
193
+
194
+ ### Internal changes (no public API impact)
195
+
196
+ - `_nearest_canonical_key()` now returns `(key, position)`
197
+ instead of just `key`. The position is needed for T4's
198
+ sentence-boundary check. The slice-pairing call site unpacks
199
+ and discards the position. Private helper; no consumer impact.
200
+ - New private helpers: `_is_sentence_terminator_dot`,
201
+ `_sentence_boundary_positions`, `_sentence_id_of`,
202
+ `_crosses_sentence_boundary`, `_is_signed_value`,
203
+ `_has_keyword_in_window`, `_compile_keyword_pattern`. All
204
+ underscore-prefixed; Tier-3 FREE.
205
+
206
+ ### Dogfood evidence
207
+
208
+ | Configuration | Warnings on `prompt-injection-detection-submission` HEAD | Reduction vs v1.0.5 baseline |
209
+ |---|---|---|
210
+ | v1.0.5 (legacy 2-tuple) | 95 | — |
211
+ | v1.1.0 BindingKey + scope='narrative' (content-type filter only) | 23 | 76% |
212
+ | **v1.2.0 + context filters (this release)** | **7** | **93%** |
213
+
214
+ The 7 v1.2.0 residuals are all cross-detector list constructions
215
+ (e.g., `"0.293 versus 0.364 for the frozen probe and 0.291 for
216
+ TF-IDF + LR"` where the validator can't infer that 0.361 / 0.291
217
+ belong to ProtectAI-v1 and TF-IDF respectively because they're
218
+ introduced by `"and"` / `"for"` without an immediately-preceding
219
+ detector mention). These require true list-grammar parsing
220
+ (rejected for v1.x in ADR 0005 A4) and are tracked for v1.3.0+
221
+ with their own ADR design review.
222
+
223
+ ### Consumer adoption path
224
+
225
+ `prompt-injection-detection-submission` and other consumers using
226
+ `scope="narrative"` get the v1.2.0 filters automatically with no
227
+ code change. Consumers on `scope="all"` (default) continue with
228
+ v1.1.0 behavior. Recommended consumer migration:
229
+
230
+ 1. Re-pin `eval-toolkit>=1.2.0,<2` (additive; no consumer code
231
+ change required).
232
+ 2. HARD-gate promotion is now credible: 7 residual warnings is
233
+ below the actionable threshold; consumer can promote
234
+ `audit_value_bindings` from SOFT to HARD bundled with
235
+ `audit_citation_alignment` per the v1.3.8 plan.
236
+
237
+ ### Tests
238
+
239
+ 36 in `tests/test_audit_value_bindings.py` (28 from v1.1.0 + 8
240
+ new for T1–T4 + sentence-boundary helper unit test). All pass.
241
+ Public API snapshot regenerated for `__version__` bump only (no
242
+ signature changes beyond an inspect-formatting normalization on
243
+ the `validate_reader_value_bindings` `bindings` annotation; same
244
+ type semantically).
245
+
246
+ ### Out of scope (deferred)
247
+
248
+ - **Cross-detector list-grammar parsing** — the 7 residual
249
+ warnings. Requires lookahead context-aware list parsing
250
+ (`"X scored Y vs Z for W and V for U"`). Track as a v1.3.0+
251
+ candidate; needs ADR design before implementation.
252
+ - **Markdown AST parsing** (ADR 0005 A4) — v2.0 territory.
253
+ - **`extra_*_keywords` kwargs** for runtime extension of the
254
+ hardcoded keyword lists — YAGNI for now (consumer's prose is
255
+ covered); add in a v1.2.x patch if concrete demand emerges.
256
+
8
257
  ## [1.1.0] — 2026-05-26 — `audit_value_bindings` slice-aware matching via `BindingKey` (closes #80)
9
258
 
10
259
  Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.1.0
3
+ Version: 1.3.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -76,3 +76,6 @@ What would have to change for this decision to be reopened?
76
76
  | [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
77
77
  | [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
78
78
  | [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
79
+ | [0004](0004-naming-conventions.md) | Naming conventions for modules, classes, and parameters | Accepted | 2026-05-23 |
80
+ | [0005](0005-structured-keys-for-audit-validators.md) | Structured keys over positional tuples for canonical-identity types in audit validators | Accepted | 2026-05-26 |
81
+ | [0006](0006-pairing-rules-for-cross-detector-list-grammar.md) | Pairing rules for cross-detector list-grammar in audit validators | Accepted | 2026-05-26 |
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.1.0"
5
+ __version__ = "1.3.0"