eval-toolkit 1.0.5__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/.gitignore +9 -0
  2. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/CHANGELOG.md +293 -0
  3. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/PKG-INFO +2 -2
  4. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/source/adr/README.md +2 -0
  5. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/pyproject.toml +1 -1
  6. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/__init__.py +3 -0
  7. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/_version.py +1 -1
  8. eval_toolkit-1.2.0/src/eval_toolkit/audit_value_bindings.py +1131 -0
  9. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/public_api/snapshot.json +12 -3
  10. eval_toolkit-1.2.0/tests/test_audit_value_bindings.py +993 -0
  11. eval_toolkit-1.0.5/src/eval_toolkit/audit_value_bindings.py +0 -448
  12. eval_toolkit-1.0.5/tests/test_audit_value_bindings.py +0 -338
  13. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/LICENSE +0 -0
  14. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/README.md +0 -0
  15. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/STYLE.md +0 -0
  16. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/archive/README.md +0 -0
  17. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/research/README.md +0 -0
  18. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/research/datasets/README.md +0 -0
  19. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/research/papers/data-integrity/README.md +0 -0
  20. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  21. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/research/papers/inference/README.md +0 -0
  22. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/research/papers/prompt-injection/README.md +0 -0
  23. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/docs/source/methodology/README.md +0 -0
  24. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/__main__.py +0 -0
  25. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/_deprecated.py +0 -0
  26. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/_parallel.py +0 -0
  27. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/_rng.py +0 -0
  28. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/_sweep.py +0 -0
  29. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/adversarial.py +0 -0
  30. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/analysis.py +0 -0
  31. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/artifacts.py +0 -0
  32. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  33. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  34. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/bootstrap.py +0 -0
  35. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/calibration.py +0 -0
  36. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/claims.py +0 -0
  37. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/config.py +0 -0
  38. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/docs.py +0 -0
  39. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/embeddings.py +0 -0
  40. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/evidence.py +0 -0
  41. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/harness.py +0 -0
  42. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/leakage.py +0 -0
  43. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/loaders.py +0 -0
  44. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/losses.py +0 -0
  45. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/manifest.py +0 -0
  46. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/metric_specs.py +0 -0
  47. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/metrics.py +0 -0
  48. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/operating_points.py +0 -0
  49. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/paths.py +0 -0
  50. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/plotting.py +0 -0
  51. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/preprocessing.py +0 -0
  52. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/probes.py +0 -0
  53. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/protocols.py +0 -0
  54. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/provenance.py +0 -0
  55. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/py.typed +0 -0
  56. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  57. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  58. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  59. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  60. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  61. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  62. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/scorecards.py +0 -0
  63. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/seeds.py +0 -0
  64. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/splits.py +0 -0
  65. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/stacking.py +0 -0
  66. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/text_dedup.py +0 -0
  67. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/src/eval_toolkit/thresholds.py +0 -0
  68. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  69. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  70. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  71. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  72. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  73. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  74. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  75. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  76. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  77. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  78. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/benchmarks/__init__.py +0 -0
  79. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  80. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/conftest.py +0 -0
  81. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  82. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  83. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  84. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  85. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/docs/expected.md +0 -0
  86. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/docs/input.md +0 -0
  87. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/docs/metrics.json +0 -0
  88. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  89. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/strategies.py +0 -0
  90. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_adversarial.py +0 -0
  91. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_analysis.py +0 -0
  92. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_artifacts.py +0 -0
  93. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_audit_citation_alignment.py +0 -0
  94. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  95. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  96. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  97. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_bootstrap_edge_cases.py +0 -0
  98. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_bootstrap_golden.py +0 -0
  99. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_bootstrap_njobs.py +0 -0
  100. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_bootstrap_props.py +0 -0
  101. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_bootstrap_research_grounded.py +0 -0
  102. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_bootstrap_unit.py +0 -0
  103. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_calibration_binary_adapters.py +0 -0
  104. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  105. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_calibration_determinism.py +0 -0
  106. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_calibration_optimization_failures.py +0 -0
  107. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_calibration_props.py +0 -0
  108. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_calibration_research_grounded.py +0 -0
  109. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_calibration_unit.py +0 -0
  110. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_claims.py +0 -0
  111. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_claims_coverage.py +0 -0
  112. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_claims_props.py +0 -0
  113. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_cli.py +0 -0
  114. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_config.py +0 -0
  115. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_coverage_bootstrap.py +0 -0
  116. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_coverage_calibration.py +0 -0
  117. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_coverage_harness.py +0 -0
  118. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_coverage_metrics.py +0 -0
  119. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_coverage_plotting.py +0 -0
  120. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_croissant_e2e.py +0 -0
  121. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  122. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_deprecated_scalars_shim.py +0 -0
  123. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_deprecations.py +0 -0
  124. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_docs_golden.py +0 -0
  125. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_docs_props.py +0 -0
  126. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_embeddings.py +0 -0
  127. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_evidence_validators.py +0 -0
  128. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_harness_edge_cases.py +0 -0
  129. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_harness_fault_injection.py +0 -0
  130. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_harness_folded.py +0 -0
  131. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_harness_internals.py +0 -0
  132. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_harness_metric_options.py +0 -0
  133. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_harness_parallelism.py +0 -0
  134. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_harness_smoke.py +0 -0
  135. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_import_boundaries.py +0 -0
  136. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  137. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_lazy_extras_messages.py +0 -0
  138. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_leakage.py +0 -0
  139. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_leakage_error_paths.py +0 -0
  140. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_leakage_props.py +0 -0
  141. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_loaders.py +0 -0
  142. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_loaders_coverage.py +0 -0
  143. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_loaders_props.py +0 -0
  144. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_logging.py +0 -0
  145. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_losses.py +0 -0
  146. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_manifest.py +0 -0
  147. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  148. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_manifest_props.py +0 -0
  149. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_manifest_validation.py +0 -0
  150. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_metrics_props.py +0 -0
  151. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_metrics_stratified_subsets.py +0 -0
  152. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_metrics_unit.py +0 -0
  153. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_misc_coverage.py +0 -0
  154. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_numeric_edge_cases.py +0 -0
  155. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_ood_loader.py +0 -0
  156. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_operating_points.py +0 -0
  157. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_operating_points_props.py +0 -0
  158. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_parallel.py +0 -0
  159. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_paths.py +0 -0
  160. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_pipeline_e2e.py +0 -0
  161. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_plotting_edge.py +0 -0
  162. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_plotting_smoke.py +0 -0
  163. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_plotting_visual.py +0 -0
  164. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_preprocessing.py +0 -0
  165. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_probes.py +0 -0
  166. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_protocol_conformance.py +0 -0
  167. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_provenance.py +0 -0
  168. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_public_api.py +0 -0
  169. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_recall_at_fpr.py +0 -0
  170. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_reference_equivalence.py +0 -0
  171. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_reproducibility_integration.py +0 -0
  172. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_rng.py +0 -0
  173. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_schemas.py +0 -0
  174. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_scorecard.py +0 -0
  175. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_seeds.py +0 -0
  176. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_splits.py +0 -0
  177. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_splits_leakage_integration.py +0 -0
  178. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_splits_props.py +0 -0
  179. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_stacking.py +0 -0
  180. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_sweep.py +0 -0
  181. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_text_dedup.py +0 -0
  182. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_text_dedup_coverage.py +0 -0
  183. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_text_dedup_props.py +0 -0
  184. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_text_dedup_strategies.py +0 -0
  185. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_thresholds.py +0 -0
  186. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_thresholds_constant_score.py +0 -0
  187. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_thresholds_coverage.py +0 -0
  188. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_thresholds_props.py +0 -0
  189. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_thresholds_research_grounded.py +0 -0
  190. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_tokenization_leakage_check.py +0 -0
  191. {eval_toolkit-1.0.5 → eval_toolkit-1.2.0}/tests/test_v09_contracts.py +0 -0
@@ -62,6 +62,15 @@ gemini-microaudit-*.md
62
62
  audit-gemini.md
63
63
  comprehensive-audit-codex.md
64
64
  audit-verification-*.md
65
+ # v1.1.0 (#80 cycle): broaden codex-comprehensive-audit-* to cover
66
+ # non-suffixed briefing variants (e.g., codex-comprehensive-audit-v0.50.0.md);
67
+ # the earlier pattern only covered the *-report.md form.
68
+ codex-comprehensive-audit-*.md
69
+
70
+ # Local scratch directory: ad-hoc dogfood scripts, pre-tag validation
71
+ # runs, draft PR bodies, audit prompts. Intentionally untracked.
72
+ # Contents have historical value but are not part of any release.
73
+ .scratch/
65
74
 
66
75
  # Claude Code project settings (machine-local)
67
76
  .claude/
@@ -5,6 +5,299 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.2.0] — 2026-05-26 — `audit_value_bindings` context-aware noise reduction (consumer-feedback follow-on to #80)
9
+
10
+ Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
11
+ Consumer-feedback follow-on after v1.1.0's adoption at
12
+ `prompt-injection-detection-submission@v1.3.11`. The v1.1.0
13
+ slice-axis fix achieved 62% noise reduction (96 → 36 warnings) on
14
+ the consumer's writeup; the residual 36 were positional-heuristic
15
+ limitations [ADR 0005](docs/source/adr/0005-structured-keys-for-audit-validators.md)
16
+ named as "Future work (deferred)" for v1.2.0+. This release
17
+ addresses 81% of that residual (36 → 7) via four context-aware
18
+ extensions to `scope="narrative"`. Combined with v1.1.0,
19
+ **93% total noise reduction** vs the pre-fix v1.0.5 baseline.
20
+
21
+ ### Added — `audit_value_bindings.py` context-aware narrative filters
22
+
23
+ All four filters activate ONLY when `scope="narrative"`. Legacy
24
+ `scope="all"` callers see zero behavior change (Tier-1 ADDITIVE).
25
+ No new public kwargs; no signature drift; the keyword lists are
26
+ hardcoded module-level `frozenset` constants. Issue [#80](https://github.com/brandon-behring/eval-toolkit/issues/80)'s
27
+ acceptance criterion was ≤5 warnings; v1.2.0 hits 7 (close to the
28
+ target; the remaining 7 are pure cross-detector list-grammar cases
29
+ that require parser-level work — see "Out of scope" below).
30
+
31
+ - **T1: Delta-context filter.** Suppresses values that are
32
+ comparative magnitudes rather than binding claims. Two
33
+ sub-filters:
34
+ - Sign-prefix skip: values immediately preceded by `+` or `-`
35
+ (negative-magnitude markers like `-0.071 AUPRC`,
36
+ `+0.073 lift`) are dropped.
37
+ - Delta-keyword skip: values within 30 chars AFTER a
38
+ delta-marker token are dropped. The before-only window
39
+ prevents mis-firing on prose like `"frozen probe's 0.515
40
+ (delta -0.132)"` where the `"delta"` token refers to the
41
+ following `-0.132`, not the preceding `0.515`.
42
+
43
+ Keyword list (`_DELTA_KEYWORDS`, hardcoded frozenset):
44
+ `delta`, `drop`, `drops`, `lift`, `lifts`, `gap`, `margin`,
45
+ `regresses`, `improves`, `beats`, `exceeds`, `trails`,
46
+ `underperforms`, `vs`, `versus`, `below`. Excluded:
47
+ `against`, `above`, `ahead`, `behind` (too ambiguous; common
48
+ comparison prepositions in legitimate binding prose).
49
+
50
+ - **T2: Floor-context filter.** Suppresses values near random-
51
+ baseline / floor mentions. Window is asymmetric (50 chars
52
+ before, 5 chars after) because floor mentions canonically
53
+ precede the value (`"random AUPRC is 0.374"`).
54
+
55
+ Keyword list (`_FLOOR_KEYWORDS`): `random`, `floor`, `chance`,
56
+ `trivial`. Intentionally narrow — `baseline`, `prior`,
57
+ `majority` excluded because they have legitimate non-floor
58
+ senses (`"TF-IDF baseline"`, `"prior work"`). Multi-word
59
+ patterns like `"below the prevalence baseline of 0.374"` are
60
+ caught by T1's `"below"` keyword instead.
61
+
62
+ - **T3: Consume-on-match within sentence.** After a value
63
+ produces a Match for `(detector, metric, slice)`, subsequent
64
+ values for the same canonical binding in the same sentence are
65
+ suppressed. Catches dense multi-detector enumerations like
66
+ `"AUPRC 0.556 vs 0.519"` where the second value is implicitly
67
+ a contrasting detector's binding (cross-detector inference
68
+ remains out of scope per ADR 0005 A4).
69
+
70
+ - **T4: Sentence-boundary detector-pair reject.** When pairing a
71
+ detector mention with a value, if a sentence terminator (`.`,
72
+ `!`, `?`, `\n\n`) lies between them, the pair is rejected.
73
+ Sentence detection uses paragraph-aware abbreviation guarding
74
+ (`vs.`, `e.g.`, `i.e.`, `c.f.`, `etc.`, `cf.`, `fig.`,
75
+ `eq.`, `pp.`, `viz.`, `ca.` excluded; decimal numbers and
76
+ letter-dot-letter patterns also guarded). Single `\n` is a
77
+ soft break (markdown line-wrap, NOT a sentence boundary);
78
+ `\n\n` is hard.
79
+
80
+ ### Internal changes (no public API impact)
81
+
82
+ - `_nearest_canonical_key()` now returns `(key, position)`
83
+ instead of just `key`. The position is needed for T4's
84
+ sentence-boundary check. The slice-pairing call site unpacks
85
+ and discards the position. Private helper; no consumer impact.
86
+ - New private helpers: `_is_sentence_terminator_dot`,
87
+ `_sentence_boundary_positions`, `_sentence_id_of`,
88
+ `_crosses_sentence_boundary`, `_is_signed_value`,
89
+ `_has_keyword_in_window`, `_compile_keyword_pattern`. All
90
+ underscore-prefixed; Tier-3 FREE.
91
+
92
+ ### Dogfood evidence
93
+
94
+ | Configuration | Warnings on `prompt-injection-detection-submission` HEAD | Reduction vs v1.0.5 baseline |
95
+ |---|---|---|
96
+ | v1.0.5 (legacy 2-tuple) | 95 | — |
97
+ | v1.1.0 BindingKey + scope='narrative' (content-type filter only) | 23 | 76% |
98
+ | **v1.2.0 + context filters (this release)** | **7** | **93%** |
99
+
100
+ The 7 v1.2.0 residuals are all cross-detector list constructions
101
+ (e.g., `"0.293 versus 0.364 for the frozen probe and 0.291 for
102
+ TF-IDF + LR"` where the validator can't infer that 0.361 / 0.291
103
+ belong to ProtectAI-v1 and TF-IDF respectively because they're
104
+ introduced by `"and"` / `"for"` without an immediately-preceding
105
+ detector mention). These require true list-grammar parsing
106
+ (rejected for v1.x in ADR 0005 A4) and are tracked for v1.3.0+
107
+ with their own ADR design review.
108
+
109
+ ### Consumer adoption path
110
+
111
+ `prompt-injection-detection-submission` and other consumers using
112
+ `scope="narrative"` get the v1.2.0 filters automatically with no
113
+ code change. Consumers on `scope="all"` (default) continue with
114
+ v1.1.0 behavior. Recommended consumer migration:
115
+
116
+ 1. Re-pin `eval-toolkit>=1.2.0,<2` (additive; no consumer code
117
+ change required).
118
+ 2. HARD-gate promotion is now credible: 7 residual warnings is
119
+ below the actionable threshold; consumer can promote
120
+ `audit_value_bindings` from SOFT to HARD bundled with
121
+ `audit_citation_alignment` per the v1.3.8 plan.
122
+
123
+ ### Tests
124
+
125
+ 36 in `tests/test_audit_value_bindings.py` (28 from v1.1.0 + 8
126
+ new for T1–T4 + sentence-boundary helper unit test). All pass.
127
+ Public API snapshot regenerated for `__version__` bump only (no
128
+ signature changes beyond an inspect-formatting normalization on
129
+ the `validate_reader_value_bindings` `bindings` annotation; same
130
+ type semantically).
131
+
132
+ ### Out of scope (deferred)
133
+
134
+ - **Cross-detector list-grammar parsing** — the 7 residual
135
+ warnings. Requires lookahead context-aware list parsing
136
+ (`"X scored Y vs Z for W and V for U"`). Track as a v1.3.0+
137
+ candidate; needs ADR design before implementation.
138
+ - **Markdown AST parsing** (ADR 0005 A4) — v2.0 territory.
139
+ - **`extra_*_keywords` kwargs** for runtime extension of the
140
+ hardcoded keyword lists — YAGNI for now (consumer's prose is
141
+ covered); add in a v1.2.x patch if concrete demand emerges.
142
+
143
+ ## [1.1.0] — 2026-05-26 — `audit_value_bindings` slice-aware matching via `BindingKey` (closes #80)
144
+
145
+ Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
146
+ Closes [#80](https://github.com/brandon-behring/eval-toolkit/issues/80)
147
+ — consumer-feedback structural fix surfaced by
148
+ `brandon-behring/prompt-injection-detection-prototype@v1.3.9` (96
149
+ warnings, ~95 false positives) where the pre-v1.1 2-tuple
150
+ `(detector, metric)` canonical-binding identity could not
151
+ disambiguate the same `(detector, metric)` across multiple slices
152
+ (`direct_validation`, `pooled_ood`, paired-delta cells,
153
+ random-floor mentions).
154
+
155
+ Pending [ADR 0005](docs/source/adr/0005-structured-keys-for-audit-validators.md)
156
+ codifies the underlying rule: "structured keys over positional
157
+ tuples for canonical-identity types in audit validators."
158
+
159
+ ### Added
160
+
161
+ - **`BindingKey`** (new public class, exported via the lazy
162
+ `_EXPORTS` resolver). Frozen dataclass with fields
163
+ `(detector: str, metric: str, slice: str = "any")`. Forward-
164
+ extensible: future identity axes (split, ci_kind, source_ref, ...)
165
+ can be added as defaulted fields without breaking the dict-key
166
+ schema. Avoids the recur-every-N-months schema-event pattern that
167
+ produced #80.
168
+ - **`validate_reader_value_bindings(bindings=...)`** now accepts
169
+ three input shapes, normalized internally to `dict[BindingKey,
170
+ float]` via a per-key `_normalize_binding_key` adapter:
171
+ 1. Canonical: `BindingKey(detector=..., metric=..., slice=...)`
172
+ (recommended for new consumer code).
173
+ 2. Sugar 3-tuple: `(detector, metric, slice)` (concise dict
174
+ literal; issue #80's proposed schema).
175
+ 3. Legacy 2-tuple: `(detector, metric)` (preserved; treated as
176
+ `slice="any"`). All pre-v1.1 consumer code continues to work
177
+ unchanged.
178
+ Mixed key shapes in a single dict are supported. Invalid key
179
+ shapes raise `TypeError` at the function boundary (loud failure,
180
+ not silent zero-match drift).
181
+ - **New optional kwargs** on `validate_reader_value_bindings`:
182
+ - `slice_aliases: Mapping[str, Sequence[str]] | None = None` —
183
+ canonical-slice-name → regex-alternatives mapping, mirroring
184
+ `detector_aliases` / `metric_aliases`. Used when at least one
185
+ `BindingKey` has `slice != "any"`.
186
+ - `slice_window_chars: int = 240` — character window for slice
187
+ disambiguation (≈ 50 tokens at ~5 chars/token).
188
+ - **Slice disambiguation** in the matching loop. When a binding
189
+ key has `slice != "any"`, the candidate value is paired with
190
+ the nearest slice mention (same last-before-first-after rule as
191
+ detector pairing). If no slice mention falls within
192
+ `slice_window_chars`, the triple is suppressed (warn-only;
193
+ counted in `unmatched_slice_count`). If the paired slice
194
+ differs from the binding's slice, the triple is silently skipped
195
+ (handled by the binding for the correct slice on its own loop
196
+ iteration).
197
+ - **`ValueBindingsReport.unmatched_slice_count: int = 0`** — new
198
+ field surfacing the warn-only signal. Default `0` means full
199
+ backward compatibility for code that constructs reports manually.
200
+ - **`scope: Literal["all", "narrative"] = "all"`** — content-type
201
+ filter on the validator. Default `"all"` preserves legacy v1.0.x
202
+ behavior. Setting `scope="narrative"` excludes from matching:
203
+ - Markdown table rows (lines starting with `|`)
204
+ - Bracketed expressions `[...]` (CI bounds, ranges)
205
+ - Fenced code blocks (triple-backtick)
206
+
207
+ This addresses the broader category of false positives that the
208
+ slice-axis fix alone could not (CI bounds being matched as point
209
+ estimates, table-cell metrics being cross-flagged, code-block
210
+ literals being treated as claims). Compatible with the
211
+ motivating misbinding bug class (V1.3.1 ADR-080) which was in
212
+ narrative prose — no recall loss.
213
+
214
+ Combined dogfood result on `prompt-injection-detection-submission`
215
+ HEAD: **76% noise reduction** (95 warnings at v1.0.5 2-tuple
216
+ baseline → 23 with v1.1.0 BindingKey + `scope="narrative"`). The
217
+ remaining 23 are positional-heuristic limitations (random-floor
218
+ mis-attribution across sentence boundaries; cross-detector
219
+ pairing in dense prose) not addressable without parser-level
220
+ work (future v1.2.0+ scope).
221
+
222
+ ### Changed (Tier-1 ADDITIVE — no breaking changes)
223
+
224
+ - `_nearest_detector_key()` private helper renamed to
225
+ `_nearest_canonical_key()` (used now for both detector and slice
226
+ pairing; the body was already generic). Implementation detail;
227
+ no public API impact.
228
+ - `pyproject.toml` `Development Status` classifier bumped
229
+ `4 - Beta` → `5 - Production/Stable` (post-v1.0 hygiene).
230
+ - `.gitignore` now covers `.scratch/` and the broader
231
+ `codex-comprehensive-audit-*.md` pattern (was previously only
232
+ `*-report.md`).
233
+
234
+ ### Deprecation policy
235
+
236
+ All three `bindings` input shapes remain accepted **indefinitely**
237
+ through the v1.x line. `BindingKey` is the canonical/recommended
238
+ shape per ADR 0005 + docstring guidance; tuples remain valid
239
+ syntactic sugar with no `DeprecationWarning`. Formal deprecation
240
+ deferred to a future v2.0 cleanup pass when there is concrete
241
+ payoff. Consumers can migrate slot-by-slot at their own pace, or
242
+ not at all (legacy 2-tuple semantics survive as `slice="any"`).
243
+
244
+ ### Consumer adoption path
245
+
246
+ `prompt-injection-detection-submission` consumer-side script at
247
+ `scripts/audit_value_bindings.py` can adopt by either:
248
+ - Replacing the 2-tuple `BINDINGS` literal with 3-tuple keys
249
+ (smallest diff; issue body's proposal), OR
250
+ - Migrating to `BindingKey(...)` for forward-extensibility (new
251
+ identity axes won't require re-touching the script).
252
+
253
+ ### Validator design philosophy (introduced this release)
254
+
255
+ This release introduces a two-layer correctness model for audit
256
+ validators, formalized in pending ADR 0005:
257
+
258
+ 1. **Identity correctness** — canonical measurements have
259
+ structured identity (`BindingKey`), not positional tuples.
260
+ Future axes added as defaulted fields without breaking the
261
+ schema.
262
+ 2. **Scope correctness** — the validator should only scan content
263
+ that is plausibly a binding claim. Narrative prose is.
264
+ Markdown tables, CI brackets, and code blocks aren't.
265
+ `scope="narrative"` is the v1.1.0 implementation of this rule;
266
+ match the lint-design convention from `ruff`/`mypy`/`bandit`
267
+ (scope predicates aren't optional in production-quality
268
+ linters).
269
+
270
+ Both layers are now in place. Consumers writing typical research
271
+ writeup prose with dense statistical tables should adopt
272
+ `scope="narrative"` for a ~80% noise reduction relative to v1.0.5.
273
+
274
+ ### Known v1.1.0 limitations (residual after slice + narrative fixes)
275
+
276
+ The remaining ~20% of v1.0.5 baseline noise is positional-
277
+ heuristic limitations that the slice + narrative fixes cannot
278
+ address:
279
+
280
+ 1. **"Random floor" / sub-clause values across sentence
281
+ boundaries** — prose like "X scored 0.291. The pooled OOD
282
+ random floor is 0.374" pairs 0.374 with the nearest preceding
283
+ detector (X), because the validator doesn't treat `.` as a
284
+ pairing boundary.
285
+ 2. **Multi-detector cross-pairing in dense prose** — prose like
286
+ "LoRA scored 0.293 [0.286, 0.301] versus 0.364 for the frozen
287
+ probe and 0.291 for TF-IDF + LR" pairs values with
288
+ text-order-nearest detector, which over-credits the second
289
+ detector in a list construction.
290
+
291
+ Future work (v1.2.0+ candidate): sentence-boundary awareness
292
+ (respecting `.` / `\n` as pairing boundaries) + better
293
+ multi-detector list parsing.
294
+
295
+ HARD-gate promotion of the consumer-side validator at v1.3.10+
296
+ becomes credible at the ~80% reduction level achieved here.
297
+ Remaining false positives can be suppressed via consumer-side
298
+ filtering (e.g., excluding lines containing "random floor" or
299
+ "versus") or accepted as known low-frequency noise.
300
+
8
301
  ## [1.0.5] — 2026-05-26 — publish workflow hardening (infrastructure-only)
9
302
 
10
303
  Tier-3 / infrastructure-only release. **No library code or public API
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.0.5
3
+ Version: 1.2.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -11,7 +11,7 @@ Author: Brandon Behring
11
11
  License-Expression: MIT
12
12
  License-File: LICENSE
13
13
  Keywords: binary-classification,bootstrap,calibration,evaluation,machine-learning,metrics
14
- Classifier: Development Status :: 4 - Beta
14
+ Classifier: Development Status :: 5 - Production/Stable
15
15
  Classifier: Intended Audience :: Developers
16
16
  Classifier: Intended Audience :: Science/Research
17
17
  Classifier: License :: OSI Approved :: MIT License
@@ -76,3 +76,5 @@ What would have to change for this decision to be reopened?
76
76
  | [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
77
77
  | [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
78
78
  | [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
79
+ | [0004](0004-naming-conventions.md) | Naming conventions for modules, classes, and parameters | Accepted | 2026-05-23 |
80
+ | [0005](0005-structured-keys-for-audit-validators.md) | Structured keys over positional tuples for canonical-identity types in audit validators | Accepted | 2026-05-26 |
@@ -21,7 +21,7 @@ keywords = [
21
21
  "binary-classification",
22
22
  ]
23
23
  classifiers = [
24
- "Development Status :: 4 - Beta",
24
+ "Development Status :: 5 - Production/Stable",
25
25
  "Intended Audience :: Developers",
26
26
  "Intended Audience :: Science/Research",
27
27
  "License :: OSI Approved :: MIT License",
@@ -63,6 +63,9 @@ _EXPORTS: dict[str, str] = {
63
63
  # --- audit_value_bindings ---
64
64
  # Flat-module per ADR 0001. Closes #71. Motivated by consumer V1.3.1
65
65
  # ADR-080 audit-fix finding (TF-IDF / LoRA 0.974 value mis-binding).
66
+ # BindingKey + slice-aware matching added v1.1.0 (closes #80;
67
+ # consumer-feedback structural fix per pending ADR 0005).
68
+ "BindingKey": "eval_toolkit.audit_value_bindings",
66
69
  "Match": "eval_toolkit.audit_value_bindings",
67
70
  "ValueBindingsReport": "eval_toolkit.audit_value_bindings",
68
71
  "Violation": "eval_toolkit.audit_value_bindings",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.0.5"
5
+ __version__ = "1.2.0"