eval-toolkit 1.0.4__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/.gitignore +9 -0
  2. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/CHANGELOG.md +205 -0
  3. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/PKG-INFO +2 -2
  4. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/pyproject.toml +1 -1
  5. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/__init__.py +3 -0
  6. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/_version.py +1 -1
  7. eval_toolkit-1.1.0/src/eval_toolkit/audit_value_bindings.py +818 -0
  8. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/public_api/snapshot.json +12 -3
  9. eval_toolkit-1.1.0/tests/test_audit_value_bindings.py +725 -0
  10. eval_toolkit-1.0.4/src/eval_toolkit/audit_value_bindings.py +0 -448
  11. eval_toolkit-1.0.4/tests/test_audit_value_bindings.py +0 -338
  12. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/LICENSE +0 -0
  13. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/README.md +0 -0
  14. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/STYLE.md +0 -0
  15. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/archive/README.md +0 -0
  16. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/research/README.md +0 -0
  17. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/research/datasets/README.md +0 -0
  18. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/research/papers/data-integrity/README.md +0 -0
  19. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  20. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/research/papers/inference/README.md +0 -0
  21. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/research/papers/prompt-injection/README.md +0 -0
  22. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/source/adr/README.md +0 -0
  23. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/docs/source/methodology/README.md +0 -0
  24. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/__main__.py +0 -0
  25. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/_deprecated.py +0 -0
  26. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/_parallel.py +0 -0
  27. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/_rng.py +0 -0
  28. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/_sweep.py +0 -0
  29. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/adversarial.py +0 -0
  30. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/analysis.py +0 -0
  31. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/artifacts.py +0 -0
  32. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  33. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  34. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/bootstrap.py +0 -0
  35. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/calibration.py +0 -0
  36. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/claims.py +0 -0
  37. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/config.py +0 -0
  38. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/docs.py +0 -0
  39. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/embeddings.py +0 -0
  40. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/evidence.py +0 -0
  41. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/harness.py +0 -0
  42. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/leakage.py +0 -0
  43. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/loaders.py +0 -0
  44. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/losses.py +0 -0
  45. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/manifest.py +0 -0
  46. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/metric_specs.py +0 -0
  47. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/metrics.py +0 -0
  48. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/operating_points.py +0 -0
  49. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/paths.py +0 -0
  50. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/plotting.py +0 -0
  51. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/preprocessing.py +0 -0
  52. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/probes.py +0 -0
  53. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/protocols.py +0 -0
  54. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/provenance.py +0 -0
  55. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/py.typed +0 -0
  56. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  57. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  58. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  59. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  60. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  61. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  62. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/scorecards.py +0 -0
  63. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/seeds.py +0 -0
  64. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/splits.py +0 -0
  65. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/stacking.py +0 -0
  66. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/text_dedup.py +0 -0
  67. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/src/eval_toolkit/thresholds.py +0 -0
  68. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  69. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  70. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  71. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  72. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  73. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  74. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  75. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  76. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  77. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  78. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/benchmarks/__init__.py +0 -0
  79. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  80. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/conftest.py +0 -0
  81. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  82. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  83. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  84. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  85. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/docs/expected.md +0 -0
  86. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/docs/input.md +0 -0
  87. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/docs/metrics.json +0 -0
  88. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  89. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/strategies.py +0 -0
  90. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_adversarial.py +0 -0
  91. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_analysis.py +0 -0
  92. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_artifacts.py +0 -0
  93. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_audit_citation_alignment.py +0 -0
  94. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  95. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  96. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  97. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_bootstrap_edge_cases.py +0 -0
  98. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_bootstrap_golden.py +0 -0
  99. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_bootstrap_njobs.py +0 -0
  100. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_bootstrap_props.py +0 -0
  101. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_bootstrap_research_grounded.py +0 -0
  102. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_bootstrap_unit.py +0 -0
  103. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_calibration_binary_adapters.py +0 -0
  104. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  105. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_calibration_determinism.py +0 -0
  106. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_calibration_optimization_failures.py +0 -0
  107. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_calibration_props.py +0 -0
  108. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_calibration_research_grounded.py +0 -0
  109. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_calibration_unit.py +0 -0
  110. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_claims.py +0 -0
  111. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_claims_coverage.py +0 -0
  112. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_claims_props.py +0 -0
  113. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_cli.py +0 -0
  114. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_config.py +0 -0
  115. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_coverage_bootstrap.py +0 -0
  116. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_coverage_calibration.py +0 -0
  117. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_coverage_harness.py +0 -0
  118. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_coverage_metrics.py +0 -0
  119. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_coverage_plotting.py +0 -0
  120. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_croissant_e2e.py +0 -0
  121. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  122. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_deprecated_scalars_shim.py +0 -0
  123. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_deprecations.py +0 -0
  124. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_docs_golden.py +0 -0
  125. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_docs_props.py +0 -0
  126. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_embeddings.py +0 -0
  127. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_evidence_validators.py +0 -0
  128. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_harness_edge_cases.py +0 -0
  129. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_harness_fault_injection.py +0 -0
  130. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_harness_folded.py +0 -0
  131. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_harness_internals.py +0 -0
  132. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_harness_metric_options.py +0 -0
  133. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_harness_parallelism.py +0 -0
  134. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_harness_smoke.py +0 -0
  135. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_import_boundaries.py +0 -0
  136. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  137. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_lazy_extras_messages.py +0 -0
  138. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_leakage.py +0 -0
  139. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_leakage_error_paths.py +0 -0
  140. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_leakage_props.py +0 -0
  141. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_loaders.py +0 -0
  142. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_loaders_coverage.py +0 -0
  143. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_loaders_props.py +0 -0
  144. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_logging.py +0 -0
  145. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_losses.py +0 -0
  146. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_manifest.py +0 -0
  147. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  148. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_manifest_props.py +0 -0
  149. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_manifest_validation.py +0 -0
  150. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_metrics_props.py +0 -0
  151. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_metrics_stratified_subsets.py +0 -0
  152. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_metrics_unit.py +0 -0
  153. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_misc_coverage.py +0 -0
  154. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_numeric_edge_cases.py +0 -0
  155. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_ood_loader.py +0 -0
  156. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_operating_points.py +0 -0
  157. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_operating_points_props.py +0 -0
  158. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_parallel.py +0 -0
  159. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_paths.py +0 -0
  160. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_pipeline_e2e.py +0 -0
  161. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_plotting_edge.py +0 -0
  162. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_plotting_smoke.py +0 -0
  163. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_plotting_visual.py +0 -0
  164. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_preprocessing.py +0 -0
  165. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_probes.py +0 -0
  166. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_protocol_conformance.py +0 -0
  167. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_provenance.py +0 -0
  168. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_public_api.py +0 -0
  169. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_recall_at_fpr.py +0 -0
  170. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_reference_equivalence.py +0 -0
  171. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_reproducibility_integration.py +0 -0
  172. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_rng.py +0 -0
  173. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_schemas.py +0 -0
  174. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_scorecard.py +0 -0
  175. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_seeds.py +0 -0
  176. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_splits.py +0 -0
  177. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_splits_leakage_integration.py +0 -0
  178. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_splits_props.py +0 -0
  179. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_stacking.py +0 -0
  180. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_sweep.py +0 -0
  181. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_text_dedup.py +0 -0
  182. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_text_dedup_coverage.py +0 -0
  183. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_text_dedup_props.py +0 -0
  184. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_text_dedup_strategies.py +0 -0
  185. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_thresholds.py +0 -0
  186. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_thresholds_constant_score.py +0 -0
  187. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_thresholds_coverage.py +0 -0
  188. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_thresholds_props.py +0 -0
  189. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_thresholds_research_grounded.py +0 -0
  190. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_tokenization_leakage_check.py +0 -0
  191. {eval_toolkit-1.0.4 → eval_toolkit-1.1.0}/tests/test_v09_contracts.py +0 -0
@@ -62,6 +62,15 @@ gemini-microaudit-*.md
62
62
  audit-gemini.md
63
63
  comprehensive-audit-codex.md
64
64
  audit-verification-*.md
65
+ # v1.1.0 (#80 cycle): broaden codex-comprehensive-audit-* to cover
66
+ # non-suffixed briefing variants (e.g., codex-comprehensive-audit-v0.50.0.md);
67
+ # the earlier pattern only covered the *-report.md form.
68
+ codex-comprehensive-audit-*.md
69
+
70
+ # Local scratch directory: ad-hoc dogfood scripts, pre-tag validation
71
+ # runs, draft PR bodies, audit prompts. Intentionally untracked.
72
+ # Contents have historical value but are not part of any release.
73
+ .scratch/
65
74
 
66
75
  # Claude Code project settings (machine-local)
67
76
  .claude/
@@ -5,6 +5,211 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.1.0] — 2026-05-26 — `audit_value_bindings` slice-aware matching via `BindingKey` (closes #80)
9
+
10
+ Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
11
+ Closes [#80](https://github.com/brandon-behring/eval-toolkit/issues/80)
12
+ — consumer-feedback structural fix surfaced by
13
+ `brandon-behring/prompt-injection-detection-prototype@v1.3.9` (96
14
+ warnings, ~95 false positives) where the pre-v1.1 2-tuple
15
+ `(detector, metric)` canonical-binding identity could not
16
+ disambiguate the same `(detector, metric)` across multiple slices
17
+ (`direct_validation`, `pooled_ood`, paired-delta cells,
18
+ random-floor mentions).
19
+
20
+ Pending [ADR 0005](docs/source/adr/0005-structured-keys-for-audit-validators.md)
21
+ codifies the underlying rule: "structured keys over positional
22
+ tuples for canonical-identity types in audit validators."
23
+
24
+ ### Added
25
+
26
+ - **`BindingKey`** (new public class, exported via the lazy
27
+ `_EXPORTS` resolver). Frozen dataclass with fields
28
+ `(detector: str, metric: str, slice: str = "any")`. Forward-
29
+ extensible: future identity axes (split, ci_kind, source_ref, ...)
30
+ can be added as defaulted fields without breaking the dict-key
31
+ schema. Avoids the recur-every-N-months schema-event pattern that
32
+ produced #80.
33
+ - **`validate_reader_value_bindings(bindings=...)`** now accepts
34
+ three input shapes, normalized internally to `dict[BindingKey,
35
+ float]` via a per-key `_normalize_binding_key` adapter:
36
+ 1. Canonical: `BindingKey(detector=..., metric=..., slice=...)`
37
+ (recommended for new consumer code).
38
+ 2. Sugar 3-tuple: `(detector, metric, slice)` (concise dict
39
+ literal; issue #80's proposed schema).
40
+ 3. Legacy 2-tuple: `(detector, metric)` (preserved; treated as
41
+ `slice="any"`). All pre-v1.1 consumer code continues to work
42
+ unchanged.
43
+ Mixed key shapes in a single dict are supported. Invalid key
44
+ shapes raise `TypeError` at the function boundary (loud failure,
45
+ not silent zero-match drift).
46
+ - **New optional kwargs** on `validate_reader_value_bindings`:
47
+ - `slice_aliases: Mapping[str, Sequence[str]] | None = None` —
48
+ canonical-slice-name → regex-alternatives mapping, mirroring
49
+ `detector_aliases` / `metric_aliases`. Used when at least one
50
+ `BindingKey` has `slice != "any"`.
51
+ - `slice_window_chars: int = 240` — character window for slice
52
+ disambiguation (≈ 50 tokens at ~5 chars/token).
53
+ - **Slice disambiguation** in the matching loop. When a binding
54
+ key has `slice != "any"`, the candidate value is paired with
55
+ the nearest slice mention (same last-before-first-after rule as
56
+ detector pairing). If no slice mention falls within
57
+ `slice_window_chars`, the triple is suppressed (warn-only;
58
+ counted in `unmatched_slice_count`). If the paired slice
59
+ differs from the binding's slice, the triple is silently skipped
60
+ (handled by the binding for the correct slice on its own loop
61
+ iteration).
62
+ - **`ValueBindingsReport.unmatched_slice_count: int = 0`** — new
63
+ field surfacing the warn-only signal. Default `0` means full
64
+ backward compatibility for code that constructs reports manually.
65
+ - **`scope: Literal["all", "narrative"] = "all"`** — content-type
66
+ filter on the validator. Default `"all"` preserves legacy v1.0.x
67
+ behavior. Setting `scope="narrative"` excludes from matching:
68
+ - Markdown table rows (lines starting with `|`)
69
+ - Bracketed expressions `[...]` (CI bounds, ranges)
70
+ - Fenced code blocks (triple-backtick)
71
+
72
+ This addresses the broader category of false positives that the
73
+ slice-axis fix alone could not (CI bounds being matched as point
74
+ estimates, table-cell metrics being cross-flagged, code-block
75
+ literals being treated as claims). Compatible with the
76
+ motivating misbinding bug class (V1.3.1 ADR-080) which was in
77
+ narrative prose — no recall loss.
78
+
79
+ Combined dogfood result on `prompt-injection-detection-submission`
80
+ HEAD: **76% noise reduction** (95 warnings at v1.0.5 2-tuple
81
+ baseline → 23 with v1.1.0 BindingKey + `scope="narrative"`). The
82
+ remaining 23 are positional-heuristic limitations (random-floor
83
+ mis-attribution across sentence boundaries; cross-detector
84
+ pairing in dense prose) not addressable without parser-level
85
+ work (future v1.2.0+ scope).
86
+
87
+ ### Changed (Tier-1 ADDITIVE — no breaking changes)
88
+
89
+ - `_nearest_detector_key()` private helper renamed to
90
+ `_nearest_canonical_key()` (used now for both detector and slice
91
+ pairing; the body was already generic). Implementation detail;
92
+ no public API impact.
93
+ - `pyproject.toml` `Development Status` classifier bumped
94
+ `4 - Beta` → `5 - Production/Stable` (post-v1.0 hygiene).
95
+ - `.gitignore` now covers `.scratch/` and the broader
96
+ `codex-comprehensive-audit-*.md` pattern (was previously only
97
+ `*-report.md`).
98
+
99
+ ### Deprecation policy
100
+
101
+ All three `bindings` input shapes remain accepted **indefinitely**
102
+ through the v1.x line. `BindingKey` is the canonical/recommended
103
+ shape per ADR 0005 + docstring guidance; tuples remain valid
104
+ syntactic sugar with no `DeprecationWarning`. Formal deprecation
105
+ deferred to a future v2.0 cleanup pass when there is concrete
106
+ payoff. Consumers can migrate slot-by-slot at their own pace, or
107
+ not at all (legacy 2-tuple semantics survive as `slice="any"`).
108
+
109
+ ### Consumer adoption path
110
+
111
+ `prompt-injection-detection-submission` consumer-side script at
112
+ `scripts/audit_value_bindings.py` can adopt by either:
113
+ - Replacing the 2-tuple `BINDINGS` literal with 3-tuple keys
114
+ (smallest diff; issue body's proposal), OR
115
+ - Migrating to `BindingKey(...)` for forward-extensibility (new
116
+ identity axes won't require re-touching the script).
117
+
118
+ ### Validator design philosophy (introduced this release)
119
+
120
+ This release introduces a two-layer correctness model for audit
121
+ validators, formalized in pending ADR 0005:
122
+
123
+ 1. **Identity correctness** — canonical measurements have
124
+ structured identity (`BindingKey`), not positional tuples.
125
+ Future axes added as defaulted fields without breaking the
126
+ schema.
127
+ 2. **Scope correctness** — the validator should only scan content
128
+ that is plausibly a binding claim. Narrative prose is.
129
+ Markdown tables, CI brackets, and code blocks aren't.
130
+ `scope="narrative"` is the v1.1.0 implementation of this rule;
131
+ match the lint-design convention from `ruff`/`mypy`/`bandit`
132
+ (scope predicates aren't optional in production-quality
133
+ linters).
134
+
135
+ Both layers are now in place. Consumers writing typical research
136
+ writeup prose with dense statistical tables should adopt
137
+ `scope="narrative"` for a ~80% noise reduction relative to v1.0.5.
138
+
139
+ ### Known v1.1.0 limitations (residual after slice + narrative fixes)
140
+
141
+ The remaining ~20% of v1.0.5 baseline noise is positional-
142
+ heuristic limitations that the slice + narrative fixes cannot
143
+ address:
144
+
145
+ 1. **"Random floor" / sub-clause values across sentence
146
+ boundaries** — prose like "X scored 0.291. The pooled OOD
147
+ random floor is 0.374" pairs 0.374 with the nearest preceding
148
+ detector (X), because the validator doesn't treat `.` as a
149
+ pairing boundary.
150
+ 2. **Multi-detector cross-pairing in dense prose** — prose like
151
+ "LoRA scored 0.293 [0.286, 0.301] versus 0.364 for the frozen
152
+ probe and 0.291 for TF-IDF + LR" pairs values with
153
+ text-order-nearest detector, which over-credits the second
154
+ detector in a list construction.
155
+
156
+ Future work (v1.2.0+ candidate): sentence-boundary awareness
157
+ (respecting `.` / `\n` as pairing boundaries) + better
158
+ multi-detector list parsing.
159
+
160
+ HARD-gate promotion of the consumer-side validator at v1.3.10+
161
+ becomes credible at the ~80% reduction level achieved here.
162
+ Remaining false positives can be suppressed via consumer-side
163
+ filtering (e.g., excluding lines containing "random floor" or
164
+ "versus") or accepted as known low-frequency noise.
165
+
166
+ ## [1.0.5] — 2026-05-26 — publish workflow hardening (infrastructure-only)
167
+
168
+ Tier-3 / infrastructure-only release. **No library code or public API
169
+ changes.** Hardens the release pipeline against the failure mode
170
+ observed at v1.0.4, where a documented GitHub Actions CRITICAL
171
+ incident (codeload action download failure across the platform) left
172
+ the v1.0.4 wheel un-published on PyPI despite a successful tag and
173
+ GitHub release. The wheel for v1.0.5 is functionally identical to
174
+ v1.0.4; this release exists primarily as a dress rehearsal for the
175
+ new verification step.
176
+
177
+ ### Added — `.github/workflows/publish.yml`
178
+
179
+ - **`workflow_dispatch:` trigger** — recovery path for failed
180
+ tag-triggered runs. Manually re-trigger via
181
+ `gh workflow run publish.yml --ref vX.Y.Z` or the Actions UI
182
+ "Run workflow" dropdown. Always uses the workflow file from main
183
+ HEAD, so workflow patches take effect immediately for recovery.
184
+ - **Post-publish `Verify PyPI receipt` step** — polls
185
+ `pypi.org/pypi/eval-toolkit/<version>/json` for HTTP 200 over a
186
+ 6-minute window (12 × 30s backoff); fails loudly if the wheel
187
+ never lands. Catches silent half-releases where
188
+ `pypa/gh-action-pypi-publish` returns success but PyPI never
189
+ receives the wheel.
190
+
191
+ ### Added — `docs/source/RELEASING.md`
192
+
193
+ - **"Tag-triggered publish failed; need to re-publish to PyPI"**
194
+ recovery recipe under Known gotchas. Documents both the
195
+ `gh run rerun` path (when the original run can be retried) and
196
+ the `workflow_dispatch` path (when the workflow has been patched
197
+ on main since the original tag). References the v1.0.4 incident
198
+ as the canonical example.
199
+
200
+ ### Notes
201
+
202
+ - `setup-uv@v8.1.0` pin is intentionally unchanged. The v1.0.4
203
+ failure was a documented GitHub Actions/codeload incident, not
204
+ an action-specific issue; replacing setup-uv with a curl-install
205
+ would lose the cache layer + Python integration + version-from-
206
+ pyproject detection it provides, and would not have prevented the
207
+ observed failure (actions/checkout downloaded successfully in the
208
+ same failing run; codeload was the SPOF, not setup-uv).
209
+ - The other 5 workflows (ci/codeql/docs/nightly-benchmarks/
210
+ nightly-mc) are not patched because they self-heal on the next
211
+ push; the SPOF only matters for one-shot tag-triggered runs.
212
+
8
213
  ## [1.0.4] — 2026-05-26 — `audit_sister_doc_concept_drift` module (closes #72)
9
214
 
10
215
  Tier-2 ADDITIVE — third (and final) member of the audit-validator
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.0.4
3
+ Version: 1.1.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -11,7 +11,7 @@ Author: Brandon Behring
11
11
  License-Expression: MIT
12
12
  License-File: LICENSE
13
13
  Keywords: binary-classification,bootstrap,calibration,evaluation,machine-learning,metrics
14
- Classifier: Development Status :: 4 - Beta
14
+ Classifier: Development Status :: 5 - Production/Stable
15
15
  Classifier: Intended Audience :: Developers
16
16
  Classifier: Intended Audience :: Science/Research
17
17
  Classifier: License :: OSI Approved :: MIT License
@@ -21,7 +21,7 @@ keywords = [
21
21
  "binary-classification",
22
22
  ]
23
23
  classifiers = [
24
- "Development Status :: 4 - Beta",
24
+ "Development Status :: 5 - Production/Stable",
25
25
  "Intended Audience :: Developers",
26
26
  "Intended Audience :: Science/Research",
27
27
  "License :: OSI Approved :: MIT License",
@@ -63,6 +63,9 @@ _EXPORTS: dict[str, str] = {
63
63
  # --- audit_value_bindings ---
64
64
  # Flat-module per ADR 0001. Closes #71. Motivated by consumer V1.3.1
65
65
  # ADR-080 audit-fix finding (TF-IDF / LoRA 0.974 value mis-binding).
66
+ # BindingKey + slice-aware matching added v1.1.0 (closes #80;
67
+ # consumer-feedback structural fix per pending ADR 0005).
68
+ "BindingKey": "eval_toolkit.audit_value_bindings",
66
69
  "Match": "eval_toolkit.audit_value_bindings",
67
70
  "ValueBindingsReport": "eval_toolkit.audit_value_bindings",
68
71
  "Violation": "eval_toolkit.audit_value_bindings",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.0.4"
5
+ __version__ = "1.1.0"