eval-toolkit 1.7.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. eval_toolkit-1.9.0/.claude/agents/README.md +54 -0
  2. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/.gitignore +6 -2
  3. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/CHANGELOG.md +191 -0
  4. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/PKG-INFO +5 -6
  5. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/README.md +4 -5
  6. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/STYLE.md +24 -9
  7. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/pyproject.toml +6 -1
  8. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/__init__.py +1 -0
  9. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/__main__.py +4 -4
  10. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_sweep.py +21 -5
  11. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_version.py +1 -1
  12. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/analysis.py +57 -8
  13. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/artifacts.py +5 -2
  14. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +9 -1
  15. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/audit_value_bindings.py +15 -1
  16. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/bootstrap.py +371 -41
  17. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/config.py +1 -1
  18. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/docs.py +2 -2
  19. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/distribution_shift.py +28 -7
  20. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/lexical_association.py +18 -1
  21. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/plotting.py +1 -1
  22. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/scorecards.py +21 -0
  23. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/public_api/snapshot.json +9 -3
  24. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_analysis.py +71 -2
  25. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_audit_sister_doc_concept_drift.py +19 -0
  26. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_audit_value_bindings.py +14 -0
  27. eval_toolkit-1.9.0/tests/test_bootstrap_edge_cases.py +374 -0
  28. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_unit.py +16 -38
  29. eval_toolkit-1.9.0/tests/test_claude_agents.py +148 -0
  30. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_cluster_bootstrap.py +117 -0
  31. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_config.py +16 -0
  32. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_docs_golden.py +14 -0
  33. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda_distribution_shift.py +43 -2
  34. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda_lexical_association.py +19 -0
  35. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_public_api.py +1 -0
  36. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_reproducibility_integration.py +7 -2
  37. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_scorecard.py +36 -0
  38. eval_toolkit-1.9.0/tests/test_stratified_cluster_bootstrap.py +260 -0
  39. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_sweep.py +23 -0
  40. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_v09_contracts.py +1 -1
  41. eval_toolkit-1.7.0/tests/test_bootstrap_edge_cases.py +0 -185
  42. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/LICENSE +0 -0
  43. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/archive/README.md +0 -0
  44. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/README.md +0 -0
  45. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/datasets/README.md +0 -0
  46. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/data-integrity/README.md +0 -0
  47. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  48. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/inference/README.md +0 -0
  49. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/prompt-injection/README.md +0 -0
  50. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/source/adr/README.md +0 -0
  51. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/source/methodology/README.md +0 -0
  52. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_deprecated.py +0 -0
  53. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_narrative.py +0 -0
  54. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_parallel.py +0 -0
  55. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_rng.py +0 -0
  56. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/adversarial.py +0 -0
  57. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  58. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/calibration.py +0 -0
  59. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/claims.py +0 -0
  60. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/__init__.py +0 -0
  61. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/data_audit.py +0 -0
  62. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/obfuscation.py +0 -0
  63. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/embeddings.py +0 -0
  64. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/evidence.py +0 -0
  65. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/harness.py +0 -0
  66. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/leakage.py +0 -0
  67. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/loaders.py +0 -0
  68. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/losses.py +0 -0
  69. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/manifest.py +0 -0
  70. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/metric_specs.py +0 -0
  71. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/metrics.py +0 -0
  72. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/operating_points.py +0 -0
  73. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/paths.py +0 -0
  74. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/preprocessing.py +0 -0
  75. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/probes.py +0 -0
  76. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/protocols.py +0 -0
  77. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/provenance.py +0 -0
  78. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/py.typed +0 -0
  79. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  80. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  81. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  82. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  83. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  84. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  85. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/seeds.py +0 -0
  86. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/splits.py +0 -0
  87. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/stacking.py +0 -0
  88. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/text_dedup.py +0 -0
  89. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/thresholds.py +0 -0
  90. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  91. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  92. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  93. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  94. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  95. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  96. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  97. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  98. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  99. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  100. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/benchmarks/__init__.py +0 -0
  101. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  102. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/conftest.py +0 -0
  103. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  104. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  105. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  106. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  107. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/docs/expected.md +0 -0
  108. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/docs/input.md +0 -0
  109. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/docs/metrics.json +0 -0
  110. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  111. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/strategies.py +0 -0
  112. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_adversarial.py +0 -0
  113. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_artifacts.py +0 -0
  114. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_audit_citation_alignment.py +0 -0
  115. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  116. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  117. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_golden.py +0 -0
  118. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_njobs.py +0 -0
  119. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_props.py +0 -0
  120. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_research_grounded.py +0 -0
  121. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_binary_adapters.py +0 -0
  122. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  123. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_determinism.py +0 -0
  124. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_optimization_failures.py +0 -0
  125. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_props.py +0 -0
  126. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_research_grounded.py +0 -0
  127. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_unit.py +0 -0
  128. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_claims.py +0 -0
  129. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_claims_coverage.py +0 -0
  130. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_claims_props.py +0 -0
  131. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_cli.py +0 -0
  132. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_bootstrap.py +0 -0
  133. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_calibration.py +0 -0
  134. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_harness.py +0 -0
  135. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_metrics.py +0 -0
  136. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_plotting.py +0 -0
  137. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_croissant_e2e.py +0 -0
  138. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  139. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_deprecated_scalars_shim.py +0 -0
  140. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_deprecations.py +0 -0
  141. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_docs_props.py +0 -0
  142. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda.py +0 -0
  143. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda_obfuscation.py +0 -0
  144. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_embeddings.py +0 -0
  145. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_evidence_validators.py +0 -0
  146. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_edge_cases.py +0 -0
  147. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_fault_injection.py +0 -0
  148. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_folded.py +0 -0
  149. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_internals.py +0 -0
  150. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_metric_options.py +0 -0
  151. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_parallelism.py +0 -0
  152. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_smoke.py +0 -0
  153. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_import_boundaries.py +0 -0
  154. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  155. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_lazy_extras_messages.py +0 -0
  156. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_leakage.py +0 -0
  157. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_leakage_error_paths.py +0 -0
  158. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_leakage_props.py +0 -0
  159. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_loaders.py +0 -0
  160. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_loaders_coverage.py +0 -0
  161. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_loaders_props.py +0 -0
  162. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_logging.py +0 -0
  163. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_losses.py +0 -0
  164. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest.py +0 -0
  165. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  166. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest_props.py +0 -0
  167. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest_validation.py +0 -0
  168. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_metrics_props.py +0 -0
  169. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_metrics_stratified_subsets.py +0 -0
  170. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_metrics_unit.py +0 -0
  171. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_misc_coverage.py +0 -0
  172. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_numeric_edge_cases.py +0 -0
  173. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_ood_loader.py +0 -0
  174. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_operating_points.py +0 -0
  175. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_operating_points_props.py +0 -0
  176. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_parallel.py +0 -0
  177. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_paths.py +0 -0
  178. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_pipeline_e2e.py +0 -0
  179. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_plotting_edge.py +0 -0
  180. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_plotting_smoke.py +0 -0
  181. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_plotting_visual.py +0 -0
  182. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_preprocessing.py +0 -0
  183. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_probes.py +0 -0
  184. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_protocol_conformance.py +0 -0
  185. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_provenance.py +0 -0
  186. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_recall_at_fpr.py +0 -0
  187. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_reference_equivalence.py +0 -0
  188. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_rng.py +0 -0
  189. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_schemas.py +0 -0
  190. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_seeds.py +0 -0
  191. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_splits.py +0 -0
  192. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_splits_leakage_integration.py +0 -0
  193. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_splits_props.py +0 -0
  194. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_stacking.py +0 -0
  195. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup.py +0 -0
  196. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup_coverage.py +0 -0
  197. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup_props.py +0 -0
  198. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup_strategies.py +0 -0
  199. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds.py +0 -0
  200. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_constant_score.py +0 -0
  201. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_coverage.py +0 -0
  202. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_props.py +0 -0
  203. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_research_grounded.py +0 -0
  204. {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_tokenization_leakage_check.py +0 -0
@@ -0,0 +1,54 @@
1
+ # eval-toolkit review agents
2
+
3
+ Repo-local Claude Code subagents that enforce the **judgment** the deterministic
4
+ gates can't: SemVer impact, audit-validator architecture, silent failures,
5
+ docstring conformance, and dogfood noise. They are **advisory** — `ruff` / `black` / `mypy` / `pytest` /
6
+ coverage / the public-API snapshot remain the authoritative blocking gates. No
7
+ agent re-runs or replaces them.
8
+
9
+ ## The agents
10
+
11
+ | Agent | Catches | Authoritative source |
12
+ |---|---|---|
13
+ | `etk-audit-validator-reviewer` | Three-layer conformance (identity/scope/pairing), `_narrative` reuse, UTF-8 | ADR 0007 / 0005 / 0006, STYLE.md §5 |
14
+ | `etk-api-stability-guardian` | Tier-1/2/3 SemVer class + public-API snapshot regen | ADR 0003, `tests/test_public_api.py`, STYLE.md §17 |
15
+ | `etk-silent-failure-auditor` | NaN/inf finiteness gaps, swallowed exceptions, encoding/IO, non-diagnostic raises | STYLE.md §1 / §6 / §7 |
16
+ | `etk-docstring-conformance-auditor` | NumPy sections, Raises↔code agreement, canonical param names, runnable Examples | STYLE.md §12 / §3a |
17
+ | `etk-dogfood-noise-analyst` | Classifies consumer residuals (real / FP / edge × layer) | runner: `scripts/dogfood_audit.py` |
18
+
19
+ ## How to run
20
+
21
+ You never need to remember the names. Either:
22
+
23
+ - **Describe the task** — "review the changes I made to the citation validator" — and the main agent auto-routes by each agent's `description`; or
24
+ - **Run `/review-eval`** — the one handle that fans them out and synthesizes one verdict.
25
+
26
+ ```
27
+ /review-eval # diff mode: git diff main...HEAD
28
+ /review-eval --pr 84 # review a GitHub PR diff
29
+ /review-eval --audit # full baseline sweep (whole files, no diff)
30
+ /review-eval --audit api # focused: just the public surface
31
+ /review-eval --audit validators # focused: just audit_*.py + _narrative.py
32
+ /review-eval --audit docstrings # focused: just public docstrings
33
+ /review-eval --refute # adversarial second pass (quote-or-reject)
34
+ /review-eval --ledger # persist a review entry under .claude/reviews/
35
+ ```
36
+
37
+ Diff mode prints to the terminal; `--audit` also writes a machine-local entry
38
+ under `.claude/reviews/` (gitignored).
39
+
40
+ ## Conventions baked in
41
+
42
+ - **Read-back discipline** — every finding quotes code with `path:line`; no quote, no finding (counters validation-without-reading).
43
+ - **High-confidence only** — plus a `suppressed N low-confidence` footer so nothing is silently dropped.
44
+ - **Hybrid rubric** — a tight inlined checklist that defers to `STYLE.md` / the ADRs as the single source of truth.
45
+ - **Structured verdict** — `PASS / CONCERNS / BLOCK`, per-agent and overall.
46
+
47
+ `tests/test_claude_agents.py` guards these files against pointer-rot (frontmatter
48
+ parses, `name` matches filename, every cited path exists).
49
+
50
+ ## Escalation
51
+
52
+ For a full multi-round release audit (fan out N finders → dedup → adversarially
53
+ verify → synthesize a ledger), use the **Workflow** tool, not a single subagent.
54
+ `/review-eval --audit` is the lightweight precursor.
@@ -72,8 +72,12 @@ codex-comprehensive-audit-*.md
72
72
  # Contents have historical value but are not part of any release.
73
73
  .scratch/
74
74
 
75
- # Claude Code project settings (machine-local)
76
- .claude/
75
+ # Claude Code: settings are machine-local; the review agents + commands are
76
+ # shared, versioned deliverables (see .claude/agents/README.md). Review ledgers
77
+ # written by `/review-eval --ledger` stay local under .claude/reviews/.
78
+ .claude/*
79
+ !.claude/agents/
80
+ !.claude/commands/
77
81
 
78
82
  # mkdocs build output (Section E.1 v0.28.0)
79
83
  /site/
@@ -5,6 +5,197 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [Unreleased]
9
+
10
+ ## [1.9.0] — 2026-06-10 — resample distribution + silent-NaN hardening + UTF-8 batch (#93, #96, #97)
11
+
12
+ ### Fixed — pre-tag adversarial-review completion (silent-NaN gaps in `bootstrap_ci` itself)
13
+
14
+ The pre-tag review panel (whole-repo re-audit + independent reviewers +
15
+ self-refutation) found the #96 class surviving in `bootstrap_ci` — the
16
+ most-used entry point, outside #96's enumerated scope:
17
+
18
+ - **Studentized path**: a NaN outer statistic was accepted as a *valid*
19
+ resample, bypassing the 95%-valid gate and poisoning the pivots into a
20
+ silent all-NaN `BootstrapCI` with zero warnings. NaN/inf outer statistics
21
+ (and inner-jackknife LOO values) now count as degenerate draws.
22
+ - **Non-finite CI bounds now raise for ANY method** — the degeneracy check
23
+ was gated to `method="BCa"`, so `percentile` returned NaN bounds with only
24
+ scipy's misdirecting `DegenerateDataWarning` (which always names BCa).
25
+ The finite BCa-collapse case (`ci_low == ci_high == point`) keeps the R9
26
+ `UserWarning` contract. Behavior change: BCa NaN-bounds previously
27
+ warned-and-returned; they now raise (the scorecard/harness per-cell
28
+ isolation converts this into an error/reason cell as before).
29
+ - **Point estimate guarded**: a metric returning NaN on the full data
30
+ previously yielded `point_estimate=nan` beside a finite CI, silently.
31
+ - `analysis.load_prediction_arrays`: labels are now domain-checked before
32
+ the int cast — `dtype=int` coercion silently **truncated** float labels
33
+ (`0.7 → 0`), flipping ground truth with in-domain values no downstream
34
+ gate could catch.
35
+ - Docs: `sweep()` Raises now documents the reachable pandas `ImportError`
36
+ and its Returns lists the always-present `strategy_id` column; STYLE.md
37
+ Tier-2 quick reference corrected to the ten strict Protocols (v1.0.2
38
+ `SimilarityStrategy` promotion).
39
+ - Tests: mutation-verified gap closed (per-stratum NaN filter pinned via a
40
+ non-NaN-propagating `combine`); `samples`↔quantile consistency pinned at
41
+ non-default confidence.
42
+
43
+ ### Added — resample-distribution exposure on the cluster bootstraps (#93)
44
+
45
+ Tier-1 strictly-appended optional parameters, SemVer-MINOR per the
46
+ [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)
47
+ 2026-06-10 amendment (#101) — backward-compatible; snapshot regenerated in
48
+ the same commit.
49
+
50
+ - **`cluster_bootstrap_ci(..., return_samples=True)`** and
51
+ **`stratified_cluster_bootstrap_ci(..., return_samples=True)`** attach the
52
+ post-filter bootstrap resample statistics to the result as
53
+ **`BootstrapCI.samples`** (read-only `numpy.ndarray`, the same array the
54
+ percentile bounds are computed from; `shape == (n_resamples_used,)`).
55
+ Distribution summaries such as the consumer's `frac_gt0`
56
+ (`float(np.mean(ci.samples > 0.0))`) are now derivable from the *same*
57
+ draws as the CI — previously structurally unrecoverable, blocking the two
58
+ remaining LODO call-site migrations (downstream DF-11).
59
+ - `BootstrapCI` gains the trailing optional field `samples`
60
+ (default `None`, `compare=False`, `repr=False`): positional construction,
61
+ equality/hash semantics, and the **`to_dict()` schema are all unchanged**.
62
+ Note: `dataclasses.asdict()` (which ignores those flags) now includes a
63
+ `samples` key — consumers serializing via `asdict` instead of `to_dict()`
64
+ should drop it (an attached ndarray is not JSON-serializable).
65
+ - scipy precedent: `BootstrapResult.bootstrap_distribution`. Honors the
66
+ n_jobs bit-for-bit reproducibility contract.
67
+
68
+ ### Fixed — silent-NaN hardening batch (#96)
69
+
70
+ Finiteness guards across the numeric surface (STYLE §1 *never fail silently* /
71
+ §7 validation boundary). All are new raises (or error statuses) on garbage
72
+ input that previously produced silently-wrong results:
73
+
74
+ - `cluster_bootstrap_ci` / `stratified_cluster_bootstrap_ci`: NaN/inf scores
75
+ now raise at the validation boundary (previously the per-stratum check was
76
+ shape-only and the result was a silent all-NaN `BootstrapCI`); a non-finite
77
+ point estimate raises with the got-value; resample draws where the statistic
78
+ (or `combine`) **returns** NaN/inf now count toward the >5% degenerate gate
79
+ instead of poisoning the quantile CI (previously only *raising* draws counted).
80
+ - `paired_bootstrap_diff` / `paired_bootstrap_ece_diff` /
81
+ `paired_bootstrap_op_point_diff`: NaN resample deltas now count as degenerate
82
+ draws (same ≤5% tolerance as raising draws — pre-#96 a NaN CI made
83
+ `overlaps_zero` read `False`, i.e. silently "statistically significant");
84
+ a non-finite full-data Δ raises with the got-value; a non-finite-CI-bounds
85
+ raise remains in each constructor as a backstop (mirrors the BCa degeneracy
86
+ guard in `bootstrap_ci`).
87
+ - `scorecard` bootstrap path: BCa-degenerate NaN CI bounds are no longer
88
+ attached to an `"ok"` cell — the CI is dropped and the reason recorded
89
+ (consistent with the existing "bootstrap unavailable" convention).
90
+ - `eda.median_bandwidth`: non-finite input (NaN/inf) now raises at entry —
91
+ NaN bypassed the `sigma <= 0.0` check and escaped as a NaN bandwidth, and a
92
+ NaN row outside the `max_samples` subsample escaped entirely.
93
+ - `eda.maximum_mean_discrepancy`: explicit `bandwidth` must be finite and > 0 —
94
+ `inf` yielded γ = 0 → all-ones Gram → MMD² = 0 → `p_value = 1.0` silently
95
+ reading "no shift".
96
+ - `eda` PAD/MMD/kNN feature matrices are finiteness-checked at the boundary
97
+ (previously NaN embeddings died deep inside sklearn blaming internals).
98
+ - `eda.class_lexical_association`: a `positive_label` matching no label (the
99
+ 1-vs-`"1"` type-mismatch trap) or matching every label now raises listing the
100
+ observed label values, instead of returning a documented all-empty result
101
+ that read "no shortcut signal".
102
+ - `scorecard`: a custom `MetricSpec.compute` returning NaN/inf now yields
103
+ `MetricResult(status="error", reason=...)` through the same path as a raising
104
+ compute — previously `status="ok"` with a NaN value.
105
+ - `sweep`: a NaN `attack_threshold` now raises (it silently zeroed every `asr`
106
+ flag); `±inf` remains a documented unsatisfiable sentinel.
107
+ - `analysis.JsonlPredictionReader`: a row missing (or `null` on) a declared
108
+ column key now fails fast with file + row + key context (the R8-F3 pattern
109
+ already applied to CSV headers) — previously a missing score coerced to NaN
110
+ deep in the metric computation and a missing label died as a bare `TypeError`.
111
+ A malformed JSON row now reports the actual file row (raw `json.JSONDecodeError`
112
+ always said "line 1"). `analysis.load_prediction_arrays` additionally rejects
113
+ non-finite loaded scores (a bare JSON `NaN` token or a CSV `"nan"` cell passes
114
+ per-row key checks) with file + column + row-index context.
115
+
116
+ ### Fixed — explicit UTF-8 encoding batch (#97)
117
+
118
+ Windows (cp1252 locale codec) is the trigger; Linux/macOS hid all of these.
119
+ Locked convention: always pass `encoding="utf-8"` on text-file IO.
120
+
121
+ - `docs.render_files` **apply mode** read and wrote consumer markdown with the
122
+ locale codec — on cp1252 this silently and *cumulatively* corrupted
123
+ non-ASCII user content on every apply (the worst item in the batch).
124
+ - All remaining text IO made explicit: `__main__` schema/payload reads
125
+ (RFC 8259 mandates UTF-8 for JSON), `analysis` CSV/JSONL prediction readers,
126
+ `config.from_yaml`, `artifacts` schema read + report write,
127
+ `plotting` sidecar write, `scripts/audit_raises_sections.py`.
128
+ - `scripts/dogfood_audit.py`: a surface file skipped for `UnicodeDecodeError`
129
+ now emits a stderr warning with the path — previously it silently vanished
130
+ from the acceptance evidence.
131
+ - **Detection locked out permanently**: ruff now enforces `PLW1514`
132
+ (implicit-encoding) across `src/`, `scripts/`, and `tests/` via
133
+ `preview = true` + `explicit-preview-rules = true` (only this rule gets
134
+ preview status; no other behavior changes).
135
+
136
+ ### Fixed
137
+
138
+ - `audit_value_bindings.validate_reader_value_bindings` now raises a
139
+ diagnostic `ValueError` when a scanned file is not valid UTF-8, instead
140
+ of letting an unguarded `read_text(encoding="utf-8")` abort the run with
141
+ a bare `UnicodeDecodeError`. Documented in the function's `Raises` section.
142
+ - `audit_sister_doc_concept_drift.validate_sister_doc_concept_drift` now
143
+ skips non-UTF-8 files with a `warnings.warn` instead of crashing — its
144
+ prior `except OSError` did not catch `UnicodeDecodeError` (a `ValueError`,
145
+ not an `OSError`), so a single non-UTF-8 byte aborted the whole scan.
146
+
147
+ ### Internal
148
+
149
+ - `scripts/` is now covered by `ruff` / `black` / `mypy` across all runners
150
+ (`Makefile`, `ci.yml`, `.pre-commit-config.yaml`, `tox.ini`, `noxfile.py`).
151
+
152
+ ### Fixed — documentation/config consistency batch (2026-06-09 full-repo audit)
153
+
154
+ - [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)
155
+ amended: records the v1.0.2 `SimilarityStrategy` promotion (strict
156
+ Tier-2 count 9 → 10) in the Tier-1 Protocol list, and replaces the
157
+ unimplemented `STRICT_DOCSTRINGS` plan with the actual contract
158
+ (docstring first lines remain pinned through v1.x).
159
+ - `SimilarityStrategy` registered in
160
+ `tests/test_public_api.py::_TIER2_PROTOCOLS` — the R6-D fail-fast
161
+ list had lagged the v1.0.2 promotion.
162
+ - `docs/source/roadmap.md` post-v1.0 section refreshed to the v1.8.0
163
+ state (was still "v1.0.1 is the next minor"; the referenced
164
+ `v1.0.1 cleanup` issue #76 closed at v1.0.2); broken repo-relative
165
+ link to a machine-local planning document removed.
166
+ - STYLE.md §17 example updated — `pr_auc` left the top level at v0.46
167
+ (Decision L); the example now uses `scorecard`. README Tier-2 box
168
+ disambiguated (10 strict Protocols vs `SliceAwareScorer`/`Versioned`).
169
+ - CONTRIBUTING.md: corrected the `[dev]`-extra claim (heavy optional
170
+ stacks `embeddings`/`transformers`/`probes`/`losses` are not
171
+ included) and documented the docs-extra requirement for `pre-push`.
172
+
173
+ ### Internal
174
+
175
+ - `make test` now collects all three doc-execution surfaces — the
176
+ positional `tests` arg silently bypassed pyproject `testpaths`,
177
+ skipping the 161 README/docs Sybil doc tests (v0.47 §5L incident
178
+ class). `make install` installs `.[dev,docs]` so the sphinx
179
+ pre-push gate works on a fresh environment.
180
+ - CI coverage step excludes `-m integration` (aligns ci.yml with the
181
+ pyproject marker contract and the Makefile coverage target).
182
+ - tox/nox aligned with `requires-python = ">=3.13"`: py313-only
183
+ envlist/`PY_VERSIONS`, monte_carlo/benchmark/integration marker
184
+ exclusions added to their pytest commands, stale "private and
185
+ home-designed" framing removed; Makefile help text and §5H
186
+ notebook-gate comments updated to current reality.
187
+
188
+ ## [1.8.0] — 2026-06-04 — composite multi-stratum cluster bootstrap (#92)
189
+
190
+ ### Added — `bootstrap.stratified_cluster_bootstrap_ci` (composite multi-stratum cluster bootstrap)
191
+
192
+ `eval_toolkit.bootstrap` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible. Generalises the v1.7.0 single-block `cluster_bootstrap_ci` to the shape leave-one-group-out transfer gaps actually take: a **composite statistic reduced over several independently-resampled cluster strata**.
193
+
194
+ - **`stratified_cluster_bootstrap_ci(strata, per_stratum_metric, combine, *, resample_labels=(0,1), …)`** — `strata` is a mapping `{key: (y, score, groups)}` of independent resample-units (e.g. `seed`, `(carrier, seed)`, `(attack_type, seed)`); each bootstrap iteration resamples every stratum's `(label, group)` clusters, computes `per_stratum_metric` on each, and reduces the `{key: metric}` map with `combine` to one scalar (a seed-averaged ROC-AUC gap, a mean-over-carriers gap, a top−bottom per-type AUPRC contrast, …). Percentile `BootstrapCI` (`method="stratified_cluster_percentile"`). `cluster_bootstrap_ci` is the single-stratum, identity-reduce special case.
195
+ - **Why:** the v1.7.0 single-block primitive could not express the **seed-averaging** that real LODO estimators do inside the bootstrap (`Gx = val − mean_seed(test_roc)`), so it did not actually fit the consumer portfolio's attack-type / carrier / dialect bootstraps. This is the correct primitive for them.
196
+ - **Parallel + reproducible:** built on `parallel_map` + `spawn_seed_sequences` ⇒ `n_jobs` gives bit-for-bit-identical CIs; `n_jobs=-1` all cores.
197
+ - Exported via `from eval_toolkit import stratified_cluster_bootstrap_ci`; `__all__` + `_EXPORTS` + doctest + n_jobs-reproducibility / seed-averaged / composite-statistic tests; mypy-strict clean.
198
+
8
199
  ## [1.7.0] — 2026-06-04 — label-stratified cluster bootstrap (#90, #91)
9
200
 
10
201
  ### Added — `bootstrap.cluster_bootstrap_ci` (label-stratified cluster bootstrap)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.7.0
3
+ Version: 1.9.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -116,11 +116,10 @@ format changes.
116
116
  │ manifest.json + seeds + git_sha + data_hashes + │
117
117
  │ gpu_info + leakage_report (NeurIPS-aligned) │
118
118
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
119
- │ Scorer / SliceAwareScorer / LeakageCheck / Splitter
120
- ThresholdSelector / DatasetLoader / MetricSpec
121
- MetaLearner / Probe / TextTransform /
122
- SimilarityStrategy (10 strict)
123
- │ Versioned (opt-in: per-object versions in manifest) │
119
+ │ Scorer / LeakageCheck / Splitter / ThresholdSelector
120
+ DatasetLoader / MetricSpec / MetaLearner / Probe /
121
+ TextTransform / SimilarityStrategy (10 strict)
122
+ SliceAwareScorer / Versioned (outside the 10 strict)
124
123
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
125
124
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
126
125
  │ paired_bootstrap_diff / cv_clt_ci / mde_from_ci │
@@ -30,11 +30,10 @@ format changes.
30
30
  │ manifest.json + seeds + git_sha + data_hashes + │
31
31
  │ gpu_info + leakage_report (NeurIPS-aligned) │
32
32
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
33
- │ Scorer / SliceAwareScorer / LeakageCheck / Splitter
34
- ThresholdSelector / DatasetLoader / MetricSpec
35
- MetaLearner / Probe / TextTransform /
36
- SimilarityStrategy (10 strict)
37
- │ Versioned (opt-in: per-object versions in manifest) │
33
+ │ Scorer / LeakageCheck / Splitter / ThresholdSelector
34
+ DatasetLoader / MetricSpec / MetaLearner / Probe /
35
+ TextTransform / SimilarityStrategy (10 strict)
36
+ SliceAwareScorer / Versioned (outside the 10 strict)
38
37
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
39
38
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
40
39
  │ paired_bootstrap_diff / cv_clt_ci / mde_from_ci │
@@ -1,8 +1,8 @@
1
1
  # eval-toolkit — Coding Standards
2
2
 
3
- Self-contained standards for this repository. External readers do not need
4
- access to any other style document; everything required to contribute lives
5
- here.
3
+ Self-contained quick reference for this repository. The ADRs
4
+ (`docs/source/adr/`) are the authoritative source for the decisions summarized
5
+ here; everything needed for day-to-day contribution lives in this file.
6
6
 
7
7
  ## 1. Foundational principles
8
8
 
@@ -27,7 +27,7 @@ here.
27
27
  | Formatter | `black`, line length 100 |
28
28
  | Linter | `ruff` with `select = ["E", "W", "F", "I", "N", "UP", "B", "SIM", "C4"]`, ignore `E501` (Black handles), `N803`/`N806` (math identifiers) |
29
29
  | Type checker | `mypy` strict (`disallow_untyped_defs`, `disallow_incomplete_defs`, `check_untyped_defs`, `no_implicit_optional`, `warn_redundant_casts`, `warn_unused_ignores`, `warn_no_return`, `strict_equality`, `warn_return_any`) |
30
- | Test runner | `pytest` with markers `unit`, `property`, `smoke`, `golden`; coverage floor `90%` |
30
+ | Test runner | `pytest` with markers `unit`, `property`, `smoke`, `golden`; coverage floor `92%` |
31
31
  | Build backend | `hatchling` |
32
32
  | Env manager | `uv` (`uv venv` → `.venv/`; `uv pip install -e .[dev]`) |
33
33
  | Python | `>=3.13` (RunPod parity floor; py313 tool targets in pyproject.toml) |
@@ -130,7 +130,15 @@ Examples:
130
130
  required.
131
131
  - `from __future__ import annotations` only when forward refs require it.
132
132
  - `Protocol` only at "real seams" — where two or more concrete implementations
133
- exist or are planned. Current seams (as of v0.8.0):
133
+ exist or are planned. The authoritative Tier-2-stable set is `_TIER2_PROTOCOLS` in
134
+ `tests/test_public_api.py` plus
135
+ [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md):
136
+ the ten strict Tier-2 Protocols are `Scorer`, `LeakageCheck`, `Splitter`,
137
+ `ThresholdSelector`, `DatasetLoader`, `MetricSpec`, `MetaLearner`, `Probe`,
138
+ `TextTransform`, and `SimilarityStrategy` (promoted 10th at v1.0.2, #76
139
+ RC2). The seams below are illustrative detail — `SliceAwareScorer` is an
140
+ opt-in subprotocol of `Scorer`, and `Versioned` is a real seam that is
141
+ **not** in the Tier-2 frozenset:
134
142
  - `Scorer` + `SliceAwareScorer` (`harness.py`) — anything with
135
143
  `predict_proba(X) -> np.ndarray`. `SliceAwareScorer` adds opt-in
136
144
  `should_score_slice(name)` for cost-controlled skipping.
@@ -251,7 +259,11 @@ Local imports inside functions are allowed for:
251
259
  ## 11. Logging
252
260
 
253
261
  Use `logging` (library context — consumers configure handlers). Do not use
254
- `print` in `src/eval_toolkit/`.
262
+ `print` in `src/eval_toolkit/`. Log levels: `DEBUG` for internal events; `INFO`
263
+ only for the rare user-relevant harness progress signal; **`WARNING` is reserved
264
+ for `warnings.warn(...)`, not `logger.warning(...)`**; and **`ERROR` must not
265
+ appear in library code — raise an exception instead**. See CONTRIBUTING.md
266
+ §Logging for the full rationale.
255
267
 
256
268
  ## 12. Docstrings
257
269
 
@@ -333,7 +345,7 @@ restate what the code says.
333
345
  `hypothesis.extra.numpy` for arrays.
334
346
  - **Golden tests** only for `docs.py`, where the output is the contract.
335
347
  - **Doctests** for math/algorithmic kernels.
336
- - **Coverage floor**: 90%.
348
+ - **Coverage floor**: 92%.
337
349
  - **`assert` is fine in tests.**
338
350
 
339
351
  ## 15. Packaging
@@ -359,6 +371,9 @@ restate what the code says.
359
371
 
360
372
  - Every module declares `__all__`.
361
373
  - The package's `__init__.py` re-exports the public surface so both
362
- `from eval_toolkit import pr_auc` and `from eval_toolkit.metrics import pr_auc`
363
- work — matches sklearn/pandas/scipy convention.
374
+ `from eval_toolkit import scorecard` and
375
+ `from eval_toolkit.scorecards import scorecard` work — matches
376
+ sklearn/pandas/scipy convention. (Threshold-dependent scalar metrics
377
+ such as `pr_auc` left the top level at v0.46 Decision L — import
378
+ them from `eval_toolkit.metrics`.)
364
379
  - Private helpers are prefixed with `_` and not re-exported.
@@ -160,7 +160,12 @@ line-length = 100
160
160
  target-version = "py313"
161
161
 
162
162
  [tool.ruff.lint]
163
- select = ["E", "F", "W", "I", "N", "UP", "B", "SIM", "C4"]
163
+ # preview + explicit-preview-rules: enable ONLY the explicitly selected
164
+ # preview rules (PLW1514 implicit-encoding, #97) — no other preview-mode
165
+ # behavior changes. Locks the Windows-cp1252 mojibake class out permanently.
166
+ preview = true
167
+ explicit-preview-rules = true
168
+ select = ["E", "F", "W", "I", "N", "UP", "B", "SIM", "C4", "PLW1514"]
164
169
  ignore = [
165
170
  "E501", # line length handled by black
166
171
  "N803", # function arg lowercase — math kernels use π, T, etc. per Decision 14
@@ -140,6 +140,7 @@ _EXPORTS: dict[str, str] = {
140
140
  "paired_bootstrap_ece_diff": "eval_toolkit.bootstrap",
141
141
  "paired_bootstrap_op_point_diff": "eval_toolkit.bootstrap",
142
142
  "paired_mde": "eval_toolkit.bootstrap",
143
+ "stratified_cluster_bootstrap_ci": "eval_toolkit.bootstrap",
143
144
  # --- calibration ---
144
145
  "DEFAULT_FN_COST": "eval_toolkit.calibration",
145
146
  "DEFAULT_FP_COST": "eval_toolkit.calibration",
@@ -47,7 +47,7 @@ def _cmd_schemas_show(args: argparse.Namespace) -> int:
47
47
  else:
48
48
  print(f"unknown schema: {name}", file=sys.stderr)
49
49
  return 2
50
- print(json.dumps(json.loads(candidate.read_text()), indent=2, sort_keys=True))
50
+ print(json.dumps(json.loads(candidate.read_text(encoding="utf-8")), indent=2, sort_keys=True))
51
51
  return 0
52
52
 
53
53
 
@@ -73,7 +73,7 @@ def _cmd_schemas_check(_args: argparse.Namespace) -> int:
73
73
  failures: list[str] = []
74
74
  for f in files:
75
75
  try:
76
- schema = json.loads(f.read_text())
76
+ schema = json.loads(f.read_text(encoding="utf-8"))
77
77
  Draft202012Validator.check_schema(schema)
78
78
  print(f" {f.name}: OK")
79
79
  except (json.JSONDecodeError, SchemaError) as exc:
@@ -109,8 +109,8 @@ def _cmd_validate(args: argparse.Namespace) -> int:
109
109
  if not file_path.exists():
110
110
  print(f"file not found: {args.file}", file=sys.stderr)
111
111
  return 2
112
- schema = json.loads(schema_path.read_text())
113
- payload = json.loads(file_path.read_text())
112
+ schema = json.loads(schema_path.read_text(encoding="utf-8"))
113
+ payload = json.loads(file_path.read_text(encoding="utf-8"))
114
114
  import jsonschema as _js # noqa: PLC0415
115
115
 
116
116
  try:
@@ -72,15 +72,18 @@ def sweep(
72
72
  **Required to materialize ``asr``** — the documented contract refuses
73
73
  a magic default threshold (cf. ``methodology/thresholds.md``).
74
74
  Ignored when ``scorer`` is ``None`` (with ``ValueError`` if passed
75
- with ``scorer=None`` to surface the API misuse).
75
+ with ``scorer=None`` to surface the API misuse). Must not be NaN
76
+ (every ``asr`` flag would silently be ``False``); ``±inf`` is
77
+ accepted as a deliberately unsatisfiable sentinel.
76
78
 
77
79
  Returns
78
80
  -------
79
81
  pandas.DataFrame
80
82
  Columns vary by which optional kwargs are passed:
81
83
 
82
- - Always: ``text_id`` (int), ``variant`` (str — from
83
- ``strategy.name``), ``transformed_text`` (str).
84
+ - Always: ``text_id`` (int), ``strategy_id`` (str —
85
+ configured-instance identity, Decision R7-B), ``variant`` (str
86
+ from ``strategy.name``), ``transformed_text`` (str).
84
87
  - With ``scorer``: also ``original_score`` (float) +
85
88
  ``transformed_score`` (float).
86
89
  - With ``scorer`` AND ``attack_threshold``: also ``asr`` (bool —
@@ -90,9 +93,12 @@ def sweep(
90
93
 
91
94
  Raises
92
95
  ------
96
+ ImportError
97
+ If pandas is not installed (install the ``dataframe`` extra).
93
98
  ValueError
94
99
  - If ``strategies`` is empty.
95
100
  - If ``attack_threshold`` is provided without ``scorer``.
101
+ - If ``attack_threshold`` is NaN.
96
102
  - If any strategy doesn't satisfy ``TextTransform`` structurally
97
103
  (typically a missing ``name`` attribute).
98
104
 
@@ -138,6 +144,13 @@ def sweep(
138
144
  "Either pass scorer=<scorer> + attack_threshold=<float>, "
139
145
  "or omit attack_threshold."
140
146
  )
147
+ # NaN comparisons are all False, so a NaN threshold would silently zero
148
+ # every asr flag. (±inf is semantically valid: an unsatisfiable sentinel.)
149
+ if attack_threshold is not None and np.isnan(attack_threshold):
150
+ raise ValueError(
151
+ "sweep(): attack_threshold is NaN — every asr flag would be False. "
152
+ "Pass a finite threshold (or ±inf as an unsatisfiable sentinel)."
153
+ )
141
154
  for i, strategy in enumerate(strategies):
142
155
  if not (hasattr(strategy, "name") and hasattr(strategy, "transform")):
143
156
  raise ValueError(
@@ -177,8 +190,11 @@ def sweep(
177
190
  "transformed_text": transformed,
178
191
  }
179
192
  if scorer is not None:
180
- assert original_scores is not None
181
- assert transformed_scores is not None
193
+ if original_scores is None or transformed_scores is None: # pragma: no cover
194
+ raise RuntimeError(
195
+ "sweep(): internal invariant violated — batch scores not "
196
+ "materialized despite scorer being set"
197
+ )
182
198
  s_orig = float(original_scores[text_id])
183
199
  s_adv = float(transformed_scores[text_id])
184
200
  row["original_score"] = s_orig
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.7.0"
5
+ __version__ = "1.9.0"
@@ -66,7 +66,7 @@ class CsvPredictionReader:
66
66
  """
67
67
  wanted = set(columns.values())
68
68
  out: dict[str, list[object]] = {col: [] for col in wanted}
69
- with Path(uri).open(newline="") as fh:
69
+ with Path(uri).open(newline="", encoding="utf-8") as fh:
70
70
  reader = csv.DictReader(fh)
71
71
  # R8-F3: validate the header up-front so missing columns
72
72
  # surface as a clear ValueError rather than as a cryptic
@@ -93,14 +93,40 @@ class JsonlPredictionReader:
93
93
  *,
94
94
  columns: Mapping[str, str],
95
95
  ) -> Mapping[str, Sequence[object]]:
96
- """Read a local JSONL file."""
96
+ """Read a local JSONL file.
97
+
98
+ Raises
99
+ ------
100
+ ValueError
101
+ If any non-blank row is not valid JSON, or is missing (or has
102
+ ``null`` for) a key declared in the ``columns`` mapping.
103
+ Validated at read time — the R8-F3 pattern already applied to
104
+ CSV headers — so a missing ``score`` key surfaces with the file
105
+ path + row number instead of being coerced to NaN deep inside
106
+ the metric computation (or, for ``label``, dying as a
107
+ context-free ``TypeError``).
108
+ """
97
109
  wanted = set(columns.values())
98
110
  out: dict[str, list[object]] = {col: [] for col in wanted}
99
- with Path(uri).open() as fh:
100
- for line in fh:
111
+ with Path(uri).open(encoding="utf-8") as fh:
112
+ for line_no, line in enumerate(fh, start=1):
101
113
  if not line.strip():
102
114
  continue
103
- row = json.loads(line)
115
+ try:
116
+ row = json.loads(line)
117
+ except json.JSONDecodeError as exc:
118
+ # json.loads on a single line always reports "line 1",
119
+ # actively misdirecting on which file row is broken.
120
+ raise ValueError(
121
+ f"JSONL file at {uri!r} row {line_no} is not valid JSON: {exc}"
122
+ ) from exc
123
+ missing = sorted(col for col in wanted if row.get(col) is None)
124
+ if missing:
125
+ raise ValueError(
126
+ f"JSONL file at {uri!r} row {line_no} is missing required "
127
+ f"key(s) {missing} (or they are null); "
128
+ f"available keys: {sorted(row)}"
129
+ )
104
130
  for col in wanted:
105
131
  out[col].append(row.get(col))
106
132
  return out
@@ -117,8 +143,13 @@ def load_prediction_arrays(
117
143
  ------
118
144
  ValueError
119
145
  If ``ref`` lacks a ``columns`` mapping, lacks a non-empty ``uri``,
120
- or its ``columns`` mapping is missing the ``label`` / ``score``
121
- keys (re-raised from :func:`_required_column`).
146
+ its ``columns`` mapping is missing the ``label`` / ``score`` keys
147
+ (re-raised from :func:`_required_column`), the loaded scores
148
+ contain non-finite values (a bare ``NaN`` token in JSONL or a
149
+ ``"nan"`` cell in CSV passes the readers' per-row key checks but
150
+ must not flow into metrics as a silent NaN), or the loaded labels
151
+ are not all in ``{0, 1}`` (an int cast would silently truncate
152
+ ``0.7 → 0``, flipping ground truth).
122
153
  """
123
154
  columns = ref.get("columns")
124
155
  if not isinstance(columns, Mapping):
@@ -131,8 +162,26 @@ def load_prediction_arrays(
131
162
  selected_reader = reader or _reader_for_ref(ref)
132
163
  reader_columns = {str(k): str(v) for k, v in columns.items() if isinstance(v, str)}
133
164
  table = selected_reader.read_predictions(uri, columns=reader_columns)
134
- labels = np.asarray(table[label_col], dtype=int)
165
+ # Load labels as float first: np.asarray(..., dtype=int) silently
166
+ # TRUNCATES numeric non-integers (0.7 → 0), flipping ground truth with
167
+ # in-domain values no downstream gate can catch (v1.9.0 pre-tag review).
168
+ labels_raw = np.asarray(table[label_col], dtype=float)
169
+ bad_labels = ~np.isin(labels_raw, (0.0, 1.0))
170
+ if bad_labels.any():
171
+ first_bad = int(np.flatnonzero(bad_labels)[0])
172
+ raise ValueError(
173
+ f"prediction artifact at {uri!r} column {label_col!r} contains "
174
+ f"non-binary label(s); first bad value {labels_raw[first_bad]!r} "
175
+ f"at data row index {first_bad}"
176
+ )
177
+ labels = labels_raw.astype(int)
135
178
  scores = np.asarray(table[score_col], dtype=float)
179
+ if not np.isfinite(scores).all():
180
+ first_bad = int(np.flatnonzero(~np.isfinite(scores))[0])
181
+ raise ValueError(
182
+ f"prediction artifact at {uri!r} column {score_col!r} contains "
183
+ f"non-finite score(s) (NaN/inf); first at data row index {first_bad}"
184
+ )
136
185
  row_id_col = columns.get("row_id")
137
186
  hash_col = columns.get("content_hash")
138
187
  row_ids = tuple(str(v) for v in table.get(str(row_id_col), ())) if row_id_col else ()
@@ -243,7 +243,10 @@ def write_json_strict(
243
243
  out_path = Path(path)
244
244
  out_path.parent.mkdir(parents=True, exist_ok=True)
245
245
  sanitized = sanitize_for_json(payload)
246
- out_path.write_text(json.dumps(sanitized, indent=indent, sort_keys=sort_keys, allow_nan=False))
246
+ out_path.write_text(
247
+ json.dumps(sanitized, indent=indent, sort_keys=sort_keys, allow_nan=False),
248
+ encoding="utf-8",
249
+ )
247
250
  return out_path
248
251
 
249
252
 
@@ -258,7 +261,7 @@ def validate_payload(payload: object, schema_name: str) -> None:
258
261
  from jsonschema import Draft202012Validator # type: ignore[import-untyped]
259
262
 
260
263
  schema_path = resources.files("eval_toolkit") / "schemas" / schema_name
261
- schema = json.loads(schema_path.read_text())
264
+ schema = json.loads(schema_path.read_text(encoding="utf-8"))
262
265
  Draft202012Validator(schema).validate(sanitize_for_json(payload))
263
266
 
264
267
 
@@ -54,6 +54,7 @@ concept_drift v1.0.4).
54
54
  from __future__ import annotations
55
55
 
56
56
  import re
57
+ import warnings
57
58
  from collections.abc import Callable, Sequence
58
59
  from dataclasses import dataclass
59
60
  from pathlib import Path
@@ -220,7 +221,14 @@ def validate_sister_doc_concept_drift(
220
221
  for path in files_resolved:
221
222
  try:
222
223
  file_texts[path] = path.read_text(encoding="utf-8")
223
- except OSError:
224
+ except (OSError, UnicodeDecodeError) as exc:
225
+ # UnicodeDecodeError is a ValueError, not an OSError — without it a
226
+ # single non-UTF-8 byte would crash the whole scan. Skip unreadable
227
+ # or non-UTF-8 files, but warn so the skip is not silent (STYLE §1).
228
+ warnings.warn(
229
+ f"skipping unreadable file {path}: {exc}",
230
+ stacklevel=2,
231
+ )
224
232
  continue
225
233
 
226
234
  drift_clusters: list[DriftCluster] = []