eval-toolkit 1.5.0__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/CHANGELOG.md +44 -0
  2. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/PKG-INFO +1 -1
  3. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_version.py +1 -1
  4. eval_toolkit-1.6.0/src/eval_toolkit/eda/__init__.py +144 -0
  5. eval_toolkit-1.6.0/src/eval_toolkit/eda/distribution_shift.py +634 -0
  6. eval_toolkit-1.6.0/src/eval_toolkit/eda/lexical_association.py +620 -0
  7. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/public_api/snapshot.json +1 -1
  8. eval_toolkit-1.6.0/tests/test_eda_distribution_shift.py +302 -0
  9. eval_toolkit-1.6.0/tests/test_eda_lexical_association.py +340 -0
  10. eval_toolkit-1.5.0/src/eval_toolkit/eda/__init__.py +0 -80
  11. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/.gitignore +0 -0
  12. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/LICENSE +0 -0
  13. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/README.md +0 -0
  14. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/STYLE.md +0 -0
  15. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/archive/README.md +0 -0
  16. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/research/README.md +0 -0
  17. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/research/datasets/README.md +0 -0
  18. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/research/papers/data-integrity/README.md +0 -0
  19. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  20. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/research/papers/inference/README.md +0 -0
  21. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/research/papers/prompt-injection/README.md +0 -0
  22. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/source/adr/README.md +0 -0
  23. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/docs/source/methodology/README.md +0 -0
  24. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/pyproject.toml +0 -0
  25. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/__init__.py +0 -0
  26. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/__main__.py +0 -0
  27. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_deprecated.py +0 -0
  28. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_narrative.py +0 -0
  29. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_parallel.py +0 -0
  30. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_rng.py +0 -0
  31. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_sweep.py +0 -0
  32. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/adversarial.py +0 -0
  33. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/analysis.py +0 -0
  34. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/artifacts.py +0 -0
  35. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  36. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  37. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/audit_value_bindings.py +0 -0
  38. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/bootstrap.py +0 -0
  39. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/calibration.py +0 -0
  40. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/claims.py +0 -0
  41. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/config.py +0 -0
  42. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/docs.py +0 -0
  43. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/eda/data_audit.py +0 -0
  44. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/eda/obfuscation.py +0 -0
  45. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/embeddings.py +0 -0
  46. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/evidence.py +0 -0
  47. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/harness.py +0 -0
  48. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/leakage.py +0 -0
  49. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/loaders.py +0 -0
  50. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/losses.py +0 -0
  51. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/manifest.py +0 -0
  52. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/metric_specs.py +0 -0
  53. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/metrics.py +0 -0
  54. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/operating_points.py +0 -0
  55. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/paths.py +0 -0
  56. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/plotting.py +0 -0
  57. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/preprocessing.py +0 -0
  58. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/probes.py +0 -0
  59. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/protocols.py +0 -0
  60. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/provenance.py +0 -0
  61. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/py.typed +0 -0
  62. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  63. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  64. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  65. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  66. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  67. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  68. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/scorecards.py +0 -0
  69. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/seeds.py +0 -0
  70. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/splits.py +0 -0
  71. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/stacking.py +0 -0
  72. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/text_dedup.py +0 -0
  73. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/src/eval_toolkit/thresholds.py +0 -0
  74. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  75. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  76. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  77. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  78. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  79. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  80. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  81. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  82. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  83. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  84. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/benchmarks/__init__.py +0 -0
  85. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  86. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/conftest.py +0 -0
  87. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  88. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  89. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  90. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  91. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/docs/expected.md +0 -0
  92. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/docs/input.md +0 -0
  93. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/docs/metrics.json +0 -0
  94. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  95. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/strategies.py +0 -0
  96. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_adversarial.py +0 -0
  97. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_analysis.py +0 -0
  98. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_artifacts.py +0 -0
  99. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_audit_citation_alignment.py +0 -0
  100. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  101. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_audit_value_bindings.py +0 -0
  102. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  103. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  104. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_edge_cases.py +0 -0
  105. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_golden.py +0 -0
  106. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_njobs.py +0 -0
  107. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_props.py +0 -0
  108. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_research_grounded.py +0 -0
  109. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_unit.py +0 -0
  110. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_calibration_binary_adapters.py +0 -0
  111. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  112. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_calibration_determinism.py +0 -0
  113. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_calibration_optimization_failures.py +0 -0
  114. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_calibration_props.py +0 -0
  115. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_calibration_research_grounded.py +0 -0
  116. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_calibration_unit.py +0 -0
  117. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_claims.py +0 -0
  118. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_claims_coverage.py +0 -0
  119. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_claims_props.py +0 -0
  120. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_cli.py +0 -0
  121. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_config.py +0 -0
  122. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_coverage_bootstrap.py +0 -0
  123. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_coverage_calibration.py +0 -0
  124. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_coverage_harness.py +0 -0
  125. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_coverage_metrics.py +0 -0
  126. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_coverage_plotting.py +0 -0
  127. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_croissant_e2e.py +0 -0
  128. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  129. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_deprecated_scalars_shim.py +0 -0
  130. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_deprecations.py +0 -0
  131. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_docs_golden.py +0 -0
  132. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_docs_props.py +0 -0
  133. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_eda.py +0 -0
  134. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_eda_obfuscation.py +0 -0
  135. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_embeddings.py +0 -0
  136. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_evidence_validators.py +0 -0
  137. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_harness_edge_cases.py +0 -0
  138. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_harness_fault_injection.py +0 -0
  139. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_harness_folded.py +0 -0
  140. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_harness_internals.py +0 -0
  141. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_harness_metric_options.py +0 -0
  142. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_harness_parallelism.py +0 -0
  143. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_harness_smoke.py +0 -0
  144. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_import_boundaries.py +0 -0
  145. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  146. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_lazy_extras_messages.py +0 -0
  147. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_leakage.py +0 -0
  148. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_leakage_error_paths.py +0 -0
  149. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_leakage_props.py +0 -0
  150. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_loaders.py +0 -0
  151. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_loaders_coverage.py +0 -0
  152. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_loaders_props.py +0 -0
  153. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_logging.py +0 -0
  154. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_losses.py +0 -0
  155. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_manifest.py +0 -0
  156. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  157. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_manifest_props.py +0 -0
  158. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_manifest_validation.py +0 -0
  159. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_metrics_props.py +0 -0
  160. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_metrics_stratified_subsets.py +0 -0
  161. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_metrics_unit.py +0 -0
  162. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_misc_coverage.py +0 -0
  163. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_numeric_edge_cases.py +0 -0
  164. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_ood_loader.py +0 -0
  165. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_operating_points.py +0 -0
  166. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_operating_points_props.py +0 -0
  167. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_parallel.py +0 -0
  168. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_paths.py +0 -0
  169. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_pipeline_e2e.py +0 -0
  170. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_plotting_edge.py +0 -0
  171. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_plotting_smoke.py +0 -0
  172. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_plotting_visual.py +0 -0
  173. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_preprocessing.py +0 -0
  174. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_probes.py +0 -0
  175. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_protocol_conformance.py +0 -0
  176. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_provenance.py +0 -0
  177. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_public_api.py +0 -0
  178. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_recall_at_fpr.py +0 -0
  179. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_reference_equivalence.py +0 -0
  180. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_reproducibility_integration.py +0 -0
  181. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_rng.py +0 -0
  182. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_schemas.py +0 -0
  183. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_scorecard.py +0 -0
  184. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_seeds.py +0 -0
  185. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_splits.py +0 -0
  186. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_splits_leakage_integration.py +0 -0
  187. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_splits_props.py +0 -0
  188. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_stacking.py +0 -0
  189. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_sweep.py +0 -0
  190. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_text_dedup.py +0 -0
  191. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_text_dedup_coverage.py +0 -0
  192. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_text_dedup_props.py +0 -0
  193. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_text_dedup_strategies.py +0 -0
  194. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_thresholds.py +0 -0
  195. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_thresholds_constant_score.py +0 -0
  196. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_thresholds_coverage.py +0 -0
  197. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_thresholds_props.py +0 -0
  198. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_thresholds_research_grounded.py +0 -0
  199. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_tokenization_leakage_check.py +0 -0
  200. {eval_toolkit-1.5.0 → eval_toolkit-1.6.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,50 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.6.0] — 2026-05-29 — Tier-2 `eda` Job-2 + Job-3: shortcut + shift diagnostics (#86, #87)
9
+
10
+ `eval_toolkit.eda.*` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, backward-compatible. Completes the EDA-first analytic layer above the v1.5.0 Job-1 integrity gate: **Job-2** lexical shortcut diagnostics (`lexical_association`, #86) and **Job-3** distribution-shift quantification (`distribution_shift`, #87). Both are dogfooded by the consumer portfolio's pre-modeling OOD-wall prediction (V5 + V9).
11
+
12
+ ### Added — Tier-2 `eda.lexical_association` shortcut diagnostics (Job-2: C1 + C2)
13
+
14
+ `eval_toolkit.eda` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, torch-free (NumPy + scikit-learn). The analytic layer above the Job-1 integrity gate: *"is the label recoverable from a surface shortcut that will not transfer out-of-distribution?"*
15
+
16
+ - **C1 — `weighted_log_odds` / `class_lexical_association`:** Monroe, Colaresi & Quinn (2008)
17
+ informative-Dirichlet weighted log-odds-ratio z-scores + smoothed PMI per token, with a
18
+ `min_count` rare-token floor (the V5 pitfall). Returns a `LexicalAssociationResult`
19
+ (`top_a` / `top_b` / `to_dict`); tokens ordered by descending z-score.
20
+ - **C2 — `competency_baselines`:** partial-input baselines (length-only, char-n-gram, BoW)
21
+ fit on a train split and scored on a test split → `CompetencyResult` of per-baseline
22
+ average-precision vs the positive-prevalence floor (the *shortcut floor*; Feng, Wallace &
23
+ Boyd-Graber, ACL 2019 caveat documented). Vectorizers fit on train only (no test leakage);
24
+ empty or single-class train/test raises a diagnostic `ValueError`.
25
+ - Exported via `from eval_toolkit.eda import ...`; 100% line+branch coverage; mypy-strict clean.
26
+
27
+ ### Added — Tier-2 `eda.distribution_shift` covariate-shift quantification (Job-3: E1)
28
+
29
+ `eval_toolkit.eda` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2. Public functions take **feature matrices**, so the module is base-install-safe (NumPy + SciPy + scikit-learn); embed text first with `eval_toolkit.embeddings.make_minilm_embedder` (`[embeddings]` extra) or any vectorizer.
30
+
31
+ - **`proxy_a_distance`:** Ben-David et al. (2006/2010) PAD = `2(1 − 2ε)` from a **linear**
32
+ domain classifier's **k-fold CV** error, with **fixed strong regularization** (small `C`) —
33
+ *not* the high-`C` RBF-SVM-on-`predict_proba` recipe that overfits to `PAD ≈ 2` at small `n`.
34
+ Optional bootstrap CI.
35
+ - **`maximum_mean_discrepancy`:** Gretton et al. (2012) **unbiased** RBF-kernel MMD² U-statistic +
36
+ **median-heuristic bandwidth** (freezable across folds) + **permutation-test** p-value
37
+ (Phipson & Smyth 2010, `(1+count)/(B+1)`, never zero). Optional bootstrap CI.
38
+ - **`knn_purity`:** mean fraction of each point's k nearest neighbours sharing its domain label.
39
+ - **`median_bandwidth`** helper + the **`distribution_shift`** orchestrator (all three) +
40
+ `PadResult` / `MmdResult` / `KnnPurityResult` / `DistributionShiftResult` dataclasses (`to_dict`).
41
+ - Docstrings carry the pre-registered caveats: distance is **necessary-not-sufficient** for OOD
42
+ collapse (fuse with shortcut-exposure); a non-significant MMD p is not "no shift"; cross-dataset
43
+ distances are ordinal-only (covariate vs label-semantics conflation). 100% line+branch coverage.
44
+
45
+ ### Fixed
46
+
47
+ - **Public-API golden `__version__` drift:** the `v1.5.0` release commit bumped
48
+ `_version.py` to `1.5.0` but did not regenerate `tests/golden/public_api/snapshot.json`,
49
+ which still pinned `'1.4.0'` — leaving `test_public_api_drift_guard` red on `main` (and on
50
+ every branch cut from it). Regenerated the golden (the diff is the `__version__` value only).
51
+
8
52
  ## [1.5.0] — 2026-05-29 — Tier-2 `eda` layer (#83) + schema-aware `HFDatasetsLoader` (#85)
9
53
 
10
54
  Tier-2 / `loaders` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.5.0
3
+ Version: 1.6.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.5.0"
5
+ __version__ = "1.6.0"
@@ -0,0 +1,144 @@
1
+ """``eval_toolkit.eda`` — EDA-first dataset integrity gating (Tier-2 surface).
2
+
3
+ This subpackage is the **Job-1 integrity gate** of an EDA-first research
4
+ program: thin, composable, torch-free per-split profiling + dataset-soundness
5
+ gates, built by reusing the v1.4.0 :mod:`eval_toolkit.leakage`,
6
+ :mod:`~eval_toolkit.text_dedup`, :mod:`~eval_toolkit.claims`, and
7
+ :mod:`~eval_toolkit.artifacts` primitives.
8
+
9
+ Stability tier
10
+ --------------
11
+ Public access is ``eval_toolkit.eda.*`` — **Tier-2** per ADR 0003. This layer
12
+ is intentionally evolvable and is **not** part of the v2.0-frozen top-level
13
+ :mod:`eval_toolkit` surface; nothing here is added to the package-root
14
+ ``_EXPORTS`` / ``__all__``. Import explicitly::
15
+
16
+ from eval_toolkit.eda import audit_dataset, DataAudit, SplitSummary
17
+
18
+ Scope
19
+ -----
20
+ - **Job-1 integrity gate** (``data_audit`` + ``obfuscation``): row counts, class
21
+ balance, text-length quantiles, dedup / cross-split leakage, obfuscation
22
+ prevalence.
23
+ - **Job-2 lexical shortcut diagnostics** (``lexical_association``): weighted
24
+ log-odds + PMI (C1) and partial-input / competency baselines (C2) — torch-free
25
+ (NumPy + scikit-learn).
26
+ - **Job-3 distribution shift** (``distribution_shift``): proxy-A-distance, MMD
27
+ (permutation-tested), and kNN purity (E1) — operates on feature matrices, so
28
+ still base-install-safe (NumPy + SciPy + scikit-learn).
29
+
30
+ The shift functions take **feature matrices**, not text — embed first with
31
+ :func:`eval_toolkit.embeddings.make_minilm_embedder` (the optional
32
+ ``[embeddings]`` extra) or any vectorizer. UMAP / 2-D projections stay caller-side.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from eval_toolkit.eda.data_audit import (
38
+ DEFAULT_MAX_NEG_POS_RATIO,
39
+ DEFAULT_MIN_NEG_POS_RATIO,
40
+ DEFAULT_PCT_OVER_CONTEXT_THRESHOLD,
41
+ EDA_AUDIT_SCHEMA_VERSION,
42
+ DataAudit,
43
+ SplitSummary,
44
+ Tokenizer,
45
+ audit_dataset,
46
+ class_balance,
47
+ length_quantiles,
48
+ summarize_split,
49
+ )
50
+ from eval_toolkit.eda.distribution_shift import (
51
+ DEFAULT_KNN_K,
52
+ DEFAULT_MMD_PERMUTATIONS,
53
+ DEFAULT_PAD_C,
54
+ DEFAULT_PAD_FOLDS,
55
+ DistributionShiftResult,
56
+ KnnPurityResult,
57
+ MmdResult,
58
+ PadResult,
59
+ distribution_shift,
60
+ knn_purity,
61
+ maximum_mean_discrepancy,
62
+ median_bandwidth,
63
+ proxy_a_distance,
64
+ )
65
+ from eval_toolkit.eda.lexical_association import (
66
+ DEFAULT_CHAR_NGRAM_RANGE,
67
+ DEFAULT_MIN_COUNT,
68
+ DEFAULT_PRIOR_SCALE,
69
+ BaselineScore,
70
+ CompetencyResult,
71
+ LexicalAssociationResult,
72
+ StrTokenizer,
73
+ class_lexical_association,
74
+ competency_baselines,
75
+ default_tokenizer,
76
+ weighted_log_odds,
77
+ )
78
+ from eval_toolkit.eda.obfuscation import (
79
+ BASE64_ENTROPY_THRESHOLD,
80
+ HEX_ENTROPY_THRESHOLD,
81
+ ObfuscationProfile,
82
+ analyze_obfuscation,
83
+ count_invisible_chars,
84
+ has_high_entropy_alnum_run,
85
+ has_rot13_marker,
86
+ is_leeted_token,
87
+ leetspeak_counts,
88
+ nfkc_changed,
89
+ nfkc_char_delta,
90
+ shannon_entropy,
91
+ )
92
+
93
+ __all__ = [
94
+ # --- constants ---
95
+ "BASE64_ENTROPY_THRESHOLD",
96
+ "DEFAULT_CHAR_NGRAM_RANGE",
97
+ "DEFAULT_KNN_K",
98
+ "DEFAULT_MAX_NEG_POS_RATIO",
99
+ "DEFAULT_MIN_COUNT",
100
+ "DEFAULT_MIN_NEG_POS_RATIO",
101
+ "DEFAULT_MMD_PERMUTATIONS",
102
+ "DEFAULT_PAD_C",
103
+ "DEFAULT_PAD_FOLDS",
104
+ "DEFAULT_PCT_OVER_CONTEXT_THRESHOLD",
105
+ "DEFAULT_PRIOR_SCALE",
106
+ "EDA_AUDIT_SCHEMA_VERSION",
107
+ "HEX_ENTROPY_THRESHOLD",
108
+ # --- classes / type aliases ---
109
+ "BaselineScore",
110
+ "CompetencyResult",
111
+ "DataAudit",
112
+ "DistributionShiftResult",
113
+ "KnnPurityResult",
114
+ "LexicalAssociationResult",
115
+ "MmdResult",
116
+ "ObfuscationProfile",
117
+ "PadResult",
118
+ "SplitSummary",
119
+ "StrTokenizer",
120
+ "Tokenizer",
121
+ # --- functions ---
122
+ "analyze_obfuscation",
123
+ "audit_dataset",
124
+ "class_balance",
125
+ "class_lexical_association",
126
+ "competency_baselines",
127
+ "count_invisible_chars",
128
+ "default_tokenizer",
129
+ "distribution_shift",
130
+ "has_high_entropy_alnum_run",
131
+ "has_rot13_marker",
132
+ "is_leeted_token",
133
+ "knn_purity",
134
+ "leetspeak_counts",
135
+ "length_quantiles",
136
+ "maximum_mean_discrepancy",
137
+ "median_bandwidth",
138
+ "nfkc_char_delta",
139
+ "nfkc_changed",
140
+ "proxy_a_distance",
141
+ "shannon_entropy",
142
+ "summarize_split",
143
+ "weighted_log_odds",
144
+ ]