eval-toolkit 1.5.0__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/CHANGELOG.md +59 -0
  2. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/PKG-INFO +1 -1
  3. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/__init__.py +1 -0
  4. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/bootstrap.py +200 -0
  6. eval_toolkit-1.7.0/src/eval_toolkit/eda/__init__.py +144 -0
  7. eval_toolkit-1.7.0/src/eval_toolkit/eda/distribution_shift.py +634 -0
  8. eval_toolkit-1.7.0/src/eval_toolkit/eda/lexical_association.py +620 -0
  9. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/benchmarks/test_kernel_benchmarks.py +2 -2
  10. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/public_api/snapshot.json +7 -1
  11. eval_toolkit-1.7.0/tests/test_cluster_bootstrap.py +131 -0
  12. eval_toolkit-1.7.0/tests/test_eda_distribution_shift.py +302 -0
  13. eval_toolkit-1.7.0/tests/test_eda_lexical_association.py +340 -0
  14. eval_toolkit-1.5.0/src/eval_toolkit/eda/__init__.py +0 -80
  15. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/.gitignore +0 -0
  16. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/LICENSE +0 -0
  17. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/README.md +0 -0
  18. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/STYLE.md +0 -0
  19. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/archive/README.md +0 -0
  20. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/research/README.md +0 -0
  21. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/research/datasets/README.md +0 -0
  22. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/research/papers/data-integrity/README.md +0 -0
  23. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  24. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/research/papers/inference/README.md +0 -0
  25. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/research/papers/prompt-injection/README.md +0 -0
  26. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/source/adr/README.md +0 -0
  27. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/docs/source/methodology/README.md +0 -0
  28. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/pyproject.toml +0 -0
  29. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/__main__.py +0 -0
  30. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_deprecated.py +0 -0
  31. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_narrative.py +0 -0
  32. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_parallel.py +0 -0
  33. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_rng.py +0 -0
  34. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_sweep.py +0 -0
  35. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/adversarial.py +0 -0
  36. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/analysis.py +0 -0
  37. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/artifacts.py +0 -0
  38. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  39. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  40. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/audit_value_bindings.py +0 -0
  41. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/calibration.py +0 -0
  42. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/claims.py +0 -0
  43. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/config.py +0 -0
  44. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/docs.py +0 -0
  45. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/eda/data_audit.py +0 -0
  46. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/eda/obfuscation.py +0 -0
  47. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/embeddings.py +0 -0
  48. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/evidence.py +0 -0
  49. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/harness.py +0 -0
  50. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/leakage.py +0 -0
  51. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/loaders.py +0 -0
  52. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/losses.py +0 -0
  53. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/manifest.py +0 -0
  54. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/metric_specs.py +0 -0
  55. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/metrics.py +0 -0
  56. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/operating_points.py +0 -0
  57. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/paths.py +0 -0
  58. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/plotting.py +0 -0
  59. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/preprocessing.py +0 -0
  60. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/probes.py +0 -0
  61. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/protocols.py +0 -0
  62. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/provenance.py +0 -0
  63. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/py.typed +0 -0
  64. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  65. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  66. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  67. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  68. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  69. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  70. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/scorecards.py +0 -0
  71. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/seeds.py +0 -0
  72. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/splits.py +0 -0
  73. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/stacking.py +0 -0
  74. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/text_dedup.py +0 -0
  75. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/src/eval_toolkit/thresholds.py +0 -0
  76. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  77. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  78. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  79. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  80. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  81. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  82. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  83. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  84. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  85. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  86. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/benchmarks/__init__.py +0 -0
  87. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/conftest.py +0 -0
  88. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  89. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  90. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  91. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  92. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/docs/expected.md +0 -0
  93. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/docs/input.md +0 -0
  94. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/docs/metrics.json +0 -0
  95. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  96. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/strategies.py +0 -0
  97. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_adversarial.py +0 -0
  98. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_analysis.py +0 -0
  99. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_artifacts.py +0 -0
  100. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_audit_citation_alignment.py +0 -0
  101. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  102. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_audit_value_bindings.py +0 -0
  103. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  104. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  105. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_edge_cases.py +0 -0
  106. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_golden.py +0 -0
  107. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_njobs.py +0 -0
  108. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_props.py +0 -0
  109. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_research_grounded.py +0 -0
  110. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_unit.py +0 -0
  111. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_calibration_binary_adapters.py +0 -0
  112. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  113. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_calibration_determinism.py +0 -0
  114. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_calibration_optimization_failures.py +0 -0
  115. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_calibration_props.py +0 -0
  116. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_calibration_research_grounded.py +0 -0
  117. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_calibration_unit.py +0 -0
  118. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_claims.py +0 -0
  119. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_claims_coverage.py +0 -0
  120. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_claims_props.py +0 -0
  121. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_cli.py +0 -0
  122. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_config.py +0 -0
  123. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_coverage_bootstrap.py +0 -0
  124. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_coverage_calibration.py +0 -0
  125. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_coverage_harness.py +0 -0
  126. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_coverage_metrics.py +0 -0
  127. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_coverage_plotting.py +0 -0
  128. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_croissant_e2e.py +0 -0
  129. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  130. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_deprecated_scalars_shim.py +0 -0
  131. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_deprecations.py +0 -0
  132. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_docs_golden.py +0 -0
  133. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_docs_props.py +0 -0
  134. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_eda.py +0 -0
  135. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_eda_obfuscation.py +0 -0
  136. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_embeddings.py +0 -0
  137. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_evidence_validators.py +0 -0
  138. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_harness_edge_cases.py +0 -0
  139. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_harness_fault_injection.py +0 -0
  140. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_harness_folded.py +0 -0
  141. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_harness_internals.py +0 -0
  142. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_harness_metric_options.py +0 -0
  143. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_harness_parallelism.py +0 -0
  144. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_harness_smoke.py +0 -0
  145. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_import_boundaries.py +0 -0
  146. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  147. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_lazy_extras_messages.py +0 -0
  148. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_leakage.py +0 -0
  149. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_leakage_error_paths.py +0 -0
  150. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_leakage_props.py +0 -0
  151. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_loaders.py +0 -0
  152. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_loaders_coverage.py +0 -0
  153. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_loaders_props.py +0 -0
  154. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_logging.py +0 -0
  155. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_losses.py +0 -0
  156. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_manifest.py +0 -0
  157. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  158. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_manifest_props.py +0 -0
  159. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_manifest_validation.py +0 -0
  160. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_metrics_props.py +0 -0
  161. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_metrics_stratified_subsets.py +0 -0
  162. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_metrics_unit.py +0 -0
  163. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_misc_coverage.py +0 -0
  164. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_numeric_edge_cases.py +0 -0
  165. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_ood_loader.py +0 -0
  166. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_operating_points.py +0 -0
  167. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_operating_points_props.py +0 -0
  168. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_parallel.py +0 -0
  169. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_paths.py +0 -0
  170. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_pipeline_e2e.py +0 -0
  171. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_plotting_edge.py +0 -0
  172. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_plotting_smoke.py +0 -0
  173. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_plotting_visual.py +0 -0
  174. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_preprocessing.py +0 -0
  175. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_probes.py +0 -0
  176. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_protocol_conformance.py +0 -0
  177. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_provenance.py +0 -0
  178. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_public_api.py +0 -0
  179. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_recall_at_fpr.py +0 -0
  180. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_reference_equivalence.py +0 -0
  181. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_reproducibility_integration.py +0 -0
  182. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_rng.py +0 -0
  183. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_schemas.py +0 -0
  184. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_scorecard.py +0 -0
  185. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_seeds.py +0 -0
  186. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_splits.py +0 -0
  187. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_splits_leakage_integration.py +0 -0
  188. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_splits_props.py +0 -0
  189. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_stacking.py +0 -0
  190. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_sweep.py +0 -0
  191. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_text_dedup.py +0 -0
  192. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_text_dedup_coverage.py +0 -0
  193. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_text_dedup_props.py +0 -0
  194. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_text_dedup_strategies.py +0 -0
  195. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_thresholds.py +0 -0
  196. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_thresholds_constant_score.py +0 -0
  197. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_thresholds_coverage.py +0 -0
  198. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_thresholds_props.py +0 -0
  199. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_thresholds_research_grounded.py +0 -0
  200. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_tokenization_leakage_check.py +0 -0
  201. {eval_toolkit-1.5.0 → eval_toolkit-1.7.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,65 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.7.0] — 2026-06-04 — label-stratified cluster bootstrap (#90, #91)
9
+
10
+ ### Added — `bootstrap.cluster_bootstrap_ci` (label-stratified cluster bootstrap)
11
+
12
+ `eval_toolkit.bootstrap` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible. Closes the gap between the row-level (`bootstrap_ci`) and fold-level (`block_bootstrap_on_folds`) resamplers: **the missing middle — resampling clusters of rows.**
13
+
14
+ - **`cluster_bootstrap_ci(y_true, y_score, groups, statistic, *, resample_labels=(0, 1), …)`** — percentile CI for a single-condition metric that resamples whole **clusters** (`groups`) with replacement, so the CI is honest under intra-cluster correlation (prompts sharing one attack payload; a document contributing both a poisoned and a benign row). The resample unit is `(label, group)`: by default positive- and negative-clusters are resampled **separately** (never a single-class draw); `resample_labels=(1,)` resamples only positive clusters with negatives held fixed (the payload-cluster convention). Returns a `BootstrapCI` with `method="cluster_percentile"`.
15
+ - **Parallel + reproducible:** built on `parallel_map` + `spawn_seed_sequences`, so `n_jobs` gives bit-for-bit-identical CIs across worker counts (the v0.34.0 reproducibility contract). `n_jobs=-1` uses all cores.
16
+ - Motivation: the analytic row-level AUROC-difference CI (`delong_roc_variance`) assumes row independence and under-covers on clustered eval data (LODO transfer gaps with payload/document/page clusters). Dogfooded by the consumer portfolio's attack-type / carrier / dialect leave-one-group-out bootstraps (Rule of Three).
17
+ - Exported via `from eval_toolkit import cluster_bootstrap_ci`; `__all__` + `_EXPORTS` updated; doctest + n_jobs-reproducibility tests; mypy-strict clean.
18
+
19
+ ### Fixed — stale `seed=` kwarg in 2 bootstrap benchmarks (#91)
20
+
21
+ `tests/benchmarks/test_kernel_benchmarks.py` passed `seed=` to `bootstrap_ci` / `paired_bootstrap_diff`, but those parameters migrated to `rng=` (SPEC 7) — the two bootstrap benchmark tests `TypeError`'d on the nightly-benchmarks workflow (excluded from PR CI via `-m "not benchmark"`, so it went unnoticed). 2-line `seed=`→`rng=` rename.
22
+
23
+ ## [1.6.0] — 2026-05-29 — Tier-2 `eda` Job-2 + Job-3: shortcut + shift diagnostics (#86, #87)
24
+
25
+ `eval_toolkit.eda.*` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, backward-compatible. Completes the EDA-first analytic layer above the v1.5.0 Job-1 integrity gate: **Job-2** lexical shortcut diagnostics (`lexical_association`, #86) and **Job-3** distribution-shift quantification (`distribution_shift`, #87). Both are dogfooded by the consumer portfolio's pre-modeling OOD-wall prediction (V5 + V9).
26
+
27
+ ### Added — Tier-2 `eda.lexical_association` shortcut diagnostics (Job-2: C1 + C2)
28
+
29
+ `eval_toolkit.eda` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, torch-free (NumPy + scikit-learn). The analytic layer above the Job-1 integrity gate: *"is the label recoverable from a surface shortcut that will not transfer out-of-distribution?"*
30
+
31
+ - **C1 — `weighted_log_odds` / `class_lexical_association`:** Monroe, Colaresi & Quinn (2008)
32
+ informative-Dirichlet weighted log-odds-ratio z-scores + smoothed PMI per token, with a
33
+ `min_count` rare-token floor (the V5 pitfall). Returns a `LexicalAssociationResult`
34
+ (`top_a` / `top_b` / `to_dict`); tokens ordered by descending z-score.
35
+ - **C2 — `competency_baselines`:** partial-input baselines (length-only, char-n-gram, BoW)
36
+ fit on a train split and scored on a test split → `CompetencyResult` of per-baseline
37
+ average-precision vs the positive-prevalence floor (the *shortcut floor*; Feng, Wallace &
38
+ Boyd-Graber, ACL 2019 caveat documented). Vectorizers fit on train only (no test leakage);
39
+ empty or single-class train/test raises a diagnostic `ValueError`.
40
+ - Exported via `from eval_toolkit.eda import ...`; 100% line+branch coverage; mypy-strict clean.
41
+
42
+ ### Added — Tier-2 `eda.distribution_shift` covariate-shift quantification (Job-3: E1)
43
+
44
+ `eval_toolkit.eda` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2. Public functions take **feature matrices**, so the module is base-install-safe (NumPy + SciPy + scikit-learn); embed text first with `eval_toolkit.embeddings.make_minilm_embedder` (`[embeddings]` extra) or any vectorizer.
45
+
46
+ - **`proxy_a_distance`:** Ben-David et al. (2006/2010) PAD = `2(1 − 2ε)` from a **linear**
47
+ domain classifier's **k-fold CV** error, with **fixed strong regularization** (small `C`) —
48
+ *not* the high-`C` RBF-SVM-on-`predict_proba` recipe that overfits to `PAD ≈ 2` at small `n`.
49
+ Optional bootstrap CI.
50
+ - **`maximum_mean_discrepancy`:** Gretton et al. (2012) **unbiased** RBF-kernel MMD² U-statistic +
51
+ **median-heuristic bandwidth** (freezable across folds) + **permutation-test** p-value
52
+ (Phipson & Smyth 2010, `(1+count)/(B+1)`, never zero). Optional bootstrap CI.
53
+ - **`knn_purity`:** mean fraction of each point's k nearest neighbours sharing its domain label.
54
+ - **`median_bandwidth`** helper + the **`distribution_shift`** orchestrator (all three) +
55
+ `PadResult` / `MmdResult` / `KnnPurityResult` / `DistributionShiftResult` dataclasses (`to_dict`).
56
+ - Docstrings carry the pre-registered caveats: distance is **necessary-not-sufficient** for OOD
57
+ collapse (fuse with shortcut-exposure); a non-significant MMD p is not "no shift"; cross-dataset
58
+ distances are ordinal-only (covariate vs label-semantics conflation). 100% line+branch coverage.
59
+
60
+ ### Fixed
61
+
62
+ - **Public-API golden `__version__` drift:** the `v1.5.0` release commit bumped
63
+ `_version.py` to `1.5.0` but did not regenerate `tests/golden/public_api/snapshot.json`,
64
+ which still pinned `'1.4.0'` — leaving `test_public_api_drift_guard` red on `main` (and on
65
+ every branch cut from it). Regenerated the golden (the diff is the `__version__` value only).
66
+
8
67
  ## [1.5.0] — 2026-05-29 — Tier-2 `eda` layer (#83) + schema-aware `HFDatasetsLoader` (#85)
9
68
 
10
69
  Tier-2 / `loaders` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.5.0
3
+ Version: 1.7.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -129,6 +129,7 @@ _EXPORTS: dict[str, str] = {
129
129
  "block_bootstrap_on_folds": "eval_toolkit.bootstrap",
130
130
  "bonferroni_correct": "eval_toolkit.bootstrap",
131
131
  "bootstrap_ci": "eval_toolkit.bootstrap",
132
+ "cluster_bootstrap_ci": "eval_toolkit.bootstrap",
132
133
  "correct_p_values": "eval_toolkit.bootstrap",
133
134
  "cross_validate_metric": "eval_toolkit.bootstrap",
134
135
  "cv_clt_ci": "eval_toolkit.bootstrap",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.5.0"
5
+ __version__ = "1.7.0"
@@ -52,6 +52,7 @@ __all__ = [
52
52
  "block_bootstrap_on_folds",
53
53
  "bonferroni_correct",
54
54
  "bootstrap_ci",
55
+ "cluster_bootstrap_ci",
55
56
  "correct_p_values",
56
57
  "cross_validate_metric",
57
58
  "cv_clt_ci",
@@ -1447,6 +1448,205 @@ def block_bootstrap_on_folds(
1447
1448
  )
1448
1449
 
1449
1450
 
1451
+ def _label_cluster_units(y_true: np.ndarray, groups: np.ndarray) -> dict[int, list[np.ndarray]]:
1452
+ """Index rows by ``(label, group)``: per label, a list of per-group row-index arrays.
1453
+
1454
+ A group that appears under both labels contributes a **separate** index array to each label's
1455
+ list — the resample unit is ``(label, group)``, so a mixed-label group (e.g. a document with
1456
+ both a poisoned and a benign variant sharing one id) splits into one positive unit and one
1457
+ negative unit, resampled independently. Helper for :func:`cluster_bootstrap_ci`.
1458
+ """
1459
+ units: dict[int, list[np.ndarray]] = {}
1460
+ for lab in np.unique(y_true):
1461
+ lab_rows = np.flatnonzero(y_true == lab)
1462
+ order = np.argsort(groups[lab_rows], kind="stable")
1463
+ sorted_rows = lab_rows[order]
1464
+ sorted_groups = groups[lab_rows][order]
1465
+ # Split at the boundaries between consecutive distinct group ids (post-sort).
1466
+ cut = np.flatnonzero(sorted_groups[1:] != sorted_groups[:-1]) + 1
1467
+ units[int(lab)] = np.split(sorted_rows, cut)
1468
+ return units
1469
+
1470
+
1471
+ def _cluster_bootstrap_step(
1472
+ seed_seq: np.random.SeedSequence,
1473
+ *,
1474
+ y_true: np.ndarray,
1475
+ y_score: np.ndarray,
1476
+ units: dict[int, list[np.ndarray]],
1477
+ statistic: MetricFn,
1478
+ resample_labels: tuple[int, ...],
1479
+ ) -> float | None:
1480
+ """One cluster-resampled draw of ``statistic`` (module-level for parallel_map picklability).
1481
+
1482
+ For each label, its ``(label, group)`` units are resampled with replacement when the label is
1483
+ in ``resample_labels``, else all its rows are held fixed. Returns the statistic on the gathered
1484
+ rows, or ``None`` if the draw is degenerate (statistic raises — e.g. a single-class draw).
1485
+ """
1486
+ rng = np.random.default_rng(seed_seq)
1487
+ parts: list[np.ndarray] = []
1488
+ for lab, group_rows in units.items():
1489
+ if lab in resample_labels:
1490
+ chosen = rng.integers(0, len(group_rows), size=len(group_rows))
1491
+ parts.extend(group_rows[c] for c in chosen)
1492
+ else:
1493
+ parts.extend(group_rows)
1494
+ idx = np.concatenate(parts)
1495
+ try:
1496
+ return float(statistic(y_true[idx], y_score[idx]))
1497
+ except (ValueError, RuntimeError):
1498
+ return None
1499
+
1500
+
1501
+ def cluster_bootstrap_ci(
1502
+ y_true: np.ndarray,
1503
+ y_score: np.ndarray,
1504
+ groups: np.ndarray,
1505
+ statistic: MetricFn,
1506
+ *,
1507
+ resample_labels: tuple[int, ...] = (0, 1),
1508
+ n_resamples: int = DEFAULT_N_RESAMPLES,
1509
+ confidence: float = DEFAULT_CONFIDENCE,
1510
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
1511
+ n_jobs: int = 1,
1512
+ ) -> BootstrapCI:
1513
+ r"""Label-stratified **cluster** (group) bootstrap percentile CI for a single-condition metric.
1514
+
1515
+ Resamples whole ``groups`` (clusters) with replacement rather than individual rows, so the CI
1516
+ is honest under intra-cluster correlation (multiple prompts sharing one attack payload; a
1517
+ document contributing both a poisoned and a benign row). The resample unit is ``(label,
1518
+ group)``: by default (``resample_labels=(0, 1)``) positive-clusters and negative-clusters are
1519
+ resampled **separately**, preserving the per-class cluster split so a draw is never
1520
+ single-class. Pass ``resample_labels=(1,)`` to resample only the positive clusters while holding
1521
+ all negatives fixed (the payload-cluster convention).
1522
+
1523
+ Where :func:`bootstrap_ci` resamples **rows** (i.i.d. assumption) and
1524
+ :func:`block_bootstrap_on_folds` resamples **per-fold scalars**, this resamples **clusters of
1525
+ rows** — the missing middle for grouped eval data. The analytic row-level AUROC-difference CI
1526
+ (:func:`delong_roc_variance`) assumes row independence and so under-covers on clustered data,
1527
+ which is the motivation for this estimator.
1528
+
1529
+ Parameters
1530
+ ----------
1531
+ y_true : np.ndarray, shape (n,)
1532
+ Binary labels in ``{0, 1}``.
1533
+ y_score : np.ndarray, shape (n,)
1534
+ Scores aligned with ``y_true``.
1535
+ groups : np.ndarray, shape (n,)
1536
+ Cluster id per row (any sortable dtype — ints or strings).
1537
+ statistic : callable ``(y_true, y_score) -> float``
1538
+ Metric to bootstrap (e.g. ``roc_auc``). Must be **picklable** when ``n_jobs != 1`` (a named
1539
+ top-level function — lambdas / closures are rejected).
1540
+ resample_labels : tuple[int, ...], optional
1541
+ Which label strata are cluster-resampled (default ``(0, 1)`` — both). Labels not listed are
1542
+ held fixed (all their rows always included). Must be non-empty.
1543
+ n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
1544
+ n_jobs : int, optional
1545
+ Parallel workers (default 1 — sequential). ``n_jobs=-1`` uses all cores; ``n_jobs=0`` is
1546
+ rejected. Per-resample seeding via :func:`spawn_seed_sequences` makes the CI **bit-for-bit
1547
+ identical across ``n_jobs``** for a fixed ``rng``. See :ref:`methodology/parallelism`.
1548
+
1549
+ Returns
1550
+ -------
1551
+ BootstrapCI
1552
+ ``method="cluster_percentile"``; ``point_estimate = statistic(y_true, y_score)`` on the full
1553
+ data; ``[alpha/2, 1 - alpha/2]`` percentile CI over the cluster-resampled distribution.
1554
+
1555
+ Raises
1556
+ ------
1557
+ ValueError
1558
+ On shape mismatch, non-1-D input, ``n < 10``, ``confidence`` outside (0, 1), empty
1559
+ ``resample_labels``, a ``resample_labels`` entry absent from ``y_true``, ``n_jobs == 0``, or
1560
+ > 5% degenerate resamples.
1561
+ TypeError
1562
+ If ``n_jobs != 1`` and ``statistic`` is not picklable.
1563
+
1564
+ Examples
1565
+ --------
1566
+ >>> import numpy as np
1567
+ >>> from eval_toolkit.metrics import roc_auc
1568
+ >>> rng = np.random.default_rng(0)
1569
+ >>> groups = np.repeat(np.arange(40), 5) # 40 clusters of 5 rows
1570
+ >>> y = (groups % 2).astype(int) # cluster-pure labels
1571
+ >>> s = y + rng.normal(0, 0.3, size=y.size)
1572
+ >>> ci = cluster_bootstrap_ci(y, s, groups, roc_auc, n_resamples=200, rng=0)
1573
+ >>> ci.method
1574
+ 'cluster_percentile'
1575
+ >>> bool(0.0 <= ci.ci_low <= ci.ci_high <= 1.0)
1576
+ True
1577
+
1578
+ Notes
1579
+ -----
1580
+ For a *gap* statistic with a fixed offset (e.g. ``Gx = val_auc − test_auc`` with ``val_auc``
1581
+ held fixed), bootstrap the variable term and shift the bounds: ``Gx_low = val_auc − ci.ci_high``,
1582
+ ``Gx_high = val_auc − ci.ci_low``. For a one-sided 95% bound, pass ``confidence=0.90`` and read
1583
+ the relevant bound.
1584
+
1585
+ References
1586
+ ----------
1587
+ .. [1] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap." Chapman & Hall, 1993.
1588
+ (§8 — bootstrapping stratified / clustered data.)
1589
+ .. [2] Field, C. A. & Welsh, A. H. "Bootstrapping clustered data." JRSS-B 69(3), 2007.
1590
+ """
1591
+ y_true_arr = np.asarray(y_true)
1592
+ y_score_arr = np.asarray(y_score)
1593
+ groups_arr = np.asarray(groups)
1594
+ if not (y_true_arr.shape == y_score_arr.shape == groups_arr.shape):
1595
+ raise ValueError(
1596
+ f"shapes mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}, "
1597
+ f"groups {groups_arr.shape}"
1598
+ )
1599
+ if y_true_arr.ndim != 1:
1600
+ raise ValueError(f"inputs must be 1-D; got shape {y_true_arr.shape}")
1601
+ n = int(y_true_arr.size)
1602
+ if n < 10:
1603
+ raise ValueError(f"n={n} too small for cluster bootstrap; need ≥ 10")
1604
+ if not 0.0 < confidence < 1.0:
1605
+ raise ValueError(f"confidence must be in (0, 1), got {confidence}")
1606
+ resample_labels = tuple(int(x) for x in resample_labels)
1607
+ if not resample_labels:
1608
+ raise ValueError("resample_labels must be non-empty (nothing would be resampled)")
1609
+ present = {int(v) for v in np.unique(y_true_arr).tolist()}
1610
+ missing = set(resample_labels) - present
1611
+ if missing:
1612
+ raise ValueError(
1613
+ f"resample_labels {sorted(missing)} absent from y_true (present: {sorted(present)})"
1614
+ )
1615
+
1616
+ point = float(statistic(y_true_arr, y_score_arr))
1617
+ units = _label_cluster_units(y_true_arr, groups_arr)
1618
+ seed_seqs = spawn_seed_sequences(rng, n_resamples)
1619
+ step = functools.partial(
1620
+ _cluster_bootstrap_step,
1621
+ y_true=y_true_arr,
1622
+ y_score=y_score_arr,
1623
+ units=units,
1624
+ statistic=statistic,
1625
+ resample_labels=resample_labels,
1626
+ )
1627
+ raw = parallel_map(step, seed_seqs, n_jobs=n_jobs, description="cluster_bootstrap_ci")
1628
+ failures = sum(1 for r in raw if r is None)
1629
+ vals = [r for r in raw if r is not None]
1630
+ if failures > 0.05 * n_resamples:
1631
+ raise ValueError(
1632
+ f"cluster_bootstrap_ci: {failures}/{n_resamples} resamples degenerate "
1633
+ "(statistic raised — e.g. single-class draws); refusing to compute CI on > 5% degenerate"
1634
+ )
1635
+ if not vals:
1636
+ raise ValueError("cluster_bootstrap_ci: no usable resamples")
1637
+ arr = np.asarray(vals, dtype=np.float64)
1638
+ alpha = 1.0 - confidence
1639
+ ci_low, ci_high = np.quantile(arr, [alpha / 2.0, 1.0 - alpha / 2.0])
1640
+ return BootstrapCI(
1641
+ point_estimate=point,
1642
+ ci_low=float(ci_low),
1643
+ ci_high=float(ci_high),
1644
+ confidence=confidence,
1645
+ n_resamples=int(len(vals)),
1646
+ method="cluster_percentile",
1647
+ )
1648
+
1649
+
1450
1650
  def cross_validate_metric(
1451
1651
  y_true: np.ndarray,
1452
1652
  y_score: np.ndarray,
@@ -0,0 +1,144 @@
1
+ """``eval_toolkit.eda`` — EDA-first dataset integrity gating (Tier-2 surface).
2
+
3
+ This subpackage is the **Job-1 integrity gate** of an EDA-first research
4
+ program: thin, composable, torch-free per-split profiling + dataset-soundness
5
+ gates, built by reusing the v1.4.0 :mod:`eval_toolkit.leakage`,
6
+ :mod:`~eval_toolkit.text_dedup`, :mod:`~eval_toolkit.claims`, and
7
+ :mod:`~eval_toolkit.artifacts` primitives.
8
+
9
+ Stability tier
10
+ --------------
11
+ Public access is ``eval_toolkit.eda.*`` — **Tier-2** per ADR 0003. This layer
12
+ is intentionally evolvable and is **not** part of the v2.0-frozen top-level
13
+ :mod:`eval_toolkit` surface; nothing here is added to the package-root
14
+ ``_EXPORTS`` / ``__all__``. Import explicitly::
15
+
16
+ from eval_toolkit.eda import audit_dataset, DataAudit, SplitSummary
17
+
18
+ Scope
19
+ -----
20
+ - **Job-1 integrity gate** (``data_audit`` + ``obfuscation``): row counts, class
21
+ balance, text-length quantiles, dedup / cross-split leakage, obfuscation
22
+ prevalence.
23
+ - **Job-2 lexical shortcut diagnostics** (``lexical_association``): weighted
24
+ log-odds + PMI (C1) and partial-input / competency baselines (C2) — torch-free
25
+ (NumPy + scikit-learn).
26
+ - **Job-3 distribution shift** (``distribution_shift``): proxy-A-distance, MMD
27
+ (permutation-tested), and kNN purity (E1) — operates on feature matrices, so
28
+ still base-install-safe (NumPy + SciPy + scikit-learn).
29
+
30
+ The shift functions take **feature matrices**, not text — embed first with
31
+ :func:`eval_toolkit.embeddings.make_minilm_embedder` (the optional
32
+ ``[embeddings]`` extra) or any vectorizer. UMAP / 2-D projections stay caller-side.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from eval_toolkit.eda.data_audit import (
38
+ DEFAULT_MAX_NEG_POS_RATIO,
39
+ DEFAULT_MIN_NEG_POS_RATIO,
40
+ DEFAULT_PCT_OVER_CONTEXT_THRESHOLD,
41
+ EDA_AUDIT_SCHEMA_VERSION,
42
+ DataAudit,
43
+ SplitSummary,
44
+ Tokenizer,
45
+ audit_dataset,
46
+ class_balance,
47
+ length_quantiles,
48
+ summarize_split,
49
+ )
50
+ from eval_toolkit.eda.distribution_shift import (
51
+ DEFAULT_KNN_K,
52
+ DEFAULT_MMD_PERMUTATIONS,
53
+ DEFAULT_PAD_C,
54
+ DEFAULT_PAD_FOLDS,
55
+ DistributionShiftResult,
56
+ KnnPurityResult,
57
+ MmdResult,
58
+ PadResult,
59
+ distribution_shift,
60
+ knn_purity,
61
+ maximum_mean_discrepancy,
62
+ median_bandwidth,
63
+ proxy_a_distance,
64
+ )
65
+ from eval_toolkit.eda.lexical_association import (
66
+ DEFAULT_CHAR_NGRAM_RANGE,
67
+ DEFAULT_MIN_COUNT,
68
+ DEFAULT_PRIOR_SCALE,
69
+ BaselineScore,
70
+ CompetencyResult,
71
+ LexicalAssociationResult,
72
+ StrTokenizer,
73
+ class_lexical_association,
74
+ competency_baselines,
75
+ default_tokenizer,
76
+ weighted_log_odds,
77
+ )
78
+ from eval_toolkit.eda.obfuscation import (
79
+ BASE64_ENTROPY_THRESHOLD,
80
+ HEX_ENTROPY_THRESHOLD,
81
+ ObfuscationProfile,
82
+ analyze_obfuscation,
83
+ count_invisible_chars,
84
+ has_high_entropy_alnum_run,
85
+ has_rot13_marker,
86
+ is_leeted_token,
87
+ leetspeak_counts,
88
+ nfkc_changed,
89
+ nfkc_char_delta,
90
+ shannon_entropy,
91
+ )
92
+
93
+ __all__ = [
94
+ # --- constants ---
95
+ "BASE64_ENTROPY_THRESHOLD",
96
+ "DEFAULT_CHAR_NGRAM_RANGE",
97
+ "DEFAULT_KNN_K",
98
+ "DEFAULT_MAX_NEG_POS_RATIO",
99
+ "DEFAULT_MIN_COUNT",
100
+ "DEFAULT_MIN_NEG_POS_RATIO",
101
+ "DEFAULT_MMD_PERMUTATIONS",
102
+ "DEFAULT_PAD_C",
103
+ "DEFAULT_PAD_FOLDS",
104
+ "DEFAULT_PCT_OVER_CONTEXT_THRESHOLD",
105
+ "DEFAULT_PRIOR_SCALE",
106
+ "EDA_AUDIT_SCHEMA_VERSION",
107
+ "HEX_ENTROPY_THRESHOLD",
108
+ # --- classes / type aliases ---
109
+ "BaselineScore",
110
+ "CompetencyResult",
111
+ "DataAudit",
112
+ "DistributionShiftResult",
113
+ "KnnPurityResult",
114
+ "LexicalAssociationResult",
115
+ "MmdResult",
116
+ "ObfuscationProfile",
117
+ "PadResult",
118
+ "SplitSummary",
119
+ "StrTokenizer",
120
+ "Tokenizer",
121
+ # --- functions ---
122
+ "analyze_obfuscation",
123
+ "audit_dataset",
124
+ "class_balance",
125
+ "class_lexical_association",
126
+ "competency_baselines",
127
+ "count_invisible_chars",
128
+ "default_tokenizer",
129
+ "distribution_shift",
130
+ "has_high_entropy_alnum_run",
131
+ "has_rot13_marker",
132
+ "is_leeted_token",
133
+ "knn_purity",
134
+ "leetspeak_counts",
135
+ "length_quantiles",
136
+ "maximum_mean_discrepancy",
137
+ "median_bandwidth",
138
+ "nfkc_char_delta",
139
+ "nfkc_changed",
140
+ "proxy_a_distance",
141
+ "shannon_entropy",
142
+ "summarize_split",
143
+ "weighted_log_odds",
144
+ ]