eval-toolkit 1.6.0__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/CHANGELOG.md +15 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/PKG-INFO +1 -1
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/__init__.py +1 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/bootstrap.py +200 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/benchmarks/test_kernel_benchmarks.py +2 -2
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/public_api/snapshot.json +7 -1
- eval_toolkit-1.7.0/tests/test_cluster_bootstrap.py +131 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/.gitignore +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/LICENSE +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/STYLE.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/pyproject.toml +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_narrative.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/audit_value_bindings.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/eda/__init__.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/eda/data_audit.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/eda/distribution_shift.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/eda/lexical_association.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/eda/obfuscation.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_audit_value_bindings.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_config.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_eda.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_eda_distribution_shift.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_eda_lexical_association.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_eda_obfuscation.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,21 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.7.0] — 2026-06-04 — label-stratified cluster bootstrap (#90, #91)
|
|
9
|
+
|
|
10
|
+
### Added — `bootstrap.cluster_bootstrap_ci` (label-stratified cluster bootstrap)
|
|
11
|
+
|
|
12
|
+
`eval_toolkit.bootstrap` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible. Closes the gap between the row-level (`bootstrap_ci`) and fold-level (`block_bootstrap_on_folds`) resamplers: **the missing middle — resampling clusters of rows.**
|
|
13
|
+
|
|
14
|
+
- **`cluster_bootstrap_ci(y_true, y_score, groups, statistic, *, resample_labels=(0, 1), …)`** — percentile CI for a single-condition metric that resamples whole **clusters** (`groups`) with replacement, so the CI is honest under intra-cluster correlation (prompts sharing one attack payload; a document contributing both a poisoned and a benign row). The resample unit is `(label, group)`: by default positive- and negative-clusters are resampled **separately** (never a single-class draw); `resample_labels=(1,)` resamples only positive clusters with negatives held fixed (the payload-cluster convention). Returns a `BootstrapCI` with `method="cluster_percentile"`.
|
|
15
|
+
- **Parallel + reproducible:** built on `parallel_map` + `spawn_seed_sequences`, so `n_jobs` gives bit-for-bit-identical CIs across worker counts (the v0.34.0 reproducibility contract). `n_jobs=-1` uses all cores.
|
|
16
|
+
- Motivation: the analytic row-level AUROC-difference CI (`delong_roc_variance`) assumes row independence and under-covers on clustered eval data (LODO transfer gaps with payload/document/page clusters). Dogfooded by the consumer portfolio's attack-type / carrier / dialect leave-one-group-out bootstraps (Rule of Three).
|
|
17
|
+
- Exported via `from eval_toolkit import cluster_bootstrap_ci`; `__all__` + `_EXPORTS` updated; doctest + n_jobs-reproducibility tests; mypy-strict clean.
|
|
18
|
+
|
|
19
|
+
### Fixed — stale `seed=` kwarg in 2 bootstrap benchmarks (#91)
|
|
20
|
+
|
|
21
|
+
`tests/benchmarks/test_kernel_benchmarks.py` passed `seed=` to `bootstrap_ci` / `paired_bootstrap_diff`, but those parameters migrated to `rng=` (SPEC 7) — the two bootstrap benchmark tests `TypeError`'d on the nightly-benchmarks workflow (excluded from PR CI via `-m "not benchmark"`, so it went unnoticed). 2-line `seed=`→`rng=` rename.
|
|
22
|
+
|
|
8
23
|
## [1.6.0] — 2026-05-29 — Tier-2 `eda` Job-2 + Job-3: shortcut + shift diagnostics (#86, #87)
|
|
9
24
|
|
|
10
25
|
`eval_toolkit.eda.*` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, backward-compatible. Completes the EDA-first analytic layer above the v1.5.0 Job-1 integrity gate: **Job-2** lexical shortcut diagnostics (`lexical_association`, #86) and **Job-3** distribution-shift quantification (`distribution_shift`, #87). Both are dogfooded by the consumer portfolio's pre-modeling OOD-wall prediction (V5 + V9).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -129,6 +129,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
129
129
|
"block_bootstrap_on_folds": "eval_toolkit.bootstrap",
|
|
130
130
|
"bonferroni_correct": "eval_toolkit.bootstrap",
|
|
131
131
|
"bootstrap_ci": "eval_toolkit.bootstrap",
|
|
132
|
+
"cluster_bootstrap_ci": "eval_toolkit.bootstrap",
|
|
132
133
|
"correct_p_values": "eval_toolkit.bootstrap",
|
|
133
134
|
"cross_validate_metric": "eval_toolkit.bootstrap",
|
|
134
135
|
"cv_clt_ci": "eval_toolkit.bootstrap",
|
|
@@ -52,6 +52,7 @@ __all__ = [
|
|
|
52
52
|
"block_bootstrap_on_folds",
|
|
53
53
|
"bonferroni_correct",
|
|
54
54
|
"bootstrap_ci",
|
|
55
|
+
"cluster_bootstrap_ci",
|
|
55
56
|
"correct_p_values",
|
|
56
57
|
"cross_validate_metric",
|
|
57
58
|
"cv_clt_ci",
|
|
@@ -1447,6 +1448,205 @@ def block_bootstrap_on_folds(
|
|
|
1447
1448
|
)
|
|
1448
1449
|
|
|
1449
1450
|
|
|
1451
|
+
def _label_cluster_units(y_true: np.ndarray, groups: np.ndarray) -> dict[int, list[np.ndarray]]:
|
|
1452
|
+
"""Index rows by ``(label, group)``: per label, a list of per-group row-index arrays.
|
|
1453
|
+
|
|
1454
|
+
A group that appears under both labels contributes a **separate** index array to each label's
|
|
1455
|
+
list — the resample unit is ``(label, group)``, so a mixed-label group (e.g. a document with
|
|
1456
|
+
both a poisoned and a benign variant sharing one id) splits into one positive unit and one
|
|
1457
|
+
negative unit, resampled independently. Helper for :func:`cluster_bootstrap_ci`.
|
|
1458
|
+
"""
|
|
1459
|
+
units: dict[int, list[np.ndarray]] = {}
|
|
1460
|
+
for lab in np.unique(y_true):
|
|
1461
|
+
lab_rows = np.flatnonzero(y_true == lab)
|
|
1462
|
+
order = np.argsort(groups[lab_rows], kind="stable")
|
|
1463
|
+
sorted_rows = lab_rows[order]
|
|
1464
|
+
sorted_groups = groups[lab_rows][order]
|
|
1465
|
+
# Split at the boundaries between consecutive distinct group ids (post-sort).
|
|
1466
|
+
cut = np.flatnonzero(sorted_groups[1:] != sorted_groups[:-1]) + 1
|
|
1467
|
+
units[int(lab)] = np.split(sorted_rows, cut)
|
|
1468
|
+
return units
|
|
1469
|
+
|
|
1470
|
+
|
|
1471
|
+
def _cluster_bootstrap_step(
|
|
1472
|
+
seed_seq: np.random.SeedSequence,
|
|
1473
|
+
*,
|
|
1474
|
+
y_true: np.ndarray,
|
|
1475
|
+
y_score: np.ndarray,
|
|
1476
|
+
units: dict[int, list[np.ndarray]],
|
|
1477
|
+
statistic: MetricFn,
|
|
1478
|
+
resample_labels: tuple[int, ...],
|
|
1479
|
+
) -> float | None:
|
|
1480
|
+
"""One cluster-resampled draw of ``statistic`` (module-level for parallel_map picklability).
|
|
1481
|
+
|
|
1482
|
+
For each label, its ``(label, group)`` units are resampled with replacement when the label is
|
|
1483
|
+
in ``resample_labels``, else all its rows are held fixed. Returns the statistic on the gathered
|
|
1484
|
+
rows, or ``None`` if the draw is degenerate (statistic raises — e.g. a single-class draw).
|
|
1485
|
+
"""
|
|
1486
|
+
rng = np.random.default_rng(seed_seq)
|
|
1487
|
+
parts: list[np.ndarray] = []
|
|
1488
|
+
for lab, group_rows in units.items():
|
|
1489
|
+
if lab in resample_labels:
|
|
1490
|
+
chosen = rng.integers(0, len(group_rows), size=len(group_rows))
|
|
1491
|
+
parts.extend(group_rows[c] for c in chosen)
|
|
1492
|
+
else:
|
|
1493
|
+
parts.extend(group_rows)
|
|
1494
|
+
idx = np.concatenate(parts)
|
|
1495
|
+
try:
|
|
1496
|
+
return float(statistic(y_true[idx], y_score[idx]))
|
|
1497
|
+
except (ValueError, RuntimeError):
|
|
1498
|
+
return None
|
|
1499
|
+
|
|
1500
|
+
|
|
1501
|
+
def cluster_bootstrap_ci(
|
|
1502
|
+
y_true: np.ndarray,
|
|
1503
|
+
y_score: np.ndarray,
|
|
1504
|
+
groups: np.ndarray,
|
|
1505
|
+
statistic: MetricFn,
|
|
1506
|
+
*,
|
|
1507
|
+
resample_labels: tuple[int, ...] = (0, 1),
|
|
1508
|
+
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
1509
|
+
confidence: float = DEFAULT_CONFIDENCE,
|
|
1510
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
1511
|
+
n_jobs: int = 1,
|
|
1512
|
+
) -> BootstrapCI:
|
|
1513
|
+
r"""Label-stratified **cluster** (group) bootstrap percentile CI for a single-condition metric.
|
|
1514
|
+
|
|
1515
|
+
Resamples whole ``groups`` (clusters) with replacement rather than individual rows, so the CI
|
|
1516
|
+
is honest under intra-cluster correlation (multiple prompts sharing one attack payload; a
|
|
1517
|
+
document contributing both a poisoned and a benign row). The resample unit is ``(label,
|
|
1518
|
+
group)``: by default (``resample_labels=(0, 1)``) positive-clusters and negative-clusters are
|
|
1519
|
+
resampled **separately**, preserving the per-class cluster split so a draw is never
|
|
1520
|
+
single-class. Pass ``resample_labels=(1,)`` to resample only the positive clusters while holding
|
|
1521
|
+
all negatives fixed (the payload-cluster convention).
|
|
1522
|
+
|
|
1523
|
+
Where :func:`bootstrap_ci` resamples **rows** (i.i.d. assumption) and
|
|
1524
|
+
:func:`block_bootstrap_on_folds` resamples **per-fold scalars**, this resamples **clusters of
|
|
1525
|
+
rows** — the missing middle for grouped eval data. The analytic row-level AUROC-difference CI
|
|
1526
|
+
(:func:`delong_roc_variance`) assumes row independence and so under-covers on clustered data,
|
|
1527
|
+
which is the motivation for this estimator.
|
|
1528
|
+
|
|
1529
|
+
Parameters
|
|
1530
|
+
----------
|
|
1531
|
+
y_true : np.ndarray, shape (n,)
|
|
1532
|
+
Binary labels in ``{0, 1}``.
|
|
1533
|
+
y_score : np.ndarray, shape (n,)
|
|
1534
|
+
Scores aligned with ``y_true``.
|
|
1535
|
+
groups : np.ndarray, shape (n,)
|
|
1536
|
+
Cluster id per row (any sortable dtype — ints or strings).
|
|
1537
|
+
statistic : callable ``(y_true, y_score) -> float``
|
|
1538
|
+
Metric to bootstrap (e.g. ``roc_auc``). Must be **picklable** when ``n_jobs != 1`` (a named
|
|
1539
|
+
top-level function — lambdas / closures are rejected).
|
|
1540
|
+
resample_labels : tuple[int, ...], optional
|
|
1541
|
+
Which label strata are cluster-resampled (default ``(0, 1)`` — both). Labels not listed are
|
|
1542
|
+
held fixed (all their rows always included). Must be non-empty.
|
|
1543
|
+
n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
|
|
1544
|
+
n_jobs : int, optional
|
|
1545
|
+
Parallel workers (default 1 — sequential). ``n_jobs=-1`` uses all cores; ``n_jobs=0`` is
|
|
1546
|
+
rejected. Per-resample seeding via :func:`spawn_seed_sequences` makes the CI **bit-for-bit
|
|
1547
|
+
identical across ``n_jobs``** for a fixed ``rng``. See :ref:`methodology/parallelism`.
|
|
1548
|
+
|
|
1549
|
+
Returns
|
|
1550
|
+
-------
|
|
1551
|
+
BootstrapCI
|
|
1552
|
+
``method="cluster_percentile"``; ``point_estimate = statistic(y_true, y_score)`` on the full
|
|
1553
|
+
data; ``[alpha/2, 1 - alpha/2]`` percentile CI over the cluster-resampled distribution.
|
|
1554
|
+
|
|
1555
|
+
Raises
|
|
1556
|
+
------
|
|
1557
|
+
ValueError
|
|
1558
|
+
On shape mismatch, non-1-D input, ``n < 10``, ``confidence`` outside (0, 1), empty
|
|
1559
|
+
``resample_labels``, a ``resample_labels`` entry absent from ``y_true``, ``n_jobs == 0``, or
|
|
1560
|
+
> 5% degenerate resamples.
|
|
1561
|
+
TypeError
|
|
1562
|
+
If ``n_jobs != 1`` and ``statistic`` is not picklable.
|
|
1563
|
+
|
|
1564
|
+
Examples
|
|
1565
|
+
--------
|
|
1566
|
+
>>> import numpy as np
|
|
1567
|
+
>>> from eval_toolkit.metrics import roc_auc
|
|
1568
|
+
>>> rng = np.random.default_rng(0)
|
|
1569
|
+
>>> groups = np.repeat(np.arange(40), 5) # 40 clusters of 5 rows
|
|
1570
|
+
>>> y = (groups % 2).astype(int) # cluster-pure labels
|
|
1571
|
+
>>> s = y + rng.normal(0, 0.3, size=y.size)
|
|
1572
|
+
>>> ci = cluster_bootstrap_ci(y, s, groups, roc_auc, n_resamples=200, rng=0)
|
|
1573
|
+
>>> ci.method
|
|
1574
|
+
'cluster_percentile'
|
|
1575
|
+
>>> bool(0.0 <= ci.ci_low <= ci.ci_high <= 1.0)
|
|
1576
|
+
True
|
|
1577
|
+
|
|
1578
|
+
Notes
|
|
1579
|
+
-----
|
|
1580
|
+
For a *gap* statistic with a fixed offset (e.g. ``Gx = val_auc − test_auc`` with ``val_auc``
|
|
1581
|
+
held fixed), bootstrap the variable term and shift the bounds: ``Gx_low = val_auc − ci.ci_high``,
|
|
1582
|
+
``Gx_high = val_auc − ci.ci_low``. For a one-sided 95% bound, pass ``confidence=0.90`` and read
|
|
1583
|
+
the relevant bound.
|
|
1584
|
+
|
|
1585
|
+
References
|
|
1586
|
+
----------
|
|
1587
|
+
.. [1] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap." Chapman & Hall, 1993.
|
|
1588
|
+
(§8 — bootstrapping stratified / clustered data.)
|
|
1589
|
+
.. [2] Field, C. A. & Welsh, A. H. "Bootstrapping clustered data." JRSS-B 69(3), 2007.
|
|
1590
|
+
"""
|
|
1591
|
+
y_true_arr = np.asarray(y_true)
|
|
1592
|
+
y_score_arr = np.asarray(y_score)
|
|
1593
|
+
groups_arr = np.asarray(groups)
|
|
1594
|
+
if not (y_true_arr.shape == y_score_arr.shape == groups_arr.shape):
|
|
1595
|
+
raise ValueError(
|
|
1596
|
+
f"shapes mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}, "
|
|
1597
|
+
f"groups {groups_arr.shape}"
|
|
1598
|
+
)
|
|
1599
|
+
if y_true_arr.ndim != 1:
|
|
1600
|
+
raise ValueError(f"inputs must be 1-D; got shape {y_true_arr.shape}")
|
|
1601
|
+
n = int(y_true_arr.size)
|
|
1602
|
+
if n < 10:
|
|
1603
|
+
raise ValueError(f"n={n} too small for cluster bootstrap; need ≥ 10")
|
|
1604
|
+
if not 0.0 < confidence < 1.0:
|
|
1605
|
+
raise ValueError(f"confidence must be in (0, 1), got {confidence}")
|
|
1606
|
+
resample_labels = tuple(int(x) for x in resample_labels)
|
|
1607
|
+
if not resample_labels:
|
|
1608
|
+
raise ValueError("resample_labels must be non-empty (nothing would be resampled)")
|
|
1609
|
+
present = {int(v) for v in np.unique(y_true_arr).tolist()}
|
|
1610
|
+
missing = set(resample_labels) - present
|
|
1611
|
+
if missing:
|
|
1612
|
+
raise ValueError(
|
|
1613
|
+
f"resample_labels {sorted(missing)} absent from y_true (present: {sorted(present)})"
|
|
1614
|
+
)
|
|
1615
|
+
|
|
1616
|
+
point = float(statistic(y_true_arr, y_score_arr))
|
|
1617
|
+
units = _label_cluster_units(y_true_arr, groups_arr)
|
|
1618
|
+
seed_seqs = spawn_seed_sequences(rng, n_resamples)
|
|
1619
|
+
step = functools.partial(
|
|
1620
|
+
_cluster_bootstrap_step,
|
|
1621
|
+
y_true=y_true_arr,
|
|
1622
|
+
y_score=y_score_arr,
|
|
1623
|
+
units=units,
|
|
1624
|
+
statistic=statistic,
|
|
1625
|
+
resample_labels=resample_labels,
|
|
1626
|
+
)
|
|
1627
|
+
raw = parallel_map(step, seed_seqs, n_jobs=n_jobs, description="cluster_bootstrap_ci")
|
|
1628
|
+
failures = sum(1 for r in raw if r is None)
|
|
1629
|
+
vals = [r for r in raw if r is not None]
|
|
1630
|
+
if failures > 0.05 * n_resamples:
|
|
1631
|
+
raise ValueError(
|
|
1632
|
+
f"cluster_bootstrap_ci: {failures}/{n_resamples} resamples degenerate "
|
|
1633
|
+
"(statistic raised — e.g. single-class draws); refusing to compute CI on > 5% degenerate"
|
|
1634
|
+
)
|
|
1635
|
+
if not vals:
|
|
1636
|
+
raise ValueError("cluster_bootstrap_ci: no usable resamples")
|
|
1637
|
+
arr = np.asarray(vals, dtype=np.float64)
|
|
1638
|
+
alpha = 1.0 - confidence
|
|
1639
|
+
ci_low, ci_high = np.quantile(arr, [alpha / 2.0, 1.0 - alpha / 2.0])
|
|
1640
|
+
return BootstrapCI(
|
|
1641
|
+
point_estimate=point,
|
|
1642
|
+
ci_low=float(ci_low),
|
|
1643
|
+
ci_high=float(ci_high),
|
|
1644
|
+
confidence=confidence,
|
|
1645
|
+
n_resamples=int(len(vals)),
|
|
1646
|
+
method="cluster_percentile",
|
|
1647
|
+
)
|
|
1648
|
+
|
|
1649
|
+
|
|
1450
1650
|
def cross_validate_metric(
|
|
1451
1651
|
y_true: np.ndarray,
|
|
1452
1652
|
y_score: np.ndarray,
|
|
@@ -136,7 +136,7 @@ def test_benchmark_bootstrap_ci_pr_auc_n200(
|
|
|
136
136
|
y, s = yspc_n200
|
|
137
137
|
|
|
138
138
|
def _run() -> float:
|
|
139
|
-
return bootstrap_ci(y, s, metric=pr_auc, n_resamples=200,
|
|
139
|
+
return bootstrap_ci(y, s, metric=pr_auc, n_resamples=200, rng=42).point_estimate
|
|
140
140
|
|
|
141
141
|
result = benchmark(_run)
|
|
142
142
|
assert 0.0 <= result <= 1.0
|
|
@@ -155,7 +155,7 @@ def test_benchmark_paired_bootstrap_diff_pr_auc_n200(
|
|
|
155
155
|
y, s_a, s_b = y_two_scorers_n200
|
|
156
156
|
|
|
157
157
|
def _run() -> float:
|
|
158
|
-
return paired_bootstrap_diff(y, s_a, s_b, metric=pr_auc, n_resamples=200,
|
|
158
|
+
return paired_bootstrap_diff(y, s_a, s_b, metric=pr_auc, n_resamples=200, rng=42).delta
|
|
159
159
|
|
|
160
160
|
result = benchmark(_run)
|
|
161
161
|
# Delta can be negative (B worse than A) — just verify it's finite + in expected range
|
|
@@ -148,6 +148,7 @@
|
|
|
148
148
|
"bootstrap_metric_from_predictions",
|
|
149
149
|
"brier_decomposition",
|
|
150
150
|
"capture_git_sha",
|
|
151
|
+
"cluster_bootstrap_ci",
|
|
151
152
|
"compute_file_hash",
|
|
152
153
|
"compute_label_overlap",
|
|
153
154
|
"correct_p_values",
|
|
@@ -1429,7 +1430,7 @@
|
|
|
1429
1430
|
"doc_first_line": "str(object='') -> str",
|
|
1430
1431
|
"kind": "value",
|
|
1431
1432
|
"type": "str",
|
|
1432
|
-
"value": "'1.
|
|
1433
|
+
"value": "'1.7.0'"
|
|
1433
1434
|
},
|
|
1434
1435
|
"apply_operating_points": {
|
|
1435
1436
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1476,6 +1477,11 @@
|
|
|
1476
1477
|
"kind": "function",
|
|
1477
1478
|
"signature": "(repo_root: 'Path | str | None' = None) -> 'str | None'"
|
|
1478
1479
|
},
|
|
1480
|
+
"cluster_bootstrap_ci": {
|
|
1481
|
+
"doc_first_line": "Label-stratified **cluster** (group) bootstrap percentile CI for a single-condition metric.",
|
|
1482
|
+
"kind": "function",
|
|
1483
|
+
"signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray', groups: 'np.ndarray', statistic: 'MetricFn', *, resample_labels: 'tuple[int, ...]' = (0, 1), n_resamples: 'int' = 1000, confidence: 'float' = 0.95, rng: 'RNGLike | SeedLike | None' = 42, n_jobs: 'int' = 1) -> 'BootstrapCI'"
|
|
1484
|
+
},
|
|
1479
1485
|
"compute_file_hash": {
|
|
1480
1486
|
"doc_first_line": "SHA-256 hex digest of an existing file (sentinel-typed).",
|
|
1481
1487
|
"kind": "function",
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Tests for :func:`eval_toolkit.bootstrap.cluster_bootstrap_ci`.
|
|
2
|
+
|
|
3
|
+
Covers the public contract (percentile CI, method tag, validation), the ``(label, group)``
|
|
4
|
+
resample-unit semantics (mixed-label groups split by label), the ``resample_labels`` knob, the
|
|
5
|
+
v0.34.0 n_jobs-reproducibility contract (same seed → identical CI across worker counts), and the
|
|
6
|
+
methodological reason the function exists: under strong intra-cluster correlation the cluster
|
|
7
|
+
bootstrap CI is **wider** than a naive row bootstrap (which would under-cover).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from eval_toolkit.bootstrap import (
|
|
16
|
+
BootstrapCI,
|
|
17
|
+
_label_cluster_units,
|
|
18
|
+
bootstrap_ci,
|
|
19
|
+
cluster_bootstrap_ci,
|
|
20
|
+
)
|
|
21
|
+
from eval_toolkit.metrics import roc_auc
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _clustered_inputs(
|
|
25
|
+
n_clusters: int = 40, rows_per_cluster: int = 5, *, seed: int = 0
|
|
26
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
27
|
+
"""Cluster-pure labels (cluster parity) + a separable score with cluster-level noise."""
|
|
28
|
+
rng = np.random.default_rng(seed)
|
|
29
|
+
groups = np.repeat(np.arange(n_clusters), rows_per_cluster)
|
|
30
|
+
y = (groups % 2).astype(int)
|
|
31
|
+
cluster_offset = rng.normal(0, 0.5, size=n_clusters)[groups]
|
|
32
|
+
s = y + cluster_offset + rng.normal(0, 0.2, size=y.size)
|
|
33
|
+
return y, s, groups
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.mark.unit
|
|
37
|
+
def test_returns_bootstrap_ci_with_cluster_method() -> None:
|
|
38
|
+
"""Basic contract: BootstrapCI, cluster_percentile method, ordered CI, point in range."""
|
|
39
|
+
y, s, g = _clustered_inputs()
|
|
40
|
+
ci = cluster_bootstrap_ci(y, s, g, roc_auc, n_resamples=300, rng=0)
|
|
41
|
+
assert isinstance(ci, BootstrapCI)
|
|
42
|
+
assert ci.method == "cluster_percentile"
|
|
43
|
+
assert ci.n_resamples == 300
|
|
44
|
+
assert 0.0 <= ci.ci_low <= ci.ci_high <= 1.0
|
|
45
|
+
assert ci.ci_low <= ci.point_estimate <= ci.ci_high
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.mark.unit
|
|
49
|
+
def test_label_cluster_units_splits_mixed_label_group() -> None:
|
|
50
|
+
"""A group id present under both labels splits into one positive unit and one negative unit."""
|
|
51
|
+
y = np.array([1, 1, 0, 0, 1])
|
|
52
|
+
groups = np.array(["g0", "g0", "g0", "g1", "g2"]) # g0 is mixed-label
|
|
53
|
+
units = _label_cluster_units(y, groups)
|
|
54
|
+
assert set(np.concatenate(units[1]).tolist()) == {0, 1, 4} # positives: g0 + g2
|
|
55
|
+
assert set(np.concatenate(units[0]).tolist()) == {2, 3} # negatives: g0 + g1
|
|
56
|
+
assert len(units[1]) == 2 and len(units[0]) == 2 # g0 counted on both sides
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@pytest.mark.unit
|
|
60
|
+
def test_resample_labels_positive_only_holds_negatives_fixed() -> None:
|
|
61
|
+
"""resample_labels=(1,) resamples positive clusters and keeps all negatives fixed."""
|
|
62
|
+
y, s, g = _clustered_inputs()
|
|
63
|
+
ci_pos = cluster_bootstrap_ci(y, s, g, roc_auc, resample_labels=(1,), n_resamples=300, rng=0)
|
|
64
|
+
ci_both = cluster_bootstrap_ci(y, s, g, roc_auc, resample_labels=(0, 1), n_resamples=300, rng=0)
|
|
65
|
+
# Both valid CIs; holding negatives fixed removes one variance source → no wider than both-sides.
|
|
66
|
+
assert ci_pos.ci_low <= ci_pos.ci_high
|
|
67
|
+
assert (ci_pos.ci_high - ci_pos.ci_low) <= (ci_both.ci_high - ci_both.ci_low) + 1e-9
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.unit
|
|
71
|
+
@pytest.mark.slow
|
|
72
|
+
def test_njobs_reproducibility() -> None:
|
|
73
|
+
"""Same seed produces a bit-for-bit-identical CI regardless of n_jobs (spawn_seed_sequences)."""
|
|
74
|
+
y, s, g = _clustered_inputs()
|
|
75
|
+
r1 = cluster_bootstrap_ci(y, s, g, roc_auc, n_resamples=200, rng=42, n_jobs=1)
|
|
76
|
+
r2 = cluster_bootstrap_ci(y, s, g, roc_auc, n_resamples=200, rng=42, n_jobs=2)
|
|
77
|
+
assert (r1.point_estimate, r1.ci_low, r1.ci_high) == (r2.point_estimate, r2.ci_low, r2.ci_high)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@pytest.mark.unit
|
|
81
|
+
@pytest.mark.slow
|
|
82
|
+
def test_njobs_minus_one_runs() -> None:
|
|
83
|
+
"""n_jobs=-1 (all cores) completes without error."""
|
|
84
|
+
y, s, g = _clustered_inputs()
|
|
85
|
+
ci = cluster_bootstrap_ci(y, s, g, roc_auc, n_resamples=100, rng=42, n_jobs=-1)
|
|
86
|
+
assert ci.ci_low <= ci.ci_high
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@pytest.mark.unit
|
|
90
|
+
def test_cluster_ci_wider_than_row_under_intracluster_correlation() -> None:
|
|
91
|
+
"""The reason the function exists: with few, strongly-correlated clusters the cluster bootstrap
|
|
92
|
+
CI is wider than a naive row bootstrap (which would under-cover by treating rows as i.i.d.)."""
|
|
93
|
+
# 10 clusters × 30 rows; the cluster-level offset dominates → rows within a cluster are highly
|
|
94
|
+
# correlated, so the effective sample size is ~10 clusters, not 300 rows.
|
|
95
|
+
rng = np.random.default_rng(7)
|
|
96
|
+
n_clusters, per = 10, 30
|
|
97
|
+
groups = np.repeat(np.arange(n_clusters), per)
|
|
98
|
+
y = (groups % 2).astype(int)
|
|
99
|
+
s = y + rng.normal(0, 0.8, size=n_clusters)[groups] + rng.normal(0, 0.05, size=y.size)
|
|
100
|
+
cluster = cluster_bootstrap_ci(y, s, groups, roc_auc, n_resamples=400, rng=0)
|
|
101
|
+
row = bootstrap_ci(y, s, roc_auc, n_resamples=400, rng=0, method="percentile")
|
|
102
|
+
cluster_width = cluster.ci_high - cluster.ci_low
|
|
103
|
+
row_width = row.ci_high - row.ci_low
|
|
104
|
+
assert cluster_width > row_width, f"cluster {cluster_width:.3f} !> row {row_width:.3f}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@pytest.mark.unit
|
|
108
|
+
@pytest.mark.parametrize(
|
|
109
|
+
("kwargs", "match"),
|
|
110
|
+
[
|
|
111
|
+
({"resample_labels": ()}, "non-empty"),
|
|
112
|
+
({"resample_labels": (2,)}, "absent from y_true"),
|
|
113
|
+
({"confidence": 1.5}, r"confidence must be in \(0, 1\)"),
|
|
114
|
+
({"n_jobs": 0}, "n_jobs"),
|
|
115
|
+
],
|
|
116
|
+
)
|
|
117
|
+
def test_validation_errors(kwargs: dict[str, object], match: str) -> None:
|
|
118
|
+
"""Invalid parameters raise ValueError with a diagnostic message."""
|
|
119
|
+
y, s, g = _clustered_inputs()
|
|
120
|
+
with pytest.raises(ValueError, match=match):
|
|
121
|
+
cluster_bootstrap_ci(y, s, g, roc_auc, n_resamples=50, rng=0, **kwargs) # type: ignore[arg-type]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@pytest.mark.unit
|
|
125
|
+
def test_shape_and_size_validation() -> None:
|
|
126
|
+
"""Shape mismatch and n < 10 raise ValueError."""
|
|
127
|
+
y, s, g = _clustered_inputs(n_clusters=4, rows_per_cluster=3) # n=12
|
|
128
|
+
with pytest.raises(ValueError, match="shapes mismatch"):
|
|
129
|
+
cluster_bootstrap_ci(y, s[:-1], g, roc_auc, n_resamples=50, rng=0)
|
|
130
|
+
with pytest.raises(ValueError, match="too small"):
|
|
131
|
+
cluster_bootstrap_ci(y[:8], s[:8], g[:8], roc_auc, n_resamples=50, rng=0)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png
RENAMED
|
File without changes
|
{eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.6.0 → eval_toolkit-1.7.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|