eval-toolkit 1.7.0__tar.gz → 1.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/CHANGELOG.md +11 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/PKG-INFO +1 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/__init__.py +1 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/bootstrap.py +189 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/public_api/snapshot.json +7 -1
- eval_toolkit-1.8.0/tests/test_stratified_cluster_bootstrap.py +153 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/.gitignore +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/LICENSE +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/STYLE.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/pyproject.toml +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_narrative.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/audit_value_bindings.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/__init__.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/data_audit.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/distribution_shift.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/lexical_association.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/obfuscation.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_audit_value_bindings.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_cluster_bootstrap.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_config.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_eda.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_eda_distribution_shift.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_eda_lexical_association.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_eda_obfuscation.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.8.0] — 2026-06-04 — composite multi-stratum cluster bootstrap (#92)
|
|
9
|
+
|
|
10
|
+
### Added — `bootstrap.stratified_cluster_bootstrap_ci` (composite multi-stratum cluster bootstrap)
|
|
11
|
+
|
|
12
|
+
`eval_toolkit.bootstrap` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible. Generalises the v1.7.0 single-block `cluster_bootstrap_ci` to the shape leave-one-group-out transfer gaps actually take: a **composite statistic reduced over several independently-resampled cluster strata**.
|
|
13
|
+
|
|
14
|
+
- **`stratified_cluster_bootstrap_ci(strata, per_stratum_metric, combine, *, resample_labels=(0,1), …)`** — `strata` is a mapping `{key: (y, score, groups)}` of independent resample-units (e.g. `seed`, `(carrier, seed)`, `(attack_type, seed)`); each bootstrap iteration resamples every stratum's `(label, group)` clusters, computes `per_stratum_metric` on each, and reduces the `{key: metric}` map with `combine` to one scalar (a seed-averaged ROC-AUC gap, a mean-over-carriers gap, a top−bottom per-type AUPRC contrast, …). Percentile `BootstrapCI` (`method="stratified_cluster_percentile"`). `cluster_bootstrap_ci` is the single-stratum, identity-reduce special case.
|
|
15
|
+
- **Why:** the v1.7.0 single-block primitive could not express the **seed-averaging** that real LODO estimators do inside the bootstrap (`Gx = val − mean_seed(test_roc)`), so it did not actually fit the consumer portfolio's attack-type / carrier / dialect bootstraps. This is the correct primitive for them.
|
|
16
|
+
- **Parallel + reproducible:** built on `parallel_map` + `spawn_seed_sequences` ⇒ `n_jobs` gives bit-for-bit-identical CIs; `n_jobs=-1` all cores.
|
|
17
|
+
- Exported via `from eval_toolkit import stratified_cluster_bootstrap_ci`; `__all__` + `_EXPORTS` + doctest + n_jobs-reproducibility / seed-averaged / composite-statistic tests; mypy-strict clean.
|
|
18
|
+
|
|
8
19
|
## [1.7.0] — 2026-06-04 — label-stratified cluster bootstrap (#90, #91)
|
|
9
20
|
|
|
10
21
|
### Added — `bootstrap.cluster_bootstrap_ci` (label-stratified cluster bootstrap)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -140,6 +140,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
140
140
|
"paired_bootstrap_ece_diff": "eval_toolkit.bootstrap",
|
|
141
141
|
"paired_bootstrap_op_point_diff": "eval_toolkit.bootstrap",
|
|
142
142
|
"paired_mde": "eval_toolkit.bootstrap",
|
|
143
|
+
"stratified_cluster_bootstrap_ci": "eval_toolkit.bootstrap",
|
|
143
144
|
# --- calibration ---
|
|
144
145
|
"DEFAULT_FN_COST": "eval_toolkit.calibration",
|
|
145
146
|
"DEFAULT_FP_COST": "eval_toolkit.calibration",
|
|
@@ -22,7 +22,7 @@ from __future__ import annotations
|
|
|
22
22
|
import functools
|
|
23
23
|
import logging
|
|
24
24
|
import warnings
|
|
25
|
-
from collections.abc import Callable
|
|
25
|
+
from collections.abc import Callable, Hashable, Mapping
|
|
26
26
|
from dataclasses import dataclass
|
|
27
27
|
from typing import Final, Literal
|
|
28
28
|
|
|
@@ -63,6 +63,7 @@ __all__ = [
|
|
|
63
63
|
"paired_bootstrap_ece_diff",
|
|
64
64
|
"paired_bootstrap_op_point_diff",
|
|
65
65
|
"paired_mde",
|
|
66
|
+
"stratified_cluster_bootstrap_ci",
|
|
66
67
|
]
|
|
67
68
|
|
|
68
69
|
DEFAULT_N_RESAMPLES: Final[int] = 1000
|
|
@@ -1647,6 +1648,193 @@ def cluster_bootstrap_ci(
|
|
|
1647
1648
|
)
|
|
1648
1649
|
|
|
1649
1650
|
|
|
1651
|
+
def _stratified_cluster_step(
|
|
1652
|
+
seed_seq: np.random.SeedSequence,
|
|
1653
|
+
*,
|
|
1654
|
+
strata_data: dict[Hashable, tuple[np.ndarray, np.ndarray]],
|
|
1655
|
+
strata_units: dict[Hashable, dict[int, list[np.ndarray]]],
|
|
1656
|
+
per_stratum_metric: MetricFn,
|
|
1657
|
+
combine: Callable[[Mapping[Hashable, float]], float],
|
|
1658
|
+
resample_labels: tuple[int, ...],
|
|
1659
|
+
) -> float | None:
|
|
1660
|
+
"""One stratified-cluster draw of ``combine({key: metric})`` (module-level for picklability).
|
|
1661
|
+
|
|
1662
|
+
Each stratum's ``(label, group)`` units are resampled independently (per ``resample_labels``;
|
|
1663
|
+
labels not listed are held fixed) with a single per-resample RNG, the per-stratum metric is
|
|
1664
|
+
computed on the gathered rows, and ``combine`` reduces the ``{key: metric}`` map to one scalar.
|
|
1665
|
+
Returns ``None`` if any stratum metric or ``combine`` raises (a degenerate draw).
|
|
1666
|
+
"""
|
|
1667
|
+
rng = np.random.default_rng(seed_seq)
|
|
1668
|
+
by_key: dict[Hashable, float] = {}
|
|
1669
|
+
for key, units in strata_units.items():
|
|
1670
|
+
y_k, s_k = strata_data[key]
|
|
1671
|
+
parts: list[np.ndarray] = []
|
|
1672
|
+
for lab, group_rows in units.items():
|
|
1673
|
+
if lab in resample_labels:
|
|
1674
|
+
chosen = rng.integers(0, len(group_rows), size=len(group_rows))
|
|
1675
|
+
parts.extend(group_rows[c] for c in chosen)
|
|
1676
|
+
else:
|
|
1677
|
+
parts.extend(group_rows)
|
|
1678
|
+
idx = np.concatenate(parts)
|
|
1679
|
+
try:
|
|
1680
|
+
by_key[key] = float(per_stratum_metric(y_k[idx], s_k[idx]))
|
|
1681
|
+
except (ValueError, RuntimeError):
|
|
1682
|
+
return None
|
|
1683
|
+
try:
|
|
1684
|
+
return float(combine(by_key))
|
|
1685
|
+
except (ValueError, RuntimeError, KeyError, ZeroDivisionError):
|
|
1686
|
+
return None
|
|
1687
|
+
|
|
1688
|
+
|
|
1689
|
+
def stratified_cluster_bootstrap_ci(
|
|
1690
|
+
strata: Mapping[Hashable, tuple[np.ndarray, np.ndarray, np.ndarray]],
|
|
1691
|
+
per_stratum_metric: MetricFn,
|
|
1692
|
+
combine: Callable[[Mapping[Hashable, float]], float],
|
|
1693
|
+
*,
|
|
1694
|
+
resample_labels: tuple[int, ...] = (0, 1),
|
|
1695
|
+
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
1696
|
+
confidence: float = DEFAULT_CONFIDENCE,
|
|
1697
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
1698
|
+
n_jobs: int = 1,
|
|
1699
|
+
) -> BootstrapCI:
|
|
1700
|
+
r"""Cluster bootstrap of a **composite** statistic over several independent strata.
|
|
1701
|
+
|
|
1702
|
+
Generalises :func:`cluster_bootstrap_ci` (one condition, one metric) to a statistic that is a
|
|
1703
|
+
user-supplied **reduction over several independently-resampled cluster strata** — the shape that
|
|
1704
|
+
leave-one-group-out transfer gaps take in practice: a per-seed (and per-group) cluster resample,
|
|
1705
|
+
averaged/combined into one scalar (a seed-averaged ROC-AUC gap, a mean-over-carriers gap, a
|
|
1706
|
+
top-minus-bottom per-type AUPRC contrast, …). Each bootstrap iteration resamples every stratum's
|
|
1707
|
+
``(label, group)`` clusters (label-stratified, per ``resample_labels``; labels not listed are
|
|
1708
|
+
held fixed), computes ``per_stratum_metric`` on each, and reduces the ``{key: metric}`` map with
|
|
1709
|
+
``combine``; the percentile CI is over those reduced values.
|
|
1710
|
+
|
|
1711
|
+
:func:`cluster_bootstrap_ci` is the single-stratum, identity-reduce special case
|
|
1712
|
+
(``strata={0: (y, score, groups)}``, ``combine=lambda m: m[0]``).
|
|
1713
|
+
|
|
1714
|
+
Parameters
|
|
1715
|
+
----------
|
|
1716
|
+
strata : Mapping[key, (y_true, y_score, groups)]
|
|
1717
|
+
Independent resample-units keyed by any hashable (e.g. ``seed`` / ``(carrier, seed)`` /
|
|
1718
|
+
``(attack_type, seed)``). Each value is three aligned 1-D arrays. Iteration order is the
|
|
1719
|
+
mapping's order (stable ⇒ deterministic).
|
|
1720
|
+
per_stratum_metric : callable ``(y_true, y_score) -> float``
|
|
1721
|
+
Metric computed on each stratum's resampled rows (e.g. ``roc_auc``, ``pr_auc``). Must be
|
|
1722
|
+
**picklable** when ``n_jobs != 1``.
|
|
1723
|
+
combine : callable ``Mapping[key, float] -> float``
|
|
1724
|
+
Reduces the per-stratum metrics to the composite statistic (e.g.
|
|
1725
|
+
``val − mean_seed(m)``; ``mean_carrier(val[c] − mean_seed(m[c, ·]))``; a top−bottom contrast).
|
|
1726
|
+
Closes over any fixed quantities (val ROC, the type partition) — pass a **picklable**
|
|
1727
|
+
top-level function or ``functools.partial`` when ``n_jobs != 1`` (lambdas are fine at
|
|
1728
|
+
``n_jobs == 1``).
|
|
1729
|
+
resample_labels : tuple[int, ...], optional
|
|
1730
|
+
Which label strata are cluster-resampled within each stratum (default ``(0, 1)``);
|
|
1731
|
+
``(1,)`` resamples only positive clusters, holding negatives fixed (the payload-cluster
|
|
1732
|
+
convention). Labels not present in a given stratum are simply skipped there.
|
|
1733
|
+
n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
|
|
1734
|
+
n_jobs : int, optional
|
|
1735
|
+
Parallel workers (default 1). Per-resample seeding via :func:`spawn_seed_sequences` makes the
|
|
1736
|
+
CI **bit-for-bit identical across ``n_jobs``**; ``n_jobs=-1`` uses all cores. See
|
|
1737
|
+
:ref:`methodology/parallelism`.
|
|
1738
|
+
|
|
1739
|
+
Returns
|
|
1740
|
+
-------
|
|
1741
|
+
BootstrapCI
|
|
1742
|
+
``method="stratified_cluster_percentile"``; ``point_estimate = combine({key:
|
|
1743
|
+
per_stratum_metric(y, score)})`` on the full data; ``[alpha/2, 1 - alpha/2]`` percentile CI.
|
|
1744
|
+
|
|
1745
|
+
Raises
|
|
1746
|
+
------
|
|
1747
|
+
ValueError
|
|
1748
|
+
On empty ``strata``, a stratum whose arrays mismatch shape / are not 1-D, empty
|
|
1749
|
+
``resample_labels``, ``confidence`` outside (0, 1), ``n_jobs == 0``, or > 5% degenerate
|
|
1750
|
+
resamples.
|
|
1751
|
+
TypeError
|
|
1752
|
+
If ``n_jobs != 1`` and ``per_stratum_metric`` / ``combine`` are not picklable.
|
|
1753
|
+
|
|
1754
|
+
Examples
|
|
1755
|
+
--------
|
|
1756
|
+
>>> import numpy as np
|
|
1757
|
+
>>> from eval_toolkit.metrics import roc_auc
|
|
1758
|
+
>>> def _block(seed):
|
|
1759
|
+
... g = np.repeat(np.arange(20), 4) # 20 clusters of 4
|
|
1760
|
+
... y = (g % 2).astype(int)
|
|
1761
|
+
... s = y + np.random.default_rng(seed).normal(0, 0.3, size=y.size)
|
|
1762
|
+
... return y, s, g
|
|
1763
|
+
>>> strata = {0: _block(0), 1: _block(1)} # two seeds
|
|
1764
|
+
>>> ci = stratified_cluster_bootstrap_ci(
|
|
1765
|
+
... strata, roc_auc, lambda m: float(np.mean(list(m.values()))),
|
|
1766
|
+
... n_resamples=200, rng=0)
|
|
1767
|
+
>>> ci.method
|
|
1768
|
+
'stratified_cluster_percentile'
|
|
1769
|
+
>>> bool(0.0 <= ci.ci_low <= ci.ci_high <= 1.0)
|
|
1770
|
+
True
|
|
1771
|
+
|
|
1772
|
+
Notes
|
|
1773
|
+
-----
|
|
1774
|
+
For a *gap* with a fixed offset (``Gx = val − stat``, ``val`` fixed), fold the offset into
|
|
1775
|
+
``combine`` (``combine`` returns ``val − mean(m.values())``) so the CI is on ``Gx`` directly.
|
|
1776
|
+
|
|
1777
|
+
References
|
|
1778
|
+
----------
|
|
1779
|
+
.. [1] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap." Chapman & Hall, 1993.
|
|
1780
|
+
(§8 — stratified / clustered resampling.)
|
|
1781
|
+
.. [2] Field, C. A. & Welsh, A. H. "Bootstrapping clustered data." JRSS-B 69(3), 2007.
|
|
1782
|
+
"""
|
|
1783
|
+
if not strata:
|
|
1784
|
+
raise ValueError("strata must be non-empty")
|
|
1785
|
+
if not 0.0 < confidence < 1.0:
|
|
1786
|
+
raise ValueError(f"confidence must be in (0, 1), got {confidence}")
|
|
1787
|
+
resample_labels = tuple(int(x) for x in resample_labels)
|
|
1788
|
+
if not resample_labels:
|
|
1789
|
+
raise ValueError("resample_labels must be non-empty (nothing would be resampled)")
|
|
1790
|
+
|
|
1791
|
+
strata_data: dict[Hashable, tuple[np.ndarray, np.ndarray]] = {}
|
|
1792
|
+
strata_units: dict[Hashable, dict[int, list[np.ndarray]]] = {}
|
|
1793
|
+
for key, triple in strata.items():
|
|
1794
|
+
y_a, s_a, g_a = (np.asarray(triple[0]), np.asarray(triple[1]), np.asarray(triple[2]))
|
|
1795
|
+
if not (y_a.shape == s_a.shape == g_a.shape) or y_a.ndim != 1:
|
|
1796
|
+
raise ValueError(
|
|
1797
|
+
f"stratum {key!r}: y/score/groups must be aligned 1-D arrays; got "
|
|
1798
|
+
f"{y_a.shape}, {s_a.shape}, {g_a.shape}"
|
|
1799
|
+
)
|
|
1800
|
+
strata_data[key] = (y_a, s_a)
|
|
1801
|
+
strata_units[key] = _label_cluster_units(y_a, g_a)
|
|
1802
|
+
|
|
1803
|
+
point = float(combine({k: float(per_stratum_metric(*strata_data[k])) for k in strata_data}))
|
|
1804
|
+
seed_seqs = spawn_seed_sequences(rng, n_resamples)
|
|
1805
|
+
step = functools.partial(
|
|
1806
|
+
_stratified_cluster_step,
|
|
1807
|
+
strata_data=strata_data,
|
|
1808
|
+
strata_units=strata_units,
|
|
1809
|
+
per_stratum_metric=per_stratum_metric,
|
|
1810
|
+
combine=combine,
|
|
1811
|
+
resample_labels=resample_labels,
|
|
1812
|
+
)
|
|
1813
|
+
raw = parallel_map(
|
|
1814
|
+
step, seed_seqs, n_jobs=n_jobs, description="stratified_cluster_bootstrap_ci"
|
|
1815
|
+
)
|
|
1816
|
+
failures = sum(1 for r in raw if r is None)
|
|
1817
|
+
vals = [r for r in raw if r is not None]
|
|
1818
|
+
if failures > 0.05 * n_resamples:
|
|
1819
|
+
raise ValueError(
|
|
1820
|
+
f"stratified_cluster_bootstrap_ci: {failures}/{n_resamples} resamples degenerate "
|
|
1821
|
+
"(a per-stratum metric or combine raised); refusing CI on > 5% degenerate"
|
|
1822
|
+
)
|
|
1823
|
+
if not vals:
|
|
1824
|
+
raise ValueError("stratified_cluster_bootstrap_ci: no usable resamples")
|
|
1825
|
+
arr = np.asarray(vals, dtype=np.float64)
|
|
1826
|
+
alpha = 1.0 - confidence
|
|
1827
|
+
ci_low, ci_high = np.quantile(arr, [alpha / 2.0, 1.0 - alpha / 2.0])
|
|
1828
|
+
return BootstrapCI(
|
|
1829
|
+
point_estimate=point,
|
|
1830
|
+
ci_low=float(ci_low),
|
|
1831
|
+
ci_high=float(ci_high),
|
|
1832
|
+
confidence=confidence,
|
|
1833
|
+
n_resamples=int(len(vals)),
|
|
1834
|
+
method="stratified_cluster_percentile",
|
|
1835
|
+
)
|
|
1836
|
+
|
|
1837
|
+
|
|
1650
1838
|
def cross_validate_metric(
|
|
1651
1839
|
y_true: np.ndarray,
|
|
1652
1840
|
y_score: np.ndarray,
|
|
@@ -242,6 +242,7 @@
|
|
|
242
242
|
"skipped_metric",
|
|
243
243
|
"source_role_gate",
|
|
244
244
|
"split_provenance_config",
|
|
245
|
+
"stratified_cluster_bootstrap_ci",
|
|
245
246
|
"stratified_recall",
|
|
246
247
|
"strict_artifact_gate",
|
|
247
248
|
"sweep",
|
|
@@ -1430,7 +1431,7 @@
|
|
|
1430
1431
|
"doc_first_line": "str(object='') -> str",
|
|
1431
1432
|
"kind": "value",
|
|
1432
1433
|
"type": "str",
|
|
1433
|
-
"value": "'1.
|
|
1434
|
+
"value": "'1.8.0'"
|
|
1434
1435
|
},
|
|
1435
1436
|
"apply_operating_points": {
|
|
1436
1437
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1947,6 +1948,11 @@
|
|
|
1947
1948
|
"kind": "function",
|
|
1948
1949
|
"signature": "(config: 'Mapping[str, Any]', repo_root: 'Path | str | None' = None, *, path_keys: 'tuple[str, ...]' = ('path', 'dir', 'file', 'splits_dir', 'model_path')) -> 'dict[str, Any]'"
|
|
1949
1950
|
},
|
|
1951
|
+
"stratified_cluster_bootstrap_ci": {
|
|
1952
|
+
"doc_first_line": "Cluster bootstrap of a **composite** statistic over several independent strata.",
|
|
1953
|
+
"kind": "function",
|
|
1954
|
+
"signature": "(strata: 'Mapping[Hashable, tuple[np.ndarray, np.ndarray, np.ndarray]]', per_stratum_metric: 'MetricFn', combine: 'Callable[[Mapping[Hashable, float]], float]', *, resample_labels: 'tuple[int, ...]' = (0, 1), n_resamples: 'int' = 1000, confidence: 'float' = 0.95, rng: 'RNGLike | SeedLike | None' = 42, n_jobs: 'int' = 1) -> 'BootstrapCI'"
|
|
1955
|
+
},
|
|
1950
1956
|
"stratified_recall": {
|
|
1951
1957
|
"doc_first_line": "Recall (TPR) per categorical stratum.",
|
|
1952
1958
|
"kind": "function",
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Tests for :func:`eval_toolkit.bootstrap.stratified_cluster_bootstrap_ci`.
|
|
2
|
+
|
|
3
|
+
Covers the public contract, the **single-stratum identity reduction == `cluster_bootstrap_ci`**
|
|
4
|
+
equivalence, the **seed-averaged gap** shape (`Gx = val − mean_seed(metric)`), a **composite
|
|
5
|
+
top−bottom** statistic over strata, the v0.34.0 n_jobs-reproducibility contract, and validation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import functools
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from eval_toolkit.bootstrap import (
|
|
16
|
+
BootstrapCI,
|
|
17
|
+
cluster_bootstrap_ci,
|
|
18
|
+
stratified_cluster_bootstrap_ci,
|
|
19
|
+
)
|
|
20
|
+
from eval_toolkit.metrics import roc_auc
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _stratum(
|
|
24
|
+
seed: int, *, n_clusters: int = 30, per: int = 4
|
|
25
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
26
|
+
"""Cluster-pure labels + a separable score with cluster-level noise (one resample-unit)."""
|
|
27
|
+
rng = np.random.default_rng(seed)
|
|
28
|
+
g = np.repeat(np.arange(n_clusters), per)
|
|
29
|
+
y = (g % 2).astype(int)
|
|
30
|
+
s = y + rng.normal(0, 0.3, size=y.size) + rng.normal(0, 0.3, size=n_clusters)[g]
|
|
31
|
+
return y, s, g
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _mean_combine(m: dict) -> float: # picklable (module-level)
|
|
35
|
+
return float(np.mean(list(m.values())))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _gap_combine(m: dict, *, val: float) -> float: # picklable via functools.partial
|
|
39
|
+
return float(val - np.mean(list(m.values())))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _top_bottom_combine(m: dict, *, top: tuple, bottom: tuple) -> float: # picklable via partial
|
|
43
|
+
return float(np.mean([m[k] for k in bottom]) - np.mean([m[k] for k in top]))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.mark.unit
|
|
47
|
+
def test_returns_bootstrap_ci_with_stratified_method() -> None:
|
|
48
|
+
"""Basic contract: BootstrapCI, stratified_cluster_percentile method, ordered CI."""
|
|
49
|
+
strata = {0: _stratum(0), 1: _stratum(1)}
|
|
50
|
+
ci = stratified_cluster_bootstrap_ci(strata, roc_auc, _mean_combine, n_resamples=300, rng=0)
|
|
51
|
+
assert isinstance(ci, BootstrapCI)
|
|
52
|
+
assert ci.method == "stratified_cluster_percentile"
|
|
53
|
+
assert ci.ci_low <= ci.point_estimate <= ci.ci_high
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.mark.unit
|
|
57
|
+
def test_single_stratum_identity_equals_cluster_bootstrap_ci() -> None:
|
|
58
|
+
"""One stratum + identity reduce reproduces cluster_bootstrap_ci bit-for-bit (same rng path)."""
|
|
59
|
+
y, s, g = _stratum(7)
|
|
60
|
+
strat = stratified_cluster_bootstrap_ci(
|
|
61
|
+
{0: (y, s, g)}, roc_auc, lambda m: m[0], n_resamples=400, rng=11
|
|
62
|
+
)
|
|
63
|
+
single = cluster_bootstrap_ci(y, s, g, roc_auc, n_resamples=400, rng=11)
|
|
64
|
+
assert (strat.point_estimate, strat.ci_low, strat.ci_high) == (
|
|
65
|
+
single.point_estimate,
|
|
66
|
+
single.ci_low,
|
|
67
|
+
single.ci_high,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@pytest.mark.unit
|
|
72
|
+
def test_seed_averaged_gap_shape() -> None:
|
|
73
|
+
"""The dialect/carrier shape: Gx = val − mean_seed(roc_auc) folded into combine."""
|
|
74
|
+
strata = {s: _stratum(s) for s in (0, 1, 2)}
|
|
75
|
+
val = 0.95
|
|
76
|
+
point_metric = float(np.mean([roc_auc(strata[s][0], strata[s][1]) for s in strata]))
|
|
77
|
+
ci = stratified_cluster_bootstrap_ci(
|
|
78
|
+
strata, roc_auc, functools.partial(_gap_combine, val=val), n_resamples=400, rng=0
|
|
79
|
+
)
|
|
80
|
+
assert ci.method == "stratified_cluster_percentile"
|
|
81
|
+
assert np.isclose(ci.point_estimate, val - point_metric) # combine reduces correctly
|
|
82
|
+
assert ci.ci_low <= ci.ci_high
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@pytest.mark.unit
|
|
86
|
+
def test_composite_top_bottom_statistic() -> None:
|
|
87
|
+
"""The §6.5 shape: a top−bottom contrast over strata keyed by group (positives-only resample)."""
|
|
88
|
+
# 4 strata, separability ordered worst->best by key, positives resampled, negatives fixed.
|
|
89
|
+
strata = {k: _stratum(k, n_clusters=20) for k in range(4)}
|
|
90
|
+
ci = stratified_cluster_bootstrap_ci(
|
|
91
|
+
strata,
|
|
92
|
+
roc_auc,
|
|
93
|
+
functools.partial(_top_bottom_combine, top=(0, 1), bottom=(2, 3)),
|
|
94
|
+
resample_labels=(1,),
|
|
95
|
+
n_resamples=300,
|
|
96
|
+
rng=0,
|
|
97
|
+
)
|
|
98
|
+
assert ci.method == "stratified_cluster_percentile"
|
|
99
|
+
assert ci.ci_low <= ci.point_estimate <= ci.ci_high
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@pytest.mark.unit
|
|
103
|
+
@pytest.mark.slow
|
|
104
|
+
def test_njobs_reproducibility() -> None:
|
|
105
|
+
"""Same seed → bit-for-bit-identical CI across n_jobs (spawn_seed_sequences)."""
|
|
106
|
+
strata = {0: _stratum(0), 1: _stratum(1), 2: _stratum(2)}
|
|
107
|
+
r1 = stratified_cluster_bootstrap_ci(
|
|
108
|
+
strata, roc_auc, _mean_combine, n_resamples=200, rng=42, n_jobs=1
|
|
109
|
+
)
|
|
110
|
+
r2 = stratified_cluster_bootstrap_ci(
|
|
111
|
+
strata, roc_auc, _mean_combine, n_resamples=200, rng=42, n_jobs=2
|
|
112
|
+
)
|
|
113
|
+
assert (r1.point_estimate, r1.ci_low, r1.ci_high) == (r2.point_estimate, r2.ci_low, r2.ci_high)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@pytest.mark.unit
|
|
117
|
+
@pytest.mark.slow
|
|
118
|
+
def test_njobs_minus_one_runs() -> None:
|
|
119
|
+
"""n_jobs=-1 (all cores) completes without error."""
|
|
120
|
+
strata = {0: _stratum(0), 1: _stratum(1)}
|
|
121
|
+
ci = stratified_cluster_bootstrap_ci(
|
|
122
|
+
strata, roc_auc, _mean_combine, n_resamples=100, rng=42, n_jobs=-1
|
|
123
|
+
)
|
|
124
|
+
assert ci.ci_low <= ci.ci_high
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@pytest.mark.unit
|
|
128
|
+
@pytest.mark.parametrize(
|
|
129
|
+
("kwargs", "strata_override", "match"),
|
|
130
|
+
[
|
|
131
|
+
({}, {}, "strata must be non-empty"),
|
|
132
|
+
({"confidence": 1.5}, None, r"confidence must be in \(0, 1\)"),
|
|
133
|
+
({"resample_labels": ()}, None, "non-empty"),
|
|
134
|
+
({"n_jobs": 0}, None, "n_jobs"),
|
|
135
|
+
],
|
|
136
|
+
)
|
|
137
|
+
def test_validation_errors(kwargs: dict, strata_override, match: str) -> None:
|
|
138
|
+
"""Invalid parameters raise ValueError with a diagnostic message."""
|
|
139
|
+
strata = {} if strata_override == {} else {0: _stratum(0), 1: _stratum(1)}
|
|
140
|
+
with pytest.raises(ValueError, match=match):
|
|
141
|
+
stratified_cluster_bootstrap_ci(
|
|
142
|
+
strata, roc_auc, _mean_combine, n_resamples=50, rng=0, **kwargs
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@pytest.mark.unit
|
|
147
|
+
def test_shape_mismatch_in_a_stratum_raises() -> None:
|
|
148
|
+
"""A stratum with misaligned arrays raises ValueError naming the stratum."""
|
|
149
|
+
y, s, g = _stratum(0)
|
|
150
|
+
with pytest.raises(ValueError, match="aligned 1-D"):
|
|
151
|
+
stratified_cluster_bootstrap_ci(
|
|
152
|
+
{"bad": (y, s[:-1], g)}, roc_auc, _mean_combine, n_resamples=50, rng=0
|
|
153
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png
RENAMED
|
File without changes
|
{eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-1.7.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|