eval-toolkit 0.49.0__tar.gz → 0.50.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/CHANGELOG.md +56 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/PKG-INFO +3 -3
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/README.md +2 -2
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/STYLE.md +1 -1
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_rng.py +19 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/analysis.py +5 -4
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/bootstrap.py +42 -33
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/harness.py +31 -24
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metrics.py +7 -5
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/scorecards.py +14 -11
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/stacking.py +16 -4
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/thresholds.py +5 -4
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/bootstrap_ci/cases.json +6 -6
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/public_api/snapshot.json +14 -14
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_analysis.py +5 -5
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_block_bootstrap_on_folds.py +7 -7
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_calibration_mc.py +4 -4
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_edge_cases.py +3 -3
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_golden.py +18 -18
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_njobs.py +12 -12
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_props.py +11 -11
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_research_grounded.py +3 -3
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_unit.py +18 -18
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_bootstrap_chain.py +2 -2
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_bootstrap.py +6 -6
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_metrics.py +1 -1
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_fault_injection.py +2 -2
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_internals.py +3 -3
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_metric_options.py +2 -2
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_parallelism.py +10 -10
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_smoke.py +2 -2
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_logging.py +1 -1
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_pipeline_e2e.py +5 -5
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_reference_equivalence.py +2 -2
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_reproducibility_integration.py +10 -10
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_scorecard.py +16 -16
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_stacking.py +17 -19
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds.py +2 -2
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_coverage.py +1 -1
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_v09_contracts.py +3 -3
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/.gitignore +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/LICENSE +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/pyproject.toml +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_tokenization_leakage_check.py +0 -0
|
@@ -5,6 +5,62 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.50.0] — 2026-05-23 — SPEC 7 `rng` parameter adoption
|
|
9
|
+
|
|
10
|
+
The SPEC 7 follow-up to v0.49.0. The `_rng.py` scaffold shipped at
|
|
11
|
+
v0.49.0 (SeedLike + RNGLike type aliases per
|
|
12
|
+
[Scientific Python SPEC 7](https://scientific-python.org/specs/spec-0007/))
|
|
13
|
+
is now wired into every Tier-1 public function that consumes a NumPy RNG.
|
|
14
|
+
|
|
15
|
+
### BREAKING
|
|
16
|
+
|
|
17
|
+
**22 Tier-1 function signatures**: `seed: int = X` / `random_state: int | None` → `rng: RNGLike | SeedLike | None = X`. Pre-v1.0 SemVer-minor BREAKING (v0.34.0 precedent). Defaults preserved (still deterministic-by-default).
|
|
18
|
+
|
|
19
|
+
Affected functions:
|
|
20
|
+
|
|
21
|
+
- `bootstrap.py` (7 public + 1 private): `bootstrap_ci`, `paired_bootstrap_diff`, `paired_bootstrap_ece_diff`, `paired_bootstrap_op_point_diff`, `paired_mde`, `block_bootstrap_on_folds`, `cross_validate_metric`, `_bootstrap_t_ci`.
|
|
22
|
+
- `metrics.py:1063`: `expected_calibration_error_debiased`.
|
|
23
|
+
- `thresholds.py`: `selected_operating_point` + `_bootstrap_threshold_metric_cis`.
|
|
24
|
+
- `analysis.py`: `bootstrap_metric_from_predictions`, `paired_diff_from_prediction_refs`.
|
|
25
|
+
- `harness.py` (6 sites): `evaluate`, `evaluate_scorer_on_slice`, `_bootstrap_auc_ci`, `_evaluate_scores`, `_compute_paired_diffs`, `_score_all_slices`.
|
|
26
|
+
- `scorecards.py`: `scorecard`, `_evaluate_spec`.
|
|
27
|
+
- `stacking.py`: `LogisticStacker.random_state` → `LogisticStacker.rng` class-field rename (sklearn pass-through derives int at the boundary).
|
|
28
|
+
|
|
29
|
+
**Body refactors**:
|
|
30
|
+
|
|
31
|
+
- 4 SeedSequence.spawn() sites converted from `np.random.SeedSequence(seed).spawn(n)` to `rng.bit_generator.seed_seq.spawn(n)` (Option A — preserves existing worker SeedSequence signatures).
|
|
32
|
+
- 2 sklearn-bridge sites in `cross_validate_metric` derive int from rng before passing to `StratifiedKFold`/`KFold(random_state=...)` (defensive across sklearn versions <1.4).
|
|
33
|
+
- `LogisticStacker.fit` derives sklearn int from `self.rng` at the boundary.
|
|
34
|
+
|
|
35
|
+
**Config schema** (Tier-2 additive): `evaluate()` config dict key `"seed"` → `"rng"`. Generator-typed input serializes as `repr(rng)`; int/None serialize as-is (backward-compatible for prior int-seed usage).
|
|
36
|
+
|
|
37
|
+
### Added
|
|
38
|
+
|
|
39
|
+
- **Docstrings**: NumPy-style parameter doc for every renamed function now references `rng : RNGLike | SeedLike | None` with explicit link to SPEC 7.
|
|
40
|
+
- **STYLE.md §3a** + **ADR 0004 D4**: `rng` row flipped from "target convention; adopted in v0.50.0" → "**canonical** convention (adopted v0.50.0)".
|
|
41
|
+
|
|
42
|
+
### Changed
|
|
43
|
+
|
|
44
|
+
- **Test sweep** (~230+ test sites): `seed=X` → `rng=X` in test kwarg calls, EXCEPT in test files that test legitimate `seed`-as-int contexts (`test_adversarial.py` for Python `random.Random`, `test_seeds.py` for `set_global_seeds`, `test_splits*.py` for Splitter dataclass fields, `test_text_dedup*.py` for MinHashLSHStrategy class field).
|
|
45
|
+
- **CHANGELOG header**: this release.
|
|
46
|
+
|
|
47
|
+
### Exceptions to SPEC 7 (KEPT `seed:` — documented in STYLE.md §3a + ADR 0004 D4)
|
|
48
|
+
|
|
49
|
+
- `seeds.set_global_seeds(seed: int)` — global-state setter, not per-function RNG.
|
|
50
|
+
- `adversarial.py` dataclass fields + functional wrappers — use Python stdlib `random.Random(seed)`, not NumPy.
|
|
51
|
+
- `splits.py` Splitter dataclass class-fields (`HoldoutSplitter.seed`, `StratifiedKFoldSplitter.seed`, etc.) — configuration storage, not user-facing RNG parameter.
|
|
52
|
+
- `loaders.py:903` YAML config schema key — declarative; renaming would break consumer YAMLs.
|
|
53
|
+
|
|
54
|
+
### Migration
|
|
55
|
+
|
|
56
|
+
- Consumer (`prompt-injection-detection-submission`) lockstep: bump dep pin `>=0.49.0` → `>=0.50.0`; rename `seed=` → `rng=` on eval-toolkit-bound call sites (estimated 5-8 sites).
|
|
57
|
+
- Bit-for-bit reproducibility preserved when migrating `seed=42` → `rng=42` (int seed is SeedLike; `np.random.default_rng(42)` is the canonical normalization).
|
|
58
|
+
|
|
59
|
+
### Notes
|
|
60
|
+
|
|
61
|
+
- Ships in parallel with Round 8 audit STOP-GATE (Decision Y.2); R8 briefing at commit `6f6839a`, awaiting Codex+Gemini reports.
|
|
62
|
+
- Memory pattern captured at v0.49.0: pre-flight grep MUST cover `README.md`, `.doctest-modules`, and any config files (per `feedback_sybil_runs_readme.md`). Applied to v0.50.0 pre-flight.
|
|
63
|
+
|
|
8
64
|
## [0.49.0] — 2026-05-23 — Global naming-standards sweep + final cleanup before v1.0
|
|
9
65
|
|
|
10
66
|
Final pre-v1.0 minor consolidating the naming-convention standardization
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.50.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -233,12 +233,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
|
|
|
233
233
|
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
234
234
|
from eval_toolkit.metrics import pr_auc
|
|
235
235
|
|
|
236
|
-
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000,
|
|
236
|
+
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
|
|
237
237
|
print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
|
|
238
238
|
|
|
239
239
|
# Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
|
|
240
240
|
s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
|
|
241
|
-
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000,
|
|
241
|
+
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
|
|
242
242
|
print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
|
|
243
243
|
```
|
|
244
244
|
|
|
@@ -150,12 +150,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
|
|
|
150
150
|
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
151
151
|
from eval_toolkit.metrics import pr_auc
|
|
152
152
|
|
|
153
|
-
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000,
|
|
153
|
+
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
|
|
154
154
|
print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
|
|
155
155
|
|
|
156
156
|
# Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
|
|
157
157
|
s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
|
|
158
|
-
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000,
|
|
158
|
+
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
|
|
159
159
|
print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
|
|
160
160
|
```
|
|
161
161
|
|
|
@@ -76,7 +76,7 @@ them; deviations need justification in the PR description.
|
|
|
76
76
|
| `n_jobs` | Parallelism (joblib + sklearn convention) |
|
|
77
77
|
| `ax` | Matplotlib axis (matplotlib convention) |
|
|
78
78
|
| `metric` | Callable `(y_true, y_score) -> float` |
|
|
79
|
-
| `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) —
|
|
79
|
+
| `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — **canonical** convention (adopted v0.50.0). Accepts `int`, `np.random.Generator`, `BitGenerator`, `SeedSequence`, or `None`. |
|
|
80
80
|
|
|
81
81
|
The v0.50.0 SPEC 7 adoption preserves two `seed: int` exceptions:
|
|
82
82
|
`set_global_seeds(seed: int)` (global-state setter, not per-function
|
|
@@ -26,6 +26,7 @@ Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
|
|
|
26
26
|
from __future__ import annotations
|
|
27
27
|
|
|
28
28
|
from collections.abc import Sequence
|
|
29
|
+
from typing import cast
|
|
29
30
|
|
|
30
31
|
import numpy as np
|
|
31
32
|
|
|
@@ -44,3 +45,21 @@ type RNGLike = np.random.Generator | np.random.BitGenerator
|
|
|
44
45
|
``Generator`` inputs and lifts ``BitGenerator`` inputs into a
|
|
45
46
|
``Generator`` — both forms compose cleanly.
|
|
46
47
|
"""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def spawn_seed_sequences(rng: RNGLike | SeedLike | None, n: int) -> list[np.random.SeedSequence]:
|
|
51
|
+
"""Spawn ``n`` independent SeedSequences from any SPEC 7 ``rng`` input.
|
|
52
|
+
|
|
53
|
+
Normalizes the input to a ``Generator``, then extracts the underlying
|
|
54
|
+
``SeedSequence`` via the bit-generator and spawns ``n`` children.
|
|
55
|
+
The cast satisfies mypy strict: the ``seed_seq`` attribute on a
|
|
56
|
+
concrete BitGenerator is a ``SeedSequence`` instance, but the type
|
|
57
|
+
stub on ``BitGenerator.seed_seq`` returns the abstract
|
|
58
|
+
``ISeedSequence`` interface (which lacks ``spawn``).
|
|
59
|
+
|
|
60
|
+
Used by the bootstrap parallel workers (which take spawned
|
|
61
|
+
``SeedSequence`` objects to seed their internal ``default_rng()`` calls).
|
|
62
|
+
"""
|
|
63
|
+
gen = np.random.default_rng(rng)
|
|
64
|
+
seed_seq = cast(np.random.SeedSequence, gen.bit_generator.seed_seq)
|
|
65
|
+
return seed_seq.spawn(n)
|
|
@@ -11,6 +11,7 @@ from typing import Any
|
|
|
11
11
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
|
|
14
|
+
from eval_toolkit._rng import RNGLike, SeedLike
|
|
14
15
|
from eval_toolkit.bootstrap import bootstrap_ci, paired_bootstrap_diff
|
|
15
16
|
from eval_toolkit.metrics import pr_auc
|
|
16
17
|
from eval_toolkit.protocols import PredictionReader
|
|
@@ -121,7 +122,7 @@ def bootstrap_metric_from_predictions(
|
|
|
121
122
|
*,
|
|
122
123
|
reader: PredictionReader | None = None,
|
|
123
124
|
n_resamples: int = 1000,
|
|
124
|
-
|
|
125
|
+
rng: RNGLike | SeedLike | None = 42,
|
|
125
126
|
) -> dict[str, object]:
|
|
126
127
|
"""Compute a PR-AUC bootstrap CI from one prediction ref."""
|
|
127
128
|
arrays = load_prediction_arrays(ref, reader=reader)
|
|
@@ -130,7 +131,7 @@ def bootstrap_metric_from_predictions(
|
|
|
130
131
|
arrays.scores,
|
|
131
132
|
pr_auc,
|
|
132
133
|
n_resamples=n_resamples,
|
|
133
|
-
|
|
134
|
+
rng=rng,
|
|
134
135
|
).to_dict()
|
|
135
136
|
|
|
136
137
|
|
|
@@ -141,7 +142,7 @@ def paired_diff_from_prediction_refs(
|
|
|
141
142
|
baseline_reader: PredictionReader | None = None,
|
|
142
143
|
candidate_reader: PredictionReader | None = None,
|
|
143
144
|
n_resamples: int = 1000,
|
|
144
|
-
|
|
145
|
+
rng: RNGLike | SeedLike | None = 42,
|
|
145
146
|
) -> dict[str, object]:
|
|
146
147
|
"""Compute paired PR-AUC delta from two prediction refs.
|
|
147
148
|
|
|
@@ -172,7 +173,7 @@ def paired_diff_from_prediction_refs(
|
|
|
172
173
|
candidate.scores,
|
|
173
174
|
pr_auc,
|
|
174
175
|
n_resamples=n_resamples,
|
|
175
|
-
|
|
176
|
+
rng=rng,
|
|
176
177
|
).to_dict()
|
|
177
178
|
|
|
178
179
|
|
|
@@ -31,6 +31,7 @@ from scipy.stats import norm as _scipy_norm
|
|
|
31
31
|
from scipy.stats import rankdata as _scipy_rankdata
|
|
32
32
|
|
|
33
33
|
from eval_toolkit._parallel import parallel_map
|
|
34
|
+
from eval_toolkit._rng import RNGLike, SeedLike, spawn_seed_sequences
|
|
34
35
|
|
|
35
36
|
_logger = logging.getLogger(__name__)
|
|
36
37
|
|
|
@@ -236,7 +237,7 @@ def bootstrap_ci(
|
|
|
236
237
|
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
237
238
|
confidence: float = DEFAULT_CONFIDENCE,
|
|
238
239
|
method: Literal["BCa", "percentile", "studentized"] = DEFAULT_METHOD,
|
|
239
|
-
|
|
240
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
240
241
|
n_jobs: int = 1,
|
|
241
242
|
) -> BootstrapCI:
|
|
242
243
|
"""Per-condition CI via :func:`scipy.stats.bootstrap`.
|
|
@@ -257,8 +258,9 @@ def bootstrap_ci(
|
|
|
257
258
|
Two-sided confidence level (default 0.95).
|
|
258
259
|
method : {"BCa", "percentile", "studentized"}, optional
|
|
259
260
|
Default "BCa".
|
|
260
|
-
|
|
261
|
-
RNG
|
|
261
|
+
rng : RNGLike | SeedLike | None, optional
|
|
262
|
+
RNG argument per `Scientific Python SPEC 7 <https://scientific-python.org/specs/spec-0007/>`_.
|
|
263
|
+
Int seed (default ``DEFAULT_SEED=42``), ``Generator``, or ``None`` (entropy).
|
|
262
264
|
n_jobs : int, optional
|
|
263
265
|
Parallel workers (default 1 — sequential). Only effective when
|
|
264
266
|
``method='studentized'`` (which has the only Python-level outer loop
|
|
@@ -284,7 +286,7 @@ def bootstrap_ci(
|
|
|
284
286
|
>>> rng = np.random.default_rng(42)
|
|
285
287
|
>>> y = rng.integers(0, 2, size=200)
|
|
286
288
|
>>> s = y + rng.normal(0, 0.3, size=200)
|
|
287
|
-
>>> ci = bootstrap_ci(y, s, metric=pr_auc, n_resamples=200,
|
|
289
|
+
>>> ci = bootstrap_ci(y, s, metric=pr_auc, n_resamples=200, rng=42)
|
|
288
290
|
>>> ci.ci_low <= ci.point_estimate <= ci.ci_high
|
|
289
291
|
True
|
|
290
292
|
|
|
@@ -319,13 +321,13 @@ def bootstrap_ci(
|
|
|
319
321
|
)
|
|
320
322
|
|
|
321
323
|
_logger.debug(
|
|
322
|
-
"bootstrap_ci: metric=%s n=%d n_resamples=%d method=%s confidence=%.3f
|
|
324
|
+
"bootstrap_ci: metric=%s n=%d n_resamples=%d method=%s confidence=%.3f rng=%r n_jobs=%d",
|
|
323
325
|
getattr(metric, "__name__", repr(metric)),
|
|
324
326
|
n,
|
|
325
327
|
n_resamples,
|
|
326
328
|
method,
|
|
327
329
|
confidence,
|
|
328
|
-
|
|
330
|
+
rng,
|
|
329
331
|
n_jobs,
|
|
330
332
|
)
|
|
331
333
|
|
|
@@ -342,11 +344,11 @@ def bootstrap_ci(
|
|
|
342
344
|
point,
|
|
343
345
|
n_resamples=n_resamples,
|
|
344
346
|
confidence=confidence,
|
|
345
|
-
|
|
347
|
+
rng=rng,
|
|
346
348
|
n_jobs=n_jobs,
|
|
347
349
|
)
|
|
348
350
|
else:
|
|
349
|
-
rng = np.random.default_rng(
|
|
351
|
+
rng = np.random.default_rng(rng)
|
|
350
352
|
res = _scipy_bootstrap(
|
|
351
353
|
(y_true_arr, y_score_arr),
|
|
352
354
|
statistic=_statistic,
|
|
@@ -423,7 +425,7 @@ def _bootstrap_t_ci(
|
|
|
423
425
|
*,
|
|
424
426
|
n_resamples: int,
|
|
425
427
|
confidence: float,
|
|
426
|
-
|
|
428
|
+
rng: RNGLike | SeedLike | None,
|
|
427
429
|
n_jobs: int = 1,
|
|
428
430
|
) -> tuple[float, float]:
|
|
429
431
|
r"""Studentized bootstrap-t CI per Algeshiemer 2024 / Davison & Hinkley §5.2.
|
|
@@ -441,7 +443,7 @@ def _bootstrap_t_ci(
|
|
|
441
443
|
Skips degenerate resamples (single-class draws causing the metric to
|
|
442
444
|
raise); raises if > 5% of resamples are degenerate.
|
|
443
445
|
"""
|
|
444
|
-
seed_seqs =
|
|
446
|
+
seed_seqs = spawn_seed_sequences(rng, n_resamples)
|
|
445
447
|
step = functools.partial(_bootstrap_t_step, y_true=y_true, y_score=y_score, metric=metric)
|
|
446
448
|
raw_results = parallel_map(step, seed_seqs, n_jobs=n_jobs, description="bootstrap_t")
|
|
447
449
|
valid_pairs = [r for r, _ in raw_results if r is not None]
|
|
@@ -505,7 +507,7 @@ def paired_bootstrap_diff(
|
|
|
505
507
|
*,
|
|
506
508
|
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
507
509
|
confidence: float = DEFAULT_CONFIDENCE,
|
|
508
|
-
|
|
510
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
509
511
|
n_jobs: int = 1,
|
|
510
512
|
) -> PairedBootstrapCI:
|
|
511
513
|
"""Paired-bootstrap CI on ``metric(B) − metric(A)`` using the same resample indices.
|
|
@@ -518,7 +520,7 @@ def paired_bootstrap_diff(
|
|
|
518
520
|
Scores from two scorers on the same rows.
|
|
519
521
|
metric : callable ``(y_true, y_score) -> float``
|
|
520
522
|
Must be picklable when ``n_jobs != 1`` (lambdas not supported).
|
|
521
|
-
n_resamples, confidence,
|
|
523
|
+
n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
|
|
522
524
|
n_jobs : int, optional
|
|
523
525
|
Parallel workers (default 1 — sequential). ``n_jobs > 1`` uses
|
|
524
526
|
joblib loky; ``n_jobs=-1`` uses all cores; ``n_jobs=0`` is rejected.
|
|
@@ -547,7 +549,7 @@ def paired_bootstrap_diff(
|
|
|
547
549
|
>>> y = rng.integers(0, 2, size=200)
|
|
548
550
|
>>> s_a = rng.normal(0, 1, size=200) # random scorer
|
|
549
551
|
>>> s_b = y + rng.normal(0, 0.3, size=200) # signal scorer
|
|
550
|
-
>>> diff = paired_bootstrap_diff(y, s_a, s_b, pr_auc, n_resamples=200,
|
|
552
|
+
>>> diff = paired_bootstrap_diff(y, s_a, s_b, pr_auc, n_resamples=200, rng=42)
|
|
551
553
|
>>> diff.delta > 0 # B beats A
|
|
552
554
|
True
|
|
553
555
|
|
|
@@ -581,7 +583,7 @@ def paired_bootstrap_diff(
|
|
|
581
583
|
raise ValueError(f"n={n} too small for paired bootstrap; need ≥ 10")
|
|
582
584
|
|
|
583
585
|
delta_point = float(metric(y_true_arr, b)) - float(metric(y_true_arr, a))
|
|
584
|
-
seed_seqs =
|
|
586
|
+
seed_seqs = spawn_seed_sequences(rng, n_resamples)
|
|
585
587
|
step = functools.partial(
|
|
586
588
|
_paired_bootstrap_diff_step,
|
|
587
589
|
y_true_arr=y_true_arr,
|
|
@@ -654,7 +656,7 @@ def paired_bootstrap_ece_diff(
|
|
|
654
656
|
ece_fn: Callable[[np.ndarray, np.ndarray, int], float],
|
|
655
657
|
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
656
658
|
confidence: float = DEFAULT_CONFIDENCE,
|
|
657
|
-
|
|
659
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
658
660
|
n_bins: int = 10,
|
|
659
661
|
n_jobs: int = 1,
|
|
660
662
|
) -> PairedBootstrapCI:
|
|
@@ -677,7 +679,7 @@ def paired_bootstrap_ece_diff(
|
|
|
677
679
|
does not depend on calibration. Typical use:
|
|
678
680
|
``from eval_toolkit.metrics import expected_calibration_error``,
|
|
679
681
|
then pass ``ece_fn=expected_calibration_error``.
|
|
680
|
-
n_resamples, confidence,
|
|
682
|
+
n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
|
|
681
683
|
n_bins : int, optional
|
|
682
684
|
Number of ECE bins (passed through to ``ece_fn``).
|
|
683
685
|
n_jobs : int, optional
|
|
@@ -715,7 +717,7 @@ def paired_bootstrap_ece_diff(
|
|
|
715
717
|
raise ValueError(f"n={n} too small for paired bootstrap; need >= 10")
|
|
716
718
|
|
|
717
719
|
delta_point = float(ece_fn(y_true_arr, b, n_bins)) - float(ece_fn(y_true_arr, a, n_bins))
|
|
718
|
-
seed_seqs =
|
|
720
|
+
seed_seqs = spawn_seed_sequences(rng, n_resamples)
|
|
719
721
|
step = functools.partial(
|
|
720
722
|
_paired_bootstrap_ece_diff_step,
|
|
721
723
|
y_true_arr=y_true_arr,
|
|
@@ -798,7 +800,7 @@ def paired_bootstrap_op_point_diff(
|
|
|
798
800
|
*,
|
|
799
801
|
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
800
802
|
confidence: float = DEFAULT_CONFIDENCE,
|
|
801
|
-
|
|
803
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
802
804
|
n_jobs: int = 1,
|
|
803
805
|
) -> PairedBootstrapCI:
|
|
804
806
|
r"""Two-level paired bootstrap for operating-point lifts.
|
|
@@ -826,7 +828,7 @@ def paired_bootstrap_op_point_diff(
|
|
|
826
828
|
``lambda y, s: MaxF1Selector().select(y, s).threshold``).
|
|
827
829
|
metric_fn : callable ``(y_true, y_score, threshold) -> float``
|
|
828
830
|
Operating-point metric (e.g., F1, precision) at the given threshold.
|
|
829
|
-
n_resamples, confidence,
|
|
831
|
+
n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
|
|
830
832
|
n_jobs : int, optional
|
|
831
833
|
Parallel workers (default 1 — sequential). See
|
|
832
834
|
:ref:`methodology/parallelism`. Both ``threshold_fn`` and
|
|
@@ -913,7 +915,7 @@ def paired_bootstrap_op_point_diff(
|
|
|
913
915
|
metric_fn(test_y_arr, test_a, thr_a_full)
|
|
914
916
|
)
|
|
915
917
|
|
|
916
|
-
seed_seqs =
|
|
918
|
+
seed_seqs = spawn_seed_sequences(rng, n_resamples)
|
|
917
919
|
step = functools.partial(
|
|
918
920
|
_paired_bootstrap_op_point_diff_step,
|
|
919
921
|
val_y_arr=val_y_arr,
|
|
@@ -1132,7 +1134,7 @@ def paired_mde(
|
|
|
1132
1134
|
alpha: float = 0.05,
|
|
1133
1135
|
power: float = 0.80,
|
|
1134
1136
|
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
1135
|
-
|
|
1137
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
1136
1138
|
n_jobs: int = 1,
|
|
1137
1139
|
) -> MDEEstimate:
|
|
1138
1140
|
r"""Minimum detectable paired Δ at (α, power).
|
|
@@ -1174,7 +1176,7 @@ def paired_mde(
|
|
|
1174
1176
|
metric,
|
|
1175
1177
|
n_resamples=n_resamples,
|
|
1176
1178
|
confidence=0.95,
|
|
1177
|
-
|
|
1179
|
+
rng=rng,
|
|
1178
1180
|
n_jobs=n_jobs,
|
|
1179
1181
|
)
|
|
1180
1182
|
est = mde_from_ci(paired, alpha=alpha, power=power)
|
|
@@ -1306,7 +1308,7 @@ def block_bootstrap_on_folds(
|
|
|
1306
1308
|
*,
|
|
1307
1309
|
n_resamples: int = DEFAULT_N_RESAMPLES,
|
|
1308
1310
|
confidence: float = DEFAULT_CONFIDENCE,
|
|
1309
|
-
|
|
1311
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
1310
1312
|
) -> BootstrapCI:
|
|
1311
1313
|
r"""Block bootstrap on folds: resample K folds with replacement; percentile CI on mean.
|
|
1312
1314
|
|
|
@@ -1341,8 +1343,9 @@ def block_bootstrap_on_folds(
|
|
|
1341
1343
|
the cross-fold sensitivity-check use case (runs in O(seconds)).
|
|
1342
1344
|
confidence : float, optional
|
|
1343
1345
|
Two-sided confidence level (default 0.95).
|
|
1344
|
-
|
|
1345
|
-
RNG
|
|
1346
|
+
rng : RNGLike | SeedLike | None, optional
|
|
1347
|
+
RNG argument per `Scientific Python SPEC 7 <https://scientific-python.org/specs/spec-0007/>`_.
|
|
1348
|
+
Int seed (default ``DEFAULT_SEED=42``), ``Generator``, or ``None`` (entropy).
|
|
1346
1349
|
|
|
1347
1350
|
Returns
|
|
1348
1351
|
-------
|
|
@@ -1360,7 +1363,7 @@ def block_bootstrap_on_folds(
|
|
|
1360
1363
|
--------
|
|
1361
1364
|
>>> import numpy as np
|
|
1362
1365
|
>>> folds = np.array([0.83, 0.81, 0.85, 0.79, 0.84])
|
|
1363
|
-
>>> ci = block_bootstrap_on_folds(folds, n_resamples=2000,
|
|
1366
|
+
>>> ci = block_bootstrap_on_folds(folds, n_resamples=2000, rng=42)
|
|
1364
1367
|
>>> ci.method
|
|
1365
1368
|
'block_bootstrap'
|
|
1366
1369
|
>>> bool(ci.ci_low <= ci.point_estimate <= ci.ci_high)
|
|
@@ -1389,7 +1392,7 @@ def block_bootstrap_on_folds(
|
|
|
1389
1392
|
if not 0.0 < confidence < 1.0:
|
|
1390
1393
|
raise ValueError(f"confidence must be in (0, 1); got {confidence}")
|
|
1391
1394
|
|
|
1392
|
-
rng = np.random.default_rng(
|
|
1395
|
+
rng = np.random.default_rng(rng)
|
|
1393
1396
|
# Vectorized: (n_resamples, K) index draws, gather, mean along axis 1.
|
|
1394
1397
|
idx = rng.integers(0, K, size=(n_resamples, K))
|
|
1395
1398
|
resample_means = arr[idx].mean(axis=1)
|
|
@@ -1412,7 +1415,7 @@ def cross_validate_metric(
|
|
|
1412
1415
|
metric: MetricFn,
|
|
1413
1416
|
k: int = 5,
|
|
1414
1417
|
stratified: bool = True,
|
|
1415
|
-
|
|
1418
|
+
rng: RNGLike | SeedLike | None = DEFAULT_SEED,
|
|
1416
1419
|
) -> np.ndarray:
|
|
1417
1420
|
r"""K-fold cross-validation of a metric on caller-supplied scores.
|
|
1418
1421
|
|
|
@@ -1444,8 +1447,8 @@ def cross_validate_metric(
|
|
|
1444
1447
|
If ``True`` (default), use ``StratifiedKFold`` so each fold
|
|
1445
1448
|
preserves the class balance. Recommended for binary
|
|
1446
1449
|
classification under class imbalance.
|
|
1447
|
-
|
|
1448
|
-
|
|
1450
|
+
rng : RNGLike | SeedLike | None, optional
|
|
1451
|
+
RNG per SPEC 7 — derived to int at the sklearn ``KFold/StratifiedKFold`` boundary.
|
|
1449
1452
|
|
|
1450
1453
|
Returns
|
|
1451
1454
|
-------
|
|
@@ -1467,7 +1470,7 @@ def cross_validate_metric(
|
|
|
1467
1470
|
>>> n = 200
|
|
1468
1471
|
>>> y = rng.binomial(1, 0.3, size=n).astype(int)
|
|
1469
1472
|
>>> s = np.clip(y * 0.6 + rng.normal(0, 0.3, n), 0, 1)
|
|
1470
|
-
>>> folds = cross_validate_metric(y, s, metric=pr_auc, k=5,
|
|
1473
|
+
>>> folds = cross_validate_metric(y, s, metric=pr_auc, k=5, rng=42)
|
|
1471
1474
|
>>> folds.shape
|
|
1472
1475
|
(5,)
|
|
1473
1476
|
>>> bool(np.all(0.0 <= folds[~np.isnan(folds)]))
|
|
@@ -1491,12 +1494,18 @@ def cross_validate_metric(
|
|
|
1491
1494
|
if k > n:
|
|
1492
1495
|
raise ValueError(f"k={k} exceeds n={n}")
|
|
1493
1496
|
|
|
1497
|
+
# Derive an int seed for sklearn — sklearn KFold's random_state accepts
|
|
1498
|
+
# int | None | RandomState (not Generator) across versions <1.4; safer to
|
|
1499
|
+
# derive at the boundary than pin a higher sklearn minimum.
|
|
1500
|
+
rng = np.random.default_rng(rng)
|
|
1501
|
+
sklearn_seed = int(rng.integers(0, 2**31 - 1))
|
|
1502
|
+
|
|
1494
1503
|
splitter: KFold | StratifiedKFold
|
|
1495
1504
|
if stratified:
|
|
1496
|
-
splitter = StratifiedKFold(n_splits=k, shuffle=True, random_state=
|
|
1505
|
+
splitter = StratifiedKFold(n_splits=k, shuffle=True, random_state=sklearn_seed)
|
|
1497
1506
|
fold_iter = splitter.split(np.zeros(n), y_arr)
|
|
1498
1507
|
else:
|
|
1499
|
-
splitter = KFold(n_splits=k, shuffle=True, random_state=
|
|
1508
|
+
splitter = KFold(n_splits=k, shuffle=True, random_state=sklearn_seed)
|
|
1500
1509
|
fold_iter = splitter.split(np.zeros(n))
|
|
1501
1510
|
|
|
1502
1511
|
fold_metrics = np.full(k, np.nan, dtype=np.float64)
|