eval-toolkit 0.48.0__tar.gz → 0.50.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/CHANGELOG.md +163 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/PKG-INFO +6 -6
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/README.md +5 -5
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/STYLE.md +103 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/pyproject.toml +7 -8
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__init__.py +8 -8
- eval_toolkit-0.50.0/src/eval_toolkit/_rng.py +65 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/adversarial.py +18 -18
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/analysis.py +5 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/bootstrap.py +42 -33
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/harness.py +31 -24
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/leakage.py +5 -17
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/manifest.py +10 -10
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metric_specs.py +1 -1
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metrics.py +7 -5
- eval_toolkit-0.48.0/src/eval_toolkit/_scorecard.py → eval_toolkit-0.50.0/src/eval_toolkit/scorecards.py +19 -15
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/stacking.py +16 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/thresholds.py +5 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/bootstrap_ci/cases.json +6 -6
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/public_api/snapshot.json +26 -26
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_adversarial.py +17 -17
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_analysis.py +5 -5
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_block_bootstrap_on_folds.py +7 -7
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_calibration_mc.py +4 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_edge_cases.py +3 -3
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_golden.py +18 -18
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_njobs.py +12 -12
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_props.py +11 -11
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_research_grounded.py +3 -3
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_unit.py +18 -18
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_bootstrap_chain.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_bootstrap.py +6 -6
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_metrics.py +1 -1
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_fault_injection.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_internals.py +3 -3
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_metric_options.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_parallelism.py +10 -10
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_smoke.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_lazy_extras_messages.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_logging.py +1 -1
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest.py +43 -43
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest_contamination_round_trip.py +6 -6
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest_props.py +11 -11
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest_validation.py +4 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_pipeline_e2e.py +5 -5
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_preprocessing.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_reference_equivalence.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_reproducibility_integration.py +10 -10
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_schemas.py +6 -6
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_scorecard.py +16 -16
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_stacking.py +17 -19
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_coverage.py +1 -1
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_v09_contracts.py +5 -5
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/.gitignore +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/LICENSE +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_tokenization_leakage_check.py +0 -0
|
@@ -5,6 +5,169 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.50.0] — 2026-05-23 — SPEC 7 `rng` parameter adoption
|
|
9
|
+
|
|
10
|
+
The SPEC 7 follow-up to v0.49.0. The `_rng.py` scaffold shipped at
|
|
11
|
+
v0.49.0 (SeedLike + RNGLike type aliases per
|
|
12
|
+
[Scientific Python SPEC 7](https://scientific-python.org/specs/spec-0007/))
|
|
13
|
+
is now wired into every Tier-1 public function that consumes a NumPy RNG.
|
|
14
|
+
|
|
15
|
+
### BREAKING
|
|
16
|
+
|
|
17
|
+
**22 Tier-1 function signatures**: `seed: int = X` / `random_state: int | None` → `rng: RNGLike | SeedLike | None = X`. Pre-v1.0 SemVer-minor BREAKING (v0.34.0 precedent). Defaults preserved (still deterministic-by-default).
|
|
18
|
+
|
|
19
|
+
Affected functions:
|
|
20
|
+
|
|
21
|
+
- `bootstrap.py` (7 public + 1 private): `bootstrap_ci`, `paired_bootstrap_diff`, `paired_bootstrap_ece_diff`, `paired_bootstrap_op_point_diff`, `paired_mde`, `block_bootstrap_on_folds`, `cross_validate_metric`, `_bootstrap_t_ci`.
|
|
22
|
+
- `metrics.py:1063`: `expected_calibration_error_debiased`.
|
|
23
|
+
- `thresholds.py`: `selected_operating_point` + `_bootstrap_threshold_metric_cis`.
|
|
24
|
+
- `analysis.py`: `bootstrap_metric_from_predictions`, `paired_diff_from_prediction_refs`.
|
|
25
|
+
- `harness.py` (6 sites): `evaluate`, `evaluate_scorer_on_slice`, `_bootstrap_auc_ci`, `_evaluate_scores`, `_compute_paired_diffs`, `_score_all_slices`.
|
|
26
|
+
- `scorecards.py`: `scorecard`, `_evaluate_spec`.
|
|
27
|
+
- `stacking.py`: `LogisticStacker.random_state` → `LogisticStacker.rng` class-field rename (sklearn pass-through derives int at the boundary).
|
|
28
|
+
|
|
29
|
+
**Body refactors**:
|
|
30
|
+
|
|
31
|
+
- 4 SeedSequence.spawn() sites converted from `np.random.SeedSequence(seed).spawn(n)` to `rng.bit_generator.seed_seq.spawn(n)` (Option A — preserves existing worker SeedSequence signatures).
|
|
32
|
+
- 2 sklearn-bridge sites in `cross_validate_metric` derive int from rng before passing to `StratifiedKFold`/`KFold(random_state=...)` (defensive across sklearn versions <1.4).
|
|
33
|
+
- `LogisticStacker.fit` derives sklearn int from `self.rng` at the boundary.
|
|
34
|
+
|
|
35
|
+
**Config schema** (Tier-2 additive): `evaluate()` config dict key `"seed"` → `"rng"`. Generator-typed input serializes as `repr(rng)`; int/None serialize as-is (backward-compatible for prior int-seed usage).
|
|
36
|
+
|
|
37
|
+
### Added
|
|
38
|
+
|
|
39
|
+
- **Docstrings**: NumPy-style parameter doc for every renamed function now references `rng : RNGLike | SeedLike | None` with explicit link to SPEC 7.
|
|
40
|
+
- **STYLE.md §3a** + **ADR 0004 D4**: `rng` row flipped from "target convention; adopted in v0.50.0" → "**canonical** convention (adopted v0.50.0)".
|
|
41
|
+
|
|
42
|
+
### Changed
|
|
43
|
+
|
|
44
|
+
- **Test sweep** (~230+ test sites): `seed=X` → `rng=X` in test kwarg calls, EXCEPT in test files that test legitimate `seed`-as-int contexts (`test_adversarial.py` for Python `random.Random`, `test_seeds.py` for `set_global_seeds`, `test_splits*.py` for Splitter dataclass fields, `test_text_dedup*.py` for MinHashLSHStrategy class field).
|
|
45
|
+
- **CHANGELOG header**: this release.
|
|
46
|
+
|
|
47
|
+
### Exceptions to SPEC 7 (KEPT `seed:` — documented in STYLE.md §3a + ADR 0004 D4)
|
|
48
|
+
|
|
49
|
+
- `seeds.set_global_seeds(seed: int)` — global-state setter, not per-function RNG.
|
|
50
|
+
- `adversarial.py` dataclass fields + functional wrappers — use Python stdlib `random.Random(seed)`, not NumPy.
|
|
51
|
+
- `splits.py` Splitter dataclass class-fields (`HoldoutSplitter.seed`, `StratifiedKFoldSplitter.seed`, etc.) — configuration storage, not user-facing RNG parameter.
|
|
52
|
+
- `loaders.py:903` YAML config schema key — declarative; renaming would break consumer YAMLs.
|
|
53
|
+
|
|
54
|
+
### Migration
|
|
55
|
+
|
|
56
|
+
- Consumer (`prompt-injection-detection-submission`) lockstep: bump dep pin `>=0.49.0` → `>=0.50.0`; rename `seed=` → `rng=` on eval-toolkit-bound call sites (estimated 5-8 sites).
|
|
57
|
+
- Bit-for-bit reproducibility preserved when migrating `seed=42` → `rng=42` (int seed is SeedLike; `np.random.default_rng(42)` is the canonical normalization).
|
|
58
|
+
|
|
59
|
+
### Notes
|
|
60
|
+
|
|
61
|
+
- Ships in parallel with Round 8 audit STOP-GATE (Decision Y.2); R8 briefing at commit `6f6839a`, awaiting Codex+Gemini reports.
|
|
62
|
+
- Memory pattern captured at v0.49.0: pre-flight grep MUST cover `README.md`, `.doctest-modules`, and any config files (per `feedback_sybil_runs_readme.md`). Applied to v0.50.0 pre-flight.
|
|
63
|
+
|
|
64
|
+
## [0.49.0] — 2026-05-23 — Global naming-standards sweep + final cleanup before v1.0
|
|
65
|
+
|
|
66
|
+
Final pre-v1.0 minor consolidating the naming-convention standardization
|
|
67
|
+
that locks the v1.0 Tier-1 contract. Audit + industry-research pass
|
|
68
|
+
(PEP 8, scikit-learn, NumPy, Google Python Style Guide, Scientific
|
|
69
|
+
Python SPEC 7) found the repo already 95-99% consistent; this release
|
|
70
|
+
closes the small remaining gaps + documents the conventions as
|
|
71
|
+
[ADR 0004](docs/source/adr/0004-naming-conventions.md). The SPEC 7
|
|
72
|
+
``rng`` parameter convention is documented here and adopted in v0.50.0.
|
|
73
|
+
|
|
74
|
+
### BREAKING
|
|
75
|
+
|
|
76
|
+
Five Tier-1 renames for naming consistency (pre-v1.0; SemVer-minor per
|
|
77
|
+
the v0.34.0 BREAKING-minor precedent). Single-consumer lockstep bump in
|
|
78
|
+
``prompt-injection-detection-submission``; no deprecation aliases.
|
|
79
|
+
|
|
80
|
+
- **``build_manifest`` → ``make_manifest``** (manifest.py). Aligns
|
|
81
|
+
with ``make_minilm_embedder`` / ``make_palette`` / ``make_run_dir``
|
|
82
|
+
factory pattern. ``build_*`` was the only outlier.
|
|
83
|
+
- **``CaseRandomization`` → ``CaseInjection``** (adversarial.py).
|
|
84
|
+
Aligns with ``*Injection`` / ``*Substitution`` adversarial suffix
|
|
85
|
+
convention.
|
|
86
|
+
- **``TokenSplitting`` → ``TokenSplittingInjection``** (adversarial.py).
|
|
87
|
+
Same rationale.
|
|
88
|
+
- **``UnicodeNormalization`` → ``UnicodeNormalizationInjection``**
|
|
89
|
+
(adversarial.py). Same rationale.
|
|
90
|
+
- **``eval_toolkit._scorecard.py`` → ``eval_toolkit.scorecards.py``**
|
|
91
|
+
(private → public module promotion). The 4 top-level symbols
|
|
92
|
+
(``scorecard``, ``Scorecard``, ``MetricSpec``, ``MetricResult``)
|
|
93
|
+
remain top-level Tier-1; the new public submodule path
|
|
94
|
+
``from eval_toolkit.scorecards import Scorecard`` is now stable.
|
|
95
|
+
``_scorecard.py`` is gone — old import paths raise
|
|
96
|
+
``ModuleNotFoundError``. Per the asymmetric-promotion principle in
|
|
97
|
+
[ADR 0001](docs/source/adr/0001-flat-module-layout.md): promote
|
|
98
|
+
collection-of-types modules, keep single-function modules underscore
|
|
99
|
+
(``_sweep.py`` stays private).
|
|
100
|
+
|
|
101
|
+
### Added
|
|
102
|
+
|
|
103
|
+
- **[ADR 0004](docs/source/adr/0004-naming-conventions.md)** — Naming
|
|
104
|
+
conventions decision record with industry citations. Covers module
|
|
105
|
+
naming (singular vs plural), class suffixes by domain, function
|
|
106
|
+
verb-prefix conventions, canonical parameter list, fitted-attribute
|
|
107
|
+
trailing underscore (sklearn convention), TypeVar leading underscore
|
|
108
|
+
(Google convention), and the SPEC 7 ``rng`` parameter convention
|
|
109
|
+
(adopted in v0.50.0).
|
|
110
|
+
- **STYLE.md** extended with §3a-d (parameter naming, class suffixes
|
|
111
|
+
by domain, module naming, asymmetric promotion), §4a-b
|
|
112
|
+
(fitted-attribute trailing underscore + TypeVar), §12 (75-col
|
|
113
|
+
docstring prose rule), §14 (test naming convention).
|
|
114
|
+
- **CONTRIBUTING.md** cross-link to ADR 0004 + STYLE.md.
|
|
115
|
+
- **[docs/source/api/strict_tier2_protocols.md](docs/source/api/strict_tier2_protocols.md)** —
|
|
116
|
+
new docs page enumerating the 9 strict Tier-2 Protocols + 1 opt-in
|
|
117
|
+
per [ADR 0003 §1](docs/source/adr/0003-stability-contract-and-gate3-methodology.md),
|
|
118
|
+
with canonical top-level import paths. Resolves #69's discoverability
|
|
119
|
+
concern without breaking the lightweight design intent of
|
|
120
|
+
``eval_toolkit.protocols`` (per ``protocols.py:1-5``).
|
|
121
|
+
- **``src/eval_toolkit/_rng.py``** — private module with SPEC 7 type
|
|
122
|
+
aliases (``SeedLike``, ``RNGLike``). Not yet referenced; scaffold for
|
|
123
|
+
the v0.50.0 SPEC 7 adoption.
|
|
124
|
+
- **[ADR 0001](docs/source/adr/0001-flat-module-layout.md)** amendment
|
|
125
|
+
— added the asymmetric-promotion sub-rule (collection-of-types MAY
|
|
126
|
+
promote, single-function SHOULD stay underscore).
|
|
127
|
+
|
|
128
|
+
### Changed
|
|
129
|
+
|
|
130
|
+
- **Duplicate-type consolidation** (single source of truth):
|
|
131
|
+
- ``Versioned`` Protocol — canonical at ``protocols.py:64``; the
|
|
132
|
+
duplicate at ``leakage.py:82`` removed. Removed
|
|
133
|
+
``"Versioned"`` from ``leakage.__all__``; previously-unused
|
|
134
|
+
``from eval_toolkit.leakage import Versioned`` now raises
|
|
135
|
+
``ImportError``. Use ``from eval_toolkit.protocols import Versioned``
|
|
136
|
+
or top-level ``from eval_toolkit import Versioned``.
|
|
137
|
+
- ``MetricStatus`` ``Literal`` — canonical at ``artifacts.py:30``; the
|
|
138
|
+
duplicate at ``scorecards.py:78`` removed; ``scorecards`` now
|
|
139
|
+
imports from ``artifacts``.
|
|
140
|
+
- **[validation] optional extra** reclassified from "active deprecation
|
|
141
|
+
with removal target v0.33.0" → "permanent no-op kept for backward
|
|
142
|
+
compatibility." Hard removal would break consumer pip pins of the
|
|
143
|
+
form ``eval-toolkit[validation]`` for zero functional benefit
|
|
144
|
+
(R3 in DEPRECATION.md).
|
|
145
|
+
- **Sphinx cross-references** updated from
|
|
146
|
+
``eval_toolkit.leakage.Versioned`` → ``eval_toolkit.protocols.Versioned``
|
|
147
|
+
in ``manifest.py`` docstrings.
|
|
148
|
+
|
|
149
|
+
### Deferred to v0.50.0
|
|
150
|
+
|
|
151
|
+
- **SPEC 7 ``rng`` parameter adoption** across ~30 NumPy-RNG functions.
|
|
152
|
+
Scope deferred from v0.49.0 after the planning audit revealed the
|
|
153
|
+
full blast radius (~30 signature sites + 247 test kwarg sites +
|
|
154
|
+
7 internal helpers + SeedSequence/Generator/sklearn-bridge
|
|
155
|
+
conversions). Splitting matches the "one cleanup per minor" pattern
|
|
156
|
+
per [feedback_staggered_breaking_releases]. ``_rng.py`` ships in
|
|
157
|
+
v0.49.0 as the scaffold; v0.50.0 wires it into every applicable
|
|
158
|
+
function.
|
|
159
|
+
|
|
160
|
+
### Notes
|
|
161
|
+
|
|
162
|
+
- Round 8 audit STOP-GATE per Decision Y.2 — briefing committed at
|
|
163
|
+
v0.48.0 (commit ``6f6839a``); v0.49.0 ships in parallel since the
|
|
164
|
+
audit-trail synthesis confirmed R8 audits the existing contract
|
|
165
|
+
(does not prescribe new changes). Any R8 finding folds into v0.49.1
|
|
166
|
+
hotfix if needed.
|
|
167
|
+
- Issue #69 closed by the new strict-Tier-2-Protocols docs page; see
|
|
168
|
+
``docs/source/api/strict_tier2_protocols.md`` and the close
|
|
169
|
+
rationale on the issue itself.
|
|
170
|
+
|
|
8
171
|
## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
|
|
9
172
|
|
|
10
173
|
Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.50.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -233,12 +233,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
|
|
|
233
233
|
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
234
234
|
from eval_toolkit.metrics import pr_auc
|
|
235
235
|
|
|
236
|
-
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000,
|
|
236
|
+
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
|
|
237
237
|
print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
|
|
238
238
|
|
|
239
239
|
# Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
|
|
240
240
|
s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
|
|
241
|
-
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000,
|
|
241
|
+
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
|
|
242
242
|
print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
|
|
243
243
|
```
|
|
244
244
|
|
|
@@ -261,13 +261,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
|
|
|
261
261
|
```python
|
|
262
262
|
import tempfile
|
|
263
263
|
from pathlib import Path
|
|
264
|
-
from eval_toolkit import
|
|
264
|
+
from eval_toolkit import make_manifest, write_manifest
|
|
265
265
|
|
|
266
266
|
with tempfile.TemporaryDirectory() as run_dir:
|
|
267
267
|
# data_files: {name: path} → eval_toolkit hashes the files for you;
|
|
268
268
|
# versioned: any object with a `version` attribute (e.g. a scorer or
|
|
269
269
|
# leakage check) is captured by name → version in the manifest.
|
|
270
|
-
manifest =
|
|
270
|
+
manifest = make_manifest(
|
|
271
271
|
run_id="quickstart-demo",
|
|
272
272
|
config={"threshold_criterion": "max_f1", "seed": 42},
|
|
273
273
|
seeds={"global": 42, "bootstrap": 42},
|
|
@@ -290,7 +290,7 @@ with tempfile.TemporaryDirectory() as run_dir:
|
|
|
290
290
|
| `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
|
|
291
291
|
| `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
|
|
292
292
|
| `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
|
|
293
|
-
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `
|
|
293
|
+
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
|
|
294
294
|
| `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
|
|
295
295
|
| `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
|
|
296
296
|
| `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
|
|
@@ -150,12 +150,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
|
|
|
150
150
|
from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
|
|
151
151
|
from eval_toolkit.metrics import pr_auc
|
|
152
152
|
|
|
153
|
-
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000,
|
|
153
|
+
ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
|
|
154
154
|
print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
|
|
155
155
|
|
|
156
156
|
# Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
|
|
157
157
|
s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
|
|
158
|
-
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000,
|
|
158
|
+
diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
|
|
159
159
|
print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
|
|
160
160
|
```
|
|
161
161
|
|
|
@@ -178,13 +178,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
|
|
|
178
178
|
```python
|
|
179
179
|
import tempfile
|
|
180
180
|
from pathlib import Path
|
|
181
|
-
from eval_toolkit import
|
|
181
|
+
from eval_toolkit import make_manifest, write_manifest
|
|
182
182
|
|
|
183
183
|
with tempfile.TemporaryDirectory() as run_dir:
|
|
184
184
|
# data_files: {name: path} → eval_toolkit hashes the files for you;
|
|
185
185
|
# versioned: any object with a `version` attribute (e.g. a scorer or
|
|
186
186
|
# leakage check) is captured by name → version in the manifest.
|
|
187
|
-
manifest =
|
|
187
|
+
manifest = make_manifest(
|
|
188
188
|
run_id="quickstart-demo",
|
|
189
189
|
config={"threshold_criterion": "max_f1", "seed": 42},
|
|
190
190
|
seeds={"global": 42, "bootstrap": 42},
|
|
@@ -207,7 +207,7 @@ with tempfile.TemporaryDirectory() as run_dir:
|
|
|
207
207
|
| `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
|
|
208
208
|
| `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
|
|
209
209
|
| `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
|
|
210
|
-
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `
|
|
210
|
+
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
|
|
211
211
|
| `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
|
|
212
212
|
| `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
|
|
213
213
|
| `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
|
|
@@ -36,6 +36,11 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
36
36
|
|
|
37
37
|
## 3. Naming
|
|
38
38
|
|
|
39
|
+
For the full decision record + industry-citations, see
|
|
40
|
+
[ADR 0004 — Naming conventions](docs/source/adr/0004-naming-conventions.md).
|
|
41
|
+
This section is the day-to-day quick reference; the ADR is the
|
|
42
|
+
authoritative source.
|
|
43
|
+
|
|
39
44
|
- Module names: `snake_case`, lowercase package (`eval_toolkit`).
|
|
40
45
|
- Class names: `PascalCase`. Suffixes used in this repo:
|
|
41
46
|
- `*Config` — frozen dataclass for settings
|
|
@@ -55,6 +60,68 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
55
60
|
- Mutation marking: not used. Mutating functions return `None` (Pythonic over
|
|
56
61
|
Julia's `_inplace` suffix).
|
|
57
62
|
|
|
63
|
+
### 3a. Parameter naming (canonical list, locked at v1.0)
|
|
64
|
+
|
|
65
|
+
These names mean these things, everywhere. Future functions MUST use
|
|
66
|
+
them; deviations need justification in the PR description.
|
|
67
|
+
|
|
68
|
+
| Parameter | Meaning |
|
|
69
|
+
|---|---|
|
|
70
|
+
| `y_true` | Ground-truth labels (binary, shape `(n,)`) |
|
|
71
|
+
| `y_score` | Continuous score / probability (shape `(n,)`) |
|
|
72
|
+
| `y_pred` | Discrete prediction (threshold-dependent) |
|
|
73
|
+
| `n_resamples` | Bootstrap iteration count |
|
|
74
|
+
| `confidence` | Two-sided confidence level (0.95 default) |
|
|
75
|
+
| `n_bins` | Binning count for calibration / ECE |
|
|
76
|
+
| `n_jobs` | Parallelism (joblib + sklearn convention) |
|
|
77
|
+
| `ax` | Matplotlib axis (matplotlib convention) |
|
|
78
|
+
| `metric` | Callable `(y_true, y_score) -> float` |
|
|
79
|
+
| `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — **canonical** convention (adopted v0.50.0). Accepts `int`, `np.random.Generator`, `BitGenerator`, `SeedSequence`, or `None`. |
|
|
80
|
+
|
|
81
|
+
The v0.50.0 SPEC 7 adoption preserves two `seed: int` exceptions:
|
|
82
|
+
`set_global_seeds(seed: int)` (global-state setter, not per-function
|
|
83
|
+
RNG; SPEC 7 doesn't apply) and adversarial dataclass fields (use Python
|
|
84
|
+
`random.Random(seed)`; not NumPy-RNG, so SPEC 7's typing doesn't fit).
|
|
85
|
+
|
|
86
|
+
### 3b. Class suffixes by domain
|
|
87
|
+
|
|
88
|
+
Each suffix maps to a Protocol contract. Stay within the pattern:
|
|
89
|
+
|
|
90
|
+
| Suffix | Domain | Protocol |
|
|
91
|
+
|---|---|---|
|
|
92
|
+
| `*Selector` | Threshold selection | `ThresholdSelector` |
|
|
93
|
+
| `*Splitter` | Cross-validation splits | `Splitter` |
|
|
94
|
+
| `*Check` | Leakage detection | `LeakageCheck` |
|
|
95
|
+
| `*Loader` | Dataset loading | `DatasetLoader` |
|
|
96
|
+
| `*Reader` | Prediction artifact reading | `PredictionReader` |
|
|
97
|
+
| `*Variant` | Preprocessing variant | (functional API) |
|
|
98
|
+
| `*Strategy` | Dedup similarity backend | `SimilarityStrategy` |
|
|
99
|
+
| `*Injection` / `*Substitution` | Adversarial char-injection / -substitution | `TextTransform` |
|
|
100
|
+
|
|
101
|
+
### 3c. Module naming (singular vs plural)
|
|
102
|
+
|
|
103
|
+
- **Plural noun** for collection-of-types modules: `metrics`,
|
|
104
|
+
`loaders`, `protocols`, `losses`, `probes`, `splits`, `paths`,
|
|
105
|
+
`seeds`, `thresholds`, `artifacts`, `claims`, `embeddings`,
|
|
106
|
+
`scorecards`.
|
|
107
|
+
- **Singular noun** for domain-concept modules: `harness`,
|
|
108
|
+
`bootstrap`, `manifest`, `calibration`, `leakage`, `analysis`,
|
|
109
|
+
`provenance`, `evidence`, `stacking`, `text_dedup`.
|
|
110
|
+
- **Gerund** for process-domain modules: `preprocessing`.
|
|
111
|
+
|
|
112
|
+
### 3d. Asymmetric module promotion (private → public)
|
|
113
|
+
|
|
114
|
+
Collection-of-types private modules MAY be promoted to plural-public
|
|
115
|
+
when they hold ≥2 user-relevant types. Single-function private
|
|
116
|
+
modules SHOULD stay underscore. See
|
|
117
|
+
[ADR 0001](docs/source/adr/0001-flat-module-layout.md) for the trigger
|
|
118
|
+
analysis.
|
|
119
|
+
|
|
120
|
+
Examples:
|
|
121
|
+
|
|
122
|
+
- `_scorecard.py` (4 public exports) → `scorecards.py` at v0.49.0. ✓ promote.
|
|
123
|
+
- `_sweep.py` (1 public function `sweep`) → stays `_sweep.py`. ✓ keep private.
|
|
124
|
+
|
|
58
125
|
## 4. Type hints
|
|
59
126
|
|
|
60
127
|
- Every public function has fully typed parameters and return.
|
|
@@ -79,10 +146,13 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
79
146
|
for 4 reference impls.
|
|
80
147
|
- `SimilarityStrategy` (`text_dedup.py`) — pluggable similarity backend for
|
|
81
148
|
`near_dedup` / `cross_dedup` / `NearDuplicateCheck` / `CrossSplitLeakageCheck`.
|
|
82
|
-
- `Versioned` (`
|
|
83
|
-
implementation may expose `version: str`.
|
|
84
|
-
auto-collects them. Mirrors the
|
|
85
|
-
pattern. See
|
|
149
|
+
- `Versioned` (`protocols.py`) — opt-in single-attribute Protocol; any
|
|
150
|
+
Tier-2 implementation may expose `version: str`.
|
|
151
|
+
`RunManifest.versioned_objects` auto-collects them. Mirrors the
|
|
152
|
+
`lm-evaluation-harness` task `VERSION` pattern. See
|
|
153
|
+
`docs/methodology/versioning.md`. (Single source of truth at
|
|
154
|
+
`protocols.py:64` since v0.49.0; the duplicate previously in
|
|
155
|
+
`leakage.py:82` was removed.)
|
|
86
156
|
- All seams are `@runtime_checkable` so callers can `isinstance(obj, Protocol)`.
|
|
87
157
|
- Reference impls are `@dataclass(frozen=True, slots=True)` with config in the
|
|
88
158
|
constructor (`TargetRecallSelector(recall=0.90)`) and the Protocol method as
|
|
@@ -90,6 +160,25 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
90
160
|
- `NamedTuple` for stable public records that benefit from positional access;
|
|
91
161
|
frozen dataclasses with `slots=True` otherwise.
|
|
92
162
|
|
|
163
|
+
### 4a. Fitted-attribute trailing underscore (sklearn convention)
|
|
164
|
+
|
|
165
|
+
Estimator-style classes (`fit`/`predict` pattern) that store
|
|
166
|
+
**learned-from-data attributes** use trailing underscore per scikit-learn
|
|
167
|
+
convention: `coef_`, `classes_`, `n_features_in_`, `feature_importances_`.
|
|
168
|
+
These attributes MUST NOT be set in `__init__` — set them only in `fit()`.
|
|
169
|
+
|
|
170
|
+
Frozen reference-impl dataclasses (`@dataclass(frozen=True, slots=True)`)
|
|
171
|
+
are **exempt** — they hold config, not fitted state.
|
|
172
|
+
|
|
173
|
+
Current canonical example: `stacking.LogisticStacker`.
|
|
174
|
+
|
|
175
|
+
### 4b. TypeVar naming
|
|
176
|
+
|
|
177
|
+
Internal (private) `TypeVar`s use a leading underscore per Google Python
|
|
178
|
+
Style Guide §3.19.10: `_T = TypeVar("_T")`. Public, constrained `TypeVar`s
|
|
179
|
+
without the underscore are allowed only when explicitly part of an
|
|
180
|
+
exported generic API.
|
|
181
|
+
|
|
93
182
|
## 5. Dataclasses
|
|
94
183
|
|
|
95
184
|
1. **`slots=True` always** on repo-owned dataclasses. Catches typos at
|
|
@@ -220,6 +309,10 @@ def fit_temperature(val_logits, val_labels, bounds=(0.05, 20.0)):
|
|
|
220
309
|
- **References** cites arXiv IDs / DOIs / journal cites.
|
|
221
310
|
- For modules where doctests would be contrived (`plotting`, `harness`,
|
|
222
311
|
`provenance`), Examples are optional.
|
|
312
|
+
- **Docstring prose wraps at 75 cols** (numpydoc convention) so that
|
|
313
|
+
`help()` is readable in a terminal. Doctest code blocks inside the
|
|
314
|
+
docstring follow the 100-col Black rule (code stays comfortable in an
|
|
315
|
+
editor even though prose around it is narrower).
|
|
223
316
|
|
|
224
317
|
## 13. Comments
|
|
225
318
|
|
|
@@ -228,6 +321,12 @@ restate what the code says.
|
|
|
228
321
|
|
|
229
322
|
## 14. Tests
|
|
230
323
|
|
|
324
|
+
- **File naming**: `tests/test_<module>.py` mirrors
|
|
325
|
+
`src/eval_toolkit/<module>.py`. Auxiliary tests per module use
|
|
326
|
+
suffixes (`test_<module>_props.py`, `test_<module>_validation.py`,
|
|
327
|
+
`test_<module>_golden.py`).
|
|
328
|
+
- **Function naming**: `test_<thing_under_test>_<scenario>`. No
|
|
329
|
+
class-based test grouping unless fixtures truly demand it (rare).
|
|
231
330
|
- **Markers**: `unit`, `property`, `smoke`, `golden`.
|
|
232
331
|
- **Sklearn-reference + analytical** as the unit-test oracle where available.
|
|
233
332
|
- **Hypothesis** required for math/stat invariants. Strategies use
|
|
@@ -74,15 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
|
|
|
74
74
|
# (granular extras — losses callers should not have to install the larger
|
|
75
75
|
# transformers stack). Shares the torch version pin with [probes].
|
|
76
76
|
losses = ["torch>=2.0"]
|
|
77
|
-
#
|
|
77
|
+
# NO-OP extra kept for backward compatibility (R3 at v0.49.0).
|
|
78
78
|
#
|
|
79
|
-
#
|
|
80
|
-
#
|
|
81
|
-
# v0.
|
|
82
|
-
#
|
|
83
|
-
#
|
|
84
|
-
#
|
|
85
|
-
# in CHANGELOG ### Deprecated + docs/DEPRECATION.md.
|
|
79
|
+
# jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
|
|
80
|
+
# no-op ever since. Originally announced as deprecated in v0.30.1 with
|
|
81
|
+
# target removal at v0.33.0, but reclassified at v0.49.0 (R3 in
|
|
82
|
+
# docs/DEPRECATION.md) as a permanent no-op — hard removal would break
|
|
83
|
+
# consumer pip pins of the form `eval-toolkit[validation]` for zero
|
|
84
|
+
# functional benefit. Retained indefinitely.
|
|
86
85
|
validation = []
|
|
87
86
|
# v0.31.0 docs site: Sphinx + pydata-sphinx-theme (replaces v0.28.0's
|
|
88
87
|
# mkdocs-material). Migration drivers — pain points Q1 in the v0.31.0
|
|
@@ -38,15 +38,15 @@ _EXPORTS: dict[str, str] = {
|
|
|
38
38
|
"ALL_TECHNIQUES": "eval_toolkit.adversarial",
|
|
39
39
|
"BidiRTLInjection": "eval_toolkit.adversarial",
|
|
40
40
|
"CORE_TECHNIQUES": "eval_toolkit.adversarial",
|
|
41
|
-
"
|
|
41
|
+
"CaseInjection": "eval_toolkit.adversarial",
|
|
42
42
|
"DiacriticInjection": "eval_toolkit.adversarial",
|
|
43
43
|
"HomoglyphSubstitution": "eval_toolkit.adversarial",
|
|
44
44
|
"InvisibleCharsInjection": "eval_toolkit.adversarial",
|
|
45
45
|
"PunctuationInjection": "eval_toolkit.adversarial",
|
|
46
46
|
"SynonymSubstitution": "eval_toolkit.adversarial",
|
|
47
47
|
"TagStrippingInjection": "eval_toolkit.adversarial",
|
|
48
|
-
"
|
|
49
|
-
"
|
|
48
|
+
"TokenSplittingInjection": "eval_toolkit.adversarial",
|
|
49
|
+
"UnicodeNormalizationInjection": "eval_toolkit.adversarial",
|
|
50
50
|
"WhitespaceInjection": "eval_toolkit.adversarial",
|
|
51
51
|
"ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
|
|
52
52
|
# CharacterInjectionStrategy + character_injection SimpleNamespace
|
|
@@ -202,7 +202,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
202
202
|
"MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
|
|
203
203
|
"RunManifest": "eval_toolkit.manifest",
|
|
204
204
|
"SourceRoleRecord": "eval_toolkit.manifest",
|
|
205
|
-
"
|
|
205
|
+
"make_manifest": "eval_toolkit.manifest",
|
|
206
206
|
"validate_source_roles": "eval_toolkit.manifest",
|
|
207
207
|
"write_manifest": "eval_toolkit.manifest",
|
|
208
208
|
# --- metrics ---
|
|
@@ -315,10 +315,10 @@ _EXPORTS: dict[str, str] = {
|
|
|
315
315
|
"wilson_interval": "eval_toolkit.thresholds",
|
|
316
316
|
"LogisticStacker": "eval_toolkit.stacking",
|
|
317
317
|
"MetaLearner": "eval_toolkit.stacking",
|
|
318
|
-
"MetricResult": "eval_toolkit.
|
|
319
|
-
"MetricSpec": "eval_toolkit.
|
|
320
|
-
"Scorecard": "eval_toolkit.
|
|
321
|
-
"scorecard": "eval_toolkit.
|
|
318
|
+
"MetricResult": "eval_toolkit.scorecards",
|
|
319
|
+
"MetricSpec": "eval_toolkit.scorecards",
|
|
320
|
+
"Scorecard": "eval_toolkit.scorecards",
|
|
321
|
+
"scorecard": "eval_toolkit.scorecards",
|
|
322
322
|
# --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
|
|
323
323
|
"sweep": "eval_toolkit._sweep",
|
|
324
324
|
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Private RNG-parameter type aliases per Scientific-Python SPEC 7.
|
|
2
|
+
|
|
3
|
+
This module centralizes the type aliases used to annotate user-facing RNG
|
|
4
|
+
parameters across the toolkit. Per `SPEC 7 — Seeding PRNG
|
|
5
|
+
<https://scientific-python.org/specs/spec-0007/>`_ (Endorsed) eval-toolkit
|
|
6
|
+
exposes a single canonical parameter name ``rng`` typed as
|
|
7
|
+
``RNGLike | SeedLike | None`` on every function that consumes a NumPy
|
|
8
|
+
``Generator``. Bodies normalize via ``np.random.default_rng(rng)``.
|
|
9
|
+
|
|
10
|
+
This module is private (underscore prefix) so the aliases stay an
|
|
11
|
+
implementation detail — public symbols use them only in their annotations.
|
|
12
|
+
If a Tier-2 consumer ever needs them exposed for their own callsite type
|
|
13
|
+
annotations, promote them via ``eval_toolkit.protocols`` per the
|
|
14
|
+
asymmetric-promotion principle in ADR 0001 + STYLE.md §3d.
|
|
15
|
+
|
|
16
|
+
Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
|
|
17
|
+
|
|
18
|
+
- ``seeds.set_global_seeds(seed: int)`` — global-state setter, not a
|
|
19
|
+
per-function RNG parameter; SPEC 7 is scoped to per-function RNG inputs.
|
|
20
|
+
- ``adversarial.*Injection`` / ``*Substitution`` / ``CaseInjection``
|
|
21
|
+
dataclass fields — they use Python's stdlib ``random.Random(seed)``,
|
|
22
|
+
not NumPy. SPEC 7's typing (``RNGLike = np.random.Generator | ...``) is
|
|
23
|
+
strictly NumPy-scoped.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from collections.abc import Sequence
|
|
29
|
+
from typing import cast
|
|
30
|
+
|
|
31
|
+
import numpy as np
|
|
32
|
+
|
|
33
|
+
type SeedLike = int | np.integer | Sequence[int] | np.random.SeedSequence
|
|
34
|
+
"""Anything that can seed a NumPy bit generator.
|
|
35
|
+
|
|
36
|
+
Per SPEC 7, ``np.random.default_rng`` accepts any of these as a seed
|
|
37
|
+
without further conversion. ``Sequence[int]`` is the entropy-vector form
|
|
38
|
+
used by ``np.random.SeedSequence``.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
type RNGLike = np.random.Generator | np.random.BitGenerator
|
|
42
|
+
"""An already-instantiated NumPy bit generator or generator wrapper.
|
|
43
|
+
|
|
44
|
+
``np.random.default_rng(rng)`` is the identity function on
|
|
45
|
+
``Generator`` inputs and lifts ``BitGenerator`` inputs into a
|
|
46
|
+
``Generator`` — both forms compose cleanly.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def spawn_seed_sequences(rng: RNGLike | SeedLike | None, n: int) -> list[np.random.SeedSequence]:
|
|
51
|
+
"""Spawn ``n`` independent SeedSequences from any SPEC 7 ``rng`` input.
|
|
52
|
+
|
|
53
|
+
Normalizes the input to a ``Generator``, then extracts the underlying
|
|
54
|
+
``SeedSequence`` via the bit-generator and spawns ``n`` children.
|
|
55
|
+
The cast satisfies mypy strict: the ``seed_seq`` attribute on a
|
|
56
|
+
concrete BitGenerator is a ``SeedSequence`` instance, but the type
|
|
57
|
+
stub on ``BitGenerator.seed_seq`` returns the abstract
|
|
58
|
+
``ISeedSequence`` interface (which lacks ``spawn``).
|
|
59
|
+
|
|
60
|
+
Used by the bootstrap parallel workers (which take spawned
|
|
61
|
+
``SeedSequence`` objects to seed their internal ``default_rng()`` calls).
|
|
62
|
+
"""
|
|
63
|
+
gen = np.random.default_rng(rng)
|
|
64
|
+
seed_seq = cast(np.random.SeedSequence, gen.bit_generator.seed_seq)
|
|
65
|
+
return seed_seq.spawn(n)
|