eval-toolkit 0.48.0__tar.gz → 0.49.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/CHANGELOG.md +107 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/PKG-INFO +4 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/README.md +3 -3
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/STYLE.md +103 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/pyproject.toml +7 -8
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/__init__.py +8 -8
- eval_toolkit-0.49.0/src/eval_toolkit/_rng.py +46 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/adversarial.py +18 -18
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/leakage.py +5 -17
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/manifest.py +10 -10
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/metric_specs.py +1 -1
- eval_toolkit-0.48.0/src/eval_toolkit/_scorecard.py → eval_toolkit-0.49.0/src/eval_toolkit/scorecards.py +5 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/public_api/snapshot.json +13 -13
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_adversarial.py +17 -17
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_lazy_extras_messages.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest.py +43 -43
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest_contamination_round_trip.py +6 -6
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest_props.py +11 -11
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest_validation.py +4 -4
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_preprocessing.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_schemas.py +6 -6
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_v09_contracts.py +2 -2
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/.gitignore +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/LICENSE +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_tokenization_leakage_check.py +0 -0
|
@@ -5,6 +5,113 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.49.0] — 2026-05-23 — Global naming-standards sweep + final cleanup before v1.0
|
|
9
|
+
|
|
10
|
+
Final pre-v1.0 minor consolidating the naming-convention standardization
|
|
11
|
+
that locks the v1.0 Tier-1 contract. Audit + industry-research pass
|
|
12
|
+
(PEP 8, scikit-learn, NumPy, Google Python Style Guide, Scientific
|
|
13
|
+
Python SPEC 7) found the repo already 95-99% consistent; this release
|
|
14
|
+
closes the small remaining gaps + documents the conventions as
|
|
15
|
+
[ADR 0004](docs/source/adr/0004-naming-conventions.md). The SPEC 7
|
|
16
|
+
``rng`` parameter convention is documented here and adopted in v0.50.0.
|
|
17
|
+
|
|
18
|
+
### BREAKING
|
|
19
|
+
|
|
20
|
+
Five Tier-1 renames for naming consistency (pre-v1.0; SemVer-minor per
|
|
21
|
+
the v0.34.0 BREAKING-minor precedent). Single-consumer lockstep bump in
|
|
22
|
+
``prompt-injection-detection-submission``; no deprecation aliases.
|
|
23
|
+
|
|
24
|
+
- **``build_manifest`` → ``make_manifest``** (manifest.py). Aligns
|
|
25
|
+
with ``make_minilm_embedder`` / ``make_palette`` / ``make_run_dir``
|
|
26
|
+
factory pattern. ``build_*`` was the only outlier.
|
|
27
|
+
- **``CaseRandomization`` → ``CaseInjection``** (adversarial.py).
|
|
28
|
+
Aligns with ``*Injection`` / ``*Substitution`` adversarial suffix
|
|
29
|
+
convention.
|
|
30
|
+
- **``TokenSplitting`` → ``TokenSplittingInjection``** (adversarial.py).
|
|
31
|
+
Same rationale.
|
|
32
|
+
- **``UnicodeNormalization`` → ``UnicodeNormalizationInjection``**
|
|
33
|
+
(adversarial.py). Same rationale.
|
|
34
|
+
- **``eval_toolkit._scorecard.py`` → ``eval_toolkit.scorecards.py``**
|
|
35
|
+
(private → public module promotion). The 4 top-level symbols
|
|
36
|
+
(``scorecard``, ``Scorecard``, ``MetricSpec``, ``MetricResult``)
|
|
37
|
+
remain top-level Tier-1; the new public submodule path
|
|
38
|
+
``from eval_toolkit.scorecards import Scorecard`` is now stable.
|
|
39
|
+
``_scorecard.py`` is gone — old import paths raise
|
|
40
|
+
``ModuleNotFoundError``. Per the asymmetric-promotion principle in
|
|
41
|
+
[ADR 0001](docs/source/adr/0001-flat-module-layout.md): promote
|
|
42
|
+
collection-of-types modules, keep single-function modules underscore
|
|
43
|
+
(``_sweep.py`` stays private).
|
|
44
|
+
|
|
45
|
+
### Added
|
|
46
|
+
|
|
47
|
+
- **[ADR 0004](docs/source/adr/0004-naming-conventions.md)** — Naming
|
|
48
|
+
conventions decision record with industry citations. Covers module
|
|
49
|
+
naming (singular vs plural), class suffixes by domain, function
|
|
50
|
+
verb-prefix conventions, canonical parameter list, fitted-attribute
|
|
51
|
+
trailing underscore (sklearn convention), TypeVar leading underscore
|
|
52
|
+
(Google convention), and the SPEC 7 ``rng`` parameter convention
|
|
53
|
+
(adopted in v0.50.0).
|
|
54
|
+
- **STYLE.md** extended with §3a-d (parameter naming, class suffixes
|
|
55
|
+
by domain, module naming, asymmetric promotion), §4a-b
|
|
56
|
+
(fitted-attribute trailing underscore + TypeVar), §12 (75-col
|
|
57
|
+
docstring prose rule), §14 (test naming convention).
|
|
58
|
+
- **CONTRIBUTING.md** cross-link to ADR 0004 + STYLE.md.
|
|
59
|
+
- **[docs/source/api/strict_tier2_protocols.md](docs/source/api/strict_tier2_protocols.md)** —
|
|
60
|
+
new docs page enumerating the 9 strict Tier-2 Protocols + 1 opt-in
|
|
61
|
+
per [ADR 0003 §1](docs/source/adr/0003-stability-contract-and-gate3-methodology.md),
|
|
62
|
+
with canonical top-level import paths. Resolves #69's discoverability
|
|
63
|
+
concern without breaking the lightweight design intent of
|
|
64
|
+
``eval_toolkit.protocols`` (per ``protocols.py:1-5``).
|
|
65
|
+
- **``src/eval_toolkit/_rng.py``** — private module with SPEC 7 type
|
|
66
|
+
aliases (``SeedLike``, ``RNGLike``). Not yet referenced; scaffold for
|
|
67
|
+
the v0.50.0 SPEC 7 adoption.
|
|
68
|
+
- **[ADR 0001](docs/source/adr/0001-flat-module-layout.md)** amendment
|
|
69
|
+
— added the asymmetric-promotion sub-rule (collection-of-types MAY
|
|
70
|
+
promote, single-function SHOULD stay underscore).
|
|
71
|
+
|
|
72
|
+
### Changed
|
|
73
|
+
|
|
74
|
+
- **Duplicate-type consolidation** (single source of truth):
|
|
75
|
+
- ``Versioned`` Protocol — canonical at ``protocols.py:64``; the
|
|
76
|
+
duplicate at ``leakage.py:82`` removed. Removed
|
|
77
|
+
``"Versioned"`` from ``leakage.__all__``; previously-unused
|
|
78
|
+
``from eval_toolkit.leakage import Versioned`` now raises
|
|
79
|
+
``ImportError``. Use ``from eval_toolkit.protocols import Versioned``
|
|
80
|
+
or top-level ``from eval_toolkit import Versioned``.
|
|
81
|
+
- ``MetricStatus`` ``Literal`` — canonical at ``artifacts.py:30``; the
|
|
82
|
+
duplicate at ``scorecards.py:78`` removed; ``scorecards`` now
|
|
83
|
+
imports from ``artifacts``.
|
|
84
|
+
- **[validation] optional extra** reclassified from "active deprecation
|
|
85
|
+
with removal target v0.33.0" → "permanent no-op kept for backward
|
|
86
|
+
compatibility." Hard removal would break consumer pip pins of the
|
|
87
|
+
form ``eval-toolkit[validation]`` for zero functional benefit
|
|
88
|
+
(R3 in DEPRECATION.md).
|
|
89
|
+
- **Sphinx cross-references** updated from
|
|
90
|
+
``eval_toolkit.leakage.Versioned`` → ``eval_toolkit.protocols.Versioned``
|
|
91
|
+
in ``manifest.py`` docstrings.
|
|
92
|
+
|
|
93
|
+
### Deferred to v0.50.0
|
|
94
|
+
|
|
95
|
+
- **SPEC 7 ``rng`` parameter adoption** across ~30 NumPy-RNG functions.
|
|
96
|
+
Scope deferred from v0.49.0 after the planning audit revealed the
|
|
97
|
+
full blast radius (~30 signature sites + 247 test kwarg sites +
|
|
98
|
+
7 internal helpers + SeedSequence/Generator/sklearn-bridge
|
|
99
|
+
conversions). Splitting matches the "one cleanup per minor" pattern
|
|
100
|
+
per [feedback_staggered_breaking_releases]. ``_rng.py`` ships in
|
|
101
|
+
v0.49.0 as the scaffold; v0.50.0 wires it into every applicable
|
|
102
|
+
function.
|
|
103
|
+
|
|
104
|
+
### Notes
|
|
105
|
+
|
|
106
|
+
- Round 8 audit STOP-GATE per Decision Y.2 — briefing committed at
|
|
107
|
+
v0.48.0 (commit ``6f6839a``); v0.49.0 ships in parallel since the
|
|
108
|
+
audit-trail synthesis confirmed R8 audits the existing contract
|
|
109
|
+
(does not prescribe new changes). Any R8 finding folds into v0.49.1
|
|
110
|
+
hotfix if needed.
|
|
111
|
+
- Issue #69 closed by the new strict-Tier-2-Protocols docs page; see
|
|
112
|
+
``docs/source/api/strict_tier2_protocols.md`` and the close
|
|
113
|
+
rationale on the issue itself.
|
|
114
|
+
|
|
8
115
|
## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
|
|
9
116
|
|
|
10
117
|
Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.49.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -261,13 +261,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
|
|
|
261
261
|
```python
|
|
262
262
|
import tempfile
|
|
263
263
|
from pathlib import Path
|
|
264
|
-
from eval_toolkit import
|
|
264
|
+
from eval_toolkit import make_manifest, write_manifest
|
|
265
265
|
|
|
266
266
|
with tempfile.TemporaryDirectory() as run_dir:
|
|
267
267
|
# data_files: {name: path} → eval_toolkit hashes the files for you;
|
|
268
268
|
# versioned: any object with a `version` attribute (e.g. a scorer or
|
|
269
269
|
# leakage check) is captured by name → version in the manifest.
|
|
270
|
-
manifest =
|
|
270
|
+
manifest = make_manifest(
|
|
271
271
|
run_id="quickstart-demo",
|
|
272
272
|
config={"threshold_criterion": "max_f1", "seed": 42},
|
|
273
273
|
seeds={"global": 42, "bootstrap": 42},
|
|
@@ -290,7 +290,7 @@ with tempfile.TemporaryDirectory() as run_dir:
|
|
|
290
290
|
| `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
|
|
291
291
|
| `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
|
|
292
292
|
| `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
|
|
293
|
-
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `
|
|
293
|
+
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
|
|
294
294
|
| `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
|
|
295
295
|
| `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
|
|
296
296
|
| `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
|
|
@@ -178,13 +178,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
|
|
|
178
178
|
```python
|
|
179
179
|
import tempfile
|
|
180
180
|
from pathlib import Path
|
|
181
|
-
from eval_toolkit import
|
|
181
|
+
from eval_toolkit import make_manifest, write_manifest
|
|
182
182
|
|
|
183
183
|
with tempfile.TemporaryDirectory() as run_dir:
|
|
184
184
|
# data_files: {name: path} → eval_toolkit hashes the files for you;
|
|
185
185
|
# versioned: any object with a `version` attribute (e.g. a scorer or
|
|
186
186
|
# leakage check) is captured by name → version in the manifest.
|
|
187
|
-
manifest =
|
|
187
|
+
manifest = make_manifest(
|
|
188
188
|
run_id="quickstart-demo",
|
|
189
189
|
config={"threshold_criterion": "max_f1", "seed": 42},
|
|
190
190
|
seeds={"global": 42, "bootstrap": 42},
|
|
@@ -207,7 +207,7 @@ with tempfile.TemporaryDirectory() as run_dir:
|
|
|
207
207
|
| `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
|
|
208
208
|
| `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
|
|
209
209
|
| `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
|
|
210
|
-
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `
|
|
210
|
+
| `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
|
|
211
211
|
| `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
|
|
212
212
|
| `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
|
|
213
213
|
| `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
|
|
@@ -36,6 +36,11 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
36
36
|
|
|
37
37
|
## 3. Naming
|
|
38
38
|
|
|
39
|
+
For the full decision record + industry-citations, see
|
|
40
|
+
[ADR 0004 — Naming conventions](docs/source/adr/0004-naming-conventions.md).
|
|
41
|
+
This section is the day-to-day quick reference; the ADR is the
|
|
42
|
+
authoritative source.
|
|
43
|
+
|
|
39
44
|
- Module names: `snake_case`, lowercase package (`eval_toolkit`).
|
|
40
45
|
- Class names: `PascalCase`. Suffixes used in this repo:
|
|
41
46
|
- `*Config` — frozen dataclass for settings
|
|
@@ -55,6 +60,68 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
55
60
|
- Mutation marking: not used. Mutating functions return `None` (Pythonic over
|
|
56
61
|
Julia's `_inplace` suffix).
|
|
57
62
|
|
|
63
|
+
### 3a. Parameter naming (canonical list, locked at v1.0)
|
|
64
|
+
|
|
65
|
+
These names mean these things, everywhere. Future functions MUST use
|
|
66
|
+
them; deviations need justification in the PR description.
|
|
67
|
+
|
|
68
|
+
| Parameter | Meaning |
|
|
69
|
+
|---|---|
|
|
70
|
+
| `y_true` | Ground-truth labels (binary, shape `(n,)`) |
|
|
71
|
+
| `y_score` | Continuous score / probability (shape `(n,)`) |
|
|
72
|
+
| `y_pred` | Discrete prediction (threshold-dependent) |
|
|
73
|
+
| `n_resamples` | Bootstrap iteration count |
|
|
74
|
+
| `confidence` | Two-sided confidence level (0.95 default) |
|
|
75
|
+
| `n_bins` | Binning count for calibration / ECE |
|
|
76
|
+
| `n_jobs` | Parallelism (joblib + sklearn convention) |
|
|
77
|
+
| `ax` | Matplotlib axis (matplotlib convention) |
|
|
78
|
+
| `metric` | Callable `(y_true, y_score) -> float` |
|
|
79
|
+
| `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — target convention; adopted in v0.50.0 |
|
|
80
|
+
|
|
81
|
+
The v0.50.0 SPEC 7 adoption preserves two `seed: int` exceptions:
|
|
82
|
+
`set_global_seeds(seed: int)` (global-state setter, not per-function
|
|
83
|
+
RNG; SPEC 7 doesn't apply) and adversarial dataclass fields (use Python
|
|
84
|
+
`random.Random(seed)`; not NumPy-RNG, so SPEC 7's typing doesn't fit).
|
|
85
|
+
|
|
86
|
+
### 3b. Class suffixes by domain
|
|
87
|
+
|
|
88
|
+
Each suffix maps to a Protocol contract. Stay within the pattern:
|
|
89
|
+
|
|
90
|
+
| Suffix | Domain | Protocol |
|
|
91
|
+
|---|---|---|
|
|
92
|
+
| `*Selector` | Threshold selection | `ThresholdSelector` |
|
|
93
|
+
| `*Splitter` | Cross-validation splits | `Splitter` |
|
|
94
|
+
| `*Check` | Leakage detection | `LeakageCheck` |
|
|
95
|
+
| `*Loader` | Dataset loading | `DatasetLoader` |
|
|
96
|
+
| `*Reader` | Prediction artifact reading | `PredictionReader` |
|
|
97
|
+
| `*Variant` | Preprocessing variant | (functional API) |
|
|
98
|
+
| `*Strategy` | Dedup similarity backend | `SimilarityStrategy` |
|
|
99
|
+
| `*Injection` / `*Substitution` | Adversarial char-injection / -substitution | `TextTransform` |
|
|
100
|
+
|
|
101
|
+
### 3c. Module naming (singular vs plural)
|
|
102
|
+
|
|
103
|
+
- **Plural noun** for collection-of-types modules: `metrics`,
|
|
104
|
+
`loaders`, `protocols`, `losses`, `probes`, `splits`, `paths`,
|
|
105
|
+
`seeds`, `thresholds`, `artifacts`, `claims`, `embeddings`,
|
|
106
|
+
`scorecards`.
|
|
107
|
+
- **Singular noun** for domain-concept modules: `harness`,
|
|
108
|
+
`bootstrap`, `manifest`, `calibration`, `leakage`, `analysis`,
|
|
109
|
+
`provenance`, `evidence`, `stacking`, `text_dedup`.
|
|
110
|
+
- **Gerund** for process-domain modules: `preprocessing`.
|
|
111
|
+
|
|
112
|
+
### 3d. Asymmetric module promotion (private → public)
|
|
113
|
+
|
|
114
|
+
Collection-of-types private modules MAY be promoted to plural-public
|
|
115
|
+
when they hold ≥2 user-relevant types. Single-function private
|
|
116
|
+
modules SHOULD stay underscore. See
|
|
117
|
+
[ADR 0001](docs/source/adr/0001-flat-module-layout.md) for the trigger
|
|
118
|
+
analysis.
|
|
119
|
+
|
|
120
|
+
Examples:
|
|
121
|
+
|
|
122
|
+
- `_scorecard.py` (4 public exports) → `scorecards.py` at v0.49.0. ✓ promote.
|
|
123
|
+
- `_sweep.py` (1 public function `sweep`) → stays `_sweep.py`. ✓ keep private.
|
|
124
|
+
|
|
58
125
|
## 4. Type hints
|
|
59
126
|
|
|
60
127
|
- Every public function has fully typed parameters and return.
|
|
@@ -79,10 +146,13 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
79
146
|
for 4 reference impls.
|
|
80
147
|
- `SimilarityStrategy` (`text_dedup.py`) — pluggable similarity backend for
|
|
81
148
|
`near_dedup` / `cross_dedup` / `NearDuplicateCheck` / `CrossSplitLeakageCheck`.
|
|
82
|
-
- `Versioned` (`
|
|
83
|
-
implementation may expose `version: str`.
|
|
84
|
-
auto-collects them. Mirrors the
|
|
85
|
-
pattern. See
|
|
149
|
+
- `Versioned` (`protocols.py`) — opt-in single-attribute Protocol; any
|
|
150
|
+
Tier-2 implementation may expose `version: str`.
|
|
151
|
+
`RunManifest.versioned_objects` auto-collects them. Mirrors the
|
|
152
|
+
`lm-evaluation-harness` task `VERSION` pattern. See
|
|
153
|
+
`docs/methodology/versioning.md`. (Single source of truth at
|
|
154
|
+
`protocols.py:64` since v0.49.0; the duplicate previously in
|
|
155
|
+
`leakage.py:82` was removed.)
|
|
86
156
|
- All seams are `@runtime_checkable` so callers can `isinstance(obj, Protocol)`.
|
|
87
157
|
- Reference impls are `@dataclass(frozen=True, slots=True)` with config in the
|
|
88
158
|
constructor (`TargetRecallSelector(recall=0.90)`) and the Protocol method as
|
|
@@ -90,6 +160,25 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
|
|
|
90
160
|
- `NamedTuple` for stable public records that benefit from positional access;
|
|
91
161
|
frozen dataclasses with `slots=True` otherwise.
|
|
92
162
|
|
|
163
|
+
### 4a. Fitted-attribute trailing underscore (sklearn convention)
|
|
164
|
+
|
|
165
|
+
Estimator-style classes (`fit`/`predict` pattern) that store
|
|
166
|
+
**learned-from-data attributes** use trailing underscore per scikit-learn
|
|
167
|
+
convention: `coef_`, `classes_`, `n_features_in_`, `feature_importances_`.
|
|
168
|
+
These attributes MUST NOT be set in `__init__` — set them only in `fit()`.
|
|
169
|
+
|
|
170
|
+
Frozen reference-impl dataclasses (`@dataclass(frozen=True, slots=True)`)
|
|
171
|
+
are **exempt** — they hold config, not fitted state.
|
|
172
|
+
|
|
173
|
+
Current canonical example: `stacking.LogisticStacker`.
|
|
174
|
+
|
|
175
|
+
### 4b. TypeVar naming
|
|
176
|
+
|
|
177
|
+
Internal (private) `TypeVar`s use a leading underscore per Google Python
|
|
178
|
+
Style Guide §3.19.10: `_T = TypeVar("_T")`. Public, constrained `TypeVar`s
|
|
179
|
+
without the underscore are allowed only when explicitly part of an
|
|
180
|
+
exported generic API.
|
|
181
|
+
|
|
93
182
|
## 5. Dataclasses
|
|
94
183
|
|
|
95
184
|
1. **`slots=True` always** on repo-owned dataclasses. Catches typos at
|
|
@@ -220,6 +309,10 @@ def fit_temperature(val_logits, val_labels, bounds=(0.05, 20.0)):
|
|
|
220
309
|
- **References** cites arXiv IDs / DOIs / journal cites.
|
|
221
310
|
- For modules where doctests would be contrived (`plotting`, `harness`,
|
|
222
311
|
`provenance`), Examples are optional.
|
|
312
|
+
- **Docstring prose wraps at 75 cols** (numpydoc convention) so that
|
|
313
|
+
`help()` is readable in a terminal. Doctest code blocks inside the
|
|
314
|
+
docstring follow the 100-col Black rule (code stays comfortable in an
|
|
315
|
+
editor even though prose around it is narrower).
|
|
223
316
|
|
|
224
317
|
## 13. Comments
|
|
225
318
|
|
|
@@ -228,6 +321,12 @@ restate what the code says.
|
|
|
228
321
|
|
|
229
322
|
## 14. Tests
|
|
230
323
|
|
|
324
|
+
- **File naming**: `tests/test_<module>.py` mirrors
|
|
325
|
+
`src/eval_toolkit/<module>.py`. Auxiliary tests per module use
|
|
326
|
+
suffixes (`test_<module>_props.py`, `test_<module>_validation.py`,
|
|
327
|
+
`test_<module>_golden.py`).
|
|
328
|
+
- **Function naming**: `test_<thing_under_test>_<scenario>`. No
|
|
329
|
+
class-based test grouping unless fixtures truly demand it (rare).
|
|
231
330
|
- **Markers**: `unit`, `property`, `smoke`, `golden`.
|
|
232
331
|
- **Sklearn-reference + analytical** as the unit-test oracle where available.
|
|
233
332
|
- **Hypothesis** required for math/stat invariants. Strategies use
|
|
@@ -74,15 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
|
|
|
74
74
|
# (granular extras — losses callers should not have to install the larger
|
|
75
75
|
# transformers stack). Shares the torch version pin with [probes].
|
|
76
76
|
losses = ["torch>=2.0"]
|
|
77
|
-
#
|
|
77
|
+
# NO-OP extra kept for backward compatibility (R3 at v0.49.0).
|
|
78
78
|
#
|
|
79
|
-
#
|
|
80
|
-
#
|
|
81
|
-
# v0.
|
|
82
|
-
#
|
|
83
|
-
#
|
|
84
|
-
#
|
|
85
|
-
# in CHANGELOG ### Deprecated + docs/DEPRECATION.md.
|
|
79
|
+
# jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
|
|
80
|
+
# no-op ever since. Originally announced as deprecated in v0.30.1 with
|
|
81
|
+
# target removal at v0.33.0, but reclassified at v0.49.0 (R3 in
|
|
82
|
+
# docs/DEPRECATION.md) as a permanent no-op — hard removal would break
|
|
83
|
+
# consumer pip pins of the form `eval-toolkit[validation]` for zero
|
|
84
|
+
# functional benefit. Retained indefinitely.
|
|
86
85
|
validation = []
|
|
87
86
|
# v0.31.0 docs site: Sphinx + pydata-sphinx-theme (replaces v0.28.0's
|
|
88
87
|
# mkdocs-material). Migration drivers — pain points Q1 in the v0.31.0
|
|
@@ -38,15 +38,15 @@ _EXPORTS: dict[str, str] = {
|
|
|
38
38
|
"ALL_TECHNIQUES": "eval_toolkit.adversarial",
|
|
39
39
|
"BidiRTLInjection": "eval_toolkit.adversarial",
|
|
40
40
|
"CORE_TECHNIQUES": "eval_toolkit.adversarial",
|
|
41
|
-
"
|
|
41
|
+
"CaseInjection": "eval_toolkit.adversarial",
|
|
42
42
|
"DiacriticInjection": "eval_toolkit.adversarial",
|
|
43
43
|
"HomoglyphSubstitution": "eval_toolkit.adversarial",
|
|
44
44
|
"InvisibleCharsInjection": "eval_toolkit.adversarial",
|
|
45
45
|
"PunctuationInjection": "eval_toolkit.adversarial",
|
|
46
46
|
"SynonymSubstitution": "eval_toolkit.adversarial",
|
|
47
47
|
"TagStrippingInjection": "eval_toolkit.adversarial",
|
|
48
|
-
"
|
|
49
|
-
"
|
|
48
|
+
"TokenSplittingInjection": "eval_toolkit.adversarial",
|
|
49
|
+
"UnicodeNormalizationInjection": "eval_toolkit.adversarial",
|
|
50
50
|
"WhitespaceInjection": "eval_toolkit.adversarial",
|
|
51
51
|
"ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
|
|
52
52
|
# CharacterInjectionStrategy + character_injection SimpleNamespace
|
|
@@ -202,7 +202,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
202
202
|
"MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
|
|
203
203
|
"RunManifest": "eval_toolkit.manifest",
|
|
204
204
|
"SourceRoleRecord": "eval_toolkit.manifest",
|
|
205
|
-
"
|
|
205
|
+
"make_manifest": "eval_toolkit.manifest",
|
|
206
206
|
"validate_source_roles": "eval_toolkit.manifest",
|
|
207
207
|
"write_manifest": "eval_toolkit.manifest",
|
|
208
208
|
# --- metrics ---
|
|
@@ -315,10 +315,10 @@ _EXPORTS: dict[str, str] = {
|
|
|
315
315
|
"wilson_interval": "eval_toolkit.thresholds",
|
|
316
316
|
"LogisticStacker": "eval_toolkit.stacking",
|
|
317
317
|
"MetaLearner": "eval_toolkit.stacking",
|
|
318
|
-
"MetricResult": "eval_toolkit.
|
|
319
|
-
"MetricSpec": "eval_toolkit.
|
|
320
|
-
"Scorecard": "eval_toolkit.
|
|
321
|
-
"scorecard": "eval_toolkit.
|
|
318
|
+
"MetricResult": "eval_toolkit.scorecards",
|
|
319
|
+
"MetricSpec": "eval_toolkit.scorecards",
|
|
320
|
+
"Scorecard": "eval_toolkit.scorecards",
|
|
321
|
+
"scorecard": "eval_toolkit.scorecards",
|
|
322
322
|
# --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
|
|
323
323
|
"sweep": "eval_toolkit._sweep",
|
|
324
324
|
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Private RNG-parameter type aliases per Scientific-Python SPEC 7.
|
|
2
|
+
|
|
3
|
+
This module centralizes the type aliases used to annotate user-facing RNG
|
|
4
|
+
parameters across the toolkit. Per `SPEC 7 — Seeding PRNG
|
|
5
|
+
<https://scientific-python.org/specs/spec-0007/>`_ (Endorsed) eval-toolkit
|
|
6
|
+
exposes a single canonical parameter name ``rng`` typed as
|
|
7
|
+
``RNGLike | SeedLike | None`` on every function that consumes a NumPy
|
|
8
|
+
``Generator``. Bodies normalize via ``np.random.default_rng(rng)``.
|
|
9
|
+
|
|
10
|
+
This module is private (underscore prefix) so the aliases stay an
|
|
11
|
+
implementation detail — public symbols use them only in their annotations.
|
|
12
|
+
If a Tier-2 consumer ever needs them exposed for their own callsite type
|
|
13
|
+
annotations, promote them via ``eval_toolkit.protocols`` per the
|
|
14
|
+
asymmetric-promotion principle in ADR 0001 + STYLE.md §3d.
|
|
15
|
+
|
|
16
|
+
Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
|
|
17
|
+
|
|
18
|
+
- ``seeds.set_global_seeds(seed: int)`` — global-state setter, not a
|
|
19
|
+
per-function RNG parameter; SPEC 7 is scoped to per-function RNG inputs.
|
|
20
|
+
- ``adversarial.*Injection`` / ``*Substitution`` / ``CaseInjection``
|
|
21
|
+
dataclass fields — they use Python's stdlib ``random.Random(seed)``,
|
|
22
|
+
not NumPy. SPEC 7's typing (``RNGLike = np.random.Generator | ...``) is
|
|
23
|
+
strictly NumPy-scoped.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from collections.abc import Sequence
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
|
|
32
|
+
type SeedLike = int | np.integer | Sequence[int] | np.random.SeedSequence
|
|
33
|
+
"""Anything that can seed a NumPy bit generator.
|
|
34
|
+
|
|
35
|
+
Per SPEC 7, ``np.random.default_rng`` accepts any of these as a seed
|
|
36
|
+
without further conversion. ``Sequence[int]`` is the entropy-vector form
|
|
37
|
+
used by ``np.random.SeedSequence``.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
type RNGLike = np.random.Generator | np.random.BitGenerator
|
|
41
|
+
"""An already-instantiated NumPy bit generator or generator wrapper.
|
|
42
|
+
|
|
43
|
+
``np.random.default_rng(rng)`` is the identity function on
|
|
44
|
+
``Generator`` inputs and lifts ``BitGenerator`` inputs into a
|
|
45
|
+
``Generator`` — both forms compose cleanly.
|
|
46
|
+
"""
|
|
@@ -12,7 +12,7 @@ Core techniques (shipped in v0.43.0):
|
|
|
12
12
|
- :class:`HomoglyphSubstitution` — Latin → Cyrillic/Greek lookalikes
|
|
13
13
|
- :class:`DiacriticInjection` — combining-mark insertion (NFC bypass)
|
|
14
14
|
- :class:`WhitespaceInjection` — variable whitespace padding (regular + NBSP)
|
|
15
|
-
- :class:`
|
|
15
|
+
- :class:`CaseInjection` — random case-flipping per character
|
|
16
16
|
- :class:`PunctuationInjection` — non-semantic punctuation insertion
|
|
17
17
|
|
|
18
18
|
Advanced techniques (shipped in v0.47 per Decision Q11.3):
|
|
@@ -20,8 +20,8 @@ Advanced techniques (shipped in v0.47 per Decision Q11.3):
|
|
|
20
20
|
- :class:`BidiRTLInjection` — U+202E…U+202C override block
|
|
21
21
|
- :class:`TagStrippingInjection` — ``<…>`` tag removal (idempotent)
|
|
22
22
|
- :class:`SynonymSubstitution` — whitelisted-word swap, seed-deterministic
|
|
23
|
-
- :class:`
|
|
24
|
-
- :class:`
|
|
23
|
+
- :class:`TokenSplittingInjection` — mid-word single-space insertion
|
|
24
|
+
- :class:`UnicodeNormalizationInjection` — NFC / NFD / NFKC / NFKD form switch
|
|
25
25
|
- :class:`InvisibleCharsInjection` — 5 invisible code points
|
|
26
26
|
|
|
27
27
|
The convenience tuples :data:`CORE_TECHNIQUES` (6-tuple),
|
|
@@ -54,15 +54,15 @@ __all__ = [
|
|
|
54
54
|
"ALL_TECHNIQUES",
|
|
55
55
|
"BidiRTLInjection",
|
|
56
56
|
"CORE_TECHNIQUES",
|
|
57
|
-
"
|
|
57
|
+
"CaseInjection",
|
|
58
58
|
"DiacriticInjection",
|
|
59
59
|
"HomoglyphSubstitution",
|
|
60
60
|
"InvisibleCharsInjection",
|
|
61
61
|
"PunctuationInjection",
|
|
62
62
|
"SynonymSubstitution",
|
|
63
63
|
"TagStrippingInjection",
|
|
64
|
-
"
|
|
65
|
-
"
|
|
64
|
+
"TokenSplittingInjection",
|
|
65
|
+
"UnicodeNormalizationInjection",
|
|
66
66
|
"WhitespaceInjection",
|
|
67
67
|
"ZeroWidthSpaceInjection",
|
|
68
68
|
]
|
|
@@ -287,7 +287,7 @@ class WhitespaceInjection:
|
|
|
287
287
|
|
|
288
288
|
|
|
289
289
|
@dataclass(frozen=True, slots=True)
|
|
290
|
-
class
|
|
290
|
+
class CaseInjection:
|
|
291
291
|
"""Randomly flip the case of alphabetic characters.
|
|
292
292
|
|
|
293
293
|
Deterministic given the seed. Numeric / punctuation / whitespace pass
|
|
@@ -311,7 +311,7 @@ class CaseRandomization:
|
|
|
311
311
|
|
|
312
312
|
def __post_init__(self) -> None:
|
|
313
313
|
if not 0.0 <= self.ratio <= 1.0:
|
|
314
|
-
raise ValueError(f"
|
|
314
|
+
raise ValueError(f"CaseInjection: ratio must be in [0, 1]; got {self.ratio}")
|
|
315
315
|
|
|
316
316
|
def transform(self, text: str) -> str:
|
|
317
317
|
rng = random.Random(self.seed)
|
|
@@ -524,7 +524,7 @@ class SynonymSubstitution:
|
|
|
524
524
|
|
|
525
525
|
|
|
526
526
|
@dataclass(frozen=True, slots=True)
|
|
527
|
-
class
|
|
527
|
+
class TokenSplittingInjection:
|
|
528
528
|
"""Insert a single space inside each long enough word.
|
|
529
529
|
|
|
530
530
|
Forces subword tokenizers to break a single token into two, often
|
|
@@ -552,10 +552,10 @@ class TokenSplitting:
|
|
|
552
552
|
def __post_init__(self) -> None:
|
|
553
553
|
if self.min_word_length < 2:
|
|
554
554
|
raise ValueError(
|
|
555
|
-
f"
|
|
555
|
+
f"TokenSplittingInjection: min_word_length must be >= 2; got {self.min_word_length}"
|
|
556
556
|
)
|
|
557
557
|
if not 0.0 <= self.ratio <= 1.0:
|
|
558
|
-
raise ValueError(f"
|
|
558
|
+
raise ValueError(f"TokenSplittingInjection: ratio must be in [0, 1]; got {self.ratio}")
|
|
559
559
|
|
|
560
560
|
def transform(self, text: str) -> str:
|
|
561
561
|
rng = random.Random(self.seed)
|
|
@@ -576,7 +576,7 @@ class TokenSplitting:
|
|
|
576
576
|
|
|
577
577
|
|
|
578
578
|
@dataclass(frozen=True, slots=True)
|
|
579
|
-
class
|
|
579
|
+
class UnicodeNormalizationInjection:
|
|
580
580
|
"""Apply a Unicode normalization form to the input.
|
|
581
581
|
|
|
582
582
|
Defaults to NFKC which folds compatibility characters (e.g., ``ABC``
|
|
@@ -598,7 +598,7 @@ class UnicodeNormalization:
|
|
|
598
598
|
def __post_init__(self) -> None:
|
|
599
599
|
if self.form not in {"NFC", "NFD", "NFKC", "NFKD"}:
|
|
600
600
|
raise ValueError(
|
|
601
|
-
f"
|
|
601
|
+
f"UnicodeNormalizationInjection: form must be NFC / NFD / NFKC / NFKD; got {self.form!r}"
|
|
602
602
|
)
|
|
603
603
|
|
|
604
604
|
def transform(self, text: str) -> str:
|
|
@@ -659,15 +659,15 @@ CORE_TECHNIQUES: tuple[type[Any], ...] = (
|
|
|
659
659
|
HomoglyphSubstitution,
|
|
660
660
|
DiacriticInjection,
|
|
661
661
|
WhitespaceInjection,
|
|
662
|
-
|
|
662
|
+
CaseInjection,
|
|
663
663
|
PunctuationInjection,
|
|
664
664
|
)
|
|
665
665
|
ADVANCED_TECHNIQUES: tuple[type[Any], ...] = (
|
|
666
666
|
BidiRTLInjection,
|
|
667
667
|
TagStrippingInjection,
|
|
668
668
|
SynonymSubstitution,
|
|
669
|
-
|
|
670
|
-
|
|
669
|
+
TokenSplittingInjection,
|
|
670
|
+
UnicodeNormalizationInjection,
|
|
671
671
|
InvisibleCharsInjection,
|
|
672
672
|
)
|
|
673
673
|
ALL_TECHNIQUES: tuple[type[Any], ...] = CORE_TECHNIQUES + ADVANCED_TECHNIQUES
|
|
@@ -703,8 +703,8 @@ def _whitespace(
|
|
|
703
703
|
|
|
704
704
|
|
|
705
705
|
def _case_random(text: str, ratio: float = 0.5, seed: int = 42) -> str:
|
|
706
|
-
"""Functional alias for :class:`
|
|
707
|
-
return
|
|
706
|
+
"""Functional alias for :class:`CaseInjection`."""
|
|
707
|
+
return CaseInjection(ratio=ratio, seed=seed).transform(text)
|
|
708
708
|
|
|
709
709
|
|
|
710
710
|
def _punctuation(text: str, ratio: float = 0.1, seed: int = 42) -> str:
|
|
@@ -71,28 +71,16 @@ __all__ = [
|
|
|
71
71
|
"Severity",
|
|
72
72
|
"TemporalLeakageCheck",
|
|
73
73
|
"TokenizationLeakageCheck",
|
|
74
|
-
"Versioned",
|
|
75
74
|
"run_leakage_checks",
|
|
76
75
|
]
|
|
77
76
|
|
|
78
77
|
Severity = Literal["error", "warning", "info"]
|
|
79
78
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
Used by :class:`~eval_toolkit.manifest.RunManifest` to capture per-object
|
|
86
|
-
versions of any Tier-2 implementation (Scorer, LeakageCheck, Splitter,
|
|
87
|
-
ThresholdSelector, DatasetLoader). Mirrors the lm-evaluation-harness
|
|
88
|
-
``VERSION`` field pattern, which invalidates cross-version metric
|
|
89
|
-
comparisons. Opt-in: implementations are not required to set ``version``.
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
@property
|
|
93
|
-
def version(self) -> str: # pragma: no cover
|
|
94
|
-
"""Stable version string for this implementation."""
|
|
95
|
-
...
|
|
79
|
+
# `Versioned` Protocol previously had a duplicate definition here (v0.7+).
|
|
80
|
+
# Removed at v0.49.0 (N5 dedup) — canonical home is `eval_toolkit.protocols`
|
|
81
|
+
# per `protocols.py:1-5` ("Lightweight public Protocols with minimal dependency
|
|
82
|
+
# surface"). Use `from eval_toolkit.protocols import Versioned` (or top-level
|
|
83
|
+
# `from eval_toolkit import Versioned`).
|
|
96
84
|
|
|
97
85
|
|
|
98
86
|
# ---------------------------------------------------------------------------
|