eval-toolkit 0.47.0__tar.gz → 0.48.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/CHANGELOG.md +90 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/PKG-INFO +1 -1
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/__init__.py +15 -9
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_scorecard.py +32 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_sweep.py +120 -2
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/adversarial.py +40 -18
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/bootstrap.py +69 -16
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/calibration.py +41 -3
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/config.py +1 -1
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/embeddings.py +1 -1
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/loaders.py +2 -3
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/metrics.py +15 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/public_api/snapshot.json +1 -1
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_analysis.py +2 -3
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_edge_cases.py +57 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_unit.py +39 -6
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_bootstrap.py +4 -2
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_metric_options.py +4 -2
- eval_toolkit-0.48.0/tests/test_lazy_extras_messages.py +283 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_loaders.py +8 -2
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_metrics_unit.py +153 -0
- eval_toolkit-0.48.0/tests/test_sweep.py +426 -0
- eval_toolkit-0.47.0/tests/test_sweep.py +0 -180
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/.gitignore +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/LICENSE +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/STYLE.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/pyproject.toml +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,96 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
|
|
9
|
+
|
|
10
|
+
Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
|
|
11
|
+
→ v0.48 → v1.0 release sequence (plan
|
|
12
|
+
``~/.claude/plans/evaluate-all-the-work-twinkly-kite.md``, Step 4). Migration
|
|
13
|
+
guide: ``docs/source/migration/v0.48.md``.
|
|
14
|
+
|
|
15
|
+
Closes:
|
|
16
|
+
|
|
17
|
+
- Round 7 audit STOP-GATE per Decision Y.2 (Codex R7-F1/F2/F3 + 6 Gemini
|
|
18
|
+
observations; see ``docs/source/audit_findings.md`` for the per-finding
|
|
19
|
+
ledger).
|
|
20
|
+
- Audit-as-seed extensions surfaced during plan refinement: full
|
|
21
|
+
module-docstring sweep across ``src/eval_toolkit/``; expanded
|
|
22
|
+
``.doctest-modules`` from 11 → 21 modules; comprehensive cross-API
|
|
23
|
+
shape-validation consistency sweep.
|
|
24
|
+
- Round 5 §5E-prep packet-drift fixes (7 methodology documentation
|
|
25
|
+
corrections).
|
|
26
|
+
|
|
27
|
+
After v0.48 observes ≥1 consumer cycle, the Round 8 audit STOP-GATE
|
|
28
|
+
opens before ``v1.0.0`` tag.
|
|
29
|
+
|
|
30
|
+
### BREAKING
|
|
31
|
+
|
|
32
|
+
- **``BootstrapCI.to_dict()`` + ``PairedBootstrapCI.to_dict()`` schema
|
|
33
|
+
rewrite** (§5B). Pre-v0.48 hard-coded a ``"ci_95"`` key regardless of
|
|
34
|
+
the actual ``confidence`` field — the key contradicted the data.
|
|
35
|
+
v0.48 schema is self-describing:
|
|
36
|
+
|
|
37
|
+
Before: ``{"point_estimate": p, "ci_95": [l, h], "confidence": 0.95, ...}``
|
|
38
|
+
After: ``{"point": p, "low": l, "high": h, "confidence": 0.95, ...}``
|
|
39
|
+
|
|
40
|
+
Migration: ``d["point_estimate"]`` → ``d["point"]``; ``d["ci_95"]``
|
|
41
|
+
→ ``(d["low"], d["high"])``. Same rewrite for ``PairedBootstrapCI``.
|
|
42
|
+
- **``sweep()`` schema grows by 1 column** (§5I, Decision R7-B option C).
|
|
43
|
+
New ``strategy_id`` column inserted between ``text_id`` and ``variant``
|
|
44
|
+
carries the canonical per-row identifier built from configured
|
|
45
|
+
kwargs. Callers indexing by column position must re-check offsets.
|
|
46
|
+
- **``sweep()`` rejects duplicate ``strategy_id``** (§5I). Mirrors
|
|
47
|
+
R6-B's duplicate ``MetricSpec.name`` rejection in ``scorecard()``.
|
|
48
|
+
- **``sweep()`` validates scorer output shape** (§5J, Decision R7-C).
|
|
49
|
+
Wrong-shape arrays from ``Scorer.predict_proba`` raise contextual
|
|
50
|
+
``ValueError`` at the boundary. Pre-v0.48: silent truncation
|
|
51
|
+
(overlong), ``IndexError`` (short), or ``TypeError`` (matrix-shaped).
|
|
52
|
+
- **``paired_bootstrap_op_point_diff()`` rejects ``val_y is test_y``**
|
|
53
|
+
(§5E-prep). The two-level bootstrap assumes disjoint val + test
|
|
54
|
+
partitions; passing the same array causes ~63.2% silent overlap.
|
|
55
|
+
|
|
56
|
+
### Added
|
|
57
|
+
|
|
58
|
+
- **``make pre-push``** Makefile target (§5L) running all 3 doc-
|
|
59
|
+
execution surfaces — Sybil-collected ``.md`` fences, MyST-NB example
|
|
60
|
+
notebooks, and in-source ``>>>`` docstring examples. Closes the
|
|
61
|
+
v0.47 Sub-PR 7 incident class.
|
|
62
|
+
- **``nb_execution_raise_on_error = True``** in ``docs/source/conf.py``
|
|
63
|
+
(§5H, Decision R7-A). Docs CI now fails on notebook execution errors.
|
|
64
|
+
- **``.doctest-modules`` expanded** from 11 → 21 modules (§5M).
|
|
65
|
+
|
|
66
|
+
### Changed
|
|
67
|
+
|
|
68
|
+
- **Cross-API shape-validation consistency** (§5N). Every public-API
|
|
69
|
+
surface with array inputs now validates shape + raises ``ValueError``
|
|
70
|
+
with context (rather than leaking low-level numpy/sklearn errors).
|
|
71
|
+
- **Standardized ``ImportError`` messages** across lazy-extras (§5C).
|
|
72
|
+
Canonical template: ``"<feature> requires <pkg>. Install with: pip
|
|
73
|
+
install eval-toolkit[<extra>]"``.
|
|
74
|
+
- **Pin-exact-key-set regression-guards** (§5A) for every dict-returning
|
|
75
|
+
metrics function. Audit revealed no drift; the tests pin existing
|
|
76
|
+
key sets so future drift fails CI loud.
|
|
77
|
+
- **Docs polish** (§5K + §5E-prep): ``SynonymSubstitution`` whitelist
|
|
78
|
+
``Notes``; ``Scorecard.to_pandas()`` dtype coercion ``Notes``;
|
|
79
|
+
``CostSensitiveSelector`` calibrated-prior ``Warning``; ``cv_clt_ci``
|
|
80
|
+
docstring per Bayle et al. (2020) Theorem 3.1; ``methodology/parallelism.md``
|
|
81
|
+
post-v0.36 state; ``methodology/testing.md`` reference-equivalence-gap
|
|
82
|
+
framing; ``methodology/calibration.md`` 4-binary-adapter family;
|
|
83
|
+
``methodology/bootstrap.md`` disjoint-split example; DeLong docs
|
|
84
|
+
aligned to shipped state (Decision U).
|
|
85
|
+
|
|
86
|
+
### Fixed
|
|
87
|
+
|
|
88
|
+
- **R7-F1**: 6 MyST-NB example notebooks (``docs/source/examples/*.md``)
|
|
89
|
+
migrated to v0.47 API; 4 module-level docstrings rewritten; 5
|
|
90
|
+
drifted ``docs/source/api/*.md`` autosummary lists corrected;
|
|
91
|
+
8 missing ``api/*.md`` pages created; roadmap "Sybil-validated
|
|
92
|
+
examples" wording corrected (§5G).
|
|
93
|
+
- **ADR 0001** (flat-module layout) + **ADR 0003** (stability contract
|
|
94
|
+
+ Gate 3 methodology) finalized for v1.0 (§5E + §5F).
|
|
95
|
+
- **schemas.md** + **methodology/claims.md** + **getting-started.md**:
|
|
96
|
+
``BootstrapCI`` schema references updated for the §5B rewrite.
|
|
97
|
+
|
|
8
98
|
## [0.47.0] — 2026-05-21 — Sweep unification + TextTransform + advanced-6 + cleanup + Round 6 follow-on
|
|
9
99
|
|
|
10
100
|
Second BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47 →
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.48.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
"""eval-toolkit — reusable evaluation contracts for binary classification.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
The v1.0 primary metric surface is :func:`~eval_toolkit.scorecard` plus the
|
|
4
|
+
:mod:`~eval_toolkit.metric_specs` namespace (ADR 0002). Submodule paths
|
|
5
|
+
remain available for scalar primitives and adapter authors:
|
|
4
6
|
|
|
5
|
-
from eval_toolkit import
|
|
6
|
-
from eval_toolkit
|
|
7
|
+
from eval_toolkit import scorecard, metric_specs as ms
|
|
8
|
+
from eval_toolkit import bootstrap_ci, BootstrapCI
|
|
9
|
+
from eval_toolkit.metrics import pr_auc # internal API, ADR 0002
|
|
7
10
|
|
|
8
11
|
The package root uses lazy exports so importing ``eval_toolkit`` does not
|
|
9
12
|
eagerly import optional-heavy modules such as plotting, loaders, or harnesses.
|
|
@@ -207,12 +210,15 @@ _EXPORTS: dict[str, str] = {
|
|
|
207
210
|
"SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
|
|
208
211
|
"ThresholdResult": "eval_toolkit.metrics",
|
|
209
212
|
"brier_decomposition": "eval_toolkit.metrics",
|
|
210
|
-
# `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed
|
|
211
|
-
# `_EXPORTS` at v0.46 (Decision L)
|
|
212
|
-
#
|
|
213
|
-
#
|
|
214
|
-
# submodule (`from eval_toolkit.metrics import pr_auc`
|
|
215
|
-
#
|
|
213
|
+
# `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants were removed
|
|
214
|
+
# from `_EXPORTS` at v0.46 (Decision L); the v0.46 `__getattr__`
|
|
215
|
+
# deprecation branch that kept them reachable with `DeprecationWarning`
|
|
216
|
+
# was removed at v0.47. They now raise `AttributeError` at the top level.
|
|
217
|
+
# The metrics submodule (`from eval_toolkit.metrics import pr_auc`)
|
|
218
|
+
# remains the only stable import path for scalar primitives — internal
|
|
219
|
+
# API per ADR 0002, not part of the v1.0 stability contract. The
|
|
220
|
+
# `scorecard()` + `metric_specs` surface is the primary path going
|
|
221
|
+
# forward (`metric_specs.pr_auc`, `metric_specs.roc_auc`, etc.).
|
|
216
222
|
"headline_metrics": "eval_toolkit.metrics",
|
|
217
223
|
"is_metric_defined_for_slice": "eval_toolkit.metrics",
|
|
218
224
|
"metrics_at_threshold": "eval_toolkit.metrics",
|
|
@@ -272,6 +272,38 @@ class Scorecard(Mapping[str, MetricResult]):
|
|
|
272
272
|
``n_resamples`` + ``method`` so the schema is lossless against
|
|
273
273
|
:meth:`BootstrapCI.to_dict` — trace provenance no longer drops in
|
|
274
274
|
the DataFrame view.
|
|
275
|
+
|
|
276
|
+
Notes
|
|
277
|
+
-----
|
|
278
|
+
**Dtype coercion: ``n_resamples`` is ``float64``, not ``Int64``.**
|
|
279
|
+
``BootstrapCI.n_resamples`` is an ``int`` at the Python level, but
|
|
280
|
+
pandas treats a mixed ``int`` + ``NaN`` column as ``float64`` —
|
|
281
|
+
any row with ``status != "ok"`` or ``bootstrap=False`` carries
|
|
282
|
+
``NaN`` in the CI columns, and NaN forces the whole column to
|
|
283
|
+
floating-point. So ``df["pr_auc"]["n_resamples"].dtype`` is
|
|
284
|
+
``float64``, and individual values read back as e.g. ``1000.0``
|
|
285
|
+
rather than ``1000`` (the trade-off Decision R6-C accepted to
|
|
286
|
+
keep the schema lossless).
|
|
287
|
+
|
|
288
|
+
Consumers expecting strict ``Int64`` semantics (e.g., for joins
|
|
289
|
+
against an integer-typed table, or for SQL emission where
|
|
290
|
+
``float64`` would round-trip as ``DOUBLE``) need to cast
|
|
291
|
+
explicitly *after* dropping NaN rows:
|
|
292
|
+
|
|
293
|
+
::
|
|
294
|
+
|
|
295
|
+
df["pr_auc"]["n_resamples"].dropna().astype("Int64")
|
|
296
|
+
|
|
297
|
+
or use pandas' nullable integer extension dtype at construction
|
|
298
|
+
time::
|
|
299
|
+
|
|
300
|
+
df["pr_auc"]["n_resamples"] = df["pr_auc"]["n_resamples"].astype("Int64")
|
|
301
|
+
|
|
302
|
+
which preserves NaN as ``pd.NA`` and the rest as integer.
|
|
303
|
+
``Scorecard.to_pandas()`` does not perform this coercion by
|
|
304
|
+
default because it would force a pandas-nullable-dtype dependency
|
|
305
|
+
on every consumer; the float64 default works under any pandas
|
|
306
|
+
version.
|
|
275
307
|
"""
|
|
276
308
|
try:
|
|
277
309
|
import pandas as pd
|
|
@@ -103,7 +103,7 @@ def sweep(
|
|
|
103
103
|
>>> from eval_toolkit import DelimitVariant, DatamarkVariant, sweep
|
|
104
104
|
>>> df = sweep([DelimitVariant(), DatamarkVariant()], ["hello world"])
|
|
105
105
|
>>> sorted(df.columns.tolist())
|
|
106
|
-
['text_id', 'transformed_text', 'variant']
|
|
106
|
+
['strategy_id', 'text_id', 'transformed_text', 'variant']
|
|
107
107
|
>>> df[df["variant"] == "delimit"].iloc[0]["transformed_text"]
|
|
108
108
|
'<<hello world>>'
|
|
109
109
|
|
|
@@ -144,6 +144,7 @@ def sweep(
|
|
|
144
144
|
f"sweep(): strategy at index {i} ({type(strategy).__name__}) "
|
|
145
145
|
f"does not satisfy TextTransform (missing 'name' or 'transform')."
|
|
146
146
|
)
|
|
147
|
+
_validate_unique_strategy_ids(strategies)
|
|
147
148
|
|
|
148
149
|
text_list = list(texts)
|
|
149
150
|
rows: list[dict[str, object]] = []
|
|
@@ -153,15 +154,25 @@ def sweep(
|
|
|
153
154
|
original_scores: np.ndarray | None = None
|
|
154
155
|
if scorer is not None and text_list:
|
|
155
156
|
original_scores = np.asarray(scorer.predict_proba(text_list))
|
|
157
|
+
_validate_scorer_output(
|
|
158
|
+
original_scores, expected_n=len(text_list), label="original-texts batch"
|
|
159
|
+
)
|
|
156
160
|
|
|
157
161
|
for strategy in strategies:
|
|
162
|
+
sid = _strategy_id_for(strategy)
|
|
158
163
|
transformed_list = [strategy.transform(t) for t in text_list]
|
|
159
164
|
transformed_scores: np.ndarray | None = None
|
|
160
165
|
if scorer is not None and transformed_list:
|
|
161
166
|
transformed_scores = np.asarray(scorer.predict_proba(transformed_list))
|
|
167
|
+
_validate_scorer_output(
|
|
168
|
+
transformed_scores,
|
|
169
|
+
expected_n=len(text_list),
|
|
170
|
+
label=f"transformed-texts batch for strategy {strategy.name!r}",
|
|
171
|
+
)
|
|
162
172
|
for text_id, (_, transformed) in enumerate(zip(text_list, transformed_list, strict=True)):
|
|
163
173
|
row: dict[str, object] = {
|
|
164
174
|
"text_id": text_id,
|
|
175
|
+
"strategy_id": sid,
|
|
165
176
|
"variant": strategy.name,
|
|
166
177
|
"transformed_text": transformed,
|
|
167
178
|
}
|
|
@@ -176,9 +187,116 @@ def sweep(
|
|
|
176
187
|
row["asr"] = bool(s_orig >= attack_threshold > s_adv)
|
|
177
188
|
rows.append(row)
|
|
178
189
|
|
|
179
|
-
base_cols = ["text_id", "variant", "transformed_text"]
|
|
190
|
+
base_cols = ["text_id", "strategy_id", "variant", "transformed_text"]
|
|
180
191
|
if scorer is not None:
|
|
181
192
|
base_cols += ["original_score", "transformed_score"]
|
|
182
193
|
if attack_threshold is not None:
|
|
183
194
|
base_cols += ["asr"]
|
|
184
195
|
return pd.DataFrame(rows, columns=base_cols)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
199
|
+
# Helpers — strategy identity (Decision R7-B; v0.48 §5I)
|
|
200
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _strategy_id_for(strategy: TextTransform) -> str:
|
|
204
|
+
"""Build a stable, repr-stable identifier from a strategy's configured state.
|
|
205
|
+
|
|
206
|
+
Decision R7-B (Round 7 audit, Codex R7-F2): a strategy's ``name`` alone is
|
|
207
|
+
not enough to identify a configured instance. Two instances of the same
|
|
208
|
+
dataclass with different kwargs (e.g., ``DelimitVariant(delimiter="<<")``
|
|
209
|
+
and ``DelimitVariant(delimiter="[[")``) share ``name == "delimit"`` and
|
|
210
|
+
would silently merge under ``groupby("variant")``. The ``strategy_id``
|
|
211
|
+
column carries the canonical configured identity so downstream
|
|
212
|
+
analysis can disambiguate.
|
|
213
|
+
|
|
214
|
+
Format (pseudo-URI; chosen for groupby-friendliness + special-char
|
|
215
|
+
safety via ``repr()``):
|
|
216
|
+
|
|
217
|
+
- Frozen dataclass strategies: ``"<name>/<k1>=<repr(v1)>,<k2>=<repr(v2)>,..."``
|
|
218
|
+
with kwargs alphabetized (excluding the ``name`` field itself). Mirrors
|
|
219
|
+
:func:`eval_toolkit.metric_specs.make_spec_name` but uses ``repr(value)``
|
|
220
|
+
instead of ``str(value)`` so string kwargs with special chars (``<<``,
|
|
221
|
+
``[[``, ``^``, etc.) round-trip cleanly.
|
|
222
|
+
- Plain :class:`TextTransform`-Protocol-satisfying objects without
|
|
223
|
+
``__dataclass_fields__``: falls back to ``strategy.name``.
|
|
224
|
+
|
|
225
|
+
Examples
|
|
226
|
+
--------
|
|
227
|
+
>>> from eval_toolkit.preprocessing import DelimitVariant
|
|
228
|
+
>>> _strategy_id_for(DelimitVariant(delimiter="<<", end=">>"))
|
|
229
|
+
"delimit/delimiter='<<',end='>>'"
|
|
230
|
+
>>> from eval_toolkit.adversarial import ZeroWidthSpaceInjection
|
|
231
|
+
>>> _strategy_id_for(ZeroWidthSpaceInjection(ratio=0.5, seed=42))
|
|
232
|
+
'zero_width_space/ratio=0.5,seed=42'
|
|
233
|
+
"""
|
|
234
|
+
fields = getattr(strategy, "__dataclass_fields__", None)
|
|
235
|
+
if fields is None:
|
|
236
|
+
return strategy.name
|
|
237
|
+
kw_pairs = sorted((f, getattr(strategy, f)) for f in fields if f != "name")
|
|
238
|
+
if not kw_pairs:
|
|
239
|
+
return strategy.name
|
|
240
|
+
return f"{strategy.name}/" + ",".join(f"{k}={v!r}" for k, v in kw_pairs)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _validate_scorer_output(scores: np.ndarray, *, expected_n: int, label: str) -> None:
|
|
244
|
+
"""Validate the shape of a batched ``Scorer.predict_proba`` result.
|
|
245
|
+
|
|
246
|
+
Decision R7-C (Round 7 audit, Codex R7-F3): three failure modes Codex
|
|
247
|
+
surfaced via runtime probe — too many 1-D scores (silent truncation,
|
|
248
|
+
worst class), too few (later ``IndexError``), and matrix-shaped
|
|
249
|
+
(later ``TypeError`` when ``float(...)`` is applied to a row). All
|
|
250
|
+
three become a single API-level ``ValueError`` with context.
|
|
251
|
+
|
|
252
|
+
Style invariants 1 (no silent failures) + 3 (API-level errors, never
|
|
253
|
+
low-level exceptions through the boundary). Drives Decision R7-C.
|
|
254
|
+
|
|
255
|
+
Parameters
|
|
256
|
+
----------
|
|
257
|
+
scores : np.ndarray
|
|
258
|
+
The ``np.asarray()``-wrapped result of ``scorer.predict_proba(...)``.
|
|
259
|
+
expected_n : int
|
|
260
|
+
The expected length — ``len(texts)`` for the current sweep call.
|
|
261
|
+
label : str
|
|
262
|
+
Context for the error message naming the offending batch
|
|
263
|
+
(e.g., ``"original-texts batch"`` or
|
|
264
|
+
``"transformed-texts batch for strategy 'zero_width_space'"``).
|
|
265
|
+
|
|
266
|
+
Raises
|
|
267
|
+
------
|
|
268
|
+
ValueError
|
|
269
|
+
If ``scores.shape != (expected_n,)``.
|
|
270
|
+
"""
|
|
271
|
+
if scores.shape != (expected_n,):
|
|
272
|
+
raise ValueError(
|
|
273
|
+
f"sweep(): scorer.predict_proba({label}) returned shape "
|
|
274
|
+
f"{scores.shape}; expected ({expected_n},). The Scorer Protocol "
|
|
275
|
+
f"requires one float P(positive) per input row (see "
|
|
276
|
+
f"`eval_toolkit.protocols.Scorer`); ensure your adapter returns "
|
|
277
|
+
f"a 1-D array of length len(texts)."
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _validate_unique_strategy_ids(strategies: Sequence[TextTransform]) -> None:
|
|
282
|
+
"""Reject duplicate ``strategy_id`` values in a single ``sweep()`` call.
|
|
283
|
+
|
|
284
|
+
Decision R7-B (Round 7 audit, Codex R7-F2): mirrors R6-B's duplicate
|
|
285
|
+
``MetricSpec.name`` rejection in ``scorecard()`` — same anti-silent-merge
|
|
286
|
+
invariant, applied to the sweep surface. No methodology-honest reason to
|
|
287
|
+
put the same configured strategy twice in one sweep; cache-warming +
|
|
288
|
+
reproducibility re-runs use ``strategy.transform()`` directly outside
|
|
289
|
+
``sweep()``.
|
|
290
|
+
"""
|
|
291
|
+
seen: dict[str, int] = {}
|
|
292
|
+
for i, strategy in enumerate(strategies):
|
|
293
|
+
sid = _strategy_id_for(strategy)
|
|
294
|
+
if sid in seen:
|
|
295
|
+
raise ValueError(
|
|
296
|
+
f"sweep(): duplicate strategy_id {sid!r} at index {i} "
|
|
297
|
+
f"(previously at index {seen[sid]}); each strategy must "
|
|
298
|
+
f"produce a unique strategy_id. If you want two configurations "
|
|
299
|
+
f"of the same dataclass in the same sweep, vary their kwargs "
|
|
300
|
+
f"so the canonical identifier differs."
|
|
301
|
+
)
|
|
302
|
+
seen[sid] = i
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Adversarial robustness: character-injection bypass suite
|
|
1
|
+
"""Adversarial robustness: 12-technique character-injection bypass suite.
|
|
2
2
|
|
|
3
3
|
Implements the character-injection bypass techniques from Microsoft Research
|
|
4
4
|
2024 ([1]_) for testing prompt-injection-detection scorers under adversarial
|
|
@@ -6,7 +6,7 @@ input perturbation. Each technique is deterministic given a ``seed`` and
|
|
|
6
6
|
preserves the surface meaning of the text from a human reader's perspective
|
|
7
7
|
while shifting the tokenizer / scorer's representation.
|
|
8
8
|
|
|
9
|
-
Core techniques shipped in v0.43.0:
|
|
9
|
+
Core techniques (shipped in v0.43.0):
|
|
10
10
|
|
|
11
11
|
- :class:`ZeroWidthSpaceInjection` — insert U+200B zero-width spaces
|
|
12
12
|
- :class:`HomoglyphSubstitution` — Latin → Cyrillic/Greek lookalikes
|
|
@@ -15,25 +15,24 @@ Core techniques shipped in v0.43.0:
|
|
|
15
15
|
- :class:`CaseRandomization` — random case-flipping per character
|
|
16
16
|
- :class:`PunctuationInjection` — non-semantic punctuation insertion
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
:class:`~eval_toolkit.protocols.Scorer`-Protocol-compliant scorer and
|
|
20
|
-
returns a DataFrame of
|
|
21
|
-
``(text_id, technique, original_score, transformed_score, asr)``
|
|
22
|
-
for adversarial robustness analysis. ASR (attack success rate) is the
|
|
23
|
-
fraction of inputs where the scorer crossed the threshold from positive
|
|
24
|
-
to negative under the transformation.
|
|
18
|
+
Advanced techniques (shipped in v0.47 per Decision Q11.3):
|
|
25
19
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
20
|
+
- :class:`BidiRTLInjection` — U+202E…U+202C override block
|
|
21
|
+
- :class:`TagStrippingInjection` — ``<…>`` tag removal (idempotent)
|
|
22
|
+
- :class:`SynonymSubstitution` — whitelisted-word swap, seed-deterministic
|
|
23
|
+
- :class:`TokenSplitting` — mid-word single-space insertion
|
|
24
|
+
- :class:`UnicodeNormalization` — NFC / NFD / NFKC / NFKD form switch
|
|
25
|
+
- :class:`InvisibleCharsInjection` — 5 invisible code points
|
|
30
26
|
|
|
31
|
-
|
|
32
|
-
|
|
27
|
+
The convenience tuples :data:`CORE_TECHNIQUES` (6-tuple),
|
|
28
|
+
:data:`ADVANCED_TECHNIQUES` (6-tuple), and :data:`ALL_TECHNIQUES`
|
|
29
|
+
(12-tuple = core + advanced) enumerate the suite for sweep callers.
|
|
33
30
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
31
|
+
Use the v0.47 top-level :func:`eval_toolkit.sweep` to apply any set of
|
|
32
|
+
:class:`~eval_toolkit.TextTransform` strategies against a corpus (and
|
|
33
|
+
optionally a :class:`~eval_toolkit.protocols.Scorer`); the v0.43–v0.46
|
|
34
|
+
module-level ``sweep()`` function and the ``character_injection``
|
|
35
|
+
``SimpleNamespace`` were removed at v0.47 (Decisions D + K + N).
|
|
37
36
|
|
|
38
37
|
References
|
|
39
38
|
----------
|
|
@@ -469,6 +468,29 @@ class SynonymSubstitution:
|
|
|
469
468
|
Random seed for determinism. Default ``42``.
|
|
470
469
|
name : str, optional
|
|
471
470
|
Override technique name. Default ``"synonym"``.
|
|
471
|
+
|
|
472
|
+
Notes
|
|
473
|
+
-----
|
|
474
|
+
The eligible-word set is the module-level ``_SYNONYMS`` dict, a fixed
|
|
475
|
+
6-entry whitelist hand-curated to preserve semantics:
|
|
476
|
+
|
|
477
|
+
- ``ignore`` → ``disregard``, ``overlook``
|
|
478
|
+
- ``instructions`` → ``directions``, ``guidance``
|
|
479
|
+
- ``system`` → ``framework``, ``platform``
|
|
480
|
+
- ``secret`` → ``private``, ``confidential``
|
|
481
|
+
- ``send`` → ``transmit``, ``forward``
|
|
482
|
+
- ``all`` → ``every``, ``all of``
|
|
483
|
+
|
|
484
|
+
Inputs containing none of those whitelist words are returned unchanged
|
|
485
|
+
— the transform is a no-op on such inputs. This is intentional: the
|
|
486
|
+
technique's invariant is "looks like the original," so the substitution
|
|
487
|
+
deliberately stays small. The trade-off is easy to be surprised by
|
|
488
|
+
when running ``SynonymSubstitution`` on a corpus that doesn't share
|
|
489
|
+
the prompt-injection vocabulary the whitelist was built from. If you
|
|
490
|
+
need broader substitution, the whitelist isn't extension-friendly
|
|
491
|
+
today — fork the dict at the module level, or treat
|
|
492
|
+
``SynonymSubstitution`` as a reference implementation for your own
|
|
493
|
+
text-transform with a richer table.
|
|
472
494
|
"""
|
|
473
495
|
|
|
474
496
|
ratio: float = 1.0
|
|
@@ -120,10 +120,29 @@ class BootstrapCI:
|
|
|
120
120
|
method: str
|
|
121
121
|
|
|
122
122
|
def to_dict(self) -> dict[str, object]:
|
|
123
|
-
"""Serialize to a stable dict schema for JSON output.
|
|
123
|
+
"""Serialize to a stable, self-describing dict schema for JSON output.
|
|
124
|
+
|
|
125
|
+
v0.48 BREAKING (§5B): schema rewritten to drop the hard-coded
|
|
126
|
+
``"ci_95"`` key that lied when ``confidence != 0.95``. The new
|
|
127
|
+
schema names the bounds neutrally and carries the actual
|
|
128
|
+
confidence level in a dedicated field; consumers can read
|
|
129
|
+
``confidence`` to interpret the bound semantics.
|
|
130
|
+
|
|
131
|
+
Before v0.48:
|
|
132
|
+
{"point_estimate": p, "ci_95": [l, h], "confidence": 0.95,
|
|
133
|
+
"n_resamples": N, "method": "BCa"}
|
|
134
|
+
|
|
135
|
+
v0.48+:
|
|
136
|
+
{"point": p, "low": l, "high": h, "confidence": 0.95,
|
|
137
|
+
"n_resamples": N, "method": "BCa"}
|
|
138
|
+
|
|
139
|
+
Migration: rename ``point_estimate`` → ``point``; replace the
|
|
140
|
+
``ci_95`` list-of-two with separate ``low`` + ``high`` keys.
|
|
141
|
+
"""
|
|
124
142
|
return {
|
|
125
|
-
"
|
|
126
|
-
"
|
|
143
|
+
"point": self.point_estimate,
|
|
144
|
+
"low": self.ci_low,
|
|
145
|
+
"high": self.ci_high,
|
|
127
146
|
"confidence": self.confidence,
|
|
128
147
|
"n_resamples": self.n_resamples,
|
|
129
148
|
"method": self.method,
|
|
@@ -185,10 +204,24 @@ class PairedBootstrapCI:
|
|
|
185
204
|
n_resamples: int
|
|
186
205
|
|
|
187
206
|
def to_dict(self) -> dict[str, object]:
|
|
188
|
-
"""Serialize to a stable dict schema for JSON output.
|
|
207
|
+
"""Serialize to a stable, self-describing dict schema for JSON output.
|
|
208
|
+
|
|
209
|
+
v0.48 BREAKING (§5B): same rewrite as :meth:`BootstrapCI.to_dict`.
|
|
210
|
+
``"ci_95"`` is replaced by ``"low"`` + ``"high"``; ``"confidence"``
|
|
211
|
+
carries the actual level.
|
|
212
|
+
|
|
213
|
+
Before v0.48:
|
|
214
|
+
{"delta": d, "ci_95": [l, h], "overlaps_zero": b,
|
|
215
|
+
"confidence": 0.95, "n_resamples": N}
|
|
216
|
+
|
|
217
|
+
v0.48+:
|
|
218
|
+
{"delta": d, "low": l, "high": h, "overlaps_zero": b,
|
|
219
|
+
"confidence": 0.95, "n_resamples": N}
|
|
220
|
+
"""
|
|
189
221
|
return {
|
|
190
222
|
"delta": self.delta,
|
|
191
|
-
"
|
|
223
|
+
"low": self.ci_low,
|
|
224
|
+
"high": self.ci_high,
|
|
192
225
|
"overlaps_zero": self.overlaps_zero,
|
|
193
226
|
"confidence": self.confidence,
|
|
194
227
|
"n_resamples": self.n_resamples,
|
|
@@ -843,6 +876,21 @@ def paired_bootstrap_op_point_diff(
|
|
|
843
876
|
.. [2] Bouckaert, R. R. "Choosing between two learning algorithms
|
|
844
877
|
based on calibrated tests." ICML 2003.
|
|
845
878
|
"""
|
|
879
|
+
# Defensive identity-guard: the two-level bootstrap resamples val + test
|
|
880
|
+
# indices INDEPENDENTLY (see _paired_bootstrap_op_point_diff_step). Passing
|
|
881
|
+
# the same Python object for val and test causes ~63.2% overlap on each
|
|
882
|
+
# resample, violating the val/test independence assumption that lets the
|
|
883
|
+
# CI absorb threshold-selection variance honestly. Partition the data
|
|
884
|
+
# before calling — see docs/source/methodology/thresholds.md.
|
|
885
|
+
if val_y is test_y:
|
|
886
|
+
raise ValueError(
|
|
887
|
+
"paired_bootstrap_op_point_diff: val_y and test_y are the same array. "
|
|
888
|
+
"The two-level bootstrap requires DISJOINT val + test slices; the "
|
|
889
|
+
"resampler draws val_idx and test_idx independently, so identical "
|
|
890
|
+
"arrays cause ~63.2% overlap and violate the independence assumption. "
|
|
891
|
+
"Partition your data first (e.g., val = arr[:n//2], test = arr[n//2:])."
|
|
892
|
+
)
|
|
893
|
+
|
|
846
894
|
val_y_arr = np.asarray(val_y)
|
|
847
895
|
val_a, val_b = np.asarray(val_score_a), np.asarray(val_score_b)
|
|
848
896
|
test_y_arr = np.asarray(test_y)
|
|
@@ -1157,12 +1205,16 @@ def cv_clt_ci(
|
|
|
1157
1205
|
|
|
1158
1206
|
Computes a confidence interval on the cross-validation mean metric
|
|
1159
1207
|
that correctly accounts for fold-level dependence. The standard
|
|
1160
|
-
"naive" CI (compute std-of-folds then divide by sqrt(K))
|
|
1161
|
-
conservative because the folds share
|
|
1162
|
-
|
|
1163
|
-
coverage
|
|
1208
|
+
"naive" CI (compute std-of-folds then divide by sqrt(K)) had long
|
|
1209
|
+
been suspected to be anti-conservative because the folds share
|
|
1210
|
+
training data. Bayle et al. 2020 prove that the naive sample-variance
|
|
1211
|
+
estimator (with ``ddof=1``) gives valid asymptotic coverage under
|
|
1212
|
+
stability conditions, resolving the historical concern that fold
|
|
1213
|
+
correlation makes it anti-conservative. No additional correction
|
|
1214
|
+
factor is applied.
|
|
1164
1215
|
|
|
1165
|
-
The
|
|
1216
|
+
The variance estimator (Bayle 2020 Theorem 3.1) is just the standard
|
|
1217
|
+
sample variance over per-fold metrics:
|
|
1166
1218
|
|
|
1167
1219
|
.. math::
|
|
1168
1220
|
|
|
@@ -1233,9 +1285,9 @@ def cv_clt_ci(
|
|
|
1233
1285
|
raise ValueError(f"confidence must be in (0, 1), got {confidence}")
|
|
1234
1286
|
|
|
1235
1287
|
point = float(arr.mean())
|
|
1236
|
-
# Bayle 2020 Theorem 3.1
|
|
1237
|
-
#
|
|
1238
|
-
#
|
|
1288
|
+
# Bayle 2020 Theorem 3.1: the naive sample-variance estimator (ddof=1)
|
|
1289
|
+
# gives valid asymptotic coverage under stability conditions — no extra
|
|
1290
|
+
# correction factor is applied for fold correlation.
|
|
1239
1291
|
sigma_hat = float(np.std(arr, ddof=1))
|
|
1240
1292
|
z = _normal_quantile(0.5 + confidence / 2.0)
|
|
1241
1293
|
margin = z * sigma_hat / np.sqrt(K)
|
|
@@ -1258,9 +1310,10 @@ def block_bootstrap_on_folds(
|
|
|
1258
1310
|
) -> BootstrapCI:
|
|
1259
1311
|
r"""Block bootstrap on folds: resample K folds with replacement; percentile CI on mean.
|
|
1260
1312
|
|
|
1261
|
-
Sibling primitive to :func:`cv_clt_ci`. Where :func:`cv_clt_ci`
|
|
1262
|
-
|
|
1263
|
-
|
|
1313
|
+
Sibling primitive to :func:`cv_clt_ci`. Where :func:`cv_clt_ci` relies on
|
|
1314
|
+
Bayle et al. 2020's CV-CLT — the naive sample-variance estimator gives
|
|
1315
|
+
valid asymptotic coverage under stability + fold exchangeability — the
|
|
1316
|
+
block bootstrap is more *conservative* under
|
|
1264
1317
|
fold-level **non-exchangeability** — situations where the K folds are
|
|
1265
1318
|
not interchangeable (e.g., source-disjoint LODO folds where one source
|
|
1266
1319
|
is intrinsically harder than the others). The sensitivity-check
|
|
@@ -356,6 +356,20 @@ def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
|
|
|
356
356
|
|
|
357
357
|
.. math:: t^* = \frac{c_{FP} \cdot (1 - π)}{c_{FP} \cdot (1 - π) + c_{FN} \cdot π}
|
|
358
358
|
|
|
359
|
+
.. warning::
|
|
360
|
+
|
|
361
|
+
This formula assumes ``y_score`` is a calibrated probability with
|
|
362
|
+
respect to a **balanced prior** (or equivalently, a raw likelihood
|
|
363
|
+
ratio). If your scores are calibrated to the deployment prior (e.g.,
|
|
364
|
+
via :func:`fit_platt_binary` on a representative validation set), the
|
|
365
|
+
prior is already incorporated into the score and applying this
|
|
366
|
+
formula will **double-count it**. For deployment-prior-calibrated
|
|
367
|
+
scores, use the simpler prior-independent form
|
|
368
|
+
``t* = c_fp / (c_fp + c_fn)`` (no ``prior`` kwarg) — that's literal
|
|
369
|
+
Elkan 2001 §4. The function in this file is the prior-corrected
|
|
370
|
+
variant for raw / balanced-prior scores; see the Examples for both
|
|
371
|
+
usage patterns.
|
|
372
|
+
|
|
359
373
|
Parameters
|
|
360
374
|
----------
|
|
361
375
|
π : float
|
|
@@ -396,6 +410,29 @@ def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
|
|
|
396
410
|
>>> bayes_optimal_threshold(1.0, c_fp=1.0, c_fn=1.0)
|
|
397
411
|
0.0
|
|
398
412
|
|
|
413
|
+
**Two correct usages, side by side.** The choice depends on what your
|
|
414
|
+
``y_score`` is calibrated to.
|
|
415
|
+
|
|
416
|
+
Usage A — raw or balanced-prior scores (use this function, pass ``π``):
|
|
417
|
+
|
|
418
|
+
>>> # Score from a model trained on a balanced (50/50) corpus, deployed
|
|
419
|
+
>>> # at a 1% positive prior, with FN cost 10× the FP cost.
|
|
420
|
+
>>> t_balanced = bayes_optimal_threshold(0.01, c_fp=1.0, c_fn=10.0)
|
|
421
|
+
>>> round(t_balanced, 4)
|
|
422
|
+
0.9083
|
|
423
|
+
|
|
424
|
+
Usage B — deployment-prior-calibrated scores (skip this function, use
|
|
425
|
+
the literal Elkan 2001 §4 prior-independent form):
|
|
426
|
+
|
|
427
|
+
>>> # Score already calibrated to the 1% deployment prior via
|
|
428
|
+
>>> # fit_platt_binary on a representative val slice — DO NOT pass π
|
|
429
|
+
>>> # to this function (you'd double-count it). Threshold the
|
|
430
|
+
>>> # already-prior-corrected probability against the cost ratio:
|
|
431
|
+
>>> c_fp, c_fn = 1.0, 10.0
|
|
432
|
+
>>> t_calibrated = c_fp / (c_fp + c_fn)
|
|
433
|
+
>>> round(t_calibrated, 4)
|
|
434
|
+
0.0909
|
|
435
|
+
|
|
399
436
|
Notes
|
|
400
437
|
-----
|
|
401
438
|
Symmetric costs (c_fp == c_fn) collapse the formula to t* = 1 - π.
|
|
@@ -407,9 +444,10 @@ def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
|
|
|
407
444
|
*Bayes-calibrated* posterior P(y=1 | x). The formula implemented here
|
|
408
445
|
is the **prior-corrected** form for thresholding raw scores at a known
|
|
409
446
|
deployment prior π, which agrees with Elkan only under symmetric costs.
|
|
410
|
-
For our intended use (deployment prior + asymmetric costs
|
|
411
|
-
prior-corrected form is what the user wants
|
|
412
|
-
be read as "Elkan 2001 cost-sensitive
|
|
447
|
+
For our intended use (deployment prior + asymmetric costs on raw /
|
|
448
|
+
balanced-prior scores) the prior-corrected form is what the user wants
|
|
449
|
+
— but the citation should be read as "Elkan 2001 cost-sensitive
|
|
450
|
+
framework", not literal §4.
|
|
413
451
|
|
|
414
452
|
References
|
|
415
453
|
----------
|
|
@@ -89,7 +89,7 @@ def from_yaml[T](path: Path | str, cls: type[T]) -> T:
|
|
|
89
89
|
import yaml # noqa: PLC0415
|
|
90
90
|
except ImportError as exc:
|
|
91
91
|
raise ImportError(
|
|
92
|
-
"from_yaml requires pyyaml
|
|
92
|
+
"from_yaml requires pyyaml. Install with: pip install eval-toolkit[yaml]"
|
|
93
93
|
) from exc
|
|
94
94
|
|
|
95
95
|
if not is_dataclass(cls):
|
|
@@ -85,7 +85,7 @@ def make_minilm_embedder(
|
|
|
85
85
|
except ImportError as e:
|
|
86
86
|
raise ImportError(
|
|
87
87
|
"make_minilm_embedder requires sentence-transformers. "
|
|
88
|
-
"Install
|
|
88
|
+
"Install with: pip install eval-toolkit[embeddings]"
|
|
89
89
|
) from e
|
|
90
90
|
|
|
91
91
|
# sentence-transformers-active path: excluded from CI coverage
|