eval-toolkit 1.7.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_toolkit-1.9.0/.claude/agents/README.md +54 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/.gitignore +6 -2
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/CHANGELOG.md +191 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/PKG-INFO +5 -6
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/README.md +4 -5
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/STYLE.md +24 -9
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/pyproject.toml +6 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/__init__.py +1 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/__main__.py +4 -4
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_sweep.py +21 -5
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/analysis.py +57 -8
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/artifacts.py +5 -2
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +9 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/audit_value_bindings.py +15 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/bootstrap.py +371 -41
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/config.py +1 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/docs.py +2 -2
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/distribution_shift.py +28 -7
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/lexical_association.py +18 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/plotting.py +1 -1
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/scorecards.py +21 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/public_api/snapshot.json +9 -3
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_analysis.py +71 -2
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_audit_sister_doc_concept_drift.py +19 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_audit_value_bindings.py +14 -0
- eval_toolkit-1.9.0/tests/test_bootstrap_edge_cases.py +374 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_unit.py +16 -38
- eval_toolkit-1.9.0/tests/test_claude_agents.py +148 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_cluster_bootstrap.py +117 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_config.py +16 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_docs_golden.py +14 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda_distribution_shift.py +43 -2
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda_lexical_association.py +19 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_public_api.py +1 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_reproducibility_integration.py +7 -2
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_scorecard.py +36 -0
- eval_toolkit-1.9.0/tests/test_stratified_cluster_bootstrap.py +260 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_sweep.py +23 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_v09_contracts.py +1 -1
- eval_toolkit-1.7.0/tests/test_bootstrap_edge_cases.py +0 -185
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/LICENSE +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_narrative.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/__init__.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/data_audit.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/eda/obfuscation.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_eda_obfuscation.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/tests/test_tokenization_leakage_check.py +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# eval-toolkit review agents
|
|
2
|
+
|
|
3
|
+
Repo-local Claude Code subagents that enforce the **judgment** the deterministic
|
|
4
|
+
gates can't: SemVer impact, audit-validator architecture, silent failures,
|
|
5
|
+
docstring conformance, and dogfood noise. They are **advisory** — `ruff` / `black` / `mypy` / `pytest` /
|
|
6
|
+
coverage / the public-API snapshot remain the authoritative blocking gates. No
|
|
7
|
+
agent re-runs or replaces them.
|
|
8
|
+
|
|
9
|
+
## The agents
|
|
10
|
+
|
|
11
|
+
| Agent | Catches | Authoritative source |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| `etk-audit-validator-reviewer` | Three-layer conformance (identity/scope/pairing), `_narrative` reuse, UTF-8 | ADR 0007 / 0005 / 0006, STYLE.md §5 |
|
|
14
|
+
| `etk-api-stability-guardian` | Tier-1/2/3 SemVer class + public-API snapshot regen | ADR 0003, `tests/test_public_api.py`, STYLE.md §17 |
|
|
15
|
+
| `etk-silent-failure-auditor` | NaN/inf finiteness gaps, swallowed exceptions, encoding/IO, non-diagnostic raises | STYLE.md §1 / §6 / §7 |
|
|
16
|
+
| `etk-docstring-conformance-auditor` | NumPy sections, Raises↔code agreement, canonical param names, runnable Examples | STYLE.md §12 / §3a |
|
|
17
|
+
| `etk-dogfood-noise-analyst` | Classifies consumer residuals (real / FP / edge × layer) | runner: `scripts/dogfood_audit.py` |
|
|
18
|
+
|
|
19
|
+
## How to run
|
|
20
|
+
|
|
21
|
+
You never need to remember the names. Either:
|
|
22
|
+
|
|
23
|
+
- **Describe the task** — "review the changes I made to the citation validator" — and the main agent auto-routes by each agent's `description`; or
|
|
24
|
+
- **Run `/review-eval`** — the one handle that fans them out and synthesizes one verdict.
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
/review-eval # diff mode: git diff main...HEAD
|
|
28
|
+
/review-eval --pr 84 # review a GitHub PR diff
|
|
29
|
+
/review-eval --audit # full baseline sweep (whole files, no diff)
|
|
30
|
+
/review-eval --audit api # focused: just the public surface
|
|
31
|
+
/review-eval --audit validators # focused: just audit_*.py + _narrative.py
|
|
32
|
+
/review-eval --audit docstrings # focused: just public docstrings
|
|
33
|
+
/review-eval --refute # adversarial second pass (quote-or-reject)
|
|
34
|
+
/review-eval --ledger # persist a review entry under .claude/reviews/
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Diff mode prints to the terminal; `--audit` also writes a machine-local entry
|
|
38
|
+
under `.claude/reviews/` (gitignored).
|
|
39
|
+
|
|
40
|
+
## Conventions baked in
|
|
41
|
+
|
|
42
|
+
- **Read-back discipline** — every finding quotes code with `path:line`; no quote, no finding (counters validation-without-reading).
|
|
43
|
+
- **High-confidence only** — plus a `suppressed N low-confidence` footer so nothing is silently dropped.
|
|
44
|
+
- **Hybrid rubric** — a tight inlined checklist that defers to `STYLE.md` / the ADRs as the single source of truth.
|
|
45
|
+
- **Structured verdict** — `PASS / CONCERNS / BLOCK`, per-agent and overall.
|
|
46
|
+
|
|
47
|
+
`tests/test_claude_agents.py` guards these files against pointer-rot (frontmatter
|
|
48
|
+
parses, `name` matches filename, every cited path exists).
|
|
49
|
+
|
|
50
|
+
## Escalation
|
|
51
|
+
|
|
52
|
+
For a full multi-round release audit (fan out N finders → dedup → adversarially
|
|
53
|
+
verify → synthesize a ledger), use the **Workflow** tool, not a single subagent.
|
|
54
|
+
`/review-eval --audit` is the lightweight precursor.
|
|
@@ -72,8 +72,12 @@ codex-comprehensive-audit-*.md
|
|
|
72
72
|
# Contents have historical value but are not part of any release.
|
|
73
73
|
.scratch/
|
|
74
74
|
|
|
75
|
-
# Claude Code
|
|
76
|
-
.claude/
|
|
75
|
+
# Claude Code: settings are machine-local; the review agents + commands are
|
|
76
|
+
# shared, versioned deliverables (see .claude/agents/README.md). Review ledgers
|
|
77
|
+
# written by `/review-eval --ledger` stay local under .claude/reviews/.
|
|
78
|
+
.claude/*
|
|
79
|
+
!.claude/agents/
|
|
80
|
+
!.claude/commands/
|
|
77
81
|
|
|
78
82
|
# mkdocs build output (Section E.1 v0.28.0)
|
|
79
83
|
/site/
|
|
@@ -5,6 +5,197 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [1.9.0] — 2026-06-10 — resample distribution + silent-NaN hardening + UTF-8 batch (#93, #96, #97)
|
|
11
|
+
|
|
12
|
+
### Fixed — pre-tag adversarial-review completion (silent-NaN gaps in `bootstrap_ci` itself)
|
|
13
|
+
|
|
14
|
+
The pre-tag review panel (whole-repo re-audit + independent reviewers +
|
|
15
|
+
self-refutation) found the #96 class surviving in `bootstrap_ci` — the
|
|
16
|
+
most-used entry point, outside #96's enumerated scope:
|
|
17
|
+
|
|
18
|
+
- **Studentized path**: a NaN outer statistic was accepted as a *valid*
|
|
19
|
+
resample, bypassing the 95%-valid gate and poisoning the pivots into a
|
|
20
|
+
silent all-NaN `BootstrapCI` with zero warnings. NaN/inf outer statistics
|
|
21
|
+
(and inner-jackknife LOO values) now count as degenerate draws.
|
|
22
|
+
- **Non-finite CI bounds now raise for ANY method** — the degeneracy check
|
|
23
|
+
was gated to `method="BCa"`, so `percentile` returned NaN bounds with only
|
|
24
|
+
scipy's misdirecting `DegenerateDataWarning` (which always names BCa).
|
|
25
|
+
The finite BCa-collapse case (`ci_low == ci_high == point`) keeps the R9
|
|
26
|
+
`UserWarning` contract. Behavior change: BCa NaN-bounds previously
|
|
27
|
+
warned-and-returned; they now raise (the scorecard/harness per-cell
|
|
28
|
+
isolation converts this into an error/reason cell as before).
|
|
29
|
+
- **Point estimate guarded**: a metric returning NaN on the full data
|
|
30
|
+
previously yielded `point_estimate=nan` beside a finite CI, silently.
|
|
31
|
+
- `analysis.load_prediction_arrays`: labels are now domain-checked before
|
|
32
|
+
the int cast — `dtype=int` coercion silently **truncated** float labels
|
|
33
|
+
(`0.7 → 0`), flipping ground truth with in-domain values no downstream
|
|
34
|
+
gate could catch.
|
|
35
|
+
- Docs: `sweep()` Raises now documents the reachable pandas `ImportError`
|
|
36
|
+
and its Returns lists the always-present `strategy_id` column; STYLE.md
|
|
37
|
+
Tier-2 quick reference corrected to the ten strict Protocols (v1.0.2
|
|
38
|
+
`SimilarityStrategy` promotion).
|
|
39
|
+
- Tests: mutation-verified gap closed (per-stratum NaN filter pinned via a
|
|
40
|
+
non-NaN-propagating `combine`); `samples`↔quantile consistency pinned at
|
|
41
|
+
non-default confidence.
|
|
42
|
+
|
|
43
|
+
### Added — resample-distribution exposure on the cluster bootstraps (#93)
|
|
44
|
+
|
|
45
|
+
Tier-1 strictly-appended optional parameters, SemVer-MINOR per the
|
|
46
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)
|
|
47
|
+
2026-06-10 amendment (#101) — backward-compatible; snapshot regenerated in
|
|
48
|
+
the same commit.
|
|
49
|
+
|
|
50
|
+
- **`cluster_bootstrap_ci(..., return_samples=True)`** and
|
|
51
|
+
**`stratified_cluster_bootstrap_ci(..., return_samples=True)`** attach the
|
|
52
|
+
post-filter bootstrap resample statistics to the result as
|
|
53
|
+
**`BootstrapCI.samples`** (read-only `numpy.ndarray`, the same array the
|
|
54
|
+
percentile bounds are computed from; `shape == (n_resamples_used,)`).
|
|
55
|
+
Distribution summaries such as the consumer's `frac_gt0`
|
|
56
|
+
(`float(np.mean(ci.samples > 0.0))`) are now derivable from the *same*
|
|
57
|
+
draws as the CI — previously structurally unrecoverable, blocking the two
|
|
58
|
+
remaining LODO call-site migrations (downstream DF-11).
|
|
59
|
+
- `BootstrapCI` gains the trailing optional field `samples`
|
|
60
|
+
(default `None`, `compare=False`, `repr=False`): positional construction,
|
|
61
|
+
equality/hash semantics, and the **`to_dict()` schema are all unchanged**.
|
|
62
|
+
Note: `dataclasses.asdict()` (which ignores those flags) now includes a
|
|
63
|
+
`samples` key — consumers serializing via `asdict` instead of `to_dict()`
|
|
64
|
+
should drop it (an attached ndarray is not JSON-serializable).
|
|
65
|
+
- scipy precedent: `BootstrapResult.bootstrap_distribution`. Honors the
|
|
66
|
+
n_jobs bit-for-bit reproducibility contract.
|
|
67
|
+
|
|
68
|
+
### Fixed — silent-NaN hardening batch (#96)
|
|
69
|
+
|
|
70
|
+
Finiteness guards across the numeric surface (STYLE §1 *never fail silently* /
|
|
71
|
+
§7 validation boundary). All are new raises (or error statuses) on garbage
|
|
72
|
+
input that previously produced silently-wrong results:
|
|
73
|
+
|
|
74
|
+
- `cluster_bootstrap_ci` / `stratified_cluster_bootstrap_ci`: NaN/inf scores
|
|
75
|
+
now raise at the validation boundary (previously the per-stratum check was
|
|
76
|
+
shape-only and the result was a silent all-NaN `BootstrapCI`); a non-finite
|
|
77
|
+
point estimate raises with the got-value; resample draws where the statistic
|
|
78
|
+
(or `combine`) **returns** NaN/inf now count toward the >5% degenerate gate
|
|
79
|
+
instead of poisoning the quantile CI (previously only *raising* draws counted).
|
|
80
|
+
- `paired_bootstrap_diff` / `paired_bootstrap_ece_diff` /
|
|
81
|
+
`paired_bootstrap_op_point_diff`: NaN resample deltas now count as degenerate
|
|
82
|
+
draws (same ≤5% tolerance as raising draws — pre-#96 a NaN CI made
|
|
83
|
+
`overlaps_zero` read `False`, i.e. silently "statistically significant");
|
|
84
|
+
a non-finite full-data Δ raises with the got-value; a non-finite-CI-bounds
|
|
85
|
+
raise remains in each constructor as a backstop (mirrors the BCa degeneracy
|
|
86
|
+
guard in `bootstrap_ci`).
|
|
87
|
+
- `scorecard` bootstrap path: BCa-degenerate NaN CI bounds are no longer
|
|
88
|
+
attached to an `"ok"` cell — the CI is dropped and the reason recorded
|
|
89
|
+
(consistent with the existing "bootstrap unavailable" convention).
|
|
90
|
+
- `eda.median_bandwidth`: non-finite input (NaN/inf) now raises at entry —
|
|
91
|
+
NaN bypassed the `sigma <= 0.0` check and escaped as a NaN bandwidth, and a
|
|
92
|
+
NaN row outside the `max_samples` subsample escaped entirely.
|
|
93
|
+
- `eda.maximum_mean_discrepancy`: explicit `bandwidth` must be finite and > 0 —
|
|
94
|
+
`inf` yielded γ = 0 → all-ones Gram → MMD² = 0 → `p_value = 1.0` silently
|
|
95
|
+
reading "no shift".
|
|
96
|
+
- `eda` PAD/MMD/kNN feature matrices are finiteness-checked at the boundary
|
|
97
|
+
(previously NaN embeddings died deep inside sklearn blaming internals).
|
|
98
|
+
- `eda.class_lexical_association`: a `positive_label` matching no label (the
|
|
99
|
+
1-vs-`"1"` type-mismatch trap) or matching every label now raises listing the
|
|
100
|
+
observed label values, instead of returning a documented all-empty result
|
|
101
|
+
that read "no shortcut signal".
|
|
102
|
+
- `scorecard`: a custom `MetricSpec.compute` returning NaN/inf now yields
|
|
103
|
+
`MetricResult(status="error", reason=...)` through the same path as a raising
|
|
104
|
+
compute — previously `status="ok"` with a NaN value.
|
|
105
|
+
- `sweep`: a NaN `attack_threshold` now raises (it silently zeroed every `asr`
|
|
106
|
+
flag); `±inf` remains a documented unsatisfiable sentinel.
|
|
107
|
+
- `analysis.JsonlPredictionReader`: a row missing (or `null` on) a declared
|
|
108
|
+
column key now fails fast with file + row + key context (the R8-F3 pattern
|
|
109
|
+
already applied to CSV headers) — previously a missing score coerced to NaN
|
|
110
|
+
deep in the metric computation and a missing label died as a bare `TypeError`.
|
|
111
|
+
A malformed JSON row now reports the actual file row (raw `json.JSONDecodeError`
|
|
112
|
+
always said "line 1"). `analysis.load_prediction_arrays` additionally rejects
|
|
113
|
+
non-finite loaded scores (a bare JSON `NaN` token or a CSV `"nan"` cell passes
|
|
114
|
+
per-row key checks) with file + column + row-index context.
|
|
115
|
+
|
|
116
|
+
### Fixed — explicit UTF-8 encoding batch (#97)
|
|
117
|
+
|
|
118
|
+
Windows (cp1252 locale codec) is the trigger; Linux/macOS hid all of these.
|
|
119
|
+
Locked convention: always pass `encoding="utf-8"` on text-file IO.
|
|
120
|
+
|
|
121
|
+
- `docs.render_files` **apply mode** read and wrote consumer markdown with the
|
|
122
|
+
locale codec — on cp1252 this silently and *cumulatively* corrupted
|
|
123
|
+
non-ASCII user content on every apply (the worst item in the batch).
|
|
124
|
+
- All remaining text IO made explicit: `__main__` schema/payload reads
|
|
125
|
+
(RFC 8259 mandates UTF-8 for JSON), `analysis` CSV/JSONL prediction readers,
|
|
126
|
+
`config.from_yaml`, `artifacts` schema read + report write,
|
|
127
|
+
`plotting` sidecar write, `scripts/audit_raises_sections.py`.
|
|
128
|
+
- `scripts/dogfood_audit.py`: a surface file skipped for `UnicodeDecodeError`
|
|
129
|
+
now emits a stderr warning with the path — previously it silently vanished
|
|
130
|
+
from the acceptance evidence.
|
|
131
|
+
- **Detection locked out permanently**: ruff now enforces `PLW1514`
|
|
132
|
+
(implicit-encoding) across `src/`, `scripts/`, and `tests/` via
|
|
133
|
+
`preview = true` + `explicit-preview-rules = true` (only this rule gets
|
|
134
|
+
preview status; no other behavior changes).
|
|
135
|
+
|
|
136
|
+
### Fixed
|
|
137
|
+
|
|
138
|
+
- `audit_value_bindings.validate_reader_value_bindings` now raises a
|
|
139
|
+
diagnostic `ValueError` when a scanned file is not valid UTF-8, instead
|
|
140
|
+
of letting an unguarded `read_text(encoding="utf-8")` abort the run with
|
|
141
|
+
a bare `UnicodeDecodeError`. Documented in the function's `Raises` section.
|
|
142
|
+
- `audit_sister_doc_concept_drift.validate_sister_doc_concept_drift` now
|
|
143
|
+
skips non-UTF-8 files with a `warnings.warn` instead of crashing — its
|
|
144
|
+
prior `except OSError` did not catch `UnicodeDecodeError` (a `ValueError`,
|
|
145
|
+
not an `OSError`), so a single non-UTF-8 byte aborted the whole scan.
|
|
146
|
+
|
|
147
|
+
### Internal
|
|
148
|
+
|
|
149
|
+
- `scripts/` is now covered by `ruff` / `black` / `mypy` across all runners
|
|
150
|
+
(`Makefile`, `ci.yml`, `.pre-commit-config.yaml`, `tox.ini`, `noxfile.py`).
|
|
151
|
+
|
|
152
|
+
### Fixed — documentation/config consistency batch (2026-06-09 full-repo audit)
|
|
153
|
+
|
|
154
|
+
- [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)
|
|
155
|
+
amended: records the v1.0.2 `SimilarityStrategy` promotion (strict
|
|
156
|
+
Tier-2 count 9 → 10) in the Tier-1 Protocol list, and replaces the
|
|
157
|
+
unimplemented `STRICT_DOCSTRINGS` plan with the actual contract
|
|
158
|
+
(docstring first lines remain pinned through v1.x).
|
|
159
|
+
- `SimilarityStrategy` registered in
|
|
160
|
+
`tests/test_public_api.py::_TIER2_PROTOCOLS` — the R6-D fail-fast
|
|
161
|
+
list had lagged the v1.0.2 promotion.
|
|
162
|
+
- `docs/source/roadmap.md` post-v1.0 section refreshed to the v1.8.0
|
|
163
|
+
state (was still "v1.0.1 is the next minor"; the referenced
|
|
164
|
+
`v1.0.1 cleanup` issue #76 closed at v1.0.2); broken repo-relative
|
|
165
|
+
link to a machine-local planning document removed.
|
|
166
|
+
- STYLE.md §17 example updated — `pr_auc` left the top level at v0.46
|
|
167
|
+
(Decision L); the example now uses `scorecard`. README Tier-2 box
|
|
168
|
+
disambiguated (10 strict Protocols vs `SliceAwareScorer`/`Versioned`).
|
|
169
|
+
- CONTRIBUTING.md: corrected the `[dev]`-extra claim (heavy optional
|
|
170
|
+
stacks `embeddings`/`transformers`/`probes`/`losses` are not
|
|
171
|
+
included) and documented the docs-extra requirement for `pre-push`.
|
|
172
|
+
|
|
173
|
+
### Internal
|
|
174
|
+
|
|
175
|
+
- `make test` now collects all three doc-execution surfaces — the
|
|
176
|
+
positional `tests` arg silently bypassed pyproject `testpaths`,
|
|
177
|
+
skipping the 161 README/docs Sybil doc tests (v0.47 §5L incident
|
|
178
|
+
class). `make install` installs `.[dev,docs]` so the sphinx
|
|
179
|
+
pre-push gate works on a fresh environment.
|
|
180
|
+
- CI coverage step excludes `-m integration` (aligns ci.yml with the
|
|
181
|
+
pyproject marker contract and the Makefile coverage target).
|
|
182
|
+
- tox/nox aligned with `requires-python = ">=3.13"`: py313-only
|
|
183
|
+
envlist/`PY_VERSIONS`, monte_carlo/benchmark/integration marker
|
|
184
|
+
exclusions added to their pytest commands, stale "private and
|
|
185
|
+
home-designed" framing removed; Makefile help text and §5H
|
|
186
|
+
notebook-gate comments updated to current reality.
|
|
187
|
+
|
|
188
|
+
## [1.8.0] — 2026-06-04 — composite multi-stratum cluster bootstrap (#92)
|
|
189
|
+
|
|
190
|
+
### Added — `bootstrap.stratified_cluster_bootstrap_ci` (composite multi-stratum cluster bootstrap)
|
|
191
|
+
|
|
192
|
+
`eval_toolkit.bootstrap` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible. Generalises the v1.7.0 single-block `cluster_bootstrap_ci` to the shape leave-one-group-out transfer gaps actually take: a **composite statistic reduced over several independently-resampled cluster strata**.
|
|
193
|
+
|
|
194
|
+
- **`stratified_cluster_bootstrap_ci(strata, per_stratum_metric, combine, *, resample_labels=(0,1), …)`** — `strata` is a mapping `{key: (y, score, groups)}` of independent resample-units (e.g. `seed`, `(carrier, seed)`, `(attack_type, seed)`); each bootstrap iteration resamples every stratum's `(label, group)` clusters, computes `per_stratum_metric` on each, and reduces the `{key: metric}` map with `combine` to one scalar (a seed-averaged ROC-AUC gap, a mean-over-carriers gap, a top−bottom per-type AUPRC contrast, …). Percentile `BootstrapCI` (`method="stratified_cluster_percentile"`). `cluster_bootstrap_ci` is the single-stratum, identity-reduce special case.
|
|
195
|
+
- **Why:** the v1.7.0 single-block primitive could not express the **seed-averaging** that real LODO estimators do inside the bootstrap (`Gx = val − mean_seed(test_roc)`), so it did not actually fit the consumer portfolio's attack-type / carrier / dialect bootstraps. This is the correct primitive for them.
|
|
196
|
+
- **Parallel + reproducible:** built on `parallel_map` + `spawn_seed_sequences` ⇒ `n_jobs` gives bit-for-bit-identical CIs; `n_jobs=-1` all cores.
|
|
197
|
+
- Exported via `from eval_toolkit import stratified_cluster_bootstrap_ci`; `__all__` + `_EXPORTS` + doctest + n_jobs-reproducibility / seed-averaged / composite-statistic tests; mypy-strict clean.
|
|
198
|
+
|
|
8
199
|
## [1.7.0] — 2026-06-04 — label-stratified cluster bootstrap (#90, #91)
|
|
9
200
|
|
|
10
201
|
### Added — `bootstrap.cluster_bootstrap_ci` (label-stratified cluster bootstrap)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -116,11 +116,10 @@ format changes.
|
|
|
116
116
|
│ manifest.json + seeds + git_sha + data_hashes + │
|
|
117
117
|
│ gpu_info + leakage_report (NeurIPS-aligned) │
|
|
118
118
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
119
|
-
│ Scorer /
|
|
120
|
-
│
|
|
121
|
-
│
|
|
122
|
-
│
|
|
123
|
-
│ Versioned (opt-in: per-object versions in manifest) │
|
|
119
|
+
│ Scorer / LeakageCheck / Splitter / ThresholdSelector │
|
|
120
|
+
│ DatasetLoader / MetricSpec / MetaLearner / Probe / │
|
|
121
|
+
│ TextTransform / SimilarityStrategy (10 strict) │
|
|
122
|
+
│ SliceAwareScorer / Versioned (outside the 10 strict) │
|
|
124
123
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
125
124
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
126
125
|
│ paired_bootstrap_diff / cv_clt_ci / mde_from_ci │
|
|
@@ -30,11 +30,10 @@ format changes.
|
|
|
30
30
|
│ manifest.json + seeds + git_sha + data_hashes + │
|
|
31
31
|
│ gpu_info + leakage_report (NeurIPS-aligned) │
|
|
32
32
|
├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
|
|
33
|
-
│ Scorer /
|
|
34
|
-
│
|
|
35
|
-
│
|
|
36
|
-
│
|
|
37
|
-
│ Versioned (opt-in: per-object versions in manifest) │
|
|
33
|
+
│ Scorer / LeakageCheck / Splitter / ThresholdSelector │
|
|
34
|
+
│ DatasetLoader / MetricSpec / MetaLearner / Probe / │
|
|
35
|
+
│ TextTransform / SimilarityStrategy (10 strict) │
|
|
36
|
+
│ SliceAwareScorer / Versioned (outside the 10 strict) │
|
|
38
37
|
├─ Tier 1 ─ Functional core ─────────────────────────────┤
|
|
39
38
|
│ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
|
|
40
39
|
│ paired_bootstrap_diff / cv_clt_ci / mde_from_ci │
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# eval-toolkit — Coding Standards
|
|
2
2
|
|
|
3
|
-
Self-contained
|
|
4
|
-
|
|
5
|
-
here.
|
|
3
|
+
Self-contained quick reference for this repository. The ADRs
|
|
4
|
+
(`docs/source/adr/`) are the authoritative source for the decisions summarized
|
|
5
|
+
here; everything needed for day-to-day contribution lives in this file.
|
|
6
6
|
|
|
7
7
|
## 1. Foundational principles
|
|
8
8
|
|
|
@@ -27,7 +27,7 @@ here.
|
|
|
27
27
|
| Formatter | `black`, line length 100 |
|
|
28
28
|
| Linter | `ruff` with `select = ["E", "W", "F", "I", "N", "UP", "B", "SIM", "C4"]`, ignore `E501` (Black handles), `N803`/`N806` (math identifiers) |
|
|
29
29
|
| Type checker | `mypy` strict (`disallow_untyped_defs`, `disallow_incomplete_defs`, `check_untyped_defs`, `no_implicit_optional`, `warn_redundant_casts`, `warn_unused_ignores`, `warn_no_return`, `strict_equality`, `warn_return_any`) |
|
|
30
|
-
| Test runner | `pytest` with markers `unit`, `property`, `smoke`, `golden`; coverage floor `
|
|
30
|
+
| Test runner | `pytest` with markers `unit`, `property`, `smoke`, `golden`; coverage floor `92%` |
|
|
31
31
|
| Build backend | `hatchling` |
|
|
32
32
|
| Env manager | `uv` (`uv venv` → `.venv/`; `uv pip install -e .[dev]`) |
|
|
33
33
|
| Python | `>=3.13` (RunPod parity floor; py313 tool targets in pyproject.toml) |
|
|
@@ -130,7 +130,15 @@ Examples:
|
|
|
130
130
|
required.
|
|
131
131
|
- `from __future__ import annotations` only when forward refs require it.
|
|
132
132
|
- `Protocol` only at "real seams" — where two or more concrete implementations
|
|
133
|
-
exist or are planned.
|
|
133
|
+
exist or are planned. The authoritative Tier-2-stable set is `_TIER2_PROTOCOLS` in
|
|
134
|
+
`tests/test_public_api.py` plus
|
|
135
|
+
[ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md):
|
|
136
|
+
the ten strict Tier-2 Protocols are `Scorer`, `LeakageCheck`, `Splitter`,
|
|
137
|
+
`ThresholdSelector`, `DatasetLoader`, `MetricSpec`, `MetaLearner`, `Probe`,
|
|
138
|
+
`TextTransform`, and `SimilarityStrategy` (promoted 10th at v1.0.2, #76
|
|
139
|
+
RC2). The seams below are illustrative detail — `SliceAwareScorer` is an
|
|
140
|
+
opt-in subprotocol of `Scorer`, and `Versioned` is a real seam that is
|
|
141
|
+
**not** in the Tier-2 frozenset:
|
|
134
142
|
- `Scorer` + `SliceAwareScorer` (`harness.py`) — anything with
|
|
135
143
|
`predict_proba(X) -> np.ndarray`. `SliceAwareScorer` adds opt-in
|
|
136
144
|
`should_score_slice(name)` for cost-controlled skipping.
|
|
@@ -251,7 +259,11 @@ Local imports inside functions are allowed for:
|
|
|
251
259
|
## 11. Logging
|
|
252
260
|
|
|
253
261
|
Use `logging` (library context — consumers configure handlers). Do not use
|
|
254
|
-
`print` in `src/eval_toolkit/`.
|
|
262
|
+
`print` in `src/eval_toolkit/`. Log levels: `DEBUG` for internal events; `INFO`
|
|
263
|
+
only for the rare user-relevant harness progress signal; **`WARNING` is reserved
|
|
264
|
+
for `warnings.warn(...)`, not `logger.warning(...)`**; and **`ERROR` must not
|
|
265
|
+
appear in library code — raise an exception instead**. See CONTRIBUTING.md
|
|
266
|
+
§Logging for the full rationale.
|
|
255
267
|
|
|
256
268
|
## 12. Docstrings
|
|
257
269
|
|
|
@@ -333,7 +345,7 @@ restate what the code says.
|
|
|
333
345
|
`hypothesis.extra.numpy` for arrays.
|
|
334
346
|
- **Golden tests** only for `docs.py`, where the output is the contract.
|
|
335
347
|
- **Doctests** for math/algorithmic kernels.
|
|
336
|
-
- **Coverage floor**:
|
|
348
|
+
- **Coverage floor**: 92%.
|
|
337
349
|
- **`assert` is fine in tests.**
|
|
338
350
|
|
|
339
351
|
## 15. Packaging
|
|
@@ -359,6 +371,9 @@ restate what the code says.
|
|
|
359
371
|
|
|
360
372
|
- Every module declares `__all__`.
|
|
361
373
|
- The package's `__init__.py` re-exports the public surface so both
|
|
362
|
-
`from eval_toolkit import
|
|
363
|
-
work — matches
|
|
374
|
+
`from eval_toolkit import scorecard` and
|
|
375
|
+
`from eval_toolkit.scorecards import scorecard` work — matches
|
|
376
|
+
sklearn/pandas/scipy convention. (Threshold-dependent scalar metrics
|
|
377
|
+
such as `pr_auc` left the top level at v0.46 Decision L — import
|
|
378
|
+
them from `eval_toolkit.metrics`.)
|
|
364
379
|
- Private helpers are prefixed with `_` and not re-exported.
|
|
@@ -160,7 +160,12 @@ line-length = 100
|
|
|
160
160
|
target-version = "py313"
|
|
161
161
|
|
|
162
162
|
[tool.ruff.lint]
|
|
163
|
-
|
|
163
|
+
# preview + explicit-preview-rules: enable ONLY the explicitly selected
|
|
164
|
+
# preview rules (PLW1514 implicit-encoding, #97) — no other preview-mode
|
|
165
|
+
# behavior changes. Locks the Windows-cp1252 mojibake class out permanently.
|
|
166
|
+
preview = true
|
|
167
|
+
explicit-preview-rules = true
|
|
168
|
+
select = ["E", "F", "W", "I", "N", "UP", "B", "SIM", "C4", "PLW1514"]
|
|
164
169
|
ignore = [
|
|
165
170
|
"E501", # line length handled by black
|
|
166
171
|
"N803", # function arg lowercase — math kernels use π, T, etc. per Decision 14
|
|
@@ -140,6 +140,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
140
140
|
"paired_bootstrap_ece_diff": "eval_toolkit.bootstrap",
|
|
141
141
|
"paired_bootstrap_op_point_diff": "eval_toolkit.bootstrap",
|
|
142
142
|
"paired_mde": "eval_toolkit.bootstrap",
|
|
143
|
+
"stratified_cluster_bootstrap_ci": "eval_toolkit.bootstrap",
|
|
143
144
|
# --- calibration ---
|
|
144
145
|
"DEFAULT_FN_COST": "eval_toolkit.calibration",
|
|
145
146
|
"DEFAULT_FP_COST": "eval_toolkit.calibration",
|
|
@@ -47,7 +47,7 @@ def _cmd_schemas_show(args: argparse.Namespace) -> int:
|
|
|
47
47
|
else:
|
|
48
48
|
print(f"unknown schema: {name}", file=sys.stderr)
|
|
49
49
|
return 2
|
|
50
|
-
print(json.dumps(json.loads(candidate.read_text()), indent=2, sort_keys=True))
|
|
50
|
+
print(json.dumps(json.loads(candidate.read_text(encoding="utf-8")), indent=2, sort_keys=True))
|
|
51
51
|
return 0
|
|
52
52
|
|
|
53
53
|
|
|
@@ -73,7 +73,7 @@ def _cmd_schemas_check(_args: argparse.Namespace) -> int:
|
|
|
73
73
|
failures: list[str] = []
|
|
74
74
|
for f in files:
|
|
75
75
|
try:
|
|
76
|
-
schema = json.loads(f.read_text())
|
|
76
|
+
schema = json.loads(f.read_text(encoding="utf-8"))
|
|
77
77
|
Draft202012Validator.check_schema(schema)
|
|
78
78
|
print(f" {f.name}: OK")
|
|
79
79
|
except (json.JSONDecodeError, SchemaError) as exc:
|
|
@@ -109,8 +109,8 @@ def _cmd_validate(args: argparse.Namespace) -> int:
|
|
|
109
109
|
if not file_path.exists():
|
|
110
110
|
print(f"file not found: {args.file}", file=sys.stderr)
|
|
111
111
|
return 2
|
|
112
|
-
schema = json.loads(schema_path.read_text())
|
|
113
|
-
payload = json.loads(file_path.read_text())
|
|
112
|
+
schema = json.loads(schema_path.read_text(encoding="utf-8"))
|
|
113
|
+
payload = json.loads(file_path.read_text(encoding="utf-8"))
|
|
114
114
|
import jsonschema as _js # noqa: PLC0415
|
|
115
115
|
|
|
116
116
|
try:
|
|
@@ -72,15 +72,18 @@ def sweep(
|
|
|
72
72
|
**Required to materialize ``asr``** — the documented contract refuses
|
|
73
73
|
a magic default threshold (cf. ``methodology/thresholds.md``).
|
|
74
74
|
Ignored when ``scorer`` is ``None`` (with ``ValueError`` if passed
|
|
75
|
-
with ``scorer=None`` to surface the API misuse).
|
|
75
|
+
with ``scorer=None`` to surface the API misuse). Must not be NaN
|
|
76
|
+
(every ``asr`` flag would silently be ``False``); ``±inf`` is
|
|
77
|
+
accepted as a deliberately unsatisfiable sentinel.
|
|
76
78
|
|
|
77
79
|
Returns
|
|
78
80
|
-------
|
|
79
81
|
pandas.DataFrame
|
|
80
82
|
Columns vary by which optional kwargs are passed:
|
|
81
83
|
|
|
82
|
-
- Always: ``text_id`` (int), ``
|
|
83
|
-
|
|
84
|
+
- Always: ``text_id`` (int), ``strategy_id`` (str —
|
|
85
|
+
configured-instance identity, Decision R7-B), ``variant`` (str —
|
|
86
|
+
from ``strategy.name``), ``transformed_text`` (str).
|
|
84
87
|
- With ``scorer``: also ``original_score`` (float) +
|
|
85
88
|
``transformed_score`` (float).
|
|
86
89
|
- With ``scorer`` AND ``attack_threshold``: also ``asr`` (bool —
|
|
@@ -90,9 +93,12 @@ def sweep(
|
|
|
90
93
|
|
|
91
94
|
Raises
|
|
92
95
|
------
|
|
96
|
+
ImportError
|
|
97
|
+
If pandas is not installed (install the ``dataframe`` extra).
|
|
93
98
|
ValueError
|
|
94
99
|
- If ``strategies`` is empty.
|
|
95
100
|
- If ``attack_threshold`` is provided without ``scorer``.
|
|
101
|
+
- If ``attack_threshold`` is NaN.
|
|
96
102
|
- If any strategy doesn't satisfy ``TextTransform`` structurally
|
|
97
103
|
(typically a missing ``name`` attribute).
|
|
98
104
|
|
|
@@ -138,6 +144,13 @@ def sweep(
|
|
|
138
144
|
"Either pass scorer=<scorer> + attack_threshold=<float>, "
|
|
139
145
|
"or omit attack_threshold."
|
|
140
146
|
)
|
|
147
|
+
# NaN comparisons are all False, so a NaN threshold would silently zero
|
|
148
|
+
# every asr flag. (±inf is semantically valid: an unsatisfiable sentinel.)
|
|
149
|
+
if attack_threshold is not None and np.isnan(attack_threshold):
|
|
150
|
+
raise ValueError(
|
|
151
|
+
"sweep(): attack_threshold is NaN — every asr flag would be False. "
|
|
152
|
+
"Pass a finite threshold (or ±inf as an unsatisfiable sentinel)."
|
|
153
|
+
)
|
|
141
154
|
for i, strategy in enumerate(strategies):
|
|
142
155
|
if not (hasattr(strategy, "name") and hasattr(strategy, "transform")):
|
|
143
156
|
raise ValueError(
|
|
@@ -177,8 +190,11 @@ def sweep(
|
|
|
177
190
|
"transformed_text": transformed,
|
|
178
191
|
}
|
|
179
192
|
if scorer is not None:
|
|
180
|
-
|
|
181
|
-
|
|
193
|
+
if original_scores is None or transformed_scores is None: # pragma: no cover
|
|
194
|
+
raise RuntimeError(
|
|
195
|
+
"sweep(): internal invariant violated — batch scores not "
|
|
196
|
+
"materialized despite scorer being set"
|
|
197
|
+
)
|
|
182
198
|
s_orig = float(original_scores[text_id])
|
|
183
199
|
s_adv = float(transformed_scores[text_id])
|
|
184
200
|
row["original_score"] = s_orig
|
|
@@ -66,7 +66,7 @@ class CsvPredictionReader:
|
|
|
66
66
|
"""
|
|
67
67
|
wanted = set(columns.values())
|
|
68
68
|
out: dict[str, list[object]] = {col: [] for col in wanted}
|
|
69
|
-
with Path(uri).open(newline="") as fh:
|
|
69
|
+
with Path(uri).open(newline="", encoding="utf-8") as fh:
|
|
70
70
|
reader = csv.DictReader(fh)
|
|
71
71
|
# R8-F3: validate the header up-front so missing columns
|
|
72
72
|
# surface as a clear ValueError rather than as a cryptic
|
|
@@ -93,14 +93,40 @@ class JsonlPredictionReader:
|
|
|
93
93
|
*,
|
|
94
94
|
columns: Mapping[str, str],
|
|
95
95
|
) -> Mapping[str, Sequence[object]]:
|
|
96
|
-
"""Read a local JSONL file.
|
|
96
|
+
"""Read a local JSONL file.
|
|
97
|
+
|
|
98
|
+
Raises
|
|
99
|
+
------
|
|
100
|
+
ValueError
|
|
101
|
+
If any non-blank row is not valid JSON, or is missing (or has
|
|
102
|
+
``null`` for) a key declared in the ``columns`` mapping.
|
|
103
|
+
Validated at read time — the R8-F3 pattern already applied to
|
|
104
|
+
CSV headers — so a missing ``score`` key surfaces with the file
|
|
105
|
+
path + row number instead of being coerced to NaN deep inside
|
|
106
|
+
the metric computation (or, for ``label``, dying as a
|
|
107
|
+
context-free ``TypeError``).
|
|
108
|
+
"""
|
|
97
109
|
wanted = set(columns.values())
|
|
98
110
|
out: dict[str, list[object]] = {col: [] for col in wanted}
|
|
99
|
-
with Path(uri).open() as fh:
|
|
100
|
-
for line in fh:
|
|
111
|
+
with Path(uri).open(encoding="utf-8") as fh:
|
|
112
|
+
for line_no, line in enumerate(fh, start=1):
|
|
101
113
|
if not line.strip():
|
|
102
114
|
continue
|
|
103
|
-
|
|
115
|
+
try:
|
|
116
|
+
row = json.loads(line)
|
|
117
|
+
except json.JSONDecodeError as exc:
|
|
118
|
+
# json.loads on a single line always reports "line 1",
|
|
119
|
+
# actively misdirecting on which file row is broken.
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"JSONL file at {uri!r} row {line_no} is not valid JSON: {exc}"
|
|
122
|
+
) from exc
|
|
123
|
+
missing = sorted(col for col in wanted if row.get(col) is None)
|
|
124
|
+
if missing:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f"JSONL file at {uri!r} row {line_no} is missing required "
|
|
127
|
+
f"key(s) {missing} (or they are null); "
|
|
128
|
+
f"available keys: {sorted(row)}"
|
|
129
|
+
)
|
|
104
130
|
for col in wanted:
|
|
105
131
|
out[col].append(row.get(col))
|
|
106
132
|
return out
|
|
@@ -117,8 +143,13 @@ def load_prediction_arrays(
|
|
|
117
143
|
------
|
|
118
144
|
ValueError
|
|
119
145
|
If ``ref`` lacks a ``columns`` mapping, lacks a non-empty ``uri``,
|
|
120
|
-
|
|
121
|
-
|
|
146
|
+
its ``columns`` mapping is missing the ``label`` / ``score`` keys
|
|
147
|
+
(re-raised from :func:`_required_column`), the loaded scores
|
|
148
|
+
contain non-finite values (a bare ``NaN`` token in JSONL or a
|
|
149
|
+
``"nan"`` cell in CSV passes the readers' per-row key checks but
|
|
150
|
+
must not flow into metrics as a silent NaN), or the loaded labels
|
|
151
|
+
are not all in ``{0, 1}`` (an int cast would silently truncate
|
|
152
|
+
``0.7 → 0``, flipping ground truth).
|
|
122
153
|
"""
|
|
123
154
|
columns = ref.get("columns")
|
|
124
155
|
if not isinstance(columns, Mapping):
|
|
@@ -131,8 +162,26 @@ def load_prediction_arrays(
|
|
|
131
162
|
selected_reader = reader or _reader_for_ref(ref)
|
|
132
163
|
reader_columns = {str(k): str(v) for k, v in columns.items() if isinstance(v, str)}
|
|
133
164
|
table = selected_reader.read_predictions(uri, columns=reader_columns)
|
|
134
|
-
labels
|
|
165
|
+
# Load labels as float first: np.asarray(..., dtype=int) silently
|
|
166
|
+
# TRUNCATES numeric non-integers (0.7 → 0), flipping ground truth with
|
|
167
|
+
# in-domain values no downstream gate can catch (v1.9.0 pre-tag review).
|
|
168
|
+
labels_raw = np.asarray(table[label_col], dtype=float)
|
|
169
|
+
bad_labels = ~np.isin(labels_raw, (0.0, 1.0))
|
|
170
|
+
if bad_labels.any():
|
|
171
|
+
first_bad = int(np.flatnonzero(bad_labels)[0])
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"prediction artifact at {uri!r} column {label_col!r} contains "
|
|
174
|
+
f"non-binary label(s); first bad value {labels_raw[first_bad]!r} "
|
|
175
|
+
f"at data row index {first_bad}"
|
|
176
|
+
)
|
|
177
|
+
labels = labels_raw.astype(int)
|
|
135
178
|
scores = np.asarray(table[score_col], dtype=float)
|
|
179
|
+
if not np.isfinite(scores).all():
|
|
180
|
+
first_bad = int(np.flatnonzero(~np.isfinite(scores))[0])
|
|
181
|
+
raise ValueError(
|
|
182
|
+
f"prediction artifact at {uri!r} column {score_col!r} contains "
|
|
183
|
+
f"non-finite score(s) (NaN/inf); first at data row index {first_bad}"
|
|
184
|
+
)
|
|
136
185
|
row_id_col = columns.get("row_id")
|
|
137
186
|
hash_col = columns.get("content_hash")
|
|
138
187
|
row_ids = tuple(str(v) for v in table.get(str(row_id_col), ())) if row_id_col else ()
|
|
@@ -243,7 +243,10 @@ def write_json_strict(
|
|
|
243
243
|
out_path = Path(path)
|
|
244
244
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
245
245
|
sanitized = sanitize_for_json(payload)
|
|
246
|
-
out_path.write_text(
|
|
246
|
+
out_path.write_text(
|
|
247
|
+
json.dumps(sanitized, indent=indent, sort_keys=sort_keys, allow_nan=False),
|
|
248
|
+
encoding="utf-8",
|
|
249
|
+
)
|
|
247
250
|
return out_path
|
|
248
251
|
|
|
249
252
|
|
|
@@ -258,7 +261,7 @@ def validate_payload(payload: object, schema_name: str) -> None:
|
|
|
258
261
|
from jsonschema import Draft202012Validator # type: ignore[import-untyped]
|
|
259
262
|
|
|
260
263
|
schema_path = resources.files("eval_toolkit") / "schemas" / schema_name
|
|
261
|
-
schema = json.loads(schema_path.read_text())
|
|
264
|
+
schema = json.loads(schema_path.read_text(encoding="utf-8"))
|
|
262
265
|
Draft202012Validator(schema).validate(sanitize_for_json(payload))
|
|
263
266
|
|
|
264
267
|
|
{eval_toolkit-1.7.0 → eval_toolkit-1.9.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py
RENAMED
|
@@ -54,6 +54,7 @@ concept_drift v1.0.4).
|
|
|
54
54
|
from __future__ import annotations
|
|
55
55
|
|
|
56
56
|
import re
|
|
57
|
+
import warnings
|
|
57
58
|
from collections.abc import Callable, Sequence
|
|
58
59
|
from dataclasses import dataclass
|
|
59
60
|
from pathlib import Path
|
|
@@ -220,7 +221,14 @@ def validate_sister_doc_concept_drift(
|
|
|
220
221
|
for path in files_resolved:
|
|
221
222
|
try:
|
|
222
223
|
file_texts[path] = path.read_text(encoding="utf-8")
|
|
223
|
-
except OSError:
|
|
224
|
+
except (OSError, UnicodeDecodeError) as exc:
|
|
225
|
+
# UnicodeDecodeError is a ValueError, not an OSError — without it a
|
|
226
|
+
# single non-UTF-8 byte would crash the whole scan. Skip unreadable
|
|
227
|
+
# or non-UTF-8 files, but warn so the skip is not silent (STYLE §1).
|
|
228
|
+
warnings.warn(
|
|
229
|
+
f"skipping unreadable file {path}: {exc}",
|
|
230
|
+
stacklevel=2,
|
|
231
|
+
)
|
|
224
232
|
continue
|
|
225
233
|
|
|
226
234
|
drift_clusters: list[DriftCluster] = []
|