eval-toolkit 1.4.0__tar.gz → 1.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/CHANGELOG.md +57 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/PKG-INFO +4 -1
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/pyproject.toml +8 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-1.6.0/src/eval_toolkit/eda/__init__.py +144 -0
- eval_toolkit-1.6.0/src/eval_toolkit/eda/data_audit.py +785 -0
- eval_toolkit-1.6.0/src/eval_toolkit/eda/distribution_shift.py +634 -0
- eval_toolkit-1.6.0/src/eval_toolkit/eda/lexical_association.py +620 -0
- eval_toolkit-1.6.0/src/eval_toolkit/eda/obfuscation.py +622 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/loaders.py +46 -8
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/public_api/snapshot.json +2 -2
- eval_toolkit-1.6.0/tests/test_eda.py +330 -0
- eval_toolkit-1.6.0/tests/test_eda_distribution_shift.py +302 -0
- eval_toolkit-1.6.0/tests/test_eda_lexical_association.py +340 -0
- eval_toolkit-1.6.0/tests/test_eda_obfuscation.py +448 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_loaders.py +107 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/.gitignore +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/LICENSE +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/STYLE.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_narrative.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/audit_value_bindings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_audit_value_bindings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_config.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.6.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,63 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.6.0] — 2026-05-29 — Tier-2 `eda` Job-2 + Job-3: shortcut + shift diagnostics (#86, #87)
|
|
9
|
+
|
|
10
|
+
`eval_toolkit.eda.*` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, backward-compatible. Completes the EDA-first analytic layer above the v1.5.0 Job-1 integrity gate: **Job-2** lexical shortcut diagnostics (`lexical_association`, #86) and **Job-3** distribution-shift quantification (`distribution_shift`, #87). Both are dogfooded by the consumer portfolio's pre-modeling OOD-wall prediction (V5 + V9).
|
|
11
|
+
|
|
12
|
+
### Added — Tier-2 `eda.lexical_association` shortcut diagnostics (Job-2: C1 + C2)
|
|
13
|
+
|
|
14
|
+
`eval_toolkit.eda` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, torch-free (NumPy + scikit-learn). The analytic layer above the Job-1 integrity gate: *"is the label recoverable from a surface shortcut that will not transfer out-of-distribution?"*
|
|
15
|
+
|
|
16
|
+
- **C1 — `weighted_log_odds` / `class_lexical_association`:** Monroe, Colaresi & Quinn (2008)
|
|
17
|
+
informative-Dirichlet weighted log-odds-ratio z-scores + smoothed PMI per token, with a
|
|
18
|
+
`min_count` rare-token floor (the V5 pitfall). Returns a `LexicalAssociationResult`
|
|
19
|
+
(`top_a` / `top_b` / `to_dict`); tokens ordered by descending z-score.
|
|
20
|
+
- **C2 — `competency_baselines`:** partial-input baselines (length-only, char-n-gram, BoW)
|
|
21
|
+
fit on a train split and scored on a test split → `CompetencyResult` of per-baseline
|
|
22
|
+
average-precision vs the positive-prevalence floor (the *shortcut floor*; Feng, Wallace &
|
|
23
|
+
Boyd-Graber, ACL 2019 caveat documented). Vectorizers fit on train only (no test leakage);
|
|
24
|
+
empty or single-class train/test raises a diagnostic `ValueError`.
|
|
25
|
+
- Exported via `from eval_toolkit.eda import ...`; 100% line+branch coverage; mypy-strict clean.
|
|
26
|
+
|
|
27
|
+
### Added — Tier-2 `eda.distribution_shift` covariate-shift quantification (Job-3: E1)
|
|
28
|
+
|
|
29
|
+
`eval_toolkit.eda` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2. Public functions take **feature matrices**, so the module is base-install-safe (NumPy + SciPy + scikit-learn); embed text first with `eval_toolkit.embeddings.make_minilm_embedder` (`[embeddings]` extra) or any vectorizer.
|
|
30
|
+
|
|
31
|
+
- **`proxy_a_distance`:** Ben-David et al. (2006/2010) PAD = `2(1 − 2ε)` from a **linear**
|
|
32
|
+
domain classifier's **k-fold CV** error, with **fixed strong regularization** (small `C`) —
|
|
33
|
+
*not* the high-`C` RBF-SVM-on-`predict_proba` recipe that overfits to `PAD ≈ 2` at small `n`.
|
|
34
|
+
Optional bootstrap CI.
|
|
35
|
+
- **`maximum_mean_discrepancy`:** Gretton et al. (2012) **unbiased** RBF-kernel MMD² U-statistic +
|
|
36
|
+
**median-heuristic bandwidth** (freezable across folds) + **permutation-test** p-value
|
|
37
|
+
(Phipson & Smyth 2010, `(1+count)/(B+1)`, never zero). Optional bootstrap CI.
|
|
38
|
+
- **`knn_purity`:** mean fraction of each point's k nearest neighbours sharing its domain label.
|
|
39
|
+
- **`median_bandwidth`** helper + the **`distribution_shift`** orchestrator (all three) +
|
|
40
|
+
`PadResult` / `MmdResult` / `KnnPurityResult` / `DistributionShiftResult` dataclasses (`to_dict`).
|
|
41
|
+
- Docstrings carry the pre-registered caveats: distance is **necessary-not-sufficient** for OOD
|
|
42
|
+
collapse (fuse with shortcut-exposure); a non-significant MMD p is not "no shift"; cross-dataset
|
|
43
|
+
distances are ordinal-only (covariate vs label-semantics conflation). 100% line+branch coverage.
|
|
44
|
+
|
|
45
|
+
### Fixed
|
|
46
|
+
|
|
47
|
+
- **Public-API golden `__version__` drift:** the `v1.5.0` release commit bumped
|
|
48
|
+
`_version.py` to `1.5.0` but did not regenerate `tests/golden/public_api/snapshot.json`,
|
|
49
|
+
which still pinned `'1.4.0'` — leaving `test_public_api_drift_guard` red on `main` (and on
|
|
50
|
+
every branch cut from it). Regenerated the golden (the diff is the `__version__` value only).
|
|
51
|
+
|
|
52
|
+
## [1.5.0] — 2026-05-29 — Tier-2 `eda` layer (#83) + schema-aware `HFDatasetsLoader` (#85)
|
|
53
|
+
|
|
54
|
+
Tier-2 / `loaders` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible.
|
|
55
|
+
|
|
56
|
+
- **`eda` Job-1 integrity gate (#83):** `audit_dataset` / `DataAudit` / `SplitSummary` + the
|
|
57
|
+
`class_balance` / `no_cross_split_leakage` / `context_window_fit` gates + the §B2 obfuscation
|
|
58
|
+
prevalence module.
|
|
59
|
+
- **schema-aware `HFDatasetsLoader` (#85):** load real-world dataset schemas without column
|
|
60
|
+
guessing — `feature_cols` + `feature_join` (join multiple columns into one feature; NaN-safe),
|
|
61
|
+
`label_map` (remap raw labels → int; fail-fast `ValueError` lists unmapped values), `revision`
|
|
62
|
+
(pin the HF dataset SHA). All new params default to the prior behavior; a missing feature/label
|
|
63
|
+
column raises `KeyError` listing the observed columns.
|
|
64
|
+
|
|
8
65
|
## [1.4.0] — 2026-05-26 — `audit_citation_alignment` Layer 2 + Layer 3 (closes #82); shared `_narrative` helpers (ADR 0007)
|
|
9
66
|
|
|
10
67
|
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.6.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -60,6 +60,9 @@ Requires-Dist: sphinx-autodoc-typehints>=2.0; extra == 'docs'
|
|
|
60
60
|
Requires-Dist: sphinx-copybutton>=0.5; extra == 'docs'
|
|
61
61
|
Requires-Dist: sphinx-design>=0.6; extra == 'docs'
|
|
62
62
|
Requires-Dist: sphinx>=7.3; extra == 'docs'
|
|
63
|
+
Provides-Extra: eda
|
|
64
|
+
Requires-Dist: matplotlib>=3.8; extra == 'eda'
|
|
65
|
+
Requires-Dist: pandas>=2.0; extra == 'eda'
|
|
63
66
|
Provides-Extra: embeddings
|
|
64
67
|
Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
|
|
65
68
|
Provides-Extra: losses
|
|
@@ -74,6 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
|
|
|
74
74
|
# (granular extras — losses callers should not have to install the larger
|
|
75
75
|
# transformers stack). Shares the torch version pin with [probes].
|
|
76
76
|
losses = ["torch>=2.0"]
|
|
77
|
+
# v1.5.0 (feat/eda-data-audit): eval_toolkit.eda Job-1 integrity-gate layer.
|
|
78
|
+
# Tier-2 surface (ADR 0003) — torch-free by design. pandas powers the
|
|
79
|
+
# DataFrameLoader reuse path; matplotlib is reserved for the EDA layer's
|
|
80
|
+
# future profiling plots. Intentionally NO sentence-transformers / torch:
|
|
81
|
+
# the near-dup / cross-split checks use the lexical TfidfCosineStrategy and
|
|
82
|
+
# token-length quantiles take a caller-supplied tokenizer (no transformers
|
|
83
|
+
# import in this module). NOT folded into [all] / [dev] — opt-in only.
|
|
84
|
+
eda = ["pandas>=2.0", "matplotlib>=3.8"]
|
|
77
85
|
# NO-OP extra kept for backward compatibility (R3 at v0.49.0).
|
|
78
86
|
#
|
|
79
87
|
# jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""``eval_toolkit.eda`` — EDA-first dataset integrity gating (Tier-2 surface).
|
|
2
|
+
|
|
3
|
+
This subpackage is the **Job-1 integrity gate** of an EDA-first research
|
|
4
|
+
program: thin, composable, torch-free per-split profiling + dataset-soundness
|
|
5
|
+
gates, built by reusing the v1.4.0 :mod:`eval_toolkit.leakage`,
|
|
6
|
+
:mod:`~eval_toolkit.text_dedup`, :mod:`~eval_toolkit.claims`, and
|
|
7
|
+
:mod:`~eval_toolkit.artifacts` primitives.
|
|
8
|
+
|
|
9
|
+
Stability tier
|
|
10
|
+
--------------
|
|
11
|
+
Public access is ``eval_toolkit.eda.*`` — **Tier-2** per ADR 0003. This layer
|
|
12
|
+
is intentionally evolvable and is **not** part of the v2.0-frozen top-level
|
|
13
|
+
:mod:`eval_toolkit` surface; nothing here is added to the package-root
|
|
14
|
+
``_EXPORTS`` / ``__all__``. Import explicitly::
|
|
15
|
+
|
|
16
|
+
from eval_toolkit.eda import audit_dataset, DataAudit, SplitSummary
|
|
17
|
+
|
|
18
|
+
Scope
|
|
19
|
+
-----
|
|
20
|
+
- **Job-1 integrity gate** (``data_audit`` + ``obfuscation``): row counts, class
|
|
21
|
+
balance, text-length quantiles, dedup / cross-split leakage, obfuscation
|
|
22
|
+
prevalence.
|
|
23
|
+
- **Job-2 lexical shortcut diagnostics** (``lexical_association``): weighted
|
|
24
|
+
log-odds + PMI (C1) and partial-input / competency baselines (C2) — torch-free
|
|
25
|
+
(NumPy + scikit-learn).
|
|
26
|
+
- **Job-3 distribution shift** (``distribution_shift``): proxy-A-distance, MMD
|
|
27
|
+
(permutation-tested), and kNN purity (E1) — operates on feature matrices, so
|
|
28
|
+
still base-install-safe (NumPy + SciPy + scikit-learn).
|
|
29
|
+
|
|
30
|
+
The shift functions take **feature matrices**, not text — embed first with
|
|
31
|
+
:func:`eval_toolkit.embeddings.make_minilm_embedder` (the optional
|
|
32
|
+
``[embeddings]`` extra) or any vectorizer. UMAP / 2-D projections stay caller-side.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
from eval_toolkit.eda.data_audit import (
|
|
38
|
+
DEFAULT_MAX_NEG_POS_RATIO,
|
|
39
|
+
DEFAULT_MIN_NEG_POS_RATIO,
|
|
40
|
+
DEFAULT_PCT_OVER_CONTEXT_THRESHOLD,
|
|
41
|
+
EDA_AUDIT_SCHEMA_VERSION,
|
|
42
|
+
DataAudit,
|
|
43
|
+
SplitSummary,
|
|
44
|
+
Tokenizer,
|
|
45
|
+
audit_dataset,
|
|
46
|
+
class_balance,
|
|
47
|
+
length_quantiles,
|
|
48
|
+
summarize_split,
|
|
49
|
+
)
|
|
50
|
+
from eval_toolkit.eda.distribution_shift import (
|
|
51
|
+
DEFAULT_KNN_K,
|
|
52
|
+
DEFAULT_MMD_PERMUTATIONS,
|
|
53
|
+
DEFAULT_PAD_C,
|
|
54
|
+
DEFAULT_PAD_FOLDS,
|
|
55
|
+
DistributionShiftResult,
|
|
56
|
+
KnnPurityResult,
|
|
57
|
+
MmdResult,
|
|
58
|
+
PadResult,
|
|
59
|
+
distribution_shift,
|
|
60
|
+
knn_purity,
|
|
61
|
+
maximum_mean_discrepancy,
|
|
62
|
+
median_bandwidth,
|
|
63
|
+
proxy_a_distance,
|
|
64
|
+
)
|
|
65
|
+
from eval_toolkit.eda.lexical_association import (
|
|
66
|
+
DEFAULT_CHAR_NGRAM_RANGE,
|
|
67
|
+
DEFAULT_MIN_COUNT,
|
|
68
|
+
DEFAULT_PRIOR_SCALE,
|
|
69
|
+
BaselineScore,
|
|
70
|
+
CompetencyResult,
|
|
71
|
+
LexicalAssociationResult,
|
|
72
|
+
StrTokenizer,
|
|
73
|
+
class_lexical_association,
|
|
74
|
+
competency_baselines,
|
|
75
|
+
default_tokenizer,
|
|
76
|
+
weighted_log_odds,
|
|
77
|
+
)
|
|
78
|
+
from eval_toolkit.eda.obfuscation import (
|
|
79
|
+
BASE64_ENTROPY_THRESHOLD,
|
|
80
|
+
HEX_ENTROPY_THRESHOLD,
|
|
81
|
+
ObfuscationProfile,
|
|
82
|
+
analyze_obfuscation,
|
|
83
|
+
count_invisible_chars,
|
|
84
|
+
has_high_entropy_alnum_run,
|
|
85
|
+
has_rot13_marker,
|
|
86
|
+
is_leeted_token,
|
|
87
|
+
leetspeak_counts,
|
|
88
|
+
nfkc_changed,
|
|
89
|
+
nfkc_char_delta,
|
|
90
|
+
shannon_entropy,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
__all__ = [
|
|
94
|
+
# --- constants ---
|
|
95
|
+
"BASE64_ENTROPY_THRESHOLD",
|
|
96
|
+
"DEFAULT_CHAR_NGRAM_RANGE",
|
|
97
|
+
"DEFAULT_KNN_K",
|
|
98
|
+
"DEFAULT_MAX_NEG_POS_RATIO",
|
|
99
|
+
"DEFAULT_MIN_COUNT",
|
|
100
|
+
"DEFAULT_MIN_NEG_POS_RATIO",
|
|
101
|
+
"DEFAULT_MMD_PERMUTATIONS",
|
|
102
|
+
"DEFAULT_PAD_C",
|
|
103
|
+
"DEFAULT_PAD_FOLDS",
|
|
104
|
+
"DEFAULT_PCT_OVER_CONTEXT_THRESHOLD",
|
|
105
|
+
"DEFAULT_PRIOR_SCALE",
|
|
106
|
+
"EDA_AUDIT_SCHEMA_VERSION",
|
|
107
|
+
"HEX_ENTROPY_THRESHOLD",
|
|
108
|
+
# --- classes / type aliases ---
|
|
109
|
+
"BaselineScore",
|
|
110
|
+
"CompetencyResult",
|
|
111
|
+
"DataAudit",
|
|
112
|
+
"DistributionShiftResult",
|
|
113
|
+
"KnnPurityResult",
|
|
114
|
+
"LexicalAssociationResult",
|
|
115
|
+
"MmdResult",
|
|
116
|
+
"ObfuscationProfile",
|
|
117
|
+
"PadResult",
|
|
118
|
+
"SplitSummary",
|
|
119
|
+
"StrTokenizer",
|
|
120
|
+
"Tokenizer",
|
|
121
|
+
# --- functions ---
|
|
122
|
+
"analyze_obfuscation",
|
|
123
|
+
"audit_dataset",
|
|
124
|
+
"class_balance",
|
|
125
|
+
"class_lexical_association",
|
|
126
|
+
"competency_baselines",
|
|
127
|
+
"count_invisible_chars",
|
|
128
|
+
"default_tokenizer",
|
|
129
|
+
"distribution_shift",
|
|
130
|
+
"has_high_entropy_alnum_run",
|
|
131
|
+
"has_rot13_marker",
|
|
132
|
+
"is_leeted_token",
|
|
133
|
+
"knn_purity",
|
|
134
|
+
"leetspeak_counts",
|
|
135
|
+
"length_quantiles",
|
|
136
|
+
"maximum_mean_discrepancy",
|
|
137
|
+
"median_bandwidth",
|
|
138
|
+
"nfkc_char_delta",
|
|
139
|
+
"nfkc_changed",
|
|
140
|
+
"proxy_a_distance",
|
|
141
|
+
"shannon_entropy",
|
|
142
|
+
"summarize_split",
|
|
143
|
+
"weighted_log_odds",
|
|
144
|
+
]
|