eval-toolkit 1.4.0__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/CHANGELOG.md +13 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/PKG-INFO +4 -1
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/pyproject.toml +8 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-1.5.0/src/eval_toolkit/eda/__init__.py +80 -0
- eval_toolkit-1.5.0/src/eval_toolkit/eda/data_audit.py +785 -0
- eval_toolkit-1.5.0/src/eval_toolkit/eda/obfuscation.py +622 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/loaders.py +46 -8
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/public_api/snapshot.json +1 -1
- eval_toolkit-1.5.0/tests/test_eda.py +330 -0
- eval_toolkit-1.5.0/tests/test_eda_obfuscation.py +448 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_loaders.py +107 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/.gitignore +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/LICENSE +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/STYLE.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/archive/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/research/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_narrative.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_rng.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/_sweep.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/audit_value_bindings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/metric_specs.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/scorecards.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/conftest.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/strategies.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_audit_citation_alignment.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_audit_value_bindings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_claims.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_cli.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_config.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_deprecated_scalars_shim.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_lazy_extras_messages.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_logging.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_losses.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_paths.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_probes.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_rng.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_scorecard.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_splits.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_sweep.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-1.4.0 → eval_toolkit-1.5.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.5.0] — 2026-05-29 — Tier-2 `eda` layer (#83) + schema-aware `HFDatasetsLoader` (#85)
|
|
9
|
+
|
|
10
|
+
Tier-2 / `loaders` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible.
|
|
11
|
+
|
|
12
|
+
- **`eda` Job-1 integrity gate (#83):** `audit_dataset` / `DataAudit` / `SplitSummary` + the
|
|
13
|
+
`class_balance` / `no_cross_split_leakage` / `context_window_fit` gates + the §B2 obfuscation
|
|
14
|
+
prevalence module.
|
|
15
|
+
- **schema-aware `HFDatasetsLoader` (#85):** load real-world dataset schemas without column
|
|
16
|
+
guessing — `feature_cols` + `feature_join` (join multiple columns into one feature; NaN-safe),
|
|
17
|
+
`label_map` (remap raw labels → int; fail-fast `ValueError` lists unmapped values), `revision`
|
|
18
|
+
(pin the HF dataset SHA). All new params default to the prior behavior; a missing feature/label
|
|
19
|
+
column raises `KeyError` listing the observed columns.
|
|
20
|
+
|
|
8
21
|
## [1.4.0] — 2026-05-26 — `audit_citation_alignment` Layer 2 + Layer 3 (closes #82); shared `_narrative` helpers (ADR 0007)
|
|
9
22
|
|
|
10
23
|
Tier-1 ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -60,6 +60,9 @@ Requires-Dist: sphinx-autodoc-typehints>=2.0; extra == 'docs'
|
|
|
60
60
|
Requires-Dist: sphinx-copybutton>=0.5; extra == 'docs'
|
|
61
61
|
Requires-Dist: sphinx-design>=0.6; extra == 'docs'
|
|
62
62
|
Requires-Dist: sphinx>=7.3; extra == 'docs'
|
|
63
|
+
Provides-Extra: eda
|
|
64
|
+
Requires-Dist: matplotlib>=3.8; extra == 'eda'
|
|
65
|
+
Requires-Dist: pandas>=2.0; extra == 'eda'
|
|
63
66
|
Provides-Extra: embeddings
|
|
64
67
|
Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
|
|
65
68
|
Provides-Extra: losses
|
|
@@ -74,6 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
|
|
|
74
74
|
# (granular extras — losses callers should not have to install the larger
|
|
75
75
|
# transformers stack). Shares the torch version pin with [probes].
|
|
76
76
|
losses = ["torch>=2.0"]
|
|
77
|
+
# v1.5.0 (feat/eda-data-audit): eval_toolkit.eda Job-1 integrity-gate layer.
|
|
78
|
+
# Tier-2 surface (ADR 0003) — torch-free by design. pandas powers the
|
|
79
|
+
# DataFrameLoader reuse path; matplotlib is reserved for the EDA layer's
|
|
80
|
+
# future profiling plots. Intentionally NO sentence-transformers / torch:
|
|
81
|
+
# the near-dup / cross-split checks use the lexical TfidfCosineStrategy and
|
|
82
|
+
# token-length quantiles take a caller-supplied tokenizer (no transformers
|
|
83
|
+
# import in this module). NOT folded into [all] / [dev] — opt-in only.
|
|
84
|
+
eda = ["pandas>=2.0", "matplotlib>=3.8"]
|
|
77
85
|
# NO-OP extra kept for backward compatibility (R3 at v0.49.0).
|
|
78
86
|
#
|
|
79
87
|
# jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""``eval_toolkit.eda`` — EDA-first dataset integrity gating (Tier-2 surface).
|
|
2
|
+
|
|
3
|
+
This subpackage is the **Job-1 integrity gate** of an EDA-first research
|
|
4
|
+
program: thin, composable, torch-free per-split profiling + dataset-soundness
|
|
5
|
+
gates, built by reusing the v1.4.0 :mod:`eval_toolkit.leakage`,
|
|
6
|
+
:mod:`~eval_toolkit.text_dedup`, :mod:`~eval_toolkit.claims`, and
|
|
7
|
+
:mod:`~eval_toolkit.artifacts` primitives.
|
|
8
|
+
|
|
9
|
+
Stability tier
|
|
10
|
+
--------------
|
|
11
|
+
Public access is ``eval_toolkit.eda.*`` — **Tier-2** per ADR 0003. This layer
|
|
12
|
+
is intentionally evolvable and is **not** part of the v2.0-frozen top-level
|
|
13
|
+
:mod:`eval_toolkit` surface; nothing here is added to the package-root
|
|
14
|
+
``_EXPORTS`` / ``__all__``. Import explicitly::
|
|
15
|
+
|
|
16
|
+
from eval_toolkit.eda import audit_dataset, DataAudit, SplitSummary
|
|
17
|
+
|
|
18
|
+
Scope (deliberately narrow)
|
|
19
|
+
--------------------------
|
|
20
|
+
Integrity gating only: row counts, class balance, text-length quantiles,
|
|
21
|
+
dedup / cross-split leakage. **No** embeddings, semantic similarity,
|
|
22
|
+
contamination scoring, or UMAP — those distribution-shift concerns are
|
|
23
|
+
deferred to a future ``distribution_shift`` module.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from eval_toolkit.eda.data_audit import (
|
|
29
|
+
DEFAULT_MAX_NEG_POS_RATIO,
|
|
30
|
+
DEFAULT_MIN_NEG_POS_RATIO,
|
|
31
|
+
DEFAULT_PCT_OVER_CONTEXT_THRESHOLD,
|
|
32
|
+
EDA_AUDIT_SCHEMA_VERSION,
|
|
33
|
+
DataAudit,
|
|
34
|
+
SplitSummary,
|
|
35
|
+
Tokenizer,
|
|
36
|
+
audit_dataset,
|
|
37
|
+
class_balance,
|
|
38
|
+
length_quantiles,
|
|
39
|
+
summarize_split,
|
|
40
|
+
)
|
|
41
|
+
from eval_toolkit.eda.obfuscation import (
|
|
42
|
+
BASE64_ENTROPY_THRESHOLD,
|
|
43
|
+
HEX_ENTROPY_THRESHOLD,
|
|
44
|
+
ObfuscationProfile,
|
|
45
|
+
analyze_obfuscation,
|
|
46
|
+
count_invisible_chars,
|
|
47
|
+
has_high_entropy_alnum_run,
|
|
48
|
+
has_rot13_marker,
|
|
49
|
+
is_leeted_token,
|
|
50
|
+
leetspeak_counts,
|
|
51
|
+
nfkc_changed,
|
|
52
|
+
nfkc_char_delta,
|
|
53
|
+
shannon_entropy,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"BASE64_ENTROPY_THRESHOLD",
|
|
58
|
+
"DEFAULT_MAX_NEG_POS_RATIO",
|
|
59
|
+
"DEFAULT_MIN_NEG_POS_RATIO",
|
|
60
|
+
"DEFAULT_PCT_OVER_CONTEXT_THRESHOLD",
|
|
61
|
+
"EDA_AUDIT_SCHEMA_VERSION",
|
|
62
|
+
"HEX_ENTROPY_THRESHOLD",
|
|
63
|
+
"DataAudit",
|
|
64
|
+
"ObfuscationProfile",
|
|
65
|
+
"SplitSummary",
|
|
66
|
+
"Tokenizer",
|
|
67
|
+
"analyze_obfuscation",
|
|
68
|
+
"audit_dataset",
|
|
69
|
+
"class_balance",
|
|
70
|
+
"count_invisible_chars",
|
|
71
|
+
"has_high_entropy_alnum_run",
|
|
72
|
+
"has_rot13_marker",
|
|
73
|
+
"is_leeted_token",
|
|
74
|
+
"leetspeak_counts",
|
|
75
|
+
"length_quantiles",
|
|
76
|
+
"nfkc_char_delta",
|
|
77
|
+
"nfkc_changed",
|
|
78
|
+
"shannon_entropy",
|
|
79
|
+
"summarize_split",
|
|
80
|
+
]
|