eval-toolkit 0.42.0__tar.gz → 0.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/CHANGELOG.md +86 -1
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/PKG-INFO +6 -17
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/README.md +0 -16
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/pyproject.toml +12 -1
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/__init__.py +23 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-0.44.0/src/eval_toolkit/adversarial.py +578 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/loaders.py +479 -0
- eval_toolkit-0.44.0/src/eval_toolkit/losses.py +225 -0
- eval_toolkit-0.44.0/src/eval_toolkit/preprocessing.py +259 -0
- eval_toolkit-0.44.0/src/eval_toolkit/probes.py +469 -0
- eval_toolkit-0.44.0/src/eval_toolkit/schemas/ood_manifest.v1.json +77 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/seeds.py +1 -1
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/public_api/snapshot.json +148 -1
- eval_toolkit-0.44.0/tests/test_adversarial.py +351 -0
- eval_toolkit-0.44.0/tests/test_losses.py +189 -0
- eval_toolkit-0.44.0/tests/test_ood_loader.py +353 -0
- eval_toolkit-0.44.0/tests/test_preprocessing.py +241 -0
- eval_toolkit-0.44.0/tests/test_probes.py +321 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_schemas.py +3 -1
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/.gitignore +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/LICENSE +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/STYLE.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,7 +5,92 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [
|
|
8
|
+
## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- `eval_toolkit.preprocessing` — new module with 3 Spotlighting
|
|
13
|
+
structural-defense variants from Hines et al. 2024
|
|
14
|
+
(arXiv 2403.14720): `delimit(text, delimiter='<<')`,
|
|
15
|
+
`datamark(text, marker='^')`, `encode(text, encoding='base64')`,
|
|
16
|
+
plus a `sweep(texts, variants=..., kwargs=...)` batch wrapper that
|
|
17
|
+
returns a `(N*3)`-row DataFrame. Includes a `spotlighting`
|
|
18
|
+
SimpleNamespace exposing the upstream issue's function-style API
|
|
19
|
+
(`spotlighting.delimit(text)`, etc.). Base-install safe (pure
|
|
20
|
+
stdlib). Closes #51.
|
|
21
|
+
- `eval_toolkit.losses` — new module with `RecallAtLowFPR` — the
|
|
22
|
+
Meta Prompt Guard 2 (PG2) training recipe: a differentiable
|
|
23
|
+
approximation of recall-at-fixed-FPR via soft-rank, returning a
|
|
24
|
+
scalar `torch.nn.Module` loss for use in standard training loops.
|
|
25
|
+
Optimizes detector ranking at a constrained operating point
|
|
26
|
+
(e.g. `fpr_target=0.01` → "maximize recall while keeping FPR ≤ 1%").
|
|
27
|
+
Closes #50.
|
|
28
|
+
- New optional extra `[losses] = torch>=2.0`. Granular per the v0.43
|
|
29
|
+
plan Decision 4 — separated from `[probes]` so callers wanting only
|
|
30
|
+
the loss don't have to install the larger transformers stack.
|
|
31
|
+
Shares the torch version pin with `[probes]`.
|
|
32
|
+
|
|
33
|
+
## [0.43.0] — 2026-05-19 — P1 batch: OOD manifest loader + character_injection sweep + ActivationDeltaProbe (closes #48, #49, #53)
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
|
|
37
|
+
- `ood_dataset_from_manifest(yaml_path, slices=..., cache_dir=...)` —
|
|
38
|
+
declarative loader for multiple OOD eval slates (BIPIA, AgentDojo,
|
|
39
|
+
InjecAgent, NotInject, PINT, LLMail-Inject, …) into a single
|
|
40
|
+
unified DataFrame with columns `text` / `label` / `source` /
|
|
41
|
+
`row_id` / `sha`. Bytes are downloaded once, sha256-verified
|
|
42
|
+
against the manifest, and cached on-disk keyed by content hash
|
|
43
|
+
(default `~/.cache/eval-toolkit/ood/`). Closes #48 — drops the
|
|
44
|
+
per-source loader boilerplate carried by
|
|
45
|
+
`prompt-injection-portfolio` and `prompt-injection-detection-submission`.
|
|
46
|
+
- `OodManifestLoader` — `DatasetLoader`-Protocol-compliant wrapper
|
|
47
|
+
around the factory, returning `{"all": EvalSlice}` with
|
|
48
|
+
`source` as the default strata column for harness pipelines.
|
|
49
|
+
- `src/eval_toolkit/schemas/ood_manifest.v1.json` — Draft 2020-12
|
|
50
|
+
JSON schema for the OOD manifest YAML; auto-validated by
|
|
51
|
+
`uv run eval-toolkit schemas check`.
|
|
52
|
+
- `eval_toolkit.adversarial` — new module with character-injection
|
|
53
|
+
bypass suite (Microsoft Research 2024, arXiv 2404.13208).
|
|
54
|
+
Six core techniques shipped as frozen-dataclass strategies:
|
|
55
|
+
`ZeroWidthSpaceInjection`, `HomoglyphSubstitution`,
|
|
56
|
+
`DiacriticInjection`, `WhitespaceInjection`, `CaseRandomization`,
|
|
57
|
+
`PunctuationInjection`. All implement a `CharacterInjectionStrategy`
|
|
58
|
+
Protocol with `transform(text: str) -> str`. Six advanced techniques
|
|
59
|
+
(bidi RTL, tag stripping, synonym, token splitting, Unicode
|
|
60
|
+
normalization, invisible chars) scheduled for v0.43.1 — the sweep
|
|
61
|
+
API stabilizes in v0.43.0 so the v0.43.1 additions are pure
|
|
62
|
+
extensions. Closes #49 (core-6).
|
|
63
|
+
- `adversarial.sweep(texts, scorer, techniques="all", threshold=0.5)`
|
|
64
|
+
— Scorer-Protocol-compliant adversarial-robustness sweep. Returns
|
|
65
|
+
a DataFrame with `(text_id, technique, original_score,
|
|
66
|
+
transformed_score, asr)` rows for matrix analysis.
|
|
67
|
+
Aggregate ASR with `df.groupby("technique")["asr"].mean()`.
|
|
68
|
+
- `adversarial.character_injection` — `SimpleNamespace` exposing the
|
|
69
|
+
function-style API from the upstream issue spec
|
|
70
|
+
(`character_injection.zero_width_space(text)`,
|
|
71
|
+
`character_injection.sweep(...)`, etc.).
|
|
72
|
+
- `eval_toolkit.probes` — new module with `ActivationDeltaProbe`:
|
|
73
|
+
TaskTracker-style linear probe over HuggingFace transformer
|
|
74
|
+
hidden-state activation deltas (Abdelnabi et al. 2024,
|
|
75
|
+
arXiv 2406.00799). Backbone-agnostic (encoder OR decoder).
|
|
76
|
+
Sklearn-compatible API: `.fit(clean_texts, injected_texts)`,
|
|
77
|
+
`.predict()` → `(n,)`, `.predict_proba()` → `(n, 2)`,
|
|
78
|
+
`.coef_`, `.classes_`. Activations cached to
|
|
79
|
+
`$XDG_CACHE_HOME/eval-toolkit/probes/` keyed by
|
|
80
|
+
`(backbone, layer_index, aggregate, sha256(text))` so re-runs are
|
|
81
|
+
near-instant. Aggregate modes: `mean`, `max`, `cls`. Closes #53.
|
|
82
|
+
- `Probe` Protocol — minimal sklearn-shaped probe surface (`fit`,
|
|
83
|
+
`predict`, `predict_proba`, `coef_`, `classes_`). Distinct from
|
|
84
|
+
`Scorer` (which returns 1-D `P(positive)`); wrap with
|
|
85
|
+
`lambda p, X: p.predict_proba(X)[:, 1]` to adapt.
|
|
86
|
+
- `ActivationExtractor` Protocol — pluggable hidden-state-extraction
|
|
87
|
+
contract for `ActivationDeltaProbe`; injectable for tests to avoid
|
|
88
|
+
loading a real backbone.
|
|
89
|
+
- New optional extra `[probes] = torch>=2.0, transformers>=4.40`.
|
|
90
|
+
Follows the `[embeddings]` precedent — opt-in only, NOT in
|
|
91
|
+
`[all]` or `[dev]`, since the transitive install is ~600MB+.
|
|
92
|
+
Module is base-install-safe: a friendly `ImportError` fires only
|
|
93
|
+
if you try to use the default HF extractor without the extra.
|
|
9
94
|
|
|
10
95
|
## [0.42.0] — 2026-05-19 — fit_isotonic_binary completes 4-calibrator family (closes #44)
|
|
11
96
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.44.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -62,11 +62,16 @@ Requires-Dist: sphinx-design>=0.6; extra == 'docs'
|
|
|
62
62
|
Requires-Dist: sphinx>=7.3; extra == 'docs'
|
|
63
63
|
Provides-Extra: embeddings
|
|
64
64
|
Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
|
|
65
|
+
Provides-Extra: losses
|
|
66
|
+
Requires-Dist: torch>=2.0; extra == 'losses'
|
|
65
67
|
Provides-Extra: parquet
|
|
66
68
|
Requires-Dist: pyarrow>=15.0; extra == 'parquet'
|
|
67
69
|
Provides-Extra: plotting
|
|
68
70
|
Requires-Dist: matplotlib>=3.8; extra == 'plotting'
|
|
69
71
|
Requires-Dist: pillow>=10.0; extra == 'plotting'
|
|
72
|
+
Provides-Extra: probes
|
|
73
|
+
Requires-Dist: torch>=2.0; extra == 'probes'
|
|
74
|
+
Requires-Dist: transformers>=4.40; extra == 'probes'
|
|
70
75
|
Provides-Extra: property
|
|
71
76
|
Requires-Dist: hypothesis>=6.100; extra == 'property'
|
|
72
77
|
Provides-Extra: transformers
|
|
@@ -308,22 +313,6 @@ tests with large `max_examples` and a few bootstrap tests with
|
|
|
308
313
|
`n_resamples >= 200`). `make fast` keeps the developer iteration loop
|
|
309
314
|
under ~30 seconds.
|
|
310
315
|
|
|
311
|
-
## Downstream contract testing (v4 sibling-smoke)
|
|
312
|
-
|
|
313
|
-
A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
|
|
314
|
-
the downstream consumer `prompt-injection-v4` at `main`, installs it
|
|
315
|
-
with this branch's eval-toolkit as an editable sibling dep (via v4's
|
|
316
|
-
`[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
|
|
317
|
-
contract regressions at PR time rather than in v4's own CI post-merge.
|
|
318
|
-
|
|
319
|
-
The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
|
|
320
|
-
datasets used by v4's smoke fixtures). Set it at:
|
|
321
|
-
`https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
|
|
322
|
-
|
|
323
|
-
The workflow runs with `continue-on-error: true` during a 2-3 week
|
|
324
|
-
trial period; it'll be promoted to a required gate once the false-
|
|
325
|
-
positive rate (from independent v4 main breakage or HF rate-limits)
|
|
326
|
-
is characterized.
|
|
327
316
|
|
|
328
317
|
## Standards
|
|
329
318
|
|
|
@@ -230,22 +230,6 @@ tests with large `max_examples` and a few bootstrap tests with
|
|
|
230
230
|
`n_resamples >= 200`). `make fast` keeps the developer iteration loop
|
|
231
231
|
under ~30 seconds.
|
|
232
232
|
|
|
233
|
-
## Downstream contract testing (v4 sibling-smoke)
|
|
234
|
-
|
|
235
|
-
A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
|
|
236
|
-
the downstream consumer `prompt-injection-v4` at `main`, installs it
|
|
237
|
-
with this branch's eval-toolkit as an editable sibling dep (via v4's
|
|
238
|
-
`[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
|
|
239
|
-
contract regressions at PR time rather than in v4's own CI post-merge.
|
|
240
|
-
|
|
241
|
-
The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
|
|
242
|
-
datasets used by v4's smoke fixtures). Set it at:
|
|
243
|
-
`https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
|
|
244
|
-
|
|
245
|
-
The workflow runs with `continue-on-error: true` during a 2-3 week
|
|
246
|
-
trial period; it'll be promoted to a required gate once the false-
|
|
247
|
-
positive rate (from independent v4 main breakage or HF rate-limits)
|
|
248
|
-
is characterized.
|
|
249
233
|
|
|
250
234
|
## Standards
|
|
251
235
|
|
|
@@ -63,6 +63,17 @@ embeddings = ["sentence-transformers>=3.0"]
|
|
|
63
63
|
# itself does not import transformers, so the optional install is
|
|
64
64
|
# strictly for callers wanting AutoTokenizer.from_pretrained(...).
|
|
65
65
|
transformers = ["transformers>=4.0"]
|
|
66
|
+
# v0.43.0: ActivationDeltaProbe (TaskTracker-style linear activation probe;
|
|
67
|
+
# closes #53). Pulls torch + transformers (~600MB+ transitive). Follows
|
|
68
|
+
# the [embeddings] precedent: opt-in only, NOT in [all] / [dev]. Module
|
|
69
|
+
# is base-install-safe (lazy imports inside ActivationDeltaProbe methods);
|
|
70
|
+
# the extra is strictly for callers wanting to actually fit / predict.
|
|
71
|
+
probes = ["torch>=2.0", "transformers>=4.40"]
|
|
72
|
+
# v0.44.0: RecallAtLowFPR loss (Meta Prompt Guard 2 recipe; closes #50).
|
|
73
|
+
# torch-only (no transformers); separated from [probes] per Decision 4
|
|
74
|
+
# (granular extras — losses callers should not have to install the larger
|
|
75
|
+
# transformers stack). Shares the torch version pin with [probes].
|
|
76
|
+
losses = ["torch>=2.0"]
|
|
66
77
|
# DEPRECATED (announced v0.30.1, removal v0.33.0).
|
|
67
78
|
#
|
|
68
79
|
# Retained as a transitive no-op so `pip install eval-toolkit[validation]`
|
|
@@ -171,7 +182,7 @@ warn_no_return = true
|
|
|
171
182
|
strict_equality = true
|
|
172
183
|
|
|
173
184
|
[[tool.mypy.overrides]]
|
|
174
|
-
module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*"]
|
|
185
|
+
module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*", "torch.*", "transformers.*"]
|
|
175
186
|
ignore_missing_imports = true
|
|
176
187
|
|
|
177
188
|
[tool.pytest.ini_options]
|
|
@@ -30,6 +30,27 @@ _logging.getLogger("eval_toolkit").addHandler(_logging.NullHandler())
|
|
|
30
30
|
# dividers below are informational only; the snapshot in
|
|
31
31
|
# tests/golden/public_api/ reads dict keys + values, not comments.
|
|
32
32
|
_EXPORTS: dict[str, str] = {
|
|
33
|
+
# --- adversarial ---
|
|
34
|
+
"CORE_TECHNIQUES": "eval_toolkit.adversarial",
|
|
35
|
+
"CaseRandomization": "eval_toolkit.adversarial",
|
|
36
|
+
"CharacterInjectionStrategy": "eval_toolkit.adversarial",
|
|
37
|
+
"DiacriticInjection": "eval_toolkit.adversarial",
|
|
38
|
+
"HomoglyphSubstitution": "eval_toolkit.adversarial",
|
|
39
|
+
"PunctuationInjection": "eval_toolkit.adversarial",
|
|
40
|
+
"WhitespaceInjection": "eval_toolkit.adversarial",
|
|
41
|
+
"ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
|
|
42
|
+
"character_injection": "eval_toolkit.adversarial",
|
|
43
|
+
# --- losses ---
|
|
44
|
+
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
45
|
+
# --- preprocessing ---
|
|
46
|
+
"datamark": "eval_toolkit.preprocessing",
|
|
47
|
+
"delimit": "eval_toolkit.preprocessing",
|
|
48
|
+
"encode": "eval_toolkit.preprocessing",
|
|
49
|
+
"spotlighting": "eval_toolkit.preprocessing",
|
|
50
|
+
# --- probes ---
|
|
51
|
+
"ActivationDeltaProbe": "eval_toolkit.probes",
|
|
52
|
+
"ActivationExtractor": "eval_toolkit.probes",
|
|
53
|
+
"Probe": "eval_toolkit.probes",
|
|
33
54
|
# --- analysis ---
|
|
34
55
|
"CsvPredictionReader": "eval_toolkit.analysis",
|
|
35
56
|
"JsonlPredictionReader": "eval_toolkit.analysis",
|
|
@@ -156,8 +177,10 @@ _EXPORTS: dict[str, str] = {
|
|
|
156
177
|
"DataFrameLoader": "eval_toolkit.loaders",
|
|
157
178
|
"DatasetLoader": "eval_toolkit.loaders",
|
|
158
179
|
"HFDatasetsLoader": "eval_toolkit.loaders",
|
|
180
|
+
"OodManifestLoader": "eval_toolkit.loaders",
|
|
159
181
|
"ParquetGlobLoader": "eval_toolkit.loaders",
|
|
160
182
|
"SingleSliceLoader": "eval_toolkit.loaders",
|
|
183
|
+
"ood_dataset_from_manifest": "eval_toolkit.loaders",
|
|
161
184
|
# --- manifest ---
|
|
162
185
|
"MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
|
|
163
186
|
"RunManifest": "eval_toolkit.manifest",
|