eval-toolkit 0.41.0__tar.gz → 0.43.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/CHANGELOG.md +113 -1
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/PKG-INFO +4 -17
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/README.md +0 -16
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/pyproject.toml +6 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/__init__.py +17 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-0.43.0/src/eval_toolkit/adversarial.py +578 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/calibration.py +78 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/loaders.py +479 -0
- eval_toolkit-0.43.0/src/eval_toolkit/probes.py +469 -0
- eval_toolkit-0.43.0/src/eval_toolkit/schemas/ood_manifest.v1.json +77 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/public_api/snapshot.json +124 -1
- eval_toolkit-0.43.0/tests/test_adversarial.py +351 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_calibration_binary_adapters.py +97 -2
- eval_toolkit-0.43.0/tests/test_ood_loader.py +353 -0
- eval_toolkit-0.43.0/tests/test_probes.py +321 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_schemas.py +3 -1
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/.gitignore +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/LICENSE +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/STYLE.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.41.0 → eval_toolkit-0.43.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,7 +5,119 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [
|
|
8
|
+
## [0.43.0] — 2026-05-19 — P1 batch: OOD manifest loader + character_injection sweep + ActivationDeltaProbe (closes #48, #49, #53)
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- `ood_dataset_from_manifest(yaml_path, slices=..., cache_dir=...)` —
|
|
13
|
+
declarative loader for multiple OOD eval slates (BIPIA, AgentDojo,
|
|
14
|
+
InjecAgent, NotInject, PINT, LLMail-Inject, …) into a single
|
|
15
|
+
unified DataFrame with columns `text` / `label` / `source` /
|
|
16
|
+
`row_id` / `sha`. Bytes are downloaded once, sha256-verified
|
|
17
|
+
against the manifest, and cached on-disk keyed by content hash
|
|
18
|
+
(default `~/.cache/eval-toolkit/ood/`). Closes #48 — drops the
|
|
19
|
+
per-source loader boilerplate carried by
|
|
20
|
+
`prompt-injection-portfolio` and `prompt-injection-detection-submission`.
|
|
21
|
+
- `OodManifestLoader` — `DatasetLoader`-Protocol-compliant wrapper
|
|
22
|
+
around the factory, returning `{"all": EvalSlice}` with
|
|
23
|
+
`source` as the default strata column for harness pipelines.
|
|
24
|
+
- `src/eval_toolkit/schemas/ood_manifest.v1.json` — Draft 2020-12
|
|
25
|
+
JSON schema for the OOD manifest YAML; auto-validated by
|
|
26
|
+
`uv run eval-toolkit schemas check`.
|
|
27
|
+
- `eval_toolkit.adversarial` — new module with character-injection
|
|
28
|
+
bypass suite (Microsoft Research 2024, arXiv 2404.13208).
|
|
29
|
+
Six core techniques shipped as frozen-dataclass strategies:
|
|
30
|
+
`ZeroWidthSpaceInjection`, `HomoglyphSubstitution`,
|
|
31
|
+
`DiacriticInjection`, `WhitespaceInjection`, `CaseRandomization`,
|
|
32
|
+
`PunctuationInjection`. All implement a `CharacterInjectionStrategy`
|
|
33
|
+
Protocol with `transform(text: str) -> str`. Six advanced techniques
|
|
34
|
+
(bidi RTL, tag stripping, synonym, token splitting, Unicode
|
|
35
|
+
normalization, invisible chars) scheduled for v0.43.1 — the sweep
|
|
36
|
+
API stabilizes in v0.43.0 so the v0.43.1 additions are pure
|
|
37
|
+
extensions. Closes #49 (core-6).
|
|
38
|
+
- `adversarial.sweep(texts, scorer, techniques="all", threshold=0.5)`
|
|
39
|
+
— Scorer-Protocol-compliant adversarial-robustness sweep. Returns
|
|
40
|
+
a DataFrame with `(text_id, technique, original_score,
|
|
41
|
+
transformed_score, asr)` rows for matrix analysis.
|
|
42
|
+
Aggregate ASR with `df.groupby("technique")["asr"].mean()`.
|
|
43
|
+
- `adversarial.character_injection` — `SimpleNamespace` exposing the
|
|
44
|
+
function-style API from the upstream issue spec
|
|
45
|
+
(`character_injection.zero_width_space(text)`,
|
|
46
|
+
`character_injection.sweep(...)`, etc.).
|
|
47
|
+
- `eval_toolkit.probes` — new module with `ActivationDeltaProbe`:
|
|
48
|
+
TaskTracker-style linear probe over HuggingFace transformer
|
|
49
|
+
hidden-state activation deltas (Abdelnabi et al. 2024,
|
|
50
|
+
arXiv 2406.00799). Backbone-agnostic (encoder OR decoder).
|
|
51
|
+
Sklearn-compatible API: `.fit(clean_texts, injected_texts)`,
|
|
52
|
+
`.predict()` → `(n,)`, `.predict_proba()` → `(n, 2)`,
|
|
53
|
+
`.coef_`, `.classes_`. Activations cached to
|
|
54
|
+
`$XDG_CACHE_HOME/eval-toolkit/probes/` keyed by
|
|
55
|
+
`(backbone, layer_index, aggregate, sha256(text))` so re-runs are
|
|
56
|
+
near-instant. Aggregate modes: `mean`, `max`, `cls`. Closes #53.
|
|
57
|
+
- `Probe` Protocol — minimal sklearn-shaped probe surface (`fit`,
|
|
58
|
+
`predict`, `predict_proba`, `coef_`, `classes_`). Distinct from
|
|
59
|
+
`Scorer` (which returns 1-D `P(positive)`); wrap with
|
|
60
|
+
`lambda p, X: p.predict_proba(X)[:, 1]` to adapt.
|
|
61
|
+
- `ActivationExtractor` Protocol — pluggable hidden-state-extraction
|
|
62
|
+
contract for `ActivationDeltaProbe`; injectable for tests to avoid
|
|
63
|
+
loading a real backbone.
|
|
64
|
+
- New optional extra `[probes] = torch>=2.0, transformers>=4.40`.
|
|
65
|
+
Follows the `[embeddings]` precedent — opt-in only, NOT in
|
|
66
|
+
`[all]` or `[dev]`, since the transitive install is ~600MB+.
|
|
67
|
+
Module is base-install-safe: a friendly `ImportError` fires only
|
|
68
|
+
if you try to use the default HF extractor without the extra.
|
|
69
|
+
|
|
70
|
+
## [0.42.0] — 2026-05-19 — fit_isotonic_binary completes 4-calibrator family (closes #44)
|
|
71
|
+
|
|
72
|
+
Final element of the binary scalar-prob calibrator family started by
|
|
73
|
+
`fit_temperature_binary` (v0.35.0). All four now uniformly return
|
|
74
|
+
`(params, apply)`:
|
|
75
|
+
|
|
76
|
+
| Function | Params | Shipped |
|
|
77
|
+
|---|---|---|
|
|
78
|
+
| `fit_temperature_binary` | `(T,)` — single float | v0.35.0 |
|
|
79
|
+
| `fit_isotonic_binary` | `None` — non-parametric | **v0.42.0** |
|
|
80
|
+
| `fit_platt_binary` | `(a, b)` | v0.40.0 |
|
|
81
|
+
| `fit_beta_binary` | `(a, b, c)` | v0.40.0 |
|
|
82
|
+
|
|
83
|
+
Consumer code can now iterate the family with a single shape, used
|
|
84
|
+
to distinguish parametric from non-parametric via
|
|
85
|
+
`if params is not None`:
|
|
86
|
+
|
|
87
|
+
```text
|
|
88
|
+
CALIBRATORS = {
|
|
89
|
+
"temperature": fit_temperature_binary,
|
|
90
|
+
"isotonic": fit_isotonic_binary,
|
|
91
|
+
"platt": fit_platt_binary,
|
|
92
|
+
"beta": fit_beta_binary,
|
|
93
|
+
}
|
|
94
|
+
for name, fit_fn in CALIBRATORS.items():
|
|
95
|
+
params, apply = fit_fn(y_val, p_val)
|
|
96
|
+
calibrated = apply(p_test)
|
|
97
|
+
if params is not None:
|
|
98
|
+
manifest.record(f"{name}_params", params)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
This matches the consumer's calibration-battery pattern in
|
|
102
|
+
`prompt-injection-detection-prototype` (their ADR-056 supersedes
|
|
103
|
+
ADR-023 to adopt the canonical `(params, apply)` shape across the
|
|
104
|
+
full 4-calibrator audit battery).
|
|
105
|
+
|
|
106
|
+
### Added
|
|
107
|
+
|
|
108
|
+
- **`eval_toolkit.fit_isotonic_binary(y_true, y_score) -> (None,
|
|
109
|
+
apply)`** — thin wrapper over `fit_isotonic_calibrator`. The
|
|
110
|
+
`None` in the params slot encodes "non-parametric" (isotonic
|
|
111
|
+
regression is a monotone step function, no scalar params to log).
|
|
112
|
+
- 6 new unit tests in `tests/test_calibration_binary_adapters.py`
|
|
113
|
+
including a 4-calibrator family-iteration integration test that
|
|
114
|
+
verifies the `None`-vs-tuple convention.
|
|
115
|
+
|
|
116
|
+
### Protocol stability
|
|
117
|
+
|
|
118
|
+
Additive only. No Tier-2 Protocol shape edits. v0.42 is minor 3 of
|
|
119
|
+
consecutive-without-Protocol-changes (v0.40 + v0.41 + v0.42). Gate 2
|
|
120
|
+
stays MET.
|
|
9
121
|
|
|
10
122
|
## [0.41.0] — 2026-05-18 — Croissant end-to-end (closes #42, v1.0 Gate 4 MET)
|
|
11
123
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.43.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -67,6 +67,9 @@ Requires-Dist: pyarrow>=15.0; extra == 'parquet'
|
|
|
67
67
|
Provides-Extra: plotting
|
|
68
68
|
Requires-Dist: matplotlib>=3.8; extra == 'plotting'
|
|
69
69
|
Requires-Dist: pillow>=10.0; extra == 'plotting'
|
|
70
|
+
Provides-Extra: probes
|
|
71
|
+
Requires-Dist: torch>=2.0; extra == 'probes'
|
|
72
|
+
Requires-Dist: transformers>=4.40; extra == 'probes'
|
|
70
73
|
Provides-Extra: property
|
|
71
74
|
Requires-Dist: hypothesis>=6.100; extra == 'property'
|
|
72
75
|
Provides-Extra: transformers
|
|
@@ -308,22 +311,6 @@ tests with large `max_examples` and a few bootstrap tests with
|
|
|
308
311
|
`n_resamples >= 200`). `make fast` keeps the developer iteration loop
|
|
309
312
|
under ~30 seconds.
|
|
310
313
|
|
|
311
|
-
## Downstream contract testing (v4 sibling-smoke)
|
|
312
|
-
|
|
313
|
-
A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
|
|
314
|
-
the downstream consumer `prompt-injection-v4` at `main`, installs it
|
|
315
|
-
with this branch's eval-toolkit as an editable sibling dep (via v4's
|
|
316
|
-
`[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
|
|
317
|
-
contract regressions at PR time rather than in v4's own CI post-merge.
|
|
318
|
-
|
|
319
|
-
The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
|
|
320
|
-
datasets used by v4's smoke fixtures). Set it at:
|
|
321
|
-
`https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
|
|
322
|
-
|
|
323
|
-
The workflow runs with `continue-on-error: true` during a 2-3 week
|
|
324
|
-
trial period; it'll be promoted to a required gate once the false-
|
|
325
|
-
positive rate (from independent v4 main breakage or HF rate-limits)
|
|
326
|
-
is characterized.
|
|
327
314
|
|
|
328
315
|
## Standards
|
|
329
316
|
|
|
@@ -230,22 +230,6 @@ tests with large `max_examples` and a few bootstrap tests with
|
|
|
230
230
|
`n_resamples >= 200`). `make fast` keeps the developer iteration loop
|
|
231
231
|
under ~30 seconds.
|
|
232
232
|
|
|
233
|
-
## Downstream contract testing (v4 sibling-smoke)
|
|
234
|
-
|
|
235
|
-
A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
|
|
236
|
-
the downstream consumer `prompt-injection-v4` at `main`, installs it
|
|
237
|
-
with this branch's eval-toolkit as an editable sibling dep (via v4's
|
|
238
|
-
`[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
|
|
239
|
-
contract regressions at PR time rather than in v4's own CI post-merge.
|
|
240
|
-
|
|
241
|
-
The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
|
|
242
|
-
datasets used by v4's smoke fixtures). Set it at:
|
|
243
|
-
`https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
|
|
244
|
-
|
|
245
|
-
The workflow runs with `continue-on-error: true` during a 2-3 week
|
|
246
|
-
trial period; it'll be promoted to a required gate once the false-
|
|
247
|
-
positive rate (from independent v4 main breakage or HF rate-limits)
|
|
248
|
-
is characterized.
|
|
249
233
|
|
|
250
234
|
## Standards
|
|
251
235
|
|
|
@@ -63,6 +63,12 @@ embeddings = ["sentence-transformers>=3.0"]
|
|
|
63
63
|
# itself does not import transformers, so the optional install is
|
|
64
64
|
# strictly for callers wanting AutoTokenizer.from_pretrained(...).
|
|
65
65
|
transformers = ["transformers>=4.0"]
|
|
66
|
+
# v0.43.0: ActivationDeltaProbe (TaskTracker-style linear activation probe;
|
|
67
|
+
# closes #53). Pulls torch + transformers (~600MB+ transitive). Follows
|
|
68
|
+
# the [embeddings] precedent: opt-in only, NOT in [all] / [dev]. Module
|
|
69
|
+
# is base-install-safe (lazy imports inside ActivationDeltaProbe methods);
|
|
70
|
+
# the extra is strictly for callers wanting to actually fit / predict.
|
|
71
|
+
probes = ["torch>=2.0", "transformers>=4.40"]
|
|
66
72
|
# DEPRECATED (announced v0.30.1, removal v0.33.0).
|
|
67
73
|
#
|
|
68
74
|
# Retained as a transitive no-op so `pip install eval-toolkit[validation]`
|
|
@@ -30,6 +30,20 @@ _logging.getLogger("eval_toolkit").addHandler(_logging.NullHandler())
|
|
|
30
30
|
# dividers below are informational only; the snapshot in
|
|
31
31
|
# tests/golden/public_api/ reads dict keys + values, not comments.
|
|
32
32
|
_EXPORTS: dict[str, str] = {
|
|
33
|
+
# --- adversarial ---
|
|
34
|
+
"CORE_TECHNIQUES": "eval_toolkit.adversarial",
|
|
35
|
+
"CaseRandomization": "eval_toolkit.adversarial",
|
|
36
|
+
"CharacterInjectionStrategy": "eval_toolkit.adversarial",
|
|
37
|
+
"DiacriticInjection": "eval_toolkit.adversarial",
|
|
38
|
+
"HomoglyphSubstitution": "eval_toolkit.adversarial",
|
|
39
|
+
"PunctuationInjection": "eval_toolkit.adversarial",
|
|
40
|
+
"WhitespaceInjection": "eval_toolkit.adversarial",
|
|
41
|
+
"ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
|
|
42
|
+
"character_injection": "eval_toolkit.adversarial",
|
|
43
|
+
# --- probes ---
|
|
44
|
+
"ActivationDeltaProbe": "eval_toolkit.probes",
|
|
45
|
+
"ActivationExtractor": "eval_toolkit.probes",
|
|
46
|
+
"Probe": "eval_toolkit.probes",
|
|
33
47
|
# --- analysis ---
|
|
34
48
|
"CsvPredictionReader": "eval_toolkit.analysis",
|
|
35
49
|
"JsonlPredictionReader": "eval_toolkit.analysis",
|
|
@@ -85,6 +99,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
85
99
|
"bayes_optimal_threshold": "eval_toolkit.calibration",
|
|
86
100
|
"fit_beta_binary": "eval_toolkit.calibration",
|
|
87
101
|
"fit_beta_calibrator": "eval_toolkit.calibration",
|
|
102
|
+
"fit_isotonic_binary": "eval_toolkit.calibration",
|
|
88
103
|
"fit_isotonic_calibrator": "eval_toolkit.calibration",
|
|
89
104
|
"fit_platt_binary": "eval_toolkit.calibration",
|
|
90
105
|
"fit_platt_calibrator": "eval_toolkit.calibration",
|
|
@@ -155,8 +170,10 @@ _EXPORTS: dict[str, str] = {
|
|
|
155
170
|
"DataFrameLoader": "eval_toolkit.loaders",
|
|
156
171
|
"DatasetLoader": "eval_toolkit.loaders",
|
|
157
172
|
"HFDatasetsLoader": "eval_toolkit.loaders",
|
|
173
|
+
"OodManifestLoader": "eval_toolkit.loaders",
|
|
158
174
|
"ParquetGlobLoader": "eval_toolkit.loaders",
|
|
159
175
|
"SingleSliceLoader": "eval_toolkit.loaders",
|
|
176
|
+
"ood_dataset_from_manifest": "eval_toolkit.loaders",
|
|
160
177
|
# --- manifest ---
|
|
161
178
|
"MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
|
|
162
179
|
"RunManifest": "eval_toolkit.manifest",
|