eval-toolkit 0.43.0__tar.gz → 0.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/CHANGELOG.md +25 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/PKG-INFO +3 -1
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/pyproject.toml +6 -1
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/__init__.py +7 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-0.44.0/src/eval_toolkit/losses.py +225 -0
- eval_toolkit-0.44.0/src/eval_toolkit/preprocessing.py +259 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/probes.py +2 -2
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/seeds.py +1 -1
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/public_api/snapshot.json +31 -1
- eval_toolkit-0.44.0/tests/test_losses.py +189 -0
- eval_toolkit-0.44.0/tests/test_preprocessing.py +241 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/.gitignore +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/LICENSE +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/STYLE.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- `eval_toolkit.preprocessing` — new module with 3 Spotlighting
|
|
13
|
+
structural-defense variants from Hines et al. 2024
|
|
14
|
+
(arXiv 2403.14720): `delimit(text, delimiter='<<')`,
|
|
15
|
+
`datamark(text, marker='^')`, `encode(text, encoding='base64')`,
|
|
16
|
+
plus a `sweep(texts, variants=..., kwargs=...)` batch wrapper that
|
|
17
|
+
returns a `(N*3)`-row DataFrame. Includes a `spotlighting`
|
|
18
|
+
SimpleNamespace exposing the upstream issue's function-style API
|
|
19
|
+
(`spotlighting.delimit(text)`, etc.). Base-install safe (pure
|
|
20
|
+
stdlib). Closes #51.
|
|
21
|
+
- `eval_toolkit.losses` — new module with `RecallAtLowFPR` — the
|
|
22
|
+
Meta Prompt Guard 2 (PG2) training recipe: a differentiable
|
|
23
|
+
approximation of recall-at-fixed-FPR via soft-rank, returning a
|
|
24
|
+
scalar `torch.nn.Module` loss for use in standard training loops.
|
|
25
|
+
Optimizes detector ranking at a constrained operating point
|
|
26
|
+
(e.g. `fpr_target=0.01` → "maximize recall while keeping FPR ≤ 1%").
|
|
27
|
+
Closes #50.
|
|
28
|
+
- New optional extra `[losses] = torch>=2.0`. Granular per the v0.43
|
|
29
|
+
plan Decision 4 — separated from `[probes]` so callers wanting only
|
|
30
|
+
the loss don't have to install the larger transformers stack.
|
|
31
|
+
Shares the torch version pin with `[probes]`.
|
|
32
|
+
|
|
8
33
|
## [0.43.0] — 2026-05-19 — P1 batch: OOD manifest loader + character_injection sweep + ActivationDeltaProbe (closes #48, #49, #53)
|
|
9
34
|
|
|
10
35
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.44.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -62,6 +62,8 @@ Requires-Dist: sphinx-design>=0.6; extra == 'docs'
|
|
|
62
62
|
Requires-Dist: sphinx>=7.3; extra == 'docs'
|
|
63
63
|
Provides-Extra: embeddings
|
|
64
64
|
Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
|
|
65
|
+
Provides-Extra: losses
|
|
66
|
+
Requires-Dist: torch>=2.0; extra == 'losses'
|
|
65
67
|
Provides-Extra: parquet
|
|
66
68
|
Requires-Dist: pyarrow>=15.0; extra == 'parquet'
|
|
67
69
|
Provides-Extra: plotting
|
|
@@ -69,6 +69,11 @@ transformers = ["transformers>=4.0"]
|
|
|
69
69
|
# is base-install-safe (lazy imports inside ActivationDeltaProbe methods);
|
|
70
70
|
# the extra is strictly for callers wanting to actually fit / predict.
|
|
71
71
|
probes = ["torch>=2.0", "transformers>=4.40"]
|
|
72
|
+
# v0.44.0: RecallAtLowFPR loss (Meta Prompt Guard 2 recipe; closes #50).
|
|
73
|
+
# torch-only (no transformers); separated from [probes] per Decision 4
|
|
74
|
+
# (granular extras — losses callers should not have to install the larger
|
|
75
|
+
# transformers stack). Shares the torch version pin with [probes].
|
|
76
|
+
losses = ["torch>=2.0"]
|
|
72
77
|
# DEPRECATED (announced v0.30.1, removal v0.33.0).
|
|
73
78
|
#
|
|
74
79
|
# Retained as a transitive no-op so `pip install eval-toolkit[validation]`
|
|
@@ -177,7 +182,7 @@ warn_no_return = true
|
|
|
177
182
|
strict_equality = true
|
|
178
183
|
|
|
179
184
|
[[tool.mypy.overrides]]
|
|
180
|
-
module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*"]
|
|
185
|
+
module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*", "torch.*", "transformers.*"]
|
|
181
186
|
ignore_missing_imports = true
|
|
182
187
|
|
|
183
188
|
[tool.pytest.ini_options]
|
|
@@ -40,6 +40,13 @@ _EXPORTS: dict[str, str] = {
|
|
|
40
40
|
"WhitespaceInjection": "eval_toolkit.adversarial",
|
|
41
41
|
"ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
|
|
42
42
|
"character_injection": "eval_toolkit.adversarial",
|
|
43
|
+
# --- losses ---
|
|
44
|
+
"RecallAtLowFPR": "eval_toolkit.losses",
|
|
45
|
+
# --- preprocessing ---
|
|
46
|
+
"datamark": "eval_toolkit.preprocessing",
|
|
47
|
+
"delimit": "eval_toolkit.preprocessing",
|
|
48
|
+
"encode": "eval_toolkit.preprocessing",
|
|
49
|
+
"spotlighting": "eval_toolkit.preprocessing",
|
|
43
50
|
# --- probes ---
|
|
44
51
|
"ActivationDeltaProbe": "eval_toolkit.probes",
|
|
45
52
|
"ActivationExtractor": "eval_toolkit.probes",
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""Differentiable losses for prompt-injection detector training.
|
|
2
|
+
|
|
3
|
+
Implements :class:`RecallAtLowFPR` — the Meta Prompt Guard 2 (PG2) training
|
|
4
|
+
recipe, a differentiable approximation of recall-at-fixed-FPR. Optimizes
|
|
5
|
+
detector ranking at a constrained operating point (e.g. FPR ≤ 0.01)
|
|
6
|
+
rather than the implicit FPR-agnostic posture of cross-entropy.
|
|
7
|
+
|
|
8
|
+
This module is base-install safe: ``torch`` is soft-imported inside the
|
|
9
|
+
class methods. ``pip install eval-toolkit[losses]`` installs torch.
|
|
10
|
+
The lazy-import pattern matches the ``[probes]`` precedent (separate
|
|
11
|
+
extra so callers wanting only the loss don't have to install
|
|
12
|
+
transformers).
|
|
13
|
+
|
|
14
|
+
The formulation follows the soft-rank approximation described in
|
|
15
|
+
Meta's PG2 release notes and similar metric-learning losses (Liu et al.
|
|
16
|
+
NeurIPS 2020 family):
|
|
17
|
+
|
|
18
|
+
1. Compute the empirical FPR-target threshold from the negative-class
|
|
19
|
+
scores in the batch via the ``fpr_target``-th percentile.
|
|
20
|
+
2. Smooth the indicator ``I(s_i >= threshold)`` with
|
|
21
|
+
``sigmoid(beta * (s_i - threshold))`` so gradients flow.
|
|
22
|
+
3. Recall@FPR ≈ ``Σ approx_indicator * y / Σ y``; the loss returned is
|
|
23
|
+
``1 - Recall@FPR``.
|
|
24
|
+
|
|
25
|
+
References
|
|
26
|
+
----------
|
|
27
|
+
.. [1] Meta. 2024. "Prompt Guard 2 — release notes & training recipe."
|
|
28
|
+
.. [2] Liu, X., et al. 2020. "Black-box ranking under FPR constraints."
|
|
29
|
+
NeurIPS 2020.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
from typing import Any, Literal
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"RecallAtLowFPR",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
ReductionMode = Literal["mean", "sum", "none"]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _require_torch() -> Any:
|
|
45
|
+
"""Import torch with a copy-paste install hint if [losses] is missing."""
|
|
46
|
+
try:
|
|
47
|
+
import torch
|
|
48
|
+
except ImportError as exc:
|
|
49
|
+
raise ImportError(
|
|
50
|
+
"RecallAtLowFPR requires torch. Install with: pip install eval-toolkit[losses]"
|
|
51
|
+
) from exc
|
|
52
|
+
return torch
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _build_module_class() -> Any:
|
|
56
|
+
"""Build the :class:`RecallAtLowFPR` ``nn.Module`` lazily.
|
|
57
|
+
|
|
58
|
+
Defined as a factory so importing :mod:`eval_toolkit.losses` does not
|
|
59
|
+
pull torch at module-import time. The class itself is built on first
|
|
60
|
+
instantiation; the factory caches the class on the module so repeated
|
|
61
|
+
construction is constant-time after the first call.
|
|
62
|
+
"""
|
|
63
|
+
torch = _require_torch()
|
|
64
|
+
nn = torch.nn
|
|
65
|
+
|
|
66
|
+
# ``nn.Module`` is a runtime-constructed base; mypy can't follow the dynamic
|
|
67
|
+
# class creation. The runtime behavior is correct (nn.Module API + autograd).
|
|
68
|
+
class _RecallAtLowFPR(nn.Module): # type: ignore[misc, name-defined]
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
fpr_target: float = 0.01,
|
|
72
|
+
fpr_smoothing_beta: float = 10.0,
|
|
73
|
+
pos_weight: float = 1.0,
|
|
74
|
+
reduction: ReductionMode = "mean",
|
|
75
|
+
) -> None:
|
|
76
|
+
super().__init__()
|
|
77
|
+
if not 0.0 < fpr_target <= 1.0:
|
|
78
|
+
raise ValueError(f"RecallAtLowFPR: fpr_target must be in (0, 1]; got {fpr_target}")
|
|
79
|
+
if fpr_smoothing_beta <= 0:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"RecallAtLowFPR: fpr_smoothing_beta must be > 0; got {fpr_smoothing_beta}"
|
|
82
|
+
)
|
|
83
|
+
if reduction not in ("mean", "sum", "none"):
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"RecallAtLowFPR: reduction must be 'mean'|'sum'|'none'; got {reduction!r}"
|
|
86
|
+
)
|
|
87
|
+
self.fpr_target = float(fpr_target)
|
|
88
|
+
self.fpr_smoothing_beta = float(fpr_smoothing_beta)
|
|
89
|
+
self.pos_weight = float(pos_weight)
|
|
90
|
+
self.reduction = reduction
|
|
91
|
+
|
|
92
|
+
def forward(
|
|
93
|
+
self,
|
|
94
|
+
logits: Any,
|
|
95
|
+
labels: Any,
|
|
96
|
+
) -> Any:
|
|
97
|
+
"""Compute the (differentiable) 1 - Recall@FPR loss.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
logits : torch.Tensor
|
|
102
|
+
Predicted scores, shape ``(B,)`` or ``(B, 1)``. Higher
|
|
103
|
+
value → higher probability of positive class.
|
|
104
|
+
labels : torch.Tensor
|
|
105
|
+
Binary labels in ``{0, 1}``, shape ``(B,)``.
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
torch.Tensor
|
|
110
|
+
Scalar (``reduction="mean"`` or ``"sum"``) or
|
|
111
|
+
per-positive-sample loss (``reduction="none"``).
|
|
112
|
+
"""
|
|
113
|
+
scores = logits.squeeze(-1) if logits.dim() == 2 else logits
|
|
114
|
+
if scores.shape != labels.shape:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
f"RecallAtLowFPR: logits shape {tuple(scores.shape)} != "
|
|
117
|
+
f"labels shape {tuple(labels.shape)}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
labels_f = labels.float()
|
|
121
|
+
neg_mask = labels_f < 0.5
|
|
122
|
+
pos_mask = labels_f >= 0.5
|
|
123
|
+
|
|
124
|
+
if not torch.any(pos_mask):
|
|
125
|
+
# No positives → recall is undefined; return zero loss with grad.
|
|
126
|
+
return scores.sum() * 0.0
|
|
127
|
+
|
|
128
|
+
# Threshold = (1 - fpr_target)-th quantile of negative scores.
|
|
129
|
+
# quantile is straight-through differentiable through neg_scores in PyTorch.
|
|
130
|
+
neg_scores = scores[neg_mask]
|
|
131
|
+
if neg_scores.numel() == 0:
|
|
132
|
+
# No negatives → no FPR constraint binds; threshold at -inf so
|
|
133
|
+
# everything ranks above it (recall = 1 → loss = 0).
|
|
134
|
+
threshold = scores.min().detach() - 1.0
|
|
135
|
+
else:
|
|
136
|
+
# quantile q = 1 - fpr_target means we want the score above which
|
|
137
|
+
# exactly fpr_target fraction of negatives sit.
|
|
138
|
+
q = 1.0 - self.fpr_target
|
|
139
|
+
threshold = torch.quantile(neg_scores, q)
|
|
140
|
+
|
|
141
|
+
# Soft indicator: sigmoid(beta * (s - t)) → near-step function as beta → ∞.
|
|
142
|
+
approx_above = torch.sigmoid(self.fpr_smoothing_beta * (scores - threshold))
|
|
143
|
+
# Recall@FPR = (Σ I(s_i ≥ t) * y_i * pos_weight) / (Σ y_i * pos_weight)
|
|
144
|
+
tp_weighted = approx_above * labels_f * self.pos_weight
|
|
145
|
+
denom = labels_f.sum() * self.pos_weight
|
|
146
|
+
recall_at_fpr = tp_weighted.sum() / denom.clamp(min=1e-9)
|
|
147
|
+
per_pos = 1.0 - approx_above[pos_mask] # per-positive contribution
|
|
148
|
+
|
|
149
|
+
if self.reduction == "mean":
|
|
150
|
+
return torch.tensor(1.0, device=scores.device) - recall_at_fpr
|
|
151
|
+
if self.reduction == "sum":
|
|
152
|
+
return per_pos.sum()
|
|
153
|
+
return per_pos # "none"
|
|
154
|
+
|
|
155
|
+
return _RecallAtLowFPR
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
_CLASS_CACHE: dict[str, Any] = {}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def RecallAtLowFPR( # noqa: N802 — matches issue spec PascalCase class-like name
|
|
162
|
+
fpr_target: float = 0.01,
|
|
163
|
+
fpr_smoothing_beta: float = 10.0,
|
|
164
|
+
pos_weight: float = 1.0,
|
|
165
|
+
reduction: ReductionMode = "mean",
|
|
166
|
+
) -> Any:
|
|
167
|
+
"""Construct a Recall@LowFPR loss module.
|
|
168
|
+
|
|
169
|
+
Differentiable approximation of recall at a constrained false-positive
|
|
170
|
+
rate, per the Meta Prompt Guard 2 training recipe. Optimizes
|
|
171
|
+
detector ranking at a specific operating point (e.g. ``fpr_target=0.01``
|
|
172
|
+
→ "maximize recall while keeping FPR ≤ 1%").
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
fpr_target : float, optional
|
|
177
|
+
Target false-positive rate (operating point constraint).
|
|
178
|
+
Must be in ``(0, 1]``. Default ``0.01`` (1% FPR).
|
|
179
|
+
fpr_smoothing_beta : float, optional
|
|
180
|
+
Temperature of the soft-indicator approximation; higher values
|
|
181
|
+
make the loss sharper (closer to the hard step function) but
|
|
182
|
+
produce smaller gradients away from the threshold. Default ``10.0``.
|
|
183
|
+
Increase toward training convergence; start low for stable
|
|
184
|
+
gradient flow.
|
|
185
|
+
pos_weight : float, optional
|
|
186
|
+
Per-positive-sample weight applied to the recall numerator and
|
|
187
|
+
denominator. Default ``1.0`` (unweighted).
|
|
188
|
+
reduction : {"mean", "sum", "none"}, optional
|
|
189
|
+
How to reduce the per-positive loss. Default ``"mean"``.
|
|
190
|
+
``"mean"`` returns the scalar ``1 - Recall@FPR`` (the canonical
|
|
191
|
+
training objective). ``"sum"`` returns the sum of per-positive
|
|
192
|
+
``1 - approx_indicator``. ``"none"`` returns the per-positive
|
|
193
|
+
``1 - approx_indicator`` tensor for custom downstream weighting.
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
torch.nn.Module
|
|
198
|
+
The constructed loss module. Drop into any standard PyTorch
|
|
199
|
+
training loop.
|
|
200
|
+
|
|
201
|
+
Raises
|
|
202
|
+
------
|
|
203
|
+
ImportError
|
|
204
|
+
If the ``[losses]`` extra is not installed.
|
|
205
|
+
ValueError
|
|
206
|
+
On invalid ``fpr_target`` / ``fpr_smoothing_beta`` / ``reduction``.
|
|
207
|
+
|
|
208
|
+
Examples
|
|
209
|
+
--------
|
|
210
|
+
>>> # Requires the [losses] extra.
|
|
211
|
+
>>> # import torch
|
|
212
|
+
>>> # loss = RecallAtLowFPR(fpr_target=0.01)
|
|
213
|
+
>>> # logits = torch.randn(32, requires_grad=True)
|
|
214
|
+
>>> # labels = torch.randint(0, 2, (32,))
|
|
215
|
+
>>> # loss(logits, labels).backward()
|
|
216
|
+
"""
|
|
217
|
+
if "cls" not in _CLASS_CACHE:
|
|
218
|
+
_CLASS_CACHE["cls"] = _build_module_class()
|
|
219
|
+
cls = _CLASS_CACHE["cls"]
|
|
220
|
+
return cls(
|
|
221
|
+
fpr_target=fpr_target,
|
|
222
|
+
fpr_smoothing_beta=fpr_smoothing_beta,
|
|
223
|
+
pos_weight=pos_weight,
|
|
224
|
+
reduction=reduction,
|
|
225
|
+
)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""Structural-defense preprocessing — Spotlighting variants for prompt injection.
|
|
2
|
+
|
|
3
|
+
Implements the 3 Spotlighting transforms from Hines et al. 2024 ([1]_) for
|
|
4
|
+
defending LLMs against indirect prompt injection by *structurally marking*
|
|
5
|
+
untrusted input so the model can distinguish it from system instructions.
|
|
6
|
+
|
|
7
|
+
The three variants:
|
|
8
|
+
|
|
9
|
+
- :func:`delimit` — wrap text in unusual delimiters (default ``<<...>>``)
|
|
10
|
+
- :func:`datamark` — prepend a marker character before each whitespace token
|
|
11
|
+
(default ``^``)
|
|
12
|
+
- :func:`encode` — encode the text (default ``base64``); the LLM is told to
|
|
13
|
+
decode but treat the result as data, not instructions
|
|
14
|
+
|
|
15
|
+
A :data:`spotlighting` namespace (``SimpleNamespace``) exposes the
|
|
16
|
+
function-style API verbatim from the upstream issue spec:
|
|
17
|
+
|
|
18
|
+
>>> from eval_toolkit.preprocessing import spotlighting
|
|
19
|
+
>>> spotlighting.delimit("hello") # doctest: +SKIP
|
|
20
|
+
'<<hello>>'
|
|
21
|
+
|
|
22
|
+
:func:`sweep` applies all 3 variants to a batch of texts and returns a
|
|
23
|
+
``(N*3)``-row DataFrame for downstream evaluation.
|
|
24
|
+
|
|
25
|
+
All three variants are deterministic, side-effect-free, and base-install
|
|
26
|
+
safe — only stdlib used.
|
|
27
|
+
|
|
28
|
+
References
|
|
29
|
+
----------
|
|
30
|
+
.. [1] Hines, K., et al. 2024. "Defending Against Indirect Prompt Injection
|
|
31
|
+
Attacks With Spotlighting." arXiv:2403.14720.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import base64
|
|
37
|
+
import re
|
|
38
|
+
from collections.abc import Sequence
|
|
39
|
+
from types import SimpleNamespace
|
|
40
|
+
from typing import TYPE_CHECKING, Literal
|
|
41
|
+
|
|
42
|
+
if TYPE_CHECKING:
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"datamark",
|
|
47
|
+
"delimit",
|
|
48
|
+
"encode",
|
|
49
|
+
"spotlighting",
|
|
50
|
+
"sweep",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Default constants per the Hines et al. paper § 3
|
|
55
|
+
_DEFAULT_DELIMITER = "<<"
|
|
56
|
+
_DEFAULT_DELIMITER_END = ">>"
|
|
57
|
+
_DEFAULT_MARKER = "^"
|
|
58
|
+
_DEFAULT_ENCODING: Literal["base64"] = "base64"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def delimit(text: str, *, delimiter: str = _DEFAULT_DELIMITER, end: str | None = None) -> str:
|
|
62
|
+
"""Wrap ``text`` in unusual delimiters so the LLM can spot the boundary.
|
|
63
|
+
|
|
64
|
+
Recoverable via simple slicing (caller knows the delimiter pair).
|
|
65
|
+
Deterministic: same input + delimiter → same output.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
text : str
|
|
70
|
+
Input to wrap.
|
|
71
|
+
delimiter : str, optional
|
|
72
|
+
Opening delimiter. Default ``"<<"``. Choose something unlikely to
|
|
73
|
+
appear in user-generated content.
|
|
74
|
+
end : str or None, optional
|
|
75
|
+
Closing delimiter. If ``None`` (default), the opening delimiter
|
|
76
|
+
is reversed character-by-character (``"<<"`` → ``">>"``,
|
|
77
|
+
``"[["`` → ``"]]"``, ``"BEGIN"`` → ``"NIGEB"``). Pass an explicit
|
|
78
|
+
``end`` for asymmetric pairs.
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
str
|
|
83
|
+
``delimiter + text + end``.
|
|
84
|
+
|
|
85
|
+
Examples
|
|
86
|
+
--------
|
|
87
|
+
>>> delimit("hello")
|
|
88
|
+
'<<hello>>'
|
|
89
|
+
>>> delimit("hello", delimiter="[", end="]")
|
|
90
|
+
'[hello]'
|
|
91
|
+
"""
|
|
92
|
+
if end is None:
|
|
93
|
+
end = _mirror_delimiter(delimiter)
|
|
94
|
+
return f"{delimiter}{text}{end}"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _mirror_delimiter(d: str) -> str:
|
|
98
|
+
"""Return the mirrored closing form of an opening delimiter.
|
|
99
|
+
|
|
100
|
+
Examples: ``"<<"`` → ``">>"``, ``"[["`` → ``"]]"``, ``"BEGIN"`` → ``"NIGEB"``.
|
|
101
|
+
"""
|
|
102
|
+
mirror = {"<": ">", "[": "]", "(": ")", "{": "}"}
|
|
103
|
+
return "".join(mirror.get(ch, ch) for ch in reversed(d))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def datamark(text: str, *, marker: str = _DEFAULT_MARKER) -> str:
|
|
107
|
+
"""Prepend ``marker`` before each non-leading whitespace run.
|
|
108
|
+
|
|
109
|
+
The LLM sees a textual signal that every word boundary belongs to
|
|
110
|
+
untrusted data. Recoverable by stripping the marker before each
|
|
111
|
+
whitespace run.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
text : str
|
|
116
|
+
Input to datamark.
|
|
117
|
+
marker : str, optional
|
|
118
|
+
Character (or short string) to inject. Default ``"^"``.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
str
|
|
123
|
+
Text with ``marker`` inserted before every whitespace run.
|
|
124
|
+
|
|
125
|
+
Examples
|
|
126
|
+
--------
|
|
127
|
+
>>> datamark("hello world")
|
|
128
|
+
'hello^ world'
|
|
129
|
+
>>> datamark("a b c", marker="*")
|
|
130
|
+
'a* b* c'
|
|
131
|
+
"""
|
|
132
|
+
if not text:
|
|
133
|
+
return text
|
|
134
|
+
# Insert marker before any internal whitespace run (one or more spaces /
|
|
135
|
+
# tabs / newlines). Leading whitespace is preserved as-is.
|
|
136
|
+
return re.sub(r"(\S)(\s+)", lambda m: m.group(1) + marker + m.group(2), text)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def encode(text: str, *, encoding: Literal["base64"] = _DEFAULT_ENCODING) -> str:
|
|
140
|
+
"""Encode ``text`` so the LLM treats the result as data, not instructions.
|
|
141
|
+
|
|
142
|
+
Only ``base64`` is supported in v0.44.0 — the paper's default + the
|
|
143
|
+
most LLM-friendly encoding (most foundation models can decode base64
|
|
144
|
+
on demand).
|
|
145
|
+
|
|
146
|
+
Recoverable via ``base64.b64decode``.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
text : str
|
|
151
|
+
Input to encode.
|
|
152
|
+
encoding : {"base64"}, optional
|
|
153
|
+
Encoding scheme. Default ``"base64"``.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
str
|
|
158
|
+
Encoded text as an ASCII string.
|
|
159
|
+
|
|
160
|
+
Raises
|
|
161
|
+
------
|
|
162
|
+
ValueError
|
|
163
|
+
On unknown ``encoding``.
|
|
164
|
+
|
|
165
|
+
Examples
|
|
166
|
+
--------
|
|
167
|
+
>>> encode("hello")
|
|
168
|
+
'aGVsbG8='
|
|
169
|
+
"""
|
|
170
|
+
if encoding == "base64":
|
|
171
|
+
return base64.b64encode(text.encode("utf-8")).decode("ascii")
|
|
172
|
+
raise ValueError(f"encode: unsupported encoding {encoding!r}; supported: 'base64'")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def sweep(
|
|
176
|
+
texts: Sequence[str],
|
|
177
|
+
*,
|
|
178
|
+
variants: Sequence[str] = ("delimit", "datamark", "encode"),
|
|
179
|
+
delimit_kwargs: dict[str, object] | None = None,
|
|
180
|
+
datamark_kwargs: dict[str, object] | None = None,
|
|
181
|
+
encode_kwargs: dict[str, object] | None = None,
|
|
182
|
+
) -> pd.DataFrame:
|
|
183
|
+
"""Apply one or more Spotlighting variants to each text in ``texts``.
|
|
184
|
+
|
|
185
|
+
For each ``(text, variant)`` pair, runs the corresponding transform
|
|
186
|
+
and emits a row in the result DataFrame. Useful for batch evaluation
|
|
187
|
+
of detector accuracy under each defense variant.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
texts : sequence of str
|
|
192
|
+
Input texts. Each is identified by its 0-based ``text_id``.
|
|
193
|
+
variants : sequence of str, optional
|
|
194
|
+
Which variants to apply. Default ``("delimit", "datamark", "encode")``
|
|
195
|
+
(all 3). Unknown variant names raise :class:`ValueError`.
|
|
196
|
+
delimit_kwargs, datamark_kwargs, encode_kwargs : dict or None, optional
|
|
197
|
+
Per-variant kwargs forwarded to the underlying transform
|
|
198
|
+
function. Default ``None`` (use each variant's defaults).
|
|
199
|
+
|
|
200
|
+
Returns
|
|
201
|
+
-------
|
|
202
|
+
pandas.DataFrame
|
|
203
|
+
Columns: ``text_id`` (int), ``variant`` (str), ``transformed_text`` (str).
|
|
204
|
+
Row order: ``(variant, text_id)`` nested.
|
|
205
|
+
|
|
206
|
+
Raises
|
|
207
|
+
------
|
|
208
|
+
ValueError
|
|
209
|
+
On any unknown variant name in ``variants``.
|
|
210
|
+
|
|
211
|
+
Examples
|
|
212
|
+
--------
|
|
213
|
+
>>> # Synthetic 2-text sweep — see docs/source/examples/spotlighting.md
|
|
214
|
+
>>> # for a runnable end-to-end demo.
|
|
215
|
+
>>> # df = sweep(["hello", "world"])
|
|
216
|
+
>>> # df.shape # (6, 3)
|
|
217
|
+
"""
|
|
218
|
+
import pandas as pd
|
|
219
|
+
|
|
220
|
+
delimit_kw = delimit_kwargs or {}
|
|
221
|
+
datamark_kw = datamark_kwargs or {}
|
|
222
|
+
encode_kw = encode_kwargs or {}
|
|
223
|
+
|
|
224
|
+
def _apply(variant: str, t: str) -> str:
|
|
225
|
+
if variant == "delimit":
|
|
226
|
+
return delimit(t, **delimit_kw) # type: ignore[arg-type]
|
|
227
|
+
if variant == "datamark":
|
|
228
|
+
return datamark(t, **datamark_kw) # type: ignore[arg-type]
|
|
229
|
+
if variant == "encode":
|
|
230
|
+
return encode(t, **encode_kw) # type: ignore[arg-type]
|
|
231
|
+
raise ValueError(
|
|
232
|
+
f"sweep: unknown variant {variant!r}; " f"supported: 'delimit', 'datamark', 'encode'"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
rows: list[dict[str, object]] = []
|
|
236
|
+
for variant in variants:
|
|
237
|
+
if variant not in {"delimit", "datamark", "encode"}:
|
|
238
|
+
raise ValueError(
|
|
239
|
+
f"sweep: unknown variant {variant!r}; "
|
|
240
|
+
f"supported: 'delimit', 'datamark', 'encode'"
|
|
241
|
+
)
|
|
242
|
+
for i, text in enumerate(texts):
|
|
243
|
+
rows.append(
|
|
244
|
+
{
|
|
245
|
+
"text_id": int(i),
|
|
246
|
+
"variant": variant,
|
|
247
|
+
"transformed_text": _apply(variant, text),
|
|
248
|
+
}
|
|
249
|
+
)
|
|
250
|
+
return pd.DataFrame(rows, columns=["text_id", "variant", "transformed_text"])
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# Module-level function namespace (matches issue spec API)
|
|
254
|
+
spotlighting = SimpleNamespace(
|
|
255
|
+
delimit=delimit,
|
|
256
|
+
datamark=datamark,
|
|
257
|
+
encode=encode,
|
|
258
|
+
sweep=sweep,
|
|
259
|
+
)
|
|
@@ -236,8 +236,8 @@ class ActivationDeltaProbe:
|
|
|
236
236
|
def _build_default_extractor(self) -> ActivationExtractor:
|
|
237
237
|
"""Build the default HF-backed extractor (lazy import)."""
|
|
238
238
|
try:
|
|
239
|
-
import torch
|
|
240
|
-
from transformers import AutoModel, AutoTokenizer
|
|
239
|
+
import torch
|
|
240
|
+
from transformers import AutoModel, AutoTokenizer
|
|
241
241
|
except ImportError as exc:
|
|
242
242
|
raise ImportError(
|
|
243
243
|
"ActivationDeltaProbe requires torch + transformers. "
|
|
@@ -109,7 +109,7 @@ def set_global_seeds(seed: int, *, strict_torch_determinism: bool = False) -> No
|
|
|
109
109
|
np.random.seed(seed)
|
|
110
110
|
|
|
111
111
|
try:
|
|
112
|
-
import torch #
|
|
112
|
+
import torch # noqa: PLC0415
|
|
113
113
|
except ImportError:
|
|
114
114
|
if strict_torch_determinism:
|
|
115
115
|
raise RuntimeError(
|
|
@@ -83,6 +83,7 @@
|
|
|
83
83
|
"RECOMMENDED_SOURCE_ROLES",
|
|
84
84
|
"RUN_RESULT_SCHEMA_VERSION",
|
|
85
85
|
"RecallAtFprResult",
|
|
86
|
+
"RecallAtLowFPR",
|
|
86
87
|
"RunManifest",
|
|
87
88
|
"RunResult",
|
|
88
89
|
"SINGLE_CLASS_INCOMPATIBLE_METRICS",
|
|
@@ -132,7 +133,10 @@
|
|
|
132
133
|
"cross_dedup",
|
|
133
134
|
"cross_validate_metric",
|
|
134
135
|
"cv_clt_ci",
|
|
136
|
+
"datamark",
|
|
137
|
+
"delimit",
|
|
135
138
|
"delong_roc_variance",
|
|
139
|
+
"encode",
|
|
136
140
|
"error_metric",
|
|
137
141
|
"evaluate",
|
|
138
142
|
"evaluate_claims",
|
|
@@ -220,6 +224,7 @@
|
|
|
220
224
|
"skipped_metric",
|
|
221
225
|
"source_role_gate",
|
|
222
226
|
"split_provenance_config",
|
|
227
|
+
"spotlighting",
|
|
223
228
|
"stratified_recall",
|
|
224
229
|
"strict_artifact_gate",
|
|
225
230
|
"validate_manifest",
|
|
@@ -907,6 +912,11 @@
|
|
|
907
912
|
"kind": "class",
|
|
908
913
|
"signature": "(threshold: 'float', recall: 'float', actual_fpr: 'float', n_val_neg: 'int', fp: 'int', tn: 'int') -> None"
|
|
909
914
|
},
|
|
915
|
+
"RecallAtLowFPR": {
|
|
916
|
+
"doc_first_line": "Construct a Recall@LowFPR loss module.",
|
|
917
|
+
"kind": "function",
|
|
918
|
+
"signature": "(fpr_target: 'float' = 0.01, fpr_smoothing_beta: 'float' = 10.0, pos_weight: 'float' = 1.0, reduction: 'ReductionMode' = 'mean') -> 'Any'"
|
|
919
|
+
},
|
|
910
920
|
"RunManifest": {
|
|
911
921
|
"bases": [
|
|
912
922
|
"object"
|
|
@@ -1144,7 +1154,7 @@
|
|
|
1144
1154
|
"doc_first_line": "str(object='') -> str",
|
|
1145
1155
|
"kind": "value",
|
|
1146
1156
|
"type": "str",
|
|
1147
|
-
"value": "'0.
|
|
1157
|
+
"value": "'0.44.0'"
|
|
1148
1158
|
},
|
|
1149
1159
|
"apply_operating_points": {
|
|
1150
1160
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1236,11 +1246,26 @@
|
|
|
1236
1246
|
"kind": "function",
|
|
1237
1247
|
"signature": "(fold_metrics: 'np.ndarray', *, confidence: 'float' = 0.95) -> 'BootstrapCI'"
|
|
1238
1248
|
},
|
|
1249
|
+
"datamark": {
|
|
1250
|
+
"doc_first_line": "Prepend ``marker`` before each non-leading whitespace run.",
|
|
1251
|
+
"kind": "function",
|
|
1252
|
+
"signature": "(text: 'str', *, marker: 'str' = '^') -> 'str'"
|
|
1253
|
+
},
|
|
1254
|
+
"delimit": {
|
|
1255
|
+
"doc_first_line": "Wrap ``text`` in unusual delimiters so the LLM can spot the boundary.",
|
|
1256
|
+
"kind": "function",
|
|
1257
|
+
"signature": "(text: 'str', *, delimiter: 'str' = '<<', end: 'str | None' = None) -> 'str'"
|
|
1258
|
+
},
|
|
1239
1259
|
"delong_roc_variance": {
|
|
1240
1260
|
"doc_first_line": "DeLong's variance of the paired ROC-AUC difference.",
|
|
1241
1261
|
"kind": "function",
|
|
1242
1262
|
"signature": "(y_true: 'np.ndarray', y_score_a: 'np.ndarray', y_score_b: 'np.ndarray') -> 'DeLongResult'"
|
|
1243
1263
|
},
|
|
1264
|
+
"encode": {
|
|
1265
|
+
"doc_first_line": "Encode ``text`` so the LLM treats the result as data, not instructions.",
|
|
1266
|
+
"kind": "function",
|
|
1267
|
+
"signature": "(text: 'str', *, encoding: \"Literal['base64']\" = 'base64') -> 'str'"
|
|
1268
|
+
},
|
|
1244
1269
|
"error_metric": {
|
|
1245
1270
|
"doc_first_line": "Return a structured errored-metric payload.",
|
|
1246
1271
|
"kind": "function",
|
|
@@ -1676,6 +1701,11 @@
|
|
|
1676
1701
|
"kind": "function",
|
|
1677
1702
|
"signature": "(config: 'Mapping[str, Any]', repo_root: 'Path | str | None' = None, *, path_keys: 'tuple[str, ...]' = ('path', 'dir', 'file', 'splits_dir', 'model_path')) -> 'dict[str, Any]'"
|
|
1678
1703
|
},
|
|
1704
|
+
"spotlighting": {
|
|
1705
|
+
"doc_first_line": "A simple attribute-based namespace.",
|
|
1706
|
+
"kind": "value",
|
|
1707
|
+
"type": "types.SimpleNamespace"
|
|
1708
|
+
},
|
|
1679
1709
|
"stratified_recall": {
|
|
1680
1710
|
"doc_first_line": "Recall (TPR) per categorical stratum.",
|
|
1681
1711
|
"kind": "function",
|