eval-toolkit 0.42.0__tar.gz → 0.44.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/CHANGELOG.md +86 -1
  2. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/PKG-INFO +6 -17
  3. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/README.md +0 -16
  4. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/pyproject.toml +12 -1
  5. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/__init__.py +23 -0
  6. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_version.py +1 -1
  7. eval_toolkit-0.44.0/src/eval_toolkit/adversarial.py +578 -0
  8. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/loaders.py +479 -0
  9. eval_toolkit-0.44.0/src/eval_toolkit/losses.py +225 -0
  10. eval_toolkit-0.44.0/src/eval_toolkit/preprocessing.py +259 -0
  11. eval_toolkit-0.44.0/src/eval_toolkit/probes.py +469 -0
  12. eval_toolkit-0.44.0/src/eval_toolkit/schemas/ood_manifest.v1.json +77 -0
  13. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/seeds.py +1 -1
  14. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/public_api/snapshot.json +148 -1
  15. eval_toolkit-0.44.0/tests/test_adversarial.py +351 -0
  16. eval_toolkit-0.44.0/tests/test_losses.py +189 -0
  17. eval_toolkit-0.44.0/tests/test_ood_loader.py +353 -0
  18. eval_toolkit-0.44.0/tests/test_preprocessing.py +241 -0
  19. eval_toolkit-0.44.0/tests/test_probes.py +321 -0
  20. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_schemas.py +3 -1
  21. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/.gitignore +0 -0
  22. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/LICENSE +0 -0
  23. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/STYLE.md +0 -0
  24. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/archive/README.md +0 -0
  25. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/README.md +0 -0
  26. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/datasets/README.md +0 -0
  27. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/data-integrity/README.md +0 -0
  28. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  29. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/inference/README.md +0 -0
  30. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/research/papers/prompt-injection/README.md +0 -0
  31. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/docs/source/methodology/README.md +0 -0
  32. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/__main__.py +0 -0
  33. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_deprecated.py +0 -0
  34. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_parallel.py +0 -0
  35. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/analysis.py +0 -0
  36. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/artifacts.py +0 -0
  37. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/bootstrap.py +0 -0
  38. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/calibration.py +0 -0
  39. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/claims.py +0 -0
  40. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/config.py +0 -0
  41. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/docs.py +0 -0
  42. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/embeddings.py +0 -0
  43. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/evidence.py +0 -0
  44. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/harness.py +0 -0
  45. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/leakage.py +0 -0
  46. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/manifest.py +0 -0
  47. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/metrics.py +0 -0
  48. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/operating_points.py +0 -0
  49. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/paths.py +0 -0
  50. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/plotting.py +0 -0
  51. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/protocols.py +0 -0
  52. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/provenance.py +0 -0
  53. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/py.typed +0 -0
  54. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  55. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  56. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  57. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  58. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  59. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/splits.py +0 -0
  60. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/text_dedup.py +0 -0
  61. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/src/eval_toolkit/thresholds.py +0 -0
  62. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  63. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  64. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  65. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  66. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  67. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  68. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  69. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  70. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  71. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  72. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/benchmarks/__init__.py +0 -0
  73. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  74. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/conftest.py +0 -0
  75. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  76. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  77. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  78. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  79. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/docs/expected.md +0 -0
  80. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/docs/input.md +0 -0
  81. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/docs/metrics.json +0 -0
  82. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  83. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/strategies.py +0 -0
  84. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_analysis.py +0 -0
  85. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_artifacts.py +0 -0
  86. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  87. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  88. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_edge_cases.py +0 -0
  89. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_golden.py +0 -0
  90. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_njobs.py +0 -0
  91. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_props.py +0 -0
  92. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_research_grounded.py +0 -0
  93. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_bootstrap_unit.py +0 -0
  94. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_binary_adapters.py +0 -0
  95. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  96. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_determinism.py +0 -0
  97. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_optimization_failures.py +0 -0
  98. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_props.py +0 -0
  99. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_research_grounded.py +0 -0
  100. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_calibration_unit.py +0 -0
  101. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_claims.py +0 -0
  102. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_claims_coverage.py +0 -0
  103. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_claims_props.py +0 -0
  104. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_cli.py +0 -0
  105. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_config.py +0 -0
  106. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_bootstrap.py +0 -0
  107. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_calibration.py +0 -0
  108. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_harness.py +0 -0
  109. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_metrics.py +0 -0
  110. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_coverage_plotting.py +0 -0
  111. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_croissant_e2e.py +0 -0
  112. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  113. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_deprecations.py +0 -0
  114. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_docs_golden.py +0 -0
  115. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_docs_props.py +0 -0
  116. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_embeddings.py +0 -0
  117. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_evidence_validators.py +0 -0
  118. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_edge_cases.py +0 -0
  119. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_fault_injection.py +0 -0
  120. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_folded.py +0 -0
  121. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_internals.py +0 -0
  122. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_metric_options.py +0 -0
  123. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_parallelism.py +0 -0
  124. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_harness_smoke.py +0 -0
  125. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_import_boundaries.py +0 -0
  126. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  127. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_leakage.py +0 -0
  128. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_leakage_error_paths.py +0 -0
  129. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_leakage_props.py +0 -0
  130. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_loaders.py +0 -0
  131. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_loaders_coverage.py +0 -0
  132. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_loaders_props.py +0 -0
  133. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_logging.py +0 -0
  134. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest.py +0 -0
  135. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  136. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest_props.py +0 -0
  137. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_manifest_validation.py +0 -0
  138. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_metrics_props.py +0 -0
  139. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_metrics_stratified_subsets.py +0 -0
  140. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_metrics_unit.py +0 -0
  141. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_misc_coverage.py +0 -0
  142. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_numeric_edge_cases.py +0 -0
  143. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_operating_points.py +0 -0
  144. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_operating_points_props.py +0 -0
  145. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_parallel.py +0 -0
  146. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_paths.py +0 -0
  147. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_pipeline_e2e.py +0 -0
  148. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_plotting_edge.py +0 -0
  149. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_plotting_smoke.py +0 -0
  150. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_plotting_visual.py +0 -0
  151. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_protocol_conformance.py +0 -0
  152. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_provenance.py +0 -0
  153. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_public_api.py +0 -0
  154. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_recall_at_fpr.py +0 -0
  155. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_reference_equivalence.py +0 -0
  156. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_reproducibility_integration.py +0 -0
  157. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_seeds.py +0 -0
  158. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_splits.py +0 -0
  159. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_splits_leakage_integration.py +0 -0
  160. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_splits_props.py +0 -0
  161. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup.py +0 -0
  162. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_coverage.py +0 -0
  163. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_props.py +0 -0
  164. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_text_dedup_strategies.py +0 -0
  165. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds.py +0 -0
  166. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_constant_score.py +0 -0
  167. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_coverage.py +0 -0
  168. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_props.py +0 -0
  169. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_thresholds_research_grounded.py +0 -0
  170. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_tokenization_leakage_check.py +0 -0
  171. {eval_toolkit-0.42.0 → eval_toolkit-0.44.0}/tests/test_v09_contracts.py +0 -0
@@ -5,7 +5,92 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [Unreleased]
8
+ ## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
9
+
10
+ ### Added
11
+
12
+ - `eval_toolkit.preprocessing` — new module with 3 Spotlighting
13
+ structural-defense variants from Hines et al. 2024
14
+ (arXiv 2403.14720): `delimit(text, delimiter='<<')`,
15
+ `datamark(text, marker='^')`, `encode(text, encoding='base64')`,
16
+ plus a `sweep(texts, variants=..., kwargs=...)` batch wrapper that
17
+ returns a `(N*3)`-row DataFrame. Includes a `spotlighting`
18
+ SimpleNamespace exposing the upstream issue's function-style API
19
+ (`spotlighting.delimit(text)`, etc.). Base-install safe (pure
20
+ stdlib). Closes #51.
21
+ - `eval_toolkit.losses` — new module with `RecallAtLowFPR` — the
22
+ Meta Prompt Guard 2 (PG2) training recipe: a differentiable
23
+ approximation of recall-at-fixed-FPR via soft-rank, returning a
24
+ scalar `torch.nn.Module` loss for use in standard training loops.
25
+ Optimizes detector ranking at a constrained operating point
26
+ (e.g. `fpr_target=0.01` → "maximize recall while keeping FPR ≤ 1%").
27
+ Closes #50.
28
+ - New optional extra `[losses] = torch>=2.0`. Granular per the v0.43
29
+ plan Decision 4 — separated from `[probes]` so callers wanting only
30
+ the loss don't have to install the larger transformers stack.
31
+ Shares the torch version pin with `[probes]`.
32
+
33
+ ## [0.43.0] — 2026-05-19 — P1 batch: OOD manifest loader + character_injection sweep + ActivationDeltaProbe (closes #48, #49, #53)
34
+
35
+ ### Added
36
+
37
+ - `ood_dataset_from_manifest(yaml_path, slices=..., cache_dir=...)` —
38
+ declarative loader for multiple OOD eval slates (BIPIA, AgentDojo,
39
+ InjecAgent, NotInject, PINT, LLMail-Inject, …) into a single
40
+ unified DataFrame with columns `text` / `label` / `source` /
41
+ `row_id` / `sha`. Bytes are downloaded once, sha256-verified
42
+ against the manifest, and cached on-disk keyed by content hash
43
+ (default `~/.cache/eval-toolkit/ood/`). Closes #48 — drops the
44
+ per-source loader boilerplate carried by
45
+ `prompt-injection-portfolio` and `prompt-injection-detection-submission`.
46
+ - `OodManifestLoader` — `DatasetLoader`-Protocol-compliant wrapper
47
+ around the factory, returning `{"all": EvalSlice}` with
48
+ `source` as the default strata column for harness pipelines.
49
+ - `src/eval_toolkit/schemas/ood_manifest.v1.json` — Draft 2020-12
50
+ JSON schema for the OOD manifest YAML; auto-validated by
51
+ `uv run eval-toolkit schemas check`.
52
+ - `eval_toolkit.adversarial` — new module with character-injection
53
+ bypass suite (Microsoft Research 2024, arXiv 2404.13208).
54
+ Six core techniques shipped as frozen-dataclass strategies:
55
+ `ZeroWidthSpaceInjection`, `HomoglyphSubstitution`,
56
+ `DiacriticInjection`, `WhitespaceInjection`, `CaseRandomization`,
57
+ `PunctuationInjection`. All implement a `CharacterInjectionStrategy`
58
+ Protocol with `transform(text: str) -> str`. Six advanced techniques
59
+ (bidi RTL, tag stripping, synonym, token splitting, Unicode
60
+ normalization, invisible chars) scheduled for v0.43.1 — the sweep
61
+ API stabilizes in v0.43.0 so the v0.43.1 additions are pure
62
+ extensions. Closes #49 (core-6).
63
+ - `adversarial.sweep(texts, scorer, techniques="all", threshold=0.5)`
64
+ — Scorer-Protocol-compliant adversarial-robustness sweep. Returns
65
+ a DataFrame with `(text_id, technique, original_score,
66
+ transformed_score, asr)` rows for matrix analysis.
67
+ Aggregate ASR with `df.groupby("technique")["asr"].mean()`.
68
+ - `adversarial.character_injection` — `SimpleNamespace` exposing the
69
+ function-style API from the upstream issue spec
70
+ (`character_injection.zero_width_space(text)`,
71
+ `character_injection.sweep(...)`, etc.).
72
+ - `eval_toolkit.probes` — new module with `ActivationDeltaProbe`:
73
+ TaskTracker-style linear probe over HuggingFace transformer
74
+ hidden-state activation deltas (Abdelnabi et al. 2024,
75
+ arXiv 2406.00799). Backbone-agnostic (encoder OR decoder).
76
+ Sklearn-compatible API: `.fit(clean_texts, injected_texts)`,
77
+ `.predict()` → `(n,)`, `.predict_proba()` → `(n, 2)`,
78
+ `.coef_`, `.classes_`. Activations cached to
79
+ `$XDG_CACHE_HOME/eval-toolkit/probes/` keyed by
80
+ `(backbone, layer_index, aggregate, sha256(text))` so re-runs are
81
+ near-instant. Aggregate modes: `mean`, `max`, `cls`. Closes #53.
82
+ - `Probe` Protocol — minimal sklearn-shaped probe surface (`fit`,
83
+ `predict`, `predict_proba`, `coef_`, `classes_`). Distinct from
84
+ `Scorer` (which returns 1-D `P(positive)`); wrap with
85
+ `lambda p, X: p.predict_proba(X)[:, 1]` to adapt.
86
+ - `ActivationExtractor` Protocol — pluggable hidden-state-extraction
87
+ contract for `ActivationDeltaProbe`; injectable for tests to avoid
88
+ loading a real backbone.
89
+ - New optional extra `[probes] = torch>=2.0, transformers>=4.40`.
90
+ Follows the `[embeddings]` precedent — opt-in only, NOT in
91
+ `[all]` or `[dev]`, since the transitive install is ~600MB+.
92
+ Module is base-install-safe: a friendly `ImportError` fires only
93
+ if you try to use the default HF extractor without the extra.
9
94
 
10
95
  ## [0.42.0] — 2026-05-19 — fit_isotonic_binary completes 4-calibrator family (closes #44)
11
96
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.42.0
3
+ Version: 0.44.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -62,11 +62,16 @@ Requires-Dist: sphinx-design>=0.6; extra == 'docs'
62
62
  Requires-Dist: sphinx>=7.3; extra == 'docs'
63
63
  Provides-Extra: embeddings
64
64
  Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
65
+ Provides-Extra: losses
66
+ Requires-Dist: torch>=2.0; extra == 'losses'
65
67
  Provides-Extra: parquet
66
68
  Requires-Dist: pyarrow>=15.0; extra == 'parquet'
67
69
  Provides-Extra: plotting
68
70
  Requires-Dist: matplotlib>=3.8; extra == 'plotting'
69
71
  Requires-Dist: pillow>=10.0; extra == 'plotting'
72
+ Provides-Extra: probes
73
+ Requires-Dist: torch>=2.0; extra == 'probes'
74
+ Requires-Dist: transformers>=4.40; extra == 'probes'
70
75
  Provides-Extra: property
71
76
  Requires-Dist: hypothesis>=6.100; extra == 'property'
72
77
  Provides-Extra: transformers
@@ -308,22 +313,6 @@ tests with large `max_examples` and a few bootstrap tests with
308
313
  `n_resamples >= 200`). `make fast` keeps the developer iteration loop
309
314
  under ~30 seconds.
310
315
 
311
- ## Downstream contract testing (v4 sibling-smoke)
312
-
313
- A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
314
- the downstream consumer `prompt-injection-v4` at `main`, installs it
315
- with this branch's eval-toolkit as an editable sibling dep (via v4's
316
- `[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
317
- contract regressions at PR time rather than in v4's own CI post-merge.
318
-
319
- The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
320
- datasets used by v4's smoke fixtures). Set it at:
321
- `https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
322
-
323
- The workflow runs with `continue-on-error: true` during a 2-3 week
324
- trial period; it'll be promoted to a required gate once the false-
325
- positive rate (from independent v4 main breakage or HF rate-limits)
326
- is characterized.
327
316
 
328
317
  ## Standards
329
318
 
@@ -230,22 +230,6 @@ tests with large `max_examples` and a few bootstrap tests with
230
230
  `n_resamples >= 200`). `make fast` keeps the developer iteration loop
231
231
  under ~30 seconds.
232
232
 
233
- ## Downstream contract testing (v4 sibling-smoke)
234
-
235
- A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
236
- the downstream consumer `prompt-injection-v4` at `main`, installs it
237
- with this branch's eval-toolkit as an editable sibling dep (via v4's
238
- `[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
239
- contract regressions at PR time rather than in v4's own CI post-merge.
240
-
241
- The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
242
- datasets used by v4's smoke fixtures). Set it at:
243
- `https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
244
-
245
- The workflow runs with `continue-on-error: true` during a 2-3 week
246
- trial period; it'll be promoted to a required gate once the false-
247
- positive rate (from independent v4 main breakage or HF rate-limits)
248
- is characterized.
249
233
 
250
234
  ## Standards
251
235
 
@@ -63,6 +63,17 @@ embeddings = ["sentence-transformers>=3.0"]
63
63
  # itself does not import transformers, so the optional install is
64
64
  # strictly for callers wanting AutoTokenizer.from_pretrained(...).
65
65
  transformers = ["transformers>=4.0"]
66
+ # v0.43.0: ActivationDeltaProbe (TaskTracker-style linear activation probe;
67
+ # closes #53). Pulls torch + transformers (~600MB+ transitive). Follows
68
+ # the [embeddings] precedent: opt-in only, NOT in [all] / [dev]. Module
69
+ # is base-install-safe (lazy imports inside ActivationDeltaProbe methods);
70
+ # the extra is strictly for callers wanting to actually fit / predict.
71
+ probes = ["torch>=2.0", "transformers>=4.40"]
72
+ # v0.44.0: RecallAtLowFPR loss (Meta Prompt Guard 2 recipe; closes #50).
73
+ # torch-only (no transformers); separated from [probes] per Decision 4
74
+ # (granular extras — losses callers should not have to install the larger
75
+ # transformers stack). Shares the torch version pin with [probes].
76
+ losses = ["torch>=2.0"]
66
77
  # DEPRECATED (announced v0.30.1, removal v0.33.0).
67
78
  #
68
79
  # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
@@ -171,7 +182,7 @@ warn_no_return = true
171
182
  strict_equality = true
172
183
 
173
184
  [[tool.mypy.overrides]]
174
- module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*"]
185
+ module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*", "torch.*", "transformers.*"]
175
186
  ignore_missing_imports = true
176
187
 
177
188
  [tool.pytest.ini_options]
@@ -30,6 +30,27 @@ _logging.getLogger("eval_toolkit").addHandler(_logging.NullHandler())
30
30
  # dividers below are informational only; the snapshot in
31
31
  # tests/golden/public_api/ reads dict keys + values, not comments.
32
32
  _EXPORTS: dict[str, str] = {
33
+ # --- adversarial ---
34
+ "CORE_TECHNIQUES": "eval_toolkit.adversarial",
35
+ "CaseRandomization": "eval_toolkit.adversarial",
36
+ "CharacterInjectionStrategy": "eval_toolkit.adversarial",
37
+ "DiacriticInjection": "eval_toolkit.adversarial",
38
+ "HomoglyphSubstitution": "eval_toolkit.adversarial",
39
+ "PunctuationInjection": "eval_toolkit.adversarial",
40
+ "WhitespaceInjection": "eval_toolkit.adversarial",
41
+ "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
42
+ "character_injection": "eval_toolkit.adversarial",
43
+ # --- losses ---
44
+ "RecallAtLowFPR": "eval_toolkit.losses",
45
+ # --- preprocessing ---
46
+ "datamark": "eval_toolkit.preprocessing",
47
+ "delimit": "eval_toolkit.preprocessing",
48
+ "encode": "eval_toolkit.preprocessing",
49
+ "spotlighting": "eval_toolkit.preprocessing",
50
+ # --- probes ---
51
+ "ActivationDeltaProbe": "eval_toolkit.probes",
52
+ "ActivationExtractor": "eval_toolkit.probes",
53
+ "Probe": "eval_toolkit.probes",
33
54
  # --- analysis ---
34
55
  "CsvPredictionReader": "eval_toolkit.analysis",
35
56
  "JsonlPredictionReader": "eval_toolkit.analysis",
@@ -156,8 +177,10 @@ _EXPORTS: dict[str, str] = {
156
177
  "DataFrameLoader": "eval_toolkit.loaders",
157
178
  "DatasetLoader": "eval_toolkit.loaders",
158
179
  "HFDatasetsLoader": "eval_toolkit.loaders",
180
+ "OodManifestLoader": "eval_toolkit.loaders",
159
181
  "ParquetGlobLoader": "eval_toolkit.loaders",
160
182
  "SingleSliceLoader": "eval_toolkit.loaders",
183
+ "ood_dataset_from_manifest": "eval_toolkit.loaders",
161
184
  # --- manifest ---
162
185
  "MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
163
186
  "RunManifest": "eval_toolkit.manifest",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.42.0"
5
+ __version__ = "0.44.0"