eval-toolkit 0.42.0__tar.gz → 0.43.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/CHANGELOG.md +61 -1
  2. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/PKG-INFO +4 -17
  3. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/README.md +0 -16
  4. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/pyproject.toml +6 -0
  5. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/__init__.py +16 -0
  6. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/_version.py +1 -1
  7. eval_toolkit-0.43.0/src/eval_toolkit/adversarial.py +578 -0
  8. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/loaders.py +479 -0
  9. eval_toolkit-0.43.0/src/eval_toolkit/probes.py +469 -0
  10. eval_toolkit-0.43.0/src/eval_toolkit/schemas/ood_manifest.v1.json +77 -0
  11. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/public_api/snapshot.json +118 -1
  12. eval_toolkit-0.43.0/tests/test_adversarial.py +351 -0
  13. eval_toolkit-0.43.0/tests/test_ood_loader.py +353 -0
  14. eval_toolkit-0.43.0/tests/test_probes.py +321 -0
  15. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_schemas.py +3 -1
  16. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/.gitignore +0 -0
  17. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/LICENSE +0 -0
  18. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/STYLE.md +0 -0
  19. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/archive/README.md +0 -0
  20. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/research/README.md +0 -0
  21. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/research/datasets/README.md +0 -0
  22. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/research/papers/data-integrity/README.md +0 -0
  23. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  24. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/research/papers/inference/README.md +0 -0
  25. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/research/papers/prompt-injection/README.md +0 -0
  26. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/docs/source/methodology/README.md +0 -0
  27. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/__main__.py +0 -0
  28. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/_deprecated.py +0 -0
  29. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/_parallel.py +0 -0
  30. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/analysis.py +0 -0
  31. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/artifacts.py +0 -0
  32. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/bootstrap.py +0 -0
  33. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/calibration.py +0 -0
  34. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/claims.py +0 -0
  35. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/config.py +0 -0
  36. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/docs.py +0 -0
  37. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/embeddings.py +0 -0
  38. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/evidence.py +0 -0
  39. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/harness.py +0 -0
  40. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/leakage.py +0 -0
  41. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/manifest.py +0 -0
  42. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/metrics.py +0 -0
  43. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/operating_points.py +0 -0
  44. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/paths.py +0 -0
  45. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/plotting.py +0 -0
  46. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/protocols.py +0 -0
  47. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/provenance.py +0 -0
  48. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/py.typed +0 -0
  49. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  50. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  51. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  52. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  53. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  54. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/seeds.py +0 -0
  55. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/splits.py +0 -0
  56. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/text_dedup.py +0 -0
  57. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/src/eval_toolkit/thresholds.py +0 -0
  58. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  59. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  60. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  61. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  62. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  63. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  64. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  65. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  66. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  67. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  68. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/benchmarks/__init__.py +0 -0
  69. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  70. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/conftest.py +0 -0
  71. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  72. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  73. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  74. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  75. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/docs/expected.md +0 -0
  76. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/docs/input.md +0 -0
  77. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/docs/metrics.json +0 -0
  78. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  79. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/strategies.py +0 -0
  80. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_analysis.py +0 -0
  81. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_artifacts.py +0 -0
  82. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  83. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  84. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_edge_cases.py +0 -0
  85. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_golden.py +0 -0
  86. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_njobs.py +0 -0
  87. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_props.py +0 -0
  88. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_research_grounded.py +0 -0
  89. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_bootstrap_unit.py +0 -0
  90. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_calibration_binary_adapters.py +0 -0
  91. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  92. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_calibration_determinism.py +0 -0
  93. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_calibration_optimization_failures.py +0 -0
  94. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_calibration_props.py +0 -0
  95. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_calibration_research_grounded.py +0 -0
  96. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_calibration_unit.py +0 -0
  97. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_claims.py +0 -0
  98. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_claims_coverage.py +0 -0
  99. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_claims_props.py +0 -0
  100. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_cli.py +0 -0
  101. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_config.py +0 -0
  102. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_coverage_bootstrap.py +0 -0
  103. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_coverage_calibration.py +0 -0
  104. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_coverage_harness.py +0 -0
  105. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_coverage_metrics.py +0 -0
  106. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_coverage_plotting.py +0 -0
  107. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_croissant_e2e.py +0 -0
  108. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  109. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_deprecations.py +0 -0
  110. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_docs_golden.py +0 -0
  111. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_docs_props.py +0 -0
  112. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_embeddings.py +0 -0
  113. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_evidence_validators.py +0 -0
  114. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_harness_edge_cases.py +0 -0
  115. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_harness_fault_injection.py +0 -0
  116. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_harness_folded.py +0 -0
  117. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_harness_internals.py +0 -0
  118. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_harness_metric_options.py +0 -0
  119. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_harness_parallelism.py +0 -0
  120. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_harness_smoke.py +0 -0
  121. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_import_boundaries.py +0 -0
  122. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  123. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_leakage.py +0 -0
  124. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_leakage_error_paths.py +0 -0
  125. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_leakage_props.py +0 -0
  126. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_loaders.py +0 -0
  127. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_loaders_coverage.py +0 -0
  128. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_loaders_props.py +0 -0
  129. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_logging.py +0 -0
  130. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_manifest.py +0 -0
  131. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  132. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_manifest_props.py +0 -0
  133. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_manifest_validation.py +0 -0
  134. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_metrics_props.py +0 -0
  135. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_metrics_stratified_subsets.py +0 -0
  136. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_metrics_unit.py +0 -0
  137. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_misc_coverage.py +0 -0
  138. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_numeric_edge_cases.py +0 -0
  139. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_operating_points.py +0 -0
  140. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_operating_points_props.py +0 -0
  141. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_parallel.py +0 -0
  142. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_paths.py +0 -0
  143. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_pipeline_e2e.py +0 -0
  144. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_plotting_edge.py +0 -0
  145. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_plotting_smoke.py +0 -0
  146. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_plotting_visual.py +0 -0
  147. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_protocol_conformance.py +0 -0
  148. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_provenance.py +0 -0
  149. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_public_api.py +0 -0
  150. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_recall_at_fpr.py +0 -0
  151. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_reference_equivalence.py +0 -0
  152. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_reproducibility_integration.py +0 -0
  153. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_seeds.py +0 -0
  154. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_splits.py +0 -0
  155. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_splits_leakage_integration.py +0 -0
  156. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_splits_props.py +0 -0
  157. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_text_dedup.py +0 -0
  158. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_text_dedup_coverage.py +0 -0
  159. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_text_dedup_props.py +0 -0
  160. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_text_dedup_strategies.py +0 -0
  161. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_thresholds.py +0 -0
  162. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_thresholds_constant_score.py +0 -0
  163. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_thresholds_coverage.py +0 -0
  164. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_thresholds_props.py +0 -0
  165. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_thresholds_research_grounded.py +0 -0
  166. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_tokenization_leakage_check.py +0 -0
  167. {eval_toolkit-0.42.0 → eval_toolkit-0.43.0}/tests/test_v09_contracts.py +0 -0
@@ -5,7 +5,67 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [Unreleased]
8
+ ## [0.43.0] — 2026-05-19 — P1 batch: OOD manifest loader + character_injection sweep + ActivationDeltaProbe (closes #48, #49, #53)
9
+
10
+ ### Added
11
+
12
+ - `ood_dataset_from_manifest(yaml_path, slices=..., cache_dir=...)` —
13
+ declarative loader for multiple OOD eval slates (BIPIA, AgentDojo,
14
+ InjecAgent, NotInject, PINT, LLMail-Inject, …) into a single
15
+ unified DataFrame with columns `text` / `label` / `source` /
16
+ `row_id` / `sha`. Bytes are downloaded once, sha256-verified
17
+ against the manifest, and cached on-disk keyed by content hash
18
+ (default `~/.cache/eval-toolkit/ood/`). Closes #48 — drops the
19
+ per-source loader boilerplate carried by
20
+ `prompt-injection-portfolio` and `prompt-injection-detection-submission`.
21
+ - `OodManifestLoader` — `DatasetLoader`-Protocol-compliant wrapper
22
+ around the factory, returning `{"all": EvalSlice}` with
23
+ `source` as the default strata column for harness pipelines.
24
+ - `src/eval_toolkit/schemas/ood_manifest.v1.json` — Draft 2020-12
25
+ JSON schema for the OOD manifest YAML; auto-validated by
26
+ `uv run eval-toolkit schemas check`.
27
+ - `eval_toolkit.adversarial` — new module with character-injection
28
+ bypass suite (Microsoft Research 2024, arXiv 2404.13208).
29
+ Six core techniques shipped as frozen-dataclass strategies:
30
+ `ZeroWidthSpaceInjection`, `HomoglyphSubstitution`,
31
+ `DiacriticInjection`, `WhitespaceInjection`, `CaseRandomization`,
32
+ `PunctuationInjection`. All implement a `CharacterInjectionStrategy`
33
+ Protocol with `transform(text: str) -> str`. Six advanced techniques
34
+ (bidi RTL, tag stripping, synonym, token splitting, Unicode
35
+ normalization, invisible chars) scheduled for v0.43.1 — the sweep
36
+ API stabilizes in v0.43.0 so the v0.43.1 additions are pure
37
+ extensions. Closes #49 (core-6).
38
+ - `adversarial.sweep(texts, scorer, techniques="all", threshold=0.5)`
39
+ — Scorer-Protocol-compliant adversarial-robustness sweep. Returns
40
+ a DataFrame with `(text_id, technique, original_score,
41
+ transformed_score, asr)` rows for matrix analysis.
42
+ Aggregate ASR with `df.groupby("technique")["asr"].mean()`.
43
+ - `adversarial.character_injection` — `SimpleNamespace` exposing the
44
+ function-style API from the upstream issue spec
45
+ (`character_injection.zero_width_space(text)`,
46
+ `character_injection.sweep(...)`, etc.).
47
+ - `eval_toolkit.probes` — new module with `ActivationDeltaProbe`:
48
+ TaskTracker-style linear probe over HuggingFace transformer
49
+ hidden-state activation deltas (Abdelnabi et al. 2024,
50
+ arXiv 2406.00799). Backbone-agnostic (encoder OR decoder).
51
+ Sklearn-compatible API: `.fit(clean_texts, injected_texts)`,
52
+ `.predict()` → `(n,)`, `.predict_proba()` → `(n, 2)`,
53
+ `.coef_`, `.classes_`. Activations cached to
54
+ `$XDG_CACHE_HOME/eval-toolkit/probes/` keyed by
55
+ `(backbone, layer_index, aggregate, sha256(text))` so re-runs are
56
+ near-instant. Aggregate modes: `mean`, `max`, `cls`. Closes #53.
57
+ - `Probe` Protocol — minimal sklearn-shaped probe surface (`fit`,
58
+ `predict`, `predict_proba`, `coef_`, `classes_`). Distinct from
59
+ `Scorer` (which returns 1-D `P(positive)`); wrap with
60
+ `lambda p, X: p.predict_proba(X)[:, 1]` to adapt.
61
+ - `ActivationExtractor` Protocol — pluggable hidden-state-extraction
62
+ contract for `ActivationDeltaProbe`; injectable for tests to avoid
63
+ loading a real backbone.
64
+ - New optional extra `[probes] = torch>=2.0, transformers>=4.40`.
65
+ Follows the `[embeddings]` precedent — opt-in only, NOT in
66
+ `[all]` or `[dev]`, since the transitive install is ~600MB+.
67
+ Module is base-install-safe: a friendly `ImportError` fires only
68
+ if you try to use the default HF extractor without the extra.
9
69
 
10
70
  ## [0.42.0] — 2026-05-19 — fit_isotonic_binary completes 4-calibrator family (closes #44)
11
71
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.42.0
3
+ Version: 0.43.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -67,6 +67,9 @@ Requires-Dist: pyarrow>=15.0; extra == 'parquet'
67
67
  Provides-Extra: plotting
68
68
  Requires-Dist: matplotlib>=3.8; extra == 'plotting'
69
69
  Requires-Dist: pillow>=10.0; extra == 'plotting'
70
+ Provides-Extra: probes
71
+ Requires-Dist: torch>=2.0; extra == 'probes'
72
+ Requires-Dist: transformers>=4.40; extra == 'probes'
70
73
  Provides-Extra: property
71
74
  Requires-Dist: hypothesis>=6.100; extra == 'property'
72
75
  Provides-Extra: transformers
@@ -308,22 +311,6 @@ tests with large `max_examples` and a few bootstrap tests with
308
311
  `n_resamples >= 200`). `make fast` keeps the developer iteration loop
309
312
  under ~30 seconds.
310
313
 
311
- ## Downstream contract testing (v4 sibling-smoke)
312
-
313
- A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
314
- the downstream consumer `prompt-injection-v4` at `main`, installs it
315
- with this branch's eval-toolkit as an editable sibling dep (via v4's
316
- `[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
317
- contract regressions at PR time rather than in v4's own CI post-merge.
318
-
319
- The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
320
- datasets used by v4's smoke fixtures). Set it at:
321
- `https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
322
-
323
- The workflow runs with `continue-on-error: true` during a 2-3 week
324
- trial period; it'll be promoted to a required gate once the false-
325
- positive rate (from independent v4 main breakage or HF rate-limits)
326
- is characterized.
327
314
 
328
315
  ## Standards
329
316
 
@@ -230,22 +230,6 @@ tests with large `max_examples` and a few bootstrap tests with
230
230
  `n_resamples >= 200`). `make fast` keeps the developer iteration loop
231
231
  under ~30 seconds.
232
232
 
233
- ## Downstream contract testing (v4 sibling-smoke)
234
-
235
- A separate CI workflow (`.github/workflows/v4-smoke.yml`) checks out
236
- the downstream consumer `prompt-injection-v4` at `main`, installs it
237
- with this branch's eval-toolkit as an editable sibling dep (via v4's
238
- `[tool.uv.sources]`), and runs v4's fast `-m smoke` suite. This catches
239
- contract regressions at PR time rather than in v4's own CI post-merge.
240
-
241
- The workflow requires a `HF_TOKEN` repo secret (gated HuggingFace
242
- datasets used by v4's smoke fixtures). Set it at:
243
- `https://github.com/brandon-behring/eval-toolkit/settings/secrets/actions`
244
-
245
- The workflow runs with `continue-on-error: true` during a 2-3 week
246
- trial period; it'll be promoted to a required gate once the false-
247
- positive rate (from independent v4 main breakage or HF rate-limits)
248
- is characterized.
249
233
 
250
234
  ## Standards
251
235
 
@@ -63,6 +63,12 @@ embeddings = ["sentence-transformers>=3.0"]
63
63
  # itself does not import transformers, so the optional install is
64
64
  # strictly for callers wanting AutoTokenizer.from_pretrained(...).
65
65
  transformers = ["transformers>=4.0"]
66
+ # v0.43.0: ActivationDeltaProbe (TaskTracker-style linear activation probe;
67
+ # closes #53). Pulls torch + transformers (~600MB+ transitive). Follows
68
+ # the [embeddings] precedent: opt-in only, NOT in [all] / [dev]. Module
69
+ # is base-install-safe (lazy imports inside ActivationDeltaProbe methods);
70
+ # the extra is strictly for callers wanting to actually fit / predict.
71
+ probes = ["torch>=2.0", "transformers>=4.40"]
66
72
  # DEPRECATED (announced v0.30.1, removal v0.33.0).
67
73
  #
68
74
  # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
@@ -30,6 +30,20 @@ _logging.getLogger("eval_toolkit").addHandler(_logging.NullHandler())
30
30
  # dividers below are informational only; the snapshot in
31
31
  # tests/golden/public_api/ reads dict keys + values, not comments.
32
32
  _EXPORTS: dict[str, str] = {
33
+ # --- adversarial ---
34
+ "CORE_TECHNIQUES": "eval_toolkit.adversarial",
35
+ "CaseRandomization": "eval_toolkit.adversarial",
36
+ "CharacterInjectionStrategy": "eval_toolkit.adversarial",
37
+ "DiacriticInjection": "eval_toolkit.adversarial",
38
+ "HomoglyphSubstitution": "eval_toolkit.adversarial",
39
+ "PunctuationInjection": "eval_toolkit.adversarial",
40
+ "WhitespaceInjection": "eval_toolkit.adversarial",
41
+ "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
42
+ "character_injection": "eval_toolkit.adversarial",
43
+ # --- probes ---
44
+ "ActivationDeltaProbe": "eval_toolkit.probes",
45
+ "ActivationExtractor": "eval_toolkit.probes",
46
+ "Probe": "eval_toolkit.probes",
33
47
  # --- analysis ---
34
48
  "CsvPredictionReader": "eval_toolkit.analysis",
35
49
  "JsonlPredictionReader": "eval_toolkit.analysis",
@@ -156,8 +170,10 @@ _EXPORTS: dict[str, str] = {
156
170
  "DataFrameLoader": "eval_toolkit.loaders",
157
171
  "DatasetLoader": "eval_toolkit.loaders",
158
172
  "HFDatasetsLoader": "eval_toolkit.loaders",
173
+ "OodManifestLoader": "eval_toolkit.loaders",
159
174
  "ParquetGlobLoader": "eval_toolkit.loaders",
160
175
  "SingleSliceLoader": "eval_toolkit.loaders",
176
+ "ood_dataset_from_manifest": "eval_toolkit.loaders",
161
177
  # --- manifest ---
162
178
  "MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
163
179
  "RunManifest": "eval_toolkit.manifest",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.42.0"
5
+ __version__ = "0.43.0"