eval-toolkit 0.43.0__tar.gz → 0.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/.gitignore +6 -0
  2. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/CHANGELOG.md +72 -0
  3. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/PKG-INFO +3 -1
  4. eval_toolkit-0.45.0/docs/source/adr/README.md +76 -0
  5. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/pyproject.toml +6 -1
  6. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/__init__.py +9 -0
  7. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_version.py +1 -1
  8. eval_toolkit-0.45.0/src/eval_toolkit/losses.py +225 -0
  9. eval_toolkit-0.45.0/src/eval_toolkit/preprocessing.py +259 -0
  10. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/probes.py +2 -2
  11. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/seeds.py +1 -1
  12. eval_toolkit-0.45.0/src/eval_toolkit/stacking.py +412 -0
  13. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/public_api/snapshot.json +49 -1
  14. eval_toolkit-0.45.0/tests/test_losses.py +189 -0
  15. eval_toolkit-0.45.0/tests/test_preprocessing.py +241 -0
  16. eval_toolkit-0.45.0/tests/test_stacking.py +369 -0
  17. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/LICENSE +0 -0
  18. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/README.md +0 -0
  19. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/STYLE.md +0 -0
  20. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/archive/README.md +0 -0
  21. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/research/README.md +0 -0
  22. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/research/datasets/README.md +0 -0
  23. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/research/papers/data-integrity/README.md +0 -0
  24. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  25. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/research/papers/inference/README.md +0 -0
  26. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/research/papers/prompt-injection/README.md +0 -0
  27. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/docs/source/methodology/README.md +0 -0
  28. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/__main__.py +0 -0
  29. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_deprecated.py +0 -0
  30. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_parallel.py +0 -0
  31. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/adversarial.py +0 -0
  32. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/analysis.py +0 -0
  33. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/artifacts.py +0 -0
  34. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/bootstrap.py +0 -0
  35. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/calibration.py +0 -0
  36. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/claims.py +0 -0
  37. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/config.py +0 -0
  38. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/docs.py +0 -0
  39. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/embeddings.py +0 -0
  40. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/evidence.py +0 -0
  41. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/harness.py +0 -0
  42. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/leakage.py +0 -0
  43. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/loaders.py +0 -0
  44. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/manifest.py +0 -0
  45. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/metrics.py +0 -0
  46. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/operating_points.py +0 -0
  47. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/paths.py +0 -0
  48. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/plotting.py +0 -0
  49. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/protocols.py +0 -0
  50. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/provenance.py +0 -0
  51. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/py.typed +0 -0
  52. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  53. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  54. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  55. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  56. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  57. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  58. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/splits.py +0 -0
  59. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/text_dedup.py +0 -0
  60. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/src/eval_toolkit/thresholds.py +0 -0
  61. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  62. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  63. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  64. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  65. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  66. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  67. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  68. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  69. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  70. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  71. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/benchmarks/__init__.py +0 -0
  72. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  73. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/conftest.py +0 -0
  74. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  75. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  76. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  77. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  78. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/docs/expected.md +0 -0
  79. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/docs/input.md +0 -0
  80. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/docs/metrics.json +0 -0
  81. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  82. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/strategies.py +0 -0
  83. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_adversarial.py +0 -0
  84. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_analysis.py +0 -0
  85. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_artifacts.py +0 -0
  86. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  87. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  88. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_edge_cases.py +0 -0
  89. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_golden.py +0 -0
  90. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_njobs.py +0 -0
  91. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_props.py +0 -0
  92. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_research_grounded.py +0 -0
  93. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_unit.py +0 -0
  94. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_calibration_binary_adapters.py +0 -0
  95. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  96. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_calibration_determinism.py +0 -0
  97. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_calibration_optimization_failures.py +0 -0
  98. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_calibration_props.py +0 -0
  99. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_calibration_research_grounded.py +0 -0
  100. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_calibration_unit.py +0 -0
  101. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_claims.py +0 -0
  102. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_claims_coverage.py +0 -0
  103. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_claims_props.py +0 -0
  104. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_cli.py +0 -0
  105. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_config.py +0 -0
  106. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_coverage_bootstrap.py +0 -0
  107. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_coverage_calibration.py +0 -0
  108. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_coverage_harness.py +0 -0
  109. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_coverage_metrics.py +0 -0
  110. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_coverage_plotting.py +0 -0
  111. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_croissant_e2e.py +0 -0
  112. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  113. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_deprecations.py +0 -0
  114. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_docs_golden.py +0 -0
  115. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_docs_props.py +0 -0
  116. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_embeddings.py +0 -0
  117. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_evidence_validators.py +0 -0
  118. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_harness_edge_cases.py +0 -0
  119. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_harness_fault_injection.py +0 -0
  120. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_harness_folded.py +0 -0
  121. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_harness_internals.py +0 -0
  122. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_harness_metric_options.py +0 -0
  123. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_harness_parallelism.py +0 -0
  124. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_harness_smoke.py +0 -0
  125. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_import_boundaries.py +0 -0
  126. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  127. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_leakage.py +0 -0
  128. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_leakage_error_paths.py +0 -0
  129. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_leakage_props.py +0 -0
  130. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_loaders.py +0 -0
  131. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_loaders_coverage.py +0 -0
  132. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_loaders_props.py +0 -0
  133. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_logging.py +0 -0
  134. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_manifest.py +0 -0
  135. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  136. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_manifest_props.py +0 -0
  137. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_manifest_validation.py +0 -0
  138. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_metrics_props.py +0 -0
  139. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_metrics_stratified_subsets.py +0 -0
  140. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_metrics_unit.py +0 -0
  141. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_misc_coverage.py +0 -0
  142. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_numeric_edge_cases.py +0 -0
  143. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_ood_loader.py +0 -0
  144. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_operating_points.py +0 -0
  145. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_operating_points_props.py +0 -0
  146. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_parallel.py +0 -0
  147. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_paths.py +0 -0
  148. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_pipeline_e2e.py +0 -0
  149. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_plotting_edge.py +0 -0
  150. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_plotting_smoke.py +0 -0
  151. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_plotting_visual.py +0 -0
  152. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_probes.py +0 -0
  153. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_protocol_conformance.py +0 -0
  154. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_provenance.py +0 -0
  155. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_public_api.py +0 -0
  156. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_recall_at_fpr.py +0 -0
  157. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_reference_equivalence.py +0 -0
  158. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_reproducibility_integration.py +0 -0
  159. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_schemas.py +0 -0
  160. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_seeds.py +0 -0
  161. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_splits.py +0 -0
  162. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_splits_leakage_integration.py +0 -0
  163. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_splits_props.py +0 -0
  164. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_text_dedup.py +0 -0
  165. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_coverage.py +0 -0
  166. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_props.py +0 -0
  167. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_strategies.py +0 -0
  168. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_thresholds.py +0 -0
  169. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_thresholds_constant_score.py +0 -0
  170. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_thresholds_coverage.py +0 -0
  171. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_thresholds_props.py +0 -0
  172. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_thresholds_research_grounded.py +0 -0
  173. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_tokenization_leakage_check.py +0 -0
  174. {eval_toolkit-0.43.0 → eval_toolkit-0.45.0}/tests/test_v09_contracts.py +0 -0
@@ -39,6 +39,12 @@ coverage.json
39
39
  # Logs
40
40
  *.log
41
41
 
42
+ # Local environment overrides (machine-local credentials / config)
43
+ .env.local
44
+
45
+ # Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
46
+ mutants/
47
+
42
48
  # Claude Code project settings (machine-local)
43
49
  .claude/
44
50
 
@@ -5,6 +5,78 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
9
+
10
+ First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
11
+ (per the v1.0 plan at `~/.claude/plans/evaluate-all-the-work-twinkly-kite.md`).
12
+ Non-breaking — purely additive. No Protocol shape edits to the existing 6
13
+ Tier-2 contracts (Gate 2 streak continues: 6 of 6 consecutive minors without
14
+ Protocol-shape changes).
15
+
16
+ ### Added
17
+
18
+ - `eval_toolkit.stacking` — new module providing the `MetaLearner` Protocol
19
+ and one reference impl, `LogisticStacker`, for combining outputs from
20
+ multiple binary detectors into a calibrated ensemble. Wraps
21
+ `sklearn.linear_model.LogisticRegression` with a stacker-shaped public API
22
+ (sklearn-style `fit(score_matrix, y)`, `predict(score_matrix)`,
23
+ `predict_proba(score_matrix)`, plus `coef_` / `classes_` / `intercept_`
24
+ attributes). No new dependencies — `scikit-learn` is already core since
25
+ v0.27. Closes #52.
26
+ - `MetaLearner` Protocol — `@runtime_checkable`; sklearn-shape contract
27
+ taking a `(n_samples, n_detectors)` score matrix. Sized as a v1.0 Tier-2
28
+ contract per the v1.0 plan Decision M (tiered stability — strict freeze at
29
+ v1.0; additive subprotocols permitted in minor releases). Mirrors the
30
+ `Probe` Protocol pattern from v0.43.
31
+ - `LogisticStacker` reference impl — configurable C, fit_intercept,
32
+ class_weight, penalty, solver, max_iter, random_state. Class-weight default
33
+ `"balanced"` for the common imbalanced-detection setting. Composes with the
34
+ 4-binary-calibrator family (v0.40 + v0.42) via `fit_platt_binary` /
35
+ `fit_isotonic_binary` chaining on stacked output.
36
+ - 24-test coverage in `tests/test_stacking.py`: Protocol satisfaction (both
37
+ structural and duck-typed), shape contracts (3-detector × 500-sample
38
+ fixtures), regularization behavior (C, L1 penalty), signal ordering,
39
+ calibration chaining (Platt + Isotonic), bootstrap CI on stacker output
40
+ (Audit F6a-aware — uses correct `BootstrapCI.ci_low/ci_high` attribute
41
+ names), determinism under fixed `random_state`, hypothesis property on
42
+ signal monotonicity, input validation (shape mismatch, single-class,
43
+ non-finite, unfit, wrong-n-detectors).
44
+ - `docs/source/examples/stacking.md` — myst-nb worked example: 3 synthetic
45
+ detectors with descending signal-to-noise, stacker fit, post-stacking
46
+ Platt calibration. Cites Wolpert 1992 + Breiman 1996.
47
+
48
+ ### Notes
49
+
50
+ - Sklearn 1.8+ deprecates `LogisticRegression(penalty=...)` in favor of
51
+ `l1_ratio`. The public `LogisticStacker(penalty=...)` API is preserved;
52
+ internal sklearn-side migration to `l1_ratio` will land when sklearn 1.10
53
+ lands and the warning becomes more visible. No user-facing impact.
54
+
55
+ ## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
56
+
57
+ ### Added
58
+
59
+ - `eval_toolkit.preprocessing` — new module with 3 Spotlighting
60
+ structural-defense variants from Hines et al. 2024
61
+ (arXiv 2403.14720): `delimit(text, delimiter='<<')`,
62
+ `datamark(text, marker='^')`, `encode(text, encoding='base64')`,
63
+ plus a `sweep(texts, variants=..., kwargs=...)` batch wrapper that
64
+ returns a `(N*3)`-row DataFrame. Includes a `spotlighting`
65
+ SimpleNamespace exposing the upstream issue's function-style API
66
+ (`spotlighting.delimit(text)`, etc.). Base-install safe (pure
67
+ stdlib). Closes #51.
68
+ - `eval_toolkit.losses` — new module with `RecallAtLowFPR` — the
69
+ Meta Prompt Guard 2 (PG2) training recipe: a differentiable
70
+ approximation of recall-at-fixed-FPR via soft-rank, returning a
71
+ scalar `torch.nn.Module` loss for use in standard training loops.
72
+ Optimizes detector ranking at a constrained operating point
73
+ (e.g. `fpr_target=0.01` → "maximize recall while keeping FPR ≤ 1%").
74
+ Closes #50.
75
+ - New optional extra `[losses] = torch>=2.0`. Granular per the v0.43
76
+ plan Decision 4 — separated from `[probes]` so callers wanting only
77
+ the loss don't have to install the larger transformers stack.
78
+ Shares the torch version pin with `[probes]`.
79
+
8
80
  ## [0.43.0] — 2026-05-19 — P1 batch: OOD manifest loader + character_injection sweep + ActivationDeltaProbe (closes #48, #49, #53)
9
81
 
10
82
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.43.0
3
+ Version: 0.45.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -62,6 +62,8 @@ Requires-Dist: sphinx-design>=0.6; extra == 'docs'
62
62
  Requires-Dist: sphinx>=7.3; extra == 'docs'
63
63
  Provides-Extra: embeddings
64
64
  Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
65
+ Provides-Extra: losses
66
+ Requires-Dist: torch>=2.0; extra == 'losses'
65
67
  Provides-Extra: parquet
66
68
  Requires-Dist: pyarrow>=15.0; extra == 'parquet'
67
69
  Provides-Extra: plotting
@@ -0,0 +1,76 @@
1
+ # Architecture Decision Records
2
+
3
+ This directory captures architecturally-significant decisions that shape
4
+ `eval-toolkit`'s long-term design. ADRs are immutable historical records —
5
+ once accepted, a decision is not edited in place; if it changes, a new ADR
6
+ supersedes it.
7
+
8
+ ## When to file an ADR
9
+
10
+ File a new ADR when a decision:
11
+
12
+ - **Locks in an interface or shape** that future code is expected to
13
+ conform to (e.g., "metrics return type", "Protocol vs ABC").
14
+ - **Closes off alternatives** that were seriously considered, so the
15
+ reasoning isn't lost.
16
+ - **Carries cost** to reverse (e.g., a public-API contract that promises
17
+ stability across a release line).
18
+
19
+ Routine refactors, bug fixes, and internal-only patterns do not need ADRs —
20
+ the commit message + CHANGELOG entry are enough.
21
+
22
+ ## Numbering
23
+
24
+ Sequential, zero-padded: `0001-flat-module-layout.md`,
25
+ `0002-scorecard-as-primary-metric-surface.md`, etc. Number is assigned
26
+ at the time of writing; if two ADRs are drafted in parallel, the second
27
+ to merge takes the next number.
28
+
29
+ ## Format
30
+
31
+ Each ADR uses this skeleton (loosely based on MADR — Markdown ADR — without
32
+ the heavyweight template):
33
+
34
+ ```markdown
35
+ # ADR NNNN: Title
36
+
37
+ **Status:** Proposed | Accepted | Superseded by ADR-MMMM
38
+ **Date:** YYYY-MM-DD
39
+ **Deciders:** (names or roles)
40
+
41
+ ## Context
42
+
43
+ What's the situation that requires a decision? What constraints are at play?
44
+
45
+ ## Decision
46
+
47
+ What did we decide?
48
+
49
+ ## Consequences
50
+
51
+ What follows from this decision? (Both positive and negative.)
52
+
53
+ ## Alternatives considered
54
+
55
+ What else was on the table, and why wasn't it chosen?
56
+
57
+ ## Trigger to revisit
58
+
59
+ What would have to change for this decision to be reopened?
60
+ (Optional but useful — keeps the ADR self-documenting.)
61
+ ```
62
+
63
+ ## Cross-references
64
+
65
+ - [`docs/RELEASING.md`](../../RELEASING.md) — release-flow process; ADRs
66
+ are typically drafted as part of release prep.
67
+ - [`docs/source/roadmap.md`](../roadmap.md) — long-term direction;
68
+ ADRs explain how individual roadmap decisions were made.
69
+
70
+ ## Index
71
+
72
+ (Updated as ADRs are added.)
73
+
74
+ | # | Title | Status | Date |
75
+ |---|---|---|---|
76
+ | _none yet_ | | | |
@@ -69,6 +69,11 @@ transformers = ["transformers>=4.0"]
69
69
  # is base-install-safe (lazy imports inside ActivationDeltaProbe methods);
70
70
  # the extra is strictly for callers wanting to actually fit / predict.
71
71
  probes = ["torch>=2.0", "transformers>=4.40"]
72
+ # v0.44.0: RecallAtLowFPR loss (Meta Prompt Guard 2 recipe; closes #50).
73
+ # torch-only (no transformers); separated from [probes] per Decision 4
74
+ # (granular extras — losses callers should not have to install the larger
75
+ # transformers stack). Shares the torch version pin with [probes].
76
+ losses = ["torch>=2.0"]
72
77
  # DEPRECATED (announced v0.30.1, removal v0.33.0).
73
78
  #
74
79
  # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
@@ -177,7 +182,7 @@ warn_no_return = true
177
182
  strict_equality = true
178
183
 
179
184
  [[tool.mypy.overrides]]
180
- module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*"]
185
+ module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*", "torch.*", "transformers.*"]
181
186
  ignore_missing_imports = true
182
187
 
183
188
  [tool.pytest.ini_options]
@@ -40,6 +40,13 @@ _EXPORTS: dict[str, str] = {
40
40
  "WhitespaceInjection": "eval_toolkit.adversarial",
41
41
  "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
42
42
  "character_injection": "eval_toolkit.adversarial",
43
+ # --- losses ---
44
+ "RecallAtLowFPR": "eval_toolkit.losses",
45
+ # --- preprocessing ---
46
+ "datamark": "eval_toolkit.preprocessing",
47
+ "delimit": "eval_toolkit.preprocessing",
48
+ "encode": "eval_toolkit.preprocessing",
49
+ "spotlighting": "eval_toolkit.preprocessing",
43
50
  # --- probes ---
44
51
  "ActivationDeltaProbe": "eval_toolkit.probes",
45
52
  "ActivationExtractor": "eval_toolkit.probes",
@@ -287,6 +294,8 @@ _EXPORTS: dict[str, str] = {
287
294
  "recall_at_fpr": "eval_toolkit.thresholds",
288
295
  "select_threshold": "eval_toolkit.thresholds",
289
296
  "wilson_interval": "eval_toolkit.thresholds",
297
+ "LogisticStacker": "eval_toolkit.stacking",
298
+ "MetaLearner": "eval_toolkit.stacking",
290
299
  }
291
300
 
292
301
  __all__ = ["__version__", *_EXPORTS.keys()]
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.43.0"
5
+ __version__ = "0.45.0"
@@ -0,0 +1,225 @@
1
+ """Differentiable losses for prompt-injection detector training.
2
+
3
+ Implements :class:`RecallAtLowFPR` — the Meta Prompt Guard 2 (PG2) training
4
+ recipe, a differentiable approximation of recall-at-fixed-FPR. Optimizes
5
+ detector ranking at a constrained operating point (e.g. FPR ≤ 0.01)
6
+ rather than the implicit FPR-agnostic posture of cross-entropy.
7
+
8
+ This module is base-install safe: ``torch`` is soft-imported inside the
9
+ class methods. ``pip install eval-toolkit[losses]`` installs torch.
10
+ The lazy-import pattern matches the ``[probes]`` precedent (separate
11
+ extra so callers wanting only the loss don't have to install
12
+ transformers).
13
+
14
+ The formulation follows the soft-rank approximation described in
15
+ Meta's PG2 release notes and similar metric-learning losses (Liu et al.
16
+ NeurIPS 2020 family):
17
+
18
+ 1. Compute the empirical FPR-target threshold from the negative-class
19
+ scores in the batch via the ``fpr_target``-th percentile.
20
+ 2. Smooth the indicator ``I(s_i >= threshold)`` with
21
+ ``sigmoid(beta * (s_i - threshold))`` so gradients flow.
22
+ 3. Recall@FPR ≈ ``Σ approx_indicator * y / Σ y``; the loss returned is
23
+ ``1 - Recall@FPR``.
24
+
25
+ References
26
+ ----------
27
+ .. [1] Meta. 2024. "Prompt Guard 2 — release notes & training recipe."
28
+ .. [2] Liu, X., et al. 2020. "Black-box ranking under FPR constraints."
29
+ NeurIPS 2020.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ from typing import Any, Literal
35
+
36
+ __all__ = [
37
+ "RecallAtLowFPR",
38
+ ]
39
+
40
+
41
+ ReductionMode = Literal["mean", "sum", "none"]
42
+
43
+
44
+ def _require_torch() -> Any:
45
+ """Import torch with a copy-paste install hint if [losses] is missing."""
46
+ try:
47
+ import torch
48
+ except ImportError as exc:
49
+ raise ImportError(
50
+ "RecallAtLowFPR requires torch. Install with: pip install eval-toolkit[losses]"
51
+ ) from exc
52
+ return torch
53
+
54
+
55
+ def _build_module_class() -> Any:
56
+ """Build the :class:`RecallAtLowFPR` ``nn.Module`` lazily.
57
+
58
+ Defined as a factory so importing :mod:`eval_toolkit.losses` does not
59
+ pull torch at module-import time. The class itself is built on first
60
+ instantiation; the factory caches the class on the module so repeated
61
+ construction is constant-time after the first call.
62
+ """
63
+ torch = _require_torch()
64
+ nn = torch.nn
65
+
66
+ # ``nn.Module`` is a runtime-constructed base; mypy can't follow the dynamic
67
+ # class creation. The runtime behavior is correct (nn.Module API + autograd).
68
+ class _RecallAtLowFPR(nn.Module): # type: ignore[misc, name-defined]
69
+ def __init__(
70
+ self,
71
+ fpr_target: float = 0.01,
72
+ fpr_smoothing_beta: float = 10.0,
73
+ pos_weight: float = 1.0,
74
+ reduction: ReductionMode = "mean",
75
+ ) -> None:
76
+ super().__init__()
77
+ if not 0.0 < fpr_target <= 1.0:
78
+ raise ValueError(f"RecallAtLowFPR: fpr_target must be in (0, 1]; got {fpr_target}")
79
+ if fpr_smoothing_beta <= 0:
80
+ raise ValueError(
81
+ f"RecallAtLowFPR: fpr_smoothing_beta must be > 0; got {fpr_smoothing_beta}"
82
+ )
83
+ if reduction not in ("mean", "sum", "none"):
84
+ raise ValueError(
85
+ f"RecallAtLowFPR: reduction must be 'mean'|'sum'|'none'; got {reduction!r}"
86
+ )
87
+ self.fpr_target = float(fpr_target)
88
+ self.fpr_smoothing_beta = float(fpr_smoothing_beta)
89
+ self.pos_weight = float(pos_weight)
90
+ self.reduction = reduction
91
+
92
+ def forward(
93
+ self,
94
+ logits: Any,
95
+ labels: Any,
96
+ ) -> Any:
97
+ """Compute the (differentiable) 1 - Recall@FPR loss.
98
+
99
+ Parameters
100
+ ----------
101
+ logits : torch.Tensor
102
+ Predicted scores, shape ``(B,)`` or ``(B, 1)``. Higher
103
+ value → higher probability of positive class.
104
+ labels : torch.Tensor
105
+ Binary labels in ``{0, 1}``, shape ``(B,)``.
106
+
107
+ Returns
108
+ -------
109
+ torch.Tensor
110
+ Scalar (``reduction="mean"`` or ``"sum"``) or
111
+ per-positive-sample loss (``reduction="none"``).
112
+ """
113
+ scores = logits.squeeze(-1) if logits.dim() == 2 else logits
114
+ if scores.shape != labels.shape:
115
+ raise ValueError(
116
+ f"RecallAtLowFPR: logits shape {tuple(scores.shape)} != "
117
+ f"labels shape {tuple(labels.shape)}"
118
+ )
119
+
120
+ labels_f = labels.float()
121
+ neg_mask = labels_f < 0.5
122
+ pos_mask = labels_f >= 0.5
123
+
124
+ if not torch.any(pos_mask):
125
+ # No positives → recall is undefined; return zero loss with grad.
126
+ return scores.sum() * 0.0
127
+
128
+ # Threshold = (1 - fpr_target)-th quantile of negative scores.
129
+ # quantile is straight-through differentiable through neg_scores in PyTorch.
130
+ neg_scores = scores[neg_mask]
131
+ if neg_scores.numel() == 0:
132
+ # No negatives → no FPR constraint binds; threshold at -inf so
133
+ # everything ranks above it (recall = 1 → loss = 0).
134
+ threshold = scores.min().detach() - 1.0
135
+ else:
136
+ # quantile q = 1 - fpr_target means we want the score above which
137
+ # exactly fpr_target fraction of negatives sit.
138
+ q = 1.0 - self.fpr_target
139
+ threshold = torch.quantile(neg_scores, q)
140
+
141
+ # Soft indicator: sigmoid(beta * (s - t)) → near-step function as beta → ∞.
142
+ approx_above = torch.sigmoid(self.fpr_smoothing_beta * (scores - threshold))
143
+ # Recall@FPR = (Σ I(s_i ≥ t) * y_i * pos_weight) / (Σ y_i * pos_weight)
144
+ tp_weighted = approx_above * labels_f * self.pos_weight
145
+ denom = labels_f.sum() * self.pos_weight
146
+ recall_at_fpr = tp_weighted.sum() / denom.clamp(min=1e-9)
147
+ per_pos = 1.0 - approx_above[pos_mask] # per-positive contribution
148
+
149
+ if self.reduction == "mean":
150
+ return torch.tensor(1.0, device=scores.device) - recall_at_fpr
151
+ if self.reduction == "sum":
152
+ return per_pos.sum()
153
+ return per_pos # "none"
154
+
155
+ return _RecallAtLowFPR
156
+
157
+
158
+ _CLASS_CACHE: dict[str, Any] = {}
159
+
160
+
161
+ def RecallAtLowFPR( # noqa: N802 — matches issue spec PascalCase class-like name
162
+ fpr_target: float = 0.01,
163
+ fpr_smoothing_beta: float = 10.0,
164
+ pos_weight: float = 1.0,
165
+ reduction: ReductionMode = "mean",
166
+ ) -> Any:
167
+ """Construct a Recall@LowFPR loss module.
168
+
169
+ Differentiable approximation of recall at a constrained false-positive
170
+ rate, per the Meta Prompt Guard 2 training recipe. Optimizes
171
+ detector ranking at a specific operating point (e.g. ``fpr_target=0.01``
172
+ → "maximize recall while keeping FPR ≤ 1%").
173
+
174
+ Parameters
175
+ ----------
176
+ fpr_target : float, optional
177
+ Target false-positive rate (operating point constraint).
178
+ Must be in ``(0, 1]``. Default ``0.01`` (1% FPR).
179
+ fpr_smoothing_beta : float, optional
180
+ Temperature of the soft-indicator approximation; higher values
181
+ make the loss sharper (closer to the hard step function) but
182
+ produce smaller gradients away from the threshold. Default ``10.0``.
183
+ Increase toward training convergence; start low for stable
184
+ gradient flow.
185
+ pos_weight : float, optional
186
+ Per-positive-sample weight applied to the recall numerator and
187
+ denominator. Default ``1.0`` (unweighted).
188
+ reduction : {"mean", "sum", "none"}, optional
189
+ How to reduce the per-positive loss. Default ``"mean"``.
190
+ ``"mean"`` returns the scalar ``1 - Recall@FPR`` (the canonical
191
+ training objective). ``"sum"`` returns the sum of per-positive
192
+ ``1 - approx_indicator``. ``"none"`` returns the per-positive
193
+ ``1 - approx_indicator`` tensor for custom downstream weighting.
194
+
195
+ Returns
196
+ -------
197
+ torch.nn.Module
198
+ The constructed loss module. Drop into any standard PyTorch
199
+ training loop.
200
+
201
+ Raises
202
+ ------
203
+ ImportError
204
+ If the ``[losses]`` extra is not installed.
205
+ ValueError
206
+ On invalid ``fpr_target`` / ``fpr_smoothing_beta`` / ``reduction``.
207
+
208
+ Examples
209
+ --------
210
+ >>> # Requires the [losses] extra.
211
+ >>> # import torch
212
+ >>> # loss = RecallAtLowFPR(fpr_target=0.01)
213
+ >>> # logits = torch.randn(32, requires_grad=True)
214
+ >>> # labels = torch.randint(0, 2, (32,))
215
+ >>> # loss(logits, labels).backward()
216
+ """
217
+ if "cls" not in _CLASS_CACHE:
218
+ _CLASS_CACHE["cls"] = _build_module_class()
219
+ cls = _CLASS_CACHE["cls"]
220
+ return cls(
221
+ fpr_target=fpr_target,
222
+ fpr_smoothing_beta=fpr_smoothing_beta,
223
+ pos_weight=pos_weight,
224
+ reduction=reduction,
225
+ )