eval-toolkit 0.44.0__tar.gz → 0.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/.gitignore +6 -0
  2. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/CHANGELOG.md +47 -0
  3. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/PKG-INFO +1 -1
  4. eval_toolkit-0.45.0/docs/source/adr/README.md +76 -0
  5. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/__init__.py +2 -0
  6. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_version.py +1 -1
  7. eval_toolkit-0.45.0/src/eval_toolkit/stacking.py +412 -0
  8. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/public_api/snapshot.json +19 -1
  9. eval_toolkit-0.45.0/tests/test_stacking.py +369 -0
  10. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/LICENSE +0 -0
  11. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/README.md +0 -0
  12. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/STYLE.md +0 -0
  13. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/archive/README.md +0 -0
  14. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/README.md +0 -0
  15. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/datasets/README.md +0 -0
  16. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/data-integrity/README.md +0 -0
  17. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  18. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/inference/README.md +0 -0
  19. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/prompt-injection/README.md +0 -0
  20. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/source/methodology/README.md +0 -0
  21. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/pyproject.toml +0 -0
  22. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/adversarial.py +0 -0
  26. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/analysis.py +0 -0
  27. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/artifacts.py +0 -0
  28. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/bootstrap.py +0 -0
  29. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/calibration.py +0 -0
  30. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/claims.py +0 -0
  31. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/config.py +0 -0
  32. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/docs.py +0 -0
  33. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/embeddings.py +0 -0
  34. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/evidence.py +0 -0
  35. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/harness.py +0 -0
  36. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/leakage.py +0 -0
  37. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/loaders.py +0 -0
  38. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/losses.py +0 -0
  39. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/manifest.py +0 -0
  40. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/metrics.py +0 -0
  41. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/operating_points.py +0 -0
  42. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/paths.py +0 -0
  43. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/plotting.py +0 -0
  44. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/preprocessing.py +0 -0
  45. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/probes.py +0 -0
  46. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/protocols.py +0 -0
  47. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/provenance.py +0 -0
  48. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/py.typed +0 -0
  49. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  50. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  51. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  52. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  53. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  54. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  55. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/seeds.py +0 -0
  56. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/splits.py +0 -0
  57. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/text_dedup.py +0 -0
  58. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/thresholds.py +0 -0
  59. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  60. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  61. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  62. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  63. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  64. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  65. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  66. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  67. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  68. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  69. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/benchmarks/__init__.py +0 -0
  70. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  71. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/conftest.py +0 -0
  72. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  73. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  74. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  75. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  76. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/docs/expected.md +0 -0
  77. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/docs/input.md +0 -0
  78. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/docs/metrics.json +0 -0
  79. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  80. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/strategies.py +0 -0
  81. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_adversarial.py +0 -0
  82. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_analysis.py +0 -0
  83. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_artifacts.py +0 -0
  84. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  85. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  86. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_edge_cases.py +0 -0
  87. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_golden.py +0 -0
  88. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_njobs.py +0 -0
  89. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_props.py +0 -0
  90. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_research_grounded.py +0 -0
  91. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_unit.py +0 -0
  92. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_binary_adapters.py +0 -0
  93. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  94. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_determinism.py +0 -0
  95. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_optimization_failures.py +0 -0
  96. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_props.py +0 -0
  97. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_research_grounded.py +0 -0
  98. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_unit.py +0 -0
  99. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_claims.py +0 -0
  100. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_claims_coverage.py +0 -0
  101. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_claims_props.py +0 -0
  102. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_cli.py +0 -0
  103. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_config.py +0 -0
  104. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_bootstrap.py +0 -0
  105. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_calibration.py +0 -0
  106. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_harness.py +0 -0
  107. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_metrics.py +0 -0
  108. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_plotting.py +0 -0
  109. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_croissant_e2e.py +0 -0
  110. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  111. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_deprecations.py +0 -0
  112. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_docs_golden.py +0 -0
  113. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_docs_props.py +0 -0
  114. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_embeddings.py +0 -0
  115. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_evidence_validators.py +0 -0
  116. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_edge_cases.py +0 -0
  117. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_fault_injection.py +0 -0
  118. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_folded.py +0 -0
  119. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_internals.py +0 -0
  120. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_metric_options.py +0 -0
  121. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_parallelism.py +0 -0
  122. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_smoke.py +0 -0
  123. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_import_boundaries.py +0 -0
  124. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  125. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_leakage.py +0 -0
  126. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_leakage_error_paths.py +0 -0
  127. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_leakage_props.py +0 -0
  128. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_loaders.py +0 -0
  129. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_loaders_coverage.py +0 -0
  130. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_loaders_props.py +0 -0
  131. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_logging.py +0 -0
  132. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_losses.py +0 -0
  133. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest.py +0 -0
  134. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  135. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest_props.py +0 -0
  136. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest_validation.py +0 -0
  137. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_metrics_props.py +0 -0
  138. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_metrics_stratified_subsets.py +0 -0
  139. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_metrics_unit.py +0 -0
  140. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_misc_coverage.py +0 -0
  141. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_numeric_edge_cases.py +0 -0
  142. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_ood_loader.py +0 -0
  143. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_operating_points.py +0 -0
  144. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_operating_points_props.py +0 -0
  145. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_parallel.py +0 -0
  146. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_paths.py +0 -0
  147. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_pipeline_e2e.py +0 -0
  148. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_plotting_edge.py +0 -0
  149. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_plotting_smoke.py +0 -0
  150. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_plotting_visual.py +0 -0
  151. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_preprocessing.py +0 -0
  152. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_probes.py +0 -0
  153. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_protocol_conformance.py +0 -0
  154. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_provenance.py +0 -0
  155. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_public_api.py +0 -0
  156. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_recall_at_fpr.py +0 -0
  157. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_reference_equivalence.py +0 -0
  158. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_reproducibility_integration.py +0 -0
  159. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_schemas.py +0 -0
  160. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_seeds.py +0 -0
  161. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_splits.py +0 -0
  162. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_splits_leakage_integration.py +0 -0
  163. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_splits_props.py +0 -0
  164. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup.py +0 -0
  165. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_coverage.py +0 -0
  166. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_props.py +0 -0
  167. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_strategies.py +0 -0
  168. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds.py +0 -0
  169. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_constant_score.py +0 -0
  170. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_coverage.py +0 -0
  171. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_props.py +0 -0
  172. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_research_grounded.py +0 -0
  173. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_tokenization_leakage_check.py +0 -0
  174. {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_v09_contracts.py +0 -0
@@ -39,6 +39,12 @@ coverage.json
39
39
  # Logs
40
40
  *.log
41
41
 
42
+ # Local environment overrides (machine-local credentials / config)
43
+ .env.local
44
+
45
+ # Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
46
+ mutants/
47
+
42
48
  # Claude Code project settings (machine-local)
43
49
  .claude/
44
50
 
@@ -5,6 +5,53 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
9
+
10
+ First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
11
+ (per the v1.0 plan at `~/.claude/plans/evaluate-all-the-work-twinkly-kite.md`).
12
+ Non-breaking — purely additive. No Protocol shape edits to the existing 6
13
+ Tier-2 contracts (Gate 2 streak continues: 6 of 6 consecutive minors without
14
+ Protocol-shape changes).
15
+
16
+ ### Added
17
+
18
+ - `eval_toolkit.stacking` — new module providing the `MetaLearner` Protocol
19
+ and one reference impl, `LogisticStacker`, for combining outputs from
20
+ multiple binary detectors into a calibrated ensemble. Wraps
21
+ `sklearn.linear_model.LogisticRegression` with a stacker-shaped public API
22
+ (sklearn-style `fit(score_matrix, y)`, `predict(score_matrix)`,
23
+ `predict_proba(score_matrix)`, plus `coef_` / `classes_` / `intercept_`
24
+ attributes). No new dependencies — `scikit-learn` is already core since
25
+ v0.27. Closes #52.
26
+ - `MetaLearner` Protocol — `@runtime_checkable`; sklearn-shape contract
27
+ taking a `(n_samples, n_detectors)` score matrix. Sized as a v1.0 Tier-2
28
+ contract per the v1.0 plan Decision M (tiered stability — strict freeze at
29
+ v1.0; additive subprotocols permitted in minor releases). Mirrors the
30
+ `Probe` Protocol pattern from v0.43.
31
+ - `LogisticStacker` reference impl — configurable C, fit_intercept,
32
+ class_weight, penalty, solver, max_iter, random_state. Class-weight default
33
+ `"balanced"` for the common imbalanced-detection setting. Composes with the
34
+ 4-binary-calibrator family (v0.40 + v0.42) via `fit_platt_binary` /
35
+ `fit_isotonic_binary` chaining on stacked output.
36
+ - 24-test coverage in `tests/test_stacking.py`: Protocol satisfaction (both
37
+ structural and duck-typed), shape contracts (3-detector × 500-sample
38
+ fixtures), regularization behavior (C, L1 penalty), signal ordering,
39
+ calibration chaining (Platt + Isotonic), bootstrap CI on stacker output
40
+ (Audit F6a-aware — uses correct `BootstrapCI.ci_low/ci_high` attribute
41
+ names), determinism under fixed `random_state`, hypothesis property on
42
+ signal monotonicity, input validation (shape mismatch, single-class,
43
+ non-finite, unfit, wrong-n-detectors).
44
+ - `docs/source/examples/stacking.md` — myst-nb worked example: 3 synthetic
45
+ detectors with descending signal-to-noise, stacker fit, post-stacking
46
+ Platt calibration. Cites Wolpert 1992 + Breiman 1996.
47
+
48
+ ### Notes
49
+
50
+ - Sklearn 1.8+ deprecates `LogisticRegression(penalty=...)` in favor of
51
+ `l1_ratio`. The public `LogisticStacker(penalty=...)` API is preserved;
52
+ internal sklearn-side migration to `l1_ratio` will land when sklearn 1.10
53
+ lands and the warning becomes more visible. No user-facing impact.
54
+
8
55
  ## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
9
56
 
10
57
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.44.0
3
+ Version: 0.45.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -0,0 +1,76 @@
1
+ # Architecture Decision Records
2
+
3
+ This directory captures architecturally-significant decisions that shape
4
+ `eval-toolkit`'s long-term design. ADRs are immutable historical records —
5
+ once accepted, a decision is not edited in place; if it changes, a new ADR
6
+ supersedes it.
7
+
8
+ ## When to file an ADR
9
+
10
+ File a new ADR when a decision:
11
+
12
+ - **Locks in an interface or shape** that future code is expected to
13
+ conform to (e.g., "metrics return type", "Protocol vs ABC").
14
+ - **Closes off alternatives** that were seriously considered, so the
15
+ reasoning isn't lost.
16
+ - **Carries cost** to reverse (e.g., a public-API contract that promises
17
+ stability across a release line).
18
+
19
+ Routine refactors, bug fixes, and internal-only patterns do not need ADRs —
20
+ the commit message + CHANGELOG entry are enough.
21
+
22
+ ## Numbering
23
+
24
+ Sequential, zero-padded: `0001-flat-module-layout.md`,
25
+ `0002-scorecard-as-primary-metric-surface.md`, etc. Number is assigned
26
+ at the time of writing; if two ADRs are drafted in parallel, the second
27
+ to merge takes the next number.
28
+
29
+ ## Format
30
+
31
+ Each ADR uses this skeleton (loosely based on MADR — Markdown ADR — without
32
+ the heavyweight template):
33
+
34
+ ```markdown
35
+ # ADR NNNN: Title
36
+
37
+ **Status:** Proposed | Accepted | Superseded by ADR-MMMM
38
+ **Date:** YYYY-MM-DD
39
+ **Deciders:** (names or roles)
40
+
41
+ ## Context
42
+
43
+ What's the situation that requires a decision? What constraints are at play?
44
+
45
+ ## Decision
46
+
47
+ What did we decide?
48
+
49
+ ## Consequences
50
+
51
+ What follows from this decision? (Both positive and negative.)
52
+
53
+ ## Alternatives considered
54
+
55
+ What else was on the table, and why wasn't it chosen?
56
+
57
+ ## Trigger to revisit
58
+
59
+ What would have to change for this decision to be reopened?
60
+ (Optional but useful — keeps the ADR self-documenting.)
61
+ ```
62
+
63
+ ## Cross-references
64
+
65
+ - [`docs/RELEASING.md`](../../RELEASING.md) — release-flow process; ADRs
66
+ are typically drafted as part of release prep.
67
+ - [`docs/source/roadmap.md`](../roadmap.md) — long-term direction;
68
+ ADRs explain how individual roadmap decisions were made.
69
+
70
+ ## Index
71
+
72
+ (Updated as ADRs are added.)
73
+
74
+ | # | Title | Status | Date |
75
+ |---|---|---|---|
76
+ | _none yet_ | | | |
@@ -294,6 +294,8 @@ _EXPORTS: dict[str, str] = {
294
294
  "recall_at_fpr": "eval_toolkit.thresholds",
295
295
  "select_threshold": "eval_toolkit.thresholds",
296
296
  "wilson_interval": "eval_toolkit.thresholds",
297
+ "LogisticStacker": "eval_toolkit.stacking",
298
+ "MetaLearner": "eval_toolkit.stacking",
297
299
  }
298
300
 
299
301
  __all__ = ["__version__", *_EXPORTS.keys()]
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.44.0"
5
+ __version__ = "0.45.0"
@@ -0,0 +1,412 @@
1
+ """Detector stacking — combine multiple binary scorers into a calibrated ensemble.
2
+
3
+ Implements the :class:`MetaLearner` Protocol and one reference impl,
4
+ :class:`LogisticStacker`, for stacking the outputs of multiple base detectors
5
+ into a single P(positive) estimate. The classic use case for prompt-injection
6
+ detection: you have a fine-tuned classifier, an activation probe, and an
7
+ LLM-judge — each emits a per-sample score in ``[0, 1]``. A stacker learns the
8
+ best (regularized) linear combination of those scores.
9
+
10
+ The :class:`MetaLearner` Protocol is intentionally minimal: ``fit`` takes a
11
+ ``(n_samples, n_detectors)`` score matrix plus binary labels; ``predict_proba``
12
+ returns the sklearn-standard ``(n_samples, 2)`` probability matrix. The shape
13
+ mirrors :class:`~eval_toolkit.probes.Probe` so consumers can drop a stacker into
14
+ any sklearn-shaped evaluation harness.
15
+
16
+ Stacking sits AFTER per-detector calibration in a typical pipeline:
17
+
18
+ 1. Train each base detector on training data.
19
+ 2. Calibrate each detector individually (e.g. :func:`fit_platt_binary`).
20
+ 3. On a held-out **stacking** set (disjoint from each detector's training set
21
+ to avoid optimistic stacking), collect the calibrated scores into a matrix.
22
+ 4. Fit a :class:`LogisticStacker` on the matrix + labels.
23
+ 5. Optionally calibrate the stacker's output via another
24
+ :func:`fit_platt_binary` / :func:`fit_isotonic_binary` pass.
25
+
26
+ The Protocol carries an attribute contract (``coef_``, ``classes_``,
27
+ ``intercept_``) and a method contract (``fit``, ``predict``, ``predict_proba``)
28
+ so :class:`MetaLearner` instances are interchangeable inside the harness.
29
+
30
+ References
31
+ ----------
32
+ .. [1] Wolpert, D. H. 1992. "Stacked generalization." Neural Networks
33
+ 5(2), 241–259. doi:10.1016/S0893-6080(05)80023-1.
34
+ .. [2] Breiman, L. 1996. "Stacked regressions." Machine Learning
35
+ 24(1), 49–64.
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import logging
41
+ from dataclasses import dataclass, field
42
+ from typing import Any, Literal, Protocol, runtime_checkable
43
+
44
+ import numpy as np
45
+ from sklearn.linear_model import LogisticRegression
46
+
47
+ _logger = logging.getLogger(__name__)
48
+
49
+ __all__ = [
50
+ "LogisticStacker",
51
+ "MetaLearner",
52
+ ]
53
+
54
+
55
+ @runtime_checkable
56
+ class MetaLearner(Protocol):
57
+ """Combines per-sample scores from multiple base detectors into P(positive).
58
+
59
+ The Protocol takes a ``(n_samples, n_detectors)`` score matrix — one row per
60
+ sample, one column per base detector — plus binary labels for fitting.
61
+ Output is the sklearn-standard ``(n_samples, 2)`` probability matrix.
62
+
63
+ The contract is sklearn-shaped so stackers compose with the existing
64
+ probe-and-harness machinery. Concrete implementations are expected to be
65
+ deterministic given a fixed ``random_state`` and to expose ``coef_`` /
66
+ ``intercept_`` for interpretability + ``RunManifest`` logging.
67
+
68
+ Attributes
69
+ ----------
70
+ coef_ : numpy.ndarray
71
+ Fitted coefficient vector, shape ``(n_detectors,)`` for binary
72
+ classification. Available after :meth:`fit`. Reading before ``fit``
73
+ raises ``AttributeError`` or returns ``None`` (impl-defined; the
74
+ reference :class:`LogisticStacker` raises).
75
+ classes_ : numpy.ndarray
76
+ Class labels, shape ``(2,)``. For binary stacking, always
77
+ ``array([0, 1])`` after :meth:`fit`.
78
+ intercept_ : numpy.ndarray
79
+ Fitted intercept, shape ``(1,)``. Available after :meth:`fit`.
80
+
81
+ Notes
82
+ -----
83
+ Tier-2 Protocol (frozen at v1.0 per ADR 0003). Additive subprotocols are
84
+ permitted in minor releases; method-signature changes require v2.0.
85
+
86
+ When passed to a parallel-capable harness (``n_jobs > 1``), implementations
87
+ MUST be picklable — joblib's loky backend serializes the entire delayed
88
+ call before worker dispatch. See ``docs/source/methodology/parallelism.md``
89
+ for the picklability contract.
90
+
91
+ Distinct from :class:`~eval_toolkit.protocols.Scorer` (which consumes raw
92
+ feature data and returns 1-D ``P(positive)``). A stacker can be wrapped to
93
+ satisfy ``Scorer`` via ``lambda X: stacker.predict_proba(X)[:, 1]`` once
94
+ callers have collected base-detector scores into ``X``.
95
+
96
+ See Also
97
+ --------
98
+ LogisticStacker : reference implementation wrapping sklearn LogisticRegression.
99
+ """
100
+
101
+ coef_: np.ndarray
102
+ classes_: np.ndarray
103
+ intercept_: np.ndarray
104
+
105
+ def fit(self, score_matrix: np.ndarray, y: np.ndarray) -> MetaLearner: # pragma: no cover
106
+ """Fit the stacker on a ``(n_samples, n_detectors)`` score matrix.
107
+
108
+ Parameters
109
+ ----------
110
+ score_matrix : numpy.ndarray
111
+ Per-detector calibrated scores, shape ``(n_samples, n_detectors)``.
112
+ Values are typically in ``[0, 1]`` but the contract does not
113
+ require it.
114
+ y : numpy.ndarray
115
+ Binary labels in ``{0, 1}``, shape ``(n_samples,)``.
116
+
117
+ Returns
118
+ -------
119
+ MetaLearner
120
+ ``self`` (sklearn convention) — the fitted estimator.
121
+ """
122
+ ...
123
+
124
+ def predict(self, score_matrix: np.ndarray) -> np.ndarray: # pragma: no cover
125
+ """Return binary predictions for ``score_matrix``, shape ``(n_samples,)``."""
126
+ ...
127
+
128
+ def predict_proba(self, score_matrix: np.ndarray) -> np.ndarray: # pragma: no cover
129
+ """Return ``(n_samples, 2)`` probability matrix.
130
+
131
+ Column order matches :attr:`classes_`. Column 1 is ``P(positive)``.
132
+ """
133
+ ...
134
+
135
+
136
+ @dataclass
137
+ class LogisticStacker:
138
+ """Reference :class:`MetaLearner` using :class:`sklearn.linear_model.LogisticRegression`.
139
+
140
+ Wraps sklearn's logistic regression with a stacker-shaped public API.
141
+ Configuration goes in the constructor; fitted state is populated on
142
+ :meth:`fit` and exposed via the standard sklearn ``coef_`` / ``classes_`` /
143
+ ``intercept_`` attributes.
144
+
145
+ Parameters
146
+ ----------
147
+ C : float, optional
148
+ Inverse regularization strength. Smaller ``C`` → stronger L2/L1
149
+ regularization → more shrinkage toward zero on detector weights.
150
+ Default ``1.0`` (sklearn default). For very-few-detector stacking
151
+ (≤3 base scorers), stronger regularization (``C=0.1``) helps prevent
152
+ the stacker from over-fitting to a single dominant detector on small
153
+ stacking sets.
154
+ fit_intercept : bool, optional
155
+ Whether to fit an intercept term. Default ``True``.
156
+ class_weight : str or dict or None, optional
157
+ Per-class sample weighting. Default ``"balanced"`` — automatically
158
+ weights inversely proportional to class frequencies, useful for
159
+ imbalanced injection / non-injection sets.
160
+ penalty : {"l1", "l2", "elasticnet", None}, optional
161
+ Regularization norm. Default ``"l2"``. ``"l1"`` zeros out non-useful
162
+ detector columns (sparsity); ``"l2"`` shrinks them uniformly.
163
+ solver : str, optional
164
+ Optimizer. Default ``"lbfgs"`` (L2-only). Use ``"liblinear"`` for L1
165
+ penalty on small stacking sets; ``"saga"`` for elasticnet.
166
+ max_iter : int, optional
167
+ Maximum iterations. Default ``1000`` (generous; stacking problems are
168
+ small and usually converge in <100).
169
+ random_state : int or None, optional
170
+ Seed for the underlying ``LogisticRegression``. Default ``None``.
171
+ Set for deterministic fitting when the solver involves randomness
172
+ (e.g. ``"saga"``).
173
+
174
+ Attributes
175
+ ----------
176
+ coef_ : numpy.ndarray
177
+ Fitted detector weights, shape ``(n_detectors,)``. Set on
178
+ :meth:`fit`.
179
+ classes_ : numpy.ndarray
180
+ Class labels, shape ``(2,)``. Always ``array([0, 1])`` after a binary
181
+ :meth:`fit`.
182
+ intercept_ : numpy.ndarray
183
+ Fitted intercept, shape ``(1,)``. Set on :meth:`fit`.
184
+
185
+ Examples
186
+ --------
187
+ >>> import numpy as np
188
+ >>> rng = np.random.default_rng(0)
189
+ >>> n = 500
190
+ >>> # Three synthetic detectors with varying noise + signal alignment.
191
+ >>> y = rng.binomial(1, 0.3, size=n)
192
+ >>> scores = np.column_stack([
193
+ ... np.clip(y * 0.7 + rng.normal(0, 0.2, n), 0, 1),
194
+ ... np.clip(y * 0.5 + rng.normal(0, 0.3, n), 0, 1),
195
+ ... np.clip(y * 0.4 + rng.normal(0, 0.4, n), 0, 1),
196
+ ... ])
197
+ >>> stacker = LogisticStacker(C=1.0).fit(scores, y)
198
+ >>> stacker.coef_.shape
199
+ (3,)
200
+ >>> stacker.classes_.tolist()
201
+ [0, 1]
202
+ >>> proba = stacker.predict_proba(scores)
203
+ >>> proba.shape
204
+ (500, 2)
205
+ >>> bool(np.allclose(proba.sum(axis=1), 1.0))
206
+ True
207
+
208
+ Raises
209
+ ------
210
+ ValueError
211
+ On shape mismatch between ``score_matrix`` and ``y``, on empty inputs,
212
+ on non-finite values in ``score_matrix``, or when ``y`` contains only
213
+ one class (logistic regression is undefined).
214
+ RuntimeError
215
+ Propagated from the underlying sklearn solver if it fails to converge.
216
+
217
+ Notes
218
+ -----
219
+ **No new dependencies.** ``scikit-learn`` is already a core eval-toolkit
220
+ dependency since v0.27.
221
+
222
+ **Calibration chaining.** A logistic stacker is not automatically
223
+ well-calibrated on the global P(positive) scale — `LogisticRegression`'s
224
+ sigmoid output is well-calibrated on the training data's class prior but
225
+ can drift on held-out distributions. For downstream calibration metrics
226
+ (ECE, Brier), chain through :func:`fit_platt_binary` or
227
+ :func:`fit_isotonic_binary` on a separate calibration set:
228
+
229
+ >>> from eval_toolkit import fit_platt_binary
230
+ >>> stacker_proba = stacker.predict_proba(scores)[:, 1]
231
+ >>> (_, _), calibrate = fit_platt_binary(y, stacker_proba)
232
+ >>> calibrated = calibrate(stacker.predict_proba(scores)[:, 1])
233
+ >>> calibrated.shape == (500,)
234
+ True
235
+
236
+ See Also
237
+ --------
238
+ eval_toolkit.protocols.Scorer : 1-D ``P(positive)`` contract for raw
239
+ feature inputs.
240
+ eval_toolkit.probes.ActivationDeltaProbe : another sklearn-shaped probe
241
+ producing detector scores.
242
+ eval_toolkit.fit_platt_binary : calibrate stacker output to global prior.
243
+ """
244
+
245
+ C: float = 1.0
246
+ fit_intercept: bool = True
247
+ class_weight: str | dict[Any, float] | None = "balanced"
248
+ penalty: Literal["l1", "l2", "elasticnet"] | None = "l2"
249
+ solver: str = "lbfgs"
250
+ max_iter: int = 1000
251
+ random_state: int | None = None
252
+
253
+ # Fitted state — populated on fit(); excluded from constructor + repr.
254
+ _model: LogisticRegression | None = field(default=None, init=False, repr=False)
255
+ _fitted: bool = field(default=False, init=False, repr=False)
256
+
257
+ @property
258
+ def coef_(self) -> np.ndarray:
259
+ """Fitted detector weights, shape ``(n_detectors,)``. Raises if unfit."""
260
+ self._assert_fitted()
261
+ assert self._model is not None # narrowed by _assert_fitted; tell mypy
262
+ # sklearn returns (1, n_features) for binary; flatten to (n_features,)
263
+ # np.asarray() wraps sklearn's Any-typed attribute into a known ndarray.
264
+ return np.asarray(self._model.coef_).ravel()
265
+
266
+ @property
267
+ def classes_(self) -> np.ndarray:
268
+ """Class labels, shape ``(2,)``. Raises if unfit."""
269
+ self._assert_fitted()
270
+ assert self._model is not None
271
+ return np.asarray(self._model.classes_)
272
+
273
+ @property
274
+ def intercept_(self) -> np.ndarray:
275
+ """Fitted intercept, shape ``(1,)``. Raises if unfit."""
276
+ self._assert_fitted()
277
+ assert self._model is not None
278
+ return np.asarray(self._model.intercept_)
279
+
280
+ def fit(self, score_matrix: np.ndarray, y: np.ndarray) -> LogisticStacker:
281
+ """Fit the stacker on a ``(n_samples, n_detectors)`` score matrix.
282
+
283
+ Parameters
284
+ ----------
285
+ score_matrix : numpy.ndarray
286
+ Per-detector scores. Shape ``(n_samples, n_detectors)``. Must be
287
+ finite. Single-detector stacking (``n_detectors == 1``) is
288
+ permitted but trivial — equivalent to recalibrating that detector.
289
+ y : numpy.ndarray
290
+ Binary labels in ``{0, 1}``. Shape ``(n_samples,)``.
291
+
292
+ Returns
293
+ -------
294
+ LogisticStacker
295
+ ``self``, with fitted state populated.
296
+
297
+ Raises
298
+ ------
299
+ ValueError
300
+ On shape mismatch, empty inputs, non-finite ``score_matrix``, or
301
+ single-class ``y``.
302
+ """
303
+ sm = np.asarray(score_matrix, dtype=float)
304
+ yarr = np.asarray(y).ravel()
305
+ _validate_fit_inputs(sm, yarr)
306
+
307
+ model = LogisticRegression(
308
+ C=self.C,
309
+ fit_intercept=self.fit_intercept,
310
+ class_weight=self.class_weight,
311
+ penalty=self.penalty,
312
+ solver=self.solver,
313
+ max_iter=self.max_iter,
314
+ random_state=self.random_state,
315
+ )
316
+ model.fit(sm, yarr)
317
+ self._model = model
318
+ self._fitted = True
319
+ return self
320
+
321
+ def predict(self, score_matrix: np.ndarray) -> np.ndarray:
322
+ """Return binary predictions, shape ``(n_samples,)``.
323
+
324
+ Threshold is sklearn's default 0.5 on column-1 (``P(positive)``).
325
+ For other operating points, use :func:`metrics_at_threshold` against
326
+ :meth:`predict_proba` output directly.
327
+
328
+ Raises
329
+ ------
330
+ ValueError
331
+ If :meth:`fit` has not been called yet, or on shape / finiteness
332
+ issues in ``score_matrix``.
333
+ """
334
+ self._assert_fitted()
335
+ assert self._model is not None # narrowed by _assert_fitted; tell mypy
336
+ sm = np.asarray(score_matrix, dtype=float)
337
+ _validate_predict_inputs(sm, expected_n_features=self.coef_.shape[0])
338
+ return np.asarray(self._model.predict(sm))
339
+
340
+ def predict_proba(self, score_matrix: np.ndarray) -> np.ndarray:
341
+ """Return ``(n_samples, 2)`` probability matrix.
342
+
343
+ Column order matches :attr:`classes_` (``[0, 1]``); column 1 is
344
+ ``P(positive)``.
345
+
346
+ Raises
347
+ ------
348
+ ValueError
349
+ If :meth:`fit` has not been called yet, or on shape / finiteness
350
+ issues in ``score_matrix``.
351
+ """
352
+ self._assert_fitted()
353
+ assert self._model is not None
354
+ sm = np.asarray(score_matrix, dtype=float)
355
+ _validate_predict_inputs(sm, expected_n_features=self.coef_.shape[0])
356
+ return np.asarray(self._model.predict_proba(sm))
357
+
358
+ def _assert_fitted(self) -> None:
359
+ """Raise if :meth:`fit` has not been called."""
360
+ if not self._fitted or self._model is None:
361
+ raise ValueError(
362
+ "LogisticStacker has not been fit yet. Call `.fit(score_matrix, y)` first."
363
+ )
364
+
365
+
366
+ def _validate_fit_inputs(score_matrix: np.ndarray, y: np.ndarray) -> None:
367
+ """Shared input validation for :meth:`LogisticStacker.fit`.
368
+
369
+ Raises ``ValueError`` with a context-rich message on every failure mode.
370
+ """
371
+ if score_matrix.ndim != 2:
372
+ raise ValueError(
373
+ f"score_matrix must be 2-D (n_samples, n_detectors); got ndim={score_matrix.ndim}"
374
+ )
375
+ if score_matrix.size == 0:
376
+ raise ValueError("score_matrix is empty; provide at least one sample")
377
+ if y.ndim != 1:
378
+ raise ValueError(f"y must be 1-D (n_samples,); got ndim={y.ndim}")
379
+ if score_matrix.shape[0] != y.shape[0]:
380
+ raise ValueError(
381
+ "score_matrix and y must have matching n_samples; "
382
+ f"got score_matrix.shape[0]={score_matrix.shape[0]}, y.shape[0]={y.shape[0]}"
383
+ )
384
+ if not np.all(np.isfinite(score_matrix)):
385
+ raise ValueError("score_matrix contains non-finite values (NaN or inf)")
386
+ unique = np.unique(y)
387
+ if unique.size < 2:
388
+ raise ValueError(
389
+ "y is single-class; LogisticStacker requires both classes "
390
+ f"in the training set (got y.unique() = {unique.tolist()})"
391
+ )
392
+
393
+
394
+ def _validate_predict_inputs(score_matrix: np.ndarray, *, expected_n_features: int) -> None:
395
+ """Shared input validation for predict / predict_proba.
396
+
397
+ Verifies the score matrix is 2-D, non-empty, finite, and has the expected
398
+ number of detector columns. Raises ``ValueError`` with context on failure.
399
+ """
400
+ if score_matrix.ndim != 2:
401
+ raise ValueError(
402
+ f"score_matrix must be 2-D (n_samples, n_detectors); got ndim={score_matrix.ndim}"
403
+ )
404
+ if score_matrix.size == 0:
405
+ raise ValueError("score_matrix is empty; provide at least one sample")
406
+ if not np.all(np.isfinite(score_matrix)):
407
+ raise ValueError("score_matrix contains non-finite values (NaN or inf)")
408
+ if score_matrix.shape[1] != expected_n_features:
409
+ raise ValueError(
410
+ "score_matrix has wrong number of detectors; "
411
+ f"expected {expected_n_features}, got {score_matrix.shape[1]}"
412
+ )
@@ -57,9 +57,11 @@
57
57
  "LeakageCheck",
58
58
  "LeakageFinding",
59
59
  "LeakageReport",
60
+ "LogisticStacker",
60
61
  "MANIFEST_SCHEMA_VERSION",
61
62
  "MDEEstimate",
62
63
  "MaxF1Selector",
64
+ "MetaLearner",
63
65
  "MetricFn",
64
66
  "MetricState",
65
67
  "MinHashLSHStrategy",
@@ -684,6 +686,14 @@
684
686
  "kind": "class",
685
687
  "signature": "(findings: 'list[LeakageFinding]' = <factory>) -> None"
686
688
  },
689
+ "LogisticStacker": {
690
+ "bases": [
691
+ "object"
692
+ ],
693
+ "doc_first_line": "Reference :class:`MetaLearner` using :class:`sklearn.linear_model.LogisticRegression`.",
694
+ "kind": "class",
695
+ "signature": "(C: 'float' = 1.0, fit_intercept: 'bool' = True, class_weight: 'str | dict[Any, float] | None' = 'balanced', penalty: \"Literal['l1', 'l2', 'elasticnet'] | None\" = 'l2', solver: 'str' = 'lbfgs', max_iter: 'int' = 1000, random_state: 'int | None' = None) -> None"
696
+ },
687
697
  "MANIFEST_SCHEMA_VERSION": {
688
698
  "doc_first_line": "str(object='') -> str",
689
699
  "kind": "value",
@@ -706,6 +716,14 @@
706
716
  "kind": "class",
707
717
  "signature": "(criterion: 'str' = 'max_f1') -> None"
708
718
  },
719
+ "MetaLearner": {
720
+ "bases": [
721
+ "Protocol"
722
+ ],
723
+ "doc_first_line": "Combines per-sample scores from multiple base detectors into P(positive).",
724
+ "kind": "class",
725
+ "signature": "(*args, **kwargs)"
726
+ },
709
727
  "MetricFn": {
710
728
  "doc_first_line": "",
711
729
  "kind": "function",
@@ -1154,7 +1172,7 @@
1154
1172
  "doc_first_line": "str(object='') -> str",
1155
1173
  "kind": "value",
1156
1174
  "type": "str",
1157
- "value": "'0.44.0'"
1175
+ "value": "'0.45.0'"
1158
1176
  },
1159
1177
  "apply_operating_points": {
1160
1178
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",