eval-toolkit 0.44.0__tar.gz → 0.45.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/.gitignore +6 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/CHANGELOG.md +47 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/PKG-INFO +1 -1
- eval_toolkit-0.45.0/docs/source/adr/README.md +76 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/__init__.py +2 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-0.45.0/src/eval_toolkit/stacking.py +412 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/public_api/snapshot.json +19 -1
- eval_toolkit-0.45.0/tests/test_stacking.py +369 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/LICENSE +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/STYLE.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/pyproject.toml +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.45.0}/tests/test_v09_contracts.py +0 -0
|
@@ -39,6 +39,12 @@ coverage.json
|
|
|
39
39
|
# Logs
|
|
40
40
|
*.log
|
|
41
41
|
|
|
42
|
+
# Local environment overrides (machine-local credentials / config)
|
|
43
|
+
.env.local
|
|
44
|
+
|
|
45
|
+
# Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
|
|
46
|
+
mutants/
|
|
47
|
+
|
|
42
48
|
# Claude Code project settings (machine-local)
|
|
43
49
|
.claude/
|
|
44
50
|
|
|
@@ -5,6 +5,53 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
|
|
9
|
+
|
|
10
|
+
First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
|
|
11
|
+
(per the v1.0 plan at `~/.claude/plans/evaluate-all-the-work-twinkly-kite.md`).
|
|
12
|
+
Non-breaking — purely additive. No Protocol shape edits to the existing 6
|
|
13
|
+
Tier-2 contracts (Gate 2 streak continues: 6 of 6 consecutive minors without
|
|
14
|
+
Protocol-shape changes).
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
|
|
18
|
+
- `eval_toolkit.stacking` — new module providing the `MetaLearner` Protocol
|
|
19
|
+
and one reference impl, `LogisticStacker`, for combining outputs from
|
|
20
|
+
multiple binary detectors into a calibrated ensemble. Wraps
|
|
21
|
+
`sklearn.linear_model.LogisticRegression` with a stacker-shaped public API
|
|
22
|
+
(sklearn-style `fit(score_matrix, y)`, `predict(score_matrix)`,
|
|
23
|
+
`predict_proba(score_matrix)`, plus `coef_` / `classes_` / `intercept_`
|
|
24
|
+
attributes). No new dependencies — `scikit-learn` is already core since
|
|
25
|
+
v0.27. Closes #52.
|
|
26
|
+
- `MetaLearner` Protocol — `@runtime_checkable`; sklearn-shape contract
|
|
27
|
+
taking a `(n_samples, n_detectors)` score matrix. Sized as a v1.0 Tier-2
|
|
28
|
+
contract per the v1.0 plan Decision M (tiered stability — strict freeze at
|
|
29
|
+
v1.0; additive subprotocols permitted in minor releases). Mirrors the
|
|
30
|
+
`Probe` Protocol pattern from v0.43.
|
|
31
|
+
- `LogisticStacker` reference impl — configurable C, fit_intercept,
|
|
32
|
+
class_weight, penalty, solver, max_iter, random_state. Class-weight default
|
|
33
|
+
`"balanced"` for the common imbalanced-detection setting. Composes with the
|
|
34
|
+
4-binary-calibrator family (v0.40 + v0.42) via `fit_platt_binary` /
|
|
35
|
+
`fit_isotonic_binary` chaining on stacked output.
|
|
36
|
+
- 24-test coverage in `tests/test_stacking.py`: Protocol satisfaction (both
|
|
37
|
+
structural and duck-typed), shape contracts (3-detector × 500-sample
|
|
38
|
+
fixtures), regularization behavior (C, L1 penalty), signal ordering,
|
|
39
|
+
calibration chaining (Platt + Isotonic), bootstrap CI on stacker output
|
|
40
|
+
(Audit F6a-aware — uses correct `BootstrapCI.ci_low/ci_high` attribute
|
|
41
|
+
names), determinism under fixed `random_state`, hypothesis property on
|
|
42
|
+
signal monotonicity, input validation (shape mismatch, single-class,
|
|
43
|
+
non-finite, unfit, wrong-n-detectors).
|
|
44
|
+
- `docs/source/examples/stacking.md` — myst-nb worked example: 3 synthetic
|
|
45
|
+
detectors with descending signal-to-noise, stacker fit, post-stacking
|
|
46
|
+
Platt calibration. Cites Wolpert 1992 + Breiman 1996.
|
|
47
|
+
|
|
48
|
+
### Notes
|
|
49
|
+
|
|
50
|
+
- Sklearn 1.8+ deprecates `LogisticRegression(penalty=...)` in favor of
|
|
51
|
+
`l1_ratio`. The public `LogisticStacker(penalty=...)` API is preserved;
|
|
52
|
+
internal sklearn-side migration to `l1_ratio` will land when sklearn 1.10
|
|
53
|
+
lands and the warning becomes more visible. No user-facing impact.
|
|
54
|
+
|
|
8
55
|
## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
|
|
9
56
|
|
|
10
57
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.45.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Architecture Decision Records
|
|
2
|
+
|
|
3
|
+
This directory captures architecturally-significant decisions that shape
|
|
4
|
+
`eval-toolkit`'s long-term design. ADRs are immutable historical records —
|
|
5
|
+
once accepted, a decision is not edited in place; if it changes, a new ADR
|
|
6
|
+
supersedes it.
|
|
7
|
+
|
|
8
|
+
## When to file an ADR
|
|
9
|
+
|
|
10
|
+
File a new ADR when a decision:
|
|
11
|
+
|
|
12
|
+
- **Locks in an interface or shape** that future code is expected to
|
|
13
|
+
conform to (e.g., "metrics return type", "Protocol vs ABC").
|
|
14
|
+
- **Closes off alternatives** that were seriously considered, so the
|
|
15
|
+
reasoning isn't lost.
|
|
16
|
+
- **Carries cost** to reverse (e.g., a public-API contract that promises
|
|
17
|
+
stability across a release line).
|
|
18
|
+
|
|
19
|
+
Routine refactors, bug fixes, and internal-only patterns do not need ADRs —
|
|
20
|
+
the commit message + CHANGELOG entry are enough.
|
|
21
|
+
|
|
22
|
+
## Numbering
|
|
23
|
+
|
|
24
|
+
Sequential, zero-padded: `0001-flat-module-layout.md`,
|
|
25
|
+
`0002-scorecard-as-primary-metric-surface.md`, etc. Number is assigned
|
|
26
|
+
at the time of writing; if two ADRs are drafted in parallel, the second
|
|
27
|
+
to merge takes the next number.
|
|
28
|
+
|
|
29
|
+
## Format
|
|
30
|
+
|
|
31
|
+
Each ADR uses this skeleton (loosely based on MADR — Markdown ADR — without
|
|
32
|
+
the heavyweight template):
|
|
33
|
+
|
|
34
|
+
```markdown
|
|
35
|
+
# ADR NNNN: Title
|
|
36
|
+
|
|
37
|
+
**Status:** Proposed | Accepted | Superseded by ADR-MMMM
|
|
38
|
+
**Date:** YYYY-MM-DD
|
|
39
|
+
**Deciders:** (names or roles)
|
|
40
|
+
|
|
41
|
+
## Context
|
|
42
|
+
|
|
43
|
+
What's the situation that requires a decision? What constraints are at play?
|
|
44
|
+
|
|
45
|
+
## Decision
|
|
46
|
+
|
|
47
|
+
What did we decide?
|
|
48
|
+
|
|
49
|
+
## Consequences
|
|
50
|
+
|
|
51
|
+
What follows from this decision? (Both positive and negative.)
|
|
52
|
+
|
|
53
|
+
## Alternatives considered
|
|
54
|
+
|
|
55
|
+
What else was on the table, and why wasn't it chosen?
|
|
56
|
+
|
|
57
|
+
## Trigger to revisit
|
|
58
|
+
|
|
59
|
+
What would have to change for this decision to be reopened?
|
|
60
|
+
(Optional but useful — keeps the ADR self-documenting.)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Cross-references
|
|
64
|
+
|
|
65
|
+
- [`docs/RELEASING.md`](../../RELEASING.md) — release-flow process; ADRs
|
|
66
|
+
are typically drafted as part of release prep.
|
|
67
|
+
- [`docs/source/roadmap.md`](../roadmap.md) — long-term direction;
|
|
68
|
+
ADRs explain how individual roadmap decisions were made.
|
|
69
|
+
|
|
70
|
+
## Index
|
|
71
|
+
|
|
72
|
+
(Updated as ADRs are added.)
|
|
73
|
+
|
|
74
|
+
| # | Title | Status | Date |
|
|
75
|
+
|---|---|---|---|
|
|
76
|
+
| _none yet_ | | | |
|
|
@@ -294,6 +294,8 @@ _EXPORTS: dict[str, str] = {
|
|
|
294
294
|
"recall_at_fpr": "eval_toolkit.thresholds",
|
|
295
295
|
"select_threshold": "eval_toolkit.thresholds",
|
|
296
296
|
"wilson_interval": "eval_toolkit.thresholds",
|
|
297
|
+
"LogisticStacker": "eval_toolkit.stacking",
|
|
298
|
+
"MetaLearner": "eval_toolkit.stacking",
|
|
297
299
|
}
|
|
298
300
|
|
|
299
301
|
__all__ = ["__version__", *_EXPORTS.keys()]
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
"""Detector stacking — combine multiple binary scorers into a calibrated ensemble.
|
|
2
|
+
|
|
3
|
+
Implements the :class:`MetaLearner` Protocol and one reference impl,
|
|
4
|
+
:class:`LogisticStacker`, for stacking the outputs of multiple base detectors
|
|
5
|
+
into a single P(positive) estimate. The classic use case for prompt-injection
|
|
6
|
+
detection: you have a fine-tuned classifier, an activation probe, and an
|
|
7
|
+
LLM-judge — each emits a per-sample score in ``[0, 1]``. A stacker learns the
|
|
8
|
+
best (regularized) linear combination of those scores.
|
|
9
|
+
|
|
10
|
+
The :class:`MetaLearner` Protocol is intentionally minimal: ``fit`` takes a
|
|
11
|
+
``(n_samples, n_detectors)`` score matrix plus binary labels; ``predict_proba``
|
|
12
|
+
returns the sklearn-standard ``(n_samples, 2)`` probability matrix. The shape
|
|
13
|
+
mirrors :class:`~eval_toolkit.probes.Probe` so consumers can drop a stacker into
|
|
14
|
+
any sklearn-shaped evaluation harness.
|
|
15
|
+
|
|
16
|
+
Stacking sits AFTER per-detector calibration in a typical pipeline:
|
|
17
|
+
|
|
18
|
+
1. Train each base detector on training data.
|
|
19
|
+
2. Calibrate each detector individually (e.g. :func:`fit_platt_binary`).
|
|
20
|
+
3. On a held-out **stacking** set (disjoint from each detector's training set
|
|
21
|
+
to avoid optimistic stacking), collect the calibrated scores into a matrix.
|
|
22
|
+
4. Fit a :class:`LogisticStacker` on the matrix + labels.
|
|
23
|
+
5. Optionally calibrate the stacker's output via another
|
|
24
|
+
:func:`fit_platt_binary` / :func:`fit_isotonic_binary` pass.
|
|
25
|
+
|
|
26
|
+
The Protocol carries an attribute contract (``coef_``, ``classes_``,
|
|
27
|
+
``intercept_``) and a method contract (``fit``, ``predict``, ``predict_proba``)
|
|
28
|
+
so :class:`MetaLearner` instances are interchangeable inside the harness.
|
|
29
|
+
|
|
30
|
+
References
|
|
31
|
+
----------
|
|
32
|
+
.. [1] Wolpert, D. H. 1992. "Stacked generalization." Neural Networks
|
|
33
|
+
5(2), 241–259. doi:10.1016/S0893-6080(05)80023-1.
|
|
34
|
+
.. [2] Breiman, L. 1996. "Stacked regressions." Machine Learning
|
|
35
|
+
24(1), 49–64.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import logging
|
|
41
|
+
from dataclasses import dataclass, field
|
|
42
|
+
from typing import Any, Literal, Protocol, runtime_checkable
|
|
43
|
+
|
|
44
|
+
import numpy as np
|
|
45
|
+
from sklearn.linear_model import LogisticRegression
|
|
46
|
+
|
|
47
|
+
_logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
"LogisticStacker",
|
|
51
|
+
"MetaLearner",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@runtime_checkable
|
|
56
|
+
class MetaLearner(Protocol):
|
|
57
|
+
"""Combines per-sample scores from multiple base detectors into P(positive).
|
|
58
|
+
|
|
59
|
+
The Protocol takes a ``(n_samples, n_detectors)`` score matrix — one row per
|
|
60
|
+
sample, one column per base detector — plus binary labels for fitting.
|
|
61
|
+
Output is the sklearn-standard ``(n_samples, 2)`` probability matrix.
|
|
62
|
+
|
|
63
|
+
The contract is sklearn-shaped so stackers compose with the existing
|
|
64
|
+
probe-and-harness machinery. Concrete implementations are expected to be
|
|
65
|
+
deterministic given a fixed ``random_state`` and to expose ``coef_`` /
|
|
66
|
+
``intercept_`` for interpretability + ``RunManifest`` logging.
|
|
67
|
+
|
|
68
|
+
Attributes
|
|
69
|
+
----------
|
|
70
|
+
coef_ : numpy.ndarray
|
|
71
|
+
Fitted coefficient vector, shape ``(n_detectors,)`` for binary
|
|
72
|
+
classification. Available after :meth:`fit`. Reading before ``fit``
|
|
73
|
+
raises ``AttributeError`` or returns ``None`` (impl-defined; the
|
|
74
|
+
reference :class:`LogisticStacker` raises).
|
|
75
|
+
classes_ : numpy.ndarray
|
|
76
|
+
Class labels, shape ``(2,)``. For binary stacking, always
|
|
77
|
+
``array([0, 1])`` after :meth:`fit`.
|
|
78
|
+
intercept_ : numpy.ndarray
|
|
79
|
+
Fitted intercept, shape ``(1,)``. Available after :meth:`fit`.
|
|
80
|
+
|
|
81
|
+
Notes
|
|
82
|
+
-----
|
|
83
|
+
Tier-2 Protocol (frozen at v1.0 per ADR 0003). Additive subprotocols are
|
|
84
|
+
permitted in minor releases; method-signature changes require v2.0.
|
|
85
|
+
|
|
86
|
+
When passed to a parallel-capable harness (``n_jobs > 1``), implementations
|
|
87
|
+
MUST be picklable — joblib's loky backend serializes the entire delayed
|
|
88
|
+
call before worker dispatch. See ``docs/source/methodology/parallelism.md``
|
|
89
|
+
for the picklability contract.
|
|
90
|
+
|
|
91
|
+
Distinct from :class:`~eval_toolkit.protocols.Scorer` (which consumes raw
|
|
92
|
+
feature data and returns 1-D ``P(positive)``). A stacker can be wrapped to
|
|
93
|
+
satisfy ``Scorer`` via ``lambda X: stacker.predict_proba(X)[:, 1]`` once
|
|
94
|
+
callers have collected base-detector scores into ``X``.
|
|
95
|
+
|
|
96
|
+
See Also
|
|
97
|
+
--------
|
|
98
|
+
LogisticStacker : reference implementation wrapping sklearn LogisticRegression.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
coef_: np.ndarray
|
|
102
|
+
classes_: np.ndarray
|
|
103
|
+
intercept_: np.ndarray
|
|
104
|
+
|
|
105
|
+
def fit(self, score_matrix: np.ndarray, y: np.ndarray) -> MetaLearner: # pragma: no cover
|
|
106
|
+
"""Fit the stacker on a ``(n_samples, n_detectors)`` score matrix.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
score_matrix : numpy.ndarray
|
|
111
|
+
Per-detector calibrated scores, shape ``(n_samples, n_detectors)``.
|
|
112
|
+
Values are typically in ``[0, 1]`` but the contract does not
|
|
113
|
+
require it.
|
|
114
|
+
y : numpy.ndarray
|
|
115
|
+
Binary labels in ``{0, 1}``, shape ``(n_samples,)``.
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
MetaLearner
|
|
120
|
+
``self`` (sklearn convention) — the fitted estimator.
|
|
121
|
+
"""
|
|
122
|
+
...
|
|
123
|
+
|
|
124
|
+
def predict(self, score_matrix: np.ndarray) -> np.ndarray: # pragma: no cover
|
|
125
|
+
"""Return binary predictions for ``score_matrix``, shape ``(n_samples,)``."""
|
|
126
|
+
...
|
|
127
|
+
|
|
128
|
+
def predict_proba(self, score_matrix: np.ndarray) -> np.ndarray: # pragma: no cover
|
|
129
|
+
"""Return ``(n_samples, 2)`` probability matrix.
|
|
130
|
+
|
|
131
|
+
Column order matches :attr:`classes_`. Column 1 is ``P(positive)``.
|
|
132
|
+
"""
|
|
133
|
+
...
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@dataclass
|
|
137
|
+
class LogisticStacker:
|
|
138
|
+
"""Reference :class:`MetaLearner` using :class:`sklearn.linear_model.LogisticRegression`.
|
|
139
|
+
|
|
140
|
+
Wraps sklearn's logistic regression with a stacker-shaped public API.
|
|
141
|
+
Configuration goes in the constructor; fitted state is populated on
|
|
142
|
+
:meth:`fit` and exposed via the standard sklearn ``coef_`` / ``classes_`` /
|
|
143
|
+
``intercept_`` attributes.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
C : float, optional
|
|
148
|
+
Inverse regularization strength. Smaller ``C`` → stronger L2/L1
|
|
149
|
+
regularization → more shrinkage toward zero on detector weights.
|
|
150
|
+
Default ``1.0`` (sklearn default). For very-few-detector stacking
|
|
151
|
+
(≤3 base scorers), stronger regularization (``C=0.1``) helps prevent
|
|
152
|
+
the stacker from over-fitting to a single dominant detector on small
|
|
153
|
+
stacking sets.
|
|
154
|
+
fit_intercept : bool, optional
|
|
155
|
+
Whether to fit an intercept term. Default ``True``.
|
|
156
|
+
class_weight : str or dict or None, optional
|
|
157
|
+
Per-class sample weighting. Default ``"balanced"`` — automatically
|
|
158
|
+
weights inversely proportional to class frequencies, useful for
|
|
159
|
+
imbalanced injection / non-injection sets.
|
|
160
|
+
penalty : {"l1", "l2", "elasticnet", None}, optional
|
|
161
|
+
Regularization norm. Default ``"l2"``. ``"l1"`` zeros out non-useful
|
|
162
|
+
detector columns (sparsity); ``"l2"`` shrinks them uniformly.
|
|
163
|
+
solver : str, optional
|
|
164
|
+
Optimizer. Default ``"lbfgs"`` (L2-only). Use ``"liblinear"`` for L1
|
|
165
|
+
penalty on small stacking sets; ``"saga"`` for elasticnet.
|
|
166
|
+
max_iter : int, optional
|
|
167
|
+
Maximum iterations. Default ``1000`` (generous; stacking problems are
|
|
168
|
+
small and usually converge in <100).
|
|
169
|
+
random_state : int or None, optional
|
|
170
|
+
Seed for the underlying ``LogisticRegression``. Default ``None``.
|
|
171
|
+
Set for deterministic fitting when the solver involves randomness
|
|
172
|
+
(e.g. ``"saga"``).
|
|
173
|
+
|
|
174
|
+
Attributes
|
|
175
|
+
----------
|
|
176
|
+
coef_ : numpy.ndarray
|
|
177
|
+
Fitted detector weights, shape ``(n_detectors,)``. Set on
|
|
178
|
+
:meth:`fit`.
|
|
179
|
+
classes_ : numpy.ndarray
|
|
180
|
+
Class labels, shape ``(2,)``. Always ``array([0, 1])`` after a binary
|
|
181
|
+
:meth:`fit`.
|
|
182
|
+
intercept_ : numpy.ndarray
|
|
183
|
+
Fitted intercept, shape ``(1,)``. Set on :meth:`fit`.
|
|
184
|
+
|
|
185
|
+
Examples
|
|
186
|
+
--------
|
|
187
|
+
>>> import numpy as np
|
|
188
|
+
>>> rng = np.random.default_rng(0)
|
|
189
|
+
>>> n = 500
|
|
190
|
+
>>> # Three synthetic detectors with varying noise + signal alignment.
|
|
191
|
+
>>> y = rng.binomial(1, 0.3, size=n)
|
|
192
|
+
>>> scores = np.column_stack([
|
|
193
|
+
... np.clip(y * 0.7 + rng.normal(0, 0.2, n), 0, 1),
|
|
194
|
+
... np.clip(y * 0.5 + rng.normal(0, 0.3, n), 0, 1),
|
|
195
|
+
... np.clip(y * 0.4 + rng.normal(0, 0.4, n), 0, 1),
|
|
196
|
+
... ])
|
|
197
|
+
>>> stacker = LogisticStacker(C=1.0).fit(scores, y)
|
|
198
|
+
>>> stacker.coef_.shape
|
|
199
|
+
(3,)
|
|
200
|
+
>>> stacker.classes_.tolist()
|
|
201
|
+
[0, 1]
|
|
202
|
+
>>> proba = stacker.predict_proba(scores)
|
|
203
|
+
>>> proba.shape
|
|
204
|
+
(500, 2)
|
|
205
|
+
>>> bool(np.allclose(proba.sum(axis=1), 1.0))
|
|
206
|
+
True
|
|
207
|
+
|
|
208
|
+
Raises
|
|
209
|
+
------
|
|
210
|
+
ValueError
|
|
211
|
+
On shape mismatch between ``score_matrix`` and ``y``, on empty inputs,
|
|
212
|
+
on non-finite values in ``score_matrix``, or when ``y`` contains only
|
|
213
|
+
one class (logistic regression is undefined).
|
|
214
|
+
RuntimeError
|
|
215
|
+
Propagated from the underlying sklearn solver if it fails to converge.
|
|
216
|
+
|
|
217
|
+
Notes
|
|
218
|
+
-----
|
|
219
|
+
**No new dependencies.** ``scikit-learn`` is already a core eval-toolkit
|
|
220
|
+
dependency since v0.27.
|
|
221
|
+
|
|
222
|
+
**Calibration chaining.** A logistic stacker is not automatically
|
|
223
|
+
well-calibrated on the global P(positive) scale — `LogisticRegression`'s
|
|
224
|
+
sigmoid output is well-calibrated on the training data's class prior but
|
|
225
|
+
can drift on held-out distributions. For downstream calibration metrics
|
|
226
|
+
(ECE, Brier), chain through :func:`fit_platt_binary` or
|
|
227
|
+
:func:`fit_isotonic_binary` on a separate calibration set:
|
|
228
|
+
|
|
229
|
+
>>> from eval_toolkit import fit_platt_binary
|
|
230
|
+
>>> stacker_proba = stacker.predict_proba(scores)[:, 1]
|
|
231
|
+
>>> (_, _), calibrate = fit_platt_binary(y, stacker_proba)
|
|
232
|
+
>>> calibrated = calibrate(stacker.predict_proba(scores)[:, 1])
|
|
233
|
+
>>> calibrated.shape == (500,)
|
|
234
|
+
True
|
|
235
|
+
|
|
236
|
+
See Also
|
|
237
|
+
--------
|
|
238
|
+
eval_toolkit.protocols.Scorer : 1-D ``P(positive)`` contract for raw
|
|
239
|
+
feature inputs.
|
|
240
|
+
eval_toolkit.probes.ActivationDeltaProbe : another sklearn-shaped probe
|
|
241
|
+
producing detector scores.
|
|
242
|
+
eval_toolkit.fit_platt_binary : calibrate stacker output to global prior.
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
C: float = 1.0
|
|
246
|
+
fit_intercept: bool = True
|
|
247
|
+
class_weight: str | dict[Any, float] | None = "balanced"
|
|
248
|
+
penalty: Literal["l1", "l2", "elasticnet"] | None = "l2"
|
|
249
|
+
solver: str = "lbfgs"
|
|
250
|
+
max_iter: int = 1000
|
|
251
|
+
random_state: int | None = None
|
|
252
|
+
|
|
253
|
+
# Fitted state — populated on fit(); excluded from constructor + repr.
|
|
254
|
+
_model: LogisticRegression | None = field(default=None, init=False, repr=False)
|
|
255
|
+
_fitted: bool = field(default=False, init=False, repr=False)
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def coef_(self) -> np.ndarray:
|
|
259
|
+
"""Fitted detector weights, shape ``(n_detectors,)``. Raises if unfit."""
|
|
260
|
+
self._assert_fitted()
|
|
261
|
+
assert self._model is not None # narrowed by _assert_fitted; tell mypy
|
|
262
|
+
# sklearn returns (1, n_features) for binary; flatten to (n_features,)
|
|
263
|
+
# np.asarray() wraps sklearn's Any-typed attribute into a known ndarray.
|
|
264
|
+
return np.asarray(self._model.coef_).ravel()
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def classes_(self) -> np.ndarray:
|
|
268
|
+
"""Class labels, shape ``(2,)``. Raises if unfit."""
|
|
269
|
+
self._assert_fitted()
|
|
270
|
+
assert self._model is not None
|
|
271
|
+
return np.asarray(self._model.classes_)
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def intercept_(self) -> np.ndarray:
|
|
275
|
+
"""Fitted intercept, shape ``(1,)``. Raises if unfit."""
|
|
276
|
+
self._assert_fitted()
|
|
277
|
+
assert self._model is not None
|
|
278
|
+
return np.asarray(self._model.intercept_)
|
|
279
|
+
|
|
280
|
+
def fit(self, score_matrix: np.ndarray, y: np.ndarray) -> LogisticStacker:
|
|
281
|
+
"""Fit the stacker on a ``(n_samples, n_detectors)`` score matrix.
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
score_matrix : numpy.ndarray
|
|
286
|
+
Per-detector scores. Shape ``(n_samples, n_detectors)``. Must be
|
|
287
|
+
finite. Single-detector stacking (``n_detectors == 1``) is
|
|
288
|
+
permitted but trivial — equivalent to recalibrating that detector.
|
|
289
|
+
y : numpy.ndarray
|
|
290
|
+
Binary labels in ``{0, 1}``. Shape ``(n_samples,)``.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
LogisticStacker
|
|
295
|
+
``self``, with fitted state populated.
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
ValueError
|
|
300
|
+
On shape mismatch, empty inputs, non-finite ``score_matrix``, or
|
|
301
|
+
single-class ``y``.
|
|
302
|
+
"""
|
|
303
|
+
sm = np.asarray(score_matrix, dtype=float)
|
|
304
|
+
yarr = np.asarray(y).ravel()
|
|
305
|
+
_validate_fit_inputs(sm, yarr)
|
|
306
|
+
|
|
307
|
+
model = LogisticRegression(
|
|
308
|
+
C=self.C,
|
|
309
|
+
fit_intercept=self.fit_intercept,
|
|
310
|
+
class_weight=self.class_weight,
|
|
311
|
+
penalty=self.penalty,
|
|
312
|
+
solver=self.solver,
|
|
313
|
+
max_iter=self.max_iter,
|
|
314
|
+
random_state=self.random_state,
|
|
315
|
+
)
|
|
316
|
+
model.fit(sm, yarr)
|
|
317
|
+
self._model = model
|
|
318
|
+
self._fitted = True
|
|
319
|
+
return self
|
|
320
|
+
|
|
321
|
+
def predict(self, score_matrix: np.ndarray) -> np.ndarray:
|
|
322
|
+
"""Return binary predictions, shape ``(n_samples,)``.
|
|
323
|
+
|
|
324
|
+
Threshold is sklearn's default 0.5 on column-1 (``P(positive)``).
|
|
325
|
+
For other operating points, use :func:`metrics_at_threshold` against
|
|
326
|
+
:meth:`predict_proba` output directly.
|
|
327
|
+
|
|
328
|
+
Raises
|
|
329
|
+
------
|
|
330
|
+
ValueError
|
|
331
|
+
If :meth:`fit` has not been called yet, or on shape / finiteness
|
|
332
|
+
issues in ``score_matrix``.
|
|
333
|
+
"""
|
|
334
|
+
self._assert_fitted()
|
|
335
|
+
assert self._model is not None # narrowed by _assert_fitted; tell mypy
|
|
336
|
+
sm = np.asarray(score_matrix, dtype=float)
|
|
337
|
+
_validate_predict_inputs(sm, expected_n_features=self.coef_.shape[0])
|
|
338
|
+
return np.asarray(self._model.predict(sm))
|
|
339
|
+
|
|
340
|
+
def predict_proba(self, score_matrix: np.ndarray) -> np.ndarray:
|
|
341
|
+
"""Return ``(n_samples, 2)`` probability matrix.
|
|
342
|
+
|
|
343
|
+
Column order matches :attr:`classes_` (``[0, 1]``); column 1 is
|
|
344
|
+
``P(positive)``.
|
|
345
|
+
|
|
346
|
+
Raises
|
|
347
|
+
------
|
|
348
|
+
ValueError
|
|
349
|
+
If :meth:`fit` has not been called yet, or on shape / finiteness
|
|
350
|
+
issues in ``score_matrix``.
|
|
351
|
+
"""
|
|
352
|
+
self._assert_fitted()
|
|
353
|
+
assert self._model is not None
|
|
354
|
+
sm = np.asarray(score_matrix, dtype=float)
|
|
355
|
+
_validate_predict_inputs(sm, expected_n_features=self.coef_.shape[0])
|
|
356
|
+
return np.asarray(self._model.predict_proba(sm))
|
|
357
|
+
|
|
358
|
+
def _assert_fitted(self) -> None:
|
|
359
|
+
"""Raise if :meth:`fit` has not been called."""
|
|
360
|
+
if not self._fitted or self._model is None:
|
|
361
|
+
raise ValueError(
|
|
362
|
+
"LogisticStacker has not been fit yet. Call `.fit(score_matrix, y)` first."
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _validate_fit_inputs(score_matrix: np.ndarray, y: np.ndarray) -> None:
|
|
367
|
+
"""Shared input validation for :meth:`LogisticStacker.fit`.
|
|
368
|
+
|
|
369
|
+
Raises ``ValueError`` with a context-rich message on every failure mode.
|
|
370
|
+
"""
|
|
371
|
+
if score_matrix.ndim != 2:
|
|
372
|
+
raise ValueError(
|
|
373
|
+
f"score_matrix must be 2-D (n_samples, n_detectors); got ndim={score_matrix.ndim}"
|
|
374
|
+
)
|
|
375
|
+
if score_matrix.size == 0:
|
|
376
|
+
raise ValueError("score_matrix is empty; provide at least one sample")
|
|
377
|
+
if y.ndim != 1:
|
|
378
|
+
raise ValueError(f"y must be 1-D (n_samples,); got ndim={y.ndim}")
|
|
379
|
+
if score_matrix.shape[0] != y.shape[0]:
|
|
380
|
+
raise ValueError(
|
|
381
|
+
"score_matrix and y must have matching n_samples; "
|
|
382
|
+
f"got score_matrix.shape[0]={score_matrix.shape[0]}, y.shape[0]={y.shape[0]}"
|
|
383
|
+
)
|
|
384
|
+
if not np.all(np.isfinite(score_matrix)):
|
|
385
|
+
raise ValueError("score_matrix contains non-finite values (NaN or inf)")
|
|
386
|
+
unique = np.unique(y)
|
|
387
|
+
if unique.size < 2:
|
|
388
|
+
raise ValueError(
|
|
389
|
+
"y is single-class; LogisticStacker requires both classes "
|
|
390
|
+
f"in the training set (got y.unique() = {unique.tolist()})"
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _validate_predict_inputs(score_matrix: np.ndarray, *, expected_n_features: int) -> None:
|
|
395
|
+
"""Shared input validation for predict / predict_proba.
|
|
396
|
+
|
|
397
|
+
Verifies the score matrix is 2-D, non-empty, finite, and has the expected
|
|
398
|
+
number of detector columns. Raises ``ValueError`` with context on failure.
|
|
399
|
+
"""
|
|
400
|
+
if score_matrix.ndim != 2:
|
|
401
|
+
raise ValueError(
|
|
402
|
+
f"score_matrix must be 2-D (n_samples, n_detectors); got ndim={score_matrix.ndim}"
|
|
403
|
+
)
|
|
404
|
+
if score_matrix.size == 0:
|
|
405
|
+
raise ValueError("score_matrix is empty; provide at least one sample")
|
|
406
|
+
if not np.all(np.isfinite(score_matrix)):
|
|
407
|
+
raise ValueError("score_matrix contains non-finite values (NaN or inf)")
|
|
408
|
+
if score_matrix.shape[1] != expected_n_features:
|
|
409
|
+
raise ValueError(
|
|
410
|
+
"score_matrix has wrong number of detectors; "
|
|
411
|
+
f"expected {expected_n_features}, got {score_matrix.shape[1]}"
|
|
412
|
+
)
|
|
@@ -57,9 +57,11 @@
|
|
|
57
57
|
"LeakageCheck",
|
|
58
58
|
"LeakageFinding",
|
|
59
59
|
"LeakageReport",
|
|
60
|
+
"LogisticStacker",
|
|
60
61
|
"MANIFEST_SCHEMA_VERSION",
|
|
61
62
|
"MDEEstimate",
|
|
62
63
|
"MaxF1Selector",
|
|
64
|
+
"MetaLearner",
|
|
63
65
|
"MetricFn",
|
|
64
66
|
"MetricState",
|
|
65
67
|
"MinHashLSHStrategy",
|
|
@@ -684,6 +686,14 @@
|
|
|
684
686
|
"kind": "class",
|
|
685
687
|
"signature": "(findings: 'list[LeakageFinding]' = <factory>) -> None"
|
|
686
688
|
},
|
|
689
|
+
"LogisticStacker": {
|
|
690
|
+
"bases": [
|
|
691
|
+
"object"
|
|
692
|
+
],
|
|
693
|
+
"doc_first_line": "Reference :class:`MetaLearner` using :class:`sklearn.linear_model.LogisticRegression`.",
|
|
694
|
+
"kind": "class",
|
|
695
|
+
"signature": "(C: 'float' = 1.0, fit_intercept: 'bool' = True, class_weight: 'str | dict[Any, float] | None' = 'balanced', penalty: \"Literal['l1', 'l2', 'elasticnet'] | None\" = 'l2', solver: 'str' = 'lbfgs', max_iter: 'int' = 1000, random_state: 'int | None' = None) -> None"
|
|
696
|
+
},
|
|
687
697
|
"MANIFEST_SCHEMA_VERSION": {
|
|
688
698
|
"doc_first_line": "str(object='') -> str",
|
|
689
699
|
"kind": "value",
|
|
@@ -706,6 +716,14 @@
|
|
|
706
716
|
"kind": "class",
|
|
707
717
|
"signature": "(criterion: 'str' = 'max_f1') -> None"
|
|
708
718
|
},
|
|
719
|
+
"MetaLearner": {
|
|
720
|
+
"bases": [
|
|
721
|
+
"Protocol"
|
|
722
|
+
],
|
|
723
|
+
"doc_first_line": "Combines per-sample scores from multiple base detectors into P(positive).",
|
|
724
|
+
"kind": "class",
|
|
725
|
+
"signature": "(*args, **kwargs)"
|
|
726
|
+
},
|
|
709
727
|
"MetricFn": {
|
|
710
728
|
"doc_first_line": "",
|
|
711
729
|
"kind": "function",
|
|
@@ -1154,7 +1172,7 @@
|
|
|
1154
1172
|
"doc_first_line": "str(object='') -> str",
|
|
1155
1173
|
"kind": "value",
|
|
1156
1174
|
"type": "str",
|
|
1157
|
-
"value": "'0.
|
|
1175
|
+
"value": "'0.45.0'"
|
|
1158
1176
|
},
|
|
1159
1177
|
"apply_operating_points": {
|
|
1160
1178
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|