deup 0.1.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {deup-0.1.1 → deup-0.3.0}/CHANGELOG.md +32 -0
  2. {deup-0.1.1 → deup-0.3.0}/PKG-INFO +9 -8
  3. {deup-0.1.1 → deup-0.3.0}/README.md +8 -7
  4. deup-0.3.0/docs/api/calibration.md +7 -0
  5. deup-0.3.0/docs/api/decomposition.md +17 -0
  6. deup-0.3.0/docs/api/diagnostics.md +23 -0
  7. deup-0.3.0/docs/api/domains.md +23 -0
  8. deup-0.3.0/docs/api/estimators.md +7 -0
  9. deup-0.3.0/docs/api/features.md +13 -0
  10. deup-0.3.0/docs/calibration.md +89 -0
  11. deup-0.3.0/docs/decomposition.md +98 -0
  12. deup-0.3.0/docs/domains.md +57 -0
  13. deup-0.3.0/docs/features.md +119 -0
  14. {deup-0.1.1 → deup-0.3.0}/docs/getting-started.md +100 -11
  15. {deup-0.1.1 → deup-0.3.0}/docs/index.md +14 -0
  16. deup-0.3.0/docs/javascripts/mathjax.js +16 -0
  17. deup-0.3.0/docs/reliability.md +96 -0
  18. deup-0.3.0/docs/theory.md +197 -0
  19. deup-0.3.0/examples/mapie_interop.py +65 -0
  20. {deup-0.1.1 → deup-0.3.0}/mkdocs.yml +23 -1
  21. {deup-0.1.1 → deup-0.3.0}/pyproject.toml +2 -2
  22. {deup-0.1.1 → deup-0.3.0}/src/deup/__init__.py +3 -3
  23. deup-0.3.0/src/deup/calibration/__init__.py +21 -0
  24. deup-0.3.0/src/deup/calibration/conformal.py +215 -0
  25. {deup-0.1.1 → deup-0.3.0}/src/deup/core/__init__.py +22 -0
  26. deup-0.3.0/src/deup/core/aleatoric.py +165 -0
  27. deup-0.3.0/src/deup/core/decompose.py +266 -0
  28. deup-0.3.0/src/deup/core/error_estimator.py +136 -0
  29. deup-0.3.0/src/deup/core/features/__init__.py +28 -0
  30. deup-0.3.0/src/deup/core/features/density.py +123 -0
  31. deup-0.3.0/src/deup/core/features/distance.py +47 -0
  32. deup-0.3.0/src/deup/core/features/pipeline.py +61 -0
  33. deup-0.3.0/src/deup/core/features/raw.py +36 -0
  34. deup-0.3.0/src/deup/core/features/residual.py +62 -0
  35. deup-0.3.0/src/deup/core/features/seen_bit.py +55 -0
  36. deup-0.3.0/src/deup/core/features/variance.py +95 -0
  37. deup-0.3.0/src/deup/diagnostics/__init__.py +43 -0
  38. deup-0.3.0/src/deup/diagnostics/aggregation.py +212 -0
  39. deup-0.3.0/src/deup/diagnostics/health.py +195 -0
  40. deup-0.3.0/src/deup/domains/__init__.py +9 -0
  41. deup-0.3.0/src/deup/domains/finance.py +256 -0
  42. deup-0.3.0/src/deup/domains/tabular.py +79 -0
  43. deup-0.3.0/src/deup/domains/vision.py +167 -0
  44. deup-0.3.0/src/deup/estimators.py +429 -0
  45. deup-0.3.0/tests/test_calibration.py +165 -0
  46. deup-0.3.0/tests/test_decompose.py +223 -0
  47. deup-0.3.0/tests/test_diagnostics.py +198 -0
  48. deup-0.3.0/tests/test_domains.py +147 -0
  49. deup-0.3.0/tests/test_estimators.py +239 -0
  50. deup-0.3.0/tests/test_features.py +143 -0
  51. deup-0.1.1/docs/api/estimators.md +0 -3
  52. deup-0.1.1/src/deup/estimators.py +0 -140
  53. deup-0.1.1/tests/test_estimators.py +0 -108
  54. {deup-0.1.1 → deup-0.3.0}/.github/workflows/ci.yml +0 -0
  55. {deup-0.1.1 → deup-0.3.0}/.github/workflows/docs.yml +0 -0
  56. {deup-0.1.1 → deup-0.3.0}/.github/workflows/release.yml +0 -0
  57. {deup-0.1.1 → deup-0.3.0}/.gitignore +0 -0
  58. {deup-0.1.1 → deup-0.3.0}/.pre-commit-config.yaml +0 -0
  59. {deup-0.1.1 → deup-0.3.0}/ARCHITECTURE.md +0 -0
  60. {deup-0.1.1 → deup-0.3.0}/BENCHMARKS.md +0 -0
  61. {deup-0.1.1 → deup-0.3.0}/CITATION.cff +0 -0
  62. {deup-0.1.1 → deup-0.3.0}/LICENSE +0 -0
  63. {deup-0.1.1 → deup-0.3.0}/RELEASING.md +0 -0
  64. {deup-0.1.1 → deup-0.3.0}/benchmarks/__init__.py +0 -0
  65. {deup-0.1.1 → deup-0.3.0}/benchmarks/results/regression_benchmark.json +0 -0
  66. {deup-0.1.1 → deup-0.3.0}/benchmarks/run_regression_benchmark.py +0 -0
  67. {deup-0.1.1 → deup-0.3.0}/docs/api/core.md +0 -0
  68. {deup-0.1.1 → deup-0.3.0}/docs/api/splitters.md +0 -0
  69. {deup-0.1.1 → deup-0.3.0}/docs/benchmarks.md +0 -0
  70. {deup-0.1.1 → deup-0.3.0}/docs/losses.md +0 -0
  71. {deup-0.1.1 → deup-0.3.0}/src/deup/core/grouping.py +0 -0
  72. {deup-0.1.1 → deup-0.3.0}/src/deup/core/losses.py +0 -0
  73. {deup-0.1.1 → deup-0.3.0}/src/deup/core/oof.py +0 -0
  74. {deup-0.1.1 → deup-0.3.0}/src/deup/core/protocols.py +0 -0
  75. {deup-0.1.1 → deup-0.3.0}/src/deup/core/types.py +0 -0
  76. {deup-0.1.1 → deup-0.3.0}/src/deup/py.typed +0 -0
  77. {deup-0.1.1 → deup-0.3.0}/src/deup/splitters.py +0 -0
  78. {deup-0.1.1 → deup-0.3.0}/tests/test_benchmark_smoke.py +0 -0
  79. {deup-0.1.1 → deup-0.3.0}/tests/test_grouping.py +0 -0
  80. {deup-0.1.1 → deup-0.3.0}/tests/test_losses.py +0 -0
  81. {deup-0.1.1 → deup-0.3.0}/tests/test_oof.py +0 -0
  82. {deup-0.1.1 → deup-0.3.0}/tests/test_protocols.py +0 -0
  83. {deup-0.1.1 → deup-0.3.0}/tests/test_smoke.py +0 -0
  84. {deup-0.1.1 → deup-0.3.0}/tests/test_splitters.py +0 -0
  85. {deup-0.1.1 → deup-0.3.0}/tests/test_types.py +0 -0
@@ -1,5 +1,37 @@
1
1
  # Changelog
2
2
 
3
+ ## [Unreleased]
4
+
5
+ ## [0.3.0] — 2026-06-05
6
+
7
+ ### Added
8
+
9
+ - **Reliability diagnostics** (`deup.diagnostics`): `AggregationReliability` /
10
+ `should_trust_aggregate` (Finding 1) and pluggable `HealthIndex` (Finding 2).
11
+ - **Domain presets** (`deup.domains`):
12
+ - `CrossSectionalDEUP` — finance flagship: `PurgedWalkForward`, rank
13
+ residualization, vol/breadth/regime g-features, `HealthIndex`, multi-horizon
14
+ targets, panel DataFrame API.
15
+ - `TabularDEUP` — KFold + raw X + Mahalanobis density.
16
+ - `VisionDEUP` — embedding → density + variance for OOD classification.
17
+ - Docs: `reliability.md`, `domains.md`, API pages.
18
+
19
+ ### Changed
20
+
21
+ - PyPI release bundles P5–P10 features (feature builders through domain presets).
22
+
23
+ ## [0.2.0] — 2026-06-05
24
+
25
+ ### Added
26
+
27
+ - **`DEUPClassifier`** — classification with log-loss / Brier OOF errors + `predict_proba`
28
+ - **`DEUPRanker`** — cross-sectional ranking; `loss="rank"`, `PurgedWalkForward` default,
29
+ rank-geometry residualization ON by default (Finding 3)
30
+ - **`acquire(pool, k)`** — active-learning hook (top-k by epistemic uncertainty)
31
+ - Refactored **`DEUPRegressor`** onto `ErrorEstimator` + optional `features` /
32
+ `aleatoric` / `decompose`
33
+ - Docs updated for all three estimators and `acquire`
34
+
3
35
  ## [0.1.1] — 2026-06-04
4
36
 
5
37
  First release published to PyPI.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deup
3
- Version: 0.1.1
3
+ Version: 0.3.0
4
4
  Summary: Direct Epistemic Uncertainty Prediction (DEUP) for any scikit-learn model, with first-class time-series support.
5
5
  Project-URL: Homepage, https://github.com/ursinasanderink/deup
6
6
  Project-URL: Repository, https://github.com/ursinasanderink/deup
@@ -67,12 +67,13 @@ model.fit(X_train, y_train)
67
67
  pred, unc = model.predict(X_test, return_uncertainty=True)
68
68
  ```
69
69
 
70
- For time-series / cross-sectional data, pass a leakage-safe splitter:
70
+ For time-series / cross-sectional finance panels:
71
71
 
72
72
  ```python
73
- from deup.splitters import PurgedWalkForward
73
+ from deup.domains.finance import CrossSectionalDEUP
74
74
 
75
- model = DEUPRegressor(base_model=my_model, cv=PurgedWalkForward(embargo=5))
75
+ model = CrossSectionalDEUP(horizon=20).fit(panel_df)
76
+ pred, unc = model.predict(test_df, return_uncertainty=True)
76
77
  ```
77
78
 
78
79
  ## Install
@@ -80,6 +81,7 @@ model = DEUPRegressor(base_model=my_model, cv=PurgedWalkForward(embargo=5))
80
81
  ```bash
81
82
  pip install deup # core (numpy + scikit-learn)
82
83
  pip install "deup[gbm]" # + LightGBM error predictor
84
+ pip install "deup[finance]" # + pandas (CrossSectionalDEUP)
83
85
  pip install "deup[docs]" # + MkDocs site locally
84
86
  ```
85
87
 
@@ -96,11 +98,10 @@ than ensemble disagreement or a conformal residual baseline — see [BENCHMARKS.
96
98
 
97
99
  ## Status / roadmap
98
100
 
99
- **v0.1 (released):** `DEUPRegressor`, OOF collector, splitters, full loss registry
100
- (squared / Brier / pinball / rank), target transforms (log / asinh), benchmark, docs.
101
+ **v0.3 (current):** everything in v0.2 plus aggregation-reliability diagnostics
102
+ (Findings 1–2), domain presets (`CrossSectionalDEUP`, `TabularDEUP`, `VisionDEUP`).
101
103
 
102
- **v0.2:** `DEUPClassifier` / `DEUPRanker`, conformal intervals, aleatoric decomposition,
103
- density/GP features, aggregation-reliability diagnostics.
104
+ **Next:** thesis parity migration (P11), full benchmark suite with N-sweep (P12).
104
105
 
105
106
  ## Citing
106
107
 
@@ -26,12 +26,13 @@ model.fit(X_train, y_train)
26
26
  pred, unc = model.predict(X_test, return_uncertainty=True)
27
27
  ```
28
28
 
29
- For time-series / cross-sectional data, pass a leakage-safe splitter:
29
+ For time-series / cross-sectional finance panels:
30
30
 
31
31
  ```python
32
- from deup.splitters import PurgedWalkForward
32
+ from deup.domains.finance import CrossSectionalDEUP
33
33
 
34
- model = DEUPRegressor(base_model=my_model, cv=PurgedWalkForward(embargo=5))
34
+ model = CrossSectionalDEUP(horizon=20).fit(panel_df)
35
+ pred, unc = model.predict(test_df, return_uncertainty=True)
35
36
  ```
36
37
 
37
38
  ## Install
@@ -39,6 +40,7 @@ model = DEUPRegressor(base_model=my_model, cv=PurgedWalkForward(embargo=5))
39
40
  ```bash
40
41
  pip install deup # core (numpy + scikit-learn)
41
42
  pip install "deup[gbm]" # + LightGBM error predictor
43
+ pip install "deup[finance]" # + pandas (CrossSectionalDEUP)
42
44
  pip install "deup[docs]" # + MkDocs site locally
43
45
  ```
44
46
 
@@ -55,11 +57,10 @@ than ensemble disagreement or a conformal residual baseline — see [BENCHMARKS.
55
57
 
56
58
  ## Status / roadmap
57
59
 
58
- **v0.1 (released):** `DEUPRegressor`, OOF collector, splitters, full loss registry
59
- (squared / Brier / pinball / rank), target transforms (log / asinh), benchmark, docs.
60
+ **v0.3 (current):** everything in v0.2 plus aggregation-reliability diagnostics
61
+ (Findings 1–2), domain presets (`CrossSectionalDEUP`, `TabularDEUP`, `VisionDEUP`).
60
62
 
61
- **v0.2:** `DEUPClassifier` / `DEUPRanker`, conformal intervals, aleatoric decomposition,
62
- density/GP features, aggregation-reliability diagnostics.
63
+ **Next:** thesis parity migration (P11), full benchmark suite with N-sweep (P12).
63
64
 
64
65
  ## Citing
65
66
 
@@ -0,0 +1,7 @@
1
+ # API: Calibration
2
+
3
+ ::: deup.calibration.conformal.UncertaintyCalibrator
4
+
5
+ ::: deup.calibration.conformal.ConformalResult
6
+
7
+ ::: deup.calibration.conformal.deup_normalizer
@@ -0,0 +1,17 @@
1
+ ::: deup.core.error_estimator.ErrorEstimator
2
+
3
+ ::: deup.core.aleatoric.Homoscedastic
4
+
5
+ ::: deup.core.aleatoric.Heteroscedastic
6
+
7
+ ::: deup.core.aleatoric.Quantile
8
+
9
+ ::: deup.core.decompose.decompose_epistemic
10
+
11
+ ::: deup.core.decompose.RankResidualizer
12
+
13
+ ::: deup.core.decompose.coupling_retention_report
14
+
15
+ ::: deup.core.decompose.density_kill_criterion
16
+
17
+ ::: deup.core.decompose.partial_correlation
@@ -0,0 +1,23 @@
1
+ # API: Diagnostics
2
+
3
+ ## Aggregation reliability
4
+
5
+ ::: deup.diagnostics.aggregation.AggregationReliability
6
+
7
+ ::: deup.diagnostics.aggregation.AggregationVerdict
8
+
9
+ ::: deup.diagnostics.aggregation.effective_sample_size
10
+
11
+ ::: deup.diagnostics.aggregation.should_trust_aggregate
12
+
13
+ ## Composite health index
14
+
15
+ ::: deup.diagnostics.health.HealthIndex
16
+
17
+ ::: deup.diagnostics.health.HealthReport
18
+
19
+ ::: deup.diagnostics.health.realized_efficacy
20
+
21
+ ::: deup.diagnostics.health.drift_psi
22
+
23
+ ::: deup.diagnostics.health.model_disagreement
@@ -0,0 +1,23 @@
1
+ # API: Domain presets
2
+
3
+ ## Finance
4
+
5
+ ::: deup.domains.finance.CrossSectionalDEUP
6
+
7
+ ::: deup.domains.finance.enrich_panel
8
+
9
+ ::: deup.domains.finance.FINANCE_G_FEATURES
10
+
11
+ ## Tabular
12
+
13
+ ::: deup.domains.tabular.TabularDEUP
14
+
15
+ ::: deup.domains.tabular.tabular_feature_pipeline
16
+
17
+ ## Vision
18
+
19
+ ::: deup.domains.vision.VisionDEUP
20
+
21
+ ::: deup.domains.vision.EmbeddingUncertaintyFeatures
22
+
23
+ ::: deup.domains.vision.IdentityEmbedding
@@ -0,0 +1,7 @@
1
+ # API: Estimators
2
+
3
+ ::: deup.estimators.DEUPRegressor
4
+
5
+ ::: deup.estimators.DEUPClassifier
6
+
7
+ ::: deup.estimators.DEUPRanker
@@ -0,0 +1,13 @@
1
+ ::: deup.core.features.FeaturePipeline
2
+
3
+ ::: deup.core.features.RawFeatures
4
+
5
+ ::: deup.core.features.DensityFeature
6
+
7
+ ::: deup.core.features.VarianceFeature
8
+
9
+ ::: deup.core.features.DistanceToTrain
10
+
11
+ ::: deup.core.features.SeenBit
12
+
13
+ ::: deup.core.features.ResidualMagnitude
@@ -0,0 +1,89 @@
1
+ # Conformal calibration
2
+
3
+ DEUP's `predict_epistemic` returns an *uncalibrated* score: higher means "less
4
+ trustworthy", but not a probability. **Split-conformal calibration** turns it into
5
+ prediction intervals with finite-sample, distribution-free marginal coverage
6
+ $P(y \in [\hat{y}^-, \hat{y}^+]) \ge 1 - \alpha$ — using the DEUP signal as the
7
+ interval's *width*.
8
+
9
+ ## How it works
10
+
11
+ On a **held-out** calibration set, compute normalized residuals
12
+ $r_i = |y_i - f(x_i)| / g(x_i)$ and take their $(1-\alpha)$ empirical quantile $q$.
13
+ The interval at a new point is
14
+
15
+ $$
16
+ [\,f(x) - q\,g(x),\;\; f(x) + q\,g(x)\,].
17
+ $$
18
+
19
+ Intervals are **narrow where $g$ is small** (confident) and wide where $g$ is large —
20
+ locally adaptive coverage, unlike a constant-width baseline.
21
+
22
+ ## Usage
23
+
24
+ ```python
25
+ from deup import DEUPRegressor
26
+
27
+ model = DEUPRegressor(base_model=my_model).fit(X_train, y_train)
28
+
29
+ # calibrate on a separate held-out split (NOT the training data)
30
+ model.calibrate(X_cal, y_cal, method="normalized", alpha=0.1)
31
+
32
+ interval = model.predict_interval(X_test)
33
+ interval.lower, interval.upper, interval.width
34
+ ```
35
+
36
+ !!! warning "Use held-out data"
37
+ Coverage guarantees require the calibration set to be unseen by both the base model
38
+ $f$ and the error model $g$. Don't calibrate on training rows.
39
+
40
+ ## Methods
41
+
42
+ | `method` | Score | Use when |
43
+ |---|---|---|
44
+ | `normalized` (default) | $\lvert y-f(x)\rvert / g(x)$ | locally adaptive intervals |
45
+ | `mondrian` | per-group quantile | group/regime-conditional coverage |
46
+ | `cqr` | conformalized quantile regression | you already have quantile models |
47
+
48
+ ```python
49
+ # Mondrian: group-conditional coverage (e.g. per regime)
50
+ model.calibrate(X_cal, y_cal, method="mondrian", alpha=0.1, groups=regime_cal)
51
+ interval = model.predict_interval(X_test, groups=regime_test)
52
+ ```
53
+
54
+ The standalone `UncertaintyCalibrator` works with raw arrays (any base model):
55
+
56
+ ```python
57
+ from deup.calibration import UncertaintyCalibrator
58
+
59
+ cal = UncertaintyCalibrator(method="normalized", alpha=0.1)
60
+ cal.fit(y_cal, y_pred_cal, uncertainty_cal)
61
+ interval = cal.predict_interval(y_pred_test, uncertainty_test)
62
+ ```
63
+
64
+ ## MAPIE interop
65
+
66
+ `deup` is **complementary** to [MAPIE](https://mapie.readthedocs.io/): MAPIE supplies
67
+ mature conformal machinery, DEUP supplies a high-quality per-point scale $g(x)$. Expose
68
+ the DEUP scale as a normalizer:
69
+
70
+ ```python
71
+ from deup.calibration import deup_normalizer
72
+
73
+ normalizer = deup_normalizer(model) # .predict(X) == model.predict_epistemic(X)
74
+ scale = normalizer.predict(X_cal) # feed into MAPIE as a residual scale
75
+ ```
76
+
77
+ See [`examples/mapie_interop.py`](https://github.com/ursinasanderink/deup/blob/main/examples/mapie_interop.py)
78
+ for a runnable script.
79
+
80
+ ## Coverage guarantee
81
+
82
+ Split conformal gives the finite-sample bound (Lei et al., 2018)
83
+
84
+ $$
85
+ 1 - \alpha \;\le\; P(y \in \hat{C}(x)) \;\le\; 1 - \alpha + \tfrac{1}{n_{\text{cal}}+1},
86
+ $$
87
+
88
+ so intervals may *slightly over-cover*; this is correct, not a bug. `deup`'s test suite
89
+ checks empirical coverage within tolerance on i.i.d. and purged time-split fixtures.
@@ -0,0 +1,98 @@
1
+ # Decomposition & rank residualization
2
+
3
+ This page covers the v0.2 components that turn the raw error estimate $g(x)$ into a
4
+ reported epistemic signal: the error estimator, aleatoric estimators, the
5
+ $\hat{e} = \max(0, g - a)$ decomposition, and cross-sectional rank-geometry
6
+ residualization. See [Theory](theory.md) for the underlying math.
7
+
8
+ ## ErrorEstimator
9
+
10
+ `ErrorEstimator` is the reusable DEUP error model $g$ — feature pipeline +
11
+ target transform + non-negativity, fit on out-of-fold errors.
12
+
13
+ ```python
14
+ from deup.core import ErrorEstimator
15
+ from deup.core.features import DensityFeature, FeaturePipeline, RawFeatures
16
+ from deup.core.oof import OOFErrorCollector
17
+ from sklearn.ensemble import RandomForestRegressor
18
+ from sklearn.model_selection import KFold
19
+
20
+ oof = OOFErrorCollector(
21
+ RandomForestRegressor(), cv=KFold(5), loss="squared"
22
+ ).fit_collect(X, y)
23
+
24
+ g = ErrorEstimator(
25
+ features=FeaturePipeline([("raw", RawFeatures()), ("density", DensityFeature())]),
26
+ target_transform="log",
27
+ ).fit(X[oof.indices], oof.errors)
28
+
29
+ error_estimate = g.predict(X_new) # >= 0
30
+ ```
31
+
32
+ ## Aleatoric estimators $a(x)$
33
+
34
+ Model-agnostic estimates of the irreducible noise floor $A(x) = \mathrm{Var}(Y\mid X=x)$
35
+ (variance scale, matching a squared-error target).
36
+
37
+ | Estimator | $a(x)$ | When |
38
+ |---|---|---|
39
+ | `Homoscedastic` | constant $\sigma^2$ | noise ~ constant across $\mathcal{X}$ |
40
+ | `Heteroscedastic` | local k-NN label variance | input-dependent noise |
41
+ | `Quantile` | $((q_{hi}-q_{lo})/z)^2$ from quantile regression | skewed / tail noise |
42
+
43
+ ```python
44
+ from deup.core import Heteroscedastic
45
+
46
+ a = Heteroscedastic(k=20).fit(X, y).predict(X_new)
47
+ ```
48
+
49
+ ## Decomposition
50
+
51
+ ```python
52
+ from deup.core import decompose_epistemic
53
+
54
+ e_hat = decompose_epistemic(error_estimate, a) # max(0, g - a)
55
+ # a=None -> conservative proxy e_hat = g (the v0.1 default)
56
+ ```
57
+
58
+ $\hat{e}$ is always non-negative.
59
+
60
+ ## Rank-geometry residualization (Finding 3)
61
+
62
+ For cross-sectional rankers, $g$ and the loss target can be partly **mechanical rank
63
+ geometry** rather than genuine error. `RankResidualizer` fits an isotonic map from the
64
+ within-group rank of $|score|$ to the signal and subtracts it, leaving the part *not*
65
+ explained by rank geometry.
66
+
67
+ ```python
68
+ from deup.core import RankResidualizer, coupling_retention_report
69
+
70
+ # decouple g from rank geometry, per date
71
+ res = RankResidualizer().fit(g_values, abs_score, groups=dates)
72
+ g_decoupled = res.transform(g_values, abs_score, groups=dates)
73
+
74
+ # diagnostics: coupling before/after + loss-association retention
75
+ report = coupling_retention_report(g_values, score, loss, groups=dates)
76
+ print(report.coupling_before, report.coupling_after, report.retention)
77
+ ```
78
+
79
+ !!! note "Thesis finding"
80
+ Residualization decoupled the signal (per-date $\rho(\hat{e}, |score|)$:
81
+ $0.616 \to 0.317$) while **retaining ~92.5%** of the loss association. This is
82
+ **off by default** and **on in `DEUPRanker`** (P7).
83
+
84
+ ## Density kill criterion (Finding 3 corollary)
85
+
86
+ Density features can be an **informative null** in homogeneous universes. The kill
87
+ criterion drops them when their gain importance is negligible **and** they barely move
88
+ the loss partial-correlation.
89
+
90
+ ```python
91
+ from deup.core import density_kill_criterion
92
+
93
+ decision = density_kill_criterion(gain_importance=1e-5, delta_partial_corr=0.001)
94
+ print(decision.keep, decision.reason) # False, "killed: ..."
95
+ ```
96
+
97
+ Use `partial_correlation(a, b, control)` to compute the $\Delta$ partial-correlation
98
+ with vs without the density feature.
@@ -0,0 +1,57 @@
1
+ # Domain presets
2
+
3
+ The core library is domain-agnostic; these modules are **thin presets** that wire the
4
+ right splitter, features, and diagnostics for common workflows. They do not duplicate
5
+ OOF collection or error-estimator logic — see ``ARCHITECTURE.md``.
6
+
7
+ ## Cross-sectional finance (flagship)
8
+
9
+ ```python
10
+ import pandas as pd
11
+ from deup.domains.finance import CrossSectionalDEUP
12
+
13
+ # long-format panel: one row per (date, asset)
14
+ panel = pd.read_parquet("signals.parquet") # columns: date, score, vol_20d, ...
15
+
16
+ model = CrossSectionalDEUP(horizon=20, cv=5, embargo=1).fit(panel)
17
+ model.calibrate(cal_panel, alpha=0.1)
18
+
19
+ pred, unc = model.predict(test_panel, return_uncertainty=True)
20
+ health = model.health_report(test_panel) # per-date context gating (Finding 2)
21
+ health.gate # bool per date: trust / trade?
22
+ ```
23
+
24
+ Defaults wired in:
25
+
26
+ | Setting | Value |
27
+ |---|---|
28
+ | Estimator | :class:`~deup.estimators.DEUPRanker` |
29
+ | CV | :class:`~deup.splitters.PurgedWalkForward` + embargo |
30
+ | Rank geometry | residualization **ON** (Finding 3) |
31
+ | g-features | vol / breadth / regime preset columns when present |
32
+ | Context health | :class:`~deup.diagnostics.HealthIndex` |
33
+
34
+ Requires ``pip install "deup[finance]"`` (pandas).
35
+
36
+ ## Generic tabular
37
+
38
+ ```python
39
+ from deup.domains.tabular import TabularDEUP
40
+
41
+ model = TabularDEUP(task="regression", cv=5).fit(X, y)
42
+ unc = model.predict_epistemic(X_test)
43
+ ```
44
+
45
+ Wires ``KFold`` + raw ``X`` + Mahalanobis density features for ``g``.
46
+
47
+ ## Vision / OOD classification
48
+
49
+ ```python
50
+ from deup.domains.vision import VisionDEUP
51
+
52
+ model = VisionDEUP(cv=5).fit(images, labels) # tensors OK — auto-flattened
53
+ unc = model.predict_epistemic(images)
54
+ ```
55
+
56
+ Wires embedding → density + variance features for ``g`` (CIFAR-style high-N path).
57
+ Pass a custom ``embedding=`` transformer or callable for CNN embeddings.
@@ -0,0 +1,119 @@
1
+ # Feature builders for $g(x)$
2
+
3
+ The error predictor $g$ in DEUP can use **stationarizing features**
4
+ $\phi_{z^N}(x)$ beyond raw inputs (Lahlou *et al.*, 2023, Sec. 3.2). Each builder
5
+ is a scikit-learn `TransformerMixin` that **fits on training data only** — the same
6
+ leakage discipline as `OOFErrorCollector` (Finding 4).
7
+
8
+ See [Theory](theory.md) for the mathematical definitions.
9
+
10
+ ## Quick example
11
+
12
+ ```python
13
+ import numpy as np
14
+ from sklearn.ensemble import RandomForestRegressor
15
+
16
+ from deup.core.features import (
17
+ DensityFeature,
18
+ DistanceToTrain,
19
+ FeaturePipeline,
20
+ RawFeatures,
21
+ SeenBit,
22
+ )
23
+
24
+ pipe = FeaturePipeline([
25
+ ("raw", RawFeatures()),
26
+ ("density", DensityFeature(method="mahalanobis")),
27
+ ("dist", DistanceToTrain(k=5)),
28
+ ("seen", SeenBit(atol=1e-8)),
29
+ ])
30
+
31
+ X_train = np.random.default_rng(0).normal(size=(500, 8))
32
+ X_test = np.random.default_rng(1).normal(size=(50, 8))
33
+
34
+ phi_train = pipe.fit_transform(X_train)
35
+ phi_test = pipe.transform(X_test)
36
+ print(phi_train.shape, phi_test.shape) # (500, 8+1+1+1), (50, ...)
37
+ ```
38
+
39
+ ## Builders
40
+
41
+ | Class | Output | Methods / notes |
42
+ |---|---|---|
43
+ | `RawFeatures` | $x$ | passthrough |
44
+ | `DensityFeature` | $\log \hat{q}(x)$ column | `mahalanobis`, `knn`, `kde`; `flow` requires `[torch]` |
45
+ | `VarianceFeature` | $\log \hat{V}(x)$ column | `ensemble` (bootstrap); `gp` requires `[torch]` |
46
+ | `DistanceToTrain` | $k$-th NN distance | default `k=5` |
47
+ | `SeenBit` | $s \in \{0,1\}$ | exact / `atol` duplicate detection |
48
+ | `ResidualMagnitude` | kNN-smoothed $\|y-f(x)\|$ | needs `estimator` + `y` at `fit` |
49
+
50
+ ### DensityFeature
51
+
52
+ ```python
53
+ # Diagonal Gaussian — matches thesis GaussianDensity.log_prob (Lee et al. 2018)
54
+ DensityFeature(method="mahalanobis")
55
+
56
+ # k-NN distance proxy: log q ≈ -log(d_k + ε)
57
+ DensityFeature(method="knn", k=5)
58
+
59
+ # sklearn KernelDensity
60
+ DensityFeature(method="kde", bandwidth=1.0)
61
+ ```
62
+
63
+ !!! warning "Finding 3"
64
+ Density can be **informative null** in homogeneous tabular panels. Ablate with
65
+ `FeaturePipeline` column importances or drop if $\Delta\rho < 0.005$.
66
+
67
+ ### VarianceFeature (ensemble)
68
+
69
+ Fits `n_estimators` bootstrap replicas of a base model and returns
70
+ $\log(\mathrm{Var}_j f_j(x) + \varepsilon)$.
71
+
72
+ ```python
73
+ VarianceFeature(
74
+ method="ensemble",
75
+ estimator=RandomForestRegressor(n_estimators=50, random_state=0),
76
+ n_estimators=10,
77
+ )
78
+ ```
79
+
80
+ ### ResidualMagnitude
81
+
82
+ At `fit(X, y)` stores training residuals $|y - f(x)|$. At `transform(X)` returns
83
+ the mean residual magnitude among $k$ nearest training neighbors — a local error prior
84
+ when $y$ is unavailable at inference.
85
+
86
+ ```python
87
+ ResidualMagnitude(
88
+ estimator=RandomForestRegressor(),
89
+ k=5,
90
+ ).fit(X_train, y_train)
91
+ ```
92
+
93
+ ## FeaturePipeline
94
+
95
+ `FeaturePipeline` horizontally stacks named builders (FeatureUnion-style). Names appear
96
+ in `get_feature_names_out()`.
97
+
98
+ ```python
99
+ from deup.core.features import FeaturePipeline, VarianceFeature, SeenBit
100
+
101
+ pipe = FeaturePipeline([
102
+ ("var", VarianceFeature(method="ensemble")),
103
+ ("seen", SeenBit()),
104
+ ])
105
+ ```
106
+
107
+ ## Torch-dependent methods
108
+
109
+ `DensityFeature(method="flow")` and `VarianceFeature(method="gp")` require
110
+ `pip install "deup[torch]"`. Without torch, construction raises `ImportError` with an
111
+ install hint; the module still imports cleanly on a torch-free install.
112
+
113
+ ## v0.1 vs v0.2
114
+
115
+ **v0.1 (this release):** feature builders + pipeline are available as primitives.
116
+ `DEUPRegressor` still trains $g$ on raw $X$ by default.
117
+
118
+ **v0.2 (P6):** `ErrorEstimator` wires `FeaturePipeline` into the DEUP training loop
119
+ with target transforms and non-negativity clipping.