microimpute 2.0.3__tar.gz → 2.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {microimpute-2.0.3 → microimpute-2.0.4}/PKG-INFO +1 -1
- microimpute-2.0.4/microimpute/models/zero_inflated.py +698 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/dashboard_formatter.py +47 -11
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/PKG-INFO +1 -1
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/SOURCES.txt +1 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/pyproject.toml +1 -1
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_dashboard_formatter.py +52 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/README.md +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/autoimpute.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/autoimpute_helpers.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/imputations.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/metrics.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/validation.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/config.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/evaluations/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/evaluations/cross_validation.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/evaluations/predictor_analysis.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/imputer.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/matching.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/mdn.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/ols.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/qrf.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/quantreg.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/data.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/statmatch_hotdeck.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/type_handling.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/visualizations/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/visualizations/comparison_plots.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/visualizations/performance_plots.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/dependency_links.txt +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/requires.txt +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/top_level.txt +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/setup.cfg +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_autoimpute.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_basic.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_data_preprocessing.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_metrics.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_predictor_analysis.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_quantile_comparison.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_smoke_qrf.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_type_handling.py +0 -0
- {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_visualizations.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: microimpute
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.4
|
|
4
4
|
Summary: Benchmarking imputation methods for microdata
|
|
5
5
|
Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
|
|
6
6
|
Requires-Python: <3.15,>=3.12
|
|
@@ -0,0 +1,698 @@
|
|
|
1
|
+
"""Regime-aware zero-inflation wrapper around base imputers.
|
|
2
|
+
|
|
3
|
+
Tabular microdata variables often fall into distinct *regimes* based on
|
|
4
|
+
which of {negative, zero, positive} values appear in the training data.
|
|
5
|
+
Imputing them with a single regressor mixes regimes together, causing
|
|
6
|
+
two recurring bugs in downstream ecosystems:
|
|
7
|
+
|
|
8
|
+
1. **Negative-dropping.** The common "fit QRF on ``y > 0``" pattern
|
|
9
|
+
drops negative training rows along with zeros, so the imputer
|
|
10
|
+
produces zero or positive values only. Variables like
|
|
11
|
+
``short_term_capital_gains`` lose their entire negative tail.
|
|
12
|
+
|
|
13
|
+
2. **Zero-crossing interpolation.** A QRF fit on all nonzero values
|
|
14
|
+
(both signs) learns leaf distributions that interpolate between
|
|
15
|
+
positive and negative training rows. Predictions for records that
|
|
16
|
+
the gate marks "nonzero" can land in the interval between
|
|
17
|
+
``max(train_negatives)`` and ``min(train_positives)``, which is
|
|
18
|
+
not a region any actual record occupies.
|
|
19
|
+
|
|
20
|
+
``ZeroInflatedImputer`` wraps any base ``Imputer`` and:
|
|
21
|
+
|
|
22
|
+
- Detects the regime automatically at fit time from the training
|
|
23
|
+
distribution — no per-variable hand configuration required.
|
|
24
|
+
- Composes the base imputer with appropriate gate(s):
|
|
25
|
+
- Three-sign: gate chooses ``{neg, 0, pos}``; separate base
|
|
26
|
+
imputers on the positive and negative subsets.
|
|
27
|
+
- ZI positive / ZI negative: binary gate (``0`` vs nonzero); base
|
|
28
|
+
imputer on the nonzero-sign subset.
|
|
29
|
+
- Sign-only (no zero): binary sign gate; two base imputers.
|
|
30
|
+
- Single-sign or constant: no gate; direct base imputer or a
|
|
31
|
+
constant imputer.
|
|
32
|
+
- At predict time, routes each record to the base imputer of its
|
|
33
|
+
gate-assigned regime, guaranteeing no sign-interpolation leaks.
|
|
34
|
+
|
|
35
|
+
The wrapper is generic over the base imputer — ``QRF`` is the obvious
|
|
36
|
+
default, but ``MDN``, ``OLS``, or ``Matching`` all compose the same way.
|
|
37
|
+
|
|
38
|
+
Regime detection is parameterized by ``min_class_count`` and
|
|
39
|
+
``min_class_fraction``: a class with fewer observations than both
|
|
40
|
+
thresholds collapses into the closest adjacent regime. This avoids
|
|
41
|
+
fitting a full three-sign split on a variable whose negative tail is
|
|
42
|
+
five outlier rows — the cost-benefit flips toward the simpler
|
|
43
|
+
architecture.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import logging
|
|
49
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
50
|
+
|
|
51
|
+
import numpy as np
|
|
52
|
+
import pandas as pd
|
|
53
|
+
from pydantic import SkipValidation, validate_call
|
|
54
|
+
|
|
55
|
+
from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG
|
|
56
|
+
from microimpute.models.imputer import (
|
|
57
|
+
Imputer,
|
|
58
|
+
ImputerResults,
|
|
59
|
+
_ConstantValueModel,
|
|
60
|
+
)
|
|
61
|
+
from microimpute.models.qrf import QRF
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Regime labels. Kept as module-level constants so downstream code can
|
|
65
|
+
# match on them without magic strings.
|
|
66
|
+
REGIME_THREE_SIGN = "THREE_SIGN"
|
|
67
|
+
REGIME_ZI_POSITIVE = "ZI_POSITIVE"
|
|
68
|
+
REGIME_ZI_NEGATIVE = "ZI_NEGATIVE"
|
|
69
|
+
REGIME_SIGN_ONLY = "SIGN_ONLY"
|
|
70
|
+
REGIME_POSITIVE_ONLY = "POSITIVE_ONLY"
|
|
71
|
+
REGIME_NEGATIVE_ONLY = "NEGATIVE_ONLY"
|
|
72
|
+
REGIME_DEGENERATE_ZERO = "DEGENERATE_ZERO"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _make_classifier(kind: str, seed: int):
|
|
76
|
+
"""Build a sklearn classifier for the zero-gate.
|
|
77
|
+
|
|
78
|
+
``hist_gb`` (default): ``HistGradientBoostingClassifier``. On the
|
|
79
|
+
isolated-log-loss benchmark over 26 zero-inflated PolicyEngine-US
|
|
80
|
+
target variables this Pareto-dominated a 50-tree RF on log-loss
|
|
81
|
+
(0.225 vs 0.310), Brier (0.071 vs 0.081), ECE (0.005 vs 0.039),
|
|
82
|
+
and ROC-AUC (0.809 vs 0.737).
|
|
83
|
+
"""
|
|
84
|
+
if kind == "hist_gb":
|
|
85
|
+
from sklearn.ensemble import HistGradientBoostingClassifier
|
|
86
|
+
|
|
87
|
+
return HistGradientBoostingClassifier(random_state=seed)
|
|
88
|
+
if kind == "rf":
|
|
89
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
90
|
+
|
|
91
|
+
return RandomForestClassifier(n_estimators=50, random_state=seed, n_jobs=-1)
|
|
92
|
+
raise ValueError(f"Unknown classifier_type {kind!r}; expected 'hist_gb' or 'rf'.")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _detect_regime(
|
|
96
|
+
y: np.ndarray,
|
|
97
|
+
*,
|
|
98
|
+
min_class_count: int,
|
|
99
|
+
min_class_fraction: float,
|
|
100
|
+
zero_atol: float,
|
|
101
|
+
) -> str:
|
|
102
|
+
"""Classify the training distribution into one of seven regimes.
|
|
103
|
+
|
|
104
|
+
A class (neg/zero/pos) counts as present iff its count is at least
|
|
105
|
+
``min_class_count`` AND its fraction of total rows is at least
|
|
106
|
+
``min_class_fraction``. Below both thresholds, the class collapses
|
|
107
|
+
into its closest adjacent regime (minority negatives merge into
|
|
108
|
+
zero → ZI_POSITIVE; minority zeros merge into the majority sign;
|
|
109
|
+
etc.). This keeps the gate architecture stable in the presence of
|
|
110
|
+
measurement-error outliers.
|
|
111
|
+
"""
|
|
112
|
+
n = len(y)
|
|
113
|
+
if n == 0:
|
|
114
|
+
return REGIME_DEGENERATE_ZERO
|
|
115
|
+
|
|
116
|
+
is_zero = np.abs(y) <= zero_atol
|
|
117
|
+
is_pos = y > zero_atol
|
|
118
|
+
is_neg = y < -zero_atol
|
|
119
|
+
|
|
120
|
+
n_zero = int(is_zero.sum())
|
|
121
|
+
n_pos = int(is_pos.sum())
|
|
122
|
+
n_neg = int(is_neg.sum())
|
|
123
|
+
|
|
124
|
+
# Apply both thresholds.
|
|
125
|
+
def _meaningful(count: int) -> bool:
|
|
126
|
+
return count >= min_class_count and (count / n) >= min_class_fraction
|
|
127
|
+
|
|
128
|
+
has_zero = _meaningful(n_zero)
|
|
129
|
+
has_pos = _meaningful(n_pos)
|
|
130
|
+
has_neg = _meaningful(n_neg)
|
|
131
|
+
|
|
132
|
+
if not (has_zero or has_pos or has_neg):
|
|
133
|
+
# All three classes are below threshold. Pick the one with the
|
|
134
|
+
# largest raw count as a degenerate fallback.
|
|
135
|
+
counts = {"zero": n_zero, "pos": n_pos, "neg": n_neg}
|
|
136
|
+
majority = max(counts, key=counts.get)
|
|
137
|
+
if majority == "zero":
|
|
138
|
+
return REGIME_DEGENERATE_ZERO
|
|
139
|
+
return REGIME_POSITIVE_ONLY if majority == "pos" else REGIME_NEGATIVE_ONLY
|
|
140
|
+
|
|
141
|
+
if has_pos and has_neg and has_zero:
|
|
142
|
+
return REGIME_THREE_SIGN
|
|
143
|
+
if has_pos and has_neg:
|
|
144
|
+
return REGIME_SIGN_ONLY
|
|
145
|
+
if has_pos and has_zero:
|
|
146
|
+
return REGIME_ZI_POSITIVE
|
|
147
|
+
if has_neg and has_zero:
|
|
148
|
+
return REGIME_ZI_NEGATIVE
|
|
149
|
+
if has_pos:
|
|
150
|
+
return REGIME_POSITIVE_ONLY
|
|
151
|
+
if has_neg:
|
|
152
|
+
return REGIME_NEGATIVE_ONLY
|
|
153
|
+
return REGIME_DEGENERATE_ZERO
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class ZeroInflatedImputer(Imputer):
|
|
157
|
+
"""Imputer that wraps a base Imputer with regime-aware zero-gating.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
base_imputer_class: ``Imputer`` subclass to use for the nonzero
|
|
161
|
+
regression step. Defaults to ``QRF``.
|
|
162
|
+
base_imputer_kwargs: Keyword arguments forwarded to the base
|
|
163
|
+
imputer constructor. ``{}`` by default.
|
|
164
|
+
min_class_count: Minimum raw count per class (neg/0/pos) for
|
|
165
|
+
that class to be considered present. Below this, the class
|
|
166
|
+
collapses into an adjacent regime. Defaults to 10.
|
|
167
|
+
min_class_fraction: Minimum fraction of total rows per class
|
|
168
|
+
for that class to be considered present. Defaults to 0.01.
|
|
169
|
+
zero_atol: Absolute tolerance for "equals zero" in the regime
|
|
170
|
+
detector. Defaults to 1e-6, matching the upstream
|
|
171
|
+
``_MultiSourceBase`` convention.
|
|
172
|
+
classifier_type: Backend for the gate classifier;
|
|
173
|
+
``"hist_gb"`` (default) or ``"rf"``.
|
|
174
|
+
seed: Random seed.
|
|
175
|
+
log_level: Python logging level.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
def __init__(
|
|
179
|
+
self,
|
|
180
|
+
base_imputer_class: Optional[Type[Imputer]] = None,
|
|
181
|
+
base_imputer_kwargs: Optional[Dict[str, Any]] = None,
|
|
182
|
+
min_class_count: int = 10,
|
|
183
|
+
min_class_fraction: float = 0.01,
|
|
184
|
+
zero_atol: float = 1e-6,
|
|
185
|
+
classifier_type: str = "hist_gb",
|
|
186
|
+
seed: Optional[int] = RANDOM_STATE,
|
|
187
|
+
log_level: Optional[str] = "WARNING",
|
|
188
|
+
) -> None:
|
|
189
|
+
super().__init__(seed=seed, log_level=log_level)
|
|
190
|
+
self.base_imputer_class = base_imputer_class or QRF
|
|
191
|
+
self.base_imputer_kwargs = dict(base_imputer_kwargs or {})
|
|
192
|
+
self.min_class_count = int(min_class_count)
|
|
193
|
+
self.min_class_fraction = float(min_class_fraction)
|
|
194
|
+
self.zero_atol = float(zero_atol)
|
|
195
|
+
self.classifier_type = classifier_type
|
|
196
|
+
|
|
197
|
+
# Filled in during fit().
|
|
198
|
+
self._regimes: Dict[str, str] = {}
|
|
199
|
+
self._per_variable: Dict[str, Dict[str, Any]] = {}
|
|
200
|
+
|
|
201
|
+
def _fit(self, *args: Any, **kwargs: Any) -> Any:
|
|
202
|
+
"""Abstract-method placeholder; this class overrides ``fit`` directly."""
|
|
203
|
+
raise NotImplementedError(
|
|
204
|
+
"ZeroInflatedImputer overrides `fit` directly; `_fit` is not used."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def get_regime(self, variable: str) -> str:
|
|
208
|
+
"""Return the detected regime label for a fitted variable."""
|
|
209
|
+
if variable not in self._regimes:
|
|
210
|
+
raise KeyError(f"Variable {variable!r} not fitted; call fit() first.")
|
|
211
|
+
return self._regimes[variable]
|
|
212
|
+
|
|
213
|
+
def fit(
|
|
214
|
+
self,
|
|
215
|
+
X_train: pd.DataFrame,
|
|
216
|
+
predictors: List[str],
|
|
217
|
+
imputed_variables: List[str],
|
|
218
|
+
weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None,
|
|
219
|
+
skip_missing: bool = False,
|
|
220
|
+
not_numeric_categorical: Optional[List[str]] = None,
|
|
221
|
+
**kwargs: Any,
|
|
222
|
+
) -> Any:
|
|
223
|
+
"""Fit the regime-aware wrapper.
|
|
224
|
+
|
|
225
|
+
Delegates non-numeric targets (categorical / boolean /
|
|
226
|
+
constant) to a single base imputer instance. Numeric targets
|
|
227
|
+
are handled per-variable: regime detection, then composition
|
|
228
|
+
of gate + base imputer(s) as appropriate.
|
|
229
|
+
|
|
230
|
+
Returns a ``ZeroInflatedImputerResults`` that routes
|
|
231
|
+
predictions through each target's regime-specific pipeline.
|
|
232
|
+
"""
|
|
233
|
+
self._validate_data(X_train, predictors + imputed_variables)
|
|
234
|
+
|
|
235
|
+
# Classify target variables as numeric / categorical / boolean /
|
|
236
|
+
# constant using the base Imputer's detector.
|
|
237
|
+
self.identify_target_types(
|
|
238
|
+
X_train,
|
|
239
|
+
imputed_variables,
|
|
240
|
+
not_numeric_categorical=not_numeric_categorical,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
self.predictors = list(predictors)
|
|
244
|
+
self.imputed_variables = list(imputed_variables)
|
|
245
|
+
self._regimes = {}
|
|
246
|
+
self._per_variable = {}
|
|
247
|
+
|
|
248
|
+
# Per-variable fit for numeric targets. Constant numeric
|
|
249
|
+
# targets (e.g. a column that is always 0 in training) are
|
|
250
|
+
# also treated here so their regime lands in the wrapper's
|
|
251
|
+
# ``_regimes`` map rather than being silently passed through.
|
|
252
|
+
constant_numeric_targets = [
|
|
253
|
+
v
|
|
254
|
+
for v in imputed_variables
|
|
255
|
+
if v in self.constant_targets
|
|
256
|
+
and np.issubdtype(
|
|
257
|
+
pd.Series([self.constant_targets[v]["value"]]).dtype,
|
|
258
|
+
np.number,
|
|
259
|
+
)
|
|
260
|
+
]
|
|
261
|
+
numeric_targets = [
|
|
262
|
+
v
|
|
263
|
+
for v in imputed_variables
|
|
264
|
+
if v in self.numeric_targets or v in constant_numeric_targets
|
|
265
|
+
]
|
|
266
|
+
for var in numeric_targets:
|
|
267
|
+
y = X_train[var].to_numpy(dtype=float, copy=False)
|
|
268
|
+
regime = _detect_regime(
|
|
269
|
+
y,
|
|
270
|
+
min_class_count=self.min_class_count,
|
|
271
|
+
min_class_fraction=self.min_class_fraction,
|
|
272
|
+
zero_atol=self.zero_atol,
|
|
273
|
+
)
|
|
274
|
+
self._regimes[var] = regime
|
|
275
|
+
self._per_variable[var] = self._fit_single_numeric(
|
|
276
|
+
X_train=X_train,
|
|
277
|
+
predictors=predictors,
|
|
278
|
+
variable=var,
|
|
279
|
+
regime=regime,
|
|
280
|
+
y=y,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Non-numeric (categorical / boolean / constant) targets are
|
|
284
|
+
# handled by a single auxiliary base imputer over their union.
|
|
285
|
+
non_numeric = [v for v in imputed_variables if v not in numeric_targets]
|
|
286
|
+
if non_numeric:
|
|
287
|
+
aux = self.base_imputer_class(
|
|
288
|
+
log_level="ERROR",
|
|
289
|
+
**self.base_imputer_kwargs,
|
|
290
|
+
)
|
|
291
|
+
aux_result = aux.fit(
|
|
292
|
+
X_train=X_train,
|
|
293
|
+
predictors=predictors,
|
|
294
|
+
imputed_variables=non_numeric,
|
|
295
|
+
weight_col=weight_col,
|
|
296
|
+
skip_missing=skip_missing,
|
|
297
|
+
not_numeric_categorical=not_numeric_categorical,
|
|
298
|
+
**kwargs,
|
|
299
|
+
)
|
|
300
|
+
aux_bundle = {"kind": "passthrough", "result": aux_result}
|
|
301
|
+
else:
|
|
302
|
+
aux_bundle = None
|
|
303
|
+
|
|
304
|
+
return ZeroInflatedImputerResults(
|
|
305
|
+
predictors=self.predictors,
|
|
306
|
+
imputed_variables=self.imputed_variables,
|
|
307
|
+
seed=self.seed,
|
|
308
|
+
regimes=self._regimes,
|
|
309
|
+
per_variable=self._per_variable,
|
|
310
|
+
non_numeric_bundle=aux_bundle,
|
|
311
|
+
log_level="WARNING",
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# ------------------------------------------------------------------
|
|
315
|
+
# Per-variable fit helpers
|
|
316
|
+
# ------------------------------------------------------------------
|
|
317
|
+
|
|
318
|
+
def _fit_single_numeric(
|
|
319
|
+
self,
|
|
320
|
+
*,
|
|
321
|
+
X_train: pd.DataFrame,
|
|
322
|
+
predictors: List[str],
|
|
323
|
+
variable: str,
|
|
324
|
+
regime: str,
|
|
325
|
+
y: np.ndarray,
|
|
326
|
+
) -> Dict[str, Any]:
|
|
327
|
+
"""Fit the gate and base imputer(s) for one numeric target.
|
|
328
|
+
|
|
329
|
+
Returns a bundle dict with the regime, the gate classifier
|
|
330
|
+
(or None), and the base imputer(s) keyed by their role.
|
|
331
|
+
"""
|
|
332
|
+
X_pred = X_train[predictors].to_numpy(dtype=float, copy=False)
|
|
333
|
+
|
|
334
|
+
if regime == REGIME_DEGENERATE_ZERO:
|
|
335
|
+
return {"kind": "constant", "value": 0.0}
|
|
336
|
+
|
|
337
|
+
if regime in (REGIME_POSITIVE_ONLY, REGIME_NEGATIVE_ONLY):
|
|
338
|
+
# No gate; single base imputer on the full training set.
|
|
339
|
+
return {
|
|
340
|
+
"kind": "single",
|
|
341
|
+
"base": self._fit_base_single(X_train, predictors, variable),
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
if regime == REGIME_ZI_POSITIVE:
|
|
345
|
+
labels = (y > self.zero_atol).astype(int)
|
|
346
|
+
clf = _make_classifier(self.classifier_type, self.seed)
|
|
347
|
+
clf.fit(X_pred, labels)
|
|
348
|
+
pos_mask = y > self.zero_atol
|
|
349
|
+
pos_base = self._fit_base_single(
|
|
350
|
+
X_train.loc[pos_mask], predictors, variable
|
|
351
|
+
)
|
|
352
|
+
return {
|
|
353
|
+
"kind": "zi_positive",
|
|
354
|
+
"classifier": clf,
|
|
355
|
+
"positive_base": pos_base,
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
if regime == REGIME_ZI_NEGATIVE:
|
|
359
|
+
labels = (y < -self.zero_atol).astype(int)
|
|
360
|
+
clf = _make_classifier(self.classifier_type, self.seed)
|
|
361
|
+
clf.fit(X_pred, labels)
|
|
362
|
+
neg_mask = y < -self.zero_atol
|
|
363
|
+
neg_base = self._fit_base_single(
|
|
364
|
+
X_train.loc[neg_mask], predictors, variable
|
|
365
|
+
)
|
|
366
|
+
return {
|
|
367
|
+
"kind": "zi_negative",
|
|
368
|
+
"classifier": clf,
|
|
369
|
+
"negative_base": neg_base,
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if regime == REGIME_SIGN_ONLY:
|
|
373
|
+
# No zero class, but both signs present. Binary sign gate
|
|
374
|
+
# plus a base imputer per sign.
|
|
375
|
+
labels = (y > 0).astype(int)
|
|
376
|
+
clf = _make_classifier(self.classifier_type, self.seed)
|
|
377
|
+
clf.fit(X_pred, labels)
|
|
378
|
+
pos_mask = y > 0
|
|
379
|
+
neg_mask = ~pos_mask
|
|
380
|
+
return {
|
|
381
|
+
"kind": "sign_only",
|
|
382
|
+
"classifier": clf,
|
|
383
|
+
"positive_base": self._fit_base_single(
|
|
384
|
+
X_train.loc[pos_mask], predictors, variable
|
|
385
|
+
),
|
|
386
|
+
"negative_base": self._fit_base_single(
|
|
387
|
+
X_train.loc[neg_mask], predictors, variable
|
|
388
|
+
),
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
if regime == REGIME_THREE_SIGN:
|
|
392
|
+
# 0 / neg / pos three-way gate + two base imputers.
|
|
393
|
+
labels = np.where(
|
|
394
|
+
y > self.zero_atol,
|
|
395
|
+
2,
|
|
396
|
+
np.where(y < -self.zero_atol, 0, 1),
|
|
397
|
+
)
|
|
398
|
+
clf = _make_classifier(self.classifier_type, self.seed)
|
|
399
|
+
clf.fit(X_pred, labels)
|
|
400
|
+
pos_mask = y > self.zero_atol
|
|
401
|
+
neg_mask = y < -self.zero_atol
|
|
402
|
+
return {
|
|
403
|
+
"kind": "three_sign",
|
|
404
|
+
"classifier": clf,
|
|
405
|
+
"positive_base": self._fit_base_single(
|
|
406
|
+
X_train.loc[pos_mask], predictors, variable
|
|
407
|
+
),
|
|
408
|
+
"negative_base": self._fit_base_single(
|
|
409
|
+
X_train.loc[neg_mask], predictors, variable
|
|
410
|
+
),
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
raise ValueError(f"Unhandled regime {regime!r}")
|
|
414
|
+
|
|
415
|
+
def _fit_base_single(
|
|
416
|
+
self,
|
|
417
|
+
X_train: pd.DataFrame,
|
|
418
|
+
predictors: List[str],
|
|
419
|
+
variable: str,
|
|
420
|
+
) -> ImputerResults:
|
|
421
|
+
"""Fit a single base Imputer on a (possibly filtered) slice."""
|
|
422
|
+
imputer = self.base_imputer_class(
|
|
423
|
+
log_level="ERROR",
|
|
424
|
+
**self.base_imputer_kwargs,
|
|
425
|
+
)
|
|
426
|
+
return imputer.fit(
|
|
427
|
+
X_train=X_train,
|
|
428
|
+
predictors=predictors,
|
|
429
|
+
imputed_variables=[variable],
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
class ZeroInflatedImputerResults(ImputerResults):
|
|
434
|
+
"""Fitted regime-aware imputer ready for prediction."""
|
|
435
|
+
|
|
436
|
+
def __init__(
|
|
437
|
+
self,
|
|
438
|
+
predictors: List[str],
|
|
439
|
+
imputed_variables: List[str],
|
|
440
|
+
seed: int,
|
|
441
|
+
regimes: Dict[str, str],
|
|
442
|
+
per_variable: Dict[str, Dict[str, Any]],
|
|
443
|
+
non_numeric_bundle: Optional[Dict[str, Any]] = None,
|
|
444
|
+
imputed_vars_dummy_info: Optional[Dict[str, Any]] = None,
|
|
445
|
+
original_predictors: Optional[List[str]] = None,
|
|
446
|
+
log_level: Optional[str] = "WARNING",
|
|
447
|
+
) -> None:
|
|
448
|
+
super().__init__(
|
|
449
|
+
predictors=predictors,
|
|
450
|
+
imputed_variables=imputed_variables,
|
|
451
|
+
seed=seed,
|
|
452
|
+
imputed_vars_dummy_info=imputed_vars_dummy_info,
|
|
453
|
+
original_predictors=original_predictors or predictors,
|
|
454
|
+
log_level=log_level,
|
|
455
|
+
)
|
|
456
|
+
self._regimes = regimes
|
|
457
|
+
self._per_variable = per_variable
|
|
458
|
+
self._non_numeric_bundle = non_numeric_bundle
|
|
459
|
+
self._rng = np.random.default_rng(seed)
|
|
460
|
+
|
|
461
|
+
@validate_call(config=VALIDATE_CONFIG)
|
|
462
|
+
def predict(
|
|
463
|
+
self,
|
|
464
|
+
X_test: pd.DataFrame,
|
|
465
|
+
quantiles: Optional[List[float]] = None,
|
|
466
|
+
return_probs: bool = False,
|
|
467
|
+
**kwargs: Any,
|
|
468
|
+
) -> Union[pd.DataFrame, Dict[float, pd.DataFrame]]:
|
|
469
|
+
"""Predict imputed values, routing per-variable by regime.
|
|
470
|
+
|
|
471
|
+
For numeric targets, the gate assigns each record to zero,
|
|
472
|
+
positive, or negative regime (depending on the detected
|
|
473
|
+
regime), and the base imputer for that regime produces the
|
|
474
|
+
nonzero draw. Zeros are set exactly to 0.0 (no stochastic
|
|
475
|
+
smearing).
|
|
476
|
+
|
|
477
|
+
For non-numeric targets (categorical / boolean / constant),
|
|
478
|
+
delegation is to the single auxiliary base imputer fit at
|
|
479
|
+
training time.
|
|
480
|
+
"""
|
|
481
|
+
if quantiles is not None:
|
|
482
|
+
# Quantile grid not currently supported in the wrapper; the
|
|
483
|
+
# regime routing only produces a single stochastic draw per
|
|
484
|
+
# call. Deterministic-quantile support would require the
|
|
485
|
+
# caller to specify quantile conditional on regime.
|
|
486
|
+
return {
|
|
487
|
+
q: self._predict_single_draw(X_test, quantile=q, **kwargs)
|
|
488
|
+
for q in quantiles
|
|
489
|
+
}
|
|
490
|
+
return self._predict_single_draw(X_test, quantile=None, **kwargs)
|
|
491
|
+
|
|
492
|
+
def _predict_single_draw(
|
|
493
|
+
self,
|
|
494
|
+
X_test: pd.DataFrame,
|
|
495
|
+
quantile: Optional[float],
|
|
496
|
+
**kwargs: Any,
|
|
497
|
+
) -> pd.DataFrame:
|
|
498
|
+
out = pd.DataFrame(index=X_test.index)
|
|
499
|
+
|
|
500
|
+
for variable in self.imputed_variables:
|
|
501
|
+
regime = self._regimes.get(variable)
|
|
502
|
+
if regime is None:
|
|
503
|
+
# Non-numeric target; handled by the auxiliary bundle.
|
|
504
|
+
continue
|
|
505
|
+
bundle = self._per_variable[variable]
|
|
506
|
+
out[variable] = self._predict_single_variable(
|
|
507
|
+
X_test, variable, bundle, quantile=quantile, **kwargs
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Merge in non-numeric target predictions from the auxiliary
|
|
511
|
+
# single base imputer.
|
|
512
|
+
if self._non_numeric_bundle is not None:
|
|
513
|
+
aux_result = self._non_numeric_bundle["result"]
|
|
514
|
+
if quantile is None:
|
|
515
|
+
aux_preds = aux_result.predict(X_test)
|
|
516
|
+
else:
|
|
517
|
+
aux_dict = aux_result.predict(X_test, quantiles=[quantile])
|
|
518
|
+
aux_preds = aux_dict[quantile]
|
|
519
|
+
for col in aux_preds.columns:
|
|
520
|
+
if col not in out.columns:
|
|
521
|
+
out[col] = aux_preds[col].values
|
|
522
|
+
|
|
523
|
+
return out
|
|
524
|
+
|
|
525
|
+
def _predict_single_variable(
|
|
526
|
+
self,
|
|
527
|
+
X_test: pd.DataFrame,
|
|
528
|
+
variable: str,
|
|
529
|
+
bundle: Dict[str, Any],
|
|
530
|
+
quantile: Optional[float],
|
|
531
|
+
**kwargs: Any,
|
|
532
|
+
) -> np.ndarray:
|
|
533
|
+
n = len(X_test)
|
|
534
|
+
kind = bundle["kind"]
|
|
535
|
+
|
|
536
|
+
if kind == "constant":
|
|
537
|
+
return np.full(n, bundle["value"], dtype=float)
|
|
538
|
+
|
|
539
|
+
if kind == "single":
|
|
540
|
+
preds = self._invoke_base(
|
|
541
|
+
bundle["base"], X_test, quantile=quantile, **kwargs
|
|
542
|
+
)
|
|
543
|
+
return preds[variable].to_numpy(dtype=float)
|
|
544
|
+
|
|
545
|
+
X_pred = X_test[self.predictors].to_numpy(dtype=float, copy=False)
|
|
546
|
+
|
|
547
|
+
if kind == "zi_positive":
|
|
548
|
+
clf = bundle["classifier"]
|
|
549
|
+
draw = self._bernoulli_gate_draw(clf, X_pred)
|
|
550
|
+
values = np.zeros(n, dtype=float)
|
|
551
|
+
positive_mask = draw == 1
|
|
552
|
+
if positive_mask.any():
|
|
553
|
+
sub_preds = self._invoke_base(
|
|
554
|
+
bundle["positive_base"],
|
|
555
|
+
X_test.loc[positive_mask],
|
|
556
|
+
quantile=quantile,
|
|
557
|
+
**kwargs,
|
|
558
|
+
)
|
|
559
|
+
values[positive_mask] = sub_preds[variable].to_numpy(dtype=float)
|
|
560
|
+
return values
|
|
561
|
+
|
|
562
|
+
if kind == "zi_negative":
|
|
563
|
+
clf = bundle["classifier"]
|
|
564
|
+
draw = self._bernoulli_gate_draw(clf, X_pred)
|
|
565
|
+
values = np.zeros(n, dtype=float)
|
|
566
|
+
negative_mask = draw == 1
|
|
567
|
+
if negative_mask.any():
|
|
568
|
+
sub_preds = self._invoke_base(
|
|
569
|
+
bundle["negative_base"],
|
|
570
|
+
X_test.loc[negative_mask],
|
|
571
|
+
quantile=quantile,
|
|
572
|
+
**kwargs,
|
|
573
|
+
)
|
|
574
|
+
values[negative_mask] = sub_preds[variable].to_numpy(dtype=float)
|
|
575
|
+
return values
|
|
576
|
+
|
|
577
|
+
if kind == "sign_only":
|
|
578
|
+
clf = bundle["classifier"]
|
|
579
|
+
draw = self._bernoulli_gate_draw(clf, X_pred)
|
|
580
|
+
positive_mask = draw == 1
|
|
581
|
+
negative_mask = ~positive_mask
|
|
582
|
+
values = np.zeros(n, dtype=float)
|
|
583
|
+
if positive_mask.any():
|
|
584
|
+
sub_preds = self._invoke_base(
|
|
585
|
+
bundle["positive_base"],
|
|
586
|
+
X_test.loc[positive_mask],
|
|
587
|
+
quantile=quantile,
|
|
588
|
+
**kwargs,
|
|
589
|
+
)
|
|
590
|
+
values[positive_mask] = sub_preds[variable].to_numpy(dtype=float)
|
|
591
|
+
if negative_mask.any():
|
|
592
|
+
sub_preds = self._invoke_base(
|
|
593
|
+
bundle["negative_base"],
|
|
594
|
+
X_test.loc[negative_mask],
|
|
595
|
+
quantile=quantile,
|
|
596
|
+
**kwargs,
|
|
597
|
+
)
|
|
598
|
+
values[negative_mask] = sub_preds[variable].to_numpy(dtype=float)
|
|
599
|
+
return values
|
|
600
|
+
|
|
601
|
+
if kind == "three_sign":
|
|
602
|
+
clf = bundle["classifier"]
|
|
603
|
+
probas = clf.predict_proba(X_pred)
|
|
604
|
+
# Classes are [0=neg, 1=zero, 2=pos] per the fit encoding.
|
|
605
|
+
cumulative = np.cumsum(probas, axis=1)
|
|
606
|
+
u = self._rng.random(n)
|
|
607
|
+
# Each row i is assigned to class argmax over k of (cumulative[i,k] >= u[i]).
|
|
608
|
+
class_indices = (cumulative >= u[:, None]).argmax(axis=1)
|
|
609
|
+
classes = clf.classes_[class_indices]
|
|
610
|
+
values = np.zeros(n, dtype=float)
|
|
611
|
+
positive_mask = classes == 2
|
|
612
|
+
negative_mask = classes == 0
|
|
613
|
+
if positive_mask.any():
|
|
614
|
+
sub_preds = self._invoke_base(
|
|
615
|
+
bundle["positive_base"],
|
|
616
|
+
X_test.loc[positive_mask],
|
|
617
|
+
quantile=quantile,
|
|
618
|
+
**kwargs,
|
|
619
|
+
)
|
|
620
|
+
values[positive_mask] = sub_preds[variable].to_numpy(dtype=float)
|
|
621
|
+
if negative_mask.any():
|
|
622
|
+
sub_preds = self._invoke_base(
|
|
623
|
+
bundle["negative_base"],
|
|
624
|
+
X_test.loc[negative_mask],
|
|
625
|
+
quantile=quantile,
|
|
626
|
+
**kwargs,
|
|
627
|
+
)
|
|
628
|
+
values[negative_mask] = sub_preds[variable].to_numpy(dtype=float)
|
|
629
|
+
return values
|
|
630
|
+
|
|
631
|
+
raise ValueError(f"Unhandled bundle kind {kind!r}")
|
|
632
|
+
|
|
633
|
+
def _invoke_base(
|
|
634
|
+
self,
|
|
635
|
+
base_result: ImputerResults,
|
|
636
|
+
X_slice: pd.DataFrame,
|
|
637
|
+
quantile: Optional[float],
|
|
638
|
+
**kwargs: Any,
|
|
639
|
+
) -> pd.DataFrame:
|
|
640
|
+
"""Call a base ImputerResults, returning a DataFrame."""
|
|
641
|
+
if quantile is None:
|
|
642
|
+
result = base_result.predict(X_slice, **kwargs)
|
|
643
|
+
if isinstance(result, dict):
|
|
644
|
+
# Some base imputers always return a dict even without
|
|
645
|
+
# ``quantiles``; pick the first.
|
|
646
|
+
result = next(iter(result.values()))
|
|
647
|
+
return result
|
|
648
|
+
result = base_result.predict(X_slice, quantiles=[quantile], **kwargs)
|
|
649
|
+
if isinstance(result, dict):
|
|
650
|
+
return result[quantile]
|
|
651
|
+
return result
|
|
652
|
+
|
|
653
|
+
def _bernoulli_gate_draw(
|
|
654
|
+
self,
|
|
655
|
+
classifier: Any,
|
|
656
|
+
X_pred: np.ndarray,
|
|
657
|
+
) -> np.ndarray:
|
|
658
|
+
"""Stochastic draw from the binary classifier's predicted proba.
|
|
659
|
+
|
|
660
|
+
Returns an array of 0/1 integers (length ``len(X_pred)``),
|
|
661
|
+
matching classifier.classes_ encoding for class-1.
|
|
662
|
+
"""
|
|
663
|
+
probas = classifier.predict_proba(X_pred)
|
|
664
|
+
# Ensure we pull the probability for the "positive-class" index
|
|
665
|
+
# (which is whichever class the classifier labeled 1 at fit time).
|
|
666
|
+
classes = np.asarray(classifier.classes_)
|
|
667
|
+
if 1 in classes:
|
|
668
|
+
positive_idx = int(np.where(classes == 1)[0][0])
|
|
669
|
+
else:
|
|
670
|
+
positive_idx = probas.shape[1] - 1
|
|
671
|
+
positive_prob = probas[:, positive_idx]
|
|
672
|
+
u = self._rng.random(len(X_pred))
|
|
673
|
+
return (u < positive_prob).astype(int)
|
|
674
|
+
|
|
675
|
+
def _predict(self, *args: Any, **kwargs: Any) -> Any:
|
|
676
|
+
"""Abstract-method placeholder.
|
|
677
|
+
|
|
678
|
+
``ImputerResults._predict`` is abstract; this class overrides
|
|
679
|
+
``predict`` directly and never dispatches through ``_predict``,
|
|
680
|
+
but the abstract method still must be satisfied.
|
|
681
|
+
"""
|
|
682
|
+
raise NotImplementedError(
|
|
683
|
+
"ZeroInflatedImputerResults overrides `predict` directly; "
|
|
684
|
+
"`_predict` is not used."
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
__all__ = [
|
|
689
|
+
"REGIME_DEGENERATE_ZERO",
|
|
690
|
+
"REGIME_NEGATIVE_ONLY",
|
|
691
|
+
"REGIME_POSITIVE_ONLY",
|
|
692
|
+
"REGIME_SIGN_ONLY",
|
|
693
|
+
"REGIME_THREE_SIGN",
|
|
694
|
+
"REGIME_ZI_NEGATIVE",
|
|
695
|
+
"REGIME_ZI_POSITIVE",
|
|
696
|
+
"ZeroInflatedImputer",
|
|
697
|
+
"ZeroInflatedImputerResults",
|
|
698
|
+
]
|
|
@@ -3,11 +3,17 @@ Utility function to format various imputation outputs into a unified CSV format
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
from typing import Any, Dict, List, Optional, Union
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
import pandas as pd
|
|
10
11
|
|
|
12
|
+
from microimpute.utils.type_handling import VariableTypeDetector
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger(__name__)
|
|
16
|
+
|
|
11
17
|
|
|
12
18
|
def _compute_histogram_data(
|
|
13
19
|
donor_values: np.ndarray,
|
|
@@ -118,7 +124,7 @@ def _compute_categorical_distribution(
|
|
|
118
124
|
pd.Series(receiver_props) / receiver_values.count() * 100
|
|
119
125
|
).tolist()
|
|
120
126
|
else:
|
|
121
|
-
categories = sorted(all_categories)
|
|
127
|
+
categories = sorted(all_categories, key=lambda value: str(value))
|
|
122
128
|
donor_props = [
|
|
123
129
|
(donor_counts.get(cat, 0) / donor_values.count() * 100)
|
|
124
130
|
for cat in categories
|
|
@@ -264,9 +270,38 @@ def _validate_imputed_variables(
|
|
|
264
270
|
)
|
|
265
271
|
|
|
266
272
|
|
|
273
|
+
def _is_categorical_distribution_variable(
|
|
274
|
+
series: pd.Series,
|
|
275
|
+
variable_name: str,
|
|
276
|
+
) -> bool:
|
|
277
|
+
"""Return whether a variable should use categorical distribution rows."""
|
|
278
|
+
detector = VariableTypeDetector()
|
|
279
|
+
var_type, _ = detector.categorize_variable(series, variable_name, log)
|
|
280
|
+
return var_type in ["bool", "categorical", "numeric_categorical"]
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _extract_cv_results(autoimpute_result: Any) -> Optional[Dict[str, Dict[str, Any]]]:
|
|
284
|
+
"""Normalize supported autoimpute result shapes to a cv_results dict."""
|
|
285
|
+
if autoimpute_result is None:
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
if hasattr(autoimpute_result, "cv_results"):
|
|
289
|
+
cv_results = getattr(autoimpute_result, "cv_results")
|
|
290
|
+
return cv_results if isinstance(cv_results, dict) else None
|
|
291
|
+
|
|
292
|
+
if not isinstance(autoimpute_result, dict):
|
|
293
|
+
return None
|
|
294
|
+
|
|
295
|
+
wrapped_cv_results = autoimpute_result.get("cv_results")
|
|
296
|
+
if isinstance(wrapped_cv_results, dict):
|
|
297
|
+
return wrapped_cv_results
|
|
298
|
+
|
|
299
|
+
return autoimpute_result
|
|
300
|
+
|
|
301
|
+
|
|
267
302
|
def format_csv(
|
|
268
303
|
output_path: Optional[str] = None,
|
|
269
|
-
autoimpute_result: Optional[
|
|
304
|
+
autoimpute_result: Optional[Any] = None,
|
|
270
305
|
comparison_metrics_df: Optional[pd.DataFrame] = None,
|
|
271
306
|
distribution_comparison_df: Optional[pd.DataFrame] = None,
|
|
272
307
|
predictor_correlations: Optional[Dict[str, pd.DataFrame]] = None,
|
|
@@ -288,7 +323,8 @@ def format_csv(
|
|
|
288
323
|
|
|
289
324
|
autoimpute_result : Dict, optional
|
|
290
325
|
Result from autoimpute containing cv_results with benchmark losses.
|
|
291
|
-
|
|
326
|
+
Supports an AutoImputeResult object, a {'cv_results': ...} wrapper,
|
|
327
|
+
or the direct structure {method: {'quantile_loss': {...}, 'log_loss': {...}}}.
|
|
292
328
|
|
|
293
329
|
comparison_metrics_df : pd.DataFrame, optional
|
|
294
330
|
DataFrame from compare_metrics() with columns:
|
|
@@ -331,7 +367,8 @@ def format_csv(
|
|
|
331
367
|
-------
|
|
332
368
|
pd.DataFrame
|
|
333
369
|
Unified long-format DataFrame with columns:
|
|
334
|
-
['type', 'method', 'variable', 'quantile', 'metric_name', 'metric_value',
|
|
370
|
+
['type', 'method', 'variable', 'quantile', 'metric_name', 'metric_value',
|
|
371
|
+
'metric_std', 'split', 'additional_info']
|
|
335
372
|
|
|
336
373
|
Raises
|
|
337
374
|
------
|
|
@@ -342,12 +379,13 @@ def format_csv(
|
|
|
342
379
|
rows = []
|
|
343
380
|
|
|
344
381
|
# 1. Process autoimpute benchmark losses from cv_results
|
|
345
|
-
|
|
346
|
-
|
|
382
|
+
cv_results = _extract_cv_results(autoimpute_result)
|
|
383
|
+
if cv_results:
|
|
384
|
+
first_value = next(iter(cv_results.values()), None)
|
|
347
385
|
if isinstance(first_value, dict) and (
|
|
348
386
|
"quantile_loss" in first_value or "log_loss" in first_value
|
|
349
387
|
):
|
|
350
|
-
for method, cv_result in
|
|
388
|
+
for method, cv_result in cv_results.items():
|
|
351
389
|
# Append "_best_method" if this is the best method
|
|
352
390
|
method_label = (
|
|
353
391
|
f"{method}_best_method" if method == best_method_name else method
|
|
@@ -647,9 +685,7 @@ def format_csv(
|
|
|
647
685
|
# Generate histogram data for each imputed variable
|
|
648
686
|
for var in imputed_variables:
|
|
649
687
|
# Check if variable is categorical or numerical
|
|
650
|
-
if
|
|
651
|
-
donor_data[var].dtype, pd.CategoricalDtype
|
|
652
|
-
):
|
|
688
|
+
if _is_categorical_distribution_variable(donor_data[var], var):
|
|
653
689
|
# Categorical variable
|
|
654
690
|
hist_data = _compute_categorical_distribution(
|
|
655
691
|
donor_data[var], receiver_data[var], var
|
|
@@ -700,7 +736,7 @@ def format_csv(
|
|
|
700
736
|
else:
|
|
701
737
|
try:
|
|
702
738
|
return float(q)
|
|
703
|
-
except:
|
|
739
|
+
except (TypeError, ValueError):
|
|
704
740
|
return q
|
|
705
741
|
|
|
706
742
|
df["quantile"] = df["quantile"].apply(convert_quantile)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: microimpute
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.4
|
|
4
4
|
Summary: Benchmarking imputation methods for microdata
|
|
5
5
|
Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
|
|
6
6
|
Requires-Python: <3.15,>=3.12
|
|
@@ -23,6 +23,7 @@ microimpute/models/mdn.py
|
|
|
23
23
|
microimpute/models/ols.py
|
|
24
24
|
microimpute/models/qrf.py
|
|
25
25
|
microimpute/models/quantreg.py
|
|
26
|
+
microimpute/models/zero_inflated.py
|
|
26
27
|
microimpute/utils/__init__.py
|
|
27
28
|
microimpute/utils/dashboard_formatter.py
|
|
28
29
|
microimpute/utils/data.py
|
|
@@ -299,6 +299,33 @@ class TestFormatCSVBasic:
|
|
|
299
299
|
class TestFormatCSVBenchmarkLoss:
|
|
300
300
|
"""Tests for benchmark_loss type formatting."""
|
|
301
301
|
|
|
302
|
+
def test_benchmark_loss_from_cv_results_wrapper(self, sample_autoimpute_result):
|
|
303
|
+
"""Test benchmark loss formatting from {'cv_results': ...} wrappers."""
|
|
304
|
+
result = format_csv(
|
|
305
|
+
autoimpute_result={"cv_results": sample_autoimpute_result},
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
benchmark_rows = result[result["type"] == "benchmark_loss"]
|
|
309
|
+
assert len(benchmark_rows) > 0
|
|
310
|
+
assert {"OLS", "QRF"}.issubset(set(benchmark_rows["method"]))
|
|
311
|
+
|
|
312
|
+
def test_benchmark_loss_from_autoimpute_result_object(
|
|
313
|
+
self, sample_autoimpute_result
|
|
314
|
+
):
|
|
315
|
+
"""Test benchmark loss formatting from objects exposing cv_results."""
|
|
316
|
+
|
|
317
|
+
class ResultLike:
|
|
318
|
+
pass
|
|
319
|
+
|
|
320
|
+
result_like = ResultLike()
|
|
321
|
+
result_like.cv_results = sample_autoimpute_result
|
|
322
|
+
|
|
323
|
+
result = format_csv(autoimpute_result=result_like)
|
|
324
|
+
|
|
325
|
+
benchmark_rows = result[result["type"] == "benchmark_loss"]
|
|
326
|
+
assert len(benchmark_rows) > 0
|
|
327
|
+
assert {"OLS", "QRF"}.issubset(set(benchmark_rows["method"]))
|
|
328
|
+
|
|
302
329
|
def test_benchmark_loss_from_autoimpute(self, sample_autoimpute_result):
|
|
303
330
|
"""Test benchmark loss formatting from autoimpute results."""
|
|
304
331
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") as f:
|
|
@@ -943,6 +970,31 @@ class TestDistributionBins:
|
|
|
943
970
|
imputed_variables=imputed_variables,
|
|
944
971
|
)
|
|
945
972
|
|
|
973
|
+
def test_numeric_categorical_distribution_uses_categorical_rows(self):
|
|
974
|
+
"""Test numeric categorical variables produce categorical distributions."""
|
|
975
|
+
donor_data = pd.DataFrame(
|
|
976
|
+
{
|
|
977
|
+
"rating": [1, 1, 2, 2, 3, 3],
|
|
978
|
+
"flag": [0, 1, 1, 0, 1, 0],
|
|
979
|
+
}
|
|
980
|
+
)
|
|
981
|
+
receiver_data = pd.DataFrame(
|
|
982
|
+
{
|
|
983
|
+
"rating": [1, 2, 2, 3],
|
|
984
|
+
"flag": [1, 1, 0, 0],
|
|
985
|
+
}
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
result = format_csv(
|
|
989
|
+
donor_data=donor_data,
|
|
990
|
+
receiver_data=receiver_data,
|
|
991
|
+
imputed_variables=["rating", "flag"],
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
dist_bins = result[result["type"] == "distribution_bins"]
|
|
995
|
+
assert set(dist_bins["variable"]) == {"rating", "flag"}
|
|
996
|
+
assert set(dist_bins["metric_name"]) == {"categorical_distribution"}
|
|
997
|
+
|
|
946
998
|
|
|
947
999
|
class TestEdgeCases:
|
|
948
1000
|
"""Test edge cases and error handling."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|