microimpute 2.0.3__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {microimpute-2.0.3 → microimpute-2.0.4}/PKG-INFO +1 -1
  2. microimpute-2.0.4/microimpute/models/zero_inflated.py +698 -0
  3. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/dashboard_formatter.py +47 -11
  4. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/PKG-INFO +1 -1
  5. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/SOURCES.txt +1 -0
  6. {microimpute-2.0.3 → microimpute-2.0.4}/pyproject.toml +1 -1
  7. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_dashboard_formatter.py +52 -0
  8. {microimpute-2.0.3 → microimpute-2.0.4}/README.md +0 -0
  9. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/__init__.py +0 -0
  10. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/__init__.py +0 -0
  11. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/autoimpute.py +0 -0
  12. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/autoimpute_helpers.py +0 -0
  13. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/imputations.py +0 -0
  14. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/metrics.py +0 -0
  15. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/comparisons/validation.py +0 -0
  16. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/config.py +0 -0
  17. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/evaluations/__init__.py +0 -0
  18. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/evaluations/cross_validation.py +0 -0
  19. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/evaluations/predictor_analysis.py +0 -0
  20. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/__init__.py +0 -0
  21. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/imputer.py +0 -0
  22. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/matching.py +0 -0
  23. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/mdn.py +0 -0
  24. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/ols.py +0 -0
  25. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/qrf.py +0 -0
  26. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/models/quantreg.py +0 -0
  27. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/__init__.py +0 -0
  28. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/data.py +0 -0
  29. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/statmatch_hotdeck.py +0 -0
  30. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/utils/type_handling.py +0 -0
  31. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/visualizations/__init__.py +0 -0
  32. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/visualizations/comparison_plots.py +0 -0
  33. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute/visualizations/performance_plots.py +0 -0
  34. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/dependency_links.txt +0 -0
  35. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/requires.txt +0 -0
  36. {microimpute-2.0.3 → microimpute-2.0.4}/microimpute.egg-info/top_level.txt +0 -0
  37. {microimpute-2.0.3 → microimpute-2.0.4}/setup.cfg +0 -0
  38. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_autoimpute.py +0 -0
  39. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_basic.py +0 -0
  40. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_data_preprocessing.py +0 -0
  41. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_metrics.py +0 -0
  42. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_predictor_analysis.py +0 -0
  43. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_quantile_comparison.py +0 -0
  44. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_smoke_qrf.py +0 -0
  45. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_type_handling.py +0 -0
  46. {microimpute-2.0.3 → microimpute-2.0.4}/tests/test_visualizations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: microimpute
3
- Version: 2.0.3
3
+ Version: 2.0.4
4
4
  Summary: Benchmarking imputation methods for microdata
5
5
  Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
6
6
  Requires-Python: <3.15,>=3.12
@@ -0,0 +1,698 @@
1
+ """Regime-aware zero-inflation wrapper around base imputers.
2
+
3
+ Tabular microdata variables often fall into distinct *regimes* based on
4
+ which of {negative, zero, positive} values appear in the training data.
5
+ Imputing them with a single regressor mixes regimes together, causing
6
+ two recurring bugs in downstream ecosystems:
7
+
8
+ 1. **Negative-dropping.** The common "fit QRF on ``y > 0``" pattern
9
+ drops negative training rows along with zeros, so the imputer
10
+ produces zero or positive values only. Variables like
11
+ ``short_term_capital_gains`` lose their entire negative tail.
12
+
13
+ 2. **Zero-crossing interpolation.** A QRF fit on all nonzero values
14
+ (both signs) learns leaf distributions that interpolate between
15
+ positive and negative training rows. Predictions for records that
16
+ the gate marks "nonzero" can land in the interval between
17
+ ``max(train_negatives)`` and ``min(train_positives)``, which is
18
+ not a region any actual record occupies.
19
+
20
+ ``ZeroInflatedImputer`` wraps any base ``Imputer`` and:
21
+
22
+ - Detects the regime automatically at fit time from the training
23
+ distribution — no per-variable hand configuration required.
24
+ - Composes the base imputer with appropriate gate(s):
25
+ - Three-sign: gate chooses ``{neg, 0, pos}``; separate base
26
+ imputers on the positive and negative subsets.
27
+ - ZI positive / ZI negative: binary gate (``0`` vs nonzero); base
28
+ imputer on the nonzero-sign subset.
29
+ - Sign-only (no zero): binary sign gate; two base imputers.
30
+ - Single-sign or constant: no gate; direct base imputer or a
31
+ constant imputer.
32
+ - At predict time, routes each record to the base imputer of its
33
+ gate-assigned regime, guaranteeing no sign-interpolation leaks.
34
+
35
+ The wrapper is generic over the base imputer — ``QRF`` is the obvious
36
+ default, but ``MDN``, ``OLS``, or ``Matching`` all compose the same way.
37
+
38
+ Regime detection is parameterized by ``min_class_count`` and
39
+ ``min_class_fraction``: a class with fewer observations than both
40
+ thresholds collapses into the closest adjacent regime. This avoids
41
+ fitting a full three-sign split on a variable whose negative tail is
42
+ five outlier rows — the cost-benefit flips toward the simpler
43
+ architecture.
44
+ """
45
+
46
+ from __future__ import annotations
47
+
48
+ import logging
49
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
50
+
51
+ import numpy as np
52
+ import pandas as pd
53
+ from pydantic import SkipValidation, validate_call
54
+
55
+ from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG
56
+ from microimpute.models.imputer import (
57
+ Imputer,
58
+ ImputerResults,
59
+ _ConstantValueModel,
60
+ )
61
+ from microimpute.models.qrf import QRF
62
+
63
+
64
+ # Regime labels. Kept as module-level constants so downstream code can
65
+ # match on them without magic strings.
66
+ REGIME_THREE_SIGN = "THREE_SIGN"
67
+ REGIME_ZI_POSITIVE = "ZI_POSITIVE"
68
+ REGIME_ZI_NEGATIVE = "ZI_NEGATIVE"
69
+ REGIME_SIGN_ONLY = "SIGN_ONLY"
70
+ REGIME_POSITIVE_ONLY = "POSITIVE_ONLY"
71
+ REGIME_NEGATIVE_ONLY = "NEGATIVE_ONLY"
72
+ REGIME_DEGENERATE_ZERO = "DEGENERATE_ZERO"
73
+
74
+
75
+ def _make_classifier(kind: str, seed: int):
76
+ """Build a sklearn classifier for the zero-gate.
77
+
78
+ ``hist_gb`` (default): ``HistGradientBoostingClassifier``. On the
79
+ isolated-log-loss benchmark over 26 zero-inflated PolicyEngine-US
80
+ target variables this Pareto-dominated a 50-tree RF on log-loss
81
+ (0.225 vs 0.310), Brier (0.071 vs 0.081), ECE (0.005 vs 0.039),
82
+ and ROC-AUC (0.809 vs 0.737).
83
+ """
84
+ if kind == "hist_gb":
85
+ from sklearn.ensemble import HistGradientBoostingClassifier
86
+
87
+ return HistGradientBoostingClassifier(random_state=seed)
88
+ if kind == "rf":
89
+ from sklearn.ensemble import RandomForestClassifier
90
+
91
+ return RandomForestClassifier(n_estimators=50, random_state=seed, n_jobs=-1)
92
+ raise ValueError(f"Unknown classifier_type {kind!r}; expected 'hist_gb' or 'rf'.")
93
+
94
+
95
+ def _detect_regime(
96
+ y: np.ndarray,
97
+ *,
98
+ min_class_count: int,
99
+ min_class_fraction: float,
100
+ zero_atol: float,
101
+ ) -> str:
102
+ """Classify the training distribution into one of seven regimes.
103
+
104
+ A class (neg/zero/pos) counts as present iff its count is at least
105
+ ``min_class_count`` AND its fraction of total rows is at least
106
+ ``min_class_fraction``. Below both thresholds, the class collapses
107
+ into its closest adjacent regime (minority negatives merge into
108
+ zero → ZI_POSITIVE; minority zeros merge into the majority sign;
109
+ etc.). This keeps the gate architecture stable in the presence of
110
+ measurement-error outliers.
111
+ """
112
+ n = len(y)
113
+ if n == 0:
114
+ return REGIME_DEGENERATE_ZERO
115
+
116
+ is_zero = np.abs(y) <= zero_atol
117
+ is_pos = y > zero_atol
118
+ is_neg = y < -zero_atol
119
+
120
+ n_zero = int(is_zero.sum())
121
+ n_pos = int(is_pos.sum())
122
+ n_neg = int(is_neg.sum())
123
+
124
+ # Apply both thresholds.
125
+ def _meaningful(count: int) -> bool:
126
+ return count >= min_class_count and (count / n) >= min_class_fraction
127
+
128
+ has_zero = _meaningful(n_zero)
129
+ has_pos = _meaningful(n_pos)
130
+ has_neg = _meaningful(n_neg)
131
+
132
+ if not (has_zero or has_pos or has_neg):
133
+ # All three classes are below threshold. Pick the one with the
134
+ # largest raw count as a degenerate fallback.
135
+ counts = {"zero": n_zero, "pos": n_pos, "neg": n_neg}
136
+ majority = max(counts, key=counts.get)
137
+ if majority == "zero":
138
+ return REGIME_DEGENERATE_ZERO
139
+ return REGIME_POSITIVE_ONLY if majority == "pos" else REGIME_NEGATIVE_ONLY
140
+
141
+ if has_pos and has_neg and has_zero:
142
+ return REGIME_THREE_SIGN
143
+ if has_pos and has_neg:
144
+ return REGIME_SIGN_ONLY
145
+ if has_pos and has_zero:
146
+ return REGIME_ZI_POSITIVE
147
+ if has_neg and has_zero:
148
+ return REGIME_ZI_NEGATIVE
149
+ if has_pos:
150
+ return REGIME_POSITIVE_ONLY
151
+ if has_neg:
152
+ return REGIME_NEGATIVE_ONLY
153
+ return REGIME_DEGENERATE_ZERO
154
+
155
+
156
+ class ZeroInflatedImputer(Imputer):
157
+ """Imputer that wraps a base Imputer with regime-aware zero-gating.
158
+
159
+ Args:
160
+ base_imputer_class: ``Imputer`` subclass to use for the nonzero
161
+ regression step. Defaults to ``QRF``.
162
+ base_imputer_kwargs: Keyword arguments forwarded to the base
163
+ imputer constructor. ``{}`` by default.
164
+ min_class_count: Minimum raw count per class (neg/0/pos) for
165
+ that class to be considered present. Below this, the class
166
+ collapses into an adjacent regime. Defaults to 10.
167
+ min_class_fraction: Minimum fraction of total rows per class
168
+ for that class to be considered present. Defaults to 0.01.
169
+ zero_atol: Absolute tolerance for "equals zero" in the regime
170
+ detector. Defaults to 1e-6, matching the upstream
171
+ ``_MultiSourceBase`` convention.
172
+ classifier_type: Backend for the gate classifier;
173
+ ``"hist_gb"`` (default) or ``"rf"``.
174
+ seed: Random seed.
175
+ log_level: Python logging level.
176
+ """
177
+
178
+ def __init__(
179
+ self,
180
+ base_imputer_class: Optional[Type[Imputer]] = None,
181
+ base_imputer_kwargs: Optional[Dict[str, Any]] = None,
182
+ min_class_count: int = 10,
183
+ min_class_fraction: float = 0.01,
184
+ zero_atol: float = 1e-6,
185
+ classifier_type: str = "hist_gb",
186
+ seed: Optional[int] = RANDOM_STATE,
187
+ log_level: Optional[str] = "WARNING",
188
+ ) -> None:
189
+ super().__init__(seed=seed, log_level=log_level)
190
+ self.base_imputer_class = base_imputer_class or QRF
191
+ self.base_imputer_kwargs = dict(base_imputer_kwargs or {})
192
+ self.min_class_count = int(min_class_count)
193
+ self.min_class_fraction = float(min_class_fraction)
194
+ self.zero_atol = float(zero_atol)
195
+ self.classifier_type = classifier_type
196
+
197
+ # Filled in during fit().
198
+ self._regimes: Dict[str, str] = {}
199
+ self._per_variable: Dict[str, Dict[str, Any]] = {}
200
+
201
+ def _fit(self, *args: Any, **kwargs: Any) -> Any:
202
+ """Abstract-method placeholder; this class overrides ``fit`` directly."""
203
+ raise NotImplementedError(
204
+ "ZeroInflatedImputer overrides `fit` directly; `_fit` is not used."
205
+ )
206
+
207
+ def get_regime(self, variable: str) -> str:
208
+ """Return the detected regime label for a fitted variable."""
209
+ if variable not in self._regimes:
210
+ raise KeyError(f"Variable {variable!r} not fitted; call fit() first.")
211
+ return self._regimes[variable]
212
+
213
+ def fit(
214
+ self,
215
+ X_train: pd.DataFrame,
216
+ predictors: List[str],
217
+ imputed_variables: List[str],
218
+ weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None,
219
+ skip_missing: bool = False,
220
+ not_numeric_categorical: Optional[List[str]] = None,
221
+ **kwargs: Any,
222
+ ) -> Any:
223
+ """Fit the regime-aware wrapper.
224
+
225
+ Delegates non-numeric targets (categorical / boolean /
226
+ constant) to a single base imputer instance. Numeric targets
227
+ are handled per-variable: regime detection, then composition
228
+ of gate + base imputer(s) as appropriate.
229
+
230
+ Returns a ``ZeroInflatedImputerResults`` that routes
231
+ predictions through each target's regime-specific pipeline.
232
+ """
233
+ self._validate_data(X_train, predictors + imputed_variables)
234
+
235
+ # Classify target variables as numeric / categorical / boolean /
236
+ # constant using the base Imputer's detector.
237
+ self.identify_target_types(
238
+ X_train,
239
+ imputed_variables,
240
+ not_numeric_categorical=not_numeric_categorical,
241
+ )
242
+
243
+ self.predictors = list(predictors)
244
+ self.imputed_variables = list(imputed_variables)
245
+ self._regimes = {}
246
+ self._per_variable = {}
247
+
248
+ # Per-variable fit for numeric targets. Constant numeric
249
+ # targets (e.g. a column that is always 0 in training) are
250
+ # also treated here so their regime lands in the wrapper's
251
+ # ``_regimes`` map rather than being silently passed through.
252
+ constant_numeric_targets = [
253
+ v
254
+ for v in imputed_variables
255
+ if v in self.constant_targets
256
+ and np.issubdtype(
257
+ pd.Series([self.constant_targets[v]["value"]]).dtype,
258
+ np.number,
259
+ )
260
+ ]
261
+ numeric_targets = [
262
+ v
263
+ for v in imputed_variables
264
+ if v in self.numeric_targets or v in constant_numeric_targets
265
+ ]
266
+ for var in numeric_targets:
267
+ y = X_train[var].to_numpy(dtype=float, copy=False)
268
+ regime = _detect_regime(
269
+ y,
270
+ min_class_count=self.min_class_count,
271
+ min_class_fraction=self.min_class_fraction,
272
+ zero_atol=self.zero_atol,
273
+ )
274
+ self._regimes[var] = regime
275
+ self._per_variable[var] = self._fit_single_numeric(
276
+ X_train=X_train,
277
+ predictors=predictors,
278
+ variable=var,
279
+ regime=regime,
280
+ y=y,
281
+ )
282
+
283
+ # Non-numeric (categorical / boolean / constant) targets are
284
+ # handled by a single auxiliary base imputer over their union.
285
+ non_numeric = [v for v in imputed_variables if v not in numeric_targets]
286
+ if non_numeric:
287
+ aux = self.base_imputer_class(
288
+ log_level="ERROR",
289
+ **self.base_imputer_kwargs,
290
+ )
291
+ aux_result = aux.fit(
292
+ X_train=X_train,
293
+ predictors=predictors,
294
+ imputed_variables=non_numeric,
295
+ weight_col=weight_col,
296
+ skip_missing=skip_missing,
297
+ not_numeric_categorical=not_numeric_categorical,
298
+ **kwargs,
299
+ )
300
+ aux_bundle = {"kind": "passthrough", "result": aux_result}
301
+ else:
302
+ aux_bundle = None
303
+
304
+ return ZeroInflatedImputerResults(
305
+ predictors=self.predictors,
306
+ imputed_variables=self.imputed_variables,
307
+ seed=self.seed,
308
+ regimes=self._regimes,
309
+ per_variable=self._per_variable,
310
+ non_numeric_bundle=aux_bundle,
311
+ log_level="WARNING",
312
+ )
313
+
314
+ # ------------------------------------------------------------------
315
+ # Per-variable fit helpers
316
+ # ------------------------------------------------------------------
317
+
318
+ def _fit_single_numeric(
319
+ self,
320
+ *,
321
+ X_train: pd.DataFrame,
322
+ predictors: List[str],
323
+ variable: str,
324
+ regime: str,
325
+ y: np.ndarray,
326
+ ) -> Dict[str, Any]:
327
+ """Fit the gate and base imputer(s) for one numeric target.
328
+
329
+ Returns a bundle dict with the regime, the gate classifier
330
+ (or None), and the base imputer(s) keyed by their role.
331
+ """
332
+ X_pred = X_train[predictors].to_numpy(dtype=float, copy=False)
333
+
334
+ if regime == REGIME_DEGENERATE_ZERO:
335
+ return {"kind": "constant", "value": 0.0}
336
+
337
+ if regime in (REGIME_POSITIVE_ONLY, REGIME_NEGATIVE_ONLY):
338
+ # No gate; single base imputer on the full training set.
339
+ return {
340
+ "kind": "single",
341
+ "base": self._fit_base_single(X_train, predictors, variable),
342
+ }
343
+
344
+ if regime == REGIME_ZI_POSITIVE:
345
+ labels = (y > self.zero_atol).astype(int)
346
+ clf = _make_classifier(self.classifier_type, self.seed)
347
+ clf.fit(X_pred, labels)
348
+ pos_mask = y > self.zero_atol
349
+ pos_base = self._fit_base_single(
350
+ X_train.loc[pos_mask], predictors, variable
351
+ )
352
+ return {
353
+ "kind": "zi_positive",
354
+ "classifier": clf,
355
+ "positive_base": pos_base,
356
+ }
357
+
358
+ if regime == REGIME_ZI_NEGATIVE:
359
+ labels = (y < -self.zero_atol).astype(int)
360
+ clf = _make_classifier(self.classifier_type, self.seed)
361
+ clf.fit(X_pred, labels)
362
+ neg_mask = y < -self.zero_atol
363
+ neg_base = self._fit_base_single(
364
+ X_train.loc[neg_mask], predictors, variable
365
+ )
366
+ return {
367
+ "kind": "zi_negative",
368
+ "classifier": clf,
369
+ "negative_base": neg_base,
370
+ }
371
+
372
+ if regime == REGIME_SIGN_ONLY:
373
+ # No zero class, but both signs present. Binary sign gate
374
+ # plus a base imputer per sign.
375
+ labels = (y > 0).astype(int)
376
+ clf = _make_classifier(self.classifier_type, self.seed)
377
+ clf.fit(X_pred, labels)
378
+ pos_mask = y > 0
379
+ neg_mask = ~pos_mask
380
+ return {
381
+ "kind": "sign_only",
382
+ "classifier": clf,
383
+ "positive_base": self._fit_base_single(
384
+ X_train.loc[pos_mask], predictors, variable
385
+ ),
386
+ "negative_base": self._fit_base_single(
387
+ X_train.loc[neg_mask], predictors, variable
388
+ ),
389
+ }
390
+
391
+ if regime == REGIME_THREE_SIGN:
392
+ # 0 / neg / pos three-way gate + two base imputers.
393
+ labels = np.where(
394
+ y > self.zero_atol,
395
+ 2,
396
+ np.where(y < -self.zero_atol, 0, 1),
397
+ )
398
+ clf = _make_classifier(self.classifier_type, self.seed)
399
+ clf.fit(X_pred, labels)
400
+ pos_mask = y > self.zero_atol
401
+ neg_mask = y < -self.zero_atol
402
+ return {
403
+ "kind": "three_sign",
404
+ "classifier": clf,
405
+ "positive_base": self._fit_base_single(
406
+ X_train.loc[pos_mask], predictors, variable
407
+ ),
408
+ "negative_base": self._fit_base_single(
409
+ X_train.loc[neg_mask], predictors, variable
410
+ ),
411
+ }
412
+
413
+ raise ValueError(f"Unhandled regime {regime!r}")
414
+
415
+ def _fit_base_single(
416
+ self,
417
+ X_train: pd.DataFrame,
418
+ predictors: List[str],
419
+ variable: str,
420
+ ) -> ImputerResults:
421
+ """Fit a single base Imputer on a (possibly filtered) slice."""
422
+ imputer = self.base_imputer_class(
423
+ log_level="ERROR",
424
+ **self.base_imputer_kwargs,
425
+ )
426
+ return imputer.fit(
427
+ X_train=X_train,
428
+ predictors=predictors,
429
+ imputed_variables=[variable],
430
+ )
431
+
432
+
433
+ class ZeroInflatedImputerResults(ImputerResults):
434
+ """Fitted regime-aware imputer ready for prediction."""
435
+
436
+ def __init__(
437
+ self,
438
+ predictors: List[str],
439
+ imputed_variables: List[str],
440
+ seed: int,
441
+ regimes: Dict[str, str],
442
+ per_variable: Dict[str, Dict[str, Any]],
443
+ non_numeric_bundle: Optional[Dict[str, Any]] = None,
444
+ imputed_vars_dummy_info: Optional[Dict[str, Any]] = None,
445
+ original_predictors: Optional[List[str]] = None,
446
+ log_level: Optional[str] = "WARNING",
447
+ ) -> None:
448
+ super().__init__(
449
+ predictors=predictors,
450
+ imputed_variables=imputed_variables,
451
+ seed=seed,
452
+ imputed_vars_dummy_info=imputed_vars_dummy_info,
453
+ original_predictors=original_predictors or predictors,
454
+ log_level=log_level,
455
+ )
456
+ self._regimes = regimes
457
+ self._per_variable = per_variable
458
+ self._non_numeric_bundle = non_numeric_bundle
459
+ self._rng = np.random.default_rng(seed)
460
+
461
+ @validate_call(config=VALIDATE_CONFIG)
462
+ def predict(
463
+ self,
464
+ X_test: pd.DataFrame,
465
+ quantiles: Optional[List[float]] = None,
466
+ return_probs: bool = False,
467
+ **kwargs: Any,
468
+ ) -> Union[pd.DataFrame, Dict[float, pd.DataFrame]]:
469
+ """Predict imputed values, routing per-variable by regime.
470
+
471
+ For numeric targets, the gate assigns each record to zero,
472
+ positive, or negative regime (depending on the detected
473
+ regime), and the base imputer for that regime produces the
474
+ nonzero draw. Zeros are set exactly to 0.0 (no stochastic
475
+ smearing).
476
+
477
+ For non-numeric targets (categorical / boolean / constant),
478
+ delegation is to the single auxiliary base imputer fit at
479
+ training time.
480
+ """
481
+ if quantiles is not None:
482
+ # Quantile grid not currently supported in the wrapper; the
483
+ # regime routing only produces a single stochastic draw per
484
+ # call. Deterministic-quantile support would require the
485
+ # caller to specify quantile conditional on regime.
486
+ return {
487
+ q: self._predict_single_draw(X_test, quantile=q, **kwargs)
488
+ for q in quantiles
489
+ }
490
+ return self._predict_single_draw(X_test, quantile=None, **kwargs)
491
+
492
+ def _predict_single_draw(
493
+ self,
494
+ X_test: pd.DataFrame,
495
+ quantile: Optional[float],
496
+ **kwargs: Any,
497
+ ) -> pd.DataFrame:
498
+ out = pd.DataFrame(index=X_test.index)
499
+
500
+ for variable in self.imputed_variables:
501
+ regime = self._regimes.get(variable)
502
+ if regime is None:
503
+ # Non-numeric target; handled by the auxiliary bundle.
504
+ continue
505
+ bundle = self._per_variable[variable]
506
+ out[variable] = self._predict_single_variable(
507
+ X_test, variable, bundle, quantile=quantile, **kwargs
508
+ )
509
+
510
+ # Merge in non-numeric target predictions from the auxiliary
511
+ # single base imputer.
512
+ if self._non_numeric_bundle is not None:
513
+ aux_result = self._non_numeric_bundle["result"]
514
+ if quantile is None:
515
+ aux_preds = aux_result.predict(X_test)
516
+ else:
517
+ aux_dict = aux_result.predict(X_test, quantiles=[quantile])
518
+ aux_preds = aux_dict[quantile]
519
+ for col in aux_preds.columns:
520
+ if col not in out.columns:
521
+ out[col] = aux_preds[col].values
522
+
523
+ return out
524
+
525
+ def _predict_single_variable(
526
+ self,
527
+ X_test: pd.DataFrame,
528
+ variable: str,
529
+ bundle: Dict[str, Any],
530
+ quantile: Optional[float],
531
+ **kwargs: Any,
532
+ ) -> np.ndarray:
533
+ n = len(X_test)
534
+ kind = bundle["kind"]
535
+
536
+ if kind == "constant":
537
+ return np.full(n, bundle["value"], dtype=float)
538
+
539
+ if kind == "single":
540
+ preds = self._invoke_base(
541
+ bundle["base"], X_test, quantile=quantile, **kwargs
542
+ )
543
+ return preds[variable].to_numpy(dtype=float)
544
+
545
+ X_pred = X_test[self.predictors].to_numpy(dtype=float, copy=False)
546
+
547
+ if kind == "zi_positive":
548
+ clf = bundle["classifier"]
549
+ draw = self._bernoulli_gate_draw(clf, X_pred)
550
+ values = np.zeros(n, dtype=float)
551
+ positive_mask = draw == 1
552
+ if positive_mask.any():
553
+ sub_preds = self._invoke_base(
554
+ bundle["positive_base"],
555
+ X_test.loc[positive_mask],
556
+ quantile=quantile,
557
+ **kwargs,
558
+ )
559
+ values[positive_mask] = sub_preds[variable].to_numpy(dtype=float)
560
+ return values
561
+
562
+ if kind == "zi_negative":
563
+ clf = bundle["classifier"]
564
+ draw = self._bernoulli_gate_draw(clf, X_pred)
565
+ values = np.zeros(n, dtype=float)
566
+ negative_mask = draw == 1
567
+ if negative_mask.any():
568
+ sub_preds = self._invoke_base(
569
+ bundle["negative_base"],
570
+ X_test.loc[negative_mask],
571
+ quantile=quantile,
572
+ **kwargs,
573
+ )
574
+ values[negative_mask] = sub_preds[variable].to_numpy(dtype=float)
575
+ return values
576
+
577
+ if kind == "sign_only":
578
+ clf = bundle["classifier"]
579
+ draw = self._bernoulli_gate_draw(clf, X_pred)
580
+ positive_mask = draw == 1
581
+ negative_mask = ~positive_mask
582
+ values = np.zeros(n, dtype=float)
583
+ if positive_mask.any():
584
+ sub_preds = self._invoke_base(
585
+ bundle["positive_base"],
586
+ X_test.loc[positive_mask],
587
+ quantile=quantile,
588
+ **kwargs,
589
+ )
590
+ values[positive_mask] = sub_preds[variable].to_numpy(dtype=float)
591
+ if negative_mask.any():
592
+ sub_preds = self._invoke_base(
593
+ bundle["negative_base"],
594
+ X_test.loc[negative_mask],
595
+ quantile=quantile,
596
+ **kwargs,
597
+ )
598
+ values[negative_mask] = sub_preds[variable].to_numpy(dtype=float)
599
+ return values
600
+
601
+ if kind == "three_sign":
602
+ clf = bundle["classifier"]
603
+ probas = clf.predict_proba(X_pred)
604
+ # Classes are [0=neg, 1=zero, 2=pos] per the fit encoding.
605
+ cumulative = np.cumsum(probas, axis=1)
606
+ u = self._rng.random(n)
607
+ # Each row i is assigned to class argmax over k of (cumulative[i,k] >= u[i]).
608
+ class_indices = (cumulative >= u[:, None]).argmax(axis=1)
609
+ classes = clf.classes_[class_indices]
610
+ values = np.zeros(n, dtype=float)
611
+ positive_mask = classes == 2
612
+ negative_mask = classes == 0
613
+ if positive_mask.any():
614
+ sub_preds = self._invoke_base(
615
+ bundle["positive_base"],
616
+ X_test.loc[positive_mask],
617
+ quantile=quantile,
618
+ **kwargs,
619
+ )
620
+ values[positive_mask] = sub_preds[variable].to_numpy(dtype=float)
621
+ if negative_mask.any():
622
+ sub_preds = self._invoke_base(
623
+ bundle["negative_base"],
624
+ X_test.loc[negative_mask],
625
+ quantile=quantile,
626
+ **kwargs,
627
+ )
628
+ values[negative_mask] = sub_preds[variable].to_numpy(dtype=float)
629
+ return values
630
+
631
+ raise ValueError(f"Unhandled bundle kind {kind!r}")
632
+
633
+ def _invoke_base(
634
+ self,
635
+ base_result: ImputerResults,
636
+ X_slice: pd.DataFrame,
637
+ quantile: Optional[float],
638
+ **kwargs: Any,
639
+ ) -> pd.DataFrame:
640
+ """Call a base ImputerResults, returning a DataFrame."""
641
+ if quantile is None:
642
+ result = base_result.predict(X_slice, **kwargs)
643
+ if isinstance(result, dict):
644
+ # Some base imputers always return a dict even without
645
+ # ``quantiles``; pick the first.
646
+ result = next(iter(result.values()))
647
+ return result
648
+ result = base_result.predict(X_slice, quantiles=[quantile], **kwargs)
649
+ if isinstance(result, dict):
650
+ return result[quantile]
651
+ return result
652
+
653
+ def _bernoulli_gate_draw(
654
+ self,
655
+ classifier: Any,
656
+ X_pred: np.ndarray,
657
+ ) -> np.ndarray:
658
+ """Stochastic draw from the binary classifier's predicted proba.
659
+
660
+ Returns an array of 0/1 integers (length ``len(X_pred)``),
661
+ matching classifier.classes_ encoding for class-1.
662
+ """
663
+ probas = classifier.predict_proba(X_pred)
664
+ # Ensure we pull the probability for the "positive-class" index
665
+ # (which is whichever class the classifier labeled 1 at fit time).
666
+ classes = np.asarray(classifier.classes_)
667
+ if 1 in classes:
668
+ positive_idx = int(np.where(classes == 1)[0][0])
669
+ else:
670
+ positive_idx = probas.shape[1] - 1
671
+ positive_prob = probas[:, positive_idx]
672
+ u = self._rng.random(len(X_pred))
673
+ return (u < positive_prob).astype(int)
674
+
675
+ def _predict(self, *args: Any, **kwargs: Any) -> Any:
676
+ """Abstract-method placeholder.
677
+
678
+ ``ImputerResults._predict`` is abstract; this class overrides
679
+ ``predict`` directly and never dispatches through ``_predict``,
680
+ but the abstract method still must be satisfied.
681
+ """
682
+ raise NotImplementedError(
683
+ "ZeroInflatedImputerResults overrides `predict` directly; "
684
+ "`_predict` is not used."
685
+ )
686
+
687
+
688
+ __all__ = [
689
+ "REGIME_DEGENERATE_ZERO",
690
+ "REGIME_NEGATIVE_ONLY",
691
+ "REGIME_POSITIVE_ONLY",
692
+ "REGIME_SIGN_ONLY",
693
+ "REGIME_THREE_SIGN",
694
+ "REGIME_ZI_NEGATIVE",
695
+ "REGIME_ZI_POSITIVE",
696
+ "ZeroInflatedImputer",
697
+ "ZeroInflatedImputerResults",
698
+ ]
@@ -3,11 +3,17 @@ Utility function to format various imputation outputs into a unified CSV format
3
3
  """
4
4
 
5
5
  import json
6
+ import logging
6
7
  from typing import Any, Dict, List, Optional, Union
7
8
 
8
9
  import numpy as np
9
10
  import pandas as pd
10
11
 
12
+ from microimpute.utils.type_handling import VariableTypeDetector
13
+
14
+
15
+ log = logging.getLogger(__name__)
16
+
11
17
 
12
18
  def _compute_histogram_data(
13
19
  donor_values: np.ndarray,
@@ -118,7 +124,7 @@ def _compute_categorical_distribution(
118
124
  pd.Series(receiver_props) / receiver_values.count() * 100
119
125
  ).tolist()
120
126
  else:
121
- categories = sorted(all_categories)
127
+ categories = sorted(all_categories, key=lambda value: str(value))
122
128
  donor_props = [
123
129
  (donor_counts.get(cat, 0) / donor_values.count() * 100)
124
130
  for cat in categories
@@ -264,9 +270,38 @@ def _validate_imputed_variables(
264
270
  )
265
271
 
266
272
 
273
+ def _is_categorical_distribution_variable(
274
+ series: pd.Series,
275
+ variable_name: str,
276
+ ) -> bool:
277
+ """Return whether a variable should use categorical distribution rows."""
278
+ detector = VariableTypeDetector()
279
+ var_type, _ = detector.categorize_variable(series, variable_name, log)
280
+ return var_type in ["bool", "categorical", "numeric_categorical"]
281
+
282
+
283
+ def _extract_cv_results(autoimpute_result: Any) -> Optional[Dict[str, Dict[str, Any]]]:
284
+ """Normalize supported autoimpute result shapes to a cv_results dict."""
285
+ if autoimpute_result is None:
286
+ return None
287
+
288
+ if hasattr(autoimpute_result, "cv_results"):
289
+ cv_results = getattr(autoimpute_result, "cv_results")
290
+ return cv_results if isinstance(cv_results, dict) else None
291
+
292
+ if not isinstance(autoimpute_result, dict):
293
+ return None
294
+
295
+ wrapped_cv_results = autoimpute_result.get("cv_results")
296
+ if isinstance(wrapped_cv_results, dict):
297
+ return wrapped_cv_results
298
+
299
+ return autoimpute_result
300
+
301
+
267
302
  def format_csv(
268
303
  output_path: Optional[str] = None,
269
- autoimpute_result: Optional[Dict] = None,
304
+ autoimpute_result: Optional[Any] = None,
270
305
  comparison_metrics_df: Optional[pd.DataFrame] = None,
271
306
  distribution_comparison_df: Optional[pd.DataFrame] = None,
272
307
  predictor_correlations: Optional[Dict[str, pd.DataFrame]] = None,
@@ -288,7 +323,8 @@ def format_csv(
288
323
 
289
324
  autoimpute_result : Dict, optional
290
325
  Result from autoimpute containing cv_results with benchmark losses.
291
- Expected structure: {method: {'quantile_loss': {...}, 'log_loss': {...}}}
326
+ Supports an AutoImputeResult object, a {'cv_results': ...} wrapper,
327
+ or the direct structure {method: {'quantile_loss': {...}, 'log_loss': {...}}}.
292
328
 
293
329
  comparison_metrics_df : pd.DataFrame, optional
294
330
  DataFrame from compare_metrics() with columns:
@@ -331,7 +367,8 @@ def format_csv(
331
367
  -------
332
368
  pd.DataFrame
333
369
  Unified long-format DataFrame with columns:
334
- ['type', 'method', 'variable', 'quantile', 'metric_name', 'metric_value', 'split', 'additional_info']
370
+ ['type', 'method', 'variable', 'quantile', 'metric_name', 'metric_value',
371
+ 'metric_std', 'split', 'additional_info']
335
372
 
336
373
  Raises
337
374
  ------
@@ -342,12 +379,13 @@ def format_csv(
342
379
  rows = []
343
380
 
344
381
  # 1. Process autoimpute benchmark losses from cv_results
345
- if autoimpute_result and isinstance(autoimpute_result, dict):
346
- first_value = next(iter(autoimpute_result.values()), None)
382
+ cv_results = _extract_cv_results(autoimpute_result)
383
+ if cv_results:
384
+ first_value = next(iter(cv_results.values()), None)
347
385
  if isinstance(first_value, dict) and (
348
386
  "quantile_loss" in first_value or "log_loss" in first_value
349
387
  ):
350
- for method, cv_result in autoimpute_result.items():
388
+ for method, cv_result in cv_results.items():
351
389
  # Append "_best_method" if this is the best method
352
390
  method_label = (
353
391
  f"{method}_best_method" if method == best_method_name else method
@@ -647,9 +685,7 @@ def format_csv(
647
685
  # Generate histogram data for each imputed variable
648
686
  for var in imputed_variables:
649
687
  # Check if variable is categorical or numerical
650
- if pd.api.types.is_string_dtype(donor_data[var]) or isinstance(
651
- donor_data[var].dtype, pd.CategoricalDtype
652
- ):
688
+ if _is_categorical_distribution_variable(donor_data[var], var):
653
689
  # Categorical variable
654
690
  hist_data = _compute_categorical_distribution(
655
691
  donor_data[var], receiver_data[var], var
@@ -700,7 +736,7 @@ def format_csv(
700
736
  else:
701
737
  try:
702
738
  return float(q)
703
- except:
739
+ except (TypeError, ValueError):
704
740
  return q
705
741
 
706
742
  df["quantile"] = df["quantile"].apply(convert_quantile)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: microimpute
3
- Version: 2.0.3
3
+ Version: 2.0.4
4
4
  Summary: Benchmarking imputation methods for microdata
5
5
  Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
6
6
  Requires-Python: <3.15,>=3.12
@@ -23,6 +23,7 @@ microimpute/models/mdn.py
23
23
  microimpute/models/ols.py
24
24
  microimpute/models/qrf.py
25
25
  microimpute/models/quantreg.py
26
+ microimpute/models/zero_inflated.py
26
27
  microimpute/utils/__init__.py
27
28
  microimpute/utils/dashboard_formatter.py
28
29
  microimpute/utils/data.py
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "microimpute"
10
- version = "2.0.3"
10
+ version = "2.0.4"
11
11
  description = "Benchmarking imputation methods for microdata"
12
12
  readme = "README.md"
13
13
  authors = [
@@ -299,6 +299,33 @@ class TestFormatCSVBasic:
299
299
  class TestFormatCSVBenchmarkLoss:
300
300
  """Tests for benchmark_loss type formatting."""
301
301
 
302
+ def test_benchmark_loss_from_cv_results_wrapper(self, sample_autoimpute_result):
303
+ """Test benchmark loss formatting from {'cv_results': ...} wrappers."""
304
+ result = format_csv(
305
+ autoimpute_result={"cv_results": sample_autoimpute_result},
306
+ )
307
+
308
+ benchmark_rows = result[result["type"] == "benchmark_loss"]
309
+ assert len(benchmark_rows) > 0
310
+ assert {"OLS", "QRF"}.issubset(set(benchmark_rows["method"]))
311
+
312
+ def test_benchmark_loss_from_autoimpute_result_object(
313
+ self, sample_autoimpute_result
314
+ ):
315
+ """Test benchmark loss formatting from objects exposing cv_results."""
316
+
317
+ class ResultLike:
318
+ pass
319
+
320
+ result_like = ResultLike()
321
+ result_like.cv_results = sample_autoimpute_result
322
+
323
+ result = format_csv(autoimpute_result=result_like)
324
+
325
+ benchmark_rows = result[result["type"] == "benchmark_loss"]
326
+ assert len(benchmark_rows) > 0
327
+ assert {"OLS", "QRF"}.issubset(set(benchmark_rows["method"]))
328
+
302
329
  def test_benchmark_loss_from_autoimpute(self, sample_autoimpute_result):
303
330
  """Test benchmark loss formatting from autoimpute results."""
304
331
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") as f:
@@ -943,6 +970,31 @@ class TestDistributionBins:
943
970
  imputed_variables=imputed_variables,
944
971
  )
945
972
 
973
+ def test_numeric_categorical_distribution_uses_categorical_rows(self):
974
+ """Test numeric categorical variables produce categorical distributions."""
975
+ donor_data = pd.DataFrame(
976
+ {
977
+ "rating": [1, 1, 2, 2, 3, 3],
978
+ "flag": [0, 1, 1, 0, 1, 0],
979
+ }
980
+ )
981
+ receiver_data = pd.DataFrame(
982
+ {
983
+ "rating": [1, 2, 2, 3],
984
+ "flag": [1, 1, 0, 0],
985
+ }
986
+ )
987
+
988
+ result = format_csv(
989
+ donor_data=donor_data,
990
+ receiver_data=receiver_data,
991
+ imputed_variables=["rating", "flag"],
992
+ )
993
+
994
+ dist_bins = result[result["type"] == "distribution_bins"]
995
+ assert set(dist_bins["variable"]) == {"rating", "flag"}
996
+ assert set(dist_bins["metric_name"]) == {"categorical_distribution"}
997
+
946
998
 
947
999
  class TestEdgeCases:
948
1000
  """Test edge cases and error handling."""
File without changes
File without changes