microimpute 2.0.3__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {microimpute-2.0.3 → microimpute-2.1.0}/PKG-INFO +1 -1
  2. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/imputer.py +110 -13
  3. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/qrf.py +103 -10
  4. microimpute-2.1.0/microimpute/models/zero_inflated.py +698 -0
  5. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/dashboard_formatter.py +47 -11
  6. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/PKG-INFO +1 -1
  7. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/SOURCES.txt +1 -0
  8. {microimpute-2.0.3 → microimpute-2.1.0}/pyproject.toml +1 -1
  9. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_dashboard_formatter.py +52 -0
  10. {microimpute-2.0.3 → microimpute-2.1.0}/README.md +0 -0
  11. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/__init__.py +0 -0
  12. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/__init__.py +0 -0
  13. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/autoimpute.py +0 -0
  14. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/autoimpute_helpers.py +0 -0
  15. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/imputations.py +0 -0
  16. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/metrics.py +0 -0
  17. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/validation.py +0 -0
  18. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/config.py +0 -0
  19. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/evaluations/__init__.py +0 -0
  20. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/evaluations/cross_validation.py +0 -0
  21. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/evaluations/predictor_analysis.py +0 -0
  22. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/__init__.py +0 -0
  23. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/matching.py +0 -0
  24. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/mdn.py +0 -0
  25. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/ols.py +0 -0
  26. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/quantreg.py +0 -0
  27. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/__init__.py +0 -0
  28. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/data.py +0 -0
  29. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/statmatch_hotdeck.py +0 -0
  30. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/type_handling.py +0 -0
  31. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/visualizations/__init__.py +0 -0
  32. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/visualizations/comparison_plots.py +0 -0
  33. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/visualizations/performance_plots.py +0 -0
  34. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/dependency_links.txt +0 -0
  35. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/requires.txt +0 -0
  36. {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/top_level.txt +0 -0
  37. {microimpute-2.0.3 → microimpute-2.1.0}/setup.cfg +0 -0
  38. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_autoimpute.py +0 -0
  39. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_basic.py +0 -0
  40. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_data_preprocessing.py +0 -0
  41. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_metrics.py +0 -0
  42. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_predictor_analysis.py +0 -0
  43. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_quantile_comparison.py +0 -0
  44. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_smoke_qrf.py +0 -0
  45. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_type_handling.py +0 -0
  46. {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_visualizations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: microimpute
3
- Version: 2.0.3
3
+ Version: 2.1.0
4
4
  Summary: Benchmarking imputation methods for microdata
5
5
  Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
6
6
  Requires-Python: <3.15,>=3.12
@@ -14,7 +14,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
14
14
 
15
15
  import numpy as np
16
16
  import pandas as pd
17
- from pydantic import SkipValidation, validate_call
17
+ from pydantic import validate_call
18
18
 
19
19
  from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG
20
20
  from microimpute.utils.type_handling import (
@@ -104,6 +104,7 @@ class Imputer(ABC):
104
104
  data: pd.DataFrame,
105
105
  imputed_variables: List[str],
106
106
  not_numeric_categorical: Optional[List[str]] = None,
107
+ target_fit_masks: Optional[Dict[str, pd.Series]] = None,
107
108
  ) -> None:
108
109
  """Identify and track variable types for imputation targets.
109
110
 
@@ -113,21 +114,27 @@ class Imputer(ABC):
113
114
  not_numeric_categorical: Optional list of variable names that should
114
115
  be treated as numeric even if they would normally be detected as
115
116
  numeric_categorical.
117
+ target_fit_masks: Optional target-specific row masks to use when
118
+ inferring target type and constants.
116
119
  """
117
120
  detector = VariableTypeDetector()
118
121
  not_numeric_categorical = not_numeric_categorical or []
122
+ target_fit_masks = target_fit_masks or {}
119
123
 
120
124
  for var in imputed_variables:
121
125
  if var not in data.columns:
122
126
  continue
127
+ target_data = data[var]
128
+ if var in target_fit_masks:
129
+ target_data = target_data.loc[target_fit_masks[var]]
123
130
 
124
131
  # First check if the variable has a constant value
125
- unique_values = data[var].dropna().unique()
132
+ unique_values = target_data.dropna().unique()
126
133
  if len(unique_values) == 1:
127
134
  constant_val = unique_values[0]
128
135
  self.constant_targets[var] = {
129
136
  "value": constant_val,
130
- "dtype": data[var].dtype,
137
+ "dtype": target_data.dtype,
131
138
  }
132
139
  self.logger.warning(
133
140
  f"Target variable '{var}' has constant value {constant_val}. "
@@ -136,7 +143,7 @@ class Imputer(ABC):
136
143
  continue
137
144
 
138
145
  var_type, categories = detector.categorize_variable(
139
- data[var],
146
+ target_data,
140
147
  var,
141
148
  self.logger,
142
149
  force_numeric=(var in not_numeric_categorical),
@@ -145,7 +152,7 @@ class Imputer(ABC):
145
152
  if var_type == "bool":
146
153
  self.boolean_targets[var] = {
147
154
  "type": "boolean",
148
- "dtype": data[var].dtype,
155
+ "dtype": target_data.dtype,
149
156
  }
150
157
  self.logger.info(f"Identified boolean target: {var}")
151
158
 
@@ -153,7 +160,7 @@ class Imputer(ABC):
153
160
  self.categorical_targets[var] = {
154
161
  "type": var_type,
155
162
  "categories": categories,
156
- "dtype": data[var].dtype,
163
+ "dtype": target_data.dtype,
157
164
  }
158
165
  self.logger.info(
159
166
  f"Identified categorical target: {var} with {len(categories) if categories else 0} categories"
@@ -163,6 +170,30 @@ class Imputer(ABC):
163
170
  self.numeric_targets.append(var)
164
171
  self.logger.debug(f"Identified numeric target: {var}")
165
172
 
173
+ def _coerce_fit_filter(
174
+ self,
175
+ X_train: pd.DataFrame,
176
+ fit_filter: Union[str, np.ndarray, pd.Series, List[bool], Tuple[bool, ...]],
177
+ *,
178
+ name: str,
179
+ ) -> pd.Series:
180
+ """Normalize a row-filter input to a boolean Series on ``X_train``."""
181
+ if isinstance(fit_filter, str):
182
+ if fit_filter not in X_train.columns:
183
+ raise ValueError(f"{name} column '{fit_filter}' not found in X_train")
184
+ mask = X_train[fit_filter]
185
+ elif isinstance(fit_filter, pd.Series):
186
+ mask = fit_filter.reindex(X_train.index)
187
+ else:
188
+ mask = pd.Series(fit_filter, index=X_train.index)
189
+
190
+ if len(mask) != len(X_train):
191
+ raise ValueError(f"{name} must have length {len(X_train)}, got {len(mask)}")
192
+ if mask.isna().any():
193
+ raise ValueError(f"{name} contains missing values")
194
+
195
+ return mask.astype(bool)
196
+
166
197
  @validate_call(config=VALIDATE_CONFIG)
167
198
  def preprocess_data_types(
168
199
  self,
@@ -216,6 +247,12 @@ class Imputer(ABC):
216
247
  weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None,
217
248
  skip_missing: bool = False,
218
249
  not_numeric_categorical: Optional[List[str]] = None,
250
+ row_filter: Optional[
251
+ Union[str, np.ndarray, pd.Series, List[bool], Tuple[bool, ...]]
252
+ ] = None,
253
+ target_filters: Optional[
254
+ Dict[str, Union[str, np.ndarray, pd.Series, List[bool], Tuple[bool, ...]]]
255
+ ] = None,
219
256
  **kwargs: Any,
220
257
  ) -> Any: # Returns ImputerResults
221
258
  """Fit the model to the training data.
@@ -229,6 +266,13 @@ class Imputer(ABC):
229
266
  not_numeric_categorical: Optional list of variable names that should
230
267
  be treated as numeric even if they would normally be detected as
231
268
  numeric_categorical.
269
+ row_filter: Optional common row mask, or the name of a boolean
270
+ column in X_train, selecting rows eligible for all targets.
271
+ target_filters: Optional mapping from imputed variable name to a
272
+ target-specific row mask, or the name of a boolean column in
273
+ X_train. Target-specific filters are combined with row_filter.
274
+ They are supported by models that fit one model per target,
275
+ such as QRF.
232
276
  **kwargs: Additional model-specific parameters.
233
277
 
234
278
  Returns:
@@ -240,6 +284,48 @@ class Imputer(ABC):
240
284
  NotImplementedError: If method is not implemented by subclass.
241
285
  """
242
286
  original_predictors = predictors.copy()
287
+ target_filters = target_filters or {}
288
+ unknown_target_filters = set(target_filters) - set(imputed_variables)
289
+ if unknown_target_filters:
290
+ raise ValueError(
291
+ "target_filters contains variables not in imputed_variables: "
292
+ f"{sorted(unknown_target_filters)}"
293
+ )
294
+
295
+ base_mask = pd.Series(True, index=X_train.index)
296
+ if row_filter is not None:
297
+ base_mask = self._coerce_fit_filter(
298
+ X_train,
299
+ row_filter,
300
+ name="row_filter",
301
+ )
302
+
303
+ target_fit_masks = {}
304
+ for variable, target_filter in target_filters.items():
305
+ target_fit_masks[variable] = (
306
+ self._coerce_fit_filter(
307
+ X_train,
308
+ target_filter,
309
+ name=f"target_filters[{variable!r}]",
310
+ )
311
+ & base_mask
312
+ )
313
+
314
+ if target_filters and not getattr(self, "supports_target_filters", False):
315
+ raise NotImplementedError(
316
+ f"{type(self).__name__} does not support target_filters"
317
+ )
318
+
319
+ if not base_mask.all():
320
+ if isinstance(weight_col, np.ndarray):
321
+ weight_col = pd.Series(weight_col, index=base_mask.index).loc[base_mask]
322
+ elif isinstance(weight_col, pd.Series):
323
+ weight_col = weight_col.reindex(base_mask.index).loc[base_mask]
324
+ X_train = X_train.loc[base_mask].copy()
325
+ target_fit_masks = {
326
+ variable: mask.loc[X_train.index]
327
+ for variable, mask in target_fit_masks.items()
328
+ }
243
329
 
244
330
  try:
245
331
  # Handle missing variables if skip_missing is enabled
@@ -288,7 +374,12 @@ class Imputer(ABC):
288
374
  )
289
375
 
290
376
  # Identify target types BEFORE preprocessing
291
- self.identify_target_types(X_train, imputed_variables, not_numeric_categorical)
377
+ self.identify_target_types(
378
+ X_train,
379
+ imputed_variables,
380
+ not_numeric_categorical,
381
+ target_fit_masks=target_fit_masks,
382
+ )
292
383
 
293
384
  X_train, predictors, imputed_variables, imputed_vars_dummy_info = (
294
385
  self.preprocess_data_types(
@@ -319,17 +410,23 @@ class Imputer(ABC):
319
410
  )
320
411
 
321
412
  # Defer actual training to subclass with all parameters
413
+ fit_kwargs = {
414
+ "categorical_targets": self.categorical_targets,
415
+ "boolean_targets": self.boolean_targets,
416
+ "numeric_targets": self.numeric_targets,
417
+ "constant_targets": self.constant_targets,
418
+ "sample_weight": sample_weight,
419
+ **kwargs,
420
+ }
421
+ if target_fit_masks:
422
+ fit_kwargs["target_fit_masks"] = target_fit_masks
423
+
322
424
  fitted_model = self._fit(
323
425
  X_train,
324
426
  self.predictors,
325
427
  self.imputed_variables,
326
428
  self.original_predictors,
327
- categorical_targets=self.categorical_targets,
328
- boolean_targets=self.boolean_targets,
329
- numeric_targets=self.numeric_targets,
330
- constant_targets=self.constant_targets,
331
- sample_weight=sample_weight,
332
- **kwargs,
429
+ **fit_kwargs,
333
430
  )
334
431
  return fitted_model
335
432
 
@@ -567,6 +567,8 @@ class QRF(Imputer):
567
567
  The underlying QRF implementation is from the quantile_forest package.
568
568
  """
569
569
 
570
+ supports_target_filters = True
571
+
570
572
  def __init__(
571
573
  self,
572
574
  log_level: Optional[str] = "WARNING",
@@ -738,6 +740,65 @@ class QRF(Imputer):
738
740
  # Regular QRF fit
739
741
  model.fit(X, y, sample_weight=sample_weight, **model_params)
740
742
 
743
+ def _target_fit_data(
744
+ self,
745
+ X_train: pd.DataFrame,
746
+ variable: str,
747
+ target_fit_masks: Optional[Dict[str, pd.Series]],
748
+ sample_weight: Optional[np.ndarray],
749
+ ) -> Tuple[pd.DataFrame, Optional[np.ndarray]]:
750
+ """Return training rows and weights for one target variable."""
751
+ if not target_fit_masks or variable not in target_fit_masks:
752
+ return X_train, sample_weight
753
+
754
+ mask = (
755
+ target_fit_masks[variable].reindex(X_train.index).fillna(False).astype(bool)
756
+ )
757
+ if not mask.any():
758
+ raise ValueError(f"No training rows selected for target '{variable}'")
759
+
760
+ target_train = X_train.loc[mask]
761
+ target_sample_weight = None
762
+ if sample_weight is not None:
763
+ target_sample_weight = np.asarray(sample_weight, dtype=float)[
764
+ mask.to_numpy()
765
+ ]
766
+
767
+ selected_rows = len(target_train)
768
+ if (
769
+ self.max_train_samples is not None
770
+ and len(target_train) > self.max_train_samples
771
+ ):
772
+ try:
773
+ variable_offset = (self.imputed_variables or []).index(variable)
774
+ except ValueError:
775
+ variable_offset = 0
776
+ seed = None if self.seed is None else self.seed + variable_offset
777
+ rng = np.random.default_rng(seed)
778
+ sel = rng.choice(
779
+ len(target_train), size=self.max_train_samples, replace=False
780
+ )
781
+ target_train = target_train.iloc[sel]
782
+ if target_sample_weight is not None:
783
+ target_sample_weight = target_sample_weight[sel]
784
+ self.logger.info(
785
+ "Subsampling target '%s' training data from %d to %d rows",
786
+ variable,
787
+ selected_rows,
788
+ self.max_train_samples,
789
+ )
790
+
791
+ dropped = len(X_train) - selected_rows
792
+ if dropped:
793
+ self.logger.info(
794
+ "Target filter for '%s' selected %d/%d training rows",
795
+ variable,
796
+ selected_rows,
797
+ len(X_train),
798
+ )
799
+
800
+ return target_train, target_sample_weight
801
+
741
802
  def _get_memory_usage_info(self) -> str:
742
803
  """Get formatted memory usage information."""
743
804
  if PSUTIL_AVAILABLE:
@@ -759,6 +820,7 @@ class QRF(Imputer):
759
820
  constant_targets: Optional[Dict[str, Dict]] = None,
760
821
  tune_hyperparameters: bool = False,
761
822
  sample_weight: Optional[np.ndarray] = None,
823
+ target_fit_masks: Optional[Dict[str, pd.Series]] = None,
762
824
  **qrf_kwargs: Any,
763
825
  ) -> QRFResults:
764
826
  """Fit the QRF model to the training data.
@@ -779,10 +841,17 @@ class QRF(Imputer):
779
841
  RuntimeError: If model fitting fails.
780
842
  """
781
843
  try:
844
+ target_fit_masks = target_fit_masks or {}
845
+ if tune_hyperparameters and target_fit_masks:
846
+ raise NotImplementedError(
847
+ "QRF target_filters are not supported with tune_hyperparameters"
848
+ )
849
+
782
850
  # Subsample training data if max_train_samples is set
783
851
  if (
784
852
  self.max_train_samples is not None
785
853
  and len(X_train) > self.max_train_samples
854
+ and not target_fit_masks
786
855
  ):
787
856
  self.logger.info(
788
857
  f"Subsampling training data from "
@@ -893,12 +962,18 @@ class QRF(Imputer):
893
962
 
894
963
  # Create appropriate model based on variable type
895
964
  model = self._create_model_for_variable(variable)
965
+ target_train, target_sample_weight = self._target_fit_data(
966
+ X_train,
967
+ variable,
968
+ target_fit_masks,
969
+ sample_weight,
970
+ )
896
971
  self._fit_model(
897
972
  model,
898
- X_train[encoded_predictors],
899
- X_train[variable],
973
+ target_train[encoded_predictors],
974
+ target_train[variable],
900
975
  variable,
901
- sample_weight=sample_weight,
976
+ sample_weight=target_sample_weight,
902
977
  **qrf_kwargs,
903
978
  )
904
979
 
@@ -997,6 +1072,7 @@ class QRF(Imputer):
997
1072
  qrf_kwargs,
998
1073
  constant_targets,
999
1074
  sample_weight=sample_weight,
1075
+ target_fit_masks=target_fit_masks,
1000
1076
  )
1001
1077
 
1002
1078
  # Memory cleanup after each batch
@@ -1051,12 +1127,18 @@ class QRF(Imputer):
1051
1127
  model = self._create_model_for_variable(variable)
1052
1128
 
1053
1129
  try:
1130
+ target_train, target_sample_weight = self._target_fit_data(
1131
+ X_train,
1132
+ variable,
1133
+ target_fit_masks,
1134
+ sample_weight,
1135
+ )
1054
1136
  self._fit_model(
1055
1137
  model,
1056
- X_train[encoded_predictors],
1057
- X_train[variable],
1138
+ target_train[encoded_predictors],
1139
+ target_train[variable],
1058
1140
  variable,
1059
- sample_weight=sample_weight,
1141
+ sample_weight=target_sample_weight,
1060
1142
  **qrf_kwargs,
1061
1143
  )
1062
1144
 
@@ -1135,6 +1217,7 @@ class QRF(Imputer):
1135
1217
  qrf_kwargs: Dict[str, Any],
1136
1218
  constant_targets: Optional[Dict[str, Dict]] = None,
1137
1219
  sample_weight: Optional[np.ndarray] = None,
1220
+ target_fit_masks: Optional[Dict[str, pd.Series]] = None,
1138
1221
  ) -> None:
1139
1222
  """Fit models for a batch of variables.
1140
1223
 
@@ -1165,12 +1248,16 @@ class QRF(Imputer):
1165
1248
  current_predictors = _get_sequential_predictors(
1166
1249
  predictors, imputed_variables, i
1167
1250
  )
1251
+ dummy_processor = getattr(self, "dummy_processor", None)
1252
+ encoded_predictors = self._get_encoded_predictors(
1253
+ current_predictors, dummy_processor
1254
+ )
1168
1255
 
1169
1256
  # Log detailed pre-imputation information
1170
1257
  self.logger.info(
1171
1258
  f"[{i + 1}/{len(imputed_variables)}] Starting imputation for '{variable}'"
1172
1259
  )
1173
- self.logger.info(f" Features: {len(current_predictors)} predictors")
1260
+ self.logger.info(f" Features: {len(encoded_predictors)} predictors")
1174
1261
  self.logger.info(f" Memory usage: {self._get_memory_usage_info()}")
1175
1262
 
1176
1263
  # Create and fit model
@@ -1178,12 +1265,18 @@ class QRF(Imputer):
1178
1265
  model = self._create_model_for_variable(variable)
1179
1266
 
1180
1267
  try:
1268
+ target_train, target_sample_weight = self._target_fit_data(
1269
+ X_train,
1270
+ variable,
1271
+ target_fit_masks,
1272
+ sample_weight,
1273
+ )
1181
1274
  self._fit_model(
1182
1275
  model,
1183
- X_train[current_predictors],
1184
- X_train[variable],
1276
+ target_train[encoded_predictors],
1277
+ target_train[variable],
1185
1278
  variable,
1186
- sample_weight=sample_weight,
1279
+ sample_weight=target_sample_weight,
1187
1280
  **qrf_kwargs,
1188
1281
  )
1189
1282