microimpute 2.0.3__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {microimpute-2.0.3 → microimpute-2.1.0}/PKG-INFO +1 -1
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/imputer.py +110 -13
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/qrf.py +103 -10
- microimpute-2.1.0/microimpute/models/zero_inflated.py +698 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/dashboard_formatter.py +47 -11
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/PKG-INFO +1 -1
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/SOURCES.txt +1 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/pyproject.toml +1 -1
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_dashboard_formatter.py +52 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/README.md +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/autoimpute.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/autoimpute_helpers.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/imputations.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/metrics.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/comparisons/validation.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/config.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/evaluations/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/evaluations/cross_validation.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/evaluations/predictor_analysis.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/matching.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/mdn.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/ols.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/models/quantreg.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/data.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/statmatch_hotdeck.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/utils/type_handling.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/visualizations/__init__.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/visualizations/comparison_plots.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute/visualizations/performance_plots.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/dependency_links.txt +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/requires.txt +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/microimpute.egg-info/top_level.txt +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/setup.cfg +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_autoimpute.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_basic.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_data_preprocessing.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_metrics.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_predictor_analysis.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_quantile_comparison.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_smoke_qrf.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_type_handling.py +0 -0
- {microimpute-2.0.3 → microimpute-2.1.0}/tests/test_visualizations.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: microimpute
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Benchmarking imputation methods for microdata
|
|
5
5
|
Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
|
|
6
6
|
Requires-Python: <3.15,>=3.12
|
|
@@ -14,7 +14,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
|
-
from pydantic import
|
|
17
|
+
from pydantic import validate_call
|
|
18
18
|
|
|
19
19
|
from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG
|
|
20
20
|
from microimpute.utils.type_handling import (
|
|
@@ -104,6 +104,7 @@ class Imputer(ABC):
|
|
|
104
104
|
data: pd.DataFrame,
|
|
105
105
|
imputed_variables: List[str],
|
|
106
106
|
not_numeric_categorical: Optional[List[str]] = None,
|
|
107
|
+
target_fit_masks: Optional[Dict[str, pd.Series]] = None,
|
|
107
108
|
) -> None:
|
|
108
109
|
"""Identify and track variable types for imputation targets.
|
|
109
110
|
|
|
@@ -113,21 +114,27 @@ class Imputer(ABC):
|
|
|
113
114
|
not_numeric_categorical: Optional list of variable names that should
|
|
114
115
|
be treated as numeric even if they would normally be detected as
|
|
115
116
|
numeric_categorical.
|
|
117
|
+
target_fit_masks: Optional target-specific row masks to use when
|
|
118
|
+
inferring target type and constants.
|
|
116
119
|
"""
|
|
117
120
|
detector = VariableTypeDetector()
|
|
118
121
|
not_numeric_categorical = not_numeric_categorical or []
|
|
122
|
+
target_fit_masks = target_fit_masks or {}
|
|
119
123
|
|
|
120
124
|
for var in imputed_variables:
|
|
121
125
|
if var not in data.columns:
|
|
122
126
|
continue
|
|
127
|
+
target_data = data[var]
|
|
128
|
+
if var in target_fit_masks:
|
|
129
|
+
target_data = target_data.loc[target_fit_masks[var]]
|
|
123
130
|
|
|
124
131
|
# First check if the variable has a constant value
|
|
125
|
-
unique_values =
|
|
132
|
+
unique_values = target_data.dropna().unique()
|
|
126
133
|
if len(unique_values) == 1:
|
|
127
134
|
constant_val = unique_values[0]
|
|
128
135
|
self.constant_targets[var] = {
|
|
129
136
|
"value": constant_val,
|
|
130
|
-
"dtype":
|
|
137
|
+
"dtype": target_data.dtype,
|
|
131
138
|
}
|
|
132
139
|
self.logger.warning(
|
|
133
140
|
f"Target variable '{var}' has constant value {constant_val}. "
|
|
@@ -136,7 +143,7 @@ class Imputer(ABC):
|
|
|
136
143
|
continue
|
|
137
144
|
|
|
138
145
|
var_type, categories = detector.categorize_variable(
|
|
139
|
-
|
|
146
|
+
target_data,
|
|
140
147
|
var,
|
|
141
148
|
self.logger,
|
|
142
149
|
force_numeric=(var in not_numeric_categorical),
|
|
@@ -145,7 +152,7 @@ class Imputer(ABC):
|
|
|
145
152
|
if var_type == "bool":
|
|
146
153
|
self.boolean_targets[var] = {
|
|
147
154
|
"type": "boolean",
|
|
148
|
-
"dtype":
|
|
155
|
+
"dtype": target_data.dtype,
|
|
149
156
|
}
|
|
150
157
|
self.logger.info(f"Identified boolean target: {var}")
|
|
151
158
|
|
|
@@ -153,7 +160,7 @@ class Imputer(ABC):
|
|
|
153
160
|
self.categorical_targets[var] = {
|
|
154
161
|
"type": var_type,
|
|
155
162
|
"categories": categories,
|
|
156
|
-
"dtype":
|
|
163
|
+
"dtype": target_data.dtype,
|
|
157
164
|
}
|
|
158
165
|
self.logger.info(
|
|
159
166
|
f"Identified categorical target: {var} with {len(categories) if categories else 0} categories"
|
|
@@ -163,6 +170,30 @@ class Imputer(ABC):
|
|
|
163
170
|
self.numeric_targets.append(var)
|
|
164
171
|
self.logger.debug(f"Identified numeric target: {var}")
|
|
165
172
|
|
|
173
|
+
def _coerce_fit_filter(
|
|
174
|
+
self,
|
|
175
|
+
X_train: pd.DataFrame,
|
|
176
|
+
fit_filter: Union[str, np.ndarray, pd.Series, List[bool], Tuple[bool, ...]],
|
|
177
|
+
*,
|
|
178
|
+
name: str,
|
|
179
|
+
) -> pd.Series:
|
|
180
|
+
"""Normalize a row-filter input to a boolean Series on ``X_train``."""
|
|
181
|
+
if isinstance(fit_filter, str):
|
|
182
|
+
if fit_filter not in X_train.columns:
|
|
183
|
+
raise ValueError(f"{name} column '{fit_filter}' not found in X_train")
|
|
184
|
+
mask = X_train[fit_filter]
|
|
185
|
+
elif isinstance(fit_filter, pd.Series):
|
|
186
|
+
mask = fit_filter.reindex(X_train.index)
|
|
187
|
+
else:
|
|
188
|
+
mask = pd.Series(fit_filter, index=X_train.index)
|
|
189
|
+
|
|
190
|
+
if len(mask) != len(X_train):
|
|
191
|
+
raise ValueError(f"{name} must have length {len(X_train)}, got {len(mask)}")
|
|
192
|
+
if mask.isna().any():
|
|
193
|
+
raise ValueError(f"{name} contains missing values")
|
|
194
|
+
|
|
195
|
+
return mask.astype(bool)
|
|
196
|
+
|
|
166
197
|
@validate_call(config=VALIDATE_CONFIG)
|
|
167
198
|
def preprocess_data_types(
|
|
168
199
|
self,
|
|
@@ -216,6 +247,12 @@ class Imputer(ABC):
|
|
|
216
247
|
weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None,
|
|
217
248
|
skip_missing: bool = False,
|
|
218
249
|
not_numeric_categorical: Optional[List[str]] = None,
|
|
250
|
+
row_filter: Optional[
|
|
251
|
+
Union[str, np.ndarray, pd.Series, List[bool], Tuple[bool, ...]]
|
|
252
|
+
] = None,
|
|
253
|
+
target_filters: Optional[
|
|
254
|
+
Dict[str, Union[str, np.ndarray, pd.Series, List[bool], Tuple[bool, ...]]]
|
|
255
|
+
] = None,
|
|
219
256
|
**kwargs: Any,
|
|
220
257
|
) -> Any: # Returns ImputerResults
|
|
221
258
|
"""Fit the model to the training data.
|
|
@@ -229,6 +266,13 @@ class Imputer(ABC):
|
|
|
229
266
|
not_numeric_categorical: Optional list of variable names that should
|
|
230
267
|
be treated as numeric even if they would normally be detected as
|
|
231
268
|
numeric_categorical.
|
|
269
|
+
row_filter: Optional common row mask, or the name of a boolean
|
|
270
|
+
column in X_train, selecting rows eligible for all targets.
|
|
271
|
+
target_filters: Optional mapping from imputed variable name to a
|
|
272
|
+
target-specific row mask, or the name of a boolean column in
|
|
273
|
+
X_train. Target-specific filters are combined with row_filter.
|
|
274
|
+
They are supported by models that fit one model per target,
|
|
275
|
+
such as QRF.
|
|
232
276
|
**kwargs: Additional model-specific parameters.
|
|
233
277
|
|
|
234
278
|
Returns:
|
|
@@ -240,6 +284,48 @@ class Imputer(ABC):
|
|
|
240
284
|
NotImplementedError: If method is not implemented by subclass.
|
|
241
285
|
"""
|
|
242
286
|
original_predictors = predictors.copy()
|
|
287
|
+
target_filters = target_filters or {}
|
|
288
|
+
unknown_target_filters = set(target_filters) - set(imputed_variables)
|
|
289
|
+
if unknown_target_filters:
|
|
290
|
+
raise ValueError(
|
|
291
|
+
"target_filters contains variables not in imputed_variables: "
|
|
292
|
+
f"{sorted(unknown_target_filters)}"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
base_mask = pd.Series(True, index=X_train.index)
|
|
296
|
+
if row_filter is not None:
|
|
297
|
+
base_mask = self._coerce_fit_filter(
|
|
298
|
+
X_train,
|
|
299
|
+
row_filter,
|
|
300
|
+
name="row_filter",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
target_fit_masks = {}
|
|
304
|
+
for variable, target_filter in target_filters.items():
|
|
305
|
+
target_fit_masks[variable] = (
|
|
306
|
+
self._coerce_fit_filter(
|
|
307
|
+
X_train,
|
|
308
|
+
target_filter,
|
|
309
|
+
name=f"target_filters[{variable!r}]",
|
|
310
|
+
)
|
|
311
|
+
& base_mask
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
if target_filters and not getattr(self, "supports_target_filters", False):
|
|
315
|
+
raise NotImplementedError(
|
|
316
|
+
f"{type(self).__name__} does not support target_filters"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
if not base_mask.all():
|
|
320
|
+
if isinstance(weight_col, np.ndarray):
|
|
321
|
+
weight_col = pd.Series(weight_col, index=base_mask.index).loc[base_mask]
|
|
322
|
+
elif isinstance(weight_col, pd.Series):
|
|
323
|
+
weight_col = weight_col.reindex(base_mask.index).loc[base_mask]
|
|
324
|
+
X_train = X_train.loc[base_mask].copy()
|
|
325
|
+
target_fit_masks = {
|
|
326
|
+
variable: mask.loc[X_train.index]
|
|
327
|
+
for variable, mask in target_fit_masks.items()
|
|
328
|
+
}
|
|
243
329
|
|
|
244
330
|
try:
|
|
245
331
|
# Handle missing variables if skip_missing is enabled
|
|
@@ -288,7 +374,12 @@ class Imputer(ABC):
|
|
|
288
374
|
)
|
|
289
375
|
|
|
290
376
|
# Identify target types BEFORE preprocessing
|
|
291
|
-
self.identify_target_types(
|
|
377
|
+
self.identify_target_types(
|
|
378
|
+
X_train,
|
|
379
|
+
imputed_variables,
|
|
380
|
+
not_numeric_categorical,
|
|
381
|
+
target_fit_masks=target_fit_masks,
|
|
382
|
+
)
|
|
292
383
|
|
|
293
384
|
X_train, predictors, imputed_variables, imputed_vars_dummy_info = (
|
|
294
385
|
self.preprocess_data_types(
|
|
@@ -319,17 +410,23 @@ class Imputer(ABC):
|
|
|
319
410
|
)
|
|
320
411
|
|
|
321
412
|
# Defer actual training to subclass with all parameters
|
|
413
|
+
fit_kwargs = {
|
|
414
|
+
"categorical_targets": self.categorical_targets,
|
|
415
|
+
"boolean_targets": self.boolean_targets,
|
|
416
|
+
"numeric_targets": self.numeric_targets,
|
|
417
|
+
"constant_targets": self.constant_targets,
|
|
418
|
+
"sample_weight": sample_weight,
|
|
419
|
+
**kwargs,
|
|
420
|
+
}
|
|
421
|
+
if target_fit_masks:
|
|
422
|
+
fit_kwargs["target_fit_masks"] = target_fit_masks
|
|
423
|
+
|
|
322
424
|
fitted_model = self._fit(
|
|
323
425
|
X_train,
|
|
324
426
|
self.predictors,
|
|
325
427
|
self.imputed_variables,
|
|
326
428
|
self.original_predictors,
|
|
327
|
-
|
|
328
|
-
boolean_targets=self.boolean_targets,
|
|
329
|
-
numeric_targets=self.numeric_targets,
|
|
330
|
-
constant_targets=self.constant_targets,
|
|
331
|
-
sample_weight=sample_weight,
|
|
332
|
-
**kwargs,
|
|
429
|
+
**fit_kwargs,
|
|
333
430
|
)
|
|
334
431
|
return fitted_model
|
|
335
432
|
|
|
@@ -567,6 +567,8 @@ class QRF(Imputer):
|
|
|
567
567
|
The underlying QRF implementation is from the quantile_forest package.
|
|
568
568
|
"""
|
|
569
569
|
|
|
570
|
+
supports_target_filters = True
|
|
571
|
+
|
|
570
572
|
def __init__(
|
|
571
573
|
self,
|
|
572
574
|
log_level: Optional[str] = "WARNING",
|
|
@@ -738,6 +740,65 @@ class QRF(Imputer):
|
|
|
738
740
|
# Regular QRF fit
|
|
739
741
|
model.fit(X, y, sample_weight=sample_weight, **model_params)
|
|
740
742
|
|
|
743
|
+
def _target_fit_data(
|
|
744
|
+
self,
|
|
745
|
+
X_train: pd.DataFrame,
|
|
746
|
+
variable: str,
|
|
747
|
+
target_fit_masks: Optional[Dict[str, pd.Series]],
|
|
748
|
+
sample_weight: Optional[np.ndarray],
|
|
749
|
+
) -> Tuple[pd.DataFrame, Optional[np.ndarray]]:
|
|
750
|
+
"""Return training rows and weights for one target variable."""
|
|
751
|
+
if not target_fit_masks or variable not in target_fit_masks:
|
|
752
|
+
return X_train, sample_weight
|
|
753
|
+
|
|
754
|
+
mask = (
|
|
755
|
+
target_fit_masks[variable].reindex(X_train.index).fillna(False).astype(bool)
|
|
756
|
+
)
|
|
757
|
+
if not mask.any():
|
|
758
|
+
raise ValueError(f"No training rows selected for target '{variable}'")
|
|
759
|
+
|
|
760
|
+
target_train = X_train.loc[mask]
|
|
761
|
+
target_sample_weight = None
|
|
762
|
+
if sample_weight is not None:
|
|
763
|
+
target_sample_weight = np.asarray(sample_weight, dtype=float)[
|
|
764
|
+
mask.to_numpy()
|
|
765
|
+
]
|
|
766
|
+
|
|
767
|
+
selected_rows = len(target_train)
|
|
768
|
+
if (
|
|
769
|
+
self.max_train_samples is not None
|
|
770
|
+
and len(target_train) > self.max_train_samples
|
|
771
|
+
):
|
|
772
|
+
try:
|
|
773
|
+
variable_offset = (self.imputed_variables or []).index(variable)
|
|
774
|
+
except ValueError:
|
|
775
|
+
variable_offset = 0
|
|
776
|
+
seed = None if self.seed is None else self.seed + variable_offset
|
|
777
|
+
rng = np.random.default_rng(seed)
|
|
778
|
+
sel = rng.choice(
|
|
779
|
+
len(target_train), size=self.max_train_samples, replace=False
|
|
780
|
+
)
|
|
781
|
+
target_train = target_train.iloc[sel]
|
|
782
|
+
if target_sample_weight is not None:
|
|
783
|
+
target_sample_weight = target_sample_weight[sel]
|
|
784
|
+
self.logger.info(
|
|
785
|
+
"Subsampling target '%s' training data from %d to %d rows",
|
|
786
|
+
variable,
|
|
787
|
+
selected_rows,
|
|
788
|
+
self.max_train_samples,
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
dropped = len(X_train) - selected_rows
|
|
792
|
+
if dropped:
|
|
793
|
+
self.logger.info(
|
|
794
|
+
"Target filter for '%s' selected %d/%d training rows",
|
|
795
|
+
variable,
|
|
796
|
+
selected_rows,
|
|
797
|
+
len(X_train),
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
return target_train, target_sample_weight
|
|
801
|
+
|
|
741
802
|
def _get_memory_usage_info(self) -> str:
|
|
742
803
|
"""Get formatted memory usage information."""
|
|
743
804
|
if PSUTIL_AVAILABLE:
|
|
@@ -759,6 +820,7 @@ class QRF(Imputer):
|
|
|
759
820
|
constant_targets: Optional[Dict[str, Dict]] = None,
|
|
760
821
|
tune_hyperparameters: bool = False,
|
|
761
822
|
sample_weight: Optional[np.ndarray] = None,
|
|
823
|
+
target_fit_masks: Optional[Dict[str, pd.Series]] = None,
|
|
762
824
|
**qrf_kwargs: Any,
|
|
763
825
|
) -> QRFResults:
|
|
764
826
|
"""Fit the QRF model to the training data.
|
|
@@ -779,10 +841,17 @@ class QRF(Imputer):
|
|
|
779
841
|
RuntimeError: If model fitting fails.
|
|
780
842
|
"""
|
|
781
843
|
try:
|
|
844
|
+
target_fit_masks = target_fit_masks or {}
|
|
845
|
+
if tune_hyperparameters and target_fit_masks:
|
|
846
|
+
raise NotImplementedError(
|
|
847
|
+
"QRF target_filters are not supported with tune_hyperparameters"
|
|
848
|
+
)
|
|
849
|
+
|
|
782
850
|
# Subsample training data if max_train_samples is set
|
|
783
851
|
if (
|
|
784
852
|
self.max_train_samples is not None
|
|
785
853
|
and len(X_train) > self.max_train_samples
|
|
854
|
+
and not target_fit_masks
|
|
786
855
|
):
|
|
787
856
|
self.logger.info(
|
|
788
857
|
f"Subsampling training data from "
|
|
@@ -893,12 +962,18 @@ class QRF(Imputer):
|
|
|
893
962
|
|
|
894
963
|
# Create appropriate model based on variable type
|
|
895
964
|
model = self._create_model_for_variable(variable)
|
|
965
|
+
target_train, target_sample_weight = self._target_fit_data(
|
|
966
|
+
X_train,
|
|
967
|
+
variable,
|
|
968
|
+
target_fit_masks,
|
|
969
|
+
sample_weight,
|
|
970
|
+
)
|
|
896
971
|
self._fit_model(
|
|
897
972
|
model,
|
|
898
|
-
|
|
899
|
-
|
|
973
|
+
target_train[encoded_predictors],
|
|
974
|
+
target_train[variable],
|
|
900
975
|
variable,
|
|
901
|
-
sample_weight=
|
|
976
|
+
sample_weight=target_sample_weight,
|
|
902
977
|
**qrf_kwargs,
|
|
903
978
|
)
|
|
904
979
|
|
|
@@ -997,6 +1072,7 @@ class QRF(Imputer):
|
|
|
997
1072
|
qrf_kwargs,
|
|
998
1073
|
constant_targets,
|
|
999
1074
|
sample_weight=sample_weight,
|
|
1075
|
+
target_fit_masks=target_fit_masks,
|
|
1000
1076
|
)
|
|
1001
1077
|
|
|
1002
1078
|
# Memory cleanup after each batch
|
|
@@ -1051,12 +1127,18 @@ class QRF(Imputer):
|
|
|
1051
1127
|
model = self._create_model_for_variable(variable)
|
|
1052
1128
|
|
|
1053
1129
|
try:
|
|
1130
|
+
target_train, target_sample_weight = self._target_fit_data(
|
|
1131
|
+
X_train,
|
|
1132
|
+
variable,
|
|
1133
|
+
target_fit_masks,
|
|
1134
|
+
sample_weight,
|
|
1135
|
+
)
|
|
1054
1136
|
self._fit_model(
|
|
1055
1137
|
model,
|
|
1056
|
-
|
|
1057
|
-
|
|
1138
|
+
target_train[encoded_predictors],
|
|
1139
|
+
target_train[variable],
|
|
1058
1140
|
variable,
|
|
1059
|
-
sample_weight=
|
|
1141
|
+
sample_weight=target_sample_weight,
|
|
1060
1142
|
**qrf_kwargs,
|
|
1061
1143
|
)
|
|
1062
1144
|
|
|
@@ -1135,6 +1217,7 @@ class QRF(Imputer):
|
|
|
1135
1217
|
qrf_kwargs: Dict[str, Any],
|
|
1136
1218
|
constant_targets: Optional[Dict[str, Dict]] = None,
|
|
1137
1219
|
sample_weight: Optional[np.ndarray] = None,
|
|
1220
|
+
target_fit_masks: Optional[Dict[str, pd.Series]] = None,
|
|
1138
1221
|
) -> None:
|
|
1139
1222
|
"""Fit models for a batch of variables.
|
|
1140
1223
|
|
|
@@ -1165,12 +1248,16 @@ class QRF(Imputer):
|
|
|
1165
1248
|
current_predictors = _get_sequential_predictors(
|
|
1166
1249
|
predictors, imputed_variables, i
|
|
1167
1250
|
)
|
|
1251
|
+
dummy_processor = getattr(self, "dummy_processor", None)
|
|
1252
|
+
encoded_predictors = self._get_encoded_predictors(
|
|
1253
|
+
current_predictors, dummy_processor
|
|
1254
|
+
)
|
|
1168
1255
|
|
|
1169
1256
|
# Log detailed pre-imputation information
|
|
1170
1257
|
self.logger.info(
|
|
1171
1258
|
f"[{i + 1}/{len(imputed_variables)}] Starting imputation for '{variable}'"
|
|
1172
1259
|
)
|
|
1173
|
-
self.logger.info(f" Features: {len(
|
|
1260
|
+
self.logger.info(f" Features: {len(encoded_predictors)} predictors")
|
|
1174
1261
|
self.logger.info(f" Memory usage: {self._get_memory_usage_info()}")
|
|
1175
1262
|
|
|
1176
1263
|
# Create and fit model
|
|
@@ -1178,12 +1265,18 @@ class QRF(Imputer):
|
|
|
1178
1265
|
model = self._create_model_for_variable(variable)
|
|
1179
1266
|
|
|
1180
1267
|
try:
|
|
1268
|
+
target_train, target_sample_weight = self._target_fit_data(
|
|
1269
|
+
X_train,
|
|
1270
|
+
variable,
|
|
1271
|
+
target_fit_masks,
|
|
1272
|
+
sample_weight,
|
|
1273
|
+
)
|
|
1181
1274
|
self._fit_model(
|
|
1182
1275
|
model,
|
|
1183
|
-
|
|
1184
|
-
|
|
1276
|
+
target_train[encoded_predictors],
|
|
1277
|
+
target_train[variable],
|
|
1185
1278
|
variable,
|
|
1186
|
-
sample_weight=
|
|
1279
|
+
sample_weight=target_sample_weight,
|
|
1187
1280
|
**qrf_kwargs,
|
|
1188
1281
|
)
|
|
1189
1282
|
|