microimpute 2.0.2__tar.gz → 2.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {microimpute-2.0.2 → microimpute-2.0.3}/PKG-INFO +1 -1
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/autoimpute.py +10 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/metrics.py +16 -4
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/imputer.py +31 -11
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/matching.py +13 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/mdn.py +35 -10
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/ols.py +99 -29
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/qrf.py +137 -29
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/quantreg.py +45 -18
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/utils/statmatch_hotdeck.py +70 -23
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/utils/type_handling.py +61 -5
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute.egg-info/PKG-INFO +1 -1
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute.egg-info/SOURCES.txt +1 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/pyproject.toml +1 -1
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_autoimpute.py +45 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_metrics.py +24 -0
- microimpute-2.0.3/tests/test_type_handling.py +142 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/README.md +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/__init__.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/__init__.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/autoimpute_helpers.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/imputations.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/validation.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/config.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/evaluations/__init__.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/evaluations/cross_validation.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/evaluations/predictor_analysis.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/__init__.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/utils/__init__.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/utils/dashboard_formatter.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/utils/data.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/visualizations/__init__.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/visualizations/comparison_plots.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute/visualizations/performance_plots.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute.egg-info/dependency_links.txt +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute.egg-info/requires.txt +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/microimpute.egg-info/top_level.txt +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/setup.cfg +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_basic.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_dashboard_formatter.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_data_preprocessing.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_predictor_analysis.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_quantile_comparison.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_smoke_qrf.py +0 -0
- {microimpute-2.0.2 → microimpute-2.0.3}/tests/test_visualizations.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: microimpute
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.3
|
|
4
4
|
Summary: Benchmarking imputation methods for microdata
|
|
5
5
|
Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
|
|
6
6
|
Requires-Python: <3.15,>=3.12
|
|
@@ -441,6 +441,16 @@ def autoimpute(
|
|
|
441
441
|
main_progress = tqdm(total=5, desc="AutoImputation progress")
|
|
442
442
|
main_progress.set_description("Input validation")
|
|
443
443
|
|
|
444
|
+
# Defensive copy so that the caller's receiver_data is never
|
|
445
|
+
# mutated. Previously the final assignment
|
|
446
|
+
# ``receiver_data[var] = median_imputations[var]`` would write
|
|
447
|
+
# back through any local binding, and whether the user's frame
|
|
448
|
+
# was affected depended on whether the intermediate ``drop`` call
|
|
449
|
+
# returned a copy or a view (#13). Copying up-front makes this
|
|
450
|
+
# explicit and eliminates the side effect regardless of later
|
|
451
|
+
# pandas internals.
|
|
452
|
+
receiver_data = receiver_data.copy()
|
|
453
|
+
|
|
444
454
|
# Use provided quantiles or defaults
|
|
445
455
|
quantiles = imputation_quantiles if imputation_quantiles else QUANTILES
|
|
446
456
|
|
|
@@ -568,12 +568,24 @@ def kl_divergence(
|
|
|
568
568
|
p_donor = np.array([donor_counts.get(cat, 0.0) for cat in all_categories])
|
|
569
569
|
q_receiver = np.array([receiver_counts.get(cat, 0.0) for cat in all_categories])
|
|
570
570
|
|
|
571
|
-
#
|
|
571
|
+
# Apply the epsilon floor to BOTH distributions symmetrically and
|
|
572
|
+
# renormalise, so KL(p || q) behaves consistently regardless of which
|
|
573
|
+
# side has a missing category (#12). Previously only ``q_receiver``
|
|
574
|
+
# was clipped, which meant a category present in q but absent in p
|
|
575
|
+
# contributed ``rel_entr(0, q) = 0`` (a free pass), while the reverse
|
|
576
|
+
# contributed ``p * log(p / eps)`` (a large finite value depending
|
|
577
|
+
# entirely on ``epsilon``) — asymmetric and arbitrary.
|
|
572
578
|
epsilon = 1e-10
|
|
579
|
+
p_donor = np.maximum(p_donor, epsilon)
|
|
573
580
|
q_receiver = np.maximum(q_receiver, epsilon)
|
|
574
|
-
|
|
575
|
-
#
|
|
576
|
-
#
|
|
581
|
+
# Renormalise so both are still valid probability vectors after the
|
|
582
|
+
# epsilon floor. Without this, the floored mass is "extra" and
|
|
583
|
+
# inflates rel_entr on categories with mass near zero.
|
|
584
|
+
p_donor = p_donor / p_donor.sum()
|
|
585
|
+
q_receiver = q_receiver / q_receiver.sum()
|
|
586
|
+
|
|
587
|
+
# Calculate KL divergence using scipy.special.rel_entr
|
|
588
|
+
# rel_entr(p, q) computes p * log(p/q) element-wise
|
|
577
589
|
kl_values = rel_entr(p_donor, q_receiver)
|
|
578
590
|
|
|
579
591
|
# Sum over all categories to get total KL divergence
|
|
@@ -271,9 +271,21 @@ class Imputer(ABC):
|
|
|
271
271
|
weights = X_train[weight_col]
|
|
272
272
|
elif weight_col is not None and isinstance(weight_col, np.ndarray):
|
|
273
273
|
weights = pd.Series(weight_col, index=X_train.index)
|
|
274
|
+
elif weight_col is not None and isinstance(weight_col, pd.Series):
|
|
275
|
+
weights = weight_col.reindex(X_train.index)
|
|
274
276
|
|
|
275
|
-
if weights is not None
|
|
276
|
-
|
|
277
|
+
if weights is not None:
|
|
278
|
+
# Check for NaN AND non-positive values together. Previously only
|
|
279
|
+
# (weights <= 0).any() was checked, which returns False for NaN
|
|
280
|
+
# weights — those then propagated into .sample() as NaN
|
|
281
|
+
# probabilities or corrupted sample_weight passed to learners.
|
|
282
|
+
weights_arr = np.asarray(weights, dtype=float)
|
|
283
|
+
invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0)
|
|
284
|
+
if invalid_mask.any():
|
|
285
|
+
raise ValueError(
|
|
286
|
+
"Weights must be positive and finite; found "
|
|
287
|
+
f"{int(invalid_mask.sum())} non-positive or NaN weight(s)"
|
|
288
|
+
)
|
|
277
289
|
|
|
278
290
|
# Identify target types BEFORE preprocessing
|
|
279
291
|
self.identify_target_types(X_train, imputed_variables, not_numeric_categorical)
|
|
@@ -284,21 +296,28 @@ class Imputer(ABC):
|
|
|
284
296
|
)
|
|
285
297
|
)
|
|
286
298
|
|
|
287
|
-
if weights is not None:
|
|
288
|
-
weights_normalized = weights / weights.sum()
|
|
289
|
-
X_train = X_train.sample(
|
|
290
|
-
n=len(X_train),
|
|
291
|
-
replace=True,
|
|
292
|
-
weights=weights_normalized,
|
|
293
|
-
random_state=self.seed,
|
|
294
|
-
).reset_index(drop=True)
|
|
295
|
-
|
|
296
299
|
# Save predictors and imputed variables
|
|
297
300
|
self.predictors = predictors
|
|
298
301
|
self.imputed_variables = imputed_variables
|
|
299
302
|
self.imputed_vars_dummy_info = imputed_vars_dummy_info
|
|
300
303
|
self.original_predictors = original_predictors
|
|
301
304
|
|
|
305
|
+
# Pass sample_weight through to the subclass so it can use each
|
|
306
|
+
# learner's native weighted-fit API (QRF, OLS→WLS, logistic, RFC all
|
|
307
|
+
# support sample_weight). This replaces the previous bootstrap
|
|
308
|
+
# resample, which silently discarded weights for the underlying
|
|
309
|
+
# estimator and inflated variance / shrank effective sample size.
|
|
310
|
+
sample_weight = None
|
|
311
|
+
if weights is not None:
|
|
312
|
+
sample_weight = np.asarray(weights_arr, dtype=float)
|
|
313
|
+
# Reindex if preprocess_data_types changed the row ordering
|
|
314
|
+
# (it currently does not, but guard against future drift).
|
|
315
|
+
if len(sample_weight) != len(X_train):
|
|
316
|
+
raise RuntimeError(
|
|
317
|
+
"Internal error: sample_weight length no longer matches "
|
|
318
|
+
"X_train after preprocessing"
|
|
319
|
+
)
|
|
320
|
+
|
|
302
321
|
# Defer actual training to subclass with all parameters
|
|
303
322
|
fitted_model = self._fit(
|
|
304
323
|
X_train,
|
|
@@ -309,6 +328,7 @@ class Imputer(ABC):
|
|
|
309
328
|
boolean_targets=self.boolean_targets,
|
|
310
329
|
numeric_targets=self.numeric_targets,
|
|
311
330
|
constant_targets=self.constant_targets,
|
|
331
|
+
sample_weight=sample_weight,
|
|
312
332
|
**kwargs,
|
|
313
333
|
)
|
|
314
334
|
return fitted_model
|
|
@@ -449,6 +449,7 @@ class Matching(Imputer):
|
|
|
449
449
|
numeric_targets: Optional[List[str]] = None,
|
|
450
450
|
constant_targets: Optional[Dict[str, Dict]] = None,
|
|
451
451
|
tune_hyperparameters: bool = False,
|
|
452
|
+
sample_weight: Optional[np.ndarray] = None,
|
|
452
453
|
**matching_kwargs: Any,
|
|
453
454
|
) -> MatchingResults:
|
|
454
455
|
"""Fit the matching model by storing the donor data and variable names.
|
|
@@ -457,6 +458,11 @@ class Matching(Imputer):
|
|
|
457
458
|
X_train: DataFrame containing the donor data.
|
|
458
459
|
predictors: List of column names to use as predictors.
|
|
459
460
|
imputed_variables: List of column names to impute.
|
|
461
|
+
sample_weight: Optional per-row sample weights for the donor
|
|
462
|
+
dataset. When provided, weights are passed to R StatMatch's
|
|
463
|
+
``NND.hotdeck`` via ``weight.don`` so that donor records are
|
|
464
|
+
matched in proportion to their survey weights rather than
|
|
465
|
+
uniformly.
|
|
460
466
|
matching_kwargs: Additional keyword arguments for hyperparameter
|
|
461
467
|
tuning of the matching function.
|
|
462
468
|
|
|
@@ -468,6 +474,13 @@ class Matching(Imputer):
|
|
|
468
474
|
"""
|
|
469
475
|
try:
|
|
470
476
|
self.donor_data = X_train.copy()
|
|
477
|
+
if sample_weight is not None:
|
|
478
|
+
# Attach donor weights to the matching hyperparameters so
|
|
479
|
+
# they're forwarded into the StatMatch R call (weight.don).
|
|
480
|
+
matching_kwargs = {
|
|
481
|
+
**matching_kwargs,
|
|
482
|
+
"donor_sample_weight": np.asarray(sample_weight, dtype=float),
|
|
483
|
+
}
|
|
471
484
|
|
|
472
485
|
if tune_hyperparameters:
|
|
473
486
|
self.logger.info("Tuning hyperparameters for the matching model")
|
|
@@ -122,17 +122,23 @@ def _suppress_pytorch_logging() -> None:
|
|
|
122
122
|
|
|
123
123
|
|
|
124
124
|
def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
|
|
125
|
-
"""Generate a hash from the training data for cache
|
|
125
|
+
"""Generate a content-sensitive hash from the training data for cache
|
|
126
|
+
identification.
|
|
126
127
|
|
|
127
|
-
Creates
|
|
128
|
-
|
|
128
|
+
Creates an order-sensitive hash based on data shape, column names, and
|
|
129
|
+
the SHA-256 digest of the per-row hash bytes of X and y. Previously
|
|
130
|
+
this summed the per-row uint64 hashes from
|
|
131
|
+
``hash_pandas_object``, which lost row ordering (permutations hashed
|
|
132
|
+
identically) and was collision-prone across semantically different
|
|
133
|
+
datasets of matching shape — a cache hit on a collision would load a
|
|
134
|
+
stale model for a new dataset (silent correctness bug).
|
|
129
135
|
|
|
130
136
|
Args:
|
|
131
137
|
X: Feature DataFrame.
|
|
132
138
|
y: Target Series.
|
|
133
139
|
|
|
134
140
|
Returns:
|
|
135
|
-
A
|
|
141
|
+
A 16-character hex string identifying the dataset.
|
|
136
142
|
"""
|
|
137
143
|
# Include shape, column names, and data statistics for identification
|
|
138
144
|
hash_components = [
|
|
@@ -142,12 +148,18 @@ def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
|
|
|
142
148
|
str(len(y)),
|
|
143
149
|
]
|
|
144
150
|
|
|
145
|
-
#
|
|
146
|
-
#
|
|
151
|
+
# Order-sensitive content hash: SHA-256 over the raw bytes of per-row
|
|
152
|
+
# hashes. Any change in row values OR row ordering produces a
|
|
153
|
+
# different digest, eliminating the sum-of-hashes collision trap.
|
|
147
154
|
try:
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
hash_components.extend(
|
|
155
|
+
x_row_hashes = pd.util.hash_pandas_object(X, index=True).values
|
|
156
|
+
y_row_hashes = pd.util.hash_pandas_object(y, index=True).values
|
|
157
|
+
hash_components.extend(
|
|
158
|
+
[
|
|
159
|
+
hashlib.sha256(x_row_hashes.tobytes()).hexdigest(),
|
|
160
|
+
hashlib.sha256(y_row_hashes.tobytes()).hexdigest(),
|
|
161
|
+
]
|
|
162
|
+
)
|
|
151
163
|
except Exception:
|
|
152
164
|
# Fallback to basic stats if hashing fails
|
|
153
165
|
hash_components.extend(
|
|
@@ -158,7 +170,9 @@ def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
|
|
|
158
170
|
)
|
|
159
171
|
|
|
160
172
|
combined = "_".join(hash_components)
|
|
161
|
-
|
|
173
|
+
# SHA-256 truncated to 16 hex chars (64 bits) — collision-resistant
|
|
174
|
+
# for any realistic cache size while keeping filesystem paths short.
|
|
175
|
+
return hashlib.sha256(combined.encode()).hexdigest()[:16]
|
|
162
176
|
|
|
163
177
|
|
|
164
178
|
def _get_package_versions_hash() -> str:
|
|
@@ -926,6 +940,7 @@ class MDN(Imputer):
|
|
|
926
940
|
numeric_targets: Optional[List[str]] = None,
|
|
927
941
|
constant_targets: Optional[Dict[str, Dict]] = None,
|
|
928
942
|
tune_hyperparameters: bool = False,
|
|
943
|
+
sample_weight: Optional[np.ndarray] = None,
|
|
929
944
|
**kwargs: Any,
|
|
930
945
|
) -> Union[MDNResults, Tuple[MDNResults, Dict[str, Any]]]:
|
|
931
946
|
"""Fit the MDN model to the training data.
|
|
@@ -940,12 +955,22 @@ class MDN(Imputer):
|
|
|
940
955
|
numeric_targets: List of numeric target names.
|
|
941
956
|
constant_targets: Dict of constant target info.
|
|
942
957
|
tune_hyperparameters: If True, tune hyperparameters before fitting.
|
|
958
|
+
sample_weight: Optional per-row sample weights. The underlying
|
|
959
|
+
pytorch_tabular MDN implementation does not accept sample
|
|
960
|
+
weights; when provided, the model raises
|
|
961
|
+
``NotImplementedError`` so callers do not silently get an
|
|
962
|
+
unweighted fit.
|
|
943
963
|
**kwargs: Additional parameters.
|
|
944
964
|
|
|
945
965
|
Returns:
|
|
946
966
|
MDNResults instance with fitted models.
|
|
947
967
|
If tune_hyperparameters=True, returns (MDNResults, best_params).
|
|
948
968
|
"""
|
|
969
|
+
if sample_weight is not None:
|
|
970
|
+
raise NotImplementedError(
|
|
971
|
+
"MDN does not yet support sample weights. Use QRF, OLS, or "
|
|
972
|
+
"Matching for weighted imputation."
|
|
973
|
+
)
|
|
949
974
|
try:
|
|
950
975
|
best_params = None
|
|
951
976
|
|
|
@@ -31,6 +31,7 @@ class _LogisticRegressionModel:
|
|
|
31
31
|
y: pd.Series,
|
|
32
32
|
var_type: str,
|
|
33
33
|
categories: List = None,
|
|
34
|
+
sample_weight: Optional[np.ndarray] = None,
|
|
34
35
|
**lr_kwargs: Any,
|
|
35
36
|
) -> None:
|
|
36
37
|
"""Fit logistic regression for categorical/boolean target.
|
|
@@ -58,20 +59,35 @@ class _LogisticRegressionModel:
|
|
|
58
59
|
)
|
|
59
60
|
y_encoded = y_encoded.fillna(0) # Default to first category
|
|
60
61
|
|
|
61
|
-
# Extract relevant LR parameters from kwargs
|
|
62
|
-
#
|
|
62
|
+
# Extract relevant LR parameters from kwargs.
|
|
63
|
+
# sklearn's LogisticRegression ignores l1_ratio unless
|
|
64
|
+
# penalty="elasticnet" (and solver="saga"). Previously we passed
|
|
65
|
+
# l1_ratio through with the default penalty="l2", so tuning
|
|
66
|
+
# l1_ratio had no effect — silent misconfiguration.
|
|
67
|
+
l1_ratio = lr_kwargs.get("l1_ratio", None)
|
|
63
68
|
classifier_params = {
|
|
64
|
-
"l1_ratio": lr_kwargs.get("l1_ratio", 0),
|
|
65
69
|
"C": lr_kwargs.get("C", 1.0),
|
|
66
70
|
"max_iter": lr_kwargs.get("max_iter", 1000),
|
|
67
|
-
"solver": lr_kwargs.get(
|
|
68
|
-
"solver", "lbfgs" if len(self.categories) <= 2 else "saga"
|
|
69
|
-
),
|
|
70
71
|
"random_state": self.seed,
|
|
71
72
|
}
|
|
73
|
+
if l1_ratio is not None and l1_ratio != 0:
|
|
74
|
+
# Caller explicitly supplied a non-zero l1_ratio: wire up the
|
|
75
|
+
# elasticnet penalty and saga solver so it actually applies.
|
|
76
|
+
classifier_params["penalty"] = "elasticnet"
|
|
77
|
+
classifier_params["l1_ratio"] = float(l1_ratio)
|
|
78
|
+
classifier_params["solver"] = lr_kwargs.get("solver", "saga")
|
|
79
|
+
else:
|
|
80
|
+
# No elasticnet requested — use the default L2 penalty and a
|
|
81
|
+
# solver that supports it.
|
|
82
|
+
classifier_params["solver"] = lr_kwargs.get(
|
|
83
|
+
"solver", "lbfgs" if len(self.categories) <= 2 else "saga"
|
|
84
|
+
)
|
|
72
85
|
|
|
73
86
|
self.classifier = LogisticRegression(**classifier_params)
|
|
74
|
-
|
|
87
|
+
fit_kwargs = {}
|
|
88
|
+
if sample_weight is not None:
|
|
89
|
+
fit_kwargs["sample_weight"] = np.asarray(sample_weight, dtype=float)
|
|
90
|
+
self.classifier.fit(X, y_encoded, **fit_kwargs)
|
|
75
91
|
|
|
76
92
|
def predict(
|
|
77
93
|
self,
|
|
@@ -137,11 +153,26 @@ class _OLSModel:
|
|
|
137
153
|
self.model = None
|
|
138
154
|
self.output_column = None
|
|
139
155
|
|
|
140
|
-
def fit(
|
|
141
|
-
|
|
156
|
+
def fit(
|
|
157
|
+
self,
|
|
158
|
+
X: pd.DataFrame,
|
|
159
|
+
y: pd.Series,
|
|
160
|
+
sample_weight: Optional[np.ndarray] = None,
|
|
161
|
+
**kwargs,
|
|
162
|
+
) -> None:
|
|
163
|
+
"""Fit OLS (or WLS when sample_weight is provided).
|
|
164
|
+
|
|
165
|
+
When ``sample_weight`` is provided, uses ``statsmodels.api.WLS`` to
|
|
166
|
+
perform a genuine weighted least-squares fit rather than ignoring
|
|
167
|
+
the weights.
|
|
168
|
+
"""
|
|
142
169
|
self.output_column = y.name
|
|
143
170
|
X_with_const = sm.add_constant(X)
|
|
144
|
-
|
|
171
|
+
if sample_weight is not None:
|
|
172
|
+
weights = np.asarray(sample_weight, dtype=float)
|
|
173
|
+
self.model = sm.WLS(y, X_with_const, weights=weights).fit()
|
|
174
|
+
else:
|
|
175
|
+
self.model = sm.OLS(y, X_with_const).fit()
|
|
145
176
|
self.scale = self.model.scale
|
|
146
177
|
|
|
147
178
|
def predict(self, X: pd.DataFrame) -> np.ndarray:
|
|
@@ -197,12 +228,22 @@ class OLSResults(ImputerResults):
|
|
|
197
228
|
X_test[self.predictors], return_probs=False, quantile=quantile
|
|
198
229
|
)
|
|
199
230
|
else:
|
|
200
|
-
# Regression for numeric targets
|
|
231
|
+
# Regression for numeric targets.
|
|
232
|
+
# Use the full prediction SE (leverage + residual) rather than
|
|
233
|
+
# just sqrt(model.scale). Previously se = sqrt(scale) used only
|
|
234
|
+
# the residual std and under-dispersed imputations for test
|
|
235
|
+
# rows far from the training centroid; at extreme quantiles
|
|
236
|
+
# (0.01, 0.99) the under-dispersion is material.
|
|
201
237
|
X_test_with_const = sm.add_constant(X_test[self.predictors])
|
|
202
|
-
|
|
203
|
-
|
|
238
|
+
prediction = model.model.get_prediction(X_test_with_const)
|
|
239
|
+
# var_pred_mean is the leverage term (x' (X'X)^-1 x) * scale;
|
|
240
|
+
# adding model.scale (residual variance) gives the prediction
|
|
241
|
+
# variance for a new observation.
|
|
242
|
+
pred_var = np.asarray(prediction.var_pred_mean) + model.scale
|
|
243
|
+
mean_preds = np.asarray(prediction.predicted_mean)
|
|
244
|
+
se = np.sqrt(np.maximum(pred_var, 0.0))
|
|
204
245
|
imputed_values = self._predict_quantile(
|
|
205
|
-
mean_preds=mean_preds,
|
|
246
|
+
mean_preds=pd.Series(mean_preds, index=X_test.index, name=variable),
|
|
206
247
|
se=se,
|
|
207
248
|
mean_quantile=quantile,
|
|
208
249
|
random_sample=random_sample,
|
|
@@ -343,16 +384,18 @@ class OLSResults(ImputerResults):
|
|
|
343
384
|
def _predict_quantile(
|
|
344
385
|
self,
|
|
345
386
|
mean_preds: pd.Series,
|
|
346
|
-
se:
|
|
387
|
+
se: Any,
|
|
347
388
|
mean_quantile: float,
|
|
348
389
|
random_sample: bool,
|
|
349
390
|
count_samples: int = 10,
|
|
350
|
-
) ->
|
|
391
|
+
) -> pd.Series:
|
|
351
392
|
"""Predict values at a specified quantile.
|
|
352
393
|
|
|
353
394
|
Args:
|
|
354
395
|
mean_preds: Mean predictions from the model.
|
|
355
|
-
se: Standard error of the predictions.
|
|
396
|
+
se: Standard error of the predictions. May be a scalar (legacy,
|
|
397
|
+
residual std) or a per-row array (prediction SE including
|
|
398
|
+
leverage).
|
|
356
399
|
mean_quantile: Quantile to predict (the quantile affects the center
|
|
357
400
|
of the beta distribution from which to sample when imputing each data point).
|
|
358
401
|
random_sample: If True, use random quantile sampling for prediction.
|
|
@@ -360,20 +403,31 @@ class OLSResults(ImputerResults):
|
|
|
360
403
|
random_sample is True.
|
|
361
404
|
|
|
362
405
|
Returns:
|
|
363
|
-
|
|
406
|
+
Series of predicted values at the specified quantile, indexed to
|
|
407
|
+
match ``mean_preds``. Returning a Series (rather than a bare
|
|
408
|
+
ndarray) preserves the test-row index so downstream
|
|
409
|
+
``DataFrame[col] = series`` assignments align correctly when a
|
|
410
|
+
numeric column is set before a categorical column whose
|
|
411
|
+
predictions come back indexed.
|
|
364
412
|
|
|
365
413
|
Raises:
|
|
366
414
|
RuntimeError: If prediction fails.
|
|
367
415
|
"""
|
|
416
|
+
# Clip q away from 0 and 1 to avoid ±inf from norm.ppf (and the
|
|
417
|
+
# degenerate a=0/a=inf case in the beta-alpha formula).
|
|
418
|
+
# Previously mean_quantile could be 0.0 or 1.0 with no guard.
|
|
419
|
+
q_eps = 1e-6
|
|
420
|
+
q_clipped = float(np.clip(mean_quantile, q_eps, 1.0 - q_eps))
|
|
368
421
|
try:
|
|
369
|
-
if random_sample
|
|
422
|
+
if random_sample:
|
|
370
423
|
self.logger.info(
|
|
371
|
-
f"Predicting at random quantiles sampled from a beta distribution with mean quantile {
|
|
424
|
+
f"Predicting at random quantiles sampled from a beta distribution with mean quantile {q_clipped}"
|
|
372
425
|
)
|
|
373
426
|
random_generator = np.random.default_rng(self.seed)
|
|
374
427
|
|
|
375
|
-
# Calculate alpha parameter for beta distribution
|
|
376
|
-
|
|
428
|
+
# Calculate alpha parameter for beta distribution (q is
|
|
429
|
+
# safely in (0,1) after clipping).
|
|
430
|
+
a = q_clipped / (1 - q_clipped)
|
|
377
431
|
|
|
378
432
|
# Generate count_samples beta distributed values with parameter a
|
|
379
433
|
beta_samples = random_generator.beta(a, 1, size=count_samples)
|
|
@@ -387,12 +441,15 @@ class OLSResults(ImputerResults):
|
|
|
387
441
|
)
|
|
388
442
|
selected_quantiles = normal_quantiles[sampled_indices]
|
|
389
443
|
|
|
390
|
-
# Adjust each mean prediction by
|
|
391
|
-
|
|
444
|
+
# Adjust each mean prediction by the sampled quantile
|
|
445
|
+
# times its per-row SE (or scalar SE if se is a float).
|
|
446
|
+
values = mean_preds.values + selected_quantiles * np.asarray(se)
|
|
392
447
|
else:
|
|
393
|
-
self.logger.info(f"Predicting at specified quantile {
|
|
394
|
-
specified_quantile = norm.ppf(
|
|
395
|
-
|
|
448
|
+
self.logger.info(f"Predicting at specified quantile {q_clipped}")
|
|
449
|
+
specified_quantile = norm.ppf(q_clipped)
|
|
450
|
+
values = mean_preds.values + specified_quantile * np.asarray(se)
|
|
451
|
+
|
|
452
|
+
return pd.Series(values, index=mean_preds.index, name=mean_preds.name)
|
|
396
453
|
|
|
397
454
|
except Exception as e:
|
|
398
455
|
if isinstance(e, ValueError):
|
|
@@ -431,6 +488,7 @@ class OLS(Imputer):
|
|
|
431
488
|
boolean_targets: Optional[Dict[str, Dict]] = None,
|
|
432
489
|
numeric_targets: Optional[List[str]] = None,
|
|
433
490
|
constant_targets: Optional[Dict[str, Dict]] = None,
|
|
491
|
+
sample_weight: Optional[np.ndarray] = None,
|
|
434
492
|
**kwargs: Any,
|
|
435
493
|
) -> OLSResults:
|
|
436
494
|
"""Fit the OLS model to the training data.
|
|
@@ -439,6 +497,9 @@ class OLS(Imputer):
|
|
|
439
497
|
X_train: DataFrame containing the training data.
|
|
440
498
|
predictors: List of column names to use as predictors.
|
|
441
499
|
imputed_variables: List of column names to impute.
|
|
500
|
+
sample_weight: Optional per-row sample weights, threaded through
|
|
501
|
+
to ``sm.WLS`` (for numeric targets) or
|
|
502
|
+
``LogisticRegression.fit`` (for categorical/boolean).
|
|
442
503
|
|
|
443
504
|
Returns:
|
|
444
505
|
The fitted model instance.
|
|
@@ -476,6 +537,7 @@ class OLS(Imputer):
|
|
|
476
537
|
Y,
|
|
477
538
|
var_type=categorical_targets[variable]["type"],
|
|
478
539
|
categories=categorical_targets[variable].get("categories"),
|
|
540
|
+
sample_weight=sample_weight,
|
|
479
541
|
**kwargs,
|
|
480
542
|
)
|
|
481
543
|
self.logger.info(
|
|
@@ -484,14 +546,22 @@ class OLS(Imputer):
|
|
|
484
546
|
elif variable in (boolean_targets or {}):
|
|
485
547
|
# Use logistic regression for boolean targets
|
|
486
548
|
model = _LogisticRegressionModel(seed=self.seed, logger=self.logger)
|
|
487
|
-
model.fit(
|
|
549
|
+
model.fit(
|
|
550
|
+
X_train[predictors],
|
|
551
|
+
Y,
|
|
552
|
+
var_type="boolean",
|
|
553
|
+
sample_weight=sample_weight,
|
|
554
|
+
**kwargs,
|
|
555
|
+
)
|
|
488
556
|
self.logger.info(
|
|
489
557
|
f"Logistic regression fitted for boolean variable {variable}"
|
|
490
558
|
)
|
|
491
559
|
else:
|
|
492
560
|
# Use OLS for numeric targets
|
|
493
561
|
model = _OLSModel(seed=self.seed, logger=self.logger)
|
|
494
|
-
model.fit(
|
|
562
|
+
model.fit(
|
|
563
|
+
X_train[predictors], Y, sample_weight=sample_weight, **kwargs
|
|
564
|
+
)
|
|
495
565
|
self.logger.info(
|
|
496
566
|
f"OLS regression fitted for numeric variable {variable}"
|
|
497
567
|
)
|