microimpute 2.0.2__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {microimpute-2.0.2 → microimpute-2.0.4}/PKG-INFO +1 -1
  2. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/comparisons/autoimpute.py +10 -0
  3. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/comparisons/metrics.py +16 -4
  4. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/models/imputer.py +31 -11
  5. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/models/matching.py +13 -0
  6. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/models/mdn.py +35 -10
  7. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/models/ols.py +99 -29
  8. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/models/qrf.py +137 -29
  9. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/models/quantreg.py +45 -18
  10. microimpute-2.0.4/microimpute/models/zero_inflated.py +698 -0
  11. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/utils/dashboard_formatter.py +47 -11
  12. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/utils/statmatch_hotdeck.py +70 -23
  13. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/utils/type_handling.py +61 -5
  14. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute.egg-info/PKG-INFO +1 -1
  15. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute.egg-info/SOURCES.txt +2 -0
  16. {microimpute-2.0.2 → microimpute-2.0.4}/pyproject.toml +1 -1
  17. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_autoimpute.py +45 -0
  18. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_dashboard_formatter.py +52 -0
  19. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_metrics.py +24 -0
  20. microimpute-2.0.4/tests/test_type_handling.py +142 -0
  21. {microimpute-2.0.2 → microimpute-2.0.4}/README.md +0 -0
  22. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/__init__.py +0 -0
  23. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/comparisons/__init__.py +0 -0
  24. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/comparisons/autoimpute_helpers.py +0 -0
  25. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/comparisons/imputations.py +0 -0
  26. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/comparisons/validation.py +0 -0
  27. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/config.py +0 -0
  28. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/evaluations/__init__.py +0 -0
  29. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/evaluations/cross_validation.py +0 -0
  30. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/evaluations/predictor_analysis.py +0 -0
  31. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/models/__init__.py +0 -0
  32. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/utils/__init__.py +0 -0
  33. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/utils/data.py +0 -0
  34. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/visualizations/__init__.py +0 -0
  35. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/visualizations/comparison_plots.py +0 -0
  36. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute/visualizations/performance_plots.py +0 -0
  37. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute.egg-info/dependency_links.txt +0 -0
  38. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute.egg-info/requires.txt +0 -0
  39. {microimpute-2.0.2 → microimpute-2.0.4}/microimpute.egg-info/top_level.txt +0 -0
  40. {microimpute-2.0.2 → microimpute-2.0.4}/setup.cfg +0 -0
  41. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_basic.py +0 -0
  42. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_data_preprocessing.py +0 -0
  43. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_predictor_analysis.py +0 -0
  44. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_quantile_comparison.py +0 -0
  45. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_smoke_qrf.py +0 -0
  46. {microimpute-2.0.2 → microimpute-2.0.4}/tests/test_visualizations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: microimpute
3
- Version: 2.0.2
3
+ Version: 2.0.4
4
4
  Summary: Benchmarking imputation methods for microdata
5
5
  Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
6
6
  Requires-Python: <3.15,>=3.12
@@ -441,6 +441,16 @@ def autoimpute(
441
441
  main_progress = tqdm(total=5, desc="AutoImputation progress")
442
442
  main_progress.set_description("Input validation")
443
443
 
444
+ # Defensive copy so that the caller's receiver_data is never
445
+ # mutated. Previously the final assignment
446
+ # ``receiver_data[var] = median_imputations[var]`` would write
447
+ # back through any local binding, and whether the user's frame
448
+ # was affected depended on whether the intermediate ``drop`` call
449
+ # returned a copy or a view (#13). Copying up-front makes this
450
+ # explicit and eliminates the side effect regardless of later
451
+ # pandas internals.
452
+ receiver_data = receiver_data.copy()
453
+
444
454
  # Use provided quantiles or defaults
445
455
  quantiles = imputation_quantiles if imputation_quantiles else QUANTILES
446
456
 
@@ -568,12 +568,24 @@ def kl_divergence(
568
568
  p_donor = np.array([donor_counts.get(cat, 0.0) for cat in all_categories])
569
569
  q_receiver = np.array([receiver_counts.get(cat, 0.0) for cat in all_categories])
570
570
 
571
- # Add small epsilon to avoid log(0) and division by zero
571
+ # Apply the epsilon floor to BOTH distributions symmetrically and
572
+ # renormalise, so KL(p || q) behaves consistently regardless of which
573
+ # side has a missing category (#12). Previously only ``q_receiver``
574
+ # was clipped, which meant a category present in q but absent in p
575
+ # contributed ``rel_entr(0, q) = 0`` (a free pass), while the reverse
576
+ # contributed ``p * log(p / eps)`` (a large finite value depending
577
+ # entirely on ``epsilon``) — asymmetric and arbitrary.
572
578
  epsilon = 1e-10
579
+ p_donor = np.maximum(p_donor, epsilon)
573
580
  q_receiver = np.maximum(q_receiver, epsilon)
574
-
575
- # Calculate KL divergence using scipy.special.kl_div
576
- # kl_div(p, q) computes p * log(p/q) element-wise
581
+ # Renormalise so both are still valid probability vectors after the
582
+ # epsilon floor. Without this, the floored mass is "extra" and
583
+ # inflates rel_entr on categories with mass near zero.
584
+ p_donor = p_donor / p_donor.sum()
585
+ q_receiver = q_receiver / q_receiver.sum()
586
+
587
+ # Calculate KL divergence using scipy.special.rel_entr
588
+ # rel_entr(p, q) computes p * log(p/q) element-wise
577
589
  kl_values = rel_entr(p_donor, q_receiver)
578
590
 
579
591
  # Sum over all categories to get total KL divergence
@@ -271,9 +271,21 @@ class Imputer(ABC):
271
271
  weights = X_train[weight_col]
272
272
  elif weight_col is not None and isinstance(weight_col, np.ndarray):
273
273
  weights = pd.Series(weight_col, index=X_train.index)
274
+ elif weight_col is not None and isinstance(weight_col, pd.Series):
275
+ weights = weight_col.reindex(X_train.index)
274
276
 
275
- if weights is not None and (weights <= 0).any():
276
- raise ValueError("Weights must be positive")
277
+ if weights is not None:
278
+ # Check for NaN AND non-positive values together. Previously only
279
+ # (weights <= 0).any() was checked, which returns False for NaN
280
+ # weights — those then propagated into .sample() as NaN
281
+ # probabilities or corrupted sample_weight passed to learners.
282
+ weights_arr = np.asarray(weights, dtype=float)
283
+ invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0)
284
+ if invalid_mask.any():
285
+ raise ValueError(
286
+ "Weights must be positive and finite; found "
287
+ f"{int(invalid_mask.sum())} non-positive or NaN weight(s)"
288
+ )
277
289
 
278
290
  # Identify target types BEFORE preprocessing
279
291
  self.identify_target_types(X_train, imputed_variables, not_numeric_categorical)
@@ -284,21 +296,28 @@ class Imputer(ABC):
284
296
  )
285
297
  )
286
298
 
287
- if weights is not None:
288
- weights_normalized = weights / weights.sum()
289
- X_train = X_train.sample(
290
- n=len(X_train),
291
- replace=True,
292
- weights=weights_normalized,
293
- random_state=self.seed,
294
- ).reset_index(drop=True)
295
-
296
299
  # Save predictors and imputed variables
297
300
  self.predictors = predictors
298
301
  self.imputed_variables = imputed_variables
299
302
  self.imputed_vars_dummy_info = imputed_vars_dummy_info
300
303
  self.original_predictors = original_predictors
301
304
 
305
+ # Pass sample_weight through to the subclass so it can use each
306
+ # learner's native weighted-fit API (QRF, OLS→WLS, logistic, RFC all
307
+ # support sample_weight). This replaces the previous bootstrap
308
+ # resample, which silently discarded weights for the underlying
309
+ # estimator and inflated variance / shrank effective sample size.
310
+ sample_weight = None
311
+ if weights is not None:
312
+ sample_weight = np.asarray(weights_arr, dtype=float)
313
+ # Reindex if preprocess_data_types changed the row ordering
314
+ # (it currently does not, but guard against future drift).
315
+ if len(sample_weight) != len(X_train):
316
+ raise RuntimeError(
317
+ "Internal error: sample_weight length no longer matches "
318
+ "X_train after preprocessing"
319
+ )
320
+
302
321
  # Defer actual training to subclass with all parameters
303
322
  fitted_model = self._fit(
304
323
  X_train,
@@ -309,6 +328,7 @@ class Imputer(ABC):
309
328
  boolean_targets=self.boolean_targets,
310
329
  numeric_targets=self.numeric_targets,
311
330
  constant_targets=self.constant_targets,
331
+ sample_weight=sample_weight,
312
332
  **kwargs,
313
333
  )
314
334
  return fitted_model
@@ -449,6 +449,7 @@ class Matching(Imputer):
449
449
  numeric_targets: Optional[List[str]] = None,
450
450
  constant_targets: Optional[Dict[str, Dict]] = None,
451
451
  tune_hyperparameters: bool = False,
452
+ sample_weight: Optional[np.ndarray] = None,
452
453
  **matching_kwargs: Any,
453
454
  ) -> MatchingResults:
454
455
  """Fit the matching model by storing the donor data and variable names.
@@ -457,6 +458,11 @@ class Matching(Imputer):
457
458
  X_train: DataFrame containing the donor data.
458
459
  predictors: List of column names to use as predictors.
459
460
  imputed_variables: List of column names to impute.
461
+ sample_weight: Optional per-row sample weights for the donor
462
+ dataset. When provided, weights are passed to R StatMatch's
463
+ ``NND.hotdeck`` via ``weight.don`` so that donor records are
464
+ matched in proportion to their survey weights rather than
465
+ uniformly.
460
466
  matching_kwargs: Additional keyword arguments for hyperparameter
461
467
  tuning of the matching function.
462
468
 
@@ -468,6 +474,13 @@ class Matching(Imputer):
468
474
  """
469
475
  try:
470
476
  self.donor_data = X_train.copy()
477
+ if sample_weight is not None:
478
+ # Attach donor weights to the matching hyperparameters so
479
+ # they're forwarded into the StatMatch R call (weight.don).
480
+ matching_kwargs = {
481
+ **matching_kwargs,
482
+ "donor_sample_weight": np.asarray(sample_weight, dtype=float),
483
+ }
471
484
 
472
485
  if tune_hyperparameters:
473
486
  self.logger.info("Tuning hyperparameters for the matching model")
@@ -122,17 +122,23 @@ def _suppress_pytorch_logging() -> None:
122
122
 
123
123
 
124
124
  def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
125
- """Generate a hash from the training data for cache identification.
125
+ """Generate a content-sensitive hash from the training data for cache
126
+ identification.
126
127
 
127
- Creates a reproducible hash based on the data shape, column names,
128
- and a sample of the data values.
128
+ Creates an order-sensitive hash based on data shape, column names, and
129
+ the SHA-256 digest of the per-row hash bytes of X and y. Previously
130
+ this summed the per-row uint64 hashes from
131
+ ``hash_pandas_object``, which lost row ordering (permutations hashed
132
+ identically) and was collision-prone across semantically different
133
+ datasets of matching shape — a cache hit on a collision would load a
134
+ stale model for a new dataset (silent correctness bug).
129
135
 
130
136
  Args:
131
137
  X: Feature DataFrame.
132
138
  y: Target Series.
133
139
 
134
140
  Returns:
135
- A short hash string identifying the dataset.
141
+ A 16-character hex string identifying the dataset.
136
142
  """
137
143
  # Include shape, column names, and data statistics for identification
138
144
  hash_components = [
@@ -142,12 +148,18 @@ def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
142
148
  str(len(y)),
143
149
  ]
144
150
 
145
- # Add hash of actual data values for uniqueness
146
- # Use pandas hash_pandas_object for consistent hashing
151
+ # Order-sensitive content hash: SHA-256 over the raw bytes of per-row
152
+ # hashes. Any change in row values OR row ordering produces a
153
+ # different digest, eliminating the sum-of-hashes collision trap.
147
154
  try:
148
- data_hash = pd.util.hash_pandas_object(X).sum()
149
- y_hash = pd.util.hash_pandas_object(y).sum()
150
- hash_components.extend([str(data_hash), str(y_hash)])
155
+ x_row_hashes = pd.util.hash_pandas_object(X, index=True).values
156
+ y_row_hashes = pd.util.hash_pandas_object(y, index=True).values
157
+ hash_components.extend(
158
+ [
159
+ hashlib.sha256(x_row_hashes.tobytes()).hexdigest(),
160
+ hashlib.sha256(y_row_hashes.tobytes()).hexdigest(),
161
+ ]
162
+ )
151
163
  except Exception:
152
164
  # Fallback to basic stats if hashing fails
153
165
  hash_components.extend(
@@ -158,7 +170,9 @@ def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
158
170
  )
159
171
 
160
172
  combined = "_".join(hash_components)
161
- return hashlib.md5(combined.encode()).hexdigest()[:12]
173
+ # SHA-256 truncated to 16 hex chars (64 bits) — collision-resistant
174
+ # for any realistic cache size while keeping filesystem paths short.
175
+ return hashlib.sha256(combined.encode()).hexdigest()[:16]
162
176
 
163
177
 
164
178
  def _get_package_versions_hash() -> str:
@@ -926,6 +940,7 @@ class MDN(Imputer):
926
940
  numeric_targets: Optional[List[str]] = None,
927
941
  constant_targets: Optional[Dict[str, Dict]] = None,
928
942
  tune_hyperparameters: bool = False,
943
+ sample_weight: Optional[np.ndarray] = None,
929
944
  **kwargs: Any,
930
945
  ) -> Union[MDNResults, Tuple[MDNResults, Dict[str, Any]]]:
931
946
  """Fit the MDN model to the training data.
@@ -940,12 +955,22 @@ class MDN(Imputer):
940
955
  numeric_targets: List of numeric target names.
941
956
  constant_targets: Dict of constant target info.
942
957
  tune_hyperparameters: If True, tune hyperparameters before fitting.
958
+ sample_weight: Optional per-row sample weights. The underlying
959
+ pytorch_tabular MDN implementation does not accept sample
960
+ weights; when provided, the model raises
961
+ ``NotImplementedError`` so callers do not silently get an
962
+ unweighted fit.
943
963
  **kwargs: Additional parameters.
944
964
 
945
965
  Returns:
946
966
  MDNResults instance with fitted models.
947
967
  If tune_hyperparameters=True, returns (MDNResults, best_params).
948
968
  """
969
+ if sample_weight is not None:
970
+ raise NotImplementedError(
971
+ "MDN does not yet support sample weights. Use QRF, OLS, or "
972
+ "Matching for weighted imputation."
973
+ )
949
974
  try:
950
975
  best_params = None
951
976
 
@@ -31,6 +31,7 @@ class _LogisticRegressionModel:
31
31
  y: pd.Series,
32
32
  var_type: str,
33
33
  categories: List = None,
34
+ sample_weight: Optional[np.ndarray] = None,
34
35
  **lr_kwargs: Any,
35
36
  ) -> None:
36
37
  """Fit logistic regression for categorical/boolean target.
@@ -58,20 +59,35 @@ class _LogisticRegressionModel:
58
59
  )
59
60
  y_encoded = y_encoded.fillna(0) # Default to first category
60
61
 
61
- # Extract relevant LR parameters from kwargs
62
- # Use l1_ratio instead of penalty (deprecated in sklearn 1.8)
62
+ # Extract relevant LR parameters from kwargs.
63
+ # sklearn's LogisticRegression ignores l1_ratio unless
64
+ # penalty="elasticnet" (and solver="saga"). Previously we passed
65
+ # l1_ratio through with the default penalty="l2", so tuning
66
+ # l1_ratio had no effect — silent misconfiguration.
67
+ l1_ratio = lr_kwargs.get("l1_ratio", None)
63
68
  classifier_params = {
64
- "l1_ratio": lr_kwargs.get("l1_ratio", 0),
65
69
  "C": lr_kwargs.get("C", 1.0),
66
70
  "max_iter": lr_kwargs.get("max_iter", 1000),
67
- "solver": lr_kwargs.get(
68
- "solver", "lbfgs" if len(self.categories) <= 2 else "saga"
69
- ),
70
71
  "random_state": self.seed,
71
72
  }
73
+ if l1_ratio is not None and l1_ratio != 0:
74
+ # Caller explicitly supplied a non-zero l1_ratio: wire up the
75
+ # elasticnet penalty and saga solver so it actually applies.
76
+ classifier_params["penalty"] = "elasticnet"
77
+ classifier_params["l1_ratio"] = float(l1_ratio)
78
+ classifier_params["solver"] = lr_kwargs.get("solver", "saga")
79
+ else:
80
+ # No elasticnet requested — use the default L2 penalty and a
81
+ # solver that supports it.
82
+ classifier_params["solver"] = lr_kwargs.get(
83
+ "solver", "lbfgs" if len(self.categories) <= 2 else "saga"
84
+ )
72
85
 
73
86
  self.classifier = LogisticRegression(**classifier_params)
74
- self.classifier.fit(X, y_encoded)
87
+ fit_kwargs = {}
88
+ if sample_weight is not None:
89
+ fit_kwargs["sample_weight"] = np.asarray(sample_weight, dtype=float)
90
+ self.classifier.fit(X, y_encoded, **fit_kwargs)
75
91
 
76
92
  def predict(
77
93
  self,
@@ -137,11 +153,26 @@ class _OLSModel:
137
153
  self.model = None
138
154
  self.output_column = None
139
155
 
140
- def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> None:
141
- """Fit OLS model."""
156
+ def fit(
157
+ self,
158
+ X: pd.DataFrame,
159
+ y: pd.Series,
160
+ sample_weight: Optional[np.ndarray] = None,
161
+ **kwargs,
162
+ ) -> None:
163
+ """Fit OLS (or WLS when sample_weight is provided).
164
+
165
+ When ``sample_weight`` is provided, uses ``statsmodels.api.WLS`` to
166
+ perform a genuine weighted least-squares fit rather than ignoring
167
+ the weights.
168
+ """
142
169
  self.output_column = y.name
143
170
  X_with_const = sm.add_constant(X)
144
- self.model = sm.OLS(y, X_with_const).fit()
171
+ if sample_weight is not None:
172
+ weights = np.asarray(sample_weight, dtype=float)
173
+ self.model = sm.WLS(y, X_with_const, weights=weights).fit()
174
+ else:
175
+ self.model = sm.OLS(y, X_with_const).fit()
145
176
  self.scale = self.model.scale
146
177
 
147
178
  def predict(self, X: pd.DataFrame) -> np.ndarray:
@@ -197,12 +228,22 @@ class OLSResults(ImputerResults):
197
228
  X_test[self.predictors], return_probs=False, quantile=quantile
198
229
  )
199
230
  else:
200
- # Regression for numeric targets
231
+ # Regression for numeric targets.
232
+ # Use the full prediction SE (leverage + residual) rather than
233
+ # just sqrt(model.scale). Previously se = sqrt(scale) used only
234
+ # the residual std and under-dispersed imputations for test
235
+ # rows far from the training centroid; at extreme quantiles
236
+ # (0.01, 0.99) the under-dispersion is material.
201
237
  X_test_with_const = sm.add_constant(X_test[self.predictors])
202
- mean_preds = model.predict(X_test_with_const)
203
- se = np.sqrt(model.scale)
238
+ prediction = model.model.get_prediction(X_test_with_const)
239
+ # var_pred_mean is the leverage term (x' (X'X)^-1 x) * scale;
240
+ # adding model.scale (residual variance) gives the prediction
241
+ # variance for a new observation.
242
+ pred_var = np.asarray(prediction.var_pred_mean) + model.scale
243
+ mean_preds = np.asarray(prediction.predicted_mean)
244
+ se = np.sqrt(np.maximum(pred_var, 0.0))
204
245
  imputed_values = self._predict_quantile(
205
- mean_preds=mean_preds,
246
+ mean_preds=pd.Series(mean_preds, index=X_test.index, name=variable),
206
247
  se=se,
207
248
  mean_quantile=quantile,
208
249
  random_sample=random_sample,
@@ -343,16 +384,18 @@ class OLSResults(ImputerResults):
343
384
  def _predict_quantile(
344
385
  self,
345
386
  mean_preds: pd.Series,
346
- se: float,
387
+ se: Any,
347
388
  mean_quantile: float,
348
389
  random_sample: bool,
349
390
  count_samples: int = 10,
350
- ) -> np.ndarray:
391
+ ) -> pd.Series:
351
392
  """Predict values at a specified quantile.
352
393
 
353
394
  Args:
354
395
  mean_preds: Mean predictions from the model.
355
- se: Standard error of the predictions.
396
+ se: Standard error of the predictions. May be a scalar (legacy,
397
+ residual std) or a per-row array (prediction SE including
398
+ leverage).
356
399
  mean_quantile: Quantile to predict (the quantile affects the center
357
400
  of the beta distribution from which to sample when imputing each data point).
358
401
  random_sample: If True, use random quantile sampling for prediction.
@@ -360,20 +403,31 @@ class OLSResults(ImputerResults):
360
403
  random_sample is True.
361
404
 
362
405
  Returns:
363
- Array of predicted values at the specified quantile.
406
+ Series of predicted values at the specified quantile, indexed to
407
+ match ``mean_preds``. Returning a Series (rather than a bare
408
+ ndarray) preserves the test-row index so downstream
409
+ ``DataFrame[col] = series`` assignments align correctly when a
410
+ numeric column is set before a categorical column whose
411
+ predictions come back indexed.
364
412
 
365
413
  Raises:
366
414
  RuntimeError: If prediction fails.
367
415
  """
416
+ # Clip q away from 0 and 1 to avoid ±inf from norm.ppf (and the
417
+ # degenerate a=0/a=inf case in the beta-alpha formula).
418
+ # Previously mean_quantile could be 0.0 or 1.0 with no guard.
419
+ q_eps = 1e-6
420
+ q_clipped = float(np.clip(mean_quantile, q_eps, 1.0 - q_eps))
368
421
  try:
369
- if random_sample == True:
422
+ if random_sample:
370
423
  self.logger.info(
371
- f"Predicting at random quantiles sampled from a beta distribution with mean quantile {mean_quantile}"
424
+ f"Predicting at random quantiles sampled from a beta distribution with mean quantile {q_clipped}"
372
425
  )
373
426
  random_generator = np.random.default_rng(self.seed)
374
427
 
375
- # Calculate alpha parameter for beta distribution
376
- a = mean_quantile / (1 - mean_quantile)
428
+ # Calculate alpha parameter for beta distribution (q is
429
+ # safely in (0,1) after clipping).
430
+ a = q_clipped / (1 - q_clipped)
377
431
 
378
432
  # Generate count_samples beta distributed values with parameter a
379
433
  beta_samples = random_generator.beta(a, 1, size=count_samples)
@@ -387,12 +441,15 @@ class OLSResults(ImputerResults):
387
441
  )
388
442
  selected_quantiles = normal_quantiles[sampled_indices]
389
443
 
390
- # Adjust each mean prediction by corresponding sampled quantile times standard error
391
- return mean_preds + selected_quantiles * se
444
+ # Adjust each mean prediction by the sampled quantile
445
+ # times its per-row SE (or scalar SE if se is a float).
446
+ values = mean_preds.values + selected_quantiles * np.asarray(se)
392
447
  else:
393
- self.logger.info(f"Predicting at specified quantile {mean_quantile}")
394
- specified_quantile = norm.ppf(mean_quantile)
395
- return mean_preds + specified_quantile * se
448
+ self.logger.info(f"Predicting at specified quantile {q_clipped}")
449
+ specified_quantile = norm.ppf(q_clipped)
450
+ values = mean_preds.values + specified_quantile * np.asarray(se)
451
+
452
+ return pd.Series(values, index=mean_preds.index, name=mean_preds.name)
396
453
 
397
454
  except Exception as e:
398
455
  if isinstance(e, ValueError):
@@ -431,6 +488,7 @@ class OLS(Imputer):
431
488
  boolean_targets: Optional[Dict[str, Dict]] = None,
432
489
  numeric_targets: Optional[List[str]] = None,
433
490
  constant_targets: Optional[Dict[str, Dict]] = None,
491
+ sample_weight: Optional[np.ndarray] = None,
434
492
  **kwargs: Any,
435
493
  ) -> OLSResults:
436
494
  """Fit the OLS model to the training data.
@@ -439,6 +497,9 @@ class OLS(Imputer):
439
497
  X_train: DataFrame containing the training data.
440
498
  predictors: List of column names to use as predictors.
441
499
  imputed_variables: List of column names to impute.
500
+ sample_weight: Optional per-row sample weights, threaded through
501
+ to ``sm.WLS`` (for numeric targets) or
502
+ ``LogisticRegression.fit`` (for categorical/boolean).
442
503
 
443
504
  Returns:
444
505
  The fitted model instance.
@@ -476,6 +537,7 @@ class OLS(Imputer):
476
537
  Y,
477
538
  var_type=categorical_targets[variable]["type"],
478
539
  categories=categorical_targets[variable].get("categories"),
540
+ sample_weight=sample_weight,
479
541
  **kwargs,
480
542
  )
481
543
  self.logger.info(
@@ -484,14 +546,22 @@ class OLS(Imputer):
484
546
  elif variable in (boolean_targets or {}):
485
547
  # Use logistic regression for boolean targets
486
548
  model = _LogisticRegressionModel(seed=self.seed, logger=self.logger)
487
- model.fit(X_train[predictors], Y, var_type="boolean", **kwargs)
549
+ model.fit(
550
+ X_train[predictors],
551
+ Y,
552
+ var_type="boolean",
553
+ sample_weight=sample_weight,
554
+ **kwargs,
555
+ )
488
556
  self.logger.info(
489
557
  f"Logistic regression fitted for boolean variable {variable}"
490
558
  )
491
559
  else:
492
560
  # Use OLS for numeric targets
493
561
  model = _OLSModel(seed=self.seed, logger=self.logger)
494
- model.fit(X_train[predictors], Y, **kwargs)
562
+ model.fit(
563
+ X_train[predictors], Y, sample_weight=sample_weight, **kwargs
564
+ )
495
565
  self.logger.info(
496
566
  f"OLS regression fitted for numeric variable {variable}"
497
567
  )