PyPI - microimpute - Versions diffs - 2.0.2__tar.gz → 2.0.3__tar.gz - Mend

microimpute 2.0.2tar.gz → 2.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{microimpute-2.0.2 → microimpute-2.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: microimpute
-Version: 2.0.2
+Version: 2.0.3
 Summary: Benchmarking imputation methods for microdata
 Author-email: María Juaristi <juaristi@uni.minerva.edu>, Nikhil Woodruff <nikhil.woodruff@outlook.com>
 Requires-Python: <3.15,>=3.12

{microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/autoimpute.py RENAMED Viewed

@@ -441,6 +441,16 @@ def autoimpute(
             main_progress = tqdm(total=5, desc="AutoImputation progress")
             main_progress.set_description("Input validation")
+        # Defensive copy so that the caller's receiver_data is never
+        # mutated. Previously the final assignment
+        # ``receiver_data[var] = median_imputations[var]`` would write
+        # back through any local binding, and whether the user's frame
+        # was affected depended on whether the intermediate ``drop`` call
+        # returned a copy or a view (#13). Copying up-front makes this
+        # explicit and eliminates the side effect regardless of later
+        # pandas internals.
+        receiver_data = receiver_data.copy()
         # Use provided quantiles or defaults
         quantiles = imputation_quantiles if imputation_quantiles else QUANTILES

{microimpute-2.0.2 → microimpute-2.0.3}/microimpute/comparisons/metrics.py RENAMED Viewed

@@ -568,12 +568,24 @@ def kl_divergence(
     p_donor = np.array([donor_counts.get(cat, 0.0) for cat in all_categories])
     q_receiver = np.array([receiver_counts.get(cat, 0.0) for cat in all_categories])
-    # Add small epsilon to avoid log(0) and division by zero
+    # Apply the epsilon floor to BOTH distributions symmetrically and
+    # renormalise, so KL(p || q) behaves consistently regardless of which
+    # side has a missing category (#12). Previously only ``q_receiver``
+    # was clipped, which meant a category present in q but absent in p
+    # contributed ``rel_entr(0, q) = 0`` (a free pass), while the reverse
+    # contributed ``p * log(p / eps)`` (a large finite value depending
+    # entirely on ``epsilon``) — asymmetric and arbitrary.
     epsilon = 1e-10
+    p_donor = np.maximum(p_donor, epsilon)
     q_receiver = np.maximum(q_receiver, epsilon)
-    # Calculate KL divergence using scipy.special.kl_div
-    # kl_div(p, q) computes p * log(p/q) element-wise
+    # Renormalise so both are still valid probability vectors after the
+    # epsilon floor. Without this, the floored mass is "extra" and
+    # inflates rel_entr on categories with mass near zero.
+    p_donor = p_donor / p_donor.sum()
+    q_receiver = q_receiver / q_receiver.sum()
+    # Calculate KL divergence using scipy.special.rel_entr
+    # rel_entr(p, q) computes p * log(p/q) element-wise
     kl_values = rel_entr(p_donor, q_receiver)
     # Sum over all categories to get total KL divergence

{microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/imputer.py RENAMED Viewed

@@ -271,9 +271,21 @@ class Imputer(ABC):
             weights = X_train[weight_col]
         elif weight_col is not None and isinstance(weight_col, np.ndarray):
             weights = pd.Series(weight_col, index=X_train.index)
+        elif weight_col is not None and isinstance(weight_col, pd.Series):
+            weights = weight_col.reindex(X_train.index)
-        if weights is not None and (weights <= 0).any():
-            raise ValueError("Weights must be positive")
+        if weights is not None:
+            # Check for NaN AND non-positive values together. Previously only
+            # (weights <= 0).any() was checked, which returns False for NaN
+            # weights — those then propagated into .sample() as NaN
+            # probabilities or corrupted sample_weight passed to learners.
+            weights_arr = np.asarray(weights, dtype=float)
+            invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0)
+            if invalid_mask.any():
+                raise ValueError(
+                    "Weights must be positive and finite; found "
+                    f"{int(invalid_mask.sum())} non-positive or NaN weight(s)"
+                )
         # Identify target types BEFORE preprocessing
         self.identify_target_types(X_train, imputed_variables, not_numeric_categorical)
@@ -284,21 +296,28 @@ class Imputer(ABC):
             )
         )
-        if weights is not None:
-            weights_normalized = weights / weights.sum()
-            X_train = X_train.sample(
-                n=len(X_train),
-                replace=True,
-                weights=weights_normalized,
-                random_state=self.seed,
-            ).reset_index(drop=True)
         # Save predictors and imputed variables
         self.predictors = predictors
         self.imputed_variables = imputed_variables
         self.imputed_vars_dummy_info = imputed_vars_dummy_info
         self.original_predictors = original_predictors
+        # Pass sample_weight through to the subclass so it can use each
+        # learner's native weighted-fit API (QRF, OLS→WLS, logistic, RFC all
+        # support sample_weight). This replaces the previous bootstrap
+        # resample, which silently discarded weights for the underlying
+        # estimator and inflated variance / shrank effective sample size.
+        sample_weight = None
+        if weights is not None:
+            sample_weight = np.asarray(weights_arr, dtype=float)
+            # Reindex if preprocess_data_types changed the row ordering
+            # (it currently does not, but guard against future drift).
+            if len(sample_weight) != len(X_train):
+                raise RuntimeError(
+                    "Internal error: sample_weight length no longer matches "
+                    "X_train after preprocessing"
+                )
         # Defer actual training to subclass with all parameters
         fitted_model = self._fit(
             X_train,
@@ -309,6 +328,7 @@ class Imputer(ABC):
             boolean_targets=self.boolean_targets,
             numeric_targets=self.numeric_targets,
             constant_targets=self.constant_targets,
+            sample_weight=sample_weight,
             **kwargs,
         )
         return fitted_model

{microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/matching.py RENAMED Viewed

@@ -449,6 +449,7 @@ class Matching(Imputer):
         numeric_targets: Optional[List[str]] = None,
         constant_targets: Optional[Dict[str, Dict]] = None,
         tune_hyperparameters: bool = False,
+        sample_weight: Optional[np.ndarray] = None,
         **matching_kwargs: Any,
     ) -> MatchingResults:
         """Fit the matching model by storing the donor data and variable names.
@@ -457,6 +458,11 @@ class Matching(Imputer):
             X_train: DataFrame containing the donor data.
             predictors: List of column names to use as predictors.
             imputed_variables: List of column names to impute.
+            sample_weight: Optional per-row sample weights for the donor
+                dataset. When provided, weights are passed to R StatMatch's
+                ``NND.hotdeck`` via ``weight.don`` so that donor records are
+                matched in proportion to their survey weights rather than
+                uniformly.
             matching_kwargs: Additional keyword arguments for hyperparameter
                 tuning of the matching function.
@@ -468,6 +474,13 @@ class Matching(Imputer):
         """
         try:
             self.donor_data = X_train.copy()
+            if sample_weight is not None:
+                # Attach donor weights to the matching hyperparameters so
+                # they're forwarded into the StatMatch R call (weight.don).
+                matching_kwargs = {
+                    **matching_kwargs,
+                    "donor_sample_weight": np.asarray(sample_weight, dtype=float),
+                }
             if tune_hyperparameters:
                 self.logger.info("Tuning hyperparameters for the matching model")

{microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/mdn.py RENAMED Viewed

@@ -122,17 +122,23 @@ def _suppress_pytorch_logging() -> None:
 def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
-    """Generate a hash from the training data for cache identification.
+    """Generate a content-sensitive hash from the training data for cache
+    identification.
-    Creates a reproducible hash based on the data shape, column names,
-    and a sample of the data values.
+    Creates an order-sensitive hash based on data shape, column names, and
+    the SHA-256 digest of the per-row hash bytes of X and y. Previously
+    this summed the per-row uint64 hashes from
+    ``hash_pandas_object``, which lost row ordering (permutations hashed
+    identically) and was collision-prone across semantically different
+    datasets of matching shape — a cache hit on a collision would load a
+    stale model for a new dataset (silent correctness bug).
     Args:
         X: Feature DataFrame.
         y: Target Series.
     Returns:
-        A short hash string identifying the dataset.
+        A 16-character hex string identifying the dataset.
     """
     # Include shape, column names, and data statistics for identification
     hash_components = [
@@ -142,12 +148,18 @@ def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
         str(len(y)),
     ]
-    # Add hash of actual data values for uniqueness
-    # Use pandas hash_pandas_object for consistent hashing
+    # Order-sensitive content hash: SHA-256 over the raw bytes of per-row
+    # hashes. Any change in row values OR row ordering produces a
+    # different digest, eliminating the sum-of-hashes collision trap.
     try:
-        data_hash = pd.util.hash_pandas_object(X).sum()
-        y_hash = pd.util.hash_pandas_object(y).sum()
-        hash_components.extend([str(data_hash), str(y_hash)])
+        x_row_hashes = pd.util.hash_pandas_object(X, index=True).values
+        y_row_hashes = pd.util.hash_pandas_object(y, index=True).values
+        hash_components.extend(
+            [
+                hashlib.sha256(x_row_hashes.tobytes()).hexdigest(),
+                hashlib.sha256(y_row_hashes.tobytes()).hexdigest(),
+            ]
+        )
     except Exception:
         # Fallback to basic stats if hashing fails
         hash_components.extend(
@@ -158,7 +170,9 @@ def _generate_data_hash(X: pd.DataFrame, y: pd.Series) -> str:
         )
     combined = "_".join(hash_components)
-    return hashlib.md5(combined.encode()).hexdigest()[:12]
+    # SHA-256 truncated to 16 hex chars (64 bits) — collision-resistant
+    # for any realistic cache size while keeping filesystem paths short.
+    return hashlib.sha256(combined.encode()).hexdigest()[:16]
 def _get_package_versions_hash() -> str:
@@ -926,6 +940,7 @@ class MDN(Imputer):
         numeric_targets: Optional[List[str]] = None,
         constant_targets: Optional[Dict[str, Dict]] = None,
         tune_hyperparameters: bool = False,
+        sample_weight: Optional[np.ndarray] = None,
         **kwargs: Any,
     ) -> Union[MDNResults, Tuple[MDNResults, Dict[str, Any]]]:
         """Fit the MDN model to the training data.
@@ -940,12 +955,22 @@ class MDN(Imputer):
             numeric_targets: List of numeric target names.
             constant_targets: Dict of constant target info.
             tune_hyperparameters: If True, tune hyperparameters before fitting.
+            sample_weight: Optional per-row sample weights. The underlying
+                pytorch_tabular MDN implementation does not accept sample
+                weights; when provided, the model raises
+                ``NotImplementedError`` so callers do not silently get an
+                unweighted fit.
             **kwargs: Additional parameters.
         Returns:
             MDNResults instance with fitted models.
             If tune_hyperparameters=True, returns (MDNResults, best_params).
         """
+        if sample_weight is not None:
+            raise NotImplementedError(
+                "MDN does not yet support sample weights. Use QRF, OLS, or "
+                "Matching for weighted imputation."
+            )
         try:
             best_params = None

{microimpute-2.0.2 → microimpute-2.0.3}/microimpute/models/ols.py RENAMED Viewed

@@ -31,6 +31,7 @@ class _LogisticRegressionModel:
         y: pd.Series,
         var_type: str,
         categories: List = None,
+        sample_weight: Optional[np.ndarray] = None,
         **lr_kwargs: Any,
     ) -> None:
         """Fit logistic regression for categorical/boolean target.
@@ -58,20 +59,35 @@ class _LogisticRegressionModel:
                 )
                 y_encoded = y_encoded.fillna(0)  # Default to first category
-        # Extract relevant LR parameters from kwargs
-        # Use l1_ratio instead of penalty (deprecated in sklearn 1.8)
+        # Extract relevant LR parameters from kwargs.
+        # sklearn's LogisticRegression ignores l1_ratio unless
+        # penalty="elasticnet" (and solver="saga"). Previously we passed
+        # l1_ratio through with the default penalty="l2", so tuning
+        # l1_ratio had no effect — silent misconfiguration.
+        l1_ratio = lr_kwargs.get("l1_ratio", None)
         classifier_params = {
-            "l1_ratio": lr_kwargs.get("l1_ratio", 0),
             "C": lr_kwargs.get("C", 1.0),
             "max_iter": lr_kwargs.get("max_iter", 1000),
-            "solver": lr_kwargs.get(
-                "solver", "lbfgs" if len(self.categories) <= 2 else "saga"
-            ),
             "random_state": self.seed,
         }
+        if l1_ratio is not None and l1_ratio != 0:
+            # Caller explicitly supplied a non-zero l1_ratio: wire up the
+            # elasticnet penalty and saga solver so it actually applies.
+            classifier_params["penalty"] = "elasticnet"
+            classifier_params["l1_ratio"] = float(l1_ratio)
+            classifier_params["solver"] = lr_kwargs.get("solver", "saga")
+        else:
+            # No elasticnet requested — use the default L2 penalty and a
+            # solver that supports it.
+            classifier_params["solver"] = lr_kwargs.get(
+                "solver", "lbfgs" if len(self.categories) <= 2 else "saga"
+            )
         self.classifier = LogisticRegression(**classifier_params)
-        self.classifier.fit(X, y_encoded)
+        fit_kwargs = {}
+        if sample_weight is not None:
+            fit_kwargs["sample_weight"] = np.asarray(sample_weight, dtype=float)
+        self.classifier.fit(X, y_encoded, **fit_kwargs)
     def predict(
         self,
@@ -137,11 +153,26 @@ class _OLSModel:
         self.model = None
         self.output_column = None
-    def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> None:
-        """Fit OLS model."""
+    def fit(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        sample_weight: Optional[np.ndarray] = None,
+        **kwargs,
+    ) -> None:
+        """Fit OLS (or WLS when sample_weight is provided).
+        When ``sample_weight`` is provided, uses ``statsmodels.api.WLS`` to
+        perform a genuine weighted least-squares fit rather than ignoring
+        the weights.
+        """
         self.output_column = y.name
         X_with_const = sm.add_constant(X)
-        self.model = sm.OLS(y, X_with_const).fit()
+        if sample_weight is not None:
+            weights = np.asarray(sample_weight, dtype=float)
+            self.model = sm.WLS(y, X_with_const, weights=weights).fit()
+        else:
+            self.model = sm.OLS(y, X_with_const).fit()
         self.scale = self.model.scale
     def predict(self, X: pd.DataFrame) -> np.ndarray:
@@ -197,12 +228,22 @@ class OLSResults(ImputerResults):
                 X_test[self.predictors], return_probs=False, quantile=quantile
             )
         else:
-            # Regression for numeric targets
+            # Regression for numeric targets.
+            # Use the full prediction SE (leverage + residual) rather than
+            # just sqrt(model.scale). Previously se = sqrt(scale) used only
+            # the residual std and under-dispersed imputations for test
+            # rows far from the training centroid; at extreme quantiles
+            # (0.01, 0.99) the under-dispersion is material.
             X_test_with_const = sm.add_constant(X_test[self.predictors])
-            mean_preds = model.predict(X_test_with_const)
-            se = np.sqrt(model.scale)
+            prediction = model.model.get_prediction(X_test_with_const)
+            # var_pred_mean is the leverage term (x' (X'X)^-1 x) * scale;
+            # adding model.scale (residual variance) gives the prediction
+            # variance for a new observation.
+            pred_var = np.asarray(prediction.var_pred_mean) + model.scale
+            mean_preds = np.asarray(prediction.predicted_mean)
+            se = np.sqrt(np.maximum(pred_var, 0.0))
             imputed_values = self._predict_quantile(
-                mean_preds=mean_preds,
+                mean_preds=pd.Series(mean_preds, index=X_test.index, name=variable),
                 se=se,
                 mean_quantile=quantile,
                 random_sample=random_sample,
@@ -343,16 +384,18 @@ class OLSResults(ImputerResults):
     def _predict_quantile(
         self,
         mean_preds: pd.Series,
-        se: float,
+        se: Any,
         mean_quantile: float,
         random_sample: bool,
         count_samples: int = 10,
-    ) -> np.ndarray:
+    ) -> pd.Series:
         """Predict values at a specified quantile.
         Args:
             mean_preds: Mean predictions from the model.
-            se: Standard error of the predictions.
+            se: Standard error of the predictions. May be a scalar (legacy,
+                residual std) or a per-row array (prediction SE including
+                leverage).
             mean_quantile: Quantile to predict (the quantile affects the center
                 of the beta distribution from which to sample when imputing each data point).
             random_sample: If True, use random quantile sampling for prediction.
@@ -360,20 +403,31 @@ class OLSResults(ImputerResults):
                 random_sample is True.
         Returns:
-            Array of predicted values at the specified quantile.
+            Series of predicted values at the specified quantile, indexed to
+            match ``mean_preds``. Returning a Series (rather than a bare
+            ndarray) preserves the test-row index so downstream
+            ``DataFrame[col] = series`` assignments align correctly when a
+            numeric column is set before a categorical column whose
+            predictions come back indexed.
         Raises:
             RuntimeError: If prediction fails.
         """
+        # Clip q away from 0 and 1 to avoid ±inf from norm.ppf (and the
+        # degenerate a=0/a=inf case in the beta-alpha formula).
+        # Previously mean_quantile could be 0.0 or 1.0 with no guard.
+        q_eps = 1e-6
+        q_clipped = float(np.clip(mean_quantile, q_eps, 1.0 - q_eps))
         try:
-            if random_sample == True:
+            if random_sample:
                 self.logger.info(
-                    f"Predicting at random quantiles sampled from a beta distribution with mean quantile {mean_quantile}"
+                    f"Predicting at random quantiles sampled from a beta distribution with mean quantile {q_clipped}"
                 )
                 random_generator = np.random.default_rng(self.seed)
-                # Calculate alpha parameter for beta distribution
-                a = mean_quantile / (1 - mean_quantile)
+                # Calculate alpha parameter for beta distribution (q is
+                # safely in (0,1) after clipping).
+                a = q_clipped / (1 - q_clipped)
                 # Generate count_samples beta distributed values with parameter a
                 beta_samples = random_generator.beta(a, 1, size=count_samples)
@@ -387,12 +441,15 @@ class OLSResults(ImputerResults):
                 )
                 selected_quantiles = normal_quantiles[sampled_indices]
-                # Adjust each mean prediction by corresponding sampled quantile times standard error
-                return mean_preds + selected_quantiles * se
+                # Adjust each mean prediction by the sampled quantile
+                # times its per-row SE (or scalar SE if se is a float).
+                values = mean_preds.values + selected_quantiles * np.asarray(se)
             else:
-                self.logger.info(f"Predicting at specified quantile {mean_quantile}")
-                specified_quantile = norm.ppf(mean_quantile)
-                return mean_preds + specified_quantile * se
+                self.logger.info(f"Predicting at specified quantile {q_clipped}")
+                specified_quantile = norm.ppf(q_clipped)
+                values = mean_preds.values + specified_quantile * np.asarray(se)
+            return pd.Series(values, index=mean_preds.index, name=mean_preds.name)
         except Exception as e:
             if isinstance(e, ValueError):
@@ -431,6 +488,7 @@ class OLS(Imputer):
         boolean_targets: Optional[Dict[str, Dict]] = None,
         numeric_targets: Optional[List[str]] = None,
         constant_targets: Optional[Dict[str, Dict]] = None,
+        sample_weight: Optional[np.ndarray] = None,
         **kwargs: Any,
     ) -> OLSResults:
         """Fit the OLS model to the training data.
@@ -439,6 +497,9 @@ class OLS(Imputer):
             X_train: DataFrame containing the training data.
             predictors: List of column names to use as predictors.
             imputed_variables: List of column names to impute.
+            sample_weight: Optional per-row sample weights, threaded through
+                to ``sm.WLS`` (for numeric targets) or
+                ``LogisticRegression.fit`` (for categorical/boolean).
         Returns:
             The fitted model instance.
@@ -476,6 +537,7 @@ class OLS(Imputer):
                         Y,
                         var_type=categorical_targets[variable]["type"],
                         categories=categorical_targets[variable].get("categories"),
+                        sample_weight=sample_weight,
                         **kwargs,
                     )
                     self.logger.info(
@@ -484,14 +546,22 @@ class OLS(Imputer):
                 elif variable in (boolean_targets or {}):
                     # Use logistic regression for boolean targets
                     model = _LogisticRegressionModel(seed=self.seed, logger=self.logger)
-                    model.fit(X_train[predictors], Y, var_type="boolean", **kwargs)
+                    model.fit(
+                        X_train[predictors],
+                        Y,
+                        var_type="boolean",
+                        sample_weight=sample_weight,
+                        **kwargs,
+                    )
                     self.logger.info(
                         f"Logistic regression fitted for boolean variable {variable}"
                     )
                 else:
                     # Use OLS for numeric targets
                     model = _OLSModel(seed=self.seed, logger=self.logger)
-                    model.fit(X_train[predictors], Y, **kwargs)
+                    model.fit(
+                        X_train[predictors], Y, sample_weight=sample_weight, **kwargs
+                    )
                     self.logger.info(
                         f"OLS regression fitted for numeric variable {variable}"
                     )

microimpute 2.0.2__tar.gz → 2.0.3__tar.gz

microimpute 2.0.2tar.gz → 2.0.3tar.gz