PyPI - diff-diff - Versions diffs - 2.8.2__tar.gz → 2.8.4__tar.gz - Mend

diff-diff 2.8.2tar.gz → 2.8.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

{diff_diff-2.8.2 → diff_diff-2.8.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diff-diff
-Version: 2.8.2
+Version: 2.8.4
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Science/Research
 Classifier: Operating System :: OS Independent

{diff_diff-2.8.2 → diff_diff-2.8.4}/diff_diff/__init__.py RENAMED Viewed

@@ -94,6 +94,7 @@ from diff_diff.prep import (
     make_treatment_indicator,
     rank_control_units,
     summarize_did_data,
+    trim_weights,
     validate_did_data,
     wide_to_long,
 )
@@ -210,7 +211,7 @@ Stacked = StackedDiD
 Bacon = BaconDecomposition
 EDiD = EfficientDiD
-__version__ = "2.8.2"
+__version__ = "2.8.4"
 __all__ = [
     # Estimators
     "DifferenceInDifferences",
@@ -307,6 +308,7 @@ __all__ = [
     "make_post_indicator",
     "wide_to_long",
     "balance_panel",
+    "trim_weights",
     "validate_did_data",
     "summarize_did_data",
     "generate_did_data",

{diff_diff-2.8.2 → diff_diff-2.8.4}/diff_diff/bootstrap_utils.py RENAMED Viewed

@@ -433,6 +433,10 @@ def generate_survey_multiplier_weights_batch(
     is present, weights are scaled by ``sqrt(1 - f_h)`` per stratum so
     the bootstrap variance matches the TSL variance.
+    For ``lonely_psu="adjust"``, singleton PSUs from different strata are
+    pooled into a combined pseudo-stratum and weights are generated for
+    the pooled group (no FPC scaling on pooled singletons).
     Parameters
     ----------
     n_bootstrap : int
@@ -454,11 +458,7 @@ def generate_survey_multiplier_weights_batch(
     psu = resolved_survey.psu
     strata = resolved_survey.strata
-    if resolved_survey.lonely_psu == "adjust":
-        raise NotImplementedError(
-            "lonely_psu='adjust' is not yet supported for survey-aware bootstrap. "
-            "Use lonely_psu='remove' or 'certainty', or use analytical inference."
-        )
+    _lonely_psu = resolved_survey.lonely_psu
     if psu is None:
         # Each observation is its own PSU
@@ -499,6 +499,7 @@ def generate_survey_multiplier_weights_batch(
         psu_to_col = {int(p): i for i, p in enumerate(psu_ids)}
         unique_strata = np.unique(strata)
+        _singleton_cols = []  # For lonely_psu="adjust" pooling
         for h in unique_strata:
             mask_h = strata == h
@@ -511,8 +512,12 @@ def generate_survey_multiplier_weights_batch(
             cols = np.array([psu_to_col[int(p)] for p in psus_in_h])
             if n_h < 2:
-                # Lonely PSU — zero weight (matches remove/certainty behavior)
-                weights[:, cols] = 0.0
+                if _lonely_psu == "adjust":
+                    # Collect for pooled pseudo-stratum processing
+                    _singleton_cols.extend(cols.tolist())
+                else:
+                    # remove / certainty — zero weight
+                    weights[:, cols] = 0.0
                 continue
             # Generate weights for this stratum
@@ -536,6 +541,31 @@ def generate_survey_multiplier_weights_batch(
             weights[:, cols] = stratum_weights
+        # Pool singleton PSUs into a pseudo-stratum for "adjust"
+        if _singleton_cols:
+            n_pooled = len(_singleton_cols)
+            if n_pooled >= 2:
+                pooled_weights = generate_bootstrap_weights_batch_numpy(
+                    n_bootstrap, n_pooled, weight_type, rng
+                )
+                # No FPC scaling for pooled singletons (conservative)
+                pooled_cols = np.array(_singleton_cols)
+                weights[:, pooled_cols] = pooled_weights
+            else:
+                # Single singleton — cannot pool, zero weight (library-specific
+                # fallback; bootstrap adjust with one singleton = remove).
+                import warnings
+                warnings.warn(
+                    "lonely_psu='adjust' with only 1 singleton stratum in "
+                    "bootstrap: singleton PSU contributes zero variance "
+                    "(same as 'remove'). At least 2 singleton strata are "
+                    "needed for pooled pseudo-stratum bootstrap.",
+                    UserWarning,
+                    stacklevel=3,
+                )
+                weights[:, _singleton_cols[0]] = 0.0
     return weights, psu_ids
@@ -553,6 +583,9 @@ def generate_rao_wu_weights(
     With FPC: ``m_h = max(1, round((1 - f_h) * (n_h - 1)))``
     (Rao, Wu & Yue 1992, Section 3).
+    For ``lonely_psu="adjust"``, singleton PSUs are pooled into a combined
+    pseudo-stratum and resampled together (no FPC scaling on pooled group).
     Parameters
     ----------
     resolved_survey : ResolvedSurveyDesign
@@ -570,11 +603,7 @@ def generate_rao_wu_weights(
     psu = resolved_survey.psu
     strata = resolved_survey.strata
-    if resolved_survey.lonely_psu == "adjust":
-        raise NotImplementedError(
-            "lonely_psu='adjust' is not yet supported for survey-aware bootstrap. "
-            "Use lonely_psu='remove' or 'certainty', or use analytical inference."
-        )
+    _lonely_psu_rw = resolved_survey.lonely_psu
     rescaled = np.zeros(n_obs, dtype=np.float64)
@@ -589,14 +618,20 @@ def generate_rao_wu_weights(
         unique_strata = np.unique(strata)
         strata_masks = [strata == h for h in unique_strata]
+    # Collect singleton PSUs for "adjust" pooling
+    _singleton_info = []  # list of (mask_h, unique_psu_h) tuples
     for mask_h in strata_masks:
         psu_h = obs_psu[mask_h]
         unique_psu_h = np.unique(psu_h)
         n_h = len(unique_psu_h)
         if n_h < 2:
-            # Census / lonely PSU — keep original weights (zero variance)
-            rescaled[mask_h] = base_weights[mask_h]
+            if _lonely_psu_rw == "adjust":
+                _singleton_info.append((mask_h, unique_psu_h))
+            else:
+                # remove / certainty — keep original weights (zero variance)
+                rescaled[mask_h] = base_weights[mask_h]
             continue
         # Compute resample size
@@ -629,6 +664,41 @@ def generate_rao_wu_weights(
         local_indices = np.array([psu_to_local[int(obs_psu[idx])] for idx in obs_in_h])
         rescaled[obs_in_h] = base_weights[obs_in_h] * scale_per_psu[local_indices]
+    # Pool singleton PSUs into a pseudo-stratum for "adjust"
+    if _singleton_info:
+        # Combine all singleton PSUs into one group
+        pooled_psus = np.concatenate([p for _, p in _singleton_info])
+        n_pooled = len(pooled_psus)
+        if n_pooled >= 2:
+            m_pooled = n_pooled - 1  # No FPC for pooled singletons
+            drawn = rng.choice(n_pooled, size=m_pooled, replace=True)
+            counts = np.bincount(drawn, minlength=n_pooled)
+            scale_per_psu = (n_pooled / m_pooled) * counts.astype(np.float64)
+            # Build PSU → scale mapping and apply
+            psu_scale_map = {int(pooled_psus[i]): scale_per_psu[i] for i in range(n_pooled)}
+            for mask_h, _ in _singleton_info:
+                obs_in_h = np.where(mask_h)[0]
+                for idx in obs_in_h:
+                    p = int(obs_psu[idx])
+                    rescaled[idx] = base_weights[idx] * psu_scale_map.get(p, 1.0)
+        else:
+            # Single singleton — cannot pool, keep base weights (library-specific
+            # fallback; bootstrap adjust with one singleton = remove).
+            import warnings
+            warnings.warn(
+                "lonely_psu='adjust' with only 1 singleton stratum in "
+                "bootstrap: singleton PSU contributes zero variance "
+                "(same as 'remove'). At least 2 singleton strata are "
+                "needed for pooled pseudo-stratum bootstrap.",
+                UserWarning,
+                stacklevel=2,
+            )
+            for mask_h, _ in _singleton_info:
+                rescaled[mask_h] = base_weights[mask_h]
     return rescaled

{diff_diff-2.8.2 → diff_diff-2.8.4}/diff_diff/continuous_did_results.py RENAMED Viewed

@@ -154,6 +154,15 @@ class ContinuousDiDResults:
             f"n_periods={len(self.time_periods)})"
         )
+    @property
+    def coef_var(self) -> float:
+        """Coefficient of variation: SE / |overall ATT|. NaN when ATT is 0 or SE non-finite."""
+        if not (np.isfinite(self.overall_att_se) and self.overall_att_se >= 0):
+            return np.nan
+        if not np.isfinite(self.overall_att) or self.overall_att == 0:
+            return np.nan
+        return self.overall_att_se / abs(self.overall_att)
     def summary(self, alpha: Optional[float] = None) -> str:
         """Generate formatted summary."""
         alpha = alpha or self.alpha
@@ -223,10 +232,15 @@ class ContinuousDiDResults:
                 f"[{self.overall_att_conf_int[0]:.4f}, {self.overall_att_conf_int[1]:.4f}]",
                 f"{conf_level}% CI for ACRT_glob: "
                 f"[{self.overall_acrt_conf_int[0]:.4f}, {self.overall_acrt_conf_int[1]:.4f}]",
-                "",
             ]
         )
+        cv = self.coef_var
+        if np.isfinite(cv):
+            lines.append(f"{'CV (SE/|ATT|):':<25} {cv:>10.4f}")
+        lines.append("")
         # Dose-response curve summary (first/mid/last points)
         if len(self.dose_grid) > 0:
             lines.extend(

{diff_diff-2.8.2 → diff_diff-2.8.4}/diff_diff/efficient_did_results.py RENAMED Viewed

@@ -172,6 +172,15 @@ class EfficientDiDResults:
             f"n_periods={len(self.time_periods)})"
         )
+    @property
+    def coef_var(self) -> float:
+        """Coefficient of variation: SE / |overall ATT|. NaN when ATT is 0 or SE non-finite."""
+        if not (np.isfinite(self.overall_se) and self.overall_se >= 0):
+            return np.nan
+        if not np.isfinite(self.overall_att) or self.overall_att == 0:
+            return np.nan
+        return self.overall_se / abs(self.overall_att)
     def summary(self, alpha: Optional[float] = None) -> str:
         """Generate formatted summary of estimation results."""
         alpha = alpha or self.alpha
@@ -219,10 +228,15 @@ class EfficientDiDResults:
                 "",
                 f"{conf_level}% Confidence Interval: "
                 f"[{self.overall_conf_int[0]:.4f}, {self.overall_conf_int[1]:.4f}]",
-                "",
             ]
         )
+        cv = self.coef_var
+        if np.isfinite(cv):
+            lines.append(f"{'CV (SE/|ATT|):':<25} {cv:>10.4f}")
+        lines.append("")
         # Event study effects
         if self.event_study_effects:
             lines.extend(

{diff_diff-2.8.2 → diff_diff-2.8.4}/diff_diff/estimators.py RENAMED Viewed

@@ -240,14 +240,14 @@ class DifferenceInDifferences:
         resolved_survey, survey_weights, survey_weight_type, survey_metadata = (
             _resolve_survey_for_fit(survey_design, data, self.inference)
         )
-        # Reject replicate-weight designs — base DiD uses compute_survey_vcov
-        # (TSL) directly, not LinearRegression's replicate dispatch.
-        if resolved_survey is not None and resolved_survey.uses_replicate_variance:
-            raise NotImplementedError(
-                "DifferenceInDifferences does not yet support replicate-weight "
-                "survey designs. Use CallawaySantAnna, EfficientDiD, "
-                "ContinuousDiD, or TripleDifference for replicate-weight "
-                "inference, or use a TSL-based survey design (strata/psu/fpc)."
+        _uses_replicate = (
+            resolved_survey is not None and resolved_survey.uses_replicate_variance
+        )
+        if _uses_replicate and self.inference == "wild_bootstrap":
+            raise ValueError(
+                "Cannot use inference='wild_bootstrap' with replicate-weight "
+                "survey designs. Replicate weights provide their own variance "
+                "estimation."
             )
         # Handle absorbed fixed effects (within-transformation)
@@ -358,6 +358,13 @@ class DifferenceInDifferences:
                 )
                 survey_metadata = compute_survey_metadata(resolved_survey, raw_w)
+        # When absorb + replicate: pass survey_design=None to prevent
+        # LinearRegression from computing replicate vcov on already-demeaned
+        # data (demeaning depends on weights, so replicate refits must re-demean).
+        _lr_survey = resolved_survey
+        if _uses_replicate and absorbed_vars:
+            _lr_survey = None
         reg = LinearRegression(
             include_intercept=False,  # Intercept already in X
             robust=self.robust,
@@ -366,7 +373,7 @@ class DifferenceInDifferences:
             rank_deficient_action=self.rank_deficient_action,
             weights=survey_weights,
             weight_type=survey_weight_type,
-            survey_design=resolved_survey,
+            survey_design=_lr_survey,
         ).fit(X, y, df_adjustment=n_absorbed_effects)
         coefficients = reg.coefficients_
@@ -375,14 +382,69 @@ class DifferenceInDifferences:
         assert coefficients is not None
         att = coefficients[att_idx]
-        # Get inference - either from bootstrap or analytical
-        if self.inference == "wild_bootstrap" and self.cluster is not None:
+        # Get inference - replicate absorb override, bootstrap, or analytical
+        if _uses_replicate and absorbed_vars:
+            # Estimator-level replicate variance: re-demean + re-solve per replicate
+            from diff_diff.survey import compute_replicate_refit_variance
+            from diff_diff.utils import safe_inference
+            _absorb_list = list(absorbed_vars)  # capture for closure
+            # Handle rank-deficient nuisance: refit only identified columns
+            _id_mask = ~np.isnan(coefficients)
+            _id_cols = np.where(_id_mask)[0]
+            _att_idx_reduced = int(np.searchsorted(_id_cols, att_idx))
+            def _refit_did_absorb(w_r):
+                nz = w_r > 0
+                wd = data[nz].copy()
+                w_nz = w_r[nz]
+                wd["_treat_time"] = (
+                    wd[treatment].values.astype(float) * wd[time].values.astype(float)
+                )
+                vars_dm = [outcome, treatment, time, "_treat_time"] + (covariates or [])
+                for ab_var in _absorb_list:
+                    wd, _ = demean_by_group(wd, vars_dm, ab_var, inplace=True, weights=w_nz)
+                y_r = wd[outcome].values.astype(float)
+                d_r = wd[treatment].values.astype(float)
+                t_r = wd[time].values.astype(float)
+                dt_r = wd["_treat_time"].values.astype(float)
+                X_r = np.column_stack([np.ones(len(y_r)), d_r, t_r, dt_r])
+                if covariates:
+                    for cov in covariates:
+                        X_r = np.column_stack([X_r, wd[cov].values.astype(float)])
+                coef_r, _, _ = solve_ols(
+                    X_r[:, _id_cols], y_r,
+                    weights=w_nz, weight_type=survey_weight_type,
+                    rank_deficient_action="silent", return_vcov=False,
+                )
+                return coef_r
+            vcov_reduced, _n_valid_rep = compute_replicate_refit_variance(
+                _refit_did_absorb, coefficients[_id_mask], resolved_survey
+            )
+            vcov = _expand_vcov_with_nan(vcov_reduced, len(coefficients), _id_cols)
+            se = float(np.sqrt(max(vcov[att_idx, att_idx], 0.0)))
+            _df_rep = (
+                survey_metadata.df_survey
+                if survey_metadata and survey_metadata.df_survey
+                else 0  # rank-deficient replicate → NaN inference
+            )
+            if _n_valid_rep < resolved_survey.n_replicates:
+                _df_rep = _n_valid_rep - 1 if _n_valid_rep > 1 else 0
+            if survey_metadata is not None:
+                survey_metadata.df_survey = _df_rep if _df_rep > 0 else None
+            t_stat, p_value, conf_int = safe_inference(
+                att, se, alpha=self.alpha, df=_df_rep
+            )
+        elif self.inference == "wild_bootstrap" and self.cluster is not None:
             # Override with wild cluster bootstrap inference
             se, p_value, conf_int, t_stat, vcov, _ = self._run_wild_bootstrap_inference(
                 X, y, residuals, cluster_ids, att_idx
             )
         else:
             # Use analytical inference from LinearRegression
+            # (handles replicate vcov for no-absorb path automatically)
             vcov = reg.vcov_
             inference = reg.get_inference(att_idx)
             se = inference.se
@@ -1017,14 +1079,14 @@ class MultiPeriodDiD(DifferenceInDifferences):
         resolved_survey, survey_weights, survey_weight_type, survey_metadata = (
             _resolve_survey_for_fit(survey_design, data, effective_inference)
         )
-        # Reject replicate-weight designs — MultiPeriodDiD uses
-        # compute_survey_vcov (TSL) directly without replicate dispatch.
-        if resolved_survey is not None and resolved_survey.uses_replicate_variance:
-            raise NotImplementedError(
-                "MultiPeriodDiD does not yet support replicate-weight survey "
-                "designs. Use CallawaySantAnna for staggered adoption with "
-                "replicate weights, or use a TSL-based survey design "
-                "(strata/psu/fpc)."
+        _uses_replicate_mp = (
+            resolved_survey is not None and resolved_survey.uses_replicate_variance
+        )
+        if _uses_replicate_mp and effective_inference == "wild_bootstrap":
+            raise ValueError(
+                "Cannot use inference='wild_bootstrap' with replicate-weight "
+                "survey designs. Replicate weights provide their own variance "
+                "estimation."
             )
         # Handle absorbed fixed effects (within-transformation)
@@ -1177,7 +1239,80 @@ class MultiPeriodDiD(DifferenceInDifferences):
         )
         # Compute survey vcov if applicable
-        if _use_survey_vcov:
+        _n_valid_rep_mp = None
+        if _use_survey_vcov and _uses_replicate_mp and absorb:
+            # Absorb + replicate: estimator-level refit (demeaning depends on weights)
+            from diff_diff.survey import compute_replicate_refit_variance
+            _absorb_list_mp = list(absorb)
+            # Handle rank-deficient nuisance: refit only identified columns
+            _id_mask_mp = ~np.isnan(coefficients)
+            _id_cols_mp = np.where(_id_mask_mp)[0]
+            def _refit_mp_absorb(w_r):
+                nz = w_r > 0
+                wd = data[nz].copy()
+                w_nz = w_r[nz]
+                d_raw_ = wd[treatment].values.astype(float)
+                t_raw_ = wd[time].values
+                wd["_did_treatment"] = d_raw_
+                for period_ in non_ref_periods:
+                    wd[f"_did_period_{period_}"] = (t_raw_ == period_).astype(float)
+                    wd[f"_did_interact_{period_}"] = d_raw_ * (t_raw_ == period_).astype(float)
+                vars_dm_ = (
+                    [outcome, "_did_treatment"]
+                    + [f"_did_period_{p}" for p in non_ref_periods]
+                    + [f"_did_interact_{p}" for p in non_ref_periods]
+                    + (covariates or [])
+                )
+                for ab_var_ in _absorb_list_mp:
+                    wd, _ = demean_by_group(wd, vars_dm_, ab_var_, inplace=True, weights=w_nz)
+                y_r = wd[outcome].values.astype(float)
+                d_r = wd["_did_treatment"].values.astype(float)
+                X_r = np.column_stack([np.ones(len(y_r)), d_r])
+                for period_ in non_ref_periods:
+                    X_r = np.column_stack(
+                        [X_r, wd[f"_did_period_{period_}"].values.astype(float)]
+                    )
+                for period_ in non_ref_periods:
+                    X_r = np.column_stack(
+                        [X_r, wd[f"_did_interact_{period_}"].values.astype(float)]
+                    )
+                if covariates:
+                    for cov_ in covariates:
+                        X_r = np.column_stack([X_r, wd[cov_].values.astype(float)])
+                coef_r, _, _ = solve_ols(
+                    X_r[:, _id_cols_mp], y_r,
+                    weights=w_nz, weight_type=survey_weight_type,
+                    rank_deficient_action="silent", return_vcov=False,
+                )
+                return coef_r
+            vcov_reduced_mp, _n_valid_rep_mp = compute_replicate_refit_variance(
+                _refit_mp_absorb, coefficients[_id_mask_mp], resolved_survey
+            )
+            vcov = _expand_vcov_with_nan(vcov_reduced_mp, len(coefficients), _id_cols_mp)
+        elif _use_survey_vcov and _uses_replicate_mp:
+            # No absorb + replicate: X is fixed, use compute_replicate_vcov directly
+            from diff_diff.survey import compute_replicate_vcov
+            nan_mask = np.isnan(coefficients)
+            if np.any(nan_mask):
+                kept_cols = np.where(~nan_mask)[0]
+                if len(kept_cols) > 0:
+                    vcov_reduced, _n_valid_rep_mp = compute_replicate_vcov(
+                        X[:, kept_cols], y, coefficients[kept_cols], resolved_survey,
+                        weight_type=survey_weight_type,
+                    )
+                    vcov = _expand_vcov_with_nan(vcov_reduced, X.shape[1], kept_cols)
+                else:
+                    vcov = np.full((X.shape[1], X.shape[1]), np.nan)
+                    _n_valid_rep_mp = 0
+            else:
+                vcov, _n_valid_rep_mp = compute_replicate_vcov(
+                    X, y, coefficients, resolved_survey, weight_type=survey_weight_type,
+                )
+        elif _use_survey_vcov:
             from diff_diff.survey import compute_survey_vcov
             nan_mask = np.isnan(coefficients)
@@ -1201,9 +1336,18 @@ class MultiPeriodDiD(DifferenceInDifferences):
         df = n_eff_df - k_effective - n_absorbed_effects
         if resolved_survey is not None and resolved_survey.df_survey is not None:
             df = resolved_survey.df_survey
+        # Replicate df: rank-deficient → NaN inference; dropped replicates → n_valid-1
+        if _uses_replicate_mp:
+            if resolved_survey.df_survey is None:
+                df = 0  # rank-deficient replicate → NaN inference
+            if _n_valid_rep_mp is not None and _n_valid_rep_mp < resolved_survey.n_replicates:
+                df = _n_valid_rep_mp - 1 if _n_valid_rep_mp > 1 else 0
+                if survey_metadata is not None:
+                    survey_metadata.df_survey = df if df > 0 else None
         # Guard: fall back to normal distribution if df is non-positive
-        if df is not None and df <= 0:
+        # Skip for replicate designs — df=0 is intentional for NaN inference
+        if df is not None and df <= 0 and not _uses_replicate_mp:
             warnings.warn(
                 f"Degrees of freedom is non-positive (df={df}). "
                 "Using normal distribution instead of t-distribution for inference.",

diff-diff 2.8.2__tar.gz → 2.8.4__tar.gz

diff-diff 2.8.2tar.gz → 2.8.4tar.gz