PyPI - diff-diff - Versions diffs - 2.1.2__tar.gz → 2.1.3__tar.gz - Mend

diff-diff 2.1.2tar.gz → 2.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{diff_diff-2.1.2 → diff_diff-2.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diff-diff
-Version: 2.1.2
+Version: 2.1.3
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Science/Research
 Classifier: Operating System :: OS Independent

{diff_diff-2.1.2 → diff_diff-2.1.3}/diff_diff/__init__.py RENAMED Viewed

@@ -136,7 +136,7 @@ from diff_diff.datasets import (
     load_mpdta,
 )
-__version__ = "2.1.2"
+__version__ = "2.1.3"
 __all__ = [
     # Estimators
     "DifferenceInDifferences",

{diff_diff-2.1.2 → diff_diff-2.1.3}/diff_diff/trop.py RENAMED Viewed

@@ -63,7 +63,11 @@ class _PrecomputedStructures(TypedDict):
     control_obs: List[Tuple[int, int]]
     """List of (t, i) tuples for valid control observations."""
     control_unit_idx: np.ndarray
-    """Array of control unit indices."""
+    """Array of never-treated unit indices (for backward compatibility)."""
+    D: np.ndarray
+    """Treatment indicator matrix (n_periods x n_units) for dynamic control sets."""
+    Y: np.ndarray
+    """Outcome matrix (n_periods x n_units)."""
     n_units: int
     """Number of units."""
     n_periods: int
@@ -529,6 +533,8 @@ class TROP:
             "treated_observations": treated_observations,
             "control_obs": control_obs,
             "control_unit_idx": control_unit_idx,
+            "D": D,
+            "Y": Y,
             "n_units": n_units,
             "n_periods": n_periods,
         }
@@ -778,16 +784,14 @@ class TROP:
                 # Prepare inputs for Rust function
                 control_mask_u8 = control_mask.astype(np.uint8)
                 time_dist_matrix = self._precomputed["time_dist_matrix"].astype(np.int64)
-                unit_dist_matrix = self._precomputed["unit_dist_matrix"]
-                control_unit_idx_i64 = control_unit_idx.astype(np.int64)
                 lambda_time_arr = np.array(self.lambda_time_grid, dtype=np.float64)
                 lambda_unit_arr = np.array(self.lambda_unit_grid, dtype=np.float64)
                 lambda_nn_arr = np.array(self.lambda_nn_grid, dtype=np.float64)
                 best_lt, best_lu, best_ln, best_score = _rust_loocv_grid_search(
-                    Y, D.astype(np.float64), control_mask_u8, control_unit_idx_i64,
-                    unit_dist_matrix, time_dist_matrix,
+                    Y, D.astype(np.float64), control_mask_u8,
+                    time_dist_matrix,
                     lambda_time_arr, lambda_unit_arr, lambda_nn_arr,
                     self.max_loocv_samples, self.max_iter, self.tol,
                     self.seed if self.seed is not None else 0
@@ -953,10 +957,16 @@ class TROP:
         """
         Compute observation-specific weight matrix for treated observation (i, t).
-        Following the paper's Algorithm 2 (page 27):
+        Following the paper's Algorithm 2 (page 27) and Equation 2 (page 7):
         - Time weights θ_s^{i,t} = exp(-λ_time × |t - s|)
         - Unit weights ω_j^{i,t} = exp(-λ_unit × dist_unit_{-t}(j, i))
+        IMPORTANT (Issue A fix): The paper's objective sums over ALL observations
+        where (1 - W_js) is non-zero, which includes pre-treatment observations of
+        eventually-treated units since W_js = 0 for those. This method computes
+        weights for ALL units where D[t, j] = 0 at the target period, not just
+        never-treated units.
         Uses pre-computed structures when available for efficiency.
         Parameters
@@ -974,7 +984,8 @@ class TROP:
         lambda_unit : float
             Unit weight decay parameter.
         control_unit_idx : np.ndarray
-            Indices of control units.
+            Indices of never-treated units (for backward compatibility, but not
+            used for weight computation - we use D matrix directly).
         n_units : int
             Number of units.
         n_periods : int
@@ -991,21 +1002,30 @@ class TROP:
             # time_dist_matrix[t, s] = |t - s|
             time_weights = np.exp(-lambda_time * self._precomputed["time_dist_matrix"][t, :])
-            # Unit weights from pre-computed unit distance matrix
+            # Unit weights - computed for ALL units where D[t, j] = 0
+            # (Issue A fix: includes pre-treatment obs of eventually-treated units)
             unit_weights = np.zeros(n_units)
+            D_stored = self._precomputed["D"]
+            Y_stored = self._precomputed["Y"]
+            # Valid control units at time t: D[t, j] == 0
+            valid_control_at_t = D_stored[t, :] == 0
             if lambda_unit == 0:
                 # Uniform weights when lambda_unit = 0
-                unit_weights[:] = 1.0
+                # All units not treated at time t get weight 1
+                unit_weights[valid_control_at_t] = 1.0
             else:
-                # Use pre-computed distances: unit_dist_matrix[j, i] = dist(j, i)
-                dist_matrix = self._precomputed["unit_dist_matrix"]
-                for j in control_unit_idx:
-                    dist = dist_matrix[j, i]
-                    if np.isinf(dist):
-                        unit_weights[j] = 0.0
-                    else:
-                        unit_weights[j] = np.exp(-lambda_unit * dist)
+                # Use observation-specific distances with target period excluded
+                # (Issue B fix: compute exact per-observation distance)
+                for j in range(n_units):
+                    if valid_control_at_t[j] and j != i:
+                        # Compute distance excluding target period t
+                        dist = self._compute_unit_distance_for_obs(Y_stored, D_stored, j, i, t)
+                        if np.isinf(dist):
+                            unit_weights[j] = 0.0
+                        else:
+                            unit_weights[j] = np.exp(-lambda_unit * dist)
             # Treated unit i gets weight 1
             unit_weights[i] = 1.0
@@ -1018,19 +1038,25 @@ class TROP:
         dist_time = np.abs(np.arange(n_periods) - t)
         time_weights = np.exp(-lambda_time * dist_time)
-        # Unit distance: pairwise RMSE from each control j to treated i
+        # Unit weights - computed for ALL units where D[t, j] = 0
+        # (Issue A fix: includes pre-treatment obs of eventually-treated units)
         unit_weights = np.zeros(n_units)
+        # Valid control units at time t: D[t, j] == 0
+        valid_control_at_t = D[t, :] == 0
         if lambda_unit == 0:
             # Uniform weights when lambda_unit = 0
-            unit_weights[:] = 1.0
+            unit_weights[valid_control_at_t] = 1.0
         else:
-            for j in control_unit_idx:
-                dist = self._compute_unit_distance_for_obs(Y, D, j, i, t)
-                if np.isinf(dist):
-                    unit_weights[j] = 0.0
-                else:
-                    unit_weights[j] = np.exp(-lambda_unit * dist)
+            for j in range(n_units):
+                if valid_control_at_t[j] and j != i:
+                    # Compute distance excluding target period t (Issue B fix)
+                    dist = self._compute_unit_distance_for_obs(Y, D, j, i, t)
+                    if np.isinf(dist):
+                        unit_weights[j] = 0.0
+                    else:
+                        unit_weights[j] = np.exp(-lambda_unit * dist)
         # Treated unit i gets weight 1 (or could be omitted since we fit on controls)
         # We include treated unit's own observation for model fitting
@@ -1102,6 +1128,101 @@ class TROP:
         return result
+    def _weighted_nuclear_norm_solve(
+        self,
+        Y: np.ndarray,
+        W: np.ndarray,
+        L_init: np.ndarray,
+        alpha: np.ndarray,
+        beta: np.ndarray,
+        lambda_nn: float,
+        max_inner_iter: int = 20,
+    ) -> np.ndarray:
+        """
+        Solve weighted nuclear norm problem using iterative weighted soft-impute.
+        Issue C fix: Implements the weighted nuclear norm optimization from the
+        paper's Equation 2 (page 7). The full objective is:
+            min_L Σ W_{ti}(R_{ti} - L_{ti})² + λ_nn||L||_*
+        This uses a proximal gradient / soft-impute approach (Mazumder et al. 2010):
+            L_{k+1} = prox_{λ||·||_*}(L_k + W ⊙ (R - L_k))
+        where W ⊙ denotes element-wise multiplication with normalized weights.
+        IMPORTANT: For observations with W=0 (treated observations), we keep
+        L values from the previous iteration rather than setting L = R, which
+        would absorb the treatment effect.
+        Parameters
+        ----------
+        Y : np.ndarray
+            Outcome matrix (n_periods x n_units).
+        W : np.ndarray
+            Weight matrix (n_periods x n_units), non-negative. W=0 indicates
+            observations that should not be used for fitting (treated obs).
+        L_init : np.ndarray
+            Initial estimate of L matrix.
+        alpha : np.ndarray
+            Current unit fixed effects estimate.
+        beta : np.ndarray
+            Current time fixed effects estimate.
+        lambda_nn : float
+            Nuclear norm regularization parameter.
+        max_inner_iter : int, default=20
+            Maximum inner iterations for the proximal algorithm.
+        Returns
+        -------
+        np.ndarray
+            Updated L matrix estimate.
+        """
+        # Compute target residual R = Y - α - β
+        R = Y - alpha[np.newaxis, :] - beta[:, np.newaxis]
+        # Handle invalid values
+        R = np.nan_to_num(R, nan=0.0, posinf=0.0, neginf=0.0)
+        # For observations with W=0 (treated obs), keep L_init instead of R
+        # This prevents L from absorbing the treatment effect
+        valid_obs_mask = W > 0
+        R_masked = np.where(valid_obs_mask, R, L_init)
+        if lambda_nn <= 0:
+            # No regularization - just return masked residual
+            # Use soft-thresholding with threshold=0 which returns the input
+            return R_masked
+        # Normalize weights so max is 1 (for step size stability)
+        W_max = np.max(W)
+        if W_max > 0:
+            W_norm = W / W_max
+        else:
+            W_norm = W
+        # Initialize L
+        L = L_init.copy()
+        # Proximal gradient iteration with weighted soft-impute
+        # This solves: min_L ||W^{1/2} ⊙ (R - L)||_F^2 + λ||L||_*
+        # Using: L_{k+1} = prox_{λ/η}(L_k + W ⊙ (R - L_k))
+        # where η is the step size (we use η = 1 with normalized weights)
+        for _ in range(max_inner_iter):
+            L_old = L.copy()
+            # Gradient step: L_k + W ⊙ (R - L_k)
+            # For W=0 observations, this keeps L_k unchanged
+            gradient_step = L + W_norm * (R_masked - L)
+            # Proximal step: soft-threshold singular values
+            L = self._soft_threshold_svd(gradient_step, lambda_nn)
+            # Check convergence
+            if np.max(np.abs(L - L_old)) < self.tol:
+                break
+        return L
     def _estimate_model(
         self,
         Y: np.ndarray,
@@ -1205,14 +1326,13 @@ class TROP:
             beta_numerator = np.sum(weighted_R_minus_alpha, axis=1)  # (n_periods,)
             beta = np.where(time_has_obs, beta_numerator / safe_time_denom, 0.0)
-            # Step 2: Update L with nuclear norm penalty
-            # Following Equation 2 (page 7): L = prox_{λ_nn||·||_*}(Y - α - β)
-            # The proximal operator for nuclear norm is soft-thresholding of SVD
-            R_for_L = Y_safe - alpha[np.newaxis, :] - beta[:, np.newaxis]
-            # Impute invalid observations with current L for stable SVD
-            R_for_L = np.where(valid_mask, R_for_L, L)
-            L = self._soft_threshold_svd(R_for_L, lambda_nn)
+            # Step 2: Update L with weighted nuclear norm penalty
+            # Issue C fix: Use weighted soft-impute to properly account for
+            # observation weights in the nuclear norm optimization.
+            # Following Equation 2 (page 7): min_L Σ W_{ti}(Y - α - β - L)² + λ||L||_*
+            L = self._weighted_nuclear_norm_solve(
+                Y_safe, W_masked, L, alpha, beta, lambda_nn, max_inner_iter=10
+            )
             # Check convergence
             alpha_diff = np.max(np.abs(alpha - alpha_old))
@@ -1388,21 +1508,15 @@ class TROP:
         # Try Rust backend for parallel bootstrap (5-15x speedup)
         if (HAS_RUST_BACKEND and _rust_bootstrap_trop_variance is not None
                 and self._precomputed is not None and Y is not None
-                and D is not None and control_unit_idx is not None):
+                and D is not None):
             try:
-                # Prepare inputs
-                treated_observations = self._precomputed["treated_observations"]
-                treated_t = np.array([t for t, i in treated_observations], dtype=np.int64)
-                treated_i = np.array([i for t, i in treated_observations], dtype=np.int64)
                 control_mask = self._precomputed["control_mask"]
+                time_dist_matrix = self._precomputed["time_dist_matrix"].astype(np.int64)
                 bootstrap_estimates, se = _rust_bootstrap_trop_variance(
                     Y, D.astype(np.float64),
                     control_mask.astype(np.uint8),
-                    control_unit_idx.astype(np.int64),
-                    treated_t, treated_i,
-                    self._precomputed["unit_dist_matrix"],
-                    self._precomputed["time_dist_matrix"].astype(np.int64),
+                    time_dist_matrix,
                     lambda_time, lambda_unit, lambda_nn,
                     self.n_bootstrap, self.max_iter, self.tol,
                     self.seed if self.seed is not None else 0
@@ -1422,14 +1536,38 @@ class TROP:
         # Python implementation (fallback)
         rng = np.random.default_rng(self.seed)
-        all_units = data[unit].unique()
-        n_units_data = len(all_units)
+        # Issue D fix: Stratified bootstrap sampling
+        # Paper's Algorithm 3 (page 27) specifies sampling N_0 control rows
+        # and N_1 treated rows separately to preserve treatment ratio
+        unit_ever_treated = data.groupby(unit)[treatment].max()
+        treated_units = np.array(unit_ever_treated[unit_ever_treated == 1].index)
+        control_units = np.array(unit_ever_treated[unit_ever_treated == 0].index)
+        n_treated_units = len(treated_units)
+        n_control_units = len(control_units)
         bootstrap_estimates_list = []
         for _ in range(self.n_bootstrap):
-            # Sample units with replacement
-            sampled_units = rng.choice(all_units, size=n_units_data, replace=True)
+            # Stratified sampling: sample control and treated units separately
+            # This preserves the treatment ratio in each bootstrap sample
+            if n_control_units > 0:
+                sampled_control = rng.choice(
+                    control_units, size=n_control_units, replace=True
+                )
+            else:
+                sampled_control = np.array([], dtype=control_units.dtype)
+            if n_treated_units > 0:
+                sampled_treated = rng.choice(
+                    treated_units, size=n_treated_units, replace=True
+                )
+            else:
+                sampled_treated = np.array([], dtype=treated_units.dtype)
+            # Combine stratified samples
+            sampled_units = np.concatenate([sampled_control, sampled_treated])
             # Create bootstrap sample with unique unit IDs
             boot_data = pd.concat([

{diff_diff-2.1.2 → diff_diff-2.1.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "maturin"
 [project]
 name = "diff-diff"
-version = "2.1.2"
+version = "2.1.3"
 description = "A library for Difference-in-Differences causal inference analysis"
 readme = "README.md"
 license = "MIT"

{diff_diff-2.1.2 → diff_diff-2.1.3}/rust/Cargo.lock RENAMED Viewed

@@ -289,7 +289,7 @@ dependencies = [
 [[package]]
 name = "diff_diff_rust"
-version = "2.1.2"
+version = "2.1.3"
 dependencies = [
  "ndarray",
  "ndarray-linalg",

{diff_diff-2.1.2 → diff_diff-2.1.3}/rust/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "diff_diff_rust"
-version = "2.1.2"
+version = "2.1.3"
 edition = "2021"
 description = "Rust backend for diff-diff DiD library"
 license = "MIT"

{diff_diff-2.1.2 → diff_diff-2.1.3}/rust/src/trop.rs RENAMED Viewed

@@ -172,15 +172,13 @@ fn compute_pair_distance(
 /// # Returns
 /// (best_lambda_time, best_lambda_unit, best_lambda_nn, best_score)
 #[pyfunction]
-#[pyo3(signature = (y, d, control_mask, control_unit_idx, unit_dist_matrix, time_dist_matrix, lambda_time_grid, lambda_unit_grid, lambda_nn_grid, max_loocv_samples, max_iter, tol, seed))]
+#[pyo3(signature = (y, d, control_mask, time_dist_matrix, lambda_time_grid, lambda_unit_grid, lambda_nn_grid, max_loocv_samples, max_iter, tol, seed))]
 #[allow(clippy::too_many_arguments)]
 pub fn loocv_grid_search<'py>(
     _py: Python<'py>,
     y: PyReadonlyArray2<'py, f64>,
     d: PyReadonlyArray2<'py, f64>,
     control_mask: PyReadonlyArray2<'py, u8>,
-    control_unit_idx: PyReadonlyArray1<'py, i64>,
-    unit_dist_matrix: PyReadonlyArray2<'py, f64>,
     time_dist_matrix: PyReadonlyArray2<'py, i64>,
     lambda_time_grid: PyReadonlyArray1<'py, f64>,
     lambda_unit_grid: PyReadonlyArray1<'py, f64>,
@@ -193,19 +191,11 @@ pub fn loocv_grid_search<'py>(
     let y_arr = y.as_array();
     let d_arr = d.as_array();
     let control_mask_arr = control_mask.as_array();
-    let control_unit_idx_arr = control_unit_idx.as_array();
-    let unit_dist_arr = unit_dist_matrix.as_array();
     let time_dist_arr = time_dist_matrix.as_array();
     let lambda_time_vec: Vec<f64> = lambda_time_grid.as_array().to_vec();
     let lambda_unit_vec: Vec<f64> = lambda_unit_grid.as_array().to_vec();
     let lambda_nn_vec: Vec<f64> = lambda_nn_grid.as_array().to_vec();
-    // Convert control_unit_idx to Vec<usize>
-    let control_units: Vec<usize> = control_unit_idx_arr
-        .iter()
-        .map(|&idx| idx as usize)
-        .collect();
     // Get control observations for LOOCV
     let control_obs = get_control_observations(
         &y_arr,
@@ -232,8 +222,6 @@ pub fn loocv_grid_search<'py>(
                 &y_arr,
                 &d_arr,
                 &control_mask_arr,
-                &control_units,
-                &unit_dist_arr,
                 &time_dist_arr,
                 &control_obs,
                 lambda_time,
@@ -291,10 +279,8 @@ fn get_control_observations(
 /// Compute LOOCV score for a specific parameter combination.
 fn loocv_score_for_params(
     y: &ArrayView2<f64>,
-    _d: &ArrayView2<f64>,
+    d: &ArrayView2<f64>,
     control_mask: &ArrayView2<u8>,
-    control_units: &[usize],
-    unit_dist: &ArrayView2<f64>,
     time_dist: &ArrayView2<i64>,
     control_obs: &[(usize, usize)],
     lambda_time: f64,
@@ -312,14 +298,14 @@ fn loocv_score_for_params(
     for &(t, i) in control_obs {
         // Compute observation-specific weight matrix
         let weight_matrix = compute_weight_matrix(
+            y,
+            d,
             n_periods,
             n_units,
             i,
             t,
             lambda_time,
             lambda_unit,
-            control_units,
-            unit_dist,
             time_dist,
         );
@@ -352,46 +338,107 @@ fn loocv_score_for_params(
     }
 }
+/// Compute observation-specific distance from unit j to unit i, excluding target period.
+///
+/// Issue B fix: Follows Equation 3 (page 7) which specifies 1{u ≠ t} to exclude target period.
+fn compute_unit_distance_for_obs(
+    y: &ArrayView2<f64>,
+    d: &ArrayView2<f64>,
+    j: usize,
+    i: usize,
+    target_period: usize,
+) -> f64 {
+    let n_periods = y.nrows();
+    let mut sum_sq = 0.0;
+    let mut n_valid = 0usize;
+    for t in 0..n_periods {
+        // Exclude target period (Issue B fix)
+        if t == target_period {
+            continue;
+        }
+        // Both units must be control at this period and have valid values
+        if d[[t, i]] == 0.0 && d[[t, j]] == 0.0
+            && y[[t, i]].is_finite() && y[[t, j]].is_finite()
+        {
+            let diff = y[[t, i]] - y[[t, j]];
+            sum_sq += diff * diff;
+            n_valid += 1;
+        }
+    }
+    if n_valid > 0 {
+        (sum_sq / n_valid as f64).sqrt()
+    } else {
+        f64::INFINITY
+    }
+}
 /// Compute observation-specific weight matrix for TROP.
 ///
 /// Time weights: θ_s = exp(-λ_time × |t - s|)
 /// Unit weights: ω_j = exp(-λ_unit × dist(j, i))
+///
+/// Paper alignment notes:
+/// - ALL units get weights (not just those untreated at target period)
+/// - The (1 - D_js) masking in the loss naturally excludes treated cells
+/// - Weights are normalized to sum to 1 (probability weights)
+/// - Distance excludes target period t per Equation 3
 fn compute_weight_matrix(
+    y: &ArrayView2<f64>,
+    d: &ArrayView2<f64>,
     n_periods: usize,
     n_units: usize,
     target_unit: usize,
     target_period: usize,
     lambda_time: f64,
     lambda_unit: f64,
-    control_units: &[usize],
-    unit_dist: &ArrayView2<f64>,
     time_dist: &ArrayView2<i64>,
 ) -> Array2<f64> {
-    // Time weights for this target period
-    let time_weights: Array1<f64> = Array1::from_shape_fn(n_periods, |s| {
+    // Time weights for this target period: θ_s = exp(-λ_time × |t - s|)
+    let mut time_weights: Array1<f64> = Array1::from_shape_fn(n_periods, |s| {
         let dist = time_dist[[target_period, s]] as f64;
         (-lambda_time * dist).exp()
     });
-    // Unit weights
+    // Normalize time weights to sum to 1
+    let time_sum: f64 = time_weights.sum();
+    if time_sum > 0.0 {
+        time_weights /= time_sum;
+    }
+    // Unit weights: ω_j = exp(-λ_unit × dist(j, i))
+    // Paper alignment: compute for ALL units, let control masking handle exclusion
     let mut unit_weights = Array1::<f64>::zeros(n_units);
     if lambda_unit == 0.0 {
         // Uniform weights when lambda_unit = 0
+        // All units get weight 1 (control masking will handle exclusion)
         unit_weights.fill(1.0);
     } else {
-        for &j in control_units {
-            let dist = unit_dist[[j, target_unit]];
-            if dist.is_finite() {
-                unit_weights[j] = (-lambda_unit * dist).exp();
+        // Compute per-observation distance for all units (excluding target unit itself)
+        for j in 0..n_units {
+            if j != target_unit {
+                let dist = compute_unit_distance_for_obs(y, d, j, target_unit, target_period);
+                if dist.is_finite() {
+                    unit_weights[j] = (-lambda_unit * dist).exp();
+                }
+                // Units with infinite distance (no valid comparison periods) get weight 0
             }
         }
     }
-    // Target unit gets weight 1
+    // Target unit gets weight 1 (will be masked out in estimation anyway)
     unit_weights[target_unit] = 1.0;
+    // Normalize unit weights to sum to 1
+    let unit_sum: f64 = unit_weights.sum();
+    if unit_sum > 0.0 {
+        unit_weights /= unit_sum;
+    }
     // Outer product: W[t, i] = time_weights[t] * unit_weights[i]
+    // Result is normalized since both components sum to 1
     let mut weight_matrix = Array2::<f64>::zeros((n_periods, n_units));
     for t in 0..n_periods {
         for i in 0..n_units {
@@ -406,6 +453,10 @@ fn compute_weight_matrix(
 ///
 /// Minimizes: Σ W_{ti}(Y_{ti} - α_i - β_t - L_{ti})² + λ_nn||L||_*
 ///
+/// Paper alignment: Uses weighted proximal gradient for L update:
+///   L ← prox_{η·λ_nn·||·||_*}(L + η·(W ⊙ (R - L)))
+/// where η ≤ 1/max(W) for convergence.
+///
 /// Returns None if estimation fails due to numerical issues.
 fn estimate_model(
     y: &ArrayView2<f64>,
@@ -432,7 +483,7 @@ fn estimate_model(
         y[[t, i]].is_finite() && est_mask[[t, i]]
     });
-    // Masked weights
+    // Masked weights: W=0 for invalid/treated observations
     let w_masked = Array2::from_shape_fn((n_periods, n_units), |(t, i)| {
         if valid_mask[[t, i]] {
             weight_matrix[[t, i]]
@@ -441,6 +492,10 @@ fn estimate_model(
         }
     });
+    // Compute step size for proximal gradient: η ≤ 1/max(W)
+    let w_max = w_masked.iter().cloned().fold(0.0_f64, f64::max);
+    let eta = if w_max > 0.0 { 1.0 / w_max } else { 1.0 };
     // Weight sums per unit and time
     let weight_sum_per_unit: Array1<f64> = w_masked.sum_axis(Axis(0));
     let weight_sum_per_time: Array1<f64> = w_masked.sum_axis(Axis(1));
@@ -472,7 +527,7 @@ fn estimate_model(
         let beta_old = beta.clone();
         let l_old = l.clone();
-        // Step 1: Update α and β
+        // Step 1: Update α and β (weighted least squares)
         // R = Y - L
         let r = &y_safe - &l;
@@ -498,25 +553,31 @@ fn estimate_model(
             }
         }
-        // Step 2: Update L with nuclear norm penalty
-        // R_for_L = Y - α - β
-        let mut r_for_l = Array2::<f64>::zeros((n_periods, n_units));
+        // Step 2: Update L with WEIGHTED nuclear norm penalty
+        // Paper alignment: Use proximal gradient instead of direct soft-thresholding
+        // L ← prox_{η·λ_nn·||·||_*}(L + η·(W ⊙ (R - L)))
+        // where R = Y - α - β
+        // Compute target residual R = Y - α - β
+        let mut r_target = Array2::<f64>::zeros((n_periods, n_units));
         for t in 0..n_periods {
             for i in 0..n_units {
-                r_for_l[[t, i]] = y_safe[[t, i]] - alpha[i] - beta[t];
+                r_target[[t, i]] = y_safe[[t, i]] - alpha[i] - beta[t];
             }
         }
-        // Impute invalid observations with current L
+        // Weighted proximal gradient step:
+        // gradient_step = L + η * W ⊙ (R - L)
+        // For W=0 cells (treated obs), this keeps L unchanged
+        let mut gradient_step = Array2::<f64>::zeros((n_periods, n_units));
         for t in 0..n_periods {
             for i in 0..n_units {
-                if !valid_mask[[t, i]] {
-                    r_for_l[[t, i]] = l[[t, i]];
-                }
+                gradient_step[[t, i]] = l[[t, i]] + eta * w_masked[[t, i]] * (r_target[[t, i]] - l[[t, i]]);
             }
         }
-        l = soft_threshold_svd(&r_for_l, lambda_nn)?;
+        // Proximal step: soft-threshold singular values with scaled lambda
+        l = soft_threshold_svd(&gradient_step, eta * lambda_nn)?;
         // Check convergence
         let alpha_diff = max_abs_diff(&alpha, &alpha_old);
@@ -627,17 +688,13 @@ fn max_abs_diff_2d(a: &Array2<f64>, b: &Array2<f64>) -> f64 {
 /// # Returns
 /// (bootstrap_estimates, standard_error)
 #[pyfunction]
-#[pyo3(signature = (y, d, control_mask, control_unit_idx, treated_obs_t, treated_obs_i, unit_dist_matrix, time_dist_matrix, lambda_time, lambda_unit, lambda_nn, n_bootstrap, max_iter, tol, seed))]
+#[pyo3(signature = (y, d, control_mask, time_dist_matrix, lambda_time, lambda_unit, lambda_nn, n_bootstrap, max_iter, tol, seed))]
 #[allow(clippy::too_many_arguments)]
 pub fn bootstrap_trop_variance<'py>(
     py: Python<'py>,
     y: PyReadonlyArray2<'py, f64>,
     d: PyReadonlyArray2<'py, f64>,
     control_mask: PyReadonlyArray2<'py, u8>,
-    control_unit_idx: PyReadonlyArray1<'py, i64>,
-    treated_obs_t: PyReadonlyArray1<'py, i64>,
-    treated_obs_i: PyReadonlyArray1<'py, i64>,
-    unit_dist_matrix: PyReadonlyArray2<'py, f64>,
     time_dist_matrix: PyReadonlyArray2<'py, i64>,
     lambda_time: f64,
     lambda_unit: f64,
@@ -650,16 +707,25 @@ pub fn bootstrap_trop_variance<'py>(
     let y_arr = y.as_array().to_owned();
     let d_arr = d.as_array().to_owned();
     let control_mask_arr = control_mask.as_array().to_owned();
-    let unit_dist_arr = unit_dist_matrix.as_array().to_owned();
     let time_dist_arr = time_dist_matrix.as_array().to_owned();
     let n_units = y_arr.ncols();
     let n_periods = y_arr.nrows();
-    // Note: control_unit_idx, treated_obs_t, treated_obs_i are passed for API
-    // compatibility but not used directly - each bootstrap iteration recomputes
-    // control units and treated observations from the resampled data.
-    let _ = (control_unit_idx, treated_obs_t, treated_obs_i);
+    // Issue D fix: Identify treated and control units for stratified sampling
+    // Following paper's Algorithm 3 (page 27): sample N_0 control and N_1 treated separately
+    let mut original_treated_units: Vec<usize> = Vec::new();
+    let mut original_control_units: Vec<usize> = Vec::new();
+    for i in 0..n_units {
+        let is_ever_treated = (0..n_periods).any(|t| d_arr[[t, i]] == 1.0);
+        if is_ever_treated {
+            original_treated_units.push(i);
+        } else {
+            original_control_units.push(i);
+        }
+    }
+    let n_treated_units = original_treated_units.len();
+    let n_control_units = original_control_units.len();
     // Run bootstrap iterations in parallel
     let bootstrap_estimates: Vec<f64> = (0..n_bootstrap)
@@ -670,16 +736,25 @@ pub fn bootstrap_trop_variance<'py>(
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(b as u64));
-            // Sample units with replacement
-            let sampled_units: Vec<usize> = (0..n_units)
-                .map(|_| rng.gen_range(0..n_units))
-                .collect();
+            // Issue D fix: Stratified sampling - sample control and treated units separately
+            let mut sampled_units: Vec<usize> = Vec::with_capacity(n_units);
+            // Sample control units with replacement
+            for _ in 0..n_control_units {
+                let idx = rng.gen_range(0..n_control_units);
+                sampled_units.push(original_control_units[idx]);
+            }
+            // Sample treated units with replacement
+            for _ in 0..n_treated_units {
+                let idx = rng.gen_range(0..n_treated_units);
+                sampled_units.push(original_treated_units[idx]);
+            }
             // Create bootstrap matrices by selecting columns
             let mut y_boot = Array2::<f64>::zeros((n_periods, n_units));
             let mut d_boot = Array2::<f64>::zeros((n_periods, n_units));
             let mut control_mask_boot = Array2::<u8>::zeros((n_periods, n_units));
-            let mut unit_dist_boot = Array2::<f64>::zeros((n_units, n_units));
             for (new_idx, &old_idx) in sampled_units.iter().enumerate() {
                 for t in 0..n_periods {
@@ -687,10 +762,6 @@ pub fn bootstrap_trop_variance<'py>(
                     d_boot[[t, new_idx]] = d_arr[[t, old_idx]];
                     control_mask_boot[[t, new_idx]] = control_mask_arr[[t, old_idx]];
                 }
-                for (new_j, &old_j) in sampled_units.iter().enumerate() {
-                    unit_dist_boot[[new_idx, new_j]] = unit_dist_arr[[old_idx, old_j]];
-                }
             }
             // Get treated observations in bootstrap sample
@@ -725,14 +796,14 @@ pub fn bootstrap_trop_variance<'py>(
             for (t, i) in boot_treated {
                 let weight_matrix = compute_weight_matrix(
+                    &y_boot.view(),
+                    &d_boot.view(),
                     n_periods,
                     n_units,
                     i,
                     t,
                     lambda_time,
                     lambda_unit,
-                    &boot_control_units,
-                    &unit_dist_boot.view(),
                     &time_dist_arr.view(),
                 );