diff-diff 2.4.1__tar.gz → 2.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diff_diff-2.4.1 → diff_diff-2.4.2}/PKG-INFO +4 -4
- {diff_diff-2.4.1 → diff_diff-2.4.2}/README.md +3 -3
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/__init__.py +1 -1
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/_backend.py +21 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/imputation.py +13 -16
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/imputation_bootstrap.py +41 -18
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/imputation_results.py +1 -1
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/sun_abraham.py +0 -22
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/trop.py +7 -1
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/two_stage.py +63 -10
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/two_stage_bootstrap.py +18 -8
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/two_stage_results.py +1 -1
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/utils.py +0 -98
- {diff_diff-2.4.1 → diff_diff-2.4.2}/pyproject.toml +1 -1
- {diff_diff-2.4.1 → diff_diff-2.4.2}/rust/Cargo.lock +28 -1
- {diff_diff-2.4.1 → diff_diff-2.4.2}/rust/Cargo.toml +11 -3
- diff_diff-2.4.2/rust/build.rs +12 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/rust/src/lib.rs +25 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/rust/src/weights.rs +410 -45
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/bacon.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/datasets.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/diagnostics.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/estimators.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/honest_did.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/linalg.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/power.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/prep.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/prep_dgp.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/pretrends.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/results.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/staggered.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/staggered_aggregation.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/staggered_bootstrap.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/staggered_results.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/synthetic_did.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/triple_diff.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/trop_results.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/twfe.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/diff_diff/visualization.py +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/rust/src/bootstrap.rs +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/rust/src/linalg.rs +0 -0
- {diff_diff-2.4.1 → diff_diff-2.4.2}/rust/src/trop.rs +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diff-diff
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.2
|
|
4
4
|
Classifier: Development Status :: 5 - Production/Stable
|
|
5
5
|
Classifier: Intended Audience :: Science/Research
|
|
6
6
|
Classifier: Operating System :: OS Independent
|
|
@@ -2021,7 +2021,7 @@ TROP(
|
|
|
2021
2021
|
max_iter=100, # Max iterations for factor estimation
|
|
2022
2022
|
tol=1e-6, # Convergence tolerance
|
|
2023
2023
|
alpha=0.05, # Significance level for CIs
|
|
2024
|
-
n_bootstrap=200, # Bootstrap replications
|
|
2024
|
+
n_bootstrap=200, # Bootstrap replications (minimum 2; TROP requires bootstrap for SEs)
|
|
2025
2025
|
seed=None # Random seed
|
|
2026
2026
|
)
|
|
2027
2027
|
```
|
|
@@ -2102,8 +2102,6 @@ SunAbraham(
|
|
|
2102
2102
|
| `time` | str | Time period column |
|
|
2103
2103
|
| `first_treat` | str | Column with first treatment period (0 for never-treated) |
|
|
2104
2104
|
| `covariates` | list | Covariate column names |
|
|
2105
|
-
| `min_pre_periods` | int | Minimum pre-treatment periods to include |
|
|
2106
|
-
| `min_post_periods` | int | Minimum post-treatment periods to include |
|
|
2107
2105
|
|
|
2108
2106
|
### SunAbrahamResults
|
|
2109
2107
|
|
|
@@ -2143,6 +2141,7 @@ ImputationDiD(
|
|
|
2143
2141
|
alpha=0.05, # Significance level for CIs
|
|
2144
2142
|
cluster=None, # Column for cluster-robust SEs
|
|
2145
2143
|
n_bootstrap=0, # Bootstrap iterations (0 = analytical)
|
|
2144
|
+
bootstrap_weights='rademacher', # 'rademacher', 'mammen', or 'webb'
|
|
2146
2145
|
seed=None, # Random seed
|
|
2147
2146
|
rank_deficient_action='warn', # 'warn', 'error', or 'silent'
|
|
2148
2147
|
horizon_max=None, # Max event-study horizon
|
|
@@ -2197,6 +2196,7 @@ TwoStageDiD(
|
|
|
2197
2196
|
alpha=0.05, # Significance level for CIs
|
|
2198
2197
|
cluster=None, # Column for cluster-robust SEs (defaults to unit)
|
|
2199
2198
|
n_bootstrap=0, # Bootstrap iterations (0 = analytical GMM SEs)
|
|
2199
|
+
bootstrap_weights='rademacher', # 'rademacher', 'mammen', or 'webb'
|
|
2200
2200
|
seed=None, # Random seed
|
|
2201
2201
|
rank_deficient_action='warn', # 'warn', 'error', or 'silent'
|
|
2202
2202
|
horizon_max=None, # Max event-study horizon
|
|
@@ -1983,7 +1983,7 @@ TROP(
|
|
|
1983
1983
|
max_iter=100, # Max iterations for factor estimation
|
|
1984
1984
|
tol=1e-6, # Convergence tolerance
|
|
1985
1985
|
alpha=0.05, # Significance level for CIs
|
|
1986
|
-
n_bootstrap=200, # Bootstrap replications
|
|
1986
|
+
n_bootstrap=200, # Bootstrap replications (minimum 2; TROP requires bootstrap for SEs)
|
|
1987
1987
|
seed=None # Random seed
|
|
1988
1988
|
)
|
|
1989
1989
|
```
|
|
@@ -2064,8 +2064,6 @@ SunAbraham(
|
|
|
2064
2064
|
| `time` | str | Time period column |
|
|
2065
2065
|
| `first_treat` | str | Column with first treatment period (0 for never-treated) |
|
|
2066
2066
|
| `covariates` | list | Covariate column names |
|
|
2067
|
-
| `min_pre_periods` | int | Minimum pre-treatment periods to include |
|
|
2068
|
-
| `min_post_periods` | int | Minimum post-treatment periods to include |
|
|
2069
2067
|
|
|
2070
2068
|
### SunAbrahamResults
|
|
2071
2069
|
|
|
@@ -2105,6 +2103,7 @@ ImputationDiD(
|
|
|
2105
2103
|
alpha=0.05, # Significance level for CIs
|
|
2106
2104
|
cluster=None, # Column for cluster-robust SEs
|
|
2107
2105
|
n_bootstrap=0, # Bootstrap iterations (0 = analytical)
|
|
2106
|
+
bootstrap_weights='rademacher', # 'rademacher', 'mammen', or 'webb'
|
|
2108
2107
|
seed=None, # Random seed
|
|
2109
2108
|
rank_deficient_action='warn', # 'warn', 'error', or 'silent'
|
|
2110
2109
|
horizon_max=None, # Max event-study horizon
|
|
@@ -2159,6 +2158,7 @@ TwoStageDiD(
|
|
|
2159
2158
|
alpha=0.05, # Significance level for CIs
|
|
2160
2159
|
cluster=None, # Column for cluster-robust SEs (defaults to unit)
|
|
2161
2160
|
n_bootstrap=0, # Bootstrap iterations (0 = analytical GMM SEs)
|
|
2161
|
+
bootstrap_weights='rademacher', # 'rademacher', 'mammen', or 'webb'
|
|
2162
2162
|
seed=None, # Random seed
|
|
2163
2163
|
rank_deficient_action='warn', # 'warn', 'error', or 'silent'
|
|
2164
2164
|
horizon_max=None, # Max event-study horizon
|
|
@@ -35,6 +35,8 @@ try:
|
|
|
35
35
|
compute_time_weights as _rust_compute_time_weights,
|
|
36
36
|
compute_noise_level as _rust_compute_noise_level,
|
|
37
37
|
sc_weight_fw as _rust_sc_weight_fw,
|
|
38
|
+
# Diagnostics
|
|
39
|
+
rust_backend_info as _rust_backend_info,
|
|
38
40
|
)
|
|
39
41
|
_rust_available = True
|
|
40
42
|
except ImportError:
|
|
@@ -56,6 +58,7 @@ except ImportError:
|
|
|
56
58
|
_rust_compute_time_weights = None
|
|
57
59
|
_rust_compute_noise_level = None
|
|
58
60
|
_rust_sc_weight_fw = None
|
|
61
|
+
_rust_backend_info = None
|
|
59
62
|
|
|
60
63
|
# Determine final backend based on environment variable and availability
|
|
61
64
|
if _backend_env == 'python':
|
|
@@ -78,6 +81,7 @@ if _backend_env == 'python':
|
|
|
78
81
|
_rust_compute_time_weights = None
|
|
79
82
|
_rust_compute_noise_level = None
|
|
80
83
|
_rust_sc_weight_fw = None
|
|
84
|
+
_rust_backend_info = None
|
|
81
85
|
elif _backend_env == 'rust':
|
|
82
86
|
# Force Rust mode - fail if not available
|
|
83
87
|
if not _rust_available:
|
|
@@ -90,8 +94,25 @@ else:
|
|
|
90
94
|
# Auto mode - use Rust if available
|
|
91
95
|
HAS_RUST_BACKEND = _rust_available
|
|
92
96
|
|
|
97
|
+
|
|
98
|
+
def rust_backend_info():
|
|
99
|
+
"""Return compile-time BLAS feature information for the Rust backend.
|
|
100
|
+
|
|
101
|
+
Returns a dict with keys:
|
|
102
|
+
- 'blas': True if any BLAS backend is linked
|
|
103
|
+
- 'accelerate': True if Apple Accelerate is linked (macOS)
|
|
104
|
+
- 'openblas': True if OpenBLAS is linked (Linux)
|
|
105
|
+
|
|
106
|
+
If the Rust backend is not available, all values are False.
|
|
107
|
+
"""
|
|
108
|
+
if _rust_backend_info is not None:
|
|
109
|
+
return _rust_backend_info()
|
|
110
|
+
return {"blas": False, "accelerate": False, "openblas": False}
|
|
111
|
+
|
|
112
|
+
|
|
93
113
|
__all__ = [
|
|
94
114
|
'HAS_RUST_BACKEND',
|
|
115
|
+
'rust_backend_info',
|
|
95
116
|
'_rust_bootstrap_weights',
|
|
96
117
|
'_rust_synthetic_weights',
|
|
97
118
|
'_rust_project_simplex',
|
|
@@ -22,7 +22,7 @@ import pandas as pd
|
|
|
22
22
|
from scipy import sparse, stats
|
|
23
23
|
from scipy.sparse.linalg import spsolve
|
|
24
24
|
|
|
25
|
-
from diff_diff.imputation_bootstrap import ImputationDiDBootstrapMixin
|
|
25
|
+
from diff_diff.imputation_bootstrap import ImputationDiDBootstrapMixin, _compute_target_weights
|
|
26
26
|
from diff_diff.imputation_results import ImputationBootstrapResults, ImputationDiDResults # noqa: F401 (re-export)
|
|
27
27
|
from diff_diff.linalg import solve_ols
|
|
28
28
|
from diff_diff.utils import safe_inference
|
|
@@ -63,6 +63,8 @@ class ImputationDiD(ImputationDiDBootstrapMixin):
|
|
|
63
63
|
n_bootstrap : int, default=0
|
|
64
64
|
Number of bootstrap iterations. If 0, uses analytical inference
|
|
65
65
|
(conservative variance from Theorem 3).
|
|
66
|
+
bootstrap_weights : str, default="rademacher"
|
|
67
|
+
Type of bootstrap weights: "rademacher", "mammen", or "webb".
|
|
66
68
|
seed : int, optional
|
|
67
69
|
Random seed for reproducibility.
|
|
68
70
|
rank_deficient_action : str, default="warn"
|
|
@@ -126,6 +128,7 @@ class ImputationDiD(ImputationDiDBootstrapMixin):
|
|
|
126
128
|
alpha: float = 0.05,
|
|
127
129
|
cluster: Optional[str] = None,
|
|
128
130
|
n_bootstrap: int = 0,
|
|
131
|
+
bootstrap_weights: str = "rademacher",
|
|
129
132
|
seed: Optional[int] = None,
|
|
130
133
|
rank_deficient_action: str = "warn",
|
|
131
134
|
horizon_max: Optional[int] = None,
|
|
@@ -136,6 +139,11 @@ class ImputationDiD(ImputationDiDBootstrapMixin):
|
|
|
136
139
|
f"rank_deficient_action must be 'warn', 'error', or 'silent', "
|
|
137
140
|
f"got '{rank_deficient_action}'"
|
|
138
141
|
)
|
|
142
|
+
if bootstrap_weights not in ("rademacher", "mammen", "webb"):
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"bootstrap_weights must be 'rademacher', 'mammen', or 'webb', "
|
|
145
|
+
f"got '{bootstrap_weights}'"
|
|
146
|
+
)
|
|
139
147
|
if aux_partition not in ("cohort_horizon", "cohort", "horizon"):
|
|
140
148
|
raise ValueError(
|
|
141
149
|
f"aux_partition must be 'cohort_horizon', 'cohort', or 'horizon', "
|
|
@@ -146,6 +154,7 @@ class ImputationDiD(ImputationDiDBootstrapMixin):
|
|
|
146
154
|
self.alpha = alpha
|
|
147
155
|
self.cluster = cluster
|
|
148
156
|
self.n_bootstrap = n_bootstrap
|
|
157
|
+
self.bootstrap_weights = bootstrap_weights
|
|
149
158
|
self.seed = seed
|
|
150
159
|
self.rank_deficient_action = rank_deficient_action
|
|
151
160
|
self.horizon_max = horizon_max
|
|
@@ -1359,15 +1368,7 @@ class ImputationDiD(ImputationDiDBootstrapMixin):
|
|
|
1359
1368
|
effect = float(np.mean(valid_tau))
|
|
1360
1369
|
|
|
1361
1370
|
# Compute SE via conservative variance with horizon-specific weights
|
|
1362
|
-
weights_h =
|
|
1363
|
-
# Map h_mask (relative to df_1) to weights array
|
|
1364
|
-
h_indices_in_omega1 = np.where(h_mask)[0]
|
|
1365
|
-
n_valid = len(valid_tau)
|
|
1366
|
-
# Only weight valid (finite) observations
|
|
1367
|
-
finite_mask = np.isfinite(tau_hat[h_mask])
|
|
1368
|
-
valid_h_indices = h_indices_in_omega1[finite_mask]
|
|
1369
|
-
for idx in valid_h_indices:
|
|
1370
|
-
weights_h[idx] = 1.0 / n_valid
|
|
1371
|
+
weights_h, n_valid = _compute_target_weights(tau_hat, h_mask)
|
|
1371
1372
|
|
|
1372
1373
|
se = self._compute_conservative_variance(
|
|
1373
1374
|
df=df,
|
|
@@ -1477,12 +1478,7 @@ class ImputationDiD(ImputationDiDBootstrapMixin):
|
|
|
1477
1478
|
effect = float(np.mean(valid_tau))
|
|
1478
1479
|
|
|
1479
1480
|
# Compute SE with group-specific weights
|
|
1480
|
-
weights_g =
|
|
1481
|
-
finite_mask = np.isfinite(tau_hat) & g_mask
|
|
1482
|
-
g_indices = np.where(finite_mask)[0]
|
|
1483
|
-
n_valid = len(valid_tau)
|
|
1484
|
-
for idx in g_indices:
|
|
1485
|
-
weights_g[idx] = 1.0 / n_valid
|
|
1481
|
+
weights_g, _ = _compute_target_weights(tau_hat, g_mask)
|
|
1486
1482
|
|
|
1487
1483
|
se = self._compute_conservative_variance(
|
|
1488
1484
|
df=df,
|
|
@@ -1664,6 +1660,7 @@ class ImputationDiD(ImputationDiDBootstrapMixin):
|
|
|
1664
1660
|
"alpha": self.alpha,
|
|
1665
1661
|
"cluster": self.cluster,
|
|
1666
1662
|
"n_bootstrap": self.n_bootstrap,
|
|
1663
|
+
"bootstrap_weights": self.bootstrap_weights,
|
|
1667
1664
|
"seed": self.seed,
|
|
1668
1665
|
"rank_deficient_action": self.rank_deficient_action,
|
|
1669
1666
|
"horizon_max": self.horizon_max,
|
|
@@ -19,6 +19,39 @@ __all__ = [
|
|
|
19
19
|
]
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def _compute_target_weights(
|
|
23
|
+
tau_hat: np.ndarray,
|
|
24
|
+
target_mask: np.ndarray,
|
|
25
|
+
) -> "tuple[np.ndarray, int]":
|
|
26
|
+
"""
|
|
27
|
+
Equal weights for finite tau_hat observations within target_mask.
|
|
28
|
+
|
|
29
|
+
Used by both aggregation and bootstrap paths to avoid weight logic
|
|
30
|
+
duplication.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
tau_hat : np.ndarray
|
|
35
|
+
Per-observation treatment effects (may contain NaN).
|
|
36
|
+
target_mask : np.ndarray
|
|
37
|
+
Boolean mask selecting the target subset within tau_hat.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
weights : np.ndarray
|
|
42
|
+
Weight array (same length as tau_hat). 1/n_valid for finite
|
|
43
|
+
observations in target_mask, 0 elsewhere.
|
|
44
|
+
n_valid : int
|
|
45
|
+
Number of finite observations in the target subset.
|
|
46
|
+
"""
|
|
47
|
+
finite_target = np.isfinite(tau_hat) & target_mask
|
|
48
|
+
n_valid = int(finite_target.sum())
|
|
49
|
+
weights = np.zeros(len(tau_hat))
|
|
50
|
+
if n_valid > 0:
|
|
51
|
+
weights[np.where(finite_target)[0]] = 1.0 / n_valid
|
|
52
|
+
return weights, n_valid
|
|
53
|
+
|
|
54
|
+
|
|
22
55
|
class ImputationDiDBootstrapMixin:
|
|
23
56
|
"""Mixin providing bootstrap inference methods for ImputationDiD."""
|
|
24
57
|
|
|
@@ -91,7 +124,8 @@ class ImputationDiDBootstrapMixin:
|
|
|
91
124
|
|
|
92
125
|
For each aggregation target (overall, per-horizon, per-group), computes
|
|
93
126
|
psi_i = sum_t v_it * epsilon_tilde_it for each cluster. The multiplier
|
|
94
|
-
bootstrap then perturbs these psi sums with
|
|
127
|
+
bootstrap then perturbs these psi sums with multiplier weights
|
|
128
|
+
(rademacher/mammen/webb; configurable via ``bootstrap_weights``).
|
|
95
129
|
|
|
96
130
|
Computational cost scales with the number of aggregation targets, since
|
|
97
131
|
each target requires its own v_untreated computation (weight-dependent).
|
|
@@ -120,13 +154,10 @@ class ImputationDiDBootstrapMixin:
|
|
|
120
154
|
result["overall"] = (overall_psi, cluster_ids)
|
|
121
155
|
|
|
122
156
|
# Event study: per-horizon weights
|
|
123
|
-
# NOTE: weight logic duplicated from _aggregate_event_study.
|
|
124
|
-
# If weight scheme changes there, update here too.
|
|
125
157
|
if event_study_effects:
|
|
126
158
|
result["event_study"] = {}
|
|
127
159
|
df_1 = df.loc[omega_1_mask]
|
|
128
160
|
rel_times = df_1["_rel_time"].values
|
|
129
|
-
n_omega_1 = int(omega_1_mask.sum())
|
|
130
161
|
|
|
131
162
|
# Balanced cohort mask (same logic as _aggregate_event_study)
|
|
132
163
|
balanced_mask = None
|
|
@@ -150,24 +181,18 @@ class ImputationDiDBootstrapMixin:
|
|
|
150
181
|
h_mask = rel_times == h
|
|
151
182
|
if balanced_mask is not None:
|
|
152
183
|
h_mask = h_mask & balanced_mask
|
|
153
|
-
weights_h =
|
|
154
|
-
finite_h = np.isfinite(tau_hat) & h_mask
|
|
155
|
-
n_valid_h = int(finite_h.sum())
|
|
184
|
+
weights_h, n_valid_h = _compute_target_weights(tau_hat, h_mask)
|
|
156
185
|
if n_valid_h == 0:
|
|
157
186
|
continue
|
|
158
|
-
weights_h[np.where(finite_h)[0]] = 1.0 / n_valid_h
|
|
159
187
|
|
|
160
188
|
psi_h, _ = self._compute_cluster_psi_sums(**common, weights=weights_h)
|
|
161
189
|
result["event_study"][h] = psi_h
|
|
162
190
|
|
|
163
191
|
# Group effects: per-group weights
|
|
164
|
-
# NOTE: weight logic duplicated from _aggregate_group.
|
|
165
|
-
# If weight scheme changes there, update here too.
|
|
166
192
|
if group_effects:
|
|
167
193
|
result["group"] = {}
|
|
168
194
|
df_1 = df.loc[omega_1_mask]
|
|
169
195
|
cohorts = df_1[first_treat].values
|
|
170
|
-
n_omega_1 = int(omega_1_mask.sum())
|
|
171
196
|
|
|
172
197
|
for g in group_effects:
|
|
173
198
|
if group_effects[g].get("n_obs", 0) == 0:
|
|
@@ -175,12 +200,9 @@ class ImputationDiDBootstrapMixin:
|
|
|
175
200
|
if not np.isfinite(group_effects[g].get("effect", np.nan)):
|
|
176
201
|
continue
|
|
177
202
|
g_mask = cohorts == g
|
|
178
|
-
weights_g =
|
|
179
|
-
finite_g = np.isfinite(tau_hat) & g_mask
|
|
180
|
-
n_valid_g = int(finite_g.sum())
|
|
203
|
+
weights_g, n_valid_g = _compute_target_weights(tau_hat, g_mask)
|
|
181
204
|
if n_valid_g == 0:
|
|
182
205
|
continue
|
|
183
|
-
weights_g[np.where(finite_g)[0]] = 1.0 / n_valid_g
|
|
184
206
|
|
|
185
207
|
psi_g, _ = self._compute_cluster_psi_sums(**common, weights=weights_g)
|
|
186
208
|
result["group"][g] = psi_g
|
|
@@ -197,7 +219,8 @@ class ImputationDiDBootstrapMixin:
|
|
|
197
219
|
"""
|
|
198
220
|
Run multiplier bootstrap on pre-computed influence function sums.
|
|
199
221
|
|
|
200
|
-
Uses T_b = sum_i w_b_i * psi_i where w_b_i are
|
|
222
|
+
Uses T_b = sum_i w_b_i * psi_i where w_b_i are multiplier weights
|
|
223
|
+
(rademacher/mammen/webb; configurable via ``bootstrap_weights``)
|
|
201
224
|
and psi_i are cluster-level influence function sums from Theorem 3.
|
|
202
225
|
SE = std(T_b, ddof=1).
|
|
203
226
|
"""
|
|
@@ -216,7 +239,7 @@ class ImputationDiDBootstrapMixin:
|
|
|
216
239
|
|
|
217
240
|
# Generate ALL weights upfront: shape (n_bootstrap, n_clusters)
|
|
218
241
|
all_weights = _generate_bootstrap_weights_batch(
|
|
219
|
-
self.n_bootstrap, n_clusters,
|
|
242
|
+
self.n_bootstrap, n_clusters, self.bootstrap_weights, rng
|
|
220
243
|
)
|
|
221
244
|
|
|
222
245
|
# Overall ATT bootstrap draws
|
|
@@ -295,7 +318,7 @@ class ImputationDiDBootstrapMixin:
|
|
|
295
318
|
|
|
296
319
|
return ImputationBootstrapResults(
|
|
297
320
|
n_bootstrap=self.n_bootstrap,
|
|
298
|
-
weight_type=
|
|
321
|
+
weight_type=self.bootstrap_weights,
|
|
299
322
|
alpha=self.alpha,
|
|
300
323
|
overall_att_se=overall_se,
|
|
301
324
|
overall_att_ci=overall_ci,
|
|
@@ -33,7 +33,7 @@ class ImputationBootstrapResults:
|
|
|
33
33
|
n_bootstrap : int
|
|
34
34
|
Number of bootstrap iterations.
|
|
35
35
|
weight_type : str
|
|
36
|
-
Type of bootstrap weights
|
|
36
|
+
Type of bootstrap weights: "rademacher", "mammen", or "webb".
|
|
37
37
|
alpha : float
|
|
38
38
|
Significance level used for confidence intervals.
|
|
39
39
|
overall_att_se : float
|
|
@@ -433,8 +433,6 @@ class SunAbraham:
|
|
|
433
433
|
time: str,
|
|
434
434
|
first_treat: str,
|
|
435
435
|
covariates: Optional[List[str]] = None,
|
|
436
|
-
min_pre_periods: int = 1,
|
|
437
|
-
min_post_periods: int = 1,
|
|
438
436
|
) -> SunAbrahamResults:
|
|
439
437
|
"""
|
|
440
438
|
Fit the Sun-Abraham estimator using saturated regression.
|
|
@@ -454,10 +452,6 @@ class SunAbraham:
|
|
|
454
452
|
Use 0 (or np.inf) for never-treated units.
|
|
455
453
|
covariates : list, optional
|
|
456
454
|
List of covariate column names to include in regression.
|
|
457
|
-
min_pre_periods : int, default=1
|
|
458
|
-
**Deprecated**: Accepted but ignored. Will be removed in a future version.
|
|
459
|
-
min_post_periods : int, default=1
|
|
460
|
-
**Deprecated**: Accepted but ignored. Will be removed in a future version.
|
|
461
455
|
|
|
462
456
|
Returns
|
|
463
457
|
-------
|
|
@@ -469,22 +463,6 @@ class SunAbraham:
|
|
|
469
463
|
ValueError
|
|
470
464
|
If required columns are missing or data validation fails.
|
|
471
465
|
"""
|
|
472
|
-
# Deprecation warnings for unimplemented parameters
|
|
473
|
-
if min_pre_periods != 1:
|
|
474
|
-
warnings.warn(
|
|
475
|
-
"min_pre_periods is not yet implemented and will be ignored. "
|
|
476
|
-
"This parameter will be removed in a future version.",
|
|
477
|
-
FutureWarning,
|
|
478
|
-
stacklevel=2,
|
|
479
|
-
)
|
|
480
|
-
if min_post_periods != 1:
|
|
481
|
-
warnings.warn(
|
|
482
|
-
"min_post_periods is not yet implemented and will be ignored. "
|
|
483
|
-
"This parameter will be removed in a future version.",
|
|
484
|
-
FutureWarning,
|
|
485
|
-
stacklevel=2,
|
|
486
|
-
)
|
|
487
|
-
|
|
488
466
|
# Validate inputs
|
|
489
467
|
required_cols = [outcome, unit, time, first_treat]
|
|
490
468
|
if covariates:
|
|
@@ -93,7 +93,7 @@ class TROP:
|
|
|
93
93
|
alpha : float, default=0.05
|
|
94
94
|
Significance level for confidence intervals.
|
|
95
95
|
n_bootstrap : int, default=200
|
|
96
|
-
Number of bootstrap replications for variance estimation.
|
|
96
|
+
Number of bootstrap replications for variance estimation. Must be >= 2.
|
|
97
97
|
seed : int, optional
|
|
98
98
|
Random seed for reproducibility.
|
|
99
99
|
|
|
@@ -156,6 +156,12 @@ class TROP:
|
|
|
156
156
|
self.lambda_unit_grid = lambda_unit_grid or [0.0, 0.1, 0.5, 1.0, 2.0, 5.0]
|
|
157
157
|
self.lambda_nn_grid = lambda_nn_grid or [0.0, 0.01, 0.1, 1.0, 10.0]
|
|
158
158
|
|
|
159
|
+
if n_bootstrap < 2:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
"n_bootstrap must be >= 2 for TROP (bootstrap variance "
|
|
162
|
+
"estimation is always used)"
|
|
163
|
+
)
|
|
164
|
+
|
|
159
165
|
self.max_iter = max_iter
|
|
160
166
|
self.tol = tol
|
|
161
167
|
self.alpha = alpha
|
|
@@ -29,6 +29,11 @@ import pandas as pd
|
|
|
29
29
|
from scipy import sparse
|
|
30
30
|
from scipy.sparse.linalg import factorized as sparse_factorized
|
|
31
31
|
|
|
32
|
+
# Maximum number of elements before falling back to per-column sparse aggregation.
|
|
33
|
+
# 10M float64 elements ≈ 80 MB peak allocation. Above this, per-column .getcol()
|
|
34
|
+
# trades throughput for bounded memory. Keep in sync with two_stage_bootstrap.py.
|
|
35
|
+
_SPARSE_DENSE_THRESHOLD = 10_000_000
|
|
36
|
+
|
|
32
37
|
from diff_diff.linalg import solve_ols
|
|
33
38
|
from diff_diff.two_stage_bootstrap import TwoStageDiDBootstrapMixin
|
|
34
39
|
from diff_diff.two_stage_results import TwoStageBootstrapResults, TwoStageDiDResults # noqa: F401 (re-export)
|
|
@@ -67,6 +72,8 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin):
|
|
|
67
72
|
n_bootstrap : int, default=0
|
|
68
73
|
Number of bootstrap iterations. If 0, uses analytical GMM
|
|
69
74
|
sandwich inference.
|
|
75
|
+
bootstrap_weights : str, default="rademacher"
|
|
76
|
+
Type of bootstrap weights: "rademacher", "mammen", or "webb".
|
|
70
77
|
seed : int, optional
|
|
71
78
|
Random seed for reproducibility.
|
|
72
79
|
rank_deficient_action : str, default="warn"
|
|
@@ -125,6 +132,7 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin):
|
|
|
125
132
|
alpha: float = 0.05,
|
|
126
133
|
cluster: Optional[str] = None,
|
|
127
134
|
n_bootstrap: int = 0,
|
|
135
|
+
bootstrap_weights: str = "rademacher",
|
|
128
136
|
seed: Optional[int] = None,
|
|
129
137
|
rank_deficient_action: str = "warn",
|
|
130
138
|
horizon_max: Optional[int] = None,
|
|
@@ -134,11 +142,17 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin):
|
|
|
134
142
|
f"rank_deficient_action must be 'warn', 'error', or 'silent', "
|
|
135
143
|
f"got '{rank_deficient_action}'"
|
|
136
144
|
)
|
|
145
|
+
if bootstrap_weights not in ("rademacher", "mammen", "webb"):
|
|
146
|
+
raise ValueError(
|
|
147
|
+
f"bootstrap_weights must be 'rademacher', 'mammen', or 'webb', "
|
|
148
|
+
f"got '{bootstrap_weights}'"
|
|
149
|
+
)
|
|
137
150
|
|
|
138
151
|
self.anticipation = anticipation
|
|
139
152
|
self.alpha = alpha
|
|
140
153
|
self.cluster = cluster
|
|
141
154
|
self.n_bootstrap = n_bootstrap
|
|
155
|
+
self.bootstrap_weights = bootstrap_weights
|
|
142
156
|
self.seed = seed
|
|
143
157
|
self.rank_deficient_action = rank_deficient_action
|
|
144
158
|
self.horizon_max = horizon_max
|
|
@@ -1065,6 +1079,41 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin):
|
|
|
1065
1079
|
|
|
1066
1080
|
return group_effects
|
|
1067
1081
|
|
|
1082
|
+
# =========================================================================
|
|
1083
|
+
# GMM score computation
|
|
1084
|
+
# =========================================================================
|
|
1085
|
+
|
|
1086
|
+
@staticmethod
|
|
1087
|
+
def _compute_gmm_scores(
|
|
1088
|
+
c_by_cluster: np.ndarray,
|
|
1089
|
+
gamma_hat: np.ndarray,
|
|
1090
|
+
s2_by_cluster: np.ndarray,
|
|
1091
|
+
) -> np.ndarray:
|
|
1092
|
+
"""
|
|
1093
|
+
Compute per-cluster GMM scores S_g = gamma_hat' c_g - X'_{2g} eps_{2g}.
|
|
1094
|
+
|
|
1095
|
+
Handles NaN/overflow from rank-deficient FE by wrapping in errstate
|
|
1096
|
+
and replacing non-finite values with 0.
|
|
1097
|
+
|
|
1098
|
+
Parameters
|
|
1099
|
+
----------
|
|
1100
|
+
c_by_cluster : np.ndarray, shape (G, p)
|
|
1101
|
+
Per-cluster Stage 1 scores.
|
|
1102
|
+
gamma_hat : np.ndarray, shape (p, k)
|
|
1103
|
+
Cross-moment correction matrix.
|
|
1104
|
+
s2_by_cluster : np.ndarray, shape (G, k)
|
|
1105
|
+
Per-cluster Stage 2 scores.
|
|
1106
|
+
|
|
1107
|
+
Returns
|
|
1108
|
+
-------
|
|
1109
|
+
np.ndarray, shape (G, k)
|
|
1110
|
+
Per-cluster influence scores.
|
|
1111
|
+
"""
|
|
1112
|
+
with np.errstate(invalid="ignore", divide="ignore", over="ignore"):
|
|
1113
|
+
correction = np.dot(c_by_cluster, gamma_hat)
|
|
1114
|
+
np.nan_to_num(correction, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
|
|
1115
|
+
return correction - s2_by_cluster
|
|
1116
|
+
|
|
1068
1117
|
# =========================================================================
|
|
1069
1118
|
# GMM Sandwich Variance (Butts & Gardner 2022)
|
|
1070
1119
|
# =========================================================================
|
|
@@ -1178,12 +1227,19 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin):
|
|
|
1178
1227
|
unique_clusters, cluster_indices = np.unique(cluster_ids, return_inverse=True)
|
|
1179
1228
|
G = len(unique_clusters)
|
|
1180
1229
|
|
|
1181
|
-
|
|
1182
|
-
weighted_X10_csc = weighted_X10.tocsc()
|
|
1230
|
+
n_elements = weighted_X10.shape[0] * weighted_X10.shape[1]
|
|
1183
1231
|
c_by_cluster = np.zeros((G, p))
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1232
|
+
if n_elements > _SPARSE_DENSE_THRESHOLD:
|
|
1233
|
+
# Per-column path: limits peak memory for large FE matrices
|
|
1234
|
+
weighted_X10_csc = weighted_X10.tocsc()
|
|
1235
|
+
for j_col in range(p):
|
|
1236
|
+
col_data = weighted_X10_csc.getcol(j_col).toarray().ravel()
|
|
1237
|
+
np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data)
|
|
1238
|
+
else:
|
|
1239
|
+
# Dense path: faster for moderate-size matrices
|
|
1240
|
+
weighted_X10_dense = weighted_X10.toarray()
|
|
1241
|
+
for j_col in range(p):
|
|
1242
|
+
np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col])
|
|
1187
1243
|
|
|
1188
1244
|
# 3. Per-cluster Stage 2 scores: X'_{2g} eps_{2g}
|
|
1189
1245
|
weighted_X2 = X_2 * eps_2[:, None] # (n x k) dense
|
|
@@ -1192,11 +1248,7 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin):
|
|
|
1192
1248
|
np.add.at(s2_by_cluster[:, j_col], cluster_indices, weighted_X2[:, j_col])
|
|
1193
1249
|
|
|
1194
1250
|
# 4. S_g = gamma_hat' c_g - X'_{2g} eps_{2g}
|
|
1195
|
-
|
|
1196
|
-
correction = np.dot(c_by_cluster, gamma_hat) # (G x p) @ (p x k) = (G x k)
|
|
1197
|
-
# Replace NaN/inf from overflow (rank-deficient FE) with 0
|
|
1198
|
-
np.nan_to_num(correction, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
|
|
1199
|
-
S = correction - s2_by_cluster # (G x k)
|
|
1251
|
+
S = self._compute_gmm_scores(c_by_cluster, gamma_hat, s2_by_cluster)
|
|
1200
1252
|
|
|
1201
1253
|
# 5. Meat: sum_g S_g S'_g = S' S
|
|
1202
1254
|
with np.errstate(invalid="ignore", over="ignore"):
|
|
@@ -1304,6 +1356,7 @@ class TwoStageDiD(TwoStageDiDBootstrapMixin):
|
|
|
1304
1356
|
"alpha": self.alpha,
|
|
1305
1357
|
"cluster": self.cluster,
|
|
1306
1358
|
"n_bootstrap": self.n_bootstrap,
|
|
1359
|
+
"bootstrap_weights": self.bootstrap_weights,
|
|
1307
1360
|
"seed": self.seed,
|
|
1308
1361
|
"rank_deficient_action": self.rank_deficient_action,
|
|
1309
1362
|
"horizon_max": self.horizon_max,
|
|
@@ -15,6 +15,9 @@ from scipy.sparse.linalg import factorized as sparse_factorized
|
|
|
15
15
|
|
|
16
16
|
from diff_diff.linalg import solve_ols
|
|
17
17
|
from diff_diff.staggered_bootstrap import _generate_bootstrap_weights_batch
|
|
18
|
+
# Maximum number of elements before falling back to per-column sparse aggregation.
|
|
19
|
+
# Keep in sync with two_stage.py.
|
|
20
|
+
_SPARSE_DENSE_THRESHOLD = 10_000_000
|
|
18
21
|
from diff_diff.two_stage_results import TwoStageBootstrapResults
|
|
19
22
|
|
|
20
23
|
__all__ = [
|
|
@@ -106,19 +109,26 @@ class TwoStageDiDBootstrapMixin:
|
|
|
106
109
|
unique_clusters, cluster_indices = np.unique(cluster_ids, return_inverse=True)
|
|
107
110
|
G = len(unique_clusters)
|
|
108
111
|
|
|
109
|
-
|
|
112
|
+
n_elements = weighted_X10.shape[0] * weighted_X10.shape[1]
|
|
110
113
|
c_by_cluster = np.zeros((G, p))
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
+
if n_elements > _SPARSE_DENSE_THRESHOLD:
|
|
115
|
+
# Per-column path: limits peak memory for large FE matrices
|
|
116
|
+
weighted_X10_csc = weighted_X10.tocsc()
|
|
117
|
+
for j_col in range(p):
|
|
118
|
+
col_data = weighted_X10_csc.getcol(j_col).toarray().ravel()
|
|
119
|
+
np.add.at(c_by_cluster[:, j_col], cluster_indices, col_data)
|
|
120
|
+
else:
|
|
121
|
+
# Dense path: faster for moderate-size matrices
|
|
122
|
+
weighted_X10_dense = weighted_X10.toarray()
|
|
123
|
+
for j_col in range(p):
|
|
124
|
+
np.add.at(c_by_cluster[:, j_col], cluster_indices, weighted_X10_dense[:, j_col])
|
|
114
125
|
|
|
115
126
|
weighted_X2 = X_2 * eps_2[:, None]
|
|
116
127
|
s2_by_cluster = np.zeros((G, k))
|
|
117
128
|
for j_col in range(k):
|
|
118
129
|
np.add.at(s2_by_cluster[:, j_col], cluster_indices, weighted_X2[:, j_col])
|
|
119
130
|
|
|
120
|
-
|
|
121
|
-
S = correction - s2_by_cluster
|
|
131
|
+
S = self._compute_gmm_scores(c_by_cluster, gamma_hat, s2_by_cluster)
|
|
122
132
|
|
|
123
133
|
# Bread
|
|
124
134
|
XtX_2 = np.dot(X_2.T, X_2)
|
|
@@ -201,7 +211,7 @@ class TwoStageDiDBootstrapMixin:
|
|
|
201
211
|
|
|
202
212
|
n_clusters = len(unique_clusters)
|
|
203
213
|
all_weights = _generate_bootstrap_weights_batch(
|
|
204
|
-
self.n_bootstrap, n_clusters,
|
|
214
|
+
self.n_bootstrap, n_clusters, self.bootstrap_weights, rng
|
|
205
215
|
)
|
|
206
216
|
|
|
207
217
|
# T_b = bread @ (sum_g w_bg * S_g) = bread @ (W @ S)' per boot
|
|
@@ -385,7 +395,7 @@ class TwoStageDiDBootstrapMixin:
|
|
|
385
395
|
|
|
386
396
|
return TwoStageBootstrapResults(
|
|
387
397
|
n_bootstrap=self.n_bootstrap,
|
|
388
|
-
weight_type=
|
|
398
|
+
weight_type=self.bootstrap_weights,
|
|
389
399
|
alpha=self.alpha,
|
|
390
400
|
overall_att_se=overall_se,
|
|
391
401
|
overall_att_ci=overall_ci,
|
|
@@ -34,7 +34,7 @@ class TwoStageBootstrapResults:
|
|
|
34
34
|
n_bootstrap : int
|
|
35
35
|
Number of bootstrap iterations.
|
|
36
36
|
weight_type : str
|
|
37
|
-
Type of bootstrap weights
|
|
37
|
+
Type of bootstrap weights: "rademacher", "mammen", or "webb".
|
|
38
38
|
alpha : float
|
|
39
39
|
Significance level used for confidence intervals.
|
|
40
40
|
overall_att_se : float
|