diff-diff 3.0.1__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diff_diff/__init__.py +382 -0
- diff_diff/_backend.py +134 -0
- diff_diff/_rust_backend.cp314-win_amd64.pyd +0 -0
- diff_diff/bacon.py +1140 -0
- diff_diff/bootstrap_utils.py +730 -0
- diff_diff/continuous_did.py +1626 -0
- diff_diff/continuous_did_bspline.py +190 -0
- diff_diff/continuous_did_results.py +374 -0
- diff_diff/datasets.py +815 -0
- diff_diff/diagnostics.py +882 -0
- diff_diff/efficient_did.py +1770 -0
- diff_diff/efficient_did_bootstrap.py +359 -0
- diff_diff/efficient_did_covariates.py +899 -0
- diff_diff/efficient_did_results.py +368 -0
- diff_diff/efficient_did_weights.py +617 -0
- diff_diff/estimators.py +1501 -0
- diff_diff/honest_did.py +2585 -0
- diff_diff/imputation.py +2458 -0
- diff_diff/imputation_bootstrap.py +418 -0
- diff_diff/imputation_results.py +448 -0
- diff_diff/linalg.py +2538 -0
- diff_diff/power.py +2588 -0
- diff_diff/practitioner.py +869 -0
- diff_diff/prep.py +1738 -0
- diff_diff/prep_dgp.py +1718 -0
- diff_diff/pretrends.py +1105 -0
- diff_diff/results.py +918 -0
- diff_diff/stacked_did.py +1049 -0
- diff_diff/stacked_did_results.py +339 -0
- diff_diff/staggered.py +3895 -0
- diff_diff/staggered_aggregation.py +864 -0
- diff_diff/staggered_bootstrap.py +752 -0
- diff_diff/staggered_results.py +416 -0
- diff_diff/staggered_triple_diff.py +1545 -0
- diff_diff/staggered_triple_diff_results.py +416 -0
- diff_diff/sun_abraham.py +1685 -0
- diff_diff/survey.py +1981 -0
- diff_diff/synthetic_did.py +1136 -0
- diff_diff/triple_diff.py +2047 -0
- diff_diff/trop.py +952 -0
- diff_diff/trop_global.py +1270 -0
- diff_diff/trop_local.py +1307 -0
- diff_diff/trop_results.py +356 -0
- diff_diff/twfe.py +542 -0
- diff_diff/two_stage.py +1952 -0
- diff_diff/two_stage_bootstrap.py +520 -0
- diff_diff/two_stage_results.py +400 -0
- diff_diff/utils.py +1902 -0
- diff_diff/visualization/__init__.py +61 -0
- diff_diff/visualization/_common.py +328 -0
- diff_diff/visualization/_continuous.py +274 -0
- diff_diff/visualization/_diagnostic.py +817 -0
- diff_diff/visualization/_event_study.py +1086 -0
- diff_diff/visualization/_power.py +661 -0
- diff_diff/visualization/_staggered.py +833 -0
- diff_diff/visualization/_synthetic.py +197 -0
- diff_diff/wooldridge.py +1285 -0
- diff_diff/wooldridge_results.py +349 -0
- diff_diff-3.0.1.dist-info/METADATA +2997 -0
- diff_diff-3.0.1.dist-info/RECORD +62 -0
- diff_diff-3.0.1.dist-info/WHEEL +4 -0
- diff_diff-3.0.1.dist-info/sboms/diff_diff_rust.cyclonedx.json +5843 -0
diff_diff/prep_dgp.py
ADDED
|
@@ -0,0 +1,1718 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data generation utilities for difference-in-differences analysis.
|
|
3
|
+
|
|
4
|
+
This module provides functions to generate synthetic datasets for testing
|
|
5
|
+
and validating DiD estimators, including basic 2x2 DiD, staggered adoption,
|
|
6
|
+
factor model data, triple difference, and event study designs.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def generate_did_data(
|
|
16
|
+
n_units: int = 100,
|
|
17
|
+
n_periods: int = 4,
|
|
18
|
+
treatment_effect: float = 5.0,
|
|
19
|
+
treatment_fraction: float = 0.5,
|
|
20
|
+
treatment_period: int = 2,
|
|
21
|
+
unit_fe_sd: float = 2.0,
|
|
22
|
+
time_trend: float = 0.5,
|
|
23
|
+
noise_sd: float = 1.0,
|
|
24
|
+
seed: Optional[int] = None,
|
|
25
|
+
) -> pd.DataFrame:
|
|
26
|
+
"""
|
|
27
|
+
Generate synthetic data for DiD analysis with known treatment effect.
|
|
28
|
+
|
|
29
|
+
Creates a balanced panel dataset with realistic features including
|
|
30
|
+
unit fixed effects, time trends, and a known treatment effect.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
n_units : int, default=100
|
|
35
|
+
Number of units in the panel.
|
|
36
|
+
n_periods : int, default=4
|
|
37
|
+
Number of time periods.
|
|
38
|
+
treatment_effect : float, default=5.0
|
|
39
|
+
True average treatment effect on the treated.
|
|
40
|
+
treatment_fraction : float, default=0.5
|
|
41
|
+
Fraction of units that receive treatment.
|
|
42
|
+
treatment_period : int, default=2
|
|
43
|
+
First post-treatment period (0-indexed). Periods >= this are post.
|
|
44
|
+
unit_fe_sd : float, default=2.0
|
|
45
|
+
Standard deviation of unit fixed effects.
|
|
46
|
+
time_trend : float, default=0.5
|
|
47
|
+
Linear time trend coefficient.
|
|
48
|
+
noise_sd : float, default=1.0
|
|
49
|
+
Standard deviation of idiosyncratic noise.
|
|
50
|
+
seed : int, optional
|
|
51
|
+
Random seed for reproducibility.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
pd.DataFrame
|
|
56
|
+
Synthetic panel data with columns:
|
|
57
|
+
- unit: Unit identifier
|
|
58
|
+
- period: Time period
|
|
59
|
+
- treated: Treatment indicator (0/1)
|
|
60
|
+
- post: Post-treatment indicator (0/1)
|
|
61
|
+
- outcome: Outcome variable
|
|
62
|
+
- true_effect: The true treatment effect (for validation)
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
Generate simple data for testing:
|
|
67
|
+
|
|
68
|
+
>>> data = generate_did_data(n_units=50, n_periods=4, treatment_effect=3.0, seed=42)
|
|
69
|
+
>>> len(data)
|
|
70
|
+
200
|
|
71
|
+
>>> data.columns.tolist()
|
|
72
|
+
['unit', 'period', 'treated', 'post', 'outcome', 'true_effect']
|
|
73
|
+
|
|
74
|
+
Verify treatment effect recovery:
|
|
75
|
+
|
|
76
|
+
>>> from diff_diff import DifferenceInDifferences
|
|
77
|
+
>>> did = DifferenceInDifferences()
|
|
78
|
+
>>> results = did.fit(data, outcome='outcome', treatment='treated', time='post')
|
|
79
|
+
>>> abs(results.att - 3.0) < 1.0 # Close to true effect
|
|
80
|
+
True
|
|
81
|
+
"""
|
|
82
|
+
rng = np.random.default_rng(seed)
|
|
83
|
+
|
|
84
|
+
# Determine treated units
|
|
85
|
+
n_treated = int(n_units * treatment_fraction)
|
|
86
|
+
treated_units = set(range(n_treated))
|
|
87
|
+
|
|
88
|
+
# Generate unit fixed effects
|
|
89
|
+
unit_fe = rng.normal(0, unit_fe_sd, n_units)
|
|
90
|
+
|
|
91
|
+
# Build data
|
|
92
|
+
records = []
|
|
93
|
+
for unit in range(n_units):
|
|
94
|
+
is_treated = unit in treated_units
|
|
95
|
+
|
|
96
|
+
for period in range(n_periods):
|
|
97
|
+
is_post = period >= treatment_period
|
|
98
|
+
|
|
99
|
+
# Base outcome
|
|
100
|
+
y = 10.0 # Baseline
|
|
101
|
+
y += unit_fe[unit] # Unit fixed effect
|
|
102
|
+
y += time_trend * period # Time trend
|
|
103
|
+
|
|
104
|
+
# Treatment effect (only for treated units in post-period)
|
|
105
|
+
effect = 0.0
|
|
106
|
+
if is_treated and is_post:
|
|
107
|
+
effect = treatment_effect
|
|
108
|
+
y += effect
|
|
109
|
+
|
|
110
|
+
# Add noise
|
|
111
|
+
y += rng.normal(0, noise_sd)
|
|
112
|
+
|
|
113
|
+
records.append(
|
|
114
|
+
{
|
|
115
|
+
"unit": unit,
|
|
116
|
+
"period": period,
|
|
117
|
+
"treated": int(is_treated),
|
|
118
|
+
"post": int(is_post),
|
|
119
|
+
"outcome": y,
|
|
120
|
+
"true_effect": effect,
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return pd.DataFrame(records)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def generate_staggered_data(
|
|
128
|
+
n_units: int = 100,
|
|
129
|
+
n_periods: int = 10,
|
|
130
|
+
cohort_periods: Optional[List[int]] = None,
|
|
131
|
+
never_treated_frac: float = 0.3,
|
|
132
|
+
treatment_effect: float = 2.0,
|
|
133
|
+
dynamic_effects: bool = True,
|
|
134
|
+
effect_growth: float = 0.1,
|
|
135
|
+
unit_fe_sd: float = 2.0,
|
|
136
|
+
time_trend: float = 0.1,
|
|
137
|
+
noise_sd: float = 0.5,
|
|
138
|
+
seed: Optional[int] = None,
|
|
139
|
+
panel: bool = True,
|
|
140
|
+
) -> pd.DataFrame:
|
|
141
|
+
"""
|
|
142
|
+
Generate synthetic data for staggered adoption DiD analysis.
|
|
143
|
+
|
|
144
|
+
Creates panel data where different units receive treatment at different
|
|
145
|
+
times (staggered rollout). Useful for testing CallawaySantAnna,
|
|
146
|
+
SunAbraham, and other staggered DiD estimators.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
n_units : int, default=100
|
|
151
|
+
Total number of units in the panel.
|
|
152
|
+
n_periods : int, default=10
|
|
153
|
+
Number of time periods.
|
|
154
|
+
cohort_periods : list of int, optional
|
|
155
|
+
Periods when treatment cohorts are first treated.
|
|
156
|
+
If None, defaults to [3, 5, 7] for a 10-period panel.
|
|
157
|
+
never_treated_frac : float, default=0.3
|
|
158
|
+
Fraction of units that are never treated (cohort 0).
|
|
159
|
+
treatment_effect : float, default=2.0
|
|
160
|
+
Base treatment effect at time of treatment.
|
|
161
|
+
dynamic_effects : bool, default=True
|
|
162
|
+
If True, treatment effects grow over time since treatment.
|
|
163
|
+
effect_growth : float, default=0.1
|
|
164
|
+
Per-period growth in treatment effect (if dynamic_effects=True).
|
|
165
|
+
Effect at time t since treatment: effect * (1 + effect_growth * t).
|
|
166
|
+
unit_fe_sd : float, default=2.0
|
|
167
|
+
Standard deviation of unit fixed effects.
|
|
168
|
+
time_trend : float, default=0.1
|
|
169
|
+
Linear time trend coefficient.
|
|
170
|
+
noise_sd : float, default=0.5
|
|
171
|
+
Standard deviation of idiosyncratic noise.
|
|
172
|
+
seed : int, optional
|
|
173
|
+
Random seed for reproducibility.
|
|
174
|
+
panel : bool, default=True
|
|
175
|
+
If True (default), generate balanced panel data (same units across
|
|
176
|
+
all periods). If False, generate repeated cross-section data where
|
|
177
|
+
each period draws independent observations with globally unique IDs.
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
pd.DataFrame
|
|
182
|
+
Synthetic staggered adoption data with columns:
|
|
183
|
+
- unit: Unit identifier
|
|
184
|
+
- period: Time period
|
|
185
|
+
- outcome: Outcome variable
|
|
186
|
+
- first_treat: First treatment period (0 = never treated)
|
|
187
|
+
- treated: Binary indicator (1 if treated at this observation)
|
|
188
|
+
- treat: Binary unit-level ever-treated indicator
|
|
189
|
+
- true_effect: The true treatment effect for this observation
|
|
190
|
+
|
|
191
|
+
Examples
|
|
192
|
+
--------
|
|
193
|
+
Generate staggered adoption data:
|
|
194
|
+
|
|
195
|
+
>>> data = generate_staggered_data(n_units=100, n_periods=10, seed=42)
|
|
196
|
+
>>> data['first_treat'].value_counts().sort_index()
|
|
197
|
+
0 30
|
|
198
|
+
3 24
|
|
199
|
+
5 23
|
|
200
|
+
7 23
|
|
201
|
+
Name: first_treat, dtype: int64
|
|
202
|
+
|
|
203
|
+
Use with Callaway-Sant'Anna estimator:
|
|
204
|
+
|
|
205
|
+
>>> from diff_diff import CallawaySantAnna
|
|
206
|
+
>>> cs = CallawaySantAnna()
|
|
207
|
+
>>> results = cs.fit(data, outcome='outcome', unit='unit',
|
|
208
|
+
... time='period', first_treat='first_treat')
|
|
209
|
+
>>> results.overall_att > 0
|
|
210
|
+
True
|
|
211
|
+
"""
|
|
212
|
+
rng = np.random.default_rng(seed)
|
|
213
|
+
|
|
214
|
+
# Default cohort periods if not specified
|
|
215
|
+
if cohort_periods is None:
|
|
216
|
+
cohort_periods = [3, 5, 7] if n_periods >= 8 else [n_periods // 3, 2 * n_periods // 3]
|
|
217
|
+
|
|
218
|
+
# Validate cohort periods
|
|
219
|
+
for cp in cohort_periods:
|
|
220
|
+
if cp < 1 or cp >= n_periods:
|
|
221
|
+
raise ValueError(f"Cohort period {cp} must be between 1 and {n_periods - 1}")
|
|
222
|
+
|
|
223
|
+
# Determine number of never-treated and treated units
|
|
224
|
+
n_never = int(n_units * never_treated_frac)
|
|
225
|
+
n_treated = n_units - n_never
|
|
226
|
+
|
|
227
|
+
if not panel:
|
|
228
|
+
# --- Repeated cross-section mode ---
|
|
229
|
+
# Each period draws n_units independent observations with unique IDs.
|
|
230
|
+
# Cohorts are assigned from the same distribution as panel.
|
|
231
|
+
records = []
|
|
232
|
+
for period in range(n_periods):
|
|
233
|
+
# For each period, draw fresh cohort assignments
|
|
234
|
+
ft_period = np.zeros(n_units, dtype=int)
|
|
235
|
+
if n_treated > 0:
|
|
236
|
+
cohort_assignments = rng.choice(len(cohort_periods), size=n_treated)
|
|
237
|
+
ft_period[n_never:] = [cohort_periods[c] for c in cohort_assignments]
|
|
238
|
+
|
|
239
|
+
# Unique unit IDs per period
|
|
240
|
+
for i in range(n_units):
|
|
241
|
+
uid = f"u{period}_{i}"
|
|
242
|
+
unit_first_treat = ft_period[i]
|
|
243
|
+
is_ever_treated = unit_first_treat > 0
|
|
244
|
+
|
|
245
|
+
is_treated = is_ever_treated and period >= unit_first_treat
|
|
246
|
+
|
|
247
|
+
# Outcome: unit_fe_proxy (drawn fresh) + time trend + treatment + noise
|
|
248
|
+
unit_fe_proxy = rng.normal(0, unit_fe_sd)
|
|
249
|
+
y = 10.0 + unit_fe_proxy + time_trend * period
|
|
250
|
+
|
|
251
|
+
effect = 0.0
|
|
252
|
+
if is_treated:
|
|
253
|
+
time_since_treatment = period - unit_first_treat
|
|
254
|
+
if dynamic_effects:
|
|
255
|
+
effect = treatment_effect * (1 + effect_growth * time_since_treatment)
|
|
256
|
+
else:
|
|
257
|
+
effect = treatment_effect
|
|
258
|
+
y += effect
|
|
259
|
+
|
|
260
|
+
y += rng.normal(0, noise_sd)
|
|
261
|
+
|
|
262
|
+
records.append(
|
|
263
|
+
{
|
|
264
|
+
"unit": uid,
|
|
265
|
+
"period": period,
|
|
266
|
+
"outcome": y,
|
|
267
|
+
"first_treat": unit_first_treat,
|
|
268
|
+
"treated": int(is_treated),
|
|
269
|
+
"treat": int(is_ever_treated),
|
|
270
|
+
"true_effect": effect,
|
|
271
|
+
}
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return pd.DataFrame(records)
|
|
275
|
+
|
|
276
|
+
# --- Panel mode (default) ---
|
|
277
|
+
# Assign treatment cohorts
|
|
278
|
+
first_treat = np.zeros(n_units, dtype=int)
|
|
279
|
+
if n_treated > 0:
|
|
280
|
+
cohort_assignments = rng.choice(len(cohort_periods), size=n_treated)
|
|
281
|
+
first_treat[n_never:] = [cohort_periods[c] for c in cohort_assignments]
|
|
282
|
+
|
|
283
|
+
# Generate unit fixed effects
|
|
284
|
+
unit_fe = rng.normal(0, unit_fe_sd, n_units)
|
|
285
|
+
|
|
286
|
+
# Build data
|
|
287
|
+
records = []
|
|
288
|
+
for unit in range(n_units):
|
|
289
|
+
unit_first_treat = first_treat[unit]
|
|
290
|
+
is_ever_treated = unit_first_treat > 0
|
|
291
|
+
|
|
292
|
+
for period in range(n_periods):
|
|
293
|
+
# Check if treated at this observation
|
|
294
|
+
is_treated = is_ever_treated and period >= unit_first_treat
|
|
295
|
+
|
|
296
|
+
# Base outcome: unit FE + time trend
|
|
297
|
+
y = 10.0 + unit_fe[unit] + time_trend * period
|
|
298
|
+
|
|
299
|
+
# Treatment effect
|
|
300
|
+
effect = 0.0
|
|
301
|
+
if is_treated:
|
|
302
|
+
time_since_treatment = period - unit_first_treat
|
|
303
|
+
if dynamic_effects:
|
|
304
|
+
effect = treatment_effect * (1 + effect_growth * time_since_treatment)
|
|
305
|
+
else:
|
|
306
|
+
effect = treatment_effect
|
|
307
|
+
y += effect
|
|
308
|
+
|
|
309
|
+
# Add noise
|
|
310
|
+
y += rng.normal(0, noise_sd)
|
|
311
|
+
|
|
312
|
+
records.append(
|
|
313
|
+
{
|
|
314
|
+
"unit": unit,
|
|
315
|
+
"period": period,
|
|
316
|
+
"outcome": y,
|
|
317
|
+
"first_treat": unit_first_treat,
|
|
318
|
+
"treated": int(is_treated),
|
|
319
|
+
"treat": int(is_ever_treated),
|
|
320
|
+
"true_effect": effect,
|
|
321
|
+
}
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
return pd.DataFrame(records)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def generate_factor_data(
|
|
328
|
+
n_units: int = 50,
|
|
329
|
+
n_pre: int = 10,
|
|
330
|
+
n_post: int = 5,
|
|
331
|
+
n_treated: int = 10,
|
|
332
|
+
n_factors: int = 2,
|
|
333
|
+
treatment_effect: float = 2.0,
|
|
334
|
+
factor_strength: float = 1.0,
|
|
335
|
+
treated_loading_shift: float = 0.5,
|
|
336
|
+
unit_fe_sd: float = 1.0,
|
|
337
|
+
noise_sd: float = 0.5,
|
|
338
|
+
seed: Optional[int] = None,
|
|
339
|
+
) -> pd.DataFrame:
|
|
340
|
+
"""
|
|
341
|
+
Generate synthetic panel data with interactive fixed effects (factor model).
|
|
342
|
+
|
|
343
|
+
Creates data following the DGP:
|
|
344
|
+
Y_it = mu + alpha_i + beta_t + Lambda_i'F_t + tau*D_it + eps_it
|
|
345
|
+
|
|
346
|
+
where Lambda_i'F_t is the interactive fixed effects component. Useful for
|
|
347
|
+
testing TROP (Triply Robust Panel) and comparing with SyntheticDiD.
|
|
348
|
+
|
|
349
|
+
Parameters
|
|
350
|
+
----------
|
|
351
|
+
n_units : int, default=50
|
|
352
|
+
Total number of units in the panel.
|
|
353
|
+
n_pre : int, default=10
|
|
354
|
+
Number of pre-treatment periods.
|
|
355
|
+
n_post : int, default=5
|
|
356
|
+
Number of post-treatment periods.
|
|
357
|
+
n_treated : int, default=10
|
|
358
|
+
Number of treated units (assigned to first n_treated unit IDs).
|
|
359
|
+
n_factors : int, default=2
|
|
360
|
+
Number of latent factors in the interactive fixed effects.
|
|
361
|
+
treatment_effect : float, default=2.0
|
|
362
|
+
True average treatment effect on the treated.
|
|
363
|
+
factor_strength : float, default=1.0
|
|
364
|
+
Scaling factor for interactive fixed effects.
|
|
365
|
+
treated_loading_shift : float, default=0.5
|
|
366
|
+
Shift in factor loadings for treated units (creates confounding).
|
|
367
|
+
unit_fe_sd : float, default=1.0
|
|
368
|
+
Standard deviation of unit fixed effects.
|
|
369
|
+
noise_sd : float, default=0.5
|
|
370
|
+
Standard deviation of idiosyncratic noise.
|
|
371
|
+
seed : int, optional
|
|
372
|
+
Random seed for reproducibility.
|
|
373
|
+
|
|
374
|
+
Returns
|
|
375
|
+
-------
|
|
376
|
+
pd.DataFrame
|
|
377
|
+
Synthetic factor model data with columns:
|
|
378
|
+
- unit: Unit identifier
|
|
379
|
+
- period: Time period
|
|
380
|
+
- outcome: Outcome variable
|
|
381
|
+
- treated: Binary indicator (1 if treated at this observation)
|
|
382
|
+
- treat: Binary unit-level ever-treated indicator
|
|
383
|
+
- true_effect: The true treatment effect for this observation
|
|
384
|
+
|
|
385
|
+
Examples
|
|
386
|
+
--------
|
|
387
|
+
Generate data with factor structure:
|
|
388
|
+
|
|
389
|
+
>>> data = generate_factor_data(n_units=50, n_factors=2, seed=42)
|
|
390
|
+
>>> data.shape
|
|
391
|
+
(750, 6)
|
|
392
|
+
|
|
393
|
+
Use with TROP estimator:
|
|
394
|
+
|
|
395
|
+
>>> from diff_diff import TROP
|
|
396
|
+
>>> trop = TROP(n_bootstrap=50, seed=42)
|
|
397
|
+
>>> results = trop.fit(data, outcome='outcome', treatment='treated',
|
|
398
|
+
... unit='unit', time='period',
|
|
399
|
+
... post_periods=list(range(10, 15)))
|
|
400
|
+
|
|
401
|
+
Notes
|
|
402
|
+
-----
|
|
403
|
+
The treated units have systematically different factor loadings
|
|
404
|
+
(shifted by `treated_loading_shift`), which creates confounding
|
|
405
|
+
that standard DiD cannot address but TROP can handle.
|
|
406
|
+
"""
|
|
407
|
+
rng = np.random.default_rng(seed)
|
|
408
|
+
|
|
409
|
+
n_control = n_units - n_treated
|
|
410
|
+
n_periods = n_pre + n_post
|
|
411
|
+
|
|
412
|
+
if n_treated > n_units:
|
|
413
|
+
raise ValueError(f"n_treated ({n_treated}) cannot exceed n_units ({n_units})")
|
|
414
|
+
if n_treated < 1:
|
|
415
|
+
raise ValueError("n_treated must be at least 1")
|
|
416
|
+
|
|
417
|
+
# Generate factors F: (n_periods, n_factors)
|
|
418
|
+
F = rng.normal(0, 1, (n_periods, n_factors))
|
|
419
|
+
|
|
420
|
+
# Generate loadings Lambda: (n_factors, n_units)
|
|
421
|
+
# Treated units have shifted loadings (creates confounding)
|
|
422
|
+
Lambda = rng.normal(0, 1, (n_factors, n_units))
|
|
423
|
+
Lambda[:, :n_treated] += treated_loading_shift
|
|
424
|
+
|
|
425
|
+
# Unit fixed effects (treated units have higher baseline)
|
|
426
|
+
alpha = rng.normal(0, unit_fe_sd, n_units)
|
|
427
|
+
alpha[:n_treated] += 1.0
|
|
428
|
+
|
|
429
|
+
# Time fixed effects (linear trend)
|
|
430
|
+
beta = np.linspace(0, 2, n_periods)
|
|
431
|
+
|
|
432
|
+
# Generate outcomes
|
|
433
|
+
records = []
|
|
434
|
+
for i in range(n_units):
|
|
435
|
+
is_ever_treated = i < n_treated
|
|
436
|
+
|
|
437
|
+
for t in range(n_periods):
|
|
438
|
+
post = t >= n_pre
|
|
439
|
+
|
|
440
|
+
# Base outcome
|
|
441
|
+
y = 10.0 + alpha[i] + beta[t]
|
|
442
|
+
|
|
443
|
+
# Interactive fixed effects: Lambda_i' F_t
|
|
444
|
+
y += factor_strength * (Lambda[:, i] @ F[t, :])
|
|
445
|
+
|
|
446
|
+
# Treatment effect
|
|
447
|
+
effect = 0.0
|
|
448
|
+
if is_ever_treated and post:
|
|
449
|
+
effect = treatment_effect
|
|
450
|
+
y += effect
|
|
451
|
+
|
|
452
|
+
# Add noise
|
|
453
|
+
y += rng.normal(0, noise_sd)
|
|
454
|
+
|
|
455
|
+
records.append(
|
|
456
|
+
{
|
|
457
|
+
"unit": i,
|
|
458
|
+
"period": t,
|
|
459
|
+
"outcome": y,
|
|
460
|
+
"treated": int(is_ever_treated and post),
|
|
461
|
+
"treat": int(is_ever_treated),
|
|
462
|
+
"true_effect": effect,
|
|
463
|
+
}
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
return pd.DataFrame(records)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def generate_ddd_data(
|
|
470
|
+
n_per_cell: int = 100,
|
|
471
|
+
treatment_effect: float = 2.0,
|
|
472
|
+
group_effect: float = 2.0,
|
|
473
|
+
partition_effect: float = 1.0,
|
|
474
|
+
time_effect: float = 0.5,
|
|
475
|
+
noise_sd: float = 1.0,
|
|
476
|
+
add_covariates: bool = False,
|
|
477
|
+
seed: Optional[int] = None,
|
|
478
|
+
) -> pd.DataFrame:
|
|
479
|
+
"""
|
|
480
|
+
Generate synthetic data for Triple Difference (DDD) analysis.
|
|
481
|
+
|
|
482
|
+
Creates data following the DGP:
|
|
483
|
+
Y = mu + G + P + T + G*P + G*T + P*T + tau*G*P*T + eps
|
|
484
|
+
|
|
485
|
+
where G=group, P=partition, T=time. The treatment effect (tau) only
|
|
486
|
+
applies to units that are in the treated group (G=1), eligible
|
|
487
|
+
partition (P=1), and post-treatment period (T=1).
|
|
488
|
+
|
|
489
|
+
Parameters
|
|
490
|
+
----------
|
|
491
|
+
n_per_cell : int, default=100
|
|
492
|
+
Number of observations per cell (8 cells total: 2x2x2).
|
|
493
|
+
treatment_effect : float, default=2.0
|
|
494
|
+
True average treatment effect on the treated (G=1, P=1, T=1).
|
|
495
|
+
group_effect : float, default=2.0
|
|
496
|
+
Main effect of being in treated group.
|
|
497
|
+
partition_effect : float, default=1.0
|
|
498
|
+
Main effect of being in eligible partition.
|
|
499
|
+
time_effect : float, default=0.5
|
|
500
|
+
Main effect of post-treatment period.
|
|
501
|
+
noise_sd : float, default=1.0
|
|
502
|
+
Standard deviation of idiosyncratic noise.
|
|
503
|
+
add_covariates : bool, default=False
|
|
504
|
+
If True, adds age and education covariates that affect outcome.
|
|
505
|
+
seed : int, optional
|
|
506
|
+
Random seed for reproducibility.
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
-------
|
|
510
|
+
pd.DataFrame
|
|
511
|
+
Synthetic DDD data with columns:
|
|
512
|
+
- outcome: Outcome variable
|
|
513
|
+
- group: Group indicator (0=control, 1=treated)
|
|
514
|
+
- partition: Partition indicator (0=ineligible, 1=eligible)
|
|
515
|
+
- time: Time indicator (0=pre, 1=post)
|
|
516
|
+
- unit_id: Unique unit identifier
|
|
517
|
+
- true_effect: The true treatment effect for this observation
|
|
518
|
+
- age: Age covariate (if add_covariates=True)
|
|
519
|
+
- education: Education covariate (if add_covariates=True)
|
|
520
|
+
|
|
521
|
+
Examples
|
|
522
|
+
--------
|
|
523
|
+
Generate DDD data:
|
|
524
|
+
|
|
525
|
+
>>> data = generate_ddd_data(n_per_cell=100, treatment_effect=3.0, seed=42)
|
|
526
|
+
>>> data.shape
|
|
527
|
+
(800, 6)
|
|
528
|
+
>>> data.groupby(['group', 'partition', 'time']).size()
|
|
529
|
+
group partition time
|
|
530
|
+
0 0 0 100
|
|
531
|
+
1 100
|
|
532
|
+
1 0 100
|
|
533
|
+
1 100
|
|
534
|
+
1 0 0 100
|
|
535
|
+
1 100
|
|
536
|
+
1 0 100
|
|
537
|
+
1 100
|
|
538
|
+
dtype: int64
|
|
539
|
+
|
|
540
|
+
Use with TripleDifference estimator:
|
|
541
|
+
|
|
542
|
+
>>> from diff_diff import TripleDifference
|
|
543
|
+
>>> ddd = TripleDifference()
|
|
544
|
+
>>> results = ddd.fit(data, outcome='outcome', group='group',
|
|
545
|
+
... partition='partition', time='time')
|
|
546
|
+
>>> abs(results.att - 3.0) < 1.0
|
|
547
|
+
True
|
|
548
|
+
"""
|
|
549
|
+
rng = np.random.default_rng(seed)
|
|
550
|
+
|
|
551
|
+
records = []
|
|
552
|
+
unit_id = 0
|
|
553
|
+
|
|
554
|
+
for g in [0, 1]: # group (0=control state, 1=treated state)
|
|
555
|
+
for p in [0, 1]: # partition (0=ineligible, 1=eligible)
|
|
556
|
+
for t in [0, 1]: # time (0=pre, 1=post)
|
|
557
|
+
for _ in range(n_per_cell):
|
|
558
|
+
# Base outcome with main effects
|
|
559
|
+
y = 50 + group_effect * g + partition_effect * p + time_effect * t
|
|
560
|
+
|
|
561
|
+
# Second-order interactions (non-treatment)
|
|
562
|
+
y += 1.5 * g * p # group-partition interaction
|
|
563
|
+
y += 1.0 * g * t # group-time interaction (diff trends)
|
|
564
|
+
y += 0.5 * p * t # partition-time interaction
|
|
565
|
+
|
|
566
|
+
# Treatment effect: ONLY for G=1, P=1, T=1
|
|
567
|
+
effect = 0.0
|
|
568
|
+
if g == 1 and p == 1 and t == 1:
|
|
569
|
+
effect = treatment_effect
|
|
570
|
+
y += effect
|
|
571
|
+
|
|
572
|
+
# Covariates (always generated for consistency)
|
|
573
|
+
age = rng.normal(40, 10)
|
|
574
|
+
education = rng.choice([12, 14, 16, 18], p=[0.3, 0.3, 0.25, 0.15])
|
|
575
|
+
|
|
576
|
+
if add_covariates:
|
|
577
|
+
y += 0.1 * age + 0.5 * education
|
|
578
|
+
|
|
579
|
+
# Add noise
|
|
580
|
+
y += rng.normal(0, noise_sd)
|
|
581
|
+
|
|
582
|
+
record = {
|
|
583
|
+
"outcome": y,
|
|
584
|
+
"group": g,
|
|
585
|
+
"partition": p,
|
|
586
|
+
"time": t,
|
|
587
|
+
"unit_id": unit_id,
|
|
588
|
+
"true_effect": effect,
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
if add_covariates:
|
|
592
|
+
record["age"] = age
|
|
593
|
+
record["education"] = education
|
|
594
|
+
|
|
595
|
+
records.append(record)
|
|
596
|
+
unit_id += 1
|
|
597
|
+
|
|
598
|
+
return pd.DataFrame(records)
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def generate_panel_data(
|
|
602
|
+
n_units: int = 100,
|
|
603
|
+
n_periods: int = 8,
|
|
604
|
+
treatment_period: int = 4,
|
|
605
|
+
treatment_fraction: float = 0.5,
|
|
606
|
+
treatment_effect: float = 5.0,
|
|
607
|
+
parallel_trends: bool = True,
|
|
608
|
+
trend_violation: float = 1.0,
|
|
609
|
+
unit_fe_sd: float = 2.0,
|
|
610
|
+
noise_sd: float = 0.5,
|
|
611
|
+
seed: Optional[int] = None,
|
|
612
|
+
) -> pd.DataFrame:
|
|
613
|
+
"""
|
|
614
|
+
Generate synthetic panel data for parallel trends testing.
|
|
615
|
+
|
|
616
|
+
Creates panel data with optional violation of parallel trends, useful
|
|
617
|
+
for testing parallel trends diagnostics, placebo tests, and sensitivity
|
|
618
|
+
analysis methods.
|
|
619
|
+
|
|
620
|
+
Parameters
|
|
621
|
+
----------
|
|
622
|
+
n_units : int, default=100
|
|
623
|
+
Total number of units in the panel.
|
|
624
|
+
n_periods : int, default=8
|
|
625
|
+
Number of time periods.
|
|
626
|
+
treatment_period : int, default=4
|
|
627
|
+
First post-treatment period (0-indexed).
|
|
628
|
+
treatment_fraction : float, default=0.5
|
|
629
|
+
Fraction of units that receive treatment.
|
|
630
|
+
treatment_effect : float, default=5.0
|
|
631
|
+
True average treatment effect on the treated.
|
|
632
|
+
parallel_trends : bool, default=True
|
|
633
|
+
If True, treated and control groups have parallel pre-treatment trends.
|
|
634
|
+
If False, treated group has a steeper pre-treatment trend.
|
|
635
|
+
trend_violation : float, default=1.0
|
|
636
|
+
Size of the differential trend for treated group when parallel_trends=False.
|
|
637
|
+
Treated units have trend = common_trend + trend_violation.
|
|
638
|
+
unit_fe_sd : float, default=2.0
|
|
639
|
+
Standard deviation of unit fixed effects.
|
|
640
|
+
noise_sd : float, default=0.5
|
|
641
|
+
Standard deviation of idiosyncratic noise.
|
|
642
|
+
seed : int, optional
|
|
643
|
+
Random seed for reproducibility.
|
|
644
|
+
|
|
645
|
+
Returns
|
|
646
|
+
-------
|
|
647
|
+
pd.DataFrame
|
|
648
|
+
Synthetic panel data with columns:
|
|
649
|
+
- unit: Unit identifier
|
|
650
|
+
- period: Time period
|
|
651
|
+
- treated: Binary unit-level treatment indicator
|
|
652
|
+
- post: Binary post-treatment indicator
|
|
653
|
+
- outcome: Outcome variable
|
|
654
|
+
- true_effect: The true treatment effect for this observation
|
|
655
|
+
|
|
656
|
+
Examples
|
|
657
|
+
--------
|
|
658
|
+
Generate data with parallel trends:
|
|
659
|
+
|
|
660
|
+
>>> data_parallel = generate_panel_data(parallel_trends=True, seed=42)
|
|
661
|
+
>>> from diff_diff.utils import check_parallel_trends
|
|
662
|
+
>>> result = check_parallel_trends(data_parallel, outcome='outcome',
|
|
663
|
+
... time='period', treatment_group='treated',
|
|
664
|
+
... pre_periods=[0, 1, 2, 3])
|
|
665
|
+
>>> result['parallel_trends_plausible']
|
|
666
|
+
True
|
|
667
|
+
|
|
668
|
+
Generate data with trend violation:
|
|
669
|
+
|
|
670
|
+
>>> data_violation = generate_panel_data(parallel_trends=False, seed=42)
|
|
671
|
+
>>> result = check_parallel_trends(data_violation, outcome='outcome',
|
|
672
|
+
... time='period', treatment_group='treated',
|
|
673
|
+
... pre_periods=[0, 1, 2, 3])
|
|
674
|
+
>>> result['parallel_trends_plausible']
|
|
675
|
+
False
|
|
676
|
+
"""
|
|
677
|
+
rng = np.random.default_rng(seed)
|
|
678
|
+
|
|
679
|
+
if treatment_period < 1:
|
|
680
|
+
raise ValueError("treatment_period must be at least 1")
|
|
681
|
+
if treatment_period >= n_periods:
|
|
682
|
+
raise ValueError(f"treatment_period must be less than n_periods ({n_periods})")
|
|
683
|
+
|
|
684
|
+
n_treated = int(n_units * treatment_fraction)
|
|
685
|
+
|
|
686
|
+
records = []
|
|
687
|
+
for unit in range(n_units):
|
|
688
|
+
is_treated = unit < n_treated
|
|
689
|
+
unit_fe = rng.normal(0, unit_fe_sd)
|
|
690
|
+
|
|
691
|
+
for period in range(n_periods):
|
|
692
|
+
post = period >= treatment_period
|
|
693
|
+
|
|
694
|
+
# Base time effect (common trend)
|
|
695
|
+
if parallel_trends:
|
|
696
|
+
time_effect = period * 1.0
|
|
697
|
+
else:
|
|
698
|
+
# Different trends: treated has steeper pre-treatment trend
|
|
699
|
+
if is_treated:
|
|
700
|
+
time_effect = period * (1.0 + trend_violation)
|
|
701
|
+
else:
|
|
702
|
+
time_effect = period * 1.0
|
|
703
|
+
|
|
704
|
+
y = 10.0 + unit_fe + time_effect
|
|
705
|
+
|
|
706
|
+
# Treatment effect (only for treated in post-period)
|
|
707
|
+
effect = 0.0
|
|
708
|
+
if is_treated and post:
|
|
709
|
+
effect = treatment_effect
|
|
710
|
+
y += effect
|
|
711
|
+
|
|
712
|
+
# Add noise
|
|
713
|
+
y += rng.normal(0, noise_sd)
|
|
714
|
+
|
|
715
|
+
records.append(
|
|
716
|
+
{
|
|
717
|
+
"unit": unit,
|
|
718
|
+
"period": period,
|
|
719
|
+
"treated": int(is_treated),
|
|
720
|
+
"post": int(post),
|
|
721
|
+
"outcome": y,
|
|
722
|
+
"true_effect": effect,
|
|
723
|
+
}
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
return pd.DataFrame(records)
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def generate_event_study_data(
|
|
730
|
+
n_units: int = 300,
|
|
731
|
+
n_pre: int = 5,
|
|
732
|
+
n_post: int = 5,
|
|
733
|
+
treatment_fraction: float = 0.5,
|
|
734
|
+
treatment_effect: float = 5.0,
|
|
735
|
+
unit_fe_sd: float = 2.0,
|
|
736
|
+
noise_sd: float = 2.0,
|
|
737
|
+
seed: Optional[int] = None,
|
|
738
|
+
) -> pd.DataFrame:
|
|
739
|
+
"""
|
|
740
|
+
Generate synthetic data for event study analysis.
|
|
741
|
+
|
|
742
|
+
Creates panel data with simultaneous treatment at period n_pre.
|
|
743
|
+
Useful for testing MultiPeriodDiD, pre-trends power analysis,
|
|
744
|
+
and HonestDiD sensitivity analysis.
|
|
745
|
+
|
|
746
|
+
Parameters
|
|
747
|
+
----------
|
|
748
|
+
n_units : int, default=300
|
|
749
|
+
Total number of units in the panel.
|
|
750
|
+
n_pre : int, default=5
|
|
751
|
+
Number of pre-treatment periods.
|
|
752
|
+
n_post : int, default=5
|
|
753
|
+
Number of post-treatment periods.
|
|
754
|
+
treatment_fraction : float, default=0.5
|
|
755
|
+
Fraction of units that receive treatment.
|
|
756
|
+
treatment_effect : float, default=5.0
|
|
757
|
+
True average treatment effect on the treated.
|
|
758
|
+
unit_fe_sd : float, default=2.0
|
|
759
|
+
Standard deviation of unit fixed effects.
|
|
760
|
+
noise_sd : float, default=2.0
|
|
761
|
+
Standard deviation of idiosyncratic noise.
|
|
762
|
+
seed : int, optional
|
|
763
|
+
Random seed for reproducibility.
|
|
764
|
+
|
|
765
|
+
Returns
|
|
766
|
+
-------
|
|
767
|
+
pd.DataFrame
|
|
768
|
+
Synthetic event study data with columns:
|
|
769
|
+
- unit: Unit identifier
|
|
770
|
+
- period: Time period
|
|
771
|
+
- treated: Binary unit-level treatment indicator
|
|
772
|
+
- post: Binary post-treatment indicator
|
|
773
|
+
- outcome: Outcome variable
|
|
774
|
+
- event_time: Time relative to treatment (negative=pre, 0+=post)
|
|
775
|
+
- true_effect: The true treatment effect for this observation
|
|
776
|
+
|
|
777
|
+
Examples
|
|
778
|
+
--------
|
|
779
|
+
Generate event study data:
|
|
780
|
+
|
|
781
|
+
>>> data = generate_event_study_data(n_units=300, n_pre=5, n_post=5, seed=42)
|
|
782
|
+
>>> data['event_time'].unique()
|
|
783
|
+
array([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4])
|
|
784
|
+
|
|
785
|
+
Use with MultiPeriodDiD:
|
|
786
|
+
|
|
787
|
+
>>> from diff_diff import MultiPeriodDiD
|
|
788
|
+
>>> mp_did = MultiPeriodDiD()
|
|
789
|
+
>>> results = mp_did.fit(data, outcome='outcome', treatment='treated',
|
|
790
|
+
... time='period', post_periods=[5, 6, 7, 8, 9])
|
|
791
|
+
|
|
792
|
+
Notes
|
|
793
|
+
-----
|
|
794
|
+
The event_time column is relative to treatment:
|
|
795
|
+
- Negative values: pre-treatment periods
|
|
796
|
+
- 0: first post-treatment period
|
|
797
|
+
- Positive values: subsequent post-treatment periods
|
|
798
|
+
"""
|
|
799
|
+
rng = np.random.default_rng(seed)
|
|
800
|
+
|
|
801
|
+
n_periods = n_pre + n_post
|
|
802
|
+
treatment_period = n_pre
|
|
803
|
+
n_treated = int(n_units * treatment_fraction)
|
|
804
|
+
|
|
805
|
+
records = []
|
|
806
|
+
for unit in range(n_units):
|
|
807
|
+
is_treated = unit < n_treated
|
|
808
|
+
unit_fe = rng.normal(0, unit_fe_sd)
|
|
809
|
+
|
|
810
|
+
for period in range(n_periods):
|
|
811
|
+
post = period >= treatment_period
|
|
812
|
+
event_time = period - treatment_period
|
|
813
|
+
|
|
814
|
+
# Common time trend
|
|
815
|
+
time_effect = period * 0.5
|
|
816
|
+
|
|
817
|
+
y = 10.0 + unit_fe + time_effect
|
|
818
|
+
|
|
819
|
+
# Treatment effect (only for treated in post-period)
|
|
820
|
+
effect = 0.0
|
|
821
|
+
if is_treated and post:
|
|
822
|
+
effect = treatment_effect
|
|
823
|
+
y += effect
|
|
824
|
+
|
|
825
|
+
# Add noise
|
|
826
|
+
y += rng.normal(0, noise_sd)
|
|
827
|
+
|
|
828
|
+
records.append(
|
|
829
|
+
{
|
|
830
|
+
"unit": unit,
|
|
831
|
+
"period": period,
|
|
832
|
+
"treated": int(is_treated),
|
|
833
|
+
"post": int(post),
|
|
834
|
+
"outcome": y,
|
|
835
|
+
"event_time": event_time,
|
|
836
|
+
"true_effect": effect,
|
|
837
|
+
}
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
return pd.DataFrame(records)
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def generate_continuous_did_data(
|
|
844
|
+
n_units: int = 500,
|
|
845
|
+
n_periods: int = 4,
|
|
846
|
+
cohort_periods: Optional[List[int]] = None,
|
|
847
|
+
never_treated_frac: float = 0.3,
|
|
848
|
+
dose_distribution: str = "lognormal",
|
|
849
|
+
dose_params: Optional[Dict] = None,
|
|
850
|
+
att_function: str = "linear",
|
|
851
|
+
att_slope: float = 2.0,
|
|
852
|
+
att_intercept: float = 1.0,
|
|
853
|
+
unit_fe_sd: float = 2.0,
|
|
854
|
+
time_trend: float = 0.5,
|
|
855
|
+
noise_sd: float = 1.0,
|
|
856
|
+
seed: Optional[int] = None,
|
|
857
|
+
) -> pd.DataFrame:
|
|
858
|
+
"""
|
|
859
|
+
Generate synthetic data for continuous DiD analysis with known dose-response.
|
|
860
|
+
|
|
861
|
+
Creates a balanced panel with continuous treatment doses and known ATT(d)
|
|
862
|
+
function, satisfying strong parallel trends by construction.
|
|
863
|
+
|
|
864
|
+
Parameters
|
|
865
|
+
----------
|
|
866
|
+
n_units : int, default=500
|
|
867
|
+
Number of units in the panel.
|
|
868
|
+
n_periods : int, default=4
|
|
869
|
+
Number of time periods (1-indexed).
|
|
870
|
+
cohort_periods : list of int, optional
|
|
871
|
+
Treatment cohort periods. Default: ``[2]`` (single cohort).
|
|
872
|
+
never_treated_frac : float, default=0.3
|
|
873
|
+
Fraction of units that are never-treated.
|
|
874
|
+
dose_distribution : str, default="lognormal"
|
|
875
|
+
Distribution for dose: ``"lognormal"``, ``"uniform"``, ``"exponential"``.
|
|
876
|
+
dose_params : dict, optional
|
|
877
|
+
Distribution-specific parameters. Defaults:
|
|
878
|
+
lognormal: ``{"mean": 0.5, "sigma": 0.5}``
|
|
879
|
+
uniform: ``{"low": 0.5, "high": 5.0}``
|
|
880
|
+
exponential: ``{"scale": 2.0}``
|
|
881
|
+
att_function : str, default="linear"
|
|
882
|
+
Functional form of ATT(d): ``"linear"``, ``"quadratic"``, ``"log"``.
|
|
883
|
+
att_slope : float, default=2.0
|
|
884
|
+
Slope parameter for ATT function.
|
|
885
|
+
att_intercept : float, default=1.0
|
|
886
|
+
Intercept parameter for ATT function.
|
|
887
|
+
unit_fe_sd : float, default=2.0
|
|
888
|
+
Standard deviation of unit fixed effects.
|
|
889
|
+
time_trend : float, default=0.5
|
|
890
|
+
Linear time trend coefficient.
|
|
891
|
+
noise_sd : float, default=1.0
|
|
892
|
+
Standard deviation of idiosyncratic noise.
|
|
893
|
+
seed : int, optional
|
|
894
|
+
Random seed for reproducibility.
|
|
895
|
+
|
|
896
|
+
Returns
|
|
897
|
+
-------
|
|
898
|
+
pd.DataFrame
|
|
899
|
+
Panel data with columns: ``unit``, ``period``, ``outcome``,
|
|
900
|
+
``first_treat``, ``dose``, ``true_att``.
|
|
901
|
+
"""
|
|
902
|
+
rng = np.random.default_rng(seed)
|
|
903
|
+
|
|
904
|
+
if cohort_periods is None:
|
|
905
|
+
cohort_periods = [2]
|
|
906
|
+
|
|
907
|
+
# Assign units to cohorts
|
|
908
|
+
n_never = int(n_units * never_treated_frac)
|
|
909
|
+
n_treated_total = n_units - n_never
|
|
910
|
+
n_per_cohort = n_treated_total // len(cohort_periods)
|
|
911
|
+
|
|
912
|
+
cohort_assignments = np.zeros(n_units, dtype=int)
|
|
913
|
+
idx = 0
|
|
914
|
+
for i, g in enumerate(cohort_periods):
|
|
915
|
+
n_this = n_per_cohort if i < len(cohort_periods) - 1 else n_treated_total - idx
|
|
916
|
+
cohort_assignments[n_never + idx : n_never + idx + n_this] = g
|
|
917
|
+
idx += n_this
|
|
918
|
+
|
|
919
|
+
# Generate doses
|
|
920
|
+
default_params = {
|
|
921
|
+
"lognormal": {"mean": 0.5, "sigma": 0.5},
|
|
922
|
+
"uniform": {"low": 0.5, "high": 5.0},
|
|
923
|
+
"exponential": {"scale": 2.0},
|
|
924
|
+
}
|
|
925
|
+
params = dose_params or default_params.get(dose_distribution, {})
|
|
926
|
+
|
|
927
|
+
dose_per_unit = np.zeros(n_units)
|
|
928
|
+
treated_mask = cohort_assignments > 0
|
|
929
|
+
n_treated_actual = int(np.sum(treated_mask))
|
|
930
|
+
|
|
931
|
+
if dose_distribution == "lognormal":
|
|
932
|
+
dose_per_unit[treated_mask] = rng.lognormal(
|
|
933
|
+
mean=params.get("mean", 0.5),
|
|
934
|
+
sigma=params.get("sigma", 0.5),
|
|
935
|
+
size=n_treated_actual,
|
|
936
|
+
)
|
|
937
|
+
elif dose_distribution == "uniform":
|
|
938
|
+
dose_per_unit[treated_mask] = rng.uniform(
|
|
939
|
+
low=params.get("low", 0.5),
|
|
940
|
+
high=params.get("high", 5.0),
|
|
941
|
+
size=n_treated_actual,
|
|
942
|
+
)
|
|
943
|
+
elif dose_distribution == "exponential":
|
|
944
|
+
dose_per_unit[treated_mask] = rng.exponential(
|
|
945
|
+
scale=params.get("scale", 2.0),
|
|
946
|
+
size=n_treated_actual,
|
|
947
|
+
)
|
|
948
|
+
else:
|
|
949
|
+
raise ValueError(
|
|
950
|
+
f"dose_distribution must be 'lognormal', 'uniform', or 'exponential', "
|
|
951
|
+
f"got '{dose_distribution}'"
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
# ATT function
|
|
955
|
+
def _att_func(d):
|
|
956
|
+
if att_function == "linear":
|
|
957
|
+
return att_intercept + att_slope * d
|
|
958
|
+
elif att_function == "quadratic":
|
|
959
|
+
return att_intercept + att_slope * d**2
|
|
960
|
+
elif att_function == "log":
|
|
961
|
+
return att_intercept + att_slope * np.log1p(d)
|
|
962
|
+
else:
|
|
963
|
+
raise ValueError(
|
|
964
|
+
f"att_function must be 'linear', 'quadratic', or 'log', " f"got '{att_function}'"
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# Unit fixed effects
|
|
968
|
+
unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
|
|
969
|
+
|
|
970
|
+
# Build panel
|
|
971
|
+
periods = np.arange(1, n_periods + 1)
|
|
972
|
+
records = []
|
|
973
|
+
for i in range(n_units):
|
|
974
|
+
g_i = cohort_assignments[i]
|
|
975
|
+
d_i = dose_per_unit[i]
|
|
976
|
+
for t in periods:
|
|
977
|
+
# Potential outcome without treatment
|
|
978
|
+
y0 = unit_fe[i] + time_trend * t + rng.normal(0, noise_sd)
|
|
979
|
+
# Treatment effect
|
|
980
|
+
if g_i > 0 and t >= g_i:
|
|
981
|
+
att_d = _att_func(d_i)
|
|
982
|
+
else:
|
|
983
|
+
att_d = 0.0
|
|
984
|
+
|
|
985
|
+
records.append(
|
|
986
|
+
{
|
|
987
|
+
"unit": i,
|
|
988
|
+
"period": int(t),
|
|
989
|
+
"outcome": y0 + att_d,
|
|
990
|
+
"first_treat": int(g_i) if g_i > 0 else 0,
|
|
991
|
+
"dose": d_i,
|
|
992
|
+
"true_att": att_d,
|
|
993
|
+
}
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
return pd.DataFrame(records)
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def generate_staggered_ddd_data(
|
|
1000
|
+
n_units: int = 200,
|
|
1001
|
+
n_periods: int = 8,
|
|
1002
|
+
cohort_periods: Optional[List[int]] = None,
|
|
1003
|
+
never_enabled_frac: float = 0.25,
|
|
1004
|
+
eligibility_frac: float = 0.5,
|
|
1005
|
+
treatment_effect: float = 3.0,
|
|
1006
|
+
dynamic_effects: bool = False,
|
|
1007
|
+
effect_growth: float = 0.1,
|
|
1008
|
+
eligibility_trend: float = 0.3,
|
|
1009
|
+
noise_sd: float = 0.5,
|
|
1010
|
+
add_covariates: bool = False,
|
|
1011
|
+
seed: Optional[int] = None,
|
|
1012
|
+
) -> pd.DataFrame:
|
|
1013
|
+
"""
|
|
1014
|
+
Generate synthetic data for staggered triple difference (DDD) analysis.
|
|
1015
|
+
|
|
1016
|
+
Creates a balanced panel with staggered enabling times and a binary
|
|
1017
|
+
eligibility dimension. Treatment occurs when a unit is both enabled
|
|
1018
|
+
(t >= S_i) and eligible (Q_i = 1). DDD-CPT holds by construction.
|
|
1019
|
+
|
|
1020
|
+
Parameters
|
|
1021
|
+
----------
|
|
1022
|
+
n_units : int, default=200
|
|
1023
|
+
Number of units.
|
|
1024
|
+
n_periods : int, default=8
|
|
1025
|
+
Number of time periods (1-indexed).
|
|
1026
|
+
cohort_periods : list of int, optional
|
|
1027
|
+
Enabling periods. Default: [4, 6].
|
|
1028
|
+
never_enabled_frac : float, default=0.25
|
|
1029
|
+
Fraction of never-enabled units.
|
|
1030
|
+
eligibility_frac : float, default=0.5
|
|
1031
|
+
Fraction of eligible units (Q=1) within each cohort.
|
|
1032
|
+
treatment_effect : float, default=3.0
|
|
1033
|
+
True ATT for treated units.
|
|
1034
|
+
dynamic_effects : bool, default=False
|
|
1035
|
+
If True, effects grow over time since enabling.
|
|
1036
|
+
effect_growth : float, default=0.1
|
|
1037
|
+
Per-period effect growth rate when dynamic_effects=True.
|
|
1038
|
+
eligibility_trend : float, default=0.3
|
|
1039
|
+
Differential time trend for eligible vs ineligible units.
|
|
1040
|
+
Same across all enabling groups (preserves DDD-CPT).
|
|
1041
|
+
noise_sd : float, default=0.5
|
|
1042
|
+
Standard deviation of idiosyncratic noise.
|
|
1043
|
+
add_covariates : bool, default=False
|
|
1044
|
+
If True, add covariates x1 (continuous) and x2 (binary).
|
|
1045
|
+
seed : int, optional
|
|
1046
|
+
Random seed.
|
|
1047
|
+
|
|
1048
|
+
Returns
|
|
1049
|
+
-------
|
|
1050
|
+
pd.DataFrame
|
|
1051
|
+
Columns: unit, period, outcome, first_treat, eligibility, treated,
|
|
1052
|
+
true_effect. Also x1, x2 if add_covariates=True.
|
|
1053
|
+
"""
|
|
1054
|
+
rng = np.random.default_rng(seed)
|
|
1055
|
+
|
|
1056
|
+
if cohort_periods is None:
|
|
1057
|
+
cohort_periods = [4, 6]
|
|
1058
|
+
|
|
1059
|
+
# Assign units to cohorts
|
|
1060
|
+
n_never = int(n_units * never_enabled_frac)
|
|
1061
|
+
n_treated_total = n_units - n_never
|
|
1062
|
+
n_per_cohort = n_treated_total // len(cohort_periods)
|
|
1063
|
+
|
|
1064
|
+
unit_cohort = np.zeros(n_units, dtype=float)
|
|
1065
|
+
idx = n_never
|
|
1066
|
+
for i, g in enumerate(cohort_periods):
|
|
1067
|
+
n_g = n_per_cohort if i < len(cohort_periods) - 1 else n_treated_total - idx + n_never
|
|
1068
|
+
unit_cohort[idx : idx + n_g] = g
|
|
1069
|
+
idx += n_g
|
|
1070
|
+
|
|
1071
|
+
# Assign eligibility (within each cohort, fraction eligible)
|
|
1072
|
+
unit_elig = np.zeros(n_units, dtype=int)
|
|
1073
|
+
for g_val in [0.0] + [float(g) for g in cohort_periods]:
|
|
1074
|
+
mask = unit_cohort == g_val
|
|
1075
|
+
n_g = int(np.sum(mask))
|
|
1076
|
+
if n_g == 0:
|
|
1077
|
+
continue
|
|
1078
|
+
n_eligible = max(1, min(int(n_g * eligibility_frac), n_g))
|
|
1079
|
+
indices = np.where(mask)[0]
|
|
1080
|
+
eligible_idx = rng.choice(indices, size=n_eligible, replace=False)
|
|
1081
|
+
unit_elig[eligible_idx] = 1
|
|
1082
|
+
|
|
1083
|
+
# Unit fixed effects
|
|
1084
|
+
unit_fe = rng.normal(0, 2.0, size=n_units)
|
|
1085
|
+
|
|
1086
|
+
# Covariates
|
|
1087
|
+
x1 = rng.normal(0, 1, size=n_units) if add_covariates else None
|
|
1088
|
+
x2 = rng.choice([0, 1], size=n_units) if add_covariates else None
|
|
1089
|
+
|
|
1090
|
+
# Generate panel
|
|
1091
|
+
records = []
|
|
1092
|
+
for i in range(n_units):
|
|
1093
|
+
g_i = unit_cohort[i]
|
|
1094
|
+
q_i = unit_elig[i]
|
|
1095
|
+
for t in range(1, n_periods + 1):
|
|
1096
|
+
# Base: unit FE + time trend + eligibility-time interaction
|
|
1097
|
+
gamma_t = 0.1 * t
|
|
1098
|
+
y = unit_fe[i] + gamma_t + 1.0 * q_i + eligibility_trend * q_i * gamma_t
|
|
1099
|
+
|
|
1100
|
+
if add_covariates:
|
|
1101
|
+
y += 0.5 * x1[i] + 0.3 * x2[i]
|
|
1102
|
+
|
|
1103
|
+
# Treatment effect: enabled AND eligible
|
|
1104
|
+
treated = int(g_i > 0 and t >= g_i and q_i == 1)
|
|
1105
|
+
true_eff = 0.0
|
|
1106
|
+
if treated:
|
|
1107
|
+
true_eff = treatment_effect
|
|
1108
|
+
if dynamic_effects:
|
|
1109
|
+
true_eff *= 1 + effect_growth * (t - g_i)
|
|
1110
|
+
y += true_eff
|
|
1111
|
+
|
|
1112
|
+
y += rng.normal(0, noise_sd)
|
|
1113
|
+
|
|
1114
|
+
row = {
|
|
1115
|
+
"unit": i,
|
|
1116
|
+
"period": t,
|
|
1117
|
+
"outcome": y,
|
|
1118
|
+
"first_treat": int(g_i) if g_i > 0 else 0,
|
|
1119
|
+
"eligibility": q_i,
|
|
1120
|
+
"treated": treated,
|
|
1121
|
+
"true_effect": true_eff,
|
|
1122
|
+
}
|
|
1123
|
+
if add_covariates:
|
|
1124
|
+
row["x1"] = x1[i]
|
|
1125
|
+
row["x2"] = x2[i]
|
|
1126
|
+
|
|
1127
|
+
records.append(row)
|
|
1128
|
+
|
|
1129
|
+
return pd.DataFrame(records)
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def _rank_pair_weights(
|
|
1133
|
+
unit_weight: np.ndarray,
|
|
1134
|
+
unit_stratum: np.ndarray,
|
|
1135
|
+
y0: np.ndarray,
|
|
1136
|
+
n_strata: int,
|
|
1137
|
+
) -> None:
|
|
1138
|
+
"""Rank-pair weights with Y(0) within each stratum (in-place).
|
|
1139
|
+
|
|
1140
|
+
High-outcome units receive higher weights, modeling informative sampling
|
|
1141
|
+
where hard-to-reach (high-outcome) subpopulations are under-covered
|
|
1142
|
+
and therefore carry larger inverse-selection-probability weights.
|
|
1143
|
+
"""
|
|
1144
|
+
for s in range(n_strata):
|
|
1145
|
+
mask = unit_stratum == s
|
|
1146
|
+
n_s = mask.sum()
|
|
1147
|
+
if n_s <= 1:
|
|
1148
|
+
continue
|
|
1149
|
+
idx_s = np.where(mask)[0]
|
|
1150
|
+
w_vals = unit_weight[idx_s].copy()
|
|
1151
|
+
if w_vals.std() < 1e-10:
|
|
1152
|
+
# No within-stratum variation: create rank-based weights
|
|
1153
|
+
# scaled to preserve stratum baseline weight level
|
|
1154
|
+
ranks = np.argsort(np.argsort(y0[idx_s])).astype(float) + 1.0
|
|
1155
|
+
unit_weight[idx_s] = ranks / ranks.mean() * w_vals.mean()
|
|
1156
|
+
else:
|
|
1157
|
+
# Rank-pair: highest Y(0) gets heaviest weight
|
|
1158
|
+
y0_order = np.argsort(-y0[idx_s])
|
|
1159
|
+
w_sorted = np.sort(w_vals)[::-1] # heaviest first
|
|
1160
|
+
unit_weight[idx_s[y0_order]] = w_sorted
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
def generate_survey_did_data(
|
|
1164
|
+
n_units: int = 200,
|
|
1165
|
+
n_periods: int = 8,
|
|
1166
|
+
cohort_periods: Optional[List[int]] = None,
|
|
1167
|
+
never_treated_frac: float = 0.3,
|
|
1168
|
+
treatment_effect: float = 2.0,
|
|
1169
|
+
dynamic_effects: bool = False,
|
|
1170
|
+
effect_growth: float = 0.3,
|
|
1171
|
+
n_strata: int = 5,
|
|
1172
|
+
psu_per_stratum: int = 8,
|
|
1173
|
+
fpc_per_stratum: float = 200.0,
|
|
1174
|
+
weight_variation: str = "moderate",
|
|
1175
|
+
psu_re_sd: float = 2.0,
|
|
1176
|
+
psu_period_factor: float = 0.5,
|
|
1177
|
+
unit_fe_sd: float = 1.0,
|
|
1178
|
+
noise_sd: float = 0.5,
|
|
1179
|
+
include_replicate_weights: bool = False,
|
|
1180
|
+
add_covariates: bool = False,
|
|
1181
|
+
panel: bool = True,
|
|
1182
|
+
seed: Optional[int] = None,
|
|
1183
|
+
# --- Research-grade DGP parameters ---
|
|
1184
|
+
icc: Optional[float] = None,
|
|
1185
|
+
weight_cv: Optional[float] = None,
|
|
1186
|
+
informative_sampling: bool = False,
|
|
1187
|
+
heterogeneous_te_by_strata: bool = False,
|
|
1188
|
+
strata_sizes: Optional[List[int]] = None,
|
|
1189
|
+
return_true_population_att: bool = False,
|
|
1190
|
+
covariate_effects: Optional[tuple] = None,
|
|
1191
|
+
te_covariate_interaction: float = 0.0,
|
|
1192
|
+
) -> pd.DataFrame:
|
|
1193
|
+
"""
|
|
1194
|
+
Generate synthetic staggered DiD data with survey structure.
|
|
1195
|
+
|
|
1196
|
+
Creates a balanced panel (or repeated cross-section) with stratified
|
|
1197
|
+
multi-stage sampling design (strata, PSUs, FPC, sampling weights) and
|
|
1198
|
+
known treatment effects. The survey structure introduces intra-cluster
|
|
1199
|
+
correlation via PSU random effects, making design-based SEs larger
|
|
1200
|
+
than naive SEs.
|
|
1201
|
+
|
|
1202
|
+
Modeled on ACS/BRFSS-style stratified household surveys: strata
|
|
1203
|
+
represent geographic region types, PSUs are census tracts sampled
|
|
1204
|
+
within each stratum, and weights are inverse selection probabilities.
|
|
1205
|
+
|
|
1206
|
+
Parameters
|
|
1207
|
+
----------
|
|
1208
|
+
n_units : int, default=200
|
|
1209
|
+
Number of units (respondents) per period.
|
|
1210
|
+
n_periods : int, default=8
|
|
1211
|
+
Number of time periods (1-indexed).
|
|
1212
|
+
cohort_periods : list of int, optional
|
|
1213
|
+
Treatment cohort periods (1-indexed, each must be >= 2 for at least
|
|
1214
|
+
one pre-treatment period). Default derived from n_periods; [3, 5]
|
|
1215
|
+
when n_periods >= 8. Requires n_periods >= 4 when not specified.
|
|
1216
|
+
never_treated_frac : float, default=0.3
|
|
1217
|
+
Fraction of units that are never treated.
|
|
1218
|
+
treatment_effect : float, default=2.0
|
|
1219
|
+
True ATT for treated units.
|
|
1220
|
+
dynamic_effects : bool, default=False
|
|
1221
|
+
If True, effects grow over time since treatment.
|
|
1222
|
+
effect_growth : float, default=0.3
|
|
1223
|
+
Per-period effect growth rate when dynamic_effects=True.
|
|
1224
|
+
n_strata : int, default=5
|
|
1225
|
+
Number of geographic strata.
|
|
1226
|
+
psu_per_stratum : int, default=8
|
|
1227
|
+
Number of PSUs (census tracts) per stratum.
|
|
1228
|
+
fpc_per_stratum : float, default=200.0
|
|
1229
|
+
Finite population correction (total tracts per stratum).
|
|
1230
|
+
weight_variation : str, default="moderate"
|
|
1231
|
+
Controls sampling weight dispersion across strata.
|
|
1232
|
+
"none": all weights equal (1.0).
|
|
1233
|
+
"moderate": weights range ~1.0-2.0 across strata.
|
|
1234
|
+
"high": weights range ~1.0-4.0 across strata.
|
|
1235
|
+
psu_re_sd : float, default=2.0
|
|
1236
|
+
Standard deviation of PSU random effects. Controls intra-cluster
|
|
1237
|
+
correlation and drives DEFF > 1.
|
|
1238
|
+
psu_period_factor : float, default=0.5
|
|
1239
|
+
Multiplier for PSU-period interaction shocks (relative to psu_re_sd).
|
|
1240
|
+
Higher values increase time-varying within-cluster correlation,
|
|
1241
|
+
which survives DiD's time-differencing and inflates design-based SEs.
|
|
1242
|
+
unit_fe_sd : float, default=1.0
|
|
1243
|
+
Standard deviation of unit fixed effects.
|
|
1244
|
+
noise_sd : float, default=0.5
|
|
1245
|
+
Standard deviation of idiosyncratic noise.
|
|
1246
|
+
include_replicate_weights : bool, default=False
|
|
1247
|
+
If True, add JK1 (delete-one-PSU) replicate weight columns.
|
|
1248
|
+
Requires at least 2 PSUs.
|
|
1249
|
+
add_covariates : bool, default=False
|
|
1250
|
+
If True, add covariates x1 (continuous) and x2 (binary).
|
|
1251
|
+
panel : bool, default=True
|
|
1252
|
+
If True, generate panel data (same respondents across periods).
|
|
1253
|
+
If False, generate repeated cross-sections with fresh respondent
|
|
1254
|
+
effects and unique unit IDs each period (for use with
|
|
1255
|
+
CallawaySantAnna(panel=False)).
|
|
1256
|
+
seed : int, optional
|
|
1257
|
+
Random seed for reproducibility.
|
|
1258
|
+
icc : float, optional
|
|
1259
|
+
Target intra-class correlation coefficient (0 < icc < 1). Overrides
|
|
1260
|
+
``psu_re_sd`` via the variance decomposition:
|
|
1261
|
+
``psu_re_sd = sqrt(icc * (sigma2_unit + sigma2_noise + sigma2_cov) /
|
|
1262
|
+
((1 - icc) * (1 + psu_period_factor^2)))`` where ``sigma2_cov``
|
|
1263
|
+
includes covariate variance when ``add_covariates=True``.
|
|
1264
|
+
Cannot be combined with a non-default ``psu_re_sd``.
|
|
1265
|
+
weight_cv : float, optional
|
|
1266
|
+
Target coefficient of variation for sampling weights. Generates
|
|
1267
|
+
LogNormal weights normalized to mean 1, bypassing ``weight_variation``.
|
|
1268
|
+
Cannot be combined with a non-default ``weight_variation``.
|
|
1269
|
+
informative_sampling : bool, default=False
|
|
1270
|
+
If True, sampling weights correlate with Y(0) — high-outcome units
|
|
1271
|
+
receive higher weights (under-coverage → larger inverse-selection-
|
|
1272
|
+
probability weights). Uses rank-pairing within each stratum. For
|
|
1273
|
+
panel data, ranking is done once from period-1 outcomes. For
|
|
1274
|
+
repeated cross-sections, ranking is refreshed each period. Within
|
|
1275
|
+
each stratum, rank-based weights are scaled to preserve the
|
|
1276
|
+
stratum's baseline weight level from ``weight_variation``.
|
|
1277
|
+
When ``add_covariates=True``, covariate contributions are
|
|
1278
|
+
included in the Y(0) ranking.
|
|
1279
|
+
heterogeneous_te_by_strata : bool, default=False
|
|
1280
|
+
If True, treatment effect varies by stratum:
|
|
1281
|
+
``TE_h = TE * (1 + 0.5 * (h - mean) / std)``. Creates a gap
|
|
1282
|
+
between unweighted and population ATT. With ``n_strata=1``,
|
|
1283
|
+
all units receive the base ``treatment_effect``.
|
|
1284
|
+
strata_sizes : list of int, optional
|
|
1285
|
+
Custom per-stratum unit counts. Must have length ``n_strata`` and
|
|
1286
|
+
sum to ``n_units``. Replaces equal allocation across strata.
|
|
1287
|
+
return_true_population_att : bool, default=False
|
|
1288
|
+
If True, attaches a diagnostic dict to ``df.attrs["dgp_truth"]``
|
|
1289
|
+
with keys: ``population_att`` (weight-weighted average of treated
|
|
1290
|
+
true effects), ``deff_kish`` (1 + CV(w)^2), ``base_stratum_effects``
|
|
1291
|
+
(base stratum TEs before dynamic/covariate modifiers),
|
|
1292
|
+
``icc_realized`` (ANOVA-based
|
|
1293
|
+
ICC computed on period-1 data).
|
|
1294
|
+
covariate_effects : tuple of (float, float), optional
|
|
1295
|
+
Coefficients ``(beta1, beta2)`` for covariates x1 and x2 in the
|
|
1296
|
+
outcome equation ``y += beta1 * x1 + beta2 * x2``. Default uses
|
|
1297
|
+
``(0.5, 0.3)``. Only used when ``add_covariates=True``. The ICC
|
|
1298
|
+
calibration automatically adjusts for the implied covariate variance.
|
|
1299
|
+
te_covariate_interaction : float, default=0.0
|
|
1300
|
+
Coefficient for treatment-by-covariate interaction:
|
|
1301
|
+
``TE_i = base_TE + te_covariate_interaction * x1_i``. Creates
|
|
1302
|
+
unit-level treatment effect heterogeneity driven by the continuous
|
|
1303
|
+
covariate. Requires ``add_covariates=True``.
|
|
1304
|
+
|
|
1305
|
+
Returns
|
|
1306
|
+
-------
|
|
1307
|
+
pd.DataFrame
|
|
1308
|
+
Columns: unit, period, outcome, first_treat, treated, true_effect,
|
|
1309
|
+
stratum, psu, fpc, weight. Also rep_0..rep_K if
|
|
1310
|
+
include_replicate_weights=True, and x1, x2 if add_covariates=True.
|
|
1311
|
+
If ``return_true_population_att=True``, ``df.attrs["dgp_truth"]``
|
|
1312
|
+
contains DGP diagnostics.
|
|
1313
|
+
"""
|
|
1314
|
+
rng = np.random.default_rng(seed)
|
|
1315
|
+
|
|
1316
|
+
# --- Upfront parameter validation ---
|
|
1317
|
+
if n_units < 1:
|
|
1318
|
+
raise ValueError(f"n_units must be positive, got {n_units}")
|
|
1319
|
+
if n_periods < 1:
|
|
1320
|
+
raise ValueError(f"n_periods must be positive, got {n_periods}")
|
|
1321
|
+
if n_strata < 1:
|
|
1322
|
+
raise ValueError(f"n_strata must be positive, got {n_strata}")
|
|
1323
|
+
if psu_per_stratum < 1:
|
|
1324
|
+
raise ValueError(f"psu_per_stratum must be positive, got {psu_per_stratum}")
|
|
1325
|
+
if not 0.0 <= never_treated_frac <= 1.0:
|
|
1326
|
+
raise ValueError(
|
|
1327
|
+
f"never_treated_frac must be between 0 and 1, got {never_treated_frac}"
|
|
1328
|
+
)
|
|
1329
|
+
if fpc_per_stratum < psu_per_stratum:
|
|
1330
|
+
raise ValueError(
|
|
1331
|
+
f"fpc_per_stratum ({fpc_per_stratum}) must be >= psu_per_stratum "
|
|
1332
|
+
f"({psu_per_stratum})"
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
if cohort_periods is None:
|
|
1336
|
+
# Derive defaults from n_periods. Cohorts need g >= 2 (at least one
|
|
1337
|
+
# pre-period for estimability with CallawaySantAnna).
|
|
1338
|
+
if n_periods >= 8:
|
|
1339
|
+
cohort_periods = [3, 5]
|
|
1340
|
+
elif n_periods >= 4:
|
|
1341
|
+
cohort_periods = [max(2, n_periods // 3), max(3, 2 * n_periods // 3)]
|
|
1342
|
+
else:
|
|
1343
|
+
raise ValueError(
|
|
1344
|
+
f"n_periods={n_periods} is too small for default cohort_periods "
|
|
1345
|
+
f"(need n_periods >= 4 for at least one cohort with a pre-period). "
|
|
1346
|
+
f"Pass cohort_periods explicitly for small panels."
|
|
1347
|
+
)
|
|
1348
|
+
# Coerce array-like to list (handles np.array inputs)
|
|
1349
|
+
cohort_periods = list(cohort_periods)
|
|
1350
|
+
if not cohort_periods:
|
|
1351
|
+
raise ValueError("cohort_periods must be a non-empty list of integers")
|
|
1352
|
+
for cp in cohort_periods:
|
|
1353
|
+
if isinstance(cp, bool) or not isinstance(cp, (int, np.integer)):
|
|
1354
|
+
raise ValueError(
|
|
1355
|
+
f"cohort_periods must contain integers, got {cp!r}"
|
|
1356
|
+
)
|
|
1357
|
+
if cp < 2 or cp > n_periods:
|
|
1358
|
+
raise ValueError(
|
|
1359
|
+
f"Cohort period {cp} must be between 2 and {n_periods} "
|
|
1360
|
+
f"(g >= 2 ensures at least one pre-treatment period)"
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
if not np.isfinite(psu_period_factor) or psu_period_factor < 0:
|
|
1364
|
+
raise ValueError(
|
|
1365
|
+
f"psu_period_factor must be finite and non-negative, "
|
|
1366
|
+
f"got {psu_period_factor}"
|
|
1367
|
+
)
|
|
1368
|
+
|
|
1369
|
+
valid_wv = ("none", "moderate", "high")
|
|
1370
|
+
if weight_variation not in valid_wv:
|
|
1371
|
+
raise ValueError(
|
|
1372
|
+
f"weight_variation must be one of {valid_wv}, got {weight_variation!r}"
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
# --- Validate research-grade DGP parameters ---
|
|
1376
|
+
if icc is not None:
|
|
1377
|
+
if not (0 < icc < 1):
|
|
1378
|
+
raise ValueError(f"icc must be between 0 and 1 (exclusive), got {icc}")
|
|
1379
|
+
if psu_re_sd != 2.0:
|
|
1380
|
+
raise ValueError(
|
|
1381
|
+
"Cannot specify both icc and a non-default psu_re_sd. "
|
|
1382
|
+
"icc overrides psu_re_sd via the ICC formula."
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
if weight_cv is not None:
|
|
1386
|
+
if not np.isfinite(weight_cv) or weight_cv <= 0:
|
|
1387
|
+
raise ValueError(
|
|
1388
|
+
f"weight_cv must be finite and positive, got {weight_cv}"
|
|
1389
|
+
)
|
|
1390
|
+
if weight_variation != "moderate":
|
|
1391
|
+
raise ValueError(
|
|
1392
|
+
"Cannot specify both weight_cv and a non-default "
|
|
1393
|
+
"weight_variation. weight_cv overrides weight_variation."
|
|
1394
|
+
)
|
|
1395
|
+
|
|
1396
|
+
if strata_sizes is not None:
|
|
1397
|
+
strata_sizes = list(strata_sizes)
|
|
1398
|
+
for ss in strata_sizes:
|
|
1399
|
+
if isinstance(ss, bool) or not isinstance(ss, (int, np.integer)):
|
|
1400
|
+
raise ValueError(
|
|
1401
|
+
f"strata_sizes must contain integers, got {ss!r}"
|
|
1402
|
+
)
|
|
1403
|
+
if len(strata_sizes) != n_strata:
|
|
1404
|
+
raise ValueError(
|
|
1405
|
+
f"strata_sizes must have length n_strata={n_strata}, "
|
|
1406
|
+
f"got {len(strata_sizes)}"
|
|
1407
|
+
)
|
|
1408
|
+
if any(s < 1 for s in strata_sizes):
|
|
1409
|
+
raise ValueError("All strata_sizes must be >= 1")
|
|
1410
|
+
if sum(strata_sizes) != n_units:
|
|
1411
|
+
raise ValueError(
|
|
1412
|
+
f"strata_sizes must sum to n_units={n_units}, "
|
|
1413
|
+
f"got {sum(strata_sizes)}"
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
# --- Validate and resolve covariate coefficients ---
|
|
1417
|
+
if covariate_effects is not None:
|
|
1418
|
+
covariate_effects = tuple(covariate_effects)
|
|
1419
|
+
if len(covariate_effects) != 2:
|
|
1420
|
+
raise ValueError(
|
|
1421
|
+
f"covariate_effects must have length 2, got {len(covariate_effects)}"
|
|
1422
|
+
)
|
|
1423
|
+
if not all(np.isfinite(c) for c in covariate_effects):
|
|
1424
|
+
raise ValueError(
|
|
1425
|
+
f"covariate_effects must be finite, got {covariate_effects}"
|
|
1426
|
+
)
|
|
1427
|
+
_beta1, _beta2 = covariate_effects if covariate_effects is not None else (0.5, 0.3)
|
|
1428
|
+
|
|
1429
|
+
if not np.isfinite(te_covariate_interaction):
|
|
1430
|
+
raise ValueError(
|
|
1431
|
+
f"te_covariate_interaction must be finite, got {te_covariate_interaction}"
|
|
1432
|
+
)
|
|
1433
|
+
if te_covariate_interaction != 0.0 and not add_covariates:
|
|
1434
|
+
raise ValueError(
|
|
1435
|
+
"te_covariate_interaction requires add_covariates=True"
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
# --- ICC -> psu_re_sd resolution ---
|
|
1439
|
+
if icc is not None:
|
|
1440
|
+
# Covariate variance: Var(beta1*x1) + Var(beta2*x2)
|
|
1441
|
+
# where x1 ~ N(0,1), x2 ~ Bernoulli(0.5)
|
|
1442
|
+
cov_var = (_beta1**2 * 1.0 + _beta2**2 * 0.25) if add_covariates else 0.0
|
|
1443
|
+
non_psu_var = unit_fe_sd**2 + noise_sd**2 + cov_var
|
|
1444
|
+
if non_psu_var < 1e-12:
|
|
1445
|
+
raise ValueError(
|
|
1446
|
+
"icc requires non-zero non-PSU variance "
|
|
1447
|
+
"(unit_fe_sd, noise_sd, or add_covariates must contribute variance)"
|
|
1448
|
+
)
|
|
1449
|
+
psu_re_sd = np.sqrt(
|
|
1450
|
+
icc * non_psu_var
|
|
1451
|
+
/ ((1 - icc) * (1 + psu_period_factor**2))
|
|
1452
|
+
)
|
|
1453
|
+
|
|
1454
|
+
# --- Survey structure: assign units to strata and PSUs ---
|
|
1455
|
+
n_psu_total = n_strata * psu_per_stratum
|
|
1456
|
+
|
|
1457
|
+
if strata_sizes is not None:
|
|
1458
|
+
stratum_n = strata_sizes
|
|
1459
|
+
else:
|
|
1460
|
+
units_per_stratum = n_units // n_strata
|
|
1461
|
+
remainder = n_units % n_strata
|
|
1462
|
+
stratum_n = [
|
|
1463
|
+
units_per_stratum + (1 if s < remainder else 0)
|
|
1464
|
+
for s in range(n_strata)
|
|
1465
|
+
]
|
|
1466
|
+
|
|
1467
|
+
unit_stratum = np.empty(n_units, dtype=int)
|
|
1468
|
+
unit_psu = np.empty(n_units, dtype=int)
|
|
1469
|
+
idx = 0
|
|
1470
|
+
for s in range(n_strata):
|
|
1471
|
+
n_s = stratum_n[s]
|
|
1472
|
+
unit_stratum[idx : idx + n_s] = s
|
|
1473
|
+
psu_start = s * psu_per_stratum
|
|
1474
|
+
for j in range(n_s):
|
|
1475
|
+
unit_psu[idx + j] = psu_start + (j % psu_per_stratum)
|
|
1476
|
+
idx += n_s
|
|
1477
|
+
|
|
1478
|
+
# Sampling weights
|
|
1479
|
+
if weight_cv is not None:
|
|
1480
|
+
sigma_ln = np.sqrt(np.log(1 + weight_cv**2))
|
|
1481
|
+
raw_w = rng.lognormal(-sigma_ln**2 / 2, sigma_ln, size=n_units)
|
|
1482
|
+
unit_weight = raw_w / raw_w.mean()
|
|
1483
|
+
else:
|
|
1484
|
+
# Stratum-based weights (inverse selection probability)
|
|
1485
|
+
scale_map = {"none": 0.0, "moderate": 1.0, "high": 3.0}
|
|
1486
|
+
scale = scale_map.get(weight_variation, 1.0)
|
|
1487
|
+
denom = max(n_strata - 1, 1)
|
|
1488
|
+
unit_weight = 1.0 + scale * (unit_stratum / denom)
|
|
1489
|
+
|
|
1490
|
+
# --- Treatment assignment (cohort structure) ---
|
|
1491
|
+
n_never = int(n_units * never_treated_frac)
|
|
1492
|
+
n_treated_total = n_units - n_never
|
|
1493
|
+
n_per_cohort = n_treated_total // len(cohort_periods)
|
|
1494
|
+
|
|
1495
|
+
unit_cohort = np.zeros(n_units, dtype=int)
|
|
1496
|
+
ci = n_never
|
|
1497
|
+
for i, g in enumerate(cohort_periods):
|
|
1498
|
+
n_g = (
|
|
1499
|
+
n_per_cohort
|
|
1500
|
+
if i < len(cohort_periods) - 1
|
|
1501
|
+
else n_treated_total - ci + n_never
|
|
1502
|
+
)
|
|
1503
|
+
unit_cohort[ci : ci + n_g] = g
|
|
1504
|
+
ci += n_g
|
|
1505
|
+
|
|
1506
|
+
# --- JK1 early guard (configured count; populated count checked after build) ---
|
|
1507
|
+
if include_replicate_weights and n_psu_total < 2:
|
|
1508
|
+
raise ValueError(
|
|
1509
|
+
"JK1 replicate weights require at least 2 PSUs, "
|
|
1510
|
+
f"got {n_psu_total}."
|
|
1511
|
+
)
|
|
1512
|
+
|
|
1513
|
+
# --- Random effects ---
|
|
1514
|
+
psu_re = rng.normal(0, psu_re_sd, size=n_psu_total)
|
|
1515
|
+
# PSU-period shocks: intra-cluster correlation that survives first-
|
|
1516
|
+
# differencing in DiD. Without these, the time-invariant PSU RE
|
|
1517
|
+
# cancels in the treatment-vs-control time-difference and the
|
|
1518
|
+
# cluster-robust / survey SE would be *smaller* than naive OLS SE.
|
|
1519
|
+
# Controlled by psu_period_factor (default 0.5); higher values
|
|
1520
|
+
# increase time-varying clustering and inflate design-based SEs.
|
|
1521
|
+
psu_period_re = rng.normal(
|
|
1522
|
+
0, psu_re_sd * psu_period_factor, size=(n_psu_total, n_periods)
|
|
1523
|
+
)
|
|
1524
|
+
|
|
1525
|
+
# --- Informative sampling (panel path): pre-draw FEs, rank-pair weights ---
|
|
1526
|
+
if informative_sampling and panel:
|
|
1527
|
+
_panel_unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
|
|
1528
|
+
y0_period1 = (
|
|
1529
|
+
_panel_unit_fe
|
|
1530
|
+
+ psu_re[unit_psu]
|
|
1531
|
+
+ psu_period_re[unit_psu, 0]
|
|
1532
|
+
+ 0.5
|
|
1533
|
+
)
|
|
1534
|
+
if add_covariates:
|
|
1535
|
+
_panel_x1 = rng.normal(0, 1, size=n_units)
|
|
1536
|
+
_panel_x2 = rng.choice([0, 1], size=n_units)
|
|
1537
|
+
y0_period1 = y0_period1 + _beta1 * _panel_x1 + _beta2 * _panel_x2
|
|
1538
|
+
_rank_pair_weights(unit_weight, unit_stratum, y0_period1, n_strata)
|
|
1539
|
+
|
|
1540
|
+
# Save base weights for cross-section informative sampling (reset each period)
|
|
1541
|
+
if informative_sampling and not panel:
|
|
1542
|
+
_base_weight = unit_weight.copy()
|
|
1543
|
+
|
|
1544
|
+
# --- Heterogeneous treatment effects by stratum ---
|
|
1545
|
+
if heterogeneous_te_by_strata:
|
|
1546
|
+
if n_strata == 1:
|
|
1547
|
+
te_by_stratum = np.array([treatment_effect])
|
|
1548
|
+
else:
|
|
1549
|
+
strata_idx = np.arange(n_strata, dtype=float)
|
|
1550
|
+
te_by_stratum = treatment_effect * (
|
|
1551
|
+
1 + 0.5 * (strata_idx - strata_idx.mean()) / strata_idx.std()
|
|
1552
|
+
)
|
|
1553
|
+
else:
|
|
1554
|
+
te_by_stratum = None
|
|
1555
|
+
|
|
1556
|
+
# --- Generate panel or repeated cross-sections ---
|
|
1557
|
+
records = []
|
|
1558
|
+
for t in range(1, n_periods + 1):
|
|
1559
|
+
# For repeated cross-sections, draw fresh respondent effects each period
|
|
1560
|
+
unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
|
|
1561
|
+
if panel and t > 1:
|
|
1562
|
+
pass # reuse unit_fe from first period (set below)
|
|
1563
|
+
if informative_sampling and panel:
|
|
1564
|
+
unit_fe = _panel_unit_fe # use pre-drawn FEs
|
|
1565
|
+
elif panel and t == 1:
|
|
1566
|
+
_panel_unit_fe = unit_fe # save for reuse
|
|
1567
|
+
elif panel and t > 1:
|
|
1568
|
+
unit_fe = _panel_unit_fe # type: ignore[possibly-undefined]
|
|
1569
|
+
|
|
1570
|
+
# Cross-section informative sampling: re-rank weights each period
|
|
1571
|
+
if informative_sampling and not panel:
|
|
1572
|
+
# Draw covariates early so they can be included in Y(0) ranking
|
|
1573
|
+
if add_covariates:
|
|
1574
|
+
x1 = rng.normal(0, 1, size=n_units)
|
|
1575
|
+
x2 = rng.choice([0, 1], size=n_units)
|
|
1576
|
+
unit_weight = _base_weight.copy() # type: ignore[possibly-undefined]
|
|
1577
|
+
y0_t = (
|
|
1578
|
+
unit_fe
|
|
1579
|
+
+ psu_re[unit_psu]
|
|
1580
|
+
+ psu_period_re[unit_psu, t - 1]
|
|
1581
|
+
+ 0.5 * t
|
|
1582
|
+
)
|
|
1583
|
+
if add_covariates:
|
|
1584
|
+
y0_t = y0_t + _beta1 * x1 + _beta2 * x2
|
|
1585
|
+
_rank_pair_weights(unit_weight, unit_stratum, y0_t, n_strata)
|
|
1586
|
+
|
|
1587
|
+
# Covariates — may already be drawn by informative sampling above
|
|
1588
|
+
if informative_sampling and panel and add_covariates:
|
|
1589
|
+
x1 = _panel_x1 # pre-drawn before loop for ranking
|
|
1590
|
+
x2 = _panel_x2
|
|
1591
|
+
elif informative_sampling and not panel and add_covariates:
|
|
1592
|
+
pass # x1, x2 already drawn in cross-section ranking block
|
|
1593
|
+
elif add_covariates:
|
|
1594
|
+
x1 = rng.normal(0, 1, size=n_units)
|
|
1595
|
+
x2 = rng.choice([0, 1], size=n_units)
|
|
1596
|
+
else:
|
|
1597
|
+
x1 = None
|
|
1598
|
+
x2 = None
|
|
1599
|
+
if not informative_sampling and panel and t > 1 and add_covariates:
|
|
1600
|
+
x1 = _panel_x1 # type: ignore[possibly-undefined]
|
|
1601
|
+
x2 = _panel_x2 # type: ignore[possibly-undefined]
|
|
1602
|
+
elif not informative_sampling and panel and t == 1 and add_covariates:
|
|
1603
|
+
_panel_x1 = x1
|
|
1604
|
+
_panel_x2 = x2
|
|
1605
|
+
|
|
1606
|
+
for i in range(n_units):
|
|
1607
|
+
g_i = unit_cohort[i]
|
|
1608
|
+
# Outcome: unit FE + PSU RE + PSU-period shock + time trend
|
|
1609
|
+
y = unit_fe[i] + psu_re[unit_psu[i]] + psu_period_re[unit_psu[i], t - 1] + 0.5 * t
|
|
1610
|
+
|
|
1611
|
+
if add_covariates:
|
|
1612
|
+
y += _beta1 * x1[i] + _beta2 * x2[i]
|
|
1613
|
+
|
|
1614
|
+
treated = int(g_i > 0 and t >= g_i)
|
|
1615
|
+
true_eff = 0.0
|
|
1616
|
+
if treated:
|
|
1617
|
+
if te_by_stratum is not None:
|
|
1618
|
+
true_eff = float(te_by_stratum[unit_stratum[i]])
|
|
1619
|
+
else:
|
|
1620
|
+
true_eff = treatment_effect
|
|
1621
|
+
if te_covariate_interaction != 0.0:
|
|
1622
|
+
true_eff += te_covariate_interaction * x1[i]
|
|
1623
|
+
if dynamic_effects:
|
|
1624
|
+
true_eff *= 1 + effect_growth * (t - g_i)
|
|
1625
|
+
y += true_eff
|
|
1626
|
+
|
|
1627
|
+
y += rng.normal(0, noise_sd)
|
|
1628
|
+
|
|
1629
|
+
# In cross-section mode, each period gets unique unit IDs
|
|
1630
|
+
uid = i if panel else (t - 1) * n_units + i
|
|
1631
|
+
|
|
1632
|
+
row = {
|
|
1633
|
+
"unit": uid,
|
|
1634
|
+
"period": t,
|
|
1635
|
+
"outcome": y,
|
|
1636
|
+
"first_treat": g_i,
|
|
1637
|
+
"treated": treated,
|
|
1638
|
+
"true_effect": true_eff,
|
|
1639
|
+
"stratum": int(unit_stratum[i]),
|
|
1640
|
+
"psu": int(unit_psu[i]),
|
|
1641
|
+
"fpc": fpc_per_stratum,
|
|
1642
|
+
"weight": float(unit_weight[i]),
|
|
1643
|
+
}
|
|
1644
|
+
if add_covariates:
|
|
1645
|
+
row["x1"] = x1[i]
|
|
1646
|
+
row["x2"] = x2[i]
|
|
1647
|
+
records.append(row)
|
|
1648
|
+
|
|
1649
|
+
df = pd.DataFrame(records)
|
|
1650
|
+
|
|
1651
|
+
# --- Replicate weights (JK1 delete-one-PSU) ---
|
|
1652
|
+
if include_replicate_weights:
|
|
1653
|
+
psu_ids = sorted(df["psu"].unique())
|
|
1654
|
+
n_rep = len(psu_ids)
|
|
1655
|
+
if n_rep < 2:
|
|
1656
|
+
raise ValueError(
|
|
1657
|
+
"JK1 replicate weights require at least 2 populated PSUs, "
|
|
1658
|
+
f"got {n_rep}. Increase n_units or decrease psu_per_stratum."
|
|
1659
|
+
)
|
|
1660
|
+
base_w = df["weight"].values
|
|
1661
|
+
for r, psu_id in enumerate(psu_ids):
|
|
1662
|
+
w_r = base_w.copy()
|
|
1663
|
+
mask = df["psu"].values == psu_id
|
|
1664
|
+
w_r[mask] = 0.0
|
|
1665
|
+
# Rescale remaining: k/(k-1) for JK1
|
|
1666
|
+
w_r[w_r > 0] *= n_rep / (n_rep - 1)
|
|
1667
|
+
df[f"rep_{r}"] = w_r
|
|
1668
|
+
|
|
1669
|
+
# --- DGP truth diagnostics ---
|
|
1670
|
+
if return_true_population_att:
|
|
1671
|
+
treated_mask = df["treated"] == 1
|
|
1672
|
+
if treated_mask.any():
|
|
1673
|
+
w_treated = df.loc[treated_mask, "weight"].values
|
|
1674
|
+
te_treated = df.loc[treated_mask, "true_effect"].values
|
|
1675
|
+
population_att = float(np.average(te_treated, weights=w_treated))
|
|
1676
|
+
else:
|
|
1677
|
+
population_att = float("nan")
|
|
1678
|
+
|
|
1679
|
+
if te_by_stratum is not None:
|
|
1680
|
+
stratum_effects = {
|
|
1681
|
+
int(s): float(te_by_stratum[s]) for s in range(n_strata)
|
|
1682
|
+
}
|
|
1683
|
+
else:
|
|
1684
|
+
stratum_effects = {
|
|
1685
|
+
int(s): float(treatment_effect) for s in range(n_strata)
|
|
1686
|
+
}
|
|
1687
|
+
|
|
1688
|
+
# Kish DEFF from weight variation
|
|
1689
|
+
w_all = df.groupby("unit")["weight"].first().values
|
|
1690
|
+
cv_w = float(w_all.std() / w_all.mean()) if w_all.mean() > 0 else 0.0
|
|
1691
|
+
deff_kish = 1 + cv_w**2
|
|
1692
|
+
|
|
1693
|
+
# Realized ICC (ANOVA-based, period-1 only to avoid TE contamination)
|
|
1694
|
+
_p1 = df[df["period"] == 1]
|
|
1695
|
+
_groups = _p1.groupby("psu")["outcome"]
|
|
1696
|
+
_n_total = len(_p1)
|
|
1697
|
+
_n_groups = _groups.ngroups
|
|
1698
|
+
# ICC undefined with < 2 groups or no within-group replication
|
|
1699
|
+
if _n_groups < 2 or _n_total <= _n_groups:
|
|
1700
|
+
icc_realized = float("nan")
|
|
1701
|
+
else:
|
|
1702
|
+
_n_bar = _n_total / _n_groups
|
|
1703
|
+
_grand_mean = _p1["outcome"].mean()
|
|
1704
|
+
_ssb = (_groups.size() * (_groups.mean() - _grand_mean) ** 2).sum()
|
|
1705
|
+
_msb = _ssb / (_n_groups - 1)
|
|
1706
|
+
_ssw = _groups.apply(lambda x: ((x - x.mean()) ** 2).sum()).sum()
|
|
1707
|
+
_msw = _ssw / (_n_total - _n_groups)
|
|
1708
|
+
_denom = _msb + (_n_bar - 1) * _msw
|
|
1709
|
+
icc_realized = float((_msb - _msw) / _denom) if _denom > 0 else float("nan")
|
|
1710
|
+
|
|
1711
|
+
df.attrs["dgp_truth"] = {
|
|
1712
|
+
"population_att": population_att,
|
|
1713
|
+
"deff_kish": float(deff_kish),
|
|
1714
|
+
"base_stratum_effects": stratum_effects,
|
|
1715
|
+
"icc_realized": icc_realized,
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1718
|
+
return df
|