diff-diff 3.0.1__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. diff_diff/__init__.py +382 -0
  2. diff_diff/_backend.py +134 -0
  3. diff_diff/_rust_backend.cp314-win_amd64.pyd +0 -0
  4. diff_diff/bacon.py +1140 -0
  5. diff_diff/bootstrap_utils.py +730 -0
  6. diff_diff/continuous_did.py +1626 -0
  7. diff_diff/continuous_did_bspline.py +190 -0
  8. diff_diff/continuous_did_results.py +374 -0
  9. diff_diff/datasets.py +815 -0
  10. diff_diff/diagnostics.py +882 -0
  11. diff_diff/efficient_did.py +1770 -0
  12. diff_diff/efficient_did_bootstrap.py +359 -0
  13. diff_diff/efficient_did_covariates.py +899 -0
  14. diff_diff/efficient_did_results.py +368 -0
  15. diff_diff/efficient_did_weights.py +617 -0
  16. diff_diff/estimators.py +1501 -0
  17. diff_diff/honest_did.py +2585 -0
  18. diff_diff/imputation.py +2458 -0
  19. diff_diff/imputation_bootstrap.py +418 -0
  20. diff_diff/imputation_results.py +448 -0
  21. diff_diff/linalg.py +2538 -0
  22. diff_diff/power.py +2588 -0
  23. diff_diff/practitioner.py +869 -0
  24. diff_diff/prep.py +1738 -0
  25. diff_diff/prep_dgp.py +1718 -0
  26. diff_diff/pretrends.py +1105 -0
  27. diff_diff/results.py +918 -0
  28. diff_diff/stacked_did.py +1049 -0
  29. diff_diff/stacked_did_results.py +339 -0
  30. diff_diff/staggered.py +3895 -0
  31. diff_diff/staggered_aggregation.py +864 -0
  32. diff_diff/staggered_bootstrap.py +752 -0
  33. diff_diff/staggered_results.py +416 -0
  34. diff_diff/staggered_triple_diff.py +1545 -0
  35. diff_diff/staggered_triple_diff_results.py +416 -0
  36. diff_diff/sun_abraham.py +1685 -0
  37. diff_diff/survey.py +1981 -0
  38. diff_diff/synthetic_did.py +1136 -0
  39. diff_diff/triple_diff.py +2047 -0
  40. diff_diff/trop.py +952 -0
  41. diff_diff/trop_global.py +1270 -0
  42. diff_diff/trop_local.py +1307 -0
  43. diff_diff/trop_results.py +356 -0
  44. diff_diff/twfe.py +542 -0
  45. diff_diff/two_stage.py +1952 -0
  46. diff_diff/two_stage_bootstrap.py +520 -0
  47. diff_diff/two_stage_results.py +400 -0
  48. diff_diff/utils.py +1902 -0
  49. diff_diff/visualization/__init__.py +61 -0
  50. diff_diff/visualization/_common.py +328 -0
  51. diff_diff/visualization/_continuous.py +274 -0
  52. diff_diff/visualization/_diagnostic.py +817 -0
  53. diff_diff/visualization/_event_study.py +1086 -0
  54. diff_diff/visualization/_power.py +661 -0
  55. diff_diff/visualization/_staggered.py +833 -0
  56. diff_diff/visualization/_synthetic.py +197 -0
  57. diff_diff/wooldridge.py +1285 -0
  58. diff_diff/wooldridge_results.py +349 -0
  59. diff_diff-3.0.1.dist-info/METADATA +2997 -0
  60. diff_diff-3.0.1.dist-info/RECORD +62 -0
  61. diff_diff-3.0.1.dist-info/WHEEL +4 -0
  62. diff_diff-3.0.1.dist-info/sboms/diff_diff_rust.cyclonedx.json +5843 -0
diff_diff/prep_dgp.py ADDED
@@ -0,0 +1,1718 @@
1
+ """
2
+ Data generation utilities for difference-in-differences analysis.
3
+
4
+ This module provides functions to generate synthetic datasets for testing
5
+ and validating DiD estimators, including basic 2x2 DiD, staggered adoption,
6
+ factor model data, triple difference, and event study designs.
7
+ """
8
+
9
+ from typing import Dict, List, Optional
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+
15
+ def generate_did_data(
16
+ n_units: int = 100,
17
+ n_periods: int = 4,
18
+ treatment_effect: float = 5.0,
19
+ treatment_fraction: float = 0.5,
20
+ treatment_period: int = 2,
21
+ unit_fe_sd: float = 2.0,
22
+ time_trend: float = 0.5,
23
+ noise_sd: float = 1.0,
24
+ seed: Optional[int] = None,
25
+ ) -> pd.DataFrame:
26
+ """
27
+ Generate synthetic data for DiD analysis with known treatment effect.
28
+
29
+ Creates a balanced panel dataset with realistic features including
30
+ unit fixed effects, time trends, and a known treatment effect.
31
+
32
+ Parameters
33
+ ----------
34
+ n_units : int, default=100
35
+ Number of units in the panel.
36
+ n_periods : int, default=4
37
+ Number of time periods.
38
+ treatment_effect : float, default=5.0
39
+ True average treatment effect on the treated.
40
+ treatment_fraction : float, default=0.5
41
+ Fraction of units that receive treatment.
42
+ treatment_period : int, default=2
43
+ First post-treatment period (0-indexed). Periods >= this are post.
44
+ unit_fe_sd : float, default=2.0
45
+ Standard deviation of unit fixed effects.
46
+ time_trend : float, default=0.5
47
+ Linear time trend coefficient.
48
+ noise_sd : float, default=1.0
49
+ Standard deviation of idiosyncratic noise.
50
+ seed : int, optional
51
+ Random seed for reproducibility.
52
+
53
+ Returns
54
+ -------
55
+ pd.DataFrame
56
+ Synthetic panel data with columns:
57
+ - unit: Unit identifier
58
+ - period: Time period
59
+ - treated: Treatment indicator (0/1)
60
+ - post: Post-treatment indicator (0/1)
61
+ - outcome: Outcome variable
62
+ - true_effect: The true treatment effect (for validation)
63
+
64
+ Examples
65
+ --------
66
+ Generate simple data for testing:
67
+
68
+ >>> data = generate_did_data(n_units=50, n_periods=4, treatment_effect=3.0, seed=42)
69
+ >>> len(data)
70
+ 200
71
+ >>> data.columns.tolist()
72
+ ['unit', 'period', 'treated', 'post', 'outcome', 'true_effect']
73
+
74
+ Verify treatment effect recovery:
75
+
76
+ >>> from diff_diff import DifferenceInDifferences
77
+ >>> did = DifferenceInDifferences()
78
+ >>> results = did.fit(data, outcome='outcome', treatment='treated', time='post')
79
+ >>> abs(results.att - 3.0) < 1.0 # Close to true effect
80
+ True
81
+ """
82
+ rng = np.random.default_rng(seed)
83
+
84
+ # Determine treated units
85
+ n_treated = int(n_units * treatment_fraction)
86
+ treated_units = set(range(n_treated))
87
+
88
+ # Generate unit fixed effects
89
+ unit_fe = rng.normal(0, unit_fe_sd, n_units)
90
+
91
+ # Build data
92
+ records = []
93
+ for unit in range(n_units):
94
+ is_treated = unit in treated_units
95
+
96
+ for period in range(n_periods):
97
+ is_post = period >= treatment_period
98
+
99
+ # Base outcome
100
+ y = 10.0 # Baseline
101
+ y += unit_fe[unit] # Unit fixed effect
102
+ y += time_trend * period # Time trend
103
+
104
+ # Treatment effect (only for treated units in post-period)
105
+ effect = 0.0
106
+ if is_treated and is_post:
107
+ effect = treatment_effect
108
+ y += effect
109
+
110
+ # Add noise
111
+ y += rng.normal(0, noise_sd)
112
+
113
+ records.append(
114
+ {
115
+ "unit": unit,
116
+ "period": period,
117
+ "treated": int(is_treated),
118
+ "post": int(is_post),
119
+ "outcome": y,
120
+ "true_effect": effect,
121
+ }
122
+ )
123
+
124
+ return pd.DataFrame(records)
125
+
126
+
127
+ def generate_staggered_data(
128
+ n_units: int = 100,
129
+ n_periods: int = 10,
130
+ cohort_periods: Optional[List[int]] = None,
131
+ never_treated_frac: float = 0.3,
132
+ treatment_effect: float = 2.0,
133
+ dynamic_effects: bool = True,
134
+ effect_growth: float = 0.1,
135
+ unit_fe_sd: float = 2.0,
136
+ time_trend: float = 0.1,
137
+ noise_sd: float = 0.5,
138
+ seed: Optional[int] = None,
139
+ panel: bool = True,
140
+ ) -> pd.DataFrame:
141
+ """
142
+ Generate synthetic data for staggered adoption DiD analysis.
143
+
144
+ Creates panel data where different units receive treatment at different
145
+ times (staggered rollout). Useful for testing CallawaySantAnna,
146
+ SunAbraham, and other staggered DiD estimators.
147
+
148
+ Parameters
149
+ ----------
150
+ n_units : int, default=100
151
+ Total number of units in the panel.
152
+ n_periods : int, default=10
153
+ Number of time periods.
154
+ cohort_periods : list of int, optional
155
+ Periods when treatment cohorts are first treated.
156
+ If None, defaults to [3, 5, 7] for a 10-period panel.
157
+ never_treated_frac : float, default=0.3
158
+ Fraction of units that are never treated (cohort 0).
159
+ treatment_effect : float, default=2.0
160
+ Base treatment effect at time of treatment.
161
+ dynamic_effects : bool, default=True
162
+ If True, treatment effects grow over time since treatment.
163
+ effect_growth : float, default=0.1
164
+ Per-period growth in treatment effect (if dynamic_effects=True).
165
+ Effect at time t since treatment: effect * (1 + effect_growth * t).
166
+ unit_fe_sd : float, default=2.0
167
+ Standard deviation of unit fixed effects.
168
+ time_trend : float, default=0.1
169
+ Linear time trend coefficient.
170
+ noise_sd : float, default=0.5
171
+ Standard deviation of idiosyncratic noise.
172
+ seed : int, optional
173
+ Random seed for reproducibility.
174
+ panel : bool, default=True
175
+ If True (default), generate balanced panel data (same units across
176
+ all periods). If False, generate repeated cross-section data where
177
+ each period draws independent observations with globally unique IDs.
178
+
179
+ Returns
180
+ -------
181
+ pd.DataFrame
182
+ Synthetic staggered adoption data with columns:
183
+ - unit: Unit identifier
184
+ - period: Time period
185
+ - outcome: Outcome variable
186
+ - first_treat: First treatment period (0 = never treated)
187
+ - treated: Binary indicator (1 if treated at this observation)
188
+ - treat: Binary unit-level ever-treated indicator
189
+ - true_effect: The true treatment effect for this observation
190
+
191
+ Examples
192
+ --------
193
+ Generate staggered adoption data:
194
+
195
+ >>> data = generate_staggered_data(n_units=100, n_periods=10, seed=42)
196
+ >>> data['first_treat'].value_counts().sort_index()
197
+ 0 30
198
+ 3 24
199
+ 5 23
200
+ 7 23
201
+ Name: first_treat, dtype: int64
202
+
203
+ Use with Callaway-Sant'Anna estimator:
204
+
205
+ >>> from diff_diff import CallawaySantAnna
206
+ >>> cs = CallawaySantAnna()
207
+ >>> results = cs.fit(data, outcome='outcome', unit='unit',
208
+ ... time='period', first_treat='first_treat')
209
+ >>> results.overall_att > 0
210
+ True
211
+ """
212
+ rng = np.random.default_rng(seed)
213
+
214
+ # Default cohort periods if not specified
215
+ if cohort_periods is None:
216
+ cohort_periods = [3, 5, 7] if n_periods >= 8 else [n_periods // 3, 2 * n_periods // 3]
217
+
218
+ # Validate cohort periods
219
+ for cp in cohort_periods:
220
+ if cp < 1 or cp >= n_periods:
221
+ raise ValueError(f"Cohort period {cp} must be between 1 and {n_periods - 1}")
222
+
223
+ # Determine number of never-treated and treated units
224
+ n_never = int(n_units * never_treated_frac)
225
+ n_treated = n_units - n_never
226
+
227
+ if not panel:
228
+ # --- Repeated cross-section mode ---
229
+ # Each period draws n_units independent observations with unique IDs.
230
+ # Cohorts are assigned from the same distribution as panel.
231
+ records = []
232
+ for period in range(n_periods):
233
+ # For each period, draw fresh cohort assignments
234
+ ft_period = np.zeros(n_units, dtype=int)
235
+ if n_treated > 0:
236
+ cohort_assignments = rng.choice(len(cohort_periods), size=n_treated)
237
+ ft_period[n_never:] = [cohort_periods[c] for c in cohort_assignments]
238
+
239
+ # Unique unit IDs per period
240
+ for i in range(n_units):
241
+ uid = f"u{period}_{i}"
242
+ unit_first_treat = ft_period[i]
243
+ is_ever_treated = unit_first_treat > 0
244
+
245
+ is_treated = is_ever_treated and period >= unit_first_treat
246
+
247
+ # Outcome: unit_fe_proxy (drawn fresh) + time trend + treatment + noise
248
+ unit_fe_proxy = rng.normal(0, unit_fe_sd)
249
+ y = 10.0 + unit_fe_proxy + time_trend * period
250
+
251
+ effect = 0.0
252
+ if is_treated:
253
+ time_since_treatment = period - unit_first_treat
254
+ if dynamic_effects:
255
+ effect = treatment_effect * (1 + effect_growth * time_since_treatment)
256
+ else:
257
+ effect = treatment_effect
258
+ y += effect
259
+
260
+ y += rng.normal(0, noise_sd)
261
+
262
+ records.append(
263
+ {
264
+ "unit": uid,
265
+ "period": period,
266
+ "outcome": y,
267
+ "first_treat": unit_first_treat,
268
+ "treated": int(is_treated),
269
+ "treat": int(is_ever_treated),
270
+ "true_effect": effect,
271
+ }
272
+ )
273
+
274
+ return pd.DataFrame(records)
275
+
276
+ # --- Panel mode (default) ---
277
+ # Assign treatment cohorts
278
+ first_treat = np.zeros(n_units, dtype=int)
279
+ if n_treated > 0:
280
+ cohort_assignments = rng.choice(len(cohort_periods), size=n_treated)
281
+ first_treat[n_never:] = [cohort_periods[c] for c in cohort_assignments]
282
+
283
+ # Generate unit fixed effects
284
+ unit_fe = rng.normal(0, unit_fe_sd, n_units)
285
+
286
+ # Build data
287
+ records = []
288
+ for unit in range(n_units):
289
+ unit_first_treat = first_treat[unit]
290
+ is_ever_treated = unit_first_treat > 0
291
+
292
+ for period in range(n_periods):
293
+ # Check if treated at this observation
294
+ is_treated = is_ever_treated and period >= unit_first_treat
295
+
296
+ # Base outcome: unit FE + time trend
297
+ y = 10.0 + unit_fe[unit] + time_trend * period
298
+
299
+ # Treatment effect
300
+ effect = 0.0
301
+ if is_treated:
302
+ time_since_treatment = period - unit_first_treat
303
+ if dynamic_effects:
304
+ effect = treatment_effect * (1 + effect_growth * time_since_treatment)
305
+ else:
306
+ effect = treatment_effect
307
+ y += effect
308
+
309
+ # Add noise
310
+ y += rng.normal(0, noise_sd)
311
+
312
+ records.append(
313
+ {
314
+ "unit": unit,
315
+ "period": period,
316
+ "outcome": y,
317
+ "first_treat": unit_first_treat,
318
+ "treated": int(is_treated),
319
+ "treat": int(is_ever_treated),
320
+ "true_effect": effect,
321
+ }
322
+ )
323
+
324
+ return pd.DataFrame(records)
325
+
326
+
327
+ def generate_factor_data(
328
+ n_units: int = 50,
329
+ n_pre: int = 10,
330
+ n_post: int = 5,
331
+ n_treated: int = 10,
332
+ n_factors: int = 2,
333
+ treatment_effect: float = 2.0,
334
+ factor_strength: float = 1.0,
335
+ treated_loading_shift: float = 0.5,
336
+ unit_fe_sd: float = 1.0,
337
+ noise_sd: float = 0.5,
338
+ seed: Optional[int] = None,
339
+ ) -> pd.DataFrame:
340
+ """
341
+ Generate synthetic panel data with interactive fixed effects (factor model).
342
+
343
+ Creates data following the DGP:
344
+ Y_it = mu + alpha_i + beta_t + Lambda_i'F_t + tau*D_it + eps_it
345
+
346
+ where Lambda_i'F_t is the interactive fixed effects component. Useful for
347
+ testing TROP (Triply Robust Panel) and comparing with SyntheticDiD.
348
+
349
+ Parameters
350
+ ----------
351
+ n_units : int, default=50
352
+ Total number of units in the panel.
353
+ n_pre : int, default=10
354
+ Number of pre-treatment periods.
355
+ n_post : int, default=5
356
+ Number of post-treatment periods.
357
+ n_treated : int, default=10
358
+ Number of treated units (assigned to first n_treated unit IDs).
359
+ n_factors : int, default=2
360
+ Number of latent factors in the interactive fixed effects.
361
+ treatment_effect : float, default=2.0
362
+ True average treatment effect on the treated.
363
+ factor_strength : float, default=1.0
364
+ Scaling factor for interactive fixed effects.
365
+ treated_loading_shift : float, default=0.5
366
+ Shift in factor loadings for treated units (creates confounding).
367
+ unit_fe_sd : float, default=1.0
368
+ Standard deviation of unit fixed effects.
369
+ noise_sd : float, default=0.5
370
+ Standard deviation of idiosyncratic noise.
371
+ seed : int, optional
372
+ Random seed for reproducibility.
373
+
374
+ Returns
375
+ -------
376
+ pd.DataFrame
377
+ Synthetic factor model data with columns:
378
+ - unit: Unit identifier
379
+ - period: Time period
380
+ - outcome: Outcome variable
381
+ - treated: Binary indicator (1 if treated at this observation)
382
+ - treat: Binary unit-level ever-treated indicator
383
+ - true_effect: The true treatment effect for this observation
384
+
385
+ Examples
386
+ --------
387
+ Generate data with factor structure:
388
+
389
+ >>> data = generate_factor_data(n_units=50, n_factors=2, seed=42)
390
+ >>> data.shape
391
+ (750, 6)
392
+
393
+ Use with TROP estimator:
394
+
395
+ >>> from diff_diff import TROP
396
+ >>> trop = TROP(n_bootstrap=50, seed=42)
397
+ >>> results = trop.fit(data, outcome='outcome', treatment='treated',
398
+ ... unit='unit', time='period',
399
+ ... post_periods=list(range(10, 15)))
400
+
401
+ Notes
402
+ -----
403
+ The treated units have systematically different factor loadings
404
+ (shifted by `treated_loading_shift`), which creates confounding
405
+ that standard DiD cannot address but TROP can handle.
406
+ """
407
+ rng = np.random.default_rng(seed)
408
+
409
+ n_control = n_units - n_treated
410
+ n_periods = n_pre + n_post
411
+
412
+ if n_treated > n_units:
413
+ raise ValueError(f"n_treated ({n_treated}) cannot exceed n_units ({n_units})")
414
+ if n_treated < 1:
415
+ raise ValueError("n_treated must be at least 1")
416
+
417
+ # Generate factors F: (n_periods, n_factors)
418
+ F = rng.normal(0, 1, (n_periods, n_factors))
419
+
420
+ # Generate loadings Lambda: (n_factors, n_units)
421
+ # Treated units have shifted loadings (creates confounding)
422
+ Lambda = rng.normal(0, 1, (n_factors, n_units))
423
+ Lambda[:, :n_treated] += treated_loading_shift
424
+
425
+ # Unit fixed effects (treated units have higher baseline)
426
+ alpha = rng.normal(0, unit_fe_sd, n_units)
427
+ alpha[:n_treated] += 1.0
428
+
429
+ # Time fixed effects (linear trend)
430
+ beta = np.linspace(0, 2, n_periods)
431
+
432
+ # Generate outcomes
433
+ records = []
434
+ for i in range(n_units):
435
+ is_ever_treated = i < n_treated
436
+
437
+ for t in range(n_periods):
438
+ post = t >= n_pre
439
+
440
+ # Base outcome
441
+ y = 10.0 + alpha[i] + beta[t]
442
+
443
+ # Interactive fixed effects: Lambda_i' F_t
444
+ y += factor_strength * (Lambda[:, i] @ F[t, :])
445
+
446
+ # Treatment effect
447
+ effect = 0.0
448
+ if is_ever_treated and post:
449
+ effect = treatment_effect
450
+ y += effect
451
+
452
+ # Add noise
453
+ y += rng.normal(0, noise_sd)
454
+
455
+ records.append(
456
+ {
457
+ "unit": i,
458
+ "period": t,
459
+ "outcome": y,
460
+ "treated": int(is_ever_treated and post),
461
+ "treat": int(is_ever_treated),
462
+ "true_effect": effect,
463
+ }
464
+ )
465
+
466
+ return pd.DataFrame(records)
467
+
468
+
469
+ def generate_ddd_data(
470
+ n_per_cell: int = 100,
471
+ treatment_effect: float = 2.0,
472
+ group_effect: float = 2.0,
473
+ partition_effect: float = 1.0,
474
+ time_effect: float = 0.5,
475
+ noise_sd: float = 1.0,
476
+ add_covariates: bool = False,
477
+ seed: Optional[int] = None,
478
+ ) -> pd.DataFrame:
479
+ """
480
+ Generate synthetic data for Triple Difference (DDD) analysis.
481
+
482
+ Creates data following the DGP:
483
+ Y = mu + G + P + T + G*P + G*T + P*T + tau*G*P*T + eps
484
+
485
+ where G=group, P=partition, T=time. The treatment effect (tau) only
486
+ applies to units that are in the treated group (G=1), eligible
487
+ partition (P=1), and post-treatment period (T=1).
488
+
489
+ Parameters
490
+ ----------
491
+ n_per_cell : int, default=100
492
+ Number of observations per cell (8 cells total: 2x2x2).
493
+ treatment_effect : float, default=2.0
494
+ True average treatment effect on the treated (G=1, P=1, T=1).
495
+ group_effect : float, default=2.0
496
+ Main effect of being in treated group.
497
+ partition_effect : float, default=1.0
498
+ Main effect of being in eligible partition.
499
+ time_effect : float, default=0.5
500
+ Main effect of post-treatment period.
501
+ noise_sd : float, default=1.0
502
+ Standard deviation of idiosyncratic noise.
503
+ add_covariates : bool, default=False
504
+ If True, adds age and education covariates that affect outcome.
505
+ seed : int, optional
506
+ Random seed for reproducibility.
507
+
508
+ Returns
509
+ -------
510
+ pd.DataFrame
511
+ Synthetic DDD data with columns:
512
+ - outcome: Outcome variable
513
+ - group: Group indicator (0=control, 1=treated)
514
+ - partition: Partition indicator (0=ineligible, 1=eligible)
515
+ - time: Time indicator (0=pre, 1=post)
516
+ - unit_id: Unique unit identifier
517
+ - true_effect: The true treatment effect for this observation
518
+ - age: Age covariate (if add_covariates=True)
519
+ - education: Education covariate (if add_covariates=True)
520
+
521
+ Examples
522
+ --------
523
+ Generate DDD data:
524
+
525
+ >>> data = generate_ddd_data(n_per_cell=100, treatment_effect=3.0, seed=42)
526
+ >>> data.shape
527
+ (800, 6)
528
+ >>> data.groupby(['group', 'partition', 'time']).size()
529
+ group partition time
530
+ 0 0 0 100
531
+ 1 100
532
+ 1 0 100
533
+ 1 100
534
+ 1 0 0 100
535
+ 1 100
536
+ 1 0 100
537
+ 1 100
538
+ dtype: int64
539
+
540
+ Use with TripleDifference estimator:
541
+
542
+ >>> from diff_diff import TripleDifference
543
+ >>> ddd = TripleDifference()
544
+ >>> results = ddd.fit(data, outcome='outcome', group='group',
545
+ ... partition='partition', time='time')
546
+ >>> abs(results.att - 3.0) < 1.0
547
+ True
548
+ """
549
+ rng = np.random.default_rng(seed)
550
+
551
+ records = []
552
+ unit_id = 0
553
+
554
+ for g in [0, 1]: # group (0=control state, 1=treated state)
555
+ for p in [0, 1]: # partition (0=ineligible, 1=eligible)
556
+ for t in [0, 1]: # time (0=pre, 1=post)
557
+ for _ in range(n_per_cell):
558
+ # Base outcome with main effects
559
+ y = 50 + group_effect * g + partition_effect * p + time_effect * t
560
+
561
+ # Second-order interactions (non-treatment)
562
+ y += 1.5 * g * p # group-partition interaction
563
+ y += 1.0 * g * t # group-time interaction (diff trends)
564
+ y += 0.5 * p * t # partition-time interaction
565
+
566
+ # Treatment effect: ONLY for G=1, P=1, T=1
567
+ effect = 0.0
568
+ if g == 1 and p == 1 and t == 1:
569
+ effect = treatment_effect
570
+ y += effect
571
+
572
+ # Covariates (always generated for consistency)
573
+ age = rng.normal(40, 10)
574
+ education = rng.choice([12, 14, 16, 18], p=[0.3, 0.3, 0.25, 0.15])
575
+
576
+ if add_covariates:
577
+ y += 0.1 * age + 0.5 * education
578
+
579
+ # Add noise
580
+ y += rng.normal(0, noise_sd)
581
+
582
+ record = {
583
+ "outcome": y,
584
+ "group": g,
585
+ "partition": p,
586
+ "time": t,
587
+ "unit_id": unit_id,
588
+ "true_effect": effect,
589
+ }
590
+
591
+ if add_covariates:
592
+ record["age"] = age
593
+ record["education"] = education
594
+
595
+ records.append(record)
596
+ unit_id += 1
597
+
598
+ return pd.DataFrame(records)
599
+
600
+
601
+ def generate_panel_data(
602
+ n_units: int = 100,
603
+ n_periods: int = 8,
604
+ treatment_period: int = 4,
605
+ treatment_fraction: float = 0.5,
606
+ treatment_effect: float = 5.0,
607
+ parallel_trends: bool = True,
608
+ trend_violation: float = 1.0,
609
+ unit_fe_sd: float = 2.0,
610
+ noise_sd: float = 0.5,
611
+ seed: Optional[int] = None,
612
+ ) -> pd.DataFrame:
613
+ """
614
+ Generate synthetic panel data for parallel trends testing.
615
+
616
+ Creates panel data with optional violation of parallel trends, useful
617
+ for testing parallel trends diagnostics, placebo tests, and sensitivity
618
+ analysis methods.
619
+
620
+ Parameters
621
+ ----------
622
+ n_units : int, default=100
623
+ Total number of units in the panel.
624
+ n_periods : int, default=8
625
+ Number of time periods.
626
+ treatment_period : int, default=4
627
+ First post-treatment period (0-indexed).
628
+ treatment_fraction : float, default=0.5
629
+ Fraction of units that receive treatment.
630
+ treatment_effect : float, default=5.0
631
+ True average treatment effect on the treated.
632
+ parallel_trends : bool, default=True
633
+ If True, treated and control groups have parallel pre-treatment trends.
634
+ If False, treated group has a steeper pre-treatment trend.
635
+ trend_violation : float, default=1.0
636
+ Size of the differential trend for treated group when parallel_trends=False.
637
+ Treated units have trend = common_trend + trend_violation.
638
+ unit_fe_sd : float, default=2.0
639
+ Standard deviation of unit fixed effects.
640
+ noise_sd : float, default=0.5
641
+ Standard deviation of idiosyncratic noise.
642
+ seed : int, optional
643
+ Random seed for reproducibility.
644
+
645
+ Returns
646
+ -------
647
+ pd.DataFrame
648
+ Synthetic panel data with columns:
649
+ - unit: Unit identifier
650
+ - period: Time period
651
+ - treated: Binary unit-level treatment indicator
652
+ - post: Binary post-treatment indicator
653
+ - outcome: Outcome variable
654
+ - true_effect: The true treatment effect for this observation
655
+
656
+ Examples
657
+ --------
658
+ Generate data with parallel trends:
659
+
660
+ >>> data_parallel = generate_panel_data(parallel_trends=True, seed=42)
661
+ >>> from diff_diff.utils import check_parallel_trends
662
+ >>> result = check_parallel_trends(data_parallel, outcome='outcome',
663
+ ... time='period', treatment_group='treated',
664
+ ... pre_periods=[0, 1, 2, 3])
665
+ >>> result['parallel_trends_plausible']
666
+ True
667
+
668
+ Generate data with trend violation:
669
+
670
+ >>> data_violation = generate_panel_data(parallel_trends=False, seed=42)
671
+ >>> result = check_parallel_trends(data_violation, outcome='outcome',
672
+ ... time='period', treatment_group='treated',
673
+ ... pre_periods=[0, 1, 2, 3])
674
+ >>> result['parallel_trends_plausible']
675
+ False
676
+ """
677
+ rng = np.random.default_rng(seed)
678
+
679
+ if treatment_period < 1:
680
+ raise ValueError("treatment_period must be at least 1")
681
+ if treatment_period >= n_periods:
682
+ raise ValueError(f"treatment_period must be less than n_periods ({n_periods})")
683
+
684
+ n_treated = int(n_units * treatment_fraction)
685
+
686
+ records = []
687
+ for unit in range(n_units):
688
+ is_treated = unit < n_treated
689
+ unit_fe = rng.normal(0, unit_fe_sd)
690
+
691
+ for period in range(n_periods):
692
+ post = period >= treatment_period
693
+
694
+ # Base time effect (common trend)
695
+ if parallel_trends:
696
+ time_effect = period * 1.0
697
+ else:
698
+ # Different trends: treated has steeper pre-treatment trend
699
+ if is_treated:
700
+ time_effect = period * (1.0 + trend_violation)
701
+ else:
702
+ time_effect = period * 1.0
703
+
704
+ y = 10.0 + unit_fe + time_effect
705
+
706
+ # Treatment effect (only for treated in post-period)
707
+ effect = 0.0
708
+ if is_treated and post:
709
+ effect = treatment_effect
710
+ y += effect
711
+
712
+ # Add noise
713
+ y += rng.normal(0, noise_sd)
714
+
715
+ records.append(
716
+ {
717
+ "unit": unit,
718
+ "period": period,
719
+ "treated": int(is_treated),
720
+ "post": int(post),
721
+ "outcome": y,
722
+ "true_effect": effect,
723
+ }
724
+ )
725
+
726
+ return pd.DataFrame(records)
727
+
728
+
729
+ def generate_event_study_data(
730
+ n_units: int = 300,
731
+ n_pre: int = 5,
732
+ n_post: int = 5,
733
+ treatment_fraction: float = 0.5,
734
+ treatment_effect: float = 5.0,
735
+ unit_fe_sd: float = 2.0,
736
+ noise_sd: float = 2.0,
737
+ seed: Optional[int] = None,
738
+ ) -> pd.DataFrame:
739
+ """
740
+ Generate synthetic data for event study analysis.
741
+
742
+ Creates panel data with simultaneous treatment at period n_pre.
743
+ Useful for testing MultiPeriodDiD, pre-trends power analysis,
744
+ and HonestDiD sensitivity analysis.
745
+
746
+ Parameters
747
+ ----------
748
+ n_units : int, default=300
749
+ Total number of units in the panel.
750
+ n_pre : int, default=5
751
+ Number of pre-treatment periods.
752
+ n_post : int, default=5
753
+ Number of post-treatment periods.
754
+ treatment_fraction : float, default=0.5
755
+ Fraction of units that receive treatment.
756
+ treatment_effect : float, default=5.0
757
+ True average treatment effect on the treated.
758
+ unit_fe_sd : float, default=2.0
759
+ Standard deviation of unit fixed effects.
760
+ noise_sd : float, default=2.0
761
+ Standard deviation of idiosyncratic noise.
762
+ seed : int, optional
763
+ Random seed for reproducibility.
764
+
765
+ Returns
766
+ -------
767
+ pd.DataFrame
768
+ Synthetic event study data with columns:
769
+ - unit: Unit identifier
770
+ - period: Time period
771
+ - treated: Binary unit-level treatment indicator
772
+ - post: Binary post-treatment indicator
773
+ - outcome: Outcome variable
774
+ - event_time: Time relative to treatment (negative=pre, 0+=post)
775
+ - true_effect: The true treatment effect for this observation
776
+
777
+ Examples
778
+ --------
779
+ Generate event study data:
780
+
781
+ >>> data = generate_event_study_data(n_units=300, n_pre=5, n_post=5, seed=42)
782
+ >>> data['event_time'].unique()
783
+ array([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4])
784
+
785
+ Use with MultiPeriodDiD:
786
+
787
+ >>> from diff_diff import MultiPeriodDiD
788
+ >>> mp_did = MultiPeriodDiD()
789
+ >>> results = mp_did.fit(data, outcome='outcome', treatment='treated',
790
+ ... time='period', post_periods=[5, 6, 7, 8, 9])
791
+
792
+ Notes
793
+ -----
794
+ The event_time column is relative to treatment:
795
+ - Negative values: pre-treatment periods
796
+ - 0: first post-treatment period
797
+ - Positive values: subsequent post-treatment periods
798
+ """
799
+ rng = np.random.default_rng(seed)
800
+
801
+ n_periods = n_pre + n_post
802
+ treatment_period = n_pre
803
+ n_treated = int(n_units * treatment_fraction)
804
+
805
+ records = []
806
+ for unit in range(n_units):
807
+ is_treated = unit < n_treated
808
+ unit_fe = rng.normal(0, unit_fe_sd)
809
+
810
+ for period in range(n_periods):
811
+ post = period >= treatment_period
812
+ event_time = period - treatment_period
813
+
814
+ # Common time trend
815
+ time_effect = period * 0.5
816
+
817
+ y = 10.0 + unit_fe + time_effect
818
+
819
+ # Treatment effect (only for treated in post-period)
820
+ effect = 0.0
821
+ if is_treated and post:
822
+ effect = treatment_effect
823
+ y += effect
824
+
825
+ # Add noise
826
+ y += rng.normal(0, noise_sd)
827
+
828
+ records.append(
829
+ {
830
+ "unit": unit,
831
+ "period": period,
832
+ "treated": int(is_treated),
833
+ "post": int(post),
834
+ "outcome": y,
835
+ "event_time": event_time,
836
+ "true_effect": effect,
837
+ }
838
+ )
839
+
840
+ return pd.DataFrame(records)
841
+
842
+
843
+ def generate_continuous_did_data(
844
+ n_units: int = 500,
845
+ n_periods: int = 4,
846
+ cohort_periods: Optional[List[int]] = None,
847
+ never_treated_frac: float = 0.3,
848
+ dose_distribution: str = "lognormal",
849
+ dose_params: Optional[Dict] = None,
850
+ att_function: str = "linear",
851
+ att_slope: float = 2.0,
852
+ att_intercept: float = 1.0,
853
+ unit_fe_sd: float = 2.0,
854
+ time_trend: float = 0.5,
855
+ noise_sd: float = 1.0,
856
+ seed: Optional[int] = None,
857
+ ) -> pd.DataFrame:
858
+ """
859
+ Generate synthetic data for continuous DiD analysis with known dose-response.
860
+
861
+ Creates a balanced panel with continuous treatment doses and known ATT(d)
862
+ function, satisfying strong parallel trends by construction.
863
+
864
+ Parameters
865
+ ----------
866
+ n_units : int, default=500
867
+ Number of units in the panel.
868
+ n_periods : int, default=4
869
+ Number of time periods (1-indexed).
870
+ cohort_periods : list of int, optional
871
+ Treatment cohort periods. Default: ``[2]`` (single cohort).
872
+ never_treated_frac : float, default=0.3
873
+ Fraction of units that are never-treated.
874
+ dose_distribution : str, default="lognormal"
875
+ Distribution for dose: ``"lognormal"``, ``"uniform"``, ``"exponential"``.
876
+ dose_params : dict, optional
877
+ Distribution-specific parameters. Defaults:
878
+ lognormal: ``{"mean": 0.5, "sigma": 0.5}``
879
+ uniform: ``{"low": 0.5, "high": 5.0}``
880
+ exponential: ``{"scale": 2.0}``
881
+ att_function : str, default="linear"
882
+ Functional form of ATT(d): ``"linear"``, ``"quadratic"``, ``"log"``.
883
+ att_slope : float, default=2.0
884
+ Slope parameter for ATT function.
885
+ att_intercept : float, default=1.0
886
+ Intercept parameter for ATT function.
887
+ unit_fe_sd : float, default=2.0
888
+ Standard deviation of unit fixed effects.
889
+ time_trend : float, default=0.5
890
+ Linear time trend coefficient.
891
+ noise_sd : float, default=1.0
892
+ Standard deviation of idiosyncratic noise.
893
+ seed : int, optional
894
+ Random seed for reproducibility.
895
+
896
+ Returns
897
+ -------
898
+ pd.DataFrame
899
+ Panel data with columns: ``unit``, ``period``, ``outcome``,
900
+ ``first_treat``, ``dose``, ``true_att``.
901
+ """
902
+ rng = np.random.default_rng(seed)
903
+
904
+ if cohort_periods is None:
905
+ cohort_periods = [2]
906
+
907
+ # Assign units to cohorts
908
+ n_never = int(n_units * never_treated_frac)
909
+ n_treated_total = n_units - n_never
910
+ n_per_cohort = n_treated_total // len(cohort_periods)
911
+
912
+ cohort_assignments = np.zeros(n_units, dtype=int)
913
+ idx = 0
914
+ for i, g in enumerate(cohort_periods):
915
+ n_this = n_per_cohort if i < len(cohort_periods) - 1 else n_treated_total - idx
916
+ cohort_assignments[n_never + idx : n_never + idx + n_this] = g
917
+ idx += n_this
918
+
919
+ # Generate doses
920
+ default_params = {
921
+ "lognormal": {"mean": 0.5, "sigma": 0.5},
922
+ "uniform": {"low": 0.5, "high": 5.0},
923
+ "exponential": {"scale": 2.0},
924
+ }
925
+ params = dose_params or default_params.get(dose_distribution, {})
926
+
927
+ dose_per_unit = np.zeros(n_units)
928
+ treated_mask = cohort_assignments > 0
929
+ n_treated_actual = int(np.sum(treated_mask))
930
+
931
+ if dose_distribution == "lognormal":
932
+ dose_per_unit[treated_mask] = rng.lognormal(
933
+ mean=params.get("mean", 0.5),
934
+ sigma=params.get("sigma", 0.5),
935
+ size=n_treated_actual,
936
+ )
937
+ elif dose_distribution == "uniform":
938
+ dose_per_unit[treated_mask] = rng.uniform(
939
+ low=params.get("low", 0.5),
940
+ high=params.get("high", 5.0),
941
+ size=n_treated_actual,
942
+ )
943
+ elif dose_distribution == "exponential":
944
+ dose_per_unit[treated_mask] = rng.exponential(
945
+ scale=params.get("scale", 2.0),
946
+ size=n_treated_actual,
947
+ )
948
+ else:
949
+ raise ValueError(
950
+ f"dose_distribution must be 'lognormal', 'uniform', or 'exponential', "
951
+ f"got '{dose_distribution}'"
952
+ )
953
+
954
+ # ATT function
955
+ def _att_func(d):
956
+ if att_function == "linear":
957
+ return att_intercept + att_slope * d
958
+ elif att_function == "quadratic":
959
+ return att_intercept + att_slope * d**2
960
+ elif att_function == "log":
961
+ return att_intercept + att_slope * np.log1p(d)
962
+ else:
963
+ raise ValueError(
964
+ f"att_function must be 'linear', 'quadratic', or 'log', " f"got '{att_function}'"
965
+ )
966
+
967
+ # Unit fixed effects
968
+ unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
969
+
970
+ # Build panel
971
+ periods = np.arange(1, n_periods + 1)
972
+ records = []
973
+ for i in range(n_units):
974
+ g_i = cohort_assignments[i]
975
+ d_i = dose_per_unit[i]
976
+ for t in periods:
977
+ # Potential outcome without treatment
978
+ y0 = unit_fe[i] + time_trend * t + rng.normal(0, noise_sd)
979
+ # Treatment effect
980
+ if g_i > 0 and t >= g_i:
981
+ att_d = _att_func(d_i)
982
+ else:
983
+ att_d = 0.0
984
+
985
+ records.append(
986
+ {
987
+ "unit": i,
988
+ "period": int(t),
989
+ "outcome": y0 + att_d,
990
+ "first_treat": int(g_i) if g_i > 0 else 0,
991
+ "dose": d_i,
992
+ "true_att": att_d,
993
+ }
994
+ )
995
+
996
+ return pd.DataFrame(records)
997
+
998
+
999
+ def generate_staggered_ddd_data(
1000
+ n_units: int = 200,
1001
+ n_periods: int = 8,
1002
+ cohort_periods: Optional[List[int]] = None,
1003
+ never_enabled_frac: float = 0.25,
1004
+ eligibility_frac: float = 0.5,
1005
+ treatment_effect: float = 3.0,
1006
+ dynamic_effects: bool = False,
1007
+ effect_growth: float = 0.1,
1008
+ eligibility_trend: float = 0.3,
1009
+ noise_sd: float = 0.5,
1010
+ add_covariates: bool = False,
1011
+ seed: Optional[int] = None,
1012
+ ) -> pd.DataFrame:
1013
+ """
1014
+ Generate synthetic data for staggered triple difference (DDD) analysis.
1015
+
1016
+ Creates a balanced panel with staggered enabling times and a binary
1017
+ eligibility dimension. Treatment occurs when a unit is both enabled
1018
+ (t >= S_i) and eligible (Q_i = 1). DDD-CPT holds by construction.
1019
+
1020
+ Parameters
1021
+ ----------
1022
+ n_units : int, default=200
1023
+ Number of units.
1024
+ n_periods : int, default=8
1025
+ Number of time periods (1-indexed).
1026
+ cohort_periods : list of int, optional
1027
+ Enabling periods. Default: [4, 6].
1028
+ never_enabled_frac : float, default=0.25
1029
+ Fraction of never-enabled units.
1030
+ eligibility_frac : float, default=0.5
1031
+ Fraction of eligible units (Q=1) within each cohort.
1032
+ treatment_effect : float, default=3.0
1033
+ True ATT for treated units.
1034
+ dynamic_effects : bool, default=False
1035
+ If True, effects grow over time since enabling.
1036
+ effect_growth : float, default=0.1
1037
+ Per-period effect growth rate when dynamic_effects=True.
1038
+ eligibility_trend : float, default=0.3
1039
+ Differential time trend for eligible vs ineligible units.
1040
+ Same across all enabling groups (preserves DDD-CPT).
1041
+ noise_sd : float, default=0.5
1042
+ Standard deviation of idiosyncratic noise.
1043
+ add_covariates : bool, default=False
1044
+ If True, add covariates x1 (continuous) and x2 (binary).
1045
+ seed : int, optional
1046
+ Random seed.
1047
+
1048
+ Returns
1049
+ -------
1050
+ pd.DataFrame
1051
+ Columns: unit, period, outcome, first_treat, eligibility, treated,
1052
+ true_effect. Also x1, x2 if add_covariates=True.
1053
+ """
1054
+ rng = np.random.default_rng(seed)
1055
+
1056
+ if cohort_periods is None:
1057
+ cohort_periods = [4, 6]
1058
+
1059
+ # Assign units to cohorts
1060
+ n_never = int(n_units * never_enabled_frac)
1061
+ n_treated_total = n_units - n_never
1062
+ n_per_cohort = n_treated_total // len(cohort_periods)
1063
+
1064
+ unit_cohort = np.zeros(n_units, dtype=float)
1065
+ idx = n_never
1066
+ for i, g in enumerate(cohort_periods):
1067
+ n_g = n_per_cohort if i < len(cohort_periods) - 1 else n_treated_total - idx + n_never
1068
+ unit_cohort[idx : idx + n_g] = g
1069
+ idx += n_g
1070
+
1071
+ # Assign eligibility (within each cohort, fraction eligible)
1072
+ unit_elig = np.zeros(n_units, dtype=int)
1073
+ for g_val in [0.0] + [float(g) for g in cohort_periods]:
1074
+ mask = unit_cohort == g_val
1075
+ n_g = int(np.sum(mask))
1076
+ if n_g == 0:
1077
+ continue
1078
+ n_eligible = max(1, min(int(n_g * eligibility_frac), n_g))
1079
+ indices = np.where(mask)[0]
1080
+ eligible_idx = rng.choice(indices, size=n_eligible, replace=False)
1081
+ unit_elig[eligible_idx] = 1
1082
+
1083
+ # Unit fixed effects
1084
+ unit_fe = rng.normal(0, 2.0, size=n_units)
1085
+
1086
+ # Covariates
1087
+ x1 = rng.normal(0, 1, size=n_units) if add_covariates else None
1088
+ x2 = rng.choice([0, 1], size=n_units) if add_covariates else None
1089
+
1090
+ # Generate panel
1091
+ records = []
1092
+ for i in range(n_units):
1093
+ g_i = unit_cohort[i]
1094
+ q_i = unit_elig[i]
1095
+ for t in range(1, n_periods + 1):
1096
+ # Base: unit FE + time trend + eligibility-time interaction
1097
+ gamma_t = 0.1 * t
1098
+ y = unit_fe[i] + gamma_t + 1.0 * q_i + eligibility_trend * q_i * gamma_t
1099
+
1100
+ if add_covariates:
1101
+ y += 0.5 * x1[i] + 0.3 * x2[i]
1102
+
1103
+ # Treatment effect: enabled AND eligible
1104
+ treated = int(g_i > 0 and t >= g_i and q_i == 1)
1105
+ true_eff = 0.0
1106
+ if treated:
1107
+ true_eff = treatment_effect
1108
+ if dynamic_effects:
1109
+ true_eff *= 1 + effect_growth * (t - g_i)
1110
+ y += true_eff
1111
+
1112
+ y += rng.normal(0, noise_sd)
1113
+
1114
+ row = {
1115
+ "unit": i,
1116
+ "period": t,
1117
+ "outcome": y,
1118
+ "first_treat": int(g_i) if g_i > 0 else 0,
1119
+ "eligibility": q_i,
1120
+ "treated": treated,
1121
+ "true_effect": true_eff,
1122
+ }
1123
+ if add_covariates:
1124
+ row["x1"] = x1[i]
1125
+ row["x2"] = x2[i]
1126
+
1127
+ records.append(row)
1128
+
1129
+ return pd.DataFrame(records)
1130
+
1131
+
1132
+ def _rank_pair_weights(
1133
+ unit_weight: np.ndarray,
1134
+ unit_stratum: np.ndarray,
1135
+ y0: np.ndarray,
1136
+ n_strata: int,
1137
+ ) -> None:
1138
+ """Rank-pair weights with Y(0) within each stratum (in-place).
1139
+
1140
+ High-outcome units receive higher weights, modeling informative sampling
1141
+ where hard-to-reach (high-outcome) subpopulations are under-covered
1142
+ and therefore carry larger inverse-selection-probability weights.
1143
+ """
1144
+ for s in range(n_strata):
1145
+ mask = unit_stratum == s
1146
+ n_s = mask.sum()
1147
+ if n_s <= 1:
1148
+ continue
1149
+ idx_s = np.where(mask)[0]
1150
+ w_vals = unit_weight[idx_s].copy()
1151
+ if w_vals.std() < 1e-10:
1152
+ # No within-stratum variation: create rank-based weights
1153
+ # scaled to preserve stratum baseline weight level
1154
+ ranks = np.argsort(np.argsort(y0[idx_s])).astype(float) + 1.0
1155
+ unit_weight[idx_s] = ranks / ranks.mean() * w_vals.mean()
1156
+ else:
1157
+ # Rank-pair: highest Y(0) gets heaviest weight
1158
+ y0_order = np.argsort(-y0[idx_s])
1159
+ w_sorted = np.sort(w_vals)[::-1] # heaviest first
1160
+ unit_weight[idx_s[y0_order]] = w_sorted
1161
+
1162
+
1163
+ def generate_survey_did_data(
1164
+ n_units: int = 200,
1165
+ n_periods: int = 8,
1166
+ cohort_periods: Optional[List[int]] = None,
1167
+ never_treated_frac: float = 0.3,
1168
+ treatment_effect: float = 2.0,
1169
+ dynamic_effects: bool = False,
1170
+ effect_growth: float = 0.3,
1171
+ n_strata: int = 5,
1172
+ psu_per_stratum: int = 8,
1173
+ fpc_per_stratum: float = 200.0,
1174
+ weight_variation: str = "moderate",
1175
+ psu_re_sd: float = 2.0,
1176
+ psu_period_factor: float = 0.5,
1177
+ unit_fe_sd: float = 1.0,
1178
+ noise_sd: float = 0.5,
1179
+ include_replicate_weights: bool = False,
1180
+ add_covariates: bool = False,
1181
+ panel: bool = True,
1182
+ seed: Optional[int] = None,
1183
+ # --- Research-grade DGP parameters ---
1184
+ icc: Optional[float] = None,
1185
+ weight_cv: Optional[float] = None,
1186
+ informative_sampling: bool = False,
1187
+ heterogeneous_te_by_strata: bool = False,
1188
+ strata_sizes: Optional[List[int]] = None,
1189
+ return_true_population_att: bool = False,
1190
+ covariate_effects: Optional[tuple] = None,
1191
+ te_covariate_interaction: float = 0.0,
1192
+ ) -> pd.DataFrame:
1193
+ """
1194
+ Generate synthetic staggered DiD data with survey structure.
1195
+
1196
+ Creates a balanced panel (or repeated cross-section) with stratified
1197
+ multi-stage sampling design (strata, PSUs, FPC, sampling weights) and
1198
+ known treatment effects. The survey structure introduces intra-cluster
1199
+ correlation via PSU random effects, making design-based SEs larger
1200
+ than naive SEs.
1201
+
1202
+ Modeled on ACS/BRFSS-style stratified household surveys: strata
1203
+ represent geographic region types, PSUs are census tracts sampled
1204
+ within each stratum, and weights are inverse selection probabilities.
1205
+
1206
+ Parameters
1207
+ ----------
1208
+ n_units : int, default=200
1209
+ Number of units (respondents) per period.
1210
+ n_periods : int, default=8
1211
+ Number of time periods (1-indexed).
1212
+ cohort_periods : list of int, optional
1213
+ Treatment cohort periods (1-indexed, each must be >= 2 for at least
1214
+ one pre-treatment period). Default derived from n_periods; [3, 5]
1215
+ when n_periods >= 8. Requires n_periods >= 4 when not specified.
1216
+ never_treated_frac : float, default=0.3
1217
+ Fraction of units that are never treated.
1218
+ treatment_effect : float, default=2.0
1219
+ True ATT for treated units.
1220
+ dynamic_effects : bool, default=False
1221
+ If True, effects grow over time since treatment.
1222
+ effect_growth : float, default=0.3
1223
+ Per-period effect growth rate when dynamic_effects=True.
1224
+ n_strata : int, default=5
1225
+ Number of geographic strata.
1226
+ psu_per_stratum : int, default=8
1227
+ Number of PSUs (census tracts) per stratum.
1228
+ fpc_per_stratum : float, default=200.0
1229
+ Finite population correction (total tracts per stratum).
1230
+ weight_variation : str, default="moderate"
1231
+ Controls sampling weight dispersion across strata.
1232
+ "none": all weights equal (1.0).
1233
+ "moderate": weights range ~1.0-2.0 across strata.
1234
+ "high": weights range ~1.0-4.0 across strata.
1235
+ psu_re_sd : float, default=2.0
1236
+ Standard deviation of PSU random effects. Controls intra-cluster
1237
+ correlation and drives DEFF > 1.
1238
+ psu_period_factor : float, default=0.5
1239
+ Multiplier for PSU-period interaction shocks (relative to psu_re_sd).
1240
+ Higher values increase time-varying within-cluster correlation,
1241
+ which survives DiD's time-differencing and inflates design-based SEs.
1242
+ unit_fe_sd : float, default=1.0
1243
+ Standard deviation of unit fixed effects.
1244
+ noise_sd : float, default=0.5
1245
+ Standard deviation of idiosyncratic noise.
1246
+ include_replicate_weights : bool, default=False
1247
+ If True, add JK1 (delete-one-PSU) replicate weight columns.
1248
+ Requires at least 2 PSUs.
1249
+ add_covariates : bool, default=False
1250
+ If True, add covariates x1 (continuous) and x2 (binary).
1251
+ panel : bool, default=True
1252
+ If True, generate panel data (same respondents across periods).
1253
+ If False, generate repeated cross-sections with fresh respondent
1254
+ effects and unique unit IDs each period (for use with
1255
+ CallawaySantAnna(panel=False)).
1256
+ seed : int, optional
1257
+ Random seed for reproducibility.
1258
+ icc : float, optional
1259
+ Target intra-class correlation coefficient (0 < icc < 1). Overrides
1260
+ ``psu_re_sd`` via the variance decomposition:
1261
+ ``psu_re_sd = sqrt(icc * (sigma2_unit + sigma2_noise + sigma2_cov) /
1262
+ ((1 - icc) * (1 + psu_period_factor^2)))`` where ``sigma2_cov``
1263
+ includes covariate variance when ``add_covariates=True``.
1264
+ Cannot be combined with a non-default ``psu_re_sd``.
1265
+ weight_cv : float, optional
1266
+ Target coefficient of variation for sampling weights. Generates
1267
+ LogNormal weights normalized to mean 1, bypassing ``weight_variation``.
1268
+ Cannot be combined with a non-default ``weight_variation``.
1269
+ informative_sampling : bool, default=False
1270
+ If True, sampling weights correlate with Y(0) — high-outcome units
1271
+ receive higher weights (under-coverage → larger inverse-selection-
1272
+ probability weights). Uses rank-pairing within each stratum. For
1273
+ panel data, ranking is done once from period-1 outcomes. For
1274
+ repeated cross-sections, ranking is refreshed each period. Within
1275
+ each stratum, rank-based weights are scaled to preserve the
1276
+ stratum's baseline weight level from ``weight_variation``.
1277
+ When ``add_covariates=True``, covariate contributions are
1278
+ included in the Y(0) ranking.
1279
+ heterogeneous_te_by_strata : bool, default=False
1280
+ If True, treatment effect varies by stratum:
1281
+ ``TE_h = TE * (1 + 0.5 * (h - mean) / std)``. Creates a gap
1282
+ between unweighted and population ATT. With ``n_strata=1``,
1283
+ all units receive the base ``treatment_effect``.
1284
+ strata_sizes : list of int, optional
1285
+ Custom per-stratum unit counts. Must have length ``n_strata`` and
1286
+ sum to ``n_units``. Replaces equal allocation across strata.
1287
+ return_true_population_att : bool, default=False
1288
+ If True, attaches a diagnostic dict to ``df.attrs["dgp_truth"]``
1289
+ with keys: ``population_att`` (weight-weighted average of treated
1290
+ true effects), ``deff_kish`` (1 + CV(w)^2), ``base_stratum_effects``
1291
+ (base stratum TEs before dynamic/covariate modifiers),
1292
+ ``icc_realized`` (ANOVA-based
1293
+ ICC computed on period-1 data).
1294
+ covariate_effects : tuple of (float, float), optional
1295
+ Coefficients ``(beta1, beta2)`` for covariates x1 and x2 in the
1296
+ outcome equation ``y += beta1 * x1 + beta2 * x2``. Default uses
1297
+ ``(0.5, 0.3)``. Only used when ``add_covariates=True``. The ICC
1298
+ calibration automatically adjusts for the implied covariate variance.
1299
+ te_covariate_interaction : float, default=0.0
1300
+ Coefficient for treatment-by-covariate interaction:
1301
+ ``TE_i = base_TE + te_covariate_interaction * x1_i``. Creates
1302
+ unit-level treatment effect heterogeneity driven by the continuous
1303
+ covariate. Requires ``add_covariates=True``.
1304
+
1305
+ Returns
1306
+ -------
1307
+ pd.DataFrame
1308
+ Columns: unit, period, outcome, first_treat, treated, true_effect,
1309
+ stratum, psu, fpc, weight. Also rep_0..rep_K if
1310
+ include_replicate_weights=True, and x1, x2 if add_covariates=True.
1311
+ If ``return_true_population_att=True``, ``df.attrs["dgp_truth"]``
1312
+ contains DGP diagnostics.
1313
+ """
1314
+ rng = np.random.default_rng(seed)
1315
+
1316
+ # --- Upfront parameter validation ---
1317
+ if n_units < 1:
1318
+ raise ValueError(f"n_units must be positive, got {n_units}")
1319
+ if n_periods < 1:
1320
+ raise ValueError(f"n_periods must be positive, got {n_periods}")
1321
+ if n_strata < 1:
1322
+ raise ValueError(f"n_strata must be positive, got {n_strata}")
1323
+ if psu_per_stratum < 1:
1324
+ raise ValueError(f"psu_per_stratum must be positive, got {psu_per_stratum}")
1325
+ if not 0.0 <= never_treated_frac <= 1.0:
1326
+ raise ValueError(
1327
+ f"never_treated_frac must be between 0 and 1, got {never_treated_frac}"
1328
+ )
1329
+ if fpc_per_stratum < psu_per_stratum:
1330
+ raise ValueError(
1331
+ f"fpc_per_stratum ({fpc_per_stratum}) must be >= psu_per_stratum "
1332
+ f"({psu_per_stratum})"
1333
+ )
1334
+
1335
+ if cohort_periods is None:
1336
+ # Derive defaults from n_periods. Cohorts need g >= 2 (at least one
1337
+ # pre-period for estimability with CallawaySantAnna).
1338
+ if n_periods >= 8:
1339
+ cohort_periods = [3, 5]
1340
+ elif n_periods >= 4:
1341
+ cohort_periods = [max(2, n_periods // 3), max(3, 2 * n_periods // 3)]
1342
+ else:
1343
+ raise ValueError(
1344
+ f"n_periods={n_periods} is too small for default cohort_periods "
1345
+ f"(need n_periods >= 4 for at least one cohort with a pre-period). "
1346
+ f"Pass cohort_periods explicitly for small panels."
1347
+ )
1348
+ # Coerce array-like to list (handles np.array inputs)
1349
+ cohort_periods = list(cohort_periods)
1350
+ if not cohort_periods:
1351
+ raise ValueError("cohort_periods must be a non-empty list of integers")
1352
+ for cp in cohort_periods:
1353
+ if isinstance(cp, bool) or not isinstance(cp, (int, np.integer)):
1354
+ raise ValueError(
1355
+ f"cohort_periods must contain integers, got {cp!r}"
1356
+ )
1357
+ if cp < 2 or cp > n_periods:
1358
+ raise ValueError(
1359
+ f"Cohort period {cp} must be between 2 and {n_periods} "
1360
+ f"(g >= 2 ensures at least one pre-treatment period)"
1361
+ )
1362
+
1363
+ if not np.isfinite(psu_period_factor) or psu_period_factor < 0:
1364
+ raise ValueError(
1365
+ f"psu_period_factor must be finite and non-negative, "
1366
+ f"got {psu_period_factor}"
1367
+ )
1368
+
1369
+ valid_wv = ("none", "moderate", "high")
1370
+ if weight_variation not in valid_wv:
1371
+ raise ValueError(
1372
+ f"weight_variation must be one of {valid_wv}, got {weight_variation!r}"
1373
+ )
1374
+
1375
+ # --- Validate research-grade DGP parameters ---
1376
+ if icc is not None:
1377
+ if not (0 < icc < 1):
1378
+ raise ValueError(f"icc must be between 0 and 1 (exclusive), got {icc}")
1379
+ if psu_re_sd != 2.0:
1380
+ raise ValueError(
1381
+ "Cannot specify both icc and a non-default psu_re_sd. "
1382
+ "icc overrides psu_re_sd via the ICC formula."
1383
+ )
1384
+
1385
+ if weight_cv is not None:
1386
+ if not np.isfinite(weight_cv) or weight_cv <= 0:
1387
+ raise ValueError(
1388
+ f"weight_cv must be finite and positive, got {weight_cv}"
1389
+ )
1390
+ if weight_variation != "moderate":
1391
+ raise ValueError(
1392
+ "Cannot specify both weight_cv and a non-default "
1393
+ "weight_variation. weight_cv overrides weight_variation."
1394
+ )
1395
+
1396
+ if strata_sizes is not None:
1397
+ strata_sizes = list(strata_sizes)
1398
+ for ss in strata_sizes:
1399
+ if isinstance(ss, bool) or not isinstance(ss, (int, np.integer)):
1400
+ raise ValueError(
1401
+ f"strata_sizes must contain integers, got {ss!r}"
1402
+ )
1403
+ if len(strata_sizes) != n_strata:
1404
+ raise ValueError(
1405
+ f"strata_sizes must have length n_strata={n_strata}, "
1406
+ f"got {len(strata_sizes)}"
1407
+ )
1408
+ if any(s < 1 for s in strata_sizes):
1409
+ raise ValueError("All strata_sizes must be >= 1")
1410
+ if sum(strata_sizes) != n_units:
1411
+ raise ValueError(
1412
+ f"strata_sizes must sum to n_units={n_units}, "
1413
+ f"got {sum(strata_sizes)}"
1414
+ )
1415
+
1416
+ # --- Validate and resolve covariate coefficients ---
1417
+ if covariate_effects is not None:
1418
+ covariate_effects = tuple(covariate_effects)
1419
+ if len(covariate_effects) != 2:
1420
+ raise ValueError(
1421
+ f"covariate_effects must have length 2, got {len(covariate_effects)}"
1422
+ )
1423
+ if not all(np.isfinite(c) for c in covariate_effects):
1424
+ raise ValueError(
1425
+ f"covariate_effects must be finite, got {covariate_effects}"
1426
+ )
1427
+ _beta1, _beta2 = covariate_effects if covariate_effects is not None else (0.5, 0.3)
1428
+
1429
+ if not np.isfinite(te_covariate_interaction):
1430
+ raise ValueError(
1431
+ f"te_covariate_interaction must be finite, got {te_covariate_interaction}"
1432
+ )
1433
+ if te_covariate_interaction != 0.0 and not add_covariates:
1434
+ raise ValueError(
1435
+ "te_covariate_interaction requires add_covariates=True"
1436
+ )
1437
+
1438
+ # --- ICC -> psu_re_sd resolution ---
1439
+ if icc is not None:
1440
+ # Covariate variance: Var(beta1*x1) + Var(beta2*x2)
1441
+ # where x1 ~ N(0,1), x2 ~ Bernoulli(0.5)
1442
+ cov_var = (_beta1**2 * 1.0 + _beta2**2 * 0.25) if add_covariates else 0.0
1443
+ non_psu_var = unit_fe_sd**2 + noise_sd**2 + cov_var
1444
+ if non_psu_var < 1e-12:
1445
+ raise ValueError(
1446
+ "icc requires non-zero non-PSU variance "
1447
+ "(unit_fe_sd, noise_sd, or add_covariates must contribute variance)"
1448
+ )
1449
+ psu_re_sd = np.sqrt(
1450
+ icc * non_psu_var
1451
+ / ((1 - icc) * (1 + psu_period_factor**2))
1452
+ )
1453
+
1454
+ # --- Survey structure: assign units to strata and PSUs ---
1455
+ n_psu_total = n_strata * psu_per_stratum
1456
+
1457
+ if strata_sizes is not None:
1458
+ stratum_n = strata_sizes
1459
+ else:
1460
+ units_per_stratum = n_units // n_strata
1461
+ remainder = n_units % n_strata
1462
+ stratum_n = [
1463
+ units_per_stratum + (1 if s < remainder else 0)
1464
+ for s in range(n_strata)
1465
+ ]
1466
+
1467
+ unit_stratum = np.empty(n_units, dtype=int)
1468
+ unit_psu = np.empty(n_units, dtype=int)
1469
+ idx = 0
1470
+ for s in range(n_strata):
1471
+ n_s = stratum_n[s]
1472
+ unit_stratum[idx : idx + n_s] = s
1473
+ psu_start = s * psu_per_stratum
1474
+ for j in range(n_s):
1475
+ unit_psu[idx + j] = psu_start + (j % psu_per_stratum)
1476
+ idx += n_s
1477
+
1478
+ # Sampling weights
1479
+ if weight_cv is not None:
1480
+ sigma_ln = np.sqrt(np.log(1 + weight_cv**2))
1481
+ raw_w = rng.lognormal(-sigma_ln**2 / 2, sigma_ln, size=n_units)
1482
+ unit_weight = raw_w / raw_w.mean()
1483
+ else:
1484
+ # Stratum-based weights (inverse selection probability)
1485
+ scale_map = {"none": 0.0, "moderate": 1.0, "high": 3.0}
1486
+ scale = scale_map.get(weight_variation, 1.0)
1487
+ denom = max(n_strata - 1, 1)
1488
+ unit_weight = 1.0 + scale * (unit_stratum / denom)
1489
+
1490
+ # --- Treatment assignment (cohort structure) ---
1491
+ n_never = int(n_units * never_treated_frac)
1492
+ n_treated_total = n_units - n_never
1493
+ n_per_cohort = n_treated_total // len(cohort_periods)
1494
+
1495
+ unit_cohort = np.zeros(n_units, dtype=int)
1496
+ ci = n_never
1497
+ for i, g in enumerate(cohort_periods):
1498
+ n_g = (
1499
+ n_per_cohort
1500
+ if i < len(cohort_periods) - 1
1501
+ else n_treated_total - ci + n_never
1502
+ )
1503
+ unit_cohort[ci : ci + n_g] = g
1504
+ ci += n_g
1505
+
1506
+ # --- JK1 early guard (configured count; populated count checked after build) ---
1507
+ if include_replicate_weights and n_psu_total < 2:
1508
+ raise ValueError(
1509
+ "JK1 replicate weights require at least 2 PSUs, "
1510
+ f"got {n_psu_total}."
1511
+ )
1512
+
1513
+ # --- Random effects ---
1514
+ psu_re = rng.normal(0, psu_re_sd, size=n_psu_total)
1515
+ # PSU-period shocks: intra-cluster correlation that survives first-
1516
+ # differencing in DiD. Without these, the time-invariant PSU RE
1517
+ # cancels in the treatment-vs-control time-difference and the
1518
+ # cluster-robust / survey SE would be *smaller* than naive OLS SE.
1519
+ # Controlled by psu_period_factor (default 0.5); higher values
1520
+ # increase time-varying clustering and inflate design-based SEs.
1521
+ psu_period_re = rng.normal(
1522
+ 0, psu_re_sd * psu_period_factor, size=(n_psu_total, n_periods)
1523
+ )
1524
+
1525
+ # --- Informative sampling (panel path): pre-draw FEs, rank-pair weights ---
1526
+ if informative_sampling and panel:
1527
+ _panel_unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
1528
+ y0_period1 = (
1529
+ _panel_unit_fe
1530
+ + psu_re[unit_psu]
1531
+ + psu_period_re[unit_psu, 0]
1532
+ + 0.5
1533
+ )
1534
+ if add_covariates:
1535
+ _panel_x1 = rng.normal(0, 1, size=n_units)
1536
+ _panel_x2 = rng.choice([0, 1], size=n_units)
1537
+ y0_period1 = y0_period1 + _beta1 * _panel_x1 + _beta2 * _panel_x2
1538
+ _rank_pair_weights(unit_weight, unit_stratum, y0_period1, n_strata)
1539
+
1540
+ # Save base weights for cross-section informative sampling (reset each period)
1541
+ if informative_sampling and not panel:
1542
+ _base_weight = unit_weight.copy()
1543
+
1544
+ # --- Heterogeneous treatment effects by stratum ---
1545
+ if heterogeneous_te_by_strata:
1546
+ if n_strata == 1:
1547
+ te_by_stratum = np.array([treatment_effect])
1548
+ else:
1549
+ strata_idx = np.arange(n_strata, dtype=float)
1550
+ te_by_stratum = treatment_effect * (
1551
+ 1 + 0.5 * (strata_idx - strata_idx.mean()) / strata_idx.std()
1552
+ )
1553
+ else:
1554
+ te_by_stratum = None
1555
+
1556
+ # --- Generate panel or repeated cross-sections ---
1557
+ records = []
1558
+ for t in range(1, n_periods + 1):
1559
+ # For repeated cross-sections, draw fresh respondent effects each period
1560
+ unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
1561
+ if panel and t > 1:
1562
+ pass # reuse unit_fe from first period (set below)
1563
+ if informative_sampling and panel:
1564
+ unit_fe = _panel_unit_fe # use pre-drawn FEs
1565
+ elif panel and t == 1:
1566
+ _panel_unit_fe = unit_fe # save for reuse
1567
+ elif panel and t > 1:
1568
+ unit_fe = _panel_unit_fe # type: ignore[possibly-undefined]
1569
+
1570
+ # Cross-section informative sampling: re-rank weights each period
1571
+ if informative_sampling and not panel:
1572
+ # Draw covariates early so they can be included in Y(0) ranking
1573
+ if add_covariates:
1574
+ x1 = rng.normal(0, 1, size=n_units)
1575
+ x2 = rng.choice([0, 1], size=n_units)
1576
+ unit_weight = _base_weight.copy() # type: ignore[possibly-undefined]
1577
+ y0_t = (
1578
+ unit_fe
1579
+ + psu_re[unit_psu]
1580
+ + psu_period_re[unit_psu, t - 1]
1581
+ + 0.5 * t
1582
+ )
1583
+ if add_covariates:
1584
+ y0_t = y0_t + _beta1 * x1 + _beta2 * x2
1585
+ _rank_pair_weights(unit_weight, unit_stratum, y0_t, n_strata)
1586
+
1587
+ # Covariates — may already be drawn by informative sampling above
1588
+ if informative_sampling and panel and add_covariates:
1589
+ x1 = _panel_x1 # pre-drawn before loop for ranking
1590
+ x2 = _panel_x2
1591
+ elif informative_sampling and not panel and add_covariates:
1592
+ pass # x1, x2 already drawn in cross-section ranking block
1593
+ elif add_covariates:
1594
+ x1 = rng.normal(0, 1, size=n_units)
1595
+ x2 = rng.choice([0, 1], size=n_units)
1596
+ else:
1597
+ x1 = None
1598
+ x2 = None
1599
+ if not informative_sampling and panel and t > 1 and add_covariates:
1600
+ x1 = _panel_x1 # type: ignore[possibly-undefined]
1601
+ x2 = _panel_x2 # type: ignore[possibly-undefined]
1602
+ elif not informative_sampling and panel and t == 1 and add_covariates:
1603
+ _panel_x1 = x1
1604
+ _panel_x2 = x2
1605
+
1606
+ for i in range(n_units):
1607
+ g_i = unit_cohort[i]
1608
+ # Outcome: unit FE + PSU RE + PSU-period shock + time trend
1609
+ y = unit_fe[i] + psu_re[unit_psu[i]] + psu_period_re[unit_psu[i], t - 1] + 0.5 * t
1610
+
1611
+ if add_covariates:
1612
+ y += _beta1 * x1[i] + _beta2 * x2[i]
1613
+
1614
+ treated = int(g_i > 0 and t >= g_i)
1615
+ true_eff = 0.0
1616
+ if treated:
1617
+ if te_by_stratum is not None:
1618
+ true_eff = float(te_by_stratum[unit_stratum[i]])
1619
+ else:
1620
+ true_eff = treatment_effect
1621
+ if te_covariate_interaction != 0.0:
1622
+ true_eff += te_covariate_interaction * x1[i]
1623
+ if dynamic_effects:
1624
+ true_eff *= 1 + effect_growth * (t - g_i)
1625
+ y += true_eff
1626
+
1627
+ y += rng.normal(0, noise_sd)
1628
+
1629
+ # In cross-section mode, each period gets unique unit IDs
1630
+ uid = i if panel else (t - 1) * n_units + i
1631
+
1632
+ row = {
1633
+ "unit": uid,
1634
+ "period": t,
1635
+ "outcome": y,
1636
+ "first_treat": g_i,
1637
+ "treated": treated,
1638
+ "true_effect": true_eff,
1639
+ "stratum": int(unit_stratum[i]),
1640
+ "psu": int(unit_psu[i]),
1641
+ "fpc": fpc_per_stratum,
1642
+ "weight": float(unit_weight[i]),
1643
+ }
1644
+ if add_covariates:
1645
+ row["x1"] = x1[i]
1646
+ row["x2"] = x2[i]
1647
+ records.append(row)
1648
+
1649
+ df = pd.DataFrame(records)
1650
+
1651
+ # --- Replicate weights (JK1 delete-one-PSU) ---
1652
+ if include_replicate_weights:
1653
+ psu_ids = sorted(df["psu"].unique())
1654
+ n_rep = len(psu_ids)
1655
+ if n_rep < 2:
1656
+ raise ValueError(
1657
+ "JK1 replicate weights require at least 2 populated PSUs, "
1658
+ f"got {n_rep}. Increase n_units or decrease psu_per_stratum."
1659
+ )
1660
+ base_w = df["weight"].values
1661
+ for r, psu_id in enumerate(psu_ids):
1662
+ w_r = base_w.copy()
1663
+ mask = df["psu"].values == psu_id
1664
+ w_r[mask] = 0.0
1665
+ # Rescale remaining: k/(k-1) for JK1
1666
+ w_r[w_r > 0] *= n_rep / (n_rep - 1)
1667
+ df[f"rep_{r}"] = w_r
1668
+
1669
+ # --- DGP truth diagnostics ---
1670
+ if return_true_population_att:
1671
+ treated_mask = df["treated"] == 1
1672
+ if treated_mask.any():
1673
+ w_treated = df.loc[treated_mask, "weight"].values
1674
+ te_treated = df.loc[treated_mask, "true_effect"].values
1675
+ population_att = float(np.average(te_treated, weights=w_treated))
1676
+ else:
1677
+ population_att = float("nan")
1678
+
1679
+ if te_by_stratum is not None:
1680
+ stratum_effects = {
1681
+ int(s): float(te_by_stratum[s]) for s in range(n_strata)
1682
+ }
1683
+ else:
1684
+ stratum_effects = {
1685
+ int(s): float(treatment_effect) for s in range(n_strata)
1686
+ }
1687
+
1688
+ # Kish DEFF from weight variation
1689
+ w_all = df.groupby("unit")["weight"].first().values
1690
+ cv_w = float(w_all.std() / w_all.mean()) if w_all.mean() > 0 else 0.0
1691
+ deff_kish = 1 + cv_w**2
1692
+
1693
+ # Realized ICC (ANOVA-based, period-1 only to avoid TE contamination)
1694
+ _p1 = df[df["period"] == 1]
1695
+ _groups = _p1.groupby("psu")["outcome"]
1696
+ _n_total = len(_p1)
1697
+ _n_groups = _groups.ngroups
1698
+ # ICC undefined with < 2 groups or no within-group replication
1699
+ if _n_groups < 2 or _n_total <= _n_groups:
1700
+ icc_realized = float("nan")
1701
+ else:
1702
+ _n_bar = _n_total / _n_groups
1703
+ _grand_mean = _p1["outcome"].mean()
1704
+ _ssb = (_groups.size() * (_groups.mean() - _grand_mean) ** 2).sum()
1705
+ _msb = _ssb / (_n_groups - 1)
1706
+ _ssw = _groups.apply(lambda x: ((x - x.mean()) ** 2).sum()).sum()
1707
+ _msw = _ssw / (_n_total - _n_groups)
1708
+ _denom = _msb + (_n_bar - 1) * _msw
1709
+ icc_realized = float((_msb - _msw) / _denom) if _denom > 0 else float("nan")
1710
+
1711
+ df.attrs["dgp_truth"] = {
1712
+ "population_att": population_att,
1713
+ "deff_kish": float(deff_kish),
1714
+ "base_stratum_effects": stratum_effects,
1715
+ "icc_realized": icc_realized,
1716
+ }
1717
+
1718
+ return df