diff-diff 2.1.1__tar.gz → 2.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {diff_diff-2.1.1 → diff_diff-2.1.3}/PKG-INFO +1 -1
  2. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/__init__.py +11 -1
  3. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/prep.py +655 -0
  4. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/trop.py +184 -46
  5. {diff_diff-2.1.1 → diff_diff-2.1.3}/pyproject.toml +1 -1
  6. {diff_diff-2.1.1 → diff_diff-2.1.3}/rust/Cargo.lock +1 -1
  7. {diff_diff-2.1.1 → diff_diff-2.1.3}/rust/Cargo.toml +1 -1
  8. {diff_diff-2.1.1 → diff_diff-2.1.3}/rust/src/trop.rs +131 -60
  9. {diff_diff-2.1.1 → diff_diff-2.1.3}/README.md +0 -0
  10. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/_backend.py +0 -0
  11. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/bacon.py +0 -0
  12. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/datasets.py +0 -0
  13. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/diagnostics.py +0 -0
  14. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/estimators.py +0 -0
  15. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/honest_did.py +0 -0
  16. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/linalg.py +0 -0
  17. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/power.py +0 -0
  18. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/pretrends.py +0 -0
  19. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/results.py +0 -0
  20. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/staggered.py +0 -0
  21. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/sun_abraham.py +0 -0
  22. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/synthetic_did.py +0 -0
  23. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/triple_diff.py +0 -0
  24. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/twfe.py +0 -0
  25. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/utils.py +0 -0
  26. {diff_diff-2.1.1 → diff_diff-2.1.3}/diff_diff/visualization.py +0 -0
  27. {diff_diff-2.1.1 → diff_diff-2.1.3}/rust/src/bootstrap.rs +0 -0
  28. {diff_diff-2.1.1 → diff_diff-2.1.3}/rust/src/lib.rs +0 -0
  29. {diff_diff-2.1.1 → diff_diff-2.1.3}/rust/src/linalg.rs +0 -0
  30. {diff_diff-2.1.1 → diff_diff-2.1.3}/rust/src/weights.rs +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diff-diff
3
- Version: 2.1.1
3
+ Version: 2.1.3
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Intended Audience :: Science/Research
6
6
  Classifier: Operating System :: OS Independent
@@ -71,6 +71,11 @@ from diff_diff.prep import (
71
71
  balance_panel,
72
72
  create_event_time,
73
73
  generate_did_data,
74
+ generate_ddd_data,
75
+ generate_event_study_data,
76
+ generate_factor_data,
77
+ generate_panel_data,
78
+ generate_staggered_data,
74
79
  make_post_indicator,
75
80
  make_treatment_indicator,
76
81
  rank_control_units,
@@ -131,7 +136,7 @@ from diff_diff.datasets import (
131
136
  load_mpdta,
132
137
  )
133
138
 
134
- __version__ = "2.1.1"
139
+ __version__ = "2.1.3"
135
140
  __all__ = [
136
141
  # Estimators
137
142
  "DifferenceInDifferences",
@@ -190,6 +195,11 @@ __all__ = [
190
195
  "validate_did_data",
191
196
  "summarize_did_data",
192
197
  "generate_did_data",
198
+ "generate_staggered_data",
199
+ "generate_factor_data",
200
+ "generate_ddd_data",
201
+ "generate_panel_data",
202
+ "generate_event_study_data",
193
203
  "create_event_time",
194
204
  "aggregate_to_cohorts",
195
205
  "rank_control_units",
@@ -1336,3 +1336,658 @@ def _suggest_treatment_candidates(
1336
1336
  # Return top candidates
1337
1337
  result = result.nlargest(n_candidates, 'treatment_candidate_score')
1338
1338
  return result.reset_index(drop=True)
1339
+
1340
+
1341
+ def generate_staggered_data(
1342
+ n_units: int = 100,
1343
+ n_periods: int = 10,
1344
+ cohort_periods: Optional[List[int]] = None,
1345
+ never_treated_frac: float = 0.3,
1346
+ treatment_effect: float = 2.0,
1347
+ dynamic_effects: bool = True,
1348
+ effect_growth: float = 0.1,
1349
+ unit_fe_sd: float = 2.0,
1350
+ time_trend: float = 0.1,
1351
+ noise_sd: float = 0.5,
1352
+ seed: Optional[int] = None,
1353
+ ) -> pd.DataFrame:
1354
+ """
1355
+ Generate synthetic data for staggered adoption DiD analysis.
1356
+
1357
+ Creates panel data where different units receive treatment at different
1358
+ times (staggered rollout). Useful for testing CallawaySantAnna,
1359
+ SunAbraham, and other staggered DiD estimators.
1360
+
1361
+ Parameters
1362
+ ----------
1363
+ n_units : int, default=100
1364
+ Total number of units in the panel.
1365
+ n_periods : int, default=10
1366
+ Number of time periods.
1367
+ cohort_periods : list of int, optional
1368
+ Periods when treatment cohorts are first treated.
1369
+ If None, defaults to [3, 5, 7] for a 10-period panel.
1370
+ never_treated_frac : float, default=0.3
1371
+ Fraction of units that are never treated (cohort 0).
1372
+ treatment_effect : float, default=2.0
1373
+ Base treatment effect at time of treatment.
1374
+ dynamic_effects : bool, default=True
1375
+ If True, treatment effects grow over time since treatment.
1376
+ effect_growth : float, default=0.1
1377
+ Per-period growth in treatment effect (if dynamic_effects=True).
1378
+ Effect at time t since treatment: effect * (1 + effect_growth * t).
1379
+ unit_fe_sd : float, default=2.0
1380
+ Standard deviation of unit fixed effects.
1381
+ time_trend : float, default=0.1
1382
+ Linear time trend coefficient.
1383
+ noise_sd : float, default=0.5
1384
+ Standard deviation of idiosyncratic noise.
1385
+ seed : int, optional
1386
+ Random seed for reproducibility.
1387
+
1388
+ Returns
1389
+ -------
1390
+ pd.DataFrame
1391
+ Synthetic staggered adoption data with columns:
1392
+ - unit: Unit identifier
1393
+ - period: Time period
1394
+ - outcome: Outcome variable
1395
+ - first_treat: First treatment period (0 = never treated)
1396
+ - treated: Binary indicator (1 if treated at this observation)
1397
+ - treat: Binary unit-level ever-treated indicator
1398
+ - true_effect: The true treatment effect for this observation
1399
+
1400
+ Examples
1401
+ --------
1402
+ Generate staggered adoption data:
1403
+
1404
+ >>> data = generate_staggered_data(n_units=100, n_periods=10, seed=42)
1405
+ >>> data['first_treat'].value_counts().sort_index()
1406
+ 0 30
1407
+ 3 24
1408
+ 5 23
1409
+ 7 23
1410
+ Name: first_treat, dtype: int64
1411
+
1412
+ Use with Callaway-Sant'Anna estimator:
1413
+
1414
+ >>> from diff_diff import CallawaySantAnna
1415
+ >>> cs = CallawaySantAnna()
1416
+ >>> results = cs.fit(data, outcome='outcome', unit='unit',
1417
+ ... time='period', first_treat='first_treat')
1418
+ >>> results.overall_att > 0
1419
+ True
1420
+ """
1421
+ rng = np.random.default_rng(seed)
1422
+
1423
+ # Default cohort periods if not specified
1424
+ if cohort_periods is None:
1425
+ cohort_periods = [3, 5, 7] if n_periods >= 8 else [n_periods // 3, 2 * n_periods // 3]
1426
+
1427
+ # Validate cohort periods
1428
+ for cp in cohort_periods:
1429
+ if cp < 1 or cp >= n_periods:
1430
+ raise ValueError(
1431
+ f"Cohort period {cp} must be between 1 and {n_periods - 1}"
1432
+ )
1433
+
1434
+ # Determine number of never-treated and treated units
1435
+ n_never = int(n_units * never_treated_frac)
1436
+ n_treated = n_units - n_never
1437
+
1438
+ # Assign treatment cohorts
1439
+ first_treat = np.zeros(n_units, dtype=int)
1440
+ if n_treated > 0:
1441
+ cohort_assignments = rng.choice(len(cohort_periods), size=n_treated)
1442
+ first_treat[n_never:] = [cohort_periods[c] for c in cohort_assignments]
1443
+
1444
+ # Generate unit fixed effects
1445
+ unit_fe = rng.normal(0, unit_fe_sd, n_units)
1446
+
1447
+ # Build data
1448
+ records = []
1449
+ for unit in range(n_units):
1450
+ unit_first_treat = first_treat[unit]
1451
+ is_ever_treated = unit_first_treat > 0
1452
+
1453
+ for period in range(n_periods):
1454
+ # Check if treated at this observation
1455
+ is_treated = is_ever_treated and period >= unit_first_treat
1456
+
1457
+ # Base outcome: unit FE + time trend
1458
+ y = 10.0 + unit_fe[unit] + time_trend * period
1459
+
1460
+ # Treatment effect
1461
+ effect = 0.0
1462
+ if is_treated:
1463
+ time_since_treatment = period - unit_first_treat
1464
+ if dynamic_effects:
1465
+ effect = treatment_effect * (1 + effect_growth * time_since_treatment)
1466
+ else:
1467
+ effect = treatment_effect
1468
+ y += effect
1469
+
1470
+ # Add noise
1471
+ y += rng.normal(0, noise_sd)
1472
+
1473
+ records.append({
1474
+ "unit": unit,
1475
+ "period": period,
1476
+ "outcome": y,
1477
+ "first_treat": unit_first_treat,
1478
+ "treated": int(is_treated),
1479
+ "treat": int(is_ever_treated),
1480
+ "true_effect": effect,
1481
+ })
1482
+
1483
+ return pd.DataFrame(records)
1484
+
1485
+
1486
+ def generate_factor_data(
1487
+ n_units: int = 50,
1488
+ n_pre: int = 10,
1489
+ n_post: int = 5,
1490
+ n_treated: int = 10,
1491
+ n_factors: int = 2,
1492
+ treatment_effect: float = 2.0,
1493
+ factor_strength: float = 1.0,
1494
+ treated_loading_shift: float = 0.5,
1495
+ unit_fe_sd: float = 1.0,
1496
+ noise_sd: float = 0.5,
1497
+ seed: Optional[int] = None,
1498
+ ) -> pd.DataFrame:
1499
+ """
1500
+ Generate synthetic panel data with interactive fixed effects (factor model).
1501
+
1502
+ Creates data following the DGP:
1503
+ Y_it = mu + alpha_i + beta_t + Lambda_i'F_t + tau*D_it + eps_it
1504
+
1505
+ where Lambda_i'F_t is the interactive fixed effects component. Useful for
1506
+ testing TROP (Triply Robust Panel) and comparing with SyntheticDiD.
1507
+
1508
+ Parameters
1509
+ ----------
1510
+ n_units : int, default=50
1511
+ Total number of units in the panel.
1512
+ n_pre : int, default=10
1513
+ Number of pre-treatment periods.
1514
+ n_post : int, default=5
1515
+ Number of post-treatment periods.
1516
+ n_treated : int, default=10
1517
+ Number of treated units (assigned to first n_treated unit IDs).
1518
+ n_factors : int, default=2
1519
+ Number of latent factors in the interactive fixed effects.
1520
+ treatment_effect : float, default=2.0
1521
+ True average treatment effect on the treated.
1522
+ factor_strength : float, default=1.0
1523
+ Scaling factor for interactive fixed effects.
1524
+ treated_loading_shift : float, default=0.5
1525
+ Shift in factor loadings for treated units (creates confounding).
1526
+ unit_fe_sd : float, default=1.0
1527
+ Standard deviation of unit fixed effects.
1528
+ noise_sd : float, default=0.5
1529
+ Standard deviation of idiosyncratic noise.
1530
+ seed : int, optional
1531
+ Random seed for reproducibility.
1532
+
1533
+ Returns
1534
+ -------
1535
+ pd.DataFrame
1536
+ Synthetic factor model data with columns:
1537
+ - unit: Unit identifier
1538
+ - period: Time period
1539
+ - outcome: Outcome variable
1540
+ - treated: Binary indicator (1 if treated at this observation)
1541
+ - treat: Binary unit-level ever-treated indicator
1542
+ - true_effect: The true treatment effect for this observation
1543
+
1544
+ Examples
1545
+ --------
1546
+ Generate data with factor structure:
1547
+
1548
+ >>> data = generate_factor_data(n_units=50, n_factors=2, seed=42)
1549
+ >>> data.shape
1550
+ (750, 6)
1551
+
1552
+ Use with TROP estimator:
1553
+
1554
+ >>> from diff_diff import TROP
1555
+ >>> trop = TROP(n_bootstrap=50, seed=42)
1556
+ >>> results = trop.fit(data, outcome='outcome', treatment='treated',
1557
+ ... unit='unit', time='period',
1558
+ ... post_periods=list(range(10, 15)))
1559
+
1560
+ Notes
1561
+ -----
1562
+ The treated units have systematically different factor loadings
1563
+ (shifted by `treated_loading_shift`), which creates confounding
1564
+ that standard DiD cannot address but TROP can handle.
1565
+ """
1566
+ rng = np.random.default_rng(seed)
1567
+
1568
+ n_control = n_units - n_treated
1569
+ n_periods = n_pre + n_post
1570
+
1571
+ if n_treated > n_units:
1572
+ raise ValueError(f"n_treated ({n_treated}) cannot exceed n_units ({n_units})")
1573
+ if n_treated < 1:
1574
+ raise ValueError("n_treated must be at least 1")
1575
+
1576
+ # Generate factors F: (n_periods, n_factors)
1577
+ F = rng.normal(0, 1, (n_periods, n_factors))
1578
+
1579
+ # Generate loadings Lambda: (n_factors, n_units)
1580
+ # Treated units have shifted loadings (creates confounding)
1581
+ Lambda = rng.normal(0, 1, (n_factors, n_units))
1582
+ Lambda[:, :n_treated] += treated_loading_shift
1583
+
1584
+ # Unit fixed effects (treated units have higher baseline)
1585
+ alpha = rng.normal(0, unit_fe_sd, n_units)
1586
+ alpha[:n_treated] += 1.0
1587
+
1588
+ # Time fixed effects (linear trend)
1589
+ beta = np.linspace(0, 2, n_periods)
1590
+
1591
+ # Generate outcomes
1592
+ records = []
1593
+ for i in range(n_units):
1594
+ is_ever_treated = i < n_treated
1595
+
1596
+ for t in range(n_periods):
1597
+ post = t >= n_pre
1598
+
1599
+ # Base outcome
1600
+ y = 10.0 + alpha[i] + beta[t]
1601
+
1602
+ # Interactive fixed effects: Lambda_i' F_t
1603
+ y += factor_strength * (Lambda[:, i] @ F[t, :])
1604
+
1605
+ # Treatment effect
1606
+ effect = 0.0
1607
+ if is_ever_treated and post:
1608
+ effect = treatment_effect
1609
+ y += effect
1610
+
1611
+ # Add noise
1612
+ y += rng.normal(0, noise_sd)
1613
+
1614
+ records.append({
1615
+ "unit": i,
1616
+ "period": t,
1617
+ "outcome": y,
1618
+ "treated": int(is_ever_treated and post),
1619
+ "treat": int(is_ever_treated),
1620
+ "true_effect": effect,
1621
+ })
1622
+
1623
+ return pd.DataFrame(records)
1624
+
1625
+
1626
+ def generate_ddd_data(
1627
+ n_per_cell: int = 100,
1628
+ treatment_effect: float = 2.0,
1629
+ group_effect: float = 2.0,
1630
+ partition_effect: float = 1.0,
1631
+ time_effect: float = 0.5,
1632
+ noise_sd: float = 1.0,
1633
+ add_covariates: bool = False,
1634
+ seed: Optional[int] = None,
1635
+ ) -> pd.DataFrame:
1636
+ """
1637
+ Generate synthetic data for Triple Difference (DDD) analysis.
1638
+
1639
+ Creates data following the DGP:
1640
+ Y = mu + G + P + T + G*P + G*T + P*T + tau*G*P*T + eps
1641
+
1642
+ where G=group, P=partition, T=time. The treatment effect (tau) only
1643
+ applies to units that are in the treated group (G=1), eligible
1644
+ partition (P=1), and post-treatment period (T=1).
1645
+
1646
+ Parameters
1647
+ ----------
1648
+ n_per_cell : int, default=100
1649
+ Number of observations per cell (8 cells total: 2x2x2).
1650
+ treatment_effect : float, default=2.0
1651
+ True average treatment effect on the treated (G=1, P=1, T=1).
1652
+ group_effect : float, default=2.0
1653
+ Main effect of being in treated group.
1654
+ partition_effect : float, default=1.0
1655
+ Main effect of being in eligible partition.
1656
+ time_effect : float, default=0.5
1657
+ Main effect of post-treatment period.
1658
+ noise_sd : float, default=1.0
1659
+ Standard deviation of idiosyncratic noise.
1660
+ add_covariates : bool, default=False
1661
+ If True, adds age and education covariates that affect outcome.
1662
+ seed : int, optional
1663
+ Random seed for reproducibility.
1664
+
1665
+ Returns
1666
+ -------
1667
+ pd.DataFrame
1668
+ Synthetic DDD data with columns:
1669
+ - outcome: Outcome variable
1670
+ - group: Group indicator (0=control, 1=treated)
1671
+ - partition: Partition indicator (0=ineligible, 1=eligible)
1672
+ - time: Time indicator (0=pre, 1=post)
1673
+ - unit_id: Unique unit identifier
1674
+ - true_effect: The true treatment effect for this observation
1675
+ - age: Age covariate (if add_covariates=True)
1676
+ - education: Education covariate (if add_covariates=True)
1677
+
1678
+ Examples
1679
+ --------
1680
+ Generate DDD data:
1681
+
1682
+ >>> data = generate_ddd_data(n_per_cell=100, treatment_effect=3.0, seed=42)
1683
+ >>> data.shape
1684
+ (800, 6)
1685
+ >>> data.groupby(['group', 'partition', 'time']).size()
1686
+ group partition time
1687
+ 0 0 0 100
1688
+ 1 100
1689
+ 1 0 100
1690
+ 1 100
1691
+ 1 0 0 100
1692
+ 1 100
1693
+ 1 0 100
1694
+ 1 100
1695
+ dtype: int64
1696
+
1697
+ Use with TripleDifference estimator:
1698
+
1699
+ >>> from diff_diff import TripleDifference
1700
+ >>> ddd = TripleDifference()
1701
+ >>> results = ddd.fit(data, outcome='outcome', group='group',
1702
+ ... partition='partition', time='time')
1703
+ >>> abs(results.att - 3.0) < 1.0
1704
+ True
1705
+ """
1706
+ rng = np.random.default_rng(seed)
1707
+
1708
+ records = []
1709
+ unit_id = 0
1710
+
1711
+ for g in [0, 1]: # group (0=control state, 1=treated state)
1712
+ for p in [0, 1]: # partition (0=ineligible, 1=eligible)
1713
+ for t in [0, 1]: # time (0=pre, 1=post)
1714
+ for _ in range(n_per_cell):
1715
+ # Base outcome with main effects
1716
+ y = 50 + group_effect * g + partition_effect * p + time_effect * t
1717
+
1718
+ # Second-order interactions (non-treatment)
1719
+ y += 1.5 * g * p # group-partition interaction
1720
+ y += 1.0 * g * t # group-time interaction (diff trends)
1721
+ y += 0.5 * p * t # partition-time interaction
1722
+
1723
+ # Treatment effect: ONLY for G=1, P=1, T=1
1724
+ effect = 0.0
1725
+ if g == 1 and p == 1 and t == 1:
1726
+ effect = treatment_effect
1727
+ y += effect
1728
+
1729
+ # Covariates (always generated for consistency)
1730
+ age = rng.normal(40, 10)
1731
+ education = rng.choice([12, 14, 16, 18], p=[0.3, 0.3, 0.25, 0.15])
1732
+
1733
+ if add_covariates:
1734
+ y += 0.1 * age + 0.5 * education
1735
+
1736
+ # Add noise
1737
+ y += rng.normal(0, noise_sd)
1738
+
1739
+ record = {
1740
+ "outcome": y,
1741
+ "group": g,
1742
+ "partition": p,
1743
+ "time": t,
1744
+ "unit_id": unit_id,
1745
+ "true_effect": effect,
1746
+ }
1747
+
1748
+ if add_covariates:
1749
+ record["age"] = age
1750
+ record["education"] = education
1751
+
1752
+ records.append(record)
1753
+ unit_id += 1
1754
+
1755
+ return pd.DataFrame(records)
1756
+
1757
+
1758
+ def generate_panel_data(
1759
+ n_units: int = 100,
1760
+ n_periods: int = 8,
1761
+ treatment_period: int = 4,
1762
+ treatment_fraction: float = 0.5,
1763
+ treatment_effect: float = 5.0,
1764
+ parallel_trends: bool = True,
1765
+ trend_violation: float = 1.0,
1766
+ unit_fe_sd: float = 2.0,
1767
+ noise_sd: float = 0.5,
1768
+ seed: Optional[int] = None,
1769
+ ) -> pd.DataFrame:
1770
+ """
1771
+ Generate synthetic panel data for parallel trends testing.
1772
+
1773
+ Creates panel data with optional violation of parallel trends, useful
1774
+ for testing parallel trends diagnostics, placebo tests, and sensitivity
1775
+ analysis methods.
1776
+
1777
+ Parameters
1778
+ ----------
1779
+ n_units : int, default=100
1780
+ Total number of units in the panel.
1781
+ n_periods : int, default=8
1782
+ Number of time periods.
1783
+ treatment_period : int, default=4
1784
+ First post-treatment period (0-indexed).
1785
+ treatment_fraction : float, default=0.5
1786
+ Fraction of units that receive treatment.
1787
+ treatment_effect : float, default=5.0
1788
+ True average treatment effect on the treated.
1789
+ parallel_trends : bool, default=True
1790
+ If True, treated and control groups have parallel pre-treatment trends.
1791
+ If False, treated group has a steeper pre-treatment trend.
1792
+ trend_violation : float, default=1.0
1793
+ Size of the differential trend for treated group when parallel_trends=False.
1794
+ Treated units have trend = common_trend + trend_violation.
1795
+ unit_fe_sd : float, default=2.0
1796
+ Standard deviation of unit fixed effects.
1797
+ noise_sd : float, default=0.5
1798
+ Standard deviation of idiosyncratic noise.
1799
+ seed : int, optional
1800
+ Random seed for reproducibility.
1801
+
1802
+ Returns
1803
+ -------
1804
+ pd.DataFrame
1805
+ Synthetic panel data with columns:
1806
+ - unit: Unit identifier
1807
+ - period: Time period
1808
+ - treated: Binary unit-level treatment indicator
1809
+ - post: Binary post-treatment indicator
1810
+ - outcome: Outcome variable
1811
+ - true_effect: The true treatment effect for this observation
1812
+
1813
+ Examples
1814
+ --------
1815
+ Generate data with parallel trends:
1816
+
1817
+ >>> data_parallel = generate_panel_data(parallel_trends=True, seed=42)
1818
+ >>> from diff_diff.utils import check_parallel_trends
1819
+ >>> result = check_parallel_trends(data_parallel, outcome='outcome',
1820
+ ... time='period', treatment_group='treated',
1821
+ ... pre_periods=[0, 1, 2, 3])
1822
+ >>> result['parallel_trends_plausible']
1823
+ True
1824
+
1825
+ Generate data with trend violation:
1826
+
1827
+ >>> data_violation = generate_panel_data(parallel_trends=False, seed=42)
1828
+ >>> result = check_parallel_trends(data_violation, outcome='outcome',
1829
+ ... time='period', treatment_group='treated',
1830
+ ... pre_periods=[0, 1, 2, 3])
1831
+ >>> result['parallel_trends_plausible']
1832
+ False
1833
+ """
1834
+ rng = np.random.default_rng(seed)
1835
+
1836
+ if treatment_period < 1:
1837
+ raise ValueError("treatment_period must be at least 1")
1838
+ if treatment_period >= n_periods:
1839
+ raise ValueError(f"treatment_period must be less than n_periods ({n_periods})")
1840
+
1841
+ n_treated = int(n_units * treatment_fraction)
1842
+
1843
+ records = []
1844
+ for unit in range(n_units):
1845
+ is_treated = unit < n_treated
1846
+ unit_fe = rng.normal(0, unit_fe_sd)
1847
+
1848
+ for period in range(n_periods):
1849
+ post = period >= treatment_period
1850
+
1851
+ # Base time effect (common trend)
1852
+ if parallel_trends:
1853
+ time_effect = period * 1.0
1854
+ else:
1855
+ # Different trends: treated has steeper pre-treatment trend
1856
+ if is_treated:
1857
+ time_effect = period * (1.0 + trend_violation)
1858
+ else:
1859
+ time_effect = period * 1.0
1860
+
1861
+ y = 10.0 + unit_fe + time_effect
1862
+
1863
+ # Treatment effect (only for treated in post-period)
1864
+ effect = 0.0
1865
+ if is_treated and post:
1866
+ effect = treatment_effect
1867
+ y += effect
1868
+
1869
+ # Add noise
1870
+ y += rng.normal(0, noise_sd)
1871
+
1872
+ records.append({
1873
+ "unit": unit,
1874
+ "period": period,
1875
+ "treated": int(is_treated),
1876
+ "post": int(post),
1877
+ "outcome": y,
1878
+ "true_effect": effect,
1879
+ })
1880
+
1881
+ return pd.DataFrame(records)
1882
+
1883
+
1884
+ def generate_event_study_data(
1885
+ n_units: int = 300,
1886
+ n_pre: int = 5,
1887
+ n_post: int = 5,
1888
+ treatment_fraction: float = 0.5,
1889
+ treatment_effect: float = 5.0,
1890
+ unit_fe_sd: float = 2.0,
1891
+ noise_sd: float = 2.0,
1892
+ seed: Optional[int] = None,
1893
+ ) -> pd.DataFrame:
1894
+ """
1895
+ Generate synthetic data for event study analysis.
1896
+
1897
+ Creates panel data with simultaneous treatment at period n_pre.
1898
+ Useful for testing MultiPeriodDiD, pre-trends power analysis,
1899
+ and HonestDiD sensitivity analysis.
1900
+
1901
+ Parameters
1902
+ ----------
1903
+ n_units : int, default=300
1904
+ Total number of units in the panel.
1905
+ n_pre : int, default=5
1906
+ Number of pre-treatment periods.
1907
+ n_post : int, default=5
1908
+ Number of post-treatment periods.
1909
+ treatment_fraction : float, default=0.5
1910
+ Fraction of units that receive treatment.
1911
+ treatment_effect : float, default=5.0
1912
+ True average treatment effect on the treated.
1913
+ unit_fe_sd : float, default=2.0
1914
+ Standard deviation of unit fixed effects.
1915
+ noise_sd : float, default=2.0
1916
+ Standard deviation of idiosyncratic noise.
1917
+ seed : int, optional
1918
+ Random seed for reproducibility.
1919
+
1920
+ Returns
1921
+ -------
1922
+ pd.DataFrame
1923
+ Synthetic event study data with columns:
1924
+ - unit: Unit identifier
1925
+ - period: Time period
1926
+ - treated: Binary unit-level treatment indicator
1927
+ - post: Binary post-treatment indicator
1928
+ - outcome: Outcome variable
1929
+ - event_time: Time relative to treatment (negative=pre, 0+=post)
1930
+ - true_effect: The true treatment effect for this observation
1931
+
1932
+ Examples
1933
+ --------
1934
+ Generate event study data:
1935
+
1936
+ >>> data = generate_event_study_data(n_units=300, n_pre=5, n_post=5, seed=42)
1937
+ >>> data['event_time'].unique()
1938
+ array([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4])
1939
+
1940
+ Use with MultiPeriodDiD:
1941
+
1942
+ >>> from diff_diff import MultiPeriodDiD
1943
+ >>> mp_did = MultiPeriodDiD()
1944
+ >>> results = mp_did.fit(data, outcome='outcome', treatment='treated',
1945
+ ... time='period', post_periods=[5, 6, 7, 8, 9])
1946
+
1947
+ Notes
1948
+ -----
1949
+ The event_time column is relative to treatment:
1950
+ - Negative values: pre-treatment periods
1951
+ - 0: first post-treatment period
1952
+ - Positive values: subsequent post-treatment periods
1953
+ """
1954
+ rng = np.random.default_rng(seed)
1955
+
1956
+ n_periods = n_pre + n_post
1957
+ treatment_period = n_pre
1958
+ n_treated = int(n_units * treatment_fraction)
1959
+
1960
+ records = []
1961
+ for unit in range(n_units):
1962
+ is_treated = unit < n_treated
1963
+ unit_fe = rng.normal(0, unit_fe_sd)
1964
+
1965
+ for period in range(n_periods):
1966
+ post = period >= treatment_period
1967
+ event_time = period - treatment_period
1968
+
1969
+ # Common time trend
1970
+ time_effect = period * 0.5
1971
+
1972
+ y = 10.0 + unit_fe + time_effect
1973
+
1974
+ # Treatment effect (only for treated in post-period)
1975
+ effect = 0.0
1976
+ if is_treated and post:
1977
+ effect = treatment_effect
1978
+ y += effect
1979
+
1980
+ # Add noise
1981
+ y += rng.normal(0, noise_sd)
1982
+
1983
+ records.append({
1984
+ "unit": unit,
1985
+ "period": period,
1986
+ "treated": int(is_treated),
1987
+ "post": int(post),
1988
+ "outcome": y,
1989
+ "event_time": event_time,
1990
+ "true_effect": effect,
1991
+ })
1992
+
1993
+ return pd.DataFrame(records)