diff-diff 2.9.0__tar.gz → 2.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diff_diff-2.9.0 → diff_diff-2.9.1}/PKG-INFO +1 -1
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/__init__.py +1 -1
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/prep_dgp.py +312 -23
- {diff_diff-2.9.0 → diff_diff-2.9.1}/pyproject.toml +1 -1
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/Cargo.lock +1 -1
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/Cargo.toml +1 -1
- {diff_diff-2.9.0 → diff_diff-2.9.1}/README.md +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/_backend.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/bacon.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/bootstrap_utils.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/continuous_did.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/continuous_did_bspline.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/continuous_did_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/datasets.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/diagnostics.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/efficient_did.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/efficient_did_bootstrap.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/efficient_did_covariates.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/efficient_did_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/efficient_did_weights.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/estimators.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/honest_did.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/imputation.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/imputation_bootstrap.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/imputation_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/linalg.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/power.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/practitioner.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/prep.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/pretrends.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/stacked_did.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/stacked_did_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/staggered.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/staggered_aggregation.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/staggered_bootstrap.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/staggered_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/staggered_triple_diff.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/staggered_triple_diff_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/sun_abraham.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/survey.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/synthetic_did.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/triple_diff.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/trop.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/trop_global.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/trop_local.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/trop_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/twfe.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/two_stage.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/two_stage_bootstrap.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/two_stage_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/utils.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/__init__.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/_common.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/_continuous.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/_diagnostic.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/_event_study.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/_power.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/_staggered.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/visualization/_synthetic.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/wooldridge.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/diff_diff/wooldridge_results.py +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/build.rs +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/src/bootstrap.rs +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/src/lib.rs +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/src/linalg.rs +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/src/trop.rs +0 -0
- {diff_diff-2.9.0 → diff_diff-2.9.1}/rust/src/weights.rs +0 -0
|
@@ -1129,6 +1129,37 @@ def generate_staggered_ddd_data(
|
|
|
1129
1129
|
return pd.DataFrame(records)
|
|
1130
1130
|
|
|
1131
1131
|
|
|
1132
|
+
def _rank_pair_weights(
|
|
1133
|
+
unit_weight: np.ndarray,
|
|
1134
|
+
unit_stratum: np.ndarray,
|
|
1135
|
+
y0: np.ndarray,
|
|
1136
|
+
n_strata: int,
|
|
1137
|
+
) -> None:
|
|
1138
|
+
"""Rank-pair weights with Y(0) within each stratum (in-place).
|
|
1139
|
+
|
|
1140
|
+
High-outcome units receive higher weights, modeling informative sampling
|
|
1141
|
+
where hard-to-reach (high-outcome) subpopulations are under-covered
|
|
1142
|
+
and therefore carry larger inverse-selection-probability weights.
|
|
1143
|
+
"""
|
|
1144
|
+
for s in range(n_strata):
|
|
1145
|
+
mask = unit_stratum == s
|
|
1146
|
+
n_s = mask.sum()
|
|
1147
|
+
if n_s <= 1:
|
|
1148
|
+
continue
|
|
1149
|
+
idx_s = np.where(mask)[0]
|
|
1150
|
+
w_vals = unit_weight[idx_s].copy()
|
|
1151
|
+
if w_vals.std() < 1e-10:
|
|
1152
|
+
# No within-stratum variation: create rank-based weights
|
|
1153
|
+
# scaled to preserve stratum baseline weight level
|
|
1154
|
+
ranks = np.argsort(np.argsort(y0[idx_s])).astype(float) + 1.0
|
|
1155
|
+
unit_weight[idx_s] = ranks / ranks.mean() * w_vals.mean()
|
|
1156
|
+
else:
|
|
1157
|
+
# Rank-pair: highest Y(0) gets heaviest weight
|
|
1158
|
+
y0_order = np.argsort(-y0[idx_s])
|
|
1159
|
+
w_sorted = np.sort(w_vals)[::-1] # heaviest first
|
|
1160
|
+
unit_weight[idx_s[y0_order]] = w_sorted
|
|
1161
|
+
|
|
1162
|
+
|
|
1132
1163
|
def generate_survey_did_data(
|
|
1133
1164
|
n_units: int = 200,
|
|
1134
1165
|
n_periods: int = 8,
|
|
@@ -1149,6 +1180,15 @@ def generate_survey_did_data(
|
|
|
1149
1180
|
add_covariates: bool = False,
|
|
1150
1181
|
panel: bool = True,
|
|
1151
1182
|
seed: Optional[int] = None,
|
|
1183
|
+
# --- Research-grade DGP parameters ---
|
|
1184
|
+
icc: Optional[float] = None,
|
|
1185
|
+
weight_cv: Optional[float] = None,
|
|
1186
|
+
informative_sampling: bool = False,
|
|
1187
|
+
heterogeneous_te_by_strata: bool = False,
|
|
1188
|
+
strata_sizes: Optional[List[int]] = None,
|
|
1189
|
+
return_true_population_att: bool = False,
|
|
1190
|
+
covariate_effects: Optional[tuple] = None,
|
|
1191
|
+
te_covariate_interaction: float = 0.0,
|
|
1152
1192
|
) -> pd.DataFrame:
|
|
1153
1193
|
"""
|
|
1154
1194
|
Generate synthetic staggered DiD data with survey structure.
|
|
@@ -1215,6 +1255,52 @@ def generate_survey_did_data(
|
|
|
1215
1255
|
CallawaySantAnna(panel=False)).
|
|
1216
1256
|
seed : int, optional
|
|
1217
1257
|
Random seed for reproducibility.
|
|
1258
|
+
icc : float, optional
|
|
1259
|
+
Target intra-class correlation coefficient (0 < icc < 1). Overrides
|
|
1260
|
+
``psu_re_sd`` via the variance decomposition:
|
|
1261
|
+
``psu_re_sd = sqrt(icc * (sigma2_unit + sigma2_noise + sigma2_cov) /
|
|
1262
|
+
((1 - icc) * (1 + psu_period_factor^2)))`` where ``sigma2_cov``
|
|
1263
|
+
includes covariate variance when ``add_covariates=True``.
|
|
1264
|
+
Cannot be combined with a non-default ``psu_re_sd``.
|
|
1265
|
+
weight_cv : float, optional
|
|
1266
|
+
Target coefficient of variation for sampling weights. Generates
|
|
1267
|
+
LogNormal weights normalized to mean 1, bypassing ``weight_variation``.
|
|
1268
|
+
Cannot be combined with a non-default ``weight_variation``.
|
|
1269
|
+
informative_sampling : bool, default=False
|
|
1270
|
+
If True, sampling weights correlate with Y(0) — high-outcome units
|
|
1271
|
+
receive higher weights (under-coverage → larger inverse-selection-
|
|
1272
|
+
probability weights). Uses rank-pairing within each stratum. For
|
|
1273
|
+
panel data, ranking is done once from period-1 outcomes. For
|
|
1274
|
+
repeated cross-sections, ranking is refreshed each period. Within
|
|
1275
|
+
each stratum, rank-based weights are scaled to preserve the
|
|
1276
|
+
stratum's baseline weight level from ``weight_variation``.
|
|
1277
|
+
When ``add_covariates=True``, covariate contributions are
|
|
1278
|
+
included in the Y(0) ranking.
|
|
1279
|
+
heterogeneous_te_by_strata : bool, default=False
|
|
1280
|
+
If True, treatment effect varies by stratum:
|
|
1281
|
+
``TE_h = TE * (1 + 0.5 * (h - mean) / std)``. Creates a gap
|
|
1282
|
+
between unweighted and population ATT. With ``n_strata=1``,
|
|
1283
|
+
all units receive the base ``treatment_effect``.
|
|
1284
|
+
strata_sizes : list of int, optional
|
|
1285
|
+
Custom per-stratum unit counts. Must have length ``n_strata`` and
|
|
1286
|
+
sum to ``n_units``. Replaces equal allocation across strata.
|
|
1287
|
+
return_true_population_att : bool, default=False
|
|
1288
|
+
If True, attaches a diagnostic dict to ``df.attrs["dgp_truth"]``
|
|
1289
|
+
with keys: ``population_att`` (weight-weighted average of treated
|
|
1290
|
+
true effects), ``deff_kish`` (1 + CV(w)^2), ``base_stratum_effects``
|
|
1291
|
+
(base stratum TEs before dynamic/covariate modifiers),
|
|
1292
|
+
``icc_realized`` (ANOVA-based
|
|
1293
|
+
ICC computed on period-1 data).
|
|
1294
|
+
covariate_effects : tuple of (float, float), optional
|
|
1295
|
+
Coefficients ``(beta1, beta2)`` for covariates x1 and x2 in the
|
|
1296
|
+
outcome equation ``y += beta1 * x1 + beta2 * x2``. Default uses
|
|
1297
|
+
``(0.5, 0.3)``. Only used when ``add_covariates=True``. The ICC
|
|
1298
|
+
calibration automatically adjusts for the implied covariate variance.
|
|
1299
|
+
te_covariate_interaction : float, default=0.0
|
|
1300
|
+
Coefficient for treatment-by-covariate interaction:
|
|
1301
|
+
``TE_i = base_TE + te_covariate_interaction * x1_i``. Creates
|
|
1302
|
+
unit-level treatment effect heterogeneity driven by the continuous
|
|
1303
|
+
covariate. Requires ``add_covariates=True``.
|
|
1218
1304
|
|
|
1219
1305
|
Returns
|
|
1220
1306
|
-------
|
|
@@ -1222,6 +1308,8 @@ def generate_survey_did_data(
|
|
|
1222
1308
|
Columns: unit, period, outcome, first_treat, treated, true_effect,
|
|
1223
1309
|
stratum, psu, fpc, weight. Also rep_0..rep_K if
|
|
1224
1310
|
include_replicate_weights=True, and x1, x2 if add_covariates=True.
|
|
1311
|
+
If ``return_true_population_att=True``, ``df.attrs["dgp_truth"]``
|
|
1312
|
+
contains DGP diagnostics.
|
|
1225
1313
|
"""
|
|
1226
1314
|
rng = np.random.default_rng(seed)
|
|
1227
1315
|
|
|
@@ -1284,30 +1372,120 @@ def generate_survey_did_data(
|
|
|
1284
1372
|
f"weight_variation must be one of {valid_wv}, got {weight_variation!r}"
|
|
1285
1373
|
)
|
|
1286
1374
|
|
|
1375
|
+
# --- Validate research-grade DGP parameters ---
|
|
1376
|
+
if icc is not None:
|
|
1377
|
+
if not (0 < icc < 1):
|
|
1378
|
+
raise ValueError(f"icc must be between 0 and 1 (exclusive), got {icc}")
|
|
1379
|
+
if psu_re_sd != 2.0:
|
|
1380
|
+
raise ValueError(
|
|
1381
|
+
"Cannot specify both icc and a non-default psu_re_sd. "
|
|
1382
|
+
"icc overrides psu_re_sd via the ICC formula."
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
if weight_cv is not None:
|
|
1386
|
+
if not np.isfinite(weight_cv) or weight_cv <= 0:
|
|
1387
|
+
raise ValueError(
|
|
1388
|
+
f"weight_cv must be finite and positive, got {weight_cv}"
|
|
1389
|
+
)
|
|
1390
|
+
if weight_variation != "moderate":
|
|
1391
|
+
raise ValueError(
|
|
1392
|
+
"Cannot specify both weight_cv and a non-default "
|
|
1393
|
+
"weight_variation. weight_cv overrides weight_variation."
|
|
1394
|
+
)
|
|
1395
|
+
|
|
1396
|
+
if strata_sizes is not None:
|
|
1397
|
+
strata_sizes = list(strata_sizes)
|
|
1398
|
+
for ss in strata_sizes:
|
|
1399
|
+
if isinstance(ss, bool) or not isinstance(ss, (int, np.integer)):
|
|
1400
|
+
raise ValueError(
|
|
1401
|
+
f"strata_sizes must contain integers, got {ss!r}"
|
|
1402
|
+
)
|
|
1403
|
+
if len(strata_sizes) != n_strata:
|
|
1404
|
+
raise ValueError(
|
|
1405
|
+
f"strata_sizes must have length n_strata={n_strata}, "
|
|
1406
|
+
f"got {len(strata_sizes)}"
|
|
1407
|
+
)
|
|
1408
|
+
if any(s < 1 for s in strata_sizes):
|
|
1409
|
+
raise ValueError("All strata_sizes must be >= 1")
|
|
1410
|
+
if sum(strata_sizes) != n_units:
|
|
1411
|
+
raise ValueError(
|
|
1412
|
+
f"strata_sizes must sum to n_units={n_units}, "
|
|
1413
|
+
f"got {sum(strata_sizes)}"
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
# --- Validate and resolve covariate coefficients ---
|
|
1417
|
+
if covariate_effects is not None:
|
|
1418
|
+
covariate_effects = tuple(covariate_effects)
|
|
1419
|
+
if len(covariate_effects) != 2:
|
|
1420
|
+
raise ValueError(
|
|
1421
|
+
f"covariate_effects must have length 2, got {len(covariate_effects)}"
|
|
1422
|
+
)
|
|
1423
|
+
if not all(np.isfinite(c) for c in covariate_effects):
|
|
1424
|
+
raise ValueError(
|
|
1425
|
+
f"covariate_effects must be finite, got {covariate_effects}"
|
|
1426
|
+
)
|
|
1427
|
+
_beta1, _beta2 = covariate_effects if covariate_effects is not None else (0.5, 0.3)
|
|
1428
|
+
|
|
1429
|
+
if not np.isfinite(te_covariate_interaction):
|
|
1430
|
+
raise ValueError(
|
|
1431
|
+
f"te_covariate_interaction must be finite, got {te_covariate_interaction}"
|
|
1432
|
+
)
|
|
1433
|
+
if te_covariate_interaction != 0.0 and not add_covariates:
|
|
1434
|
+
raise ValueError(
|
|
1435
|
+
"te_covariate_interaction requires add_covariates=True"
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
# --- ICC -> psu_re_sd resolution ---
|
|
1439
|
+
if icc is not None:
|
|
1440
|
+
# Covariate variance: Var(beta1*x1) + Var(beta2*x2)
|
|
1441
|
+
# where x1 ~ N(0,1), x2 ~ Bernoulli(0.5)
|
|
1442
|
+
cov_var = (_beta1**2 * 1.0 + _beta2**2 * 0.25) if add_covariates else 0.0
|
|
1443
|
+
non_psu_var = unit_fe_sd**2 + noise_sd**2 + cov_var
|
|
1444
|
+
if non_psu_var < 1e-12:
|
|
1445
|
+
raise ValueError(
|
|
1446
|
+
"icc requires non-zero non-PSU variance "
|
|
1447
|
+
"(unit_fe_sd, noise_sd, or add_covariates must contribute variance)"
|
|
1448
|
+
)
|
|
1449
|
+
psu_re_sd = np.sqrt(
|
|
1450
|
+
icc * non_psu_var
|
|
1451
|
+
/ ((1 - icc) * (1 + psu_period_factor**2))
|
|
1452
|
+
)
|
|
1453
|
+
|
|
1287
1454
|
# --- Survey structure: assign units to strata and PSUs ---
|
|
1288
1455
|
n_psu_total = n_strata * psu_per_stratum
|
|
1289
|
-
|
|
1290
|
-
|
|
1456
|
+
|
|
1457
|
+
if strata_sizes is not None:
|
|
1458
|
+
stratum_n = strata_sizes
|
|
1459
|
+
else:
|
|
1460
|
+
units_per_stratum = n_units // n_strata
|
|
1461
|
+
remainder = n_units % n_strata
|
|
1462
|
+
stratum_n = [
|
|
1463
|
+
units_per_stratum + (1 if s < remainder else 0)
|
|
1464
|
+
for s in range(n_strata)
|
|
1465
|
+
]
|
|
1291
1466
|
|
|
1292
1467
|
unit_stratum = np.empty(n_units, dtype=int)
|
|
1293
1468
|
unit_psu = np.empty(n_units, dtype=int)
|
|
1294
1469
|
idx = 0
|
|
1295
1470
|
for s in range(n_strata):
|
|
1296
|
-
|
|
1297
|
-
n_s = units_per_stratum + (1 if s < remainder else 0)
|
|
1471
|
+
n_s = stratum_n[s]
|
|
1298
1472
|
unit_stratum[idx : idx + n_s] = s
|
|
1299
|
-
|
|
1300
|
-
# Assign PSUs within this stratum
|
|
1301
1473
|
psu_start = s * psu_per_stratum
|
|
1302
1474
|
for j in range(n_s):
|
|
1303
1475
|
unit_psu[idx + j] = psu_start + (j % psu_per_stratum)
|
|
1304
1476
|
idx += n_s
|
|
1305
1477
|
|
|
1306
|
-
# Sampling weights
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1478
|
+
# Sampling weights
|
|
1479
|
+
if weight_cv is not None:
|
|
1480
|
+
sigma_ln = np.sqrt(np.log(1 + weight_cv**2))
|
|
1481
|
+
raw_w = rng.lognormal(-sigma_ln**2 / 2, sigma_ln, size=n_units)
|
|
1482
|
+
unit_weight = raw_w / raw_w.mean()
|
|
1483
|
+
else:
|
|
1484
|
+
# Stratum-based weights (inverse selection probability)
|
|
1485
|
+
scale_map = {"none": 0.0, "moderate": 1.0, "high": 3.0}
|
|
1486
|
+
scale = scale_map.get(weight_variation, 1.0)
|
|
1487
|
+
denom = max(n_strata - 1, 1)
|
|
1488
|
+
unit_weight = 1.0 + scale * (unit_stratum / denom)
|
|
1311
1489
|
|
|
1312
1490
|
# --- Treatment assignment (cohort structure) ---
|
|
1313
1491
|
n_never = int(n_units * never_treated_frac)
|
|
@@ -1344,6 +1522,37 @@ def generate_survey_did_data(
|
|
|
1344
1522
|
0, psu_re_sd * psu_period_factor, size=(n_psu_total, n_periods)
|
|
1345
1523
|
)
|
|
1346
1524
|
|
|
1525
|
+
# --- Informative sampling (panel path): pre-draw FEs, rank-pair weights ---
|
|
1526
|
+
if informative_sampling and panel:
|
|
1527
|
+
_panel_unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
|
|
1528
|
+
y0_period1 = (
|
|
1529
|
+
_panel_unit_fe
|
|
1530
|
+
+ psu_re[unit_psu]
|
|
1531
|
+
+ psu_period_re[unit_psu, 0]
|
|
1532
|
+
+ 0.5
|
|
1533
|
+
)
|
|
1534
|
+
if add_covariates:
|
|
1535
|
+
_panel_x1 = rng.normal(0, 1, size=n_units)
|
|
1536
|
+
_panel_x2 = rng.choice([0, 1], size=n_units)
|
|
1537
|
+
y0_period1 = y0_period1 + _beta1 * _panel_x1 + _beta2 * _panel_x2
|
|
1538
|
+
_rank_pair_weights(unit_weight, unit_stratum, y0_period1, n_strata)
|
|
1539
|
+
|
|
1540
|
+
# Save base weights for cross-section informative sampling (reset each period)
|
|
1541
|
+
if informative_sampling and not panel:
|
|
1542
|
+
_base_weight = unit_weight.copy()
|
|
1543
|
+
|
|
1544
|
+
# --- Heterogeneous treatment effects by stratum ---
|
|
1545
|
+
if heterogeneous_te_by_strata:
|
|
1546
|
+
if n_strata == 1:
|
|
1547
|
+
te_by_stratum = np.array([treatment_effect])
|
|
1548
|
+
else:
|
|
1549
|
+
strata_idx = np.arange(n_strata, dtype=float)
|
|
1550
|
+
te_by_stratum = treatment_effect * (
|
|
1551
|
+
1 + 0.5 * (strata_idx - strata_idx.mean()) / strata_idx.std()
|
|
1552
|
+
)
|
|
1553
|
+
else:
|
|
1554
|
+
te_by_stratum = None
|
|
1555
|
+
|
|
1347
1556
|
# --- Generate panel or repeated cross-sections ---
|
|
1348
1557
|
records = []
|
|
1349
1558
|
for t in range(1, n_periods + 1):
|
|
@@ -1351,21 +1560,47 @@ def generate_survey_did_data(
|
|
|
1351
1560
|
unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
|
|
1352
1561
|
if panel and t > 1:
|
|
1353
1562
|
pass # reuse unit_fe from first period (set below)
|
|
1354
|
-
if
|
|
1563
|
+
if informative_sampling and panel:
|
|
1564
|
+
unit_fe = _panel_unit_fe # use pre-drawn FEs
|
|
1565
|
+
elif panel and t == 1:
|
|
1355
1566
|
_panel_unit_fe = unit_fe # save for reuse
|
|
1356
|
-
|
|
1567
|
+
elif panel and t > 1:
|
|
1357
1568
|
unit_fe = _panel_unit_fe # type: ignore[possibly-undefined]
|
|
1358
1569
|
|
|
1359
|
-
|
|
1360
|
-
if
|
|
1570
|
+
# Cross-section informative sampling: re-rank weights each period
|
|
1571
|
+
if informative_sampling and not panel:
|
|
1572
|
+
# Draw covariates early so they can be included in Y(0) ranking
|
|
1573
|
+
if add_covariates:
|
|
1574
|
+
x1 = rng.normal(0, 1, size=n_units)
|
|
1575
|
+
x2 = rng.choice([0, 1], size=n_units)
|
|
1576
|
+
unit_weight = _base_weight.copy() # type: ignore[possibly-undefined]
|
|
1577
|
+
y0_t = (
|
|
1578
|
+
unit_fe
|
|
1579
|
+
+ psu_re[unit_psu]
|
|
1580
|
+
+ psu_period_re[unit_psu, t - 1]
|
|
1581
|
+
+ 0.5 * t
|
|
1582
|
+
)
|
|
1583
|
+
if add_covariates:
|
|
1584
|
+
y0_t = y0_t + _beta1 * x1 + _beta2 * x2
|
|
1585
|
+
_rank_pair_weights(unit_weight, unit_stratum, y0_t, n_strata)
|
|
1586
|
+
|
|
1587
|
+
# Covariates — may already be drawn by informative sampling above
|
|
1588
|
+
if informative_sampling and panel and add_covariates:
|
|
1589
|
+
x1 = _panel_x1 # pre-drawn before loop for ranking
|
|
1590
|
+
x2 = _panel_x2
|
|
1591
|
+
elif informative_sampling and not panel and add_covariates:
|
|
1592
|
+
pass # x1, x2 already drawn in cross-section ranking block
|
|
1593
|
+
elif add_covariates:
|
|
1594
|
+
x1 = rng.normal(0, 1, size=n_units)
|
|
1595
|
+
x2 = rng.choice([0, 1], size=n_units)
|
|
1596
|
+
else:
|
|
1597
|
+
x1 = None
|
|
1598
|
+
x2 = None
|
|
1599
|
+
if not informative_sampling and panel and t > 1 and add_covariates:
|
|
1361
1600
|
x1 = _panel_x1 # type: ignore[possibly-undefined]
|
|
1362
|
-
elif panel and t == 1 and add_covariates:
|
|
1363
|
-
_panel_x1 = x1
|
|
1364
|
-
|
|
1365
|
-
x2 = rng.choice([0, 1], size=n_units) if add_covariates else None
|
|
1366
|
-
if panel and t > 1 and add_covariates:
|
|
1367
1601
|
x2 = _panel_x2 # type: ignore[possibly-undefined]
|
|
1368
|
-
elif panel and t == 1 and add_covariates:
|
|
1602
|
+
elif not informative_sampling and panel and t == 1 and add_covariates:
|
|
1603
|
+
_panel_x1 = x1
|
|
1369
1604
|
_panel_x2 = x2
|
|
1370
1605
|
|
|
1371
1606
|
for i in range(n_units):
|
|
@@ -1374,12 +1609,17 @@ def generate_survey_did_data(
|
|
|
1374
1609
|
y = unit_fe[i] + psu_re[unit_psu[i]] + psu_period_re[unit_psu[i], t - 1] + 0.5 * t
|
|
1375
1610
|
|
|
1376
1611
|
if add_covariates:
|
|
1377
|
-
y +=
|
|
1612
|
+
y += _beta1 * x1[i] + _beta2 * x2[i]
|
|
1378
1613
|
|
|
1379
1614
|
treated = int(g_i > 0 and t >= g_i)
|
|
1380
1615
|
true_eff = 0.0
|
|
1381
1616
|
if treated:
|
|
1382
|
-
|
|
1617
|
+
if te_by_stratum is not None:
|
|
1618
|
+
true_eff = float(te_by_stratum[unit_stratum[i]])
|
|
1619
|
+
else:
|
|
1620
|
+
true_eff = treatment_effect
|
|
1621
|
+
if te_covariate_interaction != 0.0:
|
|
1622
|
+
true_eff += te_covariate_interaction * x1[i]
|
|
1383
1623
|
if dynamic_effects:
|
|
1384
1624
|
true_eff *= 1 + effect_growth * (t - g_i)
|
|
1385
1625
|
y += true_eff
|
|
@@ -1426,4 +1666,53 @@ def generate_survey_did_data(
|
|
|
1426
1666
|
w_r[w_r > 0] *= n_rep / (n_rep - 1)
|
|
1427
1667
|
df[f"rep_{r}"] = w_r
|
|
1428
1668
|
|
|
1669
|
+
# --- DGP truth diagnostics ---
|
|
1670
|
+
if return_true_population_att:
|
|
1671
|
+
treated_mask = df["treated"] == 1
|
|
1672
|
+
if treated_mask.any():
|
|
1673
|
+
w_treated = df.loc[treated_mask, "weight"].values
|
|
1674
|
+
te_treated = df.loc[treated_mask, "true_effect"].values
|
|
1675
|
+
population_att = float(np.average(te_treated, weights=w_treated))
|
|
1676
|
+
else:
|
|
1677
|
+
population_att = float("nan")
|
|
1678
|
+
|
|
1679
|
+
if te_by_stratum is not None:
|
|
1680
|
+
stratum_effects = {
|
|
1681
|
+
int(s): float(te_by_stratum[s]) for s in range(n_strata)
|
|
1682
|
+
}
|
|
1683
|
+
else:
|
|
1684
|
+
stratum_effects = {
|
|
1685
|
+
int(s): float(treatment_effect) for s in range(n_strata)
|
|
1686
|
+
}
|
|
1687
|
+
|
|
1688
|
+
# Kish DEFF from weight variation
|
|
1689
|
+
w_all = df.groupby("unit")["weight"].first().values
|
|
1690
|
+
cv_w = float(w_all.std() / w_all.mean()) if w_all.mean() > 0 else 0.0
|
|
1691
|
+
deff_kish = 1 + cv_w**2
|
|
1692
|
+
|
|
1693
|
+
# Realized ICC (ANOVA-based, period-1 only to avoid TE contamination)
|
|
1694
|
+
_p1 = df[df["period"] == 1]
|
|
1695
|
+
_groups = _p1.groupby("psu")["outcome"]
|
|
1696
|
+
_n_total = len(_p1)
|
|
1697
|
+
_n_groups = _groups.ngroups
|
|
1698
|
+
# ICC undefined with < 2 groups or no within-group replication
|
|
1699
|
+
if _n_groups < 2 or _n_total <= _n_groups:
|
|
1700
|
+
icc_realized = float("nan")
|
|
1701
|
+
else:
|
|
1702
|
+
_n_bar = _n_total / _n_groups
|
|
1703
|
+
_grand_mean = _p1["outcome"].mean()
|
|
1704
|
+
_ssb = (_groups.size() * (_groups.mean() - _grand_mean) ** 2).sum()
|
|
1705
|
+
_msb = _ssb / (_n_groups - 1)
|
|
1706
|
+
_ssw = _groups.apply(lambda x: ((x - x.mean()) ** 2).sum()).sum()
|
|
1707
|
+
_msw = _ssw / (_n_total - _n_groups)
|
|
1708
|
+
_denom = _msb + (_n_bar - 1) * _msw
|
|
1709
|
+
icc_realized = float((_msb - _msw) / _denom) if _denom > 0 else float("nan")
|
|
1710
|
+
|
|
1711
|
+
df.attrs["dgp_truth"] = {
|
|
1712
|
+
"population_att": population_att,
|
|
1713
|
+
"deff_kish": float(deff_kish),
|
|
1714
|
+
"base_stratum_effects": stratum_effects,
|
|
1715
|
+
"icc_realized": icc_realized,
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1429
1718
|
return df
|
|
@@ -4,7 +4,7 @@ build-backend = "maturin"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "diff-diff"
|
|
7
|
-
version = "2.9.
|
|
7
|
+
version = "2.9.1"
|
|
8
8
|
description = "Difference-in-Differences causal inference with sklearn-like API. Callaway-Sant'Anna, Synthetic DiD, Honest DiD, event studies, parallel trends."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|