diff-diff 2.9.0__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {diff_diff-2.9.0 → diff_diff-3.0.0}/PKG-INFO +1 -1
  2. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/__init__.py +1 -1
  3. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/linalg.py +64 -2
  4. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/prep_dgp.py +312 -23
  5. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/staggered.py +1 -20
  6. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/staggered_bootstrap.py +4 -4
  7. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/staggered_triple_diff.py +1 -2
  8. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/synthetic_did.py +4 -3
  9. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/trop.py +4 -37
  10. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/wooldridge.py +267 -72
  11. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/wooldridge_results.py +9 -1
  12. {diff_diff-2.9.0 → diff_diff-3.0.0}/pyproject.toml +1 -1
  13. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/Cargo.lock +1 -1
  14. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/Cargo.toml +1 -1
  15. {diff_diff-2.9.0 → diff_diff-3.0.0}/README.md +0 -0
  16. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/_backend.py +0 -0
  17. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/bacon.py +0 -0
  18. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/bootstrap_utils.py +0 -0
  19. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/continuous_did.py +0 -0
  20. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/continuous_did_bspline.py +0 -0
  21. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/continuous_did_results.py +0 -0
  22. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/datasets.py +0 -0
  23. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/diagnostics.py +0 -0
  24. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/efficient_did.py +0 -0
  25. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/efficient_did_bootstrap.py +0 -0
  26. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/efficient_did_covariates.py +0 -0
  27. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/efficient_did_results.py +0 -0
  28. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/efficient_did_weights.py +0 -0
  29. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/estimators.py +0 -0
  30. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/honest_did.py +0 -0
  31. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/imputation.py +0 -0
  32. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/imputation_bootstrap.py +0 -0
  33. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/imputation_results.py +0 -0
  34. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/power.py +0 -0
  35. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/practitioner.py +0 -0
  36. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/prep.py +0 -0
  37. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/pretrends.py +0 -0
  38. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/results.py +0 -0
  39. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/stacked_did.py +0 -0
  40. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/stacked_did_results.py +0 -0
  41. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/staggered_aggregation.py +0 -0
  42. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/staggered_results.py +0 -0
  43. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/staggered_triple_diff_results.py +0 -0
  44. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/sun_abraham.py +0 -0
  45. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/survey.py +0 -0
  46. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/triple_diff.py +0 -0
  47. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/trop_global.py +0 -0
  48. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/trop_local.py +0 -0
  49. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/trop_results.py +0 -0
  50. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/twfe.py +0 -0
  51. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/two_stage.py +0 -0
  52. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/two_stage_bootstrap.py +0 -0
  53. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/two_stage_results.py +0 -0
  54. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/utils.py +0 -0
  55. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/__init__.py +0 -0
  56. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/_common.py +0 -0
  57. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/_continuous.py +0 -0
  58. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/_diagnostic.py +0 -0
  59. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/_event_study.py +0 -0
  60. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/_power.py +0 -0
  61. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/_staggered.py +0 -0
  62. {diff_diff-2.9.0 → diff_diff-3.0.0}/diff_diff/visualization/_synthetic.py +0 -0
  63. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/build.rs +0 -0
  64. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/src/bootstrap.rs +0 -0
  65. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/src/lib.rs +0 -0
  66. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/src/linalg.rs +0 -0
  67. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/src/trop.rs +0 -0
  68. {diff_diff-2.9.0 → diff_diff-3.0.0}/rust/src/weights.rs +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diff-diff
3
- Version: 2.9.0
3
+ Version: 3.0.0
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Intended Audience :: Science/Research
6
6
  Classifier: Operating System :: OS Independent
@@ -214,7 +214,7 @@ Bacon = BaconDecomposition
214
214
  EDiD = EfficientDiD
215
215
  ETWFE = WooldridgeDiD
216
216
 
217
- __version__ = "2.9.0"
217
+ __version__ = "3.0.0"
218
218
  __all__ = [
219
219
  # Estimators
220
220
  "DifferenceInDifferences",
@@ -2372,6 +2372,7 @@ def solve_poisson(
2372
2372
  tol: float = 1e-8,
2373
2373
  init_beta: Optional[np.ndarray] = None,
2374
2374
  rank_deficient_action: str = "warn",
2375
+ weights: Optional[np.ndarray] = None,
2375
2376
  ) -> Tuple[np.ndarray, np.ndarray]:
2376
2377
  """Poisson IRLS (Newton-Raphson with log link).
2377
2378
 
@@ -2389,6 +2390,9 @@ def solve_poisson(
2389
2390
  log(mean(y)) to improve convergence for large-scale outcomes.
2390
2391
  rank_deficient_action : {"warn", "error", "silent"}
2391
2392
  How to handle rank-deficient design matrices. Mirrors solve_ols/solve_logit.
2393
+ weights : (n,) optional observation weights (e.g. survey sampling weights).
2394
+ When provided, the weighted pseudo-log-likelihood is maximised:
2395
+ score = X'(w*(y - mu)), Hessian = X'diag(w*mu)X.
2392
2396
 
2393
2397
  Returns
2394
2398
  -------
@@ -2397,6 +2401,20 @@ def solve_poisson(
2397
2401
  """
2398
2402
  n, k_orig = X.shape
2399
2403
 
2404
+ # Validate weights (mirrors solve_logit validation)
2405
+ if weights is not None:
2406
+ weights = np.asarray(weights, dtype=np.float64)
2407
+ if weights.shape != (n,):
2408
+ raise ValueError(f"weights must have shape ({n},), got {weights.shape}")
2409
+ if np.any(np.isnan(weights)):
2410
+ raise ValueError("weights contain NaN values")
2411
+ if np.any(~np.isfinite(weights)):
2412
+ raise ValueError("weights contain Inf values")
2413
+ if np.any(weights < 0):
2414
+ raise ValueError("weights must be non-negative")
2415
+ if np.sum(weights) <= 0:
2416
+ raise ValueError("weights sum to zero — no observations have positive weight")
2417
+
2400
2418
  # Validate rank_deficient_action (same as solve_logit/solve_ols)
2401
2419
  valid_actions = ("warn", "error", "silent")
2402
2420
  if rank_deficient_action not in valid_actions:
@@ -2425,6 +2443,46 @@ def solve_poisson(
2425
2443
  X = X[:, kept_cols]
2426
2444
 
2427
2445
  n, k = X.shape
2446
+
2447
+ # Validate effective weighted sample when weights have zeros
2448
+ # (mirrors solve_logit's positive-weight safeguards)
2449
+ if weights is not None and np.any(weights == 0):
2450
+ pos_mask = weights > 0
2451
+ n_pos = int(np.sum(pos_mask))
2452
+ X_eff = X[pos_mask]
2453
+ eff_rank_info = _detect_rank_deficiency(X_eff)
2454
+ if len(eff_rank_info[1]) > 0:
2455
+ n_dropped_eff = len(eff_rank_info[1])
2456
+ if rank_deficient_action == "error":
2457
+ raise ValueError(
2458
+ f"Effective (positive-weight) sample is rank-deficient: "
2459
+ f"{n_dropped_eff} linearly dependent column(s). "
2460
+ f"Cannot identify Poisson model on this subpopulation."
2461
+ )
2462
+ elif rank_deficient_action == "warn":
2463
+ warnings.warn(
2464
+ f"Effective (positive-weight) sample is rank-deficient: "
2465
+ f"dropping {n_dropped_eff} column(s). Poisson estimates "
2466
+ f"may be unreliable on this subpopulation.",
2467
+ UserWarning,
2468
+ stacklevel=2,
2469
+ )
2470
+ eff_dropped = set(int(d) for d in eff_rank_info[1])
2471
+ eff_kept = np.array([i for i in range(k) if i not in eff_dropped])
2472
+ X = X[:, eff_kept]
2473
+ if len(dropped_cols) > 0:
2474
+ kept_cols = kept_cols[eff_kept]
2475
+ else:
2476
+ kept_cols = eff_kept
2477
+ dropped_cols = list(eff_dropped)
2478
+ n, k = X.shape
2479
+ if n_pos <= k:
2480
+ raise ValueError(
2481
+ f"Only {n_pos} positive-weight observation(s) for "
2482
+ f"{k} parameters (after rank reduction). "
2483
+ f"Cannot identify Poisson model."
2484
+ )
2485
+
2428
2486
  if init_beta is not None:
2429
2487
  beta = init_beta[kept_cols].copy() if len(dropped_cols) > 0 else init_beta.copy()
2430
2488
  else:
@@ -2438,8 +2496,12 @@ def solve_poisson(
2438
2496
  for _ in range(max_iter):
2439
2497
  eta = np.clip(X @ beta, -500, 500)
2440
2498
  mu = np.exp(eta)
2441
- score = X.T @ (y - mu) # gradient of log-likelihood
2442
- hess = X.T @ (mu[:, None] * X) # -Hessian = X'WX, W=diag(mu)
2499
+ if weights is not None:
2500
+ score = X.T @ (weights * (y - mu))
2501
+ hess = X.T @ ((weights * mu)[:, None] * X)
2502
+ else:
2503
+ score = X.T @ (y - mu)
2504
+ hess = X.T @ (mu[:, None] * X)
2443
2505
  try:
2444
2506
  delta = np.linalg.solve(hess + 1e-12 * np.eye(k), score)
2445
2507
  except np.linalg.LinAlgError:
@@ -1129,6 +1129,37 @@ def generate_staggered_ddd_data(
1129
1129
  return pd.DataFrame(records)
1130
1130
 
1131
1131
 
1132
+ def _rank_pair_weights(
1133
+ unit_weight: np.ndarray,
1134
+ unit_stratum: np.ndarray,
1135
+ y0: np.ndarray,
1136
+ n_strata: int,
1137
+ ) -> None:
1138
+ """Rank-pair weights with Y(0) within each stratum (in-place).
1139
+
1140
+ High-outcome units receive higher weights, modeling informative sampling
1141
+ where hard-to-reach (high-outcome) subpopulations are under-covered
1142
+ and therefore carry larger inverse-selection-probability weights.
1143
+ """
1144
+ for s in range(n_strata):
1145
+ mask = unit_stratum == s
1146
+ n_s = mask.sum()
1147
+ if n_s <= 1:
1148
+ continue
1149
+ idx_s = np.where(mask)[0]
1150
+ w_vals = unit_weight[idx_s].copy()
1151
+ if w_vals.std() < 1e-10:
1152
+ # No within-stratum variation: create rank-based weights
1153
+ # scaled to preserve stratum baseline weight level
1154
+ ranks = np.argsort(np.argsort(y0[idx_s])).astype(float) + 1.0
1155
+ unit_weight[idx_s] = ranks / ranks.mean() * w_vals.mean()
1156
+ else:
1157
+ # Rank-pair: highest Y(0) gets heaviest weight
1158
+ y0_order = np.argsort(-y0[idx_s])
1159
+ w_sorted = np.sort(w_vals)[::-1] # heaviest first
1160
+ unit_weight[idx_s[y0_order]] = w_sorted
1161
+
1162
+
1132
1163
  def generate_survey_did_data(
1133
1164
  n_units: int = 200,
1134
1165
  n_periods: int = 8,
@@ -1149,6 +1180,15 @@ def generate_survey_did_data(
1149
1180
  add_covariates: bool = False,
1150
1181
  panel: bool = True,
1151
1182
  seed: Optional[int] = None,
1183
+ # --- Research-grade DGP parameters ---
1184
+ icc: Optional[float] = None,
1185
+ weight_cv: Optional[float] = None,
1186
+ informative_sampling: bool = False,
1187
+ heterogeneous_te_by_strata: bool = False,
1188
+ strata_sizes: Optional[List[int]] = None,
1189
+ return_true_population_att: bool = False,
1190
+ covariate_effects: Optional[tuple] = None,
1191
+ te_covariate_interaction: float = 0.0,
1152
1192
  ) -> pd.DataFrame:
1153
1193
  """
1154
1194
  Generate synthetic staggered DiD data with survey structure.
@@ -1215,6 +1255,52 @@ def generate_survey_did_data(
1215
1255
  CallawaySantAnna(panel=False)).
1216
1256
  seed : int, optional
1217
1257
  Random seed for reproducibility.
1258
+ icc : float, optional
1259
+ Target intra-class correlation coefficient (0 < icc < 1). Overrides
1260
+ ``psu_re_sd`` via the variance decomposition:
1261
+ ``psu_re_sd = sqrt(icc * (sigma2_unit + sigma2_noise + sigma2_cov) /
1262
+ ((1 - icc) * (1 + psu_period_factor^2)))`` where ``sigma2_cov``
1263
+ includes covariate variance when ``add_covariates=True``.
1264
+ Cannot be combined with a non-default ``psu_re_sd``.
1265
+ weight_cv : float, optional
1266
+ Target coefficient of variation for sampling weights. Generates
1267
+ LogNormal weights normalized to mean 1, bypassing ``weight_variation``.
1268
+ Cannot be combined with a non-default ``weight_variation``.
1269
+ informative_sampling : bool, default=False
1270
+ If True, sampling weights correlate with Y(0) — high-outcome units
1271
+ receive higher weights (under-coverage → larger inverse-selection-
1272
+ probability weights). Uses rank-pairing within each stratum. For
1273
+ panel data, ranking is done once from period-1 outcomes. For
1274
+ repeated cross-sections, ranking is refreshed each period. Within
1275
+ each stratum, rank-based weights are scaled to preserve the
1276
+ stratum's baseline weight level from ``weight_variation``.
1277
+ When ``add_covariates=True``, covariate contributions are
1278
+ included in the Y(0) ranking.
1279
+ heterogeneous_te_by_strata : bool, default=False
1280
+ If True, treatment effect varies by stratum:
1281
+ ``TE_h = TE * (1 + 0.5 * (h - mean) / std)``. Creates a gap
1282
+ between unweighted and population ATT. With ``n_strata=1``,
1283
+ all units receive the base ``treatment_effect``.
1284
+ strata_sizes : list of int, optional
1285
+ Custom per-stratum unit counts. Must have length ``n_strata`` and
1286
+ sum to ``n_units``. Replaces equal allocation across strata.
1287
+ return_true_population_att : bool, default=False
1288
+ If True, attaches a diagnostic dict to ``df.attrs["dgp_truth"]``
1289
+ with keys: ``population_att`` (weight-weighted average of treated
1290
+ true effects), ``deff_kish`` (1 + CV(w)^2), ``base_stratum_effects``
1291
+ (base stratum TEs before dynamic/covariate modifiers),
1292
+ ``icc_realized`` (ANOVA-based
1293
+ ICC computed on period-1 data).
1294
+ covariate_effects : tuple of (float, float), optional
1295
+ Coefficients ``(beta1, beta2)`` for covariates x1 and x2 in the
1296
+ outcome equation ``y += beta1 * x1 + beta2 * x2``. Default uses
1297
+ ``(0.5, 0.3)``. Only used when ``add_covariates=True``. The ICC
1298
+ calibration automatically adjusts for the implied covariate variance.
1299
+ te_covariate_interaction : float, default=0.0
1300
+ Coefficient for treatment-by-covariate interaction:
1301
+ ``TE_i = base_TE + te_covariate_interaction * x1_i``. Creates
1302
+ unit-level treatment effect heterogeneity driven by the continuous
1303
+ covariate. Requires ``add_covariates=True``.
1218
1304
 
1219
1305
  Returns
1220
1306
  -------
@@ -1222,6 +1308,8 @@ def generate_survey_did_data(
1222
1308
  Columns: unit, period, outcome, first_treat, treated, true_effect,
1223
1309
  stratum, psu, fpc, weight. Also rep_0..rep_K if
1224
1310
  include_replicate_weights=True, and x1, x2 if add_covariates=True.
1311
+ If ``return_true_population_att=True``, ``df.attrs["dgp_truth"]``
1312
+ contains DGP diagnostics.
1225
1313
  """
1226
1314
  rng = np.random.default_rng(seed)
1227
1315
 
@@ -1284,30 +1372,120 @@ def generate_survey_did_data(
1284
1372
  f"weight_variation must be one of {valid_wv}, got {weight_variation!r}"
1285
1373
  )
1286
1374
 
1375
+ # --- Validate research-grade DGP parameters ---
1376
+ if icc is not None:
1377
+ if not (0 < icc < 1):
1378
+ raise ValueError(f"icc must be between 0 and 1 (exclusive), got {icc}")
1379
+ if psu_re_sd != 2.0:
1380
+ raise ValueError(
1381
+ "Cannot specify both icc and a non-default psu_re_sd. "
1382
+ "icc overrides psu_re_sd via the ICC formula."
1383
+ )
1384
+
1385
+ if weight_cv is not None:
1386
+ if not np.isfinite(weight_cv) or weight_cv <= 0:
1387
+ raise ValueError(
1388
+ f"weight_cv must be finite and positive, got {weight_cv}"
1389
+ )
1390
+ if weight_variation != "moderate":
1391
+ raise ValueError(
1392
+ "Cannot specify both weight_cv and a non-default "
1393
+ "weight_variation. weight_cv overrides weight_variation."
1394
+ )
1395
+
1396
+ if strata_sizes is not None:
1397
+ strata_sizes = list(strata_sizes)
1398
+ for ss in strata_sizes:
1399
+ if isinstance(ss, bool) or not isinstance(ss, (int, np.integer)):
1400
+ raise ValueError(
1401
+ f"strata_sizes must contain integers, got {ss!r}"
1402
+ )
1403
+ if len(strata_sizes) != n_strata:
1404
+ raise ValueError(
1405
+ f"strata_sizes must have length n_strata={n_strata}, "
1406
+ f"got {len(strata_sizes)}"
1407
+ )
1408
+ if any(s < 1 for s in strata_sizes):
1409
+ raise ValueError("All strata_sizes must be >= 1")
1410
+ if sum(strata_sizes) != n_units:
1411
+ raise ValueError(
1412
+ f"strata_sizes must sum to n_units={n_units}, "
1413
+ f"got {sum(strata_sizes)}"
1414
+ )
1415
+
1416
+ # --- Validate and resolve covariate coefficients ---
1417
+ if covariate_effects is not None:
1418
+ covariate_effects = tuple(covariate_effects)
1419
+ if len(covariate_effects) != 2:
1420
+ raise ValueError(
1421
+ f"covariate_effects must have length 2, got {len(covariate_effects)}"
1422
+ )
1423
+ if not all(np.isfinite(c) for c in covariate_effects):
1424
+ raise ValueError(
1425
+ f"covariate_effects must be finite, got {covariate_effects}"
1426
+ )
1427
+ _beta1, _beta2 = covariate_effects if covariate_effects is not None else (0.5, 0.3)
1428
+
1429
+ if not np.isfinite(te_covariate_interaction):
1430
+ raise ValueError(
1431
+ f"te_covariate_interaction must be finite, got {te_covariate_interaction}"
1432
+ )
1433
+ if te_covariate_interaction != 0.0 and not add_covariates:
1434
+ raise ValueError(
1435
+ "te_covariate_interaction requires add_covariates=True"
1436
+ )
1437
+
1438
+ # --- ICC -> psu_re_sd resolution ---
1439
+ if icc is not None:
1440
+ # Covariate variance: Var(beta1*x1) + Var(beta2*x2)
1441
+ # where x1 ~ N(0,1), x2 ~ Bernoulli(0.5)
1442
+ cov_var = (_beta1**2 * 1.0 + _beta2**2 * 0.25) if add_covariates else 0.0
1443
+ non_psu_var = unit_fe_sd**2 + noise_sd**2 + cov_var
1444
+ if non_psu_var < 1e-12:
1445
+ raise ValueError(
1446
+ "icc requires non-zero non-PSU variance "
1447
+ "(unit_fe_sd, noise_sd, or add_covariates must contribute variance)"
1448
+ )
1449
+ psu_re_sd = np.sqrt(
1450
+ icc * non_psu_var
1451
+ / ((1 - icc) * (1 + psu_period_factor**2))
1452
+ )
1453
+
1287
1454
  # --- Survey structure: assign units to strata and PSUs ---
1288
1455
  n_psu_total = n_strata * psu_per_stratum
1289
- units_per_stratum = n_units // n_strata
1290
- remainder = n_units % n_strata
1456
+
1457
+ if strata_sizes is not None:
1458
+ stratum_n = strata_sizes
1459
+ else:
1460
+ units_per_stratum = n_units // n_strata
1461
+ remainder = n_units % n_strata
1462
+ stratum_n = [
1463
+ units_per_stratum + (1 if s < remainder else 0)
1464
+ for s in range(n_strata)
1465
+ ]
1291
1466
 
1292
1467
  unit_stratum = np.empty(n_units, dtype=int)
1293
1468
  unit_psu = np.empty(n_units, dtype=int)
1294
1469
  idx = 0
1295
1470
  for s in range(n_strata):
1296
- # Distribute remainder units across first strata
1297
- n_s = units_per_stratum + (1 if s < remainder else 0)
1471
+ n_s = stratum_n[s]
1298
1472
  unit_stratum[idx : idx + n_s] = s
1299
-
1300
- # Assign PSUs within this stratum
1301
1473
  psu_start = s * psu_per_stratum
1302
1474
  for j in range(n_s):
1303
1475
  unit_psu[idx + j] = psu_start + (j % psu_per_stratum)
1304
1476
  idx += n_s
1305
1477
 
1306
- # Sampling weights: vary by stratum (inverse selection probability)
1307
- scale_map = {"none": 0.0, "moderate": 1.0, "high": 3.0}
1308
- scale = scale_map.get(weight_variation, 1.0)
1309
- denom = max(n_strata - 1, 1)
1310
- unit_weight = 1.0 + scale * (unit_stratum / denom)
1478
+ # Sampling weights
1479
+ if weight_cv is not None:
1480
+ sigma_ln = np.sqrt(np.log(1 + weight_cv**2))
1481
+ raw_w = rng.lognormal(-sigma_ln**2 / 2, sigma_ln, size=n_units)
1482
+ unit_weight = raw_w / raw_w.mean()
1483
+ else:
1484
+ # Stratum-based weights (inverse selection probability)
1485
+ scale_map = {"none": 0.0, "moderate": 1.0, "high": 3.0}
1486
+ scale = scale_map.get(weight_variation, 1.0)
1487
+ denom = max(n_strata - 1, 1)
1488
+ unit_weight = 1.0 + scale * (unit_stratum / denom)
1311
1489
 
1312
1490
  # --- Treatment assignment (cohort structure) ---
1313
1491
  n_never = int(n_units * never_treated_frac)
@@ -1344,6 +1522,37 @@ def generate_survey_did_data(
1344
1522
  0, psu_re_sd * psu_period_factor, size=(n_psu_total, n_periods)
1345
1523
  )
1346
1524
 
1525
+ # --- Informative sampling (panel path): pre-draw FEs, rank-pair weights ---
1526
+ if informative_sampling and panel:
1527
+ _panel_unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
1528
+ y0_period1 = (
1529
+ _panel_unit_fe
1530
+ + psu_re[unit_psu]
1531
+ + psu_period_re[unit_psu, 0]
1532
+ + 0.5
1533
+ )
1534
+ if add_covariates:
1535
+ _panel_x1 = rng.normal(0, 1, size=n_units)
1536
+ _panel_x2 = rng.choice([0, 1], size=n_units)
1537
+ y0_period1 = y0_period1 + _beta1 * _panel_x1 + _beta2 * _panel_x2
1538
+ _rank_pair_weights(unit_weight, unit_stratum, y0_period1, n_strata)
1539
+
1540
+ # Save base weights for cross-section informative sampling (reset each period)
1541
+ if informative_sampling and not panel:
1542
+ _base_weight = unit_weight.copy()
1543
+
1544
+ # --- Heterogeneous treatment effects by stratum ---
1545
+ if heterogeneous_te_by_strata:
1546
+ if n_strata == 1:
1547
+ te_by_stratum = np.array([treatment_effect])
1548
+ else:
1549
+ strata_idx = np.arange(n_strata, dtype=float)
1550
+ te_by_stratum = treatment_effect * (
1551
+ 1 + 0.5 * (strata_idx - strata_idx.mean()) / strata_idx.std()
1552
+ )
1553
+ else:
1554
+ te_by_stratum = None
1555
+
1347
1556
  # --- Generate panel or repeated cross-sections ---
1348
1557
  records = []
1349
1558
  for t in range(1, n_periods + 1):
@@ -1351,21 +1560,47 @@ def generate_survey_did_data(
1351
1560
  unit_fe = rng.normal(0, unit_fe_sd, size=n_units)
1352
1561
  if panel and t > 1:
1353
1562
  pass # reuse unit_fe from first period (set below)
1354
- if panel and t == 1:
1563
+ if informative_sampling and panel:
1564
+ unit_fe = _panel_unit_fe # use pre-drawn FEs
1565
+ elif panel and t == 1:
1355
1566
  _panel_unit_fe = unit_fe # save for reuse
1356
- if panel and t > 1:
1567
+ elif panel and t > 1:
1357
1568
  unit_fe = _panel_unit_fe # type: ignore[possibly-undefined]
1358
1569
 
1359
- x1 = rng.normal(0, 1, size=n_units) if add_covariates else None
1360
- if panel and t > 1 and add_covariates:
1570
+ # Cross-section informative sampling: re-rank weights each period
1571
+ if informative_sampling and not panel:
1572
+ # Draw covariates early so they can be included in Y(0) ranking
1573
+ if add_covariates:
1574
+ x1 = rng.normal(0, 1, size=n_units)
1575
+ x2 = rng.choice([0, 1], size=n_units)
1576
+ unit_weight = _base_weight.copy() # type: ignore[possibly-undefined]
1577
+ y0_t = (
1578
+ unit_fe
1579
+ + psu_re[unit_psu]
1580
+ + psu_period_re[unit_psu, t - 1]
1581
+ + 0.5 * t
1582
+ )
1583
+ if add_covariates:
1584
+ y0_t = y0_t + _beta1 * x1 + _beta2 * x2
1585
+ _rank_pair_weights(unit_weight, unit_stratum, y0_t, n_strata)
1586
+
1587
+ # Covariates — may already be drawn by informative sampling above
1588
+ if informative_sampling and panel and add_covariates:
1589
+ x1 = _panel_x1 # pre-drawn before loop for ranking
1590
+ x2 = _panel_x2
1591
+ elif informative_sampling and not panel and add_covariates:
1592
+ pass # x1, x2 already drawn in cross-section ranking block
1593
+ elif add_covariates:
1594
+ x1 = rng.normal(0, 1, size=n_units)
1595
+ x2 = rng.choice([0, 1], size=n_units)
1596
+ else:
1597
+ x1 = None
1598
+ x2 = None
1599
+ if not informative_sampling and panel and t > 1 and add_covariates:
1361
1600
  x1 = _panel_x1 # type: ignore[possibly-undefined]
1362
- elif panel and t == 1 and add_covariates:
1363
- _panel_x1 = x1
1364
-
1365
- x2 = rng.choice([0, 1], size=n_units) if add_covariates else None
1366
- if panel and t > 1 and add_covariates:
1367
1601
  x2 = _panel_x2 # type: ignore[possibly-undefined]
1368
- elif panel and t == 1 and add_covariates:
1602
+ elif not informative_sampling and panel and t == 1 and add_covariates:
1603
+ _panel_x1 = x1
1369
1604
  _panel_x2 = x2
1370
1605
 
1371
1606
  for i in range(n_units):
@@ -1374,12 +1609,17 @@ def generate_survey_did_data(
1374
1609
  y = unit_fe[i] + psu_re[unit_psu[i]] + psu_period_re[unit_psu[i], t - 1] + 0.5 * t
1375
1610
 
1376
1611
  if add_covariates:
1377
- y += 0.5 * x1[i] + 0.3 * x2[i]
1612
+ y += _beta1 * x1[i] + _beta2 * x2[i]
1378
1613
 
1379
1614
  treated = int(g_i > 0 and t >= g_i)
1380
1615
  true_eff = 0.0
1381
1616
  if treated:
1382
- true_eff = treatment_effect
1617
+ if te_by_stratum is not None:
1618
+ true_eff = float(te_by_stratum[unit_stratum[i]])
1619
+ else:
1620
+ true_eff = treatment_effect
1621
+ if te_covariate_interaction != 0.0:
1622
+ true_eff += te_covariate_interaction * x1[i]
1383
1623
  if dynamic_effects:
1384
1624
  true_eff *= 1 + effect_growth * (t - g_i)
1385
1625
  y += true_eff
@@ -1426,4 +1666,53 @@ def generate_survey_did_data(
1426
1666
  w_r[w_r > 0] *= n_rep / (n_rep - 1)
1427
1667
  df[f"rep_{r}"] = w_r
1428
1668
 
1669
+ # --- DGP truth diagnostics ---
1670
+ if return_true_population_att:
1671
+ treated_mask = df["treated"] == 1
1672
+ if treated_mask.any():
1673
+ w_treated = df.loc[treated_mask, "weight"].values
1674
+ te_treated = df.loc[treated_mask, "true_effect"].values
1675
+ population_att = float(np.average(te_treated, weights=w_treated))
1676
+ else:
1677
+ population_att = float("nan")
1678
+
1679
+ if te_by_stratum is not None:
1680
+ stratum_effects = {
1681
+ int(s): float(te_by_stratum[s]) for s in range(n_strata)
1682
+ }
1683
+ else:
1684
+ stratum_effects = {
1685
+ int(s): float(treatment_effect) for s in range(n_strata)
1686
+ }
1687
+
1688
+ # Kish DEFF from weight variation
1689
+ w_all = df.groupby("unit")["weight"].first().values
1690
+ cv_w = float(w_all.std() / w_all.mean()) if w_all.mean() > 0 else 0.0
1691
+ deff_kish = 1 + cv_w**2
1692
+
1693
+ # Realized ICC (ANOVA-based, period-1 only to avoid TE contamination)
1694
+ _p1 = df[df["period"] == 1]
1695
+ _groups = _p1.groupby("psu")["outcome"]
1696
+ _n_total = len(_p1)
1697
+ _n_groups = _groups.ngroups
1698
+ # ICC undefined with < 2 groups or no within-group replication
1699
+ if _n_groups < 2 or _n_total <= _n_groups:
1700
+ icc_realized = float("nan")
1701
+ else:
1702
+ _n_bar = _n_total / _n_groups
1703
+ _grand_mean = _p1["outcome"].mean()
1704
+ _ssb = (_groups.size() * (_groups.mean() - _grand_mean) ** 2).sum()
1705
+ _msb = _ssb / (_n_groups - 1)
1706
+ _ssw = _groups.apply(lambda x: ((x - x.mean()) ** 2).sum()).sum()
1707
+ _msw = _ssw / (_n_total - _n_groups)
1708
+ _denom = _msb + (_n_bar - 1) * _msw
1709
+ icc_realized = float((_msb - _msw) / _denom) if _denom > 0 else float("nan")
1710
+
1711
+ df.attrs["dgp_truth"] = {
1712
+ "population_att": population_att,
1713
+ "deff_kish": float(deff_kish),
1714
+ "base_stratum_effects": stratum_effects,
1715
+ "icc_realized": icc_realized,
1716
+ }
1717
+
1429
1718
  return df
@@ -153,9 +153,6 @@ class CallawaySantAnna(
153
153
  - "rademacher": +1/-1 with equal probability (standard choice)
154
154
  - "mammen": Two-point distribution (asymptotically valid, matches skewness)
155
155
  - "webb": Six-point distribution (recommended when n_clusters < 20)
156
- bootstrap_weight_type : str, optional
157
- .. deprecated:: 1.0.1
158
- Use ``bootstrap_weights`` instead. Will be removed in v3.0.
159
156
  seed : int, optional
160
157
  Random seed for reproducibility.
161
158
  rank_deficient_action : str, default="warn"
@@ -293,7 +290,6 @@ class CallawaySantAnna(
293
290
  cluster: Optional[str] = None,
294
291
  n_bootstrap: int = 0,
295
292
  bootstrap_weights: Optional[str] = None,
296
- bootstrap_weight_type: Optional[str] = None,
297
293
  seed: Optional[int] = None,
298
294
  rank_deficient_action: str = "warn",
299
295
  base_period: str = "varying",
@@ -323,18 +319,7 @@ class CallawaySantAnna(
323
319
  f"pscore_fallback must be 'error' or 'unconditional', " f"got '{pscore_fallback}'"
324
320
  )
325
321
 
326
- # Handle bootstrap_weight_type deprecation
327
- if bootstrap_weight_type is not None:
328
- warnings.warn(
329
- "bootstrap_weight_type is deprecated and will be removed in v3.0. "
330
- "Use bootstrap_weights instead.",
331
- DeprecationWarning,
332
- stacklevel=2,
333
- )
334
- if bootstrap_weights is None:
335
- bootstrap_weights = bootstrap_weight_type
336
-
337
- # Default to rademacher if neither specified
322
+ # Default to rademacher if not specified
338
323
  if bootstrap_weights is None:
339
324
  bootstrap_weights = "rademacher"
340
325
 
@@ -362,8 +347,6 @@ class CallawaySantAnna(
362
347
  self.cluster = cluster
363
348
  self.n_bootstrap = n_bootstrap
364
349
  self.bootstrap_weights = bootstrap_weights
365
- # Keep bootstrap_weight_type for backward compatibility
366
- self.bootstrap_weight_type = bootstrap_weights
367
350
  self.seed = seed
368
351
  self.rank_deficient_action = rank_deficient_action
369
352
  self.base_period = base_period
@@ -3881,8 +3864,6 @@ class CallawaySantAnna(
3881
3864
  "cluster": self.cluster,
3882
3865
  "n_bootstrap": self.n_bootstrap,
3883
3866
  "bootstrap_weights": self.bootstrap_weights,
3884
- # Deprecated but kept for backward compatibility
3885
- "bootstrap_weight_type": self.bootstrap_weight_type,
3886
3867
  "seed": self.seed,
3887
3868
  "rank_deficient_action": self.rank_deficient_action,
3888
3869
  "base_period": self.base_period,
@@ -118,7 +118,7 @@ class CallawaySantAnnaBootstrapMixin:
118
118
 
119
119
  # Type hints for attributes accessed from the main class
120
120
  n_bootstrap: int
121
- bootstrap_weight_type: str
121
+ bootstrap_weights: str
122
122
  alpha: float
123
123
  seed: Optional[int]
124
124
  anticipation: int
@@ -329,7 +329,7 @@ class CallawaySantAnnaBootstrapMixin:
329
329
  if _use_survey_bootstrap:
330
330
  # PSU-level multiplier weights
331
331
  psu_weights, psu_ids = _generate_survey_multiplier_weights_batch(
332
- self.n_bootstrap, resolved_survey_unit, self.bootstrap_weight_type, rng
332
+ self.n_bootstrap, resolved_survey_unit, self.bootstrap_weights, rng
333
333
  )
334
334
  # Build unit → PSU column map
335
335
  if resolved_survey_unit.psu is not None:
@@ -348,7 +348,7 @@ class CallawaySantAnnaBootstrapMixin:
348
348
  else:
349
349
  # Standard unit-level weights (no survey or weights-only)
350
350
  all_bootstrap_weights = _generate_bootstrap_weights_batch(
351
- self.n_bootstrap, n_units, self.bootstrap_weight_type, rng
351
+ self.n_bootstrap, n_units, self.bootstrap_weights, rng
352
352
  )
353
353
 
354
354
  # Vectorized bootstrap ATT(g,t) computation
@@ -534,7 +534,7 @@ class CallawaySantAnnaBootstrapMixin:
534
534
 
535
535
  return CSBootstrapResults(
536
536
  n_bootstrap=self.n_bootstrap,
537
- weight_type=self.bootstrap_weight_type,
537
+ weight_type=self.bootstrap_weights,
538
538
  alpha=self.alpha,
539
539
  overall_att_se=overall_se,
540
540
  overall_att_ci=overall_ci,
@@ -147,7 +147,6 @@ class StaggeredTripleDifference(
147
147
  self.base_period = base_period
148
148
  self.n_bootstrap = n_bootstrap
149
149
  self.bootstrap_weights = bootstrap_weights
150
- self.bootstrap_weight_type = bootstrap_weights
151
150
  self.seed = seed
152
151
  self.cband = cband
153
152
  self.pscore_trim = pscore_trim
@@ -186,7 +185,7 @@ class StaggeredTripleDifference(
186
185
  raise ValueError(f"Unknown parameter: {key}")
187
186
  setattr(self, key, value)
188
187
  if "bootstrap_weights" in params:
189
- self.bootstrap_weight_type = params["bootstrap_weights"]
188
+ self.bootstrap_weights = params["bootstrap_weights"]
190
189
  return self
191
190
 
192
191
  # ------------------------------------------------------------------
@@ -144,14 +144,14 @@ class SyntheticDiD(DifferenceInDifferences):
144
144
  warnings.warn(
145
145
  "lambda_reg is deprecated and ignored. Regularization is now "
146
146
  "auto-computed from data. Use zeta_omega to override unit weight "
147
- "regularization.",
147
+ "regularization. Will be removed in v3.1.",
148
148
  DeprecationWarning,
149
149
  stacklevel=2,
150
150
  )
151
151
  if zeta is not None:
152
152
  warnings.warn(
153
153
  "zeta is deprecated and ignored. Use zeta_lambda to override "
154
- "time weight regularization.",
154
+ "time weight regularization. Will be removed in v3.1.",
155
155
  DeprecationWarning,
156
156
  stacklevel=2,
157
157
  )
@@ -1124,7 +1124,8 @@ class SyntheticDiD(DifferenceInDifferences):
1124
1124
  for key, value in params.items():
1125
1125
  if key in _deprecated:
1126
1126
  warnings.warn(
1127
- f"{key} is deprecated and ignored. Use zeta_omega/zeta_lambda " f"instead.",
1127
+ f"{key} is deprecated and ignored. Use zeta_omega/zeta_lambda "
1128
+ f"instead. Will be removed in v3.1.",
1128
1129
  DeprecationWarning,
1129
1130
  stacklevel=2,
1130
1131
  )