diff-diff 2.3.2__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,858 @@
1
+ """
2
+ Synthetic Difference-in-Differences estimator.
3
+ """
4
+
5
+ import warnings
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from numpy.linalg import LinAlgError
11
+
12
+ from diff_diff.estimators import DifferenceInDifferences
13
+ from diff_diff.linalg import solve_ols
14
+ from diff_diff.results import SyntheticDiDResults
15
+ from diff_diff.utils import (
16
+ _compute_regularization,
17
+ _sum_normalize,
18
+ compute_confidence_interval,
19
+ compute_p_value,
20
+ compute_sdid_estimator,
21
+ compute_sdid_unit_weights,
22
+ compute_time_weights,
23
+ validate_binary,
24
+ )
25
+
26
+
27
+ class SyntheticDiD(DifferenceInDifferences):
28
+ """
29
+ Synthetic Difference-in-Differences (SDID) estimator.
30
+
31
+ Combines the strengths of Difference-in-Differences and Synthetic Control
32
+ methods by re-weighting control units to better match treated units'
33
+ pre-treatment trends.
34
+
35
+ This method is particularly useful when:
36
+ - You have few treated units (possibly just one)
37
+ - Parallel trends assumption may be questionable
38
+ - Control units are heterogeneous and need reweighting
39
+ - You want robustness to pre-treatment differences
40
+
41
+ Parameters
42
+ ----------
43
+ zeta_omega : float, optional
44
+ Regularization for unit weights. If None (default), auto-computed
45
+ from data as ``(N1 * T1)^(1/4) * noise_level`` matching R's synthdid.
46
+ zeta_lambda : float, optional
47
+ Regularization for time weights. If None (default), auto-computed
48
+ from data as ``1e-6 * noise_level`` matching R's synthdid.
49
+ alpha : float, default=0.05
50
+ Significance level for confidence intervals.
51
+ variance_method : str, default="placebo"
52
+ Method for variance estimation:
53
+ - "placebo": Placebo-based variance matching R's synthdid::vcov(method="placebo").
54
+ Implements Algorithm 4 from Arkhangelsky et al. (2021). This is R's default.
55
+ - "bootstrap": Bootstrap at unit level with fixed weights matching R's
56
+ synthdid::vcov(method="bootstrap").
57
+ n_bootstrap : int, default=200
58
+ Number of replications for variance estimation. Used for both:
59
+ - Bootstrap: Number of bootstrap samples
60
+ - Placebo: Number of random permutations (matches R's `replications` argument)
61
+ seed : int, optional
62
+ Random seed for reproducibility. If None (default), results
63
+ will vary between runs.
64
+
65
+ Attributes
66
+ ----------
67
+ results_ : SyntheticDiDResults
68
+ Estimation results after calling fit().
69
+ is_fitted_ : bool
70
+ Whether the model has been fitted.
71
+
72
+ Examples
73
+ --------
74
+ Basic usage with panel data:
75
+
76
+ >>> import pandas as pd
77
+ >>> from diff_diff import SyntheticDiD
78
+ >>>
79
+ >>> # Panel data with units observed over multiple time periods
80
+ >>> # Treatment occurs at period 5 for treated units
81
+ >>> data = pd.DataFrame({
82
+ ... 'unit': [...], # Unit identifier
83
+ ... 'period': [...], # Time period
84
+ ... 'outcome': [...], # Outcome variable
85
+ ... 'treated': [...] # 1 if unit is ever treated, 0 otherwise
86
+ ... })
87
+ >>>
88
+ >>> # Fit SDID model
89
+ >>> sdid = SyntheticDiD()
90
+ >>> results = sdid.fit(
91
+ ... data,
92
+ ... outcome='outcome',
93
+ ... treatment='treated',
94
+ ... unit='unit',
95
+ ... time='period',
96
+ ... post_periods=[5, 6, 7, 8]
97
+ ... )
98
+ >>>
99
+ >>> # View results
100
+ >>> results.print_summary()
101
+ >>> print(f"ATT: {results.att:.3f} (SE: {results.se:.3f})")
102
+ >>>
103
+ >>> # Examine unit weights
104
+ >>> weights_df = results.get_unit_weights_df()
105
+ >>> print(weights_df.head(10))
106
+
107
+ Notes
108
+ -----
109
+ The SDID estimator (Arkhangelsky et al., 2021) computes:
110
+
111
+ τ̂ = (Ȳ_treated,post - Σ_t λ_t * Y_treated,t)
112
+ - Σ_j ω_j * (Ȳ_j,post - Σ_t λ_t * Y_j,t)
113
+
114
+ Where:
115
+ - ω_j are unit weights (sum to 1, non-negative)
116
+ - λ_t are time weights (sum to 1, non-negative)
117
+
118
+ Unit weights ω are chosen to match pre-treatment outcomes:
119
+ min ||Σ_j ω_j * Y_j,pre - Y_treated,pre||²
120
+
121
+ This interpolates between:
122
+ - Standard DiD (uniform weights): ω_j = 1/N_control
123
+ - Synthetic Control (exact matching): concentrated weights
124
+
125
+ References
126
+ ----------
127
+ Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., & Wager, S.
128
+ (2021). Synthetic Difference-in-Differences. American Economic Review,
129
+ 111(12), 4088-4118.
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ zeta_omega: Optional[float] = None,
135
+ zeta_lambda: Optional[float] = None,
136
+ alpha: float = 0.05,
137
+ variance_method: str = "placebo",
138
+ n_bootstrap: int = 200,
139
+ seed: Optional[int] = None,
140
+ # Deprecated — accepted for backward compat, ignored with warning
141
+ lambda_reg: Optional[float] = None,
142
+ zeta: Optional[float] = None,
143
+ ):
144
+ if lambda_reg is not None:
145
+ warnings.warn(
146
+ "lambda_reg is deprecated and ignored. Regularization is now "
147
+ "auto-computed from data. Use zeta_omega to override unit weight "
148
+ "regularization.",
149
+ DeprecationWarning,
150
+ stacklevel=2,
151
+ )
152
+ if zeta is not None:
153
+ warnings.warn(
154
+ "zeta is deprecated and ignored. Use zeta_lambda to override "
155
+ "time weight regularization.",
156
+ DeprecationWarning,
157
+ stacklevel=2,
158
+ )
159
+
160
+ super().__init__(robust=True, cluster=None, alpha=alpha)
161
+ self.zeta_omega = zeta_omega
162
+ self.zeta_lambda = zeta_lambda
163
+ self.variance_method = variance_method
164
+ self.n_bootstrap = n_bootstrap
165
+ self.seed = seed
166
+
167
+ # Validate n_bootstrap
168
+ if n_bootstrap < 2:
169
+ raise ValueError(
170
+ f"n_bootstrap must be >= 2 (got {n_bootstrap}). At least 2 "
171
+ f"iterations are needed to estimate standard errors."
172
+ )
173
+
174
+ # Validate variance_method
175
+ valid_methods = ("bootstrap", "placebo")
176
+ if variance_method not in valid_methods:
177
+ raise ValueError(
178
+ f"variance_method must be one of {valid_methods}, "
179
+ f"got '{variance_method}'"
180
+ )
181
+
182
+ self._unit_weights = None
183
+ self._time_weights = None
184
+
185
+ def fit( # type: ignore[override]
186
+ self,
187
+ data: pd.DataFrame,
188
+ outcome: str,
189
+ treatment: str,
190
+ unit: str,
191
+ time: str,
192
+ post_periods: Optional[List[Any]] = None,
193
+ covariates: Optional[List[str]] = None
194
+ ) -> SyntheticDiDResults:
195
+ """
196
+ Fit the Synthetic Difference-in-Differences model.
197
+
198
+ Parameters
199
+ ----------
200
+ data : pd.DataFrame
201
+ Panel data with observations for multiple units over multiple
202
+ time periods.
203
+ outcome : str
204
+ Name of the outcome variable column.
205
+ treatment : str
206
+ Name of the treatment group indicator column (0/1).
207
+ Should be 1 for all observations of treated units
208
+ (both pre and post treatment).
209
+ unit : str
210
+ Name of the unit identifier column.
211
+ time : str
212
+ Name of the time period column.
213
+ post_periods : list, optional
214
+ List of time period values that are post-treatment.
215
+ If None, uses the last half of periods.
216
+ covariates : list, optional
217
+ List of covariate column names. Covariates are residualized
218
+ out before computing the SDID estimator.
219
+
220
+ Returns
221
+ -------
222
+ SyntheticDiDResults
223
+ Object containing the ATT estimate, standard error,
224
+ unit weights, and time weights.
225
+
226
+ Raises
227
+ ------
228
+ ValueError
229
+ If required parameters are missing or data validation fails.
230
+ """
231
+ # Validate inputs
232
+ if outcome is None or treatment is None or unit is None or time is None:
233
+ raise ValueError(
234
+ "Must provide 'outcome', 'treatment', 'unit', and 'time'"
235
+ )
236
+
237
+ # Check columns exist
238
+ required_cols = [outcome, treatment, unit, time]
239
+ if covariates:
240
+ required_cols.extend(covariates)
241
+
242
+ missing = [c for c in required_cols if c not in data.columns]
243
+ if missing:
244
+ raise ValueError(f"Missing columns: {missing}")
245
+
246
+ # Validate treatment is binary
247
+ validate_binary(data[treatment].values, "treatment")
248
+
249
+ # Get all unique time periods
250
+ all_periods = sorted(data[time].unique())
251
+
252
+ if len(all_periods) < 2:
253
+ raise ValueError("Need at least 2 time periods")
254
+
255
+ # Determine pre and post periods
256
+ if post_periods is None:
257
+ mid = len(all_periods) // 2
258
+ post_periods = list(all_periods[mid:])
259
+ pre_periods = list(all_periods[:mid])
260
+ else:
261
+ post_periods = list(post_periods)
262
+ pre_periods = [p for p in all_periods if p not in post_periods]
263
+
264
+ if len(post_periods) == 0:
265
+ raise ValueError("Must have at least one post-treatment period")
266
+ if len(pre_periods) == 0:
267
+ raise ValueError("Must have at least one pre-treatment period")
268
+
269
+ # Validate post_periods are in data
270
+ for p in post_periods:
271
+ if p not in all_periods:
272
+ raise ValueError(f"Post-period '{p}' not found in time column")
273
+
274
+ # Identify treated and control units
275
+ # Treatment indicator should be constant within unit
276
+ unit_treatment = data.groupby(unit)[treatment].first()
277
+
278
+ # Validate treatment is constant within unit (SDID requires block treatment)
279
+ treatment_nunique = data.groupby(unit)[treatment].nunique()
280
+ varying_units = treatment_nunique[treatment_nunique > 1]
281
+ if len(varying_units) > 0:
282
+ example_unit = varying_units.index[0]
283
+ example_vals = sorted(
284
+ data.loc[data[unit] == example_unit, treatment].unique()
285
+ )
286
+ raise ValueError(
287
+ f"Treatment indicator varies within {len(varying_units)} unit(s) "
288
+ f"(e.g., unit '{example_unit}' has values {example_vals}). "
289
+ f"SyntheticDiD requires 'block' treatment where treatment is "
290
+ f"constant within each unit across all time periods. "
291
+ f"For staggered adoption designs, use CallawaySantAnna or "
292
+ f"ImputationDiD instead."
293
+ )
294
+
295
+ treated_units = unit_treatment[unit_treatment == 1].index.tolist()
296
+ control_units = unit_treatment[unit_treatment == 0].index.tolist()
297
+
298
+ if len(treated_units) == 0:
299
+ raise ValueError("No treated units found")
300
+ if len(control_units) == 0:
301
+ raise ValueError("No control units found")
302
+
303
+ # Validate balanced panel (SDID requires all units observed in all periods)
304
+ periods_per_unit = data.groupby(unit)[time].nunique()
305
+ expected_n_periods = len(all_periods)
306
+ unbalanced_units = periods_per_unit[periods_per_unit != expected_n_periods]
307
+ if len(unbalanced_units) > 0:
308
+ example_unit = unbalanced_units.index[0]
309
+ actual_count = unbalanced_units.iloc[0]
310
+ raise ValueError(
311
+ f"Panel is not balanced: {len(unbalanced_units)} unit(s) do not "
312
+ f"have observations in all {expected_n_periods} periods "
313
+ f"(e.g., unit '{example_unit}' has {actual_count} periods). "
314
+ f"SyntheticDiD requires a balanced panel. Use "
315
+ f"diff_diff.prep.balance_panel() to balance the panel first."
316
+ )
317
+
318
+ # Residualize covariates if provided
319
+ working_data = data.copy()
320
+ if covariates:
321
+ working_data = self._residualize_covariates(
322
+ working_data, outcome, covariates, unit, time
323
+ )
324
+
325
+ # Create outcome matrices
326
+ # Shape: (n_periods, n_units)
327
+ Y_pre_control, Y_post_control, Y_pre_treated, Y_post_treated = \
328
+ self._create_outcome_matrices(
329
+ working_data, outcome, unit, time,
330
+ pre_periods, post_periods, treated_units, control_units
331
+ )
332
+
333
+ # Compute auto-regularization (or use user overrides)
334
+ auto_zeta_omega, auto_zeta_lambda = _compute_regularization(
335
+ Y_pre_control, len(treated_units), len(post_periods)
336
+ )
337
+ zeta_omega = self.zeta_omega if self.zeta_omega is not None else auto_zeta_omega
338
+ zeta_lambda = self.zeta_lambda if self.zeta_lambda is not None else auto_zeta_lambda
339
+
340
+ # Store noise level for diagnostics
341
+ from diff_diff.utils import _compute_noise_level
342
+ noise_level = _compute_noise_level(Y_pre_control)
343
+
344
+ # Data-dependent convergence threshold (matches R's 1e-5 * noise.level).
345
+ # Floor of 1e-5 when noise_level == 0: R would use 0.0, causing FW to
346
+ # run all max_iter iterations. The result is equivalent (zero-noise
347
+ # data has no variation to optimize), but the floor enables early stop.
348
+ min_decrease = 1e-5 * noise_level if noise_level > 0 else 1e-5
349
+
350
+ # Compute unit weights (Frank-Wolfe with sparsification)
351
+ Y_pre_treated_mean = np.mean(Y_pre_treated, axis=1)
352
+
353
+ unit_weights = compute_sdid_unit_weights(
354
+ Y_pre_control,
355
+ Y_pre_treated_mean,
356
+ zeta_omega=zeta_omega,
357
+ min_decrease=min_decrease,
358
+ )
359
+
360
+ # Compute time weights (Frank-Wolfe on collapsed form)
361
+ time_weights = compute_time_weights(
362
+ Y_pre_control,
363
+ Y_post_control,
364
+ zeta_lambda=zeta_lambda,
365
+ min_decrease=min_decrease,
366
+ )
367
+
368
+ # Compute SDID estimate
369
+ Y_post_treated_mean = np.mean(Y_post_treated, axis=1)
370
+
371
+ att = compute_sdid_estimator(
372
+ Y_pre_control,
373
+ Y_post_control,
374
+ Y_pre_treated_mean,
375
+ Y_post_treated_mean,
376
+ unit_weights,
377
+ time_weights
378
+ )
379
+
380
+ # Compute pre-treatment fit (RMSE)
381
+ synthetic_pre = Y_pre_control @ unit_weights
382
+ pre_fit_rmse = np.sqrt(np.mean((Y_pre_treated_mean - synthetic_pre) ** 2))
383
+
384
+ # Warn if pre-treatment fit is poor (Registry requirement).
385
+ # Threshold: 1× SD of treated pre-treatment outcomes — a natural baseline
386
+ # since RMSE exceeding natural variation indicates the synthetic control
387
+ # fails to reproduce the treated series' level or trend.
388
+ pre_treatment_sd = np.std(Y_pre_treated_mean, ddof=1) if len(Y_pre_treated_mean) > 1 else 0.0
389
+ if pre_treatment_sd > 0 and pre_fit_rmse > pre_treatment_sd:
390
+ warnings.warn(
391
+ f"Pre-treatment fit is poor: RMSE ({pre_fit_rmse:.4f}) exceeds "
392
+ f"the standard deviation of treated pre-treatment outcomes "
393
+ f"({pre_treatment_sd:.4f}). The synthetic control may not "
394
+ f"adequately reproduce treated unit trends. Consider adding "
395
+ f"more control units or adjusting regularization.",
396
+ UserWarning,
397
+ stacklevel=2,
398
+ )
399
+
400
+ # Compute standard errors based on variance_method
401
+ if self.variance_method == "bootstrap":
402
+ se, bootstrap_estimates = self._bootstrap_se(
403
+ Y_pre_control, Y_post_control,
404
+ Y_pre_treated, Y_post_treated,
405
+ unit_weights, time_weights,
406
+ )
407
+ placebo_effects = bootstrap_estimates
408
+ inference_method = "bootstrap"
409
+ else:
410
+ # Use placebo-based variance (R's synthdid Algorithm 4)
411
+ se, placebo_effects = self._placebo_variance_se(
412
+ Y_pre_control,
413
+ Y_post_control,
414
+ Y_pre_treated_mean,
415
+ Y_post_treated_mean,
416
+ n_treated=len(treated_units),
417
+ zeta_omega=zeta_omega,
418
+ zeta_lambda=zeta_lambda,
419
+ min_decrease=min_decrease,
420
+ replications=self.n_bootstrap # Reuse n_bootstrap for replications
421
+ )
422
+ inference_method = "placebo"
423
+
424
+ # Compute test statistics
425
+ if np.isfinite(se) and se > 0:
426
+ t_stat = att / se
427
+ # Use placebo distribution for p-value if available
428
+ if len(placebo_effects) > 0:
429
+ # Two-sided p-value from placebo distribution
430
+ p_value = np.mean(np.abs(placebo_effects) >= np.abs(att))
431
+ p_value = max(p_value, 1.0 / (len(placebo_effects) + 1))
432
+ else:
433
+ p_value = compute_p_value(t_stat)
434
+ else:
435
+ t_stat = np.nan
436
+ p_value = np.nan
437
+
438
+ # Confidence interval
439
+ if np.isfinite(se) and se > 0:
440
+ conf_int = compute_confidence_interval(att, se, self.alpha)
441
+ else:
442
+ conf_int = (np.nan, np.nan)
443
+
444
+ # Create weight dictionaries
445
+ unit_weights_dict = {
446
+ unit_id: w for unit_id, w in zip(control_units, unit_weights)
447
+ }
448
+ time_weights_dict = {
449
+ period: w for period, w in zip(pre_periods, time_weights)
450
+ }
451
+
452
+ # Store results
453
+ self.results_ = SyntheticDiDResults(
454
+ att=att,
455
+ se=se,
456
+ t_stat=t_stat,
457
+ p_value=p_value,
458
+ conf_int=conf_int,
459
+ n_obs=len(data),
460
+ n_treated=len(treated_units),
461
+ n_control=len(control_units),
462
+ unit_weights=unit_weights_dict,
463
+ time_weights=time_weights_dict,
464
+ pre_periods=pre_periods,
465
+ post_periods=post_periods,
466
+ alpha=self.alpha,
467
+ variance_method=inference_method,
468
+ noise_level=noise_level,
469
+ zeta_omega=zeta_omega,
470
+ zeta_lambda=zeta_lambda,
471
+ pre_treatment_fit=pre_fit_rmse,
472
+ placebo_effects=placebo_effects if len(placebo_effects) > 0 else None,
473
+ n_bootstrap=self.n_bootstrap if inference_method == "bootstrap" else None
474
+ )
475
+
476
+ self._unit_weights = unit_weights
477
+ self._time_weights = time_weights
478
+ self.is_fitted_ = True
479
+
480
+ return self.results_
481
+
482
+ def _create_outcome_matrices(
483
+ self,
484
+ data: pd.DataFrame,
485
+ outcome: str,
486
+ unit: str,
487
+ time: str,
488
+ pre_periods: List[Any],
489
+ post_periods: List[Any],
490
+ treated_units: List[Any],
491
+ control_units: List[Any]
492
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
493
+ """
494
+ Create outcome matrices for SDID estimation.
495
+
496
+ Returns
497
+ -------
498
+ tuple
499
+ (Y_pre_control, Y_post_control, Y_pre_treated, Y_post_treated)
500
+ Each is a 2D array with shape (n_periods, n_units)
501
+ """
502
+ # Pivot data to wide format
503
+ pivot = data.pivot(index=time, columns=unit, values=outcome)
504
+
505
+ # Extract submatrices
506
+ Y_pre_control = pivot.loc[pre_periods, control_units].values
507
+ Y_post_control = pivot.loc[post_periods, control_units].values
508
+ Y_pre_treated = pivot.loc[pre_periods, treated_units].values
509
+ Y_post_treated = pivot.loc[post_periods, treated_units].values
510
+
511
+ return (
512
+ Y_pre_control.astype(float),
513
+ Y_post_control.astype(float),
514
+ Y_pre_treated.astype(float),
515
+ Y_post_treated.astype(float)
516
+ )
517
+
518
+ def _residualize_covariates(
519
+ self,
520
+ data: pd.DataFrame,
521
+ outcome: str,
522
+ covariates: List[str],
523
+ unit: str,
524
+ time: str
525
+ ) -> pd.DataFrame:
526
+ """
527
+ Residualize outcome by regressing out covariates.
528
+
529
+ Uses two-way fixed effects to partial out covariates.
530
+ """
531
+ data = data.copy()
532
+
533
+ # Create design matrix with covariates
534
+ X = data[covariates].values.astype(float)
535
+
536
+ # Add unit and time dummies
537
+ unit_dummies = pd.get_dummies(data[unit], prefix='u', drop_first=True)
538
+ time_dummies = pd.get_dummies(data[time], prefix='t', drop_first=True)
539
+
540
+ X_full = np.column_stack([
541
+ np.ones(len(data)),
542
+ X,
543
+ unit_dummies.values,
544
+ time_dummies.values
545
+ ])
546
+
547
+ y = data[outcome].values.astype(float)
548
+
549
+ # Fit and get residuals using unified backend
550
+ coeffs, residuals, _ = solve_ols(X_full, y, return_vcov=False)
551
+
552
+ # Add back the mean for interpretability
553
+ data[outcome] = residuals + np.mean(y)
554
+
555
+ return data
556
+
557
+ def _bootstrap_se(
558
+ self,
559
+ Y_pre_control: np.ndarray,
560
+ Y_post_control: np.ndarray,
561
+ Y_pre_treated: np.ndarray,
562
+ Y_post_treated: np.ndarray,
563
+ unit_weights: np.ndarray,
564
+ time_weights: np.ndarray,
565
+ ) -> Tuple[float, np.ndarray]:
566
+ """Compute bootstrap standard error matching R's synthdid bootstrap_sample.
567
+
568
+ Resamples all units (control + treated) with replacement, renormalizes
569
+ original unit weights for the resampled controls, and computes the
570
+ SDID estimator with **fixed** weights (no re-estimation).
571
+
572
+ This matches R's ``synthdid::vcov(method="bootstrap")``.
573
+ """
574
+ rng = np.random.default_rng(self.seed)
575
+ n_control = Y_pre_control.shape[1]
576
+ n_treated = Y_pre_treated.shape[1]
577
+ n_total = n_control + n_treated
578
+
579
+ # Build full panel matrix: (n_pre+n_post, n_control+n_treated)
580
+ Y_full = np.block([
581
+ [Y_pre_control, Y_pre_treated],
582
+ [Y_post_control, Y_post_treated]
583
+ ])
584
+ n_pre = Y_pre_control.shape[0]
585
+
586
+ bootstrap_estimates = []
587
+
588
+ for _ in range(self.n_bootstrap):
589
+ # Resample ALL units with replacement
590
+ boot_idx = rng.choice(n_total, size=n_total, replace=True)
591
+
592
+ # Identify which resampled units are control vs treated
593
+ boot_is_control = boot_idx < n_control
594
+ boot_control_idx = boot_idx[boot_is_control]
595
+ boot_treated_idx = boot_idx[~boot_is_control]
596
+
597
+ # Skip if no control or no treated units in bootstrap sample
598
+ if len(boot_control_idx) == 0 or len(boot_treated_idx) == 0:
599
+ continue
600
+
601
+ try:
602
+ # Renormalize original unit weights for the resampled controls
603
+ boot_omega = _sum_normalize(unit_weights[boot_control_idx])
604
+
605
+ # Extract resampled outcome matrices
606
+ Y_boot = Y_full[:, boot_idx]
607
+ Y_boot_pre_c = Y_boot[:n_pre, boot_is_control]
608
+ Y_boot_post_c = Y_boot[n_pre:, boot_is_control]
609
+ Y_boot_pre_t = Y_boot[:n_pre, ~boot_is_control]
610
+ Y_boot_post_t = Y_boot[n_pre:, ~boot_is_control]
611
+
612
+ # Compute ATT with FIXED weights (do NOT re-estimate)
613
+ Y_boot_pre_t_mean = np.mean(Y_boot_pre_t, axis=1)
614
+ Y_boot_post_t_mean = np.mean(Y_boot_post_t, axis=1)
615
+
616
+ tau = compute_sdid_estimator(
617
+ Y_boot_pre_c, Y_boot_post_c,
618
+ Y_boot_pre_t_mean, Y_boot_post_t_mean,
619
+ boot_omega, time_weights # time_weights = original lambda
620
+ )
621
+ if np.isfinite(tau):
622
+ bootstrap_estimates.append(tau)
623
+
624
+ except (ValueError, LinAlgError):
625
+ continue
626
+
627
+ bootstrap_estimates = np.array(bootstrap_estimates)
628
+
629
+ # Check bootstrap success rate and handle failures
630
+ n_successful = len(bootstrap_estimates)
631
+ failure_rate = 1 - (n_successful / self.n_bootstrap)
632
+
633
+ if n_successful == 0:
634
+ raise ValueError(
635
+ f"All {self.n_bootstrap} bootstrap iterations failed. "
636
+ f"This typically occurs when:\n"
637
+ f" - Sample size is too small for reliable resampling\n"
638
+ f" - Weight matrices are singular or near-singular\n"
639
+ f" - Insufficient pre-treatment periods for weight estimation\n"
640
+ f" - Too few control units relative to treated units\n"
641
+ f"Consider using variance_method='placebo' or increasing "
642
+ f"the regularization parameters (zeta_omega, zeta_lambda)."
643
+ )
644
+ elif n_successful == 1:
645
+ warnings.warn(
646
+ f"Only 1/{self.n_bootstrap} bootstrap iteration succeeded. "
647
+ f"Standard error cannot be computed reliably (requires at least 2). "
648
+ f"Returning SE=0.0. Consider using variance_method='placebo' or "
649
+ f"increasing the regularization (zeta_omega, zeta_lambda).",
650
+ UserWarning,
651
+ stacklevel=2,
652
+ )
653
+ se = 0.0
654
+ elif failure_rate > 0.05:
655
+ warnings.warn(
656
+ f"Only {n_successful}/{self.n_bootstrap} bootstrap iterations succeeded "
657
+ f"({failure_rate:.1%} failure rate). Standard errors may be unreliable. "
658
+ f"This can occur with small samples or insufficient pre-treatment periods.",
659
+ UserWarning,
660
+ stacklevel=2,
661
+ )
662
+ se = float(np.std(bootstrap_estimates, ddof=1))
663
+ else:
664
+ se = float(np.std(bootstrap_estimates, ddof=1))
665
+
666
+ return se, bootstrap_estimates
667
+
668
+ def _placebo_variance_se(
669
+ self,
670
+ Y_pre_control: np.ndarray,
671
+ Y_post_control: np.ndarray,
672
+ Y_pre_treated_mean: np.ndarray,
673
+ Y_post_treated_mean: np.ndarray,
674
+ n_treated: int,
675
+ zeta_omega: float = 0.0,
676
+ zeta_lambda: float = 0.0,
677
+ min_decrease: float = 1e-5,
678
+ replications: int = 200
679
+ ) -> Tuple[float, np.ndarray]:
680
+ """
681
+ Compute placebo-based variance matching R's synthdid methodology.
682
+
683
+ This implements Algorithm 4 from Arkhangelsky et al. (2021),
684
+ matching R's synthdid::vcov(method = "placebo"):
685
+
686
+ 1. Randomly sample N₀ control indices (permutation)
687
+ 2. Designate last N₁ as pseudo-treated, first (N₀-N₁) as pseudo-controls
688
+ 3. Re-estimate both omega and lambda on the permuted data (from
689
+ uniform initialization, fresh start), matching R's behavior where
690
+ ``update.omega=TRUE, update.lambda=TRUE`` are passed via ``opts``
691
+ 4. Compute SDID estimate with re-estimated weights
692
+ 5. Repeat `replications` times
693
+ 6. SE = sqrt((r-1)/r) * sd(estimates)
694
+
695
+ Parameters
696
+ ----------
697
+ Y_pre_control : np.ndarray
698
+ Control outcomes in pre-treatment periods, shape (n_pre, n_control).
699
+ Y_post_control : np.ndarray
700
+ Control outcomes in post-treatment periods, shape (n_post, n_control).
701
+ Y_pre_treated_mean : np.ndarray
702
+ Mean treated outcomes in pre-treatment periods, shape (n_pre,).
703
+ Y_post_treated_mean : np.ndarray
704
+ Mean treated outcomes in post-treatment periods, shape (n_post,).
705
+ n_treated : int
706
+ Number of treated units in the original estimation.
707
+ zeta_omega : float
708
+ Regularization parameter for unit weights (for re-estimation).
709
+ zeta_lambda : float
710
+ Regularization parameter for time weights (for re-estimation).
711
+ min_decrease : float
712
+ Convergence threshold for Frank-Wolfe (for re-estimation).
713
+ replications : int, default=200
714
+ Number of placebo replications.
715
+
716
+ Returns
717
+ -------
718
+ tuple
719
+ (se, placebo_effects) where se is the standard error and
720
+ placebo_effects is the array of placebo treatment effects.
721
+
722
+ References
723
+ ----------
724
+ Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., & Wager, S.
725
+ (2021). Synthetic Difference-in-Differences. American Economic Review,
726
+ 111(12), 4088-4118. Algorithm 4.
727
+ """
728
+ rng = np.random.default_rng(self.seed)
729
+ n_pre, n_control = Y_pre_control.shape
730
+
731
+ # Ensure we have enough controls for the split
732
+ n_pseudo_control = n_control - n_treated
733
+ if n_pseudo_control < 1:
734
+ warnings.warn(
735
+ f"Not enough control units ({n_control}) for placebo variance "
736
+ f"estimation with {n_treated} treated units. "
737
+ f"Consider using variance_method='bootstrap'.",
738
+ UserWarning,
739
+ stacklevel=3,
740
+ )
741
+ return 0.0, np.array([])
742
+
743
+ placebo_estimates = []
744
+
745
+ for _ in range(replications):
746
+ try:
747
+ # Random permutation of control indices (Algorithm 4, step 1)
748
+ perm = rng.permutation(n_control)
749
+
750
+ # Split into pseudo-controls and pseudo-treated (step 2)
751
+ pseudo_control_idx = perm[:n_pseudo_control]
752
+ pseudo_treated_idx = perm[n_pseudo_control:]
753
+
754
+ # Get pseudo-control and pseudo-treated outcomes
755
+ Y_pre_pseudo_control = Y_pre_control[:, pseudo_control_idx]
756
+ Y_post_pseudo_control = Y_post_control[:, pseudo_control_idx]
757
+ Y_pre_pseudo_treated_mean = np.mean(
758
+ Y_pre_control[:, pseudo_treated_idx], axis=1
759
+ )
760
+ Y_post_pseudo_treated_mean = np.mean(
761
+ Y_post_control[:, pseudo_treated_idx], axis=1
762
+ )
763
+
764
+ # Re-estimate weights on permuted data (matching R's behavior)
765
+ # R passes update.omega=TRUE, update.lambda=TRUE via opts,
766
+ # re-estimating weights from uniform initialization (fresh start).
767
+ # Unit weights: re-estimate on pseudo-control/pseudo-treated data
768
+ pseudo_omega = compute_sdid_unit_weights(
769
+ Y_pre_pseudo_control,
770
+ Y_pre_pseudo_treated_mean,
771
+ zeta_omega=zeta_omega,
772
+ min_decrease=min_decrease,
773
+ )
774
+
775
+ # Time weights: re-estimate on pseudo-control data
776
+ pseudo_lambda = compute_time_weights(
777
+ Y_pre_pseudo_control,
778
+ Y_post_pseudo_control,
779
+ zeta_lambda=zeta_lambda,
780
+ min_decrease=min_decrease,
781
+ )
782
+
783
+ # Compute placebo SDID estimate (step 4)
784
+ tau = compute_sdid_estimator(
785
+ Y_pre_pseudo_control,
786
+ Y_post_pseudo_control,
787
+ Y_pre_pseudo_treated_mean,
788
+ Y_post_pseudo_treated_mean,
789
+ pseudo_omega,
790
+ pseudo_lambda
791
+ )
792
+ if np.isfinite(tau):
793
+ placebo_estimates.append(tau)
794
+
795
+ except (ValueError, LinAlgError, ZeroDivisionError):
796
+ # Skip failed iterations
797
+ continue
798
+
799
+ placebo_estimates = np.array(placebo_estimates)
800
+ n_successful = len(placebo_estimates)
801
+
802
+ if n_successful < 2:
803
+ warnings.warn(
804
+ f"Only {n_successful} placebo replications completed successfully. "
805
+ f"Standard error cannot be estimated reliably. "
806
+ f"Consider using variance_method='bootstrap' or increasing "
807
+ f"the number of control units.",
808
+ UserWarning,
809
+ stacklevel=3,
810
+ )
811
+ return 0.0, placebo_estimates
812
+
813
+ # Warn if many replications failed
814
+ failure_rate = 1 - (n_successful / replications)
815
+ if failure_rate > 0.05:
816
+ warnings.warn(
817
+ f"Only {n_successful}/{replications} placebo replications succeeded "
818
+ f"({failure_rate:.1%} failure rate). Standard errors may be unreliable.",
819
+ UserWarning,
820
+ stacklevel=3,
821
+ )
822
+
823
+ # Compute SE using R's formula: sqrt((r-1)/r) * sd(estimates)
824
+ # This matches synthdid::vcov.R exactly
825
+ se = np.sqrt((n_successful - 1) / n_successful) * np.std(
826
+ placebo_estimates, ddof=1
827
+ )
828
+
829
+ return se, placebo_estimates
830
+
831
+ def get_params(self) -> Dict[str, Any]:
832
+ """Get estimator parameters."""
833
+ return {
834
+ "zeta_omega": self.zeta_omega,
835
+ "zeta_lambda": self.zeta_lambda,
836
+ "alpha": self.alpha,
837
+ "variance_method": self.variance_method,
838
+ "n_bootstrap": self.n_bootstrap,
839
+ "seed": self.seed,
840
+ }
841
+
842
+ def set_params(self, **params) -> "SyntheticDiD":
843
+ """Set estimator parameters."""
844
+ # Deprecated parameter names — emit warning and ignore
845
+ _deprecated = {"lambda_reg", "zeta"}
846
+ for key, value in params.items():
847
+ if key in _deprecated:
848
+ warnings.warn(
849
+ f"{key} is deprecated and ignored. Use zeta_omega/zeta_lambda "
850
+ f"instead.",
851
+ DeprecationWarning,
852
+ stacklevel=2,
853
+ )
854
+ elif hasattr(self, key):
855
+ setattr(self, key, value)
856
+ else:
857
+ raise ValueError(f"Unknown parameter: {key}")
858
+ return self