diff-diff 2.1.0__cp39-cp39-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,738 @@
1
+ """
2
+ Synthetic Difference-in-Differences estimator.
3
+ """
4
+
5
+ import warnings
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from numpy.linalg import LinAlgError
11
+
12
+ from diff_diff.estimators import DifferenceInDifferences
13
+ from diff_diff.linalg import solve_ols
14
+ from diff_diff.results import SyntheticDiDResults
15
+ from diff_diff.utils import (
16
+ compute_confidence_interval,
17
+ compute_p_value,
18
+ compute_sdid_estimator,
19
+ compute_synthetic_weights,
20
+ compute_time_weights,
21
+ validate_binary,
22
+ )
23
+
24
+
25
+ class SyntheticDiD(DifferenceInDifferences):
26
+ """
27
+ Synthetic Difference-in-Differences (SDID) estimator.
28
+
29
+ Combines the strengths of Difference-in-Differences and Synthetic Control
30
+ methods by re-weighting control units to better match treated units'
31
+ pre-treatment trends.
32
+
33
+ This method is particularly useful when:
34
+ - You have few treated units (possibly just one)
35
+ - Parallel trends assumption may be questionable
36
+ - Control units are heterogeneous and need reweighting
37
+ - You want robustness to pre-treatment differences
38
+
39
+ Parameters
40
+ ----------
41
+ lambda_reg : float, default=0.0
42
+ L2 regularization for unit weights. Larger values shrink weights
43
+ toward uniform. Useful when n_pre_periods < n_control_units.
44
+ zeta : float, default=1.0
45
+ Regularization for time weights. Larger values give more uniform
46
+ time weights (closer to standard DiD).
47
+ alpha : float, default=0.05
48
+ Significance level for confidence intervals.
49
+ variance_method : str, default="bootstrap"
50
+ Method for variance estimation:
51
+ - "bootstrap": Block bootstrap at unit level (default)
52
+ - "placebo": Placebo-based variance matching R's synthdid::vcov(method="placebo").
53
+ Implements Algorithm 4 from Arkhangelsky et al. (2021): randomly permutes
54
+ control units, designates N₁ as pseudo-treated, renormalizes original
55
+ weights for remaining pseudo-controls, and computes SDID estimate.
56
+ n_bootstrap : int, default=200
57
+ Number of replications for variance estimation. Used for both:
58
+ - Bootstrap: Number of bootstrap samples
59
+ - Placebo: Number of random permutations (matches R's `replications` argument)
60
+ seed : int, optional
61
+ Random seed for reproducibility. If None (default), results
62
+ will vary between runs.
63
+
64
+ Attributes
65
+ ----------
66
+ results_ : SyntheticDiDResults
67
+ Estimation results after calling fit().
68
+ is_fitted_ : bool
69
+ Whether the model has been fitted.
70
+
71
+ Examples
72
+ --------
73
+ Basic usage with panel data:
74
+
75
+ >>> import pandas as pd
76
+ >>> from diff_diff import SyntheticDiD
77
+ >>>
78
+ >>> # Panel data with units observed over multiple time periods
79
+ >>> # Treatment occurs at period 5 for treated units
80
+ >>> data = pd.DataFrame({
81
+ ... 'unit': [...], # Unit identifier
82
+ ... 'period': [...], # Time period
83
+ ... 'outcome': [...], # Outcome variable
84
+ ... 'treated': [...] # 1 if unit is ever treated, 0 otherwise
85
+ ... })
86
+ >>>
87
+ >>> # Fit SDID model
88
+ >>> sdid = SyntheticDiD()
89
+ >>> results = sdid.fit(
90
+ ... data,
91
+ ... outcome='outcome',
92
+ ... treatment='treated',
93
+ ... unit='unit',
94
+ ... time='period',
95
+ ... post_periods=[5, 6, 7, 8]
96
+ ... )
97
+ >>>
98
+ >>> # View results
99
+ >>> results.print_summary()
100
+ >>> print(f"ATT: {results.att:.3f} (SE: {results.se:.3f})")
101
+ >>>
102
+ >>> # Examine unit weights
103
+ >>> weights_df = results.get_unit_weights_df()
104
+ >>> print(weights_df.head(10))
105
+
106
+ Notes
107
+ -----
108
+ The SDID estimator (Arkhangelsky et al., 2021) computes:
109
+
110
+ τ̂ = (Ȳ_treated,post - Σ_t λ_t * Y_treated,t)
111
+ - Σ_j ω_j * (Ȳ_j,post - Σ_t λ_t * Y_j,t)
112
+
113
+ Where:
114
+ - ω_j are unit weights (sum to 1, non-negative)
115
+ - λ_t are time weights (sum to 1, non-negative)
116
+
117
+ Unit weights ω are chosen to match pre-treatment outcomes:
118
+ min ||Σ_j ω_j * Y_j,pre - Y_treated,pre||²
119
+
120
+ This interpolates between:
121
+ - Standard DiD (uniform weights): ω_j = 1/N_control
122
+ - Synthetic Control (exact matching): concentrated weights
123
+
124
+ References
125
+ ----------
126
+ Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., & Wager, S.
127
+ (2021). Synthetic Difference-in-Differences. American Economic Review,
128
+ 111(12), 4088-4118.
129
+ """
130
+
131
+ def __init__(
132
+ self,
133
+ lambda_reg: float = 0.0,
134
+ zeta: float = 1.0,
135
+ alpha: float = 0.05,
136
+ variance_method: str = "bootstrap",
137
+ n_bootstrap: int = 200,
138
+ seed: Optional[int] = None
139
+ ):
140
+ super().__init__(robust=True, cluster=None, alpha=alpha)
141
+ self.lambda_reg = lambda_reg
142
+ self.zeta = zeta
143
+ self.variance_method = variance_method
144
+ self.n_bootstrap = n_bootstrap
145
+ self.seed = seed
146
+
147
+ # Validate variance_method
148
+ valid_methods = ("bootstrap", "placebo")
149
+ if variance_method not in valid_methods:
150
+ raise ValueError(
151
+ f"variance_method must be one of {valid_methods}, "
152
+ f"got '{variance_method}'"
153
+ )
154
+
155
+ self._unit_weights = None
156
+ self._time_weights = None
157
+
158
+ def fit( # type: ignore[override]
159
+ self,
160
+ data: pd.DataFrame,
161
+ outcome: str,
162
+ treatment: str,
163
+ unit: str,
164
+ time: str,
165
+ post_periods: Optional[List[Any]] = None,
166
+ covariates: Optional[List[str]] = None
167
+ ) -> SyntheticDiDResults:
168
+ """
169
+ Fit the Synthetic Difference-in-Differences model.
170
+
171
+ Parameters
172
+ ----------
173
+ data : pd.DataFrame
174
+ Panel data with observations for multiple units over multiple
175
+ time periods.
176
+ outcome : str
177
+ Name of the outcome variable column.
178
+ treatment : str
179
+ Name of the treatment group indicator column (0/1).
180
+ Should be 1 for all observations of treated units
181
+ (both pre and post treatment).
182
+ unit : str
183
+ Name of the unit identifier column.
184
+ time : str
185
+ Name of the time period column.
186
+ post_periods : list, optional
187
+ List of time period values that are post-treatment.
188
+ If None, uses the last half of periods.
189
+ covariates : list, optional
190
+ List of covariate column names. Covariates are residualized
191
+ out before computing the SDID estimator.
192
+
193
+ Returns
194
+ -------
195
+ SyntheticDiDResults
196
+ Object containing the ATT estimate, standard error,
197
+ unit weights, and time weights.
198
+
199
+ Raises
200
+ ------
201
+ ValueError
202
+ If required parameters are missing or data validation fails.
203
+ """
204
+ # Validate inputs
205
+ if outcome is None or treatment is None or unit is None or time is None:
206
+ raise ValueError(
207
+ "Must provide 'outcome', 'treatment', 'unit', and 'time'"
208
+ )
209
+
210
+ # Check columns exist
211
+ required_cols = [outcome, treatment, unit, time]
212
+ if covariates:
213
+ required_cols.extend(covariates)
214
+
215
+ missing = [c for c in required_cols if c not in data.columns]
216
+ if missing:
217
+ raise ValueError(f"Missing columns: {missing}")
218
+
219
+ # Validate treatment is binary
220
+ validate_binary(data[treatment].values, "treatment")
221
+
222
+ # Get all unique time periods
223
+ all_periods = sorted(data[time].unique())
224
+
225
+ if len(all_periods) < 2:
226
+ raise ValueError("Need at least 2 time periods")
227
+
228
+ # Determine pre and post periods
229
+ if post_periods is None:
230
+ mid = len(all_periods) // 2
231
+ post_periods = list(all_periods[mid:])
232
+ pre_periods = list(all_periods[:mid])
233
+ else:
234
+ post_periods = list(post_periods)
235
+ pre_periods = [p for p in all_periods if p not in post_periods]
236
+
237
+ if len(post_periods) == 0:
238
+ raise ValueError("Must have at least one post-treatment period")
239
+ if len(pre_periods) == 0:
240
+ raise ValueError("Must have at least one pre-treatment period")
241
+
242
+ # Validate post_periods are in data
243
+ for p in post_periods:
244
+ if p not in all_periods:
245
+ raise ValueError(f"Post-period '{p}' not found in time column")
246
+
247
+ # Identify treated and control units
248
+ # Treatment indicator should be constant within unit
249
+ unit_treatment = data.groupby(unit)[treatment].first()
250
+ treated_units = unit_treatment[unit_treatment == 1].index.tolist()
251
+ control_units = unit_treatment[unit_treatment == 0].index.tolist()
252
+
253
+ if len(treated_units) == 0:
254
+ raise ValueError("No treated units found")
255
+ if len(control_units) == 0:
256
+ raise ValueError("No control units found")
257
+
258
+ # Residualize covariates if provided
259
+ working_data = data.copy()
260
+ if covariates:
261
+ working_data = self._residualize_covariates(
262
+ working_data, outcome, covariates, unit, time
263
+ )
264
+
265
+ # Create outcome matrices
266
+ # Shape: (n_periods, n_units)
267
+ Y_pre_control, Y_post_control, Y_pre_treated, Y_post_treated = \
268
+ self._create_outcome_matrices(
269
+ working_data, outcome, unit, time,
270
+ pre_periods, post_periods, treated_units, control_units
271
+ )
272
+
273
+ # Compute unit weights (synthetic control weights)
274
+ # Average treated outcomes across treated units
275
+ Y_pre_treated_mean = np.mean(Y_pre_treated, axis=1)
276
+
277
+ unit_weights = compute_synthetic_weights(
278
+ Y_pre_control,
279
+ Y_pre_treated_mean,
280
+ lambda_reg=self.lambda_reg
281
+ )
282
+
283
+ # Compute time weights
284
+ time_weights = compute_time_weights(
285
+ Y_pre_control,
286
+ Y_pre_treated_mean,
287
+ zeta=self.zeta
288
+ )
289
+
290
+ # Compute SDID estimate
291
+ Y_post_treated_mean = np.mean(Y_post_treated, axis=1)
292
+
293
+ att = compute_sdid_estimator(
294
+ Y_pre_control,
295
+ Y_post_control,
296
+ Y_pre_treated_mean,
297
+ Y_post_treated_mean,
298
+ unit_weights,
299
+ time_weights
300
+ )
301
+
302
+ # Compute pre-treatment fit (RMSE)
303
+ synthetic_pre = Y_pre_control @ unit_weights
304
+ pre_fit_rmse = np.sqrt(np.mean((Y_pre_treated_mean - synthetic_pre) ** 2))
305
+
306
+ # Compute standard errors based on variance_method
307
+ if self.variance_method == "bootstrap":
308
+ se, bootstrap_estimates = self._bootstrap_se(
309
+ working_data, outcome, unit, time,
310
+ pre_periods, post_periods, treated_units, control_units
311
+ )
312
+ placebo_effects = bootstrap_estimates
313
+ inference_method = "bootstrap"
314
+ else:
315
+ # Use placebo-based variance (R's synthdid Algorithm 4)
316
+ se, placebo_effects = self._placebo_variance_se(
317
+ Y_pre_control,
318
+ Y_post_control,
319
+ Y_pre_treated_mean,
320
+ Y_post_treated_mean,
321
+ unit_weights,
322
+ time_weights,
323
+ n_treated=len(treated_units),
324
+ replications=self.n_bootstrap # Reuse n_bootstrap for replications
325
+ )
326
+ inference_method = "placebo"
327
+
328
+ # Compute test statistics
329
+ if se > 0:
330
+ t_stat = att / se
331
+ # Use placebo distribution for p-value if available
332
+ if len(placebo_effects) > 0:
333
+ # Two-sided p-value from placebo distribution
334
+ p_value = np.mean(np.abs(placebo_effects) >= np.abs(att))
335
+ p_value = max(p_value, 1.0 / (len(placebo_effects) + 1))
336
+ else:
337
+ p_value = compute_p_value(t_stat)
338
+ else:
339
+ t_stat = 0.0
340
+ p_value = 1.0
341
+
342
+ # Confidence interval
343
+ conf_int = compute_confidence_interval(att, se, self.alpha)
344
+
345
+ # Create weight dictionaries
346
+ unit_weights_dict = {
347
+ unit_id: w for unit_id, w in zip(control_units, unit_weights)
348
+ }
349
+ time_weights_dict = {
350
+ period: w for period, w in zip(pre_periods, time_weights)
351
+ }
352
+
353
+ # Store results
354
+ self.results_ = SyntheticDiDResults(
355
+ att=att,
356
+ se=se,
357
+ t_stat=t_stat,
358
+ p_value=p_value,
359
+ conf_int=conf_int,
360
+ n_obs=len(data),
361
+ n_treated=len(treated_units),
362
+ n_control=len(control_units),
363
+ unit_weights=unit_weights_dict,
364
+ time_weights=time_weights_dict,
365
+ pre_periods=pre_periods,
366
+ post_periods=post_periods,
367
+ alpha=self.alpha,
368
+ variance_method=inference_method,
369
+ lambda_reg=self.lambda_reg,
370
+ pre_treatment_fit=pre_fit_rmse,
371
+ placebo_effects=placebo_effects if len(placebo_effects) > 0 else None,
372
+ n_bootstrap=self.n_bootstrap if inference_method == "bootstrap" else None
373
+ )
374
+
375
+ self._unit_weights = unit_weights
376
+ self._time_weights = time_weights
377
+ self.is_fitted_ = True
378
+
379
+ return self.results_
380
+
381
+ def _create_outcome_matrices(
382
+ self,
383
+ data: pd.DataFrame,
384
+ outcome: str,
385
+ unit: str,
386
+ time: str,
387
+ pre_periods: List[Any],
388
+ post_periods: List[Any],
389
+ treated_units: List[Any],
390
+ control_units: List[Any]
391
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
392
+ """
393
+ Create outcome matrices for SDID estimation.
394
+
395
+ Returns
396
+ -------
397
+ tuple
398
+ (Y_pre_control, Y_post_control, Y_pre_treated, Y_post_treated)
399
+ Each is a 2D array with shape (n_periods, n_units)
400
+ """
401
+ # Pivot data to wide format
402
+ pivot = data.pivot(index=time, columns=unit, values=outcome)
403
+
404
+ # Extract submatrices
405
+ Y_pre_control = pivot.loc[pre_periods, control_units].values
406
+ Y_post_control = pivot.loc[post_periods, control_units].values
407
+ Y_pre_treated = pivot.loc[pre_periods, treated_units].values
408
+ Y_post_treated = pivot.loc[post_periods, treated_units].values
409
+
410
+ return (
411
+ Y_pre_control.astype(float),
412
+ Y_post_control.astype(float),
413
+ Y_pre_treated.astype(float),
414
+ Y_post_treated.astype(float)
415
+ )
416
+
417
+ def _residualize_covariates(
418
+ self,
419
+ data: pd.DataFrame,
420
+ outcome: str,
421
+ covariates: List[str],
422
+ unit: str,
423
+ time: str
424
+ ) -> pd.DataFrame:
425
+ """
426
+ Residualize outcome by regressing out covariates.
427
+
428
+ Uses two-way fixed effects to partial out covariates.
429
+ """
430
+ data = data.copy()
431
+
432
+ # Create design matrix with covariates
433
+ X = data[covariates].values.astype(float)
434
+
435
+ # Add unit and time dummies
436
+ unit_dummies = pd.get_dummies(data[unit], prefix='u', drop_first=True)
437
+ time_dummies = pd.get_dummies(data[time], prefix='t', drop_first=True)
438
+
439
+ X_full = np.column_stack([
440
+ np.ones(len(data)),
441
+ X,
442
+ unit_dummies.values,
443
+ time_dummies.values
444
+ ])
445
+
446
+ y = data[outcome].values.astype(float)
447
+
448
+ # Fit and get residuals using unified backend
449
+ coeffs, residuals, _ = solve_ols(X_full, y, return_vcov=False)
450
+
451
+ # Add back the mean for interpretability
452
+ data[outcome] = residuals + np.mean(y)
453
+
454
+ return data
455
+
456
+ def _bootstrap_se(
457
+ self,
458
+ data: pd.DataFrame,
459
+ outcome: str,
460
+ unit: str,
461
+ time: str,
462
+ pre_periods: List[Any],
463
+ post_periods: List[Any],
464
+ treated_units: List[Any],
465
+ control_units: List[Any]
466
+ ) -> Tuple[float, np.ndarray]:
467
+ """
468
+ Compute bootstrap standard error.
469
+
470
+ Uses block bootstrap at the unit level.
471
+ """
472
+ rng = np.random.default_rng(self.seed)
473
+
474
+ all_units = treated_units + control_units
475
+ n_units = len(all_units)
476
+
477
+ bootstrap_estimates = []
478
+
479
+ for _ in range(self.n_bootstrap):
480
+ # Sample units with replacement
481
+ sampled_units = rng.choice(all_units, size=n_units, replace=True)
482
+
483
+ # Create bootstrap sample
484
+ boot_data = pd.concat([
485
+ data[data[unit] == u].assign(**{unit: f"{u}_{i}"})
486
+ for i, u in enumerate(sampled_units)
487
+ ], ignore_index=True)
488
+
489
+ # Identify treated/control in bootstrap sample
490
+ boot_treated = [
491
+ f"{u}_{i}" for i, u in enumerate(sampled_units)
492
+ if u in treated_units
493
+ ]
494
+ boot_control = [
495
+ f"{u}_{i}" for i, u in enumerate(sampled_units)
496
+ if u in control_units
497
+ ]
498
+
499
+ if len(boot_treated) == 0 or len(boot_control) == 0:
500
+ continue
501
+
502
+ try:
503
+ # Create matrices
504
+ Y_pre_c, Y_post_c, Y_pre_t, Y_post_t = self._create_outcome_matrices(
505
+ boot_data, outcome, unit, time,
506
+ pre_periods, post_periods, boot_treated, boot_control
507
+ )
508
+
509
+ # Compute weights
510
+ Y_pre_t_mean = np.mean(Y_pre_t, axis=1)
511
+ Y_post_t_mean = np.mean(Y_post_t, axis=1)
512
+
513
+ w = compute_synthetic_weights(Y_pre_c, Y_pre_t_mean, self.lambda_reg)
514
+ t_w = compute_time_weights(Y_pre_c, Y_pre_t_mean, self.zeta)
515
+
516
+ # Compute estimate
517
+ tau = compute_sdid_estimator(
518
+ Y_pre_c, Y_post_c, Y_pre_t_mean, Y_post_t_mean, w, t_w
519
+ )
520
+ bootstrap_estimates.append(tau)
521
+
522
+ except (ValueError, LinAlgError, KeyError):
523
+ # Skip failed bootstrap iterations (e.g., singular matrices,
524
+ # missing data in resampled units, or invalid weight computations)
525
+ continue
526
+
527
+ bootstrap_estimates = np.array(bootstrap_estimates)
528
+
529
+ # Check bootstrap success rate and handle failures appropriately
530
+ n_successful = len(bootstrap_estimates)
531
+ failure_rate = 1 - (n_successful / self.n_bootstrap)
532
+
533
+ if n_successful == 0:
534
+ raise ValueError(
535
+ f"All {self.n_bootstrap} bootstrap iterations failed. "
536
+ f"This typically occurs when:\n"
537
+ f" - Sample size is too small for reliable resampling\n"
538
+ f" - Weight matrices are singular or near-singular\n"
539
+ f" - Insufficient pre-treatment periods for weight estimation\n"
540
+ f" - Too few control units relative to treated units\n"
541
+ f"Consider using n_bootstrap=0 to disable bootstrap inference "
542
+ f"and rely on placebo-based standard errors, or increase "
543
+ f"the regularization parameters (lambda_reg, zeta)."
544
+ )
545
+ elif n_successful == 1:
546
+ warnings.warn(
547
+ f"Only 1/{self.n_bootstrap} bootstrap iteration succeeded. "
548
+ f"Standard error cannot be computed reliably (requires at least 2). "
549
+ f"Returning SE=0.0. Consider the suggestions above for improving "
550
+ f"bootstrap convergence.",
551
+ UserWarning,
552
+ stacklevel=2,
553
+ )
554
+ se = 0.0
555
+ elif failure_rate > 0.05:
556
+ warnings.warn(
557
+ f"Only {n_successful}/{self.n_bootstrap} bootstrap iterations succeeded "
558
+ f"({failure_rate:.1%} failure rate). Standard errors may be unreliable. "
559
+ f"This can occur with small samples, near-singular weight matrices, "
560
+ f"or insufficient pre-treatment periods.",
561
+ UserWarning,
562
+ stacklevel=2,
563
+ )
564
+ se = np.std(bootstrap_estimates, ddof=1)
565
+ else:
566
+ se = np.std(bootstrap_estimates, ddof=1)
567
+
568
+ return se, bootstrap_estimates
569
+
570
+ def _placebo_variance_se(
571
+ self,
572
+ Y_pre_control: np.ndarray,
573
+ Y_post_control: np.ndarray,
574
+ Y_pre_treated_mean: np.ndarray,
575
+ Y_post_treated_mean: np.ndarray,
576
+ unit_weights: np.ndarray,
577
+ time_weights: np.ndarray,
578
+ n_treated: int,
579
+ replications: int = 200
580
+ ) -> Tuple[float, np.ndarray]:
581
+ """
582
+ Compute placebo-based variance matching R's synthdid methodology.
583
+
584
+ This implements Algorithm 4 from Arkhangelsky et al. (2021),
585
+ matching R's synthdid::vcov(method = "placebo"):
586
+
587
+ 1. Randomly sample N₀ control indices (permutation)
588
+ 2. Designate last N₁ as pseudo-treated, first (N₀-N₁) as pseudo-controls
589
+ 3. Renormalize original unit weights for pseudo-controls
590
+ 4. Compute SDID estimate using renormalized weights
591
+ 5. Repeat `replications` times
592
+ 6. SE = sqrt((r-1)/r) * sd(estimates)
593
+
594
+ Parameters
595
+ ----------
596
+ Y_pre_control : np.ndarray
597
+ Control outcomes in pre-treatment periods, shape (n_pre, n_control).
598
+ Y_post_control : np.ndarray
599
+ Control outcomes in post-treatment periods, shape (n_post, n_control).
600
+ Y_pre_treated_mean : np.ndarray
601
+ Mean treated outcomes in pre-treatment periods, shape (n_pre,).
602
+ Y_post_treated_mean : np.ndarray
603
+ Mean treated outcomes in post-treatment periods, shape (n_post,).
604
+ unit_weights : np.ndarray
605
+ Original unit weights from main estimation, shape (n_control,).
606
+ time_weights : np.ndarray
607
+ Time weights from main estimation, shape (n_pre,).
608
+ n_treated : int
609
+ Number of treated units in the original estimation.
610
+ replications : int, default=200
611
+ Number of placebo replications.
612
+
613
+ Returns
614
+ -------
615
+ tuple
616
+ (se, placebo_effects) where se is the standard error and
617
+ placebo_effects is the array of placebo treatment effects.
618
+
619
+ References
620
+ ----------
621
+ Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., & Wager, S.
622
+ (2021). Synthetic Difference-in-Differences. American Economic Review,
623
+ 111(12), 4088-4118. Algorithm 4.
624
+ """
625
+ rng = np.random.default_rng(self.seed)
626
+ n_pre, n_control = Y_pre_control.shape
627
+
628
+ # Ensure we have enough controls for the split
629
+ n_pseudo_control = n_control - n_treated
630
+ if n_pseudo_control < 1:
631
+ warnings.warn(
632
+ f"Not enough control units ({n_control}) for placebo variance "
633
+ f"estimation with {n_treated} treated units. "
634
+ f"Consider using variance_method='bootstrap'.",
635
+ UserWarning,
636
+ stacklevel=3,
637
+ )
638
+ return 0.0, np.array([])
639
+
640
+ placebo_estimates = []
641
+
642
+ for _ in range(replications):
643
+ try:
644
+ # Random permutation of control indices (Algorithm 4, step 1)
645
+ perm = rng.permutation(n_control)
646
+
647
+ # Split into pseudo-controls and pseudo-treated (step 2)
648
+ pseudo_control_idx = perm[:n_pseudo_control]
649
+ pseudo_treated_idx = perm[n_pseudo_control:]
650
+
651
+ # Renormalize original weights for pseudo-controls (step 3)
652
+ # This keeps the relative importance from the main estimation
653
+ pseudo_weights = unit_weights[pseudo_control_idx]
654
+ weight_sum = pseudo_weights.sum()
655
+ if weight_sum > 0:
656
+ pseudo_weights = pseudo_weights / weight_sum
657
+ else:
658
+ # Fallback to uniform if weights sum to zero
659
+ pseudo_weights = np.ones(n_pseudo_control) / n_pseudo_control
660
+
661
+ # Get pseudo-treated outcomes (mean across pseudo-treated units)
662
+ Y_pre_pseudo_treated = np.mean(
663
+ Y_pre_control[:, pseudo_treated_idx], axis=1
664
+ )
665
+ Y_post_pseudo_treated = np.mean(
666
+ Y_post_control[:, pseudo_treated_idx], axis=1
667
+ )
668
+
669
+ # Get pseudo-control outcomes
670
+ Y_pre_pseudo_control = Y_pre_control[:, pseudo_control_idx]
671
+ Y_post_pseudo_control = Y_post_control[:, pseudo_control_idx]
672
+
673
+ # Compute placebo SDID estimate (step 4)
674
+ tau = compute_sdid_estimator(
675
+ Y_pre_pseudo_control,
676
+ Y_post_pseudo_control,
677
+ Y_pre_pseudo_treated,
678
+ Y_post_pseudo_treated,
679
+ pseudo_weights,
680
+ time_weights
681
+ )
682
+ placebo_estimates.append(tau)
683
+
684
+ except (ValueError, LinAlgError, ZeroDivisionError):
685
+ # Skip failed iterations
686
+ continue
687
+
688
+ placebo_estimates = np.array(placebo_estimates)
689
+ n_successful = len(placebo_estimates)
690
+
691
+ if n_successful < 2:
692
+ warnings.warn(
693
+ f"Only {n_successful} placebo replications completed successfully. "
694
+ f"Standard error cannot be estimated reliably. "
695
+ f"Consider using variance_method='bootstrap' or increasing "
696
+ f"the number of control units.",
697
+ UserWarning,
698
+ stacklevel=3,
699
+ )
700
+ return 0.0, placebo_estimates
701
+
702
+ # Warn if many replications failed
703
+ failure_rate = 1 - (n_successful / replications)
704
+ if failure_rate > 0.05:
705
+ warnings.warn(
706
+ f"Only {n_successful}/{replications} placebo replications succeeded "
707
+ f"({failure_rate:.1%} failure rate). Standard errors may be unreliable.",
708
+ UserWarning,
709
+ stacklevel=3,
710
+ )
711
+
712
+ # Compute SE using R's formula: sqrt((r-1)/r) * sd(estimates)
713
+ # This matches synthdid::vcov.R exactly
714
+ se = np.sqrt((n_successful - 1) / n_successful) * np.std(
715
+ placebo_estimates, ddof=1
716
+ )
717
+
718
+ return se, placebo_estimates
719
+
720
+ def get_params(self) -> Dict[str, Any]:
721
+ """Get estimator parameters."""
722
+ return {
723
+ "lambda_reg": self.lambda_reg,
724
+ "zeta": self.zeta,
725
+ "alpha": self.alpha,
726
+ "variance_method": self.variance_method,
727
+ "n_bootstrap": self.n_bootstrap,
728
+ "seed": self.seed,
729
+ }
730
+
731
+ def set_params(self, **params) -> "SyntheticDiD":
732
+ """Set estimator parameters."""
733
+ for key, value in params.items():
734
+ if hasattr(self, key):
735
+ setattr(self, key, value)
736
+ else:
737
+ raise ValueError(f"Unknown parameter: {key}")
738
+ return self