diff-diff 3.0.1__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. diff_diff/__init__.py +382 -0
  2. diff_diff/_backend.py +134 -0
  3. diff_diff/_rust_backend.cp314-win_amd64.pyd +0 -0
  4. diff_diff/bacon.py +1140 -0
  5. diff_diff/bootstrap_utils.py +730 -0
  6. diff_diff/continuous_did.py +1626 -0
  7. diff_diff/continuous_did_bspline.py +190 -0
  8. diff_diff/continuous_did_results.py +374 -0
  9. diff_diff/datasets.py +815 -0
  10. diff_diff/diagnostics.py +882 -0
  11. diff_diff/efficient_did.py +1770 -0
  12. diff_diff/efficient_did_bootstrap.py +359 -0
  13. diff_diff/efficient_did_covariates.py +899 -0
  14. diff_diff/efficient_did_results.py +368 -0
  15. diff_diff/efficient_did_weights.py +617 -0
  16. diff_diff/estimators.py +1501 -0
  17. diff_diff/honest_did.py +2585 -0
  18. diff_diff/imputation.py +2458 -0
  19. diff_diff/imputation_bootstrap.py +418 -0
  20. diff_diff/imputation_results.py +448 -0
  21. diff_diff/linalg.py +2538 -0
  22. diff_diff/power.py +2588 -0
  23. diff_diff/practitioner.py +869 -0
  24. diff_diff/prep.py +1738 -0
  25. diff_diff/prep_dgp.py +1718 -0
  26. diff_diff/pretrends.py +1105 -0
  27. diff_diff/results.py +918 -0
  28. diff_diff/stacked_did.py +1049 -0
  29. diff_diff/stacked_did_results.py +339 -0
  30. diff_diff/staggered.py +3895 -0
  31. diff_diff/staggered_aggregation.py +864 -0
  32. diff_diff/staggered_bootstrap.py +752 -0
  33. diff_diff/staggered_results.py +416 -0
  34. diff_diff/staggered_triple_diff.py +1545 -0
  35. diff_diff/staggered_triple_diff_results.py +416 -0
  36. diff_diff/sun_abraham.py +1685 -0
  37. diff_diff/survey.py +1981 -0
  38. diff_diff/synthetic_did.py +1136 -0
  39. diff_diff/triple_diff.py +2047 -0
  40. diff_diff/trop.py +952 -0
  41. diff_diff/trop_global.py +1270 -0
  42. diff_diff/trop_local.py +1307 -0
  43. diff_diff/trop_results.py +356 -0
  44. diff_diff/twfe.py +542 -0
  45. diff_diff/two_stage.py +1952 -0
  46. diff_diff/two_stage_bootstrap.py +520 -0
  47. diff_diff/two_stage_results.py +400 -0
  48. diff_diff/utils.py +1902 -0
  49. diff_diff/visualization/__init__.py +61 -0
  50. diff_diff/visualization/_common.py +328 -0
  51. diff_diff/visualization/_continuous.py +274 -0
  52. diff_diff/visualization/_diagnostic.py +817 -0
  53. diff_diff/visualization/_event_study.py +1086 -0
  54. diff_diff/visualization/_power.py +661 -0
  55. diff_diff/visualization/_staggered.py +833 -0
  56. diff_diff/visualization/_synthetic.py +197 -0
  57. diff_diff/wooldridge.py +1285 -0
  58. diff_diff/wooldridge_results.py +349 -0
  59. diff_diff-3.0.1.dist-info/METADATA +2997 -0
  60. diff_diff-3.0.1.dist-info/RECORD +62 -0
  61. diff_diff-3.0.1.dist-info/WHEEL +4 -0
  62. diff_diff-3.0.1.dist-info/sboms/diff_diff_rust.cyclonedx.json +5843 -0
@@ -0,0 +1,730 @@
1
+ """
2
+ Shared bootstrap utilities for multiplier bootstrap inference.
3
+
4
+ Provides weight generation, percentile CI, and p-value helpers used by
5
+ both CallawaySantAnna and ContinuousDiD estimators.
6
+ """
7
+
8
+ import warnings
9
+ from typing import Optional, Tuple
10
+
11
+ import numpy as np
12
+
13
+ from diff_diff._backend import HAS_RUST_BACKEND, _rust_bootstrap_weights
14
+
15
+ __all__ = [
16
+ "generate_bootstrap_weights",
17
+ "generate_bootstrap_weights_batch",
18
+ "generate_bootstrap_weights_batch_numpy",
19
+ "generate_survey_multiplier_weights_batch",
20
+ "generate_rao_wu_weights",
21
+ "generate_rao_wu_weights_batch",
22
+ "compute_percentile_ci",
23
+ "compute_bootstrap_pvalue",
24
+ "compute_effect_bootstrap_stats",
25
+ "compute_effect_bootstrap_stats_batch",
26
+ ]
27
+
28
+
29
+ def generate_bootstrap_weights(
30
+ n_units: int,
31
+ weight_type: str,
32
+ rng: np.random.Generator,
33
+ ) -> np.ndarray:
34
+ """
35
+ Generate bootstrap weights for multiplier bootstrap.
36
+
37
+ Parameters
38
+ ----------
39
+ n_units : int
40
+ Number of units (clusters) to generate weights for.
41
+ weight_type : str
42
+ Type of weights: "rademacher", "mammen", or "webb".
43
+ rng : np.random.Generator
44
+ Random number generator.
45
+
46
+ Returns
47
+ -------
48
+ np.ndarray
49
+ Array of bootstrap weights with shape (n_units,).
50
+ """
51
+ if weight_type == "rademacher":
52
+ return rng.choice([-1.0, 1.0], size=n_units)
53
+ elif weight_type == "mammen":
54
+ sqrt5 = np.sqrt(5)
55
+ val1 = -(sqrt5 - 1) / 2
56
+ val2 = (sqrt5 + 1) / 2
57
+ p1 = (sqrt5 + 1) / (2 * sqrt5)
58
+ return rng.choice([val1, val2], size=n_units, p=[p1, 1 - p1])
59
+ elif weight_type == "webb":
60
+ values = np.array(
61
+ [
62
+ -np.sqrt(3 / 2),
63
+ -np.sqrt(2 / 2),
64
+ -np.sqrt(1 / 2),
65
+ np.sqrt(1 / 2),
66
+ np.sqrt(2 / 2),
67
+ np.sqrt(3 / 2),
68
+ ]
69
+ )
70
+ return rng.choice(values, size=n_units)
71
+ else:
72
+ raise ValueError(
73
+ f"weight_type must be 'rademacher', 'mammen', or 'webb', " f"got '{weight_type}'"
74
+ )
75
+
76
+
77
+ def generate_bootstrap_weights_batch(
78
+ n_bootstrap: int,
79
+ n_units: int,
80
+ weight_type: str,
81
+ rng: np.random.Generator,
82
+ ) -> np.ndarray:
83
+ """
84
+ Generate all bootstrap weights at once (vectorized).
85
+
86
+ Uses Rust backend if available for parallel generation.
87
+
88
+ Parameters
89
+ ----------
90
+ n_bootstrap : int
91
+ Number of bootstrap iterations.
92
+ n_units : int
93
+ Number of units (clusters) to generate weights for.
94
+ weight_type : str
95
+ Type of weights: "rademacher", "mammen", or "webb".
96
+ rng : np.random.Generator
97
+ Random number generator.
98
+
99
+ Returns
100
+ -------
101
+ np.ndarray
102
+ Array of bootstrap weights with shape (n_bootstrap, n_units).
103
+ """
104
+ if HAS_RUST_BACKEND and _rust_bootstrap_weights is not None:
105
+ seed = rng.integers(0, 2**63 - 1)
106
+ return _rust_bootstrap_weights(n_bootstrap, n_units, weight_type, seed)
107
+ return generate_bootstrap_weights_batch_numpy(n_bootstrap, n_units, weight_type, rng)
108
+
109
+
110
+ def generate_bootstrap_weights_batch_numpy(
111
+ n_bootstrap: int,
112
+ n_units: int,
113
+ weight_type: str,
114
+ rng: np.random.Generator,
115
+ ) -> np.ndarray:
116
+ """
117
+ NumPy fallback implementation of :func:`generate_bootstrap_weights_batch`.
118
+
119
+ Parameters
120
+ ----------
121
+ n_bootstrap : int
122
+ Number of bootstrap iterations.
123
+ n_units : int
124
+ Number of units (clusters) to generate weights for.
125
+ weight_type : str
126
+ Type of weights: "rademacher", "mammen", or "webb".
127
+ rng : np.random.Generator
128
+ Random number generator.
129
+
130
+ Returns
131
+ -------
132
+ np.ndarray
133
+ Array of bootstrap weights with shape (n_bootstrap, n_units).
134
+ """
135
+ if weight_type == "rademacher":
136
+ return rng.choice([-1.0, 1.0], size=(n_bootstrap, n_units))
137
+ elif weight_type == "mammen":
138
+ sqrt5 = np.sqrt(5)
139
+ val1 = -(sqrt5 - 1) / 2
140
+ val2 = (sqrt5 + 1) / 2
141
+ p1 = (sqrt5 + 1) / (2 * sqrt5)
142
+ return rng.choice([val1, val2], size=(n_bootstrap, n_units), p=[p1, 1 - p1])
143
+ elif weight_type == "webb":
144
+ values = np.array(
145
+ [
146
+ -np.sqrt(3 / 2),
147
+ -np.sqrt(2 / 2),
148
+ -np.sqrt(1 / 2),
149
+ np.sqrt(1 / 2),
150
+ np.sqrt(2 / 2),
151
+ np.sqrt(3 / 2),
152
+ ]
153
+ )
154
+ return rng.choice(values, size=(n_bootstrap, n_units))
155
+ else:
156
+ raise ValueError(
157
+ f"weight_type must be 'rademacher', 'mammen', or 'webb', " f"got '{weight_type}'"
158
+ )
159
+
160
+
161
+ def compute_percentile_ci(
162
+ boot_dist: np.ndarray,
163
+ alpha: float,
164
+ ) -> Tuple[float, float]:
165
+ """
166
+ Compute percentile confidence interval from bootstrap distribution.
167
+
168
+ Parameters
169
+ ----------
170
+ boot_dist : np.ndarray
171
+ Bootstrap distribution (1-D array).
172
+ alpha : float
173
+ Significance level (e.g., 0.05 for 95% CI).
174
+
175
+ Returns
176
+ -------
177
+ tuple of float
178
+ ``(lower, upper)`` confidence interval bounds.
179
+ """
180
+ lower = float(np.percentile(boot_dist, alpha / 2 * 100))
181
+ upper = float(np.percentile(boot_dist, (1 - alpha / 2) * 100))
182
+ return (lower, upper)
183
+
184
+
185
+ def compute_bootstrap_pvalue(
186
+ original_effect: float,
187
+ boot_dist: np.ndarray,
188
+ n_valid: Optional[int] = None,
189
+ ) -> float:
190
+ """
191
+ Compute two-sided bootstrap p-value using the percentile method.
192
+
193
+ Parameters
194
+ ----------
195
+ original_effect : float
196
+ Original point estimate.
197
+ boot_dist : np.ndarray
198
+ Bootstrap distribution of the effect.
199
+ n_valid : int, optional
200
+ Number of valid bootstrap samples for p-value floor.
201
+ If None, uses ``len(boot_dist)``.
202
+
203
+ Returns
204
+ -------
205
+ float
206
+ Two-sided bootstrap p-value.
207
+ """
208
+ if original_effect >= 0:
209
+ p_one_sided = np.mean(boot_dist <= 0)
210
+ else:
211
+ p_one_sided = np.mean(boot_dist >= 0)
212
+
213
+ p_value = min(2 * p_one_sided, 1.0)
214
+ n_for_floor = n_valid if n_valid is not None else len(boot_dist)
215
+ p_value = max(p_value, 1 / (n_for_floor + 1))
216
+ return float(p_value)
217
+
218
+
219
+ def compute_effect_bootstrap_stats(
220
+ original_effect: float,
221
+ boot_dist: np.ndarray,
222
+ alpha: float = 0.05,
223
+ context: str = "bootstrap distribution",
224
+ ) -> Tuple[float, Tuple[float, float], float]:
225
+ """
226
+ Compute bootstrap statistics for a single effect.
227
+
228
+ Filters non-finite samples, returning NaN for all statistics if
229
+ fewer than 50% of samples are valid.
230
+
231
+ Parameters
232
+ ----------
233
+ original_effect : float
234
+ Original point estimate.
235
+ boot_dist : np.ndarray
236
+ Bootstrap distribution of the effect.
237
+ alpha : float, default=0.05
238
+ Significance level.
239
+ context : str, optional
240
+ Description for warning messages.
241
+
242
+ Returns
243
+ -------
244
+ se : float
245
+ Bootstrap standard error.
246
+ ci : tuple of float
247
+ Percentile confidence interval.
248
+ p_value : float
249
+ Bootstrap p-value.
250
+ """
251
+ if not np.isfinite(original_effect):
252
+ return np.nan, (np.nan, np.nan), np.nan
253
+
254
+ finite_mask = np.isfinite(boot_dist)
255
+ n_valid = np.sum(finite_mask)
256
+ n_total = len(boot_dist)
257
+
258
+ if n_valid < n_total:
259
+ n_nonfinite = n_total - n_valid
260
+ warnings.warn(
261
+ f"Dropping {n_nonfinite}/{n_total} non-finite bootstrap samples "
262
+ f"in {context}. Bootstrap estimates based on remaining valid samples.",
263
+ RuntimeWarning,
264
+ stacklevel=3,
265
+ )
266
+
267
+ if n_valid < n_total * 0.5:
268
+ warnings.warn(
269
+ f"Too few valid bootstrap samples ({n_valid}/{n_total}) in {context}. "
270
+ "Returning NaN for SE/CI/p-value to signal invalid inference.",
271
+ RuntimeWarning,
272
+ stacklevel=3,
273
+ )
274
+ return np.nan, (np.nan, np.nan), np.nan
275
+
276
+ valid_dist = boot_dist[finite_mask]
277
+ se = float(np.std(valid_dist, ddof=1))
278
+
279
+ # Guard: if SE is not finite or zero, all inference fields must be NaN.
280
+ if not np.isfinite(se) or se <= 0:
281
+ warnings.warn(
282
+ f"Bootstrap SE is non-finite or zero (n_valid={n_valid}) in {context}. "
283
+ "Returning NaN for SE/CI/p-value.",
284
+ RuntimeWarning,
285
+ stacklevel=3,
286
+ )
287
+ return np.nan, (np.nan, np.nan), np.nan
288
+
289
+ ci = compute_percentile_ci(valid_dist, alpha)
290
+ p_value = compute_bootstrap_pvalue(original_effect, valid_dist, n_valid=len(valid_dist))
291
+ return se, ci, p_value
292
+
293
+
294
+ def compute_effect_bootstrap_stats_batch(
295
+ original_effects: np.ndarray,
296
+ bootstrap_matrix: np.ndarray,
297
+ alpha: float = 0.05,
298
+ ) -> tuple:
299
+ """
300
+ Batch-compute bootstrap statistics for multiple effects at once.
301
+
302
+ Parameters
303
+ ----------
304
+ original_effects : np.ndarray
305
+ Array of original point estimates, shape (n_effects,).
306
+ bootstrap_matrix : np.ndarray
307
+ Bootstrap distributions, shape (n_bootstrap, n_effects).
308
+ alpha : float, default=0.05
309
+ Significance level.
310
+
311
+ Returns
312
+ -------
313
+ ses : np.ndarray
314
+ Bootstrap SEs for each effect.
315
+ ci_lowers : np.ndarray
316
+ Lower CI bounds for each effect.
317
+ ci_uppers : np.ndarray
318
+ Upper CI bounds for each effect.
319
+ p_values : np.ndarray
320
+ Bootstrap p-values for each effect.
321
+ """
322
+ n_bootstrap, n_effects = bootstrap_matrix.shape
323
+ ses = np.full(n_effects, np.nan)
324
+ ci_lowers = np.full(n_effects, np.nan)
325
+ ci_uppers = np.full(n_effects, np.nan)
326
+ p_values = np.full(n_effects, np.nan)
327
+
328
+ # Check for non-finite original effects
329
+ valid_effects = np.isfinite(original_effects)
330
+ if not np.any(valid_effects):
331
+ return ses, ci_lowers, ci_uppers, p_values
332
+
333
+ # Count valid bootstrap samples per effect
334
+ finite_mask = np.isfinite(bootstrap_matrix) # (n_bootstrap, n_effects)
335
+ n_valid = finite_mask.sum(axis=0) # (n_effects,)
336
+
337
+ # Determine which effects have enough valid samples
338
+ enough_valid = (n_valid >= n_bootstrap * 0.5) & valid_effects
339
+
340
+ if not np.any(enough_valid):
341
+ n_insufficient = int(np.sum(valid_effects))
342
+ if n_insufficient > 0:
343
+ warnings.warn(
344
+ f"{n_insufficient} effect(s) had too few valid bootstrap samples (<50%). "
345
+ "Returning NaN for SE/CI/p-value.",
346
+ RuntimeWarning,
347
+ stacklevel=2,
348
+ )
349
+ return ses, ci_lowers, ci_uppers, p_values
350
+
351
+ # Warn about subset with insufficient samples
352
+ n_insufficient = int(np.sum(valid_effects & ~enough_valid))
353
+ if n_insufficient > 0:
354
+ warnings.warn(
355
+ f"{n_insufficient} effect(s) had too few valid bootstrap samples (<50%). "
356
+ "Returning NaN for SE/CI/p-value.",
357
+ RuntimeWarning,
358
+ stacklevel=2,
359
+ )
360
+
361
+ # For effects with all-finite bootstraps (common case), use vectorized ops
362
+ all_finite = (n_valid == n_bootstrap) & enough_valid
363
+ if np.any(all_finite):
364
+ idx = np.where(all_finite)[0]
365
+ sub = bootstrap_matrix[:, idx]
366
+
367
+ # Vectorized SE: std across bootstrap dimension
368
+ batch_ses = np.std(sub, axis=0, ddof=1)
369
+
370
+ # Vectorized percentile CI
371
+ lower_pct = alpha / 2 * 100
372
+ upper_pct = (1 - alpha / 2) * 100
373
+ batch_ci = np.percentile(sub, [lower_pct, upper_pct], axis=0)
374
+
375
+ # Vectorized p-values
376
+ batch_p = np.empty(len(idx))
377
+ for j, eff_idx in enumerate(idx):
378
+ eff = original_effects[eff_idx]
379
+ if eff >= 0:
380
+ batch_p[j] = np.mean(sub[:, j] <= 0)
381
+ else:
382
+ batch_p[j] = np.mean(sub[:, j] >= 0)
383
+ batch_p = np.minimum(2 * batch_p, 1.0)
384
+ batch_p = np.maximum(batch_p, 1 / (n_bootstrap + 1))
385
+
386
+ # Guard: SE must be positive and finite
387
+ se_valid = np.isfinite(batch_ses) & (batch_ses > 0)
388
+ n_bad_se = int(np.sum(~se_valid))
389
+ if n_bad_se > 0:
390
+ warnings.warn(
391
+ f"{n_bad_se} effect(s) had non-finite or zero bootstrap SE. "
392
+ "Returning NaN for SE/CI/p-value.",
393
+ RuntimeWarning,
394
+ stacklevel=2,
395
+ )
396
+ ses[idx[se_valid]] = batch_ses[se_valid]
397
+ ci_lowers[idx[se_valid]] = batch_ci[0][se_valid]
398
+ ci_uppers[idx[se_valid]] = batch_ci[1][se_valid]
399
+ p_values[idx[se_valid]] = batch_p[se_valid]
400
+
401
+ # Handle effects with some non-finite bootstraps (rare) via scalar fallback
402
+ partial_valid = enough_valid & ~all_finite
403
+ if np.any(partial_valid):
404
+ for j in np.where(partial_valid)[0]:
405
+ se, ci, pv = compute_effect_bootstrap_stats(
406
+ original_effects[j],
407
+ bootstrap_matrix[:, j],
408
+ alpha=alpha,
409
+ context=f"effect {j}",
410
+ )
411
+ ses[j] = se
412
+ ci_lowers[j] = ci[0]
413
+ ci_uppers[j] = ci[1]
414
+ p_values[j] = pv
415
+
416
+ return ses, ci_lowers, ci_uppers, p_values
417
+
418
+
419
+ # ---------------------------------------------------------------------------
420
+ # Survey-aware bootstrap weight generators
421
+ # ---------------------------------------------------------------------------
422
+
423
+
424
+ def generate_survey_multiplier_weights_batch(
425
+ n_bootstrap: int,
426
+ resolved_survey: "ResolvedSurveyDesign",
427
+ weight_type: str,
428
+ rng: np.random.Generator,
429
+ ) -> Tuple[np.ndarray, np.ndarray]:
430
+ """Generate PSU-level multiplier weights for survey-aware bootstrap.
431
+
432
+ Within each stratum, weights are generated independently. When FPC
433
+ is present, weights are scaled by ``sqrt(1 - f_h)`` per stratum so
434
+ the bootstrap variance matches the TSL variance.
435
+
436
+ For ``lonely_psu="adjust"``, singleton PSUs from different strata are
437
+ pooled into a combined pseudo-stratum and weights are generated for
438
+ the pooled group (no FPC scaling on pooled singletons).
439
+
440
+ Parameters
441
+ ----------
442
+ n_bootstrap : int
443
+ Number of bootstrap iterations.
444
+ resolved_survey : ResolvedSurveyDesign
445
+ Resolved survey design.
446
+ weight_type : str
447
+ Multiplier distribution: ``"rademacher"``, ``"mammen"``, or ``"webb"``.
448
+ rng : np.random.Generator
449
+ Random number generator.
450
+
451
+ Returns
452
+ -------
453
+ weights : np.ndarray
454
+ Multiplier weights, shape ``(n_bootstrap, n_psu)``.
455
+ psu_ids : np.ndarray
456
+ Unique PSU identifiers aligned to columns of *weights*.
457
+ """
458
+ psu = resolved_survey.psu
459
+ strata = resolved_survey.strata
460
+
461
+ _lonely_psu = resolved_survey.lonely_psu
462
+
463
+ if psu is None:
464
+ # Each observation is its own PSU
465
+ n_psu = len(resolved_survey.weights)
466
+ psu_ids = np.arange(n_psu)
467
+ else:
468
+ psu_ids = np.unique(psu)
469
+ n_psu = len(psu_ids)
470
+
471
+ if strata is None:
472
+ # No stratification — generate a single block of weights
473
+ if n_psu < 2:
474
+ # Single PSU — variance unidentified (matches compute_survey_vcov)
475
+ weights = np.zeros((n_bootstrap, n_psu), dtype=np.float64)
476
+ return weights, psu_ids
477
+ weights = generate_bootstrap_weights_batch(n_bootstrap, n_psu, weight_type, rng)
478
+ # FPC scaling (unstratified)
479
+ if resolved_survey.fpc is not None:
480
+ if psu is not None:
481
+ n_units_for_fpc = n_psu
482
+ else:
483
+ n_units_for_fpc = len(resolved_survey.weights)
484
+ if resolved_survey.fpc[0] < n_units_for_fpc:
485
+ raise ValueError(
486
+ f"FPC ({resolved_survey.fpc[0]}) is less than the number of PSUs "
487
+ f"({n_units_for_fpc}). FPC must be >= number of PSUs."
488
+ )
489
+ f = n_units_for_fpc / resolved_survey.fpc[0]
490
+ if f < 1.0:
491
+ weights = weights * np.sqrt(1.0 - f)
492
+ else:
493
+ weights = np.zeros_like(weights)
494
+ else:
495
+ # Stratified — generate independently within strata
496
+ weights = np.empty((n_bootstrap, n_psu), dtype=np.float64)
497
+
498
+ # Build PSU → column-index map
499
+ psu_to_col = {int(p): i for i, p in enumerate(psu_ids)}
500
+
501
+ unique_strata = np.unique(strata)
502
+ _singleton_cols = [] # For lonely_psu="adjust" pooling
503
+ for h in unique_strata:
504
+ mask_h = strata == h
505
+
506
+ if psu is not None:
507
+ psus_in_h = np.unique(psu[mask_h])
508
+ else:
509
+ psus_in_h = np.where(mask_h)[0]
510
+
511
+ n_h = len(psus_in_h)
512
+ cols = np.array([psu_to_col[int(p)] for p in psus_in_h])
513
+
514
+ if n_h < 2:
515
+ if _lonely_psu == "adjust":
516
+ # Collect for pooled pseudo-stratum processing
517
+ _singleton_cols.extend(cols.tolist())
518
+ else:
519
+ # remove / certainty — zero weight
520
+ weights[:, cols] = 0.0
521
+ continue
522
+
523
+ # Generate weights for this stratum
524
+ stratum_weights = generate_bootstrap_weights_batch_numpy(
525
+ n_bootstrap, n_h, weight_type, rng
526
+ )
527
+
528
+ # FPC scaling
529
+ if resolved_survey.fpc is not None:
530
+ N_h = resolved_survey.fpc[mask_h][0]
531
+ if N_h < n_h:
532
+ raise ValueError(
533
+ f"FPC ({N_h}) is less than the number of PSUs "
534
+ f"({n_h}) in stratum {h}. FPC must be >= n_PSU."
535
+ )
536
+ f_h = n_h / N_h
537
+ if f_h < 1.0:
538
+ stratum_weights = stratum_weights * np.sqrt(1.0 - f_h)
539
+ else:
540
+ stratum_weights = np.zeros_like(stratum_weights)
541
+
542
+ weights[:, cols] = stratum_weights
543
+
544
+ # Pool singleton PSUs into a pseudo-stratum for "adjust"
545
+ if _singleton_cols:
546
+ n_pooled = len(_singleton_cols)
547
+ if n_pooled >= 2:
548
+ pooled_weights = generate_bootstrap_weights_batch_numpy(
549
+ n_bootstrap, n_pooled, weight_type, rng
550
+ )
551
+ # No FPC scaling for pooled singletons (conservative)
552
+ pooled_cols = np.array(_singleton_cols)
553
+ weights[:, pooled_cols] = pooled_weights
554
+ else:
555
+ # Single singleton — cannot pool, zero weight (library-specific
556
+ # fallback; bootstrap adjust with one singleton = remove).
557
+ import warnings
558
+
559
+ warnings.warn(
560
+ "lonely_psu='adjust' with only 1 singleton stratum in "
561
+ "bootstrap: singleton PSU contributes zero variance "
562
+ "(same as 'remove'). At least 2 singleton strata are "
563
+ "needed for pooled pseudo-stratum bootstrap.",
564
+ UserWarning,
565
+ stacklevel=3,
566
+ )
567
+ weights[:, _singleton_cols[0]] = 0.0
568
+
569
+ return weights, psu_ids
570
+
571
+
572
+ def generate_rao_wu_weights(
573
+ resolved_survey: "ResolvedSurveyDesign",
574
+ rng: np.random.Generator,
575
+ ) -> np.ndarray:
576
+ """Generate one set of Rao-Wu (1988) rescaled observation weights.
577
+
578
+ Within each stratum *h* with *n_h* PSUs, draw ``m_h`` PSUs with
579
+ replacement and rescale observation weights by ``(n_h / m_h) * r_hi``
580
+ where ``r_hi`` is the count of PSU *i* being selected.
581
+
582
+ Without FPC: ``m_h = n_h - 1``.
583
+ With FPC: ``m_h = max(1, round((1 - f_h) * (n_h - 1)))``
584
+ (Rao, Wu & Yue 1992, Section 3).
585
+
586
+ For ``lonely_psu="adjust"``, singleton PSUs are pooled into a combined
587
+ pseudo-stratum and resampled together (no FPC scaling on pooled group).
588
+
589
+ Parameters
590
+ ----------
591
+ resolved_survey : ResolvedSurveyDesign
592
+ Resolved survey design.
593
+ rng : np.random.Generator
594
+ Random number generator.
595
+
596
+ Returns
597
+ -------
598
+ np.ndarray
599
+ Rescaled observation weights, shape ``(n_obs,)``.
600
+ """
601
+ n_obs = len(resolved_survey.weights)
602
+ base_weights = resolved_survey.weights
603
+ psu = resolved_survey.psu
604
+ strata = resolved_survey.strata
605
+
606
+ _lonely_psu_rw = resolved_survey.lonely_psu
607
+
608
+ rescaled = np.zeros(n_obs, dtype=np.float64)
609
+
610
+ if psu is None:
611
+ obs_psu = np.arange(n_obs)
612
+ else:
613
+ obs_psu = psu
614
+
615
+ if strata is None:
616
+ strata_masks = [np.ones(n_obs, dtype=bool)]
617
+ else:
618
+ unique_strata = np.unique(strata)
619
+ strata_masks = [strata == h for h in unique_strata]
620
+
621
+ # Collect singleton PSUs for "adjust" pooling
622
+ _singleton_info = [] # list of (mask_h, unique_psu_h) tuples
623
+
624
+ for mask_h in strata_masks:
625
+ psu_h = obs_psu[mask_h]
626
+ unique_psu_h = np.unique(psu_h)
627
+ n_h = len(unique_psu_h)
628
+
629
+ if n_h < 2:
630
+ if _lonely_psu_rw == "adjust":
631
+ _singleton_info.append((mask_h, unique_psu_h))
632
+ else:
633
+ # remove / certainty — keep original weights (zero variance)
634
+ rescaled[mask_h] = base_weights[mask_h]
635
+ continue
636
+
637
+ # Compute resample size
638
+ if resolved_survey.fpc is not None:
639
+ N_h = resolved_survey.fpc[mask_h][0]
640
+ if N_h < n_h:
641
+ raise ValueError(
642
+ f"FPC ({N_h}) is less than the number of PSUs "
643
+ f"({n_h}). FPC must be >= number of PSUs."
644
+ )
645
+ f_h = n_h / N_h
646
+ if f_h >= 1.0:
647
+ # Census stratum — keep original weights (zero variance)
648
+ rescaled[mask_h] = base_weights[mask_h]
649
+ continue
650
+ m_h = max(1, round((1.0 - f_h) * (n_h - 1)))
651
+ else:
652
+ m_h = n_h - 1
653
+
654
+ # Draw m_h PSUs with replacement
655
+ drawn_indices = rng.choice(n_h, size=m_h, replace=True)
656
+ counts = np.bincount(drawn_indices, minlength=n_h)
657
+
658
+ # Rescale factor per PSU: (n_h / m_h) * r_hi
659
+ scale_per_psu = (n_h / m_h) * counts.astype(np.float64)
660
+
661
+ # Map PSU → local index for vectorized application
662
+ psu_to_local = {int(p): i for i, p in enumerate(unique_psu_h)}
663
+ obs_in_h = np.where(mask_h)[0]
664
+ local_indices = np.array([psu_to_local[int(obs_psu[idx])] for idx in obs_in_h])
665
+ rescaled[obs_in_h] = base_weights[obs_in_h] * scale_per_psu[local_indices]
666
+
667
+ # Pool singleton PSUs into a pseudo-stratum for "adjust"
668
+ if _singleton_info:
669
+ # Combine all singleton PSUs into one group
670
+ pooled_psus = np.concatenate([p for _, p in _singleton_info])
671
+ n_pooled = len(pooled_psus)
672
+
673
+ if n_pooled >= 2:
674
+ m_pooled = n_pooled - 1 # No FPC for pooled singletons
675
+ drawn = rng.choice(n_pooled, size=m_pooled, replace=True)
676
+ counts = np.bincount(drawn, minlength=n_pooled)
677
+ scale_per_psu = (n_pooled / m_pooled) * counts.astype(np.float64)
678
+
679
+ # Build PSU → scale mapping and apply
680
+ psu_scale_map = {int(pooled_psus[i]): scale_per_psu[i] for i in range(n_pooled)}
681
+ for mask_h, _ in _singleton_info:
682
+ obs_in_h = np.where(mask_h)[0]
683
+ for idx in obs_in_h:
684
+ p = int(obs_psu[idx])
685
+ rescaled[idx] = base_weights[idx] * psu_scale_map.get(p, 1.0)
686
+ else:
687
+ # Single singleton — cannot pool, keep base weights (library-specific
688
+ # fallback; bootstrap adjust with one singleton = remove).
689
+ import warnings
690
+
691
+ warnings.warn(
692
+ "lonely_psu='adjust' with only 1 singleton stratum in "
693
+ "bootstrap: singleton PSU contributes zero variance "
694
+ "(same as 'remove'). At least 2 singleton strata are "
695
+ "needed for pooled pseudo-stratum bootstrap.",
696
+ UserWarning,
697
+ stacklevel=2,
698
+ )
699
+ for mask_h, _ in _singleton_info:
700
+ rescaled[mask_h] = base_weights[mask_h]
701
+
702
+ return rescaled
703
+
704
+
705
+ def generate_rao_wu_weights_batch(
706
+ n_bootstrap: int,
707
+ resolved_survey: "ResolvedSurveyDesign",
708
+ rng: np.random.Generator,
709
+ ) -> np.ndarray:
710
+ """Generate multiple sets of Rao-Wu rescaled weights.
711
+
712
+ Parameters
713
+ ----------
714
+ n_bootstrap : int
715
+ Number of bootstrap iterations.
716
+ resolved_survey : ResolvedSurveyDesign
717
+ Resolved survey design.
718
+ rng : np.random.Generator
719
+ Random number generator.
720
+
721
+ Returns
722
+ -------
723
+ np.ndarray
724
+ Rescaled weights, shape ``(n_bootstrap, n_obs)``.
725
+ """
726
+ n_obs = len(resolved_survey.weights)
727
+ result = np.empty((n_bootstrap, n_obs), dtype=np.float64)
728
+ for b in range(n_bootstrap):
729
+ result[b] = generate_rao_wu_weights(resolved_survey, rng)
730
+ return result