diff-diff 3.0.1__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diff_diff/__init__.py +382 -0
- diff_diff/_backend.py +134 -0
- diff_diff/_rust_backend.cp314-win_amd64.pyd +0 -0
- diff_diff/bacon.py +1140 -0
- diff_diff/bootstrap_utils.py +730 -0
- diff_diff/continuous_did.py +1626 -0
- diff_diff/continuous_did_bspline.py +190 -0
- diff_diff/continuous_did_results.py +374 -0
- diff_diff/datasets.py +815 -0
- diff_diff/diagnostics.py +882 -0
- diff_diff/efficient_did.py +1770 -0
- diff_diff/efficient_did_bootstrap.py +359 -0
- diff_diff/efficient_did_covariates.py +899 -0
- diff_diff/efficient_did_results.py +368 -0
- diff_diff/efficient_did_weights.py +617 -0
- diff_diff/estimators.py +1501 -0
- diff_diff/honest_did.py +2585 -0
- diff_diff/imputation.py +2458 -0
- diff_diff/imputation_bootstrap.py +418 -0
- diff_diff/imputation_results.py +448 -0
- diff_diff/linalg.py +2538 -0
- diff_diff/power.py +2588 -0
- diff_diff/practitioner.py +869 -0
- diff_diff/prep.py +1738 -0
- diff_diff/prep_dgp.py +1718 -0
- diff_diff/pretrends.py +1105 -0
- diff_diff/results.py +918 -0
- diff_diff/stacked_did.py +1049 -0
- diff_diff/stacked_did_results.py +339 -0
- diff_diff/staggered.py +3895 -0
- diff_diff/staggered_aggregation.py +864 -0
- diff_diff/staggered_bootstrap.py +752 -0
- diff_diff/staggered_results.py +416 -0
- diff_diff/staggered_triple_diff.py +1545 -0
- diff_diff/staggered_triple_diff_results.py +416 -0
- diff_diff/sun_abraham.py +1685 -0
- diff_diff/survey.py +1981 -0
- diff_diff/synthetic_did.py +1136 -0
- diff_diff/triple_diff.py +2047 -0
- diff_diff/trop.py +952 -0
- diff_diff/trop_global.py +1270 -0
- diff_diff/trop_local.py +1307 -0
- diff_diff/trop_results.py +356 -0
- diff_diff/twfe.py +542 -0
- diff_diff/two_stage.py +1952 -0
- diff_diff/two_stage_bootstrap.py +520 -0
- diff_diff/two_stage_results.py +400 -0
- diff_diff/utils.py +1902 -0
- diff_diff/visualization/__init__.py +61 -0
- diff_diff/visualization/_common.py +328 -0
- diff_diff/visualization/_continuous.py +274 -0
- diff_diff/visualization/_diagnostic.py +817 -0
- diff_diff/visualization/_event_study.py +1086 -0
- diff_diff/visualization/_power.py +661 -0
- diff_diff/visualization/_staggered.py +833 -0
- diff_diff/visualization/_synthetic.py +197 -0
- diff_diff/wooldridge.py +1285 -0
- diff_diff/wooldridge_results.py +349 -0
- diff_diff-3.0.1.dist-info/METADATA +2997 -0
- diff_diff-3.0.1.dist-info/RECORD +62 -0
- diff_diff-3.0.1.dist-info/WHEEL +4 -0
- diff_diff-3.0.1.dist-info/sboms/diff_diff_rust.cyclonedx.json +5843 -0
|
@@ -0,0 +1,730 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared bootstrap utilities for multiplier bootstrap inference.
|
|
3
|
+
|
|
4
|
+
Provides weight generation, percentile CI, and p-value helpers used by
|
|
5
|
+
both CallawaySantAnna and ContinuousDiD estimators.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
from typing import Optional, Tuple
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from diff_diff._backend import HAS_RUST_BACKEND, _rust_bootstrap_weights
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"generate_bootstrap_weights",
|
|
17
|
+
"generate_bootstrap_weights_batch",
|
|
18
|
+
"generate_bootstrap_weights_batch_numpy",
|
|
19
|
+
"generate_survey_multiplier_weights_batch",
|
|
20
|
+
"generate_rao_wu_weights",
|
|
21
|
+
"generate_rao_wu_weights_batch",
|
|
22
|
+
"compute_percentile_ci",
|
|
23
|
+
"compute_bootstrap_pvalue",
|
|
24
|
+
"compute_effect_bootstrap_stats",
|
|
25
|
+
"compute_effect_bootstrap_stats_batch",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def generate_bootstrap_weights(
|
|
30
|
+
n_units: int,
|
|
31
|
+
weight_type: str,
|
|
32
|
+
rng: np.random.Generator,
|
|
33
|
+
) -> np.ndarray:
|
|
34
|
+
"""
|
|
35
|
+
Generate bootstrap weights for multiplier bootstrap.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
n_units : int
|
|
40
|
+
Number of units (clusters) to generate weights for.
|
|
41
|
+
weight_type : str
|
|
42
|
+
Type of weights: "rademacher", "mammen", or "webb".
|
|
43
|
+
rng : np.random.Generator
|
|
44
|
+
Random number generator.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
np.ndarray
|
|
49
|
+
Array of bootstrap weights with shape (n_units,).
|
|
50
|
+
"""
|
|
51
|
+
if weight_type == "rademacher":
|
|
52
|
+
return rng.choice([-1.0, 1.0], size=n_units)
|
|
53
|
+
elif weight_type == "mammen":
|
|
54
|
+
sqrt5 = np.sqrt(5)
|
|
55
|
+
val1 = -(sqrt5 - 1) / 2
|
|
56
|
+
val2 = (sqrt5 + 1) / 2
|
|
57
|
+
p1 = (sqrt5 + 1) / (2 * sqrt5)
|
|
58
|
+
return rng.choice([val1, val2], size=n_units, p=[p1, 1 - p1])
|
|
59
|
+
elif weight_type == "webb":
|
|
60
|
+
values = np.array(
|
|
61
|
+
[
|
|
62
|
+
-np.sqrt(3 / 2),
|
|
63
|
+
-np.sqrt(2 / 2),
|
|
64
|
+
-np.sqrt(1 / 2),
|
|
65
|
+
np.sqrt(1 / 2),
|
|
66
|
+
np.sqrt(2 / 2),
|
|
67
|
+
np.sqrt(3 / 2),
|
|
68
|
+
]
|
|
69
|
+
)
|
|
70
|
+
return rng.choice(values, size=n_units)
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"weight_type must be 'rademacher', 'mammen', or 'webb', " f"got '{weight_type}'"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def generate_bootstrap_weights_batch(
|
|
78
|
+
n_bootstrap: int,
|
|
79
|
+
n_units: int,
|
|
80
|
+
weight_type: str,
|
|
81
|
+
rng: np.random.Generator,
|
|
82
|
+
) -> np.ndarray:
|
|
83
|
+
"""
|
|
84
|
+
Generate all bootstrap weights at once (vectorized).
|
|
85
|
+
|
|
86
|
+
Uses Rust backend if available for parallel generation.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
n_bootstrap : int
|
|
91
|
+
Number of bootstrap iterations.
|
|
92
|
+
n_units : int
|
|
93
|
+
Number of units (clusters) to generate weights for.
|
|
94
|
+
weight_type : str
|
|
95
|
+
Type of weights: "rademacher", "mammen", or "webb".
|
|
96
|
+
rng : np.random.Generator
|
|
97
|
+
Random number generator.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
np.ndarray
|
|
102
|
+
Array of bootstrap weights with shape (n_bootstrap, n_units).
|
|
103
|
+
"""
|
|
104
|
+
if HAS_RUST_BACKEND and _rust_bootstrap_weights is not None:
|
|
105
|
+
seed = rng.integers(0, 2**63 - 1)
|
|
106
|
+
return _rust_bootstrap_weights(n_bootstrap, n_units, weight_type, seed)
|
|
107
|
+
return generate_bootstrap_weights_batch_numpy(n_bootstrap, n_units, weight_type, rng)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def generate_bootstrap_weights_batch_numpy(
|
|
111
|
+
n_bootstrap: int,
|
|
112
|
+
n_units: int,
|
|
113
|
+
weight_type: str,
|
|
114
|
+
rng: np.random.Generator,
|
|
115
|
+
) -> np.ndarray:
|
|
116
|
+
"""
|
|
117
|
+
NumPy fallback implementation of :func:`generate_bootstrap_weights_batch`.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
n_bootstrap : int
|
|
122
|
+
Number of bootstrap iterations.
|
|
123
|
+
n_units : int
|
|
124
|
+
Number of units (clusters) to generate weights for.
|
|
125
|
+
weight_type : str
|
|
126
|
+
Type of weights: "rademacher", "mammen", or "webb".
|
|
127
|
+
rng : np.random.Generator
|
|
128
|
+
Random number generator.
|
|
129
|
+
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
np.ndarray
|
|
133
|
+
Array of bootstrap weights with shape (n_bootstrap, n_units).
|
|
134
|
+
"""
|
|
135
|
+
if weight_type == "rademacher":
|
|
136
|
+
return rng.choice([-1.0, 1.0], size=(n_bootstrap, n_units))
|
|
137
|
+
elif weight_type == "mammen":
|
|
138
|
+
sqrt5 = np.sqrt(5)
|
|
139
|
+
val1 = -(sqrt5 - 1) / 2
|
|
140
|
+
val2 = (sqrt5 + 1) / 2
|
|
141
|
+
p1 = (sqrt5 + 1) / (2 * sqrt5)
|
|
142
|
+
return rng.choice([val1, val2], size=(n_bootstrap, n_units), p=[p1, 1 - p1])
|
|
143
|
+
elif weight_type == "webb":
|
|
144
|
+
values = np.array(
|
|
145
|
+
[
|
|
146
|
+
-np.sqrt(3 / 2),
|
|
147
|
+
-np.sqrt(2 / 2),
|
|
148
|
+
-np.sqrt(1 / 2),
|
|
149
|
+
np.sqrt(1 / 2),
|
|
150
|
+
np.sqrt(2 / 2),
|
|
151
|
+
np.sqrt(3 / 2),
|
|
152
|
+
]
|
|
153
|
+
)
|
|
154
|
+
return rng.choice(values, size=(n_bootstrap, n_units))
|
|
155
|
+
else:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"weight_type must be 'rademacher', 'mammen', or 'webb', " f"got '{weight_type}'"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def compute_percentile_ci(
|
|
162
|
+
boot_dist: np.ndarray,
|
|
163
|
+
alpha: float,
|
|
164
|
+
) -> Tuple[float, float]:
|
|
165
|
+
"""
|
|
166
|
+
Compute percentile confidence interval from bootstrap distribution.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
boot_dist : np.ndarray
|
|
171
|
+
Bootstrap distribution (1-D array).
|
|
172
|
+
alpha : float
|
|
173
|
+
Significance level (e.g., 0.05 for 95% CI).
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
tuple of float
|
|
178
|
+
``(lower, upper)`` confidence interval bounds.
|
|
179
|
+
"""
|
|
180
|
+
lower = float(np.percentile(boot_dist, alpha / 2 * 100))
|
|
181
|
+
upper = float(np.percentile(boot_dist, (1 - alpha / 2) * 100))
|
|
182
|
+
return (lower, upper)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def compute_bootstrap_pvalue(
|
|
186
|
+
original_effect: float,
|
|
187
|
+
boot_dist: np.ndarray,
|
|
188
|
+
n_valid: Optional[int] = None,
|
|
189
|
+
) -> float:
|
|
190
|
+
"""
|
|
191
|
+
Compute two-sided bootstrap p-value using the percentile method.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
original_effect : float
|
|
196
|
+
Original point estimate.
|
|
197
|
+
boot_dist : np.ndarray
|
|
198
|
+
Bootstrap distribution of the effect.
|
|
199
|
+
n_valid : int, optional
|
|
200
|
+
Number of valid bootstrap samples for p-value floor.
|
|
201
|
+
If None, uses ``len(boot_dist)``.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
float
|
|
206
|
+
Two-sided bootstrap p-value.
|
|
207
|
+
"""
|
|
208
|
+
if original_effect >= 0:
|
|
209
|
+
p_one_sided = np.mean(boot_dist <= 0)
|
|
210
|
+
else:
|
|
211
|
+
p_one_sided = np.mean(boot_dist >= 0)
|
|
212
|
+
|
|
213
|
+
p_value = min(2 * p_one_sided, 1.0)
|
|
214
|
+
n_for_floor = n_valid if n_valid is not None else len(boot_dist)
|
|
215
|
+
p_value = max(p_value, 1 / (n_for_floor + 1))
|
|
216
|
+
return float(p_value)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def compute_effect_bootstrap_stats(
|
|
220
|
+
original_effect: float,
|
|
221
|
+
boot_dist: np.ndarray,
|
|
222
|
+
alpha: float = 0.05,
|
|
223
|
+
context: str = "bootstrap distribution",
|
|
224
|
+
) -> Tuple[float, Tuple[float, float], float]:
|
|
225
|
+
"""
|
|
226
|
+
Compute bootstrap statistics for a single effect.
|
|
227
|
+
|
|
228
|
+
Filters non-finite samples, returning NaN for all statistics if
|
|
229
|
+
fewer than 50% of samples are valid.
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
original_effect : float
|
|
234
|
+
Original point estimate.
|
|
235
|
+
boot_dist : np.ndarray
|
|
236
|
+
Bootstrap distribution of the effect.
|
|
237
|
+
alpha : float, default=0.05
|
|
238
|
+
Significance level.
|
|
239
|
+
context : str, optional
|
|
240
|
+
Description for warning messages.
|
|
241
|
+
|
|
242
|
+
Returns
|
|
243
|
+
-------
|
|
244
|
+
se : float
|
|
245
|
+
Bootstrap standard error.
|
|
246
|
+
ci : tuple of float
|
|
247
|
+
Percentile confidence interval.
|
|
248
|
+
p_value : float
|
|
249
|
+
Bootstrap p-value.
|
|
250
|
+
"""
|
|
251
|
+
if not np.isfinite(original_effect):
|
|
252
|
+
return np.nan, (np.nan, np.nan), np.nan
|
|
253
|
+
|
|
254
|
+
finite_mask = np.isfinite(boot_dist)
|
|
255
|
+
n_valid = np.sum(finite_mask)
|
|
256
|
+
n_total = len(boot_dist)
|
|
257
|
+
|
|
258
|
+
if n_valid < n_total:
|
|
259
|
+
n_nonfinite = n_total - n_valid
|
|
260
|
+
warnings.warn(
|
|
261
|
+
f"Dropping {n_nonfinite}/{n_total} non-finite bootstrap samples "
|
|
262
|
+
f"in {context}. Bootstrap estimates based on remaining valid samples.",
|
|
263
|
+
RuntimeWarning,
|
|
264
|
+
stacklevel=3,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if n_valid < n_total * 0.5:
|
|
268
|
+
warnings.warn(
|
|
269
|
+
f"Too few valid bootstrap samples ({n_valid}/{n_total}) in {context}. "
|
|
270
|
+
"Returning NaN for SE/CI/p-value to signal invalid inference.",
|
|
271
|
+
RuntimeWarning,
|
|
272
|
+
stacklevel=3,
|
|
273
|
+
)
|
|
274
|
+
return np.nan, (np.nan, np.nan), np.nan
|
|
275
|
+
|
|
276
|
+
valid_dist = boot_dist[finite_mask]
|
|
277
|
+
se = float(np.std(valid_dist, ddof=1))
|
|
278
|
+
|
|
279
|
+
# Guard: if SE is not finite or zero, all inference fields must be NaN.
|
|
280
|
+
if not np.isfinite(se) or se <= 0:
|
|
281
|
+
warnings.warn(
|
|
282
|
+
f"Bootstrap SE is non-finite or zero (n_valid={n_valid}) in {context}. "
|
|
283
|
+
"Returning NaN for SE/CI/p-value.",
|
|
284
|
+
RuntimeWarning,
|
|
285
|
+
stacklevel=3,
|
|
286
|
+
)
|
|
287
|
+
return np.nan, (np.nan, np.nan), np.nan
|
|
288
|
+
|
|
289
|
+
ci = compute_percentile_ci(valid_dist, alpha)
|
|
290
|
+
p_value = compute_bootstrap_pvalue(original_effect, valid_dist, n_valid=len(valid_dist))
|
|
291
|
+
return se, ci, p_value
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def compute_effect_bootstrap_stats_batch(
|
|
295
|
+
original_effects: np.ndarray,
|
|
296
|
+
bootstrap_matrix: np.ndarray,
|
|
297
|
+
alpha: float = 0.05,
|
|
298
|
+
) -> tuple:
|
|
299
|
+
"""
|
|
300
|
+
Batch-compute bootstrap statistics for multiple effects at once.
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
original_effects : np.ndarray
|
|
305
|
+
Array of original point estimates, shape (n_effects,).
|
|
306
|
+
bootstrap_matrix : np.ndarray
|
|
307
|
+
Bootstrap distributions, shape (n_bootstrap, n_effects).
|
|
308
|
+
alpha : float, default=0.05
|
|
309
|
+
Significance level.
|
|
310
|
+
|
|
311
|
+
Returns
|
|
312
|
+
-------
|
|
313
|
+
ses : np.ndarray
|
|
314
|
+
Bootstrap SEs for each effect.
|
|
315
|
+
ci_lowers : np.ndarray
|
|
316
|
+
Lower CI bounds for each effect.
|
|
317
|
+
ci_uppers : np.ndarray
|
|
318
|
+
Upper CI bounds for each effect.
|
|
319
|
+
p_values : np.ndarray
|
|
320
|
+
Bootstrap p-values for each effect.
|
|
321
|
+
"""
|
|
322
|
+
n_bootstrap, n_effects = bootstrap_matrix.shape
|
|
323
|
+
ses = np.full(n_effects, np.nan)
|
|
324
|
+
ci_lowers = np.full(n_effects, np.nan)
|
|
325
|
+
ci_uppers = np.full(n_effects, np.nan)
|
|
326
|
+
p_values = np.full(n_effects, np.nan)
|
|
327
|
+
|
|
328
|
+
# Check for non-finite original effects
|
|
329
|
+
valid_effects = np.isfinite(original_effects)
|
|
330
|
+
if not np.any(valid_effects):
|
|
331
|
+
return ses, ci_lowers, ci_uppers, p_values
|
|
332
|
+
|
|
333
|
+
# Count valid bootstrap samples per effect
|
|
334
|
+
finite_mask = np.isfinite(bootstrap_matrix) # (n_bootstrap, n_effects)
|
|
335
|
+
n_valid = finite_mask.sum(axis=0) # (n_effects,)
|
|
336
|
+
|
|
337
|
+
# Determine which effects have enough valid samples
|
|
338
|
+
enough_valid = (n_valid >= n_bootstrap * 0.5) & valid_effects
|
|
339
|
+
|
|
340
|
+
if not np.any(enough_valid):
|
|
341
|
+
n_insufficient = int(np.sum(valid_effects))
|
|
342
|
+
if n_insufficient > 0:
|
|
343
|
+
warnings.warn(
|
|
344
|
+
f"{n_insufficient} effect(s) had too few valid bootstrap samples (<50%). "
|
|
345
|
+
"Returning NaN for SE/CI/p-value.",
|
|
346
|
+
RuntimeWarning,
|
|
347
|
+
stacklevel=2,
|
|
348
|
+
)
|
|
349
|
+
return ses, ci_lowers, ci_uppers, p_values
|
|
350
|
+
|
|
351
|
+
# Warn about subset with insufficient samples
|
|
352
|
+
n_insufficient = int(np.sum(valid_effects & ~enough_valid))
|
|
353
|
+
if n_insufficient > 0:
|
|
354
|
+
warnings.warn(
|
|
355
|
+
f"{n_insufficient} effect(s) had too few valid bootstrap samples (<50%). "
|
|
356
|
+
"Returning NaN for SE/CI/p-value.",
|
|
357
|
+
RuntimeWarning,
|
|
358
|
+
stacklevel=2,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# For effects with all-finite bootstraps (common case), use vectorized ops
|
|
362
|
+
all_finite = (n_valid == n_bootstrap) & enough_valid
|
|
363
|
+
if np.any(all_finite):
|
|
364
|
+
idx = np.where(all_finite)[0]
|
|
365
|
+
sub = bootstrap_matrix[:, idx]
|
|
366
|
+
|
|
367
|
+
# Vectorized SE: std across bootstrap dimension
|
|
368
|
+
batch_ses = np.std(sub, axis=0, ddof=1)
|
|
369
|
+
|
|
370
|
+
# Vectorized percentile CI
|
|
371
|
+
lower_pct = alpha / 2 * 100
|
|
372
|
+
upper_pct = (1 - alpha / 2) * 100
|
|
373
|
+
batch_ci = np.percentile(sub, [lower_pct, upper_pct], axis=0)
|
|
374
|
+
|
|
375
|
+
# Vectorized p-values
|
|
376
|
+
batch_p = np.empty(len(idx))
|
|
377
|
+
for j, eff_idx in enumerate(idx):
|
|
378
|
+
eff = original_effects[eff_idx]
|
|
379
|
+
if eff >= 0:
|
|
380
|
+
batch_p[j] = np.mean(sub[:, j] <= 0)
|
|
381
|
+
else:
|
|
382
|
+
batch_p[j] = np.mean(sub[:, j] >= 0)
|
|
383
|
+
batch_p = np.minimum(2 * batch_p, 1.0)
|
|
384
|
+
batch_p = np.maximum(batch_p, 1 / (n_bootstrap + 1))
|
|
385
|
+
|
|
386
|
+
# Guard: SE must be positive and finite
|
|
387
|
+
se_valid = np.isfinite(batch_ses) & (batch_ses > 0)
|
|
388
|
+
n_bad_se = int(np.sum(~se_valid))
|
|
389
|
+
if n_bad_se > 0:
|
|
390
|
+
warnings.warn(
|
|
391
|
+
f"{n_bad_se} effect(s) had non-finite or zero bootstrap SE. "
|
|
392
|
+
"Returning NaN for SE/CI/p-value.",
|
|
393
|
+
RuntimeWarning,
|
|
394
|
+
stacklevel=2,
|
|
395
|
+
)
|
|
396
|
+
ses[idx[se_valid]] = batch_ses[se_valid]
|
|
397
|
+
ci_lowers[idx[se_valid]] = batch_ci[0][se_valid]
|
|
398
|
+
ci_uppers[idx[se_valid]] = batch_ci[1][se_valid]
|
|
399
|
+
p_values[idx[se_valid]] = batch_p[se_valid]
|
|
400
|
+
|
|
401
|
+
# Handle effects with some non-finite bootstraps (rare) via scalar fallback
|
|
402
|
+
partial_valid = enough_valid & ~all_finite
|
|
403
|
+
if np.any(partial_valid):
|
|
404
|
+
for j in np.where(partial_valid)[0]:
|
|
405
|
+
se, ci, pv = compute_effect_bootstrap_stats(
|
|
406
|
+
original_effects[j],
|
|
407
|
+
bootstrap_matrix[:, j],
|
|
408
|
+
alpha=alpha,
|
|
409
|
+
context=f"effect {j}",
|
|
410
|
+
)
|
|
411
|
+
ses[j] = se
|
|
412
|
+
ci_lowers[j] = ci[0]
|
|
413
|
+
ci_uppers[j] = ci[1]
|
|
414
|
+
p_values[j] = pv
|
|
415
|
+
|
|
416
|
+
return ses, ci_lowers, ci_uppers, p_values
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
# ---------------------------------------------------------------------------
|
|
420
|
+
# Survey-aware bootstrap weight generators
|
|
421
|
+
# ---------------------------------------------------------------------------
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def generate_survey_multiplier_weights_batch(
|
|
425
|
+
n_bootstrap: int,
|
|
426
|
+
resolved_survey: "ResolvedSurveyDesign",
|
|
427
|
+
weight_type: str,
|
|
428
|
+
rng: np.random.Generator,
|
|
429
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
430
|
+
"""Generate PSU-level multiplier weights for survey-aware bootstrap.
|
|
431
|
+
|
|
432
|
+
Within each stratum, weights are generated independently. When FPC
|
|
433
|
+
is present, weights are scaled by ``sqrt(1 - f_h)`` per stratum so
|
|
434
|
+
the bootstrap variance matches the TSL variance.
|
|
435
|
+
|
|
436
|
+
For ``lonely_psu="adjust"``, singleton PSUs from different strata are
|
|
437
|
+
pooled into a combined pseudo-stratum and weights are generated for
|
|
438
|
+
the pooled group (no FPC scaling on pooled singletons).
|
|
439
|
+
|
|
440
|
+
Parameters
|
|
441
|
+
----------
|
|
442
|
+
n_bootstrap : int
|
|
443
|
+
Number of bootstrap iterations.
|
|
444
|
+
resolved_survey : ResolvedSurveyDesign
|
|
445
|
+
Resolved survey design.
|
|
446
|
+
weight_type : str
|
|
447
|
+
Multiplier distribution: ``"rademacher"``, ``"mammen"``, or ``"webb"``.
|
|
448
|
+
rng : np.random.Generator
|
|
449
|
+
Random number generator.
|
|
450
|
+
|
|
451
|
+
Returns
|
|
452
|
+
-------
|
|
453
|
+
weights : np.ndarray
|
|
454
|
+
Multiplier weights, shape ``(n_bootstrap, n_psu)``.
|
|
455
|
+
psu_ids : np.ndarray
|
|
456
|
+
Unique PSU identifiers aligned to columns of *weights*.
|
|
457
|
+
"""
|
|
458
|
+
psu = resolved_survey.psu
|
|
459
|
+
strata = resolved_survey.strata
|
|
460
|
+
|
|
461
|
+
_lonely_psu = resolved_survey.lonely_psu
|
|
462
|
+
|
|
463
|
+
if psu is None:
|
|
464
|
+
# Each observation is its own PSU
|
|
465
|
+
n_psu = len(resolved_survey.weights)
|
|
466
|
+
psu_ids = np.arange(n_psu)
|
|
467
|
+
else:
|
|
468
|
+
psu_ids = np.unique(psu)
|
|
469
|
+
n_psu = len(psu_ids)
|
|
470
|
+
|
|
471
|
+
if strata is None:
|
|
472
|
+
# No stratification — generate a single block of weights
|
|
473
|
+
if n_psu < 2:
|
|
474
|
+
# Single PSU — variance unidentified (matches compute_survey_vcov)
|
|
475
|
+
weights = np.zeros((n_bootstrap, n_psu), dtype=np.float64)
|
|
476
|
+
return weights, psu_ids
|
|
477
|
+
weights = generate_bootstrap_weights_batch(n_bootstrap, n_psu, weight_type, rng)
|
|
478
|
+
# FPC scaling (unstratified)
|
|
479
|
+
if resolved_survey.fpc is not None:
|
|
480
|
+
if psu is not None:
|
|
481
|
+
n_units_for_fpc = n_psu
|
|
482
|
+
else:
|
|
483
|
+
n_units_for_fpc = len(resolved_survey.weights)
|
|
484
|
+
if resolved_survey.fpc[0] < n_units_for_fpc:
|
|
485
|
+
raise ValueError(
|
|
486
|
+
f"FPC ({resolved_survey.fpc[0]}) is less than the number of PSUs "
|
|
487
|
+
f"({n_units_for_fpc}). FPC must be >= number of PSUs."
|
|
488
|
+
)
|
|
489
|
+
f = n_units_for_fpc / resolved_survey.fpc[0]
|
|
490
|
+
if f < 1.0:
|
|
491
|
+
weights = weights * np.sqrt(1.0 - f)
|
|
492
|
+
else:
|
|
493
|
+
weights = np.zeros_like(weights)
|
|
494
|
+
else:
|
|
495
|
+
# Stratified — generate independently within strata
|
|
496
|
+
weights = np.empty((n_bootstrap, n_psu), dtype=np.float64)
|
|
497
|
+
|
|
498
|
+
# Build PSU → column-index map
|
|
499
|
+
psu_to_col = {int(p): i for i, p in enumerate(psu_ids)}
|
|
500
|
+
|
|
501
|
+
unique_strata = np.unique(strata)
|
|
502
|
+
_singleton_cols = [] # For lonely_psu="adjust" pooling
|
|
503
|
+
for h in unique_strata:
|
|
504
|
+
mask_h = strata == h
|
|
505
|
+
|
|
506
|
+
if psu is not None:
|
|
507
|
+
psus_in_h = np.unique(psu[mask_h])
|
|
508
|
+
else:
|
|
509
|
+
psus_in_h = np.where(mask_h)[0]
|
|
510
|
+
|
|
511
|
+
n_h = len(psus_in_h)
|
|
512
|
+
cols = np.array([psu_to_col[int(p)] for p in psus_in_h])
|
|
513
|
+
|
|
514
|
+
if n_h < 2:
|
|
515
|
+
if _lonely_psu == "adjust":
|
|
516
|
+
# Collect for pooled pseudo-stratum processing
|
|
517
|
+
_singleton_cols.extend(cols.tolist())
|
|
518
|
+
else:
|
|
519
|
+
# remove / certainty — zero weight
|
|
520
|
+
weights[:, cols] = 0.0
|
|
521
|
+
continue
|
|
522
|
+
|
|
523
|
+
# Generate weights for this stratum
|
|
524
|
+
stratum_weights = generate_bootstrap_weights_batch_numpy(
|
|
525
|
+
n_bootstrap, n_h, weight_type, rng
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
# FPC scaling
|
|
529
|
+
if resolved_survey.fpc is not None:
|
|
530
|
+
N_h = resolved_survey.fpc[mask_h][0]
|
|
531
|
+
if N_h < n_h:
|
|
532
|
+
raise ValueError(
|
|
533
|
+
f"FPC ({N_h}) is less than the number of PSUs "
|
|
534
|
+
f"({n_h}) in stratum {h}. FPC must be >= n_PSU."
|
|
535
|
+
)
|
|
536
|
+
f_h = n_h / N_h
|
|
537
|
+
if f_h < 1.0:
|
|
538
|
+
stratum_weights = stratum_weights * np.sqrt(1.0 - f_h)
|
|
539
|
+
else:
|
|
540
|
+
stratum_weights = np.zeros_like(stratum_weights)
|
|
541
|
+
|
|
542
|
+
weights[:, cols] = stratum_weights
|
|
543
|
+
|
|
544
|
+
# Pool singleton PSUs into a pseudo-stratum for "adjust"
|
|
545
|
+
if _singleton_cols:
|
|
546
|
+
n_pooled = len(_singleton_cols)
|
|
547
|
+
if n_pooled >= 2:
|
|
548
|
+
pooled_weights = generate_bootstrap_weights_batch_numpy(
|
|
549
|
+
n_bootstrap, n_pooled, weight_type, rng
|
|
550
|
+
)
|
|
551
|
+
# No FPC scaling for pooled singletons (conservative)
|
|
552
|
+
pooled_cols = np.array(_singleton_cols)
|
|
553
|
+
weights[:, pooled_cols] = pooled_weights
|
|
554
|
+
else:
|
|
555
|
+
# Single singleton — cannot pool, zero weight (library-specific
|
|
556
|
+
# fallback; bootstrap adjust with one singleton = remove).
|
|
557
|
+
import warnings
|
|
558
|
+
|
|
559
|
+
warnings.warn(
|
|
560
|
+
"lonely_psu='adjust' with only 1 singleton stratum in "
|
|
561
|
+
"bootstrap: singleton PSU contributes zero variance "
|
|
562
|
+
"(same as 'remove'). At least 2 singleton strata are "
|
|
563
|
+
"needed for pooled pseudo-stratum bootstrap.",
|
|
564
|
+
UserWarning,
|
|
565
|
+
stacklevel=3,
|
|
566
|
+
)
|
|
567
|
+
weights[:, _singleton_cols[0]] = 0.0
|
|
568
|
+
|
|
569
|
+
return weights, psu_ids
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def generate_rao_wu_weights(
|
|
573
|
+
resolved_survey: "ResolvedSurveyDesign",
|
|
574
|
+
rng: np.random.Generator,
|
|
575
|
+
) -> np.ndarray:
|
|
576
|
+
"""Generate one set of Rao-Wu (1988) rescaled observation weights.
|
|
577
|
+
|
|
578
|
+
Within each stratum *h* with *n_h* PSUs, draw ``m_h`` PSUs with
|
|
579
|
+
replacement and rescale observation weights by ``(n_h / m_h) * r_hi``
|
|
580
|
+
where ``r_hi`` is the count of PSU *i* being selected.
|
|
581
|
+
|
|
582
|
+
Without FPC: ``m_h = n_h - 1``.
|
|
583
|
+
With FPC: ``m_h = max(1, round((1 - f_h) * (n_h - 1)))``
|
|
584
|
+
(Rao, Wu & Yue 1992, Section 3).
|
|
585
|
+
|
|
586
|
+
For ``lonely_psu="adjust"``, singleton PSUs are pooled into a combined
|
|
587
|
+
pseudo-stratum and resampled together (no FPC scaling on pooled group).
|
|
588
|
+
|
|
589
|
+
Parameters
|
|
590
|
+
----------
|
|
591
|
+
resolved_survey : ResolvedSurveyDesign
|
|
592
|
+
Resolved survey design.
|
|
593
|
+
rng : np.random.Generator
|
|
594
|
+
Random number generator.
|
|
595
|
+
|
|
596
|
+
Returns
|
|
597
|
+
-------
|
|
598
|
+
np.ndarray
|
|
599
|
+
Rescaled observation weights, shape ``(n_obs,)``.
|
|
600
|
+
"""
|
|
601
|
+
n_obs = len(resolved_survey.weights)
|
|
602
|
+
base_weights = resolved_survey.weights
|
|
603
|
+
psu = resolved_survey.psu
|
|
604
|
+
strata = resolved_survey.strata
|
|
605
|
+
|
|
606
|
+
_lonely_psu_rw = resolved_survey.lonely_psu
|
|
607
|
+
|
|
608
|
+
rescaled = np.zeros(n_obs, dtype=np.float64)
|
|
609
|
+
|
|
610
|
+
if psu is None:
|
|
611
|
+
obs_psu = np.arange(n_obs)
|
|
612
|
+
else:
|
|
613
|
+
obs_psu = psu
|
|
614
|
+
|
|
615
|
+
if strata is None:
|
|
616
|
+
strata_masks = [np.ones(n_obs, dtype=bool)]
|
|
617
|
+
else:
|
|
618
|
+
unique_strata = np.unique(strata)
|
|
619
|
+
strata_masks = [strata == h for h in unique_strata]
|
|
620
|
+
|
|
621
|
+
# Collect singleton PSUs for "adjust" pooling
|
|
622
|
+
_singleton_info = [] # list of (mask_h, unique_psu_h) tuples
|
|
623
|
+
|
|
624
|
+
for mask_h in strata_masks:
|
|
625
|
+
psu_h = obs_psu[mask_h]
|
|
626
|
+
unique_psu_h = np.unique(psu_h)
|
|
627
|
+
n_h = len(unique_psu_h)
|
|
628
|
+
|
|
629
|
+
if n_h < 2:
|
|
630
|
+
if _lonely_psu_rw == "adjust":
|
|
631
|
+
_singleton_info.append((mask_h, unique_psu_h))
|
|
632
|
+
else:
|
|
633
|
+
# remove / certainty — keep original weights (zero variance)
|
|
634
|
+
rescaled[mask_h] = base_weights[mask_h]
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
# Compute resample size
|
|
638
|
+
if resolved_survey.fpc is not None:
|
|
639
|
+
N_h = resolved_survey.fpc[mask_h][0]
|
|
640
|
+
if N_h < n_h:
|
|
641
|
+
raise ValueError(
|
|
642
|
+
f"FPC ({N_h}) is less than the number of PSUs "
|
|
643
|
+
f"({n_h}). FPC must be >= number of PSUs."
|
|
644
|
+
)
|
|
645
|
+
f_h = n_h / N_h
|
|
646
|
+
if f_h >= 1.0:
|
|
647
|
+
# Census stratum — keep original weights (zero variance)
|
|
648
|
+
rescaled[mask_h] = base_weights[mask_h]
|
|
649
|
+
continue
|
|
650
|
+
m_h = max(1, round((1.0 - f_h) * (n_h - 1)))
|
|
651
|
+
else:
|
|
652
|
+
m_h = n_h - 1
|
|
653
|
+
|
|
654
|
+
# Draw m_h PSUs with replacement
|
|
655
|
+
drawn_indices = rng.choice(n_h, size=m_h, replace=True)
|
|
656
|
+
counts = np.bincount(drawn_indices, minlength=n_h)
|
|
657
|
+
|
|
658
|
+
# Rescale factor per PSU: (n_h / m_h) * r_hi
|
|
659
|
+
scale_per_psu = (n_h / m_h) * counts.astype(np.float64)
|
|
660
|
+
|
|
661
|
+
# Map PSU → local index for vectorized application
|
|
662
|
+
psu_to_local = {int(p): i for i, p in enumerate(unique_psu_h)}
|
|
663
|
+
obs_in_h = np.where(mask_h)[0]
|
|
664
|
+
local_indices = np.array([psu_to_local[int(obs_psu[idx])] for idx in obs_in_h])
|
|
665
|
+
rescaled[obs_in_h] = base_weights[obs_in_h] * scale_per_psu[local_indices]
|
|
666
|
+
|
|
667
|
+
# Pool singleton PSUs into a pseudo-stratum for "adjust"
|
|
668
|
+
if _singleton_info:
|
|
669
|
+
# Combine all singleton PSUs into one group
|
|
670
|
+
pooled_psus = np.concatenate([p for _, p in _singleton_info])
|
|
671
|
+
n_pooled = len(pooled_psus)
|
|
672
|
+
|
|
673
|
+
if n_pooled >= 2:
|
|
674
|
+
m_pooled = n_pooled - 1 # No FPC for pooled singletons
|
|
675
|
+
drawn = rng.choice(n_pooled, size=m_pooled, replace=True)
|
|
676
|
+
counts = np.bincount(drawn, minlength=n_pooled)
|
|
677
|
+
scale_per_psu = (n_pooled / m_pooled) * counts.astype(np.float64)
|
|
678
|
+
|
|
679
|
+
# Build PSU → scale mapping and apply
|
|
680
|
+
psu_scale_map = {int(pooled_psus[i]): scale_per_psu[i] for i in range(n_pooled)}
|
|
681
|
+
for mask_h, _ in _singleton_info:
|
|
682
|
+
obs_in_h = np.where(mask_h)[0]
|
|
683
|
+
for idx in obs_in_h:
|
|
684
|
+
p = int(obs_psu[idx])
|
|
685
|
+
rescaled[idx] = base_weights[idx] * psu_scale_map.get(p, 1.0)
|
|
686
|
+
else:
|
|
687
|
+
# Single singleton — cannot pool, keep base weights (library-specific
|
|
688
|
+
# fallback; bootstrap adjust with one singleton = remove).
|
|
689
|
+
import warnings
|
|
690
|
+
|
|
691
|
+
warnings.warn(
|
|
692
|
+
"lonely_psu='adjust' with only 1 singleton stratum in "
|
|
693
|
+
"bootstrap: singleton PSU contributes zero variance "
|
|
694
|
+
"(same as 'remove'). At least 2 singleton strata are "
|
|
695
|
+
"needed for pooled pseudo-stratum bootstrap.",
|
|
696
|
+
UserWarning,
|
|
697
|
+
stacklevel=2,
|
|
698
|
+
)
|
|
699
|
+
for mask_h, _ in _singleton_info:
|
|
700
|
+
rescaled[mask_h] = base_weights[mask_h]
|
|
701
|
+
|
|
702
|
+
return rescaled
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def generate_rao_wu_weights_batch(
|
|
706
|
+
n_bootstrap: int,
|
|
707
|
+
resolved_survey: "ResolvedSurveyDesign",
|
|
708
|
+
rng: np.random.Generator,
|
|
709
|
+
) -> np.ndarray:
|
|
710
|
+
"""Generate multiple sets of Rao-Wu rescaled weights.
|
|
711
|
+
|
|
712
|
+
Parameters
|
|
713
|
+
----------
|
|
714
|
+
n_bootstrap : int
|
|
715
|
+
Number of bootstrap iterations.
|
|
716
|
+
resolved_survey : ResolvedSurveyDesign
|
|
717
|
+
Resolved survey design.
|
|
718
|
+
rng : np.random.Generator
|
|
719
|
+
Random number generator.
|
|
720
|
+
|
|
721
|
+
Returns
|
|
722
|
+
-------
|
|
723
|
+
np.ndarray
|
|
724
|
+
Rescaled weights, shape ``(n_bootstrap, n_obs)``.
|
|
725
|
+
"""
|
|
726
|
+
n_obs = len(resolved_survey.weights)
|
|
727
|
+
result = np.empty((n_bootstrap, n_obs), dtype=np.float64)
|
|
728
|
+
for b in range(n_bootstrap):
|
|
729
|
+
result[b] = generate_rao_wu_weights(resolved_survey, rng)
|
|
730
|
+
return result
|