cbps 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cbps/__init__.py +3462 -0
- cbps/constants.py +46 -0
- cbps/core/__init__.py +93 -0
- cbps/core/cbps_binary.py +1943 -0
- cbps/core/cbps_continuous.py +945 -0
- cbps/core/cbps_multitreat.py +1123 -0
- cbps/core/cbps_optimal.py +507 -0
- cbps/core/results.py +1447 -0
- cbps/data/Blackwell.csv +571 -0
- cbps/data/LaLonde.csv +3213 -0
- cbps/data/npcbps_continuous_sim.csv +501 -0
- cbps/data/nsw.csv +723 -0
- cbps/data/nsw_dw.csv +446 -0
- cbps/data/political_ads_urban_niebler.csv +16266 -0
- cbps/data/psid_controls.csv +2491 -0
- cbps/data/psid_controls2.csv +254 -0
- cbps/data/psid_controls3.csv +129 -0
- cbps/data/simulation_dgp1_seed12345.csv +201 -0
- cbps/data/simulation_dgp2_seed12345.csv +201 -0
- cbps/data/simulation_dgp3_seed12345.csv +201 -0
- cbps/data/simulation_dgp4_seed12345.csv +201 -0
- cbps/datasets/__init__.py +78 -0
- cbps/datasets/blackwell.py +112 -0
- cbps/datasets/continuous.py +223 -0
- cbps/datasets/lalonde.py +272 -0
- cbps/datasets/npcbps_sim.py +101 -0
- cbps/diagnostics/__init__.py +101 -0
- cbps/diagnostics/balance.py +760 -0
- cbps/diagnostics/balance_cbmsm_addon.py +162 -0
- cbps/diagnostics/continuous_diagnostics.py +259 -0
- cbps/diagnostics/normality.py +173 -0
- cbps/diagnostics/ocbps_conditions.py +197 -0
- cbps/diagnostics/overlap.py +198 -0
- cbps/diagnostics/plots.py +1193 -0
- cbps/diagnostics/weights_diag.py +205 -0
- cbps/highdim/__init__.py +84 -0
- cbps/highdim/gmm_loss.py +340 -0
- cbps/highdim/hdcbps.py +1078 -0
- cbps/highdim/lasso_utils.py +498 -0
- cbps/highdim/weight_funcs.py +298 -0
- cbps/inference/__init__.py +42 -0
- cbps/inference/asyvar.py +621 -0
- cbps/inference/vcov_outcome.py +217 -0
- cbps/iv/__init__.py +48 -0
- cbps/iv/cbiv.py +2603 -0
- cbps/logging_config.py +45 -0
- cbps/msm/__init__.py +45 -0
- cbps/msm/cbmsm.py +1871 -0
- cbps/msm/rank_diagnostics.py +112 -0
- cbps/nonparametric/__init__.py +58 -0
- cbps/nonparametric/cholesky_whitening.py +232 -0
- cbps/nonparametric/empirical_likelihood.py +339 -0
- cbps/nonparametric/npcbps.py +1036 -0
- cbps/nonparametric/taylor_approx.py +207 -0
- cbps/py.typed +0 -0
- cbps/sklearn/__init__.py +42 -0
- cbps/sklearn/estimator.py +378 -0
- cbps/utils/__init__.py +82 -0
- cbps/utils/formula.py +415 -0
- cbps/utils/helpers.py +378 -0
- cbps/utils/numerics.py +438 -0
- cbps/utils/r_compat.py +109 -0
- cbps/utils/validation.py +224 -0
- cbps/utils/variance_transform.py +483 -0
- cbps/utils/weights.py +586 -0
- cbps-0.2.0.dist-info/METADATA +1090 -0
- cbps-0.2.0.dist-info/RECORD +70 -0
- cbps-0.2.0.dist-info/WHEEL +5 -0
- cbps-0.2.0.dist-info/licenses/LICENSE +661 -0
- cbps-0.2.0.dist-info/top_level.txt +1 -0
cbps/utils/helpers.py
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Preprocessing and Validation Utilities
|
|
3
|
+
|
|
4
|
+
This module provides helper functions for data preprocessing tasks
|
|
5
|
+
commonly needed before CBPS estimation, including sample weight
|
|
6
|
+
normalization, input validation, missing value handling, and
|
|
7
|
+
treatment variable encoding.
|
|
8
|
+
|
|
9
|
+
Functions
|
|
10
|
+
---------
|
|
11
|
+
normalize_sample_weights
|
|
12
|
+
Normalize sampling weights to sum to the sample size.
|
|
13
|
+
validate_arrays
|
|
14
|
+
Validate treatment and covariate array dimensions and types.
|
|
15
|
+
handle_missing
|
|
16
|
+
Remove observations with missing values.
|
|
17
|
+
encode_treatment_factor
|
|
18
|
+
Convert categorical treatment to numeric encoding.
|
|
19
|
+
|
|
20
|
+
References
|
|
21
|
+
----------
|
|
22
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
23
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import warnings
|
|
27
|
+
from typing import Optional, Tuple, Union
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def normalize_sample_weights(
|
|
34
|
+
sample_weights: Optional[np.ndarray],
|
|
35
|
+
n: int
|
|
36
|
+
) -> np.ndarray:
|
|
37
|
+
"""
|
|
38
|
+
Normalize sampling weights to sum to the sample size.
|
|
39
|
+
|
|
40
|
+
Applies the transformation ``sw = sw / mean(sw)`` to ensure
|
|
41
|
+
``sum(sw) = n``, which is required for proper weighted estimation.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
sample_weights : np.ndarray or None
|
|
46
|
+
Original sampling weights, shape (n,).
|
|
47
|
+
If None, returns uniform weights (all ones).
|
|
48
|
+
n : int
|
|
49
|
+
Number of observations (target sum).
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
np.ndarray
|
|
54
|
+
Normalized weights satisfying ``sum(weights) = n``, dtype=float64.
|
|
55
|
+
|
|
56
|
+
Raises
|
|
57
|
+
------
|
|
58
|
+
ValueError
|
|
59
|
+
If all weights are zero, any weights are negative, or
|
|
60
|
+
normalization produces unexpected results.
|
|
61
|
+
|
|
62
|
+
Warns
|
|
63
|
+
-----
|
|
64
|
+
UserWarning
|
|
65
|
+
If any weights are exactly zero (valid but noteworthy).
|
|
66
|
+
|
|
67
|
+
Notes
|
|
68
|
+
-----
|
|
69
|
+
Zero weights are permitted for trimmed or survey designs, but a
|
|
70
|
+
warning is issued since those observations are effectively excluded.
|
|
71
|
+
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
>>> import numpy as np
|
|
75
|
+
>>> sw = np.array([0.5, 1.0, 1.5, 2.0])
|
|
76
|
+
>>> sw_norm = normalize_sample_weights(sw, n=4)
|
|
77
|
+
>>> bool(np.isclose(sw_norm.sum(), 4.0))
|
|
78
|
+
True
|
|
79
|
+
|
|
80
|
+
References
|
|
81
|
+
----------
|
|
82
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
83
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
84
|
+
"""
|
|
85
|
+
# Step 1: Handle None case
|
|
86
|
+
if sample_weights is None:
|
|
87
|
+
return np.ones(n, dtype=np.float64)
|
|
88
|
+
|
|
89
|
+
# Step 2: Convert to float64 array
|
|
90
|
+
sw = np.asarray(sample_weights, dtype=np.float64)
|
|
91
|
+
|
|
92
|
+
# Check for all-zero weights and provide informative error
|
|
93
|
+
if not np.any(sw > 0): # All weights <= 0
|
|
94
|
+
if np.all(sw == 0):
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"sample_weights cannot be all zeros. "
|
|
97
|
+
f"Received weights with sum={sw.sum():.1f} for n={n} observations."
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"sample_weights must contain at least one positive value. "
|
|
102
|
+
f"Received weights with sum={sw.sum():.6f} (all non-positive)."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Step 3: Check for negative weights (not allowed)
|
|
106
|
+
if np.any(sw < 0):
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"sample_weights must be non-negative (>= 0). "
|
|
109
|
+
f"Found {(sw < 0).sum()} negative weights."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Allow zero weights with warning (valid for survey designs and trimmed weights)
|
|
113
|
+
if np.any(sw == 0):
|
|
114
|
+
n_zeros = (sw == 0).sum()
|
|
115
|
+
warnings.warn(
|
|
116
|
+
f"sample_weights contains {n_zeros} zero values. "
|
|
117
|
+
f"These observations will be effectively excluded from the analysis. "
|
|
118
|
+
f"This is valid for trimmed weights or survey designs, but verify this is intentional.",
|
|
119
|
+
UserWarning,
|
|
120
|
+
stacklevel=3
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Step 4: Normalize by dividing by mean
|
|
124
|
+
sw = sw / sw.mean()
|
|
125
|
+
|
|
126
|
+
# Step 5: Verify sum equals n (tolerance 1e-10)
|
|
127
|
+
sum_sw = sw.sum()
|
|
128
|
+
# Use ValueError instead of assert for better error messages
|
|
129
|
+
if not np.isclose(sum_sw, n, atol=1e-10):
|
|
130
|
+
raise ValueError(
|
|
131
|
+
f"Internal error in weight normalization: "
|
|
132
|
+
f"normalized weights sum to {sum_sw:.10f} instead of {n}. "
|
|
133
|
+
f"Difference: {abs(sum_sw - n):.2e}. This should not happen."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return sw
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def validate_arrays(
|
|
140
|
+
treat: np.ndarray,
|
|
141
|
+
X: np.ndarray,
|
|
142
|
+
check_rank: bool = True
|
|
143
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
144
|
+
"""
|
|
145
|
+
Validate and standardize treatment and covariate arrays.
|
|
146
|
+
|
|
147
|
+
Performs dimension checking, type conversion, missing value detection,
|
|
148
|
+
and optional rank verification for the design matrix.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
treat : np.ndarray
|
|
153
|
+
Treatment vector, shape (n,).
|
|
154
|
+
X : np.ndarray
|
|
155
|
+
Covariate matrix, shape (n, k).
|
|
156
|
+
check_rank : bool, default=True
|
|
157
|
+
If True, verify X has full column rank.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
treat : np.ndarray
|
|
162
|
+
Validated treatment vector, dtype=float64.
|
|
163
|
+
X : np.ndarray
|
|
164
|
+
Validated covariate matrix, dtype=float64.
|
|
165
|
+
|
|
166
|
+
Raises
|
|
167
|
+
------
|
|
168
|
+
ValueError
|
|
169
|
+
If dimensions mismatch, arrays contain NaN, or X is rank-deficient
|
|
170
|
+
(when check_rank=True).
|
|
171
|
+
|
|
172
|
+
Notes
|
|
173
|
+
-----
|
|
174
|
+
Full column rank is required for the GMM optimization to have a
|
|
175
|
+
unique solution. Rank deficiency typically indicates collinear
|
|
176
|
+
covariates that should be removed.
|
|
177
|
+
|
|
178
|
+
Examples
|
|
179
|
+
--------
|
|
180
|
+
>>> import numpy as np
|
|
181
|
+
>>> treat = np.array([1, 0, 1, 0])
|
|
182
|
+
>>> X = np.array([[1, 25], [1, 30], [1, 35], [1, 40]])
|
|
183
|
+
>>> treat_v, X_v = validate_arrays(treat, X)
|
|
184
|
+
>>> treat_v.dtype
|
|
185
|
+
dtype('float64')
|
|
186
|
+
|
|
187
|
+
References
|
|
188
|
+
----------
|
|
189
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
190
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
191
|
+
"""
|
|
192
|
+
# Dimension check
|
|
193
|
+
if len(treat) != X.shape[0]:
|
|
194
|
+
raise ValueError(
|
|
195
|
+
f"treat length {len(treat)} != X rows {X.shape[0]}"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Type conversion (enforce float64)
|
|
199
|
+
treat = np.asarray(treat, dtype=np.float64)
|
|
200
|
+
X = np.asarray(X, dtype=np.float64)
|
|
201
|
+
|
|
202
|
+
# Missing value check
|
|
203
|
+
if np.isnan(treat).any() or np.isnan(X).any():
|
|
204
|
+
raise ValueError(
|
|
205
|
+
"Arrays contain NaN values. Use handle_missing() first."
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Full rank check
|
|
209
|
+
if check_rank:
|
|
210
|
+
rank = np.linalg.matrix_rank(X)
|
|
211
|
+
if rank < X.shape[1]:
|
|
212
|
+
raise ValueError(
|
|
213
|
+
f"X is not full rank: rank={rank} < ncol={X.shape[1]}"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
return treat, X
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def handle_missing(
|
|
220
|
+
data: pd.DataFrame,
|
|
221
|
+
relevant_cols: Optional[list] = None
|
|
222
|
+
) -> Tuple[pd.DataFrame, int]:
|
|
223
|
+
"""
|
|
224
|
+
Remove observations with missing values.
|
|
225
|
+
|
|
226
|
+
Performs listwise deletion of rows containing NA/NaN in the
|
|
227
|
+
specified columns, with a warning indicating how many rows
|
|
228
|
+
were removed.
|
|
229
|
+
|
|
230
|
+
Parameters
|
|
231
|
+
----------
|
|
232
|
+
data : pd.DataFrame
|
|
233
|
+
Input DataFrame.
|
|
234
|
+
relevant_cols : list of str, optional
|
|
235
|
+
Columns to check for missing values.
|
|
236
|
+
If None, checks all columns.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
data_clean : pd.DataFrame
|
|
241
|
+
DataFrame with missing-value rows removed.
|
|
242
|
+
Original index is preserved.
|
|
243
|
+
n_dropped : int
|
|
244
|
+
Number of rows removed.
|
|
245
|
+
|
|
246
|
+
Warns
|
|
247
|
+
-----
|
|
248
|
+
UserWarning
|
|
249
|
+
If any rows were dropped, indicates the count.
|
|
250
|
+
|
|
251
|
+
Examples
|
|
252
|
+
--------
|
|
253
|
+
>>> import pandas as pd
|
|
254
|
+
>>> import numpy as np
|
|
255
|
+
>>> df = pd.DataFrame({
|
|
256
|
+
... 'treat': [1, 0, np.nan, 1],
|
|
257
|
+
... 'age': [25, 30, 35, np.nan]
|
|
258
|
+
... })
|
|
259
|
+
>>> df_clean, n_drop = handle_missing(df)
|
|
260
|
+
>>> len(df_clean)
|
|
261
|
+
2
|
|
262
|
+
>>> n_drop
|
|
263
|
+
2
|
|
264
|
+
|
|
265
|
+
References
|
|
266
|
+
----------
|
|
267
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
268
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
269
|
+
"""
|
|
270
|
+
if relevant_cols is None:
|
|
271
|
+
relevant_cols = data.columns.tolist()
|
|
272
|
+
|
|
273
|
+
# Drop rows containing NA (preserve index)
|
|
274
|
+
data_clean = data.dropna(subset=relevant_cols, inplace=False)
|
|
275
|
+
n_dropped = len(data) - len(data_clean)
|
|
276
|
+
|
|
277
|
+
# Warning message (matches na.omit behavior)
|
|
278
|
+
if n_dropped > 0:
|
|
279
|
+
warnings.warn(
|
|
280
|
+
f"Removed {n_dropped} observations with missing values",
|
|
281
|
+
UserWarning
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return data_clean, n_dropped
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def encode_treatment_factor(
|
|
288
|
+
treat: Union[pd.Series, np.ndarray],
|
|
289
|
+
att: int,
|
|
290
|
+
verbose: int = 1
|
|
291
|
+
) -> Tuple[np.ndarray, list, np.ndarray]:
|
|
292
|
+
"""
|
|
293
|
+
Encode categorical treatment variable to binary 0/1.
|
|
294
|
+
|
|
295
|
+
Converts a two-level categorical treatment to numeric encoding,
|
|
296
|
+
with the encoding direction controlled by the ATT parameter.
|
|
297
|
+
|
|
298
|
+
Parameters
|
|
299
|
+
----------
|
|
300
|
+
treat : pd.Series or np.ndarray
|
|
301
|
+
Categorical treatment variable with exactly 2 levels.
|
|
302
|
+
att : int
|
|
303
|
+
Target estimand controlling encoding:
|
|
304
|
+
|
|
305
|
+
- 0: ATE - second level (alphabetically) becomes 1
|
|
306
|
+
- 1: ATT - second level as treated group
|
|
307
|
+
- 2: ATT - first level as treated group (inverts encoding)
|
|
308
|
+
|
|
309
|
+
verbose : int, default=1
|
|
310
|
+
If > 0, print information about which level is treated.
|
|
311
|
+
|
|
312
|
+
Returns
|
|
313
|
+
-------
|
|
314
|
+
treat_numeric : np.ndarray
|
|
315
|
+
Binary treatment vector (0/1), dtype=float64.
|
|
316
|
+
levels : list
|
|
317
|
+
Sorted list of the two factor levels.
|
|
318
|
+
treat_orig : np.ndarray
|
|
319
|
+
Original treatment values (for reference).
|
|
320
|
+
|
|
321
|
+
Notes
|
|
322
|
+
-----
|
|
323
|
+
The encoding follows alphabetical ordering of levels:
|
|
324
|
+
|
|
325
|
+
- Levels are sorted, so ``['control', 'treatment']`` becomes
|
|
326
|
+
``[0, 1]`` with 'treatment' = 1
|
|
327
|
+
- ATT=2 inverts this, making the first level (alphabetically) = 1
|
|
328
|
+
|
|
329
|
+
Examples
|
|
330
|
+
--------
|
|
331
|
+
>>> import pandas as pd
|
|
332
|
+
>>> treat = pd.Categorical(['control', 'treatment', 'control', 'treatment'])
|
|
333
|
+
>>> treat_num, levels, _ = encode_treatment_factor(treat, att=1, verbose=0)
|
|
334
|
+
>>> [float(x) for x in treat_num]
|
|
335
|
+
[0.0, 1.0, 0.0, 1.0]
|
|
336
|
+
>>> levels
|
|
337
|
+
['control', 'treatment']
|
|
338
|
+
|
|
339
|
+
References
|
|
340
|
+
----------
|
|
341
|
+
Imai, K. and Ratkovic, M. (2014). Covariate balancing propensity score.
|
|
342
|
+
Journal of the Royal Statistical Society, Series B 76(1), 243-263.
|
|
343
|
+
"""
|
|
344
|
+
# Save original values
|
|
345
|
+
treat_orig = np.asarray(treat).copy()
|
|
346
|
+
|
|
347
|
+
# Extract levels (sorted)
|
|
348
|
+
if isinstance(treat, pd.Series):
|
|
349
|
+
if hasattr(treat, 'cat'):
|
|
350
|
+
levels = treat.cat.categories.tolist()
|
|
351
|
+
else:
|
|
352
|
+
levels = sorted(treat.unique())
|
|
353
|
+
else:
|
|
354
|
+
levels = sorted(np.unique(treat))
|
|
355
|
+
|
|
356
|
+
# ATT encoding: second level becomes 1, first level becomes 0
|
|
357
|
+
treat_numeric = (treat_orig == levels[1]).astype(int)
|
|
358
|
+
|
|
359
|
+
# ATT=2: Invert treatment assignment
|
|
360
|
+
if att == 2:
|
|
361
|
+
treat_numeric = 1 - treat_numeric
|
|
362
|
+
|
|
363
|
+
# Print ATT information (controlled by verbose parameter)
|
|
364
|
+
if verbose > 0:
|
|
365
|
+
if att == 1:
|
|
366
|
+
print(
|
|
367
|
+
f"Finding ATT with T={levels[1]} as the treatment. "
|
|
368
|
+
f"Set ATT=2 to find ATT with T={levels[0]} as the treatment"
|
|
369
|
+
)
|
|
370
|
+
elif att == 2:
|
|
371
|
+
print(
|
|
372
|
+
f"Finding ATT with T={levels[0]} as the treatment. "
|
|
373
|
+
f"Set ATT=1 to find ATT with T={levels[1]} as the treatment"
|
|
374
|
+
)
|
|
375
|
+
# ATT=0: No message (ATE scenario)
|
|
376
|
+
|
|
377
|
+
return treat_numeric.astype(np.float64), levels, treat_orig
|
|
378
|
+
|